diff --git a/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/added_tokens.json b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/config.json b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e910e418df2fd4f36a2f14de30637412c2ce34aa --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/config.json @@ -0,0 +1,199 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": true, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": true, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": true, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 8, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "competesmoev30", + "norm_softmax": false, + "normalization": true, + "num_attention_heads": 32, + "num_experts": 4, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 2, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/mm_projector.bin", + "rate_compete": 0.2, + "rate_flip": 0.07, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.2, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": true, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": true, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/generation_config.json b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/model-00001-of-00003.safetensors b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..207698dda8bbe0ef9c11aeb0e614d7ec51512f36 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b32151dd7864a4e3b06c13abed98d80bf53b2e00d56ec62510b40392f2c9d41b +size 4972489328 diff --git a/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/model-00002-of-00003.safetensors b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..64c7d93eaa5c35045ae44194240c54d6a66ef7c2 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c517446e85d8d0b677a7975a793431529fa4701e4d1ae249f4922758a06a8ad9 +size 4985976068 diff --git a/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/model-00003-of-00003.safetensors b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..47815c5b6cbcd45ddb36664e1fc8c673fce87896 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d87f7abcc16d06b627b721f9f8d1d1eb53b2f639b4881f008487d4b71efe3d0e +size 248943552 diff --git a/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/model.safetensors.index.json b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..7cdc5da041253f30bfca8dad5f6a64a31333d1b4 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/model.safetensors.index.json @@ -0,0 +1,1033 @@ +{ + "metadata": { + "total_size": 10207261884 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.norm.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.prob_flips": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors" + } +} diff --git a/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/special_tokens_map.json b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/tokenizer.model b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/tokenizer_config.json b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/trainer_state.json b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c7c4a07bc6385376ac3039475f2a0390c2ab4e5e --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/trainer_state.json @@ -0,0 +1,249523 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.999969938373666, + "eval_steps": 500, + "global_step": 16632, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.01808289, + "auxiliary_loss_mlp": 0.01789735, + "balance_loss_clip": 1.51843524, + "balance_loss_mlp": 1.50449085, + "epoch": 6.012325266796934e-05, + "flos": 24456507091200.0, + "grad_norm": 55.551105980902214, + "language_loss": 2.85281086, + "learning_rate": 0.0, + "loss": 1.92297995, + "num_input_tokens_seen": 19155, + "step": 1, + "time_per_iteration": 16.257894039154053 + }, + { + "auxiliary_loss_clip": 0.01205479, + "auxiliary_loss_mlp": 0.01193359, + "balance_loss_clip": 1.01232791, + "balance_loss_mlp": 1.00329149, + "epoch": 0.00012024650533593868, + "flos": 20225931246720.0, + "grad_norm": 37.46852487146263, + "language_loss": 1.82852578, + "learning_rate": 4.4628432569317594e-07, + "loss": 1.85251403, + "num_input_tokens_seen": 36175, + "step": 2, + "time_per_iteration": 2.4552199840545654 + }, + { + "auxiliary_loss_clip": 0.01205605, + "auxiliary_loss_mlp": 0.01193086, + "balance_loss_clip": 1.01247311, + "balance_loss_mlp": 1.00320959, + "epoch": 0.000180369758003908, + "flos": 22309935454080.0, + "grad_norm": 32.624995238205535, + "language_loss": 1.57411027, + "learning_rate": 7.073439208833112e-07, + "loss": 1.59809709, + "num_input_tokens_seen": 54870, + "step": 3, + "time_per_iteration": 2.4339540004730225 + }, + { + "auxiliary_loss_clip": 0.01205424, + "auxiliary_loss_mlp": 0.01192892, + "balance_loss_clip": 1.01223946, + "balance_loss_mlp": 1.00272954, + "epoch": 0.00024049301067187735, + "flos": 22414650577920.0, + "grad_norm": 51.43668991419872, + "language_loss": 1.67446971, + "learning_rate": 8.925686513863519e-07, + "loss": 1.69845295, + "num_input_tokens_seen": 74575, + "step": 4, + "time_per_iteration": 2.467144727706909 + }, + { + "auxiliary_loss_clip": 0.01205581, + "auxiliary_loss_mlp": 0.01193768, + "balance_loss_clip": 1.012393, + "balance_loss_mlp": 1.00370085, + "epoch": 0.0003006162633398467, + "flos": 21396978449280.0, + "grad_norm": 56.20220038984734, + "language_loss": 1.91046274, + "learning_rate": 1.0362401141348472e-06, + "loss": 1.93445635, + "num_input_tokens_seen": 92580, + "step": 5, + "time_per_iteration": 2.725346088409424 + }, + { + "auxiliary_loss_clip": 0.01205299, + "auxiliary_loss_mlp": 0.01193865, + "balance_loss_clip": 1.01211631, + "balance_loss_mlp": 1.00379789, + "epoch": 0.000360739516007816, + "flos": 21652375127040.0, + "grad_norm": 33.376168959356, + "language_loss": 1.60788596, + "learning_rate": 1.153628246576487e-06, + "loss": 1.63187754, + "num_input_tokens_seen": 109705, + "step": 6, + "time_per_iteration": 2.6810302734375 + }, + { + "auxiliary_loss_clip": 0.01205378, + "auxiliary_loss_mlp": 0.01193715, + "balance_loss_clip": 1.012187, + "balance_loss_mlp": 1.00364757, + "epoch": 0.0004208627686757854, + "flos": 27159742897920.0, + "grad_norm": 25.36348409911497, + "language_loss": 1.5329355, + "learning_rate": 1.2528784983718962e-06, + "loss": 1.55692649, + "num_input_tokens_seen": 129425, + "step": 7, + "time_per_iteration": 2.819789409637451 + }, + { + "auxiliary_loss_clip": 0.0120516, + "auxiliary_loss_mlp": 0.01193029, + "balance_loss_clip": 1.01199865, + "balance_loss_mlp": 1.0029614, + "epoch": 0.0004809860213437547, + "flos": 31319096135040.0, + "grad_norm": 31.716473093271645, + "language_loss": 1.4371953, + "learning_rate": 1.338852977079528e-06, + "loss": 1.46117711, + "num_input_tokens_seen": 149210, + "step": 8, + "time_per_iteration": 2.7823731899261475 + }, + { + "auxiliary_loss_clip": 0.01205376, + "auxiliary_loss_mlp": 0.01193742, + "balance_loss_clip": 1.01208019, + "balance_loss_mlp": 1.00376987, + "epoch": 0.000541109274011724, + "flos": 32160411463680.0, + "grad_norm": 28.13629233274648, + "language_loss": 1.49908352, + "learning_rate": 1.4146878417666224e-06, + "loss": 1.52307463, + "num_input_tokens_seen": 169055, + "step": 9, + "time_per_iteration": 2.8178865909576416 + }, + { + "auxiliary_loss_clip": 0.01205322, + "auxiliary_loss_mlp": 0.01193663, + "balance_loss_clip": 1.01217508, + "balance_loss_mlp": 1.00378633, + "epoch": 0.0006012325266796934, + "flos": 18916808163840.0, + "grad_norm": 23.998466301561454, + "language_loss": 1.44646454, + "learning_rate": 1.4825244398280232e-06, + "loss": 1.47045422, + "num_input_tokens_seen": 188045, + "step": 10, + "time_per_iteration": 2.7466540336608887 + }, + { + "auxiliary_loss_clip": 0.01205532, + "auxiliary_loss_mlp": 0.01193956, + "balance_loss_clip": 1.01234174, + "balance_loss_mlp": 1.00398374, + "epoch": 0.0006613557793476627, + "flos": 20774861867520.0, + "grad_norm": 18.73085280871441, + "language_loss": 1.45214415, + "learning_rate": 1.5438901072051983e-06, + "loss": 1.47613907, + "num_input_tokens_seen": 207035, + "step": 11, + "time_per_iteration": 2.729630947113037 + }, + { + "auxiliary_loss_clip": 0.01205196, + "auxiliary_loss_mlp": 0.01193268, + "balance_loss_clip": 1.01199079, + "balance_loss_mlp": 1.00320029, + "epoch": 0.000721479032015632, + "flos": 16581680997120.0, + "grad_norm": 16.893296007646267, + "language_loss": 1.44745958, + "learning_rate": 1.5999125722696629e-06, + "loss": 1.47144425, + "num_input_tokens_seen": 223225, + "step": 12, + "time_per_iteration": 2.7042620182037354 + }, + { + "auxiliary_loss_clip": 0.01205361, + "auxiliary_loss_mlp": 0.01192716, + "balance_loss_clip": 1.01218724, + "balance_loss_mlp": 1.00274372, + "epoch": 0.0007816022846836014, + "flos": 23805471144960.0, + "grad_norm": 10.96289163051649, + "language_loss": 1.23040295, + "learning_rate": 1.6514482443788434e-06, + "loss": 1.25438368, + "num_input_tokens_seen": 242570, + "step": 13, + "time_per_iteration": 2.708280324935913 + }, + { + "auxiliary_loss_clip": 0.01205496, + "auxiliary_loss_mlp": 0.01193791, + "balance_loss_clip": 1.01224446, + "balance_loss_mlp": 1.00381923, + "epoch": 0.0008417255373515708, + "flos": 19172204841600.0, + "grad_norm": 5.820744445285194, + "language_loss": 1.20695221, + "learning_rate": 1.6991628240650723e-06, + "loss": 1.23094511, + "num_input_tokens_seen": 261215, + "step": 14, + "time_per_iteration": 2.6613965034484863 + }, + { + "auxiliary_loss_clip": 0.01205525, + "auxiliary_loss_mlp": 0.01193562, + "balance_loss_clip": 1.01232076, + "balance_loss_mlp": 1.00339913, + "epoch": 0.00090184879001954, + "flos": 26395564026240.0, + "grad_norm": 7.9070882271291, + "language_loss": 1.12966204, + "learning_rate": 1.7435840350181584e-06, + "loss": 1.15365279, + "num_input_tokens_seen": 280035, + "step": 15, + "time_per_iteration": 2.708667755126953 + }, + { + "auxiliary_loss_clip": 0.01205312, + "auxiliary_loss_mlp": 0.01193247, + "balance_loss_clip": 1.01211226, + "balance_loss_mlp": 1.00337052, + "epoch": 0.0009619720426875094, + "flos": 24679500785280.0, + "grad_norm": 4.590878727715065, + "language_loss": 1.11390805, + "learning_rate": 1.7851373027727038e-06, + "loss": 1.13789356, + "num_input_tokens_seen": 300265, + "step": 16, + "time_per_iteration": 5.706210374832153 + }, + { + "auxiliary_loss_clip": 0.01205367, + "auxiliary_loss_mlp": 0.01193344, + "balance_loss_clip": 1.01218116, + "balance_loss_mlp": 1.00356317, + "epoch": 0.0010220952953554788, + "flos": 18624531196800.0, + "grad_norm": 4.685798890073619, + "language_loss": 1.12782371, + "learning_rate": 1.8241705979033208e-06, + "loss": 1.15181077, + "num_input_tokens_seen": 317375, + "step": 17, + "time_per_iteration": 2.70894718170166 + }, + { + "auxiliary_loss_clip": 0.01205274, + "auxiliary_loss_mlp": 0.011929, + "balance_loss_clip": 1.01215696, + "balance_loss_mlp": 1.00311899, + "epoch": 0.001082218548023448, + "flos": 26142537646080.0, + "grad_norm": 3.895207262473496, + "language_loss": 1.08077645, + "learning_rate": 1.860972167459798e-06, + "loss": 1.10475826, + "num_input_tokens_seen": 337975, + "step": 18, + "time_per_iteration": 2.764803409576416 + }, + { + "auxiliary_loss_clip": 0.01205436, + "auxiliary_loss_mlp": 0.01192949, + "balance_loss_clip": 1.01223779, + "balance_loss_mlp": 1.00288153, + "epoch": 0.0011423418006914173, + "flos": 19609776322560.0, + "grad_norm": 4.563074736614992, + "language_loss": 1.02289855, + "learning_rate": 1.89578346593066e-06, + "loss": 1.04688239, + "num_input_tokens_seen": 356635, + "step": 19, + "time_per_iteration": 2.678893804550171 + }, + { + "auxiliary_loss_clip": 0.01205293, + "auxiliary_loss_mlp": 0.0119234, + "balance_loss_clip": 1.01221597, + "balance_loss_mlp": 1.0027492, + "epoch": 0.0012024650533593868, + "flos": 17895365107200.0, + "grad_norm": 4.074274314303234, + "language_loss": 1.16505253, + "learning_rate": 1.928808765521199e-06, + "loss": 1.18902898, + "num_input_tokens_seen": 375625, + "step": 20, + "time_per_iteration": 2.702845811843872 + }, + { + "auxiliary_loss_clip": 0.01205092, + "auxiliary_loss_mlp": 0.01192944, + "balance_loss_clip": 1.01188862, + "balance_loss_mlp": 1.00287676, + "epoch": 0.001262588306027356, + "flos": 21252043071360.0, + "grad_norm": 5.146174758532688, + "language_loss": 1.05907798, + "learning_rate": 1.9602224192552076e-06, + "loss": 1.08305836, + "num_input_tokens_seen": 394350, + "step": 21, + "time_per_iteration": 2.676811933517456 + }, + { + "auxiliary_loss_clip": 0.01204798, + "auxiliary_loss_mlp": 0.0119346, + "balance_loss_clip": 1.01162434, + "balance_loss_mlp": 1.00339293, + "epoch": 0.0013227115586953253, + "flos": 26104077158400.0, + "grad_norm": 3.7147136398742884, + "language_loss": 1.05547392, + "learning_rate": 1.9901744328983746e-06, + "loss": 1.07945657, + "num_input_tokens_seen": 413255, + "step": 22, + "time_per_iteration": 2.74166202545166 + }, + { + "auxiliary_loss_clip": 0.0120493, + "auxiliary_loss_mlp": 0.0119297, + "balance_loss_clip": 1.01170814, + "balance_loss_mlp": 1.00318885, + "epoch": 0.0013828348113632948, + "flos": 23951376190080.0, + "grad_norm": 2.996232295281037, + "language_loss": 0.91725898, + "learning_rate": 2.018794797290208e-06, + "loss": 0.94123799, + "num_input_tokens_seen": 433065, + "step": 23, + "time_per_iteration": 2.7226779460906982 + }, + { + "auxiliary_loss_clip": 0.01205077, + "auxiliary_loss_mlp": 0.01193553, + "balance_loss_clip": 1.01183772, + "balance_loss_mlp": 1.00367606, + "epoch": 0.001442958064031264, + "flos": 15959851724160.0, + "grad_norm": 4.474252274042546, + "language_loss": 1.08154845, + "learning_rate": 2.046196897962839e-06, + "loss": 1.10553467, + "num_input_tokens_seen": 451175, + "step": 24, + "time_per_iteration": 2.694331645965576 + }, + { + "auxiliary_loss_clip": 0.01204747, + "auxiliary_loss_mlp": 0.0119273, + "balance_loss_clip": 1.01163352, + "balance_loss_mlp": 1.00285375, + "epoch": 0.0015030813166992333, + "flos": 18108350801280.0, + "grad_norm": 3.7967266320231254, + "language_loss": 1.01275563, + "learning_rate": 2.0724802282696944e-06, + "loss": 1.03673041, + "num_input_tokens_seen": 468775, + "step": 25, + "time_per_iteration": 2.67570424079895 + }, + { + "auxiliary_loss_clip": 0.01204904, + "auxiliary_loss_mlp": 0.01192496, + "balance_loss_clip": 1.01177132, + "balance_loss_mlp": 1.00261962, + "epoch": 0.0015632045693672028, + "flos": 22234558763520.0, + "grad_norm": 3.2880948991236614, + "language_loss": 1.06507587, + "learning_rate": 2.0977325700720194e-06, + "loss": 1.08904994, + "num_input_tokens_seen": 488530, + "step": 26, + "time_per_iteration": 2.7398576736450195 + }, + { + "auxiliary_loss_clip": 0.01204867, + "auxiliary_loss_mlp": 0.01192733, + "balance_loss_clip": 1.01174927, + "balance_loss_mlp": 1.00276065, + "epoch": 0.001623327822035172, + "flos": 23991955580160.0, + "grad_norm": 2.8460596814313575, + "language_loss": 0.95351124, + "learning_rate": 2.122031762649933e-06, + "loss": 0.97748721, + "num_input_tokens_seen": 510495, + "step": 27, + "time_per_iteration": 2.7592060565948486 + }, + { + "auxiliary_loss_clip": 0.01205005, + "auxiliary_loss_mlp": 0.01192566, + "balance_loss_clip": 1.01193345, + "balance_loss_mlp": 1.00297534, + "epoch": 0.0016834510747031415, + "flos": 19677647070720.0, + "grad_norm": 2.4370511561885695, + "language_loss": 1.06444144, + "learning_rate": 2.1454471497582483e-06, + "loss": 1.08841705, + "num_input_tokens_seen": 528605, + "step": 28, + "time_per_iteration": 2.70330548286438 + }, + { + "auxiliary_loss_clip": 0.01204945, + "auxiliary_loss_mlp": 0.01192978, + "balance_loss_clip": 1.01172519, + "balance_loss_mlp": 1.00319672, + "epoch": 0.0017435743273711108, + "flos": 20923819568640.0, + "grad_norm": 4.259580732030081, + "language_loss": 1.02448964, + "learning_rate": 2.1680407726407727e-06, + "loss": 1.04846883, + "num_input_tokens_seen": 548515, + "step": 29, + "time_per_iteration": 2.731755018234253 + }, + { + "auxiliary_loss_clip": 0.01204945, + "auxiliary_loss_mlp": 0.01193186, + "balance_loss_clip": 1.0116812, + "balance_loss_mlp": 1.00311852, + "epoch": 0.00180369758003908, + "flos": 19528976678400.0, + "grad_norm": 3.792155883902767, + "language_loss": 1.19410634, + "learning_rate": 2.189868360711334e-06, + "loss": 1.21808767, + "num_input_tokens_seen": 564025, + "step": 30, + "time_per_iteration": 2.6613121032714844 + }, + { + "auxiliary_loss_clip": 0.01204784, + "auxiliary_loss_mlp": 0.01193479, + "balance_loss_clip": 1.01160145, + "balance_loss_mlp": 1.00360262, + "epoch": 0.0018638208327070496, + "flos": 27453169100160.0, + "grad_norm": 4.978146604949817, + "language_loss": 1.02441764, + "learning_rate": 2.2109801597326265e-06, + "loss": 1.0484004, + "num_input_tokens_seen": 583345, + "step": 31, + "time_per_iteration": 2.7494075298309326 + }, + { + "auxiliary_loss_clip": 0.01204682, + "auxiliary_loss_mlp": 0.01193323, + "balance_loss_clip": 1.01158786, + "balance_loss_mlp": 1.00335121, + "epoch": 0.0019239440853750188, + "flos": 13589460380160.0, + "grad_norm": 2.9793429190260254, + "language_loss": 0.95690095, + "learning_rate": 2.2314216284658796e-06, + "loss": 0.98088098, + "num_input_tokens_seen": 600010, + "step": 32, + "time_per_iteration": 2.6644530296325684 + }, + { + "auxiliary_loss_clip": 0.01204886, + "auxiliary_loss_mlp": 0.01193063, + "balance_loss_clip": 1.01171541, + "balance_loss_mlp": 1.00318646, + "epoch": 0.001984067338042988, + "flos": 11253866336640.0, + "grad_norm": 5.643472605000137, + "language_loss": 0.95272982, + "learning_rate": 2.2512340280885094e-06, + "loss": 0.97670937, + "num_input_tokens_seen": 616295, + "step": 33, + "time_per_iteration": 2.7133309841156006 + }, + { + "auxiliary_loss_clip": 0.01204619, + "auxiliary_loss_mlp": 0.01193014, + "balance_loss_clip": 1.01155615, + "balance_loss_mlp": 1.00332856, + "epoch": 0.0020441905907109576, + "flos": 22386245898240.0, + "grad_norm": 1.9459367896872999, + "language_loss": 0.91510415, + "learning_rate": 2.270454923596497e-06, + "loss": 0.93908042, + "num_input_tokens_seen": 637640, + "step": 34, + "time_per_iteration": 2.7458982467651367 + }, + { + "auxiliary_loss_clip": 0.0120393, + "auxiliary_loss_mlp": 0.01192818, + "balance_loss_clip": 1.01074171, + "balance_loss_mlp": 1.00294197, + "epoch": 0.0021043138433789266, + "flos": 49778580337920.0, + "grad_norm": 2.92708587639214, + "language_loss": 0.7674005, + "learning_rate": 2.2891186125067434e-06, + "loss": 0.79136801, + "num_input_tokens_seen": 659710, + "step": 35, + "time_per_iteration": 2.9707651138305664 + }, + { + "auxiliary_loss_clip": 0.01204104, + "auxiliary_loss_mlp": 0.01192321, + "balance_loss_clip": 1.01097786, + "balance_loss_mlp": 1.00253987, + "epoch": 0.002164437096046896, + "flos": 20557961591040.0, + "grad_norm": 2.919543734063356, + "language_loss": 0.88887858, + "learning_rate": 2.307256493152974e-06, + "loss": 0.91284275, + "num_input_tokens_seen": 679670, + "step": 36, + "time_per_iteration": 2.7324604988098145 + }, + { + "auxiliary_loss_clip": 0.01204157, + "auxiliary_loss_mlp": 0.01192752, + "balance_loss_clip": 1.01097298, + "balance_loss_mlp": 1.00287509, + "epoch": 0.0022245603487148656, + "flos": 26542295084160.0, + "grad_norm": 9.308720836376782, + "language_loss": 0.93080491, + "learning_rate": 2.3248973825097614e-06, + "loss": 0.95477402, + "num_input_tokens_seen": 700170, + "step": 37, + "time_per_iteration": 2.772854804992676 + }, + { + "auxiliary_loss_clip": 0.01204217, + "auxiliary_loss_mlp": 0.01193219, + "balance_loss_clip": 1.01105714, + "balance_loss_mlp": 1.00362897, + "epoch": 0.0022846836013828346, + "flos": 20338188226560.0, + "grad_norm": 2.0486961078054455, + "language_loss": 1.04056287, + "learning_rate": 2.3420677916238357e-06, + "loss": 1.06453729, + "num_input_tokens_seen": 718545, + "step": 38, + "time_per_iteration": 2.690763473510742 + }, + { + "auxiliary_loss_clip": 0.01204097, + "auxiliary_loss_mlp": 0.01192473, + "balance_loss_clip": 1.01092839, + "balance_loss_mlp": 1.00259686, + "epoch": 0.002344806854050804, + "flos": 26247575992320.0, + "grad_norm": 2.4418646562175446, + "language_loss": 0.85599446, + "learning_rate": 2.358792165262154e-06, + "loss": 0.87996006, + "num_input_tokens_seen": 739865, + "step": 39, + "time_per_iteration": 2.7373363971710205 + }, + { + "auxiliary_loss_clip": 0.01204008, + "auxiliary_loss_mlp": 0.01192912, + "balance_loss_clip": 1.01076555, + "balance_loss_mlp": 1.00293994, + "epoch": 0.0024049301067187736, + "flos": 11801539981440.0, + "grad_norm": 2.567536064115386, + "language_loss": 0.90394807, + "learning_rate": 2.3750930912143747e-06, + "loss": 0.92791724, + "num_input_tokens_seen": 755770, + "step": 40, + "time_per_iteration": 2.6736485958099365 + }, + { + "auxiliary_loss_clip": 0.01203926, + "auxiliary_loss_mlp": 0.01192927, + "balance_loss_clip": 1.01076674, + "balance_loss_mlp": 1.00314605, + "epoch": 0.0024650533593867426, + "flos": 20631506688000.0, + "grad_norm": 3.059533277286571, + "language_loss": 0.93444949, + "learning_rate": 2.3909914837471044e-06, + "loss": 0.95841801, + "num_input_tokens_seen": 773440, + "step": 41, + "time_per_iteration": 2.7152698040008545 + }, + { + "auxiliary_loss_clip": 0.01203744, + "auxiliary_loss_mlp": 0.01192209, + "balance_loss_clip": 1.01056886, + "balance_loss_mlp": 1.00252366, + "epoch": 0.002525176612054712, + "flos": 18406122549120.0, + "grad_norm": 3.5580692453168594, + "language_loss": 0.97659492, + "learning_rate": 2.4065067449483835e-06, + "loss": 1.00055456, + "num_input_tokens_seen": 790455, + "step": 42, + "time_per_iteration": 2.652796745300293 + }, + { + "auxiliary_loss_clip": 0.01203774, + "auxiliary_loss_mlp": 0.01192452, + "balance_loss_clip": 1.01068115, + "balance_loss_mlp": 1.00267124, + "epoch": 0.0025852998647226816, + "flos": 28184023128960.0, + "grad_norm": 2.16254598658149, + "language_loss": 0.97663873, + "learning_rate": 2.4216569070848724e-06, + "loss": 1.00060105, + "num_input_tokens_seen": 810645, + "step": 43, + "time_per_iteration": 2.746044158935547 + }, + { + "auxiliary_loss_clip": 0.01204174, + "auxiliary_loss_mlp": 0.01192607, + "balance_loss_clip": 1.0109278, + "balance_loss_mlp": 1.00253928, + "epoch": 0.0026454231173906506, + "flos": 14283110897280.0, + "grad_norm": 2.4307014468922636, + "language_loss": 0.93887001, + "learning_rate": 2.4364587585915504e-06, + "loss": 0.96283776, + "num_input_tokens_seen": 827470, + "step": 44, + "time_per_iteration": 2.7673892974853516 + }, + { + "auxiliary_loss_clip": 0.01204041, + "auxiliary_loss_mlp": 0.01192602, + "balance_loss_clip": 1.01085854, + "balance_loss_mlp": 1.00263035, + "epoch": 0.00270554637005862, + "flos": 22419211605120.0, + "grad_norm": 2.2097821331760485, + "language_loss": 0.9872992, + "learning_rate": 2.450927955901469e-06, + "loss": 1.01126564, + "num_input_tokens_seen": 847285, + "step": 45, + "time_per_iteration": 2.7115988731384277 + }, + { + "auxiliary_loss_clip": 0.01203943, + "auxiliary_loss_mlp": 0.01191979, + "balance_loss_clip": 1.01084304, + "balance_loss_mlp": 1.00210214, + "epoch": 0.0027656696227265896, + "flos": 23985778440960.0, + "grad_norm": 1.7840530794910496, + "language_loss": 1.02705681, + "learning_rate": 2.465079122983384e-06, + "loss": 1.05101597, + "num_input_tokens_seen": 867545, + "step": 46, + "time_per_iteration": 2.740617513656616 + }, + { + "auxiliary_loss_clip": 0.01203732, + "auxiliary_loss_mlp": 0.01191883, + "balance_loss_clip": 1.0105145, + "balance_loss_mlp": 1.00219703, + "epoch": 0.0028257928753945586, + "flos": 37669503087360.0, + "grad_norm": 2.5616304217309707, + "language_loss": 0.88188404, + "learning_rate": 2.4789259401737868e-06, + "loss": 0.90584016, + "num_input_tokens_seen": 889915, + "step": 47, + "time_per_iteration": 2.87392258644104 + }, + { + "auxiliary_loss_clip": 0.01203472, + "auxiliary_loss_mlp": 0.01191983, + "balance_loss_clip": 1.01038516, + "balance_loss_mlp": 1.00239301, + "epoch": 0.002885916128062528, + "flos": 22454547609600.0, + "grad_norm": 1.936610024798927, + "language_loss": 0.87870848, + "learning_rate": 2.492481223656015e-06, + "loss": 0.90266305, + "num_input_tokens_seen": 908975, + "step": 48, + "time_per_iteration": 2.710462808609009 + }, + { + "auxiliary_loss_clip": 0.01203287, + "auxiliary_loss_mlp": 0.01192095, + "balance_loss_clip": 1.01019096, + "balance_loss_mlp": 1.00212336, + "epoch": 0.0029460393807304976, + "flos": 27012796358400.0, + "grad_norm": 2.4520744962843586, + "language_loss": 0.89686167, + "learning_rate": 2.5057569967437924e-06, + "loss": 0.92081547, + "num_input_tokens_seen": 929810, + "step": 49, + "time_per_iteration": 2.7595748901367188 + }, + { + "auxiliary_loss_clip": 0.0120342, + "auxiliary_loss_mlp": 0.01192751, + "balance_loss_clip": 1.01024818, + "balance_loss_mlp": 1.00277901, + "epoch": 0.0030061626333984666, + "flos": 15851832549120.0, + "grad_norm": 2.6504182724960916, + "language_loss": 0.91075599, + "learning_rate": 2.51876455396287e-06, + "loss": 0.93471771, + "num_input_tokens_seen": 948650, + "step": 50, + "time_per_iteration": 2.6953978538513184 + }, + { + "auxiliary_loss_clip": 0.01203367, + "auxiliary_loss_mlp": 0.01192197, + "balance_loss_clip": 1.01017547, + "balance_loss_mlp": 1.0022254, + "epoch": 0.003066285886066436, + "flos": 31827052316160.0, + "grad_norm": 5.5542883110157675, + "language_loss": 0.87062955, + "learning_rate": 2.5315145187866316e-06, + "loss": 0.89458525, + "num_input_tokens_seen": 966455, + "step": 51, + "time_per_iteration": 2.7925755977630615 + }, + { + "auxiliary_loss_clip": 0.01203116, + "auxiliary_loss_mlp": 0.0119264, + "balance_loss_clip": 1.00999713, + "balance_loss_mlp": 1.00285864, + "epoch": 0.0031264091387344056, + "flos": 41427482774400.0, + "grad_norm": 2.1141822833344768, + "language_loss": 0.95184505, + "learning_rate": 2.5440168957651953e-06, + "loss": 0.9758026, + "num_input_tokens_seen": 988110, + "step": 52, + "time_per_iteration": 2.845551013946533 + }, + { + "auxiliary_loss_clip": 0.01203344, + "auxiliary_loss_mlp": 0.0119252, + "balance_loss_clip": 1.01021099, + "balance_loss_mlp": 1.00283432, + "epoch": 0.0031865323914023747, + "flos": 23440941970560.0, + "grad_norm": 2.2411194451459284, + "language_loss": 0.92281389, + "learning_rate": 2.5562811176888872e-06, + "loss": 0.94677258, + "num_input_tokens_seen": 1008550, + "step": 53, + "time_per_iteration": 2.747082233428955 + }, + { + "auxiliary_loss_clip": 0.01203292, + "auxiliary_loss_mlp": 0.01192731, + "balance_loss_clip": 1.01017857, + "balance_loss_mlp": 1.00295019, + "epoch": 0.003246655644070344, + "flos": 14429195510400.0, + "grad_norm": 2.6053165109811176, + "language_loss": 0.8299998, + "learning_rate": 2.5683160883431093e-06, + "loss": 0.85396004, + "num_input_tokens_seen": 1026840, + "step": 54, + "time_per_iteration": 4.122062921524048 + }, + { + "auxiliary_loss_clip": 0.01203096, + "auxiliary_loss_mlp": 0.01192603, + "balance_loss_clip": 1.00999427, + "balance_loss_mlp": 1.00272691, + "epoch": 0.0033067788967383136, + "flos": 35918247496320.0, + "grad_norm": 2.5509669950849023, + "language_loss": 0.81341243, + "learning_rate": 2.580130221340046e-06, + "loss": 0.83736944, + "num_input_tokens_seen": 1048875, + "step": 55, + "time_per_iteration": 5.679103851318359 + }, + { + "auxiliary_loss_clip": 0.01202971, + "auxiliary_loss_mlp": 0.0119226, + "balance_loss_clip": 1.00978661, + "balance_loss_mlp": 1.00238359, + "epoch": 0.003366902149406283, + "flos": 22958732862720.0, + "grad_norm": 2.539877894065937, + "language_loss": 0.8678329, + "learning_rate": 2.5917314754514246e-06, + "loss": 0.89178526, + "num_input_tokens_seen": 1066435, + "step": 56, + "time_per_iteration": 2.6871302127838135 + }, + { + "auxiliary_loss_clip": 0.01202982, + "auxiliary_loss_mlp": 0.01192761, + "balance_loss_clip": 1.00973535, + "balance_loss_mlp": 1.00288439, + "epoch": 0.003427025402074252, + "flos": 26582838560640.0, + "grad_norm": 1.8815725115491169, + "language_loss": 0.92683601, + "learning_rate": 2.6031273868139713e-06, + "loss": 0.95079339, + "num_input_tokens_seen": 1090330, + "step": 57, + "time_per_iteration": 2.77667236328125 + }, + { + "auxiliary_loss_clip": 0.0120279, + "auxiliary_loss_mlp": 0.01191931, + "balance_loss_clip": 1.00971842, + "balance_loss_mlp": 1.00243556, + "epoch": 0.0034871486547422216, + "flos": 23951196622080.0, + "grad_norm": 2.853708872897648, + "language_loss": 0.99573338, + "learning_rate": 2.614325098333948e-06, + "loss": 1.01968062, + "num_input_tokens_seen": 1109840, + "step": 58, + "time_per_iteration": 2.699157476425171 + }, + { + "auxiliary_loss_clip": 0.01202842, + "auxiliary_loss_mlp": 0.01192407, + "balance_loss_clip": 1.00981498, + "balance_loss_mlp": 1.00253057, + "epoch": 0.003547271907410191, + "flos": 21214983214080.0, + "grad_norm": 2.4358098944808964, + "language_loss": 0.88168806, + "learning_rate": 2.625331386578098e-06, + "loss": 0.90564048, + "num_input_tokens_seen": 1128415, + "step": 59, + "time_per_iteration": 2.708672523498535 + }, + { + "auxiliary_loss_clip": 0.01203045, + "auxiliary_loss_mlp": 0.01192043, + "balance_loss_clip": 1.00992906, + "balance_loss_mlp": 1.00235748, + "epoch": 0.00360739516007816, + "flos": 16504903676160.0, + "grad_norm": 2.1671948175096, + "language_loss": 0.93312693, + "learning_rate": 2.63615268640451e-06, + "loss": 0.9570778, + "num_input_tokens_seen": 1146515, + "step": 60, + "time_per_iteration": 2.713945150375366 + }, + { + "auxiliary_loss_clip": 0.01202834, + "auxiliary_loss_mlp": 0.01192306, + "balance_loss_clip": 1.00965881, + "balance_loss_mlp": 1.0026207, + "epoch": 0.0036675184127461296, + "flos": 19464805031040.0, + "grad_norm": 6.94777850036224, + "language_loss": 0.89840573, + "learning_rate": 2.6467951135575943e-06, + "loss": 0.92235708, + "num_input_tokens_seen": 1166330, + "step": 61, + "time_per_iteration": 2.72920298576355 + }, + { + "auxiliary_loss_clip": 0.01202719, + "auxiliary_loss_mlp": 0.01192366, + "balance_loss_clip": 1.00956321, + "balance_loss_mlp": 1.0025847, + "epoch": 0.003727641665414099, + "flos": 20957323979520.0, + "grad_norm": 1.9122852542568736, + "language_loss": 0.88334107, + "learning_rate": 2.657264485425803e-06, + "loss": 0.90729189, + "num_input_tokens_seen": 1186010, + "step": 62, + "time_per_iteration": 2.7062742710113525 + }, + { + "auxiliary_loss_clip": 0.01202576, + "auxiliary_loss_mlp": 0.01192221, + "balance_loss_clip": 1.00941277, + "balance_loss_mlp": 1.00263071, + "epoch": 0.003787764918082068, + "flos": 18406050721920.0, + "grad_norm": 2.7182762429389973, + "language_loss": 0.96151775, + "learning_rate": 2.6675663401385186e-06, + "loss": 0.98546565, + "num_input_tokens_seen": 1204985, + "step": 63, + "time_per_iteration": 2.6985368728637695 + }, + { + "auxiliary_loss_clip": 0.01202852, + "auxiliary_loss_mlp": 0.01192313, + "balance_loss_clip": 1.00969779, + "balance_loss_mlp": 1.0027225, + "epoch": 0.0038478881707500376, + "flos": 12459243962880.0, + "grad_norm": 2.522856657795024, + "language_loss": 0.98950517, + "learning_rate": 2.677705954159056e-06, + "loss": 1.01345682, + "num_input_tokens_seen": 1223545, + "step": 64, + "time_per_iteration": 2.677534341812134 + }, + { + "auxiliary_loss_clip": 0.01202898, + "auxiliary_loss_mlp": 0.0119144, + "balance_loss_clip": 1.00973642, + "balance_loss_mlp": 1.00184929, + "epoch": 0.003908011423418007, + "flos": 13553334276480.0, + "grad_norm": 2.2176641076600516, + "language_loss": 0.85498071, + "learning_rate": 2.6876883585136904e-06, + "loss": 0.87892413, + "num_input_tokens_seen": 1241175, + "step": 65, + "time_per_iteration": 2.7120254039764404 + }, + { + "auxiliary_loss_clip": 0.01202628, + "auxiliary_loss_mlp": 0.01192152, + "balance_loss_clip": 1.00942802, + "balance_loss_mlp": 1.00246584, + "epoch": 0.003968134676085976, + "flos": 18333475292160.0, + "grad_norm": 1.7925049536284374, + "language_loss": 0.85163844, + "learning_rate": 2.697518353781685e-06, + "loss": 0.87558627, + "num_input_tokens_seen": 1259315, + "step": 66, + "time_per_iteration": 2.663815498352051 + }, + { + "auxiliary_loss_clip": 0.01202722, + "auxiliary_loss_mlp": 0.01192353, + "balance_loss_clip": 1.00948262, + "balance_loss_mlp": 1.00247622, + "epoch": 0.004028257928753946, + "flos": 20485242506880.0, + "grad_norm": 2.3768744958494374, + "language_loss": 0.96358913, + "learning_rate": 2.7072005239581103e-06, + "loss": 0.98753989, + "num_input_tokens_seen": 1277055, + "step": 67, + "time_per_iteration": 2.680330276489258 + }, + { + "auxiliary_loss_clip": 0.01202332, + "auxiliary_loss_mlp": 0.01191716, + "balance_loss_clip": 1.00928676, + "balance_loss_mlp": 1.00231636, + "epoch": 0.004088381181421915, + "flos": 18843837684480.0, + "grad_norm": 2.209090840342662, + "language_loss": 0.94318187, + "learning_rate": 2.7167392492896727e-06, + "loss": 0.96712232, + "num_input_tokens_seen": 1294355, + "step": 68, + "time_per_iteration": 2.671347141265869 + }, + { + "auxiliary_loss_clip": 0.01202462, + "auxiliary_loss_mlp": 0.0119211, + "balance_loss_clip": 1.00933206, + "balance_loss_mlp": 1.00261462, + "epoch": 0.004148504434089885, + "flos": 19427817000960.0, + "grad_norm": 2.364737929765372, + "language_loss": 0.95688856, + "learning_rate": 2.7261387181735195e-06, + "loss": 0.98083425, + "num_input_tokens_seen": 1313525, + "step": 69, + "time_per_iteration": 2.7072455883026123 + }, + { + "auxiliary_loss_clip": 0.01202381, + "auxiliary_loss_mlp": 0.01192283, + "balance_loss_clip": 1.00924969, + "balance_loss_mlp": 1.00250196, + "epoch": 0.004208627686757853, + "flos": 20811023884800.0, + "grad_norm": 2.4937733117996324, + "language_loss": 0.97955108, + "learning_rate": 2.7354029381999196e-06, + "loss": 1.00349784, + "num_input_tokens_seen": 1330505, + "step": 70, + "time_per_iteration": 2.6867001056671143 + }, + { + "auxiliary_loss_clip": 0.01202256, + "auxiliary_loss_mlp": 0.01192044, + "balance_loss_clip": 1.0091815, + "balance_loss_mlp": 1.00235796, + "epoch": 0.004268750939425823, + "flos": 19098623831040.0, + "grad_norm": 3.036226177781071, + "language_loss": 0.93708861, + "learning_rate": 2.7445357464116983e-06, + "loss": 0.96103162, + "num_input_tokens_seen": 1349615, + "step": 71, + "time_per_iteration": 2.7110402584075928 + }, + { + "auxiliary_loss_clip": 0.0120832, + "auxiliary_loss_mlp": 0.01201118, + "balance_loss_clip": 1.01507068, + "balance_loss_mlp": 1.01190889, + "epoch": 0.004328874192093792, + "flos": 52439635514880.0, + "grad_norm": 2.4221661458051074, + "language_loss": 0.65775275, + "learning_rate": 2.75354081884615e-06, + "loss": 0.68184716, + "num_input_tokens_seen": 1410275, + "step": 72, + "time_per_iteration": 3.1846799850463867 + }, + { + "auxiliary_loss_clip": 0.01208272, + "auxiliary_loss_mlp": 0.01201049, + "balance_loss_clip": 1.01505041, + "balance_loss_mlp": 1.01184022, + "epoch": 0.004388997444761762, + "flos": 66473239564800.0, + "grad_norm": 2.242120832259664, + "language_loss": 0.63791114, + "learning_rate": 2.7624216794188286e-06, + "loss": 0.66200435, + "num_input_tokens_seen": 1473020, + "step": 73, + "time_per_iteration": 3.221299409866333 + }, + { + "auxiliary_loss_clip": 0.01202182, + "auxiliary_loss_mlp": 0.01192072, + "balance_loss_clip": 1.00907063, + "balance_loss_mlp": 1.00257719, + "epoch": 0.004449120697429731, + "flos": 18952970181120.0, + "grad_norm": 5.029968868891697, + "language_loss": 0.85880733, + "learning_rate": 2.771181708202938e-06, + "loss": 0.88274992, + "num_input_tokens_seen": 1490385, + "step": 74, + "time_per_iteration": 2.67661714553833 + }, + { + "auxiliary_loss_clip": 0.01202177, + "auxiliary_loss_mlp": 0.01191719, + "balance_loss_clip": 1.00898385, + "balance_loss_mlp": 1.00231886, + "epoch": 0.004509243950097701, + "flos": 21105491581440.0, + "grad_norm": 2.250649769108811, + "language_loss": 0.96879399, + "learning_rate": 2.779824149153005e-06, + "loss": 0.992733, + "num_input_tokens_seen": 1509725, + "step": 75, + "time_per_iteration": 2.681972026824951 + }, + { + "auxiliary_loss_clip": 0.01201882, + "auxiliary_loss_mlp": 0.01191672, + "balance_loss_clip": 1.00875425, + "balance_loss_mlp": 1.002177, + "epoch": 0.004569367202765669, + "flos": 20698730991360.0, + "grad_norm": 2.1796224809058646, + "language_loss": 0.8758018, + "learning_rate": 2.788352117317012e-06, + "loss": 0.89973736, + "num_input_tokens_seen": 1527245, + "step": 76, + "time_per_iteration": 2.696700096130371 + }, + { + "auxiliary_loss_clip": 0.01202056, + "auxiliary_loss_mlp": 0.01191926, + "balance_loss_clip": 1.00892425, + "balance_loss_mlp": 1.00262201, + "epoch": 0.004629490455433639, + "flos": 28658474899200.0, + "grad_norm": 1.920821819455277, + "language_loss": 0.91729307, + "learning_rate": 2.796768605577095e-06, + "loss": 0.9412328, + "num_input_tokens_seen": 1548930, + "step": 77, + "time_per_iteration": 2.7773232460021973 + }, + { + "auxiliary_loss_clip": 0.01201895, + "auxiliary_loss_mlp": 0.01191812, + "balance_loss_clip": 1.00883889, + "balance_loss_mlp": 1.00260258, + "epoch": 0.004689613708101608, + "flos": 11072409805440.0, + "grad_norm": 3.9763668672164347, + "language_loss": 0.92237633, + "learning_rate": 2.80507649095533e-06, + "loss": 0.94631344, + "num_input_tokens_seen": 1565695, + "step": 78, + "time_per_iteration": 2.6610281467437744 + }, + { + "auxiliary_loss_clip": 0.01201745, + "auxiliary_loss_mlp": 0.01191835, + "balance_loss_clip": 1.00871801, + "balance_loss_mlp": 1.00233984, + "epoch": 0.004749736960769578, + "flos": 21799106184960.0, + "grad_norm": 2.125445108882331, + "language_loss": 0.824458, + "learning_rate": 2.813278540517843e-06, + "loss": 0.84839386, + "num_input_tokens_seen": 1582625, + "step": 79, + "time_per_iteration": 2.682716131210327 + }, + { + "auxiliary_loss_clip": 0.012018, + "auxiliary_loss_mlp": 0.01191285, + "balance_loss_clip": 1.0086143, + "balance_loss_mlp": 1.0019803, + "epoch": 0.004809860213437547, + "flos": 19792597570560.0, + "grad_norm": 3.5033211965791025, + "language_loss": 0.91192937, + "learning_rate": 2.8213774169075505e-06, + "loss": 0.93586022, + "num_input_tokens_seen": 1601725, + "step": 80, + "time_per_iteration": 2.678161382675171 + }, + { + "auxiliary_loss_clip": 0.01201797, + "auxiliary_loss_mlp": 0.01191999, + "balance_loss_clip": 1.0086236, + "balance_loss_mlp": 1.00240874, + "epoch": 0.004869983466105517, + "flos": 26574327037440.0, + "grad_norm": 1.993204258950802, + "language_loss": 0.94968069, + "learning_rate": 2.829375683533245e-06, + "loss": 0.97361863, + "num_input_tokens_seen": 1622420, + "step": 81, + "time_per_iteration": 2.71059250831604 + }, + { + "auxiliary_loss_clip": 0.01201679, + "auxiliary_loss_mlp": 0.0119217, + "balance_loss_clip": 1.00857055, + "balance_loss_mlp": 1.00238872, + "epoch": 0.004930106718773485, + "flos": 12823378087680.0, + "grad_norm": 2.8395905581402143, + "language_loss": 0.95886588, + "learning_rate": 2.8372758094402803e-06, + "loss": 0.98280442, + "num_input_tokens_seen": 1640715, + "step": 82, + "time_per_iteration": 2.662534236907959 + }, + { + "auxiliary_loss_clip": 0.01201526, + "auxiliary_loss_mlp": 0.01191659, + "balance_loss_clip": 1.00840151, + "balance_loss_mlp": 1.00216413, + "epoch": 0.004990229971441455, + "flos": 25774919902080.0, + "grad_norm": 2.689773726716251, + "language_loss": 0.86454731, + "learning_rate": 2.84508017388607e-06, + "loss": 0.88847917, + "num_input_tokens_seen": 1662210, + "step": 83, + "time_per_iteration": 2.7104005813598633 + }, + { + "auxiliary_loss_clip": 0.01201458, + "auxiliary_loss_mlp": 0.01192056, + "balance_loss_clip": 1.00835812, + "balance_loss_mlp": 1.00256109, + "epoch": 0.005050353224109424, + "flos": 17457254922240.0, + "grad_norm": 2.5634970130638517, + "language_loss": 0.91655743, + "learning_rate": 2.852791070641559e-06, + "loss": 0.94049257, + "num_input_tokens_seen": 1681070, + "step": 84, + "time_per_iteration": 2.67033052444458 + }, + { + "auxiliary_loss_clip": 0.01207541, + "auxiliary_loss_mlp": 0.01199821, + "balance_loss_clip": 1.01429105, + "balance_loss_mlp": 1.01061237, + "epoch": 0.005110476476777394, + "flos": 69805460367360.0, + "grad_norm": 1.3643657223854007, + "language_loss": 0.6258291, + "learning_rate": 2.8604107120381682e-06, + "loss": 0.6499027, + "num_input_tokens_seen": 1747140, + "step": 85, + "time_per_iteration": 3.2235703468322754 + }, + { + "auxiliary_loss_clip": 0.0120145, + "auxiliary_loss_mlp": 0.01191688, + "balance_loss_clip": 1.00828719, + "balance_loss_mlp": 1.00228822, + "epoch": 0.005170599729445363, + "flos": 24790105739520.0, + "grad_norm": 1.733838490641841, + "language_loss": 0.90696222, + "learning_rate": 2.8679412327780482e-06, + "loss": 0.93089354, + "num_input_tokens_seen": 1767475, + "step": 86, + "time_per_iteration": 2.720186710357666 + }, + { + "auxiliary_loss_clip": 0.01201522, + "auxiliary_loss_mlp": 0.0119198, + "balance_loss_clip": 1.00835156, + "balance_loss_mlp": 1.0021987, + "epoch": 0.005230722982113333, + "flos": 23258048895360.0, + "grad_norm": 2.466446398932318, + "language_loss": 0.81959724, + "learning_rate": 2.8753846935240833e-06, + "loss": 0.84353232, + "num_input_tokens_seen": 1784980, + "step": 87, + "time_per_iteration": 2.667060136795044 + }, + { + "auxiliary_loss_clip": 0.01201447, + "auxiliary_loss_mlp": 0.01191521, + "balance_loss_clip": 1.00833488, + "balance_loss_mlp": 1.00212145, + "epoch": 0.005290846234781301, + "flos": 16727909264640.0, + "grad_norm": 1.965594857551916, + "language_loss": 0.95880413, + "learning_rate": 2.8827430842847267e-06, + "loss": 0.98273385, + "num_input_tokens_seen": 1803030, + "step": 88, + "time_per_iteration": 2.6912927627563477 + }, + { + "auxiliary_loss_clip": 0.01201504, + "auxiliary_loss_mlp": 0.01191459, + "balance_loss_clip": 1.00835824, + "balance_loss_mlp": 1.00196433, + "epoch": 0.005350969487449271, + "flos": 20886077352960.0, + "grad_norm": 3.103162746939358, + "language_loss": 0.86137259, + "learning_rate": 2.8900183276075957e-06, + "loss": 0.88530219, + "num_input_tokens_seen": 1822865, + "step": 89, + "time_per_iteration": 2.683427095413208 + }, + { + "auxiliary_loss_clip": 0.01201527, + "auxiliary_loss_mlp": 0.01191539, + "balance_loss_clip": 1.0083766, + "balance_loss_mlp": 1.00223482, + "epoch": 0.00541109274011724, + "flos": 26209977431040.0, + "grad_norm": 2.6109037216087287, + "language_loss": 0.91540909, + "learning_rate": 2.8972122815946455e-06, + "loss": 0.9393397, + "num_input_tokens_seen": 1842435, + "step": 90, + "time_per_iteration": 2.7124786376953125 + }, + { + "auxiliary_loss_clip": 0.01201239, + "auxiliary_loss_mlp": 0.01191141, + "balance_loss_clip": 1.00813437, + "balance_loss_mlp": 1.0018369, + "epoch": 0.00547121599278521, + "flos": 21178569801600.0, + "grad_norm": 2.1797578423982897, + "language_loss": 0.85724354, + "learning_rate": 2.90432674275074e-06, + "loss": 0.88116729, + "num_input_tokens_seen": 1860065, + "step": 91, + "time_per_iteration": 2.6811702251434326 + }, + { + "auxiliary_loss_clip": 0.01201223, + "auxiliary_loss_mlp": 0.01191194, + "balance_loss_clip": 1.00811136, + "balance_loss_mlp": 1.00188994, + "epoch": 0.005531339245453179, + "flos": 19718801078400.0, + "grad_norm": 2.195245364827302, + "language_loss": 0.87033999, + "learning_rate": 2.91136344867656e-06, + "loss": 0.89426422, + "num_input_tokens_seen": 1878135, + "step": 92, + "time_per_iteration": 4.056005001068115 + }, + { + "auxiliary_loss_clip": 0.01201249, + "auxiliary_loss_mlp": 0.01191936, + "balance_loss_clip": 1.00803113, + "balance_loss_mlp": 1.00215447, + "epoch": 0.005591462498121149, + "flos": 17636089760640.0, + "grad_norm": 3.449408210155871, + "language_loss": 0.92024493, + "learning_rate": 2.918324080615938e-06, + "loss": 0.94417679, + "num_input_tokens_seen": 1894895, + "step": 93, + "time_per_iteration": 6.904150724411011 + }, + { + "auxiliary_loss_clip": 0.01201371, + "auxiliary_loss_mlp": 0.01191843, + "balance_loss_clip": 1.00817418, + "balance_loss_mlp": 1.00215685, + "epoch": 0.005651585750789117, + "flos": 20011221699840.0, + "grad_norm": 3.1262526105575206, + "language_loss": 0.87352473, + "learning_rate": 2.925210265866963e-06, + "loss": 0.89745688, + "num_input_tokens_seen": 1913220, + "step": 94, + "time_per_iteration": 2.678025245666504 + }, + { + "auxiliary_loss_clip": 0.01207068, + "auxiliary_loss_mlp": 0.01198526, + "balance_loss_clip": 1.01390338, + "balance_loss_mlp": 1.00931728, + "epoch": 0.005711709003457087, + "flos": 59812957981440.0, + "grad_norm": 1.3627465481340342, + "language_loss": 0.68102241, + "learning_rate": 2.932023580065507e-06, + "loss": 0.70507836, + "num_input_tokens_seen": 1970970, + "step": 95, + "time_per_iteration": 3.053687810897827 + }, + { + "auxiliary_loss_clip": 0.01201071, + "auxiliary_loss_mlp": 0.01191677, + "balance_loss_clip": 1.00797319, + "balance_loss_mlp": 1.00246787, + "epoch": 0.005771832256125056, + "flos": 15559591495680.0, + "grad_norm": 2.470424686576594, + "language_loss": 0.90016311, + "learning_rate": 2.9387655493491906e-06, + "loss": 0.92409062, + "num_input_tokens_seen": 1988930, + "step": 96, + "time_per_iteration": 2.6483566761016846 + }, + { + "auxiliary_loss_clip": 0.01201042, + "auxiliary_loss_mlp": 0.01191637, + "balance_loss_clip": 1.00785375, + "balance_loss_mlp": 1.00223732, + "epoch": 0.005831955508793026, + "flos": 22528380015360.0, + "grad_norm": 2.6444495875457132, + "language_loss": 0.89893371, + "learning_rate": 2.9454376524092147e-06, + "loss": 0.9228605, + "num_input_tokens_seen": 2006285, + "step": 97, + "time_per_iteration": 2.729792833328247 + }, + { + "auxiliary_loss_clip": 0.01200968, + "auxiliary_loss_mlp": 0.01191632, + "balance_loss_clip": 1.00778699, + "balance_loss_mlp": 1.00213659, + "epoch": 0.005892078761460995, + "flos": 22049834094720.0, + "grad_norm": 2.0121707329460383, + "language_loss": 0.76604819, + "learning_rate": 2.952041322436969e-06, + "loss": 0.78997421, + "num_input_tokens_seen": 2024905, + "step": 98, + "time_per_iteration": 2.696692705154419 + }, + { + "auxiliary_loss_clip": 0.01206914, + "auxiliary_loss_mlp": 0.01197886, + "balance_loss_clip": 1.01374936, + "balance_loss_mlp": 1.00867677, + "epoch": 0.005952202014128965, + "flos": 68539143317760.0, + "grad_norm": 1.0322467645886058, + "language_loss": 0.65479195, + "learning_rate": 2.9585779489718204e-06, + "loss": 0.67883992, + "num_input_tokens_seen": 2086220, + "step": 99, + "time_per_iteration": 3.2099931240081787 + }, + { + "auxiliary_loss_clip": 0.01200982, + "auxiliary_loss_mlp": 0.0119146, + "balance_loss_clip": 1.00792253, + "balance_loss_mlp": 1.00225139, + "epoch": 0.006012325266796933, + "flos": 22960887678720.0, + "grad_norm": 2.0371130838962763, + "language_loss": 0.90812051, + "learning_rate": 2.9650488796560464e-06, + "loss": 0.93204486, + "num_input_tokens_seen": 2103365, + "step": 100, + "time_per_iteration": 2.699645757675171 + }, + { + "auxiliary_loss_clip": 0.01201238, + "auxiliary_loss_mlp": 0.01191595, + "balance_loss_clip": 1.00804138, + "balance_loss_mlp": 1.00209999, + "epoch": 0.006072448519464903, + "flos": 17347942857600.0, + "grad_norm": 2.6178021089949177, + "language_loss": 0.91100061, + "learning_rate": 2.971455421902446e-06, + "loss": 0.93492901, + "num_input_tokens_seen": 2121995, + "step": 101, + "time_per_iteration": 2.6519699096679688 + }, + { + "auxiliary_loss_clip": 0.01201033, + "auxiliary_loss_mlp": 0.01191358, + "balance_loss_clip": 1.00796127, + "balance_loss_mlp": 1.00214863, + "epoch": 0.006132571772132872, + "flos": 24681116897280.0, + "grad_norm": 2.0461836998378895, + "language_loss": 0.90582156, + "learning_rate": 2.9777988444798075e-06, + "loss": 0.92974544, + "num_input_tokens_seen": 2141815, + "step": 102, + "time_per_iteration": 2.7246930599212646 + }, + { + "auxiliary_loss_clip": 0.01201073, + "auxiliary_loss_mlp": 0.01191116, + "balance_loss_clip": 1.00798881, + "balance_loss_mlp": 1.00181174, + "epoch": 0.006192695024800842, + "flos": 21465675210240.0, + "grad_norm": 3.066935787093614, + "language_loss": 0.88040662, + "learning_rate": 2.9840803790210285e-06, + "loss": 0.90432853, + "num_input_tokens_seen": 2161125, + "step": 103, + "time_per_iteration": 2.676809787750244 + }, + { + "auxiliary_loss_clip": 0.0120091, + "auxiliary_loss_mlp": 0.01191302, + "balance_loss_clip": 1.00785959, + "balance_loss_mlp": 1.00209296, + "epoch": 0.006252818277468811, + "flos": 17420410546560.0, + "grad_norm": 2.209009336180435, + "language_loss": 0.9363637, + "learning_rate": 2.990301221458371e-06, + "loss": 0.96028578, + "num_input_tokens_seen": 2179510, + "step": 104, + "time_per_iteration": 2.6835174560546875 + }, + { + "auxiliary_loss_clip": 0.01201045, + "auxiliary_loss_mlp": 0.01191239, + "balance_loss_clip": 1.00798583, + "balance_loss_mlp": 1.00222051, + "epoch": 0.006312941530136781, + "flos": 19099557584640.0, + "grad_norm": 2.9683183901207655, + "language_loss": 0.96584713, + "learning_rate": 2.9964625333900544e-06, + "loss": 0.98976994, + "num_input_tokens_seen": 2197870, + "step": 105, + "time_per_iteration": 2.673766613006592 + }, + { + "auxiliary_loss_clip": 0.01201019, + "auxiliary_loss_mlp": 0.01191497, + "balance_loss_clip": 1.00792503, + "balance_loss_mlp": 1.00200152, + "epoch": 0.006373064782804749, + "flos": 24060831909120.0, + "grad_norm": 2.350030076958581, + "language_loss": 0.87049264, + "learning_rate": 3.002565443382063e-06, + "loss": 0.89441776, + "num_input_tokens_seen": 2217495, + "step": 106, + "time_per_iteration": 2.715744733810425 + }, + { + "auxiliary_loss_clip": 0.01200565, + "auxiliary_loss_mlp": 0.01191446, + "balance_loss_clip": 1.00752163, + "balance_loss_mlp": 1.00204611, + "epoch": 0.006433188035472719, + "flos": 18332433797760.0, + "grad_norm": 4.578424971093535, + "language_loss": 0.83374476, + "learning_rate": 3.008611048208843e-06, + "loss": 0.85766482, + "num_input_tokens_seen": 2236520, + "step": 107, + "time_per_iteration": 2.683683156967163 + }, + { + "auxiliary_loss_clip": 0.01206343, + "auxiliary_loss_mlp": 0.01196809, + "balance_loss_clip": 1.01319242, + "balance_loss_mlp": 1.00759971, + "epoch": 0.006493311288140688, + "flos": 62562387594240.0, + "grad_norm": 0.9890796984553928, + "language_loss": 0.6476953, + "learning_rate": 3.014600414036285e-06, + "loss": 0.67172682, + "num_input_tokens_seen": 2300140, + "step": 108, + "time_per_iteration": 3.2100865840911865 + }, + { + "auxiliary_loss_clip": 0.01200571, + "auxiliary_loss_mlp": 0.01191051, + "balance_loss_clip": 1.00755918, + "balance_loss_mlp": 1.00184226, + "epoch": 0.006553434540808658, + "flos": 19500141035520.0, + "grad_norm": 1.8688084365331237, + "language_loss": 0.97576439, + "learning_rate": 3.0205345775501937e-06, + "loss": 0.9996807, + "num_input_tokens_seen": 2317320, + "step": 109, + "time_per_iteration": 2.6887149810791016 + }, + { + "auxiliary_loss_clip": 0.01200606, + "auxiliary_loss_mlp": 0.01191191, + "balance_loss_clip": 1.00766325, + "balance_loss_mlp": 1.00179088, + "epoch": 0.006613557793476627, + "flos": 21105132445440.0, + "grad_norm": 1.738999735155943, + "language_loss": 0.84022093, + "learning_rate": 3.0264145470332218e-06, + "loss": 0.86413896, + "num_input_tokens_seen": 2337820, + "step": 110, + "time_per_iteration": 2.706954002380371 + }, + { + "auxiliary_loss_clip": 0.0120053, + "auxiliary_loss_mlp": 0.01191578, + "balance_loss_clip": 1.00736642, + "balance_loss_mlp": 1.00246394, + "epoch": 0.006673681046144597, + "flos": 26030747543040.0, + "grad_norm": 2.1300753810467836, + "language_loss": 0.82937658, + "learning_rate": 3.032241303393073e-06, + "loss": 0.85329771, + "num_input_tokens_seen": 2358560, + "step": 111, + "time_per_iteration": 2.727750062942505 + }, + { + "auxiliary_loss_clip": 0.0120043, + "auxiliary_loss_mlp": 0.01191006, + "balance_loss_clip": 1.00739586, + "balance_loss_mlp": 1.00170195, + "epoch": 0.006733804298812566, + "flos": 23147767163520.0, + "grad_norm": 2.07780713278782, + "language_loss": 0.93818289, + "learning_rate": 3.0380158011446e-06, + "loss": 0.96209735, + "num_input_tokens_seen": 2379005, + "step": 112, + "time_per_iteration": 2.6964473724365234 + }, + { + "auxiliary_loss_clip": 0.01200627, + "auxiliary_loss_mlp": 0.01190585, + "balance_loss_clip": 1.00756788, + "balance_loss_mlp": 1.00175762, + "epoch": 0.006793927551480535, + "flos": 11764444210560.0, + "grad_norm": 2.659034163676628, + "language_loss": 0.79426444, + "learning_rate": 3.0437389693482466e-06, + "loss": 0.81817651, + "num_input_tokens_seen": 2395610, + "step": 113, + "time_per_iteration": 2.6729114055633545 + }, + { + "auxiliary_loss_clip": 0.01200507, + "auxiliary_loss_mlp": 0.01190744, + "balance_loss_clip": 1.00748706, + "balance_loss_mlp": 1.0017252, + "epoch": 0.006854050804148504, + "flos": 19171953446400.0, + "grad_norm": 1.8657921563871493, + "language_loss": 0.93204159, + "learning_rate": 3.0494117125071475e-06, + "loss": 0.95595413, + "num_input_tokens_seen": 2415005, + "step": 114, + "time_per_iteration": 2.6781115531921387 + }, + { + "auxiliary_loss_clip": 0.01200545, + "auxiliary_loss_mlp": 0.0119122, + "balance_loss_clip": 1.00742257, + "balance_loss_mlp": 1.00201154, + "epoch": 0.006914174056816474, + "flos": 21981891519360.0, + "grad_norm": 1.9834658537496774, + "language_loss": 0.94539237, + "learning_rate": 3.055034911425055e-06, + "loss": 0.96931005, + "num_input_tokens_seen": 2433965, + "step": 115, + "time_per_iteration": 2.7247376441955566 + }, + { + "auxiliary_loss_clip": 0.01200403, + "auxiliary_loss_mlp": 0.01190591, + "balance_loss_clip": 1.00728738, + "balance_loss_mlp": 1.0014776, + "epoch": 0.006974297309484443, + "flos": 16289152634880.0, + "grad_norm": 2.3236154696161577, + "language_loss": 0.81752491, + "learning_rate": 3.0606094240271244e-06, + "loss": 0.84143484, + "num_input_tokens_seen": 2451605, + "step": 116, + "time_per_iteration": 2.6660854816436768 + }, + { + "auxiliary_loss_clip": 0.01200113, + "auxiliary_loss_mlp": 0.01190871, + "balance_loss_clip": 1.00710869, + "balance_loss_mlp": 1.00175726, + "epoch": 0.007034420562152413, + "flos": 26104005331200.0, + "grad_norm": 2.298093570054833, + "language_loss": 0.8810761, + "learning_rate": 3.0661360861454656e-06, + "loss": 0.9049859, + "num_input_tokens_seen": 2472035, + "step": 117, + "time_per_iteration": 2.726045608520508 + }, + { + "auxiliary_loss_clip": 0.01200198, + "auxiliary_loss_mlp": 0.01190954, + "balance_loss_clip": 1.00723624, + "balance_loss_mlp": 1.00184083, + "epoch": 0.007094543814820382, + "flos": 14204609723520.0, + "grad_norm": 2.5273573083475185, + "language_loss": 0.84376955, + "learning_rate": 3.071615712271274e-06, + "loss": 0.86768103, + "num_input_tokens_seen": 2489285, + "step": 118, + "time_per_iteration": 2.636115074157715 + }, + { + "auxiliary_loss_clip": 0.01200354, + "auxiliary_loss_mlp": 0.01191566, + "balance_loss_clip": 1.00731635, + "balance_loss_mlp": 1.00254798, + "epoch": 0.007154667067488351, + "flos": 14976007228800.0, + "grad_norm": 2.547269223772394, + "language_loss": 0.99089587, + "learning_rate": 3.0770490962752172e-06, + "loss": 1.01481509, + "num_input_tokens_seen": 2506460, + "step": 119, + "time_per_iteration": 2.6609716415405273 + }, + { + "auxiliary_loss_clip": 0.01200378, + "auxiliary_loss_mlp": 0.01190828, + "balance_loss_clip": 1.00718641, + "balance_loss_mlp": 1.001333, + "epoch": 0.00721479032015632, + "flos": 20193288762240.0, + "grad_norm": 2.2297637175599156, + "language_loss": 0.89180255, + "learning_rate": 3.082437012097686e-06, + "loss": 0.91571456, + "num_input_tokens_seen": 2525565, + "step": 120, + "time_per_iteration": 2.679975748062134 + }, + { + "auxiliary_loss_clip": 0.01200193, + "auxiliary_loss_mlp": 0.01190682, + "balance_loss_clip": 1.00716424, + "balance_loss_mlp": 1.00147271, + "epoch": 0.00727491357282429, + "flos": 23147228459520.0, + "grad_norm": 1.8112389793640113, + "language_loss": 0.9322226, + "learning_rate": 3.0877802144103967e-06, + "loss": 0.95613146, + "num_input_tokens_seen": 2546605, + "step": 121, + "time_per_iteration": 2.701289176940918 + }, + { + "auxiliary_loss_clip": 0.01200227, + "auxiliary_loss_mlp": 0.01191268, + "balance_loss_clip": 1.00720823, + "balance_loss_mlp": 1.00205934, + "epoch": 0.007335036825492259, + "flos": 15521669712000.0, + "grad_norm": 2.1711071948635134, + "language_loss": 0.90200216, + "learning_rate": 3.09307943925077e-06, + "loss": 0.92591715, + "num_input_tokens_seen": 2560730, + "step": 122, + "time_per_iteration": 2.6204683780670166 + }, + { + "auxiliary_loss_clip": 0.01200022, + "auxiliary_loss_mlp": 0.01190887, + "balance_loss_clip": 1.00700927, + "balance_loss_mlp": 1.00186884, + "epoch": 0.007395160078160229, + "flos": 24243365848320.0, + "grad_norm": 10.100257351534506, + "language_loss": 0.92551339, + "learning_rate": 3.0983354046304154e-06, + "loss": 0.94942248, + "num_input_tokens_seen": 2579550, + "step": 123, + "time_per_iteration": 2.7071378231048584 + }, + { + "auxiliary_loss_clip": 0.01199949, + "auxiliary_loss_mlp": 0.01190926, + "balance_loss_clip": 1.00682974, + "balance_loss_mlp": 1.00171673, + "epoch": 0.007455283330828198, + "flos": 31759792099200.0, + "grad_norm": 2.1238249346842557, + "language_loss": 0.70885348, + "learning_rate": 3.103548811118979e-06, + "loss": 0.73276222, + "num_input_tokens_seen": 2600390, + "step": 124, + "time_per_iteration": 2.7617008686065674 + }, + { + "auxiliary_loss_clip": 0.0119989, + "auxiliary_loss_mlp": 0.01190457, + "balance_loss_clip": 1.00692344, + "balance_loss_mlp": 1.00143874, + "epoch": 0.007515406583496167, + "flos": 26615157822720.0, + "grad_norm": 2.0040710562120556, + "language_loss": 0.88375604, + "learning_rate": 3.108720342404542e-06, + "loss": 0.90765953, + "num_input_tokens_seen": 2620770, + "step": 125, + "time_per_iteration": 2.726466178894043 + }, + { + "auxiliary_loss_clip": 0.01199958, + "auxiliary_loss_mlp": 0.01191084, + "balance_loss_clip": 1.00698876, + "balance_loss_mlp": 1.00187492, + "epoch": 0.007575529836164136, + "flos": 18223696350720.0, + "grad_norm": 2.76634541649663, + "language_loss": 0.82200587, + "learning_rate": 3.1138506658316945e-06, + "loss": 0.84591627, + "num_input_tokens_seen": 2639900, + "step": 126, + "time_per_iteration": 2.69254469871521 + }, + { + "auxiliary_loss_clip": 0.0119991, + "auxiliary_loss_mlp": 0.01191043, + "balance_loss_clip": 1.00682807, + "balance_loss_mlp": 1.00192928, + "epoch": 0.007635653088832106, + "flos": 21580410228480.0, + "grad_norm": 2.7712895507979565, + "language_loss": 0.67190921, + "learning_rate": 3.1189404329183404e-06, + "loss": 0.69581872, + "num_input_tokens_seen": 2657450, + "step": 127, + "time_per_iteration": 2.736367702484131 + }, + { + "auxiliary_loss_clip": 0.01199937, + "auxiliary_loss_mlp": 0.0119069, + "balance_loss_clip": 1.00697303, + "balance_loss_mlp": 1.00157642, + "epoch": 0.007695776341500075, + "flos": 25375054723200.0, + "grad_norm": 1.9675556040100801, + "language_loss": 0.8824473, + "learning_rate": 3.1239902798522317e-06, + "loss": 0.90635359, + "num_input_tokens_seen": 2678150, + "step": 128, + "time_per_iteration": 2.7587883472442627 + }, + { + "auxiliary_loss_clip": 0.01199845, + "auxiliary_loss_mlp": 0.01191152, + "balance_loss_clip": 1.00683761, + "balance_loss_mlp": 1.00194311, + "epoch": 0.007755899594168045, + "flos": 22343906741760.0, + "grad_norm": 1.7029530823755903, + "language_loss": 0.84568721, + "learning_rate": 3.129000827968184e-06, + "loss": 0.8695972, + "num_input_tokens_seen": 2698290, + "step": 129, + "time_per_iteration": 2.6792197227478027 + }, + { + "auxiliary_loss_clip": 0.01199842, + "auxiliary_loss_mlp": 0.01190797, + "balance_loss_clip": 1.00691366, + "balance_loss_mlp": 1.00158811, + "epoch": 0.007816022846836013, + "flos": 22638230784000.0, + "grad_norm": 2.0661960625671685, + "language_loss": 0.97281349, + "learning_rate": 3.133972684206866e-06, + "loss": 0.99671984, + "num_input_tokens_seen": 2717630, + "step": 130, + "time_per_iteration": 2.6917972564697266 + }, + { + "auxiliary_loss_clip": 0.01199696, + "auxiliary_loss_mlp": 0.01190931, + "balance_loss_clip": 1.00671101, + "balance_loss_mlp": 1.0018177, + "epoch": 0.007876146099503984, + "flos": 18182901479040.0, + "grad_norm": 2.0392407409618976, + "language_loss": 0.82460827, + "learning_rate": 3.138906441556014e-06, + "loss": 0.84851456, + "num_input_tokens_seen": 2735835, + "step": 131, + "time_per_iteration": 5.523234844207764 + }, + { + "auxiliary_loss_clip": 0.01199918, + "auxiliary_loss_mlp": 0.01190838, + "balance_loss_clip": 1.00684464, + "balance_loss_mlp": 1.00172424, + "epoch": 0.007936269352171952, + "flos": 27119486730240.0, + "grad_norm": 3.0674781528524506, + "language_loss": 0.83004701, + "learning_rate": 3.143802679474861e-06, + "loss": 0.85395455, + "num_input_tokens_seen": 2756335, + "step": 132, + "time_per_iteration": 2.710916519165039 + }, + { + "auxiliary_loss_clip": 0.01199613, + "auxiliary_loss_mlp": 0.01190674, + "balance_loss_clip": 1.00663209, + "balance_loss_mlp": 1.00165606, + "epoch": 0.007996392604839923, + "flos": 19026335710080.0, + "grad_norm": 2.5255598363500464, + "language_loss": 0.95474339, + "learning_rate": 3.1486619643025565e-06, + "loss": 0.97864622, + "num_input_tokens_seen": 2775090, + "step": 133, + "time_per_iteration": 2.6750409603118896 + }, + { + "auxiliary_loss_clip": 0.01199642, + "auxiliary_loss_mlp": 0.01190548, + "balance_loss_clip": 1.00673294, + "balance_loss_mlp": 1.00172007, + "epoch": 0.008056515857507891, + "flos": 25484151306240.0, + "grad_norm": 1.6475441930760022, + "language_loss": 0.73411036, + "learning_rate": 3.153484849651286e-06, + "loss": 0.75801224, + "num_input_tokens_seen": 2795320, + "step": 134, + "time_per_iteration": 2.7056524753570557 + }, + { + "auxiliary_loss_clip": 0.01199457, + "auxiliary_loss_mlp": 0.01190774, + "balance_loss_clip": 1.00646734, + "balance_loss_mlp": 1.00175548, + "epoch": 0.00811663911017586, + "flos": 20557566541440.0, + "grad_norm": 2.8097811402032806, + "language_loss": 0.88916457, + "learning_rate": 3.1582718767847806e-06, + "loss": 0.91306686, + "num_input_tokens_seen": 2812815, + "step": 135, + "time_per_iteration": 2.6531431674957275 + }, + { + "auxiliary_loss_clip": 0.01199466, + "auxiliary_loss_mlp": 0.01190706, + "balance_loss_clip": 1.00653946, + "balance_loss_mlp": 1.00178289, + "epoch": 0.00817676236284383, + "flos": 18799738761600.0, + "grad_norm": 2.246927016072895, + "language_loss": 0.89024323, + "learning_rate": 3.1630235749828485e-06, + "loss": 0.91414493, + "num_input_tokens_seen": 2830445, + "step": 136, + "time_per_iteration": 2.659677028656006 + }, + { + "auxiliary_loss_clip": 0.01199596, + "auxiliary_loss_mlp": 0.01190549, + "balance_loss_clip": 1.00653911, + "balance_loss_mlp": 1.00153065, + "epoch": 0.008236885615511799, + "flos": 23873593288320.0, + "grad_norm": 2.098471994175296, + "language_loss": 0.83954573, + "learning_rate": 3.1677404618925676e-06, + "loss": 0.86344719, + "num_input_tokens_seen": 2846965, + "step": 137, + "time_per_iteration": 2.686657428741455 + }, + { + "auxiliary_loss_clip": 0.01199499, + "auxiliary_loss_mlp": 0.01190627, + "balance_loss_clip": 1.00654411, + "balance_loss_mlp": 1.00160873, + "epoch": 0.00829700886817977, + "flos": 24643626076800.0, + "grad_norm": 1.868665419186176, + "language_loss": 0.90044308, + "learning_rate": 3.1724230438666953e-06, + "loss": 0.9243443, + "num_input_tokens_seen": 2867520, + "step": 138, + "time_per_iteration": 2.6977927684783936 + }, + { + "auxiliary_loss_clip": 0.01199283, + "auxiliary_loss_mlp": 0.01190433, + "balance_loss_clip": 1.00636137, + "balance_loss_mlp": 1.00151038, + "epoch": 0.008357132120847738, + "flos": 25262007644160.0, + "grad_norm": 2.2985498957787387, + "language_loss": 0.91296238, + "learning_rate": 3.177071816289865e-06, + "loss": 0.93685949, + "num_input_tokens_seen": 2885675, + "step": 139, + "time_per_iteration": 2.7075204849243164 + }, + { + "auxiliary_loss_clip": 0.01199464, + "auxiliary_loss_mlp": 0.01190572, + "balance_loss_clip": 1.00650096, + "balance_loss_mlp": 1.00155354, + "epoch": 0.008417255373515706, + "flos": 27344898529920.0, + "grad_norm": 2.6010653495544744, + "language_loss": 0.85523689, + "learning_rate": 3.181687263893095e-06, + "loss": 0.87913722, + "num_input_tokens_seen": 2905960, + "step": 140, + "time_per_iteration": 2.710655927658081 + }, + { + "auxiliary_loss_clip": 0.01199488, + "auxiliary_loss_mlp": 0.01190867, + "balance_loss_clip": 1.00653362, + "balance_loss_mlp": 1.00175381, + "epoch": 0.008477378626183677, + "flos": 17639070589440.0, + "grad_norm": 2.9264742555000787, + "language_loss": 0.84275311, + "learning_rate": 3.186269861057098e-06, + "loss": 0.86665666, + "num_input_tokens_seen": 2922780, + "step": 141, + "time_per_iteration": 2.644075870513916 + }, + { + "auxiliary_loss_clip": 0.01199414, + "auxiliary_loss_mlp": 0.01190983, + "balance_loss_clip": 1.00640821, + "balance_loss_mlp": 1.00177372, + "epoch": 0.008537501878851645, + "flos": 13881342297600.0, + "grad_norm": 2.2314405921898306, + "language_loss": 0.81250578, + "learning_rate": 3.1908200721048745e-06, + "loss": 0.83640969, + "num_input_tokens_seen": 2938765, + "step": 142, + "time_per_iteration": 2.664179563522339 + }, + { + "auxiliary_loss_clip": 0.01204763, + "auxiliary_loss_mlp": 0.01193251, + "balance_loss_clip": 1.01188016, + "balance_loss_mlp": 1.00480449, + "epoch": 0.008597625131519616, + "flos": 71248101281280.0, + "grad_norm": 1.2876602782112205, + "language_loss": 0.66947103, + "learning_rate": 3.195338351584042e-06, + "loss": 0.69345117, + "num_input_tokens_seen": 3006665, + "step": 143, + "time_per_iteration": 3.3164546489715576 + }, + { + "auxiliary_loss_clip": 0.01199239, + "auxiliary_loss_mlp": 0.01190654, + "balance_loss_clip": 1.00634193, + "balance_loss_mlp": 1.00163555, + "epoch": 0.008657748384187584, + "flos": 17602836744960.0, + "grad_norm": 2.2578592011800755, + "language_loss": 0.84007621, + "learning_rate": 3.1998251445393258e-06, + "loss": 0.86397517, + "num_input_tokens_seen": 3024335, + "step": 144, + "time_per_iteration": 2.6313388347625732 + }, + { + "auxiliary_loss_clip": 0.01199292, + "auxiliary_loss_mlp": 0.01190522, + "balance_loss_clip": 1.00632322, + "balance_loss_mlp": 1.00150371, + "epoch": 0.008717871636855555, + "flos": 19715317459200.0, + "grad_norm": 1.7861102414241643, + "language_loss": 0.88318157, + "learning_rate": 3.204280886775619e-06, + "loss": 0.9070797, + "num_input_tokens_seen": 3043300, + "step": 145, + "time_per_iteration": 2.728743314743042 + }, + { + "auxiliary_loss_clip": 0.01199163, + "auxiliary_loss_mlp": 0.01190903, + "balance_loss_clip": 1.00610769, + "balance_loss_mlp": 1.00178981, + "epoch": 0.008777994889523523, + "flos": 24717422568960.0, + "grad_norm": 1.8249224327850706, + "language_loss": 0.86026573, + "learning_rate": 3.208706005112005e-06, + "loss": 0.88416636, + "num_input_tokens_seen": 3064610, + "step": 146, + "time_per_iteration": 2.6917264461517334 + }, + { + "auxiliary_loss_clip": 0.01204432, + "auxiliary_loss_mlp": 0.01192716, + "balance_loss_clip": 1.01160598, + "balance_loss_mlp": 1.00427043, + "epoch": 0.008838118142191492, + "flos": 70132067758080.0, + "grad_norm": 0.8698282401967848, + "language_loss": 0.60143185, + "learning_rate": 3.213100917627104e-06, + "loss": 0.62540334, + "num_input_tokens_seen": 3130385, + "step": 147, + "time_per_iteration": 3.255659580230713 + }, + { + "auxiliary_loss_clip": 0.01199325, + "auxiliary_loss_mlp": 0.01190567, + "balance_loss_clip": 1.00638032, + "balance_loss_mlp": 1.00164366, + "epoch": 0.008898241394859462, + "flos": 20044797937920.0, + "grad_norm": 1.8184608797318396, + "language_loss": 0.84620166, + "learning_rate": 3.2174660338961135e-06, + "loss": 0.87010056, + "num_input_tokens_seen": 3149760, + "step": 148, + "time_per_iteration": 2.673474073410034 + }, + { + "auxiliary_loss_clip": 0.01198978, + "auxiliary_loss_mlp": 0.01190704, + "balance_loss_clip": 1.0061394, + "balance_loss_mlp": 1.00206757, + "epoch": 0.008958364647527431, + "flos": 10743611685120.0, + "grad_norm": 2.2867390313582368, + "language_loss": 0.88541842, + "learning_rate": 3.2218017552198588e-06, + "loss": 0.90931523, + "num_input_tokens_seen": 3164500, + "step": 149, + "time_per_iteration": 2.6563515663146973 + }, + { + "auxiliary_loss_clip": 0.01199121, + "auxiliary_loss_mlp": 0.01190371, + "balance_loss_clip": 1.00622964, + "balance_loss_mlp": 1.00135326, + "epoch": 0.009018487900195401, + "flos": 29127467802240.0, + "grad_norm": 2.036319357509972, + "language_loss": 0.93096709, + "learning_rate": 3.226108474846181e-06, + "loss": 0.954862, + "num_input_tokens_seen": 3182455, + "step": 150, + "time_per_iteration": 2.7340142726898193 + }, + { + "auxiliary_loss_clip": 0.01198799, + "auxiliary_loss_mlp": 0.01190356, + "balance_loss_clip": 1.00586319, + "balance_loss_mlp": 1.0015285, + "epoch": 0.00907861115286337, + "flos": 32963661354240.0, + "grad_norm": 1.8689226781590667, + "language_loss": 0.74075639, + "learning_rate": 3.2303865781839817e-06, + "loss": 0.76464796, + "num_input_tokens_seen": 3203995, + "step": 151, + "time_per_iteration": 2.7932639122009277 + }, + { + "auxiliary_loss_clip": 0.01199026, + "auxiliary_loss_mlp": 0.01190339, + "balance_loss_clip": 1.00616872, + "balance_loss_mlp": 1.0016067, + "epoch": 0.009138734405531338, + "flos": 21762441377280.0, + "grad_norm": 4.6497153241637434, + "language_loss": 0.88304102, + "learning_rate": 3.234636443010188e-06, + "loss": 0.90693462, + "num_input_tokens_seen": 3222575, + "step": 152, + "time_per_iteration": 2.6594250202178955 + }, + { + "auxiliary_loss_clip": 0.01199062, + "auxiliary_loss_mlp": 0.01190058, + "balance_loss_clip": 1.00619018, + "balance_loss_mlp": 1.00142145, + "epoch": 0.009198857658199309, + "flos": 20842517134080.0, + "grad_norm": 5.094819822575093, + "language_loss": 0.83622873, + "learning_rate": 3.238858439669943e-06, + "loss": 0.86011994, + "num_input_tokens_seen": 3240180, + "step": 153, + "time_per_iteration": 2.673509120941162 + }, + { + "auxiliary_loss_clip": 0.01199051, + "auxiliary_loss_mlp": 0.01190525, + "balance_loss_clip": 1.00615406, + "balance_loss_mlp": 1.0018878, + "epoch": 0.009258980910867277, + "flos": 24827381078400.0, + "grad_norm": 2.1465497253661456, + "language_loss": 0.89662892, + "learning_rate": 3.2430529312702712e-06, + "loss": 0.92052472, + "num_input_tokens_seen": 3259800, + "step": 154, + "time_per_iteration": 2.6985175609588623 + }, + { + "auxiliary_loss_clip": 0.01198907, + "auxiliary_loss_mlp": 0.01191056, + "balance_loss_clip": 1.00608945, + "balance_loss_mlp": 1.00232351, + "epoch": 0.009319104163535248, + "flos": 28767786963840.0, + "grad_norm": 2.095136628399198, + "language_loss": 0.89636987, + "learning_rate": 3.2472202738674737e-06, + "loss": 0.92026949, + "num_input_tokens_seen": 3280400, + "step": 155, + "time_per_iteration": 2.7938153743743896 + }, + { + "auxiliary_loss_clip": 0.01198912, + "auxiliary_loss_mlp": 0.01190386, + "balance_loss_clip": 1.00602055, + "balance_loss_mlp": 1.00174963, + "epoch": 0.009379227416203216, + "flos": 16582004219520.0, + "grad_norm": 2.8007252928611805, + "language_loss": 0.86661625, + "learning_rate": 3.2513608166485063e-06, + "loss": 0.89050925, + "num_input_tokens_seen": 3297600, + "step": 156, + "time_per_iteration": 2.6487605571746826 + }, + { + "auxiliary_loss_clip": 0.01198953, + "auxiliary_loss_mlp": 0.0119045, + "balance_loss_clip": 1.00607026, + "balance_loss_mlp": 1.00143135, + "epoch": 0.009439350668871187, + "flos": 18329919845760.0, + "grad_norm": 2.3391293876581836, + "language_loss": 0.99444991, + "learning_rate": 3.2554749021065498e-06, + "loss": 1.01834404, + "num_input_tokens_seen": 3313635, + "step": 157, + "time_per_iteration": 2.6483154296875 + }, + { + "auxiliary_loss_clip": 0.01198942, + "auxiliary_loss_mlp": 0.01190737, + "balance_loss_clip": 1.00617456, + "balance_loss_mlp": 1.0020045, + "epoch": 0.009499473921539155, + "flos": 24349912565760.0, + "grad_norm": 2.2560202518292223, + "language_loss": 0.88306493, + "learning_rate": 3.2595628662110186e-06, + "loss": 0.90696174, + "num_input_tokens_seen": 3333735, + "step": 158, + "time_per_iteration": 2.6611311435699463 + }, + { + "auxiliary_loss_clip": 0.01198691, + "auxiliary_loss_mlp": 0.01190797, + "balance_loss_clip": 1.00585115, + "balance_loss_mlp": 1.00196922, + "epoch": 0.009559597174207124, + "flos": 16399326625920.0, + "grad_norm": 2.2880126195556634, + "language_loss": 0.86483395, + "learning_rate": 3.2636250385721982e-06, + "loss": 0.88872886, + "num_input_tokens_seen": 3348800, + "step": 159, + "time_per_iteration": 2.637525796890259 + }, + { + "auxiliary_loss_clip": 0.01198441, + "auxiliary_loss_mlp": 0.01190712, + "balance_loss_clip": 1.00572491, + "balance_loss_mlp": 1.0018847, + "epoch": 0.009619720426875094, + "flos": 22856890826880.0, + "grad_norm": 1.733375334862201, + "language_loss": 0.86428297, + "learning_rate": 3.2676617426007263e-06, + "loss": 0.88817447, + "num_input_tokens_seen": 3368595, + "step": 160, + "time_per_iteration": 2.651517391204834 + }, + { + "auxiliary_loss_clip": 0.01198624, + "auxiliary_loss_mlp": 0.01190621, + "balance_loss_clip": 1.005795, + "balance_loss_mlp": 1.00188899, + "epoch": 0.009679843679543063, + "flos": 19135001329920.0, + "grad_norm": 2.4047287277287657, + "language_loss": 0.91590428, + "learning_rate": 3.2716732956621042e-06, + "loss": 0.93979669, + "num_input_tokens_seen": 3384975, + "step": 161, + "time_per_iteration": 2.6231582164764404 + }, + { + "auxiliary_loss_clip": 0.01198786, + "auxiliary_loss_mlp": 0.01190112, + "balance_loss_clip": 1.00591552, + "balance_loss_mlp": 1.00147521, + "epoch": 0.009739966932211033, + "flos": 20302995876480.0, + "grad_norm": 1.841716274294259, + "language_loss": 0.91338134, + "learning_rate": 3.2756600092264203e-06, + "loss": 0.93727034, + "num_input_tokens_seen": 3404755, + "step": 162, + "time_per_iteration": 2.6475508213043213 + }, + { + "auxiliary_loss_clip": 0.01203588, + "auxiliary_loss_mlp": 0.01191459, + "balance_loss_clip": 1.01094556, + "balance_loss_mlp": 1.00301313, + "epoch": 0.009800090184879002, + "flos": 67034234177280.0, + "grad_norm": 1.1704181225416908, + "language_loss": 0.72385973, + "learning_rate": 3.279622189013474e-06, + "loss": 0.74781024, + "num_input_tokens_seen": 3467210, + "step": 163, + "time_per_iteration": 3.150733470916748 + }, + { + "auxiliary_loss_clip": 0.01198622, + "auxiliary_loss_mlp": 0.01190444, + "balance_loss_clip": 1.00589538, + "balance_loss_mlp": 1.00161624, + "epoch": 0.00986021343754697, + "flos": 17164690646400.0, + "grad_norm": 2.1502323241010592, + "language_loss": 0.8445617, + "learning_rate": 3.283560135133457e-06, + "loss": 0.86845231, + "num_input_tokens_seen": 3483220, + "step": 164, + "time_per_iteration": 2.6748228073120117 + }, + { + "auxiliary_loss_clip": 0.01198388, + "auxiliary_loss_mlp": 0.0118988, + "balance_loss_clip": 1.00565028, + "balance_loss_mlp": 1.00124288, + "epoch": 0.00992033669021494, + "flos": 17749424148480.0, + "grad_norm": 1.9861087669238262, + "language_loss": 0.8901279, + "learning_rate": 3.2874741422233565e-06, + "loss": 0.91401052, + "num_input_tokens_seen": 3501465, + "step": 165, + "time_per_iteration": 2.668602705001831 + }, + { + "auxiliary_loss_clip": 0.01198406, + "auxiliary_loss_mlp": 0.01190409, + "balance_loss_clip": 1.00567865, + "balance_loss_mlp": 1.00177169, + "epoch": 0.00998045994288291, + "flos": 25297164080640.0, + "grad_norm": 1.8180971238731718, + "language_loss": 0.79624015, + "learning_rate": 3.2913644995792465e-06, + "loss": 0.82012826, + "num_input_tokens_seen": 3520480, + "step": 166, + "time_per_iteration": 2.6865901947021484 + }, + { + "auxiliary_loss_clip": 0.01198391, + "auxiliary_loss_mlp": 0.0119054, + "balance_loss_clip": 1.0056392, + "balance_loss_mlp": 1.00171268, + "epoch": 0.01004058319555088, + "flos": 32298954220800.0, + "grad_norm": 3.346643999573497, + "language_loss": 0.91763169, + "learning_rate": 3.2952314912845914e-06, + "loss": 0.94152093, + "num_input_tokens_seen": 3539570, + "step": 167, + "time_per_iteration": 2.760429859161377 + }, + { + "auxiliary_loss_clip": 0.0119838, + "auxiliary_loss_mlp": 0.01190389, + "balance_loss_clip": 1.00563359, + "balance_loss_mlp": 1.00194335, + "epoch": 0.010100706448218848, + "flos": 11319941404800.0, + "grad_norm": 5.068385073800942, + "language_loss": 0.90585464, + "learning_rate": 3.299075396334735e-06, + "loss": 0.92974234, + "num_input_tokens_seen": 3555465, + "step": 168, + "time_per_iteration": 2.6143341064453125 + }, + { + "auxiliary_loss_clip": 0.01198311, + "auxiliary_loss_mlp": 0.01189868, + "balance_loss_clip": 1.00559604, + "balance_loss_mlp": 1.00132704, + "epoch": 0.010160829700886819, + "flos": 29719491765120.0, + "grad_norm": 1.6193474381746311, + "language_loss": 0.87125385, + "learning_rate": 3.3028964887576868e-06, + "loss": 0.89513564, + "num_input_tokens_seen": 3578970, + "step": 169, + "time_per_iteration": 7.02229118347168 + }, + { + "auxiliary_loss_clip": 0.01198211, + "auxiliary_loss_mlp": 0.0119011, + "balance_loss_clip": 1.00554001, + "balance_loss_mlp": 1.0013777, + "epoch": 0.010220952953554787, + "flos": 20412343854720.0, + "grad_norm": 1.804080489923051, + "language_loss": 0.84648842, + "learning_rate": 3.306695037731344e-06, + "loss": 0.87037158, + "num_input_tokens_seen": 3597275, + "step": 170, + "time_per_iteration": 3.966259717941284 + }, + { + "auxiliary_loss_clip": 0.01198294, + "auxiliary_loss_mlp": 0.01190599, + "balance_loss_clip": 1.00548029, + "balance_loss_mlp": 1.00205779, + "epoch": 0.010281076206222756, + "flos": 31285124847360.0, + "grad_norm": 2.583181844149415, + "language_loss": 0.89972568, + "learning_rate": 3.3104713076972827e-06, + "loss": 0.92361468, + "num_input_tokens_seen": 3618905, + "step": 171, + "time_per_iteration": 2.7385964393615723 + }, + { + "auxiliary_loss_clip": 0.01198532, + "auxiliary_loss_mlp": 0.01189872, + "balance_loss_clip": 1.00569892, + "balance_loss_mlp": 1.00133085, + "epoch": 0.010341199458890726, + "flos": 21982286568960.0, + "grad_norm": 1.884060255927886, + "language_loss": 0.88942313, + "learning_rate": 3.314225558471224e-06, + "loss": 0.91330719, + "num_input_tokens_seen": 3639610, + "step": 172, + "time_per_iteration": 2.6427156925201416 + }, + { + "auxiliary_loss_clip": 0.01198161, + "auxiliary_loss_mlp": 0.01190103, + "balance_loss_clip": 1.00552452, + "balance_loss_mlp": 1.00165725, + "epoch": 0.010401322711558695, + "flos": 30810529422720.0, + "grad_norm": 1.6292787527186177, + "language_loss": 0.80986476, + "learning_rate": 3.317958045350308e-06, + "loss": 0.83374739, + "num_input_tokens_seen": 3664030, + "step": 173, + "time_per_iteration": 2.7262039184570312 + }, + { + "auxiliary_loss_clip": 0.01198429, + "auxiliary_loss_mlp": 0.01190127, + "balance_loss_clip": 1.00567889, + "balance_loss_mlp": 1.00139534, + "epoch": 0.010461445964226665, + "flos": 24715124098560.0, + "grad_norm": 1.8383928409693635, + "language_loss": 0.82665408, + "learning_rate": 3.3216690192172596e-06, + "loss": 0.85053962, + "num_input_tokens_seen": 3683615, + "step": 174, + "time_per_iteration": 2.723815441131592 + }, + { + "auxiliary_loss_clip": 0.01198371, + "auxiliary_loss_mlp": 0.01190318, + "balance_loss_clip": 1.00562453, + "balance_loss_mlp": 1.00177705, + "epoch": 0.010521569216894634, + "flos": 27710361457920.0, + "grad_norm": 2.1966871361736384, + "language_loss": 0.72558087, + "learning_rate": 3.325358726641591e-06, + "loss": 0.74946779, + "num_input_tokens_seen": 3704540, + "step": 175, + "time_per_iteration": 2.7073423862457275 + }, + { + "auxiliary_loss_clip": 0.01198314, + "auxiliary_loss_mlp": 0.01190198, + "balance_loss_clip": 1.00563645, + "balance_loss_mlp": 1.00175178, + "epoch": 0.010581692469562603, + "flos": 12458346122880.0, + "grad_norm": 2.465803524674434, + "language_loss": 0.97798854, + "learning_rate": 3.329027409977902e-06, + "loss": 1.00187361, + "num_input_tokens_seen": 3721320, + "step": 176, + "time_per_iteration": 2.653751850128174 + }, + { + "auxiliary_loss_clip": 0.01198303, + "auxiliary_loss_mlp": 0.0119012, + "balance_loss_clip": 1.00555253, + "balance_loss_mlp": 1.00176942, + "epoch": 0.010641815722230573, + "flos": 19427601519360.0, + "grad_norm": 2.2985862991518586, + "language_loss": 0.76644039, + "learning_rate": 3.3326753074614087e-06, + "loss": 0.79032457, + "num_input_tokens_seen": 3739385, + "step": 177, + "time_per_iteration": 2.6357083320617676 + }, + { + "auxiliary_loss_clip": 0.01198312, + "auxiliary_loss_mlp": 0.01190106, + "balance_loss_clip": 1.00552654, + "balance_loss_mlp": 1.00137436, + "epoch": 0.010701938974898541, + "flos": 18332577452160.0, + "grad_norm": 3.205987655819358, + "language_loss": 0.76575351, + "learning_rate": 3.3363026533007716e-06, + "loss": 0.78963768, + "num_input_tokens_seen": 3756360, + "step": 178, + "time_per_iteration": 2.6140151023864746 + }, + { + "auxiliary_loss_clip": 0.01198405, + "auxiliary_loss_mlp": 0.01189684, + "balance_loss_clip": 1.00565672, + "balance_loss_mlp": 1.00133371, + "epoch": 0.010762062227566512, + "flos": 19203985399680.0, + "grad_norm": 2.15828026220234, + "language_loss": 0.84126055, + "learning_rate": 3.3399096777683303e-06, + "loss": 0.86514139, + "num_input_tokens_seen": 3773930, + "step": 179, + "time_per_iteration": 2.6074414253234863 + }, + { + "auxiliary_loss_clip": 0.0119807, + "auxiliary_loss_mlp": 0.01190047, + "balance_loss_clip": 1.00533748, + "balance_loss_mlp": 1.00141001, + "epoch": 0.01082218548023448, + "flos": 31425427370880.0, + "grad_norm": 1.9973460935351353, + "language_loss": 0.83738095, + "learning_rate": 3.3434966072878213e-06, + "loss": 0.8612622, + "num_input_tokens_seen": 3793630, + "step": 180, + "time_per_iteration": 2.7164783477783203 + }, + { + "auxiliary_loss_clip": 0.01198096, + "auxiliary_loss_mlp": 0.01190033, + "balance_loss_clip": 1.00543427, + "balance_loss_mlp": 1.00149107, + "epoch": 0.01088230873290245, + "flos": 25046436170880.0, + "grad_norm": 2.4991970237588585, + "language_loss": 0.7777741, + "learning_rate": 3.3470636645196674e-06, + "loss": 0.80165541, + "num_input_tokens_seen": 3813610, + "step": 181, + "time_per_iteration": 2.6593470573425293 + }, + { + "auxiliary_loss_clip": 0.0119822, + "auxiliary_loss_mlp": 0.01190701, + "balance_loss_clip": 1.00535154, + "balance_loss_mlp": 1.0021596, + "epoch": 0.01094243198557042, + "flos": 22893411980160.0, + "grad_norm": 2.7701479354182803, + "language_loss": 0.76390517, + "learning_rate": 3.3506110684439156e-06, + "loss": 0.78779435, + "num_input_tokens_seen": 3831390, + "step": 182, + "time_per_iteration": 2.6334190368652344 + }, + { + "auxiliary_loss_clip": 0.01198197, + "auxiliary_loss_mlp": 0.01190015, + "balance_loss_clip": 1.00544381, + "balance_loss_mlp": 1.00175929, + "epoch": 0.011002555238238388, + "flos": 17165049782400.0, + "grad_norm": 2.1881675322572094, + "language_loss": 0.8748337, + "learning_rate": 3.3541390344409054e-06, + "loss": 0.89871579, + "num_input_tokens_seen": 3849705, + "step": 183, + "time_per_iteration": 2.6205086708068848 + }, + { + "auxiliary_loss_clip": 0.01198345, + "auxiliary_loss_mlp": 0.01189898, + "balance_loss_clip": 1.00558317, + "balance_loss_mlp": 1.00145197, + "epoch": 0.011062678490906358, + "flos": 22310150935680.0, + "grad_norm": 2.272149973708258, + "language_loss": 0.86490238, + "learning_rate": 3.357647774369736e-06, + "loss": 0.88878483, + "num_input_tokens_seen": 3869230, + "step": 184, + "time_per_iteration": 2.637148141860962 + }, + { + "auxiliary_loss_clip": 0.01198164, + "auxiliary_loss_mlp": 0.01190055, + "balance_loss_clip": 1.00543356, + "balance_loss_mlp": 1.00170445, + "epoch": 0.011122801743574327, + "flos": 24388373053440.0, + "grad_norm": 1.7812149067688587, + "language_loss": 0.8336553, + "learning_rate": 3.3611374966446085e-06, + "loss": 0.85753751, + "num_input_tokens_seen": 3889735, + "step": 185, + "time_per_iteration": 2.7105400562286377 + }, + { + "auxiliary_loss_clip": 0.01198161, + "auxiliary_loss_mlp": 0.01189948, + "balance_loss_clip": 1.0053761, + "balance_loss_mlp": 1.00140643, + "epoch": 0.011182924996242297, + "flos": 18150258994560.0, + "grad_norm": 2.376841115293308, + "language_loss": 0.71034205, + "learning_rate": 3.3646084063091142e-06, + "loss": 0.73422313, + "num_input_tokens_seen": 3908855, + "step": 186, + "time_per_iteration": 2.652974843978882 + }, + { + "auxiliary_loss_clip": 0.01198167, + "auxiliary_loss_mlp": 0.01190003, + "balance_loss_clip": 1.00543177, + "balance_loss_mlp": 1.00136662, + "epoch": 0.011243048248910266, + "flos": 15486800584320.0, + "grad_norm": 2.288830010042691, + "language_loss": 1.02119637, + "learning_rate": 3.3680607051085194e-06, + "loss": 1.04507816, + "num_input_tokens_seen": 3923865, + "step": 187, + "time_per_iteration": 2.6372804641723633 + }, + { + "auxiliary_loss_clip": 0.01198035, + "auxiliary_loss_mlp": 0.01189621, + "balance_loss_clip": 1.00538599, + "balance_loss_mlp": 1.00127029, + "epoch": 0.011303171501578235, + "flos": 40916868986880.0, + "grad_norm": 1.8577322568119317, + "language_loss": 0.74988306, + "learning_rate": 3.371494591560139e-06, + "loss": 0.7737596, + "num_input_tokens_seen": 3946870, + "step": 188, + "time_per_iteration": 2.8312370777130127 + }, + { + "auxiliary_loss_clip": 0.01202994, + "auxiliary_loss_mlp": 0.01189679, + "balance_loss_clip": 1.010535, + "balance_loss_mlp": 1.00199628, + "epoch": 0.011363294754246205, + "flos": 66302697790080.0, + "grad_norm": 0.752290508779987, + "language_loss": 0.56248009, + "learning_rate": 3.3749102610218297e-06, + "loss": 0.58640683, + "num_input_tokens_seen": 4010005, + "step": 189, + "time_per_iteration": 3.22642183303833 + }, + { + "auxiliary_loss_clip": 0.0119789, + "auxiliary_loss_mlp": 0.01190266, + "balance_loss_clip": 1.00515938, + "balance_loss_mlp": 1.00182021, + "epoch": 0.011423418006914174, + "flos": 24900279730560.0, + "grad_norm": 2.352735731180211, + "language_loss": 0.95013458, + "learning_rate": 3.3783079057586833e-06, + "loss": 0.97401607, + "num_input_tokens_seen": 4029035, + "step": 190, + "time_per_iteration": 2.6699676513671875 + }, + { + "auxiliary_loss_clip": 0.01197983, + "auxiliary_loss_mlp": 0.01189797, + "balance_loss_clip": 1.00530207, + "balance_loss_mlp": 1.00135112, + "epoch": 0.011483541259582144, + "flos": 19791879298560.0, + "grad_norm": 3.2049502844535587, + "language_loss": 0.84619117, + "learning_rate": 3.3816877150079665e-06, + "loss": 0.87006903, + "num_input_tokens_seen": 4046995, + "step": 191, + "time_per_iteration": 2.731593370437622 + }, + { + "auxiliary_loss_clip": 0.0119788, + "auxiliary_loss_mlp": 0.01190559, + "balance_loss_clip": 1.00518167, + "balance_loss_mlp": 1.00220811, + "epoch": 0.011543664512250112, + "flos": 26176939896960.0, + "grad_norm": 3.921759266177051, + "language_loss": 0.91764295, + "learning_rate": 3.385049875042367e-06, + "loss": 0.94152737, + "num_input_tokens_seen": 4065865, + "step": 192, + "time_per_iteration": 2.6758694648742676 + }, + { + "auxiliary_loss_clip": 0.01197824, + "auxiliary_loss_mlp": 0.01189692, + "balance_loss_clip": 1.00511312, + "balance_loss_mlp": 1.00134146, + "epoch": 0.011603787764918083, + "flos": 23768985905280.0, + "grad_norm": 2.247650125293302, + "language_loss": 0.86754018, + "learning_rate": 3.3883945692315938e-06, + "loss": 0.89141536, + "num_input_tokens_seen": 4085305, + "step": 193, + "time_per_iteration": 2.6542675495147705 + }, + { + "auxiliary_loss_clip": 0.01198003, + "auxiliary_loss_mlp": 0.01189585, + "balance_loss_clip": 1.00532341, + "balance_loss_mlp": 1.00123405, + "epoch": 0.011663911017586051, + "flos": 25954688494080.0, + "grad_norm": 2.240136553203972, + "language_loss": 0.92294502, + "learning_rate": 3.3917219781023906e-06, + "loss": 0.94682097, + "num_input_tokens_seen": 4105185, + "step": 194, + "time_per_iteration": 2.692167043685913 + }, + { + "auxiliary_loss_clip": 0.01197728, + "auxiliary_loss_mlp": 0.0118975, + "balance_loss_clip": 1.00510573, + "balance_loss_mlp": 1.00130439, + "epoch": 0.01172403427025402, + "flos": 17895149625600.0, + "grad_norm": 2.7714753323166628, + "language_loss": 0.896056, + "learning_rate": 3.3950322793970014e-06, + "loss": 0.91993082, + "num_input_tokens_seen": 4123160, + "step": 195, + "time_per_iteration": 2.610602617263794 + }, + { + "auxiliary_loss_clip": 0.0119779, + "auxiliary_loss_mlp": 0.01190443, + "balance_loss_clip": 1.00508952, + "balance_loss_mlp": 1.00190187, + "epoch": 0.01178415752292199, + "flos": 17894539094400.0, + "grad_norm": 3.5105009938192797, + "language_loss": 0.85784703, + "learning_rate": 3.3983256481301445e-06, + "loss": 0.8817293, + "num_input_tokens_seen": 4140425, + "step": 196, + "time_per_iteration": 2.7884161472320557 + }, + { + "auxiliary_loss_clip": 0.01197607, + "auxiliary_loss_mlp": 0.01189971, + "balance_loss_clip": 1.00493288, + "balance_loss_mlp": 1.00152493, + "epoch": 0.011844280775589959, + "flos": 22893555634560.0, + "grad_norm": 2.417677455953858, + "language_loss": 0.93289399, + "learning_rate": 3.4016022566445335e-06, + "loss": 0.95676982, + "num_input_tokens_seen": 4159555, + "step": 197, + "time_per_iteration": 2.6382532119750977 + }, + { + "auxiliary_loss_clip": 0.01197908, + "auxiliary_loss_mlp": 0.01189887, + "balance_loss_clip": 1.00523889, + "balance_loss_mlp": 1.00153637, + "epoch": 0.01190440402825793, + "flos": 26980333441920.0, + "grad_norm": 2.903438556882413, + "language_loss": 0.79010367, + "learning_rate": 3.4048622746649966e-06, + "loss": 0.81398165, + "num_input_tokens_seen": 4180480, + "step": 198, + "time_per_iteration": 2.6955442428588867 + }, + { + "auxiliary_loss_clip": 0.01197884, + "auxiliary_loss_mlp": 0.01189659, + "balance_loss_clip": 1.00525808, + "balance_loss_mlp": 1.00169003, + "epoch": 0.011964527280925898, + "flos": 20521584092160.0, + "grad_norm": 1.7725003917474442, + "language_loss": 0.88241929, + "learning_rate": 3.4081058693512278e-06, + "loss": 0.9062947, + "num_input_tokens_seen": 4198835, + "step": 199, + "time_per_iteration": 2.625756025314331 + }, + { + "auxiliary_loss_clip": 0.01197777, + "auxiliary_loss_mlp": 0.01189881, + "balance_loss_clip": 1.00508356, + "balance_loss_mlp": 1.00153005, + "epoch": 0.012024650533593867, + "flos": 27745984771200.0, + "grad_norm": 2.0330616146775617, + "language_loss": 0.81227303, + "learning_rate": 3.411333205349222e-06, + "loss": 0.83614957, + "num_input_tokens_seen": 4219335, + "step": 200, + "time_per_iteration": 2.6357421875 + }, + { + "auxiliary_loss_clip": 0.01197777, + "auxiliary_loss_mlp": 0.0118976, + "balance_loss_clip": 1.00509274, + "balance_loss_mlp": 1.00131392, + "epoch": 0.012084773786261837, + "flos": 10452017076480.0, + "grad_norm": 2.263371094575809, + "language_loss": 0.87553674, + "learning_rate": 3.4145444448414217e-06, + "loss": 0.89941216, + "num_input_tokens_seen": 4236940, + "step": 201, + "time_per_iteration": 2.583216428756714 + }, + { + "auxiliary_loss_clip": 0.01197672, + "auxiliary_loss_mlp": 0.01189694, + "balance_loss_clip": 1.00511956, + "balance_loss_mlp": 1.00153422, + "epoch": 0.012144897038929806, + "flos": 23105751229440.0, + "grad_norm": 1.674980022978983, + "language_loss": 0.8418839, + "learning_rate": 3.4177397475956223e-06, + "loss": 0.86575752, + "num_input_tokens_seen": 4256755, + "step": 202, + "time_per_iteration": 2.6771607398986816 + }, + { + "auxiliary_loss_clip": 0.01197492, + "auxiliary_loss_mlp": 0.01189771, + "balance_loss_clip": 1.00492942, + "balance_loss_mlp": 1.00151587, + "epoch": 0.012205020291597776, + "flos": 21033203460480.0, + "grad_norm": 1.7568429595638613, + "language_loss": 0.90073991, + "learning_rate": 3.4209192710126685e-06, + "loss": 0.92461258, + "num_input_tokens_seen": 4276505, + "step": 203, + "time_per_iteration": 2.6277027130126953 + }, + { + "auxiliary_loss_clip": 0.01202206, + "auxiliary_loss_mlp": 0.01189387, + "balance_loss_clip": 1.00995922, + "balance_loss_mlp": 1.00170374, + "epoch": 0.012265143544265745, + "flos": 68447785075200.0, + "grad_norm": 1.01152319481204, + "language_loss": 0.61258155, + "learning_rate": 3.4240831701729837e-06, + "loss": 0.6364975, + "num_input_tokens_seen": 4330965, + "step": 204, + "time_per_iteration": 3.0880796909332275 + }, + { + "auxiliary_loss_clip": 0.01197688, + "auxiliary_loss_mlp": 0.01189831, + "balance_loss_clip": 1.00505161, + "balance_loss_mlp": 1.00157595, + "epoch": 0.012325266796933715, + "flos": 17019252478080.0, + "grad_norm": 3.6367781696819246, + "language_loss": 0.91442347, + "learning_rate": 3.4272315978819516e-06, + "loss": 0.9382987, + "num_input_tokens_seen": 4348200, + "step": 205, + "time_per_iteration": 2.5958166122436523 + }, + { + "auxiliary_loss_clip": 0.01197673, + "auxiliary_loss_mlp": 0.01190005, + "balance_loss_clip": 1.00506639, + "balance_loss_mlp": 1.00174952, + "epoch": 0.012385390049601683, + "flos": 20190056538240.0, + "grad_norm": 3.441022349519803, + "language_loss": 0.8916406, + "learning_rate": 3.4303647047142043e-06, + "loss": 0.91551739, + "num_input_tokens_seen": 4365460, + "step": 206, + "time_per_iteration": 2.6325695514678955 + }, + { + "auxiliary_loss_clip": 0.01197665, + "auxiliary_loss_mlp": 0.01189797, + "balance_loss_clip": 1.00499797, + "balance_loss_mlp": 1.00144625, + "epoch": 0.012445513302269652, + "flos": 16253134272000.0, + "grad_norm": 2.6002904521291366, + "language_loss": 0.95663154, + "learning_rate": 3.43348263905683e-06, + "loss": 0.98050618, + "num_input_tokens_seen": 4383650, + "step": 207, + "time_per_iteration": 5.518757343292236 + }, + { + "auxiliary_loss_clip": 0.0119752, + "auxiliary_loss_mlp": 0.01190048, + "balance_loss_clip": 1.0048306, + "balance_loss_mlp": 1.00169778, + "epoch": 0.012505636554937622, + "flos": 23769380954880.0, + "grad_norm": 1.7169301655930678, + "language_loss": 0.75887096, + "learning_rate": 3.436585547151547e-06, + "loss": 0.78274661, + "num_input_tokens_seen": 4403765, + "step": 208, + "time_per_iteration": 5.56721568107605 + }, + { + "auxiliary_loss_clip": 0.01197497, + "auxiliary_loss_mlp": 0.01189559, + "balance_loss_clip": 1.00490224, + "balance_loss_mlp": 1.00149453, + "epoch": 0.012565759807605591, + "flos": 30591546157440.0, + "grad_norm": 2.202118428309667, + "language_loss": 0.98465627, + "learning_rate": 3.4396735731358586e-06, + "loss": 1.0085268, + "num_input_tokens_seen": 4421935, + "step": 209, + "time_per_iteration": 2.67669415473938 + }, + { + "auxiliary_loss_clip": 0.01197515, + "auxiliary_loss_mlp": 0.01189921, + "balance_loss_clip": 1.00493121, + "balance_loss_mlp": 1.00166607, + "epoch": 0.012625883060273561, + "flos": 40113511355520.0, + "grad_norm": 3.400877483703355, + "language_loss": 0.85759866, + "learning_rate": 3.4427468590832302e-06, + "loss": 0.88147306, + "num_input_tokens_seen": 4441470, + "step": 210, + "time_per_iteration": 2.761005163192749 + }, + { + "auxiliary_loss_clip": 0.01197392, + "auxiliary_loss_mlp": 0.01190047, + "balance_loss_clip": 1.00482202, + "balance_loss_mlp": 1.00179172, + "epoch": 0.01268600631294153, + "flos": 27089178629760.0, + "grad_norm": 2.3804961489436276, + "language_loss": 0.9695977, + "learning_rate": 3.445805545042314e-06, + "loss": 0.99347204, + "num_input_tokens_seen": 4459950, + "step": 211, + "time_per_iteration": 2.6928277015686035 + }, + { + "auxiliary_loss_clip": 0.01197561, + "auxiliary_loss_mlp": 0.01189776, + "balance_loss_clip": 1.00498986, + "balance_loss_mlp": 1.00180686, + "epoch": 0.012746129565609499, + "flos": 16982767238400.0, + "grad_norm": 2.8344804610920007, + "language_loss": 0.94812012, + "learning_rate": 3.448849769075239e-06, + "loss": 0.97199357, + "num_input_tokens_seen": 4478390, + "step": 212, + "time_per_iteration": 2.644266366958618 + }, + { + "auxiliary_loss_clip": 0.01197285, + "auxiliary_loss_mlp": 0.01189499, + "balance_loss_clip": 1.00475073, + "balance_loss_mlp": 1.00172067, + "epoch": 0.012806252818277469, + "flos": 46533476995200.0, + "grad_norm": 1.8009076871568994, + "language_loss": 0.76009393, + "learning_rate": 3.4518796672950093e-06, + "loss": 0.78396177, + "num_input_tokens_seen": 4501665, + "step": 213, + "time_per_iteration": 2.916585922241211 + }, + { + "auxiliary_loss_clip": 0.01197324, + "auxiliary_loss_mlp": 0.01189758, + "balance_loss_clip": 1.00471318, + "balance_loss_mlp": 1.00159812, + "epoch": 0.012866376070945438, + "flos": 14388616120320.0, + "grad_norm": 3.8414626108550434, + "language_loss": 0.86762214, + "learning_rate": 3.4548953739020187e-06, + "loss": 0.89149296, + "num_input_tokens_seen": 4519055, + "step": 214, + "time_per_iteration": 2.607252597808838 + }, + { + "auxiliary_loss_clip": 0.01197228, + "auxiliary_loss_mlp": 0.01189801, + "balance_loss_clip": 1.00469732, + "balance_loss_mlp": 1.00192714, + "epoch": 0.012926499323613408, + "flos": 26140813793280.0, + "grad_norm": 4.889485403533222, + "language_loss": 0.77288663, + "learning_rate": 3.4578970212197196e-06, + "loss": 0.79675686, + "num_input_tokens_seen": 4540870, + "step": 215, + "time_per_iteration": 2.6508991718292236 + }, + { + "auxiliary_loss_clip": 0.01197311, + "auxiliary_loss_mlp": 0.0118966, + "balance_loss_clip": 1.00476265, + "balance_loss_mlp": 1.00159526, + "epoch": 0.012986622576281377, + "flos": 30117202128000.0, + "grad_norm": 2.1767710074477358, + "language_loss": 0.90497994, + "learning_rate": 3.460884739729461e-06, + "loss": 0.92884958, + "num_input_tokens_seen": 4560395, + "step": 216, + "time_per_iteration": 2.6622304916381836 + }, + { + "auxiliary_loss_clip": 0.01197184, + "auxiliary_loss_mlp": 0.01189714, + "balance_loss_clip": 1.00461793, + "balance_loss_mlp": 1.00145888, + "epoch": 0.013046745828949347, + "flos": 13954025468160.0, + "grad_norm": 2.941309090047621, + "language_loss": 0.93586677, + "learning_rate": 3.463858658104523e-06, + "loss": 0.95973575, + "num_input_tokens_seen": 4575785, + "step": 217, + "time_per_iteration": 2.6234285831451416 + }, + { + "auxiliary_loss_clip": 0.01197246, + "auxiliary_loss_mlp": 0.01189164, + "balance_loss_clip": 1.00470448, + "balance_loss_mlp": 1.00128996, + "epoch": 0.013106869081617315, + "flos": 17347835116800.0, + "grad_norm": 2.1824201233483547, + "language_loss": 0.9356792, + "learning_rate": 3.4668189032433696e-06, + "loss": 0.95954323, + "num_input_tokens_seen": 4594985, + "step": 218, + "time_per_iteration": 2.612476348876953 + }, + { + "auxiliary_loss_clip": 0.01197187, + "auxiliary_loss_mlp": 0.01189451, + "balance_loss_clip": 1.00468588, + "balance_loss_mlp": 1.00157762, + "epoch": 0.013166992334285284, + "flos": 25884914325120.0, + "grad_norm": 2.520061632054662, + "language_loss": 0.86043602, + "learning_rate": 3.46976560030214e-06, + "loss": 0.88430238, + "num_input_tokens_seen": 4616125, + "step": 219, + "time_per_iteration": 2.6598451137542725 + }, + { + "auxiliary_loss_clip": 0.01196989, + "auxiliary_loss_mlp": 0.0118924, + "balance_loss_clip": 1.00448966, + "balance_loss_mlp": 1.00127029, + "epoch": 0.013227115586953254, + "flos": 31175956437120.0, + "grad_norm": 2.0776285600863016, + "language_loss": 0.8770684, + "learning_rate": 3.4726988727263976e-06, + "loss": 0.90093064, + "num_input_tokens_seen": 4637795, + "step": 220, + "time_per_iteration": 2.7263011932373047 + }, + { + "auxiliary_loss_clip": 0.01197112, + "auxiliary_loss_mlp": 0.01189517, + "balance_loss_clip": 1.00453687, + "balance_loss_mlp": 1.00183403, + "epoch": 0.013287238839621223, + "flos": 20409470766720.0, + "grad_norm": 1.9719401608550822, + "language_loss": 0.86287987, + "learning_rate": 3.475618842282164e-06, + "loss": 0.88674617, + "num_input_tokens_seen": 4656835, + "step": 221, + "time_per_iteration": 2.6072022914886475 + }, + { + "auxiliary_loss_clip": 0.01197048, + "auxiliary_loss_mlp": 0.01189656, + "balance_loss_clip": 1.00450778, + "balance_loss_mlp": 1.00178182, + "epoch": 0.013347362092289193, + "flos": 14137134024960.0, + "grad_norm": 2.326744824099607, + "language_loss": 0.92212188, + "learning_rate": 3.4785256290862486e-06, + "loss": 0.94598889, + "num_input_tokens_seen": 4673015, + "step": 222, + "time_per_iteration": 2.600328207015991 + }, + { + "auxiliary_loss_clip": 0.01197027, + "auxiliary_loss_mlp": 0.0118938, + "balance_loss_clip": 1.00460219, + "balance_loss_mlp": 1.00150657, + "epoch": 0.013407485344957162, + "flos": 21797705554560.0, + "grad_norm": 2.481368925460819, + "language_loss": 0.95646131, + "learning_rate": 3.481419351635897e-06, + "loss": 0.9803254, + "num_input_tokens_seen": 4692355, + "step": 223, + "time_per_iteration": 2.6082961559295654 + }, + { + "auxiliary_loss_clip": 0.01197027, + "auxiliary_loss_mlp": 0.01189539, + "balance_loss_clip": 1.00457954, + "balance_loss_mlp": 1.00156951, + "epoch": 0.013467608597625132, + "flos": 18621622195200.0, + "grad_norm": 6.376938208513726, + "language_loss": 0.88175416, + "learning_rate": 3.484300126837776e-06, + "loss": 0.9056198, + "num_input_tokens_seen": 4710080, + "step": 224, + "time_per_iteration": 2.5898544788360596 + }, + { + "auxiliary_loss_clip": 0.01196888, + "auxiliary_loss_mlp": 0.01189148, + "balance_loss_clip": 1.00440264, + "balance_loss_mlp": 1.00146508, + "epoch": 0.013527731850293101, + "flos": 18552314903040.0, + "grad_norm": 3.0460931682712973, + "language_loss": 0.89388084, + "learning_rate": 3.487168070036317e-06, + "loss": 0.91774124, + "num_input_tokens_seen": 4728980, + "step": 225, + "time_per_iteration": 2.6307175159454346 + }, + { + "auxiliary_loss_clip": 0.01196874, + "auxiliary_loss_mlp": 0.01189716, + "balance_loss_clip": 1.00440145, + "balance_loss_mlp": 1.00165105, + "epoch": 0.01358785510296107, + "flos": 19165381257600.0, + "grad_norm": 1.8761166706248384, + "language_loss": 0.98869747, + "learning_rate": 3.4900232950414224e-06, + "loss": 1.01256347, + "num_input_tokens_seen": 4747020, + "step": 226, + "time_per_iteration": 2.6372244358062744 + }, + { + "auxiliary_loss_clip": 0.01197079, + "auxiliary_loss_mlp": 0.01189632, + "balance_loss_clip": 1.00456381, + "balance_loss_mlp": 1.00156784, + "epoch": 0.01364797835562904, + "flos": 23329941966720.0, + "grad_norm": 3.503857125411034, + "language_loss": 0.90811473, + "learning_rate": 3.4928659141555727e-06, + "loss": 0.9319818, + "num_input_tokens_seen": 4765000, + "step": 227, + "time_per_iteration": 2.635289192199707 + }, + { + "auxiliary_loss_clip": 0.01200947, + "auxiliary_loss_mlp": 0.01188811, + "balance_loss_clip": 1.00878501, + "balance_loss_mlp": 1.00189042, + "epoch": 0.013708101608297009, + "flos": 70993746097920.0, + "grad_norm": 0.9367162195034241, + "language_loss": 0.57635546, + "learning_rate": 3.4956960382003234e-06, + "loss": 0.60025299, + "num_input_tokens_seen": 4833210, + "step": 228, + "time_per_iteration": 3.251800298690796 + }, + { + "auxiliary_loss_clip": 0.0119667, + "auxiliary_loss_mlp": 0.01189172, + "balance_loss_clip": 1.00424194, + "balance_loss_mlp": 1.00139356, + "epoch": 0.013768224860964979, + "flos": 16325170997760.0, + "grad_norm": 2.5220101962496675, + "language_loss": 0.87815011, + "learning_rate": 3.4985137765422354e-06, + "loss": 0.90200853, + "num_input_tokens_seen": 4850120, + "step": 229, + "time_per_iteration": 2.62094783782959 + }, + { + "auxiliary_loss_clip": 0.01196958, + "auxiliary_loss_mlp": 0.01189554, + "balance_loss_clip": 1.00442719, + "balance_loss_mlp": 1.0014894, + "epoch": 0.013828348113632948, + "flos": 20193037367040.0, + "grad_norm": 3.003700298973854, + "language_loss": 0.84325755, + "learning_rate": 3.501319237118231e-06, + "loss": 0.86712265, + "num_input_tokens_seen": 4866215, + "step": 230, + "time_per_iteration": 2.6376359462738037 + }, + { + "auxiliary_loss_clip": 0.01196983, + "auxiliary_loss_mlp": 0.01189764, + "balance_loss_clip": 1.00444627, + "balance_loss_mlp": 1.00189018, + "epoch": 0.013888471366300916, + "flos": 20741070147840.0, + "grad_norm": 1.9448086182811335, + "language_loss": 0.90416396, + "learning_rate": 3.5041125264604056e-06, + "loss": 0.92803144, + "num_input_tokens_seen": 4885630, + "step": 231, + "time_per_iteration": 2.6210241317749023 + }, + { + "auxiliary_loss_clip": 0.01196966, + "auxiliary_loss_mlp": 0.01189326, + "balance_loss_clip": 1.00451612, + "balance_loss_mlp": 1.00135744, + "epoch": 0.013948594618968886, + "flos": 22090628966400.0, + "grad_norm": 2.006576973478107, + "language_loss": 0.83577144, + "learning_rate": 3.5068937497203002e-06, + "loss": 0.8596344, + "num_input_tokens_seen": 4905570, + "step": 232, + "time_per_iteration": 2.653870105743408 + }, + { + "auxiliary_loss_clip": 0.0119679, + "auxiliary_loss_mlp": 0.01189096, + "balance_loss_clip": 1.00429773, + "balance_loss_mlp": 1.00131798, + "epoch": 0.014008717871636855, + "flos": 19063108258560.0, + "grad_norm": 7.388985372097826, + "language_loss": 0.73891145, + "learning_rate": 3.509663010692652e-06, + "loss": 0.7627703, + "num_input_tokens_seen": 4923535, + "step": 233, + "time_per_iteration": 2.665163278579712 + }, + { + "auxiliary_loss_clip": 0.01197032, + "auxiliary_loss_mlp": 0.01189585, + "balance_loss_clip": 1.0045929, + "balance_loss_mlp": 1.00171089, + "epoch": 0.014068841124304825, + "flos": 14530822064640.0, + "grad_norm": 2.3082826904198317, + "language_loss": 0.85722268, + "learning_rate": 3.512420411838642e-06, + "loss": 0.88108885, + "num_input_tokens_seen": 4939200, + "step": 234, + "time_per_iteration": 2.6334338188171387 + }, + { + "auxiliary_loss_clip": 0.0119675, + "auxiliary_loss_mlp": 0.01189305, + "balance_loss_clip": 1.00431967, + "balance_loss_mlp": 1.00152612, + "epoch": 0.014128964376972794, + "flos": 18077396256000.0, + "grad_norm": 2.4626154186771614, + "language_loss": 0.89326119, + "learning_rate": 3.515166054308634e-06, + "loss": 0.91712177, + "num_input_tokens_seen": 4956620, + "step": 235, + "time_per_iteration": 2.5753941535949707 + }, + { + "auxiliary_loss_clip": 0.01196714, + "auxiliary_loss_mlp": 0.01189409, + "balance_loss_clip": 1.00433826, + "balance_loss_mlp": 1.00144005, + "epoch": 0.014189087629640764, + "flos": 25334331678720.0, + "grad_norm": 2.3022497606773236, + "language_loss": 0.85472214, + "learning_rate": 3.5179000379644498e-06, + "loss": 0.87858331, + "num_input_tokens_seen": 4975650, + "step": 236, + "time_per_iteration": 2.6254401206970215 + }, + { + "auxiliary_loss_clip": 0.01196614, + "auxiliary_loss_mlp": 0.01189237, + "balance_loss_clip": 1.00417972, + "balance_loss_mlp": 1.00145876, + "epoch": 0.014249210882308733, + "flos": 36139744713600.0, + "grad_norm": 2.311252398868999, + "language_loss": 0.82422549, + "learning_rate": 3.520622461401154e-06, + "loss": 0.84808403, + "num_input_tokens_seen": 4997415, + "step": 237, + "time_per_iteration": 2.7351584434509277 + }, + { + "auxiliary_loss_clip": 0.01196747, + "auxiliary_loss_mlp": 0.01189751, + "balance_loss_clip": 1.00438726, + "balance_loss_mlp": 1.00178182, + "epoch": 0.014309334134976702, + "flos": 12932977461120.0, + "grad_norm": 1.9575272381392193, + "language_loss": 0.77236903, + "learning_rate": 3.5233334219683935e-06, + "loss": 0.79623401, + "num_input_tokens_seen": 5013905, + "step": 238, + "time_per_iteration": 2.6617209911346436 + }, + { + "auxiliary_loss_clip": 0.01196809, + "auxiliary_loss_mlp": 0.01189539, + "balance_loss_clip": 1.0044198, + "balance_loss_mlp": 1.00176072, + "epoch": 0.014369457387644672, + "flos": 20777519473920.0, + "grad_norm": 1.8098447372243778, + "language_loss": 0.87048137, + "learning_rate": 3.526033015791284e-06, + "loss": 0.89434493, + "num_input_tokens_seen": 5033645, + "step": 239, + "time_per_iteration": 2.6498730182647705 + }, + { + "auxiliary_loss_clip": 0.01196694, + "auxiliary_loss_mlp": 0.01189093, + "balance_loss_clip": 1.00426149, + "balance_loss_mlp": 1.00141013, + "epoch": 0.01442958064031264, + "flos": 25848536826240.0, + "grad_norm": 2.116379377141379, + "language_loss": 0.93080628, + "learning_rate": 3.528721337790862e-06, + "loss": 0.95466411, + "num_input_tokens_seen": 5052875, + "step": 240, + "time_per_iteration": 2.6558585166931152 + }, + { + "auxiliary_loss_clip": 0.0119656, + "auxiliary_loss_mlp": 0.01189268, + "balance_loss_clip": 1.00417233, + "balance_loss_mlp": 1.00158536, + "epoch": 0.014489703892980611, + "flos": 28219718269440.0, + "grad_norm": 2.1649776752487755, + "language_loss": 0.84932435, + "learning_rate": 3.531398481704111e-06, + "loss": 0.87318254, + "num_input_tokens_seen": 5075005, + "step": 241, + "time_per_iteration": 2.6592862606048584 + }, + { + "auxiliary_loss_clip": 0.01196554, + "auxiliary_loss_mlp": 0.01189324, + "balance_loss_clip": 1.00426841, + "balance_loss_mlp": 1.00173676, + "epoch": 0.01454982714564858, + "flos": 22490925108480.0, + "grad_norm": 3.1845268278008874, + "language_loss": 0.88347769, + "learning_rate": 3.534064540103573e-06, + "loss": 0.90733647, + "num_input_tokens_seen": 5091875, + "step": 242, + "time_per_iteration": 2.5946109294891357 + }, + { + "auxiliary_loss_clip": 0.01196566, + "auxiliary_loss_mlp": 0.01189185, + "balance_loss_clip": 1.00415659, + "balance_loss_mlp": 1.00140631, + "epoch": 0.014609950398316548, + "flos": 21653201139840.0, + "grad_norm": 2.321088157039347, + "language_loss": 0.8698957, + "learning_rate": 3.536719604416555e-06, + "loss": 0.89375317, + "num_input_tokens_seen": 5111290, + "step": 243, + "time_per_iteration": 2.629685401916504 + }, + { + "auxiliary_loss_clip": 0.0119659, + "auxiliary_loss_mlp": 0.01189487, + "balance_loss_clip": 1.00423169, + "balance_loss_mlp": 1.00170839, + "epoch": 0.014670073650984519, + "flos": 21869993675520.0, + "grad_norm": 1.5966875515349568, + "language_loss": 0.84253514, + "learning_rate": 3.5393637649439464e-06, + "loss": 0.86639589, + "num_input_tokens_seen": 5132265, + "step": 244, + "time_per_iteration": 2.6218924522399902 + }, + { + "auxiliary_loss_clip": 0.01196674, + "auxiliary_loss_mlp": 0.0118922, + "balance_loss_clip": 1.00423479, + "balance_loss_mlp": 1.0014416, + "epoch": 0.014730196903652487, + "flos": 23183713699200.0, + "grad_norm": 2.57022906867706, + "language_loss": 0.7891466, + "learning_rate": 3.54199711087864e-06, + "loss": 0.81300557, + "num_input_tokens_seen": 5148575, + "step": 245, + "time_per_iteration": 3.9794790744781494 + }, + { + "auxiliary_loss_clip": 0.01196571, + "auxiliary_loss_mlp": 0.01189269, + "balance_loss_clip": 1.00420749, + "balance_loss_mlp": 1.00149012, + "epoch": 0.014790320156320457, + "flos": 23222605150080.0, + "grad_norm": 3.134292860402328, + "language_loss": 0.8408699, + "learning_rate": 3.5446197303235913e-06, + "loss": 0.86472827, + "num_input_tokens_seen": 5170415, + "step": 246, + "time_per_iteration": 5.46756911277771 + }, + { + "auxiliary_loss_clip": 0.01196522, + "auxiliary_loss_mlp": 0.01189146, + "balance_loss_clip": 1.00415397, + "balance_loss_mlp": 1.00146294, + "epoch": 0.014850443408988426, + "flos": 15815490963840.0, + "grad_norm": 3.7313659938596477, + "language_loss": 0.90024018, + "learning_rate": 3.5472317103095034e-06, + "loss": 0.92409682, + "num_input_tokens_seen": 5188565, + "step": 247, + "time_per_iteration": 3.9485960006713867 + }, + { + "auxiliary_loss_clip": 0.01196433, + "auxiliary_loss_mlp": 0.01189091, + "balance_loss_clip": 1.00406981, + "balance_loss_mlp": 1.00131249, + "epoch": 0.014910566661656396, + "flos": 22781657790720.0, + "grad_norm": 2.2123911365366626, + "language_loss": 0.78255159, + "learning_rate": 3.549833136812155e-06, + "loss": 0.80640686, + "num_input_tokens_seen": 5207810, + "step": 248, + "time_per_iteration": 2.64158296585083 + }, + { + "auxiliary_loss_clip": 0.01196529, + "auxiliary_loss_mlp": 0.01189183, + "balance_loss_clip": 1.00415993, + "balance_loss_mlp": 1.00149965, + "epoch": 0.014970689914324365, + "flos": 26865023806080.0, + "grad_norm": 2.105798600639768, + "language_loss": 0.83739758, + "learning_rate": 3.552424094769381e-06, + "loss": 0.86125469, + "num_input_tokens_seen": 5226210, + "step": 249, + "time_per_iteration": 2.6466305255889893 + }, + { + "auxiliary_loss_clip": 0.01196415, + "auxiliary_loss_mlp": 0.01189201, + "balance_loss_clip": 1.00402021, + "balance_loss_mlp": 1.00151753, + "epoch": 0.015030813166992334, + "flos": 13985662371840.0, + "grad_norm": 2.031822679091406, + "language_loss": 0.93383342, + "learning_rate": 3.5550046680977174e-06, + "loss": 0.95768958, + "num_input_tokens_seen": 5241660, + "step": 250, + "time_per_iteration": 2.6302542686462402 + }, + { + "auxiliary_loss_clip": 0.01196681, + "auxiliary_loss_mlp": 0.01189716, + "balance_loss_clip": 1.00425625, + "balance_loss_mlp": 1.0018419, + "epoch": 0.015090936419660304, + "flos": 24717817618560.0, + "grad_norm": 2.372120861887689, + "language_loss": 0.96491522, + "learning_rate": 3.5575749397087034e-06, + "loss": 0.98877919, + "num_input_tokens_seen": 5261090, + "step": 251, + "time_per_iteration": 2.6296637058258057 + }, + { + "auxiliary_loss_clip": 0.0119651, + "auxiliary_loss_mlp": 0.01189196, + "balance_loss_clip": 1.00420308, + "balance_loss_mlp": 1.00141716, + "epoch": 0.015151059672328273, + "flos": 25738793798400.0, + "grad_norm": 2.27985916359479, + "language_loss": 0.8425706, + "learning_rate": 3.5601349915248707e-06, + "loss": 0.86642766, + "num_input_tokens_seen": 5279175, + "step": 252, + "time_per_iteration": 2.6380090713500977 + }, + { + "auxiliary_loss_clip": 0.01196391, + "auxiliary_loss_mlp": 0.01189299, + "balance_loss_clip": 1.00411808, + "balance_loss_mlp": 1.00152099, + "epoch": 0.015211182924996243, + "flos": 21871214737920.0, + "grad_norm": 2.259495410737159, + "language_loss": 0.98140025, + "learning_rate": 3.5626849044954064e-06, + "loss": 1.00525713, + "num_input_tokens_seen": 5296975, + "step": 253, + "time_per_iteration": 2.5986509323120117 + }, + { + "auxiliary_loss_clip": 0.01200616, + "auxiliary_loss_mlp": 0.01188156, + "balance_loss_clip": 1.00876474, + "balance_loss_mlp": 1.00123537, + "epoch": 0.015271306177664212, + "flos": 66895080888960.0, + "grad_norm": 0.8467665854645436, + "language_loss": 0.55620903, + "learning_rate": 3.5652247586115167e-06, + "loss": 0.58009672, + "num_input_tokens_seen": 5358375, + "step": 254, + "time_per_iteration": 3.1496598720550537 + }, + { + "auxiliary_loss_clip": 0.01196415, + "auxiliary_loss_mlp": 0.01189618, + "balance_loss_clip": 1.00401247, + "balance_loss_mlp": 1.00174451, + "epoch": 0.01533142943033218, + "flos": 26834069260800.0, + "grad_norm": 2.0556316309230906, + "language_loss": 0.90028572, + "learning_rate": 3.567754632921479e-06, + "loss": 0.924146, + "num_input_tokens_seen": 5377255, + "step": 255, + "time_per_iteration": 2.6362111568450928 + }, + { + "auxiliary_loss_clip": 0.01196427, + "auxiliary_loss_mlp": 0.01190001, + "balance_loss_clip": 1.00406337, + "balance_loss_mlp": 1.00222278, + "epoch": 0.01539155268300015, + "flos": 20813753318400.0, + "grad_norm": 2.4215518487357115, + "language_loss": 0.85414088, + "learning_rate": 3.5702746055454075e-06, + "loss": 0.87800515, + "num_input_tokens_seen": 5395320, + "step": 256, + "time_per_iteration": 2.6034648418426514 + }, + { + "auxiliary_loss_clip": 0.01196388, + "auxiliary_loss_mlp": 0.01189332, + "balance_loss_clip": 1.00399816, + "balance_loss_mlp": 1.00164938, + "epoch": 0.01545167593566812, + "flos": 15961862885760.0, + "grad_norm": 2.830215934883893, + "language_loss": 0.71704412, + "learning_rate": 3.5727847536897254e-06, + "loss": 0.74090135, + "num_input_tokens_seen": 5411970, + "step": 257, + "time_per_iteration": 2.55806565284729 + }, + { + "auxiliary_loss_clip": 0.01196356, + "auxiliary_loss_mlp": 0.01189114, + "balance_loss_clip": 1.00410199, + "balance_loss_mlp": 1.00152659, + "epoch": 0.01551179918833609, + "flos": 22601745544320.0, + "grad_norm": 2.492586721260394, + "language_loss": 0.94714808, + "learning_rate": 3.5752851536613596e-06, + "loss": 0.97100282, + "num_input_tokens_seen": 5430245, + "step": 258, + "time_per_iteration": 2.603466272354126 + }, + { + "auxiliary_loss_clip": 0.01196213, + "auxiliary_loss_mlp": 0.01189012, + "balance_loss_clip": 1.00394702, + "balance_loss_mlp": 1.00142407, + "epoch": 0.015571922441004058, + "flos": 22816706486400.0, + "grad_norm": 2.7012615126628767, + "language_loss": 0.92889118, + "learning_rate": 3.577775880881658e-06, + "loss": 0.95274341, + "num_input_tokens_seen": 5448905, + "step": 259, + "time_per_iteration": 2.5960710048675537 + }, + { + "auxiliary_loss_clip": 0.01196124, + "auxiliary_loss_mlp": 0.01188929, + "balance_loss_clip": 1.00396061, + "balance_loss_mlp": 1.00134134, + "epoch": 0.015632045693672027, + "flos": 18947439486720.0, + "grad_norm": 1.7811576153196267, + "language_loss": 0.97117329, + "learning_rate": 3.5802570099000424e-06, + "loss": 0.99502385, + "num_input_tokens_seen": 5466405, + "step": 260, + "time_per_iteration": 2.6157524585723877 + }, + { + "auxiliary_loss_clip": 0.011963, + "auxiliary_loss_mlp": 0.01189317, + "balance_loss_clip": 1.00400734, + "balance_loss_mlp": 1.00172901, + "epoch": 0.015692168946339995, + "flos": 29971728046080.0, + "grad_norm": 2.0513827058003478, + "language_loss": 0.87933493, + "learning_rate": 3.5827286144073947e-06, + "loss": 0.90319109, + "num_input_tokens_seen": 5487055, + "step": 261, + "time_per_iteration": 2.686302661895752 + }, + { + "auxiliary_loss_clip": 0.01196366, + "auxiliary_loss_mlp": 0.01189294, + "balance_loss_clip": 1.00404692, + "balance_loss_mlp": 1.00170684, + "epoch": 0.015752292199007967, + "flos": 19392085946880.0, + "grad_norm": 2.9283301100771526, + "language_loss": 0.67220688, + "learning_rate": 3.5851907672491904e-06, + "loss": 0.69606352, + "num_input_tokens_seen": 5506600, + "step": 262, + "time_per_iteration": 2.623112678527832 + }, + { + "auxiliary_loss_clip": 0.01196206, + "auxiliary_loss_mlp": 0.01189376, + "balance_loss_clip": 1.00387311, + "balance_loss_mlp": 1.00197923, + "epoch": 0.015812415451675936, + "flos": 20339804338560.0, + "grad_norm": 2.343257237515015, + "language_loss": 0.68159056, + "learning_rate": 3.587643540438383e-06, + "loss": 0.70544636, + "num_input_tokens_seen": 5524350, + "step": 263, + "time_per_iteration": 2.6193137168884277 + }, + { + "auxiliary_loss_clip": 0.01196233, + "auxiliary_loss_mlp": 0.01189266, + "balance_loss_clip": 1.00391436, + "balance_loss_mlp": 1.00148797, + "epoch": 0.015872538704343905, + "flos": 17525412979200.0, + "grad_norm": 2.4996685328477772, + "language_loss": 0.84968626, + "learning_rate": 3.590087005168037e-06, + "loss": 0.87354124, + "num_input_tokens_seen": 5542145, + "step": 264, + "time_per_iteration": 2.6315152645111084 + }, + { + "auxiliary_loss_clip": 0.01196269, + "auxiliary_loss_mlp": 0.01188893, + "balance_loss_clip": 1.00399041, + "balance_loss_mlp": 1.0013051, + "epoch": 0.015932661957011873, + "flos": 15260490944640.0, + "grad_norm": 3.26720664131763, + "language_loss": 1.04177427, + "learning_rate": 3.5925212318237344e-06, + "loss": 1.06562591, + "num_input_tokens_seen": 5557920, + "step": 265, + "time_per_iteration": 2.614022731781006 + }, + { + "auxiliary_loss_clip": 0.01196401, + "auxiliary_loss_mlp": 0.01189395, + "balance_loss_clip": 1.00409627, + "balance_loss_mlp": 1.00180757, + "epoch": 0.015992785209679845, + "flos": 20302528999680.0, + "grad_norm": 2.4715371463528264, + "language_loss": 0.74755454, + "learning_rate": 3.5949462899957323e-06, + "loss": 0.77141249, + "num_input_tokens_seen": 5576290, + "step": 266, + "time_per_iteration": 2.608466625213623 + }, + { + "auxiliary_loss_clip": 0.01196232, + "auxiliary_loss_mlp": 0.0118917, + "balance_loss_clip": 1.00388598, + "balance_loss_mlp": 1.00167799, + "epoch": 0.016052908462347814, + "flos": 23362368969600.0, + "grad_norm": 1.968498966841145, + "language_loss": 0.907462, + "learning_rate": 3.5973622484909068e-06, + "loss": 0.93131602, + "num_input_tokens_seen": 5595205, + "step": 267, + "time_per_iteration": 2.5959932804107666 + }, + { + "auxiliary_loss_clip": 0.01196243, + "auxiliary_loss_mlp": 0.01189174, + "balance_loss_clip": 1.00402141, + "balance_loss_mlp": 1.0015862, + "epoch": 0.016113031715015783, + "flos": 21286588976640.0, + "grad_norm": 2.528509821708748, + "language_loss": 0.85926867, + "learning_rate": 3.599769175344462e-06, + "loss": 0.88312286, + "num_input_tokens_seen": 5612645, + "step": 268, + "time_per_iteration": 2.5890953540802 + }, + { + "auxiliary_loss_clip": 0.01196307, + "auxiliary_loss_mlp": 0.01188738, + "balance_loss_clip": 1.00409174, + "balance_loss_mlp": 1.00124562, + "epoch": 0.01617315496768375, + "flos": 18914689261440.0, + "grad_norm": 2.171604349054064, + "language_loss": 0.88212365, + "learning_rate": 3.602167137831432e-06, + "loss": 0.90597415, + "num_input_tokens_seen": 5628345, + "step": 269, + "time_per_iteration": 2.5602545738220215 + }, + { + "auxiliary_loss_clip": 0.01196157, + "auxiliary_loss_mlp": 0.01189023, + "balance_loss_clip": 1.00391734, + "balance_loss_mlp": 1.00134015, + "epoch": 0.01623327822035172, + "flos": 16546488647040.0, + "grad_norm": 2.069023463520461, + "language_loss": 0.96810937, + "learning_rate": 3.6045562024779565e-06, + "loss": 0.99196124, + "num_input_tokens_seen": 5645940, + "step": 270, + "time_per_iteration": 2.583491802215576 + }, + { + "auxiliary_loss_clip": 0.01196161, + "auxiliary_loss_mlp": 0.01188986, + "balance_loss_clip": 1.00402546, + "balance_loss_mlp": 1.00168419, + "epoch": 0.016293401473019692, + "flos": 23513481486720.0, + "grad_norm": 2.150667786230526, + "language_loss": 0.85978532, + "learning_rate": 3.606936435072361e-06, + "loss": 0.88363677, + "num_input_tokens_seen": 5665690, + "step": 271, + "time_per_iteration": 2.6140856742858887 + }, + { + "auxiliary_loss_clip": 0.01196232, + "auxiliary_loss_mlp": 0.01189054, + "balance_loss_clip": 1.0039711, + "balance_loss_mlp": 1.0015614, + "epoch": 0.01635352472568766, + "flos": 29016072748800.0, + "grad_norm": 3.777119505217844, + "language_loss": 0.81091458, + "learning_rate": 3.609307900676025e-06, + "loss": 0.8347674, + "num_input_tokens_seen": 5683190, + "step": 272, + "time_per_iteration": 2.6258769035339355 + }, + { + "auxiliary_loss_clip": 0.01196041, + "auxiliary_loss_mlp": 0.01189345, + "balance_loss_clip": 1.00385451, + "balance_loss_mlp": 1.00185275, + "epoch": 0.01641364797835563, + "flos": 13370513028480.0, + "grad_norm": 2.321004855224325, + "language_loss": 0.81243157, + "learning_rate": 3.611670663634051e-06, + "loss": 0.83628547, + "num_input_tokens_seen": 5699780, + "step": 273, + "time_per_iteration": 2.6371164321899414 + }, + { + "auxiliary_loss_clip": 0.01196154, + "auxiliary_loss_mlp": 0.01188807, + "balance_loss_clip": 1.00387549, + "balance_loss_mlp": 1.00121951, + "epoch": 0.016473771231023598, + "flos": 18878239935360.0, + "grad_norm": 2.4341819304872243, + "language_loss": 0.91391158, + "learning_rate": 3.614024787585744e-06, + "loss": 0.93776125, + "num_input_tokens_seen": 5716980, + "step": 274, + "time_per_iteration": 2.666344165802002 + }, + { + "auxiliary_loss_clip": 0.01196022, + "auxiliary_loss_mlp": 0.01189129, + "balance_loss_clip": 1.00378752, + "balance_loss_mlp": 1.00173223, + "epoch": 0.016533894483691566, + "flos": 22601637803520.0, + "grad_norm": 1.8197571138976538, + "language_loss": 0.87776423, + "learning_rate": 3.6163703354748927e-06, + "loss": 0.90161568, + "num_input_tokens_seen": 5737780, + "step": 275, + "time_per_iteration": 2.621798515319824 + }, + { + "auxiliary_loss_clip": 0.01195993, + "auxiliary_loss_mlp": 0.01188964, + "balance_loss_clip": 1.0037905, + "balance_loss_mlp": 1.00147152, + "epoch": 0.01659401773635954, + "flos": 21507188353920.0, + "grad_norm": 2.2510947547207807, + "language_loss": 0.80580497, + "learning_rate": 3.6187073695598707e-06, + "loss": 0.82965451, + "num_input_tokens_seen": 5758330, + "step": 276, + "time_per_iteration": 2.599912166595459 + }, + { + "auxiliary_loss_clip": 0.01196162, + "auxiliary_loss_mlp": 0.01189018, + "balance_loss_clip": 1.00400543, + "balance_loss_mlp": 1.00143015, + "epoch": 0.016654140989027507, + "flos": 32850973411200.0, + "grad_norm": 3.309913633731033, + "language_loss": 0.81235683, + "learning_rate": 3.621035951423551e-06, + "loss": 0.83620858, + "num_input_tokens_seen": 5778340, + "step": 277, + "time_per_iteration": 2.6981449127197266 + }, + { + "auxiliary_loss_clip": 0.01195755, + "auxiliary_loss_mlp": 0.01188782, + "balance_loss_clip": 1.00364256, + "balance_loss_mlp": 1.00129008, + "epoch": 0.016714264241695476, + "flos": 12306228024960.0, + "grad_norm": 2.220134016126655, + "language_loss": 0.80438411, + "learning_rate": 3.623356141983041e-06, + "loss": 0.82822943, + "num_input_tokens_seen": 5794295, + "step": 278, + "time_per_iteration": 2.5426158905029297 + }, + { + "auxiliary_loss_clip": 0.01196037, + "auxiliary_loss_mlp": 0.01188772, + "balance_loss_clip": 1.00384665, + "balance_loss_mlp": 1.0013746, + "epoch": 0.016774387494363444, + "flos": 27123796362240.0, + "grad_norm": 1.709106971756958, + "language_loss": 0.90861773, + "learning_rate": 3.6256680014992486e-06, + "loss": 0.93246579, + "num_input_tokens_seen": 5814405, + "step": 279, + "time_per_iteration": 2.6168596744537354 + }, + { + "auxiliary_loss_clip": 0.01195917, + "auxiliary_loss_mlp": 0.01189111, + "balance_loss_clip": 1.00377822, + "balance_loss_mlp": 1.00180936, + "epoch": 0.016834510747031413, + "flos": 20191493082240.0, + "grad_norm": 2.2912235440720563, + "language_loss": 0.9409914, + "learning_rate": 3.6279715895862713e-06, + "loss": 0.9648416, + "num_input_tokens_seen": 5832795, + "step": 280, + "time_per_iteration": 2.564607620239258 + }, + { + "auxiliary_loss_clip": 0.01195829, + "auxiliary_loss_mlp": 0.01188951, + "balance_loss_clip": 1.00369227, + "balance_loss_mlp": 1.00174439, + "epoch": 0.016894633999699385, + "flos": 27274262434560.0, + "grad_norm": 2.416434623161272, + "language_loss": 0.74121857, + "learning_rate": 3.6302669652206183e-06, + "loss": 0.76506639, + "num_input_tokens_seen": 5855750, + "step": 281, + "time_per_iteration": 2.651789903640747 + }, + { + "auxiliary_loss_clip": 0.01196083, + "auxiliary_loss_mlp": 0.01188957, + "balance_loss_clip": 1.00391746, + "balance_loss_mlp": 1.00156045, + "epoch": 0.016954757252367354, + "flos": 14902964922240.0, + "grad_norm": 2.617236314453541, + "language_loss": 0.80218887, + "learning_rate": 3.632554186750274e-06, + "loss": 0.82603925, + "num_input_tokens_seen": 5872610, + "step": 282, + "time_per_iteration": 2.545279026031494 + }, + { + "auxiliary_loss_clip": 0.01195934, + "auxiliary_loss_mlp": 0.01189546, + "balance_loss_clip": 1.00376558, + "balance_loss_mlp": 1.00195813, + "epoch": 0.017014880505035322, + "flos": 21358805270400.0, + "grad_norm": 2.0775163723876284, + "language_loss": 0.77764386, + "learning_rate": 3.6348333119035937e-06, + "loss": 0.80149865, + "num_input_tokens_seen": 5892985, + "step": 283, + "time_per_iteration": 2.581019878387451 + }, + { + "auxiliary_loss_clip": 0.01196142, + "auxiliary_loss_mlp": 0.0118854, + "balance_loss_clip": 1.00404239, + "balance_loss_mlp": 1.00133348, + "epoch": 0.01707500375770329, + "flos": 35333154858240.0, + "grad_norm": 2.1533051253623436, + "language_loss": 0.84320319, + "learning_rate": 3.6371043977980503e-06, + "loss": 0.86705005, + "num_input_tokens_seen": 5914060, + "step": 284, + "time_per_iteration": 5.6949546337127686 + }, + { + "auxiliary_loss_clip": 0.01195889, + "auxiliary_loss_mlp": 0.01188782, + "balance_loss_clip": 1.00379443, + "balance_loss_mlp": 1.00148022, + "epoch": 0.01713512701037126, + "flos": 23582070506880.0, + "grad_norm": 2.964390884785365, + "language_loss": 0.967484, + "learning_rate": 3.639367500948819e-06, + "loss": 0.99133068, + "num_input_tokens_seen": 5932860, + "step": 285, + "time_per_iteration": 4.006603002548218 + }, + { + "auxiliary_loss_clip": 0.01195979, + "auxiliary_loss_mlp": 0.0118901, + "balance_loss_clip": 1.00381184, + "balance_loss_mlp": 1.00132728, + "epoch": 0.01719525026303923, + "flos": 27634661544960.0, + "grad_norm": 2.3395439751215252, + "language_loss": 0.93863863, + "learning_rate": 3.6416226772772178e-06, + "loss": 0.96248853, + "num_input_tokens_seen": 5952725, + "step": 286, + "time_per_iteration": 2.6471364498138428 + }, + { + "auxiliary_loss_clip": 0.01195836, + "auxiliary_loss_mlp": 0.01188717, + "balance_loss_clip": 1.00379634, + "balance_loss_mlp": 1.0012244, + "epoch": 0.0172553735157072, + "flos": 26979722910720.0, + "grad_norm": 1.803151441748243, + "language_loss": 0.92239988, + "learning_rate": 3.643869982119001e-06, + "loss": 0.94624531, + "num_input_tokens_seen": 5970560, + "step": 287, + "time_per_iteration": 2.6287968158721924 + }, + { + "auxiliary_loss_clip": 0.01195743, + "auxiliary_loss_mlp": 0.01188543, + "balance_loss_clip": 1.0036025, + "balance_loss_mlp": 1.00124097, + "epoch": 0.01731549676837517, + "flos": 14056621689600.0, + "grad_norm": 2.7527520380660637, + "language_loss": 1.01791787, + "learning_rate": 3.646109470232502e-06, + "loss": 1.0417608, + "num_input_tokens_seen": 5982980, + "step": 288, + "time_per_iteration": 2.552377462387085 + }, + { + "auxiliary_loss_clip": 0.01200336, + "auxiliary_loss_mlp": 0.01187577, + "balance_loss_clip": 1.0087254, + "balance_loss_mlp": 1.00065696, + "epoch": 0.017375620021043137, + "flos": 66510694471680.0, + "grad_norm": 0.9000190490973553, + "language_loss": 0.63852626, + "learning_rate": 3.6483411958066417e-06, + "loss": 0.66240543, + "num_input_tokens_seen": 6049445, + "step": 289, + "time_per_iteration": 3.2744288444519043 + }, + { + "auxiliary_loss_clip": 0.01195869, + "auxiliary_loss_mlp": 0.011893, + "balance_loss_clip": 1.00377953, + "balance_loss_mlp": 1.00180769, + "epoch": 0.01743574327371111, + "flos": 15225154940160.0, + "grad_norm": 2.8926025318219875, + "language_loss": 0.8854053, + "learning_rate": 3.6505652124687957e-06, + "loss": 0.90925699, + "num_input_tokens_seen": 6064150, + "step": 290, + "time_per_iteration": 2.5803816318511963 + }, + { + "auxiliary_loss_clip": 0.01195829, + "auxiliary_loss_mlp": 0.01188305, + "balance_loss_clip": 1.00374341, + "balance_loss_mlp": 1.00100374, + "epoch": 0.017495866526379078, + "flos": 25373869574400.0, + "grad_norm": 1.982544111726939, + "language_loss": 0.84558254, + "learning_rate": 3.6527815732925258e-06, + "loss": 0.86942393, + "num_input_tokens_seen": 6083920, + "step": 291, + "time_per_iteration": 2.636390447616577 + }, + { + "auxiliary_loss_clip": 0.01195781, + "auxiliary_loss_mlp": 0.01188713, + "balance_loss_clip": 1.00381708, + "balance_loss_mlp": 1.00150621, + "epoch": 0.017555989779047047, + "flos": 26359473836160.0, + "grad_norm": 1.583754348187385, + "language_loss": 0.72663367, + "learning_rate": 3.6549903308051806e-06, + "loss": 0.75047851, + "num_input_tokens_seen": 6105460, + "step": 292, + "time_per_iteration": 2.684086799621582 + }, + { + "auxiliary_loss_clip": 0.01195733, + "auxiliary_loss_mlp": 0.01188977, + "balance_loss_clip": 1.00370502, + "balance_loss_mlp": 1.00157976, + "epoch": 0.017616113031715015, + "flos": 22338807010560.0, + "grad_norm": 2.2026247415434295, + "language_loss": 0.87325001, + "learning_rate": 3.6571915369953646e-06, + "loss": 0.89709711, + "num_input_tokens_seen": 6122890, + "step": 293, + "time_per_iteration": 2.581516981124878 + }, + { + "auxiliary_loss_clip": 0.01195877, + "auxiliary_loss_mlp": 0.01188881, + "balance_loss_clip": 1.00384974, + "balance_loss_mlp": 1.00157952, + "epoch": 0.017676236284382984, + "flos": 20156911263360.0, + "grad_norm": 2.981256114552499, + "language_loss": 0.80932868, + "learning_rate": 3.6593852433202797e-06, + "loss": 0.83317626, + "num_input_tokens_seen": 6142890, + "step": 294, + "time_per_iteration": 2.627128839492798 + }, + { + "auxiliary_loss_clip": 0.01195513, + "auxiliary_loss_mlp": 0.01188955, + "balance_loss_clip": 1.00350797, + "balance_loss_mlp": 1.00174868, + "epoch": 0.017736359537050956, + "flos": 25223331674880.0, + "grad_norm": 1.7574786943235818, + "language_loss": 0.83951032, + "learning_rate": 3.6615715007129453e-06, + "loss": 0.86335504, + "num_input_tokens_seen": 6162030, + "step": 295, + "time_per_iteration": 2.6295437812805176 + }, + { + "auxiliary_loss_clip": 0.01195845, + "auxiliary_loss_mlp": 0.01188691, + "balance_loss_clip": 1.0038259, + "balance_loss_mlp": 1.00167525, + "epoch": 0.017796482789718925, + "flos": 20338798757760.0, + "grad_norm": 2.6831430224791952, + "language_loss": 0.84508246, + "learning_rate": 3.6637503595892897e-06, + "loss": 0.86892784, + "num_input_tokens_seen": 6180540, + "step": 296, + "time_per_iteration": 2.6106345653533936 + }, + { + "auxiliary_loss_clip": 0.0119593, + "auxiliary_loss_mlp": 0.01188629, + "balance_loss_clip": 1.00385833, + "balance_loss_mlp": 1.00151849, + "epoch": 0.017856606042386893, + "flos": 22379206832640.0, + "grad_norm": 3.09600894028505, + "language_loss": 0.87649119, + "learning_rate": 3.665921869855132e-06, + "loss": 0.90033674, + "num_input_tokens_seen": 6199425, + "step": 297, + "time_per_iteration": 2.575117826461792 + }, + { + "auxiliary_loss_clip": 0.01195718, + "auxiliary_loss_mlp": 0.01188867, + "balance_loss_clip": 1.00368524, + "balance_loss_mlp": 1.00156498, + "epoch": 0.017916729295054862, + "flos": 20230061310720.0, + "grad_norm": 2.656018462211922, + "language_loss": 0.88507712, + "learning_rate": 3.6680860809130346e-06, + "loss": 0.90892291, + "num_input_tokens_seen": 6219170, + "step": 298, + "time_per_iteration": 2.5842907428741455 + }, + { + "auxiliary_loss_clip": 0.01195649, + "auxiliary_loss_mlp": 0.01189099, + "balance_loss_clip": 1.00375581, + "balance_loss_mlp": 1.00208378, + "epoch": 0.01797685254772283, + "flos": 19390972625280.0, + "grad_norm": 5.1357415347000925, + "language_loss": 0.88793486, + "learning_rate": 3.6702430416690516e-06, + "loss": 0.91178238, + "num_input_tokens_seen": 6237930, + "step": 299, + "time_per_iteration": 2.549039125442505 + }, + { + "auxiliary_loss_clip": 0.01195517, + "auxiliary_loss_mlp": 0.01188703, + "balance_loss_clip": 1.00359499, + "balance_loss_mlp": 1.00159168, + "epoch": 0.018036975800390802, + "flos": 24426007528320.0, + "grad_norm": 29.292687862251523, + "language_loss": 0.64757854, + "learning_rate": 3.672392800539357e-06, + "loss": 0.67142069, + "num_input_tokens_seen": 6257170, + "step": 300, + "time_per_iteration": 2.6170897483825684 + }, + { + "auxiliary_loss_clip": 0.01195644, + "auxiliary_loss_mlp": 0.0118893, + "balance_loss_clip": 1.00376129, + "balance_loss_mlp": 1.00172353, + "epoch": 0.01809709905305877, + "flos": 15778933896960.0, + "grad_norm": 2.060264953928373, + "language_loss": 0.88389766, + "learning_rate": 3.6745354054567686e-06, + "loss": 0.90774339, + "num_input_tokens_seen": 6274780, + "step": 301, + "time_per_iteration": 2.5831546783447266 + }, + { + "auxiliary_loss_clip": 0.01200129, + "auxiliary_loss_mlp": 0.011876, + "balance_loss_clip": 1.00860333, + "balance_loss_mlp": 1.00068021, + "epoch": 0.01815722230572674, + "flos": 67348382526720.0, + "grad_norm": 0.8323369525012493, + "language_loss": 0.62187815, + "learning_rate": 3.676670903877158e-06, + "loss": 0.64575541, + "num_input_tokens_seen": 6340435, + "step": 302, + "time_per_iteration": 3.293363094329834 + }, + { + "auxiliary_loss_clip": 0.01195381, + "auxiliary_loss_mlp": 0.01189064, + "balance_loss_clip": 1.00346816, + "balance_loss_mlp": 1.00176215, + "epoch": 0.01821734555839471, + "flos": 15485615435520.0, + "grad_norm": 2.461075331699343, + "language_loss": 0.89666498, + "learning_rate": 3.6787993427857567e-06, + "loss": 0.92050946, + "num_input_tokens_seen": 6358160, + "step": 303, + "time_per_iteration": 2.568181276321411 + }, + { + "auxiliary_loss_clip": 0.01195629, + "auxiliary_loss_mlp": 0.01189185, + "balance_loss_clip": 1.0036453, + "balance_loss_mlp": 1.00197923, + "epoch": 0.018277468811062677, + "flos": 24097424889600.0, + "grad_norm": 1.7300593648850187, + "language_loss": 0.80278075, + "learning_rate": 3.680920768703364e-06, + "loss": 0.82662892, + "num_input_tokens_seen": 6378485, + "step": 304, + "time_per_iteration": 2.6392970085144043 + }, + { + "auxiliary_loss_clip": 0.01195553, + "auxiliary_loss_mlp": 0.01188649, + "balance_loss_clip": 1.00368834, + "balance_loss_mlp": 1.00153828, + "epoch": 0.01833759206373065, + "flos": 20959335141120.0, + "grad_norm": 1.600144989189916, + "language_loss": 0.82762557, + "learning_rate": 3.6830352276924415e-06, + "loss": 0.85146761, + "num_input_tokens_seen": 6397845, + "step": 305, + "time_per_iteration": 2.6197519302368164 + }, + { + "auxiliary_loss_clip": 0.01195659, + "auxiliary_loss_mlp": 0.01188406, + "balance_loss_clip": 1.00369287, + "balance_loss_mlp": 1.00129485, + "epoch": 0.018397715316398618, + "flos": 19390757143680.0, + "grad_norm": 1.972801560656903, + "language_loss": 0.91021764, + "learning_rate": 3.685142765363119e-06, + "loss": 0.93405831, + "num_input_tokens_seen": 6416475, + "step": 306, + "time_per_iteration": 2.581664800643921 + }, + { + "auxiliary_loss_clip": 0.01195469, + "auxiliary_loss_mlp": 0.01188386, + "balance_loss_clip": 1.00354242, + "balance_loss_mlp": 1.00137019, + "epoch": 0.018457838569066586, + "flos": 29132531619840.0, + "grad_norm": 3.962062880276381, + "language_loss": 0.86700284, + "learning_rate": 3.687243426879095e-06, + "loss": 0.89084136, + "num_input_tokens_seen": 6437520, + "step": 307, + "time_per_iteration": 2.7095370292663574 + }, + { + "auxiliary_loss_clip": 0.01195473, + "auxiliary_loss_mlp": 0.01188597, + "balance_loss_clip": 1.00362241, + "balance_loss_mlp": 1.00158131, + "epoch": 0.018517961821734555, + "flos": 19208654167680.0, + "grad_norm": 2.3879511307987302, + "language_loss": 0.71733969, + "learning_rate": 3.6893372569634466e-06, + "loss": 0.74118042, + "num_input_tokens_seen": 6455680, + "step": 308, + "time_per_iteration": 2.5967280864715576 + }, + { + "auxiliary_loss_clip": 0.01195551, + "auxiliary_loss_mlp": 0.01188716, + "balance_loss_clip": 1.0035696, + "balance_loss_mlp": 1.00150979, + "epoch": 0.018578085074402523, + "flos": 19863018184320.0, + "grad_norm": 2.095328864770589, + "language_loss": 0.91570282, + "learning_rate": 3.6914242999043395e-06, + "loss": 0.93954545, + "num_input_tokens_seen": 6474880, + "step": 309, + "time_per_iteration": 2.6106972694396973 + }, + { + "auxiliary_loss_clip": 0.01195667, + "auxiliary_loss_mlp": 0.01188623, + "balance_loss_clip": 1.00361145, + "balance_loss_mlp": 1.00132096, + "epoch": 0.018638208327070496, + "flos": 29606947476480.0, + "grad_norm": 2.721082847260732, + "language_loss": 0.72593683, + "learning_rate": 3.69350459956065e-06, + "loss": 0.7497797, + "num_input_tokens_seen": 6495945, + "step": 310, + "time_per_iteration": 2.6513044834136963 + }, + { + "auxiliary_loss_clip": 0.01195448, + "auxiliary_loss_mlp": 0.011888, + "balance_loss_clip": 1.00355136, + "balance_loss_mlp": 1.00168884, + "epoch": 0.018698331579738464, + "flos": 45731555907840.0, + "grad_norm": 2.0718102649051984, + "language_loss": 0.73915815, + "learning_rate": 3.695578199367497e-06, + "loss": 0.76300061, + "num_input_tokens_seen": 6519930, + "step": 311, + "time_per_iteration": 2.80580472946167 + }, + { + "auxiliary_loss_clip": 0.01195536, + "auxiliary_loss_mlp": 0.01188588, + "balance_loss_clip": 1.00367749, + "balance_loss_mlp": 1.0016681, + "epoch": 0.018758454832406433, + "flos": 20483662308480.0, + "grad_norm": 3.8653247995189677, + "language_loss": 0.91332006, + "learning_rate": 3.6976451423416825e-06, + "loss": 0.93716133, + "num_input_tokens_seen": 6535070, + "step": 312, + "time_per_iteration": 2.5719566345214844 + }, + { + "auxiliary_loss_clip": 0.01195451, + "auxiliary_loss_mlp": 0.01188721, + "balance_loss_clip": 1.00353599, + "balance_loss_mlp": 1.00180042, + "epoch": 0.0188185780850744, + "flos": 15777784661760.0, + "grad_norm": 2.1679773149126214, + "language_loss": 0.89780474, + "learning_rate": 3.699705471087043e-06, + "loss": 0.92164642, + "num_input_tokens_seen": 6554135, + "step": 313, + "time_per_iteration": 2.5847339630126953 + }, + { + "auxiliary_loss_clip": 0.01195374, + "auxiliary_loss_mlp": 0.01188613, + "balance_loss_clip": 1.00349164, + "balance_loss_mlp": 1.00150251, + "epoch": 0.018878701337742373, + "flos": 22455732758400.0, + "grad_norm": 2.314220830426184, + "language_loss": 0.73118353, + "learning_rate": 3.7017592277997256e-06, + "loss": 0.75502342, + "num_input_tokens_seen": 6572275, + "step": 314, + "time_per_iteration": 2.61759352684021 + }, + { + "auxiliary_loss_clip": 0.01195382, + "auxiliary_loss_mlp": 0.01188646, + "balance_loss_clip": 1.0035187, + "balance_loss_mlp": 1.00143921, + "epoch": 0.018938824590410342, + "flos": 30993530238720.0, + "grad_norm": 2.603604123704256, + "language_loss": 0.8954159, + "learning_rate": 3.7038064542733654e-06, + "loss": 0.91925621, + "num_input_tokens_seen": 6594520, + "step": 315, + "time_per_iteration": 2.6917049884796143 + }, + { + "auxiliary_loss_clip": 0.01195478, + "auxiliary_loss_mlp": 0.01188318, + "balance_loss_clip": 1.00354171, + "balance_loss_mlp": 1.00120676, + "epoch": 0.01899894784307831, + "flos": 23258910821760.0, + "grad_norm": 1.7733430587619035, + "language_loss": 0.80513006, + "learning_rate": 3.7058471919041945e-06, + "loss": 0.82896805, + "num_input_tokens_seen": 6614245, + "step": 316, + "time_per_iteration": 2.587440252304077 + }, + { + "auxiliary_loss_clip": 0.01195221, + "auxiliary_loss_mlp": 0.0118829, + "balance_loss_clip": 1.0034368, + "balance_loss_mlp": 1.00117946, + "epoch": 0.01905907109574628, + "flos": 17457901367040.0, + "grad_norm": 3.3732873862899795, + "language_loss": 0.90190047, + "learning_rate": 3.7078814816960605e-06, + "loss": 0.92573559, + "num_input_tokens_seen": 6632015, + "step": 317, + "time_per_iteration": 2.5647947788238525 + }, + { + "auxiliary_loss_clip": 0.01195349, + "auxiliary_loss_mlp": 0.01188463, + "balance_loss_clip": 1.00359464, + "balance_loss_mlp": 1.00154257, + "epoch": 0.019119194348414248, + "flos": 14970225139200.0, + "grad_norm": 2.7161321576850836, + "language_loss": 0.9079169, + "learning_rate": 3.709909364265374e-06, + "loss": 0.93175507, + "num_input_tokens_seen": 6649015, + "step": 318, + "time_per_iteration": 2.5558419227600098 + }, + { + "auxiliary_loss_clip": 0.01195255, + "auxiliary_loss_mlp": 0.01188349, + "balance_loss_clip": 1.00341797, + "balance_loss_mlp": 1.00142884, + "epoch": 0.01917931760108222, + "flos": 25482822503040.0, + "grad_norm": 2.3643659781338924, + "language_loss": 0.9346894, + "learning_rate": 3.7119308798459706e-06, + "loss": 0.95852542, + "num_input_tokens_seen": 6669225, + "step": 319, + "time_per_iteration": 2.6755619049072266 + }, + { + "auxiliary_loss_clip": 0.01199647, + "auxiliary_loss_mlp": 0.01187582, + "balance_loss_clip": 1.00820553, + "balance_loss_mlp": 1.00066209, + "epoch": 0.01923944085375019, + "flos": 71556967353600.0, + "grad_norm": 0.9270643241184998, + "language_loss": 0.59817725, + "learning_rate": 3.7139460682939026e-06, + "loss": 0.62204951, + "num_input_tokens_seen": 6725775, + "step": 320, + "time_per_iteration": 3.0912675857543945 + }, + { + "auxiliary_loss_clip": 0.01195145, + "auxiliary_loss_mlp": 0.01188614, + "balance_loss_clip": 1.00342691, + "balance_loss_mlp": 1.00169373, + "epoch": 0.019299564106418157, + "flos": 19682495406720.0, + "grad_norm": 3.007136238291209, + "language_loss": 0.89952546, + "learning_rate": 3.715954969092154e-06, + "loss": 0.92336309, + "num_input_tokens_seen": 6744170, + "step": 321, + "time_per_iteration": 2.5621864795684814 + }, + { + "auxiliary_loss_clip": 0.01195438, + "auxiliary_loss_mlp": 0.01188602, + "balance_loss_clip": 1.00366247, + "balance_loss_mlp": 1.00168133, + "epoch": 0.019359687359086126, + "flos": 24387151991040.0, + "grad_norm": 2.2282896319654997, + "language_loss": 0.82848024, + "learning_rate": 3.7179576213552805e-06, + "loss": 0.85232067, + "num_input_tokens_seen": 6764565, + "step": 322, + "time_per_iteration": 4.085370779037476 + }, + { + "auxiliary_loss_clip": 0.01195371, + "auxiliary_loss_mlp": 0.01188282, + "balance_loss_clip": 1.00349557, + "balance_loss_mlp": 1.001266, + "epoch": 0.019419810611754094, + "flos": 23951376190080.0, + "grad_norm": 2.433947934857394, + "language_loss": 0.72347045, + "learning_rate": 3.719954063833981e-06, + "loss": 0.74730706, + "num_input_tokens_seen": 6785310, + "step": 323, + "time_per_iteration": 5.480280876159668 + }, + { + "auxiliary_loss_clip": 0.01195295, + "auxiliary_loss_mlp": 0.01188168, + "balance_loss_clip": 1.00339425, + "balance_loss_mlp": 1.00124788, + "epoch": 0.019479933864422067, + "flos": 22160223567360.0, + "grad_norm": 2.111462608660464, + "language_loss": 0.92621696, + "learning_rate": 3.721944334919596e-06, + "loss": 0.95005155, + "num_input_tokens_seen": 6803290, + "step": 324, + "time_per_iteration": 2.596975803375244 + }, + { + "auxiliary_loss_clip": 0.01195367, + "auxiliary_loss_mlp": 0.01188111, + "balance_loss_clip": 1.00352669, + "balance_loss_mlp": 1.00128627, + "epoch": 0.019540057117090035, + "flos": 22236821320320.0, + "grad_norm": 2.7300967576181705, + "language_loss": 0.65461171, + "learning_rate": 3.7239284726485375e-06, + "loss": 0.67844647, + "num_input_tokens_seen": 6822570, + "step": 325, + "time_per_iteration": 2.5727033615112305 + }, + { + "auxiliary_loss_clip": 0.0119534, + "auxiliary_loss_mlp": 0.01188389, + "balance_loss_clip": 1.00357401, + "balance_loss_mlp": 1.00175452, + "epoch": 0.019600180369758004, + "flos": 23076771932160.0, + "grad_norm": 2.08395278026487, + "language_loss": 0.7641443, + "learning_rate": 3.72590651470665e-06, + "loss": 0.78798151, + "num_input_tokens_seen": 6841910, + "step": 326, + "time_per_iteration": 2.632094383239746 + }, + { + "auxiliary_loss_clip": 0.01195136, + "auxiliary_loss_mlp": 0.01188522, + "balance_loss_clip": 1.00329888, + "balance_loss_mlp": 1.00179267, + "epoch": 0.019660303622425972, + "flos": 25410857604480.0, + "grad_norm": 4.92515585273936, + "language_loss": 0.79795378, + "learning_rate": 3.727878498433505e-06, + "loss": 0.82179034, + "num_input_tokens_seen": 6862480, + "step": 327, + "time_per_iteration": 2.6212589740753174 + }, + { + "auxiliary_loss_clip": 0.01195361, + "auxiliary_loss_mlp": 0.0118826, + "balance_loss_clip": 1.00367951, + "balance_loss_mlp": 1.00153017, + "epoch": 0.01972042687509394, + "flos": 23657519024640.0, + "grad_norm": 2.5170098606751963, + "language_loss": 0.80955625, + "learning_rate": 3.7298444608266328e-06, + "loss": 0.83339244, + "num_input_tokens_seen": 6882015, + "step": 328, + "time_per_iteration": 2.6221635341644287 + }, + { + "auxiliary_loss_clip": 0.0119531, + "auxiliary_loss_mlp": 0.01188095, + "balance_loss_clip": 1.00342894, + "balance_loss_mlp": 1.00127006, + "epoch": 0.019780550127761913, + "flos": 18223480869120.0, + "grad_norm": 2.957675046723522, + "language_loss": 0.93389726, + "learning_rate": 3.731804438545683e-06, + "loss": 0.95773125, + "num_input_tokens_seen": 6899785, + "step": 329, + "time_per_iteration": 2.6097590923309326 + }, + { + "auxiliary_loss_clip": 0.01195219, + "auxiliary_loss_mlp": 0.01188589, + "balance_loss_clip": 1.00345469, + "balance_loss_mlp": 1.00185943, + "epoch": 0.01984067338042988, + "flos": 22418780641920.0, + "grad_norm": 2.3914242647949315, + "language_loss": 0.74414313, + "learning_rate": 3.7337584679165324e-06, + "loss": 0.76798123, + "num_input_tokens_seen": 6918575, + "step": 330, + "time_per_iteration": 2.6095356941223145 + }, + { + "auxiliary_loss_clip": 0.01195166, + "auxiliary_loss_mlp": 0.01188536, + "balance_loss_clip": 1.0034585, + "balance_loss_mlp": 1.00171113, + "epoch": 0.01990079663309785, + "flos": 17055199013760.0, + "grad_norm": 3.508696534673564, + "language_loss": 0.93723476, + "learning_rate": 3.7357065849353186e-06, + "loss": 0.96107185, + "num_input_tokens_seen": 6936965, + "step": 331, + "time_per_iteration": 2.5387051105499268 + }, + { + "auxiliary_loss_clip": 0.01195184, + "auxiliary_loss_mlp": 0.0118797, + "balance_loss_clip": 1.00353456, + "balance_loss_mlp": 1.00114536, + "epoch": 0.01996091988576582, + "flos": 15961791058560.0, + "grad_norm": 2.7805604225571, + "language_loss": 0.92995578, + "learning_rate": 3.737648825272422e-06, + "loss": 0.95378733, + "num_input_tokens_seen": 6953475, + "step": 332, + "time_per_iteration": 2.657576084136963 + }, + { + "auxiliary_loss_clip": 0.01195148, + "auxiliary_loss_mlp": 0.01188002, + "balance_loss_clip": 1.00341845, + "balance_loss_mlp": 1.00117731, + "epoch": 0.02002104313843379, + "flos": 23586451966080.0, + "grad_norm": 2.3964764082415817, + "language_loss": 0.75653875, + "learning_rate": 3.739585224276384e-06, + "loss": 0.78037024, + "num_input_tokens_seen": 6971630, + "step": 333, + "time_per_iteration": 2.578585147857666 + }, + { + "auxiliary_loss_clip": 0.01195311, + "auxiliary_loss_mlp": 0.0118811, + "balance_loss_clip": 1.00357318, + "balance_loss_mlp": 1.00128508, + "epoch": 0.02008116639110176, + "flos": 34094883352320.0, + "grad_norm": 2.2396453046684077, + "language_loss": 0.78543758, + "learning_rate": 3.7415158169777673e-06, + "loss": 0.80927181, + "num_input_tokens_seen": 6992775, + "step": 334, + "time_per_iteration": 2.703664779663086 + }, + { + "auxiliary_loss_clip": 0.01195068, + "auxiliary_loss_mlp": 0.01188354, + "balance_loss_clip": 1.00335968, + "balance_loss_mlp": 1.00152946, + "epoch": 0.020141289643769728, + "flos": 19683716469120.0, + "grad_norm": 1.8206182653606637, + "language_loss": 0.83202863, + "learning_rate": 3.7434406380929575e-06, + "loss": 0.85586286, + "num_input_tokens_seen": 7011425, + "step": 335, + "time_per_iteration": 2.6080803871154785 + }, + { + "auxiliary_loss_clip": 0.01195057, + "auxiliary_loss_mlp": 0.01187976, + "balance_loss_clip": 1.00328243, + "balance_loss_mlp": 1.00115073, + "epoch": 0.020201412896437697, + "flos": 20740567357440.0, + "grad_norm": 2.115445048889141, + "language_loss": 0.92351973, + "learning_rate": 3.745359722027911e-06, + "loss": 0.94735008, + "num_input_tokens_seen": 7029450, + "step": 336, + "time_per_iteration": 2.586108446121216 + }, + { + "auxiliary_loss_clip": 0.0119497, + "auxiliary_loss_mlp": 0.01188071, + "balance_loss_clip": 1.00333071, + "balance_loss_mlp": 1.00124621, + "epoch": 0.020261536149105665, + "flos": 20266510636800.0, + "grad_norm": 2.025870797017505, + "language_loss": 0.88506806, + "learning_rate": 3.7472731028818428e-06, + "loss": 0.90889841, + "num_input_tokens_seen": 7047555, + "step": 337, + "time_per_iteration": 2.5777862071990967 + }, + { + "auxiliary_loss_clip": 0.01194855, + "auxiliary_loss_mlp": 0.01188131, + "balance_loss_clip": 1.00320673, + "balance_loss_mlp": 1.00149679, + "epoch": 0.020321659401773638, + "flos": 25848752307840.0, + "grad_norm": 1.452953378610999, + "language_loss": 0.89973408, + "learning_rate": 3.7491808144508626e-06, + "loss": 0.9235639, + "num_input_tokens_seen": 7068185, + "step": 338, + "time_per_iteration": 2.630661964416504 + }, + { + "auxiliary_loss_clip": 0.01195116, + "auxiliary_loss_mlp": 0.01188252, + "balance_loss_clip": 1.00343645, + "balance_loss_mlp": 1.00152254, + "epoch": 0.020381782654441606, + "flos": 17495033051520.0, + "grad_norm": 2.1087454361802087, + "language_loss": 0.85121131, + "learning_rate": 3.7510828902315576e-06, + "loss": 0.87504494, + "num_input_tokens_seen": 7085955, + "step": 339, + "time_per_iteration": 2.551159620285034 + }, + { + "auxiliary_loss_clip": 0.01195074, + "auxiliary_loss_mlp": 0.01188408, + "balance_loss_clip": 1.00332355, + "balance_loss_mlp": 1.00158286, + "epoch": 0.020441905907109575, + "flos": 24243940465920.0, + "grad_norm": 1.7802967179365616, + "language_loss": 0.8892349, + "learning_rate": 3.75297936342452e-06, + "loss": 0.91306973, + "num_input_tokens_seen": 7106345, + "step": 340, + "time_per_iteration": 2.614140748977661 + }, + { + "auxiliary_loss_clip": 0.01194994, + "auxiliary_loss_mlp": 0.01188041, + "balance_loss_clip": 1.00334215, + "balance_loss_mlp": 1.00112045, + "epoch": 0.020502029159777543, + "flos": 22233301787520.0, + "grad_norm": 2.01825787940505, + "language_loss": 0.88235831, + "learning_rate": 3.7548702669378253e-06, + "loss": 0.90618867, + "num_input_tokens_seen": 7125070, + "step": 341, + "time_per_iteration": 2.579399824142456 + }, + { + "auxiliary_loss_clip": 0.01195036, + "auxiliary_loss_mlp": 0.01188405, + "balance_loss_clip": 1.00336516, + "balance_loss_mlp": 1.00186658, + "epoch": 0.020562152412445512, + "flos": 23987861429760.0, + "grad_norm": 2.4688771007647996, + "language_loss": 0.80313289, + "learning_rate": 3.756755633390458e-06, + "loss": 0.8269673, + "num_input_tokens_seen": 7144675, + "step": 342, + "time_per_iteration": 2.6115124225616455 + }, + { + "auxiliary_loss_clip": 0.0119495, + "auxiliary_loss_mlp": 0.01188416, + "balance_loss_clip": 1.00324249, + "balance_loss_mlp": 1.00159144, + "epoch": 0.020622275665113484, + "flos": 26975305537920.0, + "grad_norm": 1.7001247066259881, + "language_loss": 0.89240527, + "learning_rate": 3.7586354951156886e-06, + "loss": 0.91623896, + "num_input_tokens_seen": 7165505, + "step": 343, + "time_per_iteration": 2.591172218322754 + }, + { + "auxiliary_loss_clip": 0.01195031, + "auxiliary_loss_mlp": 0.01187728, + "balance_loss_clip": 1.0034411, + "balance_loss_mlp": 1.00138021, + "epoch": 0.020682398917781453, + "flos": 22600704049920.0, + "grad_norm": 1.7924696437894792, + "language_loss": 0.78072643, + "learning_rate": 3.7605098841644e-06, + "loss": 0.8045541, + "num_input_tokens_seen": 7184605, + "step": 344, + "time_per_iteration": 2.6474668979644775 + }, + { + "auxiliary_loss_clip": 0.01194842, + "auxiliary_loss_mlp": 0.01188201, + "balance_loss_clip": 1.00319898, + "balance_loss_mlp": 1.00156713, + "epoch": 0.02074252217044942, + "flos": 15013605790080.0, + "grad_norm": 1.8905738124988172, + "language_loss": 0.74931753, + "learning_rate": 3.7623788323083666e-06, + "loss": 0.77314794, + "num_input_tokens_seen": 7203065, + "step": 345, + "time_per_iteration": 2.577573537826538 + }, + { + "auxiliary_loss_clip": 0.01194973, + "auxiliary_loss_mlp": 0.01188171, + "balance_loss_clip": 1.00332654, + "balance_loss_mlp": 1.00163233, + "epoch": 0.02080264542311739, + "flos": 25337958952320.0, + "grad_norm": 2.021016522332556, + "language_loss": 0.90005589, + "learning_rate": 3.7642423710434837e-06, + "loss": 0.92388731, + "num_input_tokens_seen": 7222995, + "step": 346, + "time_per_iteration": 2.6421661376953125 + }, + { + "auxiliary_loss_clip": 0.01194768, + "auxiliary_loss_mlp": 0.01188208, + "balance_loss_clip": 1.00315368, + "balance_loss_mlp": 1.00138283, + "epoch": 0.02086276867578536, + "flos": 24388804016640.0, + "grad_norm": 2.616311300460923, + "language_loss": 0.79171324, + "learning_rate": 3.7661005315929563e-06, + "loss": 0.815543, + "num_input_tokens_seen": 7244625, + "step": 347, + "time_per_iteration": 2.6123108863830566 + }, + { + "auxiliary_loss_clip": 0.01194822, + "auxiliary_loss_mlp": 0.01188276, + "balance_loss_clip": 1.00334525, + "balance_loss_mlp": 1.00154686, + "epoch": 0.02092289192845333, + "flos": 24462205459200.0, + "grad_norm": 2.3490393248047146, + "language_loss": 0.71176416, + "learning_rate": 3.7679533449104354e-06, + "loss": 0.73559517, + "num_input_tokens_seen": 7263255, + "step": 348, + "time_per_iteration": 2.6201136112213135 + }, + { + "auxiliary_loss_clip": 0.01194849, + "auxiliary_loss_mlp": 0.01188275, + "balance_loss_clip": 1.00316274, + "balance_loss_mlp": 1.00154543, + "epoch": 0.0209830151811213, + "flos": 17451185523840.0, + "grad_norm": 2.5558047135656654, + "language_loss": 0.77031118, + "learning_rate": 3.7698008416831116e-06, + "loss": 0.79414248, + "num_input_tokens_seen": 7279275, + "step": 349, + "time_per_iteration": 2.5503854751586914 + }, + { + "auxiliary_loss_clip": 0.01194959, + "auxiliary_loss_mlp": 0.0118827, + "balance_loss_clip": 1.0034014, + "balance_loss_mlp": 1.00163579, + "epoch": 0.021043138433789268, + "flos": 24573995562240.0, + "grad_norm": 2.3065739795075118, + "language_loss": 0.85015368, + "learning_rate": 3.7716430523347664e-06, + "loss": 0.87398601, + "num_input_tokens_seen": 7300180, + "step": 350, + "time_per_iteration": 2.634120225906372 + }, + { + "auxiliary_loss_clip": 0.0119492, + "auxiliary_loss_mlp": 0.01187735, + "balance_loss_clip": 1.00328493, + "balance_loss_mlp": 1.00129175, + "epoch": 0.021103261686457236, + "flos": 24454053072000.0, + "grad_norm": 2.175876214385835, + "language_loss": 0.79830754, + "learning_rate": 3.773480007028776e-06, + "loss": 0.82213408, + "num_input_tokens_seen": 7317430, + "step": 351, + "time_per_iteration": 2.5924086570739746 + }, + { + "auxiliary_loss_clip": 0.01194915, + "auxiliary_loss_mlp": 0.01188297, + "balance_loss_clip": 1.00338173, + "balance_loss_mlp": 1.00185406, + "epoch": 0.021163384939125205, + "flos": 14683083816960.0, + "grad_norm": 2.1914925616598104, + "language_loss": 0.8720628, + "learning_rate": 3.775311735671078e-06, + "loss": 0.895895, + "num_input_tokens_seen": 7334875, + "step": 352, + "time_per_iteration": 2.6221508979797363 + }, + { + "auxiliary_loss_clip": 0.01194958, + "auxiliary_loss_mlp": 0.01188206, + "balance_loss_clip": 1.00337732, + "balance_loss_mlp": 1.00166726, + "epoch": 0.021223508191793177, + "flos": 24493195918080.0, + "grad_norm": 1.845408340654051, + "language_loss": 0.82455993, + "learning_rate": 3.7771382679130878e-06, + "loss": 0.84839159, + "num_input_tokens_seen": 7355185, + "step": 353, + "time_per_iteration": 2.62787127494812 + }, + { + "auxiliary_loss_clip": 0.0119496, + "auxiliary_loss_mlp": 0.01187705, + "balance_loss_clip": 1.00343561, + "balance_loss_mlp": 1.00116599, + "epoch": 0.021283631444461146, + "flos": 24126978804480.0, + "grad_norm": 2.4852208564260816, + "language_loss": 0.80992502, + "learning_rate": 3.7789596331545845e-06, + "loss": 0.83375168, + "num_input_tokens_seen": 7374425, + "step": 354, + "time_per_iteration": 2.608222484588623 + }, + { + "auxiliary_loss_clip": 0.01194782, + "auxiliary_loss_mlp": 0.01187867, + "balance_loss_clip": 1.00318742, + "balance_loss_mlp": 1.00132787, + "epoch": 0.021343754697129114, + "flos": 25192233475200.0, + "grad_norm": 2.04563273431837, + "language_loss": 0.80894756, + "learning_rate": 3.780775860546545e-06, + "loss": 0.8327741, + "num_input_tokens_seen": 7394175, + "step": 355, + "time_per_iteration": 2.5919992923736572 + }, + { + "auxiliary_loss_clip": 0.01194793, + "auxiliary_loss_mlp": 0.01187874, + "balance_loss_clip": 1.00322604, + "balance_loss_mlp": 1.00133514, + "epoch": 0.021403877949797083, + "flos": 17274182279040.0, + "grad_norm": 2.2513378654304197, + "language_loss": 0.8925904, + "learning_rate": 3.7825869789939474e-06, + "loss": 0.91641712, + "num_input_tokens_seen": 7412645, + "step": 356, + "time_per_iteration": 2.579740524291992 + }, + { + "auxiliary_loss_clip": 0.01194867, + "auxiliary_loss_mlp": 0.01187735, + "balance_loss_clip": 1.00329685, + "balance_loss_mlp": 1.00119615, + "epoch": 0.021464001202465055, + "flos": 30917435276160.0, + "grad_norm": 1.9218305817681935, + "language_loss": 0.80151606, + "learning_rate": 3.784393017158528e-06, + "loss": 0.82534206, + "num_input_tokens_seen": 7432275, + "step": 357, + "time_per_iteration": 2.655843734741211 + }, + { + "auxiliary_loss_clip": 0.01194764, + "auxiliary_loss_mlp": 0.01187784, + "balance_loss_clip": 1.00320637, + "balance_loss_mlp": 1.0013411, + "epoch": 0.021524124455133024, + "flos": 18186385098240.0, + "grad_norm": 2.575271850640687, + "language_loss": 0.76697099, + "learning_rate": 3.786194003461506e-06, + "loss": 0.79079652, + "num_input_tokens_seen": 7450245, + "step": 358, + "time_per_iteration": 2.5332231521606445 + }, + { + "auxiliary_loss_clip": 0.01194729, + "auxiliary_loss_mlp": 0.01187732, + "balance_loss_clip": 1.00316226, + "balance_loss_mlp": 1.00128865, + "epoch": 0.021584247707800992, + "flos": 13805786039040.0, + "grad_norm": 2.2747348874679765, + "language_loss": 0.885849, + "learning_rate": 3.787989966086264e-06, + "loss": 0.90967369, + "num_input_tokens_seen": 7466845, + "step": 359, + "time_per_iteration": 2.5307300090789795 + }, + { + "auxiliary_loss_clip": 0.01194728, + "auxiliary_loss_mlp": 0.0118774, + "balance_loss_clip": 1.00320899, + "balance_loss_mlp": 1.00129628, + "epoch": 0.02164437096046896, + "flos": 23294713703040.0, + "grad_norm": 2.440508997599465, + "language_loss": 0.76219255, + "learning_rate": 3.789780932980997e-06, + "loss": 0.78601718, + "num_input_tokens_seen": 7485450, + "step": 360, + "time_per_iteration": 5.442927122116089 + }, + { + "auxiliary_loss_clip": 0.01199493, + "auxiliary_loss_mlp": 0.01187127, + "balance_loss_clip": 1.00822127, + "balance_loss_mlp": 1.00097013, + "epoch": 0.02170449421313693, + "flos": 68899578341760.0, + "grad_norm": 0.8360019774882239, + "language_loss": 0.64918256, + "learning_rate": 3.79156693186132e-06, + "loss": 0.67304879, + "num_input_tokens_seen": 7553780, + "step": 361, + "time_per_iteration": 4.73838996887207 + }, + { + "auxiliary_loss_clip": 0.01194598, + "auxiliary_loss_mlp": 0.01187674, + "balance_loss_clip": 1.003075, + "balance_loss_mlp": 1.00132644, + "epoch": 0.0217646174658049, + "flos": 25228539146880.0, + "grad_norm": 2.9679594230888853, + "language_loss": 0.78447366, + "learning_rate": 3.7933479902128433e-06, + "loss": 0.80829644, + "num_input_tokens_seen": 7574155, + "step": 362, + "time_per_iteration": 2.601677894592285 + }, + { + "auxiliary_loss_clip": 0.01194607, + "auxiliary_loss_mlp": 0.01187904, + "balance_loss_clip": 1.00304759, + "balance_loss_mlp": 1.00136495, + "epoch": 0.02182474071847287, + "flos": 22893124671360.0, + "grad_norm": 3.967864027370434, + "language_loss": 0.92456818, + "learning_rate": 3.7951241352937077e-06, + "loss": 0.94839323, + "num_input_tokens_seen": 7592320, + "step": 363, + "time_per_iteration": 2.5805792808532715 + }, + { + "auxiliary_loss_clip": 0.01194843, + "auxiliary_loss_mlp": 0.0118791, + "balance_loss_clip": 1.00320423, + "balance_loss_mlp": 1.00165784, + "epoch": 0.02188486397114084, + "flos": 23658991482240.0, + "grad_norm": 2.5721113159834252, + "language_loss": 0.89596379, + "learning_rate": 3.7968953941370915e-06, + "loss": 0.91979128, + "num_input_tokens_seen": 7611185, + "step": 364, + "time_per_iteration": 2.587489366531372 + }, + { + "auxiliary_loss_clip": 0.0119486, + "auxiliary_loss_mlp": 0.01187669, + "balance_loss_clip": 1.0032649, + "balance_loss_mlp": 1.00141633, + "epoch": 0.021944987223808807, + "flos": 21543637680000.0, + "grad_norm": 2.1338885965183327, + "language_loss": 0.79582137, + "learning_rate": 3.798661793553676e-06, + "loss": 0.8196466, + "num_input_tokens_seen": 7631970, + "step": 365, + "time_per_iteration": 2.584927558898926 + }, + { + "auxiliary_loss_clip": 0.0119457, + "auxiliary_loss_mlp": 0.01187531, + "balance_loss_clip": 1.00312734, + "balance_loss_mlp": 1.00146914, + "epoch": 0.022005110476476776, + "flos": 16070887641600.0, + "grad_norm": 1.9018488061455643, + "language_loss": 0.84250367, + "learning_rate": 3.8004233601340808e-06, + "loss": 0.86632478, + "num_input_tokens_seen": 7649745, + "step": 366, + "time_per_iteration": 2.535595178604126 + }, + { + "auxiliary_loss_clip": 0.01194794, + "auxiliary_loss_mlp": 0.01187665, + "balance_loss_clip": 1.0032711, + "balance_loss_mlp": 1.00150824, + "epoch": 0.022065233729144748, + "flos": 21433715084160.0, + "grad_norm": 2.078130034729267, + "language_loss": 0.86971545, + "learning_rate": 3.8021801202512694e-06, + "loss": 0.89354008, + "num_input_tokens_seen": 7668830, + "step": 367, + "time_per_iteration": 2.566455364227295 + }, + { + "auxiliary_loss_clip": 0.01194738, + "auxiliary_loss_mlp": 0.01187827, + "balance_loss_clip": 1.00319099, + "balance_loss_mlp": 1.00147939, + "epoch": 0.022125356981812717, + "flos": 21543709507200.0, + "grad_norm": 2.102676877449925, + "language_loss": 0.84712195, + "learning_rate": 3.803932100062912e-06, + "loss": 0.8709476, + "num_input_tokens_seen": 7687240, + "step": 368, + "time_per_iteration": 2.5724306106567383 + }, + { + "auxiliary_loss_clip": 0.01194703, + "auxiliary_loss_mlp": 0.01187736, + "balance_loss_clip": 1.0031383, + "balance_loss_mlp": 1.0011977, + "epoch": 0.022185480234480685, + "flos": 20704153944960.0, + "grad_norm": 6.237232459975241, + "language_loss": 0.75404429, + "learning_rate": 3.8056793255137264e-06, + "loss": 0.77786863, + "num_input_tokens_seen": 7704440, + "step": 369, + "time_per_iteration": 2.5734853744506836 + }, + { + "auxiliary_loss_clip": 0.01194691, + "auxiliary_loss_mlp": 0.01188191, + "balance_loss_clip": 1.00315309, + "balance_loss_mlp": 1.00174737, + "epoch": 0.022245603487148654, + "flos": 25193203142400.0, + "grad_norm": 2.350867254190639, + "language_loss": 0.82711917, + "learning_rate": 3.8074218223377844e-06, + "loss": 0.85094798, + "num_input_tokens_seen": 7727160, + "step": 370, + "time_per_iteration": 2.669332981109619 + }, + { + "auxiliary_loss_clip": 0.01194532, + "auxiliary_loss_mlp": 0.01187796, + "balance_loss_clip": 1.00305676, + "balance_loss_mlp": 1.00144768, + "epoch": 0.022305726739816623, + "flos": 21395936954880.0, + "grad_norm": 1.7155135984288248, + "language_loss": 0.81752968, + "learning_rate": 3.8091596160607834e-06, + "loss": 0.841353, + "num_input_tokens_seen": 7747730, + "step": 371, + "time_per_iteration": 2.577413558959961 + }, + { + "auxiliary_loss_clip": 0.01194772, + "auxiliary_loss_mlp": 0.01187796, + "balance_loss_clip": 1.00332737, + "balance_loss_mlp": 1.00116217, + "epoch": 0.022365849992484595, + "flos": 22492146170880.0, + "grad_norm": 2.1902417479230993, + "language_loss": 0.83294153, + "learning_rate": 3.8108927320022896e-06, + "loss": 0.85676724, + "num_input_tokens_seen": 7766765, + "step": 372, + "time_per_iteration": 2.5835771560668945 + }, + { + "auxiliary_loss_clip": 0.01194431, + "auxiliary_loss_mlp": 0.01187703, + "balance_loss_clip": 1.00304055, + "balance_loss_mlp": 1.00154543, + "epoch": 0.022425973245152563, + "flos": 17856581397120.0, + "grad_norm": 3.1998327841423606, + "language_loss": 0.7893573, + "learning_rate": 3.8126211952779548e-06, + "loss": 0.81317872, + "num_input_tokens_seen": 7784010, + "step": 373, + "time_per_iteration": 2.533339262008667 + }, + { + "auxiliary_loss_clip": 0.01194568, + "auxiliary_loss_mlp": 0.01187456, + "balance_loss_clip": 1.0031476, + "balance_loss_mlp": 1.00129867, + "epoch": 0.022486096497820532, + "flos": 15483029656320.0, + "grad_norm": 2.4439641630274416, + "language_loss": 0.7781496, + "learning_rate": 3.8143450308016952e-06, + "loss": 0.80196983, + "num_input_tokens_seen": 7801305, + "step": 374, + "time_per_iteration": 2.5594615936279297 + }, + { + "auxiliary_loss_clip": 0.01194324, + "auxiliary_loss_mlp": 0.01187097, + "balance_loss_clip": 1.00288463, + "balance_loss_mlp": 1.00103533, + "epoch": 0.0225462197504885, + "flos": 27784157950080.0, + "grad_norm": 1.6677828518264066, + "language_loss": 0.85898423, + "learning_rate": 3.8160642632878525e-06, + "loss": 0.88279843, + "num_input_tokens_seen": 7823965, + "step": 375, + "time_per_iteration": 2.6382100582122803 + }, + { + "auxiliary_loss_clip": 0.01194584, + "auxiliary_loss_mlp": 0.01188074, + "balance_loss_clip": 1.00315857, + "balance_loss_mlp": 1.00163078, + "epoch": 0.02260634300315647, + "flos": 19975490645760.0, + "grad_norm": 2.4259077097914443, + "language_loss": 0.89069676, + "learning_rate": 3.817778917253314e-06, + "loss": 0.91452336, + "num_input_tokens_seen": 7842115, + "step": 376, + "time_per_iteration": 2.577467679977417 + }, + { + "auxiliary_loss_clip": 0.01194578, + "auxiliary_loss_mlp": 0.01187417, + "balance_loss_clip": 1.00315046, + "balance_loss_mlp": 1.00116456, + "epoch": 0.02266646625582444, + "flos": 16028189349120.0, + "grad_norm": 2.8312441711067673, + "language_loss": 0.75062382, + "learning_rate": 3.8194890170196155e-06, + "loss": 0.77444375, + "num_input_tokens_seen": 7857830, + "step": 377, + "time_per_iteration": 2.557812452316284 + }, + { + "auxiliary_loss_clip": 0.01194479, + "auxiliary_loss_mlp": 0.01187158, + "balance_loss_clip": 1.00309372, + "balance_loss_mlp": 1.001001, + "epoch": 0.02272658950849241, + "flos": 20404622430720.0, + "grad_norm": 2.9426870086229897, + "language_loss": 0.99169958, + "learning_rate": 3.8211945867150055e-06, + "loss": 1.01551592, + "num_input_tokens_seen": 7875840, + "step": 378, + "time_per_iteration": 2.5797104835510254 + }, + { + "auxiliary_loss_clip": 0.011985, + "auxiliary_loss_mlp": 0.01186443, + "balance_loss_clip": 1.00751114, + "balance_loss_mlp": 1.00028563, + "epoch": 0.02278671276116038, + "flos": 69847332647040.0, + "grad_norm": 1.1580428979706294, + "language_loss": 0.7545737, + "learning_rate": 3.822895650276492e-06, + "loss": 0.77842319, + "num_input_tokens_seen": 7940190, + "step": 379, + "time_per_iteration": 3.2116410732269287 + }, + { + "auxiliary_loss_clip": 0.01194477, + "auxiliary_loss_mlp": 0.01187924, + "balance_loss_clip": 1.00301814, + "balance_loss_mlp": 1.00138521, + "epoch": 0.022846836013828347, + "flos": 38508771340800.0, + "grad_norm": 2.5559873366336268, + "language_loss": 0.78308403, + "learning_rate": 3.824592231451859e-06, + "loss": 0.80690801, + "num_input_tokens_seen": 7960840, + "step": 380, + "time_per_iteration": 2.7648441791534424 + }, + { + "auxiliary_loss_clip": 0.01194539, + "auxiliary_loss_mlp": 0.01187731, + "balance_loss_clip": 1.0031203, + "balance_loss_mlp": 1.00166893, + "epoch": 0.02290695926649632, + "flos": 20959478795520.0, + "grad_norm": 2.536623511879267, + "language_loss": 0.96634555, + "learning_rate": 3.826284353801652e-06, + "loss": 0.99016821, + "num_input_tokens_seen": 7975500, + "step": 381, + "time_per_iteration": 2.5596296787261963 + }, + { + "auxiliary_loss_clip": 0.01194642, + "auxiliary_loss_mlp": 0.01187646, + "balance_loss_clip": 1.00316882, + "balance_loss_mlp": 1.00158465, + "epoch": 0.022967082519164288, + "flos": 24022407335040.0, + "grad_norm": 2.3616166185423655, + "language_loss": 0.87945324, + "learning_rate": 3.827972040701142e-06, + "loss": 0.90327615, + "num_input_tokens_seen": 7993880, + "step": 382, + "time_per_iteration": 2.5854296684265137 + }, + { + "auxiliary_loss_clip": 0.01194614, + "auxiliary_loss_mlp": 0.01188111, + "balance_loss_clip": 1.00319171, + "balance_loss_mlp": 1.001858, + "epoch": 0.023027205771832256, + "flos": 20997149184000.0, + "grad_norm": 2.077533626593386, + "language_loss": 0.84812057, + "learning_rate": 3.829655315342268e-06, + "loss": 0.87194777, + "num_input_tokens_seen": 8012730, + "step": 383, + "time_per_iteration": 2.554922580718994 + }, + { + "auxiliary_loss_clip": 0.01194481, + "auxiliary_loss_mlp": 0.01188252, + "balance_loss_clip": 1.00309849, + "balance_loss_mlp": 1.00209451, + "epoch": 0.023087329024500225, + "flos": 21360816432000.0, + "grad_norm": 2.2249611566225473, + "language_loss": 0.83265936, + "learning_rate": 3.831334200735543e-06, + "loss": 0.85648668, + "num_input_tokens_seen": 8031275, + "step": 384, + "time_per_iteration": 2.550968647003174 + }, + { + "auxiliary_loss_clip": 0.01194643, + "auxiliary_loss_mlp": 0.0118753, + "balance_loss_clip": 1.00327563, + "balance_loss_mlp": 1.0014683, + "epoch": 0.023147452277168194, + "flos": 21872435800320.0, + "grad_norm": 2.0336159693434537, + "language_loss": 0.89175045, + "learning_rate": 3.8330087197119426e-06, + "loss": 0.91557217, + "num_input_tokens_seen": 8051600, + "step": 385, + "time_per_iteration": 2.5597236156463623 + }, + { + "auxiliary_loss_clip": 0.01194607, + "auxiliary_loss_mlp": 0.01188377, + "balance_loss_clip": 1.00319076, + "balance_loss_mlp": 1.00231516, + "epoch": 0.023207575529836166, + "flos": 18916700423040.0, + "grad_norm": 2.036711687908669, + "language_loss": 0.69716465, + "learning_rate": 3.83467889492477e-06, + "loss": 0.72099447, + "num_input_tokens_seen": 8070600, + "step": 386, + "time_per_iteration": 2.5976431369781494 + }, + { + "auxiliary_loss_clip": 0.01194388, + "auxiliary_loss_mlp": 0.01187703, + "balance_loss_clip": 1.00305116, + "balance_loss_mlp": 1.00154567, + "epoch": 0.023267698782504134, + "flos": 25046005207680.0, + "grad_norm": 2.401723643207114, + "language_loss": 0.88087153, + "learning_rate": 3.836344748851495e-06, + "loss": 0.90469241, + "num_input_tokens_seen": 8090680, + "step": 387, + "time_per_iteration": 2.577296257019043 + }, + { + "auxiliary_loss_clip": 0.01194625, + "auxiliary_loss_mlp": 0.01187235, + "balance_loss_clip": 1.00327206, + "balance_loss_mlp": 1.00107813, + "epoch": 0.023327822035172103, + "flos": 28879217930880.0, + "grad_norm": 2.26999784120669, + "language_loss": 0.83319426, + "learning_rate": 3.838006303795566e-06, + "loss": 0.85701287, + "num_input_tokens_seen": 8114610, + "step": 388, + "time_per_iteration": 2.6322860717773438 + }, + { + "auxiliary_loss_clip": 0.01194507, + "auxiliary_loss_mlp": 0.0118781, + "balance_loss_clip": 1.00306726, + "balance_loss_mlp": 1.00146198, + "epoch": 0.02338794528784007, + "flos": 27121533805440.0, + "grad_norm": 2.5820150140535496, + "language_loss": 0.93459821, + "learning_rate": 3.839663581888206e-06, + "loss": 0.95842147, + "num_input_tokens_seen": 8133975, + "step": 389, + "time_per_iteration": 2.578867197036743 + }, + { + "auxiliary_loss_clip": 0.01194443, + "auxiliary_loss_mlp": 0.01187297, + "balance_loss_clip": 1.003124, + "balance_loss_mlp": 1.00123549, + "epoch": 0.02344806854050804, + "flos": 21322355944320.0, + "grad_norm": 2.260858611841845, + "language_loss": 0.87789571, + "learning_rate": 3.841316605090178e-06, + "loss": 0.90171313, + "num_input_tokens_seen": 8153570, + "step": 390, + "time_per_iteration": 2.5613667964935303 + }, + { + "auxiliary_loss_clip": 0.0119453, + "auxiliary_loss_mlp": 0.01187589, + "balance_loss_clip": 1.00314736, + "balance_loss_mlp": 1.0013361, + "epoch": 0.023508191793176012, + "flos": 24789997998720.0, + "grad_norm": 2.246475049108291, + "language_loss": 0.89153349, + "learning_rate": 3.842965395193529e-06, + "loss": 0.91535467, + "num_input_tokens_seen": 8170075, + "step": 391, + "time_per_iteration": 2.5653703212738037 + }, + { + "auxiliary_loss_clip": 0.01194575, + "auxiliary_loss_mlp": 0.0118756, + "balance_loss_clip": 1.00314093, + "balance_loss_mlp": 1.00140309, + "epoch": 0.02356831504584398, + "flos": 25995375624960.0, + "grad_norm": 2.403138400891863, + "language_loss": 0.860502, + "learning_rate": 3.84460997382332e-06, + "loss": 0.88432336, + "num_input_tokens_seen": 8190420, + "step": 392, + "time_per_iteration": 2.616203546524048 + }, + { + "auxiliary_loss_clip": 0.01194355, + "auxiliary_loss_mlp": 0.01187337, + "balance_loss_clip": 1.00314844, + "balance_loss_mlp": 1.00137067, + "epoch": 0.02362843829851195, + "flos": 19062461813760.0, + "grad_norm": 1.892118619830298, + "language_loss": 0.88897884, + "learning_rate": 3.8462503624393256e-06, + "loss": 0.91279578, + "num_input_tokens_seen": 8208790, + "step": 393, + "time_per_iteration": 2.5264391899108887 + }, + { + "auxiliary_loss_clip": 0.01194393, + "auxiliary_loss_mlp": 0.01187963, + "balance_loss_clip": 1.00319672, + "balance_loss_mlp": 1.00199676, + "epoch": 0.023688561551179918, + "flos": 16071031296000.0, + "grad_norm": 2.2144273094381677, + "language_loss": 0.81567067, + "learning_rate": 3.84788658233771e-06, + "loss": 0.83949423, + "num_input_tokens_seen": 8226885, + "step": 394, + "time_per_iteration": 2.524017810821533 + }, + { + "auxiliary_loss_clip": 0.0119431, + "auxiliary_loss_mlp": 0.01187763, + "balance_loss_clip": 1.00302625, + "balance_loss_mlp": 1.00160599, + "epoch": 0.023748684803847887, + "flos": 21724375939200.0, + "grad_norm": 2.74349747614728, + "language_loss": 0.8599087, + "learning_rate": 3.84951865465269e-06, + "loss": 0.88372946, + "num_input_tokens_seen": 8246825, + "step": 395, + "time_per_iteration": 2.5748345851898193 + }, + { + "auxiliary_loss_clip": 0.01198269, + "auxiliary_loss_mlp": 0.0118563, + "balance_loss_clip": 1.00735068, + "balance_loss_mlp": 1.00023544, + "epoch": 0.02380880805651586, + "flos": 61926192881280.0, + "grad_norm": 0.9300604551216766, + "language_loss": 0.63805133, + "learning_rate": 3.851146600358172e-06, + "loss": 0.66189033, + "num_input_tokens_seen": 8302835, + "step": 396, + "time_per_iteration": 3.012664318084717 + }, + { + "auxiliary_loss_clip": 0.01194405, + "auxiliary_loss_mlp": 0.01187441, + "balance_loss_clip": 1.00314784, + "balance_loss_mlp": 1.00128388, + "epoch": 0.023868931309183827, + "flos": 20266331068800.0, + "grad_norm": 2.4262342719404564, + "language_loss": 0.83564293, + "learning_rate": 3.852770440269372e-06, + "loss": 0.85946137, + "num_input_tokens_seen": 8320745, + "step": 397, + "time_per_iteration": 2.56622576713562 + }, + { + "auxiliary_loss_clip": 0.01194514, + "auxiliary_loss_mlp": 0.01187717, + "balance_loss_clip": 1.00309229, + "balance_loss_mlp": 1.00155938, + "epoch": 0.023929054561851796, + "flos": 21139103733120.0, + "grad_norm": 2.3556751901084705, + "language_loss": 0.84326291, + "learning_rate": 3.854390195044404e-06, + "loss": 0.86708522, + "num_input_tokens_seen": 8339540, + "step": 398, + "time_per_iteration": 3.97367787361145 + }, + { + "auxiliary_loss_clip": 0.01194326, + "auxiliary_loss_mlp": 0.01187262, + "balance_loss_clip": 1.00298929, + "balance_loss_mlp": 1.00110483, + "epoch": 0.023989177814519765, + "flos": 13698521049600.0, + "grad_norm": 3.7058576976336832, + "language_loss": 0.85682368, + "learning_rate": 3.856005885185868e-06, + "loss": 0.88063955, + "num_input_tokens_seen": 8354890, + "step": 399, + "time_per_iteration": 5.23486590385437 + }, + { + "auxiliary_loss_clip": 0.01194371, + "auxiliary_loss_mlp": 0.01187578, + "balance_loss_clip": 1.00314391, + "balance_loss_mlp": 1.00161099, + "epoch": 0.024049301067187733, + "flos": 26322018929280.0, + "grad_norm": 2.1037399685241214, + "language_loss": 0.86031485, + "learning_rate": 3.857617531042398e-06, + "loss": 0.88413441, + "num_input_tokens_seen": 8375845, + "step": 400, + "time_per_iteration": 4.0091657638549805 + }, + { + "auxiliary_loss_clip": 0.01194332, + "auxiliary_loss_mlp": 0.01187376, + "balance_loss_clip": 1.00311685, + "balance_loss_mlp": 1.00150502, + "epoch": 0.024109424319855705, + "flos": 24425432910720.0, + "grad_norm": 2.372685336241767, + "language_loss": 0.79371953, + "learning_rate": 3.8592251528102065e-06, + "loss": 0.81753659, + "num_input_tokens_seen": 8395240, + "step": 401, + "time_per_iteration": 2.567586660385132 + }, + { + "auxiliary_loss_clip": 0.01194258, + "auxiliary_loss_mlp": 0.01187532, + "balance_loss_clip": 1.00304246, + "balance_loss_mlp": 1.00146997, + "epoch": 0.024169547572523674, + "flos": 29604397610880.0, + "grad_norm": 4.118269132250222, + "language_loss": 0.78308785, + "learning_rate": 3.8608287705345976e-06, + "loss": 0.80690575, + "num_input_tokens_seen": 8416950, + "step": 402, + "time_per_iteration": 2.6002116203308105 + }, + { + "auxiliary_loss_clip": 0.01194356, + "auxiliary_loss_mlp": 0.01187281, + "balance_loss_clip": 1.00304008, + "balance_loss_mlp": 1.001315, + "epoch": 0.024229670825191642, + "flos": 22601458235520.0, + "grad_norm": 2.7382056579077307, + "language_loss": 0.94943595, + "learning_rate": 3.86242840411147e-06, + "loss": 0.97325236, + "num_input_tokens_seen": 8433660, + "step": 403, + "time_per_iteration": 2.586451292037964 + }, + { + "auxiliary_loss_clip": 0.01194307, + "auxiliary_loss_mlp": 0.01187292, + "balance_loss_clip": 1.00297213, + "balance_loss_mlp": 1.00123024, + "epoch": 0.02428979407785961, + "flos": 18150258994560.0, + "grad_norm": 2.824769943408209, + "language_loss": 0.99788892, + "learning_rate": 3.864024073288798e-06, + "loss": 1.02170491, + "num_input_tokens_seen": 8450180, + "step": 404, + "time_per_iteration": 2.522629737854004 + }, + { + "auxiliary_loss_clip": 0.01194268, + "auxiliary_loss_mlp": 0.01187694, + "balance_loss_clip": 1.00301969, + "balance_loss_mlp": 1.00163233, + "epoch": 0.024349917330527583, + "flos": 15304984917120.0, + "grad_norm": 2.205694984225642, + "language_loss": 0.87539196, + "learning_rate": 3.865615797668091e-06, + "loss": 0.89921153, + "num_input_tokens_seen": 8467775, + "step": 405, + "time_per_iteration": 2.518195390701294 + }, + { + "auxiliary_loss_clip": 0.01194697, + "auxiliary_loss_mlp": 0.01187884, + "balance_loss_clip": 1.00336528, + "balance_loss_mlp": 1.00163126, + "epoch": 0.024410040583195552, + "flos": 20773892200320.0, + "grad_norm": 2.1164952957084333, + "language_loss": 0.93485177, + "learning_rate": 3.867203596705844e-06, + "loss": 0.95867753, + "num_input_tokens_seen": 8486765, + "step": 406, + "time_per_iteration": 2.552990198135376 + }, + { + "auxiliary_loss_clip": 0.01194218, + "auxiliary_loss_mlp": 0.01187406, + "balance_loss_clip": 1.00299215, + "balance_loss_mlp": 1.00143945, + "epoch": 0.02447016383586352, + "flos": 21798854789760.0, + "grad_norm": 2.1268037291305837, + "language_loss": 0.87087893, + "learning_rate": 3.86878748971496e-06, + "loss": 0.89469516, + "num_input_tokens_seen": 8506515, + "step": 407, + "time_per_iteration": 2.5421714782714844 + }, + { + "auxiliary_loss_clip": 0.01194429, + "auxiliary_loss_mlp": 0.01187149, + "balance_loss_clip": 1.00325036, + "balance_loss_mlp": 1.00137341, + "epoch": 0.02453028708853149, + "flos": 33948116380800.0, + "grad_norm": 2.1218095714805427, + "language_loss": 0.7388947, + "learning_rate": 3.8703674958661596e-06, + "loss": 0.76271045, + "num_input_tokens_seen": 8528035, + "step": 408, + "time_per_iteration": 2.713914394378662 + }, + { + "auxiliary_loss_clip": 0.01194242, + "auxiliary_loss_mlp": 0.01187377, + "balance_loss_clip": 1.00301766, + "balance_loss_mlp": 1.00150633, + "epoch": 0.024590410341199458, + "flos": 21793000872960.0, + "grad_norm": 2.4208340259165086, + "language_loss": 0.92439735, + "learning_rate": 3.871943634189376e-06, + "loss": 0.94821358, + "num_input_tokens_seen": 8546455, + "step": 409, + "time_per_iteration": 2.618255376815796 + }, + { + "auxiliary_loss_clip": 0.01194241, + "auxiliary_loss_mlp": 0.01187095, + "balance_loss_clip": 1.00302935, + "balance_loss_mlp": 1.00131941, + "epoch": 0.02465053359386743, + "flos": 35114782124160.0, + "grad_norm": 2.1968192147041754, + "language_loss": 0.82681155, + "learning_rate": 3.873515923575128e-06, + "loss": 0.85062486, + "num_input_tokens_seen": 8568450, + "step": 410, + "time_per_iteration": 2.6962523460388184 + }, + { + "auxiliary_loss_clip": 0.01194384, + "auxiliary_loss_mlp": 0.01187723, + "balance_loss_clip": 1.00316024, + "balance_loss_mlp": 1.00166154, + "epoch": 0.0247106568465354, + "flos": 27451409333760.0, + "grad_norm": 2.3013540560730825, + "language_loss": 0.77789164, + "learning_rate": 3.875084382775879e-06, + "loss": 0.80171263, + "num_input_tokens_seen": 8589340, + "step": 411, + "time_per_iteration": 2.6070172786712646 + }, + { + "auxiliary_loss_clip": 0.01194135, + "auxiliary_loss_mlp": 0.0118762, + "balance_loss_clip": 1.00294971, + "balance_loss_mlp": 1.00174904, + "epoch": 0.024770780099203367, + "flos": 20703794808960.0, + "grad_norm": 2.285222528795849, + "language_loss": 0.8652488, + "learning_rate": 3.87664903040738e-06, + "loss": 0.88906634, + "num_input_tokens_seen": 8607150, + "step": 412, + "time_per_iteration": 2.576167583465576 + }, + { + "auxiliary_loss_clip": 0.01197907, + "auxiliary_loss_mlp": 0.01185585, + "balance_loss_clip": 1.00715756, + "balance_loss_mlp": 1.00019062, + "epoch": 0.024830903351871336, + "flos": 69551859369600.0, + "grad_norm": 0.8416587833736375, + "language_loss": 0.58505148, + "learning_rate": 3.878209884949994e-06, + "loss": 0.60888636, + "num_input_tokens_seen": 8669865, + "step": 413, + "time_per_iteration": 3.238638162612915 + }, + { + "auxiliary_loss_clip": 0.01194183, + "auxiliary_loss_mlp": 0.01187398, + "balance_loss_clip": 1.00292504, + "balance_loss_mlp": 1.00162244, + "epoch": 0.024891026604539304, + "flos": 32270477713920.0, + "grad_norm": 1.678650049436651, + "language_loss": 0.80376935, + "learning_rate": 3.879766964750006e-06, + "loss": 0.82758516, + "num_input_tokens_seen": 8690235, + "step": 414, + "time_per_iteration": 2.6816625595092773 + }, + { + "auxiliary_loss_clip": 0.01194146, + "auxiliary_loss_mlp": 0.01187551, + "balance_loss_clip": 1.00299764, + "balance_loss_mlp": 1.00168037, + "epoch": 0.024951149857207276, + "flos": 18840282238080.0, + "grad_norm": 3.733754861451954, + "language_loss": 0.7988987, + "learning_rate": 3.881320288020917e-06, + "loss": 0.82271564, + "num_input_tokens_seen": 8706295, + "step": 415, + "time_per_iteration": 2.5390524864196777 + }, + { + "auxiliary_loss_clip": 0.01194325, + "auxiliary_loss_mlp": 0.01187367, + "balance_loss_clip": 1.0030551, + "balance_loss_mlp": 1.00140059, + "epoch": 0.025011273109875245, + "flos": 15377201210880.0, + "grad_norm": 4.54706319819422, + "language_loss": 0.96210086, + "learning_rate": 3.882869872844723e-06, + "loss": 0.98591781, + "num_input_tokens_seen": 8724200, + "step": 416, + "time_per_iteration": 2.5595500469207764 + }, + { + "auxiliary_loss_clip": 0.01194163, + "auxiliary_loss_mlp": 0.01187127, + "balance_loss_clip": 1.00295186, + "balance_loss_mlp": 1.00125623, + "epoch": 0.025071396362543213, + "flos": 18915515274240.0, + "grad_norm": 1.6366198234907516, + "language_loss": 0.77599502, + "learning_rate": 3.884415737173176e-06, + "loss": 0.79980797, + "num_input_tokens_seen": 8744170, + "step": 417, + "time_per_iteration": 2.5929811000823975 + }, + { + "auxiliary_loss_clip": 0.01194314, + "auxiliary_loss_mlp": 0.01187325, + "balance_loss_clip": 1.00319099, + "balance_loss_mlp": 1.00154901, + "epoch": 0.025131519615211182, + "flos": 25337958952320.0, + "grad_norm": 2.1202468009038453, + "language_loss": 0.76994061, + "learning_rate": 3.8859578988290344e-06, + "loss": 0.79375702, + "num_input_tokens_seen": 8765120, + "step": 418, + "time_per_iteration": 2.6480629444122314 + }, + { + "auxiliary_loss_clip": 0.01194154, + "auxiliary_loss_mlp": 0.01187223, + "balance_loss_clip": 1.00300467, + "balance_loss_mlp": 1.00144684, + "epoch": 0.02519164286787915, + "flos": 18953149749120.0, + "grad_norm": 2.2394593630821786, + "language_loss": 0.81543428, + "learning_rate": 3.887496375507294e-06, + "loss": 0.839248, + "num_input_tokens_seen": 8783500, + "step": 419, + "time_per_iteration": 2.5463812351226807 + }, + { + "auxiliary_loss_clip": 0.01194067, + "auxiliary_loss_mlp": 0.01187389, + "balance_loss_clip": 1.00295758, + "balance_loss_mlp": 1.00161326, + "epoch": 0.025251766120547123, + "flos": 17421092904960.0, + "grad_norm": 2.0736900274694627, + "language_loss": 0.73569036, + "learning_rate": 3.8890311847764065e-06, + "loss": 0.75950497, + "num_input_tokens_seen": 8801175, + "step": 420, + "time_per_iteration": 2.593942642211914 + }, + { + "auxiliary_loss_clip": 0.01194091, + "auxiliary_loss_mlp": 0.01187737, + "balance_loss_clip": 1.00291085, + "balance_loss_mlp": 1.00205612, + "epoch": 0.02531188937321509, + "flos": 25045430590080.0, + "grad_norm": 1.8455962394061434, + "language_loss": 0.7892704, + "learning_rate": 3.890562344079484e-06, + "loss": 0.81308877, + "num_input_tokens_seen": 8820215, + "step": 421, + "time_per_iteration": 2.5852668285369873 + }, + { + "auxiliary_loss_clip": 0.01194025, + "auxiliary_loss_mlp": 0.01187333, + "balance_loss_clip": 1.00294185, + "balance_loss_mlp": 1.00155687, + "epoch": 0.02537201262588306, + "flos": 30592228515840.0, + "grad_norm": 2.6438398763279105, + "language_loss": 0.81726515, + "learning_rate": 3.89208987073549e-06, + "loss": 0.84107876, + "num_input_tokens_seen": 8839660, + "step": 422, + "time_per_iteration": 2.639103889465332 + }, + { + "auxiliary_loss_clip": 0.01194158, + "auxiliary_loss_mlp": 0.01187511, + "balance_loss_clip": 1.00298131, + "balance_loss_mlp": 1.00154424, + "epoch": 0.02543213587855103, + "flos": 26065365275520.0, + "grad_norm": 3.031648677605352, + "language_loss": 0.83423048, + "learning_rate": 3.893613781940409e-06, + "loss": 0.85804713, + "num_input_tokens_seen": 8859280, + "step": 423, + "time_per_iteration": 2.5700652599334717 + }, + { + "auxiliary_loss_clip": 0.01194145, + "auxiliary_loss_mlp": 0.01186953, + "balance_loss_clip": 1.00305319, + "balance_loss_mlp": 1.00117695, + "epoch": 0.025492259131218997, + "flos": 36022818965760.0, + "grad_norm": 2.6551231613877184, + "language_loss": 0.74252129, + "learning_rate": 3.895134094768415e-06, + "loss": 0.76633227, + "num_input_tokens_seen": 8880560, + "step": 424, + "time_per_iteration": 2.6677489280700684 + }, + { + "auxiliary_loss_clip": 0.01193997, + "auxiliary_loss_mlp": 0.01187406, + "balance_loss_clip": 1.00293541, + "balance_loss_mlp": 1.00182152, + "epoch": 0.02555238238388697, + "flos": 18588045957120.0, + "grad_norm": 2.9830414734292234, + "language_loss": 0.83033317, + "learning_rate": 3.896650826173015e-06, + "loss": 0.8541472, + "num_input_tokens_seen": 8899155, + "step": 425, + "time_per_iteration": 2.520667552947998 + }, + { + "auxiliary_loss_clip": 0.01193894, + "auxiliary_loss_mlp": 0.01186892, + "balance_loss_clip": 1.00278258, + "balance_loss_mlp": 1.00121188, + "epoch": 0.025612505636554938, + "flos": 24243186280320.0, + "grad_norm": 3.5677169526561148, + "language_loss": 0.85339618, + "learning_rate": 3.898163992988186e-06, + "loss": 0.87720406, + "num_input_tokens_seen": 8917890, + "step": 426, + "time_per_iteration": 2.561753273010254 + }, + { + "auxiliary_loss_clip": 0.01197605, + "auxiliary_loss_mlp": 0.0118555, + "balance_loss_clip": 1.00696886, + "balance_loss_mlp": 1.00015569, + "epoch": 0.025672628889222907, + "flos": 60586941265920.0, + "grad_norm": 0.8924153923027129, + "language_loss": 0.57224411, + "learning_rate": 3.899673611929491e-06, + "loss": 0.59607565, + "num_input_tokens_seen": 8978260, + "step": 427, + "time_per_iteration": 3.216174364089966 + }, + { + "auxiliary_loss_clip": 0.01194281, + "auxiliary_loss_mlp": 0.01187641, + "balance_loss_clip": 1.00310433, + "balance_loss_mlp": 1.00186563, + "epoch": 0.025732752141890875, + "flos": 19573255169280.0, + "grad_norm": 3.9014683619585213, + "language_loss": 0.88238323, + "learning_rate": 3.901179699595194e-06, + "loss": 0.90620244, + "num_input_tokens_seen": 8994460, + "step": 428, + "time_per_iteration": 2.6016552448272705 + }, + { + "auxiliary_loss_clip": 0.01193869, + "auxiliary_loss_mlp": 0.01186803, + "balance_loss_clip": 1.00280213, + "balance_loss_mlp": 1.00131285, + "epoch": 0.025792875394558847, + "flos": 31284262920960.0, + "grad_norm": 1.7735578591528138, + "language_loss": 0.85779452, + "learning_rate": 3.902682272467353e-06, + "loss": 0.88160121, + "num_input_tokens_seen": 9016670, + "step": 429, + "time_per_iteration": 2.620094060897827 + }, + { + "auxiliary_loss_clip": 0.01193834, + "auxiliary_loss_mlp": 0.01186946, + "balance_loss_clip": 1.00274456, + "balance_loss_mlp": 1.00145674, + "epoch": 0.025852998647226816, + "flos": 32379610210560.0, + "grad_norm": 2.2973892351244705, + "language_loss": 0.88172007, + "learning_rate": 3.904181346912895e-06, + "loss": 0.90552783, + "num_input_tokens_seen": 9039720, + "step": 430, + "time_per_iteration": 2.688615083694458 + }, + { + "auxiliary_loss_clip": 0.01193883, + "auxiliary_loss_mlp": 0.01186606, + "balance_loss_clip": 1.00296998, + "balance_loss_mlp": 1.00121152, + "epoch": 0.025913121899894784, + "flos": 20193288762240.0, + "grad_norm": 2.427240889717663, + "language_loss": 0.84012705, + "learning_rate": 3.905676939184698e-06, + "loss": 0.86393195, + "num_input_tokens_seen": 9059850, + "step": 431, + "time_per_iteration": 2.7940099239349365 + }, + { + "auxiliary_loss_clip": 0.01194051, + "auxiliary_loss_mlp": 0.01186903, + "balance_loss_clip": 1.00290799, + "balance_loss_mlp": 1.00112712, + "epoch": 0.025973245152562753, + "flos": 14720430983040.0, + "grad_norm": 3.429685759073978, + "language_loss": 0.86689276, + "learning_rate": 3.907169065422638e-06, + "loss": 0.89070225, + "num_input_tokens_seen": 9077590, + "step": 432, + "time_per_iteration": 2.5527966022491455 + }, + { + "auxiliary_loss_clip": 0.01194098, + "auxiliary_loss_mlp": 0.01186819, + "balance_loss_clip": 1.00298882, + "balance_loss_mlp": 1.00132942, + "epoch": 0.02603336840523072, + "flos": 30992991534720.0, + "grad_norm": 3.1723967330286764, + "language_loss": 0.75830328, + "learning_rate": 3.908657741654636e-06, + "loss": 0.78211248, + "num_input_tokens_seen": 9099880, + "step": 433, + "time_per_iteration": 2.680771827697754 + }, + { + "auxiliary_loss_clip": 0.0119388, + "auxiliary_loss_mlp": 0.01186982, + "balance_loss_clip": 1.00281537, + "balance_loss_mlp": 1.00149226, + "epoch": 0.026093491657898694, + "flos": 17674262939520.0, + "grad_norm": 2.0513762183646365, + "language_loss": 0.89772189, + "learning_rate": 3.910142983797699e-06, + "loss": 0.92153049, + "num_input_tokens_seen": 9118620, + "step": 434, + "time_per_iteration": 2.53082013130188 + }, + { + "auxiliary_loss_clip": 0.01193825, + "auxiliary_loss_mlp": 0.01187089, + "balance_loss_clip": 1.00278974, + "balance_loss_mlp": 1.00179029, + "epoch": 0.026153614910566662, + "flos": 17857874286720.0, + "grad_norm": 2.455457154470836, + "language_loss": 0.80286086, + "learning_rate": 3.9116248076589305e-06, + "loss": 0.82666999, + "num_input_tokens_seen": 9135655, + "step": 435, + "time_per_iteration": 2.5370559692382812 + }, + { + "auxiliary_loss_clip": 0.0119383, + "auxiliary_loss_mlp": 0.01187037, + "balance_loss_clip": 1.00275087, + "balance_loss_mlp": 1.00154757, + "epoch": 0.02621373816323463, + "flos": 20011113959040.0, + "grad_norm": 3.5475380043405047, + "language_loss": 0.86412227, + "learning_rate": 3.913103228936546e-06, + "loss": 0.88793093, + "num_input_tokens_seen": 9153520, + "step": 436, + "time_per_iteration": 3.9276351928710938 + }, + { + "auxiliary_loss_clip": 0.01194005, + "auxiliary_loss_mlp": 0.01187199, + "balance_loss_clip": 1.00295401, + "balance_loss_mlp": 1.00170898, + "epoch": 0.0262738614159026, + "flos": 19281193683840.0, + "grad_norm": 2.499368293246353, + "language_loss": 0.74675858, + "learning_rate": 3.914578263220868e-06, + "loss": 0.77057064, + "num_input_tokens_seen": 9170750, + "step": 437, + "time_per_iteration": 5.2701921463012695 + }, + { + "auxiliary_loss_clip": 0.01193875, + "auxiliary_loss_mlp": 0.01187022, + "balance_loss_clip": 1.00284576, + "balance_loss_mlp": 1.00162756, + "epoch": 0.026333984668570568, + "flos": 18807208790400.0, + "grad_norm": 2.20512673751465, + "language_loss": 0.90732419, + "learning_rate": 3.916049925995316e-06, + "loss": 0.93113321, + "num_input_tokens_seen": 9188430, + "step": 438, + "time_per_iteration": 3.9958479404449463 + }, + { + "auxiliary_loss_clip": 0.01197498, + "auxiliary_loss_mlp": 0.01185574, + "balance_loss_clip": 1.00701714, + "balance_loss_mlp": 1.00017977, + "epoch": 0.02639410792123854, + "flos": 64572020691840.0, + "grad_norm": 0.9224844836368291, + "language_loss": 0.62616551, + "learning_rate": 3.917518232637377e-06, + "loss": 0.64999616, + "num_input_tokens_seen": 9255835, + "step": 439, + "time_per_iteration": 3.2071070671081543 + }, + { + "auxiliary_loss_clip": 0.01194068, + "auxiliary_loss_mlp": 0.01187272, + "balance_loss_clip": 1.0030278, + "balance_loss_mlp": 1.00168717, + "epoch": 0.02645423117390651, + "flos": 28473462921600.0, + "grad_norm": 2.1600927357975634, + "language_loss": 0.7550801, + "learning_rate": 3.918983198419573e-06, + "loss": 0.77889353, + "num_input_tokens_seen": 9276835, + "step": 440, + "time_per_iteration": 2.6089389324188232 + }, + { + "auxiliary_loss_clip": 0.01193978, + "auxiliary_loss_mlp": 0.01186721, + "balance_loss_clip": 1.00292552, + "balance_loss_mlp": 1.00123155, + "epoch": 0.026514354426574478, + "flos": 18551237495040.0, + "grad_norm": 2.172192457336043, + "language_loss": 0.83119214, + "learning_rate": 3.920444838510415e-06, + "loss": 0.85499913, + "num_input_tokens_seen": 9295075, + "step": 441, + "time_per_iteration": 2.569154739379883 + }, + { + "auxiliary_loss_clip": 0.01193886, + "auxiliary_loss_mlp": 0.01187123, + "balance_loss_clip": 1.00283873, + "balance_loss_mlp": 1.00172877, + "epoch": 0.026574477679242446, + "flos": 20667812359680.0, + "grad_norm": 2.1353637678868727, + "language_loss": 0.78556132, + "learning_rate": 3.92190316797534e-06, + "loss": 0.80937147, + "num_input_tokens_seen": 9314205, + "step": 442, + "time_per_iteration": 2.5640387535095215 + }, + { + "auxiliary_loss_clip": 0.01197322, + "auxiliary_loss_mlp": 0.01184803, + "balance_loss_clip": 1.00698292, + "balance_loss_mlp": 1.00017214, + "epoch": 0.026634600931910415, + "flos": 57956125340160.0, + "grad_norm": 1.4934238745406998, + "language_loss": 0.64480788, + "learning_rate": 3.92335820177765e-06, + "loss": 0.66862911, + "num_input_tokens_seen": 9367395, + "step": 443, + "time_per_iteration": 2.97015643119812 + }, + { + "auxiliary_loss_clip": 0.01193893, + "auxiliary_loss_mlp": 0.01187034, + "balance_loss_clip": 1.00293386, + "balance_loss_mlp": 1.00163996, + "epoch": 0.026694724184578387, + "flos": 15815131827840.0, + "grad_norm": 2.3814954175836345, + "language_loss": 0.8263706, + "learning_rate": 3.924809954779425e-06, + "loss": 0.85017991, + "num_input_tokens_seen": 9385185, + "step": 444, + "time_per_iteration": 2.5409438610076904 + }, + { + "auxiliary_loss_clip": 0.01193885, + "auxiliary_loss_mlp": 0.01187048, + "balance_loss_clip": 1.00282168, + "balance_loss_mlp": 1.00146353, + "epoch": 0.026754847437246355, + "flos": 23440259612160.0, + "grad_norm": 2.7255918374764994, + "language_loss": 0.95563853, + "learning_rate": 3.9262584417424425e-06, + "loss": 0.97944784, + "num_input_tokens_seen": 9403225, + "step": 445, + "time_per_iteration": 2.560044288635254 + }, + { + "auxiliary_loss_clip": 0.01193849, + "auxiliary_loss_mlp": 0.01187205, + "balance_loss_clip": 1.00292706, + "balance_loss_mlp": 1.00181103, + "epoch": 0.026814970689914324, + "flos": 17341801632000.0, + "grad_norm": 2.4541238627207065, + "language_loss": 0.91483396, + "learning_rate": 3.9277036773290725e-06, + "loss": 0.93864453, + "num_input_tokens_seen": 9420540, + "step": 446, + "time_per_iteration": 2.498408079147339 + }, + { + "auxiliary_loss_clip": 0.01193821, + "auxiliary_loss_mlp": 0.01186815, + "balance_loss_clip": 1.00292015, + "balance_loss_mlp": 1.00142074, + "epoch": 0.026875093942582293, + "flos": 17894718662400.0, + "grad_norm": 2.5029149628574983, + "language_loss": 0.79779136, + "learning_rate": 3.92914567610317e-06, + "loss": 0.8215977, + "num_input_tokens_seen": 9438840, + "step": 447, + "time_per_iteration": 2.528641700744629 + }, + { + "auxiliary_loss_clip": 0.01193991, + "auxiliary_loss_mlp": 0.01186994, + "balance_loss_clip": 1.00296819, + "balance_loss_mlp": 1.00140882, + "epoch": 0.026935217195250265, + "flos": 21723980889600.0, + "grad_norm": 3.0616390017300033, + "language_loss": 0.86151874, + "learning_rate": 3.930584452530952e-06, + "loss": 0.88532853, + "num_input_tokens_seen": 9457215, + "step": 448, + "time_per_iteration": 2.5573811531066895 + }, + { + "auxiliary_loss_clip": 0.01193705, + "auxiliary_loss_mlp": 0.01187268, + "balance_loss_clip": 1.00279737, + "balance_loss_mlp": 1.00196862, + "epoch": 0.026995340447918233, + "flos": 23622685810560.0, + "grad_norm": 1.99352052769304, + "language_loss": 0.88426995, + "learning_rate": 3.9320200209818755e-06, + "loss": 0.90807974, + "num_input_tokens_seen": 9475615, + "step": 449, + "time_per_iteration": 2.5453851222991943 + }, + { + "auxiliary_loss_clip": 0.01193819, + "auxiliary_loss_mlp": 0.01187037, + "balance_loss_clip": 1.00277185, + "balance_loss_mlp": 1.0016427, + "epoch": 0.027055463700586202, + "flos": 17931275729280.0, + "grad_norm": 2.7372501068056114, + "language_loss": 0.80729932, + "learning_rate": 3.933452395729493e-06, + "loss": 0.83110785, + "num_input_tokens_seen": 9493975, + "step": 450, + "time_per_iteration": 2.5354537963867188 + }, + { + "auxiliary_loss_clip": 0.01193926, + "auxiliary_loss_mlp": 0.01187024, + "balance_loss_clip": 1.00304484, + "balance_loss_mlp": 1.00163007, + "epoch": 0.02711558695325417, + "flos": 25118903859840.0, + "grad_norm": 1.7588131058330303, + "language_loss": 0.81420726, + "learning_rate": 3.934881590952304e-06, + "loss": 0.83801675, + "num_input_tokens_seen": 9514810, + "step": 451, + "time_per_iteration": 2.5813803672790527 + }, + { + "auxiliary_loss_clip": 0.01193844, + "auxiliary_loss_mlp": 0.01186786, + "balance_loss_clip": 1.00300717, + "balance_loss_mlp": 1.00167799, + "epoch": 0.02717571020592214, + "flos": 24239559006720.0, + "grad_norm": 1.6698582961398418, + "language_loss": 0.76959729, + "learning_rate": 3.936307620734599e-06, + "loss": 0.79340363, + "num_input_tokens_seen": 9533635, + "step": 452, + "time_per_iteration": 2.675184488296509 + }, + { + "auxiliary_loss_clip": 0.01193671, + "auxiliary_loss_mlp": 0.01186933, + "balance_loss_clip": 1.00286722, + "balance_loss_mlp": 1.00163388, + "epoch": 0.02723583345859011, + "flos": 25118939773440.0, + "grad_norm": 2.0665698010711524, + "language_loss": 0.73109162, + "learning_rate": 3.937730499067294e-06, + "loss": 0.75489771, + "num_input_tokens_seen": 9555420, + "step": 453, + "time_per_iteration": 2.6448023319244385 + }, + { + "auxiliary_loss_clip": 0.01193677, + "auxiliary_loss_mlp": 0.01186674, + "balance_loss_clip": 1.00284886, + "balance_loss_mlp": 1.00156617, + "epoch": 0.02729595671125808, + "flos": 42741597847680.0, + "grad_norm": 2.1409749125357296, + "language_loss": 0.82282251, + "learning_rate": 3.939150239848748e-06, + "loss": 0.84662598, + "num_input_tokens_seen": 9578950, + "step": 454, + "time_per_iteration": 2.7383108139038086 + }, + { + "auxiliary_loss_clip": 0.01193749, + "auxiliary_loss_mlp": 0.01186492, + "balance_loss_clip": 1.00294328, + "balance_loss_mlp": 1.00128865, + "epoch": 0.02735607996392605, + "flos": 21430985650560.0, + "grad_norm": 2.394784484980745, + "language_loss": 0.75538856, + "learning_rate": 3.9405668568855866e-06, + "loss": 0.77919096, + "num_input_tokens_seen": 9598160, + "step": 455, + "time_per_iteration": 2.565021514892578 + }, + { + "auxiliary_loss_clip": 0.0119373, + "auxiliary_loss_mlp": 0.01186698, + "balance_loss_clip": 1.00283313, + "balance_loss_mlp": 1.00158989, + "epoch": 0.027416203216594017, + "flos": 20851280052480.0, + "grad_norm": 2.14870158574918, + "language_loss": 0.80851471, + "learning_rate": 3.941980363893499e-06, + "loss": 0.8323189, + "num_input_tokens_seen": 9616010, + "step": 456, + "time_per_iteration": 2.575594425201416 + }, + { + "auxiliary_loss_clip": 0.01193549, + "auxiliary_loss_mlp": 0.01186512, + "balance_loss_clip": 1.00270641, + "balance_loss_mlp": 1.00140405, + "epoch": 0.027476326469261986, + "flos": 13224500242560.0, + "grad_norm": 2.0283847294817403, + "language_loss": 0.81831038, + "learning_rate": 3.9433907744980384e-06, + "loss": 0.84211105, + "num_input_tokens_seen": 9634000, + "step": 457, + "time_per_iteration": 2.4886155128479004 + }, + { + "auxiliary_loss_clip": 0.01193626, + "auxiliary_loss_mlp": 0.01186948, + "balance_loss_clip": 1.00278676, + "balance_loss_mlp": 1.00174475, + "epoch": 0.027536449721929958, + "flos": 24024526237440.0, + "grad_norm": 2.4783010917415815, + "language_loss": 0.94215846, + "learning_rate": 3.944798102235412e-06, + "loss": 0.96596414, + "num_input_tokens_seen": 9653455, + "step": 458, + "time_per_iteration": 2.58109450340271 + }, + { + "auxiliary_loss_clip": 0.01193668, + "auxiliary_loss_mlp": 0.01186904, + "balance_loss_clip": 1.00285339, + "balance_loss_mlp": 1.0017004, + "epoch": 0.027596572974597926, + "flos": 13006055681280.0, + "grad_norm": 2.64615253142959, + "language_loss": 0.79258919, + "learning_rate": 3.9462023605532545e-06, + "loss": 0.81639481, + "num_input_tokens_seen": 9669650, + "step": 459, + "time_per_iteration": 2.5095856189727783 + }, + { + "auxiliary_loss_clip": 0.01193776, + "auxiliary_loss_mlp": 0.01186611, + "balance_loss_clip": 1.00296724, + "balance_loss_mlp": 1.00140703, + "epoch": 0.027656696227265895, + "flos": 26143076350080.0, + "grad_norm": 2.213388853818598, + "language_loss": 0.83472693, + "learning_rate": 3.947603562811407e-06, + "loss": 0.85853088, + "num_input_tokens_seen": 9691415, + "step": 460, + "time_per_iteration": 2.6106064319610596 + }, + { + "auxiliary_loss_clip": 0.01197073, + "auxiliary_loss_mlp": 0.01184774, + "balance_loss_clip": 1.00690365, + "balance_loss_mlp": 1.00014257, + "epoch": 0.027716819479933864, + "flos": 60697222997760.0, + "grad_norm": 1.6104160766157167, + "language_loss": 0.73697734, + "learning_rate": 3.949001722282675e-06, + "loss": 0.76079583, + "num_input_tokens_seen": 9755605, + "step": 461, + "time_per_iteration": 3.128793954849243 + }, + { + "auxiliary_loss_clip": 0.01193747, + "auxiliary_loss_mlp": 0.01186818, + "balance_loss_clip": 1.00307083, + "balance_loss_mlp": 1.00161421, + "epoch": 0.027776942732601832, + "flos": 31211938886400.0, + "grad_norm": 2.6009892017741554, + "language_loss": 0.8100704, + "learning_rate": 3.950396852153582e-06, + "loss": 0.83387607, + "num_input_tokens_seen": 9776270, + "step": 462, + "time_per_iteration": 2.61270809173584 + }, + { + "auxiliary_loss_clip": 0.01193636, + "auxiliary_loss_mlp": 0.0118666, + "balance_loss_clip": 1.00287819, + "balance_loss_mlp": 1.00145674, + "epoch": 0.027837065985269804, + "flos": 22674644196480.0, + "grad_norm": 2.2322296307795297, + "language_loss": 0.90338618, + "learning_rate": 3.951788965525118e-06, + "loss": 0.92718911, + "num_input_tokens_seen": 9794465, + "step": 463, + "time_per_iteration": 2.5464179515838623 + }, + { + "auxiliary_loss_clip": 0.01196848, + "auxiliary_loss_mlp": 0.01184759, + "balance_loss_clip": 1.00674844, + "balance_loss_mlp": 1.00012803, + "epoch": 0.027897189237937773, + "flos": 62182487399040.0, + "grad_norm": 0.8865353905439521, + "language_loss": 0.59062278, + "learning_rate": 3.953178075413476e-06, + "loss": 0.61443889, + "num_input_tokens_seen": 9849685, + "step": 464, + "time_per_iteration": 3.097794771194458 + }, + { + "auxiliary_loss_clip": 0.01193716, + "auxiliary_loss_mlp": 0.01186945, + "balance_loss_clip": 1.00294077, + "balance_loss_mlp": 1.0016458, + "epoch": 0.02795731249060574, + "flos": 24493160004480.0, + "grad_norm": 3.2772996777670906, + "language_loss": 0.81395859, + "learning_rate": 3.954564194750784e-06, + "loss": 0.83776516, + "num_input_tokens_seen": 9869505, + "step": 465, + "time_per_iteration": 2.5996170043945312 + }, + { + "auxiliary_loss_clip": 0.01193424, + "auxiliary_loss_mlp": 0.01186432, + "balance_loss_clip": 1.00267899, + "balance_loss_mlp": 1.00141966, + "epoch": 0.02801743574327371, + "flos": 23733003456000.0, + "grad_norm": 2.236187537954738, + "language_loss": 0.78328723, + "learning_rate": 3.955947336385828e-06, + "loss": 0.80708581, + "num_input_tokens_seen": 9890950, + "step": 466, + "time_per_iteration": 2.5733444690704346 + }, + { + "auxiliary_loss_clip": 0.01193502, + "auxiliary_loss_mlp": 0.01186567, + "balance_loss_clip": 1.00282168, + "balance_loss_mlp": 1.00164962, + "epoch": 0.02807755899594168, + "flos": 20629100476800.0, + "grad_norm": 1.8163407511807939, + "language_loss": 0.87543941, + "learning_rate": 3.957327513084761e-06, + "loss": 0.89924008, + "num_input_tokens_seen": 9911265, + "step": 467, + "time_per_iteration": 2.559368371963501 + }, + { + "auxiliary_loss_clip": 0.01193566, + "auxiliary_loss_mlp": 0.01186956, + "balance_loss_clip": 1.00286603, + "balance_loss_mlp": 1.00194323, + "epoch": 0.02813768224860965, + "flos": 19244564789760.0, + "grad_norm": 2.334972258999128, + "language_loss": 0.86345017, + "learning_rate": 3.958704737531818e-06, + "loss": 0.88725537, + "num_input_tokens_seen": 9929025, + "step": 468, + "time_per_iteration": 2.5495505332946777 + }, + { + "auxiliary_loss_clip": 0.01193385, + "auxiliary_loss_mlp": 0.01186677, + "balance_loss_clip": 1.00272405, + "balance_loss_mlp": 1.0014739, + "epoch": 0.02819780550127762, + "flos": 20813968800000.0, + "grad_norm": 2.1519015360296287, + "language_loss": 0.91622984, + "learning_rate": 3.9600790223300065e-06, + "loss": 0.94003057, + "num_input_tokens_seen": 9945190, + "step": 469, + "time_per_iteration": 2.5525739192962646 + }, + { + "auxiliary_loss_clip": 0.0119362, + "auxiliary_loss_mlp": 0.01186475, + "balance_loss_clip": 1.0029366, + "balance_loss_mlp": 1.00155735, + "epoch": 0.028257928753945588, + "flos": 19974125928960.0, + "grad_norm": 2.3145234437523667, + "language_loss": 0.81622678, + "learning_rate": 3.96145038000181e-06, + "loss": 0.84002775, + "num_input_tokens_seen": 9962820, + "step": 470, + "time_per_iteration": 2.537224769592285 + }, + { + "auxiliary_loss_clip": 0.01193387, + "auxiliary_loss_mlp": 0.01186987, + "balance_loss_clip": 1.00271893, + "balance_loss_mlp": 1.00168824, + "epoch": 0.028318052006613557, + "flos": 20484488321280.0, + "grad_norm": 1.7613996013476054, + "language_loss": 0.9309352, + "learning_rate": 3.962818822989861e-06, + "loss": 0.95473897, + "num_input_tokens_seen": 9982595, + "step": 471, + "time_per_iteration": 2.5300605297088623 + }, + { + "auxiliary_loss_clip": 0.01193316, + "auxiliary_loss_mlp": 0.01186506, + "balance_loss_clip": 1.00268233, + "balance_loss_mlp": 1.00139809, + "epoch": 0.02837817525928153, + "flos": 28514832410880.0, + "grad_norm": 2.400658890552359, + "language_loss": 0.76108539, + "learning_rate": 3.964184363657625e-06, + "loss": 0.78488374, + "num_input_tokens_seen": 10004645, + "step": 472, + "time_per_iteration": 2.6030752658843994 + }, + { + "auxiliary_loss_clip": 0.01193481, + "auxiliary_loss_mlp": 0.01186586, + "balance_loss_clip": 1.00270927, + "balance_loss_mlp": 1.0012871, + "epoch": 0.028438298511949497, + "flos": 18551668458240.0, + "grad_norm": 1.8824289272434511, + "language_loss": 0.93283528, + "learning_rate": 3.965547014290071e-06, + "loss": 0.95663601, + "num_input_tokens_seen": 10022555, + "step": 473, + "time_per_iteration": 2.5294957160949707 + }, + { + "auxiliary_loss_clip": 0.01193542, + "auxiliary_loss_mlp": 0.01187535, + "balance_loss_clip": 1.00279498, + "balance_loss_mlp": 1.00233197, + "epoch": 0.028498421764617466, + "flos": 16910227722240.0, + "grad_norm": 2.6150060763462046, + "language_loss": 0.89002311, + "learning_rate": 3.96690678709433e-06, + "loss": 0.91383392, + "num_input_tokens_seen": 10041025, + "step": 474, + "time_per_iteration": 3.9971232414245605 + }, + { + "auxiliary_loss_clip": 0.01193346, + "auxiliary_loss_mlp": 0.0118654, + "balance_loss_clip": 1.00277317, + "balance_loss_mlp": 1.00152671, + "epoch": 0.028558545017285435, + "flos": 27778699082880.0, + "grad_norm": 2.1031383314742134, + "language_loss": 0.78711009, + "learning_rate": 3.968263694200355e-06, + "loss": 0.81090891, + "num_input_tokens_seen": 10060775, + "step": 475, + "time_per_iteration": 5.3713884353637695 + }, + { + "auxiliary_loss_clip": 0.01196498, + "auxiliary_loss_mlp": 0.01183962, + "balance_loss_clip": 1.00651157, + "balance_loss_mlp": 1.0000937, + "epoch": 0.028618668269953403, + "flos": 65654367258240.0, + "grad_norm": 0.9156852954864154, + "language_loss": 0.66953743, + "learning_rate": 3.969617747661569e-06, + "loss": 0.69334209, + "num_input_tokens_seen": 10120225, + "step": 476, + "time_per_iteration": 4.584715127944946 + }, + { + "auxiliary_loss_clip": 0.01193418, + "auxiliary_loss_mlp": 0.01186502, + "balance_loss_clip": 1.00284171, + "balance_loss_mlp": 1.00139427, + "epoch": 0.028678791522621375, + "flos": 21937074324480.0, + "grad_norm": 2.2362935048639354, + "language_loss": 0.8392176, + "learning_rate": 3.970968959455509e-06, + "loss": 0.86301684, + "num_input_tokens_seen": 10137880, + "step": 477, + "time_per_iteration": 2.5434134006500244 + }, + { + "auxiliary_loss_clip": 0.0119346, + "auxiliary_loss_mlp": 0.01186244, + "balance_loss_clip": 1.00296891, + "balance_loss_mlp": 1.0015173, + "epoch": 0.028738914775289344, + "flos": 24572128055040.0, + "grad_norm": 2.1106178759282774, + "language_loss": 0.82404995, + "learning_rate": 3.97231734148446e-06, + "loss": 0.84784698, + "num_input_tokens_seen": 10156930, + "step": 478, + "time_per_iteration": 2.578498601913452 + }, + { + "auxiliary_loss_clip": 0.01193304, + "auxiliary_loss_mlp": 0.01186638, + "balance_loss_clip": 1.00268531, + "balance_loss_mlp": 1.00162518, + "epoch": 0.028799038027957313, + "flos": 23257977068160.0, + "grad_norm": 1.8403203993567874, + "language_loss": 0.81207502, + "learning_rate": 3.973662905576082e-06, + "loss": 0.83587444, + "num_input_tokens_seen": 10176295, + "step": 479, + "time_per_iteration": 2.6189122200012207 + }, + { + "auxiliary_loss_clip": 0.01193296, + "auxiliary_loss_mlp": 0.01186584, + "balance_loss_clip": 1.00272202, + "balance_loss_mlp": 1.0015707, + "epoch": 0.02885916128062528, + "flos": 22164102236160.0, + "grad_norm": 2.879630362730484, + "language_loss": 0.73177463, + "learning_rate": 3.975005663484038e-06, + "loss": 0.75557345, + "num_input_tokens_seen": 10195790, + "step": 480, + "time_per_iteration": 2.5960519313812256 + }, + { + "auxiliary_loss_clip": 0.01193351, + "auxiliary_loss_mlp": 0.01186359, + "balance_loss_clip": 1.00275576, + "balance_loss_mlp": 1.00144148, + "epoch": 0.02891928453329325, + "flos": 22932842135040.0, + "grad_norm": 1.8293772029329487, + "language_loss": 0.87805605, + "learning_rate": 3.976345626888605e-06, + "loss": 0.9018532, + "num_input_tokens_seen": 10218405, + "step": 481, + "time_per_iteration": 2.577554225921631 + }, + { + "auxiliary_loss_clip": 0.01196263, + "auxiliary_loss_mlp": 0.01183927, + "balance_loss_clip": 1.00636148, + "balance_loss_mlp": 1.00005841, + "epoch": 0.028979407785961222, + "flos": 57432941792640.0, + "grad_norm": 0.8218829923991628, + "language_loss": 0.6606642, + "learning_rate": 3.9776828073972864e-06, + "loss": 0.68446612, + "num_input_tokens_seen": 10271005, + "step": 482, + "time_per_iteration": 2.9033420085906982 + }, + { + "auxiliary_loss_clip": 0.01193449, + "auxiliary_loss_mlp": 0.01186134, + "balance_loss_clip": 1.00289106, + "balance_loss_mlp": 1.00140762, + "epoch": 0.02903953103862919, + "flos": 16722737706240.0, + "grad_norm": 2.860819780295529, + "language_loss": 0.79399097, + "learning_rate": 3.979017216545415e-06, + "loss": 0.81778681, + "num_input_tokens_seen": 10288405, + "step": 483, + "time_per_iteration": 2.513261079788208 + }, + { + "auxiliary_loss_clip": 0.01193349, + "auxiliary_loss_mlp": 0.01186934, + "balance_loss_clip": 1.00283504, + "balance_loss_mlp": 1.00192142, + "epoch": 0.02909965429129716, + "flos": 16763640318720.0, + "grad_norm": 2.6601361624953266, + "language_loss": 0.75746143, + "learning_rate": 3.980348865796749e-06, + "loss": 0.78126431, + "num_input_tokens_seen": 10306875, + "step": 484, + "time_per_iteration": 2.6129069328308105 + }, + { + "auxiliary_loss_clip": 0.01193383, + "auxiliary_loss_mlp": 0.0118649, + "balance_loss_clip": 1.00279117, + "balance_loss_mlp": 1.00128639, + "epoch": 0.029159777543965128, + "flos": 19785343023360.0, + "grad_norm": 2.0705024296179255, + "language_loss": 0.84083211, + "learning_rate": 3.9816777665440615e-06, + "loss": 0.86463082, + "num_input_tokens_seen": 10323965, + "step": 485, + "time_per_iteration": 2.547657012939453 + }, + { + "auxiliary_loss_clip": 0.011934, + "auxiliary_loss_mlp": 0.01186537, + "balance_loss_clip": 1.00284219, + "balance_loss_mlp": 1.0016191, + "epoch": 0.029219900796633096, + "flos": 19642670202240.0, + "grad_norm": 2.2200275805243193, + "language_loss": 0.84485251, + "learning_rate": 3.983003930109732e-06, + "loss": 0.86865187, + "num_input_tokens_seen": 10342620, + "step": 486, + "time_per_iteration": 2.580270290374756 + }, + { + "auxiliary_loss_clip": 0.01193148, + "auxiliary_loss_mlp": 0.01186658, + "balance_loss_clip": 1.00261283, + "balance_loss_mlp": 1.00183582, + "epoch": 0.02928002404930107, + "flos": 25885704424320.0, + "grad_norm": 1.931513667700736, + "language_loss": 0.88874084, + "learning_rate": 3.984327367746315e-06, + "loss": 0.91253889, + "num_input_tokens_seen": 10364610, + "step": 487, + "time_per_iteration": 2.6019699573516846 + }, + { + "auxiliary_loss_clip": 0.01193514, + "auxiliary_loss_mlp": 0.0118608, + "balance_loss_clip": 1.00301981, + "balance_loss_mlp": 1.00125778, + "epoch": 0.029340147301969037, + "flos": 20660234590080.0, + "grad_norm": 2.538491663570541, + "language_loss": 0.88292241, + "learning_rate": 3.985648090637122e-06, + "loss": 0.90671837, + "num_input_tokens_seen": 10380910, + "step": 488, + "time_per_iteration": 2.522681474685669 + }, + { + "auxiliary_loss_clip": 0.01193172, + "auxiliary_loss_mlp": 0.01186415, + "balance_loss_clip": 1.00269127, + "balance_loss_mlp": 1.00168872, + "epoch": 0.029400270554637006, + "flos": 24428018689920.0, + "grad_norm": 2.4002942776845027, + "language_loss": 0.88731825, + "learning_rate": 3.986966109896785e-06, + "loss": 0.91111422, + "num_input_tokens_seen": 10400665, + "step": 489, + "time_per_iteration": 2.562737464904785 + }, + { + "auxiliary_loss_clip": 0.01193063, + "auxiliary_loss_mlp": 0.01186171, + "balance_loss_clip": 1.00252402, + "balance_loss_mlp": 1.00144434, + "epoch": 0.029460393807304974, + "flos": 20120892900480.0, + "grad_norm": 2.9865585085603574, + "language_loss": 0.88228762, + "learning_rate": 3.988281436571815e-06, + "loss": 0.90608001, + "num_input_tokens_seen": 10420150, + "step": 490, + "time_per_iteration": 2.5472981929779053 + }, + { + "auxiliary_loss_clip": 0.01193027, + "auxiliary_loss_mlp": 0.01186956, + "balance_loss_clip": 1.00254416, + "balance_loss_mlp": 1.00194287, + "epoch": 0.029520517059972943, + "flos": 17675914965120.0, + "grad_norm": 2.2560652522152194, + "language_loss": 0.91070259, + "learning_rate": 3.989594081641164e-06, + "loss": 0.93450248, + "num_input_tokens_seen": 10438210, + "step": 491, + "time_per_iteration": 2.501002311706543 + }, + { + "auxiliary_loss_clip": 0.01192999, + "auxiliary_loss_mlp": 0.01186165, + "balance_loss_clip": 1.00257492, + "balance_loss_mlp": 1.00143862, + "epoch": 0.029580640312640915, + "flos": 18953185662720.0, + "grad_norm": 10.200485720792308, + "language_loss": 0.85412908, + "learning_rate": 3.9909040560167675e-06, + "loss": 0.87792075, + "num_input_tokens_seen": 10455125, + "step": 492, + "time_per_iteration": 2.508326530456543 + }, + { + "auxiliary_loss_clip": 0.01193354, + "auxiliary_loss_mlp": 0.01186499, + "balance_loss_clip": 1.00287342, + "balance_loss_mlp": 1.00177228, + "epoch": 0.029640763565308884, + "flos": 18726121837440.0, + "grad_norm": 2.546384065538379, + "language_loss": 0.84014565, + "learning_rate": 3.992211370544093e-06, + "loss": 0.86394417, + "num_input_tokens_seen": 10470990, + "step": 493, + "time_per_iteration": 2.509181261062622 + }, + { + "auxiliary_loss_clip": 0.01193043, + "auxiliary_loss_mlp": 0.01186048, + "balance_loss_clip": 1.00253248, + "balance_loss_mlp": 1.00141668, + "epoch": 0.029700886817976852, + "flos": 20595308757120.0, + "grad_norm": 1.7444742173079981, + "language_loss": 0.86625206, + "learning_rate": 3.99351603600268e-06, + "loss": 0.89004302, + "num_input_tokens_seen": 10490685, + "step": 494, + "time_per_iteration": 2.5651862621307373 + }, + { + "auxiliary_loss_clip": 0.01193201, + "auxiliary_loss_mlp": 0.01186512, + "balance_loss_clip": 1.00271165, + "balance_loss_mlp": 1.00159442, + "epoch": 0.02976101007064482, + "flos": 22236857233920.0, + "grad_norm": 2.3137739155469905, + "language_loss": 0.86510563, + "learning_rate": 3.994818063106668e-06, + "loss": 0.88890278, + "num_input_tokens_seen": 10509435, + "step": 495, + "time_per_iteration": 2.523216485977173 + }, + { + "auxiliary_loss_clip": 0.01193078, + "auxiliary_loss_mlp": 0.01186145, + "balance_loss_clip": 1.00263131, + "balance_loss_mlp": 1.00151408, + "epoch": 0.029821133323312793, + "flos": 23732644320000.0, + "grad_norm": 3.416595433496731, + "language_loss": 0.62226486, + "learning_rate": 3.99611746250533e-06, + "loss": 0.64605701, + "num_input_tokens_seen": 10530050, + "step": 496, + "time_per_iteration": 2.53601336479187 + }, + { + "auxiliary_loss_clip": 0.01193264, + "auxiliary_loss_mlp": 0.01186685, + "balance_loss_clip": 1.00282192, + "balance_loss_mlp": 1.00167191, + "epoch": 0.02988125657598076, + "flos": 22419498913920.0, + "grad_norm": 1.9595744544578604, + "language_loss": 0.8865751, + "learning_rate": 3.997414244783595e-06, + "loss": 0.91037464, + "num_input_tokens_seen": 10551370, + "step": 497, + "time_per_iteration": 2.599872589111328 + }, + { + "auxiliary_loss_clip": 0.01193278, + "auxiliary_loss_mlp": 0.01186276, + "balance_loss_clip": 1.00283253, + "balance_loss_mlp": 1.00173974, + "epoch": 0.02994137982864873, + "flos": 13845108453120.0, + "grad_norm": 2.733788787501999, + "language_loss": 0.84890217, + "learning_rate": 3.998708420462557e-06, + "loss": 0.87269771, + "num_input_tokens_seen": 10569225, + "step": 498, + "time_per_iteration": 2.5545239448547363 + }, + { + "auxiliary_loss_clip": 0.01193126, + "auxiliary_loss_mlp": 0.01186401, + "balance_loss_clip": 1.002707, + "balance_loss_mlp": 1.00157905, + "epoch": 0.0300015030813167, + "flos": 23908354675200.0, + "grad_norm": 5.458797081810663, + "language_loss": 0.77948713, + "learning_rate": 4e-06, + "loss": 0.80328238, + "num_input_tokens_seen": 10586170, + "step": 499, + "time_per_iteration": 2.5197267532348633 + }, + { + "auxiliary_loss_clip": 0.01193207, + "auxiliary_loss_mlp": 0.01186648, + "balance_loss_clip": 1.00273156, + "balance_loss_mlp": 1.00182581, + "epoch": 0.030061626333984667, + "flos": 22016796560640.0, + "grad_norm": 3.36066426224744, + "language_loss": 0.82563061, + "learning_rate": 3.9999999620799e-06, + "loss": 0.84942913, + "num_input_tokens_seen": 10606205, + "step": 500, + "time_per_iteration": 2.5526328086853027 + }, + { + "auxiliary_loss_clip": 0.01192996, + "auxiliary_loss_mlp": 0.01186363, + "balance_loss_clip": 1.0025897, + "balance_loss_mlp": 1.00163674, + "epoch": 0.03012174958665264, + "flos": 23039747988480.0, + "grad_norm": 2.7221802982149415, + "language_loss": 0.87910855, + "learning_rate": 3.9999998483196e-06, + "loss": 0.90290213, + "num_input_tokens_seen": 10625995, + "step": 501, + "time_per_iteration": 2.56299090385437 + }, + { + "auxiliary_loss_clip": 0.01193278, + "auxiliary_loss_mlp": 0.01186463, + "balance_loss_clip": 1.00273061, + "balance_loss_mlp": 1.00154555, + "epoch": 0.030181872839320608, + "flos": 18953257489920.0, + "grad_norm": 2.694405179589237, + "language_loss": 0.86497867, + "learning_rate": 3.9999996587191065e-06, + "loss": 0.88877606, + "num_input_tokens_seen": 10644105, + "step": 502, + "time_per_iteration": 2.520559787750244 + }, + { + "auxiliary_loss_clip": 0.01193309, + "auxiliary_loss_mlp": 0.01186112, + "balance_loss_clip": 1.00286937, + "balance_loss_mlp": 1.00138497, + "epoch": 0.030241996091988577, + "flos": 16728017005440.0, + "grad_norm": 2.893422816070595, + "language_loss": 0.84788322, + "learning_rate": 3.999999393278425e-06, + "loss": 0.8716774, + "num_input_tokens_seen": 10661090, + "step": 503, + "time_per_iteration": 2.491425037384033 + }, + { + "auxiliary_loss_clip": 0.01193189, + "auxiliary_loss_mlp": 0.01186243, + "balance_loss_clip": 1.00283933, + "balance_loss_mlp": 1.00180256, + "epoch": 0.030302119344656545, + "flos": 28621271387520.0, + "grad_norm": 2.3415558585699205, + "language_loss": 0.88169324, + "learning_rate": 3.999999051997567e-06, + "loss": 0.90548754, + "num_input_tokens_seen": 10682380, + "step": 504, + "time_per_iteration": 2.5751235485076904 + }, + { + "auxiliary_loss_clip": 0.01193109, + "auxiliary_loss_mlp": 0.01186349, + "balance_loss_clip": 1.00267792, + "balance_loss_mlp": 1.00171757, + "epoch": 0.030362242597324514, + "flos": 15669334523520.0, + "grad_norm": 3.498300586728837, + "language_loss": 0.78328407, + "learning_rate": 3.9999986348765425e-06, + "loss": 0.80707866, + "num_input_tokens_seen": 10699925, + "step": 505, + "time_per_iteration": 2.510127544403076 + }, + { + "auxiliary_loss_clip": 0.01195852, + "auxiliary_loss_mlp": 0.01184336, + "balance_loss_clip": 1.00614858, + "balance_loss_mlp": 1.00046778, + "epoch": 0.030422365849992486, + "flos": 72125973676800.0, + "grad_norm": 0.8475150984637635, + "language_loss": 0.54995072, + "learning_rate": 3.999998141915371e-06, + "loss": 0.57375264, + "num_input_tokens_seen": 10766525, + "step": 506, + "time_per_iteration": 3.2936017513275146 + }, + { + "auxiliary_loss_clip": 0.01192991, + "auxiliary_loss_mlp": 0.01186351, + "balance_loss_clip": 1.00257576, + "balance_loss_mlp": 1.00181508, + "epoch": 0.030482489102660455, + "flos": 19427817000960.0, + "grad_norm": 2.1809437626805614, + "language_loss": 0.83226752, + "learning_rate": 3.999997573114069e-06, + "loss": 0.85606092, + "num_input_tokens_seen": 10786725, + "step": 507, + "time_per_iteration": 2.526658535003662 + }, + { + "auxiliary_loss_clip": 0.01193202, + "auxiliary_loss_mlp": 0.01186229, + "balance_loss_clip": 1.0027355, + "balance_loss_mlp": 1.00150251, + "epoch": 0.030542612355328423, + "flos": 20375822701440.0, + "grad_norm": 2.4099464269758686, + "language_loss": 0.88853264, + "learning_rate": 3.999996928472659e-06, + "loss": 0.91232705, + "num_input_tokens_seen": 10805390, + "step": 508, + "time_per_iteration": 2.5282938480377197 + }, + { + "auxiliary_loss_clip": 0.01193208, + "auxiliary_loss_mlp": 0.01185867, + "balance_loss_clip": 1.00278699, + "balance_loss_mlp": 1.00114012, + "epoch": 0.030602735607996392, + "flos": 34677354297600.0, + "grad_norm": 2.382291876616468, + "language_loss": 0.71508062, + "learning_rate": 3.999996207991165e-06, + "loss": 0.73887134, + "num_input_tokens_seen": 10828030, + "step": 509, + "time_per_iteration": 2.642592191696167 + }, + { + "auxiliary_loss_clip": 0.01193089, + "auxiliary_loss_mlp": 0.01186083, + "balance_loss_clip": 1.0027287, + "balance_loss_mlp": 1.00173759, + "epoch": 0.03066285886066436, + "flos": 23658668259840.0, + "grad_norm": 2.1062243962465863, + "language_loss": 0.82295871, + "learning_rate": 3.999995411669614e-06, + "loss": 0.84675038, + "num_input_tokens_seen": 10845240, + "step": 510, + "time_per_iteration": 2.5750370025634766 + }, + { + "auxiliary_loss_clip": 0.0119311, + "auxiliary_loss_mlp": 0.01185979, + "balance_loss_clip": 1.00274098, + "balance_loss_mlp": 1.00144279, + "epoch": 0.030722982113332332, + "flos": 23002975440000.0, + "grad_norm": 1.9494828302600875, + "language_loss": 0.8353411, + "learning_rate": 3.999994539508036e-06, + "loss": 0.85913193, + "num_input_tokens_seen": 10864325, + "step": 511, + "time_per_iteration": 2.5442581176757812 + }, + { + "auxiliary_loss_clip": 0.01193109, + "auxiliary_loss_mlp": 0.01186167, + "balance_loss_clip": 1.00264597, + "balance_loss_mlp": 1.0016315, + "epoch": 0.0307831053660003, + "flos": 24750855152640.0, + "grad_norm": 2.3094745663103016, + "language_loss": 0.81973618, + "learning_rate": 3.9999935915064655e-06, + "loss": 0.84352893, + "num_input_tokens_seen": 10883860, + "step": 512, + "time_per_iteration": 3.9721477031707764 + }, + { + "auxiliary_loss_clip": 0.01192841, + "auxiliary_loss_mlp": 0.01186284, + "balance_loss_clip": 1.00247383, + "balance_loss_mlp": 1.00174761, + "epoch": 0.03084322861866827, + "flos": 26140885620480.0, + "grad_norm": 2.0522493022694746, + "language_loss": 0.86944115, + "learning_rate": 3.9999925676649374e-06, + "loss": 0.89323235, + "num_input_tokens_seen": 10904555, + "step": 513, + "time_per_iteration": 3.9568331241607666 + }, + { + "auxiliary_loss_clip": 0.01193161, + "auxiliary_loss_mlp": 0.01185989, + "balance_loss_clip": 1.00272083, + "balance_loss_mlp": 1.00145328, + "epoch": 0.03090335187133624, + "flos": 18771298168320.0, + "grad_norm": 1.9244136974095458, + "language_loss": 0.79051995, + "learning_rate": 3.999991467983491e-06, + "loss": 0.81431144, + "num_input_tokens_seen": 10923700, + "step": 514, + "time_per_iteration": 5.344879627227783 + }, + { + "auxiliary_loss_clip": 0.01193113, + "auxiliary_loss_mlp": 0.01185652, + "balance_loss_clip": 1.00271058, + "balance_loss_mlp": 1.00130665, + "epoch": 0.030963475124004207, + "flos": 23221886878080.0, + "grad_norm": 2.8188459676574786, + "language_loss": 0.77180874, + "learning_rate": 3.999990292462167e-06, + "loss": 0.79559642, + "num_input_tokens_seen": 10942730, + "step": 515, + "time_per_iteration": 2.536494016647339 + }, + { + "auxiliary_loss_clip": 0.01192893, + "auxiliary_loss_mlp": 0.01185652, + "balance_loss_clip": 1.00251448, + "balance_loss_mlp": 1.00121176, + "epoch": 0.03102359837667218, + "flos": 42525595411200.0, + "grad_norm": 2.4166318569640617, + "language_loss": 0.82658553, + "learning_rate": 3.999989041101011e-06, + "loss": 0.850371, + "num_input_tokens_seen": 10967120, + "step": 516, + "time_per_iteration": 2.731454372406006 + }, + { + "auxiliary_loss_clip": 0.01193012, + "auxiliary_loss_mlp": 0.01185875, + "balance_loss_clip": 1.00270081, + "balance_loss_mlp": 1.00152993, + "epoch": 0.031083721629340148, + "flos": 21176953689600.0, + "grad_norm": 1.9127037908631692, + "language_loss": 0.78766406, + "learning_rate": 3.999987713900071e-06, + "loss": 0.81145298, + "num_input_tokens_seen": 10986775, + "step": 517, + "time_per_iteration": 2.53739857673645 + }, + { + "auxiliary_loss_clip": 0.01193099, + "auxiliary_loss_mlp": 0.01186118, + "balance_loss_clip": 1.00272942, + "balance_loss_mlp": 1.00167704, + "epoch": 0.031143844882008116, + "flos": 29716187713920.0, + "grad_norm": 1.5413285965697028, + "language_loss": 0.9051494, + "learning_rate": 3.999986310859396e-06, + "loss": 0.92894155, + "num_input_tokens_seen": 11011360, + "step": 518, + "time_per_iteration": 2.6084916591644287 + }, + { + "auxiliary_loss_clip": 0.01193284, + "auxiliary_loss_mlp": 0.01186173, + "balance_loss_clip": 1.00299406, + "balance_loss_mlp": 1.00201869, + "epoch": 0.031203968134676085, + "flos": 23112467072640.0, + "grad_norm": 2.1999772350360387, + "language_loss": 0.8648088, + "learning_rate": 3.999984831979039e-06, + "loss": 0.88860339, + "num_input_tokens_seen": 11030150, + "step": 519, + "time_per_iteration": 2.592416763305664 + }, + { + "auxiliary_loss_clip": 0.01192932, + "auxiliary_loss_mlp": 0.01186121, + "balance_loss_clip": 1.00261569, + "balance_loss_mlp": 1.00177562, + "epoch": 0.03126409138734405, + "flos": 20954379064320.0, + "grad_norm": 2.196471322460866, + "language_loss": 0.87013775, + "learning_rate": 3.999983277259057e-06, + "loss": 0.89392829, + "num_input_tokens_seen": 11049145, + "step": 520, + "time_per_iteration": 2.6149020195007324 + }, + { + "auxiliary_loss_clip": 0.01193101, + "auxiliary_loss_mlp": 0.01186434, + "balance_loss_clip": 1.00275123, + "balance_loss_mlp": 1.00180292, + "epoch": 0.031324214640012026, + "flos": 21650112570240.0, + "grad_norm": 1.7941910077203362, + "language_loss": 0.89229381, + "learning_rate": 3.999981646699509e-06, + "loss": 0.91608918, + "num_input_tokens_seen": 11068835, + "step": 521, + "time_per_iteration": 2.568763256072998 + }, + { + "auxiliary_loss_clip": 0.01193169, + "auxiliary_loss_mlp": 0.01186049, + "balance_loss_clip": 1.00285411, + "balance_loss_mlp": 1.00160861, + "epoch": 0.03138433789267999, + "flos": 23441337020160.0, + "grad_norm": 6.563012702280141, + "language_loss": 0.71095353, + "learning_rate": 3.999979940300456e-06, + "loss": 0.73474574, + "num_input_tokens_seen": 11088980, + "step": 522, + "time_per_iteration": 2.5862877368927 + }, + { + "auxiliary_loss_clip": 0.01193122, + "auxiliary_loss_mlp": 0.01186099, + "balance_loss_clip": 1.00273538, + "balance_loss_mlp": 1.00165892, + "epoch": 0.03144446114534796, + "flos": 18982164960000.0, + "grad_norm": 3.2595664520659513, + "language_loss": 0.85702753, + "learning_rate": 3.999978158061963e-06, + "loss": 0.8808198, + "num_input_tokens_seen": 11104300, + "step": 523, + "time_per_iteration": 2.5079257488250732 + }, + { + "auxiliary_loss_clip": 0.01193007, + "auxiliary_loss_mlp": 0.01185777, + "balance_loss_clip": 1.00260448, + "balance_loss_mlp": 1.00133681, + "epoch": 0.031504584398015935, + "flos": 22637692080000.0, + "grad_norm": 2.4464524624580637, + "language_loss": 0.90620047, + "learning_rate": 3.999976299984099e-06, + "loss": 0.92998832, + "num_input_tokens_seen": 11123335, + "step": 524, + "time_per_iteration": 2.553341865539551 + }, + { + "auxiliary_loss_clip": 0.01193335, + "auxiliary_loss_mlp": 0.01185905, + "balance_loss_clip": 1.00297618, + "balance_loss_mlp": 1.0015595, + "epoch": 0.0315647076506839, + "flos": 25297056339840.0, + "grad_norm": 2.6083586387442956, + "language_loss": 0.80308056, + "learning_rate": 3.999974366066933e-06, + "loss": 0.826873, + "num_input_tokens_seen": 11140880, + "step": 525, + "time_per_iteration": 2.5596377849578857 + }, + { + "auxiliary_loss_clip": 0.01193121, + "auxiliary_loss_mlp": 0.01186231, + "balance_loss_clip": 1.00278533, + "balance_loss_mlp": 1.00179029, + "epoch": 0.03162483090335187, + "flos": 16982839065600.0, + "grad_norm": 3.367669276089353, + "language_loss": 0.80524993, + "learning_rate": 3.999972356310538e-06, + "loss": 0.82904351, + "num_input_tokens_seen": 11158710, + "step": 526, + "time_per_iteration": 2.513422727584839 + }, + { + "auxiliary_loss_clip": 0.01193192, + "auxiliary_loss_mlp": 0.01185936, + "balance_loss_clip": 1.00278234, + "balance_loss_mlp": 1.00149536, + "epoch": 0.03168495415601984, + "flos": 18734489706240.0, + "grad_norm": 3.285566915057063, + "language_loss": 0.81388879, + "learning_rate": 3.999970270714991e-06, + "loss": 0.8376801, + "num_input_tokens_seen": 11177550, + "step": 527, + "time_per_iteration": 2.5244927406311035 + }, + { + "auxiliary_loss_clip": 0.01192952, + "auxiliary_loss_mlp": 0.01185791, + "balance_loss_clip": 1.00265193, + "balance_loss_mlp": 1.0015409, + "epoch": 0.03174507740868781, + "flos": 21214875473280.0, + "grad_norm": 2.106686398576065, + "language_loss": 0.93968695, + "learning_rate": 3.999968109280371e-06, + "loss": 0.96347439, + "num_input_tokens_seen": 11196230, + "step": 528, + "time_per_iteration": 2.517488718032837 + }, + { + "auxiliary_loss_clip": 0.011929, + "auxiliary_loss_mlp": 0.01186029, + "balance_loss_clip": 1.00260305, + "balance_loss_mlp": 1.00149286, + "epoch": 0.03180520066135578, + "flos": 24787663614720.0, + "grad_norm": 1.8504213274079393, + "language_loss": 0.84100759, + "learning_rate": 3.99996587200676e-06, + "loss": 0.86479694, + "num_input_tokens_seen": 11214935, + "step": 529, + "time_per_iteration": 2.5365381240844727 + }, + { + "auxiliary_loss_clip": 0.011932, + "auxiliary_loss_mlp": 0.01185933, + "balance_loss_clip": 1.00295281, + "balance_loss_mlp": 1.00177872, + "epoch": 0.03186532391402375, + "flos": 24864261367680.0, + "grad_norm": 1.8128250462919078, + "language_loss": 0.90161419, + "learning_rate": 3.999963558894243e-06, + "loss": 0.92540556, + "num_input_tokens_seen": 11235310, + "step": 530, + "time_per_iteration": 2.563204288482666 + }, + { + "auxiliary_loss_clip": 0.01192831, + "auxiliary_loss_mlp": 0.01185627, + "balance_loss_clip": 1.00254464, + "balance_loss_mlp": 1.00137663, + "epoch": 0.03192544716669172, + "flos": 21215055041280.0, + "grad_norm": 2.0032474138067555, + "language_loss": 0.76283622, + "learning_rate": 3.999961169942907e-06, + "loss": 0.78662074, + "num_input_tokens_seen": 11254425, + "step": 531, + "time_per_iteration": 2.523595094680786 + }, + { + "auxiliary_loss_clip": 0.01192846, + "auxiliary_loss_mlp": 0.01185372, + "balance_loss_clip": 1.00260735, + "balance_loss_mlp": 1.001122, + "epoch": 0.03198557041935969, + "flos": 24353216616960.0, + "grad_norm": 1.9755074848089331, + "language_loss": 0.90515995, + "learning_rate": 3.999958705152843e-06, + "loss": 0.9289422, + "num_input_tokens_seen": 11274595, + "step": 532, + "time_per_iteration": 2.585439682006836 + }, + { + "auxiliary_loss_clip": 0.01195851, + "auxiliary_loss_mlp": 0.011837, + "balance_loss_clip": 1.00626075, + "balance_loss_mlp": 1.0005945, + "epoch": 0.032045693672027656, + "flos": 61827367587840.0, + "grad_norm": 0.7335005371719829, + "language_loss": 0.57953042, + "learning_rate": 3.9999561645241445e-06, + "loss": 0.60332596, + "num_input_tokens_seen": 11336705, + "step": 533, + "time_per_iteration": 3.188664197921753 + }, + { + "auxiliary_loss_clip": 0.01192904, + "auxiliary_loss_mlp": 0.01185985, + "balance_loss_clip": 1.00267601, + "balance_loss_mlp": 1.00173497, + "epoch": 0.03210581692469563, + "flos": 28401174800640.0, + "grad_norm": 1.6519356177713542, + "language_loss": 0.86472809, + "learning_rate": 3.999953548056907e-06, + "loss": 0.88851702, + "num_input_tokens_seen": 11356820, + "step": 534, + "time_per_iteration": 2.6178741455078125 + }, + { + "auxiliary_loss_clip": 0.01193049, + "auxiliary_loss_mlp": 0.01185087, + "balance_loss_clip": 1.00277519, + "balance_loss_mlp": 1.0010277, + "epoch": 0.03216594017736359, + "flos": 24717709877760.0, + "grad_norm": 2.235238622745417, + "language_loss": 0.77107406, + "learning_rate": 3.999950855751232e-06, + "loss": 0.79485542, + "num_input_tokens_seen": 11376645, + "step": 535, + "time_per_iteration": 2.5720317363739014 + }, + { + "auxiliary_loss_clip": 0.01192854, + "auxiliary_loss_mlp": 0.01186005, + "balance_loss_clip": 1.00265741, + "balance_loss_mlp": 1.00175524, + "epoch": 0.032226063430031565, + "flos": 31175453646720.0, + "grad_norm": 2.133226207945335, + "language_loss": 0.80421984, + "learning_rate": 3.999948087607219e-06, + "loss": 0.82800841, + "num_input_tokens_seen": 11397310, + "step": 536, + "time_per_iteration": 2.5899569988250732 + }, + { + "auxiliary_loss_clip": 0.01192909, + "auxiliary_loss_mlp": 0.01185873, + "balance_loss_clip": 1.00274658, + "balance_loss_mlp": 1.00133741, + "epoch": 0.03228618668269954, + "flos": 32198225506560.0, + "grad_norm": 1.885834653313583, + "language_loss": 0.70269924, + "learning_rate": 3.999945243624975e-06, + "loss": 0.7264871, + "num_input_tokens_seen": 11418475, + "step": 537, + "time_per_iteration": 2.619413137435913 + }, + { + "auxiliary_loss_clip": 0.01192977, + "auxiliary_loss_mlp": 0.01185876, + "balance_loss_clip": 1.00278068, + "balance_loss_mlp": 1.00181651, + "epoch": 0.0323463099353675, + "flos": 22670154996480.0, + "grad_norm": 2.1603677524404334, + "language_loss": 0.83004045, + "learning_rate": 3.999942323804607e-06, + "loss": 0.85382903, + "num_input_tokens_seen": 11436630, + "step": 538, + "time_per_iteration": 2.5468568801879883 + }, + { + "auxiliary_loss_clip": 0.01193047, + "auxiliary_loss_mlp": 0.01185906, + "balance_loss_clip": 1.00273943, + "balance_loss_mlp": 1.00156081, + "epoch": 0.032406433188035474, + "flos": 26905172232960.0, + "grad_norm": 2.0131223903669597, + "language_loss": 0.78900599, + "learning_rate": 3.999939328146225e-06, + "loss": 0.81279546, + "num_input_tokens_seen": 11457275, + "step": 539, + "time_per_iteration": 2.5676889419555664 + }, + { + "auxiliary_loss_clip": 0.01192756, + "auxiliary_loss_mlp": 0.01185496, + "balance_loss_clip": 1.00263214, + "balance_loss_mlp": 1.00124621, + "epoch": 0.03246655644070344, + "flos": 31503928544640.0, + "grad_norm": 2.430000057392391, + "language_loss": 0.7745775, + "learning_rate": 3.999936256649943e-06, + "loss": 0.79835999, + "num_input_tokens_seen": 11476925, + "step": 540, + "time_per_iteration": 2.6003451347351074 + }, + { + "auxiliary_loss_clip": 0.01193085, + "auxiliary_loss_mlp": 0.01186151, + "balance_loss_clip": 1.00286603, + "balance_loss_mlp": 1.0018059, + "epoch": 0.03252667969337141, + "flos": 23218331431680.0, + "grad_norm": 4.555438120438963, + "language_loss": 0.85362464, + "learning_rate": 3.999933109315878e-06, + "loss": 0.87741709, + "num_input_tokens_seen": 11496830, + "step": 541, + "time_per_iteration": 2.5483593940734863 + }, + { + "auxiliary_loss_clip": 0.01192912, + "auxiliary_loss_mlp": 0.01185464, + "balance_loss_clip": 1.00278294, + "balance_loss_mlp": 1.00140524, + "epoch": 0.032586802946039384, + "flos": 14757454926720.0, + "grad_norm": 2.2828312756226103, + "language_loss": 0.8912698, + "learning_rate": 3.9999298861441496e-06, + "loss": 0.91505349, + "num_input_tokens_seen": 11515605, + "step": 542, + "time_per_iteration": 2.5370588302612305 + }, + { + "auxiliary_loss_clip": 0.01192892, + "auxiliary_loss_mlp": 0.01185976, + "balance_loss_clip": 1.00267982, + "balance_loss_mlp": 1.00182104, + "epoch": 0.03264692619870735, + "flos": 24280677100800.0, + "grad_norm": 2.0428560353183083, + "language_loss": 0.70715034, + "learning_rate": 3.999926587134879e-06, + "loss": 0.73093903, + "num_input_tokens_seen": 11536230, + "step": 543, + "time_per_iteration": 2.5401360988616943 + }, + { + "auxiliary_loss_clip": 0.01192757, + "auxiliary_loss_mlp": 0.01185762, + "balance_loss_clip": 1.0026027, + "balance_loss_mlp": 1.00160778, + "epoch": 0.03270704945137532, + "flos": 22893160584960.0, + "grad_norm": 3.032935768662334, + "language_loss": 0.91871536, + "learning_rate": 3.999923212288192e-06, + "loss": 0.94250059, + "num_input_tokens_seen": 11554715, + "step": 544, + "time_per_iteration": 2.555969715118408 + }, + { + "auxiliary_loss_clip": 0.01192887, + "auxiliary_loss_mlp": 0.01185766, + "balance_loss_clip": 1.00273848, + "balance_loss_mlp": 1.00161147, + "epoch": 0.032767172704043286, + "flos": 18041018757120.0, + "grad_norm": 2.9927024295728777, + "language_loss": 0.66603255, + "learning_rate": 3.999919761604216e-06, + "loss": 0.6898191, + "num_input_tokens_seen": 11571370, + "step": 545, + "time_per_iteration": 2.5043914318084717 + }, + { + "auxiliary_loss_clip": 0.01192817, + "auxiliary_loss_mlp": 0.01185541, + "balance_loss_clip": 1.00261378, + "balance_loss_mlp": 1.00119543, + "epoch": 0.03282729595671126, + "flos": 22528739151360.0, + "grad_norm": 3.095515135815454, + "language_loss": 0.9213171, + "learning_rate": 3.999916235083083e-06, + "loss": 0.94510072, + "num_input_tokens_seen": 11588560, + "step": 546, + "time_per_iteration": 2.542353391647339 + }, + { + "auxiliary_loss_clip": 0.01192739, + "auxiliary_loss_mlp": 0.01185405, + "balance_loss_clip": 1.00258374, + "balance_loss_mlp": 1.00134563, + "epoch": 0.03288741920937923, + "flos": 20410620001920.0, + "grad_norm": 3.1361624989038988, + "language_loss": 0.81836569, + "learning_rate": 3.999912632724925e-06, + "loss": 0.84214711, + "num_input_tokens_seen": 11605685, + "step": 547, + "time_per_iteration": 2.5133256912231445 + }, + { + "auxiliary_loss_clip": 0.01192863, + "auxiliary_loss_mlp": 0.0118558, + "balance_loss_clip": 1.00264359, + "balance_loss_mlp": 1.00133014, + "epoch": 0.032947542462047195, + "flos": 20777986350720.0, + "grad_norm": 2.3384076943026875, + "language_loss": 0.80859697, + "learning_rate": 3.999908954529881e-06, + "loss": 0.83238143, + "num_input_tokens_seen": 11626290, + "step": 548, + "time_per_iteration": 2.5292890071868896 + }, + { + "auxiliary_loss_clip": 0.01192767, + "auxiliary_loss_mlp": 0.01185883, + "balance_loss_clip": 1.00260091, + "balance_loss_mlp": 1.00172877, + "epoch": 0.03300766571471517, + "flos": 19901263190400.0, + "grad_norm": 2.8783776740242746, + "language_loss": 0.67425412, + "learning_rate": 3.999905200498087e-06, + "loss": 0.6980406, + "num_input_tokens_seen": 11643950, + "step": 549, + "time_per_iteration": 2.502318859100342 + }, + { + "auxiliary_loss_clip": 0.01192761, + "auxiliary_loss_mlp": 0.01185348, + "balance_loss_clip": 1.00269735, + "balance_loss_mlp": 1.00147963, + "epoch": 0.03306778896738313, + "flos": 17967760968960.0, + "grad_norm": 2.0659967969447623, + "language_loss": 0.86119914, + "learning_rate": 3.999901370629689e-06, + "loss": 0.8849802, + "num_input_tokens_seen": 11662560, + "step": 550, + "time_per_iteration": 2.488830327987671 + }, + { + "auxiliary_loss_clip": 0.01192989, + "auxiliary_loss_mlp": 0.01185792, + "balance_loss_clip": 1.00292015, + "balance_loss_mlp": 1.00154257, + "epoch": 0.033127912220051105, + "flos": 21653380707840.0, + "grad_norm": 3.7579483673653806, + "language_loss": 0.81252825, + "learning_rate": 3.99989746492483e-06, + "loss": 0.83631611, + "num_input_tokens_seen": 11682265, + "step": 551, + "time_per_iteration": 3.8913636207580566 + }, + { + "auxiliary_loss_clip": 0.0119288, + "auxiliary_loss_mlp": 0.01185909, + "balance_loss_clip": 1.00267458, + "balance_loss_mlp": 1.00156415, + "epoch": 0.03318803547271908, + "flos": 30188376927360.0, + "grad_norm": 8.380628119552089, + "language_loss": 0.86248982, + "learning_rate": 3.999893483383658e-06, + "loss": 0.88627774, + "num_input_tokens_seen": 11699300, + "step": 552, + "time_per_iteration": 5.300330400466919 + }, + { + "auxiliary_loss_clip": 0.01192825, + "auxiliary_loss_mlp": 0.0118559, + "balance_loss_clip": 1.0027262, + "balance_loss_mlp": 1.00153112, + "epoch": 0.03324815872538704, + "flos": 20376038183040.0, + "grad_norm": 2.4186752433386443, + "language_loss": 0.93307328, + "learning_rate": 3.999889426006326e-06, + "loss": 0.95685738, + "num_input_tokens_seen": 11716955, + "step": 553, + "time_per_iteration": 3.959068536758423 + }, + { + "auxiliary_loss_clip": 0.01192824, + "auxiliary_loss_mlp": 0.01185469, + "balance_loss_clip": 1.00271821, + "balance_loss_mlp": 1.00140977, + "epoch": 0.033308281978055014, + "flos": 24494560634880.0, + "grad_norm": 2.306733642864821, + "language_loss": 0.78932422, + "learning_rate": 3.999885292792986e-06, + "loss": 0.81310713, + "num_input_tokens_seen": 11736130, + "step": 554, + "time_per_iteration": 2.596529483795166 + }, + { + "auxiliary_loss_clip": 0.01192825, + "auxiliary_loss_mlp": 0.0118571, + "balance_loss_clip": 1.00278461, + "balance_loss_mlp": 1.00174642, + "epoch": 0.03336840523072298, + "flos": 23400326666880.0, + "grad_norm": 2.184737339736418, + "language_loss": 0.82041419, + "learning_rate": 3.999881083743795e-06, + "loss": 0.84419948, + "num_input_tokens_seen": 11754425, + "step": 555, + "time_per_iteration": 2.5422959327697754 + }, + { + "auxiliary_loss_clip": 0.01192742, + "auxiliary_loss_mlp": 0.01185176, + "balance_loss_clip": 1.00263035, + "balance_loss_mlp": 1.00130785, + "epoch": 0.03342852848339095, + "flos": 30550571717760.0, + "grad_norm": 3.8688779870633194, + "language_loss": 0.887043, + "learning_rate": 3.999876798858914e-06, + "loss": 0.91082215, + "num_input_tokens_seen": 11772845, + "step": 556, + "time_per_iteration": 2.571066379547119 + }, + { + "auxiliary_loss_clip": 0.01192797, + "auxiliary_loss_mlp": 0.01185528, + "balance_loss_clip": 1.00273371, + "balance_loss_mlp": 1.00146842, + "epoch": 0.03348865173605892, + "flos": 22893304239360.0, + "grad_norm": 2.4724745051942394, + "language_loss": 0.8368814, + "learning_rate": 3.999872438138503e-06, + "loss": 0.86066467, + "num_input_tokens_seen": 11792850, + "step": 557, + "time_per_iteration": 2.5547072887420654 + }, + { + "auxiliary_loss_clip": 0.01192799, + "auxiliary_loss_mlp": 0.01185408, + "balance_loss_clip": 1.00270319, + "balance_loss_mlp": 1.00134873, + "epoch": 0.03354877498872689, + "flos": 17676022705920.0, + "grad_norm": 3.2899379685735246, + "language_loss": 0.94327098, + "learning_rate": 3.999868001582729e-06, + "loss": 0.96705306, + "num_input_tokens_seen": 11809670, + "step": 558, + "time_per_iteration": 2.5198049545288086 + }, + { + "auxiliary_loss_clip": 0.01192608, + "auxiliary_loss_mlp": 0.01185406, + "balance_loss_clip": 1.0026083, + "balance_loss_mlp": 1.00144207, + "epoch": 0.03360889824139486, + "flos": 21652985658240.0, + "grad_norm": 3.3269540218391684, + "language_loss": 0.77145231, + "learning_rate": 3.99986348919176e-06, + "loss": 0.79523247, + "num_input_tokens_seen": 11829665, + "step": 559, + "time_per_iteration": 2.533381223678589 + }, + { + "auxiliary_loss_clip": 0.01192708, + "auxiliary_loss_mlp": 0.01185561, + "balance_loss_clip": 1.00264776, + "balance_loss_mlp": 1.00178826, + "epoch": 0.033669021494062826, + "flos": 21795730306560.0, + "grad_norm": 2.2414931629721586, + "language_loss": 0.87452978, + "learning_rate": 3.9998589009657675e-06, + "loss": 0.89831245, + "num_input_tokens_seen": 11848190, + "step": 560, + "time_per_iteration": 2.54642915725708 + }, + { + "auxiliary_loss_clip": 0.01192946, + "auxiliary_loss_mlp": 0.01185216, + "balance_loss_clip": 1.00287461, + "balance_loss_mlp": 1.00125241, + "epoch": 0.0337291447467308, + "flos": 21866222747520.0, + "grad_norm": 2.6687714836051644, + "language_loss": 0.81706929, + "learning_rate": 3.999854236904925e-06, + "loss": 0.84085095, + "num_input_tokens_seen": 11864795, + "step": 561, + "time_per_iteration": 2.5341145992279053 + }, + { + "auxiliary_loss_clip": 0.01192692, + "auxiliary_loss_mlp": 0.0118553, + "balance_loss_clip": 1.00267446, + "balance_loss_mlp": 1.0014708, + "epoch": 0.03378926799939877, + "flos": 24245951627520.0, + "grad_norm": 1.7650815939207651, + "language_loss": 0.82284611, + "learning_rate": 3.999849497009409e-06, + "loss": 0.84662837, + "num_input_tokens_seen": 11885275, + "step": 562, + "time_per_iteration": 2.6075327396392822 + }, + { + "auxiliary_loss_clip": 0.01192836, + "auxiliary_loss_mlp": 0.01185414, + "balance_loss_clip": 1.00275779, + "balance_loss_mlp": 1.00164127, + "epoch": 0.033849391252066735, + "flos": 16507812677760.0, + "grad_norm": 1.9305082538595388, + "language_loss": 0.84411812, + "learning_rate": 3.999844681279401e-06, + "loss": 0.86790061, + "num_input_tokens_seen": 11903595, + "step": 563, + "time_per_iteration": 2.508358955383301 + }, + { + "auxiliary_loss_clip": 0.01192882, + "auxiliary_loss_mlp": 0.01185423, + "balance_loss_clip": 1.00284457, + "balance_loss_mlp": 1.00145912, + "epoch": 0.03390951450473471, + "flos": 15669298609920.0, + "grad_norm": 2.1860875538301583, + "language_loss": 0.94077408, + "learning_rate": 3.99983978971508e-06, + "loss": 0.96455705, + "num_input_tokens_seen": 11917815, + "step": 564, + "time_per_iteration": 2.501941680908203 + }, + { + "auxiliary_loss_clip": 0.01192695, + "auxiliary_loss_mlp": 0.01185645, + "balance_loss_clip": 1.00260496, + "balance_loss_mlp": 1.00149083, + "epoch": 0.03396963775740267, + "flos": 22674787850880.0, + "grad_norm": 2.5424814452589937, + "language_loss": 0.93994224, + "learning_rate": 3.999834822316635e-06, + "loss": 0.96372569, + "num_input_tokens_seen": 11936305, + "step": 565, + "time_per_iteration": 2.5144424438476562 + }, + { + "auxiliary_loss_clip": 0.0119551, + "auxiliary_loss_mlp": 0.01183342, + "balance_loss_clip": 1.00618792, + "balance_loss_mlp": 1.00023651, + "epoch": 0.034029761010070644, + "flos": 64392683063040.0, + "grad_norm": 1.082104664698442, + "language_loss": 0.54856396, + "learning_rate": 3.9998297790842535e-06, + "loss": 0.57235247, + "num_input_tokens_seen": 11998940, + "step": 566, + "time_per_iteration": 3.2046613693237305 + }, + { + "auxiliary_loss_clip": 0.01192729, + "auxiliary_loss_mlp": 0.0118531, + "balance_loss_clip": 1.00267899, + "balance_loss_mlp": 1.00134635, + "epoch": 0.034089884262738616, + "flos": 25004204755200.0, + "grad_norm": 2.2835007212414498, + "language_loss": 0.76641434, + "learning_rate": 3.999824660018126e-06, + "loss": 0.79019475, + "num_input_tokens_seen": 12018860, + "step": 567, + "time_per_iteration": 2.573190927505493 + }, + { + "auxiliary_loss_clip": 0.01192632, + "auxiliary_loss_mlp": 0.01185343, + "balance_loss_clip": 1.0027591, + "balance_loss_mlp": 1.00156987, + "epoch": 0.03415000751540658, + "flos": 28439096584320.0, + "grad_norm": 2.1288888163222506, + "language_loss": 0.80614513, + "learning_rate": 3.999819465118447e-06, + "loss": 0.82992494, + "num_input_tokens_seen": 12039675, + "step": 568, + "time_per_iteration": 2.5926990509033203 + }, + { + "auxiliary_loss_clip": 0.01192682, + "auxiliary_loss_mlp": 0.01185376, + "balance_loss_clip": 1.0026871, + "balance_loss_mlp": 1.00160289, + "epoch": 0.034210130768074554, + "flos": 21468727866240.0, + "grad_norm": 1.7962017414675517, + "language_loss": 0.86428773, + "learning_rate": 3.999814194385413e-06, + "loss": 0.88806832, + "num_input_tokens_seen": 12057680, + "step": 569, + "time_per_iteration": 2.5451107025146484 + }, + { + "auxiliary_loss_clip": 0.01192686, + "auxiliary_loss_mlp": 0.01185329, + "balance_loss_clip": 1.00268006, + "balance_loss_mlp": 1.0015564, + "epoch": 0.03427025402074252, + "flos": 18697501676160.0, + "grad_norm": 1.8097175296104027, + "language_loss": 0.96109813, + "learning_rate": 3.9998088478192255e-06, + "loss": 0.9848783, + "num_input_tokens_seen": 12076135, + "step": 570, + "time_per_iteration": 2.5360372066497803 + }, + { + "auxiliary_loss_clip": 0.0119254, + "auxiliary_loss_mlp": 0.01185434, + "balance_loss_clip": 1.00254512, + "balance_loss_mlp": 1.00166059, + "epoch": 0.03433037727341049, + "flos": 20849987162880.0, + "grad_norm": 2.2445336717807205, + "language_loss": 0.8009783, + "learning_rate": 3.9998034254200846e-06, + "loss": 0.82475799, + "num_input_tokens_seen": 12094785, + "step": 571, + "time_per_iteration": 2.574594497680664 + }, + { + "auxiliary_loss_clip": 0.01192681, + "auxiliary_loss_mlp": 0.0118532, + "balance_loss_clip": 1.00278163, + "balance_loss_mlp": 1.00154662, + "epoch": 0.03439050052607846, + "flos": 25410282986880.0, + "grad_norm": 2.858119852674192, + "language_loss": 0.80265403, + "learning_rate": 3.999797927188199e-06, + "loss": 0.82643408, + "num_input_tokens_seen": 12114590, + "step": 572, + "time_per_iteration": 2.580610990524292 + }, + { + "auxiliary_loss_clip": 0.01192853, + "auxiliary_loss_mlp": 0.01185399, + "balance_loss_clip": 1.00287914, + "balance_loss_mlp": 1.00153065, + "epoch": 0.03445062377874643, + "flos": 17640147997440.0, + "grad_norm": 3.52032982713047, + "language_loss": 0.8469258, + "learning_rate": 3.999792353123774e-06, + "loss": 0.87070829, + "num_input_tokens_seen": 12132390, + "step": 573, + "time_per_iteration": 2.503916025161743 + }, + { + "auxiliary_loss_clip": 0.0119262, + "auxiliary_loss_mlp": 0.01185141, + "balance_loss_clip": 1.00260711, + "balance_loss_mlp": 1.0012722, + "epoch": 0.0345107470314144, + "flos": 16764502245120.0, + "grad_norm": 2.3246468159915077, + "language_loss": 0.7650373, + "learning_rate": 3.999786703227023e-06, + "loss": 0.7888149, + "num_input_tokens_seen": 12149035, + "step": 574, + "time_per_iteration": 2.495530128479004 + }, + { + "auxiliary_loss_clip": 0.01192631, + "auxiliary_loss_mlp": 0.01184945, + "balance_loss_clip": 1.0027293, + "balance_loss_mlp": 1.00145817, + "epoch": 0.03457087028408237, + "flos": 14684448533760.0, + "grad_norm": 3.3342607341132466, + "language_loss": 0.83506274, + "learning_rate": 3.9997809774981606e-06, + "loss": 0.85883844, + "num_input_tokens_seen": 12167530, + "step": 575, + "time_per_iteration": 2.531052589416504 + }, + { + "auxiliary_loss_clip": 0.01192655, + "auxiliary_loss_mlp": 0.01185282, + "balance_loss_clip": 1.00282311, + "balance_loss_mlp": 1.00169969, + "epoch": 0.03463099353675034, + "flos": 20011293527040.0, + "grad_norm": 2.599782378636754, + "language_loss": 0.84059894, + "learning_rate": 3.9997751759374025e-06, + "loss": 0.86437827, + "num_input_tokens_seen": 12186340, + "step": 576, + "time_per_iteration": 2.5363762378692627 + }, + { + "auxiliary_loss_clip": 0.01192758, + "auxiliary_loss_mlp": 0.01185325, + "balance_loss_clip": 1.0029577, + "balance_loss_mlp": 1.00164771, + "epoch": 0.03469111678941831, + "flos": 25301150490240.0, + "grad_norm": 2.9107728190034443, + "language_loss": 0.86340702, + "learning_rate": 3.99976929854497e-06, + "loss": 0.88718784, + "num_input_tokens_seen": 12204090, + "step": 577, + "time_per_iteration": 2.5731775760650635 + }, + { + "auxiliary_loss_clip": 0.0119268, + "auxiliary_loss_mlp": 0.01184949, + "balance_loss_clip": 1.00284719, + "balance_loss_mlp": 1.00127113, + "epoch": 0.034751240042086275, + "flos": 23259413612160.0, + "grad_norm": 2.327296411922007, + "language_loss": 0.72039902, + "learning_rate": 3.9997633453210845e-06, + "loss": 0.74417531, + "num_input_tokens_seen": 12224850, + "step": 578, + "time_per_iteration": 2.5584073066711426 + }, + { + "auxiliary_loss_clip": 0.01192609, + "auxiliary_loss_mlp": 0.01185417, + "balance_loss_clip": 1.00270414, + "balance_loss_mlp": 1.00154805, + "epoch": 0.03481136329475425, + "flos": 23769237300480.0, + "grad_norm": 1.9010308287810118, + "language_loss": 0.77803868, + "learning_rate": 3.999757316265973e-06, + "loss": 0.80181897, + "num_input_tokens_seen": 12244935, + "step": 579, + "time_per_iteration": 2.5642476081848145 + }, + { + "auxiliary_loss_clip": 0.01192629, + "auxiliary_loss_mlp": 0.01185027, + "balance_loss_clip": 1.00271046, + "balance_loss_mlp": 1.00154018, + "epoch": 0.03487148654742222, + "flos": 20157521794560.0, + "grad_norm": 1.9065256052950326, + "language_loss": 0.86660933, + "learning_rate": 3.999751211379863e-06, + "loss": 0.89038587, + "num_input_tokens_seen": 12262140, + "step": 580, + "time_per_iteration": 2.5445759296417236 + }, + { + "auxiliary_loss_clip": 0.01192649, + "auxiliary_loss_mlp": 0.0118507, + "balance_loss_clip": 1.00277996, + "balance_loss_mlp": 1.00129676, + "epoch": 0.034931609800090184, + "flos": 15669585918720.0, + "grad_norm": 2.5795704835649493, + "language_loss": 0.82089055, + "learning_rate": 3.999745030662987e-06, + "loss": 0.84466779, + "num_input_tokens_seen": 12280930, + "step": 581, + "time_per_iteration": 2.513507604598999 + }, + { + "auxiliary_loss_clip": 0.01192718, + "auxiliary_loss_mlp": 0.01185147, + "balance_loss_clip": 1.0028578, + "balance_loss_mlp": 1.00146949, + "epoch": 0.034991733052758156, + "flos": 16362374509440.0, + "grad_norm": 2.44181404643169, + "language_loss": 0.7714839, + "learning_rate": 3.99973877411558e-06, + "loss": 0.79526258, + "num_input_tokens_seen": 12299125, + "step": 582, + "time_per_iteration": 2.50319766998291 + }, + { + "auxiliary_loss_clip": 0.01192546, + "auxiliary_loss_mlp": 0.01185019, + "balance_loss_clip": 1.00277495, + "balance_loss_mlp": 1.00172281, + "epoch": 0.03505185630542612, + "flos": 19387309438080.0, + "grad_norm": 1.9915101033364353, + "language_loss": 0.87316257, + "learning_rate": 3.999732441737877e-06, + "loss": 0.8969382, + "num_input_tokens_seen": 12316905, + "step": 583, + "time_per_iteration": 2.5293827056884766 + }, + { + "auxiliary_loss_clip": 0.01192688, + "auxiliary_loss_mlp": 0.01185358, + "balance_loss_clip": 1.00280416, + "balance_loss_mlp": 1.00177526, + "epoch": 0.03511197955809409, + "flos": 21323828401920.0, + "grad_norm": 2.469752549534102, + "language_loss": 0.81105828, + "learning_rate": 3.99972603353012e-06, + "loss": 0.83483869, + "num_input_tokens_seen": 12335070, + "step": 584, + "time_per_iteration": 2.5597550868988037 + }, + { + "auxiliary_loss_clip": 0.01192562, + "auxiliary_loss_mlp": 0.01184994, + "balance_loss_clip": 1.00268555, + "balance_loss_mlp": 1.00150681, + "epoch": 0.035172102810762065, + "flos": 14136595320960.0, + "grad_norm": 2.986638784618159, + "language_loss": 0.92763293, + "learning_rate": 3.999719549492551e-06, + "loss": 0.95140851, + "num_input_tokens_seen": 12350315, + "step": 585, + "time_per_iteration": 2.503190279006958 + }, + { + "auxiliary_loss_clip": 0.01192553, + "auxiliary_loss_mlp": 0.0118502, + "balance_loss_clip": 1.00274491, + "balance_loss_mlp": 1.001724, + "epoch": 0.03523222606343003, + "flos": 20296890564480.0, + "grad_norm": 2.27467049107232, + "language_loss": 0.87776428, + "learning_rate": 3.9997129896254165e-06, + "loss": 0.90153992, + "num_input_tokens_seen": 12366030, + "step": 586, + "time_per_iteration": 2.543363094329834 + }, + { + "auxiliary_loss_clip": 0.01192572, + "auxiliary_loss_mlp": 0.01185173, + "balance_loss_clip": 1.00275755, + "balance_loss_mlp": 1.00159073, + "epoch": 0.035292349316098, + "flos": 20375822701440.0, + "grad_norm": 2.020500183889663, + "language_loss": 0.76428652, + "learning_rate": 3.999706353928965e-06, + "loss": 0.788064, + "num_input_tokens_seen": 12384895, + "step": 587, + "time_per_iteration": 2.5136187076568604 + }, + { + "auxiliary_loss_clip": 0.01192514, + "auxiliary_loss_mlp": 0.01185118, + "balance_loss_clip": 1.00267422, + "balance_loss_mlp": 1.00144053, + "epoch": 0.03535247256876597, + "flos": 21468871520640.0, + "grad_norm": 1.7104267006311147, + "language_loss": 0.78591239, + "learning_rate": 3.999699642403449e-06, + "loss": 0.80968875, + "num_input_tokens_seen": 12404980, + "step": 588, + "time_per_iteration": 2.544151544570923 + }, + { + "auxiliary_loss_clip": 0.01192513, + "auxiliary_loss_mlp": 0.01185154, + "balance_loss_clip": 1.00262284, + "balance_loss_mlp": 1.00157189, + "epoch": 0.03541259582143394, + "flos": 23623044946560.0, + "grad_norm": 2.218470148542271, + "language_loss": 0.93596017, + "learning_rate": 3.99969285504912e-06, + "loss": 0.95973688, + "num_input_tokens_seen": 12423835, + "step": 589, + "time_per_iteration": 2.5702359676361084 + }, + { + "auxiliary_loss_clip": 0.01192471, + "auxiliary_loss_mlp": 0.01184943, + "balance_loss_clip": 1.00267708, + "balance_loss_mlp": 1.00136018, + "epoch": 0.03547271907410191, + "flos": 33726367768320.0, + "grad_norm": 2.7190279318719055, + "language_loss": 0.84096277, + "learning_rate": 3.99968599186624e-06, + "loss": 0.86473691, + "num_input_tokens_seen": 12443135, + "step": 590, + "time_per_iteration": 5.580706357955933 + }, + { + "auxiliary_loss_clip": 0.01192616, + "auxiliary_loss_mlp": 0.01184809, + "balance_loss_clip": 1.00278556, + "balance_loss_mlp": 1.00141752, + "epoch": 0.03553284232676988, + "flos": 21142695093120.0, + "grad_norm": 2.0652081016476433, + "language_loss": 0.8692835, + "learning_rate": 3.999679052855065e-06, + "loss": 0.8930577, + "num_input_tokens_seen": 12462895, + "step": 591, + "time_per_iteration": 2.54657244682312 + }, + { + "auxiliary_loss_clip": 0.0119246, + "auxiliary_loss_mlp": 0.01184925, + "balance_loss_clip": 1.00260615, + "balance_loss_mlp": 1.00143766, + "epoch": 0.03559296557943785, + "flos": 20046593617920.0, + "grad_norm": 2.046476430248932, + "language_loss": 0.82857943, + "learning_rate": 3.999672038015861e-06, + "loss": 0.85235327, + "num_input_tokens_seen": 12481515, + "step": 592, + "time_per_iteration": 3.9158849716186523 + }, + { + "auxiliary_loss_clip": 0.01194875, + "auxiliary_loss_mlp": 0.01183119, + "balance_loss_clip": 1.00577879, + "balance_loss_mlp": 1.00077677, + "epoch": 0.035653088832105814, + "flos": 60334597244160.0, + "grad_norm": 0.879141544997935, + "language_loss": 0.59861118, + "learning_rate": 3.999664947348893e-06, + "loss": 0.62239116, + "num_input_tokens_seen": 12548220, + "step": 593, + "time_per_iteration": 3.1720285415649414 + }, + { + "auxiliary_loss_clip": 0.01192568, + "auxiliary_loss_mlp": 0.01184692, + "balance_loss_clip": 1.00287306, + "balance_loss_mlp": 1.00120473, + "epoch": 0.035713212084773786, + "flos": 20113135562880.0, + "grad_norm": 1.879296025644796, + "language_loss": 0.87285751, + "learning_rate": 3.999657780854429e-06, + "loss": 0.89663011, + "num_input_tokens_seen": 12566105, + "step": 594, + "time_per_iteration": 2.543177604675293 + }, + { + "auxiliary_loss_clip": 0.01192458, + "auxiliary_loss_mlp": 0.0118481, + "balance_loss_clip": 1.0026679, + "balance_loss_mlp": 1.00170457, + "epoch": 0.03577333533744176, + "flos": 26285785084800.0, + "grad_norm": 2.8462118329363424, + "language_loss": 0.83980846, + "learning_rate": 3.999650538532742e-06, + "loss": 0.86358118, + "num_input_tokens_seen": 12586680, + "step": 595, + "time_per_iteration": 2.5846915245056152 + }, + { + "auxiliary_loss_clip": 0.01192402, + "auxiliary_loss_mlp": 0.01185292, + "balance_loss_clip": 1.00272548, + "balance_loss_mlp": 1.00199544, + "epoch": 0.035833458590109724, + "flos": 10889732211840.0, + "grad_norm": 3.513444883443572, + "language_loss": 0.96327329, + "learning_rate": 3.999643220384106e-06, + "loss": 0.98705029, + "num_input_tokens_seen": 12601605, + "step": 596, + "time_per_iteration": 2.5423972606658936 + }, + { + "auxiliary_loss_clip": 0.01192482, + "auxiliary_loss_mlp": 0.01184947, + "balance_loss_clip": 1.00274885, + "balance_loss_mlp": 1.00174665, + "epoch": 0.035893581842777696, + "flos": 22090198003200.0, + "grad_norm": 2.995022421954565, + "language_loss": 0.82909071, + "learning_rate": 3.999635826408799e-06, + "loss": 0.85286498, + "num_input_tokens_seen": 12620365, + "step": 597, + "time_per_iteration": 2.5513107776641846 + }, + { + "auxiliary_loss_clip": 0.01192371, + "auxiliary_loss_mlp": 0.01184518, + "balance_loss_clip": 1.002756, + "balance_loss_mlp": 1.00141287, + "epoch": 0.03595370509544566, + "flos": 23038347358080.0, + "grad_norm": 1.6239650220031854, + "language_loss": 0.81143153, + "learning_rate": 3.999628356607101e-06, + "loss": 0.83520037, + "num_input_tokens_seen": 12641140, + "step": 598, + "time_per_iteration": 2.5747363567352295 + }, + { + "auxiliary_loss_clip": 0.01192276, + "auxiliary_loss_mlp": 0.01184493, + "balance_loss_clip": 1.00266027, + "balance_loss_mlp": 1.00138748, + "epoch": 0.03601382834811363, + "flos": 20777734955520.0, + "grad_norm": 1.8654271753919396, + "language_loss": 0.813402, + "learning_rate": 3.999620810979295e-06, + "loss": 0.83716971, + "num_input_tokens_seen": 12661080, + "step": 599, + "time_per_iteration": 2.528698205947876 + }, + { + "auxiliary_loss_clip": 0.01192445, + "auxiliary_loss_mlp": 0.01184625, + "balance_loss_clip": 1.002666, + "balance_loss_mlp": 1.00123382, + "epoch": 0.036073951600781605, + "flos": 23951627585280.0, + "grad_norm": 2.2084620360897986, + "language_loss": 0.86185497, + "learning_rate": 3.999613189525668e-06, + "loss": 0.88562566, + "num_input_tokens_seen": 12678270, + "step": 600, + "time_per_iteration": 2.5700035095214844 + }, + { + "auxiliary_loss_clip": 0.01192102, + "auxiliary_loss_mlp": 0.01184564, + "balance_loss_clip": 1.00255573, + "balance_loss_mlp": 1.00155377, + "epoch": 0.03613407485344957, + "flos": 18912283050240.0, + "grad_norm": 2.0758293538320896, + "language_loss": 0.81970602, + "learning_rate": 3.999605492246508e-06, + "loss": 0.84347272, + "num_input_tokens_seen": 12697295, + "step": 601, + "time_per_iteration": 2.5085883140563965 + }, + { + "auxiliary_loss_clip": 0.01192288, + "auxiliary_loss_mlp": 0.01184536, + "balance_loss_clip": 1.00261855, + "balance_loss_mlp": 1.00152564, + "epoch": 0.03619419810611754, + "flos": 23038526926080.0, + "grad_norm": 2.4973568244296347, + "language_loss": 0.75304776, + "learning_rate": 3.999597719142107e-06, + "loss": 0.77681601, + "num_input_tokens_seen": 12716165, + "step": 602, + "time_per_iteration": 2.5426995754241943 + }, + { + "auxiliary_loss_clip": 0.01192301, + "auxiliary_loss_mlp": 0.01184266, + "balance_loss_clip": 1.00270033, + "balance_loss_mlp": 1.00106525, + "epoch": 0.03625432135878551, + "flos": 29457774293760.0, + "grad_norm": 2.263671298074831, + "language_loss": 0.79745501, + "learning_rate": 3.999589870212761e-06, + "loss": 0.82122064, + "num_input_tokens_seen": 12735475, + "step": 603, + "time_per_iteration": 2.597532033920288 + }, + { + "auxiliary_loss_clip": 0.011924, + "auxiliary_loss_mlp": 0.01184466, + "balance_loss_clip": 1.00286579, + "balance_loss_mlp": 1.0013603, + "epoch": 0.03631444461145348, + "flos": 23508525409920.0, + "grad_norm": 2.3179244254817117, + "language_loss": 0.86839318, + "learning_rate": 3.9995819454587664e-06, + "loss": 0.89216191, + "num_input_tokens_seen": 12754540, + "step": 604, + "time_per_iteration": 2.5788660049438477 + }, + { + "auxiliary_loss_clip": 0.01192187, + "auxiliary_loss_mlp": 0.01184316, + "balance_loss_clip": 1.00261986, + "balance_loss_mlp": 1.00121093, + "epoch": 0.03637456786412145, + "flos": 16618130323200.0, + "grad_norm": 4.765862750717775, + "language_loss": 0.80783528, + "learning_rate": 3.999573944880424e-06, + "loss": 0.83160031, + "num_input_tokens_seen": 12773050, + "step": 605, + "time_per_iteration": 2.4972198009490967 + }, + { + "auxiliary_loss_clip": 0.01192327, + "auxiliary_loss_mlp": 0.01184563, + "balance_loss_clip": 1.0026381, + "balance_loss_mlp": 1.00155258, + "epoch": 0.03643469111678942, + "flos": 15851832549120.0, + "grad_norm": 2.6833349190996763, + "language_loss": 0.85850406, + "learning_rate": 3.9995658684780375e-06, + "loss": 0.88227296, + "num_input_tokens_seen": 12791240, + "step": 606, + "time_per_iteration": 2.5173871517181396 + }, + { + "auxiliary_loss_clip": 0.011923, + "auxiliary_loss_mlp": 0.01184458, + "balance_loss_clip": 1.0026741, + "balance_loss_mlp": 1.00144768, + "epoch": 0.03649481436945739, + "flos": 23620387340160.0, + "grad_norm": 2.0770167848308088, + "language_loss": 0.82381135, + "learning_rate": 3.999557716251912e-06, + "loss": 0.84757888, + "num_input_tokens_seen": 12812245, + "step": 607, + "time_per_iteration": 2.5343921184539795 + }, + { + "auxiliary_loss_clip": 0.01192349, + "auxiliary_loss_mlp": 0.01184382, + "balance_loss_clip": 1.00277579, + "balance_loss_mlp": 1.00127625, + "epoch": 0.036554937622125354, + "flos": 21755581879680.0, + "grad_norm": 2.410446691959677, + "language_loss": 0.83686769, + "learning_rate": 3.999549488202358e-06, + "loss": 0.86063492, + "num_input_tokens_seen": 12831085, + "step": 608, + "time_per_iteration": 2.5490095615386963 + }, + { + "auxiliary_loss_clip": 0.01192281, + "auxiliary_loss_mlp": 0.01184032, + "balance_loss_clip": 1.00273633, + "balance_loss_mlp": 1.00111735, + "epoch": 0.036615060874793326, + "flos": 17819772935040.0, + "grad_norm": 2.317054342120116, + "language_loss": 0.82430136, + "learning_rate": 3.999541184329688e-06, + "loss": 0.84806454, + "num_input_tokens_seen": 12849115, + "step": 609, + "time_per_iteration": 2.507802724838257 + }, + { + "auxiliary_loss_clip": 0.01192552, + "auxiliary_loss_mlp": 0.01184583, + "balance_loss_clip": 1.00296164, + "balance_loss_mlp": 1.00157237, + "epoch": 0.0366751841274613, + "flos": 26753808320640.0, + "grad_norm": 2.0390512066666013, + "language_loss": 0.78966677, + "learning_rate": 3.999532804634215e-06, + "loss": 0.81343812, + "num_input_tokens_seen": 12868005, + "step": 610, + "time_per_iteration": 2.576590061187744 + }, + { + "auxiliary_loss_clip": 0.01192443, + "auxiliary_loss_mlp": 0.01184837, + "balance_loss_clip": 1.00282383, + "balance_loss_mlp": 1.00154042, + "epoch": 0.03673530738012926, + "flos": 22196960202240.0, + "grad_norm": 2.0362297668226894, + "language_loss": 0.87609082, + "learning_rate": 3.9995243491162575e-06, + "loss": 0.8998636, + "num_input_tokens_seen": 12886890, + "step": 611, + "time_per_iteration": 2.5273876190185547 + }, + { + "auxiliary_loss_clip": 0.01192357, + "auxiliary_loss_mlp": 0.01184767, + "balance_loss_clip": 1.00280762, + "balance_loss_mlp": 1.00185275, + "epoch": 0.036795430632797235, + "flos": 24681655601280.0, + "grad_norm": 3.3911846682753715, + "language_loss": 0.72712064, + "learning_rate": 3.999515817776136e-06, + "loss": 0.75089186, + "num_input_tokens_seen": 12906130, + "step": 612, + "time_per_iteration": 2.5937552452087402 + }, + { + "auxiliary_loss_clip": 0.01192302, + "auxiliary_loss_mlp": 0.01184543, + "balance_loss_clip": 1.00268507, + "balance_loss_mlp": 1.00134158, + "epoch": 0.0368555538854652, + "flos": 17748921358080.0, + "grad_norm": 3.610988732787054, + "language_loss": 0.79146659, + "learning_rate": 3.999507210614175e-06, + "loss": 0.81523502, + "num_input_tokens_seen": 12925260, + "step": 613, + "time_per_iteration": 2.520786762237549 + }, + { + "auxiliary_loss_clip": 0.01192234, + "auxiliary_loss_mlp": 0.01184572, + "balance_loss_clip": 1.00271583, + "balance_loss_mlp": 1.00146675, + "epoch": 0.03691567713813317, + "flos": 20594554571520.0, + "grad_norm": 1.9668075379388592, + "language_loss": 0.93650568, + "learning_rate": 3.9994985276307e-06, + "loss": 0.96027374, + "num_input_tokens_seen": 12944590, + "step": 614, + "time_per_iteration": 2.521545886993408 + }, + { + "auxiliary_loss_clip": 0.01192382, + "auxiliary_loss_mlp": 0.01184491, + "balance_loss_clip": 1.00283396, + "balance_loss_mlp": 1.00138545, + "epoch": 0.036975800390801145, + "flos": 33650380546560.0, + "grad_norm": 2.9689179030517034, + "language_loss": 0.73182273, + "learning_rate": 3.999489768826041e-06, + "loss": 0.75559151, + "num_input_tokens_seen": 12964785, + "step": 615, + "time_per_iteration": 2.638417959213257 + }, + { + "auxiliary_loss_clip": 0.01192241, + "auxiliary_loss_mlp": 0.01184766, + "balance_loss_clip": 1.00263464, + "balance_loss_mlp": 1.00166035, + "epoch": 0.03703592364346911, + "flos": 28293694329600.0, + "grad_norm": 2.8924518537437867, + "language_loss": 0.81780797, + "learning_rate": 3.999480934200528e-06, + "loss": 0.84157801, + "num_input_tokens_seen": 12986705, + "step": 616, + "time_per_iteration": 2.5923373699188232 + }, + { + "auxiliary_loss_clip": 0.01192379, + "auxiliary_loss_mlp": 0.01184295, + "balance_loss_clip": 1.00279236, + "balance_loss_mlp": 1.00118971, + "epoch": 0.03709604689613708, + "flos": 31504215853440.0, + "grad_norm": 2.470781085229462, + "language_loss": 0.67849934, + "learning_rate": 3.999472023754499e-06, + "loss": 0.7022661, + "num_input_tokens_seen": 13010560, + "step": 617, + "time_per_iteration": 2.6275157928466797 + }, + { + "auxiliary_loss_clip": 0.01192476, + "auxiliary_loss_mlp": 0.01184082, + "balance_loss_clip": 1.00298142, + "balance_loss_mlp": 1.00107193, + "epoch": 0.03715617014880505, + "flos": 19609381272960.0, + "grad_norm": 3.0388001797367754, + "language_loss": 0.80095488, + "learning_rate": 3.99946303748829e-06, + "loss": 0.82472044, + "num_input_tokens_seen": 13028935, + "step": 618, + "time_per_iteration": 2.550619602203369 + }, + { + "auxiliary_loss_clip": 0.01192288, + "auxiliary_loss_mlp": 0.01184657, + "balance_loss_clip": 1.00267816, + "balance_loss_mlp": 1.00155163, + "epoch": 0.03721629340147302, + "flos": 15924192497280.0, + "grad_norm": 2.7576922042388663, + "language_loss": 0.91012996, + "learning_rate": 3.999453975402242e-06, + "loss": 0.9338994, + "num_input_tokens_seen": 13046000, + "step": 619, + "time_per_iteration": 2.5232114791870117 + }, + { + "auxiliary_loss_clip": 0.01192275, + "auxiliary_loss_mlp": 0.01184281, + "balance_loss_clip": 1.00277305, + "balance_loss_mlp": 1.00155663, + "epoch": 0.03727641665414099, + "flos": 21104090951040.0, + "grad_norm": 2.155350774232013, + "language_loss": 0.94148266, + "learning_rate": 3.9994448374967e-06, + "loss": 0.96524817, + "num_input_tokens_seen": 13062995, + "step": 620, + "time_per_iteration": 2.5931198596954346 + }, + { + "auxiliary_loss_clip": 0.01192226, + "auxiliary_loss_mlp": 0.01184817, + "balance_loss_clip": 1.00265646, + "balance_loss_mlp": 1.00161624, + "epoch": 0.037336539906808956, + "flos": 24131683486080.0, + "grad_norm": 1.83257556836572, + "language_loss": 0.76919019, + "learning_rate": 3.999435623772008e-06, + "loss": 0.79296064, + "num_input_tokens_seen": 13084120, + "step": 621, + "time_per_iteration": 2.5457592010498047 + }, + { + "auxiliary_loss_clip": 0.01192291, + "auxiliary_loss_mlp": 0.01184356, + "balance_loss_clip": 1.00279915, + "balance_loss_mlp": 1.00125003, + "epoch": 0.03739666315947693, + "flos": 22346384780160.0, + "grad_norm": 3.6696359898629116, + "language_loss": 0.86839628, + "learning_rate": 3.999426334228518e-06, + "loss": 0.89216274, + "num_input_tokens_seen": 13100035, + "step": 622, + "time_per_iteration": 2.5269296169281006 + }, + { + "auxiliary_loss_clip": 0.01192178, + "auxiliary_loss_mlp": 0.01184478, + "balance_loss_clip": 1.00266659, + "balance_loss_mlp": 1.00146747, + "epoch": 0.0374567864121449, + "flos": 20449511452800.0, + "grad_norm": 2.2725476958018938, + "language_loss": 0.90157557, + "learning_rate": 3.999416968866581e-06, + "loss": 0.92534208, + "num_input_tokens_seen": 13118070, + "step": 623, + "time_per_iteration": 2.5419254302978516 + }, + { + "auxiliary_loss_clip": 0.01192343, + "auxiliary_loss_mlp": 0.01184766, + "balance_loss_clip": 1.00284934, + "balance_loss_mlp": 1.0017556, + "epoch": 0.037516909664812866, + "flos": 19208043636480.0, + "grad_norm": 1.9009146471609357, + "language_loss": 0.84106678, + "learning_rate": 3.999407527686551e-06, + "loss": 0.86483788, + "num_input_tokens_seen": 13136355, + "step": 624, + "time_per_iteration": 2.631134033203125 + }, + { + "auxiliary_loss_clip": 0.01192148, + "auxiliary_loss_mlp": 0.01184188, + "balance_loss_clip": 1.00267267, + "balance_loss_mlp": 1.00117803, + "epoch": 0.03757703291748084, + "flos": 35005218664320.0, + "grad_norm": 2.687568238555422, + "language_loss": 0.66600537, + "learning_rate": 3.999398010688788e-06, + "loss": 0.68976879, + "num_input_tokens_seen": 13155435, + "step": 625, + "time_per_iteration": 2.6682467460632324 + }, + { + "auxiliary_loss_clip": 0.01192307, + "auxiliary_loss_mlp": 0.01184409, + "balance_loss_clip": 1.00275946, + "balance_loss_mlp": 1.00139904, + "epoch": 0.0376371561701488, + "flos": 25483899911040.0, + "grad_norm": 2.3322509195721883, + "language_loss": 0.7727831, + "learning_rate": 3.999388417873652e-06, + "loss": 0.79655027, + "num_input_tokens_seen": 13174295, + "step": 626, + "time_per_iteration": 2.614034414291382 + }, + { + "auxiliary_loss_clip": 0.01192334, + "auxiliary_loss_mlp": 0.01184277, + "balance_loss_clip": 1.0028019, + "balance_loss_mlp": 1.00136256, + "epoch": 0.037697279422816775, + "flos": 18185630912640.0, + "grad_norm": 1.8089438258506538, + "language_loss": 0.8144964, + "learning_rate": 3.999378749241506e-06, + "loss": 0.83826256, + "num_input_tokens_seen": 13192500, + "step": 627, + "time_per_iteration": 2.50581693649292 + }, + { + "auxiliary_loss_clip": 0.01192572, + "auxiliary_loss_mlp": 0.01184459, + "balance_loss_clip": 1.00294852, + "balance_loss_mlp": 1.00144863, + "epoch": 0.03775740267548475, + "flos": 24644272521600.0, + "grad_norm": 1.8099880067995238, + "language_loss": 0.88567907, + "learning_rate": 3.999369004792719e-06, + "loss": 0.90944934, + "num_input_tokens_seen": 13213470, + "step": 628, + "time_per_iteration": 4.028088092803955 + }, + { + "auxiliary_loss_clip": 0.01192113, + "auxiliary_loss_mlp": 0.01184701, + "balance_loss_clip": 1.00264978, + "balance_loss_mlp": 1.0016911, + "epoch": 0.03781752592815271, + "flos": 21288205088640.0, + "grad_norm": 2.2617430878298017, + "language_loss": 0.79542923, + "learning_rate": 3.999359184527658e-06, + "loss": 0.8191973, + "num_input_tokens_seen": 13232365, + "step": 629, + "time_per_iteration": 3.99271559715271 + }, + { + "auxiliary_loss_clip": 0.01192294, + "auxiliary_loss_mlp": 0.01184386, + "balance_loss_clip": 1.0027318, + "balance_loss_mlp": 1.00128031, + "epoch": 0.037877649180820684, + "flos": 22089623385600.0, + "grad_norm": 1.8543913377258736, + "language_loss": 0.76703924, + "learning_rate": 3.999349288446696e-06, + "loss": 0.79080606, + "num_input_tokens_seen": 13251920, + "step": 630, + "time_per_iteration": 3.961516857147217 + }, + { + "auxiliary_loss_clip": 0.01192355, + "auxiliary_loss_mlp": 0.01184475, + "balance_loss_clip": 1.00280368, + "balance_loss_mlp": 1.00146461, + "epoch": 0.03793777243348865, + "flos": 14501339976960.0, + "grad_norm": 3.0844292994917715, + "language_loss": 0.91639793, + "learning_rate": 3.99933931655021e-06, + "loss": 0.94016618, + "num_input_tokens_seen": 13267440, + "step": 631, + "time_per_iteration": 2.4968342781066895 + }, + { + "auxiliary_loss_clip": 0.01192138, + "auxiliary_loss_mlp": 0.01184428, + "balance_loss_clip": 1.00273991, + "balance_loss_mlp": 1.00189471, + "epoch": 0.03799789568615662, + "flos": 21908418249600.0, + "grad_norm": 1.7022685033907112, + "language_loss": 0.92248744, + "learning_rate": 3.999329268838575e-06, + "loss": 0.94625312, + "num_input_tokens_seen": 13287850, + "step": 632, + "time_per_iteration": 2.553475856781006 + }, + { + "auxiliary_loss_clip": 0.01192402, + "auxiliary_loss_mlp": 0.01184385, + "balance_loss_clip": 1.00284004, + "balance_loss_mlp": 1.00127971, + "epoch": 0.03805801893882459, + "flos": 24827021942400.0, + "grad_norm": 2.4064658262856296, + "language_loss": 0.83130813, + "learning_rate": 3.999319145312175e-06, + "loss": 0.85507607, + "num_input_tokens_seen": 13307760, + "step": 633, + "time_per_iteration": 2.5436830520629883 + }, + { + "auxiliary_loss_clip": 0.01192255, + "auxiliary_loss_mlp": 0.01184314, + "balance_loss_clip": 1.00278592, + "balance_loss_mlp": 1.00120819, + "epoch": 0.03811814219149256, + "flos": 30482952364800.0, + "grad_norm": 2.198836736499147, + "language_loss": 0.69626474, + "learning_rate": 3.999308945971392e-06, + "loss": 0.72003043, + "num_input_tokens_seen": 13331230, + "step": 634, + "time_per_iteration": 2.614654064178467 + }, + { + "auxiliary_loss_clip": 0.01193774, + "auxiliary_loss_mlp": 0.01182152, + "balance_loss_clip": 1.00511587, + "balance_loss_mlp": 1.0005722, + "epoch": 0.03817826544416053, + "flos": 66992577379200.0, + "grad_norm": 0.8788399091809642, + "language_loss": 0.61606944, + "learning_rate": 3.999298670816614e-06, + "loss": 0.63982868, + "num_input_tokens_seen": 13394760, + "step": 635, + "time_per_iteration": 3.2228035926818848 + }, + { + "auxiliary_loss_clip": 0.01192146, + "auxiliary_loss_mlp": 0.01184246, + "balance_loss_clip": 1.00274277, + "balance_loss_mlp": 1.00142622, + "epoch": 0.038238388696828496, + "flos": 20485350247680.0, + "grad_norm": 2.5607785817495654, + "language_loss": 0.83682954, + "learning_rate": 3.9992883198482294e-06, + "loss": 0.86059344, + "num_input_tokens_seen": 13412775, + "step": 636, + "time_per_iteration": 2.6145219802856445 + }, + { + "auxiliary_loss_clip": 0.01192246, + "auxiliary_loss_mlp": 0.01184338, + "balance_loss_clip": 1.00276136, + "balance_loss_mlp": 1.00132823, + "epoch": 0.03829851194949647, + "flos": 17965893461760.0, + "grad_norm": 2.341653960264864, + "language_loss": 0.79433203, + "learning_rate": 3.999277893066632e-06, + "loss": 0.81809789, + "num_input_tokens_seen": 13427835, + "step": 637, + "time_per_iteration": 2.5535600185394287 + }, + { + "auxiliary_loss_clip": 0.01192124, + "auxiliary_loss_mlp": 0.01184414, + "balance_loss_clip": 1.00263572, + "balance_loss_mlp": 1.00169039, + "epoch": 0.03835863520216444, + "flos": 22456522857600.0, + "grad_norm": 2.0100355128845364, + "language_loss": 0.83948612, + "learning_rate": 3.999267390472215e-06, + "loss": 0.86325157, + "num_input_tokens_seen": 13447295, + "step": 638, + "time_per_iteration": 2.5672876834869385 + }, + { + "auxiliary_loss_clip": 0.01192254, + "auxiliary_loss_mlp": 0.01184598, + "balance_loss_clip": 1.00269008, + "balance_loss_mlp": 1.00149226, + "epoch": 0.038418758454832405, + "flos": 22164425458560.0, + "grad_norm": 2.3214594586925212, + "language_loss": 0.69514382, + "learning_rate": 3.999256812065381e-06, + "loss": 0.71891236, + "num_input_tokens_seen": 13468455, + "step": 639, + "time_per_iteration": 2.5946898460388184 + }, + { + "auxiliary_loss_clip": 0.01192191, + "auxiliary_loss_mlp": 0.01184275, + "balance_loss_clip": 1.00269425, + "balance_loss_mlp": 1.0013597, + "epoch": 0.03847888170750038, + "flos": 22747435107840.0, + "grad_norm": 7.6229325988037235, + "language_loss": 0.85473442, + "learning_rate": 3.999246157846526e-06, + "loss": 0.87849915, + "num_input_tokens_seen": 13489085, + "step": 640, + "time_per_iteration": 2.5825796127319336 + }, + { + "auxiliary_loss_clip": 0.01192212, + "auxiliary_loss_mlp": 0.01184555, + "balance_loss_clip": 1.00273824, + "balance_loss_mlp": 1.00164044, + "epoch": 0.03853900496016834, + "flos": 22711201263360.0, + "grad_norm": 2.521827650692488, + "language_loss": 0.81979179, + "learning_rate": 3.9992354278160574e-06, + "loss": 0.84355944, + "num_input_tokens_seen": 13509120, + "step": 641, + "time_per_iteration": 2.575768232345581 + }, + { + "auxiliary_loss_clip": 0.01193586, + "auxiliary_loss_mlp": 0.01181972, + "balance_loss_clip": 1.00504112, + "balance_loss_mlp": 1.0003922, + "epoch": 0.038599128212836314, + "flos": 70399136355840.0, + "grad_norm": 0.9018825987193773, + "language_loss": 0.65434897, + "learning_rate": 3.999224621974381e-06, + "loss": 0.67810452, + "num_input_tokens_seen": 13562005, + "step": 642, + "time_per_iteration": 3.118234872817993 + }, + { + "auxiliary_loss_clip": 0.01192043, + "auxiliary_loss_mlp": 0.01184039, + "balance_loss_clip": 1.00263131, + "balance_loss_mlp": 1.00121999, + "epoch": 0.03865925146550429, + "flos": 23295144666240.0, + "grad_norm": 2.483975236641942, + "language_loss": 0.79588026, + "learning_rate": 3.999213740321906e-06, + "loss": 0.81964105, + "num_input_tokens_seen": 13582185, + "step": 643, + "time_per_iteration": 2.618208169937134 + }, + { + "auxiliary_loss_clip": 0.01192052, + "auxiliary_loss_mlp": 0.01184298, + "balance_loss_clip": 1.00270569, + "balance_loss_mlp": 1.00157356, + "epoch": 0.03871937471817225, + "flos": 21430446946560.0, + "grad_norm": 1.923505758149206, + "language_loss": 0.82922912, + "learning_rate": 3.999202782859046e-06, + "loss": 0.85299265, + "num_input_tokens_seen": 13599555, + "step": 644, + "time_per_iteration": 2.5322749614715576 + }, + { + "auxiliary_loss_clip": 0.01192072, + "auxiliary_loss_mlp": 0.01184044, + "balance_loss_clip": 1.00271094, + "balance_loss_mlp": 1.00122464, + "epoch": 0.038779497970840224, + "flos": 34277309550720.0, + "grad_norm": 1.9765499992516578, + "language_loss": 0.82433939, + "learning_rate": 3.9991917495862165e-06, + "loss": 0.84810054, + "num_input_tokens_seen": 13621160, + "step": 645, + "time_per_iteration": 2.6718788146972656 + }, + { + "auxiliary_loss_clip": 0.01192085, + "auxiliary_loss_mlp": 0.01184421, + "balance_loss_clip": 1.0026567, + "balance_loss_mlp": 1.00150657, + "epoch": 0.03883962122350819, + "flos": 22748189293440.0, + "grad_norm": 2.318989955126233, + "language_loss": 0.81937081, + "learning_rate": 3.9991806405038345e-06, + "loss": 0.84313583, + "num_input_tokens_seen": 13641915, + "step": 646, + "time_per_iteration": 2.5907206535339355 + }, + { + "auxiliary_loss_clip": 0.01192313, + "auxiliary_loss_mlp": 0.01184539, + "balance_loss_clip": 1.00285482, + "balance_loss_mlp": 1.00152874, + "epoch": 0.03889974447617616, + "flos": 21945837242880.0, + "grad_norm": 1.9258335801791062, + "language_loss": 0.8199321, + "learning_rate": 3.999169455612323e-06, + "loss": 0.84370059, + "num_input_tokens_seen": 13661410, + "step": 647, + "time_per_iteration": 2.541327476501465 + }, + { + "auxiliary_loss_clip": 0.01192157, + "auxiliary_loss_mlp": 0.01184111, + "balance_loss_clip": 1.00276756, + "balance_loss_mlp": 1.00148273, + "epoch": 0.03895986772884413, + "flos": 31504826384640.0, + "grad_norm": 2.7554558143375942, + "language_loss": 0.84508228, + "learning_rate": 3.999158194912106e-06, + "loss": 0.86884499, + "num_input_tokens_seen": 13681705, + "step": 648, + "time_per_iteration": 2.6055359840393066 + }, + { + "auxiliary_loss_clip": 0.01192165, + "auxiliary_loss_mlp": 0.01184296, + "balance_loss_clip": 1.00272155, + "balance_loss_mlp": 1.00128579, + "epoch": 0.0390199909815121, + "flos": 19901011795200.0, + "grad_norm": 3.7905372915862596, + "language_loss": 0.84494668, + "learning_rate": 3.9991468584036086e-06, + "loss": 0.86871129, + "num_input_tokens_seen": 13700400, + "step": 649, + "time_per_iteration": 2.5284178256988525 + }, + { + "auxiliary_loss_clip": 0.01192313, + "auxiliary_loss_mlp": 0.01184558, + "balance_loss_clip": 1.00280547, + "balance_loss_mlp": 1.00154805, + "epoch": 0.03908011423418007, + "flos": 21612478095360.0, + "grad_norm": 1.9715650802221172, + "language_loss": 0.79689157, + "learning_rate": 3.999135446087263e-06, + "loss": 0.82066035, + "num_input_tokens_seen": 13720145, + "step": 650, + "time_per_iteration": 2.545870780944824 + }, + { + "auxiliary_loss_clip": 0.01192075, + "auxiliary_loss_mlp": 0.01184228, + "balance_loss_clip": 1.00266278, + "balance_loss_mlp": 1.00131321, + "epoch": 0.039140237486848035, + "flos": 18661411486080.0, + "grad_norm": 3.091067179411207, + "language_loss": 0.78883559, + "learning_rate": 3.9991239579635e-06, + "loss": 0.81259865, + "num_input_tokens_seen": 13737500, + "step": 651, + "time_per_iteration": 2.5418283939361572 + }, + { + "auxiliary_loss_clip": 0.01191975, + "auxiliary_loss_mlp": 0.01184219, + "balance_loss_clip": 1.00264466, + "balance_loss_mlp": 1.00130391, + "epoch": 0.03920036073951601, + "flos": 18661124177280.0, + "grad_norm": 2.8551966849650587, + "language_loss": 0.87595117, + "learning_rate": 3.999112394032757e-06, + "loss": 0.8997131, + "num_input_tokens_seen": 13754750, + "step": 652, + "time_per_iteration": 2.5188982486724854 + }, + { + "auxiliary_loss_clip": 0.01192042, + "auxiliary_loss_mlp": 0.01184302, + "balance_loss_clip": 1.0026772, + "balance_loss_mlp": 1.0011965, + "epoch": 0.03926048399218398, + "flos": 31354468053120.0, + "grad_norm": 2.6480395672356494, + "language_loss": 0.7931751, + "learning_rate": 3.999100754295471e-06, + "loss": 0.81693852, + "num_input_tokens_seen": 13771990, + "step": 653, + "time_per_iteration": 2.6250505447387695 + }, + { + "auxiliary_loss_clip": 0.01192176, + "auxiliary_loss_mlp": 0.01184369, + "balance_loss_clip": 1.00271761, + "balance_loss_mlp": 1.00135911, + "epoch": 0.039320607244851945, + "flos": 29603499770880.0, + "grad_norm": 5.469116535523941, + "language_loss": 0.85841787, + "learning_rate": 3.999089038752085e-06, + "loss": 0.88218331, + "num_input_tokens_seen": 13792750, + "step": 654, + "time_per_iteration": 2.5831148624420166 + }, + { + "auxiliary_loss_clip": 0.01193329, + "auxiliary_loss_mlp": 0.011819, + "balance_loss_clip": 1.00491095, + "balance_loss_mlp": 1.00031996, + "epoch": 0.03938073049751992, + "flos": 66534609951360.0, + "grad_norm": 0.7189978598093875, + "language_loss": 0.49909809, + "learning_rate": 3.999077247403041e-06, + "loss": 0.52285039, + "num_input_tokens_seen": 13858570, + "step": 655, + "time_per_iteration": 3.1961097717285156 + }, + { + "auxiliary_loss_clip": 0.01192081, + "auxiliary_loss_mlp": 0.01184234, + "balance_loss_clip": 1.00272882, + "balance_loss_mlp": 1.0015099, + "epoch": 0.03944085375018788, + "flos": 23367827836800.0, + "grad_norm": 2.1464265986274262, + "language_loss": 0.81131458, + "learning_rate": 3.9990653802487886e-06, + "loss": 0.83507776, + "num_input_tokens_seen": 13876335, + "step": 656, + "time_per_iteration": 2.5519518852233887 + }, + { + "auxiliary_loss_clip": 0.01192027, + "auxiliary_loss_mlp": 0.01184601, + "balance_loss_clip": 1.00258446, + "balance_loss_mlp": 1.0016861, + "epoch": 0.039500977002855854, + "flos": 18548292579840.0, + "grad_norm": 2.3654468281023617, + "language_loss": 0.76148266, + "learning_rate": 3.999053437289776e-06, + "loss": 0.78524899, + "num_input_tokens_seen": 13892640, + "step": 657, + "time_per_iteration": 2.5010061264038086 + }, + { + "auxiliary_loss_clip": 0.01191963, + "auxiliary_loss_mlp": 0.01184029, + "balance_loss_clip": 1.00266552, + "balance_loss_mlp": 1.00120926, + "epoch": 0.039561100255523826, + "flos": 25338174433920.0, + "grad_norm": 2.222123741663678, + "language_loss": 0.81735921, + "learning_rate": 3.999041418526457e-06, + "loss": 0.84111905, + "num_input_tokens_seen": 13910085, + "step": 658, + "time_per_iteration": 2.578784227371216 + }, + { + "auxiliary_loss_clip": 0.01191961, + "auxiliary_loss_mlp": 0.01184597, + "balance_loss_clip": 1.00260735, + "balance_loss_mlp": 1.00149202, + "epoch": 0.03962122350819179, + "flos": 18219889509120.0, + "grad_norm": 2.166844909645752, + "language_loss": 0.91233253, + "learning_rate": 3.999029323959287e-06, + "loss": 0.9360981, + "num_input_tokens_seen": 13928800, + "step": 659, + "time_per_iteration": 2.555777072906494 + }, + { + "auxiliary_loss_clip": 0.01192095, + "auxiliary_loss_mlp": 0.01184408, + "balance_loss_clip": 1.00267088, + "balance_loss_mlp": 1.00139832, + "epoch": 0.03968134676085976, + "flos": 20522230536960.0, + "grad_norm": 2.1295834790287906, + "language_loss": 0.79263306, + "learning_rate": 3.999017153588724e-06, + "loss": 0.81639802, + "num_input_tokens_seen": 13948325, + "step": 660, + "time_per_iteration": 2.564380407333374 + }, + { + "auxiliary_loss_clip": 0.01192043, + "auxiliary_loss_mlp": 0.01183921, + "balance_loss_clip": 1.0027405, + "balance_loss_mlp": 1.00129271, + "epoch": 0.03974147001352773, + "flos": 22422587483520.0, + "grad_norm": 1.6107237115600375, + "language_loss": 0.81681609, + "learning_rate": 3.999004907415231e-06, + "loss": 0.8405757, + "num_input_tokens_seen": 13969090, + "step": 661, + "time_per_iteration": 2.5899758338928223 + }, + { + "auxiliary_loss_clip": 0.01193074, + "auxiliary_loss_mlp": 0.01181827, + "balance_loss_clip": 1.00468445, + "balance_loss_mlp": 1.0002476, + "epoch": 0.0398015932661957, + "flos": 71128769322240.0, + "grad_norm": 0.9080861020594403, + "language_loss": 0.69347572, + "learning_rate": 3.998992585439272e-06, + "loss": 0.71722484, + "num_input_tokens_seen": 14037555, + "step": 662, + "time_per_iteration": 3.2866108417510986 + }, + { + "auxiliary_loss_clip": 0.01192075, + "auxiliary_loss_mlp": 0.0118398, + "balance_loss_clip": 1.00269282, + "balance_loss_mlp": 1.00135136, + "epoch": 0.03986171651886367, + "flos": 16800951571200.0, + "grad_norm": 1.936941085854616, + "language_loss": 0.82870376, + "learning_rate": 3.998980187661314e-06, + "loss": 0.85246426, + "num_input_tokens_seen": 14055765, + "step": 663, + "time_per_iteration": 2.527754068374634 + }, + { + "auxiliary_loss_clip": 0.01191988, + "auxiliary_loss_mlp": 0.01184319, + "balance_loss_clip": 1.00264835, + "balance_loss_mlp": 1.00130904, + "epoch": 0.03992183977153164, + "flos": 24535068197760.0, + "grad_norm": 2.2399082011983187, + "language_loss": 0.86898208, + "learning_rate": 3.998967714081826e-06, + "loss": 0.89274514, + "num_input_tokens_seen": 14074195, + "step": 664, + "time_per_iteration": 2.6106908321380615 + }, + { + "auxiliary_loss_clip": 0.01191891, + "auxiliary_loss_mlp": 0.0118376, + "balance_loss_clip": 1.00273168, + "balance_loss_mlp": 1.0011313, + "epoch": 0.03998196302419961, + "flos": 15595897167360.0, + "grad_norm": 2.0743215240653896, + "language_loss": 0.85071003, + "learning_rate": 3.998955164701281e-06, + "loss": 0.87446654, + "num_input_tokens_seen": 14090215, + "step": 665, + "time_per_iteration": 2.542322874069214 + }, + { + "auxiliary_loss_clip": 0.01192191, + "auxiliary_loss_mlp": 0.01184811, + "balance_loss_clip": 1.00280154, + "balance_loss_mlp": 1.00199127, + "epoch": 0.04004208627686758, + "flos": 25305065072640.0, + "grad_norm": 3.3021306400688326, + "language_loss": 0.81706476, + "learning_rate": 3.998942539520158e-06, + "loss": 0.84083486, + "num_input_tokens_seen": 14112150, + "step": 666, + "time_per_iteration": 5.548087120056152 + }, + { + "auxiliary_loss_clip": 0.01191953, + "auxiliary_loss_mlp": 0.01183885, + "balance_loss_clip": 1.00263321, + "balance_loss_mlp": 1.00125647, + "epoch": 0.04010220952953555, + "flos": 23475847011840.0, + "grad_norm": 2.6590102675330036, + "language_loss": 0.87064481, + "learning_rate": 3.998929838538932e-06, + "loss": 0.89440322, + "num_input_tokens_seen": 14131475, + "step": 667, + "time_per_iteration": 2.701120615005493 + }, + { + "auxiliary_loss_clip": 0.01191973, + "auxiliary_loss_mlp": 0.01183839, + "balance_loss_clip": 1.00276983, + "balance_loss_mlp": 1.00130582, + "epoch": 0.04016233278220352, + "flos": 18617025254400.0, + "grad_norm": 3.1787858883415594, + "language_loss": 0.80574656, + "learning_rate": 3.998917061758087e-06, + "loss": 0.82950461, + "num_input_tokens_seen": 14146165, + "step": 668, + "time_per_iteration": 2.500929832458496 + }, + { + "auxiliary_loss_clip": 0.01192982, + "auxiliary_loss_mlp": 0.01181914, + "balance_loss_clip": 1.00460684, + "balance_loss_mlp": 1.00033402, + "epoch": 0.040222456034871484, + "flos": 70906194696960.0, + "grad_norm": 0.7880522644951437, + "language_loss": 0.60034227, + "learning_rate": 3.998904209178107e-06, + "loss": 0.62409115, + "num_input_tokens_seen": 14215005, + "step": 669, + "time_per_iteration": 4.676196098327637 + }, + { + "auxiliary_loss_clip": 0.01191995, + "auxiliary_loss_mlp": 0.01184321, + "balance_loss_clip": 1.00271678, + "balance_loss_mlp": 1.0015974, + "epoch": 0.040282579287539456, + "flos": 23764712186880.0, + "grad_norm": 1.8326974400898957, + "language_loss": 0.8619886, + "learning_rate": 3.9988912807994785e-06, + "loss": 0.88575172, + "num_input_tokens_seen": 14235510, + "step": 670, + "time_per_iteration": 2.6537845134735107 + }, + { + "auxiliary_loss_clip": 0.01191915, + "auxiliary_loss_mlp": 0.01184074, + "balance_loss_clip": 1.00268972, + "balance_loss_mlp": 1.00154066, + "epoch": 0.04034270254020743, + "flos": 18478518410880.0, + "grad_norm": 1.87675016514542, + "language_loss": 0.75232017, + "learning_rate": 3.998878276622692e-06, + "loss": 0.77608013, + "num_input_tokens_seen": 14254565, + "step": 671, + "time_per_iteration": 2.518322467803955 + }, + { + "auxiliary_loss_clip": 0.01192023, + "auxiliary_loss_mlp": 0.01184315, + "balance_loss_clip": 1.00270104, + "balance_loss_mlp": 1.00168657, + "epoch": 0.040402825792875394, + "flos": 17201858244480.0, + "grad_norm": 2.069505922265828, + "language_loss": 0.92166555, + "learning_rate": 3.998865196648242e-06, + "loss": 0.94542897, + "num_input_tokens_seen": 14271885, + "step": 672, + "time_per_iteration": 2.5181095600128174 + }, + { + "auxiliary_loss_clip": 0.01191925, + "auxiliary_loss_mlp": 0.01184241, + "balance_loss_clip": 1.00267863, + "balance_loss_mlp": 1.00161278, + "epoch": 0.040462949045543366, + "flos": 19172168928000.0, + "grad_norm": 2.2494044924019128, + "language_loss": 0.89991689, + "learning_rate": 3.998852040876622e-06, + "loss": 0.92367852, + "num_input_tokens_seen": 14289670, + "step": 673, + "time_per_iteration": 2.51347279548645 + }, + { + "auxiliary_loss_clip": 0.0119175, + "auxiliary_loss_mlp": 0.01184518, + "balance_loss_clip": 1.00254822, + "balance_loss_mlp": 1.00179434, + "epoch": 0.04052307229821133, + "flos": 24019821555840.0, + "grad_norm": 2.0271111904220676, + "language_loss": 0.74847829, + "learning_rate": 3.998838809308334e-06, + "loss": 0.772241, + "num_input_tokens_seen": 14309285, + "step": 674, + "time_per_iteration": 2.5673210620880127 + }, + { + "auxiliary_loss_clip": 0.01191777, + "auxiliary_loss_mlp": 0.01183947, + "balance_loss_clip": 1.00252557, + "balance_loss_mlp": 1.00131893, + "epoch": 0.0405831955508793, + "flos": 16436601964800.0, + "grad_norm": 6.625130040581424, + "language_loss": 0.77716923, + "learning_rate": 3.9988255019438766e-06, + "loss": 0.80092645, + "num_input_tokens_seen": 14328300, + "step": 675, + "time_per_iteration": 2.550079345703125 + }, + { + "auxiliary_loss_clip": 0.01191902, + "auxiliary_loss_mlp": 0.01184295, + "balance_loss_clip": 1.00267851, + "balance_loss_mlp": 1.00157082, + "epoch": 0.040643318803547275, + "flos": 24279922915200.0, + "grad_norm": 1.7569232423005943, + "language_loss": 0.76963061, + "learning_rate": 3.998812118783757e-06, + "loss": 0.79339254, + "num_input_tokens_seen": 14346395, + "step": 676, + "time_per_iteration": 2.554842948913574 + }, + { + "auxiliary_loss_clip": 0.01192027, + "auxiliary_loss_mlp": 0.01184462, + "balance_loss_clip": 1.00270998, + "balance_loss_mlp": 1.00173795, + "epoch": 0.04070344205621524, + "flos": 17712076982400.0, + "grad_norm": 2.636885136701618, + "language_loss": 0.85456347, + "learning_rate": 3.9987986598284804e-06, + "loss": 0.87832838, + "num_input_tokens_seen": 14364605, + "step": 677, + "time_per_iteration": 2.5388824939727783 + }, + { + "auxiliary_loss_clip": 0.01191961, + "auxiliary_loss_mlp": 0.01183902, + "balance_loss_clip": 1.00269055, + "balance_loss_mlp": 1.00117826, + "epoch": 0.04076356530888321, + "flos": 26177658168960.0, + "grad_norm": 2.7991960652152166, + "language_loss": 0.76445192, + "learning_rate": 3.998785125078559e-06, + "loss": 0.78821051, + "num_input_tokens_seen": 14385265, + "step": 678, + "time_per_iteration": 2.6123623847961426 + }, + { + "auxiliary_loss_clip": 0.01191981, + "auxiliary_loss_mlp": 0.01184328, + "balance_loss_clip": 1.0026629, + "balance_loss_mlp": 1.0016036, + "epoch": 0.04082368856155118, + "flos": 35773455772800.0, + "grad_norm": 1.939674994412451, + "language_loss": 0.82236075, + "learning_rate": 3.998771514534505e-06, + "loss": 0.84612387, + "num_input_tokens_seen": 14406090, + "step": 679, + "time_per_iteration": 2.647230625152588 + }, + { + "auxiliary_loss_clip": 0.01192051, + "auxiliary_loss_mlp": 0.01183929, + "balance_loss_clip": 1.00281549, + "balance_loss_mlp": 1.00139618, + "epoch": 0.04088381181421915, + "flos": 28146640049280.0, + "grad_norm": 1.8071797295394798, + "language_loss": 0.76247519, + "learning_rate": 3.998757828196835e-06, + "loss": 0.78623497, + "num_input_tokens_seen": 14425130, + "step": 680, + "time_per_iteration": 2.596386671066284 + }, + { + "auxiliary_loss_clip": 0.01191867, + "auxiliary_loss_mlp": 0.01184079, + "balance_loss_clip": 1.00258946, + "balance_loss_mlp": 1.00135529, + "epoch": 0.04094393506688712, + "flos": 27597673514880.0, + "grad_norm": 2.279291663033507, + "language_loss": 0.83196211, + "learning_rate": 3.9987440660660685e-06, + "loss": 0.85572153, + "num_input_tokens_seen": 14447355, + "step": 681, + "time_per_iteration": 2.616393804550171 + }, + { + "auxiliary_loss_clip": 0.01191916, + "auxiliary_loss_mlp": 0.0118397, + "balance_loss_clip": 1.00267029, + "balance_loss_mlp": 1.00124609, + "epoch": 0.04100405831955509, + "flos": 23112036109440.0, + "grad_norm": 1.8899847753316568, + "language_loss": 0.71345216, + "learning_rate": 3.998730228142726e-06, + "loss": 0.73721105, + "num_input_tokens_seen": 14466790, + "step": 682, + "time_per_iteration": 2.5490965843200684 + }, + { + "auxiliary_loss_clip": 0.0119172, + "auxiliary_loss_mlp": 0.01183905, + "balance_loss_clip": 1.00251627, + "balance_loss_mlp": 1.00127602, + "epoch": 0.04106418157222306, + "flos": 20156731695360.0, + "grad_norm": 1.653676574488069, + "language_loss": 0.72758985, + "learning_rate": 3.998716314427333e-06, + "loss": 0.75134605, + "num_input_tokens_seen": 14485195, + "step": 683, + "time_per_iteration": 2.5258617401123047 + }, + { + "auxiliary_loss_clip": 0.01191964, + "auxiliary_loss_mlp": 0.01184367, + "balance_loss_clip": 1.00274229, + "balance_loss_mlp": 1.00173795, + "epoch": 0.041124304824891024, + "flos": 17420697855360.0, + "grad_norm": 2.147438739620943, + "language_loss": 0.81802583, + "learning_rate": 3.998702324920417e-06, + "loss": 0.84178913, + "num_input_tokens_seen": 14503370, + "step": 684, + "time_per_iteration": 2.5368196964263916 + }, + { + "auxiliary_loss_clip": 0.0119188, + "auxiliary_loss_mlp": 0.01184189, + "balance_loss_clip": 1.00268698, + "balance_loss_mlp": 1.00146484, + "epoch": 0.041184428077558996, + "flos": 25780163287680.0, + "grad_norm": 1.9586979359992163, + "language_loss": 0.90730852, + "learning_rate": 3.9986882596225085e-06, + "loss": 0.93106925, + "num_input_tokens_seen": 14526415, + "step": 685, + "time_per_iteration": 2.625584125518799 + }, + { + "auxiliary_loss_clip": 0.01191856, + "auxiliary_loss_mlp": 0.0118393, + "balance_loss_clip": 1.00262809, + "balance_loss_mlp": 1.00139701, + "epoch": 0.04124455133022697, + "flos": 22964766347520.0, + "grad_norm": 2.062101080864675, + "language_loss": 0.87717307, + "learning_rate": 3.998674118534141e-06, + "loss": 0.90093088, + "num_input_tokens_seen": 14546595, + "step": 686, + "time_per_iteration": 2.6270670890808105 + }, + { + "auxiliary_loss_clip": 0.01191826, + "auxiliary_loss_mlp": 0.01184161, + "balance_loss_clip": 1.0026046, + "balance_loss_mlp": 1.00143695, + "epoch": 0.04130467458289493, + "flos": 21289067015040.0, + "grad_norm": 2.2584914038772315, + "language_loss": 0.71826065, + "learning_rate": 3.998659901655851e-06, + "loss": 0.74202055, + "num_input_tokens_seen": 14566590, + "step": 687, + "time_per_iteration": 2.6060738563537598 + }, + { + "auxiliary_loss_clip": 0.01191835, + "auxiliary_loss_mlp": 0.01183989, + "balance_loss_clip": 1.00272286, + "balance_loss_mlp": 1.001647, + "epoch": 0.041364797835562905, + "flos": 19974233669760.0, + "grad_norm": 1.5936676758452688, + "language_loss": 0.86181998, + "learning_rate": 3.998645608988177e-06, + "loss": 0.88557827, + "num_input_tokens_seen": 14585965, + "step": 688, + "time_per_iteration": 2.5857350826263428 + }, + { + "auxiliary_loss_clip": 0.01191823, + "auxiliary_loss_mlp": 0.01183935, + "balance_loss_clip": 1.00269365, + "balance_loss_mlp": 1.00159216, + "epoch": 0.04142492108823087, + "flos": 21906227520000.0, + "grad_norm": 2.0844846720916737, + "language_loss": 0.83233964, + "learning_rate": 3.998631240531661e-06, + "loss": 0.85609722, + "num_input_tokens_seen": 14606015, + "step": 689, + "time_per_iteration": 2.576704978942871 + }, + { + "auxiliary_loss_clip": 0.01191754, + "auxiliary_loss_mlp": 0.01183865, + "balance_loss_clip": 1.00262332, + "balance_loss_mlp": 1.00161791, + "epoch": 0.04148504434089884, + "flos": 27639617621760.0, + "grad_norm": 1.9481906682366583, + "language_loss": 0.68139088, + "learning_rate": 3.998616796286848e-06, + "loss": 0.70514709, + "num_input_tokens_seen": 14629955, + "step": 690, + "time_per_iteration": 2.5969913005828857 + }, + { + "auxiliary_loss_clip": 0.01191781, + "auxiliary_loss_mlp": 0.01184164, + "balance_loss_clip": 1.00256109, + "balance_loss_mlp": 1.00163102, + "epoch": 0.041545167593566815, + "flos": 20518387781760.0, + "grad_norm": 1.6676027204807173, + "language_loss": 0.75023448, + "learning_rate": 3.998602276254286e-06, + "loss": 0.77399397, + "num_input_tokens_seen": 14648000, + "step": 691, + "time_per_iteration": 2.5177431106567383 + }, + { + "auxiliary_loss_clip": 0.01191587, + "auxiliary_loss_mlp": 0.01184059, + "balance_loss_clip": 1.00255823, + "balance_loss_mlp": 1.00162148, + "epoch": 0.04160529084623478, + "flos": 11868907939200.0, + "grad_norm": 2.0807850524509317, + "language_loss": 0.84625173, + "learning_rate": 3.998587680434526e-06, + "loss": 0.87000811, + "num_input_tokens_seen": 14662235, + "step": 692, + "time_per_iteration": 2.544404983520508 + }, + { + "auxiliary_loss_clip": 0.01191761, + "auxiliary_loss_mlp": 0.01184024, + "balance_loss_clip": 1.00248992, + "balance_loss_mlp": 1.00149083, + "epoch": 0.04166541409890275, + "flos": 14828306503680.0, + "grad_norm": 2.7470674464283866, + "language_loss": 0.88973129, + "learning_rate": 3.99857300882812e-06, + "loss": 0.9134891, + "num_input_tokens_seen": 14676065, + "step": 693, + "time_per_iteration": 2.489104747772217 + }, + { + "auxiliary_loss_clip": 0.01191832, + "auxiliary_loss_mlp": 0.01183522, + "balance_loss_clip": 1.00269389, + "balance_loss_mlp": 1.00117958, + "epoch": 0.04172553735157072, + "flos": 25808137004160.0, + "grad_norm": 2.3612118367091504, + "language_loss": 0.82290989, + "learning_rate": 3.998558261435626e-06, + "loss": 0.84666342, + "num_input_tokens_seen": 14694955, + "step": 694, + "time_per_iteration": 2.5791015625 + }, + { + "auxiliary_loss_clip": 0.01191793, + "auxiliary_loss_mlp": 0.01183887, + "balance_loss_clip": 1.00254869, + "balance_loss_mlp": 1.00125861, + "epoch": 0.04178566060423869, + "flos": 24279815174400.0, + "grad_norm": 2.6767768891091017, + "language_loss": 0.83317721, + "learning_rate": 3.9985434382576015e-06, + "loss": 0.85693407, + "num_input_tokens_seen": 14715510, + "step": 695, + "time_per_iteration": 2.56427001953125 + }, + { + "auxiliary_loss_clip": 0.01191683, + "auxiliary_loss_mlp": 0.01183914, + "balance_loss_clip": 1.00262189, + "balance_loss_mlp": 1.00147617, + "epoch": 0.04184578385690666, + "flos": 18222008411520.0, + "grad_norm": 2.433639479010764, + "language_loss": 0.84587836, + "learning_rate": 3.99852853929461e-06, + "loss": 0.86963427, + "num_input_tokens_seen": 14731755, + "step": 696, + "time_per_iteration": 2.494817018508911 + }, + { + "auxiliary_loss_clip": 0.01191681, + "auxiliary_loss_mlp": 0.01183674, + "balance_loss_clip": 1.00263119, + "balance_loss_mlp": 1.00152254, + "epoch": 0.041905907109574626, + "flos": 22776342577920.0, + "grad_norm": 2.429795740304447, + "language_loss": 0.92758417, + "learning_rate": 3.998513564547216e-06, + "loss": 0.9513377, + "num_input_tokens_seen": 14750810, + "step": 697, + "time_per_iteration": 2.5280227661132812 + }, + { + "auxiliary_loss_clip": 0.01191751, + "auxiliary_loss_mlp": 0.01183723, + "balance_loss_clip": 1.0027225, + "balance_loss_mlp": 1.0015713, + "epoch": 0.0419660303622426, + "flos": 20156947176960.0, + "grad_norm": 2.0824983936513113, + "language_loss": 0.83939672, + "learning_rate": 3.998498514015987e-06, + "loss": 0.86315149, + "num_input_tokens_seen": 14768435, + "step": 698, + "time_per_iteration": 2.513054847717285 + }, + { + "auxiliary_loss_clip": 0.01191801, + "auxiliary_loss_mlp": 0.01184087, + "balance_loss_clip": 1.00271535, + "balance_loss_mlp": 1.00193477, + "epoch": 0.042026153614910564, + "flos": 23076376882560.0, + "grad_norm": 2.2179012967753162, + "language_loss": 0.91226202, + "learning_rate": 3.998483387701495e-06, + "loss": 0.93602085, + "num_input_tokens_seen": 14786690, + "step": 699, + "time_per_iteration": 2.5714218616485596 + }, + { + "auxiliary_loss_clip": 0.01192873, + "auxiliary_loss_mlp": 0.01181137, + "balance_loss_clip": 1.00479472, + "balance_loss_mlp": 1.00032008, + "epoch": 0.042086276867578536, + "flos": 64495243370880.0, + "grad_norm": 0.9204297377974942, + "language_loss": 0.67897737, + "learning_rate": 3.998468185604312e-06, + "loss": 0.70271748, + "num_input_tokens_seen": 14853840, + "step": 700, + "time_per_iteration": 3.1907215118408203 + }, + { + "auxiliary_loss_clip": 0.01191779, + "auxiliary_loss_mlp": 0.01183539, + "balance_loss_clip": 1.00270081, + "balance_loss_mlp": 1.00148225, + "epoch": 0.04214640012024651, + "flos": 15487016065920.0, + "grad_norm": 2.4476832721490993, + "language_loss": 0.88785863, + "learning_rate": 3.998452907725016e-06, + "loss": 0.9116118, + "num_input_tokens_seen": 14869580, + "step": 701, + "time_per_iteration": 2.5162458419799805 + }, + { + "auxiliary_loss_clip": 0.01191804, + "auxiliary_loss_mlp": 0.01183883, + "balance_loss_clip": 1.00274348, + "balance_loss_mlp": 1.00154066, + "epoch": 0.04220652337291447, + "flos": 23877040993920.0, + "grad_norm": 2.2951083773592043, + "language_loss": 0.67238998, + "learning_rate": 3.998437554064184e-06, + "loss": 0.69614691, + "num_input_tokens_seen": 14891065, + "step": 702, + "time_per_iteration": 2.6566710472106934 + }, + { + "auxiliary_loss_clip": 0.01192943, + "auxiliary_loss_mlp": 0.01181166, + "balance_loss_clip": 1.00483584, + "balance_loss_mlp": 1.00034893, + "epoch": 0.042266646625582445, + "flos": 63795451628160.0, + "grad_norm": 0.8433906690385113, + "language_loss": 0.60856462, + "learning_rate": 3.9984221246224006e-06, + "loss": 0.63230574, + "num_input_tokens_seen": 14954815, + "step": 703, + "time_per_iteration": 3.1864535808563232 + }, + { + "auxiliary_loss_clip": 0.0119283, + "auxiliary_loss_mlp": 0.01181083, + "balance_loss_clip": 1.00476575, + "balance_loss_mlp": 1.00026608, + "epoch": 0.04232676987825041, + "flos": 50018863345920.0, + "grad_norm": 1.251402090904576, + "language_loss": 0.57703912, + "learning_rate": 3.9984066194002494e-06, + "loss": 0.60077828, + "num_input_tokens_seen": 15003050, + "step": 704, + "time_per_iteration": 5.753992557525635 + }, + { + "auxiliary_loss_clip": 0.01191826, + "auxiliary_loss_mlp": 0.0118356, + "balance_loss_clip": 1.00277781, + "balance_loss_mlp": 1.00159883, + "epoch": 0.04238689313091838, + "flos": 21616105368960.0, + "grad_norm": 2.6156826882286284, + "language_loss": 0.87514353, + "learning_rate": 3.998391038398319e-06, + "loss": 0.89889735, + "num_input_tokens_seen": 15021990, + "step": 705, + "time_per_iteration": 2.5607848167419434 + }, + { + "auxiliary_loss_clip": 0.01191428, + "auxiliary_loss_mlp": 0.01183611, + "balance_loss_clip": 1.00250709, + "balance_loss_mlp": 1.00155485, + "epoch": 0.042447016383586354, + "flos": 19135109070720.0, + "grad_norm": 1.8000638666416215, + "language_loss": 0.71392465, + "learning_rate": 3.998375381617201e-06, + "loss": 0.73767501, + "num_input_tokens_seen": 15040700, + "step": 706, + "time_per_iteration": 3.976032018661499 + }, + { + "auxiliary_loss_clip": 0.01191587, + "auxiliary_loss_mlp": 0.01183174, + "balance_loss_clip": 1.00252461, + "balance_loss_mlp": 1.00111794, + "epoch": 0.04250713963625432, + "flos": 24426007528320.0, + "grad_norm": 2.1166578087394914, + "language_loss": 0.93320477, + "learning_rate": 3.9983596490574875e-06, + "loss": 0.95695245, + "num_input_tokens_seen": 15056725, + "step": 707, + "time_per_iteration": 2.5896120071411133 + }, + { + "auxiliary_loss_clip": 0.01191441, + "auxiliary_loss_mlp": 0.01183222, + "balance_loss_clip": 1.00249481, + "balance_loss_mlp": 1.00107038, + "epoch": 0.04256726288892229, + "flos": 30367391333760.0, + "grad_norm": 1.8737333488809618, + "language_loss": 0.81105649, + "learning_rate": 3.998343840719776e-06, + "loss": 0.8348031, + "num_input_tokens_seen": 15077550, + "step": 708, + "time_per_iteration": 2.637038230895996 + }, + { + "auxiliary_loss_clip": 0.01191598, + "auxiliary_loss_mlp": 0.01183698, + "balance_loss_clip": 1.00257301, + "balance_loss_mlp": 1.00135493, + "epoch": 0.04262738614159026, + "flos": 16362661818240.0, + "grad_norm": 6.665609229591075, + "language_loss": 0.8220337, + "learning_rate": 3.998327956604666e-06, + "loss": 0.84578669, + "num_input_tokens_seen": 15094955, + "step": 709, + "time_per_iteration": 2.531679153442383 + }, + { + "auxiliary_loss_clip": 0.01191687, + "auxiliary_loss_mlp": 0.0118356, + "balance_loss_clip": 1.00269854, + "balance_loss_mlp": 1.00121737, + "epoch": 0.04268750939425823, + "flos": 20412379768320.0, + "grad_norm": 2.9123088236240546, + "language_loss": 0.85219061, + "learning_rate": 3.99831199671276e-06, + "loss": 0.87594306, + "num_input_tokens_seen": 15113395, + "step": 710, + "time_per_iteration": 2.525120973587036 + }, + { + "auxiliary_loss_clip": 0.01191665, + "auxiliary_loss_mlp": 0.01183394, + "balance_loss_clip": 1.00266838, + "balance_loss_mlp": 1.0014329, + "epoch": 0.0427476326469262, + "flos": 20302959962880.0, + "grad_norm": 3.200731736112906, + "language_loss": 0.85023415, + "learning_rate": 3.998295961044662e-06, + "loss": 0.87398481, + "num_input_tokens_seen": 15132920, + "step": 711, + "time_per_iteration": 2.5104737281799316 + }, + { + "auxiliary_loss_clip": 0.01191542, + "auxiliary_loss_mlp": 0.01183271, + "balance_loss_clip": 1.00255537, + "balance_loss_mlp": 1.00121427, + "epoch": 0.042807755899594166, + "flos": 21650794928640.0, + "grad_norm": 3.060776427170861, + "language_loss": 0.85494488, + "learning_rate": 3.9982798496009804e-06, + "loss": 0.87869298, + "num_input_tokens_seen": 15153115, + "step": 712, + "time_per_iteration": 2.5508477687835693 + }, + { + "auxiliary_loss_clip": 0.0119168, + "auxiliary_loss_mlp": 0.01183435, + "balance_loss_clip": 1.00265121, + "balance_loss_mlp": 1.00128365, + "epoch": 0.04286787915226214, + "flos": 21435007973760.0, + "grad_norm": 3.8321705891907256, + "language_loss": 0.9112283, + "learning_rate": 3.998263662382328e-06, + "loss": 0.93497944, + "num_input_tokens_seen": 15172770, + "step": 713, + "time_per_iteration": 2.5401611328125 + }, + { + "auxiliary_loss_clip": 0.01192503, + "auxiliary_loss_mlp": 0.01181052, + "balance_loss_clip": 1.00453782, + "balance_loss_mlp": 1.0002358, + "epoch": 0.04292800240493011, + "flos": 66397970615040.0, + "grad_norm": 0.886406763631746, + "language_loss": 0.6376732, + "learning_rate": 3.9982473993893165e-06, + "loss": 0.66140878, + "num_input_tokens_seen": 15240055, + "step": 714, + "time_per_iteration": 3.253676414489746 + }, + { + "auxiliary_loss_clip": 0.01191526, + "auxiliary_loss_mlp": 0.01183476, + "balance_loss_clip": 1.00268388, + "balance_loss_mlp": 1.00141954, + "epoch": 0.042988125657598075, + "flos": 31650264552960.0, + "grad_norm": 2.6025443409263933, + "language_loss": 0.74660921, + "learning_rate": 3.998231060622563e-06, + "loss": 0.77035916, + "num_input_tokens_seen": 15261585, + "step": 715, + "time_per_iteration": 2.6411046981811523 + }, + { + "auxiliary_loss_clip": 0.01191706, + "auxiliary_loss_mlp": 0.01183471, + "balance_loss_clip": 1.00272346, + "balance_loss_mlp": 1.00122356, + "epoch": 0.04304824891026605, + "flos": 33248468292480.0, + "grad_norm": 5.408294078824762, + "language_loss": 0.72736102, + "learning_rate": 3.998214646082688e-06, + "loss": 0.75111282, + "num_input_tokens_seen": 15281160, + "step": 716, + "time_per_iteration": 2.6323487758636475 + }, + { + "auxiliary_loss_clip": 0.01192365, + "auxiliary_loss_mlp": 0.01181061, + "balance_loss_clip": 1.00447035, + "balance_loss_mlp": 1.00024414, + "epoch": 0.04310837216293401, + "flos": 64064782782720.0, + "grad_norm": 0.9051268306503057, + "language_loss": 0.65616989, + "learning_rate": 3.998198155770314e-06, + "loss": 0.6799041, + "num_input_tokens_seen": 15344505, + "step": 717, + "time_per_iteration": 3.1378016471862793 + }, + { + "auxiliary_loss_clip": 0.0119228, + "auxiliary_loss_mlp": 0.01181, + "balance_loss_clip": 1.00441337, + "balance_loss_mlp": 1.00018311, + "epoch": 0.043168495415601985, + "flos": 61343757849600.0, + "grad_norm": 0.9874128622273562, + "language_loss": 0.58828723, + "learning_rate": 3.998181589686065e-06, + "loss": 0.61202002, + "num_input_tokens_seen": 15404050, + "step": 718, + "time_per_iteration": 2.9194350242614746 + }, + { + "auxiliary_loss_clip": 0.01191577, + "auxiliary_loss_mlp": 0.01183294, + "balance_loss_clip": 1.00268269, + "balance_loss_mlp": 1.00114226, + "epoch": 0.04322861866826996, + "flos": 20704261685760.0, + "grad_norm": 2.897463772506804, + "language_loss": 0.9104259, + "learning_rate": 3.99816494783057e-06, + "loss": 0.93417466, + "num_input_tokens_seen": 15424190, + "step": 719, + "time_per_iteration": 2.5489113330841064 + }, + { + "auxiliary_loss_clip": 0.0119146, + "auxiliary_loss_mlp": 0.01183408, + "balance_loss_clip": 1.002496, + "balance_loss_mlp": 1.00135148, + "epoch": 0.04328874192093792, + "flos": 30373352991360.0, + "grad_norm": 1.6041625485551243, + "language_loss": 0.66794473, + "learning_rate": 3.99814823020446e-06, + "loss": 0.69169343, + "num_input_tokens_seen": 15446500, + "step": 720, + "time_per_iteration": 2.6720871925354004 + }, + { + "auxiliary_loss_clip": 0.01191562, + "auxiliary_loss_mlp": 0.01183457, + "balance_loss_clip": 1.00268006, + "balance_loss_mlp": 1.0014956, + "epoch": 0.043348865173605894, + "flos": 21944795748480.0, + "grad_norm": 2.0596437834158845, + "language_loss": 0.77400583, + "learning_rate": 3.9981314368083684e-06, + "loss": 0.79775596, + "num_input_tokens_seen": 15465830, + "step": 721, + "time_per_iteration": 2.5458240509033203 + }, + { + "auxiliary_loss_clip": 0.01191455, + "auxiliary_loss_mlp": 0.01183379, + "balance_loss_clip": 1.00258195, + "balance_loss_mlp": 1.00160885, + "epoch": 0.04340898842627386, + "flos": 15264225959040.0, + "grad_norm": 2.888763708976196, + "language_loss": 0.88221568, + "learning_rate": 3.998114567642933e-06, + "loss": 0.90596402, + "num_input_tokens_seen": 15479985, + "step": 722, + "time_per_iteration": 2.5185415744781494 + }, + { + "auxiliary_loss_clip": 0.01191614, + "auxiliary_loss_mlp": 0.01183906, + "balance_loss_clip": 1.00266981, + "balance_loss_mlp": 1.00156331, + "epoch": 0.04346911167894183, + "flos": 27965434913280.0, + "grad_norm": 2.1213788430690736, + "language_loss": 0.84287161, + "learning_rate": 3.998097622708792e-06, + "loss": 0.8666268, + "num_input_tokens_seen": 15501545, + "step": 723, + "time_per_iteration": 2.6553778648376465 + }, + { + "auxiliary_loss_clip": 0.01191441, + "auxiliary_loss_mlp": 0.01183252, + "balance_loss_clip": 1.00267386, + "balance_loss_mlp": 1.00138628, + "epoch": 0.0435292349316098, + "flos": 29242202820480.0, + "grad_norm": 1.8990553845690894, + "language_loss": 0.82891583, + "learning_rate": 3.99808060200659e-06, + "loss": 0.8526628, + "num_input_tokens_seen": 15521725, + "step": 724, + "time_per_iteration": 2.6411807537078857 + }, + { + "auxiliary_loss_clip": 0.01191573, + "auxiliary_loss_mlp": 0.01183588, + "balance_loss_clip": 1.00272465, + "balance_loss_mlp": 1.00162649, + "epoch": 0.04358935818427777, + "flos": 20558356640640.0, + "grad_norm": 1.9219123845153234, + "language_loss": 0.79389155, + "learning_rate": 3.998063505536971e-06, + "loss": 0.81764317, + "num_input_tokens_seen": 15540910, + "step": 725, + "time_per_iteration": 2.543869733810425 + }, + { + "auxiliary_loss_clip": 0.01191539, + "auxiliary_loss_mlp": 0.01183215, + "balance_loss_clip": 1.00263524, + "balance_loss_mlp": 1.00125432, + "epoch": 0.04364948143694574, + "flos": 14464926564480.0, + "grad_norm": 2.324599947745039, + "language_loss": 0.8707251, + "learning_rate": 3.998046333300584e-06, + "loss": 0.8944726, + "num_input_tokens_seen": 15558640, + "step": 726, + "time_per_iteration": 2.5214147567749023 + }, + { + "auxiliary_loss_clip": 0.01192018, + "auxiliary_loss_mlp": 0.01180208, + "balance_loss_clip": 1.00422192, + "balance_loss_mlp": 1.00015473, + "epoch": 0.043709604689613706, + "flos": 50067268922880.0, + "grad_norm": 0.9049507977583267, + "language_loss": 0.55876267, + "learning_rate": 3.998029085298079e-06, + "loss": 0.5824849, + "num_input_tokens_seen": 15612975, + "step": 727, + "time_per_iteration": 3.2394351959228516 + }, + { + "auxiliary_loss_clip": 0.01191416, + "auxiliary_loss_mlp": 0.01183037, + "balance_loss_clip": 1.00261497, + "balance_loss_mlp": 1.00136244, + "epoch": 0.04376972794228168, + "flos": 13991588115840.0, + "grad_norm": 2.2720423372865453, + "language_loss": 0.82046151, + "learning_rate": 3.998011761530112e-06, + "loss": 0.84420609, + "num_input_tokens_seen": 15631070, + "step": 728, + "time_per_iteration": 2.5288352966308594 + }, + { + "auxiliary_loss_clip": 0.01191456, + "auxiliary_loss_mlp": 0.01182999, + "balance_loss_clip": 1.00265682, + "balance_loss_mlp": 1.00122869, + "epoch": 0.04382985119494965, + "flos": 22009901149440.0, + "grad_norm": 4.13212977197626, + "language_loss": 0.76541936, + "learning_rate": 3.997994361997338e-06, + "loss": 0.78916395, + "num_input_tokens_seen": 15647825, + "step": 729, + "time_per_iteration": 2.602471113204956 + }, + { + "auxiliary_loss_clip": 0.01191499, + "auxiliary_loss_mlp": 0.01183123, + "balance_loss_clip": 1.00263643, + "balance_loss_mlp": 1.00135303, + "epoch": 0.043889974447617615, + "flos": 24206521472640.0, + "grad_norm": 2.1658017686146747, + "language_loss": 0.94964159, + "learning_rate": 3.997976886700417e-06, + "loss": 0.97338784, + "num_input_tokens_seen": 15668260, + "step": 730, + "time_per_iteration": 2.568411111831665 + }, + { + "auxiliary_loss_clip": 0.01191449, + "auxiliary_loss_mlp": 0.01183094, + "balance_loss_clip": 1.00251901, + "balance_loss_mlp": 1.00094223, + "epoch": 0.04395009770028559, + "flos": 17274541415040.0, + "grad_norm": 1.9925407623275897, + "language_loss": 0.87974322, + "learning_rate": 3.997959335640013e-06, + "loss": 0.90348858, + "num_input_tokens_seen": 15685630, + "step": 731, + "time_per_iteration": 2.5799062252044678 + }, + { + "auxiliary_loss_clip": 0.0119152, + "auxiliary_loss_mlp": 0.01183073, + "balance_loss_clip": 1.00277686, + "balance_loss_mlp": 1.00130308, + "epoch": 0.04401022095295355, + "flos": 12310286261760.0, + "grad_norm": 3.4407726001098506, + "language_loss": 0.89273608, + "learning_rate": 3.997941708816791e-06, + "loss": 0.91648197, + "num_input_tokens_seen": 15698645, + "step": 732, + "time_per_iteration": 2.5132381916046143 + }, + { + "auxiliary_loss_clip": 0.01191445, + "auxiliary_loss_mlp": 0.01183118, + "balance_loss_clip": 1.00263298, + "balance_loss_mlp": 1.00144267, + "epoch": 0.044070344205621524, + "flos": 20959658363520.0, + "grad_norm": 2.19208852811034, + "language_loss": 0.85917795, + "learning_rate": 3.997924006231419e-06, + "loss": 0.8829236, + "num_input_tokens_seen": 15716775, + "step": 733, + "time_per_iteration": 2.5380566120147705 + }, + { + "auxiliary_loss_clip": 0.01191499, + "auxiliary_loss_mlp": 0.01183402, + "balance_loss_clip": 1.00272417, + "balance_loss_mlp": 1.0013454, + "epoch": 0.044130467458289496, + "flos": 13845288021120.0, + "grad_norm": 2.448699619812791, + "language_loss": 0.91343397, + "learning_rate": 3.9979062278845685e-06, + "loss": 0.93718302, + "num_input_tokens_seen": 15733320, + "step": 734, + "time_per_iteration": 2.512233018875122 + }, + { + "auxiliary_loss_clip": 0.0119135, + "auxiliary_loss_mlp": 0.01182904, + "balance_loss_clip": 1.0027132, + "balance_loss_mlp": 1.00122929, + "epoch": 0.04419059071095746, + "flos": 28655063107200.0, + "grad_norm": 1.9658738499802488, + "language_loss": 0.77632403, + "learning_rate": 3.9978883737769125e-06, + "loss": 0.80006659, + "num_input_tokens_seen": 15752705, + "step": 735, + "time_per_iteration": 2.574355125427246 + }, + { + "auxiliary_loss_clip": 0.01191281, + "auxiliary_loss_mlp": 0.01182992, + "balance_loss_clip": 1.00251877, + "balance_loss_mlp": 1.00112653, + "epoch": 0.04425071396362543, + "flos": 28183304856960.0, + "grad_norm": 2.6853201224570222, + "language_loss": 0.88533151, + "learning_rate": 3.9978704439091305e-06, + "loss": 0.90907419, + "num_input_tokens_seen": 15772800, + "step": 736, + "time_per_iteration": 2.6211137771606445 + }, + { + "auxiliary_loss_clip": 0.01191358, + "auxiliary_loss_mlp": 0.01183164, + "balance_loss_clip": 1.00267768, + "balance_loss_mlp": 1.00177526, + "epoch": 0.0443108372162934, + "flos": 23658452778240.0, + "grad_norm": 1.8603102090919665, + "language_loss": 0.84692621, + "learning_rate": 3.997852438281901e-06, + "loss": 0.87067139, + "num_input_tokens_seen": 15793665, + "step": 737, + "time_per_iteration": 2.5406737327575684 + }, + { + "auxiliary_loss_clip": 0.01191456, + "auxiliary_loss_mlp": 0.01183098, + "balance_loss_clip": 1.00279427, + "balance_loss_mlp": 1.00123262, + "epoch": 0.04437096046896137, + "flos": 33979861025280.0, + "grad_norm": 2.8288807726470546, + "language_loss": 0.8481673, + "learning_rate": 3.997834356895906e-06, + "loss": 0.8719129, + "num_input_tokens_seen": 15813175, + "step": 738, + "time_per_iteration": 2.653902053833008 + }, + { + "auxiliary_loss_clip": 0.01191632, + "auxiliary_loss_mlp": 0.01180122, + "balance_loss_clip": 1.00394893, + "balance_loss_mlp": 1.00006795, + "epoch": 0.04443108372162934, + "flos": 67397506375680.0, + "grad_norm": 0.8617266511452337, + "language_loss": 0.59131581, + "learning_rate": 3.9978161997518324e-06, + "loss": 0.61503339, + "num_input_tokens_seen": 15872050, + "step": 739, + "time_per_iteration": 3.100430488586426 + }, + { + "auxiliary_loss_clip": 0.01191313, + "auxiliary_loss_mlp": 0.01182676, + "balance_loss_clip": 1.00268817, + "balance_loss_mlp": 1.00100124, + "epoch": 0.04449120697429731, + "flos": 29752672953600.0, + "grad_norm": 2.556685744060503, + "language_loss": 0.91510606, + "learning_rate": 3.997797966850369e-06, + "loss": 0.93884593, + "num_input_tokens_seen": 15891085, + "step": 740, + "time_per_iteration": 2.613633155822754 + }, + { + "auxiliary_loss_clip": 0.01191481, + "auxiliary_loss_mlp": 0.01183391, + "balance_loss_clip": 1.00278938, + "balance_loss_mlp": 1.00162029, + "epoch": 0.04455133022696528, + "flos": 36502119072000.0, + "grad_norm": 2.18635515968851, + "language_loss": 0.71843016, + "learning_rate": 3.997779658192205e-06, + "loss": 0.74217892, + "num_input_tokens_seen": 15914225, + "step": 741, + "time_per_iteration": 2.676384925842285 + }, + { + "auxiliary_loss_clip": 0.01191225, + "auxiliary_loss_mlp": 0.01183036, + "balance_loss_clip": 1.0026921, + "balance_loss_mlp": 1.00145602, + "epoch": 0.044611453479633245, + "flos": 28803661672320.0, + "grad_norm": 1.8913287759991682, + "language_loss": 0.88840437, + "learning_rate": 3.997761273778037e-06, + "loss": 0.91214699, + "num_input_tokens_seen": 15934540, + "step": 742, + "time_per_iteration": 4.005162954330444 + }, + { + "auxiliary_loss_clip": 0.01191243, + "auxiliary_loss_mlp": 0.01182515, + "balance_loss_clip": 1.00261998, + "balance_loss_mlp": 1.00103068, + "epoch": 0.04467157673230122, + "flos": 20010970304640.0, + "grad_norm": 2.057566590965422, + "language_loss": 0.83982301, + "learning_rate": 3.997742813608561e-06, + "loss": 0.86356056, + "num_input_tokens_seen": 15952560, + "step": 743, + "time_per_iteration": 5.278865098953247 + }, + { + "auxiliary_loss_clip": 0.01191385, + "auxiliary_loss_mlp": 0.01182668, + "balance_loss_clip": 1.00275338, + "balance_loss_mlp": 1.00118351, + "epoch": 0.04473169998496919, + "flos": 18004964480640.0, + "grad_norm": 2.5000214175779054, + "language_loss": 0.80068207, + "learning_rate": 3.997724277684479e-06, + "loss": 0.8244226, + "num_input_tokens_seen": 15970620, + "step": 744, + "time_per_iteration": 3.9679360389709473 + }, + { + "auxiliary_loss_clip": 0.01191228, + "auxiliary_loss_mlp": 0.01182619, + "balance_loss_clip": 1.00269389, + "balance_loss_mlp": 1.00123012, + "epoch": 0.044791823237637154, + "flos": 20631722169600.0, + "grad_norm": 2.4448844248271997, + "language_loss": 0.85524607, + "learning_rate": 3.99770566600649e-06, + "loss": 0.87898457, + "num_input_tokens_seen": 15987325, + "step": 745, + "time_per_iteration": 2.5200514793395996 + }, + { + "auxiliary_loss_clip": 0.0119112, + "auxiliary_loss_mlp": 0.01182744, + "balance_loss_clip": 1.00251293, + "balance_loss_mlp": 1.0011642, + "epoch": 0.04485194649030513, + "flos": 31176171918720.0, + "grad_norm": 2.208100073759295, + "language_loss": 0.69139135, + "learning_rate": 3.997686978575302e-06, + "loss": 0.71512997, + "num_input_tokens_seen": 16008310, + "step": 746, + "time_per_iteration": 2.6716558933258057 + }, + { + "auxiliary_loss_clip": 0.01191468, + "auxiliary_loss_mlp": 0.01183083, + "balance_loss_clip": 1.00289607, + "balance_loss_mlp": 1.00159931, + "epoch": 0.04491206974297309, + "flos": 26143291831680.0, + "grad_norm": 2.413618231098392, + "language_loss": 0.68795168, + "learning_rate": 3.997668215391625e-06, + "loss": 0.71169722, + "num_input_tokens_seen": 16029620, + "step": 747, + "time_per_iteration": 2.57293438911438 + }, + { + "auxiliary_loss_clip": 0.0119132, + "auxiliary_loss_mlp": 0.01182926, + "balance_loss_clip": 1.00270796, + "balance_loss_mlp": 1.00144172, + "epoch": 0.044972192995641064, + "flos": 20667668705280.0, + "grad_norm": 1.8085395914683078, + "language_loss": 0.66793722, + "learning_rate": 3.997649376456168e-06, + "loss": 0.69167966, + "num_input_tokens_seen": 16049065, + "step": 748, + "time_per_iteration": 2.6407032012939453 + }, + { + "auxiliary_loss_clip": 0.0119144, + "auxiliary_loss_mlp": 0.01182882, + "balance_loss_clip": 1.00280643, + "balance_loss_mlp": 1.00158882, + "epoch": 0.045032316248309036, + "flos": 16106834177280.0, + "grad_norm": 4.54184097352371, + "language_loss": 0.76292586, + "learning_rate": 3.997630461769647e-06, + "loss": 0.78666908, + "num_input_tokens_seen": 16066765, + "step": 749, + "time_per_iteration": 2.503067970275879 + }, + { + "auxiliary_loss_clip": 0.01191286, + "auxiliary_loss_mlp": 0.01183056, + "balance_loss_clip": 1.00264084, + "balance_loss_mlp": 1.00157166, + "epoch": 0.045092439500977, + "flos": 17858843953920.0, + "grad_norm": 2.498119456472093, + "language_loss": 0.88980901, + "learning_rate": 3.997611471332778e-06, + "loss": 0.91355246, + "num_input_tokens_seen": 16085980, + "step": 750, + "time_per_iteration": 2.5178728103637695 + }, + { + "auxiliary_loss_clip": 0.01191169, + "auxiliary_loss_mlp": 0.01182966, + "balance_loss_clip": 1.00254333, + "balance_loss_mlp": 1.00138605, + "epoch": 0.04515256275364497, + "flos": 24462815990400.0, + "grad_norm": 7.37447157424689, + "language_loss": 0.74922734, + "learning_rate": 3.9975924051462825e-06, + "loss": 0.77296865, + "num_input_tokens_seen": 16106260, + "step": 751, + "time_per_iteration": 2.5591955184936523 + }, + { + "auxiliary_loss_clip": 0.01191305, + "auxiliary_loss_mlp": 0.01182582, + "balance_loss_clip": 1.0026722, + "balance_loss_mlp": 1.00119364, + "epoch": 0.04521268600631294, + "flos": 20916385453440.0, + "grad_norm": 2.158858801368437, + "language_loss": 0.69590008, + "learning_rate": 3.997573263210883e-06, + "loss": 0.71963894, + "num_input_tokens_seen": 16123475, + "step": 752, + "time_per_iteration": 2.5130796432495117 + }, + { + "auxiliary_loss_clip": 0.01191223, + "auxiliary_loss_mlp": 0.01182227, + "balance_loss_clip": 1.00268698, + "balance_loss_mlp": 1.00093389, + "epoch": 0.04527280925898091, + "flos": 13371374954880.0, + "grad_norm": 2.5460621099218024, + "language_loss": 0.92205977, + "learning_rate": 3.997554045527305e-06, + "loss": 0.94579434, + "num_input_tokens_seen": 16138335, + "step": 753, + "time_per_iteration": 2.4888052940368652 + }, + { + "auxiliary_loss_clip": 0.01191322, + "auxiliary_loss_mlp": 0.01183236, + "balance_loss_clip": 1.00275946, + "balance_loss_mlp": 1.00165689, + "epoch": 0.04533293251164888, + "flos": 23254565276160.0, + "grad_norm": 2.254693277058465, + "language_loss": 0.91012174, + "learning_rate": 3.997534752096277e-06, + "loss": 0.93386734, + "num_input_tokens_seen": 16157110, + "step": 754, + "time_per_iteration": 2.5469489097595215 + }, + { + "auxiliary_loss_clip": 0.01191052, + "auxiliary_loss_mlp": 0.01182572, + "balance_loss_clip": 1.00257957, + "balance_loss_mlp": 1.00137413, + "epoch": 0.04539305576431685, + "flos": 12422004537600.0, + "grad_norm": 2.601303512657712, + "language_loss": 0.78494394, + "learning_rate": 3.997515382918531e-06, + "loss": 0.80868018, + "num_input_tokens_seen": 16174155, + "step": 755, + "time_per_iteration": 2.4847123622894287 + }, + { + "auxiliary_loss_clip": 0.01191355, + "auxiliary_loss_mlp": 0.0118302, + "balance_loss_clip": 1.00269473, + "balance_loss_mlp": 1.00153613, + "epoch": 0.04545317901698482, + "flos": 16070995382400.0, + "grad_norm": 2.0952258204046297, + "language_loss": 0.78916955, + "learning_rate": 3.9974959379948015e-06, + "loss": 0.81291324, + "num_input_tokens_seen": 16192240, + "step": 756, + "time_per_iteration": 2.49680233001709 + }, + { + "auxiliary_loss_clip": 0.01191156, + "auxiliary_loss_mlp": 0.01178658, + "balance_loss_clip": 1.00369859, + "balance_loss_mlp": 1.00013018, + "epoch": 0.045513302269652785, + "flos": 66396139021440.0, + "grad_norm": 0.8038959960550259, + "language_loss": 0.62742448, + "learning_rate": 3.997476417325827e-06, + "loss": 0.65112257, + "num_input_tokens_seen": 16255775, + "step": 757, + "time_per_iteration": 3.1569976806640625 + }, + { + "auxiliary_loss_clip": 0.01191323, + "auxiliary_loss_mlp": 0.0118302, + "balance_loss_clip": 1.00273025, + "balance_loss_mlp": 1.00144076, + "epoch": 0.04557342552232076, + "flos": 21471169991040.0, + "grad_norm": 1.605099359044113, + "language_loss": 0.84342384, + "learning_rate": 3.997456820912346e-06, + "loss": 0.86716723, + "num_input_tokens_seen": 16277015, + "step": 758, + "time_per_iteration": 2.5424084663391113 + }, + { + "auxiliary_loss_clip": 0.01191054, + "auxiliary_loss_mlp": 0.01182329, + "balance_loss_clip": 1.00257921, + "balance_loss_mlp": 1.0011307, + "epoch": 0.04563354877498873, + "flos": 23732680233600.0, + "grad_norm": 1.8041255443274755, + "language_loss": 0.88140738, + "learning_rate": 3.997437148755101e-06, + "loss": 0.90514123, + "num_input_tokens_seen": 16296005, + "step": 759, + "time_per_iteration": 2.548691749572754 + }, + { + "auxiliary_loss_clip": 0.01191321, + "auxiliary_loss_mlp": 0.01182509, + "balance_loss_clip": 1.0026772, + "balance_loss_mlp": 1.00131047, + "epoch": 0.045693672027656694, + "flos": 25735741142400.0, + "grad_norm": 3.5515370302694125, + "language_loss": 0.73444688, + "learning_rate": 3.9974174008548405e-06, + "loss": 0.75818515, + "num_input_tokens_seen": 16315300, + "step": 760, + "time_per_iteration": 2.5554747581481934 + }, + { + "auxiliary_loss_clip": 0.01191241, + "auxiliary_loss_mlp": 0.01182767, + "balance_loss_clip": 1.00275111, + "balance_loss_mlp": 1.00176001, + "epoch": 0.045753795280324666, + "flos": 19719016560000.0, + "grad_norm": 2.0839480724986252, + "language_loss": 0.82773334, + "learning_rate": 3.9973975772123105e-06, + "loss": 0.85147339, + "num_input_tokens_seen": 16333820, + "step": 761, + "time_per_iteration": 2.5241355895996094 + }, + { + "auxiliary_loss_clip": 0.01191161, + "auxiliary_loss_mlp": 0.01182527, + "balance_loss_clip": 1.00267231, + "balance_loss_mlp": 1.00132942, + "epoch": 0.04581391853299264, + "flos": 23255786338560.0, + "grad_norm": 1.7377099533656384, + "language_loss": 0.80034566, + "learning_rate": 3.997377677828266e-06, + "loss": 0.82408249, + "num_input_tokens_seen": 16355290, + "step": 762, + "time_per_iteration": 2.5887482166290283 + }, + { + "auxiliary_loss_clip": 0.01191063, + "auxiliary_loss_mlp": 0.01178702, + "balance_loss_clip": 1.00363445, + "balance_loss_mlp": 1.0001744, + "epoch": 0.0458740417856606, + "flos": 64231155601920.0, + "grad_norm": 1.0104414921230922, + "language_loss": 0.58701444, + "learning_rate": 3.9973577027034585e-06, + "loss": 0.61071205, + "num_input_tokens_seen": 16415995, + "step": 763, + "time_per_iteration": 3.2375924587249756 + }, + { + "auxiliary_loss_clip": 0.01191135, + "auxiliary_loss_mlp": 0.01182843, + "balance_loss_clip": 1.00263584, + "balance_loss_mlp": 1.00154984, + "epoch": 0.045934165038328575, + "flos": 20770121272320.0, + "grad_norm": 2.6097713209561513, + "language_loss": 0.87750566, + "learning_rate": 3.9973376518386475e-06, + "loss": 0.90124547, + "num_input_tokens_seen": 16433120, + "step": 764, + "time_per_iteration": 2.527886152267456 + }, + { + "auxiliary_loss_clip": 0.01191165, + "auxiliary_loss_mlp": 0.01182848, + "balance_loss_clip": 1.00274897, + "balance_loss_mlp": 1.00165033, + "epoch": 0.04599428829099654, + "flos": 30262891691520.0, + "grad_norm": 3.1682209965556574, + "language_loss": 0.85791987, + "learning_rate": 3.997317525234592e-06, + "loss": 0.88165998, + "num_input_tokens_seen": 16453360, + "step": 765, + "time_per_iteration": 2.620896339416504 + }, + { + "auxiliary_loss_clip": 0.01191227, + "auxiliary_loss_mlp": 0.01182668, + "balance_loss_clip": 1.00269222, + "balance_loss_mlp": 1.00127971, + "epoch": 0.04605441154366451, + "flos": 23038921975680.0, + "grad_norm": 2.5247206631932735, + "language_loss": 0.87839395, + "learning_rate": 3.997297322892056e-06, + "loss": 0.90213287, + "num_input_tokens_seen": 16471160, + "step": 766, + "time_per_iteration": 2.547398090362549 + }, + { + "auxiliary_loss_clip": 0.01191162, + "auxiliary_loss_mlp": 0.01182764, + "balance_loss_clip": 1.00268602, + "balance_loss_mlp": 1.00147057, + "epoch": 0.046114534796332485, + "flos": 22017407091840.0, + "grad_norm": 2.196689191370766, + "language_loss": 0.84269285, + "learning_rate": 3.997277044811806e-06, + "loss": 0.86643207, + "num_input_tokens_seen": 16488940, + "step": 767, + "time_per_iteration": 2.5470268726348877 + }, + { + "auxiliary_loss_clip": 0.0119124, + "auxiliary_loss_mlp": 0.01182355, + "balance_loss_clip": 1.00279713, + "balance_loss_mlp": 1.00115728, + "epoch": 0.04617465804900045, + "flos": 29862380067840.0, + "grad_norm": 2.559064487304541, + "language_loss": 0.86800939, + "learning_rate": 3.99725669099461e-06, + "loss": 0.89174533, + "num_input_tokens_seen": 16509505, + "step": 768, + "time_per_iteration": 2.7044951915740967 + }, + { + "auxiliary_loss_clip": 0.01191047, + "auxiliary_loss_mlp": 0.01182473, + "balance_loss_clip": 1.00257993, + "balance_loss_mlp": 1.00127447, + "epoch": 0.04623478130166842, + "flos": 25630056351360.0, + "grad_norm": 2.2758800932188112, + "language_loss": 0.75408947, + "learning_rate": 3.9972362614412395e-06, + "loss": 0.7778247, + "num_input_tokens_seen": 16528840, + "step": 769, + "time_per_iteration": 2.574126720428467 + }, + { + "auxiliary_loss_clip": 0.01191048, + "auxiliary_loss_mlp": 0.01182258, + "balance_loss_clip": 1.00265825, + "balance_loss_mlp": 1.00134635, + "epoch": 0.04629490455433639, + "flos": 20449080489600.0, + "grad_norm": 1.8914857599790589, + "language_loss": 0.86482811, + "learning_rate": 3.997215756152471e-06, + "loss": 0.88856113, + "num_input_tokens_seen": 16548335, + "step": 770, + "time_per_iteration": 2.590031147003174 + }, + { + "auxiliary_loss_clip": 0.01191073, + "auxiliary_loss_mlp": 0.011824, + "balance_loss_clip": 1.00261974, + "balance_loss_mlp": 1.00120234, + "epoch": 0.04635502780700436, + "flos": 23148736830720.0, + "grad_norm": 2.313088889287727, + "language_loss": 0.86813343, + "learning_rate": 3.99719517512908e-06, + "loss": 0.89186811, + "num_input_tokens_seen": 16567725, + "step": 771, + "time_per_iteration": 2.530567169189453 + }, + { + "auxiliary_loss_clip": 0.01191102, + "auxiliary_loss_mlp": 0.01182653, + "balance_loss_clip": 1.00263309, + "balance_loss_mlp": 1.00154984, + "epoch": 0.04641515105967233, + "flos": 23292020183040.0, + "grad_norm": 2.5803972947391633, + "language_loss": 0.83683652, + "learning_rate": 3.997174518371848e-06, + "loss": 0.86057401, + "num_input_tokens_seen": 16588175, + "step": 772, + "time_per_iteration": 2.5636210441589355 + }, + { + "auxiliary_loss_clip": 0.01191089, + "auxiliary_loss_mlp": 0.01182416, + "balance_loss_clip": 1.00269914, + "balance_loss_mlp": 1.00140846, + "epoch": 0.046475274312340296, + "flos": 25115204759040.0, + "grad_norm": 1.9026788581124892, + "language_loss": 0.74021471, + "learning_rate": 3.997153785881557e-06, + "loss": 0.76394975, + "num_input_tokens_seen": 16607735, + "step": 773, + "time_per_iteration": 2.5651638507843018 + }, + { + "auxiliary_loss_clip": 0.01190958, + "auxiliary_loss_mlp": 0.01182227, + "balance_loss_clip": 1.00261664, + "balance_loss_mlp": 1.00131536, + "epoch": 0.04653539756500827, + "flos": 25264916645760.0, + "grad_norm": 2.7184461456700237, + "language_loss": 0.78735161, + "learning_rate": 3.997132977658996e-06, + "loss": 0.81108344, + "num_input_tokens_seen": 16627225, + "step": 774, + "time_per_iteration": 2.568791389465332 + }, + { + "auxiliary_loss_clip": 0.01191196, + "auxiliary_loss_mlp": 0.01182278, + "balance_loss_clip": 1.00273919, + "balance_loss_mlp": 1.00136554, + "epoch": 0.046595520817676234, + "flos": 35404150089600.0, + "grad_norm": 2.3002852608752726, + "language_loss": 0.7341224, + "learning_rate": 3.997112093704952e-06, + "loss": 0.7578572, + "num_input_tokens_seen": 16647785, + "step": 775, + "time_per_iteration": 2.637694835662842 + }, + { + "auxiliary_loss_clip": 0.01190953, + "auxiliary_loss_mlp": 0.01181896, + "balance_loss_clip": 1.00250697, + "balance_loss_mlp": 1.00088835, + "epoch": 0.046655644070344206, + "flos": 18112516778880.0, + "grad_norm": 1.7413210399513805, + "language_loss": 0.77089131, + "learning_rate": 3.997091134020217e-06, + "loss": 0.7946198, + "num_input_tokens_seen": 16667555, + "step": 776, + "time_per_iteration": 2.5173492431640625 + }, + { + "auxiliary_loss_clip": 0.01191041, + "auxiliary_loss_mlp": 0.01182262, + "balance_loss_clip": 1.00263715, + "balance_loss_mlp": 1.00125492, + "epoch": 0.04671576732301218, + "flos": 29205286617600.0, + "grad_norm": 1.9851296796404556, + "language_loss": 0.717978, + "learning_rate": 3.997070098605585e-06, + "loss": 0.74171108, + "num_input_tokens_seen": 16686875, + "step": 777, + "time_per_iteration": 2.5676207542419434 + }, + { + "auxiliary_loss_clip": 0.01190957, + "auxiliary_loss_mlp": 0.01182454, + "balance_loss_clip": 1.00263643, + "balance_loss_mlp": 1.00154161, + "epoch": 0.04677589057568014, + "flos": 30478319510400.0, + "grad_norm": 2.106632278654723, + "language_loss": 0.76613629, + "learning_rate": 3.997048987461856e-06, + "loss": 0.78987038, + "num_input_tokens_seen": 16706420, + "step": 778, + "time_per_iteration": 2.610123634338379 + }, + { + "auxiliary_loss_clip": 0.01191029, + "auxiliary_loss_mlp": 0.01182035, + "balance_loss_clip": 1.00261688, + "balance_loss_mlp": 1.0012188, + "epoch": 0.046836013828348115, + "flos": 20557674282240.0, + "grad_norm": 2.332926215373546, + "language_loss": 0.79289258, + "learning_rate": 3.997027800589829e-06, + "loss": 0.81662321, + "num_input_tokens_seen": 16726390, + "step": 779, + "time_per_iteration": 2.5476505756378174 + }, + { + "auxiliary_loss_clip": 0.01190867, + "auxiliary_loss_mlp": 0.01182232, + "balance_loss_clip": 1.00255656, + "balance_loss_mlp": 1.00131965, + "epoch": 0.04689613708101608, + "flos": 25447378757760.0, + "grad_norm": 1.6427130917471118, + "language_loss": 0.77130622, + "learning_rate": 3.997006537990308e-06, + "loss": 0.79503721, + "num_input_tokens_seen": 16748965, + "step": 780, + "time_per_iteration": 3.986131191253662 + }, + { + "auxiliary_loss_clip": 0.01191099, + "auxiliary_loss_mlp": 0.01182109, + "balance_loss_clip": 1.0028193, + "balance_loss_mlp": 1.0013876, + "epoch": 0.04695626033368405, + "flos": 23001395241600.0, + "grad_norm": 2.3033112954857424, + "language_loss": 0.76623154, + "learning_rate": 3.996985199664099e-06, + "loss": 0.7899636, + "num_input_tokens_seen": 16768620, + "step": 781, + "time_per_iteration": 5.348784446716309 + }, + { + "auxiliary_loss_clip": 0.01191118, + "auxiliary_loss_mlp": 0.01182626, + "balance_loss_clip": 1.00270951, + "balance_loss_mlp": 1.00161862, + "epoch": 0.047016383586352024, + "flos": 29133357632640.0, + "grad_norm": 2.8957180554470896, + "language_loss": 0.7373364, + "learning_rate": 3.99696378561201e-06, + "loss": 0.76107383, + "num_input_tokens_seen": 16789755, + "step": 782, + "time_per_iteration": 4.118903398513794 + }, + { + "auxiliary_loss_clip": 0.01190886, + "auxiliary_loss_mlp": 0.0118208, + "balance_loss_clip": 1.00258124, + "balance_loss_mlp": 1.00135899, + "epoch": 0.04707650683901999, + "flos": 14976330451200.0, + "grad_norm": 2.2032019742523885, + "language_loss": 0.8035695, + "learning_rate": 3.996942295834855e-06, + "loss": 0.82729912, + "num_input_tokens_seen": 16807585, + "step": 783, + "time_per_iteration": 2.5197930335998535 + }, + { + "auxiliary_loss_clip": 0.01190834, + "auxiliary_loss_mlp": 0.01181848, + "balance_loss_clip": 1.00263488, + "balance_loss_mlp": 1.00122201, + "epoch": 0.04713663009168796, + "flos": 21651118151040.0, + "grad_norm": 2.1436950149169203, + "language_loss": 0.81936872, + "learning_rate": 3.996920730333448e-06, + "loss": 0.84309554, + "num_input_tokens_seen": 16827220, + "step": 784, + "time_per_iteration": 2.553246021270752 + }, + { + "auxiliary_loss_clip": 0.01190748, + "auxiliary_loss_mlp": 0.01182348, + "balance_loss_clip": 1.00243962, + "balance_loss_mlp": 1.00143635, + "epoch": 0.04719675334435593, + "flos": 21325408600320.0, + "grad_norm": 2.3388248714902042, + "language_loss": 0.80895722, + "learning_rate": 3.996899089108607e-06, + "loss": 0.83268809, + "num_input_tokens_seen": 16846230, + "step": 785, + "time_per_iteration": 2.523282527923584 + }, + { + "auxiliary_loss_clip": 0.01191104, + "auxiliary_loss_mlp": 0.01182409, + "balance_loss_clip": 1.00279713, + "balance_loss_mlp": 1.00149679, + "epoch": 0.0472568765970239, + "flos": 17931383470080.0, + "grad_norm": 1.8686249558505523, + "language_loss": 0.8993907, + "learning_rate": 3.996877372161152e-06, + "loss": 0.92312586, + "num_input_tokens_seen": 16865325, + "step": 786, + "time_per_iteration": 2.536513328552246 + }, + { + "auxiliary_loss_clip": 0.0119083, + "auxiliary_loss_mlp": 0.01182081, + "balance_loss_clip": 1.00239944, + "balance_loss_mlp": 1.00135994, + "epoch": 0.04731699984969187, + "flos": 18077324428800.0, + "grad_norm": 3.5087799592365325, + "language_loss": 0.76438153, + "learning_rate": 3.9968555794919065e-06, + "loss": 0.78811061, + "num_input_tokens_seen": 16882930, + "step": 787, + "time_per_iteration": 2.51611590385437 + }, + { + "auxiliary_loss_clip": 0.01190884, + "auxiliary_loss_mlp": 0.01181833, + "balance_loss_clip": 1.0026871, + "balance_loss_mlp": 1.00130296, + "epoch": 0.047377123102359836, + "flos": 23185078416000.0, + "grad_norm": 2.3809137268699376, + "language_loss": 0.81152451, + "learning_rate": 3.996833711101698e-06, + "loss": 0.83525169, + "num_input_tokens_seen": 16900710, + "step": 788, + "time_per_iteration": 2.579232931137085 + }, + { + "auxiliary_loss_clip": 0.0119088, + "auxiliary_loss_mlp": 0.01181889, + "balance_loss_clip": 1.00269485, + "balance_loss_mlp": 1.00126266, + "epoch": 0.04743724635502781, + "flos": 22747794243840.0, + "grad_norm": 2.1506622943591376, + "language_loss": 0.84486336, + "learning_rate": 3.996811766991355e-06, + "loss": 0.86859107, + "num_input_tokens_seen": 16919210, + "step": 789, + "time_per_iteration": 2.572117567062378 + }, + { + "auxiliary_loss_clip": 0.01190925, + "auxiliary_loss_mlp": 0.01182146, + "balance_loss_clip": 1.00276828, + "balance_loss_mlp": 1.00161529, + "epoch": 0.04749736960769577, + "flos": 17238702620160.0, + "grad_norm": 2.289883772754309, + "language_loss": 0.82160306, + "learning_rate": 3.996789747161709e-06, + "loss": 0.84533376, + "num_input_tokens_seen": 16937125, + "step": 790, + "time_per_iteration": 2.5215017795562744 + }, + { + "auxiliary_loss_clip": 0.01190589, + "auxiliary_loss_mlp": 0.01182044, + "balance_loss_clip": 1.00241876, + "balance_loss_mlp": 1.00141788, + "epoch": 0.047557492860363745, + "flos": 40479261592320.0, + "grad_norm": 2.1628623256981006, + "language_loss": 0.8842175, + "learning_rate": 3.996767651613597e-06, + "loss": 0.90794384, + "num_input_tokens_seen": 16958610, + "step": 791, + "time_per_iteration": 2.673675775527954 + }, + { + "auxiliary_loss_clip": 0.01190819, + "auxiliary_loss_mlp": 0.01182025, + "balance_loss_clip": 1.00264812, + "balance_loss_mlp": 1.00139952, + "epoch": 0.04761761611303172, + "flos": 18698004466560.0, + "grad_norm": 2.277656337435829, + "language_loss": 0.9084034, + "learning_rate": 3.996745480347854e-06, + "loss": 0.93213183, + "num_input_tokens_seen": 16977300, + "step": 792, + "time_per_iteration": 2.5345067977905273 + }, + { + "auxiliary_loss_clip": 0.01190783, + "auxiliary_loss_mlp": 0.01182237, + "balance_loss_clip": 1.00263703, + "balance_loss_mlp": 1.00170648, + "epoch": 0.04767773936569968, + "flos": 20921987975040.0, + "grad_norm": 2.1082714286617796, + "language_loss": 0.73464215, + "learning_rate": 3.996723233365324e-06, + "loss": 0.75837231, + "num_input_tokens_seen": 16994950, + "step": 793, + "time_per_iteration": 2.501579761505127 + }, + { + "auxiliary_loss_clip": 0.01190891, + "auxiliary_loss_mlp": 0.01182058, + "balance_loss_clip": 1.00263381, + "balance_loss_mlp": 1.00114572, + "epoch": 0.047737862618367655, + "flos": 23732680233600.0, + "grad_norm": 2.478463850045645, + "language_loss": 0.86311138, + "learning_rate": 3.996700910666847e-06, + "loss": 0.88684088, + "num_input_tokens_seen": 17014760, + "step": 794, + "time_per_iteration": 2.547175407409668 + }, + { + "auxiliary_loss_clip": 0.01190665, + "auxiliary_loss_mlp": 0.01182462, + "balance_loss_clip": 1.00247705, + "balance_loss_mlp": 1.00164533, + "epoch": 0.04779798587103562, + "flos": 23695764030720.0, + "grad_norm": 3.6913418350462357, + "language_loss": 0.69963467, + "learning_rate": 3.996678512253272e-06, + "loss": 0.7233659, + "num_input_tokens_seen": 17032715, + "step": 795, + "time_per_iteration": 2.5361030101776123 + }, + { + "auxiliary_loss_clip": 0.01190637, + "auxiliary_loss_mlp": 0.01181888, + "balance_loss_clip": 1.00256133, + "balance_loss_mlp": 1.00145268, + "epoch": 0.04785810912370359, + "flos": 23183641872000.0, + "grad_norm": 1.786820055611672, + "language_loss": 0.80921078, + "learning_rate": 3.996656038125449e-06, + "loss": 0.83293605, + "num_input_tokens_seen": 17052215, + "step": 796, + "time_per_iteration": 2.5358903408050537 + }, + { + "auxiliary_loss_clip": 0.01190633, + "auxiliary_loss_mlp": 0.01181714, + "balance_loss_clip": 1.00255597, + "balance_loss_mlp": 1.00146997, + "epoch": 0.047918232376371564, + "flos": 18040623707520.0, + "grad_norm": 2.195642035542276, + "language_loss": 0.81862867, + "learning_rate": 3.996633488284228e-06, + "loss": 0.84235215, + "num_input_tokens_seen": 17069225, + "step": 797, + "time_per_iteration": 2.4892256259918213 + }, + { + "auxiliary_loss_clip": 0.01191038, + "auxiliary_loss_mlp": 0.01177871, + "balance_loss_clip": 1.00408959, + "balance_loss_mlp": 1.0001061, + "epoch": 0.04797835562903953, + "flos": 62442588758400.0, + "grad_norm": 0.9208103827385075, + "language_loss": 0.6449104, + "learning_rate": 3.996610862730465e-06, + "loss": 0.66859949, + "num_input_tokens_seen": 17126680, + "step": 798, + "time_per_iteration": 3.050361156463623 + }, + { + "auxiliary_loss_clip": 0.01190673, + "auxiliary_loss_mlp": 0.01182031, + "balance_loss_clip": 1.00251639, + "balance_loss_mlp": 1.00140524, + "epoch": 0.0480384788817075, + "flos": 21507296094720.0, + "grad_norm": 1.9655500326049709, + "language_loss": 0.91060925, + "learning_rate": 3.996588161465018e-06, + "loss": 0.93433625, + "num_input_tokens_seen": 17144835, + "step": 799, + "time_per_iteration": 2.532944679260254 + }, + { + "auxiliary_loss_clip": 0.01190855, + "auxiliary_loss_mlp": 0.01182123, + "balance_loss_clip": 1.00272477, + "balance_loss_mlp": 1.00149655, + "epoch": 0.048098602134375466, + "flos": 21726710323200.0, + "grad_norm": 2.0287726291684103, + "language_loss": 0.86672354, + "learning_rate": 3.996565384488748e-06, + "loss": 0.89045334, + "num_input_tokens_seen": 17165030, + "step": 800, + "time_per_iteration": 2.5441548824310303 + }, + { + "auxiliary_loss_clip": 0.01190764, + "auxiliary_loss_mlp": 0.01182063, + "balance_loss_clip": 1.00257623, + "balance_loss_mlp": 1.00153255, + "epoch": 0.04815872538704344, + "flos": 22931082368640.0, + "grad_norm": 2.7642878529697734, + "language_loss": 0.84223628, + "learning_rate": 3.996542531802518e-06, + "loss": 0.86596459, + "num_input_tokens_seen": 17184895, + "step": 801, + "time_per_iteration": 2.558583974838257 + }, + { + "auxiliary_loss_clip": 0.01190766, + "auxiliary_loss_mlp": 0.01182232, + "balance_loss_clip": 1.00267875, + "balance_loss_mlp": 1.00151098, + "epoch": 0.04821884863971141, + "flos": 43174716042240.0, + "grad_norm": 2.0137728271255524, + "language_loss": 0.79914391, + "learning_rate": 3.996519603407196e-06, + "loss": 0.82287389, + "num_input_tokens_seen": 17208225, + "step": 802, + "time_per_iteration": 2.710300922393799 + }, + { + "auxiliary_loss_clip": 0.01190765, + "auxiliary_loss_mlp": 0.01182001, + "balance_loss_clip": 1.00271297, + "balance_loss_mlp": 1.00137532, + "epoch": 0.048278971892379376, + "flos": 18620006083200.0, + "grad_norm": 2.2827974008324494, + "language_loss": 0.8645153, + "learning_rate": 3.996496599303649e-06, + "loss": 0.88824296, + "num_input_tokens_seen": 17226305, + "step": 803, + "time_per_iteration": 2.5116899013519287 + }, + { + "auxiliary_loss_clip": 0.01190672, + "auxiliary_loss_mlp": 0.01181845, + "balance_loss_clip": 1.00258064, + "balance_loss_mlp": 1.00121915, + "epoch": 0.04833909514504735, + "flos": 20230061310720.0, + "grad_norm": 2.6716884960684384, + "language_loss": 0.85455382, + "learning_rate": 3.996473519492753e-06, + "loss": 0.87827897, + "num_input_tokens_seen": 17244545, + "step": 804, + "time_per_iteration": 2.583244800567627 + }, + { + "auxiliary_loss_clip": 0.01190635, + "auxiliary_loss_mlp": 0.01181896, + "balance_loss_clip": 1.00262392, + "balance_loss_mlp": 1.00155592, + "epoch": 0.04839921839771532, + "flos": 24645170361600.0, + "grad_norm": 1.9719750328105166, + "language_loss": 0.85929722, + "learning_rate": 3.99645036397538e-06, + "loss": 0.88302255, + "num_input_tokens_seen": 17265730, + "step": 805, + "time_per_iteration": 2.5569772720336914 + }, + { + "auxiliary_loss_clip": 0.01190598, + "auxiliary_loss_mlp": 0.01181981, + "balance_loss_clip": 1.00251889, + "balance_loss_mlp": 1.00154567, + "epoch": 0.048459341650383285, + "flos": 24827452905600.0, + "grad_norm": 2.1196080181985733, + "language_loss": 0.68135899, + "learning_rate": 3.9964271327524085e-06, + "loss": 0.7050848, + "num_input_tokens_seen": 17284820, + "step": 806, + "time_per_iteration": 2.5774800777435303 + }, + { + "auxiliary_loss_clip": 0.01190528, + "auxiliary_loss_mlp": 0.01181571, + "balance_loss_clip": 1.00256622, + "balance_loss_mlp": 1.0013268, + "epoch": 0.04851946490305126, + "flos": 22163204396160.0, + "grad_norm": 2.204250801338769, + "language_loss": 0.76654339, + "learning_rate": 3.9964038258247214e-06, + "loss": 0.79026437, + "num_input_tokens_seen": 17305085, + "step": 807, + "time_per_iteration": 2.5672504901885986 + }, + { + "auxiliary_loss_clip": 0.01190482, + "auxiliary_loss_mlp": 0.01182208, + "balance_loss_clip": 1.0025413, + "balance_loss_mlp": 1.00186789, + "epoch": 0.04857958815571922, + "flos": 19792022952960.0, + "grad_norm": 2.437030810356074, + "language_loss": 0.86678731, + "learning_rate": 3.9963804431932005e-06, + "loss": 0.89051419, + "num_input_tokens_seen": 17322715, + "step": 808, + "time_per_iteration": 2.540316104888916 + }, + { + "auxiliary_loss_clip": 0.01190667, + "auxiliary_loss_mlp": 0.01181948, + "balance_loss_clip": 1.00255823, + "balance_loss_mlp": 1.00151324, + "epoch": 0.048639711408387194, + "flos": 18697968552960.0, + "grad_norm": 2.3614823547933095, + "language_loss": 0.90036321, + "learning_rate": 3.996356984858732e-06, + "loss": 0.92408931, + "num_input_tokens_seen": 17341455, + "step": 809, + "time_per_iteration": 2.6020727157592773 + }, + { + "auxiliary_loss_clip": 0.01190665, + "auxiliary_loss_mlp": 0.01182161, + "balance_loss_clip": 1.00265205, + "balance_loss_mlp": 1.0015347, + "epoch": 0.048699834661055166, + "flos": 24863507182080.0, + "grad_norm": 2.1892928334453323, + "language_loss": 0.84618843, + "learning_rate": 3.996333450822208e-06, + "loss": 0.86991668, + "num_input_tokens_seen": 17360765, + "step": 810, + "time_per_iteration": 2.571666717529297 + }, + { + "auxiliary_loss_clip": 0.01190737, + "auxiliary_loss_mlp": 0.01182115, + "balance_loss_clip": 1.00251317, + "balance_loss_mlp": 1.00148857, + "epoch": 0.04875995791372313, + "flos": 20704010290560.0, + "grad_norm": 1.8080233618873889, + "language_loss": 0.80739951, + "learning_rate": 3.99630984108452e-06, + "loss": 0.831128, + "num_input_tokens_seen": 17380625, + "step": 811, + "time_per_iteration": 2.568657398223877 + }, + { + "auxiliary_loss_clip": 0.01190594, + "auxiliary_loss_mlp": 0.01182065, + "balance_loss_clip": 1.00254917, + "balance_loss_mlp": 1.00153482, + "epoch": 0.048820081166391104, + "flos": 18588297352320.0, + "grad_norm": 1.8997085917608993, + "language_loss": 0.74270785, + "learning_rate": 3.9962861556465615e-06, + "loss": 0.76643443, + "num_input_tokens_seen": 17399355, + "step": 812, + "time_per_iteration": 2.5186240673065186 + }, + { + "auxiliary_loss_clip": 0.0119063, + "auxiliary_loss_mlp": 0.01181451, + "balance_loss_clip": 1.00265574, + "balance_loss_mlp": 1.00139761, + "epoch": 0.04888020441905907, + "flos": 22707322594560.0, + "grad_norm": 2.1148574779732954, + "language_loss": 0.90175712, + "learning_rate": 3.996262394509233e-06, + "loss": 0.92547786, + "num_input_tokens_seen": 17418240, + "step": 813, + "time_per_iteration": 2.5196776390075684 + }, + { + "auxiliary_loss_clip": 0.01190583, + "auxiliary_loss_mlp": 0.0118175, + "balance_loss_clip": 1.0025754, + "balance_loss_mlp": 1.00131488, + "epoch": 0.04894032767172704, + "flos": 22784351310720.0, + "grad_norm": 2.1005966010920094, + "language_loss": 0.74636483, + "learning_rate": 3.9962385576734335e-06, + "loss": 0.77008814, + "num_input_tokens_seen": 17436250, + "step": 814, + "time_per_iteration": 2.5462489128112793 + }, + { + "auxiliary_loss_clip": 0.01190551, + "auxiliary_loss_mlp": 0.01181749, + "balance_loss_clip": 1.0025444, + "balance_loss_mlp": 1.0015049, + "epoch": 0.04900045092439501, + "flos": 25516147345920.0, + "grad_norm": 2.270731512992109, + "language_loss": 0.83381188, + "learning_rate": 3.9962146451400675e-06, + "loss": 0.85753489, + "num_input_tokens_seen": 17455750, + "step": 815, + "time_per_iteration": 2.5467336177825928 + }, + { + "auxiliary_loss_clip": 0.01190642, + "auxiliary_loss_mlp": 0.01181941, + "balance_loss_clip": 1.00257409, + "balance_loss_mlp": 1.00131488, + "epoch": 0.04906057417706298, + "flos": 25958136199680.0, + "grad_norm": 2.2702988025001325, + "language_loss": 0.90897, + "learning_rate": 3.996190656910043e-06, + "loss": 0.93269581, + "num_input_tokens_seen": 17474995, + "step": 816, + "time_per_iteration": 2.5833725929260254 + }, + { + "auxiliary_loss_clip": 0.01190736, + "auxiliary_loss_mlp": 0.01181837, + "balance_loss_clip": 1.00266099, + "balance_loss_mlp": 1.00111568, + "epoch": 0.04912069742973095, + "flos": 18624638937600.0, + "grad_norm": 2.670581169347298, + "language_loss": 0.79981428, + "learning_rate": 3.996166592984268e-06, + "loss": 0.82354003, + "num_input_tokens_seen": 17493395, + "step": 817, + "time_per_iteration": 2.5544826984405518 + }, + { + "auxiliary_loss_clip": 0.01190713, + "auxiliary_loss_mlp": 0.01182519, + "balance_loss_clip": 1.00266469, + "balance_loss_mlp": 1.00198829, + "epoch": 0.049180820682398915, + "flos": 23699786353920.0, + "grad_norm": 1.8799966032022972, + "language_loss": 0.85006189, + "learning_rate": 3.996142453363656e-06, + "loss": 0.87379426, + "num_input_tokens_seen": 17514565, + "step": 818, + "time_per_iteration": 3.971522092819214 + }, + { + "auxiliary_loss_clip": 0.0119078, + "auxiliary_loss_mlp": 0.01181962, + "balance_loss_clip": 1.00265741, + "balance_loss_mlp": 1.00152659, + "epoch": 0.04924094393506689, + "flos": 22420396753920.0, + "grad_norm": 2.063329566611359, + "language_loss": 0.75895405, + "learning_rate": 3.996118238049124e-06, + "loss": 0.78268147, + "num_input_tokens_seen": 17534590, + "step": 819, + "time_per_iteration": 4.014476776123047 + }, + { + "auxiliary_loss_clip": 0.01190808, + "auxiliary_loss_mlp": 0.01182126, + "balance_loss_clip": 1.00277746, + "balance_loss_mlp": 1.00178576, + "epoch": 0.04930106718773486, + "flos": 15738246766080.0, + "grad_norm": 2.247093501956014, + "language_loss": 0.84750032, + "learning_rate": 3.996093947041586e-06, + "loss": 0.87122971, + "num_input_tokens_seen": 17551900, + "step": 820, + "time_per_iteration": 2.505462408065796 + }, + { + "auxiliary_loss_clip": 0.01190702, + "auxiliary_loss_mlp": 0.01181854, + "balance_loss_clip": 1.00260901, + "balance_loss_mlp": 1.00141907, + "epoch": 0.049361190440402825, + "flos": 26250628648320.0, + "grad_norm": 1.8984463298628005, + "language_loss": 0.90689373, + "learning_rate": 3.996069580341966e-06, + "loss": 0.93061924, + "num_input_tokens_seen": 17571485, + "step": 821, + "time_per_iteration": 4.08094596862793 + }, + { + "auxiliary_loss_clip": 0.01190539, + "auxiliary_loss_mlp": 0.01181958, + "balance_loss_clip": 1.00256407, + "balance_loss_mlp": 1.00180912, + "epoch": 0.0494213136930708, + "flos": 21252366293760.0, + "grad_norm": 2.2257708914449017, + "language_loss": 0.89715248, + "learning_rate": 3.996045137951188e-06, + "loss": 0.92087746, + "num_input_tokens_seen": 17591410, + "step": 822, + "time_per_iteration": 2.5448966026306152 + }, + { + "auxiliary_loss_clip": 0.01190588, + "auxiliary_loss_mlp": 0.01181583, + "balance_loss_clip": 1.00253367, + "balance_loss_mlp": 1.00133848, + "epoch": 0.04948143694573876, + "flos": 27965506740480.0, + "grad_norm": 1.9454505744918273, + "language_loss": 0.67470574, + "learning_rate": 3.996020619870178e-06, + "loss": 0.69842744, + "num_input_tokens_seen": 17612010, + "step": 823, + "time_per_iteration": 2.6052608489990234 + }, + { + "auxiliary_loss_clip": 0.01191455, + "auxiliary_loss_mlp": 0.01178192, + "balance_loss_clip": 1.00445032, + "balance_loss_mlp": 1.00042713, + "epoch": 0.049541560198406734, + "flos": 66180995533440.0, + "grad_norm": 1.3259651138157806, + "language_loss": 0.62289131, + "learning_rate": 3.995996026099866e-06, + "loss": 0.64658785, + "num_input_tokens_seen": 17673430, + "step": 824, + "time_per_iteration": 3.2437734603881836 + }, + { + "auxiliary_loss_clip": 0.01190661, + "auxiliary_loss_mlp": 0.01181885, + "balance_loss_clip": 1.00260329, + "balance_loss_mlp": 1.00135398, + "epoch": 0.049601683451074706, + "flos": 22892693708160.0, + "grad_norm": 1.9244893518523296, + "language_loss": 0.90678215, + "learning_rate": 3.995971356641185e-06, + "loss": 0.9305076, + "num_input_tokens_seen": 17689545, + "step": 825, + "time_per_iteration": 2.54756760597229 + }, + { + "auxiliary_loss_clip": 0.01190598, + "auxiliary_loss_mlp": 0.01181539, + "balance_loss_clip": 1.0025934, + "balance_loss_mlp": 1.0011034, + "epoch": 0.04966180670374267, + "flos": 21433643256960.0, + "grad_norm": 2.078778060801701, + "language_loss": 0.66857028, + "learning_rate": 3.9959466114950695e-06, + "loss": 0.69229174, + "num_input_tokens_seen": 17705965, + "step": 826, + "time_per_iteration": 2.5636444091796875 + }, + { + "auxiliary_loss_clip": 0.01190601, + "auxiliary_loss_mlp": 0.01181671, + "balance_loss_clip": 1.00256991, + "balance_loss_mlp": 1.00142646, + "epoch": 0.04972192995641064, + "flos": 23107367341440.0, + "grad_norm": 3.690231501161982, + "language_loss": 0.7845583, + "learning_rate": 3.995921790662459e-06, + "loss": 0.808281, + "num_input_tokens_seen": 17724580, + "step": 827, + "time_per_iteration": 2.5364863872528076 + }, + { + "auxiliary_loss_clip": 0.01190593, + "auxiliary_loss_mlp": 0.0118198, + "balance_loss_clip": 1.00253773, + "balance_loss_mlp": 1.00154483, + "epoch": 0.04978205320907861, + "flos": 40406147458560.0, + "grad_norm": 1.7466721522036688, + "language_loss": 0.78610897, + "learning_rate": 3.995896894144294e-06, + "loss": 0.80983472, + "num_input_tokens_seen": 17747755, + "step": 828, + "time_per_iteration": 2.714789867401123 + }, + { + "auxiliary_loss_clip": 0.01190537, + "auxiliary_loss_mlp": 0.01180955, + "balance_loss_clip": 1.00254118, + "balance_loss_mlp": 1.0011878, + "epoch": 0.04984217646174658, + "flos": 25228539146880.0, + "grad_norm": 1.8303845074164093, + "language_loss": 0.83563042, + "learning_rate": 3.995871921941519e-06, + "loss": 0.85934532, + "num_input_tokens_seen": 17768550, + "step": 829, + "time_per_iteration": 2.587345600128174 + }, + { + "auxiliary_loss_clip": 0.01190561, + "auxiliary_loss_mlp": 0.011819, + "balance_loss_clip": 1.00252295, + "balance_loss_mlp": 1.00146532, + "epoch": 0.04990229971441455, + "flos": 15959636242560.0, + "grad_norm": 1.9570605990734236, + "language_loss": 0.75463247, + "learning_rate": 3.99584687405508e-06, + "loss": 0.77835715, + "num_input_tokens_seen": 17786080, + "step": 830, + "time_per_iteration": 2.5089797973632812 + }, + { + "auxiliary_loss_clip": 0.01190508, + "auxiliary_loss_mlp": 0.01181638, + "balance_loss_clip": 1.00252509, + "balance_loss_mlp": 1.00148928, + "epoch": 0.04996242296708252, + "flos": 18405116968320.0, + "grad_norm": 2.037751582112102, + "language_loss": 0.79642761, + "learning_rate": 3.995821750485929e-06, + "loss": 0.82014906, + "num_input_tokens_seen": 17803635, + "step": 831, + "time_per_iteration": 2.4932870864868164 + }, + { + "auxiliary_loss_clip": 0.01125103, + "auxiliary_loss_mlp": 0.01181184, + "balance_loss_clip": 1.00233316, + "balance_loss_mlp": 1.00132084, + "epoch": 0.05002254621975049, + "flos": 17858053854720.0, + "grad_norm": 2.4628426684986446, + "language_loss": 0.91814578, + "learning_rate": 3.995796551235016e-06, + "loss": 0.9412086, + "num_input_tokens_seen": 17822190, + "step": 832, + "time_per_iteration": 2.706544876098633 + }, + { + "auxiliary_loss_clip": 0.01157413, + "auxiliary_loss_mlp": 0.01181718, + "balance_loss_clip": 1.00241947, + "balance_loss_mlp": 1.00147367, + "epoch": 0.050082669472418455, + "flos": 45660273367680.0, + "grad_norm": 2.225970279679003, + "language_loss": 0.83346057, + "learning_rate": 3.9957712763032974e-06, + "loss": 0.85685188, + "num_input_tokens_seen": 17846915, + "step": 833, + "time_per_iteration": 2.8143765926361084 + }, + { + "auxiliary_loss_clip": 0.01157353, + "auxiliary_loss_mlp": 0.01181049, + "balance_loss_clip": 1.00236058, + "balance_loss_mlp": 1.00109065, + "epoch": 0.05014279272508643, + "flos": 37962067363200.0, + "grad_norm": 2.1118775667757865, + "language_loss": 0.81895888, + "learning_rate": 3.995745925691733e-06, + "loss": 0.84234285, + "num_input_tokens_seen": 17867270, + "step": 834, + "time_per_iteration": 2.748124122619629 + }, + { + "auxiliary_loss_clip": 0.01173847, + "auxiliary_loss_mlp": 0.01181745, + "balance_loss_clip": 1.00246811, + "balance_loss_mlp": 1.00130963, + "epoch": 0.0502029159777544, + "flos": 20996179516800.0, + "grad_norm": 2.4213369781681435, + "language_loss": 0.91921437, + "learning_rate": 3.995720499401282e-06, + "loss": 0.94277024, + "num_input_tokens_seen": 17884880, + "step": 835, + "time_per_iteration": 2.606901168823242 + }, + { + "auxiliary_loss_clip": 0.01190355, + "auxiliary_loss_mlp": 0.01182171, + "balance_loss_clip": 1.00241256, + "balance_loss_mlp": 1.00183153, + "epoch": 0.050263039230422364, + "flos": 15888066393600.0, + "grad_norm": 2.7576018434850686, + "language_loss": 0.76332295, + "learning_rate": 3.995694997432911e-06, + "loss": 0.78704822, + "num_input_tokens_seen": 17903695, + "step": 836, + "time_per_iteration": 2.5410027503967285 + }, + { + "auxiliary_loss_clip": 0.01174, + "auxiliary_loss_mlp": 0.01181416, + "balance_loss_clip": 1.00249934, + "balance_loss_mlp": 1.00164866, + "epoch": 0.050323162483090336, + "flos": 23732752060800.0, + "grad_norm": 2.1643955939507338, + "language_loss": 0.84085983, + "learning_rate": 3.9956694197875855e-06, + "loss": 0.86441404, + "num_input_tokens_seen": 17920745, + "step": 837, + "time_per_iteration": 2.6465003490448 + }, + { + "auxiliary_loss_clip": 0.01157395, + "auxiliary_loss_mlp": 0.00749857, + "balance_loss_clip": 1.00242424, + "balance_loss_mlp": 1.00077116, + "epoch": 0.0503832857357583, + "flos": 20266223328000.0, + "grad_norm": 2.3602465976133313, + "language_loss": 0.73571002, + "learning_rate": 3.995643766466275e-06, + "loss": 0.75478256, + "num_input_tokens_seen": 17938220, + "step": 838, + "time_per_iteration": 2.6066839694976807 + }, + { + "auxiliary_loss_clip": 0.01157642, + "auxiliary_loss_mlp": 0.01181459, + "balance_loss_clip": 1.00223279, + "balance_loss_mlp": 1.00159645, + "epoch": 0.05044340898842627, + "flos": 17785011548160.0, + "grad_norm": 1.6061821354859505, + "language_loss": 0.83574295, + "learning_rate": 3.995618037469953e-06, + "loss": 0.85913396, + "num_input_tokens_seen": 17957325, + "step": 839, + "time_per_iteration": 2.648468255996704 + }, + { + "auxiliary_loss_clip": 0.01190253, + "auxiliary_loss_mlp": 0.01181555, + "balance_loss_clip": 1.00247991, + "balance_loss_mlp": 1.00169241, + "epoch": 0.050503532241094246, + "flos": 22966526113920.0, + "grad_norm": 2.1219042214466333, + "language_loss": 0.85623336, + "learning_rate": 3.995592232799595e-06, + "loss": 0.87995148, + "num_input_tokens_seen": 17975875, + "step": 840, + "time_per_iteration": 2.546494245529175 + }, + { + "auxiliary_loss_clip": 0.01157632, + "auxiliary_loss_mlp": 0.01181652, + "balance_loss_clip": 1.0023731, + "balance_loss_mlp": 1.00150275, + "epoch": 0.05056365549376221, + "flos": 22776989022720.0, + "grad_norm": 2.226431607339735, + "language_loss": 0.94655764, + "learning_rate": 3.99556635245618e-06, + "loss": 0.96995044, + "num_input_tokens_seen": 17994340, + "step": 841, + "time_per_iteration": 2.698024272918701 + }, + { + "auxiliary_loss_clip": 0.01190398, + "auxiliary_loss_mlp": 0.01181836, + "balance_loss_clip": 1.00265002, + "balance_loss_mlp": 1.00187814, + "epoch": 0.05062377874643018, + "flos": 30916968399360.0, + "grad_norm": 2.105372389445403, + "language_loss": 0.7731716, + "learning_rate": 3.995540396440688e-06, + "loss": 0.79689395, + "num_input_tokens_seen": 18015260, + "step": 842, + "time_per_iteration": 2.6690917015075684 + }, + { + "auxiliary_loss_clip": 0.01174075, + "auxiliary_loss_mlp": 0.01182103, + "balance_loss_clip": 1.00258636, + "balance_loss_mlp": 1.00166774, + "epoch": 0.05068390199909815, + "flos": 19647159402240.0, + "grad_norm": 2.0587828158614374, + "language_loss": 0.7862401, + "learning_rate": 3.995514364754105e-06, + "loss": 0.80980194, + "num_input_tokens_seen": 18033960, + "step": 843, + "time_per_iteration": 2.6521434783935547 + }, + { + "auxiliary_loss_clip": 0.01173883, + "auxiliary_loss_mlp": 0.01181371, + "balance_loss_clip": 1.00260305, + "balance_loss_mlp": 1.00131714, + "epoch": 0.05074402525176612, + "flos": 37962103276800.0, + "grad_norm": 2.5904012397789464, + "language_loss": 0.83055413, + "learning_rate": 3.995488257397417e-06, + "loss": 0.85410666, + "num_input_tokens_seen": 18056700, + "step": 844, + "time_per_iteration": 2.7152490615844727 + }, + { + "auxiliary_loss_clip": 0.01174016, + "auxiliary_loss_mlp": 0.01181388, + "balance_loss_clip": 1.00241888, + "balance_loss_mlp": 1.0014298, + "epoch": 0.05080414850443409, + "flos": 22054610603520.0, + "grad_norm": 4.037140084050482, + "language_loss": 0.76133108, + "learning_rate": 3.995462074371614e-06, + "loss": 0.78488505, + "num_input_tokens_seen": 18075815, + "step": 845, + "time_per_iteration": 2.616469144821167 + }, + { + "auxiliary_loss_clip": 0.01173591, + "auxiliary_loss_mlp": 0.01181059, + "balance_loss_clip": 1.00230384, + "balance_loss_mlp": 1.0013864, + "epoch": 0.05086427175710206, + "flos": 20225787592320.0, + "grad_norm": 2.9460157139881757, + "language_loss": 0.87984192, + "learning_rate": 3.99543581567769e-06, + "loss": 0.90338838, + "num_input_tokens_seen": 18095095, + "step": 846, + "time_per_iteration": 2.595440149307251 + }, + { + "auxiliary_loss_clip": 0.01157932, + "auxiliary_loss_mlp": 0.01181609, + "balance_loss_clip": 1.00244141, + "balance_loss_mlp": 1.00136471, + "epoch": 0.05092439500977003, + "flos": 15159223526400.0, + "grad_norm": 1.6514357576615548, + "language_loss": 0.87586486, + "learning_rate": 3.9954094813166394e-06, + "loss": 0.89926028, + "num_input_tokens_seen": 18112675, + "step": 847, + "time_per_iteration": 2.6849958896636963 + }, + { + "auxiliary_loss_clip": 0.01124444, + "auxiliary_loss_mlp": 0.0118103, + "balance_loss_clip": 1.00209582, + "balance_loss_mlp": 1.00154901, + "epoch": 0.050984518262437994, + "flos": 22055149307520.0, + "grad_norm": 2.589224164010786, + "language_loss": 0.82294846, + "learning_rate": 3.995383071289462e-06, + "loss": 0.84600317, + "num_input_tokens_seen": 18130745, + "step": 848, + "time_per_iteration": 2.711721897125244 + }, + { + "auxiliary_loss_clip": 0.01190431, + "auxiliary_loss_mlp": 0.01181522, + "balance_loss_clip": 1.00265825, + "balance_loss_mlp": 1.0017544, + "epoch": 0.05104464151510597, + "flos": 30225329043840.0, + "grad_norm": 1.8261810204825522, + "language_loss": 0.87395847, + "learning_rate": 3.995356585597158e-06, + "loss": 0.89767802, + "num_input_tokens_seen": 18152410, + "step": 849, + "time_per_iteration": 2.6845524311065674 + }, + { + "auxiliary_loss_clip": 0.01190326, + "auxiliary_loss_mlp": 0.01180921, + "balance_loss_clip": 1.00257611, + "balance_loss_mlp": 1.00105798, + "epoch": 0.05110476476777394, + "flos": 18332900674560.0, + "grad_norm": 1.739546032582966, + "language_loss": 0.83031201, + "learning_rate": 3.995330024240732e-06, + "loss": 0.85402453, + "num_input_tokens_seen": 18170870, + "step": 850, + "time_per_iteration": 2.5682713985443115 + }, + { + "auxiliary_loss_clip": 0.01173631, + "auxiliary_loss_mlp": 0.0118138, + "balance_loss_clip": 1.00242352, + "balance_loss_mlp": 1.00123107, + "epoch": 0.051164888020441904, + "flos": 37998732170880.0, + "grad_norm": 2.0281801027912225, + "language_loss": 0.65055382, + "learning_rate": 3.995303387221192e-06, + "loss": 0.67410398, + "num_input_tokens_seen": 18191555, + "step": 851, + "time_per_iteration": 2.7522547245025635 + }, + { + "auxiliary_loss_clip": 0.01173815, + "auxiliary_loss_mlp": 0.01181613, + "balance_loss_clip": 1.00247908, + "balance_loss_mlp": 1.00155902, + "epoch": 0.051225011273109876, + "flos": 23038634666880.0, + "grad_norm": 2.5571608554174237, + "language_loss": 0.83423096, + "learning_rate": 3.995276674539547e-06, + "loss": 0.85778522, + "num_input_tokens_seen": 18208620, + "step": 852, + "time_per_iteration": 2.5769104957580566 + }, + { + "auxiliary_loss_clip": 0.01157115, + "auxiliary_loss_mlp": 0.01181774, + "balance_loss_clip": 1.00225806, + "balance_loss_mlp": 1.00143385, + "epoch": 0.05128513452577785, + "flos": 18259822454400.0, + "grad_norm": 2.0722039255841658, + "language_loss": 0.80542499, + "learning_rate": 3.995249886196811e-06, + "loss": 0.82881391, + "num_input_tokens_seen": 18226370, + "step": 853, + "time_per_iteration": 2.6383543014526367 + }, + { + "auxiliary_loss_clip": 0.01190356, + "auxiliary_loss_mlp": 0.01181376, + "balance_loss_clip": 1.00261474, + "balance_loss_mlp": 1.00199032, + "epoch": 0.05134525777844581, + "flos": 27198957571200.0, + "grad_norm": 2.3510258086568587, + "language_loss": 0.75705129, + "learning_rate": 3.995223022193999e-06, + "loss": 0.78076863, + "num_input_tokens_seen": 18247075, + "step": 854, + "time_per_iteration": 2.6188104152679443 + }, + { + "auxiliary_loss_clip": 0.01157479, + "auxiliary_loss_mlp": 0.01181644, + "balance_loss_clip": 1.00250733, + "balance_loss_mlp": 1.00159061, + "epoch": 0.051405381031113785, + "flos": 28362247436160.0, + "grad_norm": 2.095763287115062, + "language_loss": 0.81478477, + "learning_rate": 3.99519608253213e-06, + "loss": 0.83817601, + "num_input_tokens_seen": 18265680, + "step": 855, + "time_per_iteration": 2.710845470428467 + }, + { + "auxiliary_loss_clip": 0.01141953, + "auxiliary_loss_mlp": 0.00750003, + "balance_loss_clip": 1.00443399, + "balance_loss_mlp": 1.0010829, + "epoch": 0.05146550428378175, + "flos": 65618169327360.0, + "grad_norm": 0.9830881019088409, + "language_loss": 0.65661663, + "learning_rate": 3.995169067212227e-06, + "loss": 0.67553622, + "num_input_tokens_seen": 18327015, + "step": 856, + "time_per_iteration": 4.591958045959473 + }, + { + "auxiliary_loss_clip": 0.01157282, + "auxiliary_loss_mlp": 0.01181377, + "balance_loss_clip": 1.00235856, + "balance_loss_mlp": 1.00141931, + "epoch": 0.05152562753644972, + "flos": 22054861998720.0, + "grad_norm": 1.9198001390600263, + "language_loss": 0.77199697, + "learning_rate": 3.9951419762353116e-06, + "loss": 0.79538363, + "num_input_tokens_seen": 18345235, + "step": 857, + "time_per_iteration": 4.150082588195801 + }, + { + "auxiliary_loss_clip": 0.01140872, + "auxiliary_loss_mlp": 0.01181361, + "balance_loss_clip": 1.00217211, + "balance_loss_mlp": 1.00140285, + "epoch": 0.051585750789117694, + "flos": 18509544783360.0, + "grad_norm": 2.14980777608374, + "language_loss": 0.89574337, + "learning_rate": 3.995114809602412e-06, + "loss": 0.9189657, + "num_input_tokens_seen": 18362350, + "step": 858, + "time_per_iteration": 2.733795404434204 + }, + { + "auxiliary_loss_clip": 0.01157534, + "auxiliary_loss_mlp": 0.01181168, + "balance_loss_clip": 1.00249481, + "balance_loss_mlp": 1.00139999, + "epoch": 0.05164587404178566, + "flos": 23730238108800.0, + "grad_norm": 1.8440146667876571, + "language_loss": 0.75445396, + "learning_rate": 3.9950875673145605e-06, + "loss": 0.77784097, + "num_input_tokens_seen": 18383390, + "step": 859, + "time_per_iteration": 4.158409833908081 + }, + { + "auxiliary_loss_clip": 0.01157863, + "auxiliary_loss_mlp": 0.01181657, + "balance_loss_clip": 1.00242937, + "balance_loss_mlp": 1.00150836, + "epoch": 0.05170599729445363, + "flos": 16252882876800.0, + "grad_norm": 4.310556972153138, + "language_loss": 0.90666193, + "learning_rate": 3.995060249372788e-06, + "loss": 0.93005717, + "num_input_tokens_seen": 18399220, + "step": 860, + "time_per_iteration": 2.601961612701416 + }, + { + "auxiliary_loss_clip": 0.01190365, + "auxiliary_loss_mlp": 0.01181694, + "balance_loss_clip": 1.0026443, + "balance_loss_mlp": 1.00145006, + "epoch": 0.0517661205471216, + "flos": 23985922095360.0, + "grad_norm": 1.7506672577004723, + "language_loss": 0.82242966, + "learning_rate": 3.99503285577813e-06, + "loss": 0.84615028, + "num_input_tokens_seen": 18419005, + "step": 861, + "time_per_iteration": 2.60994029045105 + }, + { + "auxiliary_loss_clip": 0.01157174, + "auxiliary_loss_mlp": 0.01181583, + "balance_loss_clip": 1.00230122, + "balance_loss_mlp": 1.00143385, + "epoch": 0.05182624379978957, + "flos": 29277718392960.0, + "grad_norm": 1.708096301629598, + "language_loss": 0.78345388, + "learning_rate": 3.995005386531627e-06, + "loss": 0.80684143, + "num_input_tokens_seen": 18440550, + "step": 862, + "time_per_iteration": 2.6726279258728027 + }, + { + "auxiliary_loss_clip": 0.01157788, + "auxiliary_loss_mlp": 0.01181297, + "balance_loss_clip": 1.00248814, + "balance_loss_mlp": 1.00152934, + "epoch": 0.05188636705245754, + "flos": 24170826332160.0, + "grad_norm": 1.9405102210769496, + "language_loss": 0.8902775, + "learning_rate": 3.9949778416343195e-06, + "loss": 0.91366833, + "num_input_tokens_seen": 18461950, + "step": 863, + "time_per_iteration": 2.6444220542907715 + }, + { + "auxiliary_loss_clip": 0.01157346, + "auxiliary_loss_mlp": 0.0118111, + "balance_loss_clip": 1.00248575, + "balance_loss_mlp": 1.00153327, + "epoch": 0.051946490305125506, + "flos": 26760703731840.0, + "grad_norm": 2.0522094669143813, + "language_loss": 0.75636506, + "learning_rate": 3.9949502210872525e-06, + "loss": 0.77974963, + "num_input_tokens_seen": 18480555, + "step": 864, + "time_per_iteration": 2.668527603149414 + }, + { + "auxiliary_loss_clip": 0.01157786, + "auxiliary_loss_mlp": 0.01181388, + "balance_loss_clip": 1.00254929, + "balance_loss_mlp": 1.0014298, + "epoch": 0.05200661355779348, + "flos": 21502519585920.0, + "grad_norm": 2.4608741080206977, + "language_loss": 0.78916502, + "learning_rate": 3.994922524891474e-06, + "loss": 0.81255674, + "num_input_tokens_seen": 18499645, + "step": 865, + "time_per_iteration": 2.629690647125244 + }, + { + "auxiliary_loss_clip": 0.01173503, + "auxiliary_loss_mlp": 0.01181343, + "balance_loss_clip": 1.00235939, + "balance_loss_mlp": 1.00157571, + "epoch": 0.05206673681046144, + "flos": 18114492026880.0, + "grad_norm": 2.332342005256619, + "language_loss": 0.86000937, + "learning_rate": 3.994894753048032e-06, + "loss": 0.8835578, + "num_input_tokens_seen": 18516810, + "step": 866, + "time_per_iteration": 2.590214729309082 + }, + { + "auxiliary_loss_clip": 0.01140786, + "auxiliary_loss_mlp": 0.01181448, + "balance_loss_clip": 1.00255013, + "balance_loss_mlp": 1.00168002, + "epoch": 0.052126860063129415, + "flos": 17524191916800.0, + "grad_norm": 4.562129247985369, + "language_loss": 0.87676835, + "learning_rate": 3.9948669055579815e-06, + "loss": 0.89999068, + "num_input_tokens_seen": 18532510, + "step": 867, + "time_per_iteration": 2.684229612350464 + }, + { + "auxiliary_loss_clip": 0.01140834, + "auxiliary_loss_mlp": 0.01181084, + "balance_loss_clip": 1.00239229, + "balance_loss_mlp": 1.00179291, + "epoch": 0.05218698331579739, + "flos": 32598054771840.0, + "grad_norm": 1.5879331200357378, + "language_loss": 0.63862419, + "learning_rate": 3.9948389824223785e-06, + "loss": 0.66184342, + "num_input_tokens_seen": 18557380, + "step": 868, + "time_per_iteration": 2.777616024017334 + }, + { + "auxiliary_loss_clip": 0.0119014, + "auxiliary_loss_mlp": 0.01181424, + "balance_loss_clip": 1.00245643, + "balance_loss_mlp": 1.00137055, + "epoch": 0.05224710656846535, + "flos": 22127293774080.0, + "grad_norm": 2.4023265767779485, + "language_loss": 0.83334053, + "learning_rate": 3.994810983642281e-06, + "loss": 0.8570562, + "num_input_tokens_seen": 18575720, + "step": 869, + "time_per_iteration": 2.5663812160491943 + }, + { + "auxiliary_loss_clip": 0.01173668, + "auxiliary_loss_mlp": 0.01181224, + "balance_loss_clip": 1.00248706, + "balance_loss_mlp": 1.00126529, + "epoch": 0.052307229821133325, + "flos": 11145092976000.0, + "grad_norm": 2.0449673244975894, + "language_loss": 0.87641013, + "learning_rate": 3.994782909218751e-06, + "loss": 0.89995909, + "num_input_tokens_seen": 18592185, + "step": 870, + "time_per_iteration": 2.5693893432617188 + }, + { + "auxiliary_loss_clip": 0.01190347, + "auxiliary_loss_mlp": 0.01181266, + "balance_loss_clip": 1.00269389, + "balance_loss_mlp": 1.00159407, + "epoch": 0.05236735307380129, + "flos": 19128070005120.0, + "grad_norm": 3.045203135724274, + "language_loss": 0.8069731, + "learning_rate": 3.994754759152854e-06, + "loss": 0.83068919, + "num_input_tokens_seen": 18609560, + "step": 871, + "time_per_iteration": 2.5766468048095703 + }, + { + "auxiliary_loss_clip": 0.01157239, + "auxiliary_loss_mlp": 0.01181132, + "balance_loss_clip": 1.00249183, + "balance_loss_mlp": 1.00155473, + "epoch": 0.05242747632646926, + "flos": 20960663944320.0, + "grad_norm": 1.8806052181204767, + "language_loss": 0.81163824, + "learning_rate": 3.994726533445656e-06, + "loss": 0.83502197, + "num_input_tokens_seen": 18629405, + "step": 872, + "time_per_iteration": 2.6529266834259033 + }, + { + "auxiliary_loss_clip": 0.01159089, + "auxiliary_loss_mlp": 0.01178496, + "balance_loss_clip": 1.00456452, + "balance_loss_mlp": 1.00073135, + "epoch": 0.052487599579137234, + "flos": 65020542842880.0, + "grad_norm": 0.8772898399339752, + "language_loss": 0.61651611, + "learning_rate": 3.9946982320982274e-06, + "loss": 0.63989204, + "num_input_tokens_seen": 18681480, + "step": 873, + "time_per_iteration": 3.0889761447906494 + }, + { + "auxiliary_loss_clip": 0.01157407, + "auxiliary_loss_mlp": 0.01181009, + "balance_loss_clip": 1.00238049, + "balance_loss_mlp": 1.00133634, + "epoch": 0.0525477228318052, + "flos": 23288859786240.0, + "grad_norm": 1.951266051111707, + "language_loss": 0.88876945, + "learning_rate": 3.994669855111643e-06, + "loss": 0.9121536, + "num_input_tokens_seen": 18700390, + "step": 874, + "time_per_iteration": 2.62870192527771 + }, + { + "auxiliary_loss_clip": 0.01157105, + "auxiliary_loss_mlp": 0.01181285, + "balance_loss_clip": 1.00232792, + "balance_loss_mlp": 1.00161242, + "epoch": 0.05260784608447317, + "flos": 32230221546240.0, + "grad_norm": 1.704292860156135, + "language_loss": 0.74791849, + "learning_rate": 3.994641402486977e-06, + "loss": 0.7713024, + "num_input_tokens_seen": 18721280, + "step": 875, + "time_per_iteration": 2.6867189407348633 + }, + { + "auxiliary_loss_clip": 0.01173705, + "auxiliary_loss_mlp": 0.01180333, + "balance_loss_clip": 1.00248075, + "balance_loss_mlp": 1.00123334, + "epoch": 0.052667969337141136, + "flos": 24463211040000.0, + "grad_norm": 1.7767302577713364, + "language_loss": 0.92515278, + "learning_rate": 3.99461287422531e-06, + "loss": 0.94869316, + "num_input_tokens_seen": 18741545, + "step": 876, + "time_per_iteration": 2.6327195167541504 + }, + { + "auxiliary_loss_clip": 0.01191261, + "auxiliary_loss_mlp": 0.01177659, + "balance_loss_clip": 1.00475073, + "balance_loss_mlp": 1.00065696, + "epoch": 0.05272809258980911, + "flos": 57784329567360.0, + "grad_norm": 0.8161074725510618, + "language_loss": 0.62893987, + "learning_rate": 3.994584270327722e-06, + "loss": 0.65262902, + "num_input_tokens_seen": 18801400, + "step": 877, + "time_per_iteration": 3.1351022720336914 + }, + { + "auxiliary_loss_clip": 0.01157029, + "auxiliary_loss_mlp": 0.01180644, + "balance_loss_clip": 1.00235522, + "balance_loss_mlp": 1.00154424, + "epoch": 0.05278821584247708, + "flos": 17420805596160.0, + "grad_norm": 2.4244941284723214, + "language_loss": 0.85630774, + "learning_rate": 3.994555590795299e-06, + "loss": 0.87968451, + "num_input_tokens_seen": 18819670, + "step": 878, + "time_per_iteration": 2.6383676528930664 + }, + { + "auxiliary_loss_clip": 0.01190149, + "auxiliary_loss_mlp": 0.0118078, + "balance_loss_clip": 1.00257874, + "balance_loss_mlp": 1.00139415, + "epoch": 0.052848339095145046, + "flos": 26137258346880.0, + "grad_norm": 1.8945176667942407, + "language_loss": 0.83050466, + "learning_rate": 3.9945268356291275e-06, + "loss": 0.85421395, + "num_input_tokens_seen": 18840580, + "step": 879, + "time_per_iteration": 2.605893611907959 + }, + { + "auxiliary_loss_clip": 0.0115684, + "auxiliary_loss_mlp": 0.01181043, + "balance_loss_clip": 1.00223279, + "balance_loss_mlp": 1.00156116, + "epoch": 0.05290846234781302, + "flos": 16472081623680.0, + "grad_norm": 2.0741840536870138, + "language_loss": 0.84332746, + "learning_rate": 3.9944980048302985e-06, + "loss": 0.86670631, + "num_input_tokens_seen": 18859295, + "step": 880, + "time_per_iteration": 2.627963066101074 + }, + { + "auxiliary_loss_clip": 0.01140958, + "auxiliary_loss_mlp": 0.01181203, + "balance_loss_clip": 1.00245905, + "balance_loss_mlp": 1.00153112, + "epoch": 0.05296858560048098, + "flos": 19865173000320.0, + "grad_norm": 2.6435294812506056, + "language_loss": 0.8662312, + "learning_rate": 3.994469098399906e-06, + "loss": 0.88945287, + "num_input_tokens_seen": 18877485, + "step": 881, + "time_per_iteration": 2.668600559234619 + }, + { + "auxiliary_loss_clip": 0.01173604, + "auxiliary_loss_mlp": 0.0118089, + "balance_loss_clip": 1.00245774, + "balance_loss_mlp": 1.00140822, + "epoch": 0.053028708853148955, + "flos": 24388588535040.0, + "grad_norm": 8.611970231545865, + "language_loss": 0.88037109, + "learning_rate": 3.994440116339046e-06, + "loss": 0.90391612, + "num_input_tokens_seen": 18898275, + "step": 882, + "time_per_iteration": 2.7268078327178955 + }, + { + "auxiliary_loss_clip": 0.01190047, + "auxiliary_loss_mlp": 0.01181234, + "balance_loss_clip": 1.00250149, + "balance_loss_mlp": 1.0015614, + "epoch": 0.05308883210581693, + "flos": 36393166143360.0, + "grad_norm": 4.1051149953010615, + "language_loss": 0.6934765, + "learning_rate": 3.994411058648816e-06, + "loss": 0.71718931, + "num_input_tokens_seen": 18920665, + "step": 883, + "time_per_iteration": 2.6902916431427 + }, + { + "auxiliary_loss_clip": 0.01140871, + "auxiliary_loss_mlp": 0.01180641, + "balance_loss_clip": 1.00243008, + "balance_loss_mlp": 1.0013504, + "epoch": 0.05314895535848489, + "flos": 22855095146880.0, + "grad_norm": 2.5935691067528626, + "language_loss": 0.76186383, + "learning_rate": 3.994381925330319e-06, + "loss": 0.78507888, + "num_input_tokens_seen": 18939835, + "step": 884, + "time_per_iteration": 2.6771814823150635 + }, + { + "auxiliary_loss_clip": 0.01124255, + "auxiliary_loss_mlp": 0.01180463, + "balance_loss_clip": 1.00216913, + "balance_loss_mlp": 1.0013628, + "epoch": 0.053209078611152864, + "flos": 12860330204160.0, + "grad_norm": 2.6446355173165235, + "language_loss": 0.85593379, + "learning_rate": 3.994352716384659e-06, + "loss": 0.87898099, + "num_input_tokens_seen": 18958405, + "step": 885, + "time_per_iteration": 2.781881093978882 + }, + { + "auxiliary_loss_clip": 0.01157566, + "auxiliary_loss_mlp": 0.01180717, + "balance_loss_clip": 1.00242472, + "balance_loss_mlp": 1.00142646, + "epoch": 0.05326920186382083, + "flos": 12164596698240.0, + "grad_norm": 2.6638011093197345, + "language_loss": 0.85856289, + "learning_rate": 3.994323431812945e-06, + "loss": 0.88194573, + "num_input_tokens_seen": 18975445, + "step": 886, + "time_per_iteration": 2.6333816051483154 + }, + { + "auxiliary_loss_clip": 0.01140582, + "auxiliary_loss_mlp": 0.01180643, + "balance_loss_clip": 1.00223851, + "balance_loss_mlp": 1.00163841, + "epoch": 0.0533293251164888, + "flos": 22704485420160.0, + "grad_norm": 2.1226574252098094, + "language_loss": 0.89150608, + "learning_rate": 3.994294071616286e-06, + "loss": 0.91471845, + "num_input_tokens_seen": 18991930, + "step": 887, + "time_per_iteration": 2.727969169616699 + }, + { + "auxiliary_loss_clip": 0.0112453, + "auxiliary_loss_mlp": 0.01181083, + "balance_loss_clip": 1.00227177, + "balance_loss_mlp": 1.00141072, + "epoch": 0.053389448369156774, + "flos": 26940939200640.0, + "grad_norm": 1.9275270302648024, + "language_loss": 0.75182343, + "learning_rate": 3.994264635795796e-06, + "loss": 0.77487963, + "num_input_tokens_seen": 19009790, + "step": 888, + "time_per_iteration": 2.751300096511841 + }, + { + "auxiliary_loss_clip": 0.01123946, + "auxiliary_loss_mlp": 0.01180425, + "balance_loss_clip": 1.00198352, + "balance_loss_mlp": 1.00161159, + "epoch": 0.05344957162182474, + "flos": 25556331686400.0, + "grad_norm": 2.0771855638237926, + "language_loss": 0.88320118, + "learning_rate": 3.994235124352592e-06, + "loss": 0.90624487, + "num_input_tokens_seen": 19030170, + "step": 889, + "time_per_iteration": 2.7728068828582764 + }, + { + "auxiliary_loss_clip": 0.01189927, + "auxiliary_loss_mlp": 0.01180213, + "balance_loss_clip": 1.00255466, + "balance_loss_mlp": 1.00111306, + "epoch": 0.05350969487449271, + "flos": 19719591177600.0, + "grad_norm": 1.8550723386173718, + "language_loss": 0.88826782, + "learning_rate": 3.994205537287791e-06, + "loss": 0.91196918, + "num_input_tokens_seen": 19048075, + "step": 890, + "time_per_iteration": 2.539376974105835 + }, + { + "auxiliary_loss_clip": 0.01157275, + "auxiliary_loss_mlp": 0.01180543, + "balance_loss_clip": 1.00241923, + "balance_loss_mlp": 1.00163364, + "epoch": 0.053569818127160676, + "flos": 27016351804800.0, + "grad_norm": 2.486954928626339, + "language_loss": 0.93323028, + "learning_rate": 3.994175874602517e-06, + "loss": 0.95660847, + "num_input_tokens_seen": 19067465, + "step": 891, + "time_per_iteration": 2.7177023887634277 + }, + { + "auxiliary_loss_clip": 0.01173995, + "auxiliary_loss_mlp": 0.01180703, + "balance_loss_clip": 1.00253487, + "balance_loss_mlp": 1.00131691, + "epoch": 0.05362994137982865, + "flos": 13188338225280.0, + "grad_norm": 2.095068549412315, + "language_loss": 0.7208128, + "learning_rate": 3.994146136297893e-06, + "loss": 0.74435973, + "num_input_tokens_seen": 19085505, + "step": 892, + "time_per_iteration": 2.5683557987213135 + }, + { + "auxiliary_loss_clip": 0.01157317, + "auxiliary_loss_mlp": 0.00749829, + "balance_loss_clip": 1.002244, + "balance_loss_mlp": 1.00058103, + "epoch": 0.05369006463249662, + "flos": 28658008022400.0, + "grad_norm": 1.7653274159202224, + "language_loss": 0.82253826, + "learning_rate": 3.994116322375049e-06, + "loss": 0.84160966, + "num_input_tokens_seen": 19104360, + "step": 893, + "time_per_iteration": 2.7009716033935547 + }, + { + "auxiliary_loss_clip": 0.01156931, + "auxiliary_loss_mlp": 0.01180039, + "balance_loss_clip": 1.00234616, + "balance_loss_mlp": 1.00122547, + "epoch": 0.053750187885164585, + "flos": 28913153304960.0, + "grad_norm": 2.2483579105387594, + "language_loss": 0.82090521, + "learning_rate": 3.994086432835114e-06, + "loss": 0.84427494, + "num_input_tokens_seen": 19124680, + "step": 894, + "time_per_iteration": 4.1823039054870605 + }, + { + "auxiliary_loss_clip": 0.01173967, + "auxiliary_loss_mlp": 0.01180153, + "balance_loss_clip": 1.00259781, + "balance_loss_mlp": 1.00133896, + "epoch": 0.05381031113783256, + "flos": 15158828476800.0, + "grad_norm": 3.7355384591388408, + "language_loss": 0.7579546, + "learning_rate": 3.994056467679221e-06, + "loss": 0.78149575, + "num_input_tokens_seen": 19142895, + "step": 895, + "time_per_iteration": 5.307438850402832 + }, + { + "auxiliary_loss_clip": 0.01157128, + "auxiliary_loss_mlp": 0.01180443, + "balance_loss_clip": 1.00258946, + "balance_loss_mlp": 1.0014385, + "epoch": 0.05387043439050053, + "flos": 21835232288640.0, + "grad_norm": 2.5288560215598133, + "language_loss": 0.86577821, + "learning_rate": 3.9940264269085065e-06, + "loss": 0.88915396, + "num_input_tokens_seen": 19163125, + "step": 896, + "time_per_iteration": 4.096049070358276 + }, + { + "auxiliary_loss_clip": 0.0119001, + "auxiliary_loss_mlp": 0.00749795, + "balance_loss_clip": 1.00261998, + "balance_loss_mlp": 1.00054765, + "epoch": 0.053930557643168495, + "flos": 17310308382720.0, + "grad_norm": 3.3070464595747224, + "language_loss": 0.87640291, + "learning_rate": 3.9939963105241115e-06, + "loss": 0.89580095, + "num_input_tokens_seen": 19179385, + "step": 897, + "time_per_iteration": 2.5493457317352295 + }, + { + "auxiliary_loss_clip": 0.0117341, + "auxiliary_loss_mlp": 0.01180055, + "balance_loss_clip": 1.00250435, + "balance_loss_mlp": 1.00133681, + "epoch": 0.05399068089583647, + "flos": 17348481561600.0, + "grad_norm": 1.8083290245859538, + "language_loss": 0.90213531, + "learning_rate": 3.993966118527175e-06, + "loss": 0.92567003, + "num_input_tokens_seen": 19198725, + "step": 898, + "time_per_iteration": 2.560724973678589 + }, + { + "auxiliary_loss_clip": 0.01157148, + "auxiliary_loss_mlp": 0.01180752, + "balance_loss_clip": 1.00243545, + "balance_loss_mlp": 1.00174713, + "epoch": 0.05405080414850443, + "flos": 17486952491520.0, + "grad_norm": 2.909967853760895, + "language_loss": 0.92145342, + "learning_rate": 3.993935850918845e-06, + "loss": 0.94483238, + "num_input_tokens_seen": 19212380, + "step": 899, + "time_per_iteration": 2.5827744007110596 + }, + { + "auxiliary_loss_clip": 0.01157322, + "auxiliary_loss_mlp": 0.01180125, + "balance_loss_clip": 1.00247025, + "balance_loss_mlp": 1.00159752, + "epoch": 0.054110927401172404, + "flos": 24496787278080.0, + "grad_norm": 2.244391441103912, + "language_loss": 0.75457847, + "learning_rate": 3.9939055077002665e-06, + "loss": 0.77795291, + "num_input_tokens_seen": 19232235, + "step": 900, + "time_per_iteration": 2.6498804092407227 + }, + { + "auxiliary_loss_clip": 0.01173708, + "auxiliary_loss_mlp": 0.01180044, + "balance_loss_clip": 1.0025034, + "balance_loss_mlp": 1.00123048, + "epoch": 0.054171050653840376, + "flos": 22930040874240.0, + "grad_norm": 2.5178888606871697, + "language_loss": 0.73837996, + "learning_rate": 3.993875088872592e-06, + "loss": 0.76191753, + "num_input_tokens_seen": 19251460, + "step": 901, + "time_per_iteration": 2.5751609802246094 + }, + { + "auxiliary_loss_clip": 0.01140524, + "auxiliary_loss_mlp": 0.011799, + "balance_loss_clip": 1.00239396, + "balance_loss_mlp": 1.00156307, + "epoch": 0.05423117390650834, + "flos": 12933192942720.0, + "grad_norm": 2.398702034485752, + "language_loss": 0.84852707, + "learning_rate": 3.9938445944369745e-06, + "loss": 0.87173134, + "num_input_tokens_seen": 19269060, + "step": 902, + "time_per_iteration": 2.6493325233459473 + }, + { + "auxiliary_loss_clip": 0.01124471, + "auxiliary_loss_mlp": 0.01179983, + "balance_loss_clip": 1.00225389, + "balance_loss_mlp": 1.00145578, + "epoch": 0.05429129715917631, + "flos": 19901335017600.0, + "grad_norm": 2.951725684182845, + "language_loss": 0.86385047, + "learning_rate": 3.993814024394569e-06, + "loss": 0.886895, + "num_input_tokens_seen": 19288620, + "step": 903, + "time_per_iteration": 2.7997467517852783 + }, + { + "auxiliary_loss_clip": 0.0117368, + "auxiliary_loss_mlp": 0.0117978, + "balance_loss_clip": 1.00252211, + "balance_loss_mlp": 1.00134706, + "epoch": 0.05435142041184428, + "flos": 16908611610240.0, + "grad_norm": 2.9027736845807226, + "language_loss": 0.75328648, + "learning_rate": 3.993783378746537e-06, + "loss": 0.77682114, + "num_input_tokens_seen": 19306615, + "step": 904, + "time_per_iteration": 2.6763200759887695 + }, + { + "auxiliary_loss_clip": 0.01175248, + "auxiliary_loss_mlp": 0.01180278, + "balance_loss_clip": 1.00252366, + "balance_loss_mlp": 1.00155926, + "epoch": 0.05441154366451225, + "flos": 23948323534080.0, + "grad_norm": 3.029684224771264, + "language_loss": 0.8572228, + "learning_rate": 3.993752657494039e-06, + "loss": 0.88077807, + "num_input_tokens_seen": 19321680, + "step": 905, + "time_per_iteration": 2.5843067169189453 + }, + { + "auxiliary_loss_clip": 0.01156933, + "auxiliary_loss_mlp": 0.01180356, + "balance_loss_clip": 1.00245237, + "balance_loss_mlp": 1.00192368, + "epoch": 0.05447166691718022, + "flos": 19975382904960.0, + "grad_norm": 1.8080401359295017, + "language_loss": 0.74403012, + "learning_rate": 3.993721860638241e-06, + "loss": 0.76740301, + "num_input_tokens_seen": 19339760, + "step": 906, + "time_per_iteration": 2.5954763889312744 + }, + { + "auxiliary_loss_clip": 0.01156746, + "auxiliary_loss_mlp": 0.01179809, + "balance_loss_clip": 1.00239563, + "balance_loss_mlp": 1.00156736, + "epoch": 0.05453179016984819, + "flos": 24936513575040.0, + "grad_norm": 2.0450507181225426, + "language_loss": 0.87438411, + "learning_rate": 3.993690988180309e-06, + "loss": 0.89774966, + "num_input_tokens_seen": 19359585, + "step": 907, + "time_per_iteration": 2.6455509662628174 + }, + { + "auxiliary_loss_clip": 0.01173397, + "auxiliary_loss_mlp": 0.01180519, + "balance_loss_clip": 1.00248611, + "balance_loss_mlp": 1.00170541, + "epoch": 0.05459191342251616, + "flos": 18115102558080.0, + "grad_norm": 1.7555251237218792, + "language_loss": 0.87020475, + "learning_rate": 3.9936600401214165e-06, + "loss": 0.89374399, + "num_input_tokens_seen": 19378590, + "step": 908, + "time_per_iteration": 2.5553951263427734 + }, + { + "auxiliary_loss_clip": 0.01157166, + "auxiliary_loss_mlp": 0.01180144, + "balance_loss_clip": 1.00247169, + "balance_loss_mlp": 1.00171113, + "epoch": 0.054652036675184125, + "flos": 19208295031680.0, + "grad_norm": 2.46282413438134, + "language_loss": 0.8973887, + "learning_rate": 3.9936290164627345e-06, + "loss": 0.9207617, + "num_input_tokens_seen": 19397910, + "step": 909, + "time_per_iteration": 2.6231439113616943 + }, + { + "auxiliary_loss_clip": 0.01157026, + "auxiliary_loss_mlp": 0.01180259, + "balance_loss_clip": 1.0024308, + "balance_loss_mlp": 1.00154042, + "epoch": 0.0547121599278521, + "flos": 16325745615360.0, + "grad_norm": 2.416279664390293, + "language_loss": 0.7137084, + "learning_rate": 3.99359791720544e-06, + "loss": 0.73708117, + "num_input_tokens_seen": 19415950, + "step": 910, + "time_per_iteration": 2.7074410915374756 + }, + { + "auxiliary_loss_clip": 0.01156996, + "auxiliary_loss_mlp": 0.01179437, + "balance_loss_clip": 1.00248075, + "balance_loss_mlp": 1.00110006, + "epoch": 0.05477228318052007, + "flos": 20339014239360.0, + "grad_norm": 1.6570283931342247, + "language_loss": 0.83301514, + "learning_rate": 3.993566742350714e-06, + "loss": 0.85637945, + "num_input_tokens_seen": 19435275, + "step": 911, + "time_per_iteration": 2.6428439617156982 + }, + { + "auxiliary_loss_clip": 0.01173786, + "auxiliary_loss_mlp": 0.01180245, + "balance_loss_clip": 1.00248122, + "balance_loss_mlp": 1.00152636, + "epoch": 0.054832406433188034, + "flos": 21973092687360.0, + "grad_norm": 2.3005761215128584, + "language_loss": 0.76186991, + "learning_rate": 3.993535491899736e-06, + "loss": 0.78541017, + "num_input_tokens_seen": 19452090, + "step": 912, + "time_per_iteration": 2.579620361328125 + }, + { + "auxiliary_loss_clip": 0.01156756, + "auxiliary_loss_mlp": 0.01179619, + "balance_loss_clip": 1.00236368, + "balance_loss_mlp": 1.00118685, + "epoch": 0.054892529685856006, + "flos": 16398931576320.0, + "grad_norm": 2.889564492488545, + "language_loss": 0.8283546, + "learning_rate": 3.993504165853694e-06, + "loss": 0.85171843, + "num_input_tokens_seen": 19470865, + "step": 913, + "time_per_iteration": 2.6205689907073975 + }, + { + "auxiliary_loss_clip": 0.01173769, + "auxiliary_loss_mlp": 0.01179934, + "balance_loss_clip": 1.00273299, + "balance_loss_mlp": 1.00140643, + "epoch": 0.05495265293852397, + "flos": 23912341084800.0, + "grad_norm": 2.63267496194166, + "language_loss": 0.83543563, + "learning_rate": 3.993472764213772e-06, + "loss": 0.85897267, + "num_input_tokens_seen": 19492145, + "step": 914, + "time_per_iteration": 2.609640121459961 + }, + { + "auxiliary_loss_clip": 0.01173511, + "auxiliary_loss_mlp": 0.00749822, + "balance_loss_clip": 1.00247049, + "balance_loss_mlp": 1.00062346, + "epoch": 0.055012776191191944, + "flos": 23586954756480.0, + "grad_norm": 2.5973565209768523, + "language_loss": 0.89945674, + "learning_rate": 3.9934412869811655e-06, + "loss": 0.91869003, + "num_input_tokens_seen": 19511015, + "step": 915, + "time_per_iteration": 2.659541606903076 + }, + { + "auxiliary_loss_clip": 0.01173141, + "auxiliary_loss_mlp": 0.01179625, + "balance_loss_clip": 1.00236309, + "balance_loss_mlp": 1.00119269, + "epoch": 0.055072899443859916, + "flos": 17528501548800.0, + "grad_norm": 2.200515796702361, + "language_loss": 0.89461029, + "learning_rate": 3.993409734157064e-06, + "loss": 0.91813791, + "num_input_tokens_seen": 19529040, + "step": 916, + "time_per_iteration": 2.5540192127227783 + }, + { + "auxiliary_loss_clip": 0.01142144, + "auxiliary_loss_mlp": 0.01180386, + "balance_loss_clip": 1.00231338, + "balance_loss_mlp": 1.00147629, + "epoch": 0.05513302269652788, + "flos": 21687172427520.0, + "grad_norm": 1.8379910673723088, + "language_loss": 0.80035329, + "learning_rate": 3.993378105742666e-06, + "loss": 0.8235786, + "num_input_tokens_seen": 19549540, + "step": 917, + "time_per_iteration": 2.705808639526367 + }, + { + "auxiliary_loss_clip": 0.01108258, + "auxiliary_loss_mlp": 0.01180324, + "balance_loss_clip": 1.00232601, + "balance_loss_mlp": 1.00151014, + "epoch": 0.05519314594919585, + "flos": 21613340021760.0, + "grad_norm": 1.8315736422884308, + "language_loss": 0.79736507, + "learning_rate": 3.9933464017391705e-06, + "loss": 0.82025081, + "num_input_tokens_seen": 19567570, + "step": 918, + "time_per_iteration": 2.730137586593628 + }, + { + "auxiliary_loss_clip": 0.0117311, + "auxiliary_loss_mlp": 0.01179992, + "balance_loss_clip": 1.0022614, + "balance_loss_mlp": 1.00136876, + "epoch": 0.05525326920186382, + "flos": 21798567480960.0, + "grad_norm": 2.2311901792832427, + "language_loss": 0.889238, + "learning_rate": 3.99331462214778e-06, + "loss": 0.91276908, + "num_input_tokens_seen": 19585330, + "step": 919, + "time_per_iteration": 2.586395025253296 + }, + { + "auxiliary_loss_clip": 0.0118975, + "auxiliary_loss_mlp": 0.01180113, + "balance_loss_clip": 1.00251913, + "balance_loss_mlp": 1.00168025, + "epoch": 0.05531339245453179, + "flos": 28439635288320.0, + "grad_norm": 2.295116745941819, + "language_loss": 0.87457937, + "learning_rate": 3.993282766969699e-06, + "loss": 0.898278, + "num_input_tokens_seen": 19604970, + "step": 920, + "time_per_iteration": 2.5792644023895264 + }, + { + "auxiliary_loss_clip": 0.0115693, + "auxiliary_loss_mlp": 0.01179383, + "balance_loss_clip": 1.00240827, + "balance_loss_mlp": 1.00095022, + "epoch": 0.05537351570719976, + "flos": 37375143131520.0, + "grad_norm": 2.1939210983882775, + "language_loss": 0.66262084, + "learning_rate": 3.993250836206136e-06, + "loss": 0.68598396, + "num_input_tokens_seen": 19626235, + "step": 921, + "time_per_iteration": 2.721972942352295 + }, + { + "auxiliary_loss_clip": 0.0117357, + "auxiliary_loss_mlp": 0.01180049, + "balance_loss_clip": 1.00252771, + "balance_loss_mlp": 1.00142622, + "epoch": 0.05543363895986773, + "flos": 20084479488000.0, + "grad_norm": 2.5263639217945384, + "language_loss": 0.71803904, + "learning_rate": 3.993218829858301e-06, + "loss": 0.74157512, + "num_input_tokens_seen": 19644305, + "step": 922, + "time_per_iteration": 2.5832486152648926 + }, + { + "auxiliary_loss_clip": 0.01157264, + "auxiliary_loss_mlp": 0.01179523, + "balance_loss_clip": 1.00238168, + "balance_loss_mlp": 1.00118601, + "epoch": 0.0554937622125357, + "flos": 24533200690560.0, + "grad_norm": 2.6884910220004694, + "language_loss": 0.81838995, + "learning_rate": 3.993186747927408e-06, + "loss": 0.84175789, + "num_input_tokens_seen": 19662130, + "step": 923, + "time_per_iteration": 2.642988443374634 + }, + { + "auxiliary_loss_clip": 0.01173163, + "auxiliary_loss_mlp": 0.01179523, + "balance_loss_clip": 1.00239146, + "balance_loss_mlp": 1.00128102, + "epoch": 0.055553885465203665, + "flos": 14320063013760.0, + "grad_norm": 1.8351872650198566, + "language_loss": 0.78794479, + "learning_rate": 3.993154590414675e-06, + "loss": 0.81147158, + "num_input_tokens_seen": 19680715, + "step": 924, + "time_per_iteration": 2.571085214614868 + }, + { + "auxiliary_loss_clip": 0.01139824, + "auxiliary_loss_mlp": 0.01179621, + "balance_loss_clip": 1.00189042, + "balance_loss_mlp": 1.00118804, + "epoch": 0.05561400871787164, + "flos": 27381132374400.0, + "grad_norm": 2.009075992901827, + "language_loss": 1.02242851, + "learning_rate": 3.993122357321319e-06, + "loss": 1.04562294, + "num_input_tokens_seen": 19700535, + "step": 925, + "time_per_iteration": 2.660656452178955 + }, + { + "auxiliary_loss_clip": 0.01124886, + "auxiliary_loss_mlp": 0.01179222, + "balance_loss_clip": 1.00232697, + "balance_loss_mlp": 1.00088501, + "epoch": 0.05567413197053961, + "flos": 23221096778880.0, + "grad_norm": 2.1744680985575213, + "language_loss": 0.8099972, + "learning_rate": 3.993090048648564e-06, + "loss": 0.83303833, + "num_input_tokens_seen": 19718825, + "step": 926, + "time_per_iteration": 2.690293788909912 + }, + { + "auxiliary_loss_clip": 0.01173439, + "auxiliary_loss_mlp": 0.01180153, + "balance_loss_clip": 1.00251579, + "balance_loss_mlp": 1.00143445, + "epoch": 0.055734255223207574, + "flos": 25264952559360.0, + "grad_norm": 2.6978279867174284, + "language_loss": 0.73470068, + "learning_rate": 3.993057664397634e-06, + "loss": 0.75823665, + "num_input_tokens_seen": 19739080, + "step": 927, + "time_per_iteration": 2.653663158416748 + }, + { + "auxiliary_loss_clip": 0.01190385, + "auxiliary_loss_mlp": 0.01177367, + "balance_loss_clip": 1.00431275, + "balance_loss_mlp": 1.00036526, + "epoch": 0.055794378475875546, + "flos": 66503116702080.0, + "grad_norm": 0.7787569918619953, + "language_loss": 0.5986613, + "learning_rate": 3.9930252045697585e-06, + "loss": 0.62233877, + "num_input_tokens_seen": 19802960, + "step": 928, + "time_per_iteration": 3.1742937564849854 + }, + { + "auxiliary_loss_clip": 0.01172991, + "auxiliary_loss_mlp": 0.01179534, + "balance_loss_clip": 1.00244236, + "balance_loss_mlp": 1.00148344, + "epoch": 0.05585450172854351, + "flos": 25337635729920.0, + "grad_norm": 2.172014866354548, + "language_loss": 0.94895381, + "learning_rate": 3.992992669166168e-06, + "loss": 0.97247905, + "num_input_tokens_seen": 19822765, + "step": 929, + "time_per_iteration": 2.6218760013580322 + }, + { + "auxiliary_loss_clip": 0.0114047, + "auxiliary_loss_mlp": 0.01179933, + "balance_loss_clip": 1.00229895, + "balance_loss_mlp": 1.00140476, + "epoch": 0.05591462498121148, + "flos": 33911738881920.0, + "grad_norm": 2.0304410310241643, + "language_loss": 0.72051597, + "learning_rate": 3.992960058188094e-06, + "loss": 0.74372, + "num_input_tokens_seen": 19843590, + "step": 930, + "time_per_iteration": 2.7697384357452393 + }, + { + "auxiliary_loss_clip": 0.01156416, + "auxiliary_loss_mlp": 0.01179684, + "balance_loss_clip": 1.00236022, + "balance_loss_mlp": 1.00134683, + "epoch": 0.055974748233879455, + "flos": 17930880679680.0, + "grad_norm": 2.62351235632954, + "language_loss": 0.85415822, + "learning_rate": 3.992927371636776e-06, + "loss": 0.87751925, + "num_input_tokens_seen": 19860230, + "step": 931, + "time_per_iteration": 2.6344971656799316 + }, + { + "auxiliary_loss_clip": 0.01175096, + "auxiliary_loss_mlp": 0.0074982, + "balance_loss_clip": 1.00262737, + "balance_loss_mlp": 1.0006634, + "epoch": 0.05603487148654742, + "flos": 24021976371840.0, + "grad_norm": 1.7295109051598643, + "language_loss": 0.83386844, + "learning_rate": 3.9928946095134525e-06, + "loss": 0.8531177, + "num_input_tokens_seen": 19880795, + "step": 932, + "time_per_iteration": 5.421313047409058 + }, + { + "auxiliary_loss_clip": 0.01173169, + "auxiliary_loss_mlp": 0.01179651, + "balance_loss_clip": 1.00251102, + "balance_loss_mlp": 1.00150442, + "epoch": 0.05609499473921539, + "flos": 17307758517120.0, + "grad_norm": 2.0516207687712518, + "language_loss": 0.73664117, + "learning_rate": 3.992861771819365e-06, + "loss": 0.76016939, + "num_input_tokens_seen": 19897960, + "step": 933, + "time_per_iteration": 3.9280314445495605 + }, + { + "auxiliary_loss_clip": 0.01124186, + "auxiliary_loss_mlp": 0.01179753, + "balance_loss_clip": 1.00203133, + "balance_loss_mlp": 1.00151157, + "epoch": 0.05615511799188336, + "flos": 20994742972800.0, + "grad_norm": 3.8747174374523, + "language_loss": 0.8662743, + "learning_rate": 3.99282885855576e-06, + "loss": 0.8893137, + "num_input_tokens_seen": 19913315, + "step": 934, + "time_per_iteration": 2.697197914123535 + }, + { + "auxiliary_loss_clip": 0.01124015, + "auxiliary_loss_mlp": 0.01179067, + "balance_loss_clip": 1.00223494, + "balance_loss_mlp": 1.001302, + "epoch": 0.05621524124455133, + "flos": 17273535834240.0, + "grad_norm": 2.2636714775710796, + "language_loss": 0.80335683, + "learning_rate": 3.992795869723885e-06, + "loss": 0.82638764, + "num_input_tokens_seen": 19928790, + "step": 935, + "time_per_iteration": 4.131223440170288 + }, + { + "auxiliary_loss_clip": 0.01190239, + "auxiliary_loss_mlp": 0.01176613, + "balance_loss_clip": 1.00423694, + "balance_loss_mlp": 1.00037372, + "epoch": 0.0562753644972193, + "flos": 58719370458240.0, + "grad_norm": 0.8121217551517687, + "language_loss": 0.69143057, + "learning_rate": 3.99276280532499e-06, + "loss": 0.7150991, + "num_input_tokens_seen": 19988785, + "step": 936, + "time_per_iteration": 3.044518232345581 + }, + { + "auxiliary_loss_clip": 0.01189706, + "auxiliary_loss_mlp": 0.0117947, + "balance_loss_clip": 1.00265718, + "balance_loss_mlp": 1.00122786, + "epoch": 0.05633548774988727, + "flos": 17457039440640.0, + "grad_norm": 2.8294438791404772, + "language_loss": 0.75708055, + "learning_rate": 3.992729665360331e-06, + "loss": 0.78077239, + "num_input_tokens_seen": 20007685, + "step": 937, + "time_per_iteration": 2.5530171394348145 + }, + { + "auxiliary_loss_clip": 0.01173766, + "auxiliary_loss_mlp": 0.01176611, + "balance_loss_clip": 1.00412846, + "balance_loss_mlp": 1.00037193, + "epoch": 0.05639561100255524, + "flos": 70654928083200.0, + "grad_norm": 0.8647342436852886, + "language_loss": 0.64267009, + "learning_rate": 3.992696449831162e-06, + "loss": 0.66617388, + "num_input_tokens_seen": 20072750, + "step": 938, + "time_per_iteration": 3.0837390422821045 + }, + { + "auxiliary_loss_clip": 0.01140963, + "auxiliary_loss_mlp": 0.01179833, + "balance_loss_clip": 1.00235367, + "balance_loss_mlp": 1.00120997, + "epoch": 0.056455734255223204, + "flos": 20485996692480.0, + "grad_norm": 2.7759243427473286, + "language_loss": 0.79474497, + "learning_rate": 3.992663158738745e-06, + "loss": 0.81795299, + "num_input_tokens_seen": 20089070, + "step": 939, + "time_per_iteration": 2.6394927501678467 + }, + { + "auxiliary_loss_clip": 0.01157572, + "auxiliary_loss_mlp": 0.01179858, + "balance_loss_clip": 1.00257683, + "balance_loss_mlp": 1.0014255, + "epoch": 0.056515857507891176, + "flos": 22053569109120.0, + "grad_norm": 1.6302162416074062, + "language_loss": 0.74180102, + "learning_rate": 3.992629792084341e-06, + "loss": 0.76517528, + "num_input_tokens_seen": 20108790, + "step": 940, + "time_per_iteration": 2.6421701908111572 + }, + { + "auxiliary_loss_clip": 0.01173167, + "auxiliary_loss_mlp": 0.01179487, + "balance_loss_clip": 1.00266075, + "balance_loss_mlp": 1.00134015, + "epoch": 0.05657598076055915, + "flos": 24025316336640.0, + "grad_norm": 2.28033914190252, + "language_loss": 0.70347887, + "learning_rate": 3.992596349869216e-06, + "loss": 0.72700542, + "num_input_tokens_seen": 20128455, + "step": 941, + "time_per_iteration": 2.6270365715026855 + }, + { + "auxiliary_loss_clip": 0.01124394, + "auxiliary_loss_mlp": 0.01179602, + "balance_loss_clip": 1.00225806, + "balance_loss_mlp": 1.00145543, + "epoch": 0.05663610401322711, + "flos": 20480609652480.0, + "grad_norm": 1.9726723662487757, + "language_loss": 0.8061049, + "learning_rate": 3.992562832094637e-06, + "loss": 0.82914484, + "num_input_tokens_seen": 20145775, + "step": 942, + "time_per_iteration": 2.685526132583618 + }, + { + "auxiliary_loss_clip": 0.01173258, + "auxiliary_loss_mlp": 0.01179237, + "balance_loss_clip": 1.00244164, + "balance_loss_mlp": 1.00137711, + "epoch": 0.056696227265895086, + "flos": 21069042255360.0, + "grad_norm": 2.144230264823132, + "language_loss": 0.8895185, + "learning_rate": 3.9925292387618755e-06, + "loss": 0.91304338, + "num_input_tokens_seen": 20164315, + "step": 943, + "time_per_iteration": 2.578176498413086 + }, + { + "auxiliary_loss_clip": 0.01172945, + "auxiliary_loss_mlp": 0.01179365, + "balance_loss_clip": 1.00251484, + "balance_loss_mlp": 1.00131452, + "epoch": 0.05675635051856306, + "flos": 17821317219840.0, + "grad_norm": 3.859741847389023, + "language_loss": 0.74712062, + "learning_rate": 3.992495569872206e-06, + "loss": 0.77064371, + "num_input_tokens_seen": 20182760, + "step": 944, + "time_per_iteration": 2.5667643547058105 + }, + { + "auxiliary_loss_clip": 0.01173115, + "auxiliary_loss_mlp": 0.01179399, + "balance_loss_clip": 1.00242472, + "balance_loss_mlp": 1.00134778, + "epoch": 0.05681647377123102, + "flos": 23114945111040.0, + "grad_norm": 1.6369217088399581, + "language_loss": 0.79636192, + "learning_rate": 3.992461825426906e-06, + "loss": 0.81988704, + "num_input_tokens_seen": 20203830, + "step": 945, + "time_per_iteration": 2.5993030071258545 + }, + { + "auxiliary_loss_clip": 0.01172994, + "auxiliary_loss_mlp": 0.01178932, + "balance_loss_clip": 1.00243998, + "balance_loss_mlp": 1.00107217, + "epoch": 0.056876597023898995, + "flos": 16070528505600.0, + "grad_norm": 3.03677433544533, + "language_loss": 0.82531673, + "learning_rate": 3.992428005427252e-06, + "loss": 0.848836, + "num_input_tokens_seen": 20220365, + "step": 946, + "time_per_iteration": 2.562394142150879 + }, + { + "auxiliary_loss_clip": 0.01189665, + "auxiliary_loss_mlp": 0.01179451, + "balance_loss_clip": 1.00261581, + "balance_loss_mlp": 1.0012095, + "epoch": 0.05693672027656696, + "flos": 16835641130880.0, + "grad_norm": 2.3290153384197363, + "language_loss": 0.79140472, + "learning_rate": 3.992394109874529e-06, + "loss": 0.81509584, + "num_input_tokens_seen": 20238640, + "step": 947, + "time_per_iteration": 2.509735107421875 + }, + { + "auxiliary_loss_clip": 0.01157313, + "auxiliary_loss_mlp": 0.01179339, + "balance_loss_clip": 1.00245428, + "balance_loss_mlp": 1.00119305, + "epoch": 0.05699684352923493, + "flos": 21389113370880.0, + "grad_norm": 3.9325738682313256, + "language_loss": 0.85659391, + "learning_rate": 3.9923601387700225e-06, + "loss": 0.87996042, + "num_input_tokens_seen": 20251025, + "step": 948, + "time_per_iteration": 2.6319758892059326 + }, + { + "auxiliary_loss_clip": 0.01189575, + "auxiliary_loss_mlp": 0.01179476, + "balance_loss_clip": 1.00255084, + "balance_loss_mlp": 1.00151992, + "epoch": 0.057056966781902904, + "flos": 15560309767680.0, + "grad_norm": 1.865579430365743, + "language_loss": 0.87090588, + "learning_rate": 3.992326092115019e-06, + "loss": 0.8945964, + "num_input_tokens_seen": 20269775, + "step": 949, + "time_per_iteration": 2.505869150161743 + }, + { + "auxiliary_loss_clip": 0.01172864, + "auxiliary_loss_mlp": 0.01179068, + "balance_loss_clip": 1.00247407, + "balance_loss_mlp": 1.00120735, + "epoch": 0.05711709003457087, + "flos": 19937856170880.0, + "grad_norm": 1.9171734068699449, + "language_loss": 0.79222822, + "learning_rate": 3.992291969910811e-06, + "loss": 0.8157475, + "num_input_tokens_seen": 20287715, + "step": 950, + "time_per_iteration": 2.5477848052978516 + }, + { + "auxiliary_loss_clip": 0.01157061, + "auxiliary_loss_mlp": 0.01179326, + "balance_loss_clip": 1.00227165, + "balance_loss_mlp": 1.00136995, + "epoch": 0.05717721328723884, + "flos": 30332701774080.0, + "grad_norm": 3.3052741301593267, + "language_loss": 0.8246305, + "learning_rate": 3.992257772158691e-06, + "loss": 0.84799433, + "num_input_tokens_seen": 20307070, + "step": 951, + "time_per_iteration": 2.7418975830078125 + }, + { + "auxiliary_loss_clip": 0.01157465, + "auxiliary_loss_mlp": 0.01179129, + "balance_loss_clip": 1.00234699, + "balance_loss_mlp": 1.00107777, + "epoch": 0.05723733653990681, + "flos": 23654358627840.0, + "grad_norm": 2.4412917983941056, + "language_loss": 0.87318242, + "learning_rate": 3.992223498859958e-06, + "loss": 0.89654839, + "num_input_tokens_seen": 20324945, + "step": 952, + "time_per_iteration": 2.63435959815979 + }, + { + "auxiliary_loss_clip": 0.01156655, + "auxiliary_loss_mlp": 0.01179115, + "balance_loss_clip": 1.00234854, + "balance_loss_mlp": 1.00106454, + "epoch": 0.05729745979257478, + "flos": 22055759838720.0, + "grad_norm": 1.9051047073365577, + "language_loss": 0.79298496, + "learning_rate": 3.9921891500159084e-06, + "loss": 0.81634271, + "num_input_tokens_seen": 20346135, + "step": 953, + "time_per_iteration": 2.664764404296875 + }, + { + "auxiliary_loss_clip": 0.01157096, + "auxiliary_loss_mlp": 0.01179501, + "balance_loss_clip": 1.00244331, + "balance_loss_mlp": 1.00164068, + "epoch": 0.05735758304524275, + "flos": 19604353368960.0, + "grad_norm": 10.042453905650131, + "language_loss": 0.86850727, + "learning_rate": 3.992154725627848e-06, + "loss": 0.89187324, + "num_input_tokens_seen": 20364450, + "step": 954, + "time_per_iteration": 2.6422533988952637 + }, + { + "auxiliary_loss_clip": 0.01173155, + "auxiliary_loss_mlp": 0.01179318, + "balance_loss_clip": 1.00247836, + "balance_loss_mlp": 1.00117195, + "epoch": 0.057417706297910716, + "flos": 19099018880640.0, + "grad_norm": 2.1141424015160832, + "language_loss": 0.88355088, + "learning_rate": 3.9921202256970804e-06, + "loss": 0.90707564, + "num_input_tokens_seen": 20383500, + "step": 955, + "time_per_iteration": 2.5902252197265625 + }, + { + "auxiliary_loss_clip": 0.01157052, + "auxiliary_loss_mlp": 0.01179269, + "balance_loss_clip": 1.00248003, + "balance_loss_mlp": 1.00131369, + "epoch": 0.05747782955057869, + "flos": 16654507822080.0, + "grad_norm": 2.0983036924440617, + "language_loss": 0.89381295, + "learning_rate": 3.992085650224914e-06, + "loss": 0.91717619, + "num_input_tokens_seen": 20400295, + "step": 956, + "time_per_iteration": 2.613284111022949 + }, + { + "auxiliary_loss_clip": 0.01139976, + "auxiliary_loss_mlp": 0.01178987, + "balance_loss_clip": 1.0023452, + "balance_loss_mlp": 1.00112665, + "epoch": 0.05753795280324665, + "flos": 14502058248960.0, + "grad_norm": 1.6586836996461944, + "language_loss": 0.75736696, + "learning_rate": 3.99205099921266e-06, + "loss": 0.78055656, + "num_input_tokens_seen": 20419085, + "step": 957, + "time_per_iteration": 2.6756231784820557 + }, + { + "auxiliary_loss_clip": 0.01140439, + "auxiliary_loss_mlp": 0.01178939, + "balance_loss_clip": 1.00229931, + "balance_loss_mlp": 1.00136518, + "epoch": 0.057598076055914625, + "flos": 18076318848000.0, + "grad_norm": 1.8363201094282728, + "language_loss": 0.80145299, + "learning_rate": 3.992016272661633e-06, + "loss": 0.82464671, + "num_input_tokens_seen": 20437465, + "step": 958, + "time_per_iteration": 2.660407066345215 + }, + { + "auxiliary_loss_clip": 0.01155908, + "auxiliary_loss_mlp": 0.01179085, + "balance_loss_clip": 1.00224805, + "balance_loss_mlp": 1.00122523, + "epoch": 0.0576581993085826, + "flos": 22124600254080.0, + "grad_norm": 2.8095624183088037, + "language_loss": 0.8837738, + "learning_rate": 3.99198147057315e-06, + "loss": 0.9071238, + "num_input_tokens_seen": 20456235, + "step": 959, + "time_per_iteration": 2.6431357860565186 + }, + { + "auxiliary_loss_clip": 0.01140177, + "auxiliary_loss_mlp": 0.01179115, + "balance_loss_clip": 1.00238371, + "balance_loss_mlp": 1.00125444, + "epoch": 0.05771832256125056, + "flos": 33181746779520.0, + "grad_norm": 2.2113414970062184, + "language_loss": 0.78644174, + "learning_rate": 3.991946592948529e-06, + "loss": 0.80963469, + "num_input_tokens_seen": 20476825, + "step": 960, + "time_per_iteration": 2.7398409843444824 + }, + { + "auxiliary_loss_clip": 0.01107722, + "auxiliary_loss_mlp": 0.01179526, + "balance_loss_clip": 1.00216079, + "balance_loss_mlp": 1.00147462, + "epoch": 0.057778445813918534, + "flos": 24170143973760.0, + "grad_norm": 2.1073979769312756, + "language_loss": 0.92911363, + "learning_rate": 3.991911639789094e-06, + "loss": 0.95198607, + "num_input_tokens_seen": 20496965, + "step": 961, + "time_per_iteration": 2.766339063644409 + }, + { + "auxiliary_loss_clip": 0.01156677, + "auxiliary_loss_mlp": 0.0117919, + "balance_loss_clip": 1.0023396, + "balance_loss_mlp": 1.00142527, + "epoch": 0.0578385690665865, + "flos": 29643037666560.0, + "grad_norm": 4.772693468441663, + "language_loss": 0.68322515, + "learning_rate": 3.991876611096169e-06, + "loss": 0.7065838, + "num_input_tokens_seen": 20518035, + "step": 962, + "time_per_iteration": 2.687714099884033 + }, + { + "auxiliary_loss_clip": 0.01140347, + "auxiliary_loss_mlp": 0.01179381, + "balance_loss_clip": 1.00224471, + "balance_loss_mlp": 1.0016166, + "epoch": 0.05789869231925447, + "flos": 20885430908160.0, + "grad_norm": 2.3159160837413824, + "language_loss": 0.88345861, + "learning_rate": 3.991841506871084e-06, + "loss": 0.90665591, + "num_input_tokens_seen": 20534740, + "step": 963, + "time_per_iteration": 2.655391216278076 + }, + { + "auxiliary_loss_clip": 0.01157147, + "auxiliary_loss_mlp": 0.01179444, + "balance_loss_clip": 1.00263572, + "balance_loss_mlp": 1.00139332, + "epoch": 0.057958815571922444, + "flos": 26031106679040.0, + "grad_norm": 4.106618593019819, + "language_loss": 0.84905231, + "learning_rate": 3.99180632711517e-06, + "loss": 0.87241817, + "num_input_tokens_seen": 20553485, + "step": 964, + "time_per_iteration": 2.6709296703338623 + }, + { + "auxiliary_loss_clip": 0.01156108, + "auxiliary_loss_mlp": 0.01179177, + "balance_loss_clip": 1.00227737, + "balance_loss_mlp": 1.00141251, + "epoch": 0.05801893882459041, + "flos": 18077683564800.0, + "grad_norm": 4.406095641255849, + "language_loss": 0.77854842, + "learning_rate": 3.99177107182976e-06, + "loss": 0.80190134, + "num_input_tokens_seen": 20572155, + "step": 965, + "time_per_iteration": 2.6063220500946045 + }, + { + "auxiliary_loss_clip": 0.01142156, + "auxiliary_loss_mlp": 0.01178989, + "balance_loss_clip": 1.00263274, + "balance_loss_mlp": 1.00141454, + "epoch": 0.05807906207725838, + "flos": 17748885444480.0, + "grad_norm": 1.9214017961245617, + "language_loss": 0.81473422, + "learning_rate": 3.99173574101619e-06, + "loss": 0.83794558, + "num_input_tokens_seen": 20590395, + "step": 966, + "time_per_iteration": 2.6330182552337646 + }, + { + "auxiliary_loss_clip": 0.01172952, + "auxiliary_loss_mlp": 0.01179122, + "balance_loss_clip": 1.00254524, + "balance_loss_mlp": 1.00135708, + "epoch": 0.058139185329926346, + "flos": 18040372312320.0, + "grad_norm": 1.9084168027651232, + "language_loss": 0.76496994, + "learning_rate": 3.9917003346758035e-06, + "loss": 0.78849065, + "num_input_tokens_seen": 20608435, + "step": 967, + "time_per_iteration": 2.5814149379730225 + }, + { + "auxiliary_loss_clip": 0.01173339, + "auxiliary_loss_mlp": 0.01175581, + "balance_loss_clip": 1.00387788, + "balance_loss_mlp": 1.0001049, + "epoch": 0.05819930858259432, + "flos": 62363297485440.0, + "grad_norm": 0.7899679142026519, + "language_loss": 0.57344508, + "learning_rate": 3.991664852809939e-06, + "loss": 0.59693432, + "num_input_tokens_seen": 20668575, + "step": 968, + "time_per_iteration": 3.096327543258667 + }, + { + "auxiliary_loss_clip": 0.01156435, + "auxiliary_loss_mlp": 0.01178939, + "balance_loss_clip": 1.00242257, + "balance_loss_mlp": 1.00117373, + "epoch": 0.05825943183526229, + "flos": 19135360465920.0, + "grad_norm": 2.2456042754015475, + "language_loss": 0.8243351, + "learning_rate": 3.991629295419945e-06, + "loss": 0.84768879, + "num_input_tokens_seen": 20687355, + "step": 969, + "time_per_iteration": 4.005632162094116 + }, + { + "auxiliary_loss_clip": 0.01172917, + "auxiliary_loss_mlp": 0.00749813, + "balance_loss_clip": 1.00247753, + "balance_loss_mlp": 1.00062418, + "epoch": 0.058319555087930255, + "flos": 29022465369600.0, + "grad_norm": 2.11369782237391, + "language_loss": 0.77945536, + "learning_rate": 3.991593662507167e-06, + "loss": 0.79868257, + "num_input_tokens_seen": 20705710, + "step": 970, + "time_per_iteration": 5.482975244522095 + }, + { + "auxiliary_loss_clip": 0.01140187, + "auxiliary_loss_mlp": 0.0117904, + "balance_loss_clip": 1.00214422, + "balance_loss_mlp": 1.00127506, + "epoch": 0.05837967834059823, + "flos": 18879999701760.0, + "grad_norm": 2.747148397726227, + "language_loss": 0.92086565, + "learning_rate": 3.991557954072958e-06, + "loss": 0.94405788, + "num_input_tokens_seen": 20722405, + "step": 971, + "time_per_iteration": 2.6546213626861572 + }, + { + "auxiliary_loss_clip": 0.01156261, + "auxiliary_loss_mlp": 0.0117888, + "balance_loss_clip": 1.00221753, + "balance_loss_mlp": 1.00130558, + "epoch": 0.05843980159326619, + "flos": 25703062744320.0, + "grad_norm": 1.6499788475166508, + "language_loss": 0.86023188, + "learning_rate": 3.991522170118673e-06, + "loss": 0.88358319, + "num_input_tokens_seen": 20741480, + "step": 972, + "time_per_iteration": 4.05697226524353 + }, + { + "auxiliary_loss_clip": 0.01139676, + "auxiliary_loss_mlp": 0.01179189, + "balance_loss_clip": 1.00211477, + "balance_loss_mlp": 1.00151992, + "epoch": 0.058499924845934165, + "flos": 25552129795200.0, + "grad_norm": 3.0567178450108585, + "language_loss": 0.87483323, + "learning_rate": 3.991486310645667e-06, + "loss": 0.89802194, + "num_input_tokens_seen": 20759685, + "step": 973, + "time_per_iteration": 2.716549873352051 + }, + { + "auxiliary_loss_clip": 0.01172929, + "auxiliary_loss_mlp": 0.00749828, + "balance_loss_clip": 1.00248003, + "balance_loss_mlp": 1.0007143, + "epoch": 0.05856004809860214, + "flos": 16436171001600.0, + "grad_norm": 2.0731548424593447, + "language_loss": 0.74918693, + "learning_rate": 3.991450375655301e-06, + "loss": 0.7684145, + "num_input_tokens_seen": 20778180, + "step": 974, + "time_per_iteration": 2.5526561737060547 + }, + { + "auxiliary_loss_clip": 0.01172673, + "auxiliary_loss_mlp": 0.00749799, + "balance_loss_clip": 1.00257492, + "balance_loss_mlp": 1.00066209, + "epoch": 0.0586201713512701, + "flos": 39458824116480.0, + "grad_norm": 5.102398452772525, + "language_loss": 0.76929355, + "learning_rate": 3.991414365148936e-06, + "loss": 0.78851831, + "num_input_tokens_seen": 20802705, + "step": 975, + "time_per_iteration": 2.7494099140167236 + }, + { + "auxiliary_loss_clip": 0.0118938, + "auxiliary_loss_mlp": 0.01178914, + "balance_loss_clip": 1.00261891, + "balance_loss_mlp": 1.00105381, + "epoch": 0.058680294603938074, + "flos": 23365170230400.0, + "grad_norm": 1.979959024381428, + "language_loss": 0.76684284, + "learning_rate": 3.99137827912794e-06, + "loss": 0.79052579, + "num_input_tokens_seen": 20822540, + "step": 976, + "time_per_iteration": 2.552699327468872 + }, + { + "auxiliary_loss_clip": 0.01156347, + "auxiliary_loss_mlp": 0.01179199, + "balance_loss_clip": 1.00232077, + "balance_loss_mlp": 1.00162482, + "epoch": 0.05874041785660604, + "flos": 32232017226240.0, + "grad_norm": 1.8070682599098142, + "language_loss": 0.87541908, + "learning_rate": 3.991342117593679e-06, + "loss": 0.8987745, + "num_input_tokens_seen": 20844175, + "step": 977, + "time_per_iteration": 2.71610951423645 + }, + { + "auxiliary_loss_clip": 0.01156482, + "auxiliary_loss_mlp": 0.01178753, + "balance_loss_clip": 1.00231659, + "balance_loss_mlp": 1.00127423, + "epoch": 0.05880054110927401, + "flos": 22310043194880.0, + "grad_norm": 1.5234291838750846, + "language_loss": 0.79402268, + "learning_rate": 3.991305880547527e-06, + "loss": 0.81737506, + "num_input_tokens_seen": 20864730, + "step": 978, + "time_per_iteration": 2.6440818309783936 + }, + { + "auxiliary_loss_clip": 0.01091993, + "auxiliary_loss_mlp": 0.0117941, + "balance_loss_clip": 1.00181711, + "balance_loss_mlp": 1.00164461, + "epoch": 0.05886066436194198, + "flos": 27380450016000.0, + "grad_norm": 2.371206189769331, + "language_loss": 0.80650479, + "learning_rate": 3.991269567990855e-06, + "loss": 0.82921875, + "num_input_tokens_seen": 20885200, + "step": 979, + "time_per_iteration": 3.143779993057251 + }, + { + "auxiliary_loss_clip": 0.01157508, + "auxiliary_loss_mlp": 0.01175527, + "balance_loss_clip": 1.00391936, + "balance_loss_mlp": 1.00005138, + "epoch": 0.05892078761460995, + "flos": 59584493525760.0, + "grad_norm": 0.9244844064373116, + "language_loss": 0.5902406, + "learning_rate": 3.9912331799250415e-06, + "loss": 0.61357093, + "num_input_tokens_seen": 20940325, + "step": 980, + "time_per_iteration": 3.463784694671631 + }, + { + "auxiliary_loss_clip": 0.01189318, + "auxiliary_loss_mlp": 0.01179001, + "balance_loss_clip": 1.00267029, + "balance_loss_mlp": 1.00161743, + "epoch": 0.05898091086727792, + "flos": 15414081500160.0, + "grad_norm": 5.302424991736767, + "language_loss": 0.86778212, + "learning_rate": 3.9911967163514665e-06, + "loss": 0.89146531, + "num_input_tokens_seen": 20958220, + "step": 981, + "time_per_iteration": 2.5451595783233643 + }, + { + "auxiliary_loss_clip": 0.01156434, + "auxiliary_loss_mlp": 0.01178957, + "balance_loss_clip": 1.00237489, + "balance_loss_mlp": 1.0013833, + "epoch": 0.059041034119945886, + "flos": 23655328295040.0, + "grad_norm": 2.196485632937207, + "language_loss": 0.79759729, + "learning_rate": 3.991160177271513e-06, + "loss": 0.82095122, + "num_input_tokens_seen": 20978920, + "step": 982, + "time_per_iteration": 2.6343300342559814 + }, + { + "auxiliary_loss_clip": 0.01158189, + "auxiliary_loss_mlp": 0.01179346, + "balance_loss_clip": 1.00229979, + "balance_loss_mlp": 1.00158155, + "epoch": 0.05910115737261386, + "flos": 24754087376640.0, + "grad_norm": 3.378962244810084, + "language_loss": 0.84308225, + "learning_rate": 3.9911235626865654e-06, + "loss": 0.86645758, + "num_input_tokens_seen": 20999490, + "step": 983, + "time_per_iteration": 2.6847593784332275 + }, + { + "auxiliary_loss_clip": 0.01172868, + "auxiliary_loss_mlp": 0.01178795, + "balance_loss_clip": 1.00246239, + "balance_loss_mlp": 1.00150692, + "epoch": 0.05916128062528183, + "flos": 11728749070080.0, + "grad_norm": 1.856609764514785, + "language_loss": 0.84821904, + "learning_rate": 3.9910868725980125e-06, + "loss": 0.87173569, + "num_input_tokens_seen": 21017865, + "step": 984, + "time_per_iteration": 2.5482285022735596 + }, + { + "auxiliary_loss_clip": 0.0117302, + "auxiliary_loss_mlp": 0.01178718, + "balance_loss_clip": 1.00256526, + "balance_loss_mlp": 1.00142992, + "epoch": 0.059221403877949795, + "flos": 21902995296000.0, + "grad_norm": 2.7823159864674856, + "language_loss": 0.77680153, + "learning_rate": 3.9910501070072465e-06, + "loss": 0.80031896, + "num_input_tokens_seen": 21035900, + "step": 985, + "time_per_iteration": 2.6052753925323486 + }, + { + "auxiliary_loss_clip": 0.0112439, + "auxiliary_loss_mlp": 0.01178679, + "balance_loss_clip": 1.00236082, + "balance_loss_mlp": 1.00120056, + "epoch": 0.05928152713061777, + "flos": 20514580940160.0, + "grad_norm": 1.8978776095441676, + "language_loss": 0.90516055, + "learning_rate": 3.991013265915661e-06, + "loss": 0.92819124, + "num_input_tokens_seen": 21053235, + "step": 986, + "time_per_iteration": 2.6879677772521973 + }, + { + "auxiliary_loss_clip": 0.01172985, + "auxiliary_loss_mlp": 0.01178568, + "balance_loss_clip": 1.00245428, + "balance_loss_mlp": 1.00108945, + "epoch": 0.05934165038328574, + "flos": 24495135252480.0, + "grad_norm": 2.3128103984269592, + "language_loss": 0.75917661, + "learning_rate": 3.9909763493246525e-06, + "loss": 0.78269219, + "num_input_tokens_seen": 21073090, + "step": 987, + "time_per_iteration": 2.650019407272339 + }, + { + "auxiliary_loss_clip": 0.0117278, + "auxiliary_loss_mlp": 0.0117897, + "balance_loss_clip": 1.00238442, + "balance_loss_mlp": 1.00139618, + "epoch": 0.059401773635953704, + "flos": 38728041914880.0, + "grad_norm": 2.3852463892418525, + "language_loss": 0.71389186, + "learning_rate": 3.990939357235621e-06, + "loss": 0.73740935, + "num_input_tokens_seen": 21094895, + "step": 988, + "time_per_iteration": 2.737992525100708 + }, + { + "auxiliary_loss_clip": 0.01139532, + "auxiliary_loss_mlp": 0.01175522, + "balance_loss_clip": 1.0033102, + "balance_loss_mlp": 1.00004625, + "epoch": 0.059461896888621676, + "flos": 58023565125120.0, + "grad_norm": 0.9377979276774521, + "language_loss": 0.71154249, + "learning_rate": 3.99090228964997e-06, + "loss": 0.73469305, + "num_input_tokens_seen": 21147555, + "step": 989, + "time_per_iteration": 3.1097359657287598 + }, + { + "auxiliary_loss_clip": 0.01124221, + "auxiliary_loss_mlp": 0.01178882, + "balance_loss_clip": 1.00218225, + "balance_loss_mlp": 1.0013082, + "epoch": 0.05952202014128964, + "flos": 22127760650880.0, + "grad_norm": 1.882205001991567, + "language_loss": 0.78524709, + "learning_rate": 3.990865146569105e-06, + "loss": 0.8082782, + "num_input_tokens_seen": 21167845, + "step": 990, + "time_per_iteration": 2.690406560897827 + }, + { + "auxiliary_loss_clip": 0.01172683, + "auxiliary_loss_mlp": 0.01178667, + "balance_loss_clip": 1.00236917, + "balance_loss_mlp": 1.00118852, + "epoch": 0.059582143393957614, + "flos": 20445776438400.0, + "grad_norm": 1.9593183097414701, + "language_loss": 0.85923111, + "learning_rate": 3.990827927994434e-06, + "loss": 0.88274467, + "num_input_tokens_seen": 21185085, + "step": 991, + "time_per_iteration": 2.6136302947998047 + }, + { + "auxiliary_loss_clip": 0.01189154, + "auxiliary_loss_mlp": 0.01179199, + "balance_loss_clip": 1.00243497, + "balance_loss_mlp": 1.00133884, + "epoch": 0.059642266646625586, + "flos": 20594877793920.0, + "grad_norm": 2.5847777768739, + "language_loss": 0.77140659, + "learning_rate": 3.9907906339273674e-06, + "loss": 0.7950902, + "num_input_tokens_seen": 21204230, + "step": 992, + "time_per_iteration": 2.603383779525757 + }, + { + "auxiliary_loss_clip": 0.01123486, + "auxiliary_loss_mlp": 0.01179049, + "balance_loss_clip": 1.00221872, + "balance_loss_mlp": 1.00128388, + "epoch": 0.05970238989929355, + "flos": 19352655792000.0, + "grad_norm": 2.389108198925134, + "language_loss": 0.75063622, + "learning_rate": 3.9907532643693215e-06, + "loss": 0.77366155, + "num_input_tokens_seen": 21222655, + "step": 993, + "time_per_iteration": 2.7044715881347656 + }, + { + "auxiliary_loss_clip": 0.01140274, + "auxiliary_loss_mlp": 0.0117903, + "balance_loss_clip": 1.00245881, + "balance_loss_mlp": 1.0014559, + "epoch": 0.05976251315196152, + "flos": 30264040926720.0, + "grad_norm": 1.8065643214584535, + "language_loss": 0.78515911, + "learning_rate": 3.990715819321712e-06, + "loss": 0.80835223, + "num_input_tokens_seen": 21242310, + "step": 994, + "time_per_iteration": 2.7382123470306396 + }, + { + "auxiliary_loss_clip": 0.01189127, + "auxiliary_loss_mlp": 0.01179132, + "balance_loss_clip": 1.00254631, + "balance_loss_mlp": 1.00174892, + "epoch": 0.05982263640462949, + "flos": 23185150243200.0, + "grad_norm": 2.439707815619212, + "language_loss": 0.79836148, + "learning_rate": 3.99067829878596e-06, + "loss": 0.82204407, + "num_input_tokens_seen": 21261410, + "step": 995, + "time_per_iteration": 2.535489082336426 + }, + { + "auxiliary_loss_clip": 0.01139541, + "auxiliary_loss_mlp": 0.01178649, + "balance_loss_clip": 1.00215816, + "balance_loss_mlp": 1.00107455, + "epoch": 0.05988275965729746, + "flos": 27850879463040.0, + "grad_norm": 8.76761155555849, + "language_loss": 0.86834198, + "learning_rate": 3.990640702763487e-06, + "loss": 0.89152396, + "num_input_tokens_seen": 21280080, + "step": 996, + "time_per_iteration": 2.6860973834991455 + }, + { + "auxiliary_loss_clip": 0.01140543, + "auxiliary_loss_mlp": 0.01179071, + "balance_loss_clip": 1.00241375, + "balance_loss_mlp": 1.00168705, + "epoch": 0.05994288290996543, + "flos": 24680003575680.0, + "grad_norm": 4.2495623683294195, + "language_loss": 0.88318765, + "learning_rate": 3.990603031255718e-06, + "loss": 0.90638369, + "num_input_tokens_seen": 21296765, + "step": 997, + "time_per_iteration": 2.665494203567505 + }, + { + "auxiliary_loss_clip": 0.01159016, + "auxiliary_loss_mlp": 0.01175584, + "balance_loss_clip": 1.00387573, + "balance_loss_mlp": 1.000108, + "epoch": 0.0600030061626334, + "flos": 69929568835200.0, + "grad_norm": 1.027275803521941, + "language_loss": 0.7536869, + "learning_rate": 3.990565284264083e-06, + "loss": 0.77703285, + "num_input_tokens_seen": 21363345, + "step": 998, + "time_per_iteration": 3.299713611602783 + }, + { + "auxiliary_loss_clip": 0.0114021, + "auxiliary_loss_mlp": 0.01178716, + "balance_loss_clip": 1.00231934, + "balance_loss_mlp": 1.00152349, + "epoch": 0.06006312941530137, + "flos": 26540140268160.0, + "grad_norm": 2.1772041203201975, + "language_loss": 0.75704801, + "learning_rate": 3.990527461790013e-06, + "loss": 0.78023732, + "num_input_tokens_seen": 21385290, + "step": 999, + "time_per_iteration": 2.6994965076446533 + }, + { + "auxiliary_loss_clip": 0.01173023, + "auxiliary_loss_mlp": 0.01178467, + "balance_loss_clip": 1.00235534, + "balance_loss_mlp": 1.00098777, + "epoch": 0.060123252667969335, + "flos": 27344000689920.0, + "grad_norm": 1.79043581699158, + "language_loss": 0.82736027, + "learning_rate": 3.990489563834943e-06, + "loss": 0.85087514, + "num_input_tokens_seen": 21407625, + "step": 1000, + "time_per_iteration": 2.632370948791504 + }, + { + "auxiliary_loss_clip": 0.01156874, + "auxiliary_loss_mlp": 0.01178747, + "balance_loss_clip": 1.00259566, + "balance_loss_mlp": 1.00136364, + "epoch": 0.06018337592063731, + "flos": 27016710940800.0, + "grad_norm": 2.3849759549720013, + "language_loss": 0.86429608, + "learning_rate": 3.990451590400309e-06, + "loss": 0.88765228, + "num_input_tokens_seen": 21426835, + "step": 1001, + "time_per_iteration": 2.668161630630493 + }, + { + "auxiliary_loss_clip": 0.01156518, + "auxiliary_loss_mlp": 0.01178484, + "balance_loss_clip": 1.00243568, + "balance_loss_mlp": 1.00129104, + "epoch": 0.06024349917330528, + "flos": 25592960580480.0, + "grad_norm": 2.088673182938162, + "language_loss": 0.74259633, + "learning_rate": 3.990413541487551e-06, + "loss": 0.76594639, + "num_input_tokens_seen": 21444920, + "step": 1002, + "time_per_iteration": 2.6344757080078125 + }, + { + "auxiliary_loss_clip": 0.01188985, + "auxiliary_loss_mlp": 0.01178787, + "balance_loss_clip": 1.00251365, + "balance_loss_mlp": 1.00140357, + "epoch": 0.060303622425973244, + "flos": 26133271937280.0, + "grad_norm": 2.396598663436807, + "language_loss": 0.75742662, + "learning_rate": 3.990375417098112e-06, + "loss": 0.78110433, + "num_input_tokens_seen": 21463555, + "step": 1003, + "time_per_iteration": 2.6569161415100098 + }, + { + "auxiliary_loss_clip": 0.01156246, + "auxiliary_loss_mlp": 0.01178371, + "balance_loss_clip": 1.00235295, + "balance_loss_mlp": 1.00127339, + "epoch": 0.060363745678641216, + "flos": 20377187418240.0, + "grad_norm": 2.3035417809345087, + "language_loss": 0.70387006, + "learning_rate": 3.990337217233437e-06, + "loss": 0.72721624, + "num_input_tokens_seen": 21481990, + "step": 1004, + "time_per_iteration": 2.6083786487579346 + }, + { + "auxiliary_loss_clip": 0.01173054, + "auxiliary_loss_mlp": 0.01179043, + "balance_loss_clip": 1.0026083, + "balance_loss_mlp": 1.00165975, + "epoch": 0.06042386893130918, + "flos": 17749172753280.0, + "grad_norm": 2.12979213419192, + "language_loss": 0.83741254, + "learning_rate": 3.990298941894976e-06, + "loss": 0.86093354, + "num_input_tokens_seen": 21500385, + "step": 1005, + "time_per_iteration": 2.578552007675171 + }, + { + "auxiliary_loss_clip": 0.01172417, + "auxiliary_loss_mlp": 0.0117554, + "balance_loss_clip": 1.00369763, + "balance_loss_mlp": 1.00006449, + "epoch": 0.06048399218397715, + "flos": 68538496872960.0, + "grad_norm": 0.8992694537669946, + "language_loss": 0.59048957, + "learning_rate": 3.9902605910841794e-06, + "loss": 0.61396915, + "num_input_tokens_seen": 21561040, + "step": 1006, + "time_per_iteration": 4.567206621170044 + }, + { + "auxiliary_loss_clip": 0.01156758, + "auxiliary_loss_mlp": 0.01178638, + "balance_loss_clip": 1.00238431, + "balance_loss_mlp": 1.00115943, + "epoch": 0.060544115436645125, + "flos": 23258515772160.0, + "grad_norm": 2.0984211208172727, + "language_loss": 0.7468375, + "learning_rate": 3.990222164802503e-06, + "loss": 0.77019143, + "num_input_tokens_seen": 21580655, + "step": 1007, + "time_per_iteration": 2.5989089012145996 + }, + { + "auxiliary_loss_clip": 0.01156513, + "auxiliary_loss_mlp": 0.01178663, + "balance_loss_clip": 1.00247121, + "balance_loss_mlp": 1.00127947, + "epoch": 0.06060423868931309, + "flos": 23878441624320.0, + "grad_norm": 2.4841488847097937, + "language_loss": 0.80799955, + "learning_rate": 3.9901836630514006e-06, + "loss": 0.83135134, + "num_input_tokens_seen": 21599650, + "step": 1008, + "time_per_iteration": 5.526517868041992 + }, + { + "auxiliary_loss_clip": 0.01141431, + "auxiliary_loss_mlp": 0.01178549, + "balance_loss_clip": 1.00234222, + "balance_loss_mlp": 1.001261, + "epoch": 0.06066436194198106, + "flos": 18728061171840.0, + "grad_norm": 10.304432808268706, + "language_loss": 0.78527576, + "learning_rate": 3.990145085832335e-06, + "loss": 0.80847555, + "num_input_tokens_seen": 21617550, + "step": 1009, + "time_per_iteration": 4.042831897735596 + }, + { + "auxiliary_loss_clip": 0.01172897, + "auxiliary_loss_mlp": 0.0117834, + "balance_loss_clip": 1.00259006, + "balance_loss_mlp": 1.00114703, + "epoch": 0.06072448519464903, + "flos": 24640465680000.0, + "grad_norm": 1.8255407544771858, + "language_loss": 0.9269594, + "learning_rate": 3.990106433146769e-06, + "loss": 0.9504717, + "num_input_tokens_seen": 21635865, + "step": 1010, + "time_per_iteration": 2.596938133239746 + }, + { + "auxiliary_loss_clip": 0.01107485, + "auxiliary_loss_mlp": 0.00749924, + "balance_loss_clip": 1.0022856, + "balance_loss_mlp": 1.00095665, + "epoch": 0.060784608447317, + "flos": 17378825575680.0, + "grad_norm": 2.982665069234647, + "language_loss": 0.71347028, + "learning_rate": 3.9900677049961665e-06, + "loss": 0.73204434, + "num_input_tokens_seen": 21653945, + "step": 1011, + "time_per_iteration": 2.7419869899749756 + }, + { + "auxiliary_loss_clip": 0.01172986, + "auxiliary_loss_mlp": 0.01178843, + "balance_loss_clip": 1.0026803, + "balance_loss_mlp": 1.00145912, + "epoch": 0.06084473169998497, + "flos": 23692208584320.0, + "grad_norm": 3.5040837774657767, + "language_loss": 0.87381375, + "learning_rate": 3.990028901381999e-06, + "loss": 0.89733201, + "num_input_tokens_seen": 21671230, + "step": 1012, + "time_per_iteration": 2.570791482925415 + }, + { + "auxiliary_loss_clip": 0.01173013, + "auxiliary_loss_mlp": 0.01178599, + "balance_loss_clip": 1.00247955, + "balance_loss_mlp": 1.00131106, + "epoch": 0.06090485495265294, + "flos": 23546339452800.0, + "grad_norm": 3.4657147349144792, + "language_loss": 0.7723428, + "learning_rate": 3.989990022305734e-06, + "loss": 0.79585898, + "num_input_tokens_seen": 21691155, + "step": 1013, + "time_per_iteration": 2.5937623977661133 + }, + { + "auxiliary_loss_clip": 0.01172516, + "auxiliary_loss_mlp": 0.00749836, + "balance_loss_clip": 1.00261402, + "balance_loss_mlp": 1.000808, + "epoch": 0.06096497820532091, + "flos": 20339301548160.0, + "grad_norm": 2.503459817758103, + "language_loss": 0.85899794, + "learning_rate": 3.98995106776885e-06, + "loss": 0.87822145, + "num_input_tokens_seen": 21707405, + "step": 1014, + "time_per_iteration": 2.560631513595581 + }, + { + "auxiliary_loss_clip": 0.01172707, + "auxiliary_loss_mlp": 0.0117916, + "balance_loss_clip": 1.00247228, + "balance_loss_mlp": 1.00168109, + "epoch": 0.061025101457988874, + "flos": 26939035779840.0, + "grad_norm": 2.0243163751931097, + "language_loss": 0.73464459, + "learning_rate": 3.98991203777282e-06, + "loss": 0.75816327, + "num_input_tokens_seen": 21728090, + "step": 1015, + "time_per_iteration": 2.6012983322143555 + }, + { + "auxiliary_loss_clip": 0.01155853, + "auxiliary_loss_mlp": 0.01178431, + "balance_loss_clip": 1.0024606, + "balance_loss_mlp": 1.00142884, + "epoch": 0.061085224710656846, + "flos": 25375054723200.0, + "grad_norm": 1.7296194176499788, + "language_loss": 0.7927379, + "learning_rate": 3.9898729323191275e-06, + "loss": 0.81608069, + "num_input_tokens_seen": 21747950, + "step": 1016, + "time_per_iteration": 2.653326988220215 + }, + { + "auxiliary_loss_clip": 0.01139828, + "auxiliary_loss_mlp": 0.01178433, + "balance_loss_clip": 1.00229955, + "balance_loss_mlp": 1.00133538, + "epoch": 0.06114534796332482, + "flos": 24824759385600.0, + "grad_norm": 1.9659847039870206, + "language_loss": 0.7636342, + "learning_rate": 3.989833751409254e-06, + "loss": 0.78681684, + "num_input_tokens_seen": 21767900, + "step": 1017, + "time_per_iteration": 2.6725974082946777 + }, + { + "auxiliary_loss_clip": 0.01156827, + "auxiliary_loss_mlp": 0.01179353, + "balance_loss_clip": 1.00256419, + "balance_loss_mlp": 1.00187397, + "epoch": 0.061205471215992784, + "flos": 20631434860800.0, + "grad_norm": 1.835337816523148, + "language_loss": 0.85968363, + "learning_rate": 3.989794495044685e-06, + "loss": 0.88304538, + "num_input_tokens_seen": 21787375, + "step": 1018, + "time_per_iteration": 2.6059021949768066 + }, + { + "auxiliary_loss_clip": 0.01141518, + "auxiliary_loss_mlp": 0.01178726, + "balance_loss_clip": 1.00237632, + "balance_loss_mlp": 1.00153375, + "epoch": 0.061265594468660756, + "flos": 16508351381760.0, + "grad_norm": 3.2996793822680712, + "language_loss": 0.77077234, + "learning_rate": 3.989755163226909e-06, + "loss": 0.79397476, + "num_input_tokens_seen": 21806275, + "step": 1019, + "time_per_iteration": 2.618093967437744 + }, + { + "auxiliary_loss_clip": 0.01123122, + "auxiliary_loss_mlp": 0.01178388, + "balance_loss_clip": 1.00229752, + "balance_loss_mlp": 1.00119543, + "epoch": 0.06132571772132872, + "flos": 26246211275520.0, + "grad_norm": 1.9287444620641838, + "language_loss": 0.84270632, + "learning_rate": 3.989715755957418e-06, + "loss": 0.86572146, + "num_input_tokens_seen": 21826430, + "step": 1020, + "time_per_iteration": 2.7369518280029297 + }, + { + "auxiliary_loss_clip": 0.01172776, + "auxiliary_loss_mlp": 0.01178555, + "balance_loss_clip": 1.00255704, + "balance_loss_mlp": 1.00145781, + "epoch": 0.06138584097399669, + "flos": 37414788768000.0, + "grad_norm": 2.1174492974085335, + "language_loss": 0.79364562, + "learning_rate": 3.989676273237705e-06, + "loss": 0.81715894, + "num_input_tokens_seen": 21847800, + "step": 1021, + "time_per_iteration": 2.710829496383667 + }, + { + "auxiliary_loss_clip": 0.01156864, + "auxiliary_loss_mlp": 0.01179023, + "balance_loss_clip": 1.0024693, + "balance_loss_mlp": 1.00163984, + "epoch": 0.061445964226664665, + "flos": 17420661941760.0, + "grad_norm": 1.9408386864350748, + "language_loss": 0.87505722, + "learning_rate": 3.9896367150692705e-06, + "loss": 0.89841604, + "num_input_tokens_seen": 21863385, + "step": 1022, + "time_per_iteration": 2.6257357597351074 + }, + { + "auxiliary_loss_clip": 0.01158365, + "auxiliary_loss_mlp": 0.01178673, + "balance_loss_clip": 1.00265157, + "balance_loss_mlp": 1.00157547, + "epoch": 0.06150608747933263, + "flos": 22600021691520.0, + "grad_norm": 2.095012918825352, + "language_loss": 0.82975346, + "learning_rate": 3.989597081453611e-06, + "loss": 0.85312378, + "num_input_tokens_seen": 21881880, + "step": 1023, + "time_per_iteration": 2.6299169063568115 + }, + { + "auxiliary_loss_clip": 0.01188666, + "auxiliary_loss_mlp": 0.01174832, + "balance_loss_clip": 1.00372386, + "balance_loss_mlp": 1.00011849, + "epoch": 0.0615662107320006, + "flos": 56741482005120.0, + "grad_norm": 0.8945563435445498, + "language_loss": 0.65072769, + "learning_rate": 3.989557372392231e-06, + "loss": 0.67436266, + "num_input_tokens_seen": 21940550, + "step": 1024, + "time_per_iteration": 3.180811643600464 + }, + { + "auxiliary_loss_clip": 0.01156928, + "auxiliary_loss_mlp": 0.01178981, + "balance_loss_clip": 1.00256658, + "balance_loss_mlp": 1.00150251, + "epoch": 0.06162633398466857, + "flos": 22564793427840.0, + "grad_norm": 1.8811377512490222, + "language_loss": 0.87960547, + "learning_rate": 3.989517587886636e-06, + "loss": 0.90296459, + "num_input_tokens_seen": 21958390, + "step": 1025, + "time_per_iteration": 2.6292238235473633 + }, + { + "auxiliary_loss_clip": 0.01156483, + "auxiliary_loss_mlp": 0.01178887, + "balance_loss_clip": 1.00254154, + "balance_loss_mlp": 1.00140786, + "epoch": 0.06168645723733654, + "flos": 25593104234880.0, + "grad_norm": 1.7058790737289893, + "language_loss": 0.84573638, + "learning_rate": 3.989477727938335e-06, + "loss": 0.86909008, + "num_input_tokens_seen": 21978625, + "step": 1026, + "time_per_iteration": 2.648756265640259 + }, + { + "auxiliary_loss_clip": 0.01140448, + "auxiliary_loss_mlp": 0.01178617, + "balance_loss_clip": 1.00239134, + "balance_loss_mlp": 1.00132942, + "epoch": 0.06174658049000451, + "flos": 15997917162240.0, + "grad_norm": 1.9151564651624455, + "language_loss": 0.8245281, + "learning_rate": 3.989437792548839e-06, + "loss": 0.84771883, + "num_input_tokens_seen": 21996035, + "step": 1027, + "time_per_iteration": 2.6222026348114014 + }, + { + "auxiliary_loss_clip": 0.01123006, + "auxiliary_loss_mlp": 0.01178628, + "balance_loss_clip": 1.00229168, + "balance_loss_mlp": 1.00124502, + "epoch": 0.06180670374267248, + "flos": 11285970117120.0, + "grad_norm": 2.244740642953027, + "language_loss": 0.84357977, + "learning_rate": 3.989397781719663e-06, + "loss": 0.8665961, + "num_input_tokens_seen": 22011625, + "step": 1028, + "time_per_iteration": 2.6805269718170166 + }, + { + "auxiliary_loss_clip": 0.01155471, + "auxiliary_loss_mlp": 0.01174795, + "balance_loss_clip": 1.00358701, + "balance_loss_mlp": 1.00008202, + "epoch": 0.06186682699534045, + "flos": 65130142216320.0, + "grad_norm": 0.943819222865045, + "language_loss": 0.60446692, + "learning_rate": 3.989357695452323e-06, + "loss": 0.62776959, + "num_input_tokens_seen": 22066035, + "step": 1029, + "time_per_iteration": 3.008068799972534 + }, + { + "auxiliary_loss_clip": 0.01156103, + "auxiliary_loss_mlp": 0.0117858, + "balance_loss_clip": 1.00249791, + "balance_loss_mlp": 1.0013876, + "epoch": 0.061926950248008414, + "flos": 21105742976640.0, + "grad_norm": 3.656077693652529, + "language_loss": 0.82613021, + "learning_rate": 3.98931753374834e-06, + "loss": 0.84947705, + "num_input_tokens_seen": 22085015, + "step": 1030, + "time_per_iteration": 2.619642972946167 + }, + { + "auxiliary_loss_clip": 0.01188927, + "auxiliary_loss_mlp": 0.01178551, + "balance_loss_clip": 1.0026865, + "balance_loss_mlp": 1.00135851, + "epoch": 0.061987073500676386, + "flos": 17748454481280.0, + "grad_norm": 2.479526337928104, + "language_loss": 0.79989445, + "learning_rate": 3.989277296609237e-06, + "loss": 0.82356918, + "num_input_tokens_seen": 22102775, + "step": 1031, + "time_per_iteration": 2.5278079509735107 + }, + { + "auxiliary_loss_clip": 0.01155993, + "auxiliary_loss_mlp": 0.01178478, + "balance_loss_clip": 1.00247002, + "balance_loss_mlp": 1.00166643, + "epoch": 0.06204719675334436, + "flos": 21836237869440.0, + "grad_norm": 1.9961182953576588, + "language_loss": 0.77745539, + "learning_rate": 3.98923698403654e-06, + "loss": 0.80080014, + "num_input_tokens_seen": 22121680, + "step": 1032, + "time_per_iteration": 2.612236261367798 + }, + { + "auxiliary_loss_clip": 0.01172751, + "auxiliary_loss_mlp": 0.01178421, + "balance_loss_clip": 1.00244415, + "balance_loss_mlp": 1.0013237, + "epoch": 0.06210732000601232, + "flos": 19353697286400.0, + "grad_norm": 2.1956800552622275, + "language_loss": 0.89556825, + "learning_rate": 3.989196596031776e-06, + "loss": 0.91908002, + "num_input_tokens_seen": 22138155, + "step": 1033, + "time_per_iteration": 2.5406486988067627 + }, + { + "auxiliary_loss_clip": 0.01172595, + "auxiliary_loss_mlp": 0.01178497, + "balance_loss_clip": 1.00243568, + "balance_loss_mlp": 1.00140011, + "epoch": 0.062167443258680295, + "flos": 24749382695040.0, + "grad_norm": 2.0987265811871945, + "language_loss": 0.85058165, + "learning_rate": 3.989156132596479e-06, + "loss": 0.87409258, + "num_input_tokens_seen": 22157420, + "step": 1034, + "time_per_iteration": 2.580322027206421 + }, + { + "auxiliary_loss_clip": 0.0115608, + "auxiliary_loss_mlp": 0.01178397, + "balance_loss_clip": 1.00244284, + "balance_loss_mlp": 1.00158548, + "epoch": 0.06222756651134827, + "flos": 34458478773120.0, + "grad_norm": 1.7585786945474973, + "language_loss": 0.809062, + "learning_rate": 3.989115593732182e-06, + "loss": 0.83240664, + "num_input_tokens_seen": 22178620, + "step": 1035, + "time_per_iteration": 2.693734884262085 + }, + { + "auxiliary_loss_clip": 0.01123114, + "auxiliary_loss_mlp": 0.01178383, + "balance_loss_clip": 1.00223899, + "balance_loss_mlp": 1.00138092, + "epoch": 0.06228768976401623, + "flos": 25666469763840.0, + "grad_norm": 2.032343535488329, + "language_loss": 0.78521335, + "learning_rate": 3.989074979440421e-06, + "loss": 0.80822825, + "num_input_tokens_seen": 22197125, + "step": 1036, + "time_per_iteration": 2.720520257949829 + }, + { + "auxiliary_loss_clip": 0.01172346, + "auxiliary_loss_mlp": 0.01178592, + "balance_loss_clip": 1.00240195, + "balance_loss_mlp": 1.00159013, + "epoch": 0.062347813016684205, + "flos": 25295619795840.0, + "grad_norm": 1.8029306180878004, + "language_loss": 0.8658433, + "learning_rate": 3.989034289722739e-06, + "loss": 0.88935268, + "num_input_tokens_seen": 22217575, + "step": 1037, + "time_per_iteration": 2.576186180114746 + }, + { + "auxiliary_loss_clip": 0.01172348, + "auxiliary_loss_mlp": 0.01178175, + "balance_loss_clip": 1.0025444, + "balance_loss_mlp": 1.00126815, + "epoch": 0.06240793626935217, + "flos": 26907039740160.0, + "grad_norm": 2.358936067861055, + "language_loss": 0.81367731, + "learning_rate": 3.988993524580676e-06, + "loss": 0.83718252, + "num_input_tokens_seen": 22236840, + "step": 1038, + "time_per_iteration": 2.6063456535339355 + }, + { + "auxiliary_loss_clip": 0.01140022, + "auxiliary_loss_mlp": 0.01178387, + "balance_loss_clip": 1.00245547, + "balance_loss_mlp": 1.00167155, + "epoch": 0.06246805952202014, + "flos": 21615782146560.0, + "grad_norm": 5.328639928993333, + "language_loss": 0.85363257, + "learning_rate": 3.98895268401578e-06, + "loss": 0.87681663, + "num_input_tokens_seen": 22256465, + "step": 1039, + "time_per_iteration": 2.6332995891571045 + }, + { + "auxiliary_loss_clip": 0.01156185, + "auxiliary_loss_mlp": 0.01178201, + "balance_loss_clip": 1.00254536, + "balance_loss_mlp": 1.00129461, + "epoch": 0.0625281827746881, + "flos": 19311896833920.0, + "grad_norm": 2.0064692116153835, + "language_loss": 0.80735677, + "learning_rate": 3.9889117680296e-06, + "loss": 0.83070058, + "num_input_tokens_seen": 22274025, + "step": 1040, + "time_per_iteration": 2.583491802215576 + }, + { + "auxiliary_loss_clip": 0.01189193, + "auxiliary_loss_mlp": 0.01178231, + "balance_loss_clip": 1.00298095, + "balance_loss_mlp": 1.00122881, + "epoch": 0.06258830602735609, + "flos": 27745769289600.0, + "grad_norm": 2.2331133449634306, + "language_loss": 0.70055425, + "learning_rate": 3.988870776623685e-06, + "loss": 0.7242285, + "num_input_tokens_seen": 22292245, + "step": 1041, + "time_per_iteration": 2.575428009033203 + }, + { + "auxiliary_loss_clip": 0.01188658, + "auxiliary_loss_mlp": 0.01177937, + "balance_loss_clip": 1.00249231, + "balance_loss_mlp": 1.0012207, + "epoch": 0.06264842928002405, + "flos": 23222605150080.0, + "grad_norm": 2.0725247074655564, + "language_loss": 0.81266415, + "learning_rate": 3.9888297097995905e-06, + "loss": 0.83633012, + "num_input_tokens_seen": 22311455, + "step": 1042, + "time_per_iteration": 2.5589091777801514 + }, + { + "auxiliary_loss_clip": 0.01188758, + "auxiliary_loss_mlp": 0.01177744, + "balance_loss_clip": 1.0026623, + "balance_loss_mlp": 1.00112367, + "epoch": 0.06270855253269202, + "flos": 38399495189760.0, + "grad_norm": 2.4092965594749782, + "language_loss": 0.7580297, + "learning_rate": 3.988788567558874e-06, + "loss": 0.78169477, + "num_input_tokens_seen": 22333750, + "step": 1043, + "time_per_iteration": 2.687042474746704 + }, + { + "auxiliary_loss_clip": 0.01172088, + "auxiliary_loss_mlp": 0.01178122, + "balance_loss_clip": 1.00249171, + "balance_loss_mlp": 1.00159729, + "epoch": 0.06276867578535998, + "flos": 22453542028800.0, + "grad_norm": 1.9970695992982417, + "language_loss": 0.92223024, + "learning_rate": 3.988747349903097e-06, + "loss": 0.94573241, + "num_input_tokens_seen": 22351940, + "step": 1044, + "time_per_iteration": 4.028000116348267 + }, + { + "auxiliary_loss_clip": 0.01171948, + "auxiliary_loss_mlp": 0.01178217, + "balance_loss_clip": 1.00232148, + "balance_loss_mlp": 1.0015012, + "epoch": 0.06282879903802796, + "flos": 22930435923840.0, + "grad_norm": 2.30172428179067, + "language_loss": 0.85890394, + "learning_rate": 3.988706056833821e-06, + "loss": 0.88240564, + "num_input_tokens_seen": 22372085, + "step": 1045, + "time_per_iteration": 4.054581642150879 + }, + { + "auxiliary_loss_clip": 0.01155382, + "auxiliary_loss_mlp": 0.01178073, + "balance_loss_clip": 1.00220251, + "balance_loss_mlp": 1.00145221, + "epoch": 0.06288892229069593, + "flos": 34819237019520.0, + "grad_norm": 2.293241002113271, + "language_loss": 0.78236032, + "learning_rate": 3.9886646883526125e-06, + "loss": 0.80569482, + "num_input_tokens_seen": 22392020, + "step": 1046, + "time_per_iteration": 2.781778573989868 + }, + { + "auxiliary_loss_clip": 0.01172206, + "auxiliary_loss_mlp": 0.01178318, + "balance_loss_clip": 1.00262833, + "balance_loss_mlp": 1.00160241, + "epoch": 0.06294904554336389, + "flos": 19427134642560.0, + "grad_norm": 2.19468610627808, + "language_loss": 0.77329624, + "learning_rate": 3.988623244461039e-06, + "loss": 0.79680145, + "num_input_tokens_seen": 22411180, + "step": 1047, + "time_per_iteration": 5.471964359283447 + }, + { + "auxiliary_loss_clip": 0.01172378, + "auxiliary_loss_mlp": 0.01178133, + "balance_loss_clip": 1.00245118, + "balance_loss_mlp": 1.00132155, + "epoch": 0.06300916879603187, + "flos": 40661867358720.0, + "grad_norm": 2.1451463978145426, + "language_loss": 0.77243793, + "learning_rate": 3.988581725160672e-06, + "loss": 0.79594302, + "num_input_tokens_seen": 22435105, + "step": 1048, + "time_per_iteration": 2.771275520324707 + }, + { + "auxiliary_loss_clip": 0.01155545, + "auxiliary_loss_mlp": 0.0117832, + "balance_loss_clip": 1.00236559, + "balance_loss_mlp": 1.00160432, + "epoch": 0.06306929204869983, + "flos": 23804142341760.0, + "grad_norm": 2.1347068694838676, + "language_loss": 0.78027463, + "learning_rate": 3.988540130453087e-06, + "loss": 0.80361325, + "num_input_tokens_seen": 22452710, + "step": 1049, + "time_per_iteration": 2.628445863723755 + }, + { + "auxiliary_loss_clip": 0.01172097, + "auxiliary_loss_mlp": 0.01178183, + "balance_loss_clip": 1.00244927, + "balance_loss_mlp": 1.00127673, + "epoch": 0.0631294153013678, + "flos": 18915802583040.0, + "grad_norm": 2.616443291049237, + "language_loss": 0.83144605, + "learning_rate": 3.988498460339862e-06, + "loss": 0.85494888, + "num_input_tokens_seen": 22470175, + "step": 1050, + "time_per_iteration": 2.551180601119995 + }, + { + "auxiliary_loss_clip": 0.01188682, + "auxiliary_loss_mlp": 0.01178186, + "balance_loss_clip": 1.00272107, + "balance_loss_mlp": 1.00147057, + "epoch": 0.06318953855403578, + "flos": 24280174310400.0, + "grad_norm": 1.968393742197769, + "language_loss": 0.76865351, + "learning_rate": 3.988456714822575e-06, + "loss": 0.79232216, + "num_input_tokens_seen": 22490020, + "step": 1051, + "time_per_iteration": 2.5277349948883057 + }, + { + "auxiliary_loss_clip": 0.01155826, + "auxiliary_loss_mlp": 0.01178368, + "balance_loss_clip": 1.00249028, + "balance_loss_mlp": 1.00165224, + "epoch": 0.06324966180670374, + "flos": 22528918719360.0, + "grad_norm": 3.1341902767736296, + "language_loss": 0.79983598, + "learning_rate": 3.98841489390281e-06, + "loss": 0.82317793, + "num_input_tokens_seen": 22509685, + "step": 1052, + "time_per_iteration": 2.643990993499756 + }, + { + "auxiliary_loss_clip": 0.01188909, + "auxiliary_loss_mlp": 0.01177812, + "balance_loss_clip": 1.00281858, + "balance_loss_mlp": 1.00119138, + "epoch": 0.06330978505937171, + "flos": 15778107884160.0, + "grad_norm": 2.523558539133835, + "language_loss": 0.78043008, + "learning_rate": 3.988372997582155e-06, + "loss": 0.80409729, + "num_input_tokens_seen": 22527905, + "step": 1053, + "time_per_iteration": 2.50258469581604 + }, + { + "auxiliary_loss_clip": 0.01155775, + "auxiliary_loss_mlp": 0.007498, + "balance_loss_clip": 1.00247085, + "balance_loss_mlp": 1.00087237, + "epoch": 0.06336990831203967, + "flos": 21471098163840.0, + "grad_norm": 2.045870494064391, + "language_loss": 0.84574318, + "learning_rate": 3.988331025862195e-06, + "loss": 0.8647989, + "num_input_tokens_seen": 22546335, + "step": 1054, + "time_per_iteration": 2.6586387157440186 + }, + { + "auxiliary_loss_clip": 0.01156403, + "auxiliary_loss_mlp": 0.01177737, + "balance_loss_clip": 1.00258899, + "balance_loss_mlp": 1.00130773, + "epoch": 0.06343003156470765, + "flos": 18478877546880.0, + "grad_norm": 3.5345166631761558, + "language_loss": 0.85502195, + "learning_rate": 3.9882889787445225e-06, + "loss": 0.87836337, + "num_input_tokens_seen": 22563885, + "step": 1055, + "time_per_iteration": 2.58966326713562 + }, + { + "auxiliary_loss_clip": 0.01139466, + "auxiliary_loss_mlp": 0.01178091, + "balance_loss_clip": 1.0023433, + "balance_loss_mlp": 1.00118434, + "epoch": 0.06349015481737562, + "flos": 25154886309120.0, + "grad_norm": 3.052662331743328, + "language_loss": 0.81338763, + "learning_rate": 3.988246856230734e-06, + "loss": 0.83656323, + "num_input_tokens_seen": 22583035, + "step": 1056, + "time_per_iteration": 2.690483570098877 + }, + { + "auxiliary_loss_clip": 0.01106788, + "auxiliary_loss_mlp": 0.01177847, + "balance_loss_clip": 1.00215745, + "balance_loss_mlp": 1.0011313, + "epoch": 0.06355027807004358, + "flos": 26871775562880.0, + "grad_norm": 4.608554778190267, + "language_loss": 0.8126694, + "learning_rate": 3.988204658322426e-06, + "loss": 0.83551574, + "num_input_tokens_seen": 22605055, + "step": 1057, + "time_per_iteration": 2.754131555557251 + }, + { + "auxiliary_loss_clip": 0.01142258, + "auxiliary_loss_mlp": 0.01177699, + "balance_loss_clip": 1.00251269, + "balance_loss_mlp": 1.00155556, + "epoch": 0.06361040132271156, + "flos": 21396691140480.0, + "grad_norm": 1.7616695037669556, + "language_loss": 0.83522946, + "learning_rate": 3.988162385021196e-06, + "loss": 0.85842907, + "num_input_tokens_seen": 22623760, + "step": 1058, + "time_per_iteration": 2.7036125659942627 + }, + { + "auxiliary_loss_clip": 0.01155805, + "auxiliary_loss_mlp": 0.0117792, + "balance_loss_clip": 1.00245976, + "balance_loss_mlp": 1.00129938, + "epoch": 0.06367052457537953, + "flos": 25733765894400.0, + "grad_norm": 2.322996232706264, + "language_loss": 0.87456232, + "learning_rate": 3.988120036328651e-06, + "loss": 0.89789957, + "num_input_tokens_seen": 22643000, + "step": 1059, + "time_per_iteration": 2.6141929626464844 + }, + { + "auxiliary_loss_clip": 0.01139578, + "auxiliary_loss_mlp": 0.01178066, + "balance_loss_clip": 1.00244641, + "balance_loss_mlp": 1.00125456, + "epoch": 0.0637306478280475, + "flos": 17631420992640.0, + "grad_norm": 2.1943549675748146, + "language_loss": 0.9132275, + "learning_rate": 3.988077612246394e-06, + "loss": 0.93640393, + "num_input_tokens_seen": 22660460, + "step": 1060, + "time_per_iteration": 2.6298792362213135 + }, + { + "auxiliary_loss_clip": 0.01156079, + "auxiliary_loss_mlp": 0.01177893, + "balance_loss_clip": 1.00237203, + "balance_loss_mlp": 1.00127268, + "epoch": 0.06379077108071547, + "flos": 13662610427520.0, + "grad_norm": 2.080804322490593, + "language_loss": 0.8725915, + "learning_rate": 3.988035112776035e-06, + "loss": 0.89593124, + "num_input_tokens_seen": 22679270, + "step": 1061, + "time_per_iteration": 2.584869861602783 + }, + { + "auxiliary_loss_clip": 0.01155532, + "auxiliary_loss_mlp": 0.01177664, + "balance_loss_clip": 1.00240517, + "balance_loss_mlp": 1.00113845, + "epoch": 0.06385089433338344, + "flos": 28478849961600.0, + "grad_norm": 2.260560437802618, + "language_loss": 0.77261239, + "learning_rate": 3.987992537919185e-06, + "loss": 0.79594433, + "num_input_tokens_seen": 22699330, + "step": 1062, + "time_per_iteration": 2.716048002243042 + }, + { + "auxiliary_loss_clip": 0.01138976, + "auxiliary_loss_mlp": 0.01177773, + "balance_loss_clip": 1.00214314, + "balance_loss_mlp": 1.00143874, + "epoch": 0.0639110175860514, + "flos": 24311057028480.0, + "grad_norm": 2.428198440644362, + "language_loss": 0.86418468, + "learning_rate": 3.987949887677459e-06, + "loss": 0.88735217, + "num_input_tokens_seen": 22717945, + "step": 1063, + "time_per_iteration": 2.667886972427368 + }, + { + "auxiliary_loss_clip": 0.01188512, + "auxiliary_loss_mlp": 0.01178002, + "balance_loss_clip": 1.00244665, + "balance_loss_mlp": 1.00147724, + "epoch": 0.06397114083871938, + "flos": 22090772620800.0, + "grad_norm": 2.4053857508647134, + "language_loss": 0.80225492, + "learning_rate": 3.9879071620524744e-06, + "loss": 0.8259201, + "num_input_tokens_seen": 22736790, + "step": 1064, + "time_per_iteration": 2.583479642868042 + }, + { + "auxiliary_loss_clip": 0.01172418, + "auxiliary_loss_mlp": 0.01178044, + "balance_loss_clip": 1.00262475, + "balance_loss_mlp": 1.00161433, + "epoch": 0.06403126409138735, + "flos": 19572824206080.0, + "grad_norm": 2.2450298732236704, + "language_loss": 0.8414346, + "learning_rate": 3.987864361045851e-06, + "loss": 0.86493921, + "num_input_tokens_seen": 22754745, + "step": 1065, + "time_per_iteration": 2.575141429901123 + }, + { + "auxiliary_loss_clip": 0.01139453, + "auxiliary_loss_mlp": 0.01177896, + "balance_loss_clip": 1.00228894, + "balance_loss_mlp": 1.00137043, + "epoch": 0.06409138734405531, + "flos": 40807413267840.0, + "grad_norm": 1.5478538048962094, + "language_loss": 0.68784189, + "learning_rate": 3.987821484659211e-06, + "loss": 0.71101534, + "num_input_tokens_seen": 22776780, + "step": 1066, + "time_per_iteration": 2.8448994159698486 + }, + { + "auxiliary_loss_clip": 0.01188504, + "auxiliary_loss_mlp": 0.01177918, + "balance_loss_clip": 1.00254655, + "balance_loss_mlp": 1.00158358, + "epoch": 0.06415151059672328, + "flos": 20441610460800.0, + "grad_norm": 2.265226206995748, + "language_loss": 0.90167248, + "learning_rate": 3.987778532894181e-06, + "loss": 0.92533672, + "num_input_tokens_seen": 22793915, + "step": 1067, + "time_per_iteration": 2.559548854827881 + }, + { + "auxiliary_loss_clip": 0.01155681, + "auxiliary_loss_mlp": 0.01177724, + "balance_loss_clip": 1.00247765, + "balance_loss_mlp": 1.0013901, + "epoch": 0.06421163384939126, + "flos": 18072045129600.0, + "grad_norm": 3.5224615274364646, + "language_loss": 0.83551073, + "learning_rate": 3.987735505752391e-06, + "loss": 0.85884476, + "num_input_tokens_seen": 22812670, + "step": 1068, + "time_per_iteration": 2.642524003982544 + }, + { + "auxiliary_loss_clip": 0.01156054, + "auxiliary_loss_mlp": 0.0117798, + "balance_loss_clip": 1.00257671, + "balance_loss_mlp": 1.00164521, + "epoch": 0.06427175710205922, + "flos": 25119442563840.0, + "grad_norm": 2.423875885059745, + "language_loss": 0.89546466, + "learning_rate": 3.987692403235471e-06, + "loss": 0.918805, + "num_input_tokens_seen": 22832440, + "step": 1069, + "time_per_iteration": 2.685324192047119 + }, + { + "auxiliary_loss_clip": 0.01157453, + "auxiliary_loss_mlp": 0.01177943, + "balance_loss_clip": 1.00221062, + "balance_loss_mlp": 1.00170386, + "epoch": 0.06433188035472719, + "flos": 17380549428480.0, + "grad_norm": 2.5162877864610516, + "language_loss": 0.95878267, + "learning_rate": 3.987649225345056e-06, + "loss": 0.98213655, + "num_input_tokens_seen": 22845495, + "step": 1070, + "time_per_iteration": 2.564091682434082 + }, + { + "auxiliary_loss_clip": 0.01123046, + "auxiliary_loss_mlp": 0.0117773, + "balance_loss_clip": 1.00222194, + "balance_loss_mlp": 1.00130033, + "epoch": 0.06439200360739517, + "flos": 23546267625600.0, + "grad_norm": 2.0655414123707185, + "language_loss": 0.88215601, + "learning_rate": 3.987605972082782e-06, + "loss": 0.90516376, + "num_input_tokens_seen": 22865390, + "step": 1071, + "time_per_iteration": 2.6722750663757324 + }, + { + "auxiliary_loss_clip": 0.01139599, + "auxiliary_loss_mlp": 0.0117785, + "balance_loss_clip": 1.00225902, + "balance_loss_mlp": 1.00161147, + "epoch": 0.06445212686006313, + "flos": 21979772616960.0, + "grad_norm": 1.722109477089185, + "language_loss": 0.76049972, + "learning_rate": 3.987562643450292e-06, + "loss": 0.78367418, + "num_input_tokens_seen": 22885495, + "step": 1072, + "time_per_iteration": 2.6625077724456787 + }, + { + "auxiliary_loss_clip": 0.01156181, + "auxiliary_loss_mlp": 0.01177615, + "balance_loss_clip": 1.00252497, + "balance_loss_mlp": 1.00128102, + "epoch": 0.0645122501127311, + "flos": 25921291824000.0, + "grad_norm": 2.138440299785174, + "language_loss": 0.80739015, + "learning_rate": 3.987519239449226e-06, + "loss": 0.83072805, + "num_input_tokens_seen": 22904845, + "step": 1073, + "time_per_iteration": 2.627183198928833 + }, + { + "auxiliary_loss_clip": 0.01171787, + "auxiliary_loss_mlp": 0.01177494, + "balance_loss_clip": 1.00247121, + "balance_loss_mlp": 1.00135016, + "epoch": 0.06457237336539907, + "flos": 25626034028160.0, + "grad_norm": 1.7378541590644865, + "language_loss": 0.80434763, + "learning_rate": 3.987475760081233e-06, + "loss": 0.82784045, + "num_input_tokens_seen": 22925940, + "step": 1074, + "time_per_iteration": 2.6160194873809814 + }, + { + "auxiliary_loss_clip": 0.01156053, + "auxiliary_loss_mlp": 0.01177576, + "balance_loss_clip": 1.00242269, + "balance_loss_mlp": 1.00143218, + "epoch": 0.06463249661806704, + "flos": 19463979018240.0, + "grad_norm": 1.60939894651421, + "language_loss": 0.7905249, + "learning_rate": 3.987432205347958e-06, + "loss": 0.81386119, + "num_input_tokens_seen": 22944375, + "step": 1075, + "time_per_iteration": 2.602132797241211 + }, + { + "auxiliary_loss_clip": 0.01156486, + "auxiliary_loss_mlp": 0.0117744, + "balance_loss_clip": 1.00260353, + "balance_loss_mlp": 1.00139177, + "epoch": 0.064692619870735, + "flos": 24498044254080.0, + "grad_norm": 3.432519957160789, + "language_loss": 0.87453157, + "learning_rate": 3.987388575251055e-06, + "loss": 0.89787078, + "num_input_tokens_seen": 22959145, + "step": 1076, + "time_per_iteration": 2.6451165676116943 + }, + { + "auxiliary_loss_clip": 0.01172328, + "auxiliary_loss_mlp": 0.01177341, + "balance_loss_clip": 1.00256848, + "balance_loss_mlp": 1.00119781, + "epoch": 0.06475274312340297, + "flos": 17018677860480.0, + "grad_norm": 1.8772938680002373, + "language_loss": 0.80580449, + "learning_rate": 3.98734486979218e-06, + "loss": 0.82930124, + "num_input_tokens_seen": 22978100, + "step": 1077, + "time_per_iteration": 2.5385146141052246 + }, + { + "auxiliary_loss_clip": 0.01156017, + "auxiliary_loss_mlp": 0.01177597, + "balance_loss_clip": 1.00251627, + "balance_loss_mlp": 1.00126266, + "epoch": 0.06481286637607095, + "flos": 24572379450240.0, + "grad_norm": 2.045570502900855, + "language_loss": 0.9146663, + "learning_rate": 3.987301088972986e-06, + "loss": 0.93800247, + "num_input_tokens_seen": 22997285, + "step": 1078, + "time_per_iteration": 2.645472764968872 + }, + { + "auxiliary_loss_clip": 0.01188596, + "auxiliary_loss_mlp": 0.01177722, + "balance_loss_clip": 1.00263143, + "balance_loss_mlp": 1.00129223, + "epoch": 0.06487298962873891, + "flos": 21105635235840.0, + "grad_norm": 2.378790682027337, + "language_loss": 0.7873646, + "learning_rate": 3.987257232795137e-06, + "loss": 0.81102777, + "num_input_tokens_seen": 23016285, + "step": 1079, + "time_per_iteration": 2.5165886878967285 + }, + { + "auxiliary_loss_clip": 0.01139233, + "auxiliary_loss_mlp": 0.01177632, + "balance_loss_clip": 1.00225842, + "balance_loss_mlp": 1.00139332, + "epoch": 0.06493311288140688, + "flos": 24608182331520.0, + "grad_norm": 2.0748382405677703, + "language_loss": 0.6939106, + "learning_rate": 3.987213301260294e-06, + "loss": 0.71707922, + "num_input_tokens_seen": 23036420, + "step": 1080, + "time_per_iteration": 2.725353479385376 + }, + { + "auxiliary_loss_clip": 0.01122748, + "auxiliary_loss_mlp": 0.01177448, + "balance_loss_clip": 1.00211239, + "balance_loss_mlp": 1.00149488, + "epoch": 0.06499323613407486, + "flos": 25337994865920.0, + "grad_norm": 2.872298172601235, + "language_loss": 0.72054237, + "learning_rate": 3.987169294370123e-06, + "loss": 0.74354434, + "num_input_tokens_seen": 23056945, + "step": 1081, + "time_per_iteration": 2.7172763347625732 + }, + { + "auxiliary_loss_clip": 0.01125489, + "auxiliary_loss_mlp": 0.01177324, + "balance_loss_clip": 1.00231862, + "balance_loss_mlp": 1.00137162, + "epoch": 0.06505335938674282, + "flos": 20375714960640.0, + "grad_norm": 2.786921116087925, + "language_loss": 0.84222806, + "learning_rate": 3.987125212126294e-06, + "loss": 0.86525619, + "num_input_tokens_seen": 23074940, + "step": 1082, + "time_per_iteration": 4.052046060562134 + }, + { + "auxiliary_loss_clip": 0.01172361, + "auxiliary_loss_mlp": 0.01177775, + "balance_loss_clip": 1.00244546, + "balance_loss_mlp": 1.00144053, + "epoch": 0.06511348263941079, + "flos": 25337923038720.0, + "grad_norm": 2.7836311578804773, + "language_loss": 0.82373929, + "learning_rate": 3.987081054530478e-06, + "loss": 0.84724057, + "num_input_tokens_seen": 23093420, + "step": 1083, + "time_per_iteration": 3.9526257514953613 + }, + { + "auxiliary_loss_clip": 0.01139059, + "auxiliary_loss_mlp": 0.0117749, + "balance_loss_clip": 1.00220919, + "balance_loss_mlp": 1.00125098, + "epoch": 0.06517360589207877, + "flos": 20332801186560.0, + "grad_norm": 2.613110337526892, + "language_loss": 0.79107946, + "learning_rate": 3.987036821584348e-06, + "loss": 0.81424493, + "num_input_tokens_seen": 23111550, + "step": 1084, + "time_per_iteration": 4.125804424285889 + }, + { + "auxiliary_loss_clip": 0.01156359, + "auxiliary_loss_mlp": 0.01177225, + "balance_loss_clip": 1.00247288, + "balance_loss_mlp": 1.00108171, + "epoch": 0.06523372914474673, + "flos": 31681650061440.0, + "grad_norm": 2.7762498704964464, + "language_loss": 0.66800022, + "learning_rate": 3.986992513289584e-06, + "loss": 0.69133604, + "num_input_tokens_seen": 23130335, + "step": 1085, + "time_per_iteration": 4.078576326370239 + }, + { + "auxiliary_loss_clip": 0.01155921, + "auxiliary_loss_mlp": 0.01177713, + "balance_loss_clip": 1.00247526, + "balance_loss_mlp": 1.00166428, + "epoch": 0.0652938523974147, + "flos": 20778165918720.0, + "grad_norm": 1.8868352315072523, + "language_loss": 0.76793683, + "learning_rate": 3.9869481296478645e-06, + "loss": 0.79127318, + "num_input_tokens_seen": 23152380, + "step": 1086, + "time_per_iteration": 2.682666301727295 + }, + { + "auxiliary_loss_clip": 0.01171678, + "auxiliary_loss_mlp": 0.01177173, + "balance_loss_clip": 1.00242221, + "balance_loss_mlp": 1.00112534, + "epoch": 0.06535397565008266, + "flos": 16690993061760.0, + "grad_norm": 2.3853794480989357, + "language_loss": 0.85796297, + "learning_rate": 3.986903670660872e-06, + "loss": 0.88145149, + "num_input_tokens_seen": 23171630, + "step": 1087, + "time_per_iteration": 2.535092830657959 + }, + { + "auxiliary_loss_clip": 0.01155141, + "auxiliary_loss_mlp": 0.01177722, + "balance_loss_clip": 1.0023191, + "balance_loss_mlp": 1.00138772, + "epoch": 0.06541409890275064, + "flos": 26868220116480.0, + "grad_norm": 2.062439056518037, + "language_loss": 0.78100991, + "learning_rate": 3.9868591363302945e-06, + "loss": 0.80433851, + "num_input_tokens_seen": 23192520, + "step": 1088, + "time_per_iteration": 2.6260290145874023 + }, + { + "auxiliary_loss_clip": 0.01155673, + "auxiliary_loss_mlp": 0.01177462, + "balance_loss_clip": 1.00242305, + "balance_loss_mlp": 1.0014137, + "epoch": 0.06547422215541861, + "flos": 20521620005760.0, + "grad_norm": 2.0358007893126815, + "language_loss": 0.71205497, + "learning_rate": 3.9868145266578186e-06, + "loss": 0.73538631, + "num_input_tokens_seen": 23210710, + "step": 1089, + "time_per_iteration": 2.5915579795837402 + }, + { + "auxiliary_loss_clip": 0.01156017, + "auxiliary_loss_mlp": 0.00749837, + "balance_loss_clip": 1.0024693, + "balance_loss_mlp": 1.00084257, + "epoch": 0.06553434540808657, + "flos": 22016616992640.0, + "grad_norm": 1.7648362139619203, + "language_loss": 0.85623276, + "learning_rate": 3.9867698416451366e-06, + "loss": 0.87529135, + "num_input_tokens_seen": 23230305, + "step": 1090, + "time_per_iteration": 2.837444305419922 + }, + { + "auxiliary_loss_clip": 0.01188409, + "auxiliary_loss_mlp": 0.01177437, + "balance_loss_clip": 1.00261593, + "balance_loss_mlp": 1.00138843, + "epoch": 0.06559446866075455, + "flos": 24608649208320.0, + "grad_norm": 1.8511059329686421, + "language_loss": 0.72125828, + "learning_rate": 3.9867250812939434e-06, + "loss": 0.74491674, + "num_input_tokens_seen": 23249015, + "step": 1091, + "time_per_iteration": 2.565906286239624 + }, + { + "auxiliary_loss_clip": 0.01089591, + "auxiliary_loss_mlp": 0.01177526, + "balance_loss_clip": 1.00182819, + "balance_loss_mlp": 1.00138283, + "epoch": 0.06565459191342252, + "flos": 24274679529600.0, + "grad_norm": 21.410085893837454, + "language_loss": 0.82680464, + "learning_rate": 3.986680245605936e-06, + "loss": 0.8494758, + "num_input_tokens_seen": 23265105, + "step": 1092, + "time_per_iteration": 2.985450267791748 + }, + { + "auxiliary_loss_clip": 0.01188171, + "auxiliary_loss_mlp": 0.01177438, + "balance_loss_clip": 1.00236177, + "balance_loss_mlp": 1.00129485, + "epoch": 0.06571471516609048, + "flos": 24787124910720.0, + "grad_norm": 3.1071779979061707, + "language_loss": 0.71190655, + "learning_rate": 3.986635334582814e-06, + "loss": 0.73556268, + "num_input_tokens_seen": 23283950, + "step": 1093, + "time_per_iteration": 2.778775691986084 + }, + { + "auxiliary_loss_clip": 0.01155051, + "auxiliary_loss_mlp": 0.01177585, + "balance_loss_clip": 1.00233054, + "balance_loss_mlp": 1.00125015, + "epoch": 0.06577483841875846, + "flos": 26214071581440.0, + "grad_norm": 1.5411523357912082, + "language_loss": 0.88175952, + "learning_rate": 3.986590348226282e-06, + "loss": 0.90508592, + "num_input_tokens_seen": 23305005, + "step": 1094, + "time_per_iteration": 2.6501708030700684 + }, + { + "auxiliary_loss_clip": 0.01155579, + "auxiliary_loss_mlp": 0.01177416, + "balance_loss_clip": 1.00248027, + "balance_loss_mlp": 1.00136781, + "epoch": 0.06583496167142643, + "flos": 25080802508160.0, + "grad_norm": 1.5124546352321768, + "language_loss": 0.8140927, + "learning_rate": 3.986545286538044e-06, + "loss": 0.83742267, + "num_input_tokens_seen": 23323220, + "step": 1095, + "time_per_iteration": 2.646559476852417 + }, + { + "auxiliary_loss_clip": 0.01138489, + "auxiliary_loss_mlp": 0.01177087, + "balance_loss_clip": 1.00207424, + "balance_loss_mlp": 1.00122929, + "epoch": 0.06589508492409439, + "flos": 25629804956160.0, + "grad_norm": 2.4300588846741364, + "language_loss": 0.69986236, + "learning_rate": 3.986500149519811e-06, + "loss": 0.72301817, + "num_input_tokens_seen": 23342235, + "step": 1096, + "time_per_iteration": 2.6557388305664062 + }, + { + "auxiliary_loss_clip": 0.01171815, + "auxiliary_loss_mlp": 0.01177862, + "balance_loss_clip": 1.00249243, + "balance_loss_mlp": 1.00181365, + "epoch": 0.06595520817676236, + "flos": 23621249266560.0, + "grad_norm": 2.155748583644344, + "language_loss": 0.77701402, + "learning_rate": 3.986454937173292e-06, + "loss": 0.80051076, + "num_input_tokens_seen": 23363680, + "step": 1097, + "time_per_iteration": 2.588132381439209 + }, + { + "auxiliary_loss_clip": 0.01188341, + "auxiliary_loss_mlp": 0.01177515, + "balance_loss_clip": 1.00256014, + "balance_loss_mlp": 1.00146651, + "epoch": 0.06601533142943034, + "flos": 33801708545280.0, + "grad_norm": 3.669456185312556, + "language_loss": 0.78641033, + "learning_rate": 3.986409649500203e-06, + "loss": 0.81006885, + "num_input_tokens_seen": 23385590, + "step": 1098, + "time_per_iteration": 2.6372859477996826 + }, + { + "auxiliary_loss_clip": 0.01173912, + "auxiliary_loss_mlp": 0.0117792, + "balance_loss_clip": 1.00246501, + "balance_loss_mlp": 1.00187135, + "epoch": 0.0660754546820983, + "flos": 20259184262400.0, + "grad_norm": 2.591916986228224, + "language_loss": 0.81903297, + "learning_rate": 3.986364286502261e-06, + "loss": 0.84255123, + "num_input_tokens_seen": 23402945, + "step": 1099, + "time_per_iteration": 2.5458168983459473 + }, + { + "auxiliary_loss_clip": 0.01172252, + "auxiliary_loss_mlp": 0.01177214, + "balance_loss_clip": 1.0024364, + "balance_loss_mlp": 1.00116551, + "epoch": 0.06613557793476627, + "flos": 19354164163200.0, + "grad_norm": 1.8965855466002783, + "language_loss": 0.83039814, + "learning_rate": 3.986318848181186e-06, + "loss": 0.85389286, + "num_input_tokens_seen": 23421410, + "step": 1100, + "time_per_iteration": 2.545569896697998 + }, + { + "auxiliary_loss_clip": 0.01155454, + "auxiliary_loss_mlp": 0.0117758, + "balance_loss_clip": 1.00245583, + "balance_loss_mlp": 1.00153196, + "epoch": 0.06619570118743424, + "flos": 13772568936960.0, + "grad_norm": 5.134985635503988, + "language_loss": 0.73205888, + "learning_rate": 3.986273334538702e-06, + "loss": 0.75538921, + "num_input_tokens_seen": 23438870, + "step": 1101, + "time_per_iteration": 2.5875022411346436 + }, + { + "auxiliary_loss_clip": 0.01172093, + "auxiliary_loss_mlp": 0.01177426, + "balance_loss_clip": 1.00245464, + "balance_loss_mlp": 1.00147295, + "epoch": 0.06625582444010221, + "flos": 17857874286720.0, + "grad_norm": 10.146819803847958, + "language_loss": 0.86618996, + "learning_rate": 3.986227745576533e-06, + "loss": 0.88968515, + "num_input_tokens_seen": 23456975, + "step": 1102, + "time_per_iteration": 2.565279960632324 + }, + { + "auxiliary_loss_clip": 0.01155237, + "auxiliary_loss_mlp": 0.01177308, + "balance_loss_clip": 1.00234532, + "balance_loss_mlp": 1.00125957, + "epoch": 0.06631594769277017, + "flos": 11838707579520.0, + "grad_norm": 2.0997616907505465, + "language_loss": 0.81517035, + "learning_rate": 3.98618208129641e-06, + "loss": 0.83849573, + "num_input_tokens_seen": 23473440, + "step": 1103, + "time_per_iteration": 2.587153911590576 + }, + { + "auxiliary_loss_clip": 0.01171945, + "auxiliary_loss_mlp": 0.0074979, + "balance_loss_clip": 1.00264573, + "balance_loss_mlp": 1.00084639, + "epoch": 0.06637607094543815, + "flos": 19793351756160.0, + "grad_norm": 2.2693550066756956, + "language_loss": 0.82170093, + "learning_rate": 3.986136341700063e-06, + "loss": 0.8409183, + "num_input_tokens_seen": 23493880, + "step": 1104, + "time_per_iteration": 2.593675136566162 + }, + { + "auxiliary_loss_clip": 0.01138362, + "auxiliary_loss_mlp": 0.01177306, + "balance_loss_clip": 1.00207233, + "balance_loss_mlp": 1.00135255, + "epoch": 0.06643619419810612, + "flos": 25485659677440.0, + "grad_norm": 1.911432262076515, + "language_loss": 0.80542696, + "learning_rate": 3.986090526789227e-06, + "loss": 0.82858366, + "num_input_tokens_seen": 23514920, + "step": 1105, + "time_per_iteration": 2.6766409873962402 + }, + { + "auxiliary_loss_clip": 0.01154985, + "auxiliary_loss_mlp": 0.01177225, + "balance_loss_clip": 1.00229979, + "balance_loss_mlp": 1.00146294, + "epoch": 0.06649631745077408, + "flos": 16946533393920.0, + "grad_norm": 1.8245656675286053, + "language_loss": 0.96375453, + "learning_rate": 3.986044636565639e-06, + "loss": 0.98707658, + "num_input_tokens_seen": 23531635, + "step": 1106, + "time_per_iteration": 2.571629285812378 + }, + { + "auxiliary_loss_clip": 0.011715, + "auxiliary_loss_mlp": 0.01177093, + "balance_loss_clip": 1.00232911, + "balance_loss_mlp": 1.00114036, + "epoch": 0.06655644070344206, + "flos": 17858592558720.0, + "grad_norm": 2.3993031780098955, + "language_loss": 0.82935679, + "learning_rate": 3.985998671031039e-06, + "loss": 0.85284269, + "num_input_tokens_seen": 23551020, + "step": 1107, + "time_per_iteration": 2.5893893241882324 + }, + { + "auxiliary_loss_clip": 0.01173904, + "auxiliary_loss_mlp": 0.01173547, + "balance_loss_clip": 1.0036689, + "balance_loss_mlp": 1.00036013, + "epoch": 0.06661656395611003, + "flos": 61419350021760.0, + "grad_norm": 0.8521368821024186, + "language_loss": 0.56739306, + "learning_rate": 3.9859526301871705e-06, + "loss": 0.59086752, + "num_input_tokens_seen": 23610675, + "step": 1108, + "time_per_iteration": 3.140296220779419 + }, + { + "auxiliary_loss_clip": 0.01155775, + "auxiliary_loss_mlp": 0.01177027, + "balance_loss_clip": 1.00234401, + "balance_loss_mlp": 1.00116992, + "epoch": 0.066676687208778, + "flos": 20662856282880.0, + "grad_norm": 3.335446924246397, + "language_loss": 0.72950065, + "learning_rate": 3.9859065140357795e-06, + "loss": 0.75282866, + "num_input_tokens_seen": 23628710, + "step": 1109, + "time_per_iteration": 2.6223530769348145 + }, + { + "auxiliary_loss_clip": 0.01122445, + "auxiliary_loss_mlp": 0.01177742, + "balance_loss_clip": 1.00219715, + "balance_loss_mlp": 1.00178862, + "epoch": 0.06673681046144596, + "flos": 20923280864640.0, + "grad_norm": 2.4384308949215576, + "language_loss": 0.78359294, + "learning_rate": 3.985860322578614e-06, + "loss": 0.80659485, + "num_input_tokens_seen": 23649160, + "step": 1110, + "time_per_iteration": 2.684497594833374 + }, + { + "auxiliary_loss_clip": 0.01139064, + "auxiliary_loss_mlp": 0.01177198, + "balance_loss_clip": 1.00226116, + "balance_loss_mlp": 1.00143623, + "epoch": 0.06679693371411394, + "flos": 31065818359680.0, + "grad_norm": 2.3563819485880613, + "language_loss": 0.71686852, + "learning_rate": 3.985814055817427e-06, + "loss": 0.74003112, + "num_input_tokens_seen": 23671995, + "step": 1111, + "time_per_iteration": 2.6990761756896973 + }, + { + "auxiliary_loss_clip": 0.01138896, + "auxiliary_loss_mlp": 0.01177641, + "balance_loss_clip": 1.00236273, + "balance_loss_mlp": 1.00159287, + "epoch": 0.0668570569667819, + "flos": 21726135705600.0, + "grad_norm": 2.927206857894407, + "language_loss": 0.78224409, + "learning_rate": 3.985767713753971e-06, + "loss": 0.80540949, + "num_input_tokens_seen": 23690705, + "step": 1112, + "time_per_iteration": 2.631333827972412 + }, + { + "auxiliary_loss_clip": 0.01139657, + "auxiliary_loss_mlp": 0.01177138, + "balance_loss_clip": 1.00242317, + "balance_loss_mlp": 1.00128067, + "epoch": 0.06691718021944987, + "flos": 22747255539840.0, + "grad_norm": 2.6426271701672657, + "language_loss": 0.78862762, + "learning_rate": 3.985721296390005e-06, + "loss": 0.81179559, + "num_input_tokens_seen": 23709990, + "step": 1113, + "time_per_iteration": 2.6515069007873535 + }, + { + "auxiliary_loss_clip": 0.01123398, + "auxiliary_loss_mlp": 0.0117643, + "balance_loss_clip": 1.00232697, + "balance_loss_mlp": 1.00104904, + "epoch": 0.06697730347211785, + "flos": 16545626720640.0, + "grad_norm": 2.251433463867893, + "language_loss": 0.82716835, + "learning_rate": 3.985674803727289e-06, + "loss": 0.85016656, + "num_input_tokens_seen": 23728485, + "step": 1114, + "time_per_iteration": 2.669224500656128 + }, + { + "auxiliary_loss_clip": 0.0112248, + "auxiliary_loss_mlp": 0.01173507, + "balance_loss_clip": 1.00347757, + "balance_loss_mlp": 1.00031996, + "epoch": 0.06703742672478581, + "flos": 59782326658560.0, + "grad_norm": 0.8317554478158524, + "language_loss": 0.58099037, + "learning_rate": 3.985628235767584e-06, + "loss": 0.60395026, + "num_input_tokens_seen": 23786650, + "step": 1115, + "time_per_iteration": 3.205538034439087 + }, + { + "auxiliary_loss_clip": 0.01155131, + "auxiliary_loss_mlp": 0.01176972, + "balance_loss_clip": 1.00229645, + "balance_loss_mlp": 1.00140023, + "epoch": 0.06709754997745378, + "flos": 16800197385600.0, + "grad_norm": 2.756150409363434, + "language_loss": 0.91379327, + "learning_rate": 3.985581592512658e-06, + "loss": 0.9371143, + "num_input_tokens_seen": 23802555, + "step": 1116, + "time_per_iteration": 2.589667320251465 + }, + { + "auxiliary_loss_clip": 0.01138781, + "auxiliary_loss_mlp": 0.00749815, + "balance_loss_clip": 1.0022949, + "balance_loss_mlp": 1.00086546, + "epoch": 0.06715767323012176, + "flos": 22123917895680.0, + "grad_norm": 2.069633907394224, + "language_loss": 0.87195659, + "learning_rate": 3.985534873964279e-06, + "loss": 0.89084256, + "num_input_tokens_seen": 23822945, + "step": 1117, + "time_per_iteration": 2.706587076187134 + }, + { + "auxiliary_loss_clip": 0.01171517, + "auxiliary_loss_mlp": 0.01173474, + "balance_loss_clip": 1.00370133, + "balance_loss_mlp": 1.00028694, + "epoch": 0.06721779648278972, + "flos": 66618100137600.0, + "grad_norm": 0.8584517117637993, + "language_loss": 0.59783822, + "learning_rate": 3.985488080124218e-06, + "loss": 0.62128818, + "num_input_tokens_seen": 23874075, + "step": 1118, + "time_per_iteration": 3.0826656818389893 + }, + { + "auxiliary_loss_clip": 0.01155976, + "auxiliary_loss_mlp": 0.01177022, + "balance_loss_clip": 1.00230074, + "balance_loss_mlp": 1.00116491, + "epoch": 0.06727791973545769, + "flos": 22382474970240.0, + "grad_norm": 3.6721747099364936, + "language_loss": 0.83341098, + "learning_rate": 3.985441210994251e-06, + "loss": 0.85674095, + "num_input_tokens_seen": 23889720, + "step": 1119, + "time_per_iteration": 2.6101956367492676 + }, + { + "auxiliary_loss_clip": 0.01154893, + "auxiliary_loss_mlp": 0.01177365, + "balance_loss_clip": 1.00232744, + "balance_loss_mlp": 1.00169849, + "epoch": 0.06733804298812565, + "flos": 24280210224000.0, + "grad_norm": 1.9614094684534742, + "language_loss": 0.85015404, + "learning_rate": 3.9853942665761545e-06, + "loss": 0.87347668, + "num_input_tokens_seen": 23909385, + "step": 1120, + "time_per_iteration": 5.442663192749023 + }, + { + "auxiliary_loss_clip": 0.01188061, + "auxiliary_loss_mlp": 0.01177461, + "balance_loss_clip": 1.0025568, + "balance_loss_mlp": 1.00179458, + "epoch": 0.06739816624079363, + "flos": 15918230839680.0, + "grad_norm": 1.9423233409346536, + "language_loss": 0.78964001, + "learning_rate": 3.985347246871708e-06, + "loss": 0.81329519, + "num_input_tokens_seen": 23926830, + "step": 1121, + "time_per_iteration": 2.489612579345703 + }, + { + "auxiliary_loss_clip": 0.01171292, + "auxiliary_loss_mlp": 0.01173285, + "balance_loss_clip": 1.00395513, + "balance_loss_mlp": 1.00009751, + "epoch": 0.0674582894934616, + "flos": 71398567353600.0, + "grad_norm": 0.7555009242480243, + "language_loss": 0.58385921, + "learning_rate": 3.985300151882694e-06, + "loss": 0.60730505, + "num_input_tokens_seen": 23992640, + "step": 1122, + "time_per_iteration": 5.95762300491333 + }, + { + "auxiliary_loss_clip": 0.01122642, + "auxiliary_loss_mlp": 0.01177067, + "balance_loss_clip": 1.00208437, + "balance_loss_mlp": 1.00149524, + "epoch": 0.06751841274612956, + "flos": 25264952559360.0, + "grad_norm": 1.898167854396152, + "language_loss": 0.72077703, + "learning_rate": 3.985252981610901e-06, + "loss": 0.74377406, + "num_input_tokens_seen": 24011135, + "step": 1123, + "time_per_iteration": 2.6959621906280518 + }, + { + "auxiliary_loss_clip": 0.01122231, + "auxiliary_loss_mlp": 0.01177168, + "balance_loss_clip": 1.00214469, + "balance_loss_mlp": 1.00131047, + "epoch": 0.06757853599879754, + "flos": 23802741711360.0, + "grad_norm": 2.303773861924269, + "language_loss": 0.78909045, + "learning_rate": 3.985205736058114e-06, + "loss": 0.81208444, + "num_input_tokens_seen": 24030695, + "step": 1124, + "time_per_iteration": 2.6847827434539795 + }, + { + "auxiliary_loss_clip": 0.01171523, + "auxiliary_loss_mlp": 0.01177041, + "balance_loss_clip": 1.00233054, + "balance_loss_mlp": 1.00137448, + "epoch": 0.0676386592514655, + "flos": 21033742164480.0, + "grad_norm": 2.451655151334093, + "language_loss": 0.71734333, + "learning_rate": 3.985158415226128e-06, + "loss": 0.74082893, + "num_input_tokens_seen": 24050680, + "step": 1125, + "time_per_iteration": 2.577479124069214 + }, + { + "auxiliary_loss_clip": 0.01138834, + "auxiliary_loss_mlp": 0.01177257, + "balance_loss_clip": 1.00234926, + "balance_loss_mlp": 1.00168514, + "epoch": 0.06769878250413347, + "flos": 25556331686400.0, + "grad_norm": 2.4262932434436824, + "language_loss": 0.81213874, + "learning_rate": 3.985111019116736e-06, + "loss": 0.83529961, + "num_input_tokens_seen": 24067205, + "step": 1126, + "time_per_iteration": 2.6726326942443848 + }, + { + "auxiliary_loss_clip": 0.0115491, + "auxiliary_loss_mlp": 0.01173292, + "balance_loss_clip": 1.00363159, + "balance_loss_mlp": 1.00010478, + "epoch": 0.06775890575680145, + "flos": 70655251305600.0, + "grad_norm": 0.7813218207841504, + "language_loss": 0.59804487, + "learning_rate": 3.985063547731735e-06, + "loss": 0.62132692, + "num_input_tokens_seen": 24131320, + "step": 1127, + "time_per_iteration": 3.1708903312683105 + }, + { + "auxiliary_loss_clip": 0.01188117, + "auxiliary_loss_mlp": 0.01177059, + "balance_loss_clip": 1.00267839, + "balance_loss_mlp": 1.001297, + "epoch": 0.06781902900946941, + "flos": 24235500769920.0, + "grad_norm": 2.289967750313152, + "language_loss": 0.81682491, + "learning_rate": 3.985016001072925e-06, + "loss": 0.84047669, + "num_input_tokens_seen": 24149930, + "step": 1128, + "time_per_iteration": 2.630620002746582 + }, + { + "auxiliary_loss_clip": 0.01139729, + "auxiliary_loss_mlp": 0.01176902, + "balance_loss_clip": 1.00239539, + "balance_loss_mlp": 1.00133109, + "epoch": 0.06787915226213738, + "flos": 22417523665920.0, + "grad_norm": 2.301225373363794, + "language_loss": 0.75190341, + "learning_rate": 3.984968379142109e-06, + "loss": 0.77506983, + "num_input_tokens_seen": 24169590, + "step": 1129, + "time_per_iteration": 2.6452155113220215 + }, + { + "auxiliary_loss_clip": 0.01089701, + "auxiliary_loss_mlp": 0.01176722, + "balance_loss_clip": 1.00207436, + "balance_loss_mlp": 1.00124609, + "epoch": 0.06793927551480534, + "flos": 37706922080640.0, + "grad_norm": 2.0208748871653888, + "language_loss": 0.72158968, + "learning_rate": 3.984920681941094e-06, + "loss": 0.74425393, + "num_input_tokens_seen": 24189965, + "step": 1130, + "time_per_iteration": 3.0850329399108887 + }, + { + "auxiliary_loss_clip": 0.0112234, + "auxiliary_loss_mlp": 0.01177142, + "balance_loss_clip": 1.00212002, + "balance_loss_mlp": 1.0015707, + "epoch": 0.06799939876747332, + "flos": 20631398947200.0, + "grad_norm": 1.9677802712809571, + "language_loss": 0.80671704, + "learning_rate": 3.984872909471688e-06, + "loss": 0.82971185, + "num_input_tokens_seen": 24208045, + "step": 1131, + "time_per_iteration": 2.9755473136901855 + }, + { + "auxiliary_loss_clip": 0.01171797, + "auxiliary_loss_mlp": 0.01176735, + "balance_loss_clip": 1.00241399, + "balance_loss_mlp": 1.00164068, + "epoch": 0.06805952202014129, + "flos": 14864755829760.0, + "grad_norm": 2.1753177944181594, + "language_loss": 0.80132306, + "learning_rate": 3.984825061735701e-06, + "loss": 0.82480836, + "num_input_tokens_seen": 24223805, + "step": 1132, + "time_per_iteration": 2.5249974727630615 + }, + { + "auxiliary_loss_clip": 0.01155011, + "auxiliary_loss_mlp": 0.01176956, + "balance_loss_clip": 1.00221848, + "balance_loss_mlp": 1.0014801, + "epoch": 0.06811964527280925, + "flos": 48909434947200.0, + "grad_norm": 1.4840198964406588, + "language_loss": 0.63367689, + "learning_rate": 3.9847771387349495e-06, + "loss": 0.65699649, + "num_input_tokens_seen": 24249475, + "step": 1133, + "time_per_iteration": 2.8763768672943115 + }, + { + "auxiliary_loss_clip": 0.01106211, + "auxiliary_loss_mlp": 0.01176631, + "balance_loss_clip": 1.00210726, + "balance_loss_mlp": 1.00105965, + "epoch": 0.06817976852547723, + "flos": 15377273038080.0, + "grad_norm": 2.647629723913723, + "language_loss": 0.74723244, + "learning_rate": 3.9847291404712506e-06, + "loss": 0.7700609, + "num_input_tokens_seen": 24267980, + "step": 1134, + "time_per_iteration": 2.7265679836273193 + }, + { + "auxiliary_loss_clip": 0.01138953, + "auxiliary_loss_mlp": 0.00749824, + "balance_loss_clip": 1.00243521, + "balance_loss_mlp": 1.000916, + "epoch": 0.0682398917781452, + "flos": 20155690200960.0, + "grad_norm": 1.7247534763694015, + "language_loss": 0.87067676, + "learning_rate": 3.984681066946423e-06, + "loss": 0.88956451, + "num_input_tokens_seen": 24286805, + "step": 1135, + "time_per_iteration": 2.722839117050171 + }, + { + "auxiliary_loss_clip": 0.01155534, + "auxiliary_loss_mlp": 0.00749765, + "balance_loss_clip": 1.0023452, + "balance_loss_mlp": 1.00088298, + "epoch": 0.06830001503081316, + "flos": 23440618748160.0, + "grad_norm": 5.4730231982268736, + "language_loss": 0.78009403, + "learning_rate": 3.984632918162291e-06, + "loss": 0.79914701, + "num_input_tokens_seen": 24305855, + "step": 1136, + "time_per_iteration": 2.65826678276062 + }, + { + "auxiliary_loss_clip": 0.01155283, + "auxiliary_loss_mlp": 0.0117705, + "balance_loss_clip": 1.00249195, + "balance_loss_mlp": 1.0015744, + "epoch": 0.06836013828348114, + "flos": 34349813153280.0, + "grad_norm": 2.153300524725046, + "language_loss": 0.84125835, + "learning_rate": 3.984584694120679e-06, + "loss": 0.8645817, + "num_input_tokens_seen": 24326535, + "step": 1137, + "time_per_iteration": 2.73335862159729 + }, + { + "auxiliary_loss_clip": 0.01139301, + "auxiliary_loss_mlp": 0.01176868, + "balance_loss_clip": 1.00237489, + "balance_loss_mlp": 1.00139165, + "epoch": 0.06842026153614911, + "flos": 23148844571520.0, + "grad_norm": 2.35948939246938, + "language_loss": 0.79095608, + "learning_rate": 3.984536394823418e-06, + "loss": 0.81411779, + "num_input_tokens_seen": 24345810, + "step": 1138, + "time_per_iteration": 2.6798858642578125 + }, + { + "auxiliary_loss_clip": 0.01188039, + "auxiliary_loss_mlp": 0.01176773, + "balance_loss_clip": 1.00260222, + "balance_loss_mlp": 1.00129724, + "epoch": 0.06848038478881707, + "flos": 24608972430720.0, + "grad_norm": 2.0170437541444164, + "language_loss": 0.8572942, + "learning_rate": 3.984488020272336e-06, + "loss": 0.88094234, + "num_input_tokens_seen": 24366095, + "step": 1139, + "time_per_iteration": 2.5973100662231445 + }, + { + "auxiliary_loss_clip": 0.01139411, + "auxiliary_loss_mlp": 0.011769, + "balance_loss_clip": 1.00236952, + "balance_loss_mlp": 1.00142407, + "epoch": 0.06854050804148504, + "flos": 40880994278400.0, + "grad_norm": 1.8117298667595516, + "language_loss": 0.74977332, + "learning_rate": 3.984439570469271e-06, + "loss": 0.77293646, + "num_input_tokens_seen": 24388665, + "step": 1140, + "time_per_iteration": 2.859100818634033 + }, + { + "auxiliary_loss_clip": 0.01155352, + "auxiliary_loss_mlp": 0.00749788, + "balance_loss_clip": 1.00237548, + "balance_loss_mlp": 1.00096071, + "epoch": 0.06860063129415302, + "flos": 31686354743040.0, + "grad_norm": 2.179337346909626, + "language_loss": 0.68103224, + "learning_rate": 3.9843910454160574e-06, + "loss": 0.70008361, + "num_input_tokens_seen": 24407705, + "step": 1141, + "time_per_iteration": 2.8276891708374023 + }, + { + "auxiliary_loss_clip": 0.01171729, + "auxiliary_loss_mlp": 0.01177064, + "balance_loss_clip": 1.00255203, + "balance_loss_mlp": 1.00139701, + "epoch": 0.06866075454682098, + "flos": 26542007775360.0, + "grad_norm": 2.8164532978378354, + "language_loss": 0.79156196, + "learning_rate": 3.984342445114538e-06, + "loss": 0.81504995, + "num_input_tokens_seen": 24428390, + "step": 1142, + "time_per_iteration": 2.614448308944702 + }, + { + "auxiliary_loss_clip": 0.01154983, + "auxiliary_loss_mlp": 0.01176929, + "balance_loss_clip": 1.0023756, + "balance_loss_mlp": 1.00135803, + "epoch": 0.06872087779948895, + "flos": 29789768724480.0, + "grad_norm": 2.056806203861203, + "language_loss": 0.68817389, + "learning_rate": 3.984293769566553e-06, + "loss": 0.71149302, + "num_input_tokens_seen": 24450810, + "step": 1143, + "time_per_iteration": 2.6648917198181152 + }, + { + "auxiliary_loss_clip": 0.01171744, + "auxiliary_loss_mlp": 0.01176584, + "balance_loss_clip": 1.00248206, + "balance_loss_mlp": 1.00158453, + "epoch": 0.06878100105215693, + "flos": 26941118768640.0, + "grad_norm": 2.0753818985680685, + "language_loss": 0.74550402, + "learning_rate": 3.98424501877395e-06, + "loss": 0.7689873, + "num_input_tokens_seen": 24469965, + "step": 1144, + "time_per_iteration": 2.5943729877471924 + }, + { + "auxiliary_loss_clip": 0.01171881, + "auxiliary_loss_mlp": 0.01177244, + "balance_loss_clip": 1.00247383, + "balance_loss_mlp": 1.00148165, + "epoch": 0.06884112430482489, + "flos": 10670748946560.0, + "grad_norm": 2.105560500102454, + "language_loss": 0.91929984, + "learning_rate": 3.984196192738577e-06, + "loss": 0.9427911, + "num_input_tokens_seen": 24486370, + "step": 1145, + "time_per_iteration": 2.5557706356048584 + }, + { + "auxiliary_loss_clip": 0.01188115, + "auxiliary_loss_mlp": 0.0117728, + "balance_loss_clip": 1.00263286, + "balance_loss_mlp": 1.0016135, + "epoch": 0.06890124755749286, + "flos": 20193647898240.0, + "grad_norm": 2.6034700831181876, + "language_loss": 0.81849384, + "learning_rate": 3.984147291462285e-06, + "loss": 0.84214777, + "num_input_tokens_seen": 24503780, + "step": 1146, + "time_per_iteration": 2.537123203277588 + }, + { + "auxiliary_loss_clip": 0.01187923, + "auxiliary_loss_mlp": 0.01176452, + "balance_loss_clip": 1.00263822, + "balance_loss_mlp": 1.00135708, + "epoch": 0.06896137081016084, + "flos": 20449224144000.0, + "grad_norm": 1.8428608467195275, + "language_loss": 0.84916902, + "learning_rate": 3.98409831494693e-06, + "loss": 0.87281281, + "num_input_tokens_seen": 24522320, + "step": 1147, + "time_per_iteration": 2.5437569618225098 + }, + { + "auxiliary_loss_clip": 0.01122138, + "auxiliary_loss_mlp": 0.0117689, + "balance_loss_clip": 1.00216651, + "balance_loss_mlp": 1.00160503, + "epoch": 0.0690214940628288, + "flos": 18368703555840.0, + "grad_norm": 1.8683818039380502, + "language_loss": 0.86161596, + "learning_rate": 3.984049263194367e-06, + "loss": 0.88460624, + "num_input_tokens_seen": 24540445, + "step": 1148, + "time_per_iteration": 2.674618721008301 + }, + { + "auxiliary_loss_clip": 0.01155236, + "auxiliary_loss_mlp": 0.01176613, + "balance_loss_clip": 1.00236952, + "balance_loss_mlp": 1.00132799, + "epoch": 0.06908161731549677, + "flos": 20558033418240.0, + "grad_norm": 2.1725163140872144, + "language_loss": 0.69406617, + "learning_rate": 3.9840001362064575e-06, + "loss": 0.71738458, + "num_input_tokens_seen": 24557105, + "step": 1149, + "time_per_iteration": 2.6291403770446777 + }, + { + "auxiliary_loss_clip": 0.01187856, + "auxiliary_loss_mlp": 0.01176495, + "balance_loss_clip": 1.00249207, + "balance_loss_mlp": 1.00101876, + "epoch": 0.06914174056816474, + "flos": 27563666313600.0, + "grad_norm": 1.8077141528024068, + "language_loss": 0.83916742, + "learning_rate": 3.983950933985064e-06, + "loss": 0.86281097, + "num_input_tokens_seen": 24578240, + "step": 1150, + "time_per_iteration": 2.586071014404297 + }, + { + "auxiliary_loss_clip": 0.01154716, + "auxiliary_loss_mlp": 0.01176892, + "balance_loss_clip": 1.00241709, + "balance_loss_mlp": 1.00151134, + "epoch": 0.06920186382083271, + "flos": 15304015249920.0, + "grad_norm": 3.2216730308737604, + "language_loss": 0.81601632, + "learning_rate": 3.983901656532052e-06, + "loss": 0.83933246, + "num_input_tokens_seen": 24593585, + "step": 1151, + "time_per_iteration": 2.639953851699829 + }, + { + "auxiliary_loss_clip": 0.01187929, + "auxiliary_loss_mlp": 0.01176893, + "balance_loss_clip": 1.00262809, + "balance_loss_mlp": 1.0016073, + "epoch": 0.06926198707350067, + "flos": 25191227894400.0, + "grad_norm": 2.3209456745570596, + "language_loss": 0.85767847, + "learning_rate": 3.983852303849291e-06, + "loss": 0.88132668, + "num_input_tokens_seen": 24613110, + "step": 1152, + "time_per_iteration": 2.571396827697754 + }, + { + "auxiliary_loss_clip": 0.01171218, + "auxiliary_loss_mlp": 0.01176778, + "balance_loss_clip": 1.00236201, + "balance_loss_mlp": 1.00158834, + "epoch": 0.06932211032616864, + "flos": 13256137146240.0, + "grad_norm": 2.1291076442320187, + "language_loss": 0.9133544, + "learning_rate": 3.983802875938651e-06, + "loss": 0.93683434, + "num_input_tokens_seen": 24628795, + "step": 1153, + "time_per_iteration": 2.5150961875915527 + }, + { + "auxiliary_loss_clip": 0.0115535, + "auxiliary_loss_mlp": 0.01176564, + "balance_loss_clip": 1.00250101, + "balance_loss_mlp": 1.00146937, + "epoch": 0.06938223357883662, + "flos": 24827381078400.0, + "grad_norm": 2.370107657411429, + "language_loss": 0.81723356, + "learning_rate": 3.983753372802008e-06, + "loss": 0.84055269, + "num_input_tokens_seen": 24645480, + "step": 1154, + "time_per_iteration": 2.634766101837158 + }, + { + "auxiliary_loss_clip": 0.01155385, + "auxiliary_loss_mlp": 0.01176818, + "balance_loss_clip": 1.00257611, + "balance_loss_mlp": 1.00153255, + "epoch": 0.06944235683150458, + "flos": 27267977554560.0, + "grad_norm": 2.138984195105098, + "language_loss": 0.75132346, + "learning_rate": 3.983703794441237e-06, + "loss": 0.77464545, + "num_input_tokens_seen": 24664630, + "step": 1155, + "time_per_iteration": 2.655421495437622 + }, + { + "auxiliary_loss_clip": 0.01171537, + "auxiliary_loss_mlp": 0.00749756, + "balance_loss_clip": 1.00241685, + "balance_loss_mlp": 1.00084019, + "epoch": 0.06950248008417255, + "flos": 25808065176960.0, + "grad_norm": 1.8136805785285528, + "language_loss": 0.71194363, + "learning_rate": 3.98365414085822e-06, + "loss": 0.73115659, + "num_input_tokens_seen": 24684210, + "step": 1156, + "time_per_iteration": 2.6013312339782715 + }, + { + "auxiliary_loss_clip": 0.01157265, + "auxiliary_loss_mlp": 0.00749798, + "balance_loss_clip": 1.00276971, + "balance_loss_mlp": 1.0009104, + "epoch": 0.06956260333684053, + "flos": 22271546793600.0, + "grad_norm": 1.9074769150247064, + "language_loss": 0.7532962, + "learning_rate": 3.98360441205484e-06, + "loss": 0.77236682, + "num_input_tokens_seen": 24702490, + "step": 1157, + "time_per_iteration": 3.9868528842926025 + }, + { + "auxiliary_loss_clip": 0.01154703, + "auxiliary_loss_mlp": 0.01176756, + "balance_loss_clip": 1.002244, + "balance_loss_mlp": 1.0013752, + "epoch": 0.0696227265895085, + "flos": 29681390413440.0, + "grad_norm": 1.6198994353519736, + "language_loss": 0.72034931, + "learning_rate": 3.983554608032982e-06, + "loss": 0.74366391, + "num_input_tokens_seen": 24724340, + "step": 1158, + "time_per_iteration": 4.06138277053833 + }, + { + "auxiliary_loss_clip": 0.01187668, + "auxiliary_loss_mlp": 0.01176525, + "balance_loss_clip": 1.0024327, + "balance_loss_mlp": 1.00143075, + "epoch": 0.06968284984217646, + "flos": 25523545547520.0, + "grad_norm": 1.8676608240098114, + "language_loss": 0.79653454, + "learning_rate": 3.983504728794533e-06, + "loss": 0.82017642, + "num_input_tokens_seen": 24745550, + "step": 1159, + "time_per_iteration": 2.590846538543701 + }, + { + "auxiliary_loss_clip": 0.01187875, + "auxiliary_loss_mlp": 0.01176868, + "balance_loss_clip": 1.00267637, + "balance_loss_mlp": 1.00139189, + "epoch": 0.06974297309484444, + "flos": 20698192287360.0, + "grad_norm": 2.4737591204815956, + "language_loss": 0.80581498, + "learning_rate": 3.983454774341387e-06, + "loss": 0.82946241, + "num_input_tokens_seen": 24762575, + "step": 1160, + "time_per_iteration": 4.041468381881714 + }, + { + "auxiliary_loss_clip": 0.01173619, + "auxiliary_loss_mlp": 0.01176529, + "balance_loss_clip": 1.00257373, + "balance_loss_mlp": 1.00133896, + "epoch": 0.0698030963475124, + "flos": 26505199313280.0, + "grad_norm": 1.67671890501679, + "language_loss": 0.76115608, + "learning_rate": 3.983404744675437e-06, + "loss": 0.7846576, + "num_input_tokens_seen": 24782605, + "step": 1161, + "time_per_iteration": 2.606379747390747 + }, + { + "auxiliary_loss_clip": 0.01154252, + "auxiliary_loss_mlp": 0.01176838, + "balance_loss_clip": 1.00209594, + "balance_loss_mlp": 1.00164771, + "epoch": 0.06986321960018037, + "flos": 23040430346880.0, + "grad_norm": 1.805891040483644, + "language_loss": 0.83013797, + "learning_rate": 3.9833546397985794e-06, + "loss": 0.85344887, + "num_input_tokens_seen": 24802910, + "step": 1162, + "time_per_iteration": 2.5954701900482178 + }, + { + "auxiliary_loss_clip": 0.0117153, + "auxiliary_loss_mlp": 0.01176411, + "balance_loss_clip": 1.00242162, + "balance_loss_mlp": 1.00112581, + "epoch": 0.06992334285284833, + "flos": 28584822061440.0, + "grad_norm": 3.008617871985331, + "language_loss": 0.79924393, + "learning_rate": 3.983304459712716e-06, + "loss": 0.82272327, + "num_input_tokens_seen": 24823305, + "step": 1163, + "time_per_iteration": 2.613374710083008 + }, + { + "auxiliary_loss_clip": 0.01171022, + "auxiliary_loss_mlp": 0.01176825, + "balance_loss_clip": 1.00230229, + "balance_loss_mlp": 1.00153995, + "epoch": 0.06998346610551631, + "flos": 20595344670720.0, + "grad_norm": 1.8998830785628926, + "language_loss": 0.79144835, + "learning_rate": 3.983254204419749e-06, + "loss": 0.8149268, + "num_input_tokens_seen": 24842155, + "step": 1164, + "time_per_iteration": 2.5391674041748047 + }, + { + "auxiliary_loss_clip": 0.01122171, + "auxiliary_loss_mlp": 0.01176895, + "balance_loss_clip": 1.00223827, + "balance_loss_mlp": 1.00170517, + "epoch": 0.07004358935818428, + "flos": 22528810978560.0, + "grad_norm": 1.4503284612753367, + "language_loss": 0.73193979, + "learning_rate": 3.983203873921583e-06, + "loss": 0.75493044, + "num_input_tokens_seen": 24862080, + "step": 1165, + "time_per_iteration": 2.7042605876922607 + }, + { + "auxiliary_loss_clip": 0.01154979, + "auxiliary_loss_mlp": 0.01176482, + "balance_loss_clip": 1.00233197, + "balance_loss_mlp": 1.00129199, + "epoch": 0.07010371261085224, + "flos": 28949997680640.0, + "grad_norm": 1.6317284384722452, + "language_loss": 0.81099075, + "learning_rate": 3.983153468220128e-06, + "loss": 0.83430541, + "num_input_tokens_seen": 24886165, + "step": 1166, + "time_per_iteration": 2.6836297512054443 + }, + { + "auxiliary_loss_clip": 0.01155092, + "auxiliary_loss_mlp": 0.01176122, + "balance_loss_clip": 1.00228763, + "balance_loss_mlp": 1.00112295, + "epoch": 0.07016383586352022, + "flos": 23659171050240.0, + "grad_norm": 1.9879980349388693, + "language_loss": 0.84211969, + "learning_rate": 3.983102987317295e-06, + "loss": 0.8654319, + "num_input_tokens_seen": 24905775, + "step": 1167, + "time_per_iteration": 2.630908966064453 + }, + { + "auxiliary_loss_clip": 0.01171125, + "auxiliary_loss_mlp": 0.01176972, + "balance_loss_clip": 1.00232601, + "balance_loss_mlp": 1.00178182, + "epoch": 0.07022395911618819, + "flos": 19792130693760.0, + "grad_norm": 6.6110803855811575, + "language_loss": 0.89739835, + "learning_rate": 3.983052431214997e-06, + "loss": 0.9208793, + "num_input_tokens_seen": 24924295, + "step": 1168, + "time_per_iteration": 2.5459461212158203 + }, + { + "auxiliary_loss_clip": 0.01154898, + "auxiliary_loss_mlp": 0.01176774, + "balance_loss_clip": 1.00232422, + "balance_loss_mlp": 1.00129795, + "epoch": 0.07028408236885615, + "flos": 21689147675520.0, + "grad_norm": 2.0975712211574105, + "language_loss": 0.88911104, + "learning_rate": 3.983001799915153e-06, + "loss": 0.91242766, + "num_input_tokens_seen": 24943210, + "step": 1169, + "time_per_iteration": 2.5873496532440186 + }, + { + "auxiliary_loss_clip": 0.01187689, + "auxiliary_loss_mlp": 0.01176854, + "balance_loss_clip": 1.00254059, + "balance_loss_mlp": 1.00156879, + "epoch": 0.07034420562152413, + "flos": 25630271832960.0, + "grad_norm": 2.071186463422984, + "language_loss": 0.84368849, + "learning_rate": 3.982951093419681e-06, + "loss": 0.86733389, + "num_input_tokens_seen": 24960360, + "step": 1170, + "time_per_iteration": 2.54988431930542 + }, + { + "auxiliary_loss_clip": 0.01156917, + "auxiliary_loss_mlp": 0.00749735, + "balance_loss_clip": 1.00234437, + "balance_loss_mlp": 1.00080657, + "epoch": 0.0704043288741921, + "flos": 20810449267200.0, + "grad_norm": 1.9593876386414764, + "language_loss": 0.75911438, + "learning_rate": 3.982900311730506e-06, + "loss": 0.7781809, + "num_input_tokens_seen": 24978290, + "step": 1171, + "time_per_iteration": 2.5972936153411865 + }, + { + "auxiliary_loss_clip": 0.01154891, + "auxiliary_loss_mlp": 0.01176946, + "balance_loss_clip": 1.00229979, + "balance_loss_mlp": 1.0016607, + "epoch": 0.07046445212686006, + "flos": 25593176062080.0, + "grad_norm": 4.805626014961117, + "language_loss": 0.89232695, + "learning_rate": 3.9828494548495514e-06, + "loss": 0.91564536, + "num_input_tokens_seen": 24997055, + "step": 1172, + "time_per_iteration": 2.6224029064178467 + }, + { + "auxiliary_loss_clip": 0.01155008, + "auxiliary_loss_mlp": 0.01176948, + "balance_loss_clip": 1.00235116, + "balance_loss_mlp": 1.00166249, + "epoch": 0.07052457537952803, + "flos": 25556978131200.0, + "grad_norm": 1.6754651140648398, + "language_loss": 0.82444954, + "learning_rate": 3.982798522778748e-06, + "loss": 0.84776908, + "num_input_tokens_seen": 25017490, + "step": 1173, + "time_per_iteration": 2.635951519012451 + }, + { + "auxiliary_loss_clip": 0.01171605, + "auxiliary_loss_mlp": 0.01176491, + "balance_loss_clip": 1.00250506, + "balance_loss_mlp": 1.00130129, + "epoch": 0.070584698632196, + "flos": 17968515154560.0, + "grad_norm": 1.8591126199027201, + "language_loss": 0.82388008, + "learning_rate": 3.9827475155200245e-06, + "loss": 0.84736103, + "num_input_tokens_seen": 25035660, + "step": 1174, + "time_per_iteration": 2.551521062850952 + }, + { + "auxiliary_loss_clip": 0.01154704, + "auxiliary_loss_mlp": 0.01176599, + "balance_loss_clip": 1.00223446, + "balance_loss_mlp": 1.00131333, + "epoch": 0.07064482188486397, + "flos": 25370888745600.0, + "grad_norm": 1.9314751169503799, + "language_loss": 0.8524248, + "learning_rate": 3.982696433075317e-06, + "loss": 0.87573791, + "num_input_tokens_seen": 25054785, + "step": 1175, + "time_per_iteration": 2.624044179916382 + }, + { + "auxiliary_loss_clip": 0.01171104, + "auxiliary_loss_mlp": 0.01176911, + "balance_loss_clip": 1.00249147, + "balance_loss_mlp": 1.00153089, + "epoch": 0.07070494513753194, + "flos": 24899848767360.0, + "grad_norm": 2.606487352541376, + "language_loss": 0.83285987, + "learning_rate": 3.982645275446563e-06, + "loss": 0.85634005, + "num_input_tokens_seen": 25075180, + "step": 1176, + "time_per_iteration": 2.5836732387542725 + }, + { + "auxiliary_loss_clip": 0.01138636, + "auxiliary_loss_mlp": 0.01176407, + "balance_loss_clip": 1.0021385, + "balance_loss_mlp": 1.00131214, + "epoch": 0.07076506839019991, + "flos": 22338447874560.0, + "grad_norm": 2.394231354228447, + "language_loss": 0.74457741, + "learning_rate": 3.982594042635701e-06, + "loss": 0.76772785, + "num_input_tokens_seen": 25093035, + "step": 1177, + "time_per_iteration": 2.6195504665374756 + }, + { + "auxiliary_loss_clip": 0.01154875, + "auxiliary_loss_mlp": 0.01176726, + "balance_loss_clip": 1.00244057, + "balance_loss_mlp": 1.00134552, + "epoch": 0.07082519164286788, + "flos": 18660800954880.0, + "grad_norm": 1.6539881427913097, + "language_loss": 0.85947502, + "learning_rate": 3.982542734644673e-06, + "loss": 0.88279104, + "num_input_tokens_seen": 25112520, + "step": 1178, + "time_per_iteration": 2.6672275066375732 + }, + { + "auxiliary_loss_clip": 0.01154622, + "auxiliary_loss_mlp": 0.01172509, + "balance_loss_clip": 1.00375891, + "balance_loss_mlp": 1.00008512, + "epoch": 0.07088531489553584, + "flos": 63654107610240.0, + "grad_norm": 0.8465272607405894, + "language_loss": 0.63309145, + "learning_rate": 3.982491351475427e-06, + "loss": 0.65636271, + "num_input_tokens_seen": 25177760, + "step": 1179, + "time_per_iteration": 3.341245651245117 + }, + { + "auxiliary_loss_clip": 0.01171361, + "auxiliary_loss_mlp": 0.01176484, + "balance_loss_clip": 1.00245643, + "balance_loss_mlp": 1.00119913, + "epoch": 0.07094543814820382, + "flos": 21572688804480.0, + "grad_norm": 2.6586626965088622, + "language_loss": 0.83488309, + "learning_rate": 3.98243989312991e-06, + "loss": 0.85836154, + "num_input_tokens_seen": 25195260, + "step": 1180, + "time_per_iteration": 2.5615053176879883 + }, + { + "auxiliary_loss_clip": 0.01154877, + "auxiliary_loss_mlp": 0.01176559, + "balance_loss_clip": 1.00233281, + "balance_loss_mlp": 1.00156021, + "epoch": 0.07100556140087179, + "flos": 22089946608000.0, + "grad_norm": 2.1487448428803813, + "language_loss": 0.88653195, + "learning_rate": 3.982388359610074e-06, + "loss": 0.90984631, + "num_input_tokens_seen": 25212740, + "step": 1181, + "time_per_iteration": 2.6285648345947266 + }, + { + "auxiliary_loss_clip": 0.01155218, + "auxiliary_loss_mlp": 0.01176501, + "balance_loss_clip": 1.00246906, + "balance_loss_mlp": 1.00140691, + "epoch": 0.07106568465353975, + "flos": 47922286400640.0, + "grad_norm": 1.8850952553336608, + "language_loss": 0.8347578, + "learning_rate": 3.9823367509178725e-06, + "loss": 0.85807496, + "num_input_tokens_seen": 25236420, + "step": 1182, + "time_per_iteration": 2.84100079536438 + }, + { + "auxiliary_loss_clip": 0.01171247, + "auxiliary_loss_mlp": 0.01176362, + "balance_loss_clip": 1.00256538, + "balance_loss_mlp": 1.00136328, + "epoch": 0.07112580790620772, + "flos": 23440798316160.0, + "grad_norm": 2.131790413685024, + "language_loss": 0.79130542, + "learning_rate": 3.982285067055262e-06, + "loss": 0.81478155, + "num_input_tokens_seen": 25255120, + "step": 1183, + "time_per_iteration": 2.6064705848693848 + }, + { + "auxiliary_loss_clip": 0.01187576, + "auxiliary_loss_mlp": 0.01176356, + "balance_loss_clip": 1.00243104, + "balance_loss_mlp": 1.00116587, + "epoch": 0.0711859311588757, + "flos": 31868888682240.0, + "grad_norm": 1.9476858537489397, + "language_loss": 0.78977686, + "learning_rate": 3.982233308024204e-06, + "loss": 0.81341612, + "num_input_tokens_seen": 25275150, + "step": 1184, + "time_per_iteration": 2.7000279426574707 + }, + { + "auxiliary_loss_clip": 0.01121179, + "auxiliary_loss_mlp": 0.01176056, + "balance_loss_clip": 1.00213957, + "balance_loss_mlp": 1.00115228, + "epoch": 0.07124605441154366, + "flos": 19610315026560.0, + "grad_norm": 2.029242047907811, + "language_loss": 0.77093995, + "learning_rate": 3.98218147382666e-06, + "loss": 0.79391229, + "num_input_tokens_seen": 25293680, + "step": 1185, + "time_per_iteration": 2.7193756103515625 + }, + { + "auxiliary_loss_clip": 0.01187622, + "auxiliary_loss_mlp": 0.01176268, + "balance_loss_clip": 1.00255585, + "balance_loss_mlp": 1.0015552, + "epoch": 0.07130617766421163, + "flos": 14684448533760.0, + "grad_norm": 4.2397358254048, + "language_loss": 0.65290463, + "learning_rate": 3.982129564464596e-06, + "loss": 0.67654353, + "num_input_tokens_seen": 25310050, + "step": 1186, + "time_per_iteration": 2.4960412979125977 + }, + { + "auxiliary_loss_clip": 0.01170868, + "auxiliary_loss_mlp": 0.01176232, + "balance_loss_clip": 1.00241101, + "balance_loss_mlp": 1.00123262, + "epoch": 0.07136630091687961, + "flos": 26067915141120.0, + "grad_norm": 5.386867128742226, + "language_loss": 0.69903511, + "learning_rate": 3.98207757993998e-06, + "loss": 0.72250617, + "num_input_tokens_seen": 25331020, + "step": 1187, + "time_per_iteration": 2.6097991466522217 + }, + { + "auxiliary_loss_clip": 0.01138434, + "auxiliary_loss_mlp": 0.01176027, + "balance_loss_clip": 1.00235796, + "balance_loss_mlp": 1.00131404, + "epoch": 0.07142642416954757, + "flos": 15669190869120.0, + "grad_norm": 4.004608490880106, + "language_loss": 0.78740144, + "learning_rate": 3.9820255202547845e-06, + "loss": 0.81054604, + "num_input_tokens_seen": 25347875, + "step": 1188, + "time_per_iteration": 2.6029653549194336 + }, + { + "auxiliary_loss_clip": 0.01187513, + "auxiliary_loss_mlp": 0.01175955, + "balance_loss_clip": 1.00261712, + "balance_loss_mlp": 1.00105095, + "epoch": 0.07148654742221554, + "flos": 19755322231680.0, + "grad_norm": 1.883600120723503, + "language_loss": 0.84929049, + "learning_rate": 3.981973385410981e-06, + "loss": 0.87292522, + "num_input_tokens_seen": 25366715, + "step": 1189, + "time_per_iteration": 2.5549230575561523 + }, + { + "auxiliary_loss_clip": 0.01138568, + "auxiliary_loss_mlp": 0.00749732, + "balance_loss_clip": 1.0023284, + "balance_loss_mlp": 1.00083113, + "epoch": 0.07154667067488352, + "flos": 23471824688640.0, + "grad_norm": 1.918329576496452, + "language_loss": 0.76900971, + "learning_rate": 3.9819211754105494e-06, + "loss": 0.7878927, + "num_input_tokens_seen": 25385450, + "step": 1190, + "time_per_iteration": 2.691769599914551 + }, + { + "auxiliary_loss_clip": 0.01187596, + "auxiliary_loss_mlp": 0.01176708, + "balance_loss_clip": 1.00258458, + "balance_loss_mlp": 1.00170863, + "epoch": 0.07160679392755148, + "flos": 18332936588160.0, + "grad_norm": 4.593291164225093, + "language_loss": 0.75730348, + "learning_rate": 3.981868890255468e-06, + "loss": 0.78094655, + "num_input_tokens_seen": 25403940, + "step": 1191, + "time_per_iteration": 2.56103253364563 + }, + { + "auxiliary_loss_clip": 0.01139307, + "auxiliary_loss_mlp": 0.01176282, + "balance_loss_clip": 1.00240779, + "balance_loss_mlp": 1.00109231, + "epoch": 0.07166691718021945, + "flos": 17747017937280.0, + "grad_norm": 2.391712085068518, + "language_loss": 0.73820627, + "learning_rate": 3.981816529947719e-06, + "loss": 0.7613622, + "num_input_tokens_seen": 25420410, + "step": 1192, + "time_per_iteration": 2.617394208908081 + }, + { + "auxiliary_loss_clip": 0.01187382, + "auxiliary_loss_mlp": 0.01175968, + "balance_loss_clip": 1.00246131, + "balance_loss_mlp": 1.00115955, + "epoch": 0.07172704043288743, + "flos": 22451925916800.0, + "grad_norm": 2.398149069873053, + "language_loss": 0.78040981, + "learning_rate": 3.9817640944892896e-06, + "loss": 0.80404329, + "num_input_tokens_seen": 25439415, + "step": 1193, + "time_per_iteration": 2.5530972480773926 + }, + { + "auxiliary_loss_clip": 0.01154583, + "auxiliary_loss_mlp": 0.0117621, + "balance_loss_clip": 1.00234127, + "balance_loss_mlp": 1.0011158, + "epoch": 0.07178716368555539, + "flos": 23222210100480.0, + "grad_norm": 1.9340606984043185, + "language_loss": 0.85738081, + "learning_rate": 3.981711583882166e-06, + "loss": 0.88068873, + "num_input_tokens_seen": 25458715, + "step": 1194, + "time_per_iteration": 4.0184006690979 + }, + { + "auxiliary_loss_clip": 0.01171442, + "auxiliary_loss_mlp": 0.01176347, + "balance_loss_clip": 1.00251365, + "balance_loss_mlp": 1.00153899, + "epoch": 0.07184728693822336, + "flos": 25150828072320.0, + "grad_norm": 2.616156295643807, + "language_loss": 0.81434411, + "learning_rate": 3.981658998128341e-06, + "loss": 0.83782196, + "num_input_tokens_seen": 25477985, + "step": 1195, + "time_per_iteration": 2.6379952430725098 + }, + { + "auxiliary_loss_clip": 0.01155311, + "auxiliary_loss_mlp": 0.01176198, + "balance_loss_clip": 1.00256503, + "balance_loss_mlp": 1.0013895, + "epoch": 0.07190741019089132, + "flos": 22711237176960.0, + "grad_norm": 2.4633504740874326, + "language_loss": 0.7969746, + "learning_rate": 3.981606337229808e-06, + "loss": 0.82028973, + "num_input_tokens_seen": 25497110, + "step": 1196, + "time_per_iteration": 4.015820741653442 + }, + { + "auxiliary_loss_clip": 0.01138648, + "auxiliary_loss_mlp": 0.00749726, + "balance_loss_clip": 1.00229061, + "balance_loss_mlp": 1.00074041, + "epoch": 0.0719675334435593, + "flos": 29349791032320.0, + "grad_norm": 2.479548884882908, + "language_loss": 0.71575665, + "learning_rate": 3.9815536011885655e-06, + "loss": 0.73464036, + "num_input_tokens_seen": 25516555, + "step": 1197, + "time_per_iteration": 4.145668983459473 + }, + { + "auxiliary_loss_clip": 0.01121395, + "auxiliary_loss_mlp": 0.01175888, + "balance_loss_clip": 1.00205255, + "balance_loss_mlp": 1.0010798, + "epoch": 0.07202765669622727, + "flos": 17639788861440.0, + "grad_norm": 2.0995254720607823, + "language_loss": 0.86053598, + "learning_rate": 3.98150079000661e-06, + "loss": 0.8835088, + "num_input_tokens_seen": 25533895, + "step": 1198, + "time_per_iteration": 4.02533221244812 + }, + { + "auxiliary_loss_clip": 0.01121914, + "auxiliary_loss_mlp": 0.01176211, + "balance_loss_clip": 1.00231862, + "balance_loss_mlp": 1.00130677, + "epoch": 0.07208777994889523, + "flos": 21434038306560.0, + "grad_norm": 3.193416418049068, + "language_loss": 0.8411094, + "learning_rate": 3.981447903685947e-06, + "loss": 0.86409068, + "num_input_tokens_seen": 25554195, + "step": 1199, + "time_per_iteration": 2.69704532623291 + }, + { + "auxiliary_loss_clip": 0.01187629, + "auxiliary_loss_mlp": 0.01176397, + "balance_loss_clip": 1.00271368, + "balance_loss_mlp": 1.00130248, + "epoch": 0.07214790320156321, + "flos": 26940867373440.0, + "grad_norm": 2.007044726806272, + "language_loss": 0.76593089, + "learning_rate": 3.981394942228581e-06, + "loss": 0.78957111, + "num_input_tokens_seen": 25574155, + "step": 1200, + "time_per_iteration": 2.6087472438812256 + }, + { + "auxiliary_loss_clip": 0.01171174, + "auxiliary_loss_mlp": 0.01176348, + "balance_loss_clip": 1.0024904, + "balance_loss_mlp": 1.00163448, + "epoch": 0.07220802645423118, + "flos": 23879949995520.0, + "grad_norm": 1.8449253041156517, + "language_loss": 0.82884055, + "learning_rate": 3.98134190563652e-06, + "loss": 0.85231578, + "num_input_tokens_seen": 25592735, + "step": 1201, + "time_per_iteration": 2.6013388633728027 + }, + { + "auxiliary_loss_clip": 0.01171084, + "auxiliary_loss_mlp": 0.01176298, + "balance_loss_clip": 1.00250733, + "balance_loss_mlp": 1.0013938, + "epoch": 0.07226814970689914, + "flos": 19243631036160.0, + "grad_norm": 2.982782620583394, + "language_loss": 0.68937141, + "learning_rate": 3.981288793911775e-06, + "loss": 0.71284533, + "num_input_tokens_seen": 25611510, + "step": 1202, + "time_per_iteration": 2.5732405185699463 + }, + { + "auxiliary_loss_clip": 0.01154445, + "auxiliary_loss_mlp": 0.00749751, + "balance_loss_clip": 1.00242496, + "balance_loss_mlp": 1.00092673, + "epoch": 0.07232827295956712, + "flos": 19172025273600.0, + "grad_norm": 1.9366287675159595, + "language_loss": 0.87612838, + "learning_rate": 3.98123560705636e-06, + "loss": 0.89517033, + "num_input_tokens_seen": 25629560, + "step": 1203, + "time_per_iteration": 2.623119592666626 + }, + { + "auxiliary_loss_clip": 0.01121794, + "auxiliary_loss_mlp": 0.01176487, + "balance_loss_clip": 1.00210631, + "balance_loss_mlp": 1.00148726, + "epoch": 0.07238839621223508, + "flos": 17639752947840.0, + "grad_norm": 2.7142825241489805, + "language_loss": 0.78062844, + "learning_rate": 3.981182345072293e-06, + "loss": 0.80361128, + "num_input_tokens_seen": 25648330, + "step": 1204, + "time_per_iteration": 2.6761832237243652 + }, + { + "auxiliary_loss_clip": 0.01170777, + "auxiliary_loss_mlp": 0.01176088, + "balance_loss_clip": 1.00248778, + "balance_loss_mlp": 1.00147069, + "epoch": 0.07244851946490305, + "flos": 28292401440000.0, + "grad_norm": 1.5951344072869582, + "language_loss": 0.82139659, + "learning_rate": 3.981129007961593e-06, + "loss": 0.8448652, + "num_input_tokens_seen": 25669470, + "step": 1205, + "time_per_iteration": 2.6672418117523193 + }, + { + "auxiliary_loss_clip": 0.01154702, + "auxiliary_loss_mlp": 0.00749736, + "balance_loss_clip": 1.00244451, + "balance_loss_mlp": 1.00089419, + "epoch": 0.07250864271757101, + "flos": 22564829341440.0, + "grad_norm": 1.953024173212616, + "language_loss": 0.76322675, + "learning_rate": 3.981075595726283e-06, + "loss": 0.78227115, + "num_input_tokens_seen": 25690470, + "step": 1206, + "time_per_iteration": 2.6595301628112793 + }, + { + "auxiliary_loss_clip": 0.01170952, + "auxiliary_loss_mlp": 0.01176, + "balance_loss_clip": 1.0023793, + "balance_loss_mlp": 1.00138211, + "epoch": 0.072568765970239, + "flos": 21762405463680.0, + "grad_norm": 1.7909643944878821, + "language_loss": 0.77336466, + "learning_rate": 3.981022108368387e-06, + "loss": 0.79683417, + "num_input_tokens_seen": 25709205, + "step": 1207, + "time_per_iteration": 2.5915615558624268 + }, + { + "auxiliary_loss_clip": 0.01171517, + "auxiliary_loss_mlp": 0.01175966, + "balance_loss_clip": 1.00258017, + "balance_loss_mlp": 1.00144398, + "epoch": 0.07262888922290696, + "flos": 25519702792320.0, + "grad_norm": 2.060949850064788, + "language_loss": 0.80142438, + "learning_rate": 3.9809685458899345e-06, + "loss": 0.82489926, + "num_input_tokens_seen": 25728485, + "step": 1208, + "time_per_iteration": 2.6258177757263184 + }, + { + "auxiliary_loss_clip": 0.0117136, + "auxiliary_loss_mlp": 0.01176104, + "balance_loss_clip": 1.00248456, + "balance_loss_mlp": 1.00139105, + "epoch": 0.07268901247557492, + "flos": 21246548290560.0, + "grad_norm": 2.0510185541224866, + "language_loss": 0.78885818, + "learning_rate": 3.980914908292955e-06, + "loss": 0.81233287, + "num_input_tokens_seen": 25747730, + "step": 1209, + "time_per_iteration": 2.607074499130249 + }, + { + "auxiliary_loss_clip": 0.01170992, + "auxiliary_loss_mlp": 0.01176047, + "balance_loss_clip": 1.00253105, + "balance_loss_mlp": 1.00142968, + "epoch": 0.0727491357282429, + "flos": 25479302970240.0, + "grad_norm": 2.4180887982246624, + "language_loss": 0.80919838, + "learning_rate": 3.980861195579486e-06, + "loss": 0.83266872, + "num_input_tokens_seen": 25768050, + "step": 1210, + "time_per_iteration": 2.587778091430664 + }, + { + "auxiliary_loss_clip": 0.01154292, + "auxiliary_loss_mlp": 0.01175932, + "balance_loss_clip": 1.00248218, + "balance_loss_mlp": 1.00160027, + "epoch": 0.07280925898091087, + "flos": 24462169545600.0, + "grad_norm": 1.762689446393406, + "language_loss": 0.84428579, + "learning_rate": 3.98080740775156e-06, + "loss": 0.86758804, + "num_input_tokens_seen": 25787985, + "step": 1211, + "time_per_iteration": 2.6570022106170654 + }, + { + "auxiliary_loss_clip": 0.01155196, + "auxiliary_loss_mlp": 0.01175565, + "balance_loss_clip": 1.00252962, + "balance_loss_mlp": 1.0011375, + "epoch": 0.07286938223357883, + "flos": 18288191220480.0, + "grad_norm": 3.0162519223911133, + "language_loss": 0.90853691, + "learning_rate": 3.98075354481122e-06, + "loss": 0.93184453, + "num_input_tokens_seen": 25803620, + "step": 1212, + "time_per_iteration": 2.559631824493408 + }, + { + "auxiliary_loss_clip": 0.01187207, + "auxiliary_loss_mlp": 0.01175673, + "balance_loss_clip": 1.00251567, + "balance_loss_mlp": 1.00134158, + "epoch": 0.07292950548624681, + "flos": 21214803646080.0, + "grad_norm": 1.7428315336291638, + "language_loss": 0.72567165, + "learning_rate": 3.9806996067605055e-06, + "loss": 0.74930048, + "num_input_tokens_seen": 25823315, + "step": 1213, + "time_per_iteration": 2.545675277709961 + }, + { + "auxiliary_loss_clip": 0.01121534, + "auxiliary_loss_mlp": 0.01175426, + "balance_loss_clip": 1.00213432, + "balance_loss_mlp": 1.0010941, + "epoch": 0.07298962873891478, + "flos": 24642009964800.0, + "grad_norm": 1.829703974068523, + "language_loss": 0.8424927, + "learning_rate": 3.980645593601465e-06, + "loss": 0.86546224, + "num_input_tokens_seen": 25842605, + "step": 1214, + "time_per_iteration": 2.683262825012207 + }, + { + "auxiliary_loss_clip": 0.01187092, + "auxiliary_loss_mlp": 0.01176032, + "balance_loss_clip": 1.00241065, + "balance_loss_mlp": 1.00131893, + "epoch": 0.07304975199158274, + "flos": 27052765217280.0, + "grad_norm": 2.274881278817425, + "language_loss": 0.84154296, + "learning_rate": 3.980591505336144e-06, + "loss": 0.86517417, + "num_input_tokens_seen": 25863030, + "step": 1215, + "time_per_iteration": 2.57981538772583 + }, + { + "auxiliary_loss_clip": 0.01141274, + "auxiliary_loss_mlp": 0.0117588, + "balance_loss_clip": 1.00239134, + "balance_loss_mlp": 1.00154877, + "epoch": 0.07310987524425071, + "flos": 33549544091520.0, + "grad_norm": 1.7467430934374026, + "language_loss": 0.81353378, + "learning_rate": 3.980537341966595e-06, + "loss": 0.83670545, + "num_input_tokens_seen": 25888015, + "step": 1216, + "time_per_iteration": 2.7590174674987793 + }, + { + "auxiliary_loss_clip": 0.0115522, + "auxiliary_loss_mlp": 0.01175839, + "balance_loss_clip": 1.00262284, + "balance_loss_mlp": 1.00131667, + "epoch": 0.07316999849691869, + "flos": 28110944908800.0, + "grad_norm": 2.428472873470714, + "language_loss": 0.7635352, + "learning_rate": 3.980483103494872e-06, + "loss": 0.78684574, + "num_input_tokens_seen": 25908660, + "step": 1217, + "time_per_iteration": 2.7104909420013428 + }, + { + "auxiliary_loss_clip": 0.01137795, + "auxiliary_loss_mlp": 0.01175875, + "balance_loss_clip": 1.00226295, + "balance_loss_mlp": 1.00144839, + "epoch": 0.07323012174958665, + "flos": 14392602529920.0, + "grad_norm": 2.020619558503532, + "language_loss": 0.86559618, + "learning_rate": 3.98042878992303e-06, + "loss": 0.88873291, + "num_input_tokens_seen": 25927215, + "step": 1218, + "time_per_iteration": 2.643023729324341 + }, + { + "auxiliary_loss_clip": 0.01170718, + "auxiliary_loss_mlp": 0.01176259, + "balance_loss_clip": 1.00242162, + "balance_loss_mlp": 1.001737, + "epoch": 0.07329024500225462, + "flos": 21616428591360.0, + "grad_norm": 1.8289268037223818, + "language_loss": 0.86445117, + "learning_rate": 3.9803744012531305e-06, + "loss": 0.88792098, + "num_input_tokens_seen": 25945500, + "step": 1219, + "time_per_iteration": 2.6079349517822266 + }, + { + "auxiliary_loss_clip": 0.01187068, + "auxiliary_loss_mlp": 0.01175396, + "balance_loss_clip": 1.00254941, + "balance_loss_mlp": 1.00144601, + "epoch": 0.0733503682549226, + "flos": 13224141106560.0, + "grad_norm": 2.6811254769191426, + "language_loss": 0.84790587, + "learning_rate": 3.980319937487235e-06, + "loss": 0.87153053, + "num_input_tokens_seen": 25963105, + "step": 1220, + "time_per_iteration": 2.496316909790039 + }, + { + "auxiliary_loss_clip": 0.01138983, + "auxiliary_loss_mlp": 0.01175964, + "balance_loss_clip": 1.00231004, + "balance_loss_mlp": 1.00144148, + "epoch": 0.07341049150759056, + "flos": 20886975192960.0, + "grad_norm": 2.4104798379224976, + "language_loss": 0.76907891, + "learning_rate": 3.98026539862741e-06, + "loss": 0.7922284, + "num_input_tokens_seen": 25981690, + "step": 1221, + "time_per_iteration": 2.662949323654175 + }, + { + "auxiliary_loss_clip": 0.01121686, + "auxiliary_loss_mlp": 0.01176069, + "balance_loss_clip": 1.00229573, + "balance_loss_mlp": 1.00164199, + "epoch": 0.07347061476025853, + "flos": 15413614623360.0, + "grad_norm": 1.876345193775788, + "language_loss": 0.91860062, + "learning_rate": 3.980210784675722e-06, + "loss": 0.94157815, + "num_input_tokens_seen": 25999890, + "step": 1222, + "time_per_iteration": 2.6625819206237793 + }, + { + "auxiliary_loss_clip": 0.01105128, + "auxiliary_loss_mlp": 0.0117597, + "balance_loss_clip": 1.00206673, + "balance_loss_mlp": 1.00154316, + "epoch": 0.0735307380129265, + "flos": 11108859131520.0, + "grad_norm": 2.45123131703818, + "language_loss": 0.90952671, + "learning_rate": 3.980156095634242e-06, + "loss": 0.93233764, + "num_input_tokens_seen": 26016445, + "step": 1223, + "time_per_iteration": 2.7421875 + }, + { + "auxiliary_loss_clip": 0.01187284, + "auxiliary_loss_mlp": 0.01176051, + "balance_loss_clip": 1.00274277, + "balance_loss_mlp": 1.00190997, + "epoch": 0.07359086126559447, + "flos": 23732392924800.0, + "grad_norm": 3.118316243590315, + "language_loss": 0.82071245, + "learning_rate": 3.980101331505045e-06, + "loss": 0.84434581, + "num_input_tokens_seen": 26036080, + "step": 1224, + "time_per_iteration": 2.5762994289398193 + }, + { + "auxiliary_loss_clip": 0.01187111, + "auxiliary_loss_mlp": 0.01175937, + "balance_loss_clip": 1.00255179, + "balance_loss_mlp": 1.00160575, + "epoch": 0.07365098451826244, + "flos": 20993270515200.0, + "grad_norm": 1.7938468173372195, + "language_loss": 0.83215594, + "learning_rate": 3.9800464922902076e-06, + "loss": 0.85578644, + "num_input_tokens_seen": 26055805, + "step": 1225, + "time_per_iteration": 2.5616512298583984 + }, + { + "auxiliary_loss_clip": 0.01138444, + "auxiliary_loss_mlp": 0.01175979, + "balance_loss_clip": 1.00241756, + "balance_loss_mlp": 1.00155234, + "epoch": 0.0737111077709304, + "flos": 19933582452480.0, + "grad_norm": 1.9453593333311108, + "language_loss": 0.90378582, + "learning_rate": 3.979991577991808e-06, + "loss": 0.92693013, + "num_input_tokens_seen": 26073905, + "step": 1226, + "time_per_iteration": 2.6299333572387695 + }, + { + "auxiliary_loss_clip": 0.01187283, + "auxiliary_loss_mlp": 0.01175938, + "balance_loss_clip": 1.00255489, + "balance_loss_mlp": 1.00141525, + "epoch": 0.07377123102359838, + "flos": 16581537342720.0, + "grad_norm": 3.252308544417047, + "language_loss": 0.76813328, + "learning_rate": 3.97993658861193e-06, + "loss": 0.79176545, + "num_input_tokens_seen": 26091700, + "step": 1227, + "time_per_iteration": 2.5745456218719482 + }, + { + "auxiliary_loss_clip": 0.01170548, + "auxiliary_loss_mlp": 0.01175329, + "balance_loss_clip": 1.00252247, + "balance_loss_mlp": 1.00118756, + "epoch": 0.07383135427626634, + "flos": 28328563457280.0, + "grad_norm": 1.580849252334613, + "language_loss": 0.85861349, + "learning_rate": 3.9798815241526575e-06, + "loss": 0.88207221, + "num_input_tokens_seen": 26114105, + "step": 1228, + "time_per_iteration": 2.636101722717285 + }, + { + "auxiliary_loss_clip": 0.01170744, + "auxiliary_loss_mlp": 0.01175287, + "balance_loss_clip": 1.00249958, + "balance_loss_mlp": 1.00105095, + "epoch": 0.07389147752893431, + "flos": 20047168235520.0, + "grad_norm": 1.9610206989772245, + "language_loss": 0.79355705, + "learning_rate": 3.97982638461608e-06, + "loss": 0.81701732, + "num_input_tokens_seen": 26131165, + "step": 1229, + "time_per_iteration": 2.5979790687561035 + }, + { + "auxiliary_loss_clip": 0.01170887, + "auxiliary_loss_mlp": 0.0074958, + "balance_loss_clip": 1.00262904, + "balance_loss_mlp": 1.00057685, + "epoch": 0.07395160078160229, + "flos": 18114132890880.0, + "grad_norm": 1.9689118852155776, + "language_loss": 0.78239679, + "learning_rate": 3.979771170004287e-06, + "loss": 0.80160141, + "num_input_tokens_seen": 26150040, + "step": 1230, + "time_per_iteration": 2.6065359115600586 + }, + { + "auxiliary_loss_clip": 0.01187255, + "auxiliary_loss_mlp": 0.01175242, + "balance_loss_clip": 1.00274181, + "balance_loss_mlp": 1.00119627, + "epoch": 0.07401172403427025, + "flos": 23586918842880.0, + "grad_norm": 1.906732276952044, + "language_loss": 0.81632519, + "learning_rate": 3.979715880319372e-06, + "loss": 0.8399502, + "num_input_tokens_seen": 26169380, + "step": 1231, + "time_per_iteration": 2.5507233142852783 + }, + { + "auxiliary_loss_clip": 0.01156682, + "auxiliary_loss_mlp": 0.01176032, + "balance_loss_clip": 1.00239348, + "balance_loss_mlp": 1.00189114, + "epoch": 0.07407184728693822, + "flos": 26359904799360.0, + "grad_norm": 2.0674110915441752, + "language_loss": 0.94882894, + "learning_rate": 3.979660515563434e-06, + "loss": 0.97215605, + "num_input_tokens_seen": 26189420, + "step": 1232, + "time_per_iteration": 4.016552448272705 + }, + { + "auxiliary_loss_clip": 0.0117057, + "auxiliary_loss_mlp": 0.01176057, + "balance_loss_clip": 1.00251341, + "balance_loss_mlp": 1.00201178, + "epoch": 0.0741319705396062, + "flos": 22200443821440.0, + "grad_norm": 1.8526577814969967, + "language_loss": 0.80341077, + "learning_rate": 3.979605075738569e-06, + "loss": 0.82687706, + "num_input_tokens_seen": 26209300, + "step": 1233, + "time_per_iteration": 2.5663959980010986 + }, + { + "auxiliary_loss_clip": 0.01187186, + "auxiliary_loss_mlp": 0.01175791, + "balance_loss_clip": 1.00261092, + "balance_loss_mlp": 1.00136435, + "epoch": 0.07419209379227416, + "flos": 39200482523520.0, + "grad_norm": 2.3755615599330486, + "language_loss": 0.70888168, + "learning_rate": 3.979549560846883e-06, + "loss": 0.7325114, + "num_input_tokens_seen": 26228110, + "step": 1234, + "time_per_iteration": 3.984334945678711 + }, + { + "auxiliary_loss_clip": 0.01154968, + "auxiliary_loss_mlp": 0.01175617, + "balance_loss_clip": 1.0025785, + "balance_loss_mlp": 1.00138044, + "epoch": 0.07425221704494213, + "flos": 22781657790720.0, + "grad_norm": 1.710964517738778, + "language_loss": 0.77122295, + "learning_rate": 3.979493970890478e-06, + "loss": 0.79452878, + "num_input_tokens_seen": 26247020, + "step": 1235, + "time_per_iteration": 5.5621497631073 + }, + { + "auxiliary_loss_clip": 0.01186905, + "auxiliary_loss_mlp": 0.01175897, + "balance_loss_clip": 1.00249982, + "balance_loss_mlp": 1.00166059, + "epoch": 0.0743123402976101, + "flos": 22272983337600.0, + "grad_norm": 2.3468901080064506, + "language_loss": 0.83000594, + "learning_rate": 3.979438305871464e-06, + "loss": 0.85363394, + "num_input_tokens_seen": 26265750, + "step": 1236, + "time_per_iteration": 2.576596736907959 + }, + { + "auxiliary_loss_clip": 0.01138536, + "auxiliary_loss_mlp": 0.00749596, + "balance_loss_clip": 1.00226188, + "balance_loss_mlp": 1.00053596, + "epoch": 0.07437246355027807, + "flos": 29315029645440.0, + "grad_norm": 1.8447060176252192, + "language_loss": 0.75857413, + "learning_rate": 3.979382565791951e-06, + "loss": 0.77745545, + "num_input_tokens_seen": 26287905, + "step": 1237, + "time_per_iteration": 2.7040274143218994 + }, + { + "auxiliary_loss_clip": 0.01121163, + "auxiliary_loss_mlp": 0.00749586, + "balance_loss_clip": 1.00204706, + "balance_loss_mlp": 1.00051689, + "epoch": 0.07443258680294604, + "flos": 31944732249600.0, + "grad_norm": 1.9410196837469782, + "language_loss": 0.77552384, + "learning_rate": 3.979326750654053e-06, + "loss": 0.79423136, + "num_input_tokens_seen": 26311795, + "step": 1238, + "time_per_iteration": 2.788825273513794 + }, + { + "auxiliary_loss_clip": 0.01153916, + "auxiliary_loss_mlp": 0.01175835, + "balance_loss_clip": 1.00233197, + "balance_loss_mlp": 1.00150359, + "epoch": 0.074492710055614, + "flos": 22675290641280.0, + "grad_norm": 2.187203988924027, + "language_loss": 0.86399853, + "learning_rate": 3.9792708604598854e-06, + "loss": 0.88729596, + "num_input_tokens_seen": 26330330, + "step": 1239, + "time_per_iteration": 2.6278634071350098 + }, + { + "auxiliary_loss_clip": 0.01138087, + "auxiliary_loss_mlp": 0.01175358, + "balance_loss_clip": 1.0023638, + "balance_loss_mlp": 1.00102687, + "epoch": 0.07455283330828198, + "flos": 21284901037440.0, + "grad_norm": 1.918678472649905, + "language_loss": 0.89095092, + "learning_rate": 3.979214895211569e-06, + "loss": 0.91408539, + "num_input_tokens_seen": 26348865, + "step": 1240, + "time_per_iteration": 2.652484178543091 + }, + { + "auxiliary_loss_clip": 0.01154495, + "auxiliary_loss_mlp": 0.01175658, + "balance_loss_clip": 1.00247908, + "balance_loss_mlp": 1.00132632, + "epoch": 0.07461295656094995, + "flos": 24388408967040.0, + "grad_norm": 1.855173268649673, + "language_loss": 0.88737303, + "learning_rate": 3.979158854911225e-06, + "loss": 0.91067457, + "num_input_tokens_seen": 26368210, + "step": 1241, + "time_per_iteration": 2.675041675567627 + }, + { + "auxiliary_loss_clip": 0.01154308, + "auxiliary_loss_mlp": 0.01171111, + "balance_loss_clip": 1.00406337, + "balance_loss_mlp": 1.00021219, + "epoch": 0.07467307981361791, + "flos": 62109660574080.0, + "grad_norm": 0.8922301874149583, + "language_loss": 0.63107961, + "learning_rate": 3.979102739560979e-06, + "loss": 0.65433383, + "num_input_tokens_seen": 26424890, + "step": 1242, + "time_per_iteration": 3.2416305541992188 + }, + { + "auxiliary_loss_clip": 0.01138413, + "auxiliary_loss_mlp": 0.01176048, + "balance_loss_clip": 1.0023191, + "balance_loss_mlp": 1.00143051, + "epoch": 0.07473320306628589, + "flos": 24863148046080.0, + "grad_norm": 2.4209925574001736, + "language_loss": 0.62601089, + "learning_rate": 3.9790465491629595e-06, + "loss": 0.6491555, + "num_input_tokens_seen": 26446405, + "step": 1243, + "time_per_iteration": 2.7209181785583496 + }, + { + "auxiliary_loss_clip": 0.01170763, + "auxiliary_loss_mlp": 0.01175272, + "balance_loss_clip": 1.00257826, + "balance_loss_mlp": 1.00132203, + "epoch": 0.07479332631895386, + "flos": 24897442556160.0, + "grad_norm": 1.847923307530036, + "language_loss": 0.76052046, + "learning_rate": 3.978990283719296e-06, + "loss": 0.78398073, + "num_input_tokens_seen": 26466070, + "step": 1244, + "time_per_iteration": 2.622448682785034 + }, + { + "auxiliary_loss_clip": 0.01154182, + "auxiliary_loss_mlp": 0.00749612, + "balance_loss_clip": 1.00237298, + "balance_loss_mlp": 1.0005492, + "epoch": 0.07485344957162182, + "flos": 17815247821440.0, + "grad_norm": 3.118866063996812, + "language_loss": 0.70120472, + "learning_rate": 3.978933943232123e-06, + "loss": 0.72024262, + "num_input_tokens_seen": 26479350, + "step": 1245, + "time_per_iteration": 2.6126797199249268 + }, + { + "auxiliary_loss_clip": 0.01187008, + "auxiliary_loss_mlp": 0.01175727, + "balance_loss_clip": 1.00252676, + "balance_loss_mlp": 1.00139558, + "epoch": 0.0749135728242898, + "flos": 25010202326400.0, + "grad_norm": 1.8276541340934016, + "language_loss": 0.88897514, + "learning_rate": 3.978877527703576e-06, + "loss": 0.91260242, + "num_input_tokens_seen": 26498255, + "step": 1246, + "time_per_iteration": 2.562458038330078 + }, + { + "auxiliary_loss_clip": 0.01187083, + "auxiliary_loss_mlp": 0.0117642, + "balance_loss_clip": 1.00240195, + "balance_loss_mlp": 1.00189757, + "epoch": 0.07497369607695777, + "flos": 17822071405440.0, + "grad_norm": 3.0761458655882445, + "language_loss": 0.87933946, + "learning_rate": 3.9788210371357945e-06, + "loss": 0.90297449, + "num_input_tokens_seen": 26515375, + "step": 1247, + "time_per_iteration": 2.503633737564087 + }, + { + "auxiliary_loss_clip": 0.01154138, + "auxiliary_loss_mlp": 0.01175707, + "balance_loss_clip": 1.00236773, + "balance_loss_mlp": 1.00175714, + "epoch": 0.07503381932962573, + "flos": 15121086261120.0, + "grad_norm": 2.2398655529469966, + "language_loss": 0.6461904, + "learning_rate": 3.978764471530921e-06, + "loss": 0.66948891, + "num_input_tokens_seen": 26533595, + "step": 1248, + "time_per_iteration": 2.59066104888916 + }, + { + "auxiliary_loss_clip": 0.01170927, + "auxiliary_loss_mlp": 0.00749584, + "balance_loss_clip": 1.00258601, + "balance_loss_mlp": 1.00061297, + "epoch": 0.0750939425822937, + "flos": 12816734071680.0, + "grad_norm": 2.7332333936331863, + "language_loss": 0.74792111, + "learning_rate": 3.978707830891102e-06, + "loss": 0.7671262, + "num_input_tokens_seen": 26549405, + "step": 1249, + "time_per_iteration": 2.576230525970459 + }, + { + "auxiliary_loss_clip": 0.01154753, + "auxiliary_loss_mlp": 0.01176322, + "balance_loss_clip": 1.00253034, + "balance_loss_mlp": 1.00179994, + "epoch": 0.07515406583496168, + "flos": 24206844695040.0, + "grad_norm": 2.8218653919201224, + "language_loss": 0.82263374, + "learning_rate": 3.978651115218482e-06, + "loss": 0.84594452, + "num_input_tokens_seen": 26567200, + "step": 1250, + "time_per_iteration": 2.6093409061431885 + }, + { + "auxiliary_loss_clip": 0.01122378, + "auxiliary_loss_mlp": 0.01175628, + "balance_loss_clip": 1.00227022, + "balance_loss_mlp": 1.00148749, + "epoch": 0.07521418908762964, + "flos": 26688164215680.0, + "grad_norm": 2.0131775106870182, + "language_loss": 0.66886443, + "learning_rate": 3.978594324515215e-06, + "loss": 0.69184452, + "num_input_tokens_seen": 26586190, + "step": 1251, + "time_per_iteration": 2.7359700202941895 + }, + { + "auxiliary_loss_clip": 0.01138039, + "auxiliary_loss_mlp": 0.01170928, + "balance_loss_clip": 1.00364184, + "balance_loss_mlp": 1.00003004, + "epoch": 0.0752743123402976, + "flos": 59095140589440.0, + "grad_norm": 0.8968509233865782, + "language_loss": 0.70407647, + "learning_rate": 3.9785374587834515e-06, + "loss": 0.72716618, + "num_input_tokens_seen": 26650710, + "step": 1252, + "time_per_iteration": 3.3855462074279785 + }, + { + "auxiliary_loss_clip": 0.0118692, + "auxiliary_loss_mlp": 0.01175871, + "balance_loss_clip": 1.00244069, + "balance_loss_mlp": 1.00172973, + "epoch": 0.07533443559296558, + "flos": 23477032160640.0, + "grad_norm": 2.5240486149565453, + "language_loss": 0.79552418, + "learning_rate": 3.97848051802535e-06, + "loss": 0.81915212, + "num_input_tokens_seen": 26669000, + "step": 1253, + "time_per_iteration": 2.766517162322998 + }, + { + "auxiliary_loss_clip": 0.01138201, + "auxiliary_loss_mlp": 0.01175654, + "balance_loss_clip": 1.00239503, + "balance_loss_mlp": 1.00151324, + "epoch": 0.07539455884563355, + "flos": 20879110114560.0, + "grad_norm": 2.4674565919377365, + "language_loss": 0.93237221, + "learning_rate": 3.978423502243069e-06, + "loss": 0.95551074, + "num_input_tokens_seen": 26683075, + "step": 1254, + "time_per_iteration": 2.626396417617798 + }, + { + "auxiliary_loss_clip": 0.01154152, + "auxiliary_loss_mlp": 0.0117596, + "balance_loss_clip": 1.0022881, + "balance_loss_mlp": 1.00181866, + "epoch": 0.07545468209830151, + "flos": 27672906551040.0, + "grad_norm": 1.90282333363367, + "language_loss": 0.87752593, + "learning_rate": 3.97836641143877e-06, + "loss": 0.90082705, + "num_input_tokens_seen": 26701875, + "step": 1255, + "time_per_iteration": 2.6618525981903076 + }, + { + "auxiliary_loss_clip": 0.01186791, + "auxiliary_loss_mlp": 0.01175722, + "balance_loss_clip": 1.00243199, + "balance_loss_mlp": 1.00177205, + "epoch": 0.0755148053509695, + "flos": 14136990370560.0, + "grad_norm": 1.8462906763503026, + "language_loss": 0.79853153, + "learning_rate": 3.978309245614618e-06, + "loss": 0.82215667, + "num_input_tokens_seen": 26719050, + "step": 1256, + "time_per_iteration": 2.5195674896240234 + }, + { + "auxiliary_loss_clip": 0.01154536, + "auxiliary_loss_mlp": 0.01170919, + "balance_loss_clip": 1.00378215, + "balance_loss_mlp": 1.00002098, + "epoch": 0.07557492860363746, + "flos": 58235257929600.0, + "grad_norm": 0.7918509515343137, + "language_loss": 0.57971239, + "learning_rate": 3.9782520047727825e-06, + "loss": 0.6029669, + "num_input_tokens_seen": 26780650, + "step": 1257, + "time_per_iteration": 3.2827916145324707 + }, + { + "auxiliary_loss_clip": 0.01121492, + "auxiliary_loss_mlp": 0.01175801, + "balance_loss_clip": 1.00208569, + "balance_loss_mlp": 1.0015651, + "epoch": 0.07563505185630542, + "flos": 24644380262400.0, + "grad_norm": 2.0713752874241997, + "language_loss": 0.8968153, + "learning_rate": 3.978194688915432e-06, + "loss": 0.91978824, + "num_input_tokens_seen": 26798725, + "step": 1258, + "time_per_iteration": 2.6972696781158447 + }, + { + "auxiliary_loss_clip": 0.011542, + "auxiliary_loss_mlp": 0.01175336, + "balance_loss_clip": 1.00242865, + "balance_loss_mlp": 1.00148153, + "epoch": 0.07569517510897339, + "flos": 15522998515200.0, + "grad_norm": 2.0051858795974367, + "language_loss": 0.81205058, + "learning_rate": 3.978137298044741e-06, + "loss": 0.83534592, + "num_input_tokens_seen": 26817005, + "step": 1259, + "time_per_iteration": 2.5982396602630615 + }, + { + "auxiliary_loss_clip": 0.01170905, + "auxiliary_loss_mlp": 0.01175644, + "balance_loss_clip": 1.00247967, + "balance_loss_mlp": 1.00159836, + "epoch": 0.07575529836164137, + "flos": 22928532503040.0, + "grad_norm": 1.8662741462331516, + "language_loss": 0.76102984, + "learning_rate": 3.978079832162885e-06, + "loss": 0.78449529, + "num_input_tokens_seen": 26836655, + "step": 1260, + "time_per_iteration": 2.5719399452209473 + }, + { + "auxiliary_loss_clip": 0.01140277, + "auxiliary_loss_mlp": 0.01175572, + "balance_loss_clip": 1.00233662, + "balance_loss_mlp": 1.00162137, + "epoch": 0.07581542161430933, + "flos": 19500428344320.0, + "grad_norm": 1.8635843165829946, + "language_loss": 0.84863651, + "learning_rate": 3.978022291272044e-06, + "loss": 0.871795, + "num_input_tokens_seen": 26854925, + "step": 1261, + "time_per_iteration": 2.6446592807769775 + }, + { + "auxiliary_loss_clip": 0.01186838, + "auxiliary_loss_mlp": 0.01176016, + "balance_loss_clip": 1.00250387, + "balance_loss_mlp": 1.00206566, + "epoch": 0.0758755448669773, + "flos": 24973465691520.0, + "grad_norm": 2.0638670581616925, + "language_loss": 0.82538009, + "learning_rate": 3.977964675374399e-06, + "loss": 0.84900868, + "num_input_tokens_seen": 26876170, + "step": 1262, + "time_per_iteration": 2.601377248764038 + }, + { + "auxiliary_loss_clip": 0.01186896, + "auxiliary_loss_mlp": 0.01175927, + "balance_loss_clip": 1.00249124, + "balance_loss_mlp": 1.00178647, + "epoch": 0.07593566811964528, + "flos": 22747973811840.0, + "grad_norm": 2.368397280019095, + "language_loss": 0.83098614, + "learning_rate": 3.977906984472136e-06, + "loss": 0.85461438, + "num_input_tokens_seen": 26895005, + "step": 1263, + "time_per_iteration": 2.527379035949707 + }, + { + "auxiliary_loss_clip": 0.01121233, + "auxiliary_loss_mlp": 0.01175343, + "balance_loss_clip": 1.00222349, + "balance_loss_mlp": 1.0013926, + "epoch": 0.07599579137231324, + "flos": 23112395245440.0, + "grad_norm": 3.3546915920179337, + "language_loss": 0.76243997, + "learning_rate": 3.977849218567442e-06, + "loss": 0.78540576, + "num_input_tokens_seen": 26913930, + "step": 1264, + "time_per_iteration": 2.6925413608551025 + }, + { + "auxiliary_loss_clip": 0.01156453, + "auxiliary_loss_mlp": 0.01175632, + "balance_loss_clip": 1.0024184, + "balance_loss_mlp": 1.00158644, + "epoch": 0.07605591462498121, + "flos": 14502058248960.0, + "grad_norm": 3.2454327241650494, + "language_loss": 0.81198764, + "learning_rate": 3.977791377662507e-06, + "loss": 0.83530843, + "num_input_tokens_seen": 26931485, + "step": 1265, + "time_per_iteration": 2.5774104595184326 + }, + { + "auxiliary_loss_clip": 0.01138312, + "auxiliary_loss_mlp": 0.0117603, + "balance_loss_clip": 1.0022831, + "balance_loss_mlp": 1.00188935, + "epoch": 0.07611603787764919, + "flos": 23514199758720.0, + "grad_norm": 2.0109507039259316, + "language_loss": 0.65494287, + "learning_rate": 3.977733461759524e-06, + "loss": 0.67808628, + "num_input_tokens_seen": 26951670, + "step": 1266, + "time_per_iteration": 2.739201307296753 + }, + { + "auxiliary_loss_clip": 0.01137174, + "auxiliary_loss_mlp": 0.01175375, + "balance_loss_clip": 1.00212955, + "balance_loss_mlp": 1.0013293, + "epoch": 0.07617616113031715, + "flos": 21507188353920.0, + "grad_norm": 2.1042994130850947, + "language_loss": 0.79642898, + "learning_rate": 3.977675470860691e-06, + "loss": 0.81955445, + "num_input_tokens_seen": 26970335, + "step": 1267, + "time_per_iteration": 2.6576435565948486 + }, + { + "auxiliary_loss_clip": 0.01153898, + "auxiliary_loss_mlp": 0.01175391, + "balance_loss_clip": 1.00225353, + "balance_loss_mlp": 1.00153613, + "epoch": 0.07623628438298512, + "flos": 14573161221120.0, + "grad_norm": 2.8990221317434006, + "language_loss": 0.73369205, + "learning_rate": 3.977617404968205e-06, + "loss": 0.75698483, + "num_input_tokens_seen": 26986025, + "step": 1268, + "time_per_iteration": 2.6017093658447266 + }, + { + "auxiliary_loss_clip": 0.01170126, + "auxiliary_loss_mlp": 0.01175474, + "balance_loss_clip": 1.00236368, + "balance_loss_mlp": 1.0018096, + "epoch": 0.07629640763565308, + "flos": 14720395069440.0, + "grad_norm": 2.283018729806448, + "language_loss": 0.82515752, + "learning_rate": 3.977559264084269e-06, + "loss": 0.8486135, + "num_input_tokens_seen": 27004045, + "step": 1269, + "time_per_iteration": 2.5582025051116943 + }, + { + "auxiliary_loss_clip": 0.01170285, + "auxiliary_loss_mlp": 0.0117609, + "balance_loss_clip": 1.00241995, + "balance_loss_mlp": 1.00194919, + "epoch": 0.07635653088832106, + "flos": 14902929008640.0, + "grad_norm": 2.1398194543291122, + "language_loss": 0.88773274, + "learning_rate": 3.977501048211088e-06, + "loss": 0.91119647, + "num_input_tokens_seen": 27022070, + "step": 1270, + "time_per_iteration": 3.9268507957458496 + }, + { + "auxiliary_loss_clip": 0.01170327, + "auxiliary_loss_mlp": 0.01176163, + "balance_loss_clip": 1.00234008, + "balance_loss_mlp": 1.00173569, + "epoch": 0.07641665414098903, + "flos": 26651571235200.0, + "grad_norm": 2.5797929108573685, + "language_loss": 0.71019316, + "learning_rate": 3.977442757350869e-06, + "loss": 0.73365808, + "num_input_tokens_seen": 27041755, + "step": 1271, + "time_per_iteration": 4.02025032043457 + }, + { + "auxiliary_loss_clip": 0.01136951, + "auxiliary_loss_mlp": 0.01175854, + "balance_loss_clip": 1.00221634, + "balance_loss_mlp": 1.00218952, + "epoch": 0.07647677739365699, + "flos": 25192808092800.0, + "grad_norm": 1.726401830827268, + "language_loss": 0.82556319, + "learning_rate": 3.977384391505823e-06, + "loss": 0.84869123, + "num_input_tokens_seen": 27061540, + "step": 1272, + "time_per_iteration": 4.099025249481201 + }, + { + "auxiliary_loss_clip": 0.01154344, + "auxiliary_loss_mlp": 0.00749585, + "balance_loss_clip": 1.00231862, + "balance_loss_mlp": 1.00046372, + "epoch": 0.07653690064632497, + "flos": 20558141159040.0, + "grad_norm": 1.7015967569465145, + "language_loss": 0.8004325, + "learning_rate": 3.977325950678162e-06, + "loss": 0.81947184, + "num_input_tokens_seen": 27081395, + "step": 1273, + "time_per_iteration": 2.6333956718444824 + }, + { + "auxiliary_loss_clip": 0.01153785, + "auxiliary_loss_mlp": 0.01175593, + "balance_loss_clip": 1.00228047, + "balance_loss_mlp": 1.00145268, + "epoch": 0.07659702389899294, + "flos": 22269320150400.0, + "grad_norm": 1.7008188177668322, + "language_loss": 0.81059813, + "learning_rate": 3.977267434870103e-06, + "loss": 0.83389199, + "num_input_tokens_seen": 27101175, + "step": 1274, + "time_per_iteration": 4.01666259765625 + }, + { + "auxiliary_loss_clip": 0.01153731, + "auxiliary_loss_mlp": 0.01175822, + "balance_loss_clip": 1.00233579, + "balance_loss_mlp": 1.00196719, + "epoch": 0.0766571471516609, + "flos": 32636120209920.0, + "grad_norm": 2.430410692883263, + "language_loss": 0.72966069, + "learning_rate": 3.977208844083865e-06, + "loss": 0.75295627, + "num_input_tokens_seen": 27124505, + "step": 1275, + "time_per_iteration": 2.7049944400787354 + }, + { + "auxiliary_loss_clip": 0.01187006, + "auxiliary_loss_mlp": 0.01175787, + "balance_loss_clip": 1.00255477, + "balance_loss_mlp": 1.00183725, + "epoch": 0.07671727040432888, + "flos": 15267386355840.0, + "grad_norm": 2.1768299482986326, + "language_loss": 0.79715145, + "learning_rate": 3.9771501783216685e-06, + "loss": 0.82077938, + "num_input_tokens_seen": 27140960, + "step": 1276, + "time_per_iteration": 2.4856836795806885 + }, + { + "auxiliary_loss_clip": 0.01172829, + "auxiliary_loss_mlp": 0.01175619, + "balance_loss_clip": 1.00234413, + "balance_loss_mlp": 1.00166917, + "epoch": 0.07677739365699685, + "flos": 28184094956160.0, + "grad_norm": 2.1664298293714537, + "language_loss": 0.5970996, + "learning_rate": 3.97709143758574e-06, + "loss": 0.62058413, + "num_input_tokens_seen": 27160985, + "step": 1277, + "time_per_iteration": 2.694110631942749 + }, + { + "auxiliary_loss_clip": 0.0117031, + "auxiliary_loss_mlp": 0.01175692, + "balance_loss_clip": 1.00240779, + "balance_loss_mlp": 1.00164664, + "epoch": 0.07683751690966481, + "flos": 18296128126080.0, + "grad_norm": 3.0179581766214056, + "language_loss": 0.74847746, + "learning_rate": 3.977032621878305e-06, + "loss": 0.77193749, + "num_input_tokens_seen": 27178390, + "step": 1278, + "time_per_iteration": 2.5411205291748047 + }, + { + "auxiliary_loss_clip": 0.01154492, + "auxiliary_loss_mlp": 0.01175412, + "balance_loss_clip": 1.00220776, + "balance_loss_mlp": 1.00136673, + "epoch": 0.07689764016233278, + "flos": 21981101420160.0, + "grad_norm": 2.6628588391598793, + "language_loss": 0.88948178, + "learning_rate": 3.976973731201596e-06, + "loss": 0.91278088, + "num_input_tokens_seen": 27197505, + "step": 1279, + "time_per_iteration": 2.6319291591644287 + }, + { + "auxiliary_loss_clip": 0.01154743, + "auxiliary_loss_mlp": 0.01175258, + "balance_loss_clip": 1.00242686, + "balance_loss_mlp": 1.00140285, + "epoch": 0.07695776341500075, + "flos": 22235995307520.0, + "grad_norm": 2.6067873595059723, + "language_loss": 0.82572675, + "learning_rate": 3.976914765557845e-06, + "loss": 0.84902674, + "num_input_tokens_seen": 27214260, + "step": 1280, + "time_per_iteration": 2.624206066131592 + }, + { + "auxiliary_loss_clip": 0.01170112, + "auxiliary_loss_mlp": 0.01175212, + "balance_loss_clip": 1.00241804, + "balance_loss_mlp": 1.00145221, + "epoch": 0.07701788666766872, + "flos": 16143750380160.0, + "grad_norm": 2.2256943960741666, + "language_loss": 0.75646329, + "learning_rate": 3.9768557249492875e-06, + "loss": 0.77991652, + "num_input_tokens_seen": 27232525, + "step": 1281, + "time_per_iteration": 2.5313644409179688 + }, + { + "auxiliary_loss_clip": 0.01137001, + "auxiliary_loss_mlp": 0.01175248, + "balance_loss_clip": 1.00204635, + "balance_loss_mlp": 1.0014888, + "epoch": 0.07707800992033668, + "flos": 19463045264640.0, + "grad_norm": 1.8799815490813714, + "language_loss": 0.75026894, + "learning_rate": 3.9767966093781634e-06, + "loss": 0.77339143, + "num_input_tokens_seen": 27249800, + "step": 1282, + "time_per_iteration": 2.650611639022827 + }, + { + "auxiliary_loss_clip": 0.01186751, + "auxiliary_loss_mlp": 0.01175858, + "balance_loss_clip": 1.00247848, + "balance_loss_mlp": 1.00209832, + "epoch": 0.07713813317300466, + "flos": 18990281433600.0, + "grad_norm": 1.986468470516477, + "language_loss": 0.83929116, + "learning_rate": 3.976737418846713e-06, + "loss": 0.86291718, + "num_input_tokens_seen": 27268895, + "step": 1283, + "time_per_iteration": 2.5283730030059814 + }, + { + "auxiliary_loss_clip": 0.01170218, + "auxiliary_loss_mlp": 0.01175513, + "balance_loss_clip": 1.00242627, + "balance_loss_mlp": 1.00184894, + "epoch": 0.07719825642567263, + "flos": 18113953322880.0, + "grad_norm": 2.0077748064221015, + "language_loss": 0.7518295, + "learning_rate": 3.976678153357181e-06, + "loss": 0.77528685, + "num_input_tokens_seen": 27288180, + "step": 1284, + "time_per_iteration": 2.5490541458129883 + }, + { + "auxiliary_loss_clip": 0.01170856, + "auxiliary_loss_mlp": 0.01175359, + "balance_loss_clip": 1.00238621, + "balance_loss_mlp": 1.00188541, + "epoch": 0.0772583796783406, + "flos": 42194426993280.0, + "grad_norm": 1.6674057474350508, + "language_loss": 0.76026827, + "learning_rate": 3.976618812911817e-06, + "loss": 0.78373039, + "num_input_tokens_seen": 27311815, + "step": 1285, + "time_per_iteration": 2.752289056777954 + }, + { + "auxiliary_loss_clip": 0.01186711, + "auxiliary_loss_mlp": 0.01175548, + "balance_loss_clip": 1.00257277, + "balance_loss_mlp": 1.00169301, + "epoch": 0.07731850293100857, + "flos": 24753692327040.0, + "grad_norm": 5.928507942326579, + "language_loss": 0.84141946, + "learning_rate": 3.9765593975128685e-06, + "loss": 0.86504209, + "num_input_tokens_seen": 27331890, + "step": 1286, + "time_per_iteration": 2.5964624881744385 + }, + { + "auxiliary_loss_clip": 0.01153725, + "auxiliary_loss_mlp": 0.0117563, + "balance_loss_clip": 1.00207925, + "balance_loss_mlp": 1.00158465, + "epoch": 0.07737862618367654, + "flos": 17565884628480.0, + "grad_norm": 2.4046250195627197, + "language_loss": 0.76420707, + "learning_rate": 3.97649990716259e-06, + "loss": 0.78750062, + "num_input_tokens_seen": 27348320, + "step": 1287, + "time_per_iteration": 2.534118175506592 + }, + { + "auxiliary_loss_clip": 0.01153934, + "auxiliary_loss_mlp": 0.01175337, + "balance_loss_clip": 1.00228643, + "balance_loss_mlp": 1.00157762, + "epoch": 0.0774387494363445, + "flos": 25627147349760.0, + "grad_norm": 1.681081974269305, + "language_loss": 0.8479932, + "learning_rate": 3.976440341863237e-06, + "loss": 0.87128592, + "num_input_tokens_seen": 27367670, + "step": 1288, + "time_per_iteration": 2.6622350215911865 + }, + { + "auxiliary_loss_clip": 0.01186739, + "auxiliary_loss_mlp": 0.01175358, + "balance_loss_clip": 1.00240266, + "balance_loss_mlp": 1.00159812, + "epoch": 0.07749887268901248, + "flos": 12239865648000.0, + "grad_norm": 2.0184754977052126, + "language_loss": 0.85552025, + "learning_rate": 3.976380701617068e-06, + "loss": 0.87914121, + "num_input_tokens_seen": 27385485, + "step": 1289, + "time_per_iteration": 2.5238466262817383 + }, + { + "auxiliary_loss_clip": 0.01186746, + "auxiliary_loss_mlp": 0.01175291, + "balance_loss_clip": 1.00249076, + "balance_loss_mlp": 1.00143635, + "epoch": 0.07755899594168045, + "flos": 25081736261760.0, + "grad_norm": 1.9063019906813392, + "language_loss": 0.85283607, + "learning_rate": 3.976320986426344e-06, + "loss": 0.87645644, + "num_input_tokens_seen": 27405110, + "step": 1290, + "time_per_iteration": 2.5660080909729004 + }, + { + "auxiliary_loss_clip": 0.0115354, + "auxiliary_loss_mlp": 0.01175314, + "balance_loss_clip": 1.00222051, + "balance_loss_mlp": 1.00145948, + "epoch": 0.07761911919434841, + "flos": 14246410176000.0, + "grad_norm": 4.189439016093268, + "language_loss": 0.91125679, + "learning_rate": 3.9762611962933315e-06, + "loss": 0.93454528, + "num_input_tokens_seen": 27422855, + "step": 1291, + "time_per_iteration": 2.577850580215454 + }, + { + "auxiliary_loss_clip": 0.01153978, + "auxiliary_loss_mlp": 0.01170215, + "balance_loss_clip": 1.00318801, + "balance_loss_mlp": 1.00007904, + "epoch": 0.07767924244701638, + "flos": 67237202954880.0, + "grad_norm": 0.8886179198208725, + "language_loss": 0.65080154, + "learning_rate": 3.9762013312202955e-06, + "loss": 0.67404342, + "num_input_tokens_seen": 27487190, + "step": 1292, + "time_per_iteration": 3.2981784343719482 + }, + { + "auxiliary_loss_clip": 0.01170194, + "auxiliary_loss_mlp": 0.01175028, + "balance_loss_clip": 1.00239372, + "balance_loss_mlp": 1.00136447, + "epoch": 0.07773936569968436, + "flos": 28550635292160.0, + "grad_norm": 1.8733457326827367, + "language_loss": 0.87769586, + "learning_rate": 3.9761413912095075e-06, + "loss": 0.90114808, + "num_input_tokens_seen": 27510465, + "step": 1293, + "time_per_iteration": 2.6225240230560303 + }, + { + "auxiliary_loss_clip": 0.01105427, + "auxiliary_loss_mlp": 0.01175618, + "balance_loss_clip": 1.00213599, + "balance_loss_mlp": 1.00185823, + "epoch": 0.07779948895235232, + "flos": 27490264871040.0, + "grad_norm": 3.351365424656732, + "language_loss": 0.85364836, + "learning_rate": 3.976081376263239e-06, + "loss": 0.87645876, + "num_input_tokens_seen": 27528645, + "step": 1294, + "time_per_iteration": 2.7966086864471436 + }, + { + "auxiliary_loss_clip": 0.01138076, + "auxiliary_loss_mlp": 0.01175103, + "balance_loss_clip": 1.00238514, + "balance_loss_mlp": 1.00124824, + "epoch": 0.07785961220502029, + "flos": 18223301301120.0, + "grad_norm": 2.813802234858633, + "language_loss": 0.79741657, + "learning_rate": 3.976021286383768e-06, + "loss": 0.82054842, + "num_input_tokens_seen": 27546165, + "step": 1295, + "time_per_iteration": 2.6321158409118652 + }, + { + "auxiliary_loss_clip": 0.01137368, + "auxiliary_loss_mlp": 0.01175324, + "balance_loss_clip": 1.00209343, + "balance_loss_mlp": 1.00127828, + "epoch": 0.07791973545768827, + "flos": 24608218245120.0, + "grad_norm": 2.6178828184530527, + "language_loss": 0.88252711, + "learning_rate": 3.975961121573371e-06, + "loss": 0.90565407, + "num_input_tokens_seen": 27566520, + "step": 1296, + "time_per_iteration": 2.6621057987213135 + }, + { + "auxiliary_loss_clip": 0.01186669, + "auxiliary_loss_mlp": 0.01175186, + "balance_loss_clip": 1.00244164, + "balance_loss_mlp": 1.00161719, + "epoch": 0.07797985871035623, + "flos": 14282069402880.0, + "grad_norm": 2.417685195539298, + "language_loss": 0.96887338, + "learning_rate": 3.9759008818343305e-06, + "loss": 0.99249196, + "num_input_tokens_seen": 27581960, + "step": 1297, + "time_per_iteration": 2.5140221118927 + }, + { + "auxiliary_loss_clip": 0.01153987, + "auxiliary_loss_mlp": 0.01175246, + "balance_loss_clip": 1.00227332, + "balance_loss_mlp": 1.00167751, + "epoch": 0.0780399819630242, + "flos": 26610453141120.0, + "grad_norm": 2.4384824638804354, + "language_loss": 0.7613191, + "learning_rate": 3.97584056716893e-06, + "loss": 0.78461146, + "num_input_tokens_seen": 27601415, + "step": 1298, + "time_per_iteration": 2.6316537857055664 + }, + { + "auxiliary_loss_clip": 0.01121331, + "auxiliary_loss_mlp": 0.00749548, + "balance_loss_clip": 1.00211477, + "balance_loss_mlp": 1.00048625, + "epoch": 0.07810010521569218, + "flos": 21834514016640.0, + "grad_norm": 1.839396976473766, + "language_loss": 0.80371916, + "learning_rate": 3.9757801775794575e-06, + "loss": 0.82242799, + "num_input_tokens_seen": 27621490, + "step": 1299, + "time_per_iteration": 2.687293767929077 + }, + { + "auxiliary_loss_clip": 0.011367, + "auxiliary_loss_mlp": 0.01174835, + "balance_loss_clip": 1.00221062, + "balance_loss_mlp": 1.00136185, + "epoch": 0.07816022846836014, + "flos": 25081233471360.0, + "grad_norm": 1.8517475487403772, + "language_loss": 0.86427414, + "learning_rate": 3.975719713068202e-06, + "loss": 0.88738954, + "num_input_tokens_seen": 27640600, + "step": 1300, + "time_per_iteration": 2.681772470474243 + }, + { + "auxiliary_loss_clip": 0.01186717, + "auxiliary_loss_mlp": 0.01174903, + "balance_loss_clip": 1.00250304, + "balance_loss_mlp": 1.00114369, + "epoch": 0.0782203517210281, + "flos": 40917515431680.0, + "grad_norm": 1.796523780685078, + "language_loss": 0.72087246, + "learning_rate": 3.975659173637458e-06, + "loss": 0.74448866, + "num_input_tokens_seen": 27663070, + "step": 1301, + "time_per_iteration": 2.699493646621704 + }, + { + "auxiliary_loss_clip": 0.01170587, + "auxiliary_loss_mlp": 0.01176176, + "balance_loss_clip": 1.00242877, + "balance_loss_mlp": 1.00232148, + "epoch": 0.07828047497369607, + "flos": 41172014269440.0, + "grad_norm": 1.5300643433434449, + "language_loss": 0.71079195, + "learning_rate": 3.97559855928952e-06, + "loss": 0.73425961, + "num_input_tokens_seen": 27686425, + "step": 1302, + "time_per_iteration": 2.746563196182251 + }, + { + "auxiliary_loss_clip": 0.01154576, + "auxiliary_loss_mlp": 0.00749546, + "balance_loss_clip": 1.00239825, + "balance_loss_mlp": 1.00045431, + "epoch": 0.07834059822636405, + "flos": 23508130360320.0, + "grad_norm": 3.2565557197621167, + "language_loss": 0.81782389, + "learning_rate": 3.9755378700266864e-06, + "loss": 0.83686507, + "num_input_tokens_seen": 27704900, + "step": 1303, + "time_per_iteration": 2.6433868408203125 + }, + { + "auxiliary_loss_clip": 0.01170179, + "auxiliary_loss_mlp": 0.01175162, + "balance_loss_clip": 1.00230408, + "balance_loss_mlp": 1.00168824, + "epoch": 0.07840072147903202, + "flos": 20193899293440.0, + "grad_norm": 2.400421290879595, + "language_loss": 0.75022495, + "learning_rate": 3.9754771058512585e-06, + "loss": 0.77367836, + "num_input_tokens_seen": 27724890, + "step": 1304, + "time_per_iteration": 2.5726897716522217 + }, + { + "auxiliary_loss_clip": 0.01186644, + "auxiliary_loss_mlp": 0.0117545, + "balance_loss_clip": 1.00246334, + "balance_loss_mlp": 1.00197673, + "epoch": 0.07846084473169998, + "flos": 21360816432000.0, + "grad_norm": 1.623356180121722, + "language_loss": 0.76143116, + "learning_rate": 3.975416266765542e-06, + "loss": 0.78505206, + "num_input_tokens_seen": 27743115, + "step": 1305, + "time_per_iteration": 2.549168825149536 + }, + { + "auxiliary_loss_clip": 0.0110387, + "auxiliary_loss_mlp": 0.01175475, + "balance_loss_clip": 1.00189972, + "balance_loss_mlp": 1.00181103, + "epoch": 0.07852096798436796, + "flos": 25410965345280.0, + "grad_norm": 1.6553610118473754, + "language_loss": 0.85313511, + "learning_rate": 3.975355352771841e-06, + "loss": 0.87592852, + "num_input_tokens_seen": 27763570, + "step": 1306, + "time_per_iteration": 2.764927387237549 + }, + { + "auxiliary_loss_clip": 0.01169882, + "auxiliary_loss_mlp": 0.01175009, + "balance_loss_clip": 1.00234783, + "balance_loss_mlp": 1.00144053, + "epoch": 0.07858109123703592, + "flos": 24571481610240.0, + "grad_norm": 4.92483815784745, + "language_loss": 0.90447229, + "learning_rate": 3.975294363872468e-06, + "loss": 0.92792118, + "num_input_tokens_seen": 27780030, + "step": 1307, + "time_per_iteration": 3.996081829071045 + }, + { + "auxiliary_loss_clip": 0.01121092, + "auxiliary_loss_mlp": 0.01174879, + "balance_loss_clip": 1.00222993, + "balance_loss_mlp": 1.00131011, + "epoch": 0.07864121448970389, + "flos": 20698874645760.0, + "grad_norm": 1.87101517980323, + "language_loss": 0.83422816, + "learning_rate": 3.975233300069735e-06, + "loss": 0.85718787, + "num_input_tokens_seen": 27796225, + "step": 1308, + "time_per_iteration": 2.679438591003418 + }, + { + "auxiliary_loss_clip": 0.01140517, + "auxiliary_loss_mlp": 0.01174773, + "balance_loss_clip": 1.00225151, + "balance_loss_mlp": 1.00149012, + "epoch": 0.07870133774237187, + "flos": 22966526113920.0, + "grad_norm": 1.4814331815391053, + "language_loss": 0.77764136, + "learning_rate": 3.975172161365958e-06, + "loss": 0.80079424, + "num_input_tokens_seen": 27815975, + "step": 1309, + "time_per_iteration": 3.9981658458709717 + }, + { + "auxiliary_loss_clip": 0.01170347, + "auxiliary_loss_mlp": 0.01175307, + "balance_loss_clip": 1.00238299, + "balance_loss_mlp": 1.00154805, + "epoch": 0.07876146099503983, + "flos": 18842832103680.0, + "grad_norm": 1.8841888242986564, + "language_loss": 0.805812, + "learning_rate": 3.975110947763453e-06, + "loss": 0.82926857, + "num_input_tokens_seen": 27832255, + "step": 1310, + "time_per_iteration": 3.9910688400268555 + }, + { + "auxiliary_loss_clip": 0.01153553, + "auxiliary_loss_mlp": 0.00749543, + "balance_loss_clip": 1.00234711, + "balance_loss_mlp": 1.0005337, + "epoch": 0.0788215842477078, + "flos": 23805794367360.0, + "grad_norm": 1.7137033770700758, + "language_loss": 0.73240346, + "learning_rate": 3.9750496592645435e-06, + "loss": 0.75143445, + "num_input_tokens_seen": 27852180, + "step": 1311, + "time_per_iteration": 4.055295944213867 + }, + { + "auxiliary_loss_clip": 0.0117071, + "auxiliary_loss_mlp": 0.01176146, + "balance_loss_clip": 1.00246882, + "balance_loss_mlp": 1.00248218, + "epoch": 0.07888170750037576, + "flos": 21579907438080.0, + "grad_norm": 1.7224146033917618, + "language_loss": 0.85869831, + "learning_rate": 3.974988295871553e-06, + "loss": 0.88216686, + "num_input_tokens_seen": 27871435, + "step": 1312, + "time_per_iteration": 2.575205087661743 + }, + { + "auxiliary_loss_clip": 0.01153781, + "auxiliary_loss_mlp": 0.01175349, + "balance_loss_clip": 1.00233519, + "balance_loss_mlp": 1.00187588, + "epoch": 0.07894183075304374, + "flos": 19864849777920.0, + "grad_norm": 1.8602985857000063, + "language_loss": 0.82297647, + "learning_rate": 3.9749268575868085e-06, + "loss": 0.84626776, + "num_input_tokens_seen": 27890625, + "step": 1313, + "time_per_iteration": 2.6037981510162354 + }, + { + "auxiliary_loss_clip": 0.01170782, + "auxiliary_loss_mlp": 0.00749551, + "balance_loss_clip": 1.00240147, + "balance_loss_mlp": 1.0005064, + "epoch": 0.07900195400571171, + "flos": 16143463071360.0, + "grad_norm": 3.085941254556987, + "language_loss": 0.73371756, + "learning_rate": 3.97486534441264e-06, + "loss": 0.75292087, + "num_input_tokens_seen": 27906530, + "step": 1314, + "time_per_iteration": 2.5234339237213135 + }, + { + "auxiliary_loss_clip": 0.01139966, + "auxiliary_loss_mlp": 0.00749587, + "balance_loss_clip": 1.00220156, + "balance_loss_mlp": 1.00055945, + "epoch": 0.07906207725837967, + "flos": 23730417676800.0, + "grad_norm": 1.613949187407365, + "language_loss": 0.79710019, + "learning_rate": 3.974803756351379e-06, + "loss": 0.81599575, + "num_input_tokens_seen": 27926725, + "step": 1315, + "time_per_iteration": 2.66695499420166 + }, + { + "auxiliary_loss_clip": 0.01170427, + "auxiliary_loss_mlp": 0.01175093, + "balance_loss_clip": 1.00236046, + "balance_loss_mlp": 1.00181067, + "epoch": 0.07912220051104765, + "flos": 24315905364480.0, + "grad_norm": 1.8400546166549774, + "language_loss": 0.73754001, + "learning_rate": 3.974742093405362e-06, + "loss": 0.76099527, + "num_input_tokens_seen": 27947875, + "step": 1316, + "time_per_iteration": 2.612907648086548 + }, + { + "auxiliary_loss_clip": 0.01137604, + "auxiliary_loss_mlp": 0.01175548, + "balance_loss_clip": 1.00218678, + "balance_loss_mlp": 1.00178826, + "epoch": 0.07918232376371562, + "flos": 18880035615360.0, + "grad_norm": 2.9398055983000524, + "language_loss": 0.65487486, + "learning_rate": 3.974680355576927e-06, + "loss": 0.67800641, + "num_input_tokens_seen": 27965040, + "step": 1317, + "time_per_iteration": 2.672069549560547 + }, + { + "auxiliary_loss_clip": 0.01140269, + "auxiliary_loss_mlp": 0.01175215, + "balance_loss_clip": 1.00233114, + "balance_loss_mlp": 1.00126505, + "epoch": 0.07924244701638358, + "flos": 27376284038400.0, + "grad_norm": 2.8400183884551526, + "language_loss": 0.72658688, + "learning_rate": 3.974618542868415e-06, + "loss": 0.74974167, + "num_input_tokens_seen": 27985330, + "step": 1318, + "time_per_iteration": 2.671268939971924 + }, + { + "auxiliary_loss_clip": 0.01121551, + "auxiliary_loss_mlp": 0.01174978, + "balance_loss_clip": 1.00208247, + "balance_loss_mlp": 1.00140929, + "epoch": 0.07930257026905156, + "flos": 25120340403840.0, + "grad_norm": 1.7854958286305387, + "language_loss": 0.90382206, + "learning_rate": 3.97455665528217e-06, + "loss": 0.92678738, + "num_input_tokens_seen": 28007615, + "step": 1319, + "time_per_iteration": 2.7178115844726562 + }, + { + "auxiliary_loss_clip": 0.01170626, + "auxiliary_loss_mlp": 0.0117492, + "balance_loss_clip": 1.00233817, + "balance_loss_mlp": 1.00154185, + "epoch": 0.07936269352171953, + "flos": 21834478103040.0, + "grad_norm": 2.415142436959094, + "language_loss": 0.80026436, + "learning_rate": 3.974494692820539e-06, + "loss": 0.82371986, + "num_input_tokens_seen": 28027765, + "step": 1320, + "time_per_iteration": 2.583037853240967 + }, + { + "auxiliary_loss_clip": 0.01153786, + "auxiliary_loss_mlp": 0.01174839, + "balance_loss_clip": 1.00243831, + "balance_loss_mlp": 1.00146139, + "epoch": 0.07942281677438749, + "flos": 16939889377920.0, + "grad_norm": 2.0532276445425732, + "language_loss": 0.68972456, + "learning_rate": 3.974432655485872e-06, + "loss": 0.71301079, + "num_input_tokens_seen": 28044225, + "step": 1321, + "time_per_iteration": 2.5940136909484863 + }, + { + "auxiliary_loss_clip": 0.01169909, + "auxiliary_loss_mlp": 0.01174732, + "balance_loss_clip": 1.0023638, + "balance_loss_mlp": 1.00154495, + "epoch": 0.07948294002705546, + "flos": 18986941468800.0, + "grad_norm": 2.0453320497730996, + "language_loss": 0.84073544, + "learning_rate": 3.9743705432805195e-06, + "loss": 0.86418188, + "num_input_tokens_seen": 28062915, + "step": 1322, + "time_per_iteration": 2.5874340534210205 + }, + { + "auxiliary_loss_clip": 0.0118637, + "auxiliary_loss_mlp": 0.01174971, + "balance_loss_clip": 1.00239933, + "balance_loss_mlp": 1.00159335, + "epoch": 0.07954306327972344, + "flos": 21653452535040.0, + "grad_norm": 1.8633372361571736, + "language_loss": 0.90364099, + "learning_rate": 3.974308356206838e-06, + "loss": 0.92725444, + "num_input_tokens_seen": 28082175, + "step": 1323, + "time_per_iteration": 2.52305006980896 + }, + { + "auxiliary_loss_clip": 0.01138231, + "auxiliary_loss_mlp": 0.01174811, + "balance_loss_clip": 1.00233042, + "balance_loss_mlp": 1.0015285, + "epoch": 0.0796031865323914, + "flos": 23220270766080.0, + "grad_norm": 3.4802358296038127, + "language_loss": 0.82557201, + "learning_rate": 3.974246094267187e-06, + "loss": 0.84870243, + "num_input_tokens_seen": 28102645, + "step": 1324, + "time_per_iteration": 2.718114137649536 + }, + { + "auxiliary_loss_clip": 0.01153307, + "auxiliary_loss_mlp": 0.01174657, + "balance_loss_clip": 1.0021286, + "balance_loss_mlp": 1.00127912, + "epoch": 0.07966330978505937, + "flos": 23294534135040.0, + "grad_norm": 3.265932130450797, + "language_loss": 0.79132116, + "learning_rate": 3.974183757463925e-06, + "loss": 0.81460083, + "num_input_tokens_seen": 28122805, + "step": 1325, + "time_per_iteration": 2.59330153465271 + }, + { + "auxiliary_loss_clip": 0.01105703, + "auxiliary_loss_mlp": 0.00749531, + "balance_loss_clip": 1.00219393, + "balance_loss_mlp": 1.00045359, + "epoch": 0.07972343303772735, + "flos": 18363783392640.0, + "grad_norm": 2.0625520754747853, + "language_loss": 0.88174951, + "learning_rate": 3.974121345799418e-06, + "loss": 0.90030181, + "num_input_tokens_seen": 28140530, + "step": 1326, + "time_per_iteration": 2.7014975547790527 + }, + { + "auxiliary_loss_clip": 0.01186214, + "auxiliary_loss_mlp": 0.01174661, + "balance_loss_clip": 1.00229442, + "balance_loss_mlp": 1.00147378, + "epoch": 0.07978355629039531, + "flos": 21762513204480.0, + "grad_norm": 1.8387952452456195, + "language_loss": 0.82964778, + "learning_rate": 3.974058859276032e-06, + "loss": 0.85325652, + "num_input_tokens_seen": 28159640, + "step": 1327, + "time_per_iteration": 2.5130515098571777 + }, + { + "auxiliary_loss_clip": 0.01186473, + "auxiliary_loss_mlp": 0.01174917, + "balance_loss_clip": 1.00247514, + "balance_loss_mlp": 1.00125253, + "epoch": 0.07984367954306328, + "flos": 18551309322240.0, + "grad_norm": 3.08317120927344, + "language_loss": 0.78804052, + "learning_rate": 3.9739962978961354e-06, + "loss": 0.81165445, + "num_input_tokens_seen": 28177050, + "step": 1328, + "time_per_iteration": 2.544692277908325 + }, + { + "auxiliary_loss_clip": 0.01170161, + "auxiliary_loss_mlp": 0.01174705, + "balance_loss_clip": 1.00232542, + "balance_loss_mlp": 1.00123143, + "epoch": 0.07990380279573125, + "flos": 16904050583040.0, + "grad_norm": 2.3495040255692747, + "language_loss": 0.74150413, + "learning_rate": 3.973933661662101e-06, + "loss": 0.76495278, + "num_input_tokens_seen": 28193245, + "step": 1329, + "time_per_iteration": 2.5334184169769287 + }, + { + "auxiliary_loss_clip": 0.01153384, + "auxiliary_loss_mlp": 0.0117472, + "balance_loss_clip": 1.00220287, + "balance_loss_mlp": 1.00153267, + "epoch": 0.07996392604839922, + "flos": 24098358643200.0, + "grad_norm": 1.512170115702225, + "language_loss": 0.81187809, + "learning_rate": 3.973870950576305e-06, + "loss": 0.83515918, + "num_input_tokens_seen": 28213570, + "step": 1330, + "time_per_iteration": 2.641101837158203 + }, + { + "auxiliary_loss_clip": 0.01186364, + "auxiliary_loss_mlp": 0.00749611, + "balance_loss_clip": 1.002496, + "balance_loss_mlp": 1.00055277, + "epoch": 0.08002404930106718, + "flos": 14278729438080.0, + "grad_norm": 2.2234168837650157, + "language_loss": 0.88722825, + "learning_rate": 3.9738081646411255e-06, + "loss": 0.90658796, + "num_input_tokens_seen": 28229980, + "step": 1331, + "time_per_iteration": 2.4903557300567627 + }, + { + "auxiliary_loss_clip": 0.01170451, + "auxiliary_loss_mlp": 0.00749621, + "balance_loss_clip": 1.00238872, + "balance_loss_mlp": 1.00055981, + "epoch": 0.08008417255373516, + "flos": 40406219285760.0, + "grad_norm": 5.2891614621330145, + "language_loss": 0.73273087, + "learning_rate": 3.973745303858942e-06, + "loss": 0.75193155, + "num_input_tokens_seen": 28253840, + "step": 1332, + "time_per_iteration": 2.7225680351257324 + }, + { + "auxiliary_loss_clip": 0.01155997, + "auxiliary_loss_mlp": 0.01174653, + "balance_loss_clip": 1.00239694, + "balance_loss_mlp": 1.00137007, + "epoch": 0.08014429580640313, + "flos": 18478913460480.0, + "grad_norm": 1.8755069676172131, + "language_loss": 0.82964706, + "learning_rate": 3.973682368232138e-06, + "loss": 0.85295349, + "num_input_tokens_seen": 28271675, + "step": 1333, + "time_per_iteration": 2.5731961727142334 + }, + { + "auxiliary_loss_clip": 0.01123987, + "auxiliary_loss_mlp": 0.01174507, + "balance_loss_clip": 1.00223637, + "balance_loss_mlp": 1.00122452, + "epoch": 0.0802044190590711, + "flos": 22053461368320.0, + "grad_norm": 2.2358502617297584, + "language_loss": 0.74948537, + "learning_rate": 3.9736193577631015e-06, + "loss": 0.77247036, + "num_input_tokens_seen": 28291850, + "step": 1334, + "time_per_iteration": 2.6918814182281494 + }, + { + "auxiliary_loss_clip": 0.01153565, + "auxiliary_loss_mlp": 0.01174604, + "balance_loss_clip": 1.00238562, + "balance_loss_mlp": 1.00160718, + "epoch": 0.08026454231173906, + "flos": 24572128055040.0, + "grad_norm": 2.3735235044570824, + "language_loss": 0.79901391, + "learning_rate": 3.973556272454221e-06, + "loss": 0.82229561, + "num_input_tokens_seen": 28310780, + "step": 1335, + "time_per_iteration": 2.62015438079834 + }, + { + "auxiliary_loss_clip": 0.0113609, + "auxiliary_loss_mlp": 0.01170566, + "balance_loss_clip": 1.00330043, + "balance_loss_mlp": 1.00043106, + "epoch": 0.08032466556440704, + "flos": 52581841459200.0, + "grad_norm": 0.7411109121141969, + "language_loss": 0.56054807, + "learning_rate": 3.973493112307889e-06, + "loss": 0.58361459, + "num_input_tokens_seen": 28369985, + "step": 1336, + "time_per_iteration": 3.251227617263794 + }, + { + "auxiliary_loss_clip": 0.01154276, + "auxiliary_loss_mlp": 0.0117471, + "balance_loss_clip": 1.00242674, + "balance_loss_mlp": 1.00152254, + "epoch": 0.080384788817075, + "flos": 23842602829440.0, + "grad_norm": 3.011368687526075, + "language_loss": 0.67456341, + "learning_rate": 3.9734298773265005e-06, + "loss": 0.69785327, + "num_input_tokens_seen": 28388670, + "step": 1337, + "time_per_iteration": 2.635891914367676 + }, + { + "auxiliary_loss_clip": 0.01153675, + "auxiliary_loss_mlp": 0.0117458, + "balance_loss_clip": 1.00231338, + "balance_loss_mlp": 1.00177431, + "epoch": 0.08044491206974297, + "flos": 25300719527040.0, + "grad_norm": 1.711087845086429, + "language_loss": 0.86850822, + "learning_rate": 3.973366567512453e-06, + "loss": 0.89179081, + "num_input_tokens_seen": 28411845, + "step": 1338, + "time_per_iteration": 2.7084808349609375 + }, + { + "auxiliary_loss_clip": 0.01121765, + "auxiliary_loss_mlp": 0.01174734, + "balance_loss_clip": 1.00213575, + "balance_loss_mlp": 1.00135636, + "epoch": 0.08050503532241095, + "flos": 22376549226240.0, + "grad_norm": 3.716055313622412, + "language_loss": 0.87327564, + "learning_rate": 3.973303182868147e-06, + "loss": 0.89624059, + "num_input_tokens_seen": 28427875, + "step": 1339, + "time_per_iteration": 2.6610817909240723 + }, + { + "auxiliary_loss_clip": 0.01169951, + "auxiliary_loss_mlp": 0.01174267, + "balance_loss_clip": 1.00241983, + "balance_loss_mlp": 1.00146151, + "epoch": 0.08056515857507891, + "flos": 18369421827840.0, + "grad_norm": 1.9929920042353333, + "language_loss": 0.89796507, + "learning_rate": 3.973239723395988e-06, + "loss": 0.92140722, + "num_input_tokens_seen": 28446615, + "step": 1340, + "time_per_iteration": 2.5576000213623047 + }, + { + "auxiliary_loss_clip": 0.01169128, + "auxiliary_loss_mlp": 0.01170414, + "balance_loss_clip": 1.0031234, + "balance_loss_mlp": 1.00027847, + "epoch": 0.08062528182774688, + "flos": 51348130980480.0, + "grad_norm": 0.8824840955583614, + "language_loss": 0.64834148, + "learning_rate": 3.97317618909838e-06, + "loss": 0.6717369, + "num_input_tokens_seen": 28505290, + "step": 1341, + "time_per_iteration": 3.095977783203125 + }, + { + "auxiliary_loss_clip": 0.01170231, + "auxiliary_loss_mlp": 0.01174715, + "balance_loss_clip": 1.00233507, + "balance_loss_mlp": 1.00114644, + "epoch": 0.08068540508041486, + "flos": 17599712261760.0, + "grad_norm": 2.0612987706542, + "language_loss": 0.89915979, + "learning_rate": 3.973112579977733e-06, + "loss": 0.92260933, + "num_input_tokens_seen": 28522735, + "step": 1342, + "time_per_iteration": 2.5905539989471436 + }, + { + "auxiliary_loss_clip": 0.01153798, + "auxiliary_loss_mlp": 0.01174511, + "balance_loss_clip": 1.00250149, + "balance_loss_mlp": 1.00122809, + "epoch": 0.08074552833308282, + "flos": 10561185486720.0, + "grad_norm": 2.8971873327439015, + "language_loss": 0.76393926, + "learning_rate": 3.973048896036459e-06, + "loss": 0.78722239, + "num_input_tokens_seen": 28539460, + "step": 1343, + "time_per_iteration": 2.647702932357788 + }, + { + "auxiliary_loss_clip": 0.01156203, + "auxiliary_loss_mlp": 0.01170279, + "balance_loss_clip": 1.00319457, + "balance_loss_mlp": 1.00014389, + "epoch": 0.08080565158575079, + "flos": 60840254954880.0, + "grad_norm": 0.7972055071149954, + "language_loss": 0.57381433, + "learning_rate": 3.972985137276974e-06, + "loss": 0.59707916, + "num_input_tokens_seen": 28599855, + "step": 1344, + "time_per_iteration": 3.1036269664764404 + }, + { + "auxiliary_loss_clip": 0.0113849, + "auxiliary_loss_mlp": 0.01174947, + "balance_loss_clip": 1.00236773, + "balance_loss_mlp": 1.00147367, + "epoch": 0.08086577483841875, + "flos": 18332361970560.0, + "grad_norm": 2.2107434827442796, + "language_loss": 0.86914206, + "learning_rate": 3.972921303701695e-06, + "loss": 0.89227647, + "num_input_tokens_seen": 28617585, + "step": 1345, + "time_per_iteration": 4.033085107803345 + }, + { + "auxiliary_loss_clip": 0.01186227, + "auxiliary_loss_mlp": 0.01174232, + "balance_loss_clip": 1.00247383, + "balance_loss_mlp": 1.00114036, + "epoch": 0.08092589809108673, + "flos": 21543601766400.0, + "grad_norm": 1.6759757806584459, + "language_loss": 0.87851483, + "learning_rate": 3.972857395313042e-06, + "loss": 0.9021194, + "num_input_tokens_seen": 28636355, + "step": 1346, + "time_per_iteration": 2.6113429069519043 + }, + { + "auxiliary_loss_clip": 0.01169516, + "auxiliary_loss_mlp": 0.01174442, + "balance_loss_clip": 1.00222683, + "balance_loss_mlp": 1.00125492, + "epoch": 0.0809860213437547, + "flos": 22128012046080.0, + "grad_norm": 1.5934006458514527, + "language_loss": 0.92656136, + "learning_rate": 3.972793412113439e-06, + "loss": 0.950001, + "num_input_tokens_seen": 28656260, + "step": 1347, + "time_per_iteration": 3.9540517330169678 + }, + { + "auxiliary_loss_clip": 0.01172291, + "auxiliary_loss_mlp": 0.01174679, + "balance_loss_clip": 1.0023427, + "balance_loss_mlp": 1.00168228, + "epoch": 0.08104614459642266, + "flos": 21725489260800.0, + "grad_norm": 1.682372445264745, + "language_loss": 0.89612544, + "learning_rate": 3.972729354105312e-06, + "loss": 0.91959512, + "num_input_tokens_seen": 28675865, + "step": 1348, + "time_per_iteration": 4.082314729690552 + }, + { + "auxiliary_loss_clip": 0.01122945, + "auxiliary_loss_mlp": 0.01174524, + "balance_loss_clip": 1.00211155, + "balance_loss_mlp": 1.00143242, + "epoch": 0.08110626784909064, + "flos": 23951878980480.0, + "grad_norm": 1.9153707672539024, + "language_loss": 0.76592982, + "learning_rate": 3.97266522129109e-06, + "loss": 0.78890443, + "num_input_tokens_seen": 28696255, + "step": 1349, + "time_per_iteration": 4.080092668533325 + }, + { + "auxiliary_loss_clip": 0.01186144, + "auxiliary_loss_mlp": 0.01174409, + "balance_loss_clip": 1.00243831, + "balance_loss_mlp": 1.00141251, + "epoch": 0.0811663911017586, + "flos": 19025689265280.0, + "grad_norm": 1.851494906670461, + "language_loss": 0.88292062, + "learning_rate": 3.972601013673205e-06, + "loss": 0.90652621, + "num_input_tokens_seen": 28713905, + "step": 1350, + "time_per_iteration": 2.6159567832946777 + }, + { + "auxiliary_loss_clip": 0.01139616, + "auxiliary_loss_mlp": 0.00749585, + "balance_loss_clip": 1.00225353, + "balance_loss_mlp": 1.00056386, + "epoch": 0.08122651435442657, + "flos": 15341290588800.0, + "grad_norm": 2.4581684311345535, + "language_loss": 0.82595003, + "learning_rate": 3.972536731254092e-06, + "loss": 0.84484208, + "num_input_tokens_seen": 28732075, + "step": 1351, + "time_per_iteration": 2.629221200942993 + }, + { + "auxiliary_loss_clip": 0.01186102, + "auxiliary_loss_mlp": 0.0117435, + "balance_loss_clip": 1.00237489, + "balance_loss_mlp": 1.00116324, + "epoch": 0.08128663760709455, + "flos": 23221563655680.0, + "grad_norm": 1.850577622095361, + "language_loss": 0.75188792, + "learning_rate": 3.972472374036189e-06, + "loss": 0.77549249, + "num_input_tokens_seen": 28751150, + "step": 1352, + "time_per_iteration": 2.553518295288086 + }, + { + "auxiliary_loss_clip": 0.0116972, + "auxiliary_loss_mlp": 0.00749562, + "balance_loss_clip": 1.00233495, + "balance_loss_mlp": 1.00061226, + "epoch": 0.08134676085976252, + "flos": 22965628273920.0, + "grad_norm": 2.4623425311834772, + "language_loss": 0.83272195, + "learning_rate": 3.972407942021935e-06, + "loss": 0.85191476, + "num_input_tokens_seen": 28773360, + "step": 1353, + "time_per_iteration": 2.6220526695251465 + }, + { + "auxiliary_loss_clip": 0.01169064, + "auxiliary_loss_mlp": 0.01170247, + "balance_loss_clip": 1.0037353, + "balance_loss_mlp": 1.0001111, + "epoch": 0.08140688411243048, + "flos": 64322115816960.0, + "grad_norm": 0.8504381005960931, + "language_loss": 0.59782243, + "learning_rate": 3.972343435213775e-06, + "loss": 0.62121546, + "num_input_tokens_seen": 28833390, + "step": 1354, + "time_per_iteration": 3.2096588611602783 + }, + { + "auxiliary_loss_clip": 0.01137139, + "auxiliary_loss_mlp": 0.01174568, + "balance_loss_clip": 1.00227702, + "balance_loss_mlp": 1.00147569, + "epoch": 0.08146700736509845, + "flos": 22491858862080.0, + "grad_norm": 2.02705327700217, + "language_loss": 0.82643569, + "learning_rate": 3.972278853614154e-06, + "loss": 0.84955275, + "num_input_tokens_seen": 28852430, + "step": 1355, + "time_per_iteration": 2.710232734680176 + }, + { + "auxiliary_loss_clip": 0.01172319, + "auxiliary_loss_mlp": 0.01174228, + "balance_loss_clip": 1.00258791, + "balance_loss_mlp": 1.00132632, + "epoch": 0.08152713061776642, + "flos": 20447823513600.0, + "grad_norm": 1.8714304380891231, + "language_loss": 0.71007663, + "learning_rate": 3.972214197225521e-06, + "loss": 0.73354208, + "num_input_tokens_seen": 28870685, + "step": 1356, + "time_per_iteration": 2.573991298675537 + }, + { + "auxiliary_loss_clip": 0.01169784, + "auxiliary_loss_mlp": 0.01174216, + "balance_loss_clip": 1.00220942, + "balance_loss_mlp": 1.0010283, + "epoch": 0.08158725387043439, + "flos": 23550218121600.0, + "grad_norm": 1.9931576874558232, + "language_loss": 0.70234621, + "learning_rate": 3.972149466050329e-06, + "loss": 0.72578621, + "num_input_tokens_seen": 28889860, + "step": 1357, + "time_per_iteration": 2.5946860313415527 + }, + { + "auxiliary_loss_clip": 0.01169608, + "auxiliary_loss_mlp": 0.01174312, + "balance_loss_clip": 1.00227082, + "balance_loss_mlp": 1.00122011, + "epoch": 0.08164737712310235, + "flos": 22017335264640.0, + "grad_norm": 2.163282273848258, + "language_loss": 0.84067315, + "learning_rate": 3.97208466009103e-06, + "loss": 0.86411238, + "num_input_tokens_seen": 28905865, + "step": 1358, + "time_per_iteration": 2.5606346130371094 + }, + { + "auxiliary_loss_clip": 0.01153057, + "auxiliary_loss_mlp": 0.01174104, + "balance_loss_clip": 1.00219893, + "balance_loss_mlp": 1.00120306, + "epoch": 0.08170750037577033, + "flos": 23367827836800.0, + "grad_norm": 5.0697406107717, + "language_loss": 1.02271724, + "learning_rate": 3.972019779350084e-06, + "loss": 1.0459888, + "num_input_tokens_seen": 28925250, + "step": 1359, + "time_per_iteration": 2.6279261112213135 + }, + { + "auxiliary_loss_clip": 0.01103854, + "auxiliary_loss_mlp": 0.01174408, + "balance_loss_clip": 1.00193083, + "balance_loss_mlp": 1.00131643, + "epoch": 0.0817676236284383, + "flos": 28397978490240.0, + "grad_norm": 1.9088507605838385, + "language_loss": 0.83390462, + "learning_rate": 3.971954823829951e-06, + "loss": 0.85668725, + "num_input_tokens_seen": 28943445, + "step": 1360, + "time_per_iteration": 2.7535316944122314 + }, + { + "auxiliary_loss_clip": 0.01186109, + "auxiliary_loss_mlp": 0.01174859, + "balance_loss_clip": 1.00239468, + "balance_loss_mlp": 1.00167179, + "epoch": 0.08182774688110626, + "flos": 19208905562880.0, + "grad_norm": 2.2254478843833274, + "language_loss": 0.72609729, + "learning_rate": 3.971889793533093e-06, + "loss": 0.74970698, + "num_input_tokens_seen": 28962695, + "step": 1361, + "time_per_iteration": 2.5171451568603516 + }, + { + "auxiliary_loss_clip": 0.01152702, + "auxiliary_loss_mlp": 0.01173816, + "balance_loss_clip": 1.0019635, + "balance_loss_mlp": 1.00120103, + "epoch": 0.08188787013377424, + "flos": 22784099915520.0, + "grad_norm": 2.3723174490997216, + "language_loss": 0.76523137, + "learning_rate": 3.971824688461976e-06, + "loss": 0.78849655, + "num_input_tokens_seen": 28982120, + "step": 1362, + "time_per_iteration": 2.614332437515259 + }, + { + "auxiliary_loss_clip": 0.01186008, + "auxiliary_loss_mlp": 0.01174302, + "balance_loss_clip": 1.00237823, + "balance_loss_mlp": 1.00130558, + "epoch": 0.08194799338644221, + "flos": 16468095214080.0, + "grad_norm": 3.3831795967280756, + "language_loss": 0.72805619, + "learning_rate": 3.971759508619069e-06, + "loss": 0.75165927, + "num_input_tokens_seen": 28998100, + "step": 1363, + "time_per_iteration": 2.563488483428955 + }, + { + "auxiliary_loss_clip": 0.01185965, + "auxiliary_loss_mlp": 0.01174426, + "balance_loss_clip": 1.00237906, + "balance_loss_mlp": 1.00152516, + "epoch": 0.08200811663911017, + "flos": 23913633974400.0, + "grad_norm": 8.239377823031479, + "language_loss": 0.7748937, + "learning_rate": 3.971694254006844e-06, + "loss": 0.79849768, + "num_input_tokens_seen": 29017095, + "step": 1364, + "time_per_iteration": 2.5527584552764893 + }, + { + "auxiliary_loss_clip": 0.01123849, + "auxiliary_loss_mlp": 0.01174146, + "balance_loss_clip": 1.00240803, + "balance_loss_mlp": 1.00134063, + "epoch": 0.08206823989177814, + "flos": 17896550256000.0, + "grad_norm": 2.1894633213290904, + "language_loss": 0.82152301, + "learning_rate": 3.971628924627776e-06, + "loss": 0.84450299, + "num_input_tokens_seen": 29037240, + "step": 1365, + "time_per_iteration": 2.708547830581665 + }, + { + "auxiliary_loss_clip": 0.01169909, + "auxiliary_loss_mlp": 0.0117399, + "balance_loss_clip": 1.00256217, + "balance_loss_mlp": 1.0013752, + "epoch": 0.08212836314444612, + "flos": 22088186841600.0, + "grad_norm": 1.903775861426112, + "language_loss": 0.82126892, + "learning_rate": 3.97156352048434e-06, + "loss": 0.84470797, + "num_input_tokens_seen": 29056250, + "step": 1366, + "time_per_iteration": 2.588870048522949 + }, + { + "auxiliary_loss_clip": 0.01137361, + "auxiliary_loss_mlp": 0.0117457, + "balance_loss_clip": 1.00218201, + "balance_loss_mlp": 1.00176454, + "epoch": 0.08218848639711408, + "flos": 17597485618560.0, + "grad_norm": 1.702031937128482, + "language_loss": 0.81836414, + "learning_rate": 3.97149804157902e-06, + "loss": 0.84148347, + "num_input_tokens_seen": 29073380, + "step": 1367, + "time_per_iteration": 2.661411762237549 + }, + { + "auxiliary_loss_clip": 0.0118611, + "auxiliary_loss_mlp": 0.01174382, + "balance_loss_clip": 1.00252092, + "balance_loss_mlp": 1.00157571, + "epoch": 0.08224860964978205, + "flos": 17857838373120.0, + "grad_norm": 2.526205661084557, + "language_loss": 0.84141588, + "learning_rate": 3.9714324879142946e-06, + "loss": 0.86502081, + "num_input_tokens_seen": 29091330, + "step": 1368, + "time_per_iteration": 2.536430835723877 + }, + { + "auxiliary_loss_clip": 0.01136721, + "auxiliary_loss_mlp": 0.01173494, + "balance_loss_clip": 1.00216866, + "balance_loss_mlp": 1.00126076, + "epoch": 0.08230873290245003, + "flos": 25227533566080.0, + "grad_norm": 1.8966473017919467, + "language_loss": 0.81297028, + "learning_rate": 3.971366859492653e-06, + "loss": 0.83607239, + "num_input_tokens_seen": 29110375, + "step": 1369, + "time_per_iteration": 2.712420701980591 + }, + { + "auxiliary_loss_clip": 0.01120456, + "auxiliary_loss_mlp": 0.00749484, + "balance_loss_clip": 1.00222731, + "balance_loss_mlp": 1.00044346, + "epoch": 0.08236885615511799, + "flos": 31759935753600.0, + "grad_norm": 2.1314937779965444, + "language_loss": 0.74864972, + "learning_rate": 3.971301156316582e-06, + "loss": 0.76734912, + "num_input_tokens_seen": 29129395, + "step": 1370, + "time_per_iteration": 2.7683136463165283 + }, + { + "auxiliary_loss_clip": 0.01120285, + "auxiliary_loss_mlp": 0.01174322, + "balance_loss_clip": 1.00222445, + "balance_loss_mlp": 1.00142074, + "epoch": 0.08242897940778596, + "flos": 23185832601600.0, + "grad_norm": 1.5503001434721655, + "language_loss": 0.74600387, + "learning_rate": 3.971235378388573e-06, + "loss": 0.76894999, + "num_input_tokens_seen": 29148650, + "step": 1371, + "time_per_iteration": 2.7893691062927246 + }, + { + "auxiliary_loss_clip": 0.01071003, + "auxiliary_loss_mlp": 0.0117387, + "balance_loss_clip": 1.00181067, + "balance_loss_mlp": 1.00135028, + "epoch": 0.08248910266045394, + "flos": 34491480393600.0, + "grad_norm": 1.883419611149401, + "language_loss": 0.71208459, + "learning_rate": 3.971169525711122e-06, + "loss": 0.73453331, + "num_input_tokens_seen": 29170785, + "step": 1372, + "time_per_iteration": 2.9037203788757324 + }, + { + "auxiliary_loss_clip": 0.01140596, + "auxiliary_loss_mlp": 0.0117372, + "balance_loss_clip": 1.00241899, + "balance_loss_mlp": 1.00091457, + "epoch": 0.0825492259131219, + "flos": 13436228960640.0, + "grad_norm": 2.9869362423626282, + "language_loss": 0.87941825, + "learning_rate": 3.9711035982867246e-06, + "loss": 0.90256137, + "num_input_tokens_seen": 29185210, + "step": 1373, + "time_per_iteration": 2.6057357788085938 + }, + { + "auxiliary_loss_clip": 0.01137538, + "auxiliary_loss_mlp": 0.01173869, + "balance_loss_clip": 1.00216269, + "balance_loss_mlp": 1.00115895, + "epoch": 0.08260934916578987, + "flos": 25812446636160.0, + "grad_norm": 2.585276361962294, + "language_loss": 0.82044828, + "learning_rate": 3.971037596117882e-06, + "loss": 0.84356236, + "num_input_tokens_seen": 29205210, + "step": 1374, + "time_per_iteration": 2.7108001708984375 + }, + { + "auxiliary_loss_clip": 0.01120199, + "auxiliary_loss_mlp": 0.0116982, + "balance_loss_clip": 1.00225806, + "balance_loss_mlp": 1.00044763, + "epoch": 0.08266947241845783, + "flos": 63460009491840.0, + "grad_norm": 0.8247298055418644, + "language_loss": 0.60688263, + "learning_rate": 3.970971519207095e-06, + "loss": 0.6297828, + "num_input_tokens_seen": 29265350, + "step": 1375, + "time_per_iteration": 3.2268359661102295 + }, + { + "auxiliary_loss_clip": 0.01152755, + "auxiliary_loss_mlp": 0.01169428, + "balance_loss_clip": 1.00332546, + "balance_loss_mlp": 1.00005519, + "epoch": 0.08272959567112581, + "flos": 69993704568960.0, + "grad_norm": 0.96104916084324, + "language_loss": 0.62196207, + "learning_rate": 3.970905367556871e-06, + "loss": 0.64518386, + "num_input_tokens_seen": 29321475, + "step": 1376, + "time_per_iteration": 3.1630868911743164 + }, + { + "auxiliary_loss_clip": 0.0112339, + "auxiliary_loss_mlp": 0.01174378, + "balance_loss_clip": 1.00227094, + "balance_loss_mlp": 1.00166738, + "epoch": 0.08278971892379378, + "flos": 20413205781120.0, + "grad_norm": 1.7152450493511018, + "language_loss": 0.82414329, + "learning_rate": 3.970839141169718e-06, + "loss": 0.847121, + "num_input_tokens_seen": 29341405, + "step": 1377, + "time_per_iteration": 2.711460828781128 + }, + { + "auxiliary_loss_clip": 0.01153019, + "auxiliary_loss_mlp": 0.01173692, + "balance_loss_clip": 1.00219607, + "balance_loss_mlp": 1.00126755, + "epoch": 0.08284984217646174, + "flos": 26250233598720.0, + "grad_norm": 1.862168493086518, + "language_loss": 0.84809494, + "learning_rate": 3.970772840048147e-06, + "loss": 0.87136209, + "num_input_tokens_seen": 29361955, + "step": 1378, + "time_per_iteration": 2.676527500152588 + }, + { + "auxiliary_loss_clip": 0.01169401, + "auxiliary_loss_mlp": 0.01173927, + "balance_loss_clip": 1.00230026, + "balance_loss_mlp": 1.00121653, + "epoch": 0.08290996542912972, + "flos": 27194683852800.0, + "grad_norm": 3.9681908673912916, + "language_loss": 0.87357724, + "learning_rate": 3.970706464194672e-06, + "loss": 0.89701056, + "num_input_tokens_seen": 29382395, + "step": 1379, + "time_per_iteration": 2.620987892150879 + }, + { + "auxiliary_loss_clip": 0.01139894, + "auxiliary_loss_mlp": 0.01173793, + "balance_loss_clip": 1.00235415, + "balance_loss_mlp": 1.00136876, + "epoch": 0.08297008868179769, + "flos": 38618191146240.0, + "grad_norm": 2.08771176825024, + "language_loss": 0.78539377, + "learning_rate": 3.970640013611812e-06, + "loss": 0.80853069, + "num_input_tokens_seen": 29404460, + "step": 1380, + "time_per_iteration": 2.8124234676361084 + }, + { + "auxiliary_loss_clip": 0.01169273, + "auxiliary_loss_mlp": 0.01173668, + "balance_loss_clip": 1.00238788, + "balance_loss_mlp": 1.00114799, + "epoch": 0.08303021193446565, + "flos": 19974736460160.0, + "grad_norm": 2.317677712958515, + "language_loss": 0.86262167, + "learning_rate": 3.970573488302083e-06, + "loss": 0.88605106, + "num_input_tokens_seen": 29422675, + "step": 1381, + "time_per_iteration": 2.5550060272216797 + }, + { + "auxiliary_loss_clip": 0.0117012, + "auxiliary_loss_mlp": 0.0074959, + "balance_loss_clip": 1.00247669, + "balance_loss_mlp": 1.00056875, + "epoch": 0.08309033518713363, + "flos": 13662646341120.0, + "grad_norm": 3.7010880873426726, + "language_loss": 0.88086212, + "learning_rate": 3.970506888268011e-06, + "loss": 0.90005928, + "num_input_tokens_seen": 29439840, + "step": 1382, + "time_per_iteration": 3.941300868988037 + }, + { + "auxiliary_loss_clip": 0.01136989, + "auxiliary_loss_mlp": 0.01173928, + "balance_loss_clip": 1.0022999, + "balance_loss_mlp": 1.00140858, + "epoch": 0.0831504584398016, + "flos": 17968551068160.0, + "grad_norm": 2.0056475312009794, + "language_loss": 0.77040827, + "learning_rate": 3.970440213512121e-06, + "loss": 0.79351747, + "num_input_tokens_seen": 29457360, + "step": 1383, + "time_per_iteration": 2.622157573699951 + }, + { + "auxiliary_loss_clip": 0.01169672, + "auxiliary_loss_mlp": 0.01174044, + "balance_loss_clip": 1.00236773, + "balance_loss_mlp": 1.00142872, + "epoch": 0.08321058169246956, + "flos": 22601386408320.0, + "grad_norm": 3.431474935765277, + "language_loss": 0.82911509, + "learning_rate": 3.97037346403694e-06, + "loss": 0.85255224, + "num_input_tokens_seen": 29477040, + "step": 1384, + "time_per_iteration": 2.5820202827453613 + }, + { + "auxiliary_loss_clip": 0.01121161, + "auxiliary_loss_mlp": 0.01173977, + "balance_loss_clip": 1.00219703, + "balance_loss_mlp": 1.00117111, + "epoch": 0.08327070494513754, + "flos": 22850426378880.0, + "grad_norm": 2.6611753561800753, + "language_loss": 0.85413927, + "learning_rate": 3.970306639845e-06, + "loss": 0.87709069, + "num_input_tokens_seen": 29492010, + "step": 1385, + "time_per_iteration": 5.590746164321899 + }, + { + "auxiliary_loss_clip": 0.01137026, + "auxiliary_loss_mlp": 0.01174026, + "balance_loss_clip": 1.0022831, + "balance_loss_mlp": 1.00131512, + "epoch": 0.0833308281978055, + "flos": 22782986593920.0, + "grad_norm": 2.4735505797210857, + "language_loss": 0.6904763, + "learning_rate": 3.970239740938835e-06, + "loss": 0.71358681, + "num_input_tokens_seen": 29511850, + "step": 1386, + "time_per_iteration": 4.1455864906311035 + }, + { + "auxiliary_loss_clip": 0.0116983, + "auxiliary_loss_mlp": 0.01173803, + "balance_loss_clip": 1.0023787, + "balance_loss_mlp": 1.00118828, + "epoch": 0.08339095145047347, + "flos": 20812604083200.0, + "grad_norm": 1.8518609019182495, + "language_loss": 0.81991005, + "learning_rate": 3.97017276732098e-06, + "loss": 0.84334636, + "num_input_tokens_seen": 29531415, + "step": 1387, + "time_per_iteration": 2.607593536376953 + }, + { + "auxiliary_loss_clip": 0.01152779, + "auxiliary_loss_mlp": 0.01174126, + "balance_loss_clip": 1.00220621, + "balance_loss_mlp": 1.0013206, + "epoch": 0.08345107470314143, + "flos": 18515326872960.0, + "grad_norm": 1.9714695942810725, + "language_loss": 0.77256799, + "learning_rate": 3.970105718993978e-06, + "loss": 0.79583699, + "num_input_tokens_seen": 29549525, + "step": 1388, + "time_per_iteration": 2.6343491077423096 + }, + { + "auxiliary_loss_clip": 0.01103925, + "auxiliary_loss_mlp": 0.01173844, + "balance_loss_clip": 1.00197697, + "balance_loss_mlp": 1.00141931, + "epoch": 0.08351119795580941, + "flos": 18807567926400.0, + "grad_norm": 2.003044558236542, + "language_loss": 0.79539889, + "learning_rate": 3.970038595960369e-06, + "loss": 0.81817663, + "num_input_tokens_seen": 29568705, + "step": 1389, + "time_per_iteration": 2.7005341053009033 + }, + { + "auxiliary_loss_clip": 0.01153333, + "auxiliary_loss_mlp": 0.01174314, + "balance_loss_clip": 1.00250721, + "balance_loss_mlp": 1.00131798, + "epoch": 0.08357132120847738, + "flos": 18441817689600.0, + "grad_norm": 6.221464144719645, + "language_loss": 0.87354481, + "learning_rate": 3.969971398222699e-06, + "loss": 0.89682126, + "num_input_tokens_seen": 29585855, + "step": 1390, + "time_per_iteration": 2.5818281173706055 + }, + { + "auxiliary_loss_clip": 0.01153209, + "auxiliary_loss_mlp": 0.01173985, + "balance_loss_clip": 1.00228238, + "balance_loss_mlp": 1.0012747, + "epoch": 0.08363144446114534, + "flos": 25922333318400.0, + "grad_norm": 1.6188998364611717, + "language_loss": 0.86671937, + "learning_rate": 3.969904125783517e-06, + "loss": 0.88999128, + "num_input_tokens_seen": 29607280, + "step": 1391, + "time_per_iteration": 2.642620325088501 + }, + { + "auxiliary_loss_clip": 0.011211, + "auxiliary_loss_mlp": 0.01174403, + "balance_loss_clip": 1.00224841, + "balance_loss_mlp": 1.00150144, + "epoch": 0.08369156771381332, + "flos": 18041306065920.0, + "grad_norm": 3.7528803409684897, + "language_loss": 0.88024479, + "learning_rate": 3.969836778645371e-06, + "loss": 0.90319985, + "num_input_tokens_seen": 29624130, + "step": 1392, + "time_per_iteration": 2.653704881668091 + }, + { + "auxiliary_loss_clip": 0.01169338, + "auxiliary_loss_mlp": 0.01174103, + "balance_loss_clip": 1.00236487, + "balance_loss_mlp": 1.00148797, + "epoch": 0.08375169096648129, + "flos": 22675111073280.0, + "grad_norm": 2.212769093057282, + "language_loss": 0.80722457, + "learning_rate": 3.969769356810819e-06, + "loss": 0.83065897, + "num_input_tokens_seen": 29643210, + "step": 1393, + "time_per_iteration": 2.5663902759552 + }, + { + "auxiliary_loss_clip": 0.01185859, + "auxiliary_loss_mlp": 0.01173804, + "balance_loss_clip": 1.00265861, + "balance_loss_mlp": 1.00156999, + "epoch": 0.08381181421914925, + "flos": 26103215232000.0, + "grad_norm": 1.7421826221334713, + "language_loss": 0.85077691, + "learning_rate": 3.969701860282415e-06, + "loss": 0.87437356, + "num_input_tokens_seen": 29663920, + "step": 1394, + "time_per_iteration": 2.568995237350464 + }, + { + "auxiliary_loss_clip": 0.01120325, + "auxiliary_loss_mlp": 0.01173642, + "balance_loss_clip": 1.00214219, + "balance_loss_mlp": 1.00102687, + "epoch": 0.08387193747181723, + "flos": 20629782835200.0, + "grad_norm": 3.4904258978503653, + "language_loss": 0.83042288, + "learning_rate": 3.969634289062719e-06, + "loss": 0.85336256, + "num_input_tokens_seen": 29683825, + "step": 1395, + "time_per_iteration": 2.6586170196533203 + }, + { + "auxiliary_loss_clip": 0.01169426, + "auxiliary_loss_mlp": 0.00749583, + "balance_loss_clip": 1.00252259, + "balance_loss_mlp": 1.00057054, + "epoch": 0.0839320607244852, + "flos": 13443196199040.0, + "grad_norm": 3.0558970508791035, + "language_loss": 0.82423365, + "learning_rate": 3.969566643154293e-06, + "loss": 0.84342378, + "num_input_tokens_seen": 29698775, + "step": 1396, + "time_per_iteration": 2.5562431812286377 + }, + { + "auxiliary_loss_clip": 0.01169402, + "auxiliary_loss_mlp": 0.0117376, + "balance_loss_clip": 1.00251055, + "balance_loss_mlp": 1.00124002, + "epoch": 0.08399218397715316, + "flos": 23477247642240.0, + "grad_norm": 1.935092633814565, + "language_loss": 0.76893955, + "learning_rate": 3.969498922559703e-06, + "loss": 0.79237115, + "num_input_tokens_seen": 29719430, + "step": 1397, + "time_per_iteration": 2.600935459136963 + }, + { + "auxiliary_loss_clip": 0.01136495, + "auxiliary_loss_mlp": 0.01173718, + "balance_loss_clip": 1.00229263, + "balance_loss_mlp": 1.00100744, + "epoch": 0.08405230722982113, + "flos": 25920717206400.0, + "grad_norm": 1.8693082904264067, + "language_loss": 0.78064728, + "learning_rate": 3.969431127281516e-06, + "loss": 0.80374938, + "num_input_tokens_seen": 29739685, + "step": 1398, + "time_per_iteration": 2.6927194595336914 + }, + { + "auxiliary_loss_clip": 0.01185803, + "auxiliary_loss_mlp": 0.01173769, + "balance_loss_clip": 1.00252271, + "balance_loss_mlp": 1.00134468, + "epoch": 0.0841124304824891, + "flos": 17967437746560.0, + "grad_norm": 2.126416375330137, + "language_loss": 0.9509201, + "learning_rate": 3.969363257322304e-06, + "loss": 0.97451586, + "num_input_tokens_seen": 29756165, + "step": 1399, + "time_per_iteration": 2.5163655281066895 + }, + { + "auxiliary_loss_clip": 0.01170241, + "auxiliary_loss_mlp": 0.01174169, + "balance_loss_clip": 1.00251842, + "balance_loss_mlp": 1.00126791, + "epoch": 0.08417255373515707, + "flos": 25629661301760.0, + "grad_norm": 2.7179616048428255, + "language_loss": 0.8225615, + "learning_rate": 3.96929531268464e-06, + "loss": 0.84600562, + "num_input_tokens_seen": 29776425, + "step": 1400, + "time_per_iteration": 2.6212923526763916 + }, + { + "auxiliary_loss_clip": 0.01152812, + "auxiliary_loss_mlp": 0.01173622, + "balance_loss_clip": 1.00232911, + "balance_loss_mlp": 1.00100708, + "epoch": 0.08423267698782504, + "flos": 26249730808320.0, + "grad_norm": 1.8041157894290303, + "language_loss": 0.86487317, + "learning_rate": 3.969227293371099e-06, + "loss": 0.88813758, + "num_input_tokens_seen": 29796440, + "step": 1401, + "time_per_iteration": 2.6295037269592285 + }, + { + "auxiliary_loss_clip": 0.01185855, + "auxiliary_loss_mlp": 0.01174035, + "balance_loss_clip": 1.00238204, + "balance_loss_mlp": 1.00142038, + "epoch": 0.08429280024049302, + "flos": 20119707751680.0, + "grad_norm": 1.7746429523117724, + "language_loss": 0.87289083, + "learning_rate": 3.969159199384263e-06, + "loss": 0.89648962, + "num_input_tokens_seen": 29814755, + "step": 1402, + "time_per_iteration": 2.545071840286255 + }, + { + "auxiliary_loss_clip": 0.01139335, + "auxiliary_loss_mlp": 0.00749599, + "balance_loss_clip": 1.00222015, + "balance_loss_mlp": 1.00051391, + "epoch": 0.08435292349316098, + "flos": 42924526836480.0, + "grad_norm": 2.1021450255094924, + "language_loss": 0.89081138, + "learning_rate": 3.9690910307267125e-06, + "loss": 0.90970075, + "num_input_tokens_seen": 29834785, + "step": 1403, + "time_per_iteration": 2.839660882949829 + }, + { + "auxiliary_loss_clip": 0.01152664, + "auxiliary_loss_mlp": 0.01173399, + "balance_loss_clip": 1.00226998, + "balance_loss_mlp": 1.00097477, + "epoch": 0.08441304674582895, + "flos": 22857285876480.0, + "grad_norm": 2.137823663934574, + "language_loss": 0.80393362, + "learning_rate": 3.969022787401033e-06, + "loss": 0.82719433, + "num_input_tokens_seen": 29854695, + "step": 1404, + "time_per_iteration": 2.655214786529541 + }, + { + "auxiliary_loss_clip": 0.01153417, + "auxiliary_loss_mlp": 0.0117408, + "balance_loss_clip": 1.00228262, + "balance_loss_mlp": 1.00156069, + "epoch": 0.08447316999849692, + "flos": 18697501676160.0, + "grad_norm": 2.100156192931601, + "language_loss": 0.83575404, + "learning_rate": 3.968954469409811e-06, + "loss": 0.85902905, + "num_input_tokens_seen": 29872180, + "step": 1405, + "time_per_iteration": 2.5771539211273193 + }, + { + "auxiliary_loss_clip": 0.01169636, + "auxiliary_loss_mlp": 0.01173323, + "balance_loss_clip": 1.00239623, + "balance_loss_mlp": 1.0010891, + "epoch": 0.08453329325116489, + "flos": 25483971738240.0, + "grad_norm": 1.9384493793459154, + "language_loss": 0.80229837, + "learning_rate": 3.968886076755639e-06, + "loss": 0.82572794, + "num_input_tokens_seen": 29893205, + "step": 1406, + "time_per_iteration": 2.651732921600342 + }, + { + "auxiliary_loss_clip": 0.01155528, + "auxiliary_loss_mlp": 0.01173806, + "balance_loss_clip": 1.00229418, + "balance_loss_mlp": 1.00157285, + "epoch": 0.08459341650383286, + "flos": 20920048640640.0, + "grad_norm": 1.7183213254696184, + "language_loss": 0.79859084, + "learning_rate": 3.96881760944111e-06, + "loss": 0.82188421, + "num_input_tokens_seen": 29911970, + "step": 1407, + "time_per_iteration": 2.632808208465576 + }, + { + "auxiliary_loss_clip": 0.01169559, + "auxiliary_loss_mlp": 0.01173704, + "balance_loss_clip": 1.00240445, + "balance_loss_mlp": 1.00118494, + "epoch": 0.08465353975650082, + "flos": 13043079624960.0, + "grad_norm": 2.2101590669723388, + "language_loss": 0.9202466, + "learning_rate": 3.968749067468819e-06, + "loss": 0.94367921, + "num_input_tokens_seen": 29929925, + "step": 1408, + "time_per_iteration": 2.544079542160034 + }, + { + "auxiliary_loss_clip": 0.01152765, + "auxiliary_loss_mlp": 0.01168613, + "balance_loss_clip": 1.00360274, + "balance_loss_mlp": 1.00000346, + "epoch": 0.0847136630091688, + "flos": 60877422552960.0, + "grad_norm": 0.8926726350014749, + "language_loss": 0.61844003, + "learning_rate": 3.968680450841368e-06, + "loss": 0.64165378, + "num_input_tokens_seen": 29985950, + "step": 1409, + "time_per_iteration": 3.2575581073760986 + }, + { + "auxiliary_loss_clip": 0.0118554, + "auxiliary_loss_mlp": 0.01173184, + "balance_loss_clip": 1.00246954, + "balance_loss_mlp": 1.00133169, + "epoch": 0.08477378626183676, + "flos": 22046530043520.0, + "grad_norm": 1.6659918768866144, + "language_loss": 0.86201692, + "learning_rate": 3.968611759561355e-06, + "loss": 0.8856042, + "num_input_tokens_seen": 30004330, + "step": 1410, + "time_per_iteration": 2.570279836654663 + }, + { + "auxiliary_loss_clip": 0.01169258, + "auxiliary_loss_mlp": 0.01173818, + "balance_loss_clip": 1.00233078, + "balance_loss_mlp": 1.00129819, + "epoch": 0.08483390951450473, + "flos": 16690059308160.0, + "grad_norm": 2.2237608705453686, + "language_loss": 0.7424829, + "learning_rate": 3.968542993631388e-06, + "loss": 0.76591372, + "num_input_tokens_seen": 30022555, + "step": 1411, + "time_per_iteration": 2.544919729232788 + }, + { + "auxiliary_loss_clip": 0.01185057, + "auxiliary_loss_mlp": 0.01168689, + "balance_loss_clip": 1.00376034, + "balance_loss_mlp": 1.00007904, + "epoch": 0.08489403276717271, + "flos": 51584640082560.0, + "grad_norm": 0.9050950937783483, + "language_loss": 0.56774485, + "learning_rate": 3.968474153054073e-06, + "loss": 0.59128225, + "num_input_tokens_seen": 30077220, + "step": 1412, + "time_per_iteration": 3.024279832839966 + }, + { + "auxiliary_loss_clip": 0.01153113, + "auxiliary_loss_mlp": 0.01173683, + "balance_loss_clip": 1.00225663, + "balance_loss_mlp": 1.00125897, + "epoch": 0.08495415601984067, + "flos": 17092330698240.0, + "grad_norm": 1.9490169596758082, + "language_loss": 0.89362973, + "learning_rate": 3.96840523783202e-06, + "loss": 0.91689765, + "num_input_tokens_seen": 30094600, + "step": 1413, + "time_per_iteration": 2.5906622409820557 + }, + { + "auxiliary_loss_clip": 0.01152515, + "auxiliary_loss_mlp": 0.01173668, + "balance_loss_clip": 1.00236881, + "balance_loss_mlp": 1.0011487, + "epoch": 0.08501427927250864, + "flos": 23148413608320.0, + "grad_norm": 1.845145829201243, + "language_loss": 0.87785369, + "learning_rate": 3.968336247967844e-06, + "loss": 0.90111554, + "num_input_tokens_seen": 30114475, + "step": 1414, + "time_per_iteration": 2.6340599060058594 + }, + { + "auxiliary_loss_clip": 0.0115271, + "auxiliary_loss_mlp": 0.01173823, + "balance_loss_clip": 1.00227475, + "balance_loss_mlp": 1.00139892, + "epoch": 0.08507440252517662, + "flos": 19063467394560.0, + "grad_norm": 1.7419594081626957, + "language_loss": 0.77497149, + "learning_rate": 3.96826718346416e-06, + "loss": 0.79823685, + "num_input_tokens_seen": 30133350, + "step": 1415, + "time_per_iteration": 2.6164252758026123 + }, + { + "auxiliary_loss_clip": 0.0116961, + "auxiliary_loss_mlp": 0.01173619, + "balance_loss_clip": 1.00233829, + "balance_loss_mlp": 1.00129032, + "epoch": 0.08513452577784458, + "flos": 60182296600320.0, + "grad_norm": 1.6137144956487346, + "language_loss": 0.70763516, + "learning_rate": 3.968198044323587e-06, + "loss": 0.73106748, + "num_input_tokens_seen": 30159005, + "step": 1416, + "time_per_iteration": 2.922147750854492 + }, + { + "auxiliary_loss_clip": 0.01152599, + "auxiliary_loss_mlp": 0.01173331, + "balance_loss_clip": 1.0022378, + "balance_loss_mlp": 1.00119233, + "epoch": 0.08519464903051255, + "flos": 27308485117440.0, + "grad_norm": 4.361497946348274, + "language_loss": 0.74943817, + "learning_rate": 3.968128830548748e-06, + "loss": 0.77269745, + "num_input_tokens_seen": 30179450, + "step": 1417, + "time_per_iteration": 2.6497340202331543 + }, + { + "auxiliary_loss_clip": 0.01153031, + "auxiliary_loss_mlp": 0.01173525, + "balance_loss_clip": 1.0023005, + "balance_loss_mlp": 1.0011003, + "epoch": 0.08525477228318051, + "flos": 20266438809600.0, + "grad_norm": 7.036676541049047, + "language_loss": 0.82728374, + "learning_rate": 3.968059542142265e-06, + "loss": 0.85054934, + "num_input_tokens_seen": 30197235, + "step": 1418, + "time_per_iteration": 2.650756597518921 + }, + { + "auxiliary_loss_clip": 0.01136227, + "auxiliary_loss_mlp": 0.01168656, + "balance_loss_clip": 1.00312853, + "balance_loss_mlp": 1.00004601, + "epoch": 0.08531489553584849, + "flos": 67615017183360.0, + "grad_norm": 0.8615950263208345, + "language_loss": 0.56664962, + "learning_rate": 3.9679901791067685e-06, + "loss": 0.58969849, + "num_input_tokens_seen": 30257410, + "step": 1419, + "time_per_iteration": 4.498016357421875 + }, + { + "auxiliary_loss_clip": 0.01185654, + "auxiliary_loss_mlp": 0.01173522, + "balance_loss_clip": 1.00236464, + "balance_loss_mlp": 1.00128841, + "epoch": 0.08537501878851646, + "flos": 27526965592320.0, + "grad_norm": 2.224458566077924, + "language_loss": 0.70442712, + "learning_rate": 3.967920741444886e-06, + "loss": 0.72801888, + "num_input_tokens_seen": 30277865, + "step": 1420, + "time_per_iteration": 2.6063501834869385 + }, + { + "auxiliary_loss_clip": 0.011375, + "auxiliary_loss_mlp": 0.01173194, + "balance_loss_clip": 1.00225437, + "balance_loss_mlp": 1.00115144, + "epoch": 0.08543514204118442, + "flos": 22784243569920.0, + "grad_norm": 1.66598677974585, + "language_loss": 0.88015389, + "learning_rate": 3.967851229159252e-06, + "loss": 0.90326083, + "num_input_tokens_seen": 30298545, + "step": 1421, + "time_per_iteration": 2.7098772525787354 + }, + { + "auxiliary_loss_clip": 0.01184855, + "auxiliary_loss_mlp": 0.01168635, + "balance_loss_clip": 1.00371385, + "balance_loss_mlp": 1.00002515, + "epoch": 0.0854952652938524, + "flos": 60990721027200.0, + "grad_norm": 0.7966064722999943, + "language_loss": 0.63539028, + "learning_rate": 3.967781642252502e-06, + "loss": 0.65892518, + "num_input_tokens_seen": 30361725, + "step": 1422, + "time_per_iteration": 3.104600191116333 + }, + { + "auxiliary_loss_clip": 0.01135813, + "auxiliary_loss_mlp": 0.01173438, + "balance_loss_clip": 1.00226212, + "balance_loss_mlp": 1.00139546, + "epoch": 0.08555538854652037, + "flos": 28038046256640.0, + "grad_norm": 4.878373201500072, + "language_loss": 0.83393884, + "learning_rate": 3.967711980727276e-06, + "loss": 0.85703135, + "num_input_tokens_seen": 30382180, + "step": 1423, + "time_per_iteration": 5.608163833618164 + }, + { + "auxiliary_loss_clip": 0.01137409, + "auxiliary_loss_mlp": 0.01173434, + "balance_loss_clip": 1.00232244, + "balance_loss_mlp": 1.00129545, + "epoch": 0.08561551179918833, + "flos": 23509279595520.0, + "grad_norm": 1.6900460958900856, + "language_loss": 0.75255048, + "learning_rate": 3.967642244586213e-06, + "loss": 0.77565891, + "num_input_tokens_seen": 30402980, + "step": 1424, + "time_per_iteration": 4.156313419342041 + }, + { + "auxiliary_loss_clip": 0.01119741, + "auxiliary_loss_mlp": 0.01173468, + "balance_loss_clip": 1.00205791, + "balance_loss_mlp": 1.00152111, + "epoch": 0.08567563505185631, + "flos": 17926930183680.0, + "grad_norm": 1.9815251493434671, + "language_loss": 0.75926507, + "learning_rate": 3.96757243383196e-06, + "loss": 0.78219712, + "num_input_tokens_seen": 30420800, + "step": 1425, + "time_per_iteration": 2.66595458984375 + }, + { + "auxiliary_loss_clip": 0.01185625, + "auxiliary_loss_mlp": 0.01173347, + "balance_loss_clip": 1.00253582, + "balance_loss_mlp": 1.00120902, + "epoch": 0.08573575830452428, + "flos": 19719519350400.0, + "grad_norm": 2.256538277485435, + "language_loss": 0.9331069, + "learning_rate": 3.9675025484671624e-06, + "loss": 0.95669663, + "num_input_tokens_seen": 30439620, + "step": 1426, + "time_per_iteration": 2.553572654724121 + }, + { + "auxiliary_loss_clip": 0.01104357, + "auxiliary_loss_mlp": 0.01173867, + "balance_loss_clip": 1.00215292, + "balance_loss_mlp": 1.00163305, + "epoch": 0.08579588155719224, + "flos": 17931563038080.0, + "grad_norm": 3.2180679456705024, + "language_loss": 0.75414044, + "learning_rate": 3.967432588494471e-06, + "loss": 0.77692264, + "num_input_tokens_seen": 30457300, + "step": 1427, + "time_per_iteration": 2.717972755432129 + }, + { + "auxiliary_loss_clip": 0.01185541, + "auxiliary_loss_mlp": 0.01173266, + "balance_loss_clip": 1.00244129, + "balance_loss_mlp": 1.00112796, + "epoch": 0.08585600480986022, + "flos": 16033324993920.0, + "grad_norm": 2.919695322838684, + "language_loss": 0.82301062, + "learning_rate": 3.96736255391654e-06, + "loss": 0.84659868, + "num_input_tokens_seen": 30471580, + "step": 1428, + "time_per_iteration": 2.5397567749023438 + }, + { + "auxiliary_loss_clip": 0.01169713, + "auxiliary_loss_mlp": 0.01173608, + "balance_loss_clip": 1.00245929, + "balance_loss_mlp": 1.00137448, + "epoch": 0.08591612806252819, + "flos": 28657433404800.0, + "grad_norm": 1.8764601144952913, + "language_loss": 0.80316561, + "learning_rate": 3.967292444736023e-06, + "loss": 0.82659888, + "num_input_tokens_seen": 30492720, + "step": 1429, + "time_per_iteration": 2.667044162750244 + }, + { + "auxiliary_loss_clip": 0.01152455, + "auxiliary_loss_mlp": 0.01173081, + "balance_loss_clip": 1.00220287, + "balance_loss_mlp": 1.00122881, + "epoch": 0.08597625131519615, + "flos": 20959119659520.0, + "grad_norm": 1.9767989724857793, + "language_loss": 0.88467878, + "learning_rate": 3.967222260955578e-06, + "loss": 0.90793407, + "num_input_tokens_seen": 30509535, + "step": 1430, + "time_per_iteration": 2.642730236053467 + }, + { + "auxiliary_loss_clip": 0.01136701, + "auxiliary_loss_mlp": 0.01173626, + "balance_loss_clip": 1.00232923, + "balance_loss_mlp": 1.00196481, + "epoch": 0.08603637456786412, + "flos": 23256360956160.0, + "grad_norm": 1.7564915590222558, + "language_loss": 0.82047915, + "learning_rate": 3.96715200257787e-06, + "loss": 0.84358245, + "num_input_tokens_seen": 30529490, + "step": 1431, + "time_per_iteration": 2.679874897003174 + }, + { + "auxiliary_loss_clip": 0.01136084, + "auxiliary_loss_mlp": 0.01173834, + "balance_loss_clip": 1.00215125, + "balance_loss_mlp": 1.00141001, + "epoch": 0.0860964978205321, + "flos": 28694170039680.0, + "grad_norm": 3.867637688156139, + "language_loss": 0.77699244, + "learning_rate": 3.967081669605559e-06, + "loss": 0.80009156, + "num_input_tokens_seen": 30550205, + "step": 1432, + "time_per_iteration": 2.6850550174713135 + }, + { + "auxiliary_loss_clip": 0.01152739, + "auxiliary_loss_mlp": 0.01173397, + "balance_loss_clip": 1.00219882, + "balance_loss_mlp": 1.0014497, + "epoch": 0.08615662107320006, + "flos": 19318397195520.0, + "grad_norm": 2.075628482644236, + "language_loss": 0.72921908, + "learning_rate": 3.967011262041315e-06, + "loss": 0.75248045, + "num_input_tokens_seen": 30568830, + "step": 1433, + "time_per_iteration": 2.6153576374053955 + }, + { + "auxiliary_loss_clip": 0.01136723, + "auxiliary_loss_mlp": 0.00749615, + "balance_loss_clip": 1.00212407, + "balance_loss_mlp": 1.00063586, + "epoch": 0.08621674432586802, + "flos": 15851688894720.0, + "grad_norm": 3.0576388159750247, + "language_loss": 0.85412884, + "learning_rate": 3.9669407798878065e-06, + "loss": 0.87299216, + "num_input_tokens_seen": 30585730, + "step": 1434, + "time_per_iteration": 2.634974718093872 + }, + { + "auxiliary_loss_clip": 0.01152813, + "auxiliary_loss_mlp": 0.01172974, + "balance_loss_clip": 1.00220954, + "balance_loss_mlp": 1.001122, + "epoch": 0.086276867578536, + "flos": 14100648785280.0, + "grad_norm": 2.6594166611466825, + "language_loss": 0.78547686, + "learning_rate": 3.966870223147707e-06, + "loss": 0.80873466, + "num_input_tokens_seen": 30603180, + "step": 1435, + "time_per_iteration": 2.611423969268799 + }, + { + "auxiliary_loss_clip": 0.01139416, + "auxiliary_loss_mlp": 0.01167854, + "balance_loss_clip": 1.0038271, + "balance_loss_mlp": 1.00000715, + "epoch": 0.08633699083120397, + "flos": 70184857772160.0, + "grad_norm": 0.879744426399056, + "language_loss": 0.5789476, + "learning_rate": 3.96679959182369e-06, + "loss": 0.60202026, + "num_input_tokens_seen": 30668895, + "step": 1436, + "time_per_iteration": 3.3291640281677246 + }, + { + "auxiliary_loss_clip": 0.01169521, + "auxiliary_loss_mlp": 0.01172826, + "balance_loss_clip": 1.00241995, + "balance_loss_mlp": 1.00106883, + "epoch": 0.08639711408387193, + "flos": 30298874140800.0, + "grad_norm": 2.1411359695067955, + "language_loss": 0.68617368, + "learning_rate": 3.966728885918437e-06, + "loss": 0.70959717, + "num_input_tokens_seen": 30688955, + "step": 1437, + "time_per_iteration": 2.651641845703125 + }, + { + "auxiliary_loss_clip": 0.01104257, + "auxiliary_loss_mlp": 0.01173064, + "balance_loss_clip": 1.00204515, + "balance_loss_mlp": 1.00111699, + "epoch": 0.08645723733653991, + "flos": 20297680663680.0, + "grad_norm": 2.0548344221509685, + "language_loss": 0.72798395, + "learning_rate": 3.966658105434627e-06, + "loss": 0.7507571, + "num_input_tokens_seen": 30706095, + "step": 1438, + "time_per_iteration": 2.7246484756469727 + }, + { + "auxiliary_loss_clip": 0.0116882, + "auxiliary_loss_mlp": 0.01172797, + "balance_loss_clip": 1.00237775, + "balance_loss_mlp": 1.00104022, + "epoch": 0.08651736058920788, + "flos": 32890583134080.0, + "grad_norm": 1.7098439767608684, + "language_loss": 0.64089322, + "learning_rate": 3.966587250374945e-06, + "loss": 0.66430938, + "num_input_tokens_seen": 30729025, + "step": 1439, + "time_per_iteration": 2.7089462280273438 + }, + { + "auxiliary_loss_clip": 0.01136099, + "auxiliary_loss_mlp": 0.01172559, + "balance_loss_clip": 1.00218475, + "balance_loss_mlp": 1.00108838, + "epoch": 0.08657748384187584, + "flos": 22637368857600.0, + "grad_norm": 1.936697454842285, + "language_loss": 0.87329125, + "learning_rate": 3.966516320742077e-06, + "loss": 0.8963778, + "num_input_tokens_seen": 30746155, + "step": 1440, + "time_per_iteration": 2.6658644676208496 + }, + { + "auxiliary_loss_clip": 0.0113608, + "auxiliary_loss_mlp": 0.00749605, + "balance_loss_clip": 1.00217199, + "balance_loss_mlp": 1.00063658, + "epoch": 0.08663760709454381, + "flos": 23658380951040.0, + "grad_norm": 2.030948485587219, + "language_loss": 0.83487034, + "learning_rate": 3.9664453165387124e-06, + "loss": 0.85372716, + "num_input_tokens_seen": 30761410, + "step": 1441, + "time_per_iteration": 2.7004690170288086 + }, + { + "auxiliary_loss_clip": 0.01184448, + "auxiliary_loss_mlp": 0.01167875, + "balance_loss_clip": 1.00343323, + "balance_loss_mlp": 1.00002813, + "epoch": 0.08669773034721179, + "flos": 62686564911360.0, + "grad_norm": 0.8475106490942806, + "language_loss": 0.60499048, + "learning_rate": 3.966374237767545e-06, + "loss": 0.62851369, + "num_input_tokens_seen": 30823010, + "step": 1442, + "time_per_iteration": 3.2524285316467285 + }, + { + "auxiliary_loss_clip": 0.01152514, + "auxiliary_loss_mlp": 0.01172974, + "balance_loss_clip": 1.00238049, + "balance_loss_mlp": 1.00102687, + "epoch": 0.08675785359987975, + "flos": 20667489137280.0, + "grad_norm": 2.2393088969845967, + "language_loss": 0.79162478, + "learning_rate": 3.96630308443127e-06, + "loss": 0.81487966, + "num_input_tokens_seen": 30841980, + "step": 1443, + "time_per_iteration": 2.7041940689086914 + }, + { + "auxiliary_loss_clip": 0.01169001, + "auxiliary_loss_mlp": 0.01173132, + "balance_loss_clip": 1.00229943, + "balance_loss_mlp": 1.00108969, + "epoch": 0.08681797685254772, + "flos": 26941118768640.0, + "grad_norm": 1.7621263366257998, + "language_loss": 0.82379639, + "learning_rate": 3.966231856532584e-06, + "loss": 0.84721768, + "num_input_tokens_seen": 30863280, + "step": 1444, + "time_per_iteration": 2.6500797271728516 + }, + { + "auxiliary_loss_clip": 0.01185552, + "auxiliary_loss_mlp": 0.01173108, + "balance_loss_clip": 1.00244117, + "balance_loss_mlp": 1.00106537, + "epoch": 0.0868781001052157, + "flos": 17712831168000.0, + "grad_norm": 1.8223579866087471, + "language_loss": 0.86637282, + "learning_rate": 3.966160554074189e-06, + "loss": 0.88995945, + "num_input_tokens_seen": 30881710, + "step": 1445, + "time_per_iteration": 2.516470193862915 + }, + { + "auxiliary_loss_clip": 0.01168963, + "auxiliary_loss_mlp": 0.01173201, + "balance_loss_clip": 1.00248325, + "balance_loss_mlp": 1.00134921, + "epoch": 0.08693822335788366, + "flos": 19896522595200.0, + "grad_norm": 2.1078870027188445, + "language_loss": 0.81435764, + "learning_rate": 3.96608917705879e-06, + "loss": 0.83777928, + "num_input_tokens_seen": 30900225, + "step": 1446, + "time_per_iteration": 2.567401170730591 + }, + { + "auxiliary_loss_clip": 0.01167822, + "auxiliary_loss_mlp": 0.01167894, + "balance_loss_clip": 1.0033257, + "balance_loss_mlp": 1.00004733, + "epoch": 0.08699834661055163, + "flos": 67023747406080.0, + "grad_norm": 0.7306790263174067, + "language_loss": 0.54736483, + "learning_rate": 3.966017725489091e-06, + "loss": 0.57072198, + "num_input_tokens_seen": 30959580, + "step": 1447, + "time_per_iteration": 3.172292470932007 + }, + { + "auxiliary_loss_clip": 0.01136161, + "auxiliary_loss_mlp": 0.01172846, + "balance_loss_clip": 1.00208151, + "balance_loss_mlp": 1.00128031, + "epoch": 0.0870584698632196, + "flos": 13480507451520.0, + "grad_norm": 4.448416113303806, + "language_loss": 0.84501469, + "learning_rate": 3.965946199367804e-06, + "loss": 0.86810482, + "num_input_tokens_seen": 30976775, + "step": 1448, + "time_per_iteration": 2.6167709827423096 + }, + { + "auxiliary_loss_clip": 0.01185566, + "auxiliary_loss_mlp": 0.01173352, + "balance_loss_clip": 1.00248837, + "balance_loss_mlp": 1.00130963, + "epoch": 0.08711859311588757, + "flos": 16107013745280.0, + "grad_norm": 2.8119643765644033, + "language_loss": 0.80731869, + "learning_rate": 3.965874598697638e-06, + "loss": 0.83090794, + "num_input_tokens_seen": 30990495, + "step": 1449, + "time_per_iteration": 2.568652391433716 + }, + { + "auxiliary_loss_clip": 0.01136571, + "auxiliary_loss_mlp": 0.01172862, + "balance_loss_clip": 1.00225925, + "balance_loss_mlp": 1.00100994, + "epoch": 0.08717871636855554, + "flos": 38472357928320.0, + "grad_norm": 1.497944509995706, + "language_loss": 0.70692319, + "learning_rate": 3.965802923481313e-06, + "loss": 0.73001754, + "num_input_tokens_seen": 31014080, + "step": 1450, + "time_per_iteration": 2.794854164123535 + }, + { + "auxiliary_loss_clip": 0.01120005, + "auxiliary_loss_mlp": 0.01172713, + "balance_loss_clip": 1.0019877, + "balance_loss_mlp": 1.00095582, + "epoch": 0.0872388396212235, + "flos": 17600574188160.0, + "grad_norm": 1.9831910163368813, + "language_loss": 0.83652705, + "learning_rate": 3.965731173721542e-06, + "loss": 0.85945421, + "num_input_tokens_seen": 31031210, + "step": 1451, + "time_per_iteration": 2.6552047729492188 + }, + { + "auxiliary_loss_clip": 0.01119115, + "auxiliary_loss_mlp": 0.00749512, + "balance_loss_clip": 1.00202227, + "balance_loss_mlp": 1.000494, + "epoch": 0.08729896287389148, + "flos": 25259385951360.0, + "grad_norm": 2.0290655924025183, + "language_loss": 0.74501181, + "learning_rate": 3.965659349421049e-06, + "loss": 0.76369804, + "num_input_tokens_seen": 31049710, + "step": 1452, + "time_per_iteration": 2.702075958251953 + }, + { + "auxiliary_loss_clip": 0.01153278, + "auxiliary_loss_mlp": 0.01173115, + "balance_loss_clip": 1.00240147, + "balance_loss_mlp": 1.00135839, + "epoch": 0.08735908612655945, + "flos": 15632454234240.0, + "grad_norm": 5.777344905482667, + "language_loss": 0.79881024, + "learning_rate": 3.965587450582556e-06, + "loss": 0.82207417, + "num_input_tokens_seen": 31066160, + "step": 1453, + "time_per_iteration": 2.6582441329956055 + }, + { + "auxiliary_loss_clip": 0.01152468, + "auxiliary_loss_mlp": 0.0117267, + "balance_loss_clip": 1.002177, + "balance_loss_mlp": 1.00148559, + "epoch": 0.08741920937922741, + "flos": 20339660684160.0, + "grad_norm": 2.8775973672123927, + "language_loss": 0.71352756, + "learning_rate": 3.96551547720879e-06, + "loss": 0.73677897, + "num_input_tokens_seen": 31085270, + "step": 1454, + "time_per_iteration": 2.623690128326416 + }, + { + "auxiliary_loss_clip": 0.01168437, + "auxiliary_loss_mlp": 0.01167827, + "balance_loss_clip": 1.00327969, + "balance_loss_mlp": 0.99997991, + "epoch": 0.08747933263189539, + "flos": 62819795433600.0, + "grad_norm": 0.7821508763779305, + "language_loss": 0.58586633, + "learning_rate": 3.96544342930248e-06, + "loss": 0.60922897, + "num_input_tokens_seen": 31148445, + "step": 1455, + "time_per_iteration": 3.168488025665283 + }, + { + "auxiliary_loss_clip": 0.01185307, + "auxiliary_loss_mlp": 0.01172853, + "balance_loss_clip": 1.00244319, + "balance_loss_mlp": 1.0012871, + "epoch": 0.08753945588456336, + "flos": 33035877648000.0, + "grad_norm": 1.5081092117339667, + "language_loss": 0.77543241, + "learning_rate": 3.965371306866359e-06, + "loss": 0.79901403, + "num_input_tokens_seen": 31168770, + "step": 1456, + "time_per_iteration": 2.6710121631622314 + }, + { + "auxiliary_loss_clip": 0.01120334, + "auxiliary_loss_mlp": 0.01172907, + "balance_loss_clip": 1.00211573, + "balance_loss_mlp": 1.00143623, + "epoch": 0.08759957913723132, + "flos": 35547182046720.0, + "grad_norm": 1.8594765972393532, + "language_loss": 0.72380012, + "learning_rate": 3.96529910990316e-06, + "loss": 0.74673253, + "num_input_tokens_seen": 31189270, + "step": 1457, + "time_per_iteration": 4.151994943618774 + }, + { + "auxiliary_loss_clip": 0.01168801, + "auxiliary_loss_mlp": 0.01172701, + "balance_loss_clip": 1.00224864, + "balance_loss_mlp": 1.00132608, + "epoch": 0.0876597023898993, + "flos": 23911120022400.0, + "grad_norm": 1.9840507454358518, + "language_loss": 0.86471319, + "learning_rate": 3.965226838415622e-06, + "loss": 0.88812822, + "num_input_tokens_seen": 31210385, + "step": 1458, + "time_per_iteration": 2.600456476211548 + }, + { + "auxiliary_loss_clip": 0.01153014, + "auxiliary_loss_mlp": 0.01172882, + "balance_loss_clip": 1.00252581, + "balance_loss_mlp": 1.00150704, + "epoch": 0.08771982564256726, + "flos": 18114025150080.0, + "grad_norm": 1.6178834501807688, + "language_loss": 0.80353928, + "learning_rate": 3.965154492406486e-06, + "loss": 0.82679832, + "num_input_tokens_seen": 31229745, + "step": 1459, + "time_per_iteration": 2.5932059288024902 + }, + { + "auxiliary_loss_clip": 0.01102522, + "auxiliary_loss_mlp": 0.01172584, + "balance_loss_clip": 1.00192595, + "balance_loss_mlp": 1.00111389, + "epoch": 0.08777994889523523, + "flos": 17712005155200.0, + "grad_norm": 2.2291383258123973, + "language_loss": 0.84578478, + "learning_rate": 3.9650820718784945e-06, + "loss": 0.86853582, + "num_input_tokens_seen": 31248280, + "step": 1460, + "time_per_iteration": 5.500978708267212 + }, + { + "auxiliary_loss_clip": 0.0116876, + "auxiliary_loss_mlp": 0.0117273, + "balance_loss_clip": 1.00224936, + "balance_loss_mlp": 1.00144994, + "epoch": 0.0878400721479032, + "flos": 12819930382080.0, + "grad_norm": 3.830381596510552, + "language_loss": 0.80201948, + "learning_rate": 3.965009576834394e-06, + "loss": 0.82543433, + "num_input_tokens_seen": 31262190, + "step": 1461, + "time_per_iteration": 4.000235080718994 + }, + { + "auxiliary_loss_clip": 0.01155399, + "auxiliary_loss_mlp": 0.01172792, + "balance_loss_clip": 1.00249326, + "balance_loss_mlp": 1.00151253, + "epoch": 0.08790019540057117, + "flos": 26392690938240.0, + "grad_norm": 1.5904866554562336, + "language_loss": 0.76380169, + "learning_rate": 3.964937007276932e-06, + "loss": 0.78708363, + "num_input_tokens_seen": 31283690, + "step": 1462, + "time_per_iteration": 2.7151238918304443 + }, + { + "auxiliary_loss_clip": 0.01152923, + "auxiliary_loss_mlp": 0.01172973, + "balance_loss_clip": 1.00226915, + "balance_loss_mlp": 1.00121617, + "epoch": 0.08796031865323914, + "flos": 19134031662720.0, + "grad_norm": 1.9947776393199343, + "language_loss": 0.7456091, + "learning_rate": 3.9648643632088634e-06, + "loss": 0.76886809, + "num_input_tokens_seen": 31302505, + "step": 1463, + "time_per_iteration": 2.5979738235473633 + }, + { + "auxiliary_loss_clip": 0.01169072, + "auxiliary_loss_mlp": 0.01172654, + "balance_loss_clip": 1.00231695, + "balance_loss_mlp": 1.00118303, + "epoch": 0.0880204419059071, + "flos": 26064287867520.0, + "grad_norm": 2.261915589473258, + "language_loss": 0.83200389, + "learning_rate": 3.964791644632941e-06, + "loss": 0.85542119, + "num_input_tokens_seen": 31323070, + "step": 1464, + "time_per_iteration": 2.6215076446533203 + }, + { + "auxiliary_loss_clip": 0.01153024, + "auxiliary_loss_mlp": 0.01173039, + "balance_loss_clip": 1.00231361, + "balance_loss_mlp": 1.00156879, + "epoch": 0.08808056515857508, + "flos": 22377842115840.0, + "grad_norm": 1.8875335181813466, + "language_loss": 0.78123212, + "learning_rate": 3.964718851551923e-06, + "loss": 0.80449283, + "num_input_tokens_seen": 31341880, + "step": 1465, + "time_per_iteration": 2.64022159576416 + }, + { + "auxiliary_loss_clip": 0.01185242, + "auxiliary_loss_mlp": 0.01173051, + "balance_loss_clip": 1.0024668, + "balance_loss_mlp": 1.00167525, + "epoch": 0.08814068841124305, + "flos": 23185293897600.0, + "grad_norm": 2.0690725303078983, + "language_loss": 0.85403335, + "learning_rate": 3.9646459839685675e-06, + "loss": 0.87761629, + "num_input_tokens_seen": 31361995, + "step": 1466, + "time_per_iteration": 2.5451676845550537 + }, + { + "auxiliary_loss_clip": 0.01120515, + "auxiliary_loss_mlp": 0.0074946, + "balance_loss_clip": 1.00225496, + "balance_loss_mlp": 1.00046134, + "epoch": 0.08820081166391101, + "flos": 25155281358720.0, + "grad_norm": 2.494885283559166, + "language_loss": 0.83760846, + "learning_rate": 3.964573041885641e-06, + "loss": 0.85630822, + "num_input_tokens_seen": 31381515, + "step": 1467, + "time_per_iteration": 2.735844850540161 + }, + { + "auxiliary_loss_clip": 0.01168607, + "auxiliary_loss_mlp": 0.01172871, + "balance_loss_clip": 1.00232458, + "balance_loss_mlp": 1.0013051, + "epoch": 0.08826093491657899, + "flos": 22231685675520.0, + "grad_norm": 1.8563303443655372, + "language_loss": 0.75907302, + "learning_rate": 3.964500025305907e-06, + "loss": 0.78248787, + "num_input_tokens_seen": 31400345, + "step": 1468, + "time_per_iteration": 2.5756306648254395 + }, + { + "auxiliary_loss_clip": 0.01168677, + "auxiliary_loss_mlp": 0.01172802, + "balance_loss_clip": 1.00228405, + "balance_loss_mlp": 1.00142634, + "epoch": 0.08832105816924696, + "flos": 22126826897280.0, + "grad_norm": 1.5300840097584654, + "language_loss": 0.8045705, + "learning_rate": 3.9644269342321355e-06, + "loss": 0.82798529, + "num_input_tokens_seen": 31419620, + "step": 1469, + "time_per_iteration": 2.605367422103882 + }, + { + "auxiliary_loss_clip": 0.01185195, + "auxiliary_loss_mlp": 0.01172572, + "balance_loss_clip": 1.00252748, + "balance_loss_mlp": 1.00157821, + "epoch": 0.08838118142191492, + "flos": 17566495159680.0, + "grad_norm": 2.331891476099762, + "language_loss": 0.77612364, + "learning_rate": 3.9643537686670974e-06, + "loss": 0.79970133, + "num_input_tokens_seen": 31437970, + "step": 1470, + "time_per_iteration": 2.522987127304077 + }, + { + "auxiliary_loss_clip": 0.01185198, + "auxiliary_loss_mlp": 0.01172718, + "balance_loss_clip": 1.00242281, + "balance_loss_mlp": 1.00153399, + "epoch": 0.0884413046745829, + "flos": 20777196251520.0, + "grad_norm": 1.9760113553234084, + "language_loss": 0.84279764, + "learning_rate": 3.964280528613569e-06, + "loss": 0.86637682, + "num_input_tokens_seen": 31457040, + "step": 1471, + "time_per_iteration": 2.5091168880462646 + }, + { + "auxiliary_loss_clip": 0.01139511, + "auxiliary_loss_mlp": 0.01172463, + "balance_loss_clip": 1.00239539, + "balance_loss_mlp": 1.00156474, + "epoch": 0.08850142792725087, + "flos": 22125462180480.0, + "grad_norm": 1.5266610551896884, + "language_loss": 0.83682525, + "learning_rate": 3.964207214074324e-06, + "loss": 0.85994494, + "num_input_tokens_seen": 31477520, + "step": 1472, + "time_per_iteration": 2.6470227241516113 + }, + { + "auxiliary_loss_clip": 0.01152758, + "auxiliary_loss_mlp": 0.01172342, + "balance_loss_clip": 1.0023911, + "balance_loss_mlp": 1.00125265, + "epoch": 0.08856155117991883, + "flos": 22418744728320.0, + "grad_norm": 4.30199707772233, + "language_loss": 0.82768804, + "learning_rate": 3.964133825052146e-06, + "loss": 0.85093904, + "num_input_tokens_seen": 31495575, + "step": 1473, + "time_per_iteration": 2.628190279006958 + }, + { + "auxiliary_loss_clip": 0.01103757, + "auxiliary_loss_mlp": 0.01172529, + "balance_loss_clip": 1.00189483, + "balance_loss_mlp": 1.00144017, + "epoch": 0.0886216744325868, + "flos": 29937002572800.0, + "grad_norm": 1.5465229743211057, + "language_loss": 0.78401744, + "learning_rate": 3.964060361549816e-06, + "loss": 0.80678034, + "num_input_tokens_seen": 31520020, + "step": 1474, + "time_per_iteration": 2.8036997318267822 + }, + { + "auxiliary_loss_clip": 0.01139191, + "auxiliary_loss_mlp": 0.0117227, + "balance_loss_clip": 1.00239062, + "balance_loss_mlp": 1.00137186, + "epoch": 0.08868179768525478, + "flos": 23982833525760.0, + "grad_norm": 1.6505770931831094, + "language_loss": 0.78906101, + "learning_rate": 3.963986823570121e-06, + "loss": 0.81217563, + "num_input_tokens_seen": 31539265, + "step": 1475, + "time_per_iteration": 2.6839146614074707 + }, + { + "auxiliary_loss_clip": 0.01185209, + "auxiliary_loss_mlp": 0.01172474, + "balance_loss_clip": 1.00243008, + "balance_loss_mlp": 1.00128961, + "epoch": 0.08874192093792274, + "flos": 43177553216640.0, + "grad_norm": 1.8358943825584468, + "language_loss": 0.73993528, + "learning_rate": 3.963913211115848e-06, + "loss": 0.76351213, + "num_input_tokens_seen": 31563425, + "step": 1476, + "time_per_iteration": 2.7298195362091064 + }, + { + "auxiliary_loss_clip": 0.01152063, + "auxiliary_loss_mlp": 0.01172931, + "balance_loss_clip": 1.00224268, + "balance_loss_mlp": 1.00155556, + "epoch": 0.0888020441905907, + "flos": 32852445868800.0, + "grad_norm": 1.6336792478345388, + "language_loss": 0.74621987, + "learning_rate": 3.9638395241897895e-06, + "loss": 0.76946986, + "num_input_tokens_seen": 31584525, + "step": 1477, + "time_per_iteration": 2.7032604217529297 + }, + { + "auxiliary_loss_clip": 0.01185204, + "auxiliary_loss_mlp": 0.01172454, + "balance_loss_clip": 1.00242031, + "balance_loss_mlp": 1.00098372, + "epoch": 0.08886216744325869, + "flos": 23149347361920.0, + "grad_norm": 2.188511832880215, + "language_loss": 0.86799669, + "learning_rate": 3.963765762794739e-06, + "loss": 0.89157331, + "num_input_tokens_seen": 31603325, + "step": 1478, + "time_per_iteration": 2.543424606323242 + }, + { + "auxiliary_loss_clip": 0.01169214, + "auxiliary_loss_mlp": 0.01172516, + "balance_loss_clip": 1.00234962, + "balance_loss_mlp": 1.00142694, + "epoch": 0.08892229069592665, + "flos": 23331593992320.0, + "grad_norm": 1.7422418159241315, + "language_loss": 0.77783763, + "learning_rate": 3.963691926933495e-06, + "loss": 0.80125499, + "num_input_tokens_seen": 31624820, + "step": 1479, + "time_per_iteration": 2.601470947265625 + }, + { + "auxiliary_loss_clip": 0.01155152, + "auxiliary_loss_mlp": 0.01172395, + "balance_loss_clip": 1.0021584, + "balance_loss_mlp": 1.00121093, + "epoch": 0.08898241394859462, + "flos": 26213784272640.0, + "grad_norm": 2.250734448561988, + "language_loss": 0.77915466, + "learning_rate": 3.9636180166088555e-06, + "loss": 0.80243015, + "num_input_tokens_seen": 31646080, + "step": 1480, + "time_per_iteration": 2.627958297729492 + }, + { + "auxiliary_loss_clip": 0.01168753, + "auxiliary_loss_mlp": 0.01173218, + "balance_loss_clip": 1.00229168, + "balance_loss_mlp": 1.00155616, + "epoch": 0.0890425372012626, + "flos": 23550613171200.0, + "grad_norm": 1.569740168030817, + "language_loss": 0.66888285, + "learning_rate": 3.963544031823624e-06, + "loss": 0.69230247, + "num_input_tokens_seen": 31665770, + "step": 1481, + "time_per_iteration": 2.6219828128814697 + }, + { + "auxiliary_loss_clip": 0.01120244, + "auxiliary_loss_mlp": 0.01172237, + "balance_loss_clip": 1.00213242, + "balance_loss_mlp": 1.00105226, + "epoch": 0.08910266045393056, + "flos": 23002795872000.0, + "grad_norm": 2.801866630100904, + "language_loss": 0.96238112, + "learning_rate": 3.9634699725806065e-06, + "loss": 0.98530591, + "num_input_tokens_seen": 31683805, + "step": 1482, + "time_per_iteration": 2.7150588035583496 + }, + { + "auxiliary_loss_clip": 0.01153188, + "auxiliary_loss_mlp": 0.01172704, + "balance_loss_clip": 1.00223494, + "balance_loss_mlp": 1.0012337, + "epoch": 0.08916278370659853, + "flos": 31936508035200.0, + "grad_norm": 1.8126058978320212, + "language_loss": 0.78478062, + "learning_rate": 3.96339583888261e-06, + "loss": 0.80803955, + "num_input_tokens_seen": 31704630, + "step": 1483, + "time_per_iteration": 2.697258234024048 + }, + { + "auxiliary_loss_clip": 0.01169311, + "auxiliary_loss_mlp": 0.01173275, + "balance_loss_clip": 1.00243092, + "balance_loss_mlp": 1.00170898, + "epoch": 0.08922290695926649, + "flos": 17530404969600.0, + "grad_norm": 1.921606055023808, + "language_loss": 0.85492647, + "learning_rate": 3.963321630732448e-06, + "loss": 0.87835228, + "num_input_tokens_seen": 31723255, + "step": 1484, + "time_per_iteration": 2.5479447841644287 + }, + { + "auxiliary_loss_clip": 0.01185386, + "auxiliary_loss_mlp": 0.01172944, + "balance_loss_clip": 1.00252068, + "balance_loss_mlp": 1.00147319, + "epoch": 0.08928303021193447, + "flos": 32125075459200.0, + "grad_norm": 1.685226776543103, + "language_loss": 0.80271614, + "learning_rate": 3.963247348132932e-06, + "loss": 0.82629943, + "num_input_tokens_seen": 31747045, + "step": 1485, + "time_per_iteration": 2.618434190750122 + }, + { + "auxiliary_loss_clip": 0.01168884, + "auxiliary_loss_mlp": 0.0117291, + "balance_loss_clip": 1.00220156, + "balance_loss_mlp": 1.00134373, + "epoch": 0.08934315346460243, + "flos": 22125210785280.0, + "grad_norm": 3.002653131558574, + "language_loss": 0.83173555, + "learning_rate": 3.96317299108688e-06, + "loss": 0.8551535, + "num_input_tokens_seen": 31766615, + "step": 1486, + "time_per_iteration": 2.6137845516204834 + }, + { + "auxiliary_loss_clip": 0.01135764, + "auxiliary_loss_mlp": 0.01172747, + "balance_loss_clip": 1.002069, + "balance_loss_mlp": 1.00146663, + "epoch": 0.0894032767172704, + "flos": 22565583527040.0, + "grad_norm": 2.018596577251711, + "language_loss": 0.76795709, + "learning_rate": 3.963098559597111e-06, + "loss": 0.79104221, + "num_input_tokens_seen": 31785855, + "step": 1487, + "time_per_iteration": 2.6741790771484375 + }, + { + "auxiliary_loss_clip": 0.01153142, + "auxiliary_loss_mlp": 0.0117237, + "balance_loss_clip": 1.00237572, + "balance_loss_mlp": 1.00118589, + "epoch": 0.08946339996993838, + "flos": 20193396503040.0, + "grad_norm": 2.131461575817156, + "language_loss": 0.82777596, + "learning_rate": 3.963024053666449e-06, + "loss": 0.85103106, + "num_input_tokens_seen": 31804210, + "step": 1488, + "time_per_iteration": 2.60817289352417 + }, + { + "auxiliary_loss_clip": 0.01169195, + "auxiliary_loss_mlp": 0.01171929, + "balance_loss_clip": 1.00242186, + "balance_loss_mlp": 1.00112581, + "epoch": 0.08952352322260634, + "flos": 48360181104000.0, + "grad_norm": 1.9894226286410261, + "language_loss": 0.72124279, + "learning_rate": 3.962949473297718e-06, + "loss": 0.74465406, + "num_input_tokens_seen": 31826150, + "step": 1489, + "time_per_iteration": 2.821889877319336 + }, + { + "auxiliary_loss_clip": 0.01135876, + "auxiliary_loss_mlp": 0.01171869, + "balance_loss_clip": 1.00211179, + "balance_loss_mlp": 1.00097072, + "epoch": 0.08958364647527431, + "flos": 31793081028480.0, + "grad_norm": 1.900771220136635, + "language_loss": 0.89892781, + "learning_rate": 3.962874818493745e-06, + "loss": 0.92200518, + "num_input_tokens_seen": 31848060, + "step": 1490, + "time_per_iteration": 2.822266101837158 + }, + { + "auxiliary_loss_clip": 0.01168568, + "auxiliary_loss_mlp": 0.01172299, + "balance_loss_clip": 1.00228024, + "balance_loss_mlp": 1.00120974, + "epoch": 0.08964376972794229, + "flos": 23368186972800.0, + "grad_norm": 1.8708319812561514, + "language_loss": 0.73523241, + "learning_rate": 3.9628000892573635e-06, + "loss": 0.75864106, + "num_input_tokens_seen": 31870040, + "step": 1491, + "time_per_iteration": 2.6081347465515137 + }, + { + "auxiliary_loss_clip": 0.01184868, + "auxiliary_loss_mlp": 0.0074938, + "balance_loss_clip": 1.00230086, + "balance_loss_mlp": 1.00035799, + "epoch": 0.08970389298061025, + "flos": 23294785530240.0, + "grad_norm": 1.6434513898666878, + "language_loss": 0.76744407, + "learning_rate": 3.9627252855914055e-06, + "loss": 0.78678656, + "num_input_tokens_seen": 31890400, + "step": 1492, + "time_per_iteration": 2.552825927734375 + }, + { + "auxiliary_loss_clip": 0.01185023, + "auxiliary_loss_mlp": 0.01172292, + "balance_loss_clip": 1.00244427, + "balance_loss_mlp": 1.00120246, + "epoch": 0.08976401623327822, + "flos": 33761703772800.0, + "grad_norm": 2.130722137159313, + "language_loss": 0.70999825, + "learning_rate": 3.962650407498707e-06, + "loss": 0.73357141, + "num_input_tokens_seen": 31913435, + "step": 1493, + "time_per_iteration": 2.6269702911376953 + }, + { + "auxiliary_loss_clip": 0.01184997, + "auxiliary_loss_mlp": 0.01172567, + "balance_loss_clip": 1.00240231, + "balance_loss_mlp": 1.00128675, + "epoch": 0.08982413948594618, + "flos": 23911335504000.0, + "grad_norm": 1.788319094530604, + "language_loss": 0.86932707, + "learning_rate": 3.962575454982109e-06, + "loss": 0.89290273, + "num_input_tokens_seen": 31932435, + "step": 1494, + "time_per_iteration": 3.919513702392578 + }, + { + "auxiliary_loss_clip": 0.0108742, + "auxiliary_loss_mlp": 0.01172656, + "balance_loss_clip": 1.00186765, + "balance_loss_mlp": 1.00137603, + "epoch": 0.08988426273861416, + "flos": 16837544551680.0, + "grad_norm": 1.7816275955532141, + "language_loss": 0.83113694, + "learning_rate": 3.962500428044454e-06, + "loss": 0.85373771, + "num_input_tokens_seen": 31950125, + "step": 1495, + "time_per_iteration": 2.7333455085754395 + }, + { + "auxiliary_loss_clip": 0.01169315, + "auxiliary_loss_mlp": 0.01172523, + "balance_loss_clip": 1.00256586, + "balance_loss_mlp": 1.00114751, + "epoch": 0.08994438599128213, + "flos": 14793365548800.0, + "grad_norm": 2.106294752109485, + "language_loss": 0.7019521, + "learning_rate": 3.962425326688585e-06, + "loss": 0.72537053, + "num_input_tokens_seen": 31968050, + "step": 1496, + "time_per_iteration": 2.5369410514831543 + }, + { + "auxiliary_loss_clip": 0.01152221, + "auxiliary_loss_mlp": 0.01172336, + "balance_loss_clip": 1.00222683, + "balance_loss_mlp": 1.00124717, + "epoch": 0.09000450924395009, + "flos": 17384320356480.0, + "grad_norm": 1.6459753321742387, + "language_loss": 0.79864448, + "learning_rate": 3.962350150917351e-06, + "loss": 0.82189006, + "num_input_tokens_seen": 31985675, + "step": 1497, + "time_per_iteration": 2.580235242843628 + }, + { + "auxiliary_loss_clip": 0.01103103, + "auxiliary_loss_mlp": 0.01172654, + "balance_loss_clip": 1.00190592, + "balance_loss_mlp": 1.00127864, + "epoch": 0.09006463249661807, + "flos": 24280317964800.0, + "grad_norm": 8.848115202281296, + "language_loss": 0.82867008, + "learning_rate": 3.9622749007336035e-06, + "loss": 0.85142767, + "num_input_tokens_seen": 32005180, + "step": 1498, + "time_per_iteration": 5.476789712905884 + }, + { + "auxiliary_loss_clip": 0.01152835, + "auxiliary_loss_mlp": 0.01172938, + "balance_loss_clip": 1.00243354, + "balance_loss_mlp": 1.00165808, + "epoch": 0.09012475574928604, + "flos": 13661928069120.0, + "grad_norm": 2.188902168579791, + "language_loss": 0.78657466, + "learning_rate": 3.962199576140195e-06, + "loss": 0.80983245, + "num_input_tokens_seen": 32022970, + "step": 1499, + "time_per_iteration": 4.028857707977295 + }, + { + "auxiliary_loss_clip": 0.01169241, + "auxiliary_loss_mlp": 0.00749376, + "balance_loss_clip": 1.00249457, + "balance_loss_mlp": 1.00039124, + "epoch": 0.090184879001954, + "flos": 23327751237120.0, + "grad_norm": 1.7653638929509865, + "language_loss": 0.9315877, + "learning_rate": 3.962124177139981e-06, + "loss": 0.95077384, + "num_input_tokens_seen": 32043055, + "step": 1500, + "time_per_iteration": 2.602778434753418 + }, + { + "auxiliary_loss_clip": 0.01135371, + "auxiliary_loss_mlp": 0.01172595, + "balance_loss_clip": 1.00207984, + "balance_loss_mlp": 1.00122023, + "epoch": 0.09024500225462198, + "flos": 23002688131200.0, + "grad_norm": 2.285522763949321, + "language_loss": 0.7397815, + "learning_rate": 3.962048703735822e-06, + "loss": 0.76286113, + "num_input_tokens_seen": 32061900, + "step": 1501, + "time_per_iteration": 2.6592798233032227 + }, + { + "auxiliary_loss_clip": 0.01135547, + "auxiliary_loss_mlp": 0.01167098, + "balance_loss_clip": 1.00324345, + "balance_loss_mlp": 1.00001383, + "epoch": 0.09030512550728995, + "flos": 62189203242240.0, + "grad_norm": 0.753930615712022, + "language_loss": 0.58356273, + "learning_rate": 3.96197315593058e-06, + "loss": 0.60658908, + "num_input_tokens_seen": 32122745, + "step": 1502, + "time_per_iteration": 3.247115135192871 + }, + { + "auxiliary_loss_clip": 0.01153074, + "auxiliary_loss_mlp": 0.01172077, + "balance_loss_clip": 1.00224042, + "balance_loss_mlp": 1.00117898, + "epoch": 0.09036524875995791, + "flos": 38800689171840.0, + "grad_norm": 2.3590021243538226, + "language_loss": 0.69746304, + "learning_rate": 3.961897533727119e-06, + "loss": 0.72071457, + "num_input_tokens_seen": 32145125, + "step": 1503, + "time_per_iteration": 2.7929575443267822 + }, + { + "auxiliary_loss_clip": 0.01119846, + "auxiliary_loss_mlp": 0.01172268, + "balance_loss_clip": 1.00201023, + "balance_loss_mlp": 1.0011791, + "epoch": 0.09042537201262588, + "flos": 21690081429120.0, + "grad_norm": 1.994882524401047, + "language_loss": 0.86536986, + "learning_rate": 3.961821837128306e-06, + "loss": 0.888291, + "num_input_tokens_seen": 32166255, + "step": 1504, + "time_per_iteration": 2.725780725479126 + }, + { + "auxiliary_loss_clip": 0.01136566, + "auxiliary_loss_mlp": 0.01172797, + "balance_loss_clip": 1.00223863, + "balance_loss_mlp": 1.00142181, + "epoch": 0.09048549526529386, + "flos": 22267021680000.0, + "grad_norm": 2.312170758945136, + "language_loss": 0.72480041, + "learning_rate": 3.961746066137014e-06, + "loss": 0.74789405, + "num_input_tokens_seen": 32184010, + "step": 1505, + "time_per_iteration": 2.6520884037017822 + }, + { + "auxiliary_loss_clip": 0.01135949, + "auxiliary_loss_mlp": 0.01172167, + "balance_loss_clip": 1.00225329, + "balance_loss_mlp": 1.00117314, + "epoch": 0.09054561851796182, + "flos": 14610939350400.0, + "grad_norm": 2.3423846073879098, + "language_loss": 0.80445671, + "learning_rate": 3.961670220756114e-06, + "loss": 0.82753783, + "num_input_tokens_seen": 32201635, + "step": 1506, + "time_per_iteration": 2.65755033493042 + }, + { + "auxiliary_loss_clip": 0.01119906, + "auxiliary_loss_mlp": 0.01172157, + "balance_loss_clip": 1.00208294, + "balance_loss_mlp": 1.00125885, + "epoch": 0.09060574177062979, + "flos": 27636169916160.0, + "grad_norm": 2.1727801731273257, + "language_loss": 0.76250619, + "learning_rate": 3.961594300988482e-06, + "loss": 0.78542686, + "num_input_tokens_seen": 32221940, + "step": 1507, + "time_per_iteration": 2.7532474994659424 + }, + { + "auxiliary_loss_clip": 0.0115091, + "auxiliary_loss_mlp": 0.01166372, + "balance_loss_clip": 1.00275683, + "balance_loss_mlp": 1.0000515, + "epoch": 0.09066586502329776, + "flos": 66085797513600.0, + "grad_norm": 0.7317941092557572, + "language_loss": 0.57652885, + "learning_rate": 3.961518306836998e-06, + "loss": 0.5997017, + "num_input_tokens_seen": 32276495, + "step": 1508, + "time_per_iteration": 3.1068897247314453 + }, + { + "auxiliary_loss_clip": 0.01152565, + "auxiliary_loss_mlp": 0.01171983, + "balance_loss_clip": 1.00229526, + "balance_loss_mlp": 1.00127542, + "epoch": 0.09072598827596573, + "flos": 18916449027840.0, + "grad_norm": 1.906075828918904, + "language_loss": 0.85281837, + "learning_rate": 3.961442238304543e-06, + "loss": 0.87606382, + "num_input_tokens_seen": 32294130, + "step": 1509, + "time_per_iteration": 2.639204740524292 + }, + { + "auxiliary_loss_clip": 0.01151742, + "auxiliary_loss_mlp": 0.01172304, + "balance_loss_clip": 1.00216341, + "balance_loss_mlp": 1.00121486, + "epoch": 0.0907861115286337, + "flos": 24821742643200.0, + "grad_norm": 3.0187719380279767, + "language_loss": 0.84805167, + "learning_rate": 3.961366095394002e-06, + "loss": 0.87129211, + "num_input_tokens_seen": 32313555, + "step": 1510, + "time_per_iteration": 2.639799118041992 + }, + { + "auxiliary_loss_clip": 0.01152625, + "auxiliary_loss_mlp": 0.01172047, + "balance_loss_clip": 1.00228071, + "balance_loss_mlp": 1.00105333, + "epoch": 0.09084623478130167, + "flos": 21652842003840.0, + "grad_norm": 1.8743008824005607, + "language_loss": 0.85263538, + "learning_rate": 3.961289878108262e-06, + "loss": 0.87588215, + "num_input_tokens_seen": 32331430, + "step": 1511, + "time_per_iteration": 2.6235668659210205 + }, + { + "auxiliary_loss_clip": 0.011357, + "auxiliary_loss_mlp": 0.01172052, + "balance_loss_clip": 1.00214267, + "balance_loss_mlp": 1.00115323, + "epoch": 0.09090635803396964, + "flos": 27639258485760.0, + "grad_norm": 1.4700215991344892, + "language_loss": 0.84857643, + "learning_rate": 3.9612135864502135e-06, + "loss": 0.87165397, + "num_input_tokens_seen": 32353705, + "step": 1512, + "time_per_iteration": 2.7166457176208496 + }, + { + "auxiliary_loss_clip": 0.01135491, + "auxiliary_loss_mlp": 0.01171731, + "balance_loss_clip": 1.00199533, + "balance_loss_mlp": 1.00102305, + "epoch": 0.0909664812866376, + "flos": 17669127294720.0, + "grad_norm": 2.221748217150877, + "language_loss": 0.86904216, + "learning_rate": 3.961137220422749e-06, + "loss": 0.8921144, + "num_input_tokens_seen": 32370520, + "step": 1513, + "time_per_iteration": 2.6114399433135986 + }, + { + "auxiliary_loss_clip": 0.01168553, + "auxiliary_loss_mlp": 0.01172214, + "balance_loss_clip": 1.00216341, + "balance_loss_mlp": 1.00112486, + "epoch": 0.09102660453930557, + "flos": 23951448017280.0, + "grad_norm": 1.945905367216776, + "language_loss": 0.86433047, + "learning_rate": 3.961060780028764e-06, + "loss": 0.88773817, + "num_input_tokens_seen": 32389105, + "step": 1514, + "time_per_iteration": 2.587045192718506 + }, + { + "auxiliary_loss_clip": 0.01118611, + "auxiliary_loss_mlp": 0.01172252, + "balance_loss_clip": 1.00206614, + "balance_loss_mlp": 1.00135326, + "epoch": 0.09108672779197355, + "flos": 25812949426560.0, + "grad_norm": 1.808157699073747, + "language_loss": 0.89941013, + "learning_rate": 3.960984265271159e-06, + "loss": 0.92231882, + "num_input_tokens_seen": 32408065, + "step": 1515, + "time_per_iteration": 2.7201716899871826 + }, + { + "auxiliary_loss_clip": 0.01151778, + "auxiliary_loss_mlp": 0.01172315, + "balance_loss_clip": 1.00214279, + "balance_loss_mlp": 1.00141621, + "epoch": 0.09114685104464151, + "flos": 29639482220160.0, + "grad_norm": 2.148041892410447, + "language_loss": 0.85150921, + "learning_rate": 3.9609076761528335e-06, + "loss": 0.87475014, + "num_input_tokens_seen": 32427225, + "step": 1516, + "time_per_iteration": 2.6655149459838867 + }, + { + "auxiliary_loss_clip": 0.01152237, + "auxiliary_loss_mlp": 0.01171961, + "balance_loss_clip": 1.00225401, + "balance_loss_mlp": 1.00115776, + "epoch": 0.09120697429730948, + "flos": 33729635905920.0, + "grad_norm": 1.5668175104027924, + "language_loss": 0.80893135, + "learning_rate": 3.960831012676692e-06, + "loss": 0.83217335, + "num_input_tokens_seen": 32450510, + "step": 1517, + "time_per_iteration": 2.726223945617676 + }, + { + "auxiliary_loss_clip": 0.01168342, + "auxiliary_loss_mlp": 0.01172579, + "balance_loss_clip": 1.00239575, + "balance_loss_mlp": 1.00187182, + "epoch": 0.09126709754997746, + "flos": 18401381953920.0, + "grad_norm": 1.857689318662588, + "language_loss": 0.7790271, + "learning_rate": 3.960754274845642e-06, + "loss": 0.80243623, + "num_input_tokens_seen": 32468425, + "step": 1518, + "time_per_iteration": 2.6385433673858643 + }, + { + "auxiliary_loss_clip": 0.01168894, + "auxiliary_loss_mlp": 0.01172091, + "balance_loss_clip": 1.00234878, + "balance_loss_mlp": 1.00119233, + "epoch": 0.09132722080264542, + "flos": 22091957769600.0, + "grad_norm": 1.911074646370275, + "language_loss": 0.86360228, + "learning_rate": 3.960677462662594e-06, + "loss": 0.88701206, + "num_input_tokens_seen": 32487510, + "step": 1519, + "time_per_iteration": 2.576314926147461 + }, + { + "auxiliary_loss_clip": 0.01151751, + "auxiliary_loss_mlp": 0.01172115, + "balance_loss_clip": 1.00220752, + "balance_loss_mlp": 1.00102603, + "epoch": 0.09138734405531339, + "flos": 21033131633280.0, + "grad_norm": 2.282228803907285, + "language_loss": 0.73013765, + "learning_rate": 3.96060057613046e-06, + "loss": 0.75337631, + "num_input_tokens_seen": 32507250, + "step": 1520, + "time_per_iteration": 2.6270241737365723 + }, + { + "auxiliary_loss_clip": 0.01152023, + "auxiliary_loss_mlp": 0.0117221, + "balance_loss_clip": 1.00217152, + "balance_loss_mlp": 1.00121593, + "epoch": 0.09144746730798137, + "flos": 20083940784000.0, + "grad_norm": 3.4692790692523343, + "language_loss": 0.85280198, + "learning_rate": 3.960523615252156e-06, + "loss": 0.87604427, + "num_input_tokens_seen": 32526045, + "step": 1521, + "time_per_iteration": 2.6012370586395264 + }, + { + "auxiliary_loss_clip": 0.01103707, + "auxiliary_loss_mlp": 0.01172352, + "balance_loss_clip": 1.00192189, + "balance_loss_mlp": 1.001454, + "epoch": 0.09150759056064933, + "flos": 22778210085120.0, + "grad_norm": 1.9340326892251185, + "language_loss": 0.84085453, + "learning_rate": 3.960446580030599e-06, + "loss": 0.8636151, + "num_input_tokens_seen": 32546575, + "step": 1522, + "time_per_iteration": 2.7414746284484863 + }, + { + "auxiliary_loss_clip": 0.01184753, + "auxiliary_loss_mlp": 0.01171968, + "balance_loss_clip": 1.00240898, + "balance_loss_mlp": 1.00154638, + "epoch": 0.0915677138133173, + "flos": 27564205017600.0, + "grad_norm": 1.9045400280587381, + "language_loss": 0.81041068, + "learning_rate": 3.960369470468711e-06, + "loss": 0.83397788, + "num_input_tokens_seen": 32568795, + "step": 1523, + "time_per_iteration": 2.5740225315093994 + }, + { + "auxiliary_loss_clip": 0.01152337, + "auxiliary_loss_mlp": 0.00749484, + "balance_loss_clip": 1.00220013, + "balance_loss_mlp": 1.00050902, + "epoch": 0.09162783706598528, + "flos": 17674765729920.0, + "grad_norm": 2.0871142436291974, + "language_loss": 0.74488807, + "learning_rate": 3.960292286569418e-06, + "loss": 0.76390624, + "num_input_tokens_seen": 32587010, + "step": 1524, + "time_per_iteration": 2.618882179260254 + }, + { + "auxiliary_loss_clip": 0.01136098, + "auxiliary_loss_mlp": 0.01172589, + "balance_loss_clip": 1.00204992, + "balance_loss_mlp": 1.00140429, + "epoch": 0.09168796031865324, + "flos": 18478195188480.0, + "grad_norm": 2.1381257863104777, + "language_loss": 0.85831499, + "learning_rate": 3.960215028335644e-06, + "loss": 0.88140178, + "num_input_tokens_seen": 32602375, + "step": 1525, + "time_per_iteration": 2.6075427532196045 + }, + { + "auxiliary_loss_clip": 0.01151782, + "auxiliary_loss_mlp": 0.0117181, + "balance_loss_clip": 1.0022254, + "balance_loss_mlp": 1.0010072, + "epoch": 0.0917480835713212, + "flos": 29387605075200.0, + "grad_norm": 2.287529501179401, + "language_loss": 0.74895847, + "learning_rate": 3.96013769577032e-06, + "loss": 0.77219439, + "num_input_tokens_seen": 32621460, + "step": 1526, + "time_per_iteration": 2.6854851245880127 + }, + { + "auxiliary_loss_clip": 0.01184712, + "auxiliary_loss_mlp": 0.01172142, + "balance_loss_clip": 1.00237274, + "balance_loss_mlp": 1.00124407, + "epoch": 0.09180820682398917, + "flos": 19829262378240.0, + "grad_norm": 1.8146743091595845, + "language_loss": 0.77325988, + "learning_rate": 3.960060288876378e-06, + "loss": 0.79682839, + "num_input_tokens_seen": 32640440, + "step": 1527, + "time_per_iteration": 2.543384075164795 + }, + { + "auxiliary_loss_clip": 0.01168004, + "auxiliary_loss_mlp": 0.01171855, + "balance_loss_clip": 1.0021677, + "balance_loss_mlp": 1.00114751, + "epoch": 0.09186833007665715, + "flos": 23841848643840.0, + "grad_norm": 2.1549167354823324, + "language_loss": 0.78261894, + "learning_rate": 3.959982807656753e-06, + "loss": 0.80601752, + "num_input_tokens_seen": 32660020, + "step": 1528, + "time_per_iteration": 2.6008522510528564 + }, + { + "auxiliary_loss_clip": 0.01136447, + "auxiliary_loss_mlp": 0.01172217, + "balance_loss_clip": 1.00215578, + "balance_loss_mlp": 1.00112796, + "epoch": 0.09192845332932512, + "flos": 12932726065920.0, + "grad_norm": 4.177719030847131, + "language_loss": 0.7666446, + "learning_rate": 3.959905252114384e-06, + "loss": 0.78973126, + "num_input_tokens_seen": 32678170, + "step": 1529, + "time_per_iteration": 2.6684672832489014 + }, + { + "auxiliary_loss_clip": 0.01184803, + "auxiliary_loss_mlp": 0.00749484, + "balance_loss_clip": 1.00232577, + "balance_loss_mlp": 1.00053835, + "epoch": 0.09198857658199308, + "flos": 24568177559040.0, + "grad_norm": 1.8164046330051915, + "language_loss": 0.83089042, + "learning_rate": 3.959827622252211e-06, + "loss": 0.85023332, + "num_input_tokens_seen": 32697540, + "step": 1530, + "time_per_iteration": 2.603217601776123 + }, + { + "auxiliary_loss_clip": 0.01122316, + "auxiliary_loss_mlp": 0.01172162, + "balance_loss_clip": 1.00194454, + "balance_loss_mlp": 1.00135899, + "epoch": 0.09204869983466106, + "flos": 20266941600000.0, + "grad_norm": 2.132638691898268, + "language_loss": 0.83635175, + "learning_rate": 3.959749918073179e-06, + "loss": 0.8592965, + "num_input_tokens_seen": 32716805, + "step": 1531, + "time_per_iteration": 2.6671719551086426 + }, + { + "auxiliary_loss_clip": 0.01136139, + "auxiliary_loss_mlp": 0.01172, + "balance_loss_clip": 1.00208557, + "balance_loss_mlp": 1.00100636, + "epoch": 0.09210882308732903, + "flos": 20885646389760.0, + "grad_norm": 2.1078832068953433, + "language_loss": 0.81350076, + "learning_rate": 3.959672139580233e-06, + "loss": 0.83658212, + "num_input_tokens_seen": 32736385, + "step": 1532, + "time_per_iteration": 4.024760961532593 + }, + { + "auxiliary_loss_clip": 0.01152218, + "auxiliary_loss_mlp": 0.01171807, + "balance_loss_clip": 1.0022037, + "balance_loss_mlp": 1.00109887, + "epoch": 0.09216894633999699, + "flos": 30956326727040.0, + "grad_norm": 1.9209576606690644, + "language_loss": 0.83323449, + "learning_rate": 3.9595942867763235e-06, + "loss": 0.85647476, + "num_input_tokens_seen": 32757140, + "step": 1533, + "time_per_iteration": 2.725670576095581 + }, + { + "auxiliary_loss_clip": 0.01135095, + "auxiliary_loss_mlp": 0.01172057, + "balance_loss_clip": 1.00212193, + "balance_loss_mlp": 1.00115871, + "epoch": 0.09222906959266497, + "flos": 13151565676800.0, + "grad_norm": 1.991918924848956, + "language_loss": 0.90161514, + "learning_rate": 3.959516359664402e-06, + "loss": 0.92468667, + "num_input_tokens_seen": 32774860, + "step": 1534, + "time_per_iteration": 2.6182596683502197 + }, + { + "auxiliary_loss_clip": 0.01136316, + "auxiliary_loss_mlp": 0.01172132, + "balance_loss_clip": 1.00208354, + "balance_loss_mlp": 1.0012337, + "epoch": 0.09228919284533293, + "flos": 25994477784960.0, + "grad_norm": 1.8698055263978233, + "language_loss": 0.7529797, + "learning_rate": 3.959438358247424e-06, + "loss": 0.77606422, + "num_input_tokens_seen": 32795250, + "step": 1535, + "time_per_iteration": 4.02425742149353 + }, + { + "auxiliary_loss_clip": 0.01168696, + "auxiliary_loss_mlp": 0.01171659, + "balance_loss_clip": 1.00230813, + "balance_loss_mlp": 1.00123751, + "epoch": 0.0923493160980009, + "flos": 18660800954880.0, + "grad_norm": 1.8275381107509898, + "language_loss": 0.81562936, + "learning_rate": 3.959360282528346e-06, + "loss": 0.83903295, + "num_input_tokens_seen": 32813805, + "step": 1536, + "time_per_iteration": 5.420169830322266 + }, + { + "auxiliary_loss_clip": 0.01184615, + "auxiliary_loss_mlp": 0.01171828, + "balance_loss_clip": 1.00233209, + "balance_loss_mlp": 1.00121617, + "epoch": 0.09240943935066886, + "flos": 21140576190720.0, + "grad_norm": 1.8116547017189286, + "language_loss": 0.89358079, + "learning_rate": 3.959282132510131e-06, + "loss": 0.91714525, + "num_input_tokens_seen": 32830960, + "step": 1537, + "time_per_iteration": 2.539949417114258 + }, + { + "auxiliary_loss_clip": 0.01151988, + "auxiliary_loss_mlp": 0.01172129, + "balance_loss_clip": 1.00211108, + "balance_loss_mlp": 1.00123072, + "epoch": 0.09246956260333684, + "flos": 20592435669120.0, + "grad_norm": 1.9518612264795934, + "language_loss": 0.80759352, + "learning_rate": 3.959203908195741e-06, + "loss": 0.83083475, + "num_input_tokens_seen": 32848275, + "step": 1538, + "time_per_iteration": 2.5929784774780273 + }, + { + "auxiliary_loss_clip": 0.01149738, + "auxiliary_loss_mlp": 0.01166434, + "balance_loss_clip": 1.00283957, + "balance_loss_mlp": 1.00011349, + "epoch": 0.09252968585600481, + "flos": 67558710614400.0, + "grad_norm": 0.7444480712293168, + "language_loss": 0.57426494, + "learning_rate": 3.959125609588142e-06, + "loss": 0.59742665, + "num_input_tokens_seen": 32917730, + "step": 1539, + "time_per_iteration": 3.277139186859131 + }, + { + "auxiliary_loss_clip": 0.0115131, + "auxiliary_loss_mlp": 0.01171697, + "balance_loss_clip": 1.00196767, + "balance_loss_mlp": 1.0012753, + "epoch": 0.09258980910867277, + "flos": 17383853479680.0, + "grad_norm": 4.38115858783374, + "language_loss": 0.67381871, + "learning_rate": 3.959047236690304e-06, + "loss": 0.69704884, + "num_input_tokens_seen": 32934910, + "step": 1540, + "time_per_iteration": 2.6235783100128174 + }, + { + "auxiliary_loss_clip": 0.0113625, + "auxiliary_loss_mlp": 0.01171667, + "balance_loss_clip": 1.0020622, + "balance_loss_mlp": 1.00105429, + "epoch": 0.09264993236134075, + "flos": 19865927185920.0, + "grad_norm": 1.8428841031248402, + "language_loss": 0.83658683, + "learning_rate": 3.958968789505198e-06, + "loss": 0.85966599, + "num_input_tokens_seen": 32953840, + "step": 1541, + "time_per_iteration": 2.641550302505493 + }, + { + "auxiliary_loss_clip": 0.0118343, + "auxiliary_loss_mlp": 0.01166379, + "balance_loss_clip": 1.00332701, + "balance_loss_mlp": 1.00005865, + "epoch": 0.09271005561400872, + "flos": 62284401262080.0, + "grad_norm": 0.8833838694452999, + "language_loss": 0.61953831, + "learning_rate": 3.9588902680358e-06, + "loss": 0.64303643, + "num_input_tokens_seen": 33011410, + "step": 1542, + "time_per_iteration": 3.1227340698242188 + }, + { + "auxiliary_loss_clip": 0.011517, + "auxiliary_loss_mlp": 0.01172009, + "balance_loss_clip": 1.00218606, + "balance_loss_mlp": 1.00139642, + "epoch": 0.09277017886667668, + "flos": 23329870139520.0, + "grad_norm": 1.5071772960998322, + "language_loss": 0.82753503, + "learning_rate": 3.958811672285086e-06, + "loss": 0.85077214, + "num_input_tokens_seen": 33031675, + "step": 1543, + "time_per_iteration": 2.628693103790283 + }, + { + "auxiliary_loss_clip": 0.01136004, + "auxiliary_loss_mlp": 0.01171798, + "balance_loss_clip": 1.00219297, + "balance_loss_mlp": 1.001472, + "epoch": 0.09283030211934466, + "flos": 54745169875200.0, + "grad_norm": 1.704204189331213, + "language_loss": 0.72219574, + "learning_rate": 3.958733002256038e-06, + "loss": 0.74527383, + "num_input_tokens_seen": 33056355, + "step": 1544, + "time_per_iteration": 2.9541995525360107 + }, + { + "auxiliary_loss_clip": 0.01168671, + "auxiliary_loss_mlp": 0.0117168, + "balance_loss_clip": 1.00220478, + "balance_loss_mlp": 1.00097203, + "epoch": 0.09289042537201263, + "flos": 30334784762880.0, + "grad_norm": 1.6061335582403895, + "language_loss": 0.77169633, + "learning_rate": 3.958654257951637e-06, + "loss": 0.79509985, + "num_input_tokens_seen": 33079520, + "step": 1545, + "time_per_iteration": 2.6536076068878174 + }, + { + "auxiliary_loss_clip": 0.01134927, + "auxiliary_loss_mlp": 0.01171692, + "balance_loss_clip": 1.00212049, + "balance_loss_mlp": 1.00108004, + "epoch": 0.09295054862468059, + "flos": 17746838369280.0, + "grad_norm": 3.046126957980969, + "language_loss": 0.74789369, + "learning_rate": 3.9585754393748706e-06, + "loss": 0.77095997, + "num_input_tokens_seen": 33096135, + "step": 1546, + "time_per_iteration": 2.609661340713501 + }, + { + "auxiliary_loss_clip": 0.01168925, + "auxiliary_loss_mlp": 0.01171676, + "balance_loss_clip": 1.00232625, + "balance_loss_mlp": 1.00096846, + "epoch": 0.09301067187734856, + "flos": 23658021815040.0, + "grad_norm": 1.8772314827929681, + "language_loss": 0.84633362, + "learning_rate": 3.9584965465287275e-06, + "loss": 0.86973965, + "num_input_tokens_seen": 33115245, + "step": 1547, + "time_per_iteration": 2.6056363582611084 + }, + { + "auxiliary_loss_clip": 0.01136257, + "auxiliary_loss_mlp": 0.01172085, + "balance_loss_clip": 1.00218916, + "balance_loss_mlp": 1.00137722, + "epoch": 0.09307079513001654, + "flos": 27527719777920.0, + "grad_norm": 2.4474600983078214, + "language_loss": 0.67375016, + "learning_rate": 3.958417579416199e-06, + "loss": 0.69683361, + "num_input_tokens_seen": 33136640, + "step": 1548, + "time_per_iteration": 2.732578754425049 + }, + { + "auxiliary_loss_clip": 0.01103537, + "auxiliary_loss_mlp": 0.01171757, + "balance_loss_clip": 1.00198102, + "balance_loss_mlp": 1.00114453, + "epoch": 0.0931309183826845, + "flos": 20627340710400.0, + "grad_norm": 1.925952081508901, + "language_loss": 0.83388495, + "learning_rate": 3.9583385380402795e-06, + "loss": 0.85663784, + "num_input_tokens_seen": 33155060, + "step": 1549, + "time_per_iteration": 2.748922348022461 + }, + { + "auxiliary_loss_clip": 0.01168176, + "auxiliary_loss_mlp": 0.01171534, + "balance_loss_clip": 1.00243914, + "balance_loss_mlp": 1.00101662, + "epoch": 0.09319104163535247, + "flos": 29020921084800.0, + "grad_norm": 1.6319826351860205, + "language_loss": 0.75594592, + "learning_rate": 3.958259422403966e-06, + "loss": 0.77934301, + "num_input_tokens_seen": 33175420, + "step": 1550, + "time_per_iteration": 2.6268177032470703 + }, + { + "auxiliary_loss_clip": 0.01135855, + "auxiliary_loss_mlp": 0.01171959, + "balance_loss_clip": 1.00218582, + "balance_loss_mlp": 1.00125146, + "epoch": 0.09325116488802045, + "flos": 25301545539840.0, + "grad_norm": 2.312767362194556, + "language_loss": 0.83438301, + "learning_rate": 3.95818023251026e-06, + "loss": 0.85746121, + "num_input_tokens_seen": 33194120, + "step": 1551, + "time_per_iteration": 2.709763765335083 + }, + { + "auxiliary_loss_clip": 0.01167114, + "auxiliary_loss_mlp": 0.00749129, + "balance_loss_clip": 1.00306511, + "balance_loss_mlp": 1.00001466, + "epoch": 0.09331128814068841, + "flos": 61536203942400.0, + "grad_norm": 0.7382351951004674, + "language_loss": 0.61798835, + "learning_rate": 3.958100968362163e-06, + "loss": 0.63715076, + "num_input_tokens_seen": 33261080, + "step": 1552, + "time_per_iteration": 3.2901039123535156 + }, + { + "auxiliary_loss_clip": 0.01167225, + "auxiliary_loss_mlp": 0.01166759, + "balance_loss_clip": 1.00264239, + "balance_loss_mlp": 1.00043797, + "epoch": 0.09337141139335638, + "flos": 53293700171520.0, + "grad_norm": 0.8357994201876476, + "language_loss": 0.59020209, + "learning_rate": 3.958021629962681e-06, + "loss": 0.6135419, + "num_input_tokens_seen": 33330235, + "step": 1553, + "time_per_iteration": 3.281175374984741 + }, + { + "auxiliary_loss_clip": 0.01138857, + "auxiliary_loss_mlp": 0.01171786, + "balance_loss_clip": 1.00223613, + "balance_loss_mlp": 1.00117421, + "epoch": 0.09343153464602436, + "flos": 23476852592640.0, + "grad_norm": 1.7956436492633638, + "language_loss": 0.87549716, + "learning_rate": 3.957942217314823e-06, + "loss": 0.89860356, + "num_input_tokens_seen": 33349035, + "step": 1554, + "time_per_iteration": 2.689244031906128 + }, + { + "auxiliary_loss_clip": 0.01152049, + "auxiliary_loss_mlp": 0.01171645, + "balance_loss_clip": 1.00220525, + "balance_loss_mlp": 1.00141406, + "epoch": 0.09349165789869232, + "flos": 19353481804800.0, + "grad_norm": 2.0739748044876802, + "language_loss": 0.8156032, + "learning_rate": 3.957862730421599e-06, + "loss": 0.83884013, + "num_input_tokens_seen": 33368060, + "step": 1555, + "time_per_iteration": 2.734794855117798 + }, + { + "auxiliary_loss_clip": 0.01166717, + "auxiliary_loss_mlp": 0.01166342, + "balance_loss_clip": 1.00320721, + "balance_loss_mlp": 1.00002122, + "epoch": 0.09355178115136029, + "flos": 67502580635520.0, + "grad_norm": 0.8704231087259128, + "language_loss": 0.59660411, + "learning_rate": 3.957783169286024e-06, + "loss": 0.61993468, + "num_input_tokens_seen": 33430825, + "step": 1556, + "time_per_iteration": 3.12412691116333 + }, + { + "auxiliary_loss_clip": 0.01168618, + "auxiliary_loss_mlp": 0.01171334, + "balance_loss_clip": 1.00240099, + "balance_loss_mlp": 1.00119853, + "epoch": 0.09361190440402825, + "flos": 37341638720640.0, + "grad_norm": 1.7346256492900938, + "language_loss": 0.84510314, + "learning_rate": 3.9577035339111155e-06, + "loss": 0.86850262, + "num_input_tokens_seen": 33454855, + "step": 1557, + "time_per_iteration": 2.722898483276367 + }, + { + "auxiliary_loss_clip": 0.01086616, + "auxiliary_loss_mlp": 0.01171574, + "balance_loss_clip": 1.00167072, + "balance_loss_mlp": 1.00134349, + "epoch": 0.09367202765669623, + "flos": 24899705112960.0, + "grad_norm": 1.6126680843701717, + "language_loss": 0.78124863, + "learning_rate": 3.957623824299893e-06, + "loss": 0.8038305, + "num_input_tokens_seen": 33476000, + "step": 1558, + "time_per_iteration": 2.943042278289795 + }, + { + "auxiliary_loss_clip": 0.01151508, + "auxiliary_loss_mlp": 0.01171769, + "balance_loss_clip": 1.00213456, + "balance_loss_mlp": 1.00125194, + "epoch": 0.0937321509093642, + "flos": 15705568368000.0, + "grad_norm": 1.9416751574398823, + "language_loss": 0.79842108, + "learning_rate": 3.957544040455379e-06, + "loss": 0.82165384, + "num_input_tokens_seen": 33493845, + "step": 1559, + "time_per_iteration": 2.8364462852478027 + }, + { + "auxiliary_loss_clip": 0.01135298, + "auxiliary_loss_mlp": 0.01172017, + "balance_loss_clip": 1.00208473, + "balance_loss_mlp": 1.00150013, + "epoch": 0.09379227416203216, + "flos": 20483698222080.0, + "grad_norm": 3.571948511726574, + "language_loss": 0.76744008, + "learning_rate": 3.957464182380599e-06, + "loss": 0.79051328, + "num_input_tokens_seen": 33510850, + "step": 1560, + "time_per_iteration": 2.6264073848724365 + }, + { + "auxiliary_loss_clip": 0.01135698, + "auxiliary_loss_mlp": 0.01171586, + "balance_loss_clip": 1.00212073, + "balance_loss_mlp": 1.00106883, + "epoch": 0.09385239741470014, + "flos": 24352498344960.0, + "grad_norm": 1.7735900486164349, + "language_loss": 0.81186664, + "learning_rate": 3.95738425007858e-06, + "loss": 0.83493948, + "num_input_tokens_seen": 33530430, + "step": 1561, + "time_per_iteration": 2.6652140617370605 + }, + { + "auxiliary_loss_clip": 0.01167871, + "auxiliary_loss_mlp": 0.01171257, + "balance_loss_clip": 1.00217628, + "balance_loss_mlp": 1.00093102, + "epoch": 0.0939125206673681, + "flos": 33291489807360.0, + "grad_norm": 3.418175012987206, + "language_loss": 0.61836481, + "learning_rate": 3.957304243552354e-06, + "loss": 0.64175612, + "num_input_tokens_seen": 33551975, + "step": 1562, + "time_per_iteration": 2.6821036338806152 + }, + { + "auxiliary_loss_clip": 0.01151994, + "auxiliary_loss_mlp": 0.01171844, + "balance_loss_clip": 1.0022862, + "balance_loss_mlp": 1.00151825, + "epoch": 0.09397264392003607, + "flos": 19244923925760.0, + "grad_norm": 1.779155404565033, + "language_loss": 0.84973931, + "learning_rate": 3.957224162804956e-06, + "loss": 0.87297767, + "num_input_tokens_seen": 33569850, + "step": 1563, + "time_per_iteration": 2.5954577922821045 + }, + { + "auxiliary_loss_clip": 0.01151566, + "auxiliary_loss_mlp": 0.01171502, + "balance_loss_clip": 1.00210667, + "balance_loss_mlp": 1.00127077, + "epoch": 0.09403276717270405, + "flos": 19317930318720.0, + "grad_norm": 1.732744185836293, + "language_loss": 0.7657643, + "learning_rate": 3.9571440078394205e-06, + "loss": 0.78899497, + "num_input_tokens_seen": 33590510, + "step": 1564, + "time_per_iteration": 2.6678242683410645 + }, + { + "auxiliary_loss_clip": 0.01155245, + "auxiliary_loss_mlp": 0.01171666, + "balance_loss_clip": 1.00232565, + "balance_loss_mlp": 1.00153017, + "epoch": 0.09409289042537201, + "flos": 23583471137280.0, + "grad_norm": 1.8518336862511482, + "language_loss": 0.80231285, + "learning_rate": 3.9570637786587895e-06, + "loss": 0.82558203, + "num_input_tokens_seen": 33608810, + "step": 1565, + "time_per_iteration": 2.705369472503662 + }, + { + "auxiliary_loss_clip": 0.01151276, + "auxiliary_loss_mlp": 0.01172012, + "balance_loss_clip": 1.00200081, + "balance_loss_mlp": 1.00159061, + "epoch": 0.09415301367803998, + "flos": 20078446003200.0, + "grad_norm": 2.2670884138200527, + "language_loss": 0.75464535, + "learning_rate": 3.956983475266103e-06, + "loss": 0.77787828, + "num_input_tokens_seen": 33627265, + "step": 1566, + "time_per_iteration": 2.624497890472412 + }, + { + "auxiliary_loss_clip": 0.0113469, + "auxiliary_loss_mlp": 0.00749382, + "balance_loss_clip": 1.00193655, + "balance_loss_mlp": 1.00035679, + "epoch": 0.09421313693070796, + "flos": 21062075016960.0, + "grad_norm": 2.1137567710601606, + "language_loss": 0.78249538, + "learning_rate": 3.956903097664407e-06, + "loss": 0.80133605, + "num_input_tokens_seen": 33644810, + "step": 1567, + "time_per_iteration": 2.6658308506011963 + }, + { + "auxiliary_loss_clip": 0.01152218, + "auxiliary_loss_mlp": 0.01171617, + "balance_loss_clip": 1.00213218, + "balance_loss_mlp": 1.00148153, + "epoch": 0.09427326018337592, + "flos": 24316156759680.0, + "grad_norm": 1.7281026675406177, + "language_loss": 0.82442856, + "learning_rate": 3.956822645856749e-06, + "loss": 0.84766698, + "num_input_tokens_seen": 33665665, + "step": 1568, + "time_per_iteration": 2.6490719318389893 + }, + { + "auxiliary_loss_clip": 0.01184574, + "auxiliary_loss_mlp": 0.01171501, + "balance_loss_clip": 1.00234771, + "balance_loss_mlp": 1.00107932, + "epoch": 0.09433338343604389, + "flos": 20263888944000.0, + "grad_norm": 1.8582661349894034, + "language_loss": 0.76756281, + "learning_rate": 3.9567421198461814e-06, + "loss": 0.79112351, + "num_input_tokens_seen": 33684760, + "step": 1569, + "time_per_iteration": 2.5266506671905518 + }, + { + "auxiliary_loss_clip": 0.01119095, + "auxiliary_loss_mlp": 0.0117186, + "balance_loss_clip": 1.00204802, + "balance_loss_mlp": 1.00134349, + "epoch": 0.09439350668871185, + "flos": 12742973493120.0, + "grad_norm": 2.2052094463134138, + "language_loss": 0.85861063, + "learning_rate": 3.956661519635756e-06, + "loss": 0.88152015, + "num_input_tokens_seen": 33700750, + "step": 1570, + "time_per_iteration": 4.003573894500732 + }, + { + "auxiliary_loss_clip": 0.01119674, + "auxiliary_loss_mlp": 0.01171382, + "balance_loss_clip": 1.00203419, + "balance_loss_mlp": 1.00105572, + "epoch": 0.09445362994137983, + "flos": 25962266263680.0, + "grad_norm": 1.587189032232012, + "language_loss": 0.76342475, + "learning_rate": 3.95658084522853e-06, + "loss": 0.78633529, + "num_input_tokens_seen": 33724430, + "step": 1571, + "time_per_iteration": 2.7652065753936768 + }, + { + "auxiliary_loss_clip": 0.01120404, + "auxiliary_loss_mlp": 0.01171664, + "balance_loss_clip": 1.00202119, + "balance_loss_mlp": 1.00162411, + "epoch": 0.0945137531940478, + "flos": 19715353372800.0, + "grad_norm": 1.5543987731328606, + "language_loss": 0.79478532, + "learning_rate": 3.956500096627561e-06, + "loss": 0.81770599, + "num_input_tokens_seen": 33743455, + "step": 1572, + "time_per_iteration": 2.6945159435272217 + }, + { + "auxiliary_loss_clip": 0.01135715, + "auxiliary_loss_mlp": 0.01171354, + "balance_loss_clip": 1.00207508, + "balance_loss_mlp": 1.00140977, + "epoch": 0.09457387644671576, + "flos": 23617047375360.0, + "grad_norm": 1.6802500361525021, + "language_loss": 0.87744898, + "learning_rate": 3.956419273835913e-06, + "loss": 0.90051967, + "num_input_tokens_seen": 33763435, + "step": 1573, + "time_per_iteration": 4.05666971206665 + }, + { + "auxiliary_loss_clip": 0.01151521, + "auxiliary_loss_mlp": 0.01172327, + "balance_loss_clip": 1.00212669, + "balance_loss_mlp": 1.00171447, + "epoch": 0.09463399969938374, + "flos": 26907291135360.0, + "grad_norm": 1.947528410470739, + "language_loss": 0.81569481, + "learning_rate": 3.95633837685665e-06, + "loss": 0.83893329, + "num_input_tokens_seen": 33784325, + "step": 1574, + "time_per_iteration": 5.562394857406616 + }, + { + "auxiliary_loss_clip": 0.01154569, + "auxiliary_loss_mlp": 0.01171705, + "balance_loss_clip": 1.00227499, + "balance_loss_mlp": 1.00156927, + "epoch": 0.0946941229520517, + "flos": 23659566099840.0, + "grad_norm": 1.6525803228047464, + "language_loss": 0.80823267, + "learning_rate": 3.95625740569284e-06, + "loss": 0.8314954, + "num_input_tokens_seen": 33802510, + "step": 1575, + "time_per_iteration": 2.6239144802093506 + }, + { + "auxiliary_loss_clip": 0.01184407, + "auxiliary_loss_mlp": 0.01171865, + "balance_loss_clip": 1.00231004, + "balance_loss_mlp": 1.00163388, + "epoch": 0.09475424620471967, + "flos": 24134053783680.0, + "grad_norm": 1.8270434761830252, + "language_loss": 0.8641156, + "learning_rate": 3.956176360347553e-06, + "loss": 0.88767827, + "num_input_tokens_seen": 33819980, + "step": 1576, + "time_per_iteration": 2.6041293144226074 + }, + { + "auxiliary_loss_clip": 0.01150804, + "auxiliary_loss_mlp": 0.01165571, + "balance_loss_clip": 1.00267839, + "balance_loss_mlp": 1.00001287, + "epoch": 0.09481436945738765, + "flos": 68426168065920.0, + "grad_norm": 1.0169786931556735, + "language_loss": 0.65877736, + "learning_rate": 3.956095240823862e-06, + "loss": 0.68194115, + "num_input_tokens_seen": 33878925, + "step": 1577, + "time_per_iteration": 3.162278652191162 + }, + { + "auxiliary_loss_clip": 0.01135262, + "auxiliary_loss_mlp": 0.01171339, + "balance_loss_clip": 1.00216818, + "balance_loss_mlp": 1.00120354, + "epoch": 0.09487449271005562, + "flos": 16654076858880.0, + "grad_norm": 1.8791837555196451, + "language_loss": 0.79450846, + "learning_rate": 3.956014047124844e-06, + "loss": 0.81757444, + "num_input_tokens_seen": 33897600, + "step": 1578, + "time_per_iteration": 2.694824695587158 + }, + { + "auxiliary_loss_clip": 0.01184433, + "auxiliary_loss_mlp": 0.01171546, + "balance_loss_clip": 1.00221205, + "balance_loss_mlp": 1.00150609, + "epoch": 0.09493461596272358, + "flos": 24275685110400.0, + "grad_norm": 1.5642875571755566, + "language_loss": 0.77886271, + "learning_rate": 3.955932779253578e-06, + "loss": 0.80242252, + "num_input_tokens_seen": 33917365, + "step": 1579, + "time_per_iteration": 2.572633743286133 + }, + { + "auxiliary_loss_clip": 0.01136092, + "auxiliary_loss_mlp": 0.01171494, + "balance_loss_clip": 1.00205386, + "balance_loss_mlp": 1.0014534, + "epoch": 0.09499473921539155, + "flos": 21870173243520.0, + "grad_norm": 3.9897933476859184, + "language_loss": 0.73173749, + "learning_rate": 3.955851437213144e-06, + "loss": 0.75481331, + "num_input_tokens_seen": 33936680, + "step": 1580, + "time_per_iteration": 2.698341131210327 + }, + { + "auxiliary_loss_clip": 0.01150998, + "auxiliary_loss_mlp": 0.01171037, + "balance_loss_clip": 1.00205493, + "balance_loss_mlp": 1.00118768, + "epoch": 0.09505486246805953, + "flos": 33547137880320.0, + "grad_norm": 1.7747471111613777, + "language_loss": 0.77522755, + "learning_rate": 3.955770021006627e-06, + "loss": 0.79844797, + "num_input_tokens_seen": 33960685, + "step": 1581, + "time_per_iteration": 2.7324352264404297 + }, + { + "auxiliary_loss_clip": 0.01138374, + "auxiliary_loss_mlp": 0.01171225, + "balance_loss_clip": 1.00212348, + "balance_loss_mlp": 1.00118494, + "epoch": 0.09511498572072749, + "flos": 21215342350080.0, + "grad_norm": 1.7984619346187434, + "language_loss": 0.86724961, + "learning_rate": 3.955688530637116e-06, + "loss": 0.89034557, + "num_input_tokens_seen": 33980015, + "step": 1582, + "time_per_iteration": 2.6870431900024414 + }, + { + "auxiliary_loss_clip": 0.01168186, + "auxiliary_loss_mlp": 0.01171653, + "balance_loss_clip": 1.00222611, + "balance_loss_mlp": 1.00142252, + "epoch": 0.09517510897339546, + "flos": 14611262572800.0, + "grad_norm": 2.1675278823384256, + "language_loss": 0.66852558, + "learning_rate": 3.955606966107699e-06, + "loss": 0.69192398, + "num_input_tokens_seen": 33997705, + "step": 1583, + "time_per_iteration": 2.5768930912017822 + }, + { + "auxiliary_loss_clip": 0.01167878, + "auxiliary_loss_mlp": 0.01171396, + "balance_loss_clip": 1.00227058, + "balance_loss_mlp": 1.00106955, + "epoch": 0.09523523222606343, + "flos": 27817339138560.0, + "grad_norm": 2.52635360835494, + "language_loss": 0.70871556, + "learning_rate": 3.95552532742147e-06, + "loss": 0.73210835, + "num_input_tokens_seen": 34017465, + "step": 1584, + "time_per_iteration": 2.6728763580322266 + }, + { + "auxiliary_loss_clip": 0.01119257, + "auxiliary_loss_mlp": 0.01171166, + "balance_loss_clip": 1.00197196, + "balance_loss_mlp": 1.00131702, + "epoch": 0.0952953554787314, + "flos": 20706272847360.0, + "grad_norm": 1.4781975188493137, + "language_loss": 0.80914259, + "learning_rate": 3.955443614581525e-06, + "loss": 0.83204687, + "num_input_tokens_seen": 34038550, + "step": 1585, + "time_per_iteration": 2.7438011169433594 + }, + { + "auxiliary_loss_clip": 0.01152249, + "auxiliary_loss_mlp": 0.0117155, + "balance_loss_clip": 1.00213814, + "balance_loss_mlp": 1.00122356, + "epoch": 0.09535547873139937, + "flos": 24787627701120.0, + "grad_norm": 1.9924683489608566, + "language_loss": 0.72113013, + "learning_rate": 3.955361827590961e-06, + "loss": 0.74436808, + "num_input_tokens_seen": 34058665, + "step": 1586, + "time_per_iteration": 2.6770095825195312 + }, + { + "auxiliary_loss_clip": 0.0113334, + "auxiliary_loss_mlp": 0.01165615, + "balance_loss_clip": 1.00232112, + "balance_loss_mlp": 1.00005722, + "epoch": 0.09541560198406734, + "flos": 71912194905600.0, + "grad_norm": 0.8121421876016777, + "language_loss": 0.55429858, + "learning_rate": 3.955279966452883e-06, + "loss": 0.57728815, + "num_input_tokens_seen": 34109655, + "step": 1587, + "time_per_iteration": 3.0553195476531982 + }, + { + "auxiliary_loss_clip": 0.01136088, + "auxiliary_loss_mlp": 0.01171174, + "balance_loss_clip": 1.00212228, + "balance_loss_mlp": 1.0013243, + "epoch": 0.09547572523673531, + "flos": 28982604251520.0, + "grad_norm": 1.8418735978380252, + "language_loss": 0.81419849, + "learning_rate": 3.955198031170391e-06, + "loss": 0.83727109, + "num_input_tokens_seen": 34131115, + "step": 1588, + "time_per_iteration": 2.708181619644165 + }, + { + "auxiliary_loss_clip": 0.01122453, + "auxiliary_loss_mlp": 0.01171304, + "balance_loss_clip": 1.00226188, + "balance_loss_mlp": 1.0012641, + "epoch": 0.09553584848940327, + "flos": 24133910129280.0, + "grad_norm": 1.482940648102264, + "language_loss": 0.81672305, + "learning_rate": 3.955116021746594e-06, + "loss": 0.83966064, + "num_input_tokens_seen": 34151925, + "step": 1589, + "time_per_iteration": 2.722142457962036 + }, + { + "auxiliary_loss_clip": 0.01118963, + "auxiliary_loss_mlp": 0.00749445, + "balance_loss_clip": 1.00190473, + "balance_loss_mlp": 1.00039363, + "epoch": 0.09559597174207124, + "flos": 42851376789120.0, + "grad_norm": 1.367917530862161, + "language_loss": 0.64758343, + "learning_rate": 3.955033938184601e-06, + "loss": 0.66626751, + "num_input_tokens_seen": 34175395, + "step": 1590, + "time_per_iteration": 2.8661718368530273 + }, + { + "auxiliary_loss_clip": 0.01135017, + "auxiliary_loss_mlp": 0.01171218, + "balance_loss_clip": 1.00205851, + "balance_loss_mlp": 1.00136852, + "epoch": 0.09565609499473922, + "flos": 32670845683200.0, + "grad_norm": 1.574301658207124, + "language_loss": 0.82924426, + "learning_rate": 3.954951780487526e-06, + "loss": 0.85230654, + "num_input_tokens_seen": 34197760, + "step": 1591, + "time_per_iteration": 2.746518135070801 + }, + { + "auxiliary_loss_clip": 0.01151513, + "auxiliary_loss_mlp": 0.01171719, + "balance_loss_clip": 1.00220191, + "balance_loss_mlp": 1.0013926, + "epoch": 0.09571621824740718, + "flos": 18478410670080.0, + "grad_norm": 2.318819705905935, + "language_loss": 0.74243343, + "learning_rate": 3.9548695486584835e-06, + "loss": 0.76566577, + "num_input_tokens_seen": 34215330, + "step": 1592, + "time_per_iteration": 2.6270222663879395 + }, + { + "auxiliary_loss_clip": 0.01168175, + "auxiliary_loss_mlp": 0.01171059, + "balance_loss_clip": 1.00211573, + "balance_loss_mlp": 1.00120974, + "epoch": 0.09577634150007515, + "flos": 29387497334400.0, + "grad_norm": 1.7632690939527438, + "language_loss": 0.74179745, + "learning_rate": 3.954787242700592e-06, + "loss": 0.76518977, + "num_input_tokens_seen": 34237745, + "step": 1593, + "time_per_iteration": 2.6413068771362305 + }, + { + "auxiliary_loss_clip": 0.01168479, + "auxiliary_loss_mlp": 0.0117108, + "balance_loss_clip": 1.00229478, + "balance_loss_mlp": 1.00142169, + "epoch": 0.09583646475274313, + "flos": 22747830157440.0, + "grad_norm": 1.7335355811493782, + "language_loss": 0.69526494, + "learning_rate": 3.954704862616971e-06, + "loss": 0.71866059, + "num_input_tokens_seen": 34256565, + "step": 1594, + "time_per_iteration": 2.602349281311035 + }, + { + "auxiliary_loss_clip": 0.01167496, + "auxiliary_loss_mlp": 0.01171083, + "balance_loss_clip": 1.00213885, + "balance_loss_mlp": 1.00132942, + "epoch": 0.0958965880054111, + "flos": 23218367345280.0, + "grad_norm": 2.3094247784212936, + "language_loss": 0.82083732, + "learning_rate": 3.954622408410747e-06, + "loss": 0.84422314, + "num_input_tokens_seen": 34275970, + "step": 1595, + "time_per_iteration": 2.568870782852173 + }, + { + "auxiliary_loss_clip": 0.0115202, + "auxiliary_loss_mlp": 0.01170629, + "balance_loss_clip": 1.00217414, + "balance_loss_mlp": 1.00106561, + "epoch": 0.09595671125807906, + "flos": 21324438933120.0, + "grad_norm": 1.9091202791169353, + "language_loss": 0.84646821, + "learning_rate": 3.954539880085045e-06, + "loss": 0.86969471, + "num_input_tokens_seen": 34295490, + "step": 1596, + "time_per_iteration": 2.613081216812134 + }, + { + "auxiliary_loss_clip": 0.01168445, + "auxiliary_loss_mlp": 0.0117129, + "balance_loss_clip": 1.00227106, + "balance_loss_mlp": 1.00125015, + "epoch": 0.09601683451074704, + "flos": 39603472185600.0, + "grad_norm": 1.9043127579284866, + "language_loss": 0.69174922, + "learning_rate": 3.9544572776429945e-06, + "loss": 0.71514654, + "num_input_tokens_seen": 34319990, + "step": 1597, + "time_per_iteration": 2.732538938522339 + }, + { + "auxiliary_loss_clip": 0.0116826, + "auxiliary_loss_mlp": 0.00749447, + "balance_loss_clip": 1.00209904, + "balance_loss_mlp": 1.00045168, + "epoch": 0.096076957763415, + "flos": 23732716147200.0, + "grad_norm": 2.1479134825724624, + "language_loss": 0.74952483, + "learning_rate": 3.954374601087729e-06, + "loss": 0.76870185, + "num_input_tokens_seen": 34339225, + "step": 1598, + "time_per_iteration": 2.618830919265747 + }, + { + "auxiliary_loss_clip": 0.01167745, + "auxiliary_loss_mlp": 0.01171236, + "balance_loss_clip": 1.00225627, + "balance_loss_mlp": 1.00129175, + "epoch": 0.09613708101608297, + "flos": 34678108483200.0, + "grad_norm": 1.6441542022589315, + "language_loss": 0.69096309, + "learning_rate": 3.954291850422382e-06, + "loss": 0.71435297, + "num_input_tokens_seen": 34361020, + "step": 1599, + "time_per_iteration": 2.726163625717163 + }, + { + "auxiliary_loss_clip": 0.01135509, + "auxiliary_loss_mlp": 0.01170974, + "balance_loss_clip": 1.00194454, + "balance_loss_mlp": 1.00122023, + "epoch": 0.09619720426875093, + "flos": 20740028653440.0, + "grad_norm": 2.3621770972535128, + "language_loss": 0.84515691, + "learning_rate": 3.954209025650093e-06, + "loss": 0.8682217, + "num_input_tokens_seen": 34378630, + "step": 1600, + "time_per_iteration": 2.686235189437866 + }, + { + "auxiliary_loss_clip": 0.01151576, + "auxiliary_loss_mlp": 0.01171005, + "balance_loss_clip": 1.00210094, + "balance_loss_mlp": 1.00134635, + "epoch": 0.09625732752141891, + "flos": 13042720488960.0, + "grad_norm": 2.233793315932642, + "language_loss": 0.80605304, + "learning_rate": 3.954126126774001e-06, + "loss": 0.82927883, + "num_input_tokens_seen": 34397110, + "step": 1601, + "time_per_iteration": 2.6448776721954346 + }, + { + "auxiliary_loss_clip": 0.01168509, + "auxiliary_loss_mlp": 0.01171307, + "balance_loss_clip": 1.00217843, + "balance_loss_mlp": 1.00126731, + "epoch": 0.09631745077408688, + "flos": 22273629782400.0, + "grad_norm": 2.698678352591149, + "language_loss": 0.82461637, + "learning_rate": 3.954043153797251e-06, + "loss": 0.84801459, + "num_input_tokens_seen": 34414165, + "step": 1602, + "time_per_iteration": 2.642425298690796 + }, + { + "auxiliary_loss_clip": 0.01135608, + "auxiliary_loss_mlp": 0.01170831, + "balance_loss_clip": 1.00209403, + "balance_loss_mlp": 1.00117254, + "epoch": 0.09637757402675484, + "flos": 24754266944640.0, + "grad_norm": 1.925898272773512, + "language_loss": 0.62581474, + "learning_rate": 3.953960106722989e-06, + "loss": 0.64887911, + "num_input_tokens_seen": 34434445, + "step": 1603, + "time_per_iteration": 2.6805789470672607 + }, + { + "auxiliary_loss_clip": 0.01184291, + "auxiliary_loss_mlp": 0.01170989, + "balance_loss_clip": 1.00229514, + "balance_loss_mlp": 1.00104427, + "epoch": 0.09643769727942282, + "flos": 22525758322560.0, + "grad_norm": 2.2731520422440075, + "language_loss": 0.71183574, + "learning_rate": 3.953876985554364e-06, + "loss": 0.73538858, + "num_input_tokens_seen": 34453095, + "step": 1604, + "time_per_iteration": 2.586245536804199 + }, + { + "auxiliary_loss_clip": 0.0116797, + "auxiliary_loss_mlp": 0.01170933, + "balance_loss_clip": 1.00229573, + "balance_loss_mlp": 1.00146544, + "epoch": 0.09649782053209079, + "flos": 30921026636160.0, + "grad_norm": 2.2118674643251337, + "language_loss": 0.79987925, + "learning_rate": 3.953793790294527e-06, + "loss": 0.82326829, + "num_input_tokens_seen": 34473680, + "step": 1605, + "time_per_iteration": 2.639660596847534 + }, + { + "auxiliary_loss_clip": 0.0115121, + "auxiliary_loss_mlp": 0.01170806, + "balance_loss_clip": 1.00198269, + "balance_loss_mlp": 1.00105238, + "epoch": 0.09655794378475875, + "flos": 25337635729920.0, + "grad_norm": 1.99480729669537, + "language_loss": 0.74146271, + "learning_rate": 3.953710520946634e-06, + "loss": 0.76468289, + "num_input_tokens_seen": 34492610, + "step": 1606, + "time_per_iteration": 2.6675353050231934 + }, + { + "auxiliary_loss_clip": 0.01167783, + "auxiliary_loss_mlp": 0.01170867, + "balance_loss_clip": 1.00206089, + "balance_loss_mlp": 1.00139928, + "epoch": 0.09661806703742673, + "flos": 22346061557760.0, + "grad_norm": 1.7795780556058267, + "language_loss": 0.75516832, + "learning_rate": 3.953627177513843e-06, + "loss": 0.7785548, + "num_input_tokens_seen": 34511855, + "step": 1607, + "time_per_iteration": 4.0514075756073 + }, + { + "auxiliary_loss_clip": 0.01135607, + "auxiliary_loss_mlp": 0.01170812, + "balance_loss_clip": 1.00190234, + "balance_loss_mlp": 1.00124836, + "epoch": 0.0966781902900947, + "flos": 17457578144640.0, + "grad_norm": 1.7966337964881187, + "language_loss": 0.86922818, + "learning_rate": 3.953543759999312e-06, + "loss": 0.89229238, + "num_input_tokens_seen": 34528905, + "step": 1608, + "time_per_iteration": 2.703498363494873 + }, + { + "auxiliary_loss_clip": 0.01102447, + "auxiliary_loss_mlp": 0.01171397, + "balance_loss_clip": 1.00180364, + "balance_loss_mlp": 1.00135708, + "epoch": 0.09673831354276266, + "flos": 36903995412480.0, + "grad_norm": 2.0180235157520072, + "language_loss": 0.71470952, + "learning_rate": 3.953460268406207e-06, + "loss": 0.73744798, + "num_input_tokens_seen": 34548480, + "step": 1609, + "time_per_iteration": 2.8548097610473633 + }, + { + "auxiliary_loss_clip": 0.01134303, + "auxiliary_loss_mlp": 0.01171059, + "balance_loss_clip": 1.00193298, + "balance_loss_mlp": 1.00140023, + "epoch": 0.09679843679543064, + "flos": 20701388597760.0, + "grad_norm": 2.088735616699872, + "language_loss": 0.84765911, + "learning_rate": 3.953376702737693e-06, + "loss": 0.8707127, + "num_input_tokens_seen": 34565410, + "step": 1610, + "time_per_iteration": 2.6674320697784424 + }, + { + "auxiliary_loss_clip": 0.01151141, + "auxiliary_loss_mlp": 0.01170647, + "balance_loss_clip": 1.0020014, + "balance_loss_mlp": 1.00117934, + "epoch": 0.0968585600480986, + "flos": 23514415240320.0, + "grad_norm": 2.2649930650753216, + "language_loss": 0.67220873, + "learning_rate": 3.953293062996939e-06, + "loss": 0.69542658, + "num_input_tokens_seen": 34584840, + "step": 1611, + "time_per_iteration": 4.095090866088867 + }, + { + "auxiliary_loss_clip": 0.01119884, + "auxiliary_loss_mlp": 0.01170766, + "balance_loss_clip": 1.00206459, + "balance_loss_mlp": 1.00120258, + "epoch": 0.09691868330076657, + "flos": 20121072468480.0, + "grad_norm": 1.669457570319491, + "language_loss": 0.81239665, + "learning_rate": 3.953209349187115e-06, + "loss": 0.83530319, + "num_input_tokens_seen": 34603360, + "step": 1612, + "time_per_iteration": 4.148070812225342 + }, + { + "auxiliary_loss_clip": 0.01168433, + "auxiliary_loss_mlp": 0.01171217, + "balance_loss_clip": 1.00232422, + "balance_loss_mlp": 1.00136781, + "epoch": 0.09697880655343454, + "flos": 16544692967040.0, + "grad_norm": 2.0528402264014773, + "language_loss": 0.81043649, + "learning_rate": 3.953125561311398e-06, + "loss": 0.83383298, + "num_input_tokens_seen": 34620760, + "step": 1613, + "time_per_iteration": 2.5492238998413086 + }, + { + "auxiliary_loss_clip": 0.01137872, + "auxiliary_loss_mlp": 0.01170791, + "balance_loss_clip": 1.00201774, + "balance_loss_mlp": 1.00122786, + "epoch": 0.09703892980610251, + "flos": 26104184899200.0, + "grad_norm": 1.9116177178725167, + "language_loss": 0.8443073, + "learning_rate": 3.953041699372964e-06, + "loss": 0.86739397, + "num_input_tokens_seen": 34640695, + "step": 1614, + "time_per_iteration": 2.693758964538574 + }, + { + "auxiliary_loss_clip": 0.01166077, + "auxiliary_loss_mlp": 0.00748992, + "balance_loss_clip": 1.00293207, + "balance_loss_mlp": 0.99979478, + "epoch": 0.09709905305877048, + "flos": 60443622000000.0, + "grad_norm": 0.6985481482578334, + "language_loss": 0.54654831, + "learning_rate": 3.952957763374992e-06, + "loss": 0.56569898, + "num_input_tokens_seen": 34702395, + "step": 1615, + "time_per_iteration": 3.134431838989258 + }, + { + "auxiliary_loss_clip": 0.01117862, + "auxiliary_loss_mlp": 0.01164913, + "balance_loss_clip": 1.00262082, + "balance_loss_mlp": 1.00011849, + "epoch": 0.09715917631143844, + "flos": 57639932893440.0, + "grad_norm": 0.7632984642376406, + "language_loss": 0.5824331, + "learning_rate": 3.952873753320666e-06, + "loss": 0.60526085, + "num_input_tokens_seen": 34768910, + "step": 1616, + "time_per_iteration": 3.6467766761779785 + }, + { + "auxiliary_loss_clip": 0.01152622, + "auxiliary_loss_mlp": 0.01170694, + "balance_loss_clip": 1.00219703, + "balance_loss_mlp": 1.00122595, + "epoch": 0.09721929956410642, + "flos": 20558212986240.0, + "grad_norm": 2.3177545477895976, + "language_loss": 0.69029367, + "learning_rate": 3.952789669213172e-06, + "loss": 0.71352679, + "num_input_tokens_seen": 34787680, + "step": 1617, + "time_per_iteration": 2.8599367141723633 + }, + { + "auxiliary_loss_clip": 0.01135395, + "auxiliary_loss_mlp": 0.01170653, + "balance_loss_clip": 1.00195026, + "balance_loss_mlp": 1.00099409, + "epoch": 0.09727942281677439, + "flos": 27344359825920.0, + "grad_norm": 1.8995617343257294, + "language_loss": 0.80469275, + "learning_rate": 3.952705511055698e-06, + "loss": 0.82775319, + "num_input_tokens_seen": 34808330, + "step": 1618, + "time_per_iteration": 2.763612985610962 + }, + { + "auxiliary_loss_clip": 0.01151623, + "auxiliary_loss_mlp": 0.01170185, + "balance_loss_clip": 1.00207829, + "balance_loss_mlp": 1.00109899, + "epoch": 0.09733954606944235, + "flos": 24900028335360.0, + "grad_norm": 1.6881762115055035, + "language_loss": 0.92965573, + "learning_rate": 3.952621278851435e-06, + "loss": 0.95287377, + "num_input_tokens_seen": 34830020, + "step": 1619, + "time_per_iteration": 2.718299388885498 + }, + { + "auxiliary_loss_clip": 0.01168061, + "auxiliary_loss_mlp": 0.01170528, + "balance_loss_clip": 1.00229287, + "balance_loss_mlp": 1.00144184, + "epoch": 0.09739966932211033, + "flos": 31503928544640.0, + "grad_norm": 2.112141211050264, + "language_loss": 0.88942033, + "learning_rate": 3.9525369726035784e-06, + "loss": 0.91280621, + "num_input_tokens_seen": 34850330, + "step": 1620, + "time_per_iteration": 2.7115046977996826 + }, + { + "auxiliary_loss_clip": 0.01152222, + "auxiliary_loss_mlp": 0.01170815, + "balance_loss_clip": 1.00215054, + "balance_loss_mlp": 1.0012517, + "epoch": 0.0974597925747783, + "flos": 23878764846720.0, + "grad_norm": 4.795352796775052, + "language_loss": 0.77570403, + "learning_rate": 3.952452592315324e-06, + "loss": 0.79893446, + "num_input_tokens_seen": 34871640, + "step": 1621, + "time_per_iteration": 2.6544952392578125 + }, + { + "auxiliary_loss_clip": 0.01135558, + "auxiliary_loss_mlp": 0.01170453, + "balance_loss_clip": 1.00206423, + "balance_loss_mlp": 1.00127137, + "epoch": 0.09751991582744626, + "flos": 17019575700480.0, + "grad_norm": 2.024618698349522, + "language_loss": 0.78148425, + "learning_rate": 3.952368137989871e-06, + "loss": 0.80454445, + "num_input_tokens_seen": 34888100, + "step": 1622, + "time_per_iteration": 2.6371490955352783 + }, + { + "auxiliary_loss_clip": 0.01151849, + "auxiliary_loss_mlp": 0.0117057, + "balance_loss_clip": 1.00225377, + "balance_loss_mlp": 1.00100732, + "epoch": 0.09758003908011423, + "flos": 28402826826240.0, + "grad_norm": 2.610762602951364, + "language_loss": 0.85835695, + "learning_rate": 3.9522836096304225e-06, + "loss": 0.88158113, + "num_input_tokens_seen": 34910485, + "step": 1623, + "time_per_iteration": 2.726370334625244 + }, + { + "auxiliary_loss_clip": 0.01167759, + "auxiliary_loss_mlp": 0.01170478, + "balance_loss_clip": 1.00221038, + "balance_loss_mlp": 1.00120103, + "epoch": 0.09764016233278221, + "flos": 18144297336960.0, + "grad_norm": 2.2596395361243204, + "language_loss": 0.80305505, + "learning_rate": 3.952199007240184e-06, + "loss": 0.82643735, + "num_input_tokens_seen": 34928615, + "step": 1624, + "time_per_iteration": 2.5947868824005127 + }, + { + "auxiliary_loss_clip": 0.01168131, + "auxiliary_loss_mlp": 0.0117015, + "balance_loss_clip": 1.00217891, + "balance_loss_mlp": 1.00096798, + "epoch": 0.09770028558545017, + "flos": 15265842071040.0, + "grad_norm": 2.2846545385749955, + "language_loss": 0.85874987, + "learning_rate": 3.952114330822364e-06, + "loss": 0.88213265, + "num_input_tokens_seen": 34946045, + "step": 1625, + "time_per_iteration": 2.562417984008789 + }, + { + "auxiliary_loss_clip": 0.01168255, + "auxiliary_loss_mlp": 0.01170687, + "balance_loss_clip": 1.00226045, + "balance_loss_mlp": 1.00121868, + "epoch": 0.09776040883811814, + "flos": 23472435219840.0, + "grad_norm": 2.470662136191208, + "language_loss": 0.85796964, + "learning_rate": 3.952029580380172e-06, + "loss": 0.8813591, + "num_input_tokens_seen": 34962865, + "step": 1626, + "time_per_iteration": 2.6453635692596436 + }, + { + "auxiliary_loss_clip": 0.01168354, + "auxiliary_loss_mlp": 0.00749364, + "balance_loss_clip": 1.00220418, + "balance_loss_mlp": 1.00035381, + "epoch": 0.09782053209078612, + "flos": 24499480798080.0, + "grad_norm": 2.052978628239654, + "language_loss": 0.83320385, + "learning_rate": 3.9519447559168234e-06, + "loss": 0.85238099, + "num_input_tokens_seen": 34983505, + "step": 1627, + "time_per_iteration": 2.621853828430176 + }, + { + "auxiliary_loss_clip": 0.01167425, + "auxiliary_loss_mlp": 0.01170481, + "balance_loss_clip": 1.00197721, + "balance_loss_mlp": 1.00120366, + "epoch": 0.09788065534345408, + "flos": 21580158833280.0, + "grad_norm": 1.9493785164796442, + "language_loss": 0.84283388, + "learning_rate": 3.951859857435534e-06, + "loss": 0.86621296, + "num_input_tokens_seen": 35001825, + "step": 1628, + "time_per_iteration": 2.6091480255126953 + }, + { + "auxiliary_loss_clip": 0.01167835, + "auxiliary_loss_mlp": 0.0117053, + "balance_loss_clip": 1.00218165, + "balance_loss_mlp": 1.00115752, + "epoch": 0.09794077859612205, + "flos": 23842459175040.0, + "grad_norm": 1.4806019943224396, + "language_loss": 0.7589401, + "learning_rate": 3.951774884939523e-06, + "loss": 0.78232372, + "num_input_tokens_seen": 35023075, + "step": 1629, + "time_per_iteration": 2.5977423191070557 + }, + { + "auxiliary_loss_clip": 0.01118245, + "auxiliary_loss_mlp": 0.01170612, + "balance_loss_clip": 1.0020206, + "balance_loss_mlp": 1.00133514, + "epoch": 0.09800090184879003, + "flos": 23659889322240.0, + "grad_norm": 2.159969755055667, + "language_loss": 0.781757, + "learning_rate": 3.951689838432013e-06, + "loss": 0.8046456, + "num_input_tokens_seen": 35043480, + "step": 1630, + "time_per_iteration": 2.734475612640381 + }, + { + "auxiliary_loss_clip": 0.01151561, + "auxiliary_loss_mlp": 0.01170671, + "balance_loss_clip": 1.00219226, + "balance_loss_mlp": 1.00120318, + "epoch": 0.09806102510145799, + "flos": 17055773631360.0, + "grad_norm": 1.837785731576855, + "language_loss": 0.8644048, + "learning_rate": 3.951604717916228e-06, + "loss": 0.88762712, + "num_input_tokens_seen": 35061490, + "step": 1631, + "time_per_iteration": 2.5667147636413574 + }, + { + "auxiliary_loss_clip": 0.01151949, + "auxiliary_loss_mlp": 0.01170705, + "balance_loss_clip": 1.00216568, + "balance_loss_mlp": 1.00123715, + "epoch": 0.09812114835412596, + "flos": 23878477537920.0, + "grad_norm": 1.8851121856892172, + "language_loss": 0.83174336, + "learning_rate": 3.9515195233953975e-06, + "loss": 0.85496998, + "num_input_tokens_seen": 35079670, + "step": 1632, + "time_per_iteration": 2.640533208847046 + }, + { + "auxiliary_loss_clip": 0.0113493, + "auxiliary_loss_mlp": 0.01170663, + "balance_loss_clip": 1.00213742, + "balance_loss_mlp": 1.00129044, + "epoch": 0.09818127160679392, + "flos": 20595488325120.0, + "grad_norm": 1.7027254320550789, + "language_loss": 0.78726345, + "learning_rate": 3.951434254872751e-06, + "loss": 0.81031942, + "num_input_tokens_seen": 35099205, + "step": 1633, + "time_per_iteration": 2.640505075454712 + }, + { + "auxiliary_loss_clip": 0.01167929, + "auxiliary_loss_mlp": 0.01170561, + "balance_loss_clip": 1.00219846, + "balance_loss_mlp": 1.00128436, + "epoch": 0.0982413948594619, + "flos": 15487339288320.0, + "grad_norm": 1.9574588575030323, + "language_loss": 0.73432153, + "learning_rate": 3.951348912351521e-06, + "loss": 0.7577064, + "num_input_tokens_seen": 35115270, + "step": 1634, + "time_per_iteration": 2.5847249031066895 + }, + { + "auxiliary_loss_clip": 0.01150969, + "auxiliary_loss_mlp": 0.01170592, + "balance_loss_clip": 1.00199568, + "balance_loss_mlp": 1.00131512, + "epoch": 0.09830151811212987, + "flos": 24207958016640.0, + "grad_norm": 2.7563232441193355, + "language_loss": 0.72861028, + "learning_rate": 3.951263495834947e-06, + "loss": 0.75182593, + "num_input_tokens_seen": 35134065, + "step": 1635, + "time_per_iteration": 2.6073596477508545 + }, + { + "auxiliary_loss_clip": 0.01151578, + "auxiliary_loss_mlp": 0.01170723, + "balance_loss_clip": 1.00224066, + "balance_loss_mlp": 1.00135088, + "epoch": 0.09836164136479783, + "flos": 20594590485120.0, + "grad_norm": 1.896263397942047, + "language_loss": 0.78165007, + "learning_rate": 3.951178005326264e-06, + "loss": 0.80487311, + "num_input_tokens_seen": 35154870, + "step": 1636, + "time_per_iteration": 2.671980142593384 + }, + { + "auxiliary_loss_clip": 0.01154327, + "auxiliary_loss_mlp": 0.01170894, + "balance_loss_clip": 1.00223649, + "balance_loss_mlp": 1.00123501, + "epoch": 0.09842176461746581, + "flos": 19934157070080.0, + "grad_norm": 1.9357832440606288, + "language_loss": 0.69845611, + "learning_rate": 3.951092440828715e-06, + "loss": 0.7217083, + "num_input_tokens_seen": 35171850, + "step": 1637, + "time_per_iteration": 2.583770275115967 + }, + { + "auxiliary_loss_clip": 0.01184066, + "auxiliary_loss_mlp": 0.01170659, + "balance_loss_clip": 1.00237203, + "balance_loss_mlp": 1.00147724, + "epoch": 0.09848188787013377, + "flos": 21214659991680.0, + "grad_norm": 2.1609897742145523, + "language_loss": 0.77061307, + "learning_rate": 3.951006802345545e-06, + "loss": 0.79416031, + "num_input_tokens_seen": 35188795, + "step": 1638, + "time_per_iteration": 2.552795171737671 + }, + { + "auxiliary_loss_clip": 0.01134409, + "auxiliary_loss_mlp": 0.01170269, + "balance_loss_clip": 1.00207162, + "balance_loss_mlp": 1.00099182, + "epoch": 0.09854201112280174, + "flos": 30154226071680.0, + "grad_norm": 4.452028703617176, + "language_loss": 0.72492683, + "learning_rate": 3.950921089880003e-06, + "loss": 0.74797362, + "num_input_tokens_seen": 35212100, + "step": 1639, + "time_per_iteration": 2.7552332878112793 + }, + { + "auxiliary_loss_clip": 0.01167177, + "auxiliary_loss_mlp": 0.01170289, + "balance_loss_clip": 1.00212693, + "balance_loss_mlp": 1.00082111, + "epoch": 0.09860213437546972, + "flos": 21795730306560.0, + "grad_norm": 1.7110493882690792, + "language_loss": 0.88821459, + "learning_rate": 3.950835303435337e-06, + "loss": 0.91158926, + "num_input_tokens_seen": 35230390, + "step": 1640, + "time_per_iteration": 2.6106865406036377 + }, + { + "auxiliary_loss_clip": 0.01170938, + "auxiliary_loss_mlp": 0.0117032, + "balance_loss_clip": 1.00239921, + "balance_loss_mlp": 1.00094795, + "epoch": 0.09866225762813768, + "flos": 21835555511040.0, + "grad_norm": 1.9384614099053792, + "language_loss": 0.81033355, + "learning_rate": 3.950749443014801e-06, + "loss": 0.83374619, + "num_input_tokens_seen": 35250405, + "step": 1641, + "time_per_iteration": 2.573725938796997 + }, + { + "auxiliary_loss_clip": 0.01167446, + "auxiliary_loss_mlp": 0.01170481, + "balance_loss_clip": 1.00215518, + "balance_loss_mlp": 1.00110817, + "epoch": 0.09872238088080565, + "flos": 17599855916160.0, + "grad_norm": 3.4208042163529218, + "language_loss": 0.86162019, + "learning_rate": 3.95066350862165e-06, + "loss": 0.88499951, + "num_input_tokens_seen": 35262820, + "step": 1642, + "time_per_iteration": 2.5538556575775146 + }, + { + "auxiliary_loss_clip": 0.011346, + "auxiliary_loss_mlp": 0.01170667, + "balance_loss_clip": 1.00200915, + "balance_loss_mlp": 1.00129437, + "epoch": 0.09878250413347361, + "flos": 27636134002560.0, + "grad_norm": 1.6579222678044294, + "language_loss": 0.80957997, + "learning_rate": 3.950577500259144e-06, + "loss": 0.83263266, + "num_input_tokens_seen": 35284490, + "step": 1643, + "time_per_iteration": 2.724242687225342 + }, + { + "auxiliary_loss_clip": 0.01170879, + "auxiliary_loss_mlp": 0.01170733, + "balance_loss_clip": 1.00243115, + "balance_loss_mlp": 1.00145626, + "epoch": 0.0988426273861416, + "flos": 16544728880640.0, + "grad_norm": 1.810451364811339, + "language_loss": 0.8245616, + "learning_rate": 3.950491417930543e-06, + "loss": 0.84797776, + "num_input_tokens_seen": 35302815, + "step": 1644, + "time_per_iteration": 2.571277379989624 + }, + { + "auxiliary_loss_clip": 0.01167848, + "auxiliary_loss_mlp": 0.00749312, + "balance_loss_clip": 1.00229311, + "balance_loss_mlp": 1.0003376, + "epoch": 0.09890275063880956, + "flos": 21215270522880.0, + "grad_norm": 1.728630732014082, + "language_loss": 0.68592668, + "learning_rate": 3.9504052616391124e-06, + "loss": 0.70509833, + "num_input_tokens_seen": 35321175, + "step": 1645, + "time_per_iteration": 3.9969897270202637 + }, + { + "auxiliary_loss_clip": 0.01149806, + "auxiliary_loss_mlp": 0.01164857, + "balance_loss_clip": 1.00300229, + "balance_loss_mlp": 1.00006187, + "epoch": 0.09896287389147752, + "flos": 59379372910080.0, + "grad_norm": 0.8424542097038198, + "language_loss": 0.60889399, + "learning_rate": 3.950319031388119e-06, + "loss": 0.63204062, + "num_input_tokens_seen": 35381740, + "step": 1646, + "time_per_iteration": 3.148066282272339 + }, + { + "auxiliary_loss_clip": 0.01138239, + "auxiliary_loss_mlp": 0.0117048, + "balance_loss_clip": 1.00219393, + "balance_loss_mlp": 1.0012027, + "epoch": 0.0990229971441455, + "flos": 29642678530560.0, + "grad_norm": 1.7475920957221818, + "language_loss": 0.73455173, + "learning_rate": 3.950232727180833e-06, + "loss": 0.75763887, + "num_input_tokens_seen": 35403760, + "step": 1647, + "time_per_iteration": 2.6984996795654297 + }, + { + "auxiliary_loss_clip": 0.01135737, + "auxiliary_loss_mlp": 0.01170704, + "balance_loss_clip": 1.00223804, + "balance_loss_mlp": 1.00161743, + "epoch": 0.09908312039681347, + "flos": 21834873152640.0, + "grad_norm": 2.030400792349347, + "language_loss": 0.84130776, + "learning_rate": 3.950146349020525e-06, + "loss": 0.86437219, + "num_input_tokens_seen": 35424050, + "step": 1648, + "time_per_iteration": 2.759220600128174 + }, + { + "auxiliary_loss_clip": 0.01166406, + "auxiliary_loss_mlp": 0.01164838, + "balance_loss_clip": 1.00323904, + "balance_loss_mlp": 1.0000428, + "epoch": 0.09914324364948143, + "flos": 57564304807680.0, + "grad_norm": 0.7308844750762044, + "language_loss": 0.55684483, + "learning_rate": 3.950059896910473e-06, + "loss": 0.58015728, + "num_input_tokens_seen": 35481690, + "step": 1649, + "time_per_iteration": 4.444530248641968 + }, + { + "auxiliary_loss_clip": 0.0116722, + "auxiliary_loss_mlp": 0.01169701, + "balance_loss_clip": 1.00212896, + "balance_loss_mlp": 1.00080514, + "epoch": 0.09920336690214941, + "flos": 34123934476800.0, + "grad_norm": 2.0323307827609205, + "language_loss": 0.901039, + "learning_rate": 3.949973370853954e-06, + "loss": 0.9244082, + "num_input_tokens_seen": 35498635, + "step": 1650, + "time_per_iteration": 4.140197515487671 + }, + { + "auxiliary_loss_clip": 0.01118242, + "auxiliary_loss_mlp": 0.00749094, + "balance_loss_clip": 1.00303376, + "balance_loss_mlp": 0.99992734, + "epoch": 0.09926349015481738, + "flos": 71216428464000.0, + "grad_norm": 0.7935781387137881, + "language_loss": 0.6370219, + "learning_rate": 3.94988677085425e-06, + "loss": 0.65569532, + "num_input_tokens_seen": 35565720, + "step": 1651, + "time_per_iteration": 3.5499212741851807 + }, + { + "auxiliary_loss_clip": 0.01167209, + "auxiliary_loss_mlp": 0.0117024, + "balance_loss_clip": 1.00221407, + "balance_loss_mlp": 1.00143933, + "epoch": 0.09932361340748534, + "flos": 23148700917120.0, + "grad_norm": 1.6259266913489452, + "language_loss": 0.88274044, + "learning_rate": 3.949800096914643e-06, + "loss": 0.90611494, + "num_input_tokens_seen": 35586000, + "step": 1652, + "time_per_iteration": 3.1077940464019775 + }, + { + "auxiliary_loss_clip": 0.01150793, + "auxiliary_loss_mlp": 0.01169977, + "balance_loss_clip": 1.00220704, + "balance_loss_mlp": 1.00117636, + "epoch": 0.09938373666015332, + "flos": 19828651847040.0, + "grad_norm": 1.765513993069911, + "language_loss": 0.82028604, + "learning_rate": 3.949713349038422e-06, + "loss": 0.8434937, + "num_input_tokens_seen": 35604355, + "step": 1653, + "time_per_iteration": 2.5874788761138916 + }, + { + "auxiliary_loss_clip": 0.01167375, + "auxiliary_loss_mlp": 0.00749387, + "balance_loss_clip": 1.00220478, + "balance_loss_mlp": 1.00052822, + "epoch": 0.09944385991282129, + "flos": 22090664880000.0, + "grad_norm": 1.8979996081687094, + "language_loss": 0.79362601, + "learning_rate": 3.949626527228875e-06, + "loss": 0.81279361, + "num_input_tokens_seen": 35625495, + "step": 1654, + "time_per_iteration": 2.6308600902557373 + }, + { + "auxiliary_loss_clip": 0.01183658, + "auxiliary_loss_mlp": 0.01169858, + "balance_loss_clip": 1.00242734, + "balance_loss_mlp": 1.00134373, + "epoch": 0.09950398316548925, + "flos": 19828867328640.0, + "grad_norm": 1.5590946887559554, + "language_loss": 0.8134616, + "learning_rate": 3.949539631489295e-06, + "loss": 0.83699679, + "num_input_tokens_seen": 35645030, + "step": 1655, + "time_per_iteration": 2.5181071758270264 + }, + { + "auxiliary_loss_clip": 0.01183676, + "auxiliary_loss_mlp": 0.01169715, + "balance_loss_clip": 1.0022397, + "balance_loss_mlp": 1.0010103, + "epoch": 0.09956410641815722, + "flos": 25003701964800.0, + "grad_norm": 2.3802396430219015, + "language_loss": 0.80724359, + "learning_rate": 3.9494526618229765e-06, + "loss": 0.83077753, + "num_input_tokens_seen": 35664305, + "step": 1656, + "time_per_iteration": 2.5751137733459473 + }, + { + "auxiliary_loss_clip": 0.01167005, + "auxiliary_loss_mlp": 0.0117034, + "balance_loss_clip": 1.0022229, + "balance_loss_mlp": 1.00153995, + "epoch": 0.0996242296708252, + "flos": 19317714837120.0, + "grad_norm": 4.776864577759953, + "language_loss": 0.88900888, + "learning_rate": 3.949365618233217e-06, + "loss": 0.91238236, + "num_input_tokens_seen": 35684060, + "step": 1657, + "time_per_iteration": 2.5487096309661865 + }, + { + "auxiliary_loss_clip": 0.01150698, + "auxiliary_loss_mlp": 0.01170705, + "balance_loss_clip": 1.0020951, + "balance_loss_mlp": 1.00152326, + "epoch": 0.09968435292349316, + "flos": 21871609787520.0, + "grad_norm": 2.3979469126946977, + "language_loss": 0.85714656, + "learning_rate": 3.9492785007233195e-06, + "loss": 0.8803606, + "num_input_tokens_seen": 35703250, + "step": 1658, + "time_per_iteration": 2.6107473373413086 + }, + { + "auxiliary_loss_clip": 0.01182152, + "auxiliary_loss_mlp": 0.0116404, + "balance_loss_clip": 1.00305951, + "balance_loss_mlp": 1.00000823, + "epoch": 0.09974447617616113, + "flos": 65384533313280.0, + "grad_norm": 0.9086376061149665, + "language_loss": 0.60775912, + "learning_rate": 3.949191309296585e-06, + "loss": 0.63122106, + "num_input_tokens_seen": 35762165, + "step": 1659, + "time_per_iteration": 3.1377944946289062 + }, + { + "auxiliary_loss_clip": 0.01151395, + "auxiliary_loss_mlp": 0.01169907, + "balance_loss_clip": 1.00210118, + "balance_loss_mlp": 1.00139332, + "epoch": 0.0998045994288291, + "flos": 23659817495040.0, + "grad_norm": 1.8049764169422433, + "language_loss": 0.85755014, + "learning_rate": 3.949104043956321e-06, + "loss": 0.88076311, + "num_input_tokens_seen": 35781520, + "step": 1660, + "time_per_iteration": 2.6539418697357178 + }, + { + "auxiliary_loss_clip": 0.01151245, + "auxiliary_loss_mlp": 0.01170282, + "balance_loss_clip": 1.00238431, + "balance_loss_mlp": 1.00148106, + "epoch": 0.09986472268149707, + "flos": 19609704495360.0, + "grad_norm": 1.8612410104123176, + "language_loss": 0.80295694, + "learning_rate": 3.949016704705836e-06, + "loss": 0.82617217, + "num_input_tokens_seen": 35799565, + "step": 1661, + "time_per_iteration": 2.5905728340148926 + }, + { + "auxiliary_loss_clip": 0.01151729, + "auxiliary_loss_mlp": 0.01170027, + "balance_loss_clip": 1.00206661, + "balance_loss_mlp": 1.0011313, + "epoch": 0.09992484593416504, + "flos": 26213317395840.0, + "grad_norm": 1.7389044769867381, + "language_loss": 0.83983994, + "learning_rate": 3.948929291548443e-06, + "loss": 0.86305749, + "num_input_tokens_seen": 35821085, + "step": 1662, + "time_per_iteration": 2.6773598194122314 + }, + { + "auxiliary_loss_clip": 0.01152148, + "auxiliary_loss_mlp": 0.0116963, + "balance_loss_clip": 1.00224507, + "balance_loss_mlp": 1.00130677, + "epoch": 0.09998496918683301, + "flos": 17493632421120.0, + "grad_norm": 1.9373628216676835, + "language_loss": 0.89150935, + "learning_rate": 3.9488418044874546e-06, + "loss": 0.91472721, + "num_input_tokens_seen": 35839840, + "step": 1663, + "time_per_iteration": 2.589557409286499 + }, + { + "auxiliary_loss_clip": 0.01167548, + "auxiliary_loss_mlp": 0.0117003, + "balance_loss_clip": 1.00224209, + "balance_loss_mlp": 1.00123, + "epoch": 0.10004509243950098, + "flos": 22784925928320.0, + "grad_norm": 1.6087302989937404, + "language_loss": 0.70217252, + "learning_rate": 3.948754243526191e-06, + "loss": 0.72554833, + "num_input_tokens_seen": 35861545, + "step": 1664, + "time_per_iteration": 2.624987840652466 + }, + { + "auxiliary_loss_clip": 0.01134251, + "auxiliary_loss_mlp": 0.01170113, + "balance_loss_clip": 1.00190413, + "balance_loss_mlp": 1.00112152, + "epoch": 0.10010521569216894, + "flos": 16253385667200.0, + "grad_norm": 1.9017636959047632, + "language_loss": 0.78575623, + "learning_rate": 3.94866660866797e-06, + "loss": 0.80879986, + "num_input_tokens_seen": 35878295, + "step": 1665, + "time_per_iteration": 2.593137264251709 + }, + { + "auxiliary_loss_clip": 0.01167339, + "auxiliary_loss_mlp": 0.01170105, + "balance_loss_clip": 1.00238097, + "balance_loss_mlp": 1.00159097, + "epoch": 0.10016533894483691, + "flos": 23402589223680.0, + "grad_norm": 1.63625599819125, + "language_loss": 0.7010377, + "learning_rate": 3.9485788999161165e-06, + "loss": 0.72441208, + "num_input_tokens_seen": 35898990, + "step": 1666, + "time_per_iteration": 2.610501527786255 + }, + { + "auxiliary_loss_clip": 0.01088955, + "auxiliary_loss_mlp": 0.01170737, + "balance_loss_clip": 1.00198293, + "balance_loss_mlp": 1.00155473, + "epoch": 0.10022546219750489, + "flos": 19354164163200.0, + "grad_norm": 1.8462080109422625, + "language_loss": 0.78807992, + "learning_rate": 3.948491117273956e-06, + "loss": 0.81067687, + "num_input_tokens_seen": 35916225, + "step": 1667, + "time_per_iteration": 2.7541391849517822 + }, + { + "auxiliary_loss_clip": 0.01151255, + "auxiliary_loss_mlp": 0.01169826, + "balance_loss_clip": 1.00220537, + "balance_loss_mlp": 1.00121629, + "epoch": 0.10028558545017285, + "flos": 27085766837760.0, + "grad_norm": 2.212564508352791, + "language_loss": 0.775213, + "learning_rate": 3.948403260744817e-06, + "loss": 0.79842377, + "num_input_tokens_seen": 35934630, + "step": 1668, + "time_per_iteration": 2.663238286972046 + }, + { + "auxiliary_loss_clip": 0.01183691, + "auxiliary_loss_mlp": 0.01170061, + "balance_loss_clip": 1.00241208, + "balance_loss_mlp": 1.00135624, + "epoch": 0.10034570870284082, + "flos": 25847136195840.0, + "grad_norm": 1.730747534715526, + "language_loss": 0.7834301, + "learning_rate": 3.948315330332031e-06, + "loss": 0.80696762, + "num_input_tokens_seen": 35953855, + "step": 1669, + "time_per_iteration": 2.627383232116699 + }, + { + "auxiliary_loss_clip": 0.01183808, + "auxiliary_loss_mlp": 0.0117069, + "balance_loss_clip": 1.0024116, + "balance_loss_mlp": 1.00169921, + "epoch": 0.1004058319555088, + "flos": 26249587153920.0, + "grad_norm": 2.5783911501516115, + "language_loss": 0.85528147, + "learning_rate": 3.948227326038933e-06, + "loss": 0.8788265, + "num_input_tokens_seen": 35974555, + "step": 1670, + "time_per_iteration": 2.609419107437134 + }, + { + "auxiliary_loss_clip": 0.01183482, + "auxiliary_loss_mlp": 0.01169643, + "balance_loss_clip": 1.00233436, + "balance_loss_mlp": 1.00131965, + "epoch": 0.10046595520817676, + "flos": 25374480105600.0, + "grad_norm": 1.4979494640785231, + "language_loss": 0.77106124, + "learning_rate": 3.9481392478688586e-06, + "loss": 0.7945925, + "num_input_tokens_seen": 35996830, + "step": 1671, + "time_per_iteration": 2.6099696159362793 + }, + { + "auxiliary_loss_clip": 0.01166049, + "auxiliary_loss_mlp": 0.01164175, + "balance_loss_clip": 1.00285578, + "balance_loss_mlp": 1.00014329, + "epoch": 0.10052607846084473, + "flos": 67461821677440.0, + "grad_norm": 0.7649003755177625, + "language_loss": 0.60716248, + "learning_rate": 3.948051095825149e-06, + "loss": 0.63046467, + "num_input_tokens_seen": 36054465, + "step": 1672, + "time_per_iteration": 3.1386330127716064 + }, + { + "auxiliary_loss_clip": 0.01134302, + "auxiliary_loss_mlp": 0.01169894, + "balance_loss_clip": 1.00197411, + "balance_loss_mlp": 1.00138009, + "epoch": 0.10058620171351271, + "flos": 21360493209600.0, + "grad_norm": 2.597445762882888, + "language_loss": 0.77064538, + "learning_rate": 3.947962869911147e-06, + "loss": 0.79368734, + "num_input_tokens_seen": 36073480, + "step": 1673, + "time_per_iteration": 2.6370534896850586 + }, + { + "auxiliary_loss_clip": 0.01134791, + "auxiliary_loss_mlp": 0.01169776, + "balance_loss_clip": 1.00202179, + "balance_loss_mlp": 1.00126183, + "epoch": 0.10064632496618067, + "flos": 16800125558400.0, + "grad_norm": 2.474841498651074, + "language_loss": 0.73553568, + "learning_rate": 3.947874570130197e-06, + "loss": 0.75858134, + "num_input_tokens_seen": 36091830, + "step": 1674, + "time_per_iteration": 2.63871693611145 + }, + { + "auxiliary_loss_clip": 0.01167053, + "auxiliary_loss_mlp": 0.00749366, + "balance_loss_clip": 1.00219321, + "balance_loss_mlp": 1.00053537, + "epoch": 0.10070644821884864, + "flos": 23624445576960.0, + "grad_norm": 1.8898804224055292, + "language_loss": 0.79572523, + "learning_rate": 3.947786196485649e-06, + "loss": 0.81488943, + "num_input_tokens_seen": 36111400, + "step": 1675, + "time_per_iteration": 2.6143181324005127 + }, + { + "auxiliary_loss_clip": 0.01183756, + "auxiliary_loss_mlp": 0.01170179, + "balance_loss_clip": 1.00246549, + "balance_loss_mlp": 1.00166428, + "epoch": 0.1007665714715166, + "flos": 24462564595200.0, + "grad_norm": 2.2168374146097163, + "language_loss": 0.81510311, + "learning_rate": 3.947697748980853e-06, + "loss": 0.83864242, + "num_input_tokens_seen": 36129345, + "step": 1676, + "time_per_iteration": 2.5795204639434814 + }, + { + "auxiliary_loss_clip": 0.01167287, + "auxiliary_loss_mlp": 0.01170122, + "balance_loss_clip": 1.00232434, + "balance_loss_mlp": 1.00132179, + "epoch": 0.10082669472418458, + "flos": 16799119977600.0, + "grad_norm": 1.9594125847710662, + "language_loss": 0.8607406, + "learning_rate": 3.947609227619163e-06, + "loss": 0.88411462, + "num_input_tokens_seen": 36146255, + "step": 1677, + "time_per_iteration": 2.572126865386963 + }, + { + "auxiliary_loss_clip": 0.01151231, + "auxiliary_loss_mlp": 0.01169907, + "balance_loss_clip": 1.0022974, + "balance_loss_mlp": 1.00148809, + "epoch": 0.10088681797685255, + "flos": 13553513844480.0, + "grad_norm": 1.9870690161369788, + "language_loss": 0.86509538, + "learning_rate": 3.947520632403936e-06, + "loss": 0.8883068, + "num_input_tokens_seen": 36164050, + "step": 1678, + "time_per_iteration": 2.5914580821990967 + }, + { + "auxiliary_loss_clip": 0.01154169, + "auxiliary_loss_mlp": 0.01169612, + "balance_loss_clip": 1.00228822, + "balance_loss_mlp": 1.00128818, + "epoch": 0.10094694122952051, + "flos": 25265706744960.0, + "grad_norm": 2.2181076111137545, + "language_loss": 0.89706653, + "learning_rate": 3.947431963338532e-06, + "loss": 0.9203043, + "num_input_tokens_seen": 36183530, + "step": 1679, + "time_per_iteration": 2.6357548236846924 + }, + { + "auxiliary_loss_clip": 0.01181995, + "auxiliary_loss_mlp": 0.01164069, + "balance_loss_clip": 1.00302958, + "balance_loss_mlp": 1.00003707, + "epoch": 0.10100706448218849, + "flos": 69854299885440.0, + "grad_norm": 0.7805340608688272, + "language_loss": 0.53016025, + "learning_rate": 3.947343220426312e-06, + "loss": 0.55362087, + "num_input_tokens_seen": 36248550, + "step": 1680, + "time_per_iteration": 3.179088592529297 + }, + { + "auxiliary_loss_clip": 0.01183775, + "auxiliary_loss_mlp": 0.0074932, + "balance_loss_clip": 1.002437, + "balance_loss_mlp": 1.00046086, + "epoch": 0.10106718773485646, + "flos": 20007163463040.0, + "grad_norm": 1.7372211244439426, + "language_loss": 0.76707268, + "learning_rate": 3.947254403670641e-06, + "loss": 0.78640366, + "num_input_tokens_seen": 36266065, + "step": 1681, + "time_per_iteration": 2.5386440753936768 + }, + { + "auxiliary_loss_clip": 0.01151289, + "auxiliary_loss_mlp": 0.01169851, + "balance_loss_clip": 1.00232744, + "balance_loss_mlp": 1.00124121, + "epoch": 0.10112731098752442, + "flos": 13479825093120.0, + "grad_norm": 2.4572603307641043, + "language_loss": 0.94042069, + "learning_rate": 3.947165513074889e-06, + "loss": 0.96363205, + "num_input_tokens_seen": 36280960, + "step": 1682, + "time_per_iteration": 2.600470781326294 + }, + { + "auxiliary_loss_clip": 0.01167508, + "auxiliary_loss_mlp": 0.01169535, + "balance_loss_clip": 1.00232804, + "balance_loss_mlp": 1.00111651, + "epoch": 0.1011874342401924, + "flos": 18515901490560.0, + "grad_norm": 2.154786321215008, + "language_loss": 0.88014364, + "learning_rate": 3.947076548642425e-06, + "loss": 0.90351409, + "num_input_tokens_seen": 36299010, + "step": 1683, + "time_per_iteration": 4.047745227813721 + }, + { + "auxiliary_loss_clip": 0.01135771, + "auxiliary_loss_mlp": 0.01169856, + "balance_loss_clip": 1.00218153, + "balance_loss_mlp": 1.00115073, + "epoch": 0.10124755749286037, + "flos": 20702861055360.0, + "grad_norm": 1.7985370215008878, + "language_loss": 0.74815488, + "learning_rate": 3.946987510376624e-06, + "loss": 0.77121115, + "num_input_tokens_seen": 36318400, + "step": 1684, + "time_per_iteration": 2.7188005447387695 + }, + { + "auxiliary_loss_clip": 0.01149459, + "auxiliary_loss_mlp": 0.01164017, + "balance_loss_clip": 1.00291276, + "balance_loss_mlp": 0.99998492, + "epoch": 0.10130768074552833, + "flos": 56109456247680.0, + "grad_norm": 0.7529270295882873, + "language_loss": 0.6105969, + "learning_rate": 3.9468983982808615e-06, + "loss": 0.6337316, + "num_input_tokens_seen": 36381815, + "step": 1685, + "time_per_iteration": 3.259542465209961 + }, + { + "auxiliary_loss_clip": 0.01150745, + "auxiliary_loss_mlp": 0.01170091, + "balance_loss_clip": 1.00216746, + "balance_loss_mlp": 1.00138605, + "epoch": 0.1013678039981963, + "flos": 33402346156800.0, + "grad_norm": 2.6336050936502002, + "language_loss": 0.61953914, + "learning_rate": 3.946809212358516e-06, + "loss": 0.64274746, + "num_input_tokens_seen": 36404320, + "step": 1686, + "time_per_iteration": 2.746516466140747 + }, + { + "auxiliary_loss_clip": 0.01134792, + "auxiliary_loss_mlp": 0.01169795, + "balance_loss_clip": 1.002195, + "balance_loss_mlp": 1.00147116, + "epoch": 0.10142792725086427, + "flos": 31905338008320.0, + "grad_norm": 3.715620705305134, + "language_loss": 0.8121078, + "learning_rate": 3.946719952612972e-06, + "loss": 0.83515358, + "num_input_tokens_seen": 36427510, + "step": 1687, + "time_per_iteration": 4.122025728225708 + }, + { + "auxiliary_loss_clip": 0.01170829, + "auxiliary_loss_mlp": 0.01170116, + "balance_loss_clip": 1.00257802, + "balance_loss_mlp": 1.00122058, + "epoch": 0.10148805050353224, + "flos": 28475905046400.0, + "grad_norm": 1.7785952272858832, + "language_loss": 0.71968055, + "learning_rate": 3.94663061904761e-06, + "loss": 0.74309003, + "num_input_tokens_seen": 36448230, + "step": 1688, + "time_per_iteration": 4.147178411483765 + }, + { + "auxiliary_loss_clip": 0.01151178, + "auxiliary_loss_mlp": 0.01170182, + "balance_loss_clip": 1.00231493, + "balance_loss_mlp": 1.00176311, + "epoch": 0.1015481737562002, + "flos": 25148888737920.0, + "grad_norm": 1.9722060857565318, + "language_loss": 0.87328339, + "learning_rate": 3.94654121166582e-06, + "loss": 0.89649695, + "num_input_tokens_seen": 36464395, + "step": 1689, + "time_per_iteration": 2.6232898235321045 + }, + { + "auxiliary_loss_clip": 0.01167726, + "auxiliary_loss_mlp": 0.01169899, + "balance_loss_clip": 1.00218248, + "balance_loss_mlp": 1.00128984, + "epoch": 0.10160829700886818, + "flos": 30882781630080.0, + "grad_norm": 2.4707359014485704, + "language_loss": 0.88132358, + "learning_rate": 3.946451730470993e-06, + "loss": 0.90469974, + "num_input_tokens_seen": 36486475, + "step": 1690, + "time_per_iteration": 2.66546368598938 + }, + { + "auxiliary_loss_clip": 0.01150441, + "auxiliary_loss_mlp": 0.01169973, + "balance_loss_clip": 1.00203562, + "balance_loss_mlp": 1.00126839, + "epoch": 0.10166842026153615, + "flos": 20412020632320.0, + "grad_norm": 1.9353868226996749, + "language_loss": 0.83607638, + "learning_rate": 3.946362175466521e-06, + "loss": 0.85928047, + "num_input_tokens_seen": 36505310, + "step": 1691, + "time_per_iteration": 2.5916783809661865 + }, + { + "auxiliary_loss_clip": 0.01151249, + "auxiliary_loss_mlp": 0.01169844, + "balance_loss_clip": 1.00229144, + "balance_loss_mlp": 1.00132942, + "epoch": 0.10172854351420411, + "flos": 33476968661760.0, + "grad_norm": 1.6672444103547568, + "language_loss": 0.66870439, + "learning_rate": 3.946272546655801e-06, + "loss": 0.69191527, + "num_input_tokens_seen": 36529820, + "step": 1692, + "time_per_iteration": 2.7720329761505127 + }, + { + "auxiliary_loss_clip": 0.01151373, + "auxiliary_loss_mlp": 0.011704, + "balance_loss_clip": 1.00214851, + "balance_loss_mlp": 1.00178993, + "epoch": 0.1017886667668721, + "flos": 23550325862400.0, + "grad_norm": 1.5481415076433918, + "language_loss": 0.75824189, + "learning_rate": 3.94618284404223e-06, + "loss": 0.78145957, + "num_input_tokens_seen": 36549000, + "step": 1693, + "time_per_iteration": 2.665444850921631 + }, + { + "auxiliary_loss_clip": 0.01135868, + "auxiliary_loss_mlp": 0.01169616, + "balance_loss_clip": 1.00222206, + "balance_loss_mlp": 1.00129223, + "epoch": 0.10184879001954006, + "flos": 23296078419840.0, + "grad_norm": 1.864997876366511, + "language_loss": 0.87372607, + "learning_rate": 3.9460930676292105e-06, + "loss": 0.89678097, + "num_input_tokens_seen": 36567515, + "step": 1694, + "time_per_iteration": 2.716296672821045 + }, + { + "auxiliary_loss_clip": 0.01118305, + "auxiliary_loss_mlp": 0.01169902, + "balance_loss_clip": 1.0020268, + "balance_loss_mlp": 1.00119662, + "epoch": 0.10190891327220802, + "flos": 18333116156160.0, + "grad_norm": 2.9421590096618098, + "language_loss": 0.79570651, + "learning_rate": 3.946003217420147e-06, + "loss": 0.81858855, + "num_input_tokens_seen": 36586190, + "step": 1695, + "time_per_iteration": 2.65417218208313 + }, + { + "auxiliary_loss_clip": 0.01117735, + "auxiliary_loss_mlp": 0.01169644, + "balance_loss_clip": 1.00189292, + "balance_loss_mlp": 1.00160646, + "epoch": 0.10196903652487599, + "flos": 26465374108800.0, + "grad_norm": 1.7056417897923315, + "language_loss": 0.86525255, + "learning_rate": 3.945913293418447e-06, + "loss": 0.88812637, + "num_input_tokens_seen": 36607495, + "step": 1696, + "time_per_iteration": 2.7279772758483887 + }, + { + "auxiliary_loss_clip": 0.01167742, + "auxiliary_loss_mlp": 0.01170024, + "balance_loss_clip": 1.00231194, + "balance_loss_mlp": 1.00160491, + "epoch": 0.10202915977754397, + "flos": 21869526798720.0, + "grad_norm": 1.805940386662302, + "language_loss": 0.82299244, + "learning_rate": 3.945823295627519e-06, + "loss": 0.8463701, + "num_input_tokens_seen": 36628555, + "step": 1697, + "time_per_iteration": 2.600592851638794 + }, + { + "auxiliary_loss_clip": 0.01183617, + "auxiliary_loss_mlp": 0.01169866, + "balance_loss_clip": 1.00237501, + "balance_loss_mlp": 1.00116074, + "epoch": 0.10208928303021193, + "flos": 22309755886080.0, + "grad_norm": 2.0732264826442637, + "language_loss": 0.80902267, + "learning_rate": 3.9457332240507775e-06, + "loss": 0.83255756, + "num_input_tokens_seen": 36646250, + "step": 1698, + "time_per_iteration": 2.5672459602355957 + }, + { + "auxiliary_loss_clip": 0.011385, + "auxiliary_loss_mlp": 0.01169605, + "balance_loss_clip": 1.00250864, + "balance_loss_mlp": 1.00099504, + "epoch": 0.1021494062828799, + "flos": 22125569921280.0, + "grad_norm": 2.4886575023724746, + "language_loss": 0.75848538, + "learning_rate": 3.945643078691637e-06, + "loss": 0.78156644, + "num_input_tokens_seen": 36666675, + "step": 1699, + "time_per_iteration": 2.647355079650879 + }, + { + "auxiliary_loss_clip": 0.0115058, + "auxiliary_loss_mlp": 0.0116943, + "balance_loss_clip": 1.00218892, + "balance_loss_mlp": 1.00120211, + "epoch": 0.10220952953554788, + "flos": 19646728439040.0, + "grad_norm": 1.669735481139253, + "language_loss": 0.80312115, + "learning_rate": 3.945552859553516e-06, + "loss": 0.82632124, + "num_input_tokens_seen": 36685225, + "step": 1700, + "time_per_iteration": 2.597705602645874 + }, + { + "auxiliary_loss_clip": 0.0116696, + "auxiliary_loss_mlp": 0.01169561, + "balance_loss_clip": 1.00220573, + "balance_loss_mlp": 1.00123703, + "epoch": 0.10226965278821584, + "flos": 29787290686080.0, + "grad_norm": 2.94570734261903, + "language_loss": 0.76948822, + "learning_rate": 3.945462566639836e-06, + "loss": 0.79285347, + "num_input_tokens_seen": 36705985, + "step": 1701, + "time_per_iteration": 2.6760170459747314 + }, + { + "auxiliary_loss_clip": 0.0116742, + "auxiliary_loss_mlp": 0.01169693, + "balance_loss_clip": 1.00226426, + "balance_loss_mlp": 1.00117922, + "epoch": 0.10232977604088381, + "flos": 27016818681600.0, + "grad_norm": 1.9083932955371483, + "language_loss": 0.77959234, + "learning_rate": 3.945372199954019e-06, + "loss": 0.8029635, + "num_input_tokens_seen": 36725815, + "step": 1702, + "time_per_iteration": 2.617478847503662 + }, + { + "auxiliary_loss_clip": 0.01151058, + "auxiliary_loss_mlp": 0.01169503, + "balance_loss_clip": 1.00227726, + "balance_loss_mlp": 1.0012753, + "epoch": 0.10238989929355179, + "flos": 20777519473920.0, + "grad_norm": 3.400362065906011, + "language_loss": 0.94687921, + "learning_rate": 3.945281759499494e-06, + "loss": 0.97008491, + "num_input_tokens_seen": 36742345, + "step": 1703, + "time_per_iteration": 2.635340929031372 + }, + { + "auxiliary_loss_clip": 0.01118035, + "auxiliary_loss_mlp": 0.0116349, + "balance_loss_clip": 1.00268793, + "balance_loss_mlp": 1.00022089, + "epoch": 0.10245002254621975, + "flos": 57698322451200.0, + "grad_norm": 0.8760204758274647, + "language_loss": 0.55097687, + "learning_rate": 3.94519124527969e-06, + "loss": 0.5737921, + "num_input_tokens_seen": 36798775, + "step": 1704, + "time_per_iteration": 3.3025732040405273 + }, + { + "auxiliary_loss_clip": 0.01183547, + "auxiliary_loss_mlp": 0.01169669, + "balance_loss_clip": 1.00238824, + "balance_loss_mlp": 1.00124979, + "epoch": 0.10251014579888772, + "flos": 16800125558400.0, + "grad_norm": 2.2687921965816695, + "language_loss": 0.84053254, + "learning_rate": 3.945100657298039e-06, + "loss": 0.86406469, + "num_input_tokens_seen": 36816295, + "step": 1705, + "time_per_iteration": 2.9657139778137207 + }, + { + "auxiliary_loss_clip": 0.01149742, + "auxiliary_loss_mlp": 0.01164558, + "balance_loss_clip": 1.00291753, + "balance_loss_mlp": 1.00052643, + "epoch": 0.1025702690515557, + "flos": 68565500922240.0, + "grad_norm": 0.7650555839930149, + "language_loss": 0.60458457, + "learning_rate": 3.9450099955579765e-06, + "loss": 0.62772763, + "num_input_tokens_seen": 36882030, + "step": 1706, + "time_per_iteration": 3.2708919048309326 + }, + { + "auxiliary_loss_clip": 0.01135666, + "auxiliary_loss_mlp": 0.0116974, + "balance_loss_clip": 1.00218534, + "balance_loss_mlp": 1.00103498, + "epoch": 0.10263039230422366, + "flos": 14866623336960.0, + "grad_norm": 3.199721459042769, + "language_loss": 0.86537874, + "learning_rate": 3.94491926006294e-06, + "loss": 0.88843274, + "num_input_tokens_seen": 36899245, + "step": 1707, + "time_per_iteration": 2.6503639221191406 + }, + { + "auxiliary_loss_clip": 0.01166848, + "auxiliary_loss_mlp": 0.01169634, + "balance_loss_clip": 1.00224602, + "balance_loss_mlp": 1.00102472, + "epoch": 0.10269051555689163, + "flos": 25337599816320.0, + "grad_norm": 1.4600038214756, + "language_loss": 0.72787887, + "learning_rate": 3.944828450816369e-06, + "loss": 0.75124365, + "num_input_tokens_seen": 36920950, + "step": 1708, + "time_per_iteration": 2.6184873580932617 + }, + { + "auxiliary_loss_clip": 0.01150542, + "auxiliary_loss_mlp": 0.00749363, + "balance_loss_clip": 1.00226665, + "balance_loss_mlp": 1.00051117, + "epoch": 0.10275063880955959, + "flos": 21068826773760.0, + "grad_norm": 2.027607298464378, + "language_loss": 0.91345465, + "learning_rate": 3.944737567821709e-06, + "loss": 0.93245363, + "num_input_tokens_seen": 36938900, + "step": 1709, + "time_per_iteration": 2.6469902992248535 + }, + { + "auxiliary_loss_clip": 0.01135445, + "auxiliary_loss_mlp": 0.01169454, + "balance_loss_clip": 1.00231791, + "balance_loss_mlp": 1.00132108, + "epoch": 0.10281076206222757, + "flos": 30366780802560.0, + "grad_norm": 2.5112314328638368, + "language_loss": 0.88327754, + "learning_rate": 3.944646611082406e-06, + "loss": 0.90632653, + "num_input_tokens_seen": 36957010, + "step": 1710, + "time_per_iteration": 2.7585716247558594 + }, + { + "auxiliary_loss_clip": 0.01167385, + "auxiliary_loss_mlp": 0.0116958, + "balance_loss_clip": 1.00225997, + "balance_loss_mlp": 1.00144696, + "epoch": 0.10287088531489554, + "flos": 22418313765120.0, + "grad_norm": 1.8383513470472015, + "language_loss": 0.79373741, + "learning_rate": 3.944555580601908e-06, + "loss": 0.81710708, + "num_input_tokens_seen": 36977690, + "step": 1711, + "time_per_iteration": 2.5829524993896484 + }, + { + "auxiliary_loss_clip": 0.01134723, + "auxiliary_loss_mlp": 0.01169648, + "balance_loss_clip": 1.00214243, + "balance_loss_mlp": 1.00132489, + "epoch": 0.1029310085675635, + "flos": 25115994858240.0, + "grad_norm": 1.637228315010517, + "language_loss": 0.73920667, + "learning_rate": 3.944464476383668e-06, + "loss": 0.76225036, + "num_input_tokens_seen": 36997300, + "step": 1712, + "time_per_iteration": 2.6873843669891357 + }, + { + "auxiliary_loss_clip": 0.01134991, + "auxiliary_loss_mlp": 0.01169264, + "balance_loss_clip": 1.0023855, + "balance_loss_mlp": 1.0011313, + "epoch": 0.10299113182023148, + "flos": 19865639877120.0, + "grad_norm": 1.7950392109884536, + "language_loss": 0.86947179, + "learning_rate": 3.94437329843114e-06, + "loss": 0.89251435, + "num_input_tokens_seen": 37016110, + "step": 1713, + "time_per_iteration": 2.6634504795074463 + }, + { + "auxiliary_loss_clip": 0.0116672, + "auxiliary_loss_mlp": 0.01169771, + "balance_loss_clip": 1.00225139, + "balance_loss_mlp": 1.00135231, + "epoch": 0.10305125507289944, + "flos": 20447608032000.0, + "grad_norm": 1.612647134863898, + "language_loss": 0.727548, + "learning_rate": 3.944282046747782e-06, + "loss": 0.75091296, + "num_input_tokens_seen": 37036405, + "step": 1714, + "time_per_iteration": 2.6225578784942627 + }, + { + "auxiliary_loss_clip": 0.01167031, + "auxiliary_loss_mlp": 0.0116957, + "balance_loss_clip": 1.00224578, + "balance_loss_mlp": 1.00143683, + "epoch": 0.10311137832556741, + "flos": 26250772302720.0, + "grad_norm": 1.7751158790151562, + "language_loss": 0.90685987, + "learning_rate": 3.944190721337053e-06, + "loss": 0.93022585, + "num_input_tokens_seen": 37057580, + "step": 1715, + "time_per_iteration": 2.6331241130828857 + }, + { + "auxiliary_loss_clip": 0.01167188, + "auxiliary_loss_mlp": 0.01169606, + "balance_loss_clip": 1.00229621, + "balance_loss_mlp": 1.00118697, + "epoch": 0.10317150157823539, + "flos": 35298932175360.0, + "grad_norm": 1.7822679944961164, + "language_loss": 0.75863802, + "learning_rate": 3.944099322202418e-06, + "loss": 0.78200597, + "num_input_tokens_seen": 37079120, + "step": 1716, + "time_per_iteration": 2.7238352298736572 + }, + { + "auxiliary_loss_clip": 0.01153964, + "auxiliary_loss_mlp": 0.01170209, + "balance_loss_clip": 1.00224113, + "balance_loss_mlp": 1.00159895, + "epoch": 0.10323162483090335, + "flos": 25739943033600.0, + "grad_norm": 1.7275719815483568, + "language_loss": 0.84985328, + "learning_rate": 3.944007849347342e-06, + "loss": 0.87309492, + "num_input_tokens_seen": 37099710, + "step": 1717, + "time_per_iteration": 2.7239811420440674 + }, + { + "auxiliary_loss_clip": 0.01135488, + "auxiliary_loss_mlp": 0.01170176, + "balance_loss_clip": 1.00234103, + "balance_loss_mlp": 1.00213861, + "epoch": 0.10329174808357132, + "flos": 16289870906880.0, + "grad_norm": 1.984113619564641, + "language_loss": 0.83064902, + "learning_rate": 3.943916302775292e-06, + "loss": 0.85370564, + "num_input_tokens_seen": 37117775, + "step": 1718, + "time_per_iteration": 2.6607868671417236 + }, + { + "auxiliary_loss_clip": 0.01167002, + "auxiliary_loss_mlp": 0.01169155, + "balance_loss_clip": 1.00238061, + "balance_loss_mlp": 1.00121331, + "epoch": 0.10335187133623928, + "flos": 36687166963200.0, + "grad_norm": 2.0296869835581304, + "language_loss": 0.73365134, + "learning_rate": 3.943824682489742e-06, + "loss": 0.75701296, + "num_input_tokens_seen": 37140280, + "step": 1719, + "time_per_iteration": 2.7245090007781982 + }, + { + "auxiliary_loss_clip": 0.01166849, + "auxiliary_loss_mlp": 0.01169202, + "balance_loss_clip": 1.00218511, + "balance_loss_mlp": 1.00125992, + "epoch": 0.10341199458890726, + "flos": 14975648092800.0, + "grad_norm": 1.699265676353771, + "language_loss": 0.92765296, + "learning_rate": 3.9437329884941665e-06, + "loss": 0.95101351, + "num_input_tokens_seen": 37158350, + "step": 1720, + "time_per_iteration": 2.5614166259765625 + }, + { + "auxiliary_loss_clip": 0.01134511, + "auxiliary_loss_mlp": 0.0116927, + "balance_loss_clip": 1.00215721, + "balance_loss_mlp": 1.00113678, + "epoch": 0.10347211784157523, + "flos": 21031587348480.0, + "grad_norm": 2.3600545499887033, + "language_loss": 0.7912153, + "learning_rate": 3.943641220792039e-06, + "loss": 0.81425309, + "num_input_tokens_seen": 37177120, + "step": 1721, + "time_per_iteration": 4.054248332977295 + }, + { + "auxiliary_loss_clip": 0.01117959, + "auxiliary_loss_mlp": 0.01170062, + "balance_loss_clip": 1.00196517, + "balance_loss_mlp": 1.00154734, + "epoch": 0.1035322410942432, + "flos": 19792094780160.0, + "grad_norm": 1.7887558246888289, + "language_loss": 0.8091768, + "learning_rate": 3.9435493793868434e-06, + "loss": 0.832057, + "num_input_tokens_seen": 37195895, + "step": 1722, + "time_per_iteration": 2.7299818992614746 + }, + { + "auxiliary_loss_clip": 0.01148814, + "auxiliary_loss_mlp": 0.0116323, + "balance_loss_clip": 1.0027492, + "balance_loss_mlp": 0.99996072, + "epoch": 0.10359236434691117, + "flos": 52698874947840.0, + "grad_norm": 0.9270166934994644, + "language_loss": 0.67193818, + "learning_rate": 3.943457464282059e-06, + "loss": 0.69505858, + "num_input_tokens_seen": 37247270, + "step": 1723, + "time_per_iteration": 2.955411434173584 + }, + { + "auxiliary_loss_clip": 0.0116689, + "auxiliary_loss_mlp": 0.01169782, + "balance_loss_clip": 1.00222373, + "balance_loss_mlp": 1.00145841, + "epoch": 0.10365248759957914, + "flos": 18405404277120.0, + "grad_norm": 4.481848678051385, + "language_loss": 0.7806704, + "learning_rate": 3.9433654754811745e-06, + "loss": 0.80403709, + "num_input_tokens_seen": 37265595, + "step": 1724, + "time_per_iteration": 2.6074750423431396 + }, + { + "auxiliary_loss_clip": 0.01134851, + "auxiliary_loss_mlp": 0.0117014, + "balance_loss_clip": 1.00210106, + "balance_loss_mlp": 1.00181675, + "epoch": 0.1037126108522471, + "flos": 47553555335040.0, + "grad_norm": 2.153562581172917, + "language_loss": 0.75095403, + "learning_rate": 3.943273412987676e-06, + "loss": 0.77400398, + "num_input_tokens_seen": 37286660, + "step": 1725, + "time_per_iteration": 5.593619346618652 + }, + { + "auxiliary_loss_clip": 0.01134658, + "auxiliary_loss_mlp": 0.0116973, + "balance_loss_clip": 1.00221038, + "balance_loss_mlp": 1.00150168, + "epoch": 0.10377273410491508, + "flos": 22816670572800.0, + "grad_norm": 1.9593730135680874, + "language_loss": 0.75153446, + "learning_rate": 3.943181276805054e-06, + "loss": 0.77457833, + "num_input_tokens_seen": 37304915, + "step": 1726, + "time_per_iteration": 4.171969413757324 + }, + { + "auxiliary_loss_clip": 0.01152014, + "auxiliary_loss_mlp": 0.0116997, + "balance_loss_clip": 1.00233114, + "balance_loss_mlp": 1.00155115, + "epoch": 0.10383285735758305, + "flos": 26138694890880.0, + "grad_norm": 2.0390871994729967, + "language_loss": 0.73993582, + "learning_rate": 3.9430890669368035e-06, + "loss": 0.76315558, + "num_input_tokens_seen": 37325265, + "step": 1727, + "time_per_iteration": 2.679257869720459 + }, + { + "auxiliary_loss_clip": 0.01167701, + "auxiliary_loss_mlp": 0.01169626, + "balance_loss_clip": 1.0023154, + "balance_loss_mlp": 1.00139749, + "epoch": 0.10389298061025101, + "flos": 17091791994240.0, + "grad_norm": 2.166209056647805, + "language_loss": 0.84833461, + "learning_rate": 3.942996783386422e-06, + "loss": 0.87170792, + "num_input_tokens_seen": 37341650, + "step": 1728, + "time_per_iteration": 2.587217092514038 + }, + { + "auxiliary_loss_clip": 0.0115075, + "auxiliary_loss_mlp": 0.01169396, + "balance_loss_clip": 1.00223207, + "balance_loss_mlp": 1.0012629, + "epoch": 0.10395310386291898, + "flos": 20776513893120.0, + "grad_norm": 2.0436066668599486, + "language_loss": 0.70526588, + "learning_rate": 3.942904426157406e-06, + "loss": 0.72846735, + "num_input_tokens_seen": 37360270, + "step": 1729, + "time_per_iteration": 2.6191327571868896 + }, + { + "auxiliary_loss_clip": 0.01167688, + "auxiliary_loss_mlp": 0.01169844, + "balance_loss_clip": 1.00236058, + "balance_loss_mlp": 1.00152016, + "epoch": 0.10401322711558696, + "flos": 12820540913280.0, + "grad_norm": 2.364135863302968, + "language_loss": 0.81407398, + "learning_rate": 3.9428119952532605e-06, + "loss": 0.83744931, + "num_input_tokens_seen": 37375225, + "step": 1730, + "time_per_iteration": 2.5432851314544678 + }, + { + "auxiliary_loss_clip": 0.01069709, + "auxiliary_loss_mlp": 0.01169291, + "balance_loss_clip": 1.00180173, + "balance_loss_mlp": 1.00115812, + "epoch": 0.10407335036825492, + "flos": 23184683366400.0, + "grad_norm": 1.8949470260450538, + "language_loss": 0.75977671, + "learning_rate": 3.942719490677489e-06, + "loss": 0.78216672, + "num_input_tokens_seen": 37395165, + "step": 1731, + "time_per_iteration": 3.125375509262085 + }, + { + "auxiliary_loss_clip": 0.01118105, + "auxiliary_loss_mlp": 0.01169525, + "balance_loss_clip": 1.00201714, + "balance_loss_mlp": 1.00148809, + "epoch": 0.10413347362092289, + "flos": 26104184899200.0, + "grad_norm": 1.9791172259455354, + "language_loss": 0.82772183, + "learning_rate": 3.9426269124336e-06, + "loss": 0.85059816, + "num_input_tokens_seen": 37414845, + "step": 1732, + "time_per_iteration": 3.187774419784546 + }, + { + "auxiliary_loss_clip": 0.01134229, + "auxiliary_loss_mlp": 0.01169786, + "balance_loss_clip": 1.00223672, + "balance_loss_mlp": 1.00146222, + "epoch": 0.10419359687359087, + "flos": 12641059630080.0, + "grad_norm": 2.102968211975695, + "language_loss": 0.83034778, + "learning_rate": 3.942534260525104e-06, + "loss": 0.85338783, + "num_input_tokens_seen": 37432490, + "step": 1733, + "time_per_iteration": 2.634725570678711 + }, + { + "auxiliary_loss_clip": 0.01150587, + "auxiliary_loss_mlp": 0.0116963, + "balance_loss_clip": 1.00212061, + "balance_loss_mlp": 1.0014019, + "epoch": 0.10425372012625883, + "flos": 12125094716160.0, + "grad_norm": 2.4958683751710775, + "language_loss": 0.76226258, + "learning_rate": 3.942441534955514e-06, + "loss": 0.78546464, + "num_input_tokens_seen": 37449435, + "step": 1734, + "time_per_iteration": 2.633617639541626 + }, + { + "auxiliary_loss_clip": 0.01134148, + "auxiliary_loss_mlp": 0.01168984, + "balance_loss_clip": 1.00201511, + "balance_loss_mlp": 1.00113761, + "epoch": 0.1043138433789268, + "flos": 25337563902720.0, + "grad_norm": 1.793737322892497, + "language_loss": 0.74882126, + "learning_rate": 3.9423487357283465e-06, + "loss": 0.77185255, + "num_input_tokens_seen": 37469105, + "step": 1735, + "time_per_iteration": 2.683197498321533 + }, + { + "auxiliary_loss_clip": 0.01166781, + "auxiliary_loss_mlp": 0.01168984, + "balance_loss_clip": 1.00218105, + "balance_loss_mlp": 1.00104165, + "epoch": 0.10437396663159478, + "flos": 29167149352320.0, + "grad_norm": 1.6748290812362658, + "language_loss": 0.78870261, + "learning_rate": 3.94225586284712e-06, + "loss": 0.81206024, + "num_input_tokens_seen": 37490540, + "step": 1736, + "time_per_iteration": 2.638904333114624 + }, + { + "auxiliary_loss_clip": 0.01166436, + "auxiliary_loss_mlp": 0.01168955, + "balance_loss_clip": 1.00226617, + "balance_loss_mlp": 1.0012989, + "epoch": 0.10443408988426274, + "flos": 25080946162560.0, + "grad_norm": 1.9514441455779885, + "language_loss": 0.70742583, + "learning_rate": 3.942162916315356e-06, + "loss": 0.73077977, + "num_input_tokens_seen": 37511905, + "step": 1737, + "time_per_iteration": 2.6129181385040283 + }, + { + "auxiliary_loss_clip": 0.01134966, + "auxiliary_loss_mlp": 0.01169504, + "balance_loss_clip": 1.00200701, + "balance_loss_mlp": 1.00127554, + "epoch": 0.1044942131369307, + "flos": 26759662237440.0, + "grad_norm": 1.9734270301350987, + "language_loss": 0.81697834, + "learning_rate": 3.942069896136581e-06, + "loss": 0.84002304, + "num_input_tokens_seen": 37533635, + "step": 1738, + "time_per_iteration": 2.678227663040161 + }, + { + "auxiliary_loss_clip": 0.01183271, + "auxiliary_loss_mlp": 0.01169086, + "balance_loss_clip": 1.0022974, + "balance_loss_mlp": 1.00133443, + "epoch": 0.10455433638959867, + "flos": 18442571875200.0, + "grad_norm": 3.1681596649885235, + "language_loss": 0.75413346, + "learning_rate": 3.9419768023143196e-06, + "loss": 0.77765697, + "num_input_tokens_seen": 37552035, + "step": 1739, + "time_per_iteration": 2.552438735961914 + }, + { + "auxiliary_loss_clip": 0.01137296, + "auxiliary_loss_mlp": 0.01169021, + "balance_loss_clip": 1.00214934, + "balance_loss_mlp": 1.00117421, + "epoch": 0.10461445964226665, + "flos": 23218977876480.0, + "grad_norm": 1.6010079833767858, + "language_loss": 0.77027446, + "learning_rate": 3.941883634852104e-06, + "loss": 0.7933377, + "num_input_tokens_seen": 37571540, + "step": 1740, + "time_per_iteration": 2.701179265975952 + }, + { + "auxiliary_loss_clip": 0.01150666, + "auxiliary_loss_mlp": 0.01169758, + "balance_loss_clip": 1.00226891, + "balance_loss_mlp": 1.00172091, + "epoch": 0.10467458289493461, + "flos": 24345243797760.0, + "grad_norm": 2.077287492478931, + "language_loss": 0.86098772, + "learning_rate": 3.941790393753467e-06, + "loss": 0.88419199, + "num_input_tokens_seen": 37588265, + "step": 1741, + "time_per_iteration": 2.6107234954833984 + }, + { + "auxiliary_loss_clip": 0.01150154, + "auxiliary_loss_mlp": 0.01169386, + "balance_loss_clip": 1.00215387, + "balance_loss_mlp": 1.0009675, + "epoch": 0.10473470614760258, + "flos": 21287953693440.0, + "grad_norm": 2.2980974325846852, + "language_loss": 0.75276154, + "learning_rate": 3.941697079021942e-06, + "loss": 0.77595699, + "num_input_tokens_seen": 37606860, + "step": 1742, + "time_per_iteration": 2.5971527099609375 + }, + { + "auxiliary_loss_clip": 0.01117812, + "auxiliary_loss_mlp": 0.01169248, + "balance_loss_clip": 1.00207734, + "balance_loss_mlp": 1.00149632, + "epoch": 0.10479482940027056, + "flos": 21687208341120.0, + "grad_norm": 2.0371973256134708, + "language_loss": 0.87468803, + "learning_rate": 3.94160369066107e-06, + "loss": 0.89755869, + "num_input_tokens_seen": 37625210, + "step": 1743, + "time_per_iteration": 2.688241720199585 + }, + { + "auxiliary_loss_clip": 0.01133478, + "auxiliary_loss_mlp": 0.01168494, + "balance_loss_clip": 1.0017885, + "balance_loss_mlp": 1.00093317, + "epoch": 0.10485495265293852, + "flos": 21573694385280.0, + "grad_norm": 3.016544677768723, + "language_loss": 0.75557768, + "learning_rate": 3.941510228674391e-06, + "loss": 0.77859747, + "num_input_tokens_seen": 37644110, + "step": 1744, + "time_per_iteration": 2.626175880432129 + }, + { + "auxiliary_loss_clip": 0.01166554, + "auxiliary_loss_mlp": 0.01169012, + "balance_loss_clip": 1.00241339, + "balance_loss_mlp": 1.00116515, + "epoch": 0.10491507590560649, + "flos": 37961923708800.0, + "grad_norm": 2.0756922490206464, + "language_loss": 0.78891039, + "learning_rate": 3.941416693065451e-06, + "loss": 0.81226605, + "num_input_tokens_seen": 37665800, + "step": 1745, + "time_per_iteration": 2.741065263748169 + }, + { + "auxiliary_loss_clip": 0.01183121, + "auxiliary_loss_mlp": 0.01169137, + "balance_loss_clip": 1.00230074, + "balance_loss_mlp": 1.00157619, + "epoch": 0.10497519915827447, + "flos": 26396282298240.0, + "grad_norm": 1.9575131738282068, + "language_loss": 0.8342458, + "learning_rate": 3.941323083837794e-06, + "loss": 0.85776836, + "num_input_tokens_seen": 37685095, + "step": 1746, + "time_per_iteration": 2.5706722736358643 + }, + { + "auxiliary_loss_clip": 0.01150778, + "auxiliary_loss_mlp": 0.01169191, + "balance_loss_clip": 1.00218987, + "balance_loss_mlp": 1.00153506, + "epoch": 0.10503532241094243, + "flos": 40662190581120.0, + "grad_norm": 1.9972439838407605, + "language_loss": 0.70043206, + "learning_rate": 3.941229400994971e-06, + "loss": 0.72363174, + "num_input_tokens_seen": 37707445, + "step": 1747, + "time_per_iteration": 2.7850165367126465 + }, + { + "auxiliary_loss_clip": 0.01134044, + "auxiliary_loss_mlp": 0.01169375, + "balance_loss_clip": 1.00210166, + "balance_loss_mlp": 1.00143254, + "epoch": 0.1050954456636104, + "flos": 29789409588480.0, + "grad_norm": 2.1585857114777123, + "language_loss": 0.84264559, + "learning_rate": 3.941135644540535e-06, + "loss": 0.8656798, + "num_input_tokens_seen": 37728325, + "step": 1748, + "time_per_iteration": 2.7031538486480713 + }, + { + "auxiliary_loss_clip": 0.01182952, + "auxiliary_loss_mlp": 0.01168723, + "balance_loss_clip": 1.00214458, + "balance_loss_mlp": 1.00116277, + "epoch": 0.10515556891627838, + "flos": 23948754497280.0, + "grad_norm": 1.7182842644796759, + "language_loss": 0.71595097, + "learning_rate": 3.941041814478041e-06, + "loss": 0.73946774, + "num_input_tokens_seen": 37748910, + "step": 1749, + "time_per_iteration": 2.5940048694610596 + }, + { + "auxiliary_loss_clip": 0.01151025, + "auxiliary_loss_mlp": 0.01168681, + "balance_loss_clip": 1.00217783, + "balance_loss_mlp": 1.00121617, + "epoch": 0.10521569216894634, + "flos": 18259606972800.0, + "grad_norm": 2.0857003168490467, + "language_loss": 0.81888944, + "learning_rate": 3.940947910811047e-06, + "loss": 0.84208643, + "num_input_tokens_seen": 37765745, + "step": 1750, + "time_per_iteration": 2.635281562805176 + }, + { + "auxiliary_loss_clip": 0.01137691, + "auxiliary_loss_mlp": 0.01169056, + "balance_loss_clip": 1.00214624, + "balance_loss_mlp": 1.00149536, + "epoch": 0.10527581542161431, + "flos": 15630909949440.0, + "grad_norm": 3.111401511608577, + "language_loss": 0.9260987, + "learning_rate": 3.940853933543114e-06, + "loss": 0.94916612, + "num_input_tokens_seen": 37780520, + "step": 1751, + "time_per_iteration": 2.616433620452881 + }, + { + "auxiliary_loss_clip": 0.01166535, + "auxiliary_loss_mlp": 0.01168564, + "balance_loss_clip": 1.00215197, + "balance_loss_mlp": 1.00119388, + "epoch": 0.10533593867428227, + "flos": 18296559089280.0, + "grad_norm": 2.418315368451264, + "language_loss": 0.79184967, + "learning_rate": 3.940759882677805e-06, + "loss": 0.81520069, + "num_input_tokens_seen": 37799515, + "step": 1752, + "time_per_iteration": 2.565020799636841 + }, + { + "auxiliary_loss_clip": 0.01117464, + "auxiliary_loss_mlp": 0.01168513, + "balance_loss_clip": 1.0019877, + "balance_loss_mlp": 1.00123811, + "epoch": 0.10539606192695025, + "flos": 29023219555200.0, + "grad_norm": 2.676047200804853, + "language_loss": 0.7565192, + "learning_rate": 3.940665758218686e-06, + "loss": 0.77937889, + "num_input_tokens_seen": 37818695, + "step": 1753, + "time_per_iteration": 2.7140204906463623 + }, + { + "auxiliary_loss_clip": 0.01133795, + "auxiliary_loss_mlp": 0.01169028, + "balance_loss_clip": 1.00198758, + "balance_loss_mlp": 1.00127721, + "epoch": 0.10545618517961822, + "flos": 19969313506560.0, + "grad_norm": 1.9576400272940064, + "language_loss": 0.83832061, + "learning_rate": 3.940571560169328e-06, + "loss": 0.86134887, + "num_input_tokens_seen": 37837860, + "step": 1754, + "time_per_iteration": 2.6462247371673584 + }, + { + "auxiliary_loss_clip": 0.01117918, + "auxiliary_loss_mlp": 0.01169079, + "balance_loss_clip": 1.0020591, + "balance_loss_mlp": 1.00104177, + "epoch": 0.10551630843228618, + "flos": 16143427157760.0, + "grad_norm": 2.8734826010999868, + "language_loss": 0.68350852, + "learning_rate": 3.940477288533302e-06, + "loss": 0.70637846, + "num_input_tokens_seen": 37856260, + "step": 1755, + "time_per_iteration": 2.6465067863464355 + }, + { + "auxiliary_loss_clip": 0.01167265, + "auxiliary_loss_mlp": 0.01169257, + "balance_loss_clip": 1.00224113, + "balance_loss_mlp": 1.00141037, + "epoch": 0.10557643168495416, + "flos": 23440115957760.0, + "grad_norm": 2.1798160064481853, + "language_loss": 0.76801348, + "learning_rate": 3.940382943314182e-06, + "loss": 0.79137868, + "num_input_tokens_seen": 37876960, + "step": 1756, + "time_per_iteration": 2.602482557296753 + }, + { + "auxiliary_loss_clip": 0.01183003, + "auxiliary_loss_mlp": 0.01169139, + "balance_loss_clip": 1.00219774, + "balance_loss_mlp": 1.0014832, + "epoch": 0.10563655493762213, + "flos": 21799034357760.0, + "grad_norm": 1.5797474912043745, + "language_loss": 0.79968178, + "learning_rate": 3.940288524515547e-06, + "loss": 0.82320321, + "num_input_tokens_seen": 37897070, + "step": 1757, + "time_per_iteration": 2.5310475826263428 + }, + { + "auxiliary_loss_clip": 0.01135205, + "auxiliary_loss_mlp": 0.01168919, + "balance_loss_clip": 1.00218844, + "balance_loss_mlp": 1.00126326, + "epoch": 0.10569667819029009, + "flos": 53800863275520.0, + "grad_norm": 1.5415756439202088, + "language_loss": 0.78629541, + "learning_rate": 3.940194032140976e-06, + "loss": 0.80933666, + "num_input_tokens_seen": 37923635, + "step": 1758, + "time_per_iteration": 4.3538572788238525 + }, + { + "auxiliary_loss_clip": 0.01150636, + "auxiliary_loss_mlp": 0.01168746, + "balance_loss_clip": 1.00214696, + "balance_loss_mlp": 1.00109041, + "epoch": 0.10575680144295807, + "flos": 22925515760640.0, + "grad_norm": 2.125255480549906, + "language_loss": 0.91642761, + "learning_rate": 3.940099466194054e-06, + "loss": 0.93962145, + "num_input_tokens_seen": 37942650, + "step": 1759, + "time_per_iteration": 2.6269378662109375 + }, + { + "auxiliary_loss_clip": 0.01150631, + "auxiliary_loss_mlp": 0.01168732, + "balance_loss_clip": 1.00204206, + "balance_loss_mlp": 1.00107646, + "epoch": 0.10581692469562604, + "flos": 14136667148160.0, + "grad_norm": 2.1269532450400543, + "language_loss": 0.77567554, + "learning_rate": 3.940004826678365e-06, + "loss": 0.79886913, + "num_input_tokens_seen": 37960660, + "step": 1760, + "time_per_iteration": 2.5814058780670166 + }, + { + "auxiliary_loss_clip": 0.01167239, + "auxiliary_loss_mlp": 0.01169125, + "balance_loss_clip": 1.00229406, + "balance_loss_mlp": 1.00137389, + "epoch": 0.105877047948294, + "flos": 25958674903680.0, + "grad_norm": 2.7835882259252163, + "language_loss": 0.89044952, + "learning_rate": 3.939910113597498e-06, + "loss": 0.91381317, + "num_input_tokens_seen": 37978625, + "step": 1761, + "time_per_iteration": 2.597900390625 + }, + { + "auxiliary_loss_clip": 0.01102096, + "auxiliary_loss_mlp": 0.00749244, + "balance_loss_clip": 1.00197756, + "balance_loss_mlp": 1.00029206, + "epoch": 0.10593717120096197, + "flos": 30664768032000.0, + "grad_norm": 1.897439213896005, + "language_loss": 0.77901024, + "learning_rate": 3.9398153269550464e-06, + "loss": 0.79752362, + "num_input_tokens_seen": 38000005, + "step": 1762, + "time_per_iteration": 4.479990482330322 + }, + { + "auxiliary_loss_clip": 0.01148621, + "auxiliary_loss_mlp": 0.01163236, + "balance_loss_clip": 1.00257301, + "balance_loss_mlp": 1.00072956, + "epoch": 0.10599729445362994, + "flos": 66436682497920.0, + "grad_norm": 0.7629605773286796, + "language_loss": 0.60535061, + "learning_rate": 3.939720466754602e-06, + "loss": 0.62846923, + "num_input_tokens_seen": 38066165, + "step": 1763, + "time_per_iteration": 4.763385057449341 + }, + { + "auxiliary_loss_clip": 0.01149791, + "auxiliary_loss_mlp": 0.01168659, + "balance_loss_clip": 1.00198376, + "balance_loss_mlp": 1.00138438, + "epoch": 0.10605741770629791, + "flos": 23948179879680.0, + "grad_norm": 1.5391901410828386, + "language_loss": 0.7965436, + "learning_rate": 3.939625532999763e-06, + "loss": 0.81972802, + "num_input_tokens_seen": 38086150, + "step": 1764, + "time_per_iteration": 2.6097874641418457 + }, + { + "auxiliary_loss_clip": 0.01135337, + "auxiliary_loss_mlp": 0.01168718, + "balance_loss_clip": 1.00211143, + "balance_loss_mlp": 1.00125277, + "epoch": 0.10611754095896588, + "flos": 19387524919680.0, + "grad_norm": 1.613815291243603, + "language_loss": 0.7994051, + "learning_rate": 3.9395305256941314e-06, + "loss": 0.82244563, + "num_input_tokens_seen": 38104205, + "step": 1765, + "time_per_iteration": 4.093387126922607 + }, + { + "auxiliary_loss_clip": 0.01166281, + "auxiliary_loss_mlp": 0.01168411, + "balance_loss_clip": 1.00210321, + "balance_loss_mlp": 1.00132751, + "epoch": 0.10617766421163385, + "flos": 22237755073920.0, + "grad_norm": 1.8859208063561907, + "language_loss": 0.76901287, + "learning_rate": 3.939435444841306e-06, + "loss": 0.79235977, + "num_input_tokens_seen": 38122005, + "step": 1766, + "time_per_iteration": 2.5882132053375244 + }, + { + "auxiliary_loss_clip": 0.01182967, + "auxiliary_loss_mlp": 0.011691, + "balance_loss_clip": 1.00225449, + "balance_loss_mlp": 1.00153947, + "epoch": 0.10623778746430182, + "flos": 28404407024640.0, + "grad_norm": 1.6851665478414575, + "language_loss": 0.77254689, + "learning_rate": 3.939340290444895e-06, + "loss": 0.79606748, + "num_input_tokens_seen": 38143365, + "step": 1767, + "time_per_iteration": 2.6064863204956055 + }, + { + "auxiliary_loss_clip": 0.01069582, + "auxiliary_loss_mlp": 0.01164233, + "balance_loss_clip": 1.00248861, + "balance_loss_mlp": 1.00020146, + "epoch": 0.10629791071696978, + "flos": 64234639221120.0, + "grad_norm": 0.6775935905366101, + "language_loss": 0.57889652, + "learning_rate": 3.939245062508506e-06, + "loss": 0.60123467, + "num_input_tokens_seen": 38210035, + "step": 1768, + "time_per_iteration": 3.7615230083465576 + }, + { + "auxiliary_loss_clip": 0.01135127, + "auxiliary_loss_mlp": 0.01168484, + "balance_loss_clip": 1.00219893, + "balance_loss_mlp": 1.001019, + "epoch": 0.10635803396963776, + "flos": 22747578762240.0, + "grad_norm": 1.363370682253301, + "language_loss": 0.86741006, + "learning_rate": 3.939149761035749e-06, + "loss": 0.89044619, + "num_input_tokens_seen": 38231230, + "step": 1769, + "time_per_iteration": 4.08635687828064 + }, + { + "auxiliary_loss_clip": 0.01121525, + "auxiliary_loss_mlp": 0.00749197, + "balance_loss_clip": 1.00198686, + "balance_loss_mlp": 1.00024533, + "epoch": 0.10641815722230573, + "flos": 31395586147200.0, + "grad_norm": 1.7019417124503795, + "language_loss": 0.61688507, + "learning_rate": 3.9390543860302395e-06, + "loss": 0.63559228, + "num_input_tokens_seen": 38253890, + "step": 1770, + "time_per_iteration": 2.813105583190918 + }, + { + "auxiliary_loss_clip": 0.01165618, + "auxiliary_loss_mlp": 0.01163719, + "balance_loss_clip": 1.00279236, + "balance_loss_mlp": 1.00045037, + "epoch": 0.1064782804749737, + "flos": 58552527784320.0, + "grad_norm": 1.0824979317994028, + "language_loss": 0.57090306, + "learning_rate": 3.9389589374955925e-06, + "loss": 0.59419644, + "num_input_tokens_seen": 38304290, + "step": 1771, + "time_per_iteration": 3.06549072265625 + }, + { + "auxiliary_loss_clip": 0.01133693, + "auxiliary_loss_mlp": 0.01168743, + "balance_loss_clip": 1.00216305, + "balance_loss_mlp": 1.00165927, + "epoch": 0.10653840372764166, + "flos": 23987825516160.0, + "grad_norm": 1.7515009438556617, + "language_loss": 0.88421857, + "learning_rate": 3.938863415435429e-06, + "loss": 0.90724289, + "num_input_tokens_seen": 38324725, + "step": 1772, + "time_per_iteration": 2.6749179363250732 + }, + { + "auxiliary_loss_clip": 0.01182965, + "auxiliary_loss_mlp": 0.01168609, + "balance_loss_clip": 1.00224161, + "balance_loss_mlp": 1.00114346, + "epoch": 0.10659852698030964, + "flos": 18294655668480.0, + "grad_norm": 2.7507406439977484, + "language_loss": 0.76112103, + "learning_rate": 3.93876781985337e-06, + "loss": 0.78463686, + "num_input_tokens_seen": 38340735, + "step": 1773, + "time_per_iteration": 2.5146374702453613 + }, + { + "auxiliary_loss_clip": 0.01135166, + "auxiliary_loss_mlp": 0.01168783, + "balance_loss_clip": 1.00232935, + "balance_loss_mlp": 1.00112736, + "epoch": 0.1066586502329776, + "flos": 32160591031680.0, + "grad_norm": 2.3674243594662427, + "language_loss": 0.83118391, + "learning_rate": 3.938672150753041e-06, + "loss": 0.85422337, + "num_input_tokens_seen": 38361315, + "step": 1774, + "time_per_iteration": 2.7533295154571533 + }, + { + "auxiliary_loss_clip": 0.01149735, + "auxiliary_loss_mlp": 0.00749232, + "balance_loss_clip": 1.00207663, + "balance_loss_mlp": 1.00025225, + "epoch": 0.10671877348564557, + "flos": 17785155202560.0, + "grad_norm": 2.3988436745791617, + "language_loss": 0.76495874, + "learning_rate": 3.9385764081380704e-06, + "loss": 0.78394836, + "num_input_tokens_seen": 38377425, + "step": 1775, + "time_per_iteration": 2.6303114891052246 + }, + { + "auxiliary_loss_clip": 0.01181244, + "auxiliary_loss_mlp": 0.01162736, + "balance_loss_clip": 1.00278735, + "balance_loss_mlp": 1.00022984, + "epoch": 0.10677889673831355, + "flos": 63510177813120.0, + "grad_norm": 0.8289338029027077, + "language_loss": 0.57446146, + "learning_rate": 3.9384805920120876e-06, + "loss": 0.59790134, + "num_input_tokens_seen": 38440275, + "step": 1776, + "time_per_iteration": 3.1127400398254395 + }, + { + "auxiliary_loss_clip": 0.01151208, + "auxiliary_loss_mlp": 0.01168739, + "balance_loss_clip": 1.00233078, + "balance_loss_mlp": 1.00155973, + "epoch": 0.10683901999098151, + "flos": 22017694400640.0, + "grad_norm": 2.2881793914498085, + "language_loss": 0.83315265, + "learning_rate": 3.938384702378727e-06, + "loss": 0.85635209, + "num_input_tokens_seen": 38461820, + "step": 1777, + "time_per_iteration": 2.6474649906158447 + }, + { + "auxiliary_loss_clip": 0.01118207, + "auxiliary_loss_mlp": 0.00749211, + "balance_loss_clip": 1.0021596, + "balance_loss_mlp": 1.00033617, + "epoch": 0.10689914324364948, + "flos": 25042952551680.0, + "grad_norm": 1.810936875802564, + "language_loss": 0.87105823, + "learning_rate": 3.938288739241625e-06, + "loss": 0.88973248, + "num_input_tokens_seen": 38482235, + "step": 1778, + "time_per_iteration": 2.7407400608062744 + }, + { + "auxiliary_loss_clip": 0.01101931, + "auxiliary_loss_mlp": 0.00749227, + "balance_loss_clip": 1.00199485, + "balance_loss_mlp": 1.00032437, + "epoch": 0.10695926649631746, + "flos": 16435129507200.0, + "grad_norm": 1.8209502476312684, + "language_loss": 0.8399713, + "learning_rate": 3.938192702604417e-06, + "loss": 0.85848296, + "num_input_tokens_seen": 38500690, + "step": 1779, + "time_per_iteration": 2.7439019680023193 + }, + { + "auxiliary_loss_clip": 0.01150696, + "auxiliary_loss_mlp": 0.00749243, + "balance_loss_clip": 1.00233197, + "balance_loss_mlp": 1.00034595, + "epoch": 0.10701938974898542, + "flos": 16979211792000.0, + "grad_norm": 1.8683328372829557, + "language_loss": 0.671592, + "learning_rate": 3.9380965924707495e-06, + "loss": 0.69059134, + "num_input_tokens_seen": 38518405, + "step": 1780, + "time_per_iteration": 2.636319398880005 + }, + { + "auxiliary_loss_clip": 0.0116645, + "auxiliary_loss_mlp": 0.01168452, + "balance_loss_clip": 1.00231719, + "balance_loss_mlp": 1.0012728, + "epoch": 0.10707951300165339, + "flos": 15888102307200.0, + "grad_norm": 2.0967384344116455, + "language_loss": 0.9239769, + "learning_rate": 3.938000408844265e-06, + "loss": 0.94732589, + "num_input_tokens_seen": 38535060, + "step": 1781, + "time_per_iteration": 2.5668516159057617 + }, + { + "auxiliary_loss_clip": 0.01116757, + "auxiliary_loss_mlp": 0.01168226, + "balance_loss_clip": 1.00208235, + "balance_loss_mlp": 1.00123763, + "epoch": 0.10713963625432135, + "flos": 14247164361600.0, + "grad_norm": 2.173818928984673, + "language_loss": 0.79276502, + "learning_rate": 3.9379041517286105e-06, + "loss": 0.81561494, + "num_input_tokens_seen": 38552855, + "step": 1782, + "time_per_iteration": 2.6714839935302734 + }, + { + "auxiliary_loss_clip": 0.01153709, + "auxiliary_loss_mlp": 0.01168901, + "balance_loss_clip": 1.00236535, + "balance_loss_mlp": 1.00143528, + "epoch": 0.10719975950698933, + "flos": 16756780821120.0, + "grad_norm": 3.760787203479825, + "language_loss": 0.79432988, + "learning_rate": 3.937807821127436e-06, + "loss": 0.81755596, + "num_input_tokens_seen": 38570075, + "step": 1783, + "time_per_iteration": 2.5894055366516113 + }, + { + "auxiliary_loss_clip": 0.01150166, + "auxiliary_loss_mlp": 0.0116866, + "balance_loss_clip": 1.00226808, + "balance_loss_mlp": 1.00128996, + "epoch": 0.1072598827596573, + "flos": 22710626645760.0, + "grad_norm": 1.9614626951548204, + "language_loss": 0.86442816, + "learning_rate": 3.937711417044395e-06, + "loss": 0.8876164, + "num_input_tokens_seen": 38587970, + "step": 1784, + "time_per_iteration": 2.640655040740967 + }, + { + "auxiliary_loss_clip": 0.01149967, + "auxiliary_loss_mlp": 0.01168818, + "balance_loss_clip": 1.00207341, + "balance_loss_mlp": 1.00135255, + "epoch": 0.10732000601232526, + "flos": 23258264376960.0, + "grad_norm": 3.8480831784971588, + "language_loss": 1.00728476, + "learning_rate": 3.937614939483143e-06, + "loss": 1.03047264, + "num_input_tokens_seen": 38605840, + "step": 1785, + "time_per_iteration": 2.690593957901001 + }, + { + "auxiliary_loss_clip": 0.01166516, + "auxiliary_loss_mlp": 0.01168449, + "balance_loss_clip": 1.00237703, + "balance_loss_mlp": 1.00146031, + "epoch": 0.10738012926499324, + "flos": 24207060176640.0, + "grad_norm": 1.36751440369618, + "language_loss": 0.84901279, + "learning_rate": 3.937518388447339e-06, + "loss": 0.87236243, + "num_input_tokens_seen": 38627070, + "step": 1786, + "time_per_iteration": 2.6406655311584473 + }, + { + "auxiliary_loss_clip": 0.0118289, + "auxiliary_loss_mlp": 0.0116875, + "balance_loss_clip": 1.00226617, + "balance_loss_mlp": 1.00118911, + "epoch": 0.1074402525176612, + "flos": 20923065383040.0, + "grad_norm": 1.8026326783806794, + "language_loss": 0.78689981, + "learning_rate": 3.937421763940642e-06, + "loss": 0.81041616, + "num_input_tokens_seen": 38645840, + "step": 1787, + "time_per_iteration": 2.548570156097412 + }, + { + "auxiliary_loss_clip": 0.01166354, + "auxiliary_loss_mlp": 0.01168488, + "balance_loss_clip": 1.00219679, + "balance_loss_mlp": 1.00111794, + "epoch": 0.10750037577032917, + "flos": 16946928443520.0, + "grad_norm": 1.751224550775617, + "language_loss": 0.82894921, + "learning_rate": 3.937325065966719e-06, + "loss": 0.85229766, + "num_input_tokens_seen": 38664770, + "step": 1788, + "time_per_iteration": 2.5571110248565674 + }, + { + "auxiliary_loss_clip": 0.01182902, + "auxiliary_loss_mlp": 0.01168797, + "balance_loss_clip": 1.00233209, + "balance_loss_mlp": 1.00142717, + "epoch": 0.10756049902299715, + "flos": 20266546550400.0, + "grad_norm": 1.8263726348731846, + "language_loss": 0.78126419, + "learning_rate": 3.9372282945292335e-06, + "loss": 0.80478114, + "num_input_tokens_seen": 38683865, + "step": 1789, + "time_per_iteration": 2.545809507369995 + }, + { + "auxiliary_loss_clip": 0.01183072, + "auxiliary_loss_mlp": 0.01168718, + "balance_loss_clip": 1.00244534, + "balance_loss_mlp": 1.00115752, + "epoch": 0.10762062227566511, + "flos": 23586523793280.0, + "grad_norm": 2.4321867823063927, + "language_loss": 0.75064349, + "learning_rate": 3.937131449631859e-06, + "loss": 0.7741614, + "num_input_tokens_seen": 38702485, + "step": 1790, + "time_per_iteration": 2.597289800643921 + }, + { + "auxiliary_loss_clip": 0.01167155, + "auxiliary_loss_mlp": 0.00749273, + "balance_loss_clip": 1.00236678, + "balance_loss_mlp": 1.00042868, + "epoch": 0.10768074552833308, + "flos": 24310626065280.0, + "grad_norm": 2.223342266247626, + "language_loss": 0.78264523, + "learning_rate": 3.9370345312782645e-06, + "loss": 0.80180949, + "num_input_tokens_seen": 38722475, + "step": 1791, + "time_per_iteration": 2.583580732345581 + }, + { + "auxiliary_loss_clip": 0.01133971, + "auxiliary_loss_mlp": 0.01168646, + "balance_loss_clip": 1.00210857, + "balance_loss_mlp": 1.00175333, + "epoch": 0.10774086878100106, + "flos": 25299965341440.0, + "grad_norm": 1.9381444374988928, + "language_loss": 0.70905018, + "learning_rate": 3.936937539472126e-06, + "loss": 0.73207641, + "num_input_tokens_seen": 38743285, + "step": 1792, + "time_per_iteration": 2.7031071186065674 + }, + { + "auxiliary_loss_clip": 0.01133096, + "auxiliary_loss_mlp": 0.01167828, + "balance_loss_clip": 1.00191474, + "balance_loss_mlp": 1.0007441, + "epoch": 0.10780099203366902, + "flos": 22054035985920.0, + "grad_norm": 3.9750693527054284, + "language_loss": 0.76057267, + "learning_rate": 3.9368404742171236e-06, + "loss": 0.78358191, + "num_input_tokens_seen": 38763035, + "step": 1793, + "time_per_iteration": 2.6372196674346924 + }, + { + "auxiliary_loss_clip": 0.01117988, + "auxiliary_loss_mlp": 0.01168479, + "balance_loss_clip": 1.00221133, + "balance_loss_mlp": 1.00129974, + "epoch": 0.10786111528633699, + "flos": 22747471021440.0, + "grad_norm": 1.5484740703979913, + "language_loss": 0.85354745, + "learning_rate": 3.936743335516936e-06, + "loss": 0.87641215, + "num_input_tokens_seen": 38784900, + "step": 1794, + "time_per_iteration": 2.7193992137908936 + }, + { + "auxiliary_loss_clip": 0.01101611, + "auxiliary_loss_mlp": 0.01169081, + "balance_loss_clip": 1.00191069, + "balance_loss_mlp": 1.00142467, + "epoch": 0.10792123853900495, + "flos": 20851064570880.0, + "grad_norm": 1.6459523605053965, + "language_loss": 0.74746096, + "learning_rate": 3.936646123375246e-06, + "loss": 0.77016783, + "num_input_tokens_seen": 38804695, + "step": 1795, + "time_per_iteration": 2.729743719100952 + }, + { + "auxiliary_loss_clip": 0.01118681, + "auxiliary_loss_mlp": 0.01168691, + "balance_loss_clip": 1.00213099, + "balance_loss_mlp": 1.00122559, + "epoch": 0.10798136179167293, + "flos": 17748705876480.0, + "grad_norm": 2.576054482819374, + "language_loss": 0.81769043, + "learning_rate": 3.936548837795741e-06, + "loss": 0.84056413, + "num_input_tokens_seen": 38822395, + "step": 1796, + "time_per_iteration": 4.045880317687988 + }, + { + "auxiliary_loss_clip": 0.01151528, + "auxiliary_loss_mlp": 0.01169553, + "balance_loss_clip": 1.00244212, + "balance_loss_mlp": 1.00180197, + "epoch": 0.1080414850443409, + "flos": 13589639948160.0, + "grad_norm": 2.2380711931078827, + "language_loss": 0.7462064, + "learning_rate": 3.936451478782111e-06, + "loss": 0.76941717, + "num_input_tokens_seen": 38839865, + "step": 1797, + "time_per_iteration": 2.623988151550293 + }, + { + "auxiliary_loss_clip": 0.01151047, + "auxiliary_loss_mlp": 0.01168099, + "balance_loss_clip": 1.00222182, + "balance_loss_mlp": 1.00120556, + "epoch": 0.10810160829700886, + "flos": 16253421580800.0, + "grad_norm": 2.2538982738850537, + "language_loss": 0.81915432, + "learning_rate": 3.936354046338046e-06, + "loss": 0.84234583, + "num_input_tokens_seen": 38857300, + "step": 1798, + "time_per_iteration": 2.6341657638549805 + }, + { + "auxiliary_loss_clip": 0.01133323, + "auxiliary_loss_mlp": 0.011683, + "balance_loss_clip": 1.0019238, + "balance_loss_mlp": 1.00112057, + "epoch": 0.10816173154967684, + "flos": 15158002464000.0, + "grad_norm": 2.215128195176857, + "language_loss": 0.85429597, + "learning_rate": 3.936256540467242e-06, + "loss": 0.87731218, + "num_input_tokens_seen": 38874960, + "step": 1799, + "time_per_iteration": 3.9833438396453857 + }, + { + "auxiliary_loss_clip": 0.01150301, + "auxiliary_loss_mlp": 0.01168291, + "balance_loss_clip": 1.0021739, + "balance_loss_mlp": 1.00139761, + "epoch": 0.10822185480234481, + "flos": 17785334770560.0, + "grad_norm": 1.733517193190799, + "language_loss": 0.7730329, + "learning_rate": 3.9361589611733955e-06, + "loss": 0.79621887, + "num_input_tokens_seen": 38893610, + "step": 1800, + "time_per_iteration": 3.980337142944336 + }, + { + "auxiliary_loss_clip": 0.0118271, + "auxiliary_loss_mlp": 0.01167961, + "balance_loss_clip": 1.00228167, + "balance_loss_mlp": 1.00116324, + "epoch": 0.10828197805501277, + "flos": 25556654908800.0, + "grad_norm": 1.5406507690815445, + "language_loss": 0.72688961, + "learning_rate": 3.9360613084602075e-06, + "loss": 0.75039637, + "num_input_tokens_seen": 38913485, + "step": 1801, + "time_per_iteration": 2.5703837871551514 + }, + { + "auxiliary_loss_clip": 0.01182811, + "auxiliary_loss_mlp": 0.01168642, + "balance_loss_clip": 1.00230789, + "balance_loss_mlp": 1.00117636, + "epoch": 0.10834210130768075, + "flos": 28984435845120.0, + "grad_norm": 1.7756591276339595, + "language_loss": 0.66125333, + "learning_rate": 3.935963582331381e-06, + "loss": 0.68476784, + "num_input_tokens_seen": 38935650, + "step": 1802, + "time_per_iteration": 3.9848179817199707 + }, + { + "auxiliary_loss_clip": 0.01167095, + "auxiliary_loss_mlp": 0.01168462, + "balance_loss_clip": 1.00234902, + "balance_loss_mlp": 1.00137877, + "epoch": 0.10840222456034872, + "flos": 20264212166400.0, + "grad_norm": 1.7582351356485557, + "language_loss": 0.82031733, + "learning_rate": 3.935865782790621e-06, + "loss": 0.84367287, + "num_input_tokens_seen": 38954130, + "step": 1803, + "time_per_iteration": 2.56284236907959 + }, + { + "auxiliary_loss_clip": 0.0116624, + "auxiliary_loss_mlp": 0.01168318, + "balance_loss_clip": 1.00216818, + "balance_loss_mlp": 1.00123429, + "epoch": 0.10846234781301668, + "flos": 19863054097920.0, + "grad_norm": 1.611044960918849, + "language_loss": 0.90897071, + "learning_rate": 3.9357679098416365e-06, + "loss": 0.93231636, + "num_input_tokens_seen": 38972905, + "step": 1804, + "time_per_iteration": 2.54367733001709 + }, + { + "auxiliary_loss_clip": 0.01117394, + "auxiliary_loss_mlp": 0.01168487, + "balance_loss_clip": 1.0019505, + "balance_loss_mlp": 1.00111723, + "epoch": 0.10852247106568465, + "flos": 26469037296000.0, + "grad_norm": 1.899055857979597, + "language_loss": 0.75990117, + "learning_rate": 3.935669963488139e-06, + "loss": 0.78275996, + "num_input_tokens_seen": 38993255, + "step": 1805, + "time_per_iteration": 2.7154858112335205 + }, + { + "auxiliary_loss_clip": 0.01150823, + "auxiliary_loss_mlp": 0.01168647, + "balance_loss_clip": 1.00246894, + "balance_loss_mlp": 1.00127697, + "epoch": 0.10858259431835263, + "flos": 30081506987520.0, + "grad_norm": 2.613548341309692, + "language_loss": 0.85925847, + "learning_rate": 3.935571943733843e-06, + "loss": 0.8824532, + "num_input_tokens_seen": 39012610, + "step": 1806, + "time_per_iteration": 2.670264482498169 + }, + { + "auxiliary_loss_clip": 0.01166534, + "auxiliary_loss_mlp": 0.00749331, + "balance_loss_clip": 1.00225091, + "balance_loss_mlp": 1.00041878, + "epoch": 0.10864271757102059, + "flos": 19063180085760.0, + "grad_norm": 2.410257629836168, + "language_loss": 0.80701184, + "learning_rate": 3.9354738505824635e-06, + "loss": 0.82617044, + "num_input_tokens_seen": 39030120, + "step": 1807, + "time_per_iteration": 2.569795846939087 + }, + { + "auxiliary_loss_clip": 0.0113361, + "auxiliary_loss_mlp": 0.01168434, + "balance_loss_clip": 1.00215793, + "balance_loss_mlp": 1.00163627, + "epoch": 0.10870284082368856, + "flos": 24715052271360.0, + "grad_norm": 1.7920529181991114, + "language_loss": 0.78885293, + "learning_rate": 3.9353756840377225e-06, + "loss": 0.81187344, + "num_input_tokens_seen": 39049875, + "step": 1808, + "time_per_iteration": 2.7001020908355713 + }, + { + "auxiliary_loss_clip": 0.01149858, + "auxiliary_loss_mlp": 0.01168206, + "balance_loss_clip": 1.0021987, + "balance_loss_mlp": 1.00131285, + "epoch": 0.10876296407635654, + "flos": 20627663932800.0, + "grad_norm": 1.531575000096418, + "language_loss": 0.79091179, + "learning_rate": 3.935277444103342e-06, + "loss": 0.8140924, + "num_input_tokens_seen": 39068935, + "step": 1809, + "time_per_iteration": 2.640690565109253 + }, + { + "auxiliary_loss_clip": 0.01182574, + "auxiliary_loss_mlp": 0.01168244, + "balance_loss_clip": 1.00224745, + "balance_loss_mlp": 1.00135112, + "epoch": 0.1088230873290245, + "flos": 21579835610880.0, + "grad_norm": 2.035923576624253, + "language_loss": 0.84888422, + "learning_rate": 3.935179130783046e-06, + "loss": 0.87239242, + "num_input_tokens_seen": 39087370, + "step": 1810, + "time_per_iteration": 2.531550407409668 + }, + { + "auxiliary_loss_clip": 0.01135633, + "auxiliary_loss_mlp": 0.01168249, + "balance_loss_clip": 1.00227022, + "balance_loss_mlp": 1.00106966, + "epoch": 0.10888321058169247, + "flos": 26469037296000.0, + "grad_norm": 1.6878689239484028, + "language_loss": 0.63617682, + "learning_rate": 3.935080744080564e-06, + "loss": 0.65921557, + "num_input_tokens_seen": 39106635, + "step": 1811, + "time_per_iteration": 2.7116165161132812 + }, + { + "auxiliary_loss_clip": 0.01153367, + "auxiliary_loss_mlp": 0.01168352, + "balance_loss_clip": 1.00199842, + "balance_loss_mlp": 1.00117278, + "epoch": 0.10894333383436045, + "flos": 25848608653440.0, + "grad_norm": 1.8628811863943984, + "language_loss": 0.74158174, + "learning_rate": 3.934982283999626e-06, + "loss": 0.76479888, + "num_input_tokens_seen": 39126335, + "step": 1812, + "time_per_iteration": 2.6505532264709473 + }, + { + "auxiliary_loss_clip": 0.01150493, + "auxiliary_loss_mlp": 0.0116804, + "balance_loss_clip": 1.00219727, + "balance_loss_mlp": 1.0012424, + "epoch": 0.10900345708702841, + "flos": 19537093152000.0, + "grad_norm": 2.3623871970073673, + "language_loss": 0.72972363, + "learning_rate": 3.934883750543966e-06, + "loss": 0.75290895, + "num_input_tokens_seen": 39144820, + "step": 1813, + "time_per_iteration": 2.5979771614074707 + }, + { + "auxiliary_loss_clip": 0.01151383, + "auxiliary_loss_mlp": 0.01167834, + "balance_loss_clip": 1.00234759, + "balance_loss_mlp": 1.00132215, + "epoch": 0.10906358033969638, + "flos": 23623296341760.0, + "grad_norm": 1.6489345352203977, + "language_loss": 0.82381344, + "learning_rate": 3.93478514371732e-06, + "loss": 0.84700561, + "num_input_tokens_seen": 39165945, + "step": 1814, + "time_per_iteration": 2.610806465148926 + }, + { + "auxiliary_loss_clip": 0.01132732, + "auxiliary_loss_mlp": 0.01168518, + "balance_loss_clip": 1.0020479, + "balance_loss_mlp": 1.00124311, + "epoch": 0.10912370359236434, + "flos": 21214731818880.0, + "grad_norm": 2.2577354516026396, + "language_loss": 0.8390128, + "learning_rate": 3.934686463523429e-06, + "loss": 0.86202526, + "num_input_tokens_seen": 39183520, + "step": 1815, + "time_per_iteration": 2.6537771224975586 + }, + { + "auxiliary_loss_clip": 0.01133858, + "auxiliary_loss_mlp": 0.01168284, + "balance_loss_clip": 1.00210559, + "balance_loss_mlp": 1.00120032, + "epoch": 0.10918382684503232, + "flos": 13553190622080.0, + "grad_norm": 2.9000355082633633, + "language_loss": 0.71577966, + "learning_rate": 3.9345877099660315e-06, + "loss": 0.73880106, + "num_input_tokens_seen": 39201190, + "step": 1816, + "time_per_iteration": 2.6137020587921143 + }, + { + "auxiliary_loss_clip": 0.01166938, + "auxiliary_loss_mlp": 0.01168381, + "balance_loss_clip": 1.0022192, + "balance_loss_mlp": 1.00148773, + "epoch": 0.10924395009770028, + "flos": 27964321591680.0, + "grad_norm": 1.965884937405156, + "language_loss": 0.73162878, + "learning_rate": 3.9344888830488744e-06, + "loss": 0.75498194, + "num_input_tokens_seen": 39221210, + "step": 1817, + "time_per_iteration": 2.615131139755249 + }, + { + "auxiliary_loss_clip": 0.01100307, + "auxiliary_loss_mlp": 0.01168086, + "balance_loss_clip": 1.00187993, + "balance_loss_mlp": 1.00128841, + "epoch": 0.10930407335036825, + "flos": 25593750679680.0, + "grad_norm": 1.5417407353313857, + "language_loss": 0.66996247, + "learning_rate": 3.934389982775706e-06, + "loss": 0.69264638, + "num_input_tokens_seen": 39242025, + "step": 1818, + "time_per_iteration": 2.893963098526001 + }, + { + "auxiliary_loss_clip": 0.01150177, + "auxiliary_loss_mlp": 0.01168389, + "balance_loss_clip": 1.00223923, + "balance_loss_mlp": 1.00130558, + "epoch": 0.10936419660303623, + "flos": 18406194376320.0, + "grad_norm": 2.222259722878134, + "language_loss": 0.72691262, + "learning_rate": 3.934291009150275e-06, + "loss": 0.75009829, + "num_input_tokens_seen": 39259870, + "step": 1819, + "time_per_iteration": 2.785614013671875 + }, + { + "auxiliary_loss_clip": 0.01150287, + "auxiliary_loss_mlp": 0.00749225, + "balance_loss_clip": 1.00214863, + "balance_loss_mlp": 1.00045228, + "epoch": 0.1094243198557042, + "flos": 23840052963840.0, + "grad_norm": 2.959997494745523, + "language_loss": 0.73963535, + "learning_rate": 3.934191962176335e-06, + "loss": 0.75863051, + "num_input_tokens_seen": 39278500, + "step": 1820, + "time_per_iteration": 2.626326084136963 + }, + { + "auxiliary_loss_clip": 0.01182594, + "auxiliary_loss_mlp": 0.01168412, + "balance_loss_clip": 1.00239038, + "balance_loss_mlp": 1.00151908, + "epoch": 0.10948444310837216, + "flos": 14643940970880.0, + "grad_norm": 2.101546864644682, + "language_loss": 0.82428479, + "learning_rate": 3.934092841857642e-06, + "loss": 0.84779489, + "num_input_tokens_seen": 39294800, + "step": 1821, + "time_per_iteration": 2.494927406311035 + }, + { + "auxiliary_loss_clip": 0.01150184, + "auxiliary_loss_mlp": 0.01168006, + "balance_loss_clip": 1.00223708, + "balance_loss_mlp": 1.00111341, + "epoch": 0.10954456636104014, + "flos": 27818811596160.0, + "grad_norm": 1.8863000883848895, + "language_loss": 0.76254833, + "learning_rate": 3.933993648197955e-06, + "loss": 0.78573024, + "num_input_tokens_seen": 39314625, + "step": 1822, + "time_per_iteration": 2.6675846576690674 + }, + { + "auxiliary_loss_clip": 0.01150415, + "auxiliary_loss_mlp": 0.01167765, + "balance_loss_clip": 1.00218248, + "balance_loss_mlp": 1.0012536, + "epoch": 0.1096046896137081, + "flos": 33620934372480.0, + "grad_norm": 2.0654969071233182, + "language_loss": 0.79779392, + "learning_rate": 3.933894381201034e-06, + "loss": 0.82097572, + "num_input_tokens_seen": 39336465, + "step": 1823, + "time_per_iteration": 2.7288777828216553 + }, + { + "auxiliary_loss_clip": 0.01150102, + "auxiliary_loss_mlp": 0.0116757, + "balance_loss_clip": 1.00224495, + "balance_loss_mlp": 1.00105882, + "epoch": 0.10966481286637607, + "flos": 26980010219520.0, + "grad_norm": 1.4119952580833917, + "language_loss": 0.79686409, + "learning_rate": 3.933795040870645e-06, + "loss": 0.82004082, + "num_input_tokens_seen": 39357930, + "step": 1824, + "time_per_iteration": 2.687058448791504 + }, + { + "auxiliary_loss_clip": 0.01149961, + "auxiliary_loss_mlp": 0.01168038, + "balance_loss_clip": 1.00217795, + "balance_loss_mlp": 1.00143099, + "epoch": 0.10972493611904403, + "flos": 23036551678080.0, + "grad_norm": 1.6668039906857444, + "language_loss": 0.88245451, + "learning_rate": 3.933695627210554e-06, + "loss": 0.90563452, + "num_input_tokens_seen": 39376380, + "step": 1825, + "time_per_iteration": 2.614797353744507 + }, + { + "auxiliary_loss_clip": 0.0113453, + "auxiliary_loss_mlp": 0.01168139, + "balance_loss_clip": 1.00212967, + "balance_loss_mlp": 1.00124598, + "epoch": 0.10978505937171201, + "flos": 38104632443520.0, + "grad_norm": 2.112451075242399, + "language_loss": 0.76340234, + "learning_rate": 3.933596140224532e-06, + "loss": 0.78642905, + "num_input_tokens_seen": 39399935, + "step": 1826, + "time_per_iteration": 2.7735116481781006 + }, + { + "auxiliary_loss_clip": 0.01164908, + "auxiliary_loss_mlp": 0.0116229, + "balance_loss_clip": 1.00317848, + "balance_loss_mlp": 1.00054729, + "epoch": 0.10984518262437998, + "flos": 59849694616320.0, + "grad_norm": 0.8381465741738738, + "language_loss": 0.54966557, + "learning_rate": 3.93349657991635e-06, + "loss": 0.57293755, + "num_input_tokens_seen": 39460685, + "step": 1827, + "time_per_iteration": 3.1944727897644043 + }, + { + "auxiliary_loss_clip": 0.01181527, + "auxiliary_loss_mlp": 0.01162366, + "balance_loss_clip": 1.00350761, + "balance_loss_mlp": 1.00062299, + "epoch": 0.10990530587704794, + "flos": 66719837410560.0, + "grad_norm": 0.8618396436646405, + "language_loss": 0.55353236, + "learning_rate": 3.933396946289784e-06, + "loss": 0.57697129, + "num_input_tokens_seen": 39524765, + "step": 1828, + "time_per_iteration": 3.121051788330078 + }, + { + "auxiliary_loss_clip": 0.01149532, + "auxiliary_loss_mlp": 0.01168244, + "balance_loss_clip": 1.00212061, + "balance_loss_mlp": 1.00115991, + "epoch": 0.10996542912971592, + "flos": 25447199189760.0, + "grad_norm": 2.427162878086941, + "language_loss": 0.84167629, + "learning_rate": 3.933297239348612e-06, + "loss": 0.8648541, + "num_input_tokens_seen": 39543640, + "step": 1829, + "time_per_iteration": 2.668518543243408 + }, + { + "auxiliary_loss_clip": 0.01116328, + "auxiliary_loss_mlp": 0.01167992, + "balance_loss_clip": 1.00190806, + "balance_loss_mlp": 1.00128961, + "epoch": 0.11002555238238389, + "flos": 44018186186880.0, + "grad_norm": 4.131040843957966, + "language_loss": 0.89030218, + "learning_rate": 3.933197459096614e-06, + "loss": 0.91314542, + "num_input_tokens_seen": 39567525, + "step": 1830, + "time_per_iteration": 2.8727827072143555 + }, + { + "auxiliary_loss_clip": 0.01148312, + "auxiliary_loss_mlp": 0.01162149, + "balance_loss_clip": 1.00323176, + "balance_loss_mlp": 1.00040627, + "epoch": 0.11008567563505185, + "flos": 54065133590400.0, + "grad_norm": 0.6874246593264125, + "language_loss": 0.55512643, + "learning_rate": 3.9330976055375756e-06, + "loss": 0.57823104, + "num_input_tokens_seen": 39628470, + "step": 1831, + "time_per_iteration": 3.176417589187622 + }, + { + "auxiliary_loss_clip": 0.01149424, + "auxiliary_loss_mlp": 0.01168204, + "balance_loss_clip": 1.00210404, + "balance_loss_mlp": 1.00131094, + "epoch": 0.11014579888771983, + "flos": 24243150366720.0, + "grad_norm": 2.0386239133097237, + "language_loss": 0.90872383, + "learning_rate": 3.932997678675282e-06, + "loss": 0.93190014, + "num_input_tokens_seen": 39646670, + "step": 1832, + "time_per_iteration": 2.688354253768921 + }, + { + "auxiliary_loss_clip": 0.01168917, + "auxiliary_loss_mlp": 0.01162105, + "balance_loss_clip": 1.00338912, + "balance_loss_mlp": 1.00036144, + "epoch": 0.1102059221403878, + "flos": 57743965658880.0, + "grad_norm": 0.7155585555289863, + "language_loss": 0.59920979, + "learning_rate": 3.932897678513523e-06, + "loss": 0.62252003, + "num_input_tokens_seen": 39712915, + "step": 1833, + "time_per_iteration": 3.125906229019165 + }, + { + "auxiliary_loss_clip": 0.01166804, + "auxiliary_loss_mlp": 0.01167811, + "balance_loss_clip": 1.00226283, + "balance_loss_mlp": 1.00101304, + "epoch": 0.11026604539305576, + "flos": 16795923667200.0, + "grad_norm": 2.6111086539798465, + "language_loss": 0.80721986, + "learning_rate": 3.93279760505609e-06, + "loss": 0.83056599, + "num_input_tokens_seen": 39730650, + "step": 1834, + "time_per_iteration": 3.9794673919677734 + }, + { + "auxiliary_loss_clip": 0.01117666, + "auxiliary_loss_mlp": 0.01168275, + "balance_loss_clip": 1.00210476, + "balance_loss_mlp": 1.00128663, + "epoch": 0.11032616864572373, + "flos": 23988076911360.0, + "grad_norm": 2.3830136533705306, + "language_loss": 0.90536153, + "learning_rate": 3.932697458306779e-06, + "loss": 0.92822093, + "num_input_tokens_seen": 39751065, + "step": 1835, + "time_per_iteration": 2.744053602218628 + }, + { + "auxiliary_loss_clip": 0.0113399, + "auxiliary_loss_mlp": 0.01167862, + "balance_loss_clip": 1.00219774, + "balance_loss_mlp": 1.00116014, + "epoch": 0.1103862918983917, + "flos": 19683141851520.0, + "grad_norm": 3.5726799719216444, + "language_loss": 0.63617837, + "learning_rate": 3.932597238269386e-06, + "loss": 0.65919691, + "num_input_tokens_seen": 39769245, + "step": 1836, + "time_per_iteration": 2.6777169704437256 + }, + { + "auxiliary_loss_clip": 0.01133742, + "auxiliary_loss_mlp": 0.01167628, + "balance_loss_clip": 1.00195825, + "balance_loss_mlp": 1.00121188, + "epoch": 0.11044641515105967, + "flos": 32160878340480.0, + "grad_norm": 1.6900426322634703, + "language_loss": 0.72834778, + "learning_rate": 3.932496944947711e-06, + "loss": 0.75136149, + "num_input_tokens_seen": 39790830, + "step": 1837, + "time_per_iteration": 4.077815294265747 + }, + { + "auxiliary_loss_clip": 0.01166843, + "auxiliary_loss_mlp": 0.011683, + "balance_loss_clip": 1.00235701, + "balance_loss_mlp": 1.00159764, + "epoch": 0.11050653840372764, + "flos": 16689233295360.0, + "grad_norm": 2.1629617921733106, + "language_loss": 0.78449357, + "learning_rate": 3.93239657834556e-06, + "loss": 0.807845, + "num_input_tokens_seen": 39809475, + "step": 1838, + "time_per_iteration": 4.084299325942993 + }, + { + "auxiliary_loss_clip": 0.01149261, + "auxiliary_loss_mlp": 0.01167876, + "balance_loss_clip": 1.00220883, + "balance_loss_mlp": 1.00155544, + "epoch": 0.11056666165639562, + "flos": 21208877902080.0, + "grad_norm": 1.9303119749712672, + "language_loss": 0.71718419, + "learning_rate": 3.932296138466736e-06, + "loss": 0.74035555, + "num_input_tokens_seen": 39826355, + "step": 1839, + "time_per_iteration": 2.598813056945801 + }, + { + "auxiliary_loss_clip": 0.01182604, + "auxiliary_loss_mlp": 0.00749196, + "balance_loss_clip": 1.00239158, + "balance_loss_mlp": 1.00044739, + "epoch": 0.11062678490906358, + "flos": 19165488998400.0, + "grad_norm": 1.9586677915454762, + "language_loss": 0.78711236, + "learning_rate": 3.93219562531505e-06, + "loss": 0.8064304, + "num_input_tokens_seen": 39845335, + "step": 1840, + "time_per_iteration": 3.953993558883667 + }, + { + "auxiliary_loss_clip": 0.01166465, + "auxiliary_loss_mlp": 0.01167465, + "balance_loss_clip": 1.00225568, + "balance_loss_mlp": 1.00104856, + "epoch": 0.11068690816173155, + "flos": 24895287740160.0, + "grad_norm": 1.5675678844280765, + "language_loss": 0.87948596, + "learning_rate": 3.932095038894311e-06, + "loss": 0.90282524, + "num_input_tokens_seen": 39865065, + "step": 1841, + "time_per_iteration": 2.5867998600006104 + }, + { + "auxiliary_loss_clip": 0.0113422, + "auxiliary_loss_mlp": 0.01167539, + "balance_loss_clip": 1.00221658, + "balance_loss_mlp": 1.00150442, + "epoch": 0.11074703141439952, + "flos": 16472368932480.0, + "grad_norm": 2.07594029738906, + "language_loss": 0.9035008, + "learning_rate": 3.931994379208334e-06, + "loss": 0.92651844, + "num_input_tokens_seen": 39882780, + "step": 1842, + "time_per_iteration": 2.613665819168091 + }, + { + "auxiliary_loss_clip": 0.01149701, + "auxiliary_loss_mlp": 0.01168074, + "balance_loss_clip": 1.00204659, + "balance_loss_mlp": 1.00146675, + "epoch": 0.11080715466706749, + "flos": 19172420323200.0, + "grad_norm": 1.8832642989076607, + "language_loss": 0.85808378, + "learning_rate": 3.931893646260937e-06, + "loss": 0.88126153, + "num_input_tokens_seen": 39900295, + "step": 1843, + "time_per_iteration": 2.5896761417388916 + }, + { + "auxiliary_loss_clip": 0.01116912, + "auxiliary_loss_mlp": 0.00749225, + "balance_loss_clip": 1.00191069, + "balance_loss_mlp": 1.00041842, + "epoch": 0.11086727791973545, + "flos": 27704687109120.0, + "grad_norm": 1.5957926472477537, + "language_loss": 0.7465142, + "learning_rate": 3.931792840055941e-06, + "loss": 0.76517558, + "num_input_tokens_seen": 39922075, + "step": 1844, + "time_per_iteration": 2.756924629211426 + }, + { + "auxiliary_loss_clip": 0.01182276, + "auxiliary_loss_mlp": 0.01167722, + "balance_loss_clip": 1.00222468, + "balance_loss_mlp": 1.00130606, + "epoch": 0.11092740117240343, + "flos": 18514967736960.0, + "grad_norm": 1.8385308081326694, + "language_loss": 0.75840747, + "learning_rate": 3.931691960597165e-06, + "loss": 0.78190744, + "num_input_tokens_seen": 39940115, + "step": 1845, + "time_per_iteration": 2.524979591369629 + }, + { + "auxiliary_loss_clip": 0.01149286, + "auxiliary_loss_mlp": 0.01167382, + "balance_loss_clip": 1.00208139, + "balance_loss_mlp": 1.00115681, + "epoch": 0.1109875244250714, + "flos": 20522446018560.0, + "grad_norm": 1.4854230586115686, + "language_loss": 0.76358169, + "learning_rate": 3.9315910078884375e-06, + "loss": 0.78674841, + "num_input_tokens_seen": 39959920, + "step": 1846, + "time_per_iteration": 2.5911922454833984 + }, + { + "auxiliary_loss_clip": 0.01166033, + "auxiliary_loss_mlp": 0.01167958, + "balance_loss_clip": 1.00221384, + "balance_loss_mlp": 1.00125587, + "epoch": 0.11104764767773936, + "flos": 14098601710080.0, + "grad_norm": 2.441476728583489, + "language_loss": 0.8607887, + "learning_rate": 3.931489981933584e-06, + "loss": 0.88412863, + "num_input_tokens_seen": 39974755, + "step": 1847, + "time_per_iteration": 2.563344955444336 + }, + { + "auxiliary_loss_clip": 0.01182342, + "auxiliary_loss_mlp": 0.01167868, + "balance_loss_clip": 1.00223231, + "balance_loss_mlp": 1.00116611, + "epoch": 0.11110777093040733, + "flos": 20594518657920.0, + "grad_norm": 3.2715532923811557, + "language_loss": 0.77399802, + "learning_rate": 3.931388882736438e-06, + "loss": 0.79750013, + "num_input_tokens_seen": 39993355, + "step": 1848, + "time_per_iteration": 2.554694414138794 + }, + { + "auxiliary_loss_clip": 0.01166298, + "auxiliary_loss_mlp": 0.01167506, + "balance_loss_clip": 1.00236845, + "balance_loss_mlp": 1.00137591, + "epoch": 0.11116789418307531, + "flos": 21870065502720.0, + "grad_norm": 1.7796937409696956, + "language_loss": 0.77413678, + "learning_rate": 3.931287710300832e-06, + "loss": 0.79747474, + "num_input_tokens_seen": 40012410, + "step": 1849, + "time_per_iteration": 2.573753833770752 + }, + { + "auxiliary_loss_clip": 0.01132768, + "auxiliary_loss_mlp": 0.00749161, + "balance_loss_clip": 1.00179255, + "balance_loss_mlp": 1.00034308, + "epoch": 0.11122801743574327, + "flos": 15523106256000.0, + "grad_norm": 2.564679474713294, + "language_loss": 0.714517, + "learning_rate": 3.931186464630601e-06, + "loss": 0.73333627, + "num_input_tokens_seen": 40029315, + "step": 1850, + "time_per_iteration": 2.6062870025634766 + }, + { + "auxiliary_loss_clip": 0.0116596, + "auxiliary_loss_mlp": 0.01167676, + "balance_loss_clip": 1.00224352, + "balance_loss_mlp": 1.00116444, + "epoch": 0.11128814068841124, + "flos": 14392279307520.0, + "grad_norm": 2.222232001673662, + "language_loss": 0.81364125, + "learning_rate": 3.931085145729588e-06, + "loss": 0.8369776, + "num_input_tokens_seen": 40045765, + "step": 1851, + "time_per_iteration": 2.5171151161193848 + }, + { + "auxiliary_loss_clip": 0.01166221, + "auxiliary_loss_mlp": 0.01168128, + "balance_loss_clip": 1.00225604, + "balance_loss_mlp": 1.00123549, + "epoch": 0.11134826394107922, + "flos": 16653933204480.0, + "grad_norm": 6.811660153813927, + "language_loss": 0.8863945, + "learning_rate": 3.930983753601631e-06, + "loss": 0.90973794, + "num_input_tokens_seen": 40061660, + "step": 1852, + "time_per_iteration": 2.527841567993164 + }, + { + "auxiliary_loss_clip": 0.01165749, + "auxiliary_loss_mlp": 0.01167774, + "balance_loss_clip": 1.00220656, + "balance_loss_mlp": 1.00126266, + "epoch": 0.11140838719374718, + "flos": 16690993061760.0, + "grad_norm": 1.9164735533076476, + "language_loss": 0.72071004, + "learning_rate": 3.930882288250578e-06, + "loss": 0.74404532, + "num_input_tokens_seen": 40080180, + "step": 1853, + "time_per_iteration": 2.531125783920288 + }, + { + "auxiliary_loss_clip": 0.01180872, + "auxiliary_loss_mlp": 0.01161218, + "balance_loss_clip": 1.0030365, + "balance_loss_mlp": 1.00023782, + "epoch": 0.11146851044641515, + "flos": 60976355587200.0, + "grad_norm": 0.7731189079562306, + "language_loss": 0.53684855, + "learning_rate": 3.930780749680273e-06, + "loss": 0.56026936, + "num_input_tokens_seen": 40138910, + "step": 1854, + "time_per_iteration": 3.061527967453003 + }, + { + "auxiliary_loss_clip": 0.01150026, + "auxiliary_loss_mlp": 0.01168055, + "balance_loss_clip": 1.0020771, + "balance_loss_mlp": 1.0012573, + "epoch": 0.11152863369908313, + "flos": 22193835719040.0, + "grad_norm": 2.626340670440545, + "language_loss": 0.847776, + "learning_rate": 3.9306791378945705e-06, + "loss": 0.87095684, + "num_input_tokens_seen": 40157745, + "step": 1855, + "time_per_iteration": 2.6074368953704834 + }, + { + "auxiliary_loss_clip": 0.01150515, + "auxiliary_loss_mlp": 0.0116804, + "balance_loss_clip": 1.00215793, + "balance_loss_mlp": 1.00162435, + "epoch": 0.11158875695175109, + "flos": 19537524115200.0, + "grad_norm": 1.983036203723697, + "language_loss": 0.81758428, + "learning_rate": 3.9305774528973205e-06, + "loss": 0.84076977, + "num_input_tokens_seen": 40175375, + "step": 1856, + "time_per_iteration": 2.59611439704895 + }, + { + "auxiliary_loss_clip": 0.01165687, + "auxiliary_loss_mlp": 0.01167514, + "balance_loss_clip": 1.00211692, + "balance_loss_mlp": 1.00128865, + "epoch": 0.11164888020441906, + "flos": 25442709989760.0, + "grad_norm": 1.9128304438563057, + "language_loss": 0.83103186, + "learning_rate": 3.93047569469238e-06, + "loss": 0.85436386, + "num_input_tokens_seen": 40195715, + "step": 1857, + "time_per_iteration": 2.5988848209381104 + }, + { + "auxiliary_loss_clip": 0.01133252, + "auxiliary_loss_mlp": 0.0116758, + "balance_loss_clip": 1.00189066, + "balance_loss_mlp": 1.00106907, + "epoch": 0.11170900345708702, + "flos": 15632741543040.0, + "grad_norm": 2.1865119822428736, + "language_loss": 0.8276248, + "learning_rate": 3.930373863283608e-06, + "loss": 0.85063314, + "num_input_tokens_seen": 40213975, + "step": 1858, + "time_per_iteration": 2.5909786224365234 + }, + { + "auxiliary_loss_clip": 0.01133922, + "auxiliary_loss_mlp": 0.01167712, + "balance_loss_clip": 1.00199211, + "balance_loss_mlp": 1.00129616, + "epoch": 0.111769126709755, + "flos": 23039424766080.0, + "grad_norm": 2.269785402075391, + "language_loss": 0.91591245, + "learning_rate": 3.930271958674866e-06, + "loss": 0.93892872, + "num_input_tokens_seen": 40233905, + "step": 1859, + "time_per_iteration": 2.6468324661254883 + }, + { + "auxiliary_loss_clip": 0.01165557, + "auxiliary_loss_mlp": 0.01167942, + "balance_loss_clip": 1.0020858, + "balance_loss_mlp": 1.00123966, + "epoch": 0.11182924996242297, + "flos": 20850705434880.0, + "grad_norm": 2.125657257226921, + "language_loss": 0.81898886, + "learning_rate": 3.930169980870018e-06, + "loss": 0.8423239, + "num_input_tokens_seen": 40252810, + "step": 1860, + "time_per_iteration": 2.5592410564422607 + }, + { + "auxiliary_loss_clip": 0.0114944, + "auxiliary_loss_mlp": 0.01167798, + "balance_loss_clip": 1.00204384, + "balance_loss_mlp": 1.00147772, + "epoch": 0.11188937321509093, + "flos": 17455315587840.0, + "grad_norm": 1.7685914671585383, + "language_loss": 0.75145215, + "learning_rate": 3.930067929872931e-06, + "loss": 0.77462447, + "num_input_tokens_seen": 40272000, + "step": 1861, + "time_per_iteration": 2.6601359844207764 + }, + { + "auxiliary_loss_clip": 0.01182202, + "auxiliary_loss_mlp": 0.01167143, + "balance_loss_clip": 1.00225973, + "balance_loss_mlp": 1.00110865, + "epoch": 0.11194949646775891, + "flos": 24095916518400.0, + "grad_norm": 2.037088929891749, + "language_loss": 0.88824427, + "learning_rate": 3.929965805687474e-06, + "loss": 0.91173774, + "num_input_tokens_seen": 40290660, + "step": 1862, + "time_per_iteration": 2.553767442703247 + }, + { + "auxiliary_loss_clip": 0.01149539, + "auxiliary_loss_mlp": 0.01167122, + "balance_loss_clip": 1.00214386, + "balance_loss_mlp": 1.00118256, + "epoch": 0.11200961972042688, + "flos": 25153880728320.0, + "grad_norm": 2.7360637123256444, + "language_loss": 0.86871517, + "learning_rate": 3.92986360831752e-06, + "loss": 0.89188176, + "num_input_tokens_seen": 40307820, + "step": 1863, + "time_per_iteration": 2.633340835571289 + }, + { + "auxiliary_loss_clip": 0.01149974, + "auxiliary_loss_mlp": 0.011672, + "balance_loss_clip": 1.00210094, + "balance_loss_mlp": 1.0010705, + "epoch": 0.11206974297309484, + "flos": 21288312829440.0, + "grad_norm": 2.0094178905468447, + "language_loss": 0.64463091, + "learning_rate": 3.929761337766945e-06, + "loss": 0.66780263, + "num_input_tokens_seen": 40327430, + "step": 1864, + "time_per_iteration": 2.6325621604919434 + }, + { + "auxiliary_loss_clip": 0.01100263, + "auxiliary_loss_mlp": 0.01167261, + "balance_loss_clip": 1.00186205, + "balance_loss_mlp": 1.00103545, + "epoch": 0.11212986622576282, + "flos": 18915982151040.0, + "grad_norm": 2.0480248270440415, + "language_loss": 0.74309719, + "learning_rate": 3.929658994039627e-06, + "loss": 0.7657724, + "num_input_tokens_seen": 40344545, + "step": 1865, + "time_per_iteration": 2.7194650173187256 + }, + { + "auxiliary_loss_clip": 0.01101859, + "auxiliary_loss_mlp": 0.01167707, + "balance_loss_clip": 1.0019455, + "balance_loss_mlp": 1.00119567, + "epoch": 0.11218998947843078, + "flos": 22054754257920.0, + "grad_norm": 2.151070084917746, + "language_loss": 0.84746796, + "learning_rate": 3.929556577139446e-06, + "loss": 0.87016362, + "num_input_tokens_seen": 40362300, + "step": 1866, + "time_per_iteration": 2.753605604171753 + }, + { + "auxiliary_loss_clip": 0.01102725, + "auxiliary_loss_mlp": 0.00749121, + "balance_loss_clip": 1.00198328, + "balance_loss_mlp": 1.00039315, + "epoch": 0.11225011273109875, + "flos": 24571697091840.0, + "grad_norm": 1.5196342239540528, + "language_loss": 0.81214958, + "learning_rate": 3.929454087070286e-06, + "loss": 0.83066797, + "num_input_tokens_seen": 40384720, + "step": 1867, + "time_per_iteration": 2.8051164150238037 + }, + { + "auxiliary_loss_clip": 0.01182187, + "auxiliary_loss_mlp": 0.01167424, + "balance_loss_clip": 1.00218058, + "balance_loss_mlp": 1.00119889, + "epoch": 0.11231023598376672, + "flos": 28438665621120.0, + "grad_norm": 2.1393862964901103, + "language_loss": 0.86836672, + "learning_rate": 3.929351523836035e-06, + "loss": 0.89186287, + "num_input_tokens_seen": 40404000, + "step": 1868, + "time_per_iteration": 2.579766273498535 + }, + { + "auxiliary_loss_clip": 0.01149618, + "auxiliary_loss_mlp": 0.00749082, + "balance_loss_clip": 1.00223196, + "balance_loss_mlp": 1.00030041, + "epoch": 0.1123703592364347, + "flos": 14426466076800.0, + "grad_norm": 2.1906665622366535, + "language_loss": 0.68618554, + "learning_rate": 3.9292488874405795e-06, + "loss": 0.70517254, + "num_input_tokens_seen": 40418665, + "step": 1869, + "time_per_iteration": 2.578279972076416 + }, + { + "auxiliary_loss_clip": 0.01133482, + "auxiliary_loss_mlp": 0.01167809, + "balance_loss_clip": 1.00203454, + "balance_loss_mlp": 1.00139296, + "epoch": 0.11243048248910266, + "flos": 22236282616320.0, + "grad_norm": 1.6740252411210104, + "language_loss": 0.77383447, + "learning_rate": 3.929146177887814e-06, + "loss": 0.7968474, + "num_input_tokens_seen": 40437870, + "step": 1870, + "time_per_iteration": 2.724531888961792 + }, + { + "auxiliary_loss_clip": 0.01116704, + "auxiliary_loss_mlp": 0.01167661, + "balance_loss_clip": 1.00190759, + "balance_loss_mlp": 1.00124526, + "epoch": 0.11249060574177062, + "flos": 18584167288320.0, + "grad_norm": 1.7974248426359887, + "language_loss": 0.75789601, + "learning_rate": 3.929043395181631e-06, + "loss": 0.78073967, + "num_input_tokens_seen": 40455570, + "step": 1871, + "time_per_iteration": 2.641885757446289 + }, + { + "auxiliary_loss_clip": 0.0109939, + "auxiliary_loss_mlp": 0.01167745, + "balance_loss_clip": 1.00163674, + "balance_loss_mlp": 1.00123322, + "epoch": 0.1125507289944386, + "flos": 22856567604480.0, + "grad_norm": 1.8550941082827472, + "language_loss": 0.81610411, + "learning_rate": 3.928940539325929e-06, + "loss": 0.8387754, + "num_input_tokens_seen": 40473600, + "step": 1872, + "time_per_iteration": 4.073937892913818 + }, + { + "auxiliary_loss_clip": 0.01182416, + "auxiliary_loss_mlp": 0.01167699, + "balance_loss_clip": 1.00237823, + "balance_loss_mlp": 1.0011878, + "epoch": 0.11261085224710657, + "flos": 19676390094720.0, + "grad_norm": 2.333404382971913, + "language_loss": 0.82969642, + "learning_rate": 3.9288376103246095e-06, + "loss": 0.85319757, + "num_input_tokens_seen": 40490025, + "step": 1873, + "time_per_iteration": 2.515629529953003 + }, + { + "auxiliary_loss_clip": 0.01132471, + "auxiliary_loss_mlp": 0.01167294, + "balance_loss_clip": 1.00191092, + "balance_loss_mlp": 1.00097346, + "epoch": 0.11267097549977453, + "flos": 26063246373120.0, + "grad_norm": 1.8459240914935204, + "language_loss": 0.92176145, + "learning_rate": 3.928734608181575e-06, + "loss": 0.94475907, + "num_input_tokens_seen": 40511580, + "step": 1874, + "time_per_iteration": 4.054711580276489 + }, + { + "auxiliary_loss_clip": 0.01134953, + "auxiliary_loss_mlp": 0.0116768, + "balance_loss_clip": 1.00216436, + "balance_loss_mlp": 1.00164545, + "epoch": 0.11273109875244251, + "flos": 21068036674560.0, + "grad_norm": 1.5295540952507263, + "language_loss": 0.75420904, + "learning_rate": 3.928631532900729e-06, + "loss": 0.77723539, + "num_input_tokens_seen": 40530155, + "step": 1875, + "time_per_iteration": 4.027900457382202 + }, + { + "auxiliary_loss_clip": 0.01165774, + "auxiliary_loss_mlp": 0.01167331, + "balance_loss_clip": 1.00221705, + "balance_loss_mlp": 1.0012958, + "epoch": 0.11279122200511048, + "flos": 27088999061760.0, + "grad_norm": 2.2490109310277546, + "language_loss": 0.71704495, + "learning_rate": 3.928528384485984e-06, + "loss": 0.740376, + "num_input_tokens_seen": 40549500, + "step": 1876, + "time_per_iteration": 2.602672576904297 + }, + { + "auxiliary_loss_clip": 0.01149921, + "auxiliary_loss_mlp": 0.0116686, + "balance_loss_clip": 1.00212955, + "balance_loss_mlp": 1.00101638, + "epoch": 0.11285134525777844, + "flos": 20187901722240.0, + "grad_norm": 1.8134405523442096, + "language_loss": 0.76920629, + "learning_rate": 3.9284251629412475e-06, + "loss": 0.79237407, + "num_input_tokens_seen": 40567475, + "step": 1877, + "time_per_iteration": 4.056175708770752 + }, + { + "auxiliary_loss_clip": 0.01166459, + "auxiliary_loss_mlp": 0.01167641, + "balance_loss_clip": 1.00227666, + "balance_loss_mlp": 1.0013206, + "epoch": 0.11291146851044641, + "flos": 12458453863680.0, + "grad_norm": 2.3315173301693712, + "language_loss": 0.87726384, + "learning_rate": 3.928321868270436e-06, + "loss": 0.90060484, + "num_input_tokens_seen": 40583280, + "step": 1878, + "time_per_iteration": 2.5910654067993164 + }, + { + "auxiliary_loss_clip": 0.01132741, + "auxiliary_loss_mlp": 0.01167232, + "balance_loss_clip": 1.0019927, + "balance_loss_mlp": 1.00110185, + "epoch": 0.11297159176311439, + "flos": 23842315520640.0, + "grad_norm": 2.2512717001054106, + "language_loss": 0.81339741, + "learning_rate": 3.928218500477466e-06, + "loss": 0.83639717, + "num_input_tokens_seen": 40603080, + "step": 1879, + "time_per_iteration": 2.6595845222473145 + }, + { + "auxiliary_loss_clip": 0.01150667, + "auxiliary_loss_mlp": 0.01167541, + "balance_loss_clip": 1.00211143, + "balance_loss_mlp": 1.00131512, + "epoch": 0.11303171501578235, + "flos": 29930538124800.0, + "grad_norm": 2.106918853052269, + "language_loss": 0.70479351, + "learning_rate": 3.928115059566259e-06, + "loss": 0.72797561, + "num_input_tokens_seen": 40623255, + "step": 1880, + "time_per_iteration": 2.6743879318237305 + }, + { + "auxiliary_loss_clip": 0.01153161, + "auxiliary_loss_mlp": 0.01167121, + "balance_loss_clip": 1.00227726, + "balance_loss_mlp": 1.00108624, + "epoch": 0.11309183826845032, + "flos": 16180558842240.0, + "grad_norm": 1.6615389176082669, + "language_loss": 0.72672987, + "learning_rate": 3.928011545540734e-06, + "loss": 0.74993271, + "num_input_tokens_seen": 40641570, + "step": 1881, + "time_per_iteration": 2.664613723754883 + }, + { + "auxiliary_loss_clip": 0.01133167, + "auxiliary_loss_mlp": 0.00749163, + "balance_loss_clip": 1.00194383, + "balance_loss_mlp": 1.00037193, + "epoch": 0.1131519615211183, + "flos": 12020702814720.0, + "grad_norm": 2.268470058752317, + "language_loss": 0.74208128, + "learning_rate": 3.927907958404819e-06, + "loss": 0.76090455, + "num_input_tokens_seen": 40658775, + "step": 1882, + "time_per_iteration": 2.658811569213867 + }, + { + "auxiliary_loss_clip": 0.01182303, + "auxiliary_loss_mlp": 0.01167452, + "balance_loss_clip": 1.0022701, + "balance_loss_mlp": 1.00103617, + "epoch": 0.11321208477378626, + "flos": 26250125857920.0, + "grad_norm": 2.0933952942037988, + "language_loss": 0.79257321, + "learning_rate": 3.92780429816244e-06, + "loss": 0.8160708, + "num_input_tokens_seen": 40679555, + "step": 1883, + "time_per_iteration": 2.578343152999878 + }, + { + "auxiliary_loss_clip": 0.01133695, + "auxiliary_loss_mlp": 0.01167185, + "balance_loss_clip": 1.00203741, + "balance_loss_mlp": 1.00115061, + "epoch": 0.11327220802645423, + "flos": 13626376583040.0, + "grad_norm": 2.4797885566300697, + "language_loss": 0.77283823, + "learning_rate": 3.927700564817529e-06, + "loss": 0.79584706, + "num_input_tokens_seen": 40697295, + "step": 1884, + "time_per_iteration": 2.6551120281219482 + }, + { + "auxiliary_loss_clip": 0.01164011, + "auxiliary_loss_mlp": 0.01160981, + "balance_loss_clip": 1.00265348, + "balance_loss_mlp": 1.00000048, + "epoch": 0.1133323312791222, + "flos": 57191802814080.0, + "grad_norm": 0.7936293089687295, + "language_loss": 0.55202115, + "learning_rate": 3.927596758374019e-06, + "loss": 0.57527107, + "num_input_tokens_seen": 40758095, + "step": 1885, + "time_per_iteration": 3.0799720287323 + }, + { + "auxiliary_loss_clip": 0.01084501, + "auxiliary_loss_mlp": 0.01166759, + "balance_loss_clip": 1.00179982, + "balance_loss_mlp": 1.00120139, + "epoch": 0.11339245453179017, + "flos": 24351708245760.0, + "grad_norm": 1.7611881029031442, + "language_loss": 0.90571231, + "learning_rate": 3.927492878835848e-06, + "loss": 0.92822492, + "num_input_tokens_seen": 40777140, + "step": 1886, + "time_per_iteration": 3.0812222957611084 + }, + { + "auxiliary_loss_clip": 0.01118858, + "auxiliary_loss_mlp": 0.01166339, + "balance_loss_clip": 1.00213432, + "balance_loss_mlp": 1.00097179, + "epoch": 0.11345257778445814, + "flos": 22670693700480.0, + "grad_norm": 2.5470502003486404, + "language_loss": 0.84942913, + "learning_rate": 3.927388926206953e-06, + "loss": 0.87228107, + "num_input_tokens_seen": 40797505, + "step": 1887, + "time_per_iteration": 3.1952805519104004 + }, + { + "auxiliary_loss_clip": 0.01132286, + "auxiliary_loss_mlp": 0.01166997, + "balance_loss_clip": 1.00193954, + "balance_loss_mlp": 1.00124824, + "epoch": 0.11351270103712612, + "flos": 20988242611200.0, + "grad_norm": 2.7594745726818677, + "language_loss": 0.76114893, + "learning_rate": 3.927284900491277e-06, + "loss": 0.78414178, + "num_input_tokens_seen": 40812970, + "step": 1888, + "time_per_iteration": 2.7448360919952393 + }, + { + "auxiliary_loss_clip": 0.01117159, + "auxiliary_loss_mlp": 0.01167139, + "balance_loss_clip": 1.00201154, + "balance_loss_mlp": 1.00110412, + "epoch": 0.11357282428979408, + "flos": 37347923600640.0, + "grad_norm": 1.663827732270763, + "language_loss": 0.68226409, + "learning_rate": 3.927180801692764e-06, + "loss": 0.70510697, + "num_input_tokens_seen": 40837745, + "step": 1889, + "time_per_iteration": 2.881024122238159 + }, + { + "auxiliary_loss_clip": 0.01182188, + "auxiliary_loss_mlp": 0.01166749, + "balance_loss_clip": 1.00226545, + "balance_loss_mlp": 1.00109577, + "epoch": 0.11363294754246205, + "flos": 21757018423680.0, + "grad_norm": 1.6683133222783797, + "language_loss": 0.83941305, + "learning_rate": 3.927076629815362e-06, + "loss": 0.8629024, + "num_input_tokens_seen": 40856490, + "step": 1890, + "time_per_iteration": 2.524799108505249 + }, + { + "auxiliary_loss_clip": 0.01153131, + "auxiliary_loss_mlp": 0.01167308, + "balance_loss_clip": 1.00215912, + "balance_loss_mlp": 1.00155902, + "epoch": 0.11369307079513001, + "flos": 22601637803520.0, + "grad_norm": 2.119169331326466, + "language_loss": 0.64600945, + "learning_rate": 3.926972384863022e-06, + "loss": 0.66921377, + "num_input_tokens_seen": 40874070, + "step": 1891, + "time_per_iteration": 2.594735622406006 + }, + { + "auxiliary_loss_clip": 0.0113389, + "auxiliary_loss_mlp": 0.01166763, + "balance_loss_clip": 1.00207353, + "balance_loss_mlp": 1.00101399, + "epoch": 0.11375319404779799, + "flos": 21944257044480.0, + "grad_norm": 2.984349432774334, + "language_loss": 0.88448012, + "learning_rate": 3.9268680668396956e-06, + "loss": 0.90748668, + "num_input_tokens_seen": 40892425, + "step": 1892, + "time_per_iteration": 2.6557376384735107 + }, + { + "auxiliary_loss_clip": 0.01100701, + "auxiliary_loss_mlp": 0.01167621, + "balance_loss_clip": 1.00179255, + "balance_loss_mlp": 1.00129986, + "epoch": 0.11381331730046595, + "flos": 26395456285440.0, + "grad_norm": 2.049432259754845, + "language_loss": 0.72338855, + "learning_rate": 3.926763675749339e-06, + "loss": 0.74607182, + "num_input_tokens_seen": 40912190, + "step": 1893, + "time_per_iteration": 2.735960006713867 + }, + { + "auxiliary_loss_clip": 0.01182148, + "auxiliary_loss_mlp": 0.01167139, + "balance_loss_clip": 1.00217652, + "balance_loss_mlp": 1.0013907, + "epoch": 0.11387344055313392, + "flos": 23804716959360.0, + "grad_norm": 1.8354396140860858, + "language_loss": 0.79641879, + "learning_rate": 3.92665921159591e-06, + "loss": 0.81991172, + "num_input_tokens_seen": 40928395, + "step": 1894, + "time_per_iteration": 2.5924601554870605 + }, + { + "auxiliary_loss_clip": 0.0114955, + "auxiliary_loss_mlp": 0.01167368, + "balance_loss_clip": 1.00216937, + "balance_loss_mlp": 1.00114202, + "epoch": 0.1139335638058019, + "flos": 34522865902080.0, + "grad_norm": 2.5752981237471175, + "language_loss": 0.7943294, + "learning_rate": 3.926554674383371e-06, + "loss": 0.81749856, + "num_input_tokens_seen": 40946555, + "step": 1895, + "time_per_iteration": 2.708711862564087 + }, + { + "auxiliary_loss_clip": 0.01180232, + "auxiliary_loss_mlp": 0.01160995, + "balance_loss_clip": 1.00259423, + "balance_loss_mlp": 1.00001454, + "epoch": 0.11399368705846986, + "flos": 70587811520640.0, + "grad_norm": 0.8026291790444612, + "language_loss": 0.63361734, + "learning_rate": 3.926450064115686e-06, + "loss": 0.65702963, + "num_input_tokens_seen": 41004910, + "step": 1896, + "time_per_iteration": 3.1860666275024414 + }, + { + "auxiliary_loss_clip": 0.01148944, + "auxiliary_loss_mlp": 0.01166787, + "balance_loss_clip": 1.00215292, + "balance_loss_mlp": 1.00122881, + "epoch": 0.11405381031113783, + "flos": 21324259365120.0, + "grad_norm": 1.5943280572297425, + "language_loss": 0.85194105, + "learning_rate": 3.926345380796821e-06, + "loss": 0.87509835, + "num_input_tokens_seen": 41026385, + "step": 1897, + "time_per_iteration": 2.6368699073791504 + }, + { + "auxiliary_loss_clip": 0.01182149, + "auxiliary_loss_mlp": 0.00749058, + "balance_loss_clip": 1.00225806, + "balance_loss_mlp": 1.0003531, + "epoch": 0.11411393356380581, + "flos": 19719627091200.0, + "grad_norm": 2.1186721739566154, + "language_loss": 0.79444361, + "learning_rate": 3.9262406244307465e-06, + "loss": 0.81375569, + "num_input_tokens_seen": 41045315, + "step": 1898, + "time_per_iteration": 2.593465566635132 + }, + { + "auxiliary_loss_clip": 0.01117519, + "auxiliary_loss_mlp": 0.01166982, + "balance_loss_clip": 1.00197601, + "balance_loss_mlp": 1.00113785, + "epoch": 0.11417405681647377, + "flos": 17530440883200.0, + "grad_norm": 2.213105198768618, + "language_loss": 0.73056519, + "learning_rate": 3.926135795021435e-06, + "loss": 0.75341022, + "num_input_tokens_seen": 41063390, + "step": 1899, + "time_per_iteration": 2.7058653831481934 + }, + { + "auxiliary_loss_clip": 0.01115949, + "auxiliary_loss_mlp": 0.01160257, + "balance_loss_clip": 1.0024929, + "balance_loss_mlp": 1.00003958, + "epoch": 0.11423418006914174, + "flos": 59674666619520.0, + "grad_norm": 0.9061080860299529, + "language_loss": 0.63467926, + "learning_rate": 3.92603089257286e-06, + "loss": 0.65744132, + "num_input_tokens_seen": 41124180, + "step": 1900, + "time_per_iteration": 3.2105157375335693 + }, + { + "auxiliary_loss_clip": 0.01100888, + "auxiliary_loss_mlp": 0.01166836, + "balance_loss_clip": 1.00177288, + "balance_loss_mlp": 1.00127816, + "epoch": 0.1142943033218097, + "flos": 22963114321920.0, + "grad_norm": 1.8521886658249864, + "language_loss": 0.78357971, + "learning_rate": 3.925925917089001e-06, + "loss": 0.80625701, + "num_input_tokens_seen": 41143485, + "step": 1901, + "time_per_iteration": 2.7756094932556152 + }, + { + "auxiliary_loss_clip": 0.0116642, + "auxiliary_loss_mlp": 0.01167183, + "balance_loss_clip": 1.0022347, + "balance_loss_mlp": 1.00143433, + "epoch": 0.11435442657447768, + "flos": 18256267008000.0, + "grad_norm": 2.097711772769421, + "language_loss": 0.8414613, + "learning_rate": 3.925820868573839e-06, + "loss": 0.86479735, + "num_input_tokens_seen": 41161695, + "step": 1902, + "time_per_iteration": 2.580596923828125 + }, + { + "auxiliary_loss_clip": 0.01166258, + "auxiliary_loss_mlp": 0.01166855, + "balance_loss_clip": 1.00219774, + "balance_loss_mlp": 1.0011059, + "epoch": 0.11441454982714565, + "flos": 24061191045120.0, + "grad_norm": 1.764951097195839, + "language_loss": 0.77939069, + "learning_rate": 3.925715747031356e-06, + "loss": 0.80272174, + "num_input_tokens_seen": 41181715, + "step": 1903, + "time_per_iteration": 2.6340768337249756 + }, + { + "auxiliary_loss_clip": 0.01149796, + "auxiliary_loss_mlp": 0.01166333, + "balance_loss_clip": 1.00196671, + "balance_loss_mlp": 1.00086999, + "epoch": 0.11447467307981361, + "flos": 25337707557120.0, + "grad_norm": 2.1528014444990506, + "language_loss": 0.75662494, + "learning_rate": 3.925610552465539e-06, + "loss": 0.77978623, + "num_input_tokens_seen": 41201770, + "step": 1904, + "time_per_iteration": 2.6568589210510254 + }, + { + "auxiliary_loss_clip": 0.01150106, + "auxiliary_loss_mlp": 0.01166622, + "balance_loss_clip": 1.00218534, + "balance_loss_mlp": 1.00087321, + "epoch": 0.11453479633248159, + "flos": 21726063878400.0, + "grad_norm": 2.1757370433967878, + "language_loss": 0.92652297, + "learning_rate": 3.9255052848803764e-06, + "loss": 0.94969022, + "num_input_tokens_seen": 41220590, + "step": 1905, + "time_per_iteration": 2.643256902694702 + }, + { + "auxiliary_loss_clip": 0.01153785, + "auxiliary_loss_mlp": 0.01167019, + "balance_loss_clip": 1.00200462, + "balance_loss_mlp": 1.00079322, + "epoch": 0.11459491958514956, + "flos": 12969714096000.0, + "grad_norm": 2.7287426276221978, + "language_loss": 0.77719015, + "learning_rate": 3.925399944279861e-06, + "loss": 0.80039823, + "num_input_tokens_seen": 41237250, + "step": 1906, + "time_per_iteration": 2.5857274532318115 + }, + { + "auxiliary_loss_clip": 0.01182136, + "auxiliary_loss_mlp": 0.01167129, + "balance_loss_clip": 1.00229621, + "balance_loss_mlp": 1.00118923, + "epoch": 0.11465504283781752, + "flos": 22711273090560.0, + "grad_norm": 2.1188483106411438, + "language_loss": 0.81905293, + "learning_rate": 3.925294530667986e-06, + "loss": 0.84254551, + "num_input_tokens_seen": 41256680, + "step": 1907, + "time_per_iteration": 2.573254346847534 + }, + { + "auxiliary_loss_clip": 0.01133812, + "auxiliary_loss_mlp": 0.0116739, + "balance_loss_clip": 1.00214028, + "balance_loss_mlp": 1.0014503, + "epoch": 0.1147151660904855, + "flos": 23398387332480.0, + "grad_norm": 2.443121834905182, + "language_loss": 0.84719408, + "learning_rate": 3.92518904404875e-06, + "loss": 0.87020606, + "num_input_tokens_seen": 41270955, + "step": 1908, + "time_per_iteration": 2.6463916301727295 + }, + { + "auxiliary_loss_clip": 0.01114931, + "auxiliary_loss_mlp": 0.01160172, + "balance_loss_clip": 1.00229812, + "balance_loss_mlp": 0.99995452, + "epoch": 0.11477528934315347, + "flos": 63011843498880.0, + "grad_norm": 0.916383658688791, + "language_loss": 0.61093086, + "learning_rate": 3.925083484426153e-06, + "loss": 0.63368189, + "num_input_tokens_seen": 41319180, + "step": 1909, + "time_per_iteration": 3.0420918464660645 + }, + { + "auxiliary_loss_clip": 0.01182162, + "auxiliary_loss_mlp": 0.01167207, + "balance_loss_clip": 1.00235391, + "balance_loss_mlp": 1.00136352, + "epoch": 0.11483541259582143, + "flos": 16325601960960.0, + "grad_norm": 3.131819957432241, + "language_loss": 0.79366165, + "learning_rate": 3.924977851804197e-06, + "loss": 0.81715536, + "num_input_tokens_seen": 41337480, + "step": 1910, + "time_per_iteration": 3.9608054161071777 + }, + { + "auxiliary_loss_clip": 0.01153058, + "auxiliary_loss_mlp": 0.01167018, + "balance_loss_clip": 1.00222993, + "balance_loss_mlp": 1.0012691, + "epoch": 0.1148955358484894, + "flos": 21580410228480.0, + "grad_norm": 3.0181443288514727, + "language_loss": 0.76951826, + "learning_rate": 3.9248721461868875e-06, + "loss": 0.79271901, + "num_input_tokens_seen": 41354650, + "step": 1911, + "time_per_iteration": 2.6323955059051514 + }, + { + "auxiliary_loss_clip": 0.01150011, + "auxiliary_loss_mlp": 0.01166512, + "balance_loss_clip": 1.00216377, + "balance_loss_mlp": 1.00104964, + "epoch": 0.11495565910115738, + "flos": 27673696650240.0, + "grad_norm": 1.5782079305033956, + "language_loss": 0.7915833, + "learning_rate": 3.9247663675782336e-06, + "loss": 0.81474853, + "num_input_tokens_seen": 41376935, + "step": 1912, + "time_per_iteration": 4.17895770072937 + }, + { + "auxiliary_loss_clip": 0.01182042, + "auxiliary_loss_mlp": 0.00749115, + "balance_loss_clip": 1.00221658, + "balance_loss_mlp": 1.00032735, + "epoch": 0.11501578235382534, + "flos": 20632368614400.0, + "grad_norm": 1.958467661339381, + "language_loss": 0.77924442, + "learning_rate": 3.924660515982246e-06, + "loss": 0.79855603, + "num_input_tokens_seen": 41396105, + "step": 1913, + "time_per_iteration": 2.5690512657165527 + }, + { + "auxiliary_loss_clip": 0.01165323, + "auxiliary_loss_mlp": 0.01166703, + "balance_loss_clip": 1.00203228, + "balance_loss_mlp": 1.00095475, + "epoch": 0.1150759056064933, + "flos": 19829046896640.0, + "grad_norm": 1.8524817012161952, + "language_loss": 0.70451665, + "learning_rate": 3.924554591402939e-06, + "loss": 0.72783685, + "num_input_tokens_seen": 41415600, + "step": 1914, + "time_per_iteration": 2.572968006134033 + }, + { + "auxiliary_loss_clip": 0.01101627, + "auxiliary_loss_mlp": 0.0116103, + "balance_loss_clip": 1.0019207, + "balance_loss_mlp": 1.00005007, + "epoch": 0.11513602885916129, + "flos": 70045776311040.0, + "grad_norm": 0.7600341867762491, + "language_loss": 0.61016011, + "learning_rate": 3.92444859384433e-06, + "loss": 0.63278669, + "num_input_tokens_seen": 41478760, + "step": 1915, + "time_per_iteration": 4.777540683746338 + }, + { + "auxiliary_loss_clip": 0.01165767, + "auxiliary_loss_mlp": 0.01167153, + "balance_loss_clip": 1.00219464, + "balance_loss_mlp": 1.0014044, + "epoch": 0.11519615211182925, + "flos": 15741730385280.0, + "grad_norm": 2.0298027820069073, + "language_loss": 0.93010247, + "learning_rate": 3.924342523310436e-06, + "loss": 0.95343173, + "num_input_tokens_seen": 41495720, + "step": 1916, + "time_per_iteration": 2.5517239570617676 + }, + { + "auxiliary_loss_clip": 0.01149462, + "auxiliary_loss_mlp": 0.01166955, + "balance_loss_clip": 1.0021174, + "balance_loss_mlp": 1.00130165, + "epoch": 0.11525627536449722, + "flos": 20667632791680.0, + "grad_norm": 1.7838909521263915, + "language_loss": 0.72804749, + "learning_rate": 3.9242363798052806e-06, + "loss": 0.7512117, + "num_input_tokens_seen": 41513585, + "step": 1917, + "time_per_iteration": 2.5992519855499268 + }, + { + "auxiliary_loss_clip": 0.01132337, + "auxiliary_loss_mlp": 0.0116636, + "balance_loss_clip": 1.00204492, + "balance_loss_mlp": 1.00080204, + "epoch": 0.1153163986171652, + "flos": 20303283185280.0, + "grad_norm": 2.3772051021994343, + "language_loss": 0.74515724, + "learning_rate": 3.92413016333289e-06, + "loss": 0.76814419, + "num_input_tokens_seen": 41533390, + "step": 1918, + "time_per_iteration": 2.655775308609009 + }, + { + "auxiliary_loss_clip": 0.01133307, + "auxiliary_loss_mlp": 0.01166295, + "balance_loss_clip": 1.00208855, + "balance_loss_mlp": 1.00092816, + "epoch": 0.11537652186983316, + "flos": 17639321984640.0, + "grad_norm": 2.0369457143611207, + "language_loss": 0.86600488, + "learning_rate": 3.92402387389729e-06, + "loss": 0.88900089, + "num_input_tokens_seen": 41551015, + "step": 1919, + "time_per_iteration": 2.6544618606567383 + }, + { + "auxiliary_loss_clip": 0.01133586, + "auxiliary_loss_mlp": 0.01166405, + "balance_loss_clip": 1.00199521, + "balance_loss_mlp": 1.00113368, + "epoch": 0.11543664512250112, + "flos": 21069401391360.0, + "grad_norm": 2.083764059457626, + "language_loss": 0.86895168, + "learning_rate": 3.923917511502512e-06, + "loss": 0.89195156, + "num_input_tokens_seen": 41568055, + "step": 1920, + "time_per_iteration": 2.727224826812744 + }, + { + "auxiliary_loss_clip": 0.01165387, + "auxiliary_loss_mlp": 0.01166442, + "balance_loss_clip": 1.0022192, + "balance_loss_mlp": 1.00107479, + "epoch": 0.11549676837516909, + "flos": 22747542848640.0, + "grad_norm": 2.815195903773737, + "language_loss": 0.79555345, + "learning_rate": 3.923811076152589e-06, + "loss": 0.81887174, + "num_input_tokens_seen": 41587435, + "step": 1921, + "time_per_iteration": 2.611959457397461 + }, + { + "auxiliary_loss_clip": 0.01165904, + "auxiliary_loss_mlp": 0.01167035, + "balance_loss_clip": 1.002159, + "balance_loss_mlp": 1.00128627, + "epoch": 0.11555689162783707, + "flos": 19168972617600.0, + "grad_norm": 2.1464531080741787, + "language_loss": 0.78826475, + "learning_rate": 3.923704567851557e-06, + "loss": 0.81159413, + "num_input_tokens_seen": 41604975, + "step": 1922, + "time_per_iteration": 2.5996603965759277 + }, + { + "auxiliary_loss_clip": 0.0106748, + "auxiliary_loss_mlp": 0.01167082, + "balance_loss_clip": 1.00162745, + "balance_loss_mlp": 1.00181055, + "epoch": 0.11561701488050503, + "flos": 24572056227840.0, + "grad_norm": 1.8622168620803887, + "language_loss": 0.84313738, + "learning_rate": 3.923597986603456e-06, + "loss": 0.86548305, + "num_input_tokens_seen": 41626155, + "step": 1923, + "time_per_iteration": 2.972461223602295 + }, + { + "auxiliary_loss_clip": 0.01165459, + "auxiliary_loss_mlp": 0.01166636, + "balance_loss_clip": 1.00211954, + "balance_loss_mlp": 1.00107849, + "epoch": 0.115677138133173, + "flos": 17092546179840.0, + "grad_norm": 2.011025798113299, + "language_loss": 0.81139672, + "learning_rate": 3.9234913324123264e-06, + "loss": 0.83471775, + "num_input_tokens_seen": 41644805, + "step": 1924, + "time_per_iteration": 3.3592922687530518 + }, + { + "auxiliary_loss_clip": 0.01147278, + "auxiliary_loss_mlp": 0.01160229, + "balance_loss_clip": 1.00251651, + "balance_loss_mlp": 1.0000118, + "epoch": 0.11573726138584098, + "flos": 62703875266560.0, + "grad_norm": 0.8120980168973098, + "language_loss": 0.61193085, + "learning_rate": 3.923384605282212e-06, + "loss": 0.63500595, + "num_input_tokens_seen": 41709345, + "step": 1925, + "time_per_iteration": 3.3765764236450195 + }, + { + "auxiliary_loss_clip": 0.01166351, + "auxiliary_loss_mlp": 0.01166819, + "balance_loss_clip": 1.00232947, + "balance_loss_mlp": 1.00164223, + "epoch": 0.11579738463850894, + "flos": 22601135013120.0, + "grad_norm": 1.6791928006979535, + "language_loss": 0.74816275, + "learning_rate": 3.923277805217161e-06, + "loss": 0.77149439, + "num_input_tokens_seen": 41730210, + "step": 1926, + "time_per_iteration": 2.6157350540161133 + }, + { + "auxiliary_loss_clip": 0.01102054, + "auxiliary_loss_mlp": 0.00749095, + "balance_loss_clip": 1.00187135, + "balance_loss_mlp": 1.00032413, + "epoch": 0.11585750789117691, + "flos": 21726135705600.0, + "grad_norm": 2.7261794389799614, + "language_loss": 0.7288866, + "learning_rate": 3.923170932221222e-06, + "loss": 0.74739808, + "num_input_tokens_seen": 41750270, + "step": 1927, + "time_per_iteration": 2.7478673458099365 + }, + { + "auxiliary_loss_clip": 0.01149711, + "auxiliary_loss_mlp": 0.01166607, + "balance_loss_clip": 1.0021559, + "balance_loss_mlp": 1.00104928, + "epoch": 0.11591763114384489, + "flos": 26287544851200.0, + "grad_norm": 3.1300230391358594, + "language_loss": 0.86993945, + "learning_rate": 3.92306398629845e-06, + "loss": 0.89310259, + "num_input_tokens_seen": 41772975, + "step": 1928, + "time_per_iteration": 2.675163745880127 + }, + { + "auxiliary_loss_clip": 0.01133389, + "auxiliary_loss_mlp": 0.01166505, + "balance_loss_clip": 1.00215364, + "balance_loss_mlp": 1.00123346, + "epoch": 0.11597775439651285, + "flos": 23000461488000.0, + "grad_norm": 1.8703385520920532, + "language_loss": 0.77667713, + "learning_rate": 3.922956967452898e-06, + "loss": 0.79967606, + "num_input_tokens_seen": 41791765, + "step": 1929, + "time_per_iteration": 2.663642168045044 + }, + { + "auxiliary_loss_clip": 0.01181958, + "auxiliary_loss_mlp": 0.01166546, + "balance_loss_clip": 1.00231671, + "balance_loss_mlp": 1.00136983, + "epoch": 0.11603787764918082, + "flos": 31941715507200.0, + "grad_norm": 1.6189226500897538, + "language_loss": 0.7721417, + "learning_rate": 3.922849875688626e-06, + "loss": 0.7956267, + "num_input_tokens_seen": 41815615, + "step": 1930, + "time_per_iteration": 2.627253532409668 + }, + { + "auxiliary_loss_clip": 0.01149899, + "auxiliary_loss_mlp": 0.01166344, + "balance_loss_clip": 1.0021441, + "balance_loss_mlp": 1.00116801, + "epoch": 0.1160980009018488, + "flos": 22271654534400.0, + "grad_norm": 1.8350529837289151, + "language_loss": 0.7215628, + "learning_rate": 3.922742711009693e-06, + "loss": 0.74472523, + "num_input_tokens_seen": 41834810, + "step": 1931, + "time_per_iteration": 2.6641244888305664 + }, + { + "auxiliary_loss_clip": 0.01149658, + "auxiliary_loss_mlp": 0.01166684, + "balance_loss_clip": 1.00211024, + "balance_loss_mlp": 1.00131691, + "epoch": 0.11615812415451676, + "flos": 22783633038720.0, + "grad_norm": 1.8741958094685134, + "language_loss": 0.82495111, + "learning_rate": 3.922635473420164e-06, + "loss": 0.84811461, + "num_input_tokens_seen": 41854975, + "step": 1932, + "time_per_iteration": 2.6368062496185303 + }, + { + "auxiliary_loss_clip": 0.0113215, + "auxiliary_loss_mlp": 0.01160406, + "balance_loss_clip": 1.00286627, + "balance_loss_mlp": 1.00018883, + "epoch": 0.11621824740718473, + "flos": 67146096107520.0, + "grad_norm": 0.7709509690991654, + "language_loss": 0.61129993, + "learning_rate": 3.922528162924105e-06, + "loss": 0.63422549, + "num_input_tokens_seen": 41911105, + "step": 1933, + "time_per_iteration": 3.0910089015960693 + }, + { + "auxiliary_loss_clip": 0.01104285, + "auxiliary_loss_mlp": 0.00749065, + "balance_loss_clip": 1.00303817, + "balance_loss_mlp": 1.00034511, + "epoch": 0.11627837065985269, + "flos": 20375930442240.0, + "grad_norm": 2.0784113335823236, + "language_loss": 0.85640001, + "learning_rate": 3.922420779525586e-06, + "loss": 0.8749336, + "num_input_tokens_seen": 41931750, + "step": 1934, + "time_per_iteration": 2.7393758296966553 + }, + { + "auxiliary_loss_clip": 0.01117425, + "auxiliary_loss_mlp": 0.0116714, + "balance_loss_clip": 1.00207496, + "balance_loss_mlp": 1.00120056, + "epoch": 0.11633849391252067, + "flos": 21725812483200.0, + "grad_norm": 2.275509044070521, + "language_loss": 0.66208994, + "learning_rate": 3.9223133232286776e-06, + "loss": 0.68493557, + "num_input_tokens_seen": 41949400, + "step": 1935, + "time_per_iteration": 2.682417869567871 + }, + { + "auxiliary_loss_clip": 0.01182123, + "auxiliary_loss_mlp": 0.01166756, + "balance_loss_clip": 1.00234115, + "balance_loss_mlp": 1.00119853, + "epoch": 0.11639861716518864, + "flos": 18805341283200.0, + "grad_norm": 1.794746984475115, + "language_loss": 0.75418526, + "learning_rate": 3.922205794037456e-06, + "loss": 0.77767408, + "num_input_tokens_seen": 41968100, + "step": 1936, + "time_per_iteration": 2.5288727283477783 + }, + { + "auxiliary_loss_clip": 0.0118194, + "auxiliary_loss_mlp": 0.01166461, + "balance_loss_clip": 1.00226402, + "balance_loss_mlp": 1.00099802, + "epoch": 0.1164587404178566, + "flos": 21214983214080.0, + "grad_norm": 1.7870095412375875, + "language_loss": 0.84936875, + "learning_rate": 3.922098191955998e-06, + "loss": 0.87285274, + "num_input_tokens_seen": 41986375, + "step": 1937, + "time_per_iteration": 2.523430585861206 + }, + { + "auxiliary_loss_clip": 0.01149473, + "auxiliary_loss_mlp": 0.01166223, + "balance_loss_clip": 1.00205934, + "balance_loss_mlp": 1.00095129, + "epoch": 0.11651886367052458, + "flos": 27818632028160.0, + "grad_norm": 1.8419500996221903, + "language_loss": 0.76294303, + "learning_rate": 3.921990516988384e-06, + "loss": 0.78610003, + "num_input_tokens_seen": 42006055, + "step": 1938, + "time_per_iteration": 2.6756982803344727 + }, + { + "auxiliary_loss_clip": 0.01182161, + "auxiliary_loss_mlp": 0.01166528, + "balance_loss_clip": 1.00234759, + "balance_loss_mlp": 1.00097036, + "epoch": 0.11657898692319255, + "flos": 22889569224960.0, + "grad_norm": 1.7337459373989446, + "language_loss": 0.79675305, + "learning_rate": 3.921882769138696e-06, + "loss": 0.82023996, + "num_input_tokens_seen": 42024995, + "step": 1939, + "time_per_iteration": 2.5438969135284424 + }, + { + "auxiliary_loss_clip": 0.01150448, + "auxiliary_loss_mlp": 0.01166442, + "balance_loss_clip": 1.00222325, + "balance_loss_mlp": 1.00126553, + "epoch": 0.11663911017586051, + "flos": 24315905364480.0, + "grad_norm": 2.1366160910541345, + "language_loss": 0.86309683, + "learning_rate": 3.9217749484110215e-06, + "loss": 0.8862657, + "num_input_tokens_seen": 42042640, + "step": 1940, + "time_per_iteration": 2.638031482696533 + }, + { + "auxiliary_loss_clip": 0.01148818, + "auxiliary_loss_mlp": 0.01166079, + "balance_loss_clip": 1.00217295, + "balance_loss_mlp": 1.0012846, + "epoch": 0.11669923342852849, + "flos": 42340152470400.0, + "grad_norm": 1.3584473959471906, + "language_loss": 0.76099336, + "learning_rate": 3.921667054809449e-06, + "loss": 0.78414237, + "num_input_tokens_seen": 42067005, + "step": 1941, + "time_per_iteration": 2.786832332611084 + }, + { + "auxiliary_loss_clip": 0.01148806, + "auxiliary_loss_mlp": 0.00749153, + "balance_loss_clip": 1.00197029, + "balance_loss_mlp": 1.00036335, + "epoch": 0.11675935668119646, + "flos": 14642288945280.0, + "grad_norm": 2.0376314443735133, + "language_loss": 0.88372022, + "learning_rate": 3.921559088338068e-06, + "loss": 0.90269971, + "num_input_tokens_seen": 42082295, + "step": 1942, + "time_per_iteration": 2.5909156799316406 + }, + { + "auxiliary_loss_clip": 0.0116542, + "auxiliary_loss_mlp": 0.01166324, + "balance_loss_clip": 1.00220013, + "balance_loss_mlp": 1.00133848, + "epoch": 0.11681947993386442, + "flos": 35116470063360.0, + "grad_norm": 1.5918866857424996, + "language_loss": 0.67683613, + "learning_rate": 3.921451049000975e-06, + "loss": 0.70015359, + "num_input_tokens_seen": 42105295, + "step": 1943, + "time_per_iteration": 2.707066535949707 + }, + { + "auxiliary_loss_clip": 0.01148952, + "auxiliary_loss_mlp": 0.01166255, + "balance_loss_clip": 1.00217092, + "balance_loss_mlp": 1.00117373, + "epoch": 0.11687960318653239, + "flos": 38983259024640.0, + "grad_norm": 1.8636824832209542, + "language_loss": 0.6946249, + "learning_rate": 3.921342936802265e-06, + "loss": 0.71777701, + "num_input_tokens_seen": 42125520, + "step": 1944, + "time_per_iteration": 2.7606852054595947 + }, + { + "auxiliary_loss_clip": 0.01166037, + "auxiliary_loss_mlp": 0.01165973, + "balance_loss_clip": 1.00224161, + "balance_loss_mlp": 1.0010829, + "epoch": 0.11693972643920036, + "flos": 25994980575360.0, + "grad_norm": 1.5887806023732496, + "language_loss": 0.82548547, + "learning_rate": 3.921234751746038e-06, + "loss": 0.84880561, + "num_input_tokens_seen": 42146335, + "step": 1945, + "time_per_iteration": 2.6396872997283936 + }, + { + "auxiliary_loss_clip": 0.01150275, + "auxiliary_loss_mlp": 0.01166204, + "balance_loss_clip": 1.00210273, + "balance_loss_mlp": 1.0012182, + "epoch": 0.11699984969186833, + "flos": 27272107618560.0, + "grad_norm": 1.988543654438439, + "language_loss": 0.76035416, + "learning_rate": 3.9211264938363975e-06, + "loss": 0.78351891, + "num_input_tokens_seen": 42165320, + "step": 1946, + "time_per_iteration": 2.685633659362793 + }, + { + "auxiliary_loss_clip": 0.01133519, + "auxiliary_loss_mlp": 0.01166189, + "balance_loss_clip": 1.00205898, + "balance_loss_mlp": 1.00129843, + "epoch": 0.1170599729445363, + "flos": 15267853232640.0, + "grad_norm": 1.8816482133831258, + "language_loss": 0.68806672, + "learning_rate": 3.921018163077448e-06, + "loss": 0.71106386, + "num_input_tokens_seen": 42182955, + "step": 1947, + "time_per_iteration": 2.634019136428833 + }, + { + "auxiliary_loss_clip": 0.01149495, + "auxiliary_loss_mlp": 0.01167216, + "balance_loss_clip": 1.00228453, + "balance_loss_mlp": 1.00194371, + "epoch": 0.11712009619720427, + "flos": 17164439251200.0, + "grad_norm": 1.9292781620010444, + "language_loss": 0.85370207, + "learning_rate": 3.920909759473295e-06, + "loss": 0.87686926, + "num_input_tokens_seen": 42200760, + "step": 1948, + "time_per_iteration": 3.984999418258667 + }, + { + "auxiliary_loss_clip": 0.01147833, + "auxiliary_loss_mlp": 0.00748658, + "balance_loss_clip": 1.00259387, + "balance_loss_mlp": 0.99970526, + "epoch": 0.11718021944987224, + "flos": 70940991997440.0, + "grad_norm": 0.8193217097511993, + "language_loss": 0.65161234, + "learning_rate": 3.920801283028054e-06, + "loss": 0.67057729, + "num_input_tokens_seen": 42265745, + "step": 1949, + "time_per_iteration": 3.251007318496704 + }, + { + "auxiliary_loss_clip": 0.01165242, + "auxiliary_loss_mlp": 0.01166514, + "balance_loss_clip": 1.00219941, + "balance_loss_mlp": 1.00124276, + "epoch": 0.1172403427025402, + "flos": 27453456408960.0, + "grad_norm": 1.4748221893487061, + "language_loss": 0.71996206, + "learning_rate": 3.920692733745835e-06, + "loss": 0.74327958, + "num_input_tokens_seen": 42286245, + "step": 1950, + "time_per_iteration": 5.313292026519775 + }, + { + "auxiliary_loss_clip": 0.01165903, + "auxiliary_loss_mlp": 0.01166706, + "balance_loss_clip": 1.0022887, + "balance_loss_mlp": 1.00143409, + "epoch": 0.11730046595520818, + "flos": 15668723992320.0, + "grad_norm": 2.221433650386546, + "language_loss": 0.7669636, + "learning_rate": 3.920584111630755e-06, + "loss": 0.7902897, + "num_input_tokens_seen": 42302710, + "step": 1951, + "time_per_iteration": 2.589293956756592 + }, + { + "auxiliary_loss_clip": 0.01117388, + "auxiliary_loss_mlp": 0.01166534, + "balance_loss_clip": 1.00205016, + "balance_loss_mlp": 1.00154829, + "epoch": 0.11736058920787615, + "flos": 25630164092160.0, + "grad_norm": 1.8120969186411686, + "language_loss": 0.75904781, + "learning_rate": 3.9204754166869325e-06, + "loss": 0.78188699, + "num_input_tokens_seen": 42324115, + "step": 1952, + "time_per_iteration": 2.7390263080596924 + }, + { + "auxiliary_loss_clip": 0.0113413, + "auxiliary_loss_mlp": 0.01166503, + "balance_loss_clip": 1.00221705, + "balance_loss_mlp": 1.00123167, + "epoch": 0.11742071246054411, + "flos": 21434289701760.0, + "grad_norm": 1.8571384238771163, + "language_loss": 0.72607344, + "learning_rate": 3.920366648918491e-06, + "loss": 0.74907976, + "num_input_tokens_seen": 42342505, + "step": 1953, + "time_per_iteration": 4.086268186569214 + }, + { + "auxiliary_loss_clip": 0.01149594, + "auxiliary_loss_mlp": 0.00749114, + "balance_loss_clip": 1.00211406, + "balance_loss_mlp": 1.0004158, + "epoch": 0.11748083571321208, + "flos": 15997845335040.0, + "grad_norm": 2.452355477032209, + "language_loss": 0.79348886, + "learning_rate": 3.920257808329552e-06, + "loss": 0.81247592, + "num_input_tokens_seen": 42360525, + "step": 1954, + "time_per_iteration": 2.5935099124908447 + }, + { + "auxiliary_loss_clip": 0.01104261, + "auxiliary_loss_mlp": 0.0116671, + "balance_loss_clip": 1.00263035, + "balance_loss_mlp": 1.00124764, + "epoch": 0.11754095896588006, + "flos": 16180056051840.0, + "grad_norm": 1.7134117046289923, + "language_loss": 0.8570016, + "learning_rate": 3.920148894924246e-06, + "loss": 0.87971133, + "num_input_tokens_seen": 42377045, + "step": 1955, + "time_per_iteration": 2.6948165893554688 + }, + { + "auxiliary_loss_clip": 0.01166083, + "auxiliary_loss_mlp": 0.00749147, + "balance_loss_clip": 1.0022856, + "balance_loss_mlp": 1.00051892, + "epoch": 0.11760108221854802, + "flos": 13261596013440.0, + "grad_norm": 2.2027673626108006, + "language_loss": 0.77743089, + "learning_rate": 3.920039908706701e-06, + "loss": 0.79658318, + "num_input_tokens_seen": 42393960, + "step": 1956, + "time_per_iteration": 2.5756895542144775 + }, + { + "auxiliary_loss_clip": 0.01165467, + "auxiliary_loss_mlp": 0.01166083, + "balance_loss_clip": 1.0021317, + "balance_loss_mlp": 1.00128841, + "epoch": 0.11766120547121599, + "flos": 24498439303680.0, + "grad_norm": 1.7638155292179296, + "language_loss": 0.80345893, + "learning_rate": 3.91993084968105e-06, + "loss": 0.82677448, + "num_input_tokens_seen": 42413160, + "step": 1957, + "time_per_iteration": 2.590514659881592 + }, + { + "auxiliary_loss_clip": 0.01165517, + "auxiliary_loss_mlp": 0.01166627, + "balance_loss_clip": 1.00223374, + "balance_loss_mlp": 1.00116444, + "epoch": 0.11772132872388397, + "flos": 17784005967360.0, + "grad_norm": 2.008028778162544, + "language_loss": 0.77778137, + "learning_rate": 3.919821717851428e-06, + "loss": 0.80110276, + "num_input_tokens_seen": 42432590, + "step": 1958, + "time_per_iteration": 2.570300817489624 + }, + { + "auxiliary_loss_clip": 0.01148456, + "auxiliary_loss_mlp": 0.01166328, + "balance_loss_clip": 1.0020721, + "balance_loss_mlp": 1.00115132, + "epoch": 0.11778145197655193, + "flos": 13217030213760.0, + "grad_norm": 1.6909626411986631, + "language_loss": 0.76821542, + "learning_rate": 3.919712513221976e-06, + "loss": 0.79136336, + "num_input_tokens_seen": 42450135, + "step": 1959, + "time_per_iteration": 2.5980587005615234 + }, + { + "auxiliary_loss_clip": 0.01149076, + "auxiliary_loss_mlp": 0.01166177, + "balance_loss_clip": 1.00214124, + "balance_loss_mlp": 1.00109589, + "epoch": 0.1178415752292199, + "flos": 20230204965120.0, + "grad_norm": 1.8435517067975076, + "language_loss": 0.7035284, + "learning_rate": 3.919603235796832e-06, + "loss": 0.72668099, + "num_input_tokens_seen": 42470050, + "step": 1960, + "time_per_iteration": 2.6671836376190186 + }, + { + "auxiliary_loss_clip": 0.01148898, + "auxiliary_loss_mlp": 0.01166228, + "balance_loss_clip": 1.00199401, + "balance_loss_mlp": 1.00114727, + "epoch": 0.11790169848188788, + "flos": 13040134709760.0, + "grad_norm": 3.3770200404713675, + "language_loss": 0.81788588, + "learning_rate": 3.9194938855801406e-06, + "loss": 0.84103709, + "num_input_tokens_seen": 42484335, + "step": 1961, + "time_per_iteration": 2.6017768383026123 + }, + { + "auxiliary_loss_clip": 0.01165895, + "auxiliary_loss_mlp": 0.00749117, + "balance_loss_clip": 1.0021472, + "balance_loss_mlp": 1.00045395, + "epoch": 0.11796182173455584, + "flos": 22265728790400.0, + "grad_norm": 1.7969950513222108, + "language_loss": 0.92793798, + "learning_rate": 3.919384462576049e-06, + "loss": 0.94708812, + "num_input_tokens_seen": 42502720, + "step": 1962, + "time_per_iteration": 2.6359057426452637 + }, + { + "auxiliary_loss_clip": 0.01133267, + "auxiliary_loss_mlp": 0.01166663, + "balance_loss_clip": 1.00200653, + "balance_loss_mlp": 1.00120044, + "epoch": 0.1180219449872238, + "flos": 10635017892480.0, + "grad_norm": 2.092855739431949, + "language_loss": 0.87372303, + "learning_rate": 3.919274966788707e-06, + "loss": 0.89672232, + "num_input_tokens_seen": 42519460, + "step": 1963, + "time_per_iteration": 2.634641408920288 + }, + { + "auxiliary_loss_clip": 0.0114896, + "auxiliary_loss_mlp": 0.00749185, + "balance_loss_clip": 1.00209188, + "balance_loss_mlp": 1.00046897, + "epoch": 0.11808206823989177, + "flos": 20923532259840.0, + "grad_norm": 1.816207513617111, + "language_loss": 0.84171468, + "learning_rate": 3.919165398222265e-06, + "loss": 0.86069608, + "num_input_tokens_seen": 42539420, + "step": 1964, + "time_per_iteration": 2.8375754356384277 + }, + { + "auxiliary_loss_clip": 0.01134214, + "auxiliary_loss_mlp": 0.01166495, + "balance_loss_clip": 1.00242186, + "balance_loss_mlp": 1.00131857, + "epoch": 0.11814219149255975, + "flos": 20777770869120.0, + "grad_norm": 1.8277190688678746, + "language_loss": 0.83060622, + "learning_rate": 3.919055756880879e-06, + "loss": 0.85361326, + "num_input_tokens_seen": 42558225, + "step": 1965, + "time_per_iteration": 2.7041306495666504 + }, + { + "auxiliary_loss_clip": 0.0118195, + "auxiliary_loss_mlp": 0.01166499, + "balance_loss_clip": 1.0022893, + "balance_loss_mlp": 1.00103629, + "epoch": 0.11820231474522772, + "flos": 48759938542080.0, + "grad_norm": 1.839050207936712, + "language_loss": 0.7440365, + "learning_rate": 3.918946042768707e-06, + "loss": 0.76752096, + "num_input_tokens_seen": 42580790, + "step": 1966, + "time_per_iteration": 3.0784125328063965 + }, + { + "auxiliary_loss_clip": 0.01148803, + "auxiliary_loss_mlp": 0.01166897, + "balance_loss_clip": 1.00220728, + "balance_loss_mlp": 1.0014348, + "epoch": 0.11826243799789568, + "flos": 16690598012160.0, + "grad_norm": 2.5760859901889144, + "language_loss": 0.7323848, + "learning_rate": 3.918836255889908e-06, + "loss": 0.7555418, + "num_input_tokens_seen": 42597355, + "step": 1967, + "time_per_iteration": 2.5776455402374268 + }, + { + "auxiliary_loss_clip": 0.01165314, + "auxiliary_loss_mlp": 0.01166494, + "balance_loss_clip": 1.00214338, + "balance_loss_mlp": 1.00131702, + "epoch": 0.11832256125056366, + "flos": 16909868586240.0, + "grad_norm": 2.7951749618024833, + "language_loss": 0.88376647, + "learning_rate": 3.9187263962486456e-06, + "loss": 0.90708458, + "num_input_tokens_seen": 42616060, + "step": 1968, + "time_per_iteration": 2.573038339614868 + }, + { + "auxiliary_loss_clip": 0.0114938, + "auxiliary_loss_mlp": 0.01166286, + "balance_loss_clip": 1.00215733, + "balance_loss_mlp": 1.00091898, + "epoch": 0.11838268450323162, + "flos": 22820405587200.0, + "grad_norm": 1.7263009473133089, + "language_loss": 0.67365003, + "learning_rate": 3.918616463849087e-06, + "loss": 0.69680667, + "num_input_tokens_seen": 42636285, + "step": 1969, + "time_per_iteration": 2.6407108306884766 + }, + { + "auxiliary_loss_clip": 0.01133973, + "auxiliary_loss_mlp": 0.01166501, + "balance_loss_clip": 1.00229609, + "balance_loss_mlp": 1.0011344, + "epoch": 0.11844280775589959, + "flos": 33545844990720.0, + "grad_norm": 1.8690807807193235, + "language_loss": 0.80747783, + "learning_rate": 3.918506458695399e-06, + "loss": 0.8304826, + "num_input_tokens_seen": 42658320, + "step": 1970, + "time_per_iteration": 2.8861501216888428 + }, + { + "auxiliary_loss_clip": 0.01163421, + "auxiliary_loss_mlp": 0.01159484, + "balance_loss_clip": 1.00256443, + "balance_loss_mlp": 1.0000298, + "epoch": 0.11850293100856757, + "flos": 66350998604160.0, + "grad_norm": 0.7974171728350634, + "language_loss": 0.66222751, + "learning_rate": 3.918396380791754e-06, + "loss": 0.68545651, + "num_input_tokens_seen": 42721500, + "step": 1971, + "time_per_iteration": 3.128685235977173 + }, + { + "auxiliary_loss_clip": 0.01149773, + "auxiliary_loss_mlp": 0.01167041, + "balance_loss_clip": 1.0022738, + "balance_loss_mlp": 1.00129199, + "epoch": 0.11856305426123553, + "flos": 24681045070080.0, + "grad_norm": 1.794898425384068, + "language_loss": 0.79500574, + "learning_rate": 3.918286230142327e-06, + "loss": 0.81817389, + "num_input_tokens_seen": 42739825, + "step": 1972, + "time_per_iteration": 2.7406795024871826 + }, + { + "auxiliary_loss_clip": 0.01133007, + "auxiliary_loss_mlp": 0.00749088, + "balance_loss_clip": 1.00206566, + "balance_loss_mlp": 1.00047135, + "epoch": 0.1186231775139035, + "flos": 24280102483200.0, + "grad_norm": 2.9132782014151015, + "language_loss": 0.72453117, + "learning_rate": 3.918176006751292e-06, + "loss": 0.74335206, + "num_input_tokens_seen": 42758695, + "step": 1973, + "time_per_iteration": 2.6949100494384766 + }, + { + "auxiliary_loss_clip": 0.01133154, + "auxiliary_loss_mlp": 0.01166238, + "balance_loss_clip": 1.00225782, + "balance_loss_mlp": 1.00077534, + "epoch": 0.11868330076657148, + "flos": 21757413473280.0, + "grad_norm": 1.5806431831046523, + "language_loss": 0.71952623, + "learning_rate": 3.918065710622832e-06, + "loss": 0.74252015, + "num_input_tokens_seen": 42778510, + "step": 1974, + "time_per_iteration": 2.685103416442871 + }, + { + "auxiliary_loss_clip": 0.01137333, + "auxiliary_loss_mlp": 0.0116634, + "balance_loss_clip": 1.00258446, + "balance_loss_mlp": 1.00097251, + "epoch": 0.11874342401923944, + "flos": 17193274894080.0, + "grad_norm": 3.3972078662911778, + "language_loss": 0.78295302, + "learning_rate": 3.917955341761128e-06, + "loss": 0.80598974, + "num_input_tokens_seen": 42793995, + "step": 1975, + "time_per_iteration": 2.668381690979004 + }, + { + "auxiliary_loss_clip": 0.01115334, + "auxiliary_loss_mlp": 0.01166025, + "balance_loss_clip": 1.00198102, + "balance_loss_mlp": 1.00113416, + "epoch": 0.11880354727190741, + "flos": 15229572312960.0, + "grad_norm": 2.051125637895056, + "language_loss": 0.75211728, + "learning_rate": 3.917844900170364e-06, + "loss": 0.77493083, + "num_input_tokens_seen": 42809000, + "step": 1976, + "time_per_iteration": 2.666860818862915 + }, + { + "auxiliary_loss_clip": 0.01165278, + "auxiliary_loss_mlp": 0.01166302, + "balance_loss_clip": 1.00227308, + "balance_loss_mlp": 1.00112581, + "epoch": 0.11886367052457537, + "flos": 27309706179840.0, + "grad_norm": 1.4524498941462065, + "language_loss": 0.75197154, + "learning_rate": 3.91773438585473e-06, + "loss": 0.77528739, + "num_input_tokens_seen": 42831585, + "step": 1977, + "time_per_iteration": 2.669023036956787 + }, + { + "auxiliary_loss_clip": 0.01181842, + "auxiliary_loss_mlp": 0.01166958, + "balance_loss_clip": 1.00229478, + "balance_loss_mlp": 1.00140035, + "epoch": 0.11892379377724335, + "flos": 21798280172160.0, + "grad_norm": 2.1170526643910246, + "language_loss": 0.74300158, + "learning_rate": 3.9176237988184165e-06, + "loss": 0.76648962, + "num_input_tokens_seen": 42848420, + "step": 1978, + "time_per_iteration": 2.5186164379119873 + }, + { + "auxiliary_loss_clip": 0.01132544, + "auxiliary_loss_mlp": 0.01166318, + "balance_loss_clip": 1.00214624, + "balance_loss_mlp": 1.00123739, + "epoch": 0.11898391702991132, + "flos": 13991013498240.0, + "grad_norm": 1.8165420427560934, + "language_loss": 0.73685741, + "learning_rate": 3.917513139065616e-06, + "loss": 0.75984609, + "num_input_tokens_seen": 42866645, + "step": 1979, + "time_per_iteration": 2.621641159057617 + }, + { + "auxiliary_loss_clip": 0.01131963, + "auxiliary_loss_mlp": 0.01166166, + "balance_loss_clip": 1.00203252, + "balance_loss_mlp": 1.00108528, + "epoch": 0.11904404028257928, + "flos": 32234567091840.0, + "grad_norm": 1.7477212293137145, + "language_loss": 0.98639679, + "learning_rate": 3.917402406600525e-06, + "loss": 1.00937819, + "num_input_tokens_seen": 42888515, + "step": 1980, + "time_per_iteration": 2.7303307056427 + }, + { + "auxiliary_loss_clip": 0.01148591, + "auxiliary_loss_mlp": 0.01166234, + "balance_loss_clip": 1.00207758, + "balance_loss_mlp": 1.0010581, + "epoch": 0.11910416353524726, + "flos": 23586272398080.0, + "grad_norm": 1.7409663378271878, + "language_loss": 0.86129493, + "learning_rate": 3.917291601427342e-06, + "loss": 0.88444322, + "num_input_tokens_seen": 42909035, + "step": 1981, + "time_per_iteration": 2.631032943725586 + }, + { + "auxiliary_loss_clip": 0.01148977, + "auxiliary_loss_mlp": 0.01166695, + "balance_loss_clip": 1.00218701, + "balance_loss_mlp": 1.00132775, + "epoch": 0.11916428678791523, + "flos": 25333038789120.0, + "grad_norm": 1.8896389692419135, + "language_loss": 0.85435218, + "learning_rate": 3.91718072355027e-06, + "loss": 0.87750888, + "num_input_tokens_seen": 42927555, + "step": 1982, + "time_per_iteration": 2.649200677871704 + }, + { + "auxiliary_loss_clip": 0.01150527, + "auxiliary_loss_mlp": 0.01166094, + "balance_loss_clip": 1.00240207, + "balance_loss_mlp": 1.00120354, + "epoch": 0.11922441004058319, + "flos": 19788431592960.0, + "grad_norm": 1.9407864031655835, + "language_loss": 0.85268319, + "learning_rate": 3.917069772973513e-06, + "loss": 0.87584937, + "num_input_tokens_seen": 42945300, + "step": 1983, + "time_per_iteration": 2.609666585922241 + }, + { + "auxiliary_loss_clip": 0.01115481, + "auxiliary_loss_mlp": 0.01166495, + "balance_loss_clip": 1.00186765, + "balance_loss_mlp": 1.00122285, + "epoch": 0.11928453329325117, + "flos": 21536347219200.0, + "grad_norm": 2.8935901528773367, + "language_loss": 0.77352536, + "learning_rate": 3.916958749701277e-06, + "loss": 0.79634511, + "num_input_tokens_seen": 42961295, + "step": 1984, + "time_per_iteration": 2.6767120361328125 + }, + { + "auxiliary_loss_clip": 0.01165238, + "auxiliary_loss_mlp": 0.01166298, + "balance_loss_clip": 1.00226068, + "balance_loss_mlp": 1.00112152, + "epoch": 0.11934465654591914, + "flos": 20815010294400.0, + "grad_norm": 2.5725747701360233, + "language_loss": 0.83641005, + "learning_rate": 3.9168476537377745e-06, + "loss": 0.85972542, + "num_input_tokens_seen": 42980330, + "step": 1985, + "time_per_iteration": 2.597554922103882 + }, + { + "auxiliary_loss_clip": 0.01153186, + "auxiliary_loss_mlp": 0.01166182, + "balance_loss_clip": 1.00227606, + "balance_loss_mlp": 1.00100589, + "epoch": 0.1194047797985871, + "flos": 19060486565760.0, + "grad_norm": 2.0275934585145303, + "language_loss": 0.73793483, + "learning_rate": 3.916736485087216e-06, + "loss": 0.76112854, + "num_input_tokens_seen": 42996125, + "step": 1986, + "time_per_iteration": 3.9928016662597656 + }, + { + "auxiliary_loss_clip": 0.01149415, + "auxiliary_loss_mlp": 0.01166326, + "balance_loss_clip": 1.00223446, + "balance_loss_mlp": 1.00143576, + "epoch": 0.11946490305125507, + "flos": 27190805184000.0, + "grad_norm": 2.032167110169929, + "language_loss": 0.72023666, + "learning_rate": 3.916625243753819e-06, + "loss": 0.74339402, + "num_input_tokens_seen": 43014180, + "step": 1987, + "time_per_iteration": 4.13309383392334 + }, + { + "auxiliary_loss_clip": 0.01149094, + "auxiliary_loss_mlp": 0.01166636, + "balance_loss_clip": 1.00216627, + "balance_loss_mlp": 1.0011735, + "epoch": 0.11952502630392305, + "flos": 21140791672320.0, + "grad_norm": 1.85917460525042, + "language_loss": 0.72213149, + "learning_rate": 3.916513929741799e-06, + "loss": 0.74528879, + "num_input_tokens_seen": 43032120, + "step": 1988, + "time_per_iteration": 2.603929281234741 + }, + { + "auxiliary_loss_clip": 0.01165169, + "auxiliary_loss_mlp": 0.01166388, + "balance_loss_clip": 1.00217557, + "balance_loss_mlp": 1.00140262, + "epoch": 0.11958514955659101, + "flos": 22124241118080.0, + "grad_norm": 1.817438654938029, + "language_loss": 0.81288743, + "learning_rate": 3.91640254305538e-06, + "loss": 0.83620298, + "num_input_tokens_seen": 43052215, + "step": 1989, + "time_per_iteration": 2.5867063999176025 + }, + { + "auxiliary_loss_clip": 0.01133681, + "auxiliary_loss_mlp": 0.01166642, + "balance_loss_clip": 1.0021863, + "balance_loss_mlp": 1.0011797, + "epoch": 0.11964527280925898, + "flos": 17421452040960.0, + "grad_norm": 2.6274689269780342, + "language_loss": 0.75438046, + "learning_rate": 3.916291083698784e-06, + "loss": 0.77738369, + "num_input_tokens_seen": 43069720, + "step": 1990, + "time_per_iteration": 2.6435248851776123 + }, + { + "auxiliary_loss_clip": 0.01148675, + "auxiliary_loss_mlp": 0.01159495, + "balance_loss_clip": 1.00275791, + "balance_loss_mlp": 1.00004029, + "epoch": 0.11970539606192696, + "flos": 70679741402880.0, + "grad_norm": 0.85904510785634, + "language_loss": 0.55258107, + "learning_rate": 3.916179551676238e-06, + "loss": 0.57566285, + "num_input_tokens_seen": 43123130, + "step": 1991, + "time_per_iteration": 4.554771184921265 + }, + { + "auxiliary_loss_clip": 0.01131801, + "auxiliary_loss_mlp": 0.01165873, + "balance_loss_clip": 1.00200856, + "balance_loss_mlp": 1.00126839, + "epoch": 0.11976551931459492, + "flos": 21215019127680.0, + "grad_norm": 3.0673192426923697, + "language_loss": 0.78685367, + "learning_rate": 3.916067946991971e-06, + "loss": 0.80983037, + "num_input_tokens_seen": 43140015, + "step": 1992, + "time_per_iteration": 2.642423629760742 + }, + { + "auxiliary_loss_clip": 0.01181849, + "auxiliary_loss_mlp": 0.01166582, + "balance_loss_clip": 1.0023638, + "balance_loss_mlp": 1.00121498, + "epoch": 0.11982564256726289, + "flos": 25989306226560.0, + "grad_norm": 1.6830270655686292, + "language_loss": 0.79025722, + "learning_rate": 3.915956269650216e-06, + "loss": 0.81374151, + "num_input_tokens_seen": 43160105, + "step": 1993, + "time_per_iteration": 2.576829433441162 + }, + { + "auxiliary_loss_clip": 0.01133305, + "auxiliary_loss_mlp": 0.01165917, + "balance_loss_clip": 1.00213146, + "balance_loss_mlp": 1.00112176, + "epoch": 0.11988576581993086, + "flos": 21650866755840.0, + "grad_norm": 1.758518578268353, + "language_loss": 0.82196224, + "learning_rate": 3.915844519655208e-06, + "loss": 0.84495449, + "num_input_tokens_seen": 43179835, + "step": 1994, + "time_per_iteration": 2.6667184829711914 + }, + { + "auxiliary_loss_clip": 0.01152588, + "auxiliary_loss_mlp": 0.01166249, + "balance_loss_clip": 1.0021596, + "balance_loss_mlp": 1.00126362, + "epoch": 0.11994588907259883, + "flos": 17857407409920.0, + "grad_norm": 1.9984551688821557, + "language_loss": 0.88402981, + "learning_rate": 3.915732697011183e-06, + "loss": 0.9072181, + "num_input_tokens_seen": 43197210, + "step": 1995, + "time_per_iteration": 2.571007490158081 + }, + { + "auxiliary_loss_clip": 0.01148821, + "auxiliary_loss_mlp": 0.0116609, + "balance_loss_clip": 1.00223565, + "balance_loss_mlp": 1.00129497, + "epoch": 0.1200060123252668, + "flos": 24462744163200.0, + "grad_norm": 1.90301668449042, + "language_loss": 0.7432009, + "learning_rate": 3.9156208017223825e-06, + "loss": 0.76635003, + "num_input_tokens_seen": 43215050, + "step": 1996, + "time_per_iteration": 2.6619696617126465 + }, + { + "auxiliary_loss_clip": 0.01132208, + "auxiliary_loss_mlp": 0.01165859, + "balance_loss_clip": 1.00199246, + "balance_loss_mlp": 1.00106359, + "epoch": 0.12006613557793476, + "flos": 18732191235840.0, + "grad_norm": 1.916762371484884, + "language_loss": 0.87884009, + "learning_rate": 3.915508833793048e-06, + "loss": 0.90182078, + "num_input_tokens_seen": 43233900, + "step": 1997, + "time_per_iteration": 2.6531317234039307 + }, + { + "auxiliary_loss_clip": 0.01165663, + "auxiliary_loss_mlp": 0.0074912, + "balance_loss_clip": 1.00228453, + "balance_loss_mlp": 1.00036979, + "epoch": 0.12012625883060274, + "flos": 22267739952000.0, + "grad_norm": 1.8211928645464497, + "language_loss": 0.78822279, + "learning_rate": 3.915396793227428e-06, + "loss": 0.80737066, + "num_input_tokens_seen": 43252105, + "step": 1998, + "time_per_iteration": 2.628401756286621 + }, + { + "auxiliary_loss_clip": 0.01165695, + "auxiliary_loss_mlp": 0.007491, + "balance_loss_clip": 1.00238538, + "balance_loss_mlp": 1.00042772, + "epoch": 0.1201863820832707, + "flos": 21758885930880.0, + "grad_norm": 1.924281012395334, + "language_loss": 0.73269105, + "learning_rate": 3.915284680029769e-06, + "loss": 0.75183904, + "num_input_tokens_seen": 43270315, + "step": 1999, + "time_per_iteration": 2.5832393169403076 + }, + { + "auxiliary_loss_clip": 0.01181852, + "auxiliary_loss_mlp": 0.01166243, + "balance_loss_clip": 1.00237894, + "balance_loss_mlp": 1.00135326, + "epoch": 0.12024650533593867, + "flos": 21907987286400.0, + "grad_norm": 2.51242271478205, + "language_loss": 0.75061238, + "learning_rate": 3.915172494204323e-06, + "loss": 0.77409327, + "num_input_tokens_seen": 43289935, + "step": 2000, + "time_per_iteration": 2.562239408493042 + }, + { + "auxiliary_loss_clip": 0.01149493, + "auxiliary_loss_mlp": 0.01165612, + "balance_loss_clip": 1.00208688, + "balance_loss_mlp": 1.00110304, + "epoch": 0.12030662858860665, + "flos": 21689219502720.0, + "grad_norm": 1.5301766011381093, + "language_loss": 0.85139257, + "learning_rate": 3.915060235755344e-06, + "loss": 0.87454361, + "num_input_tokens_seen": 43309325, + "step": 2001, + "time_per_iteration": 2.6238536834716797 + }, + { + "auxiliary_loss_clip": 0.01149296, + "auxiliary_loss_mlp": 0.01166189, + "balance_loss_clip": 1.00221825, + "balance_loss_mlp": 1.00120294, + "epoch": 0.12036675184127461, + "flos": 12933228856320.0, + "grad_norm": 2.799716132081366, + "language_loss": 0.74469995, + "learning_rate": 3.91494790468709e-06, + "loss": 0.76785481, + "num_input_tokens_seen": 43327010, + "step": 2002, + "time_per_iteration": 2.613551139831543 + }, + { + "auxiliary_loss_clip": 0.01133781, + "auxiliary_loss_mlp": 0.01166513, + "balance_loss_clip": 1.00236917, + "balance_loss_mlp": 1.0011456, + "epoch": 0.12042687509394258, + "flos": 20851028657280.0, + "grad_norm": 1.806502663811844, + "language_loss": 0.78041625, + "learning_rate": 3.9148355010038185e-06, + "loss": 0.80341923, + "num_input_tokens_seen": 43345650, + "step": 2003, + "time_per_iteration": 2.669337511062622 + }, + { + "auxiliary_loss_clip": 0.01165223, + "auxiliary_loss_mlp": 0.01165746, + "balance_loss_clip": 1.00223005, + "balance_loss_mlp": 1.00104666, + "epoch": 0.12048699834661056, + "flos": 23878513451520.0, + "grad_norm": 1.6503546007485368, + "language_loss": 0.72304624, + "learning_rate": 3.914723024709793e-06, + "loss": 0.74635589, + "num_input_tokens_seen": 43365555, + "step": 2004, + "time_per_iteration": 2.6394641399383545 + }, + { + "auxiliary_loss_clip": 0.01149155, + "auxiliary_loss_mlp": 0.01166645, + "balance_loss_clip": 1.00230026, + "balance_loss_mlp": 1.00127792, + "epoch": 0.12054712159927852, + "flos": 19756363726080.0, + "grad_norm": 1.6415581364944474, + "language_loss": 0.78336751, + "learning_rate": 3.914610475809279e-06, + "loss": 0.80652553, + "num_input_tokens_seen": 43384990, + "step": 2005, + "time_per_iteration": 2.653543472290039 + }, + { + "auxiliary_loss_clip": 0.01180034, + "auxiliary_loss_mlp": 0.00748601, + "balance_loss_clip": 1.00286198, + "balance_loss_mlp": 0.99966872, + "epoch": 0.12060724485194649, + "flos": 51672763123200.0, + "grad_norm": 0.9266485934983805, + "language_loss": 0.58125907, + "learning_rate": 3.914497854306543e-06, + "loss": 0.60054547, + "num_input_tokens_seen": 43436335, + "step": 2006, + "time_per_iteration": 2.9248650074005127 + }, + { + "auxiliary_loss_clip": 0.01153541, + "auxiliary_loss_mlp": 0.01165973, + "balance_loss_clip": 1.00233507, + "balance_loss_mlp": 1.00117803, + "epoch": 0.12066736810461445, + "flos": 18990425088000.0, + "grad_norm": 1.7262091821134489, + "language_loss": 0.76510286, + "learning_rate": 3.9143851602058575e-06, + "loss": 0.78829801, + "num_input_tokens_seen": 43456495, + "step": 2007, + "time_per_iteration": 2.6611671447753906 + }, + { + "auxiliary_loss_clip": 0.01132287, + "auxiliary_loss_mlp": 0.01166583, + "balance_loss_clip": 1.00212753, + "balance_loss_mlp": 1.00131154, + "epoch": 0.12072749135728243, + "flos": 16471973882880.0, + "grad_norm": 2.7819878681513615, + "language_loss": 0.83587575, + "learning_rate": 3.914272393511494e-06, + "loss": 0.85886455, + "num_input_tokens_seen": 43473085, + "step": 2008, + "time_per_iteration": 2.7327003479003906 + }, + { + "auxiliary_loss_clip": 0.01181724, + "auxiliary_loss_mlp": 0.01166338, + "balance_loss_clip": 1.00229681, + "balance_loss_mlp": 1.00135279, + "epoch": 0.1207876146099504, + "flos": 18077108947200.0, + "grad_norm": 2.0770768677870803, + "language_loss": 0.84124529, + "learning_rate": 3.91415955422773e-06, + "loss": 0.86472583, + "num_input_tokens_seen": 43491135, + "step": 2009, + "time_per_iteration": 2.535987615585327 + }, + { + "auxiliary_loss_clip": 0.0118177, + "auxiliary_loss_mlp": 0.01166163, + "balance_loss_clip": 1.00236797, + "balance_loss_mlp": 1.00117719, + "epoch": 0.12084773786261836, + "flos": 21871573873920.0, + "grad_norm": 1.6583780629691103, + "language_loss": 0.84295243, + "learning_rate": 3.914046642358844e-06, + "loss": 0.86643177, + "num_input_tokens_seen": 43510440, + "step": 2010, + "time_per_iteration": 2.5393142700195312 + }, + { + "auxiliary_loss_clip": 0.01150256, + "auxiliary_loss_mlp": 0.00749147, + "balance_loss_clip": 1.00247538, + "balance_loss_mlp": 1.00037575, + "epoch": 0.12090786111528634, + "flos": 18333044328960.0, + "grad_norm": 1.5961980484573737, + "language_loss": 0.83924401, + "learning_rate": 3.9139336579091174e-06, + "loss": 0.85823804, + "num_input_tokens_seen": 43530145, + "step": 2011, + "time_per_iteration": 2.642587661743164 + }, + { + "auxiliary_loss_clip": 0.01133892, + "auxiliary_loss_mlp": 0.01166206, + "balance_loss_clip": 1.00228822, + "balance_loss_mlp": 1.0012207, + "epoch": 0.1209679843679543, + "flos": 21105850717440.0, + "grad_norm": 2.1455205141528055, + "language_loss": 0.96219969, + "learning_rate": 3.913820600882834e-06, + "loss": 0.98520076, + "num_input_tokens_seen": 43549315, + "step": 2012, + "time_per_iteration": 2.678352117538452 + }, + { + "auxiliary_loss_clip": 0.01148765, + "auxiliary_loss_mlp": 0.01165869, + "balance_loss_clip": 1.00211442, + "balance_loss_mlp": 1.0010736, + "epoch": 0.12102810762062227, + "flos": 29241053585280.0, + "grad_norm": 1.8483403937989222, + "language_loss": 0.80790275, + "learning_rate": 3.913707471284283e-06, + "loss": 0.83104908, + "num_input_tokens_seen": 43569240, + "step": 2013, + "time_per_iteration": 2.6601366996765137 + }, + { + "auxiliary_loss_clip": 0.011207, + "auxiliary_loss_mlp": 0.01165918, + "balance_loss_clip": 1.00215626, + "balance_loss_mlp": 1.00093174, + "epoch": 0.12108823087329025, + "flos": 17930701111680.0, + "grad_norm": 2.9203088124058576, + "language_loss": 0.77407485, + "learning_rate": 3.9135942691177515e-06, + "loss": 0.79694104, + "num_input_tokens_seen": 43587710, + "step": 2014, + "time_per_iteration": 2.668722152709961 + }, + { + "auxiliary_loss_clip": 0.01166031, + "auxiliary_loss_mlp": 0.01165808, + "balance_loss_clip": 1.00249803, + "balance_loss_mlp": 1.00110841, + "epoch": 0.12114835412595822, + "flos": 22091850028800.0, + "grad_norm": 3.36609461158929, + "language_loss": 0.87020481, + "learning_rate": 3.913480994387535e-06, + "loss": 0.89352322, + "num_input_tokens_seen": 43606000, + "step": 2015, + "time_per_iteration": 2.571728467941284 + }, + { + "auxiliary_loss_clip": 0.01181669, + "auxiliary_loss_mlp": 0.01165884, + "balance_loss_clip": 1.00238156, + "balance_loss_mlp": 1.00118399, + "epoch": 0.12120847737862618, + "flos": 20412343854720.0, + "grad_norm": 2.972831504764365, + "language_loss": 0.69164997, + "learning_rate": 3.913367647097926e-06, + "loss": 0.71512556, + "num_input_tokens_seen": 43624815, + "step": 2016, + "time_per_iteration": 2.522980213165283 + }, + { + "auxiliary_loss_clip": 0.01148504, + "auxiliary_loss_mlp": 0.01166089, + "balance_loss_clip": 1.00208664, + "balance_loss_mlp": 1.00081682, + "epoch": 0.12126860063129415, + "flos": 22309037614080.0, + "grad_norm": 2.327580544931468, + "language_loss": 0.80365527, + "learning_rate": 3.913254227253225e-06, + "loss": 0.82680118, + "num_input_tokens_seen": 43643960, + "step": 2017, + "time_per_iteration": 2.638421058654785 + }, + { + "auxiliary_loss_clip": 0.01166125, + "auxiliary_loss_mlp": 0.0116626, + "balance_loss_clip": 1.00247264, + "balance_loss_mlp": 1.00108385, + "epoch": 0.12132872388396213, + "flos": 13699275235200.0, + "grad_norm": 2.0804114312213424, + "language_loss": 0.69452095, + "learning_rate": 3.913140734857731e-06, + "loss": 0.71784478, + "num_input_tokens_seen": 43662650, + "step": 2018, + "time_per_iteration": 2.597581386566162 + }, + { + "auxiliary_loss_clip": 0.01132911, + "auxiliary_loss_mlp": 0.01166431, + "balance_loss_clip": 1.00231099, + "balance_loss_mlp": 1.00125492, + "epoch": 0.12138884713663009, + "flos": 26466954307200.0, + "grad_norm": 1.7348928608328051, + "language_loss": 0.72525895, + "learning_rate": 3.91302716991575e-06, + "loss": 0.74825233, + "num_input_tokens_seen": 43684205, + "step": 2019, + "time_per_iteration": 2.694955348968506 + }, + { + "auxiliary_loss_clip": 0.01099411, + "auxiliary_loss_mlp": 0.01166434, + "balance_loss_clip": 1.00193357, + "balance_loss_mlp": 1.00125742, + "epoch": 0.12144897038929806, + "flos": 26141603892480.0, + "grad_norm": 1.8313211379001566, + "language_loss": 0.92229009, + "learning_rate": 3.912913532431586e-06, + "loss": 0.94494861, + "num_input_tokens_seen": 43706320, + "step": 2020, + "time_per_iteration": 2.87329363822937 + }, + { + "auxiliary_loss_clip": 0.01153854, + "auxiliary_loss_mlp": 0.01166467, + "balance_loss_clip": 1.00259244, + "balance_loss_mlp": 1.00148177, + "epoch": 0.12150909364196603, + "flos": 24717530309760.0, + "grad_norm": 2.0121700705437053, + "language_loss": 0.77661645, + "learning_rate": 3.912799822409549e-06, + "loss": 0.79981971, + "num_input_tokens_seen": 43724805, + "step": 2021, + "time_per_iteration": 2.868760585784912 + }, + { + "auxiliary_loss_clip": 0.01181693, + "auxiliary_loss_mlp": 0.01165776, + "balance_loss_clip": 1.00243735, + "balance_loss_mlp": 1.00107598, + "epoch": 0.121569216894634, + "flos": 25186990089600.0, + "grad_norm": 1.9691587956992012, + "language_loss": 0.80333114, + "learning_rate": 3.912686039853952e-06, + "loss": 0.82680583, + "num_input_tokens_seen": 43742320, + "step": 2022, + "time_per_iteration": 2.5689008235931396 + }, + { + "auxiliary_loss_clip": 0.01149317, + "auxiliary_loss_mlp": 0.01166258, + "balance_loss_clip": 1.00228882, + "balance_loss_mlp": 1.00117731, + "epoch": 0.12162934014730196, + "flos": 13444094039040.0, + "grad_norm": 1.9146472798691279, + "language_loss": 0.85097742, + "learning_rate": 3.912572184769108e-06, + "loss": 0.87413323, + "num_input_tokens_seen": 43760665, + "step": 2023, + "time_per_iteration": 2.6300439834594727 + }, + { + "auxiliary_loss_clip": 0.01133181, + "auxiliary_loss_mlp": 0.01166507, + "balance_loss_clip": 1.00221658, + "balance_loss_mlp": 1.00123525, + "epoch": 0.12168946339996994, + "flos": 16946138344320.0, + "grad_norm": 2.6213744934184513, + "language_loss": 0.85517907, + "learning_rate": 3.912458257159335e-06, + "loss": 0.87817591, + "num_input_tokens_seen": 43779020, + "step": 2024, + "time_per_iteration": 4.045836448669434 + }, + { + "auxiliary_loss_clip": 0.01181511, + "auxiliary_loss_mlp": 0.01166033, + "balance_loss_clip": 1.00221765, + "balance_loss_mlp": 1.00123835, + "epoch": 0.12174958665263791, + "flos": 29821585196160.0, + "grad_norm": 2.1048854376934187, + "language_loss": 0.72059381, + "learning_rate": 3.912344257028954e-06, + "loss": 0.74406922, + "num_input_tokens_seen": 43798850, + "step": 2025, + "time_per_iteration": 5.4680187702178955 + }, + { + "auxiliary_loss_clip": 0.01148289, + "auxiliary_loss_mlp": 0.01165578, + "balance_loss_clip": 1.00208092, + "balance_loss_mlp": 1.00087833, + "epoch": 0.12180970990530587, + "flos": 24641902224000.0, + "grad_norm": 1.4963154363183138, + "language_loss": 0.75985825, + "learning_rate": 3.912230184382286e-06, + "loss": 0.78299689, + "num_input_tokens_seen": 43820130, + "step": 2026, + "time_per_iteration": 2.6692802906036377 + }, + { + "auxiliary_loss_clip": 0.01153549, + "auxiliary_loss_mlp": 0.01166224, + "balance_loss_clip": 1.00233412, + "balance_loss_mlp": 1.00123811, + "epoch": 0.12186983315797385, + "flos": 20521691832960.0, + "grad_norm": 1.877328395067599, + "language_loss": 0.88710201, + "learning_rate": 3.912116039223659e-06, + "loss": 0.91029978, + "num_input_tokens_seen": 43838485, + "step": 2027, + "time_per_iteration": 2.5922491550445557 + }, + { + "auxiliary_loss_clip": 0.01149031, + "auxiliary_loss_mlp": 0.01166237, + "balance_loss_clip": 1.00213706, + "balance_loss_mlp": 1.00134635, + "epoch": 0.12192995641064182, + "flos": 27818344719360.0, + "grad_norm": 1.5973842481696934, + "language_loss": 0.75717396, + "learning_rate": 3.912001821557399e-06, + "loss": 0.7803266, + "num_input_tokens_seen": 43859080, + "step": 2028, + "time_per_iteration": 4.0735182762146 + }, + { + "auxiliary_loss_clip": 0.01119647, + "auxiliary_loss_mlp": 0.01165813, + "balance_loss_clip": 1.00203419, + "balance_loss_mlp": 1.00111341, + "epoch": 0.12199007966330978, + "flos": 22017119783040.0, + "grad_norm": 1.9398569538091481, + "language_loss": 0.77160633, + "learning_rate": 3.911887531387839e-06, + "loss": 0.79446089, + "num_input_tokens_seen": 43879030, + "step": 2029, + "time_per_iteration": 2.7163753509521484 + }, + { + "auxiliary_loss_clip": 0.01165085, + "auxiliary_loss_mlp": 0.01165913, + "balance_loss_clip": 1.00233364, + "balance_loss_mlp": 1.00121307, + "epoch": 0.12205020291597775, + "flos": 23295216493440.0, + "grad_norm": 1.6893338956309687, + "language_loss": 0.7939502, + "learning_rate": 3.911773168719313e-06, + "loss": 0.81726015, + "num_input_tokens_seen": 43898505, + "step": 2030, + "time_per_iteration": 2.626652240753174 + }, + { + "auxiliary_loss_clip": 0.01181563, + "auxiliary_loss_mlp": 0.01165528, + "balance_loss_clip": 1.00236988, + "balance_loss_mlp": 1.00111485, + "epoch": 0.12211032616864573, + "flos": 26031609469440.0, + "grad_norm": 2.0881848006202057, + "language_loss": 0.74037486, + "learning_rate": 3.911658733556155e-06, + "loss": 0.7638458, + "num_input_tokens_seen": 43917945, + "step": 2031, + "time_per_iteration": 2.554380416870117 + }, + { + "auxiliary_loss_clip": 0.01181789, + "auxiliary_loss_mlp": 0.01165468, + "balance_loss_clip": 1.00254762, + "balance_loss_mlp": 1.00105405, + "epoch": 0.12217044942131369, + "flos": 20410943224320.0, + "grad_norm": 2.940321534268761, + "language_loss": 0.75387919, + "learning_rate": 3.911544225902707e-06, + "loss": 0.77735174, + "num_input_tokens_seen": 43937385, + "step": 2032, + "time_per_iteration": 2.515214204788208 + }, + { + "auxiliary_loss_clip": 0.01164858, + "auxiliary_loss_mlp": 0.01165569, + "balance_loss_clip": 1.00217271, + "balance_loss_mlp": 1.00106025, + "epoch": 0.12223057267398166, + "flos": 22857142222080.0, + "grad_norm": 1.6896422974101954, + "language_loss": 0.89115155, + "learning_rate": 3.911429645763311e-06, + "loss": 0.91445577, + "num_input_tokens_seen": 43958130, + "step": 2033, + "time_per_iteration": 2.5953352451324463 + }, + { + "auxiliary_loss_clip": 0.01153364, + "auxiliary_loss_mlp": 0.01166063, + "balance_loss_clip": 1.00247502, + "balance_loss_mlp": 1.00126815, + "epoch": 0.12229069592664964, + "flos": 20047563285120.0, + "grad_norm": 2.068942938673331, + "language_loss": 0.6555112, + "learning_rate": 3.911314993142311e-06, + "loss": 0.67870551, + "num_input_tokens_seen": 43976800, + "step": 2034, + "time_per_iteration": 2.5847256183624268 + }, + { + "auxiliary_loss_clip": 0.01148873, + "auxiliary_loss_mlp": 0.01166295, + "balance_loss_clip": 1.00216639, + "balance_loss_mlp": 1.00150037, + "epoch": 0.1223508191793176, + "flos": 22274240313600.0, + "grad_norm": 1.7326223487873447, + "language_loss": 0.76403677, + "learning_rate": 3.911200268044055e-06, + "loss": 0.78718847, + "num_input_tokens_seen": 43996620, + "step": 2035, + "time_per_iteration": 2.6197102069854736 + }, + { + "auxiliary_loss_clip": 0.01181867, + "auxiliary_loss_mlp": 0.01166067, + "balance_loss_clip": 1.00244999, + "balance_loss_mlp": 1.00127161, + "epoch": 0.12241094243198557, + "flos": 21285978445440.0, + "grad_norm": 1.6633316721334646, + "language_loss": 0.71307886, + "learning_rate": 3.911085470472892e-06, + "loss": 0.7365582, + "num_input_tokens_seen": 44016175, + "step": 2036, + "time_per_iteration": 2.5287249088287354 + }, + { + "auxiliary_loss_clip": 0.01148374, + "auxiliary_loss_mlp": 0.01165943, + "balance_loss_clip": 1.00229478, + "balance_loss_mlp": 1.00152946, + "epoch": 0.12247106568465355, + "flos": 17382381022080.0, + "grad_norm": 1.6239784449538786, + "language_loss": 0.83301663, + "learning_rate": 3.910970600433178e-06, + "loss": 0.85615981, + "num_input_tokens_seen": 44035060, + "step": 2037, + "time_per_iteration": 2.5856077671051025 + }, + { + "auxiliary_loss_clip": 0.01149397, + "auxiliary_loss_mlp": 0.01165761, + "balance_loss_clip": 1.00233936, + "balance_loss_mlp": 1.00106204, + "epoch": 0.12253118893732151, + "flos": 27045438842880.0, + "grad_norm": 2.502165459315169, + "language_loss": 0.79840302, + "learning_rate": 3.910855657929267e-06, + "loss": 0.8215546, + "num_input_tokens_seen": 44053330, + "step": 2038, + "time_per_iteration": 2.6370601654052734 + }, + { + "auxiliary_loss_clip": 0.01164292, + "auxiliary_loss_mlp": 0.00748592, + "balance_loss_clip": 1.00289845, + "balance_loss_mlp": 0.99972755, + "epoch": 0.12259131218998948, + "flos": 53861518368000.0, + "grad_norm": 0.8152881715587051, + "language_loss": 0.58618045, + "learning_rate": 3.910740642965518e-06, + "loss": 0.60530925, + "num_input_tokens_seen": 44107575, + "step": 2039, + "time_per_iteration": 3.0673201084136963 + }, + { + "auxiliary_loss_clip": 0.01133893, + "auxiliary_loss_mlp": 0.01165591, + "balance_loss_clip": 1.00221026, + "balance_loss_mlp": 1.00108206, + "epoch": 0.12265143544265744, + "flos": 17891917401600.0, + "grad_norm": 2.601928659136619, + "language_loss": 0.80949581, + "learning_rate": 3.910625555546292e-06, + "loss": 0.83249068, + "num_input_tokens_seen": 44126075, + "step": 2040, + "time_per_iteration": 2.6115598678588867 + }, + { + "auxiliary_loss_clip": 0.01148388, + "auxiliary_loss_mlp": 0.0116532, + "balance_loss_clip": 1.00215733, + "balance_loss_mlp": 1.00109768, + "epoch": 0.12271155869532542, + "flos": 21799932197760.0, + "grad_norm": 1.8936881265411336, + "language_loss": 0.832609, + "learning_rate": 3.910510395675953e-06, + "loss": 0.85574603, + "num_input_tokens_seen": 44145605, + "step": 2041, + "time_per_iteration": 2.6450257301330566 + }, + { + "auxiliary_loss_clip": 0.01150397, + "auxiliary_loss_mlp": 0.01165977, + "balance_loss_clip": 1.0023818, + "balance_loss_mlp": 1.00099146, + "epoch": 0.12277168194799339, + "flos": 19828759587840.0, + "grad_norm": 1.4439385701908232, + "language_loss": 0.67053485, + "learning_rate": 3.9103951633588694e-06, + "loss": 0.69369864, + "num_input_tokens_seen": 44164770, + "step": 2042, + "time_per_iteration": 2.5996437072753906 + }, + { + "auxiliary_loss_clip": 0.0113238, + "auxiliary_loss_mlp": 0.0116595, + "balance_loss_clip": 1.00205255, + "balance_loss_mlp": 1.00134611, + "epoch": 0.12283180520066135, + "flos": 23221024951680.0, + "grad_norm": 1.778772654093224, + "language_loss": 0.81611621, + "learning_rate": 3.910279858599409e-06, + "loss": 0.83909953, + "num_input_tokens_seen": 44184025, + "step": 2043, + "time_per_iteration": 2.6661229133605957 + }, + { + "auxiliary_loss_clip": 0.01152323, + "auxiliary_loss_mlp": 0.01165303, + "balance_loss_clip": 1.00219941, + "balance_loss_mlp": 1.00108004, + "epoch": 0.12289192845332933, + "flos": 18588476920320.0, + "grad_norm": 1.7149473600533394, + "language_loss": 0.80354857, + "learning_rate": 3.910164481401946e-06, + "loss": 0.82672489, + "num_input_tokens_seen": 44202950, + "step": 2044, + "time_per_iteration": 2.570263385772705 + }, + { + "auxiliary_loss_clip": 0.01114949, + "auxiliary_loss_mlp": 0.0116563, + "balance_loss_clip": 1.00197983, + "balance_loss_mlp": 1.0013119, + "epoch": 0.1229520517059973, + "flos": 25769532862080.0, + "grad_norm": 1.7017997423402778, + "language_loss": 0.78260326, + "learning_rate": 3.910049031770853e-06, + "loss": 0.80540901, + "num_input_tokens_seen": 44221115, + "step": 2045, + "time_per_iteration": 2.695690155029297 + }, + { + "auxiliary_loss_clip": 0.01164855, + "auxiliary_loss_mlp": 0.01165992, + "balance_loss_clip": 1.00230181, + "balance_loss_mlp": 1.00148368, + "epoch": 0.12301217495866526, + "flos": 20887154760960.0, + "grad_norm": 1.9966948829502416, + "language_loss": 0.67515737, + "learning_rate": 3.90993350971051e-06, + "loss": 0.69846582, + "num_input_tokens_seen": 44240575, + "step": 2046, + "time_per_iteration": 2.555337429046631 + }, + { + "auxiliary_loss_clip": 0.01181645, + "auxiliary_loss_mlp": 0.01165722, + "balance_loss_clip": 1.00249398, + "balance_loss_mlp": 1.00130904, + "epoch": 0.12307229821133324, + "flos": 22378811783040.0, + "grad_norm": 2.390178970506005, + "language_loss": 0.72489458, + "learning_rate": 3.909817915225297e-06, + "loss": 0.74836826, + "num_input_tokens_seen": 44257145, + "step": 2047, + "time_per_iteration": 2.526698112487793 + }, + { + "auxiliary_loss_clip": 0.01165188, + "auxiliary_loss_mlp": 0.01166064, + "balance_loss_clip": 1.00236809, + "balance_loss_mlp": 1.00117397, + "epoch": 0.1231324214640012, + "flos": 23367396873600.0, + "grad_norm": 2.354186503824876, + "language_loss": 0.7686131, + "learning_rate": 3.909702248319597e-06, + "loss": 0.79192567, + "num_input_tokens_seen": 44278035, + "step": 2048, + "time_per_iteration": 2.6157736778259277 + }, + { + "auxiliary_loss_clip": 0.01148371, + "auxiliary_loss_mlp": 0.01165728, + "balance_loss_clip": 1.00218678, + "balance_loss_mlp": 1.00102842, + "epoch": 0.12319254471666917, + "flos": 23767154311680.0, + "grad_norm": 1.9805608959740613, + "language_loss": 0.85021615, + "learning_rate": 3.909586508997797e-06, + "loss": 0.87335712, + "num_input_tokens_seen": 44296980, + "step": 2049, + "time_per_iteration": 2.6402082443237305 + }, + { + "auxiliary_loss_clip": 0.011153, + "auxiliary_loss_mlp": 0.01165886, + "balance_loss_clip": 1.00199366, + "balance_loss_mlp": 1.00118625, + "epoch": 0.12325266796933713, + "flos": 23550146294400.0, + "grad_norm": 1.8077533139881996, + "language_loss": 0.7542218, + "learning_rate": 3.909470697264285e-06, + "loss": 0.77703369, + "num_input_tokens_seen": 44318005, + "step": 2050, + "time_per_iteration": 2.7203123569488525 + }, + { + "auxiliary_loss_clip": 0.01132243, + "auxiliary_loss_mlp": 0.01165693, + "balance_loss_clip": 1.00213599, + "balance_loss_mlp": 1.00099373, + "epoch": 0.12331279122200511, + "flos": 24423996366720.0, + "grad_norm": 2.130748435461313, + "language_loss": 0.81057686, + "learning_rate": 3.909354813123452e-06, + "loss": 0.83355629, + "num_input_tokens_seen": 44335260, + "step": 2051, + "time_per_iteration": 2.6616735458374023 + }, + { + "auxiliary_loss_clip": 0.01181586, + "auxiliary_loss_mlp": 0.00749117, + "balance_loss_clip": 1.00251293, + "balance_loss_mlp": 1.00037837, + "epoch": 0.12337291447467308, + "flos": 25484294960640.0, + "grad_norm": 1.6341784446415062, + "language_loss": 0.79953557, + "learning_rate": 3.909238856579693e-06, + "loss": 0.81884259, + "num_input_tokens_seen": 44355315, + "step": 2052, + "time_per_iteration": 2.6181910037994385 + }, + { + "auxiliary_loss_clip": 0.01165494, + "auxiliary_loss_mlp": 0.01166094, + "balance_loss_clip": 1.00239789, + "balance_loss_mlp": 1.00129855, + "epoch": 0.12343303772734104, + "flos": 23550002640000.0, + "grad_norm": 1.9295114369125208, + "language_loss": 0.73909533, + "learning_rate": 3.909122827637406e-06, + "loss": 0.76241124, + "num_input_tokens_seen": 44373020, + "step": 2053, + "time_per_iteration": 2.6668002605438232 + }, + { + "auxiliary_loss_clip": 0.01181509, + "auxiliary_loss_mlp": 0.00749057, + "balance_loss_clip": 1.00234056, + "balance_loss_mlp": 1.0003252, + "epoch": 0.12349316098000902, + "flos": 47557074867840.0, + "grad_norm": 1.6167079345381667, + "language_loss": 0.74413228, + "learning_rate": 3.909006726300991e-06, + "loss": 0.76343793, + "num_input_tokens_seen": 44397525, + "step": 2054, + "time_per_iteration": 2.777904748916626 + }, + { + "auxiliary_loss_clip": 0.0114912, + "auxiliary_loss_mlp": 0.01165137, + "balance_loss_clip": 1.00215912, + "balance_loss_mlp": 1.00091386, + "epoch": 0.12355328423267699, + "flos": 25045969294080.0, + "grad_norm": 1.6769011606994582, + "language_loss": 0.84949666, + "learning_rate": 3.908890552574849e-06, + "loss": 0.8726393, + "num_input_tokens_seen": 44415890, + "step": 2055, + "time_per_iteration": 2.6304733753204346 + }, + { + "auxiliary_loss_clip": 0.01115857, + "auxiliary_loss_mlp": 0.01165652, + "balance_loss_clip": 1.00194156, + "balance_loss_mlp": 1.00142896, + "epoch": 0.12361340748534495, + "flos": 27709140395520.0, + "grad_norm": 1.9211911797152443, + "language_loss": 0.77332342, + "learning_rate": 3.908774306463384e-06, + "loss": 0.79613853, + "num_input_tokens_seen": 44436625, + "step": 2056, + "time_per_iteration": 2.728952407836914 + }, + { + "auxiliary_loss_clip": 0.01165027, + "auxiliary_loss_mlp": 0.01165624, + "balance_loss_clip": 1.00229418, + "balance_loss_mlp": 1.00130582, + "epoch": 0.12367353073801293, + "flos": 26140598311680.0, + "grad_norm": 1.8385535042498262, + "language_loss": 0.83086848, + "learning_rate": 3.908657987971009e-06, + "loss": 0.85417497, + "num_input_tokens_seen": 44455265, + "step": 2057, + "time_per_iteration": 2.6069717407226562 + }, + { + "auxiliary_loss_clip": 0.01149257, + "auxiliary_loss_mlp": 0.01165928, + "balance_loss_clip": 1.00217652, + "balance_loss_mlp": 1.00122833, + "epoch": 0.1237336539906809, + "flos": 25156035544320.0, + "grad_norm": 1.69663543357111, + "language_loss": 0.78008431, + "learning_rate": 3.90854159710213e-06, + "loss": 0.80323613, + "num_input_tokens_seen": 44475815, + "step": 2058, + "time_per_iteration": 2.643904447555542 + }, + { + "auxiliary_loss_clip": 0.01149351, + "auxiliary_loss_mlp": 0.01165697, + "balance_loss_clip": 1.00226796, + "balance_loss_mlp": 1.00099707, + "epoch": 0.12379377724334886, + "flos": 15304589867520.0, + "grad_norm": 2.232238490966684, + "language_loss": 0.83540863, + "learning_rate": 3.9084251338611624e-06, + "loss": 0.85855913, + "num_input_tokens_seen": 44494045, + "step": 2059, + "time_per_iteration": 2.5893325805664062 + }, + { + "auxiliary_loss_clip": 0.01132773, + "auxiliary_loss_mlp": 0.01165751, + "balance_loss_clip": 1.00226939, + "balance_loss_mlp": 1.00124168, + "epoch": 0.12385390049601683, + "flos": 21316717509120.0, + "grad_norm": 2.0203091397017774, + "language_loss": 0.81710732, + "learning_rate": 3.908308598252523e-06, + "loss": 0.84009254, + "num_input_tokens_seen": 44509120, + "step": 2060, + "time_per_iteration": 2.6372716426849365 + }, + { + "auxiliary_loss_clip": 0.01152714, + "auxiliary_loss_mlp": 0.0116566, + "balance_loss_clip": 1.00238204, + "balance_loss_mlp": 1.00105596, + "epoch": 0.1239140237486848, + "flos": 15116309752320.0, + "grad_norm": 3.112049181567135, + "language_loss": 0.86096263, + "learning_rate": 3.9081919902806306e-06, + "loss": 0.88414633, + "num_input_tokens_seen": 44525780, + "step": 2061, + "time_per_iteration": 2.5678226947784424 + }, + { + "auxiliary_loss_clip": 0.01164914, + "auxiliary_loss_mlp": 0.01165201, + "balance_loss_clip": 1.00235438, + "balance_loss_mlp": 1.00116873, + "epoch": 0.12397414700135277, + "flos": 21976791788160.0, + "grad_norm": 2.0286535054093258, + "language_loss": 0.84840494, + "learning_rate": 3.908075309949906e-06, + "loss": 0.87170613, + "num_input_tokens_seen": 44543125, + "step": 2062, + "time_per_iteration": 5.321699857711792 + }, + { + "auxiliary_loss_clip": 0.01133746, + "auxiliary_loss_mlp": 0.01165539, + "balance_loss_clip": 1.00231576, + "balance_loss_mlp": 1.0011251, + "epoch": 0.12403427025402074, + "flos": 13400892956160.0, + "grad_norm": 2.220370124989897, + "language_loss": 0.78936368, + "learning_rate": 3.907958557264774e-06, + "loss": 0.81235647, + "num_input_tokens_seen": 44560275, + "step": 2063, + "time_per_iteration": 2.6286590099334717 + }, + { + "auxiliary_loss_clip": 0.0111548, + "auxiliary_loss_mlp": 0.01165422, + "balance_loss_clip": 1.00207043, + "balance_loss_mlp": 1.00129461, + "epoch": 0.12409439350668872, + "flos": 15304374385920.0, + "grad_norm": 1.9554157468767255, + "language_loss": 0.79393077, + "learning_rate": 3.907841732229663e-06, + "loss": 0.8167398, + "num_input_tokens_seen": 44577640, + "step": 2064, + "time_per_iteration": 4.098936557769775 + }, + { + "auxiliary_loss_clip": 0.01148174, + "auxiliary_loss_mlp": 0.01165708, + "balance_loss_clip": 1.00220215, + "balance_loss_mlp": 1.00158072, + "epoch": 0.12415451675935668, + "flos": 25009376313600.0, + "grad_norm": 2.0928894001726124, + "language_loss": 0.92657506, + "learning_rate": 3.907724834849002e-06, + "loss": 0.94971395, + "num_input_tokens_seen": 44594860, + "step": 2065, + "time_per_iteration": 2.6512632369995117 + }, + { + "auxiliary_loss_clip": 0.01148899, + "auxiliary_loss_mlp": 0.0116546, + "balance_loss_clip": 1.00229597, + "balance_loss_mlp": 1.00104642, + "epoch": 0.12421464001202465, + "flos": 23659673840640.0, + "grad_norm": 1.761068567778081, + "language_loss": 0.80577815, + "learning_rate": 3.907607865127225e-06, + "loss": 0.82892174, + "num_input_tokens_seen": 44614780, + "step": 2066, + "time_per_iteration": 4.08673357963562 + }, + { + "auxiliary_loss_clip": 0.01114534, + "auxiliary_loss_mlp": 0.01158877, + "balance_loss_clip": 1.00267613, + "balance_loss_mlp": 1.00018585, + "epoch": 0.12427476326469263, + "flos": 65732904345600.0, + "grad_norm": 0.8757277488900858, + "language_loss": 0.63350046, + "learning_rate": 3.907490823068766e-06, + "loss": 0.65623462, + "num_input_tokens_seen": 44671240, + "step": 2067, + "time_per_iteration": 3.3016414642333984 + }, + { + "auxiliary_loss_clip": 0.01116937, + "auxiliary_loss_mlp": 0.01165518, + "balance_loss_clip": 1.00215745, + "balance_loss_mlp": 1.00119996, + "epoch": 0.12433488651736059, + "flos": 24535427333760.0, + "grad_norm": 2.261693935210814, + "language_loss": 0.93201244, + "learning_rate": 3.907373708678063e-06, + "loss": 0.95483696, + "num_input_tokens_seen": 44691050, + "step": 2068, + "time_per_iteration": 3.0334248542785645 + }, + { + "auxiliary_loss_clip": 0.011654, + "auxiliary_loss_mlp": 0.0116521, + "balance_loss_clip": 1.00237513, + "balance_loss_mlp": 1.0013684, + "epoch": 0.12439500977002856, + "flos": 21031659175680.0, + "grad_norm": 1.8580420303023983, + "language_loss": 0.81333363, + "learning_rate": 3.9072565219595596e-06, + "loss": 0.83663976, + "num_input_tokens_seen": 44709850, + "step": 2069, + "time_per_iteration": 2.5930471420288086 + }, + { + "auxiliary_loss_clip": 0.01120315, + "auxiliary_loss_mlp": 0.01165741, + "balance_loss_clip": 1.00209463, + "balance_loss_mlp": 1.00132751, + "epoch": 0.12445513302269653, + "flos": 26830621555200.0, + "grad_norm": 1.5861519527811616, + "language_loss": 0.77571225, + "learning_rate": 3.907139262917696e-06, + "loss": 0.79857278, + "num_input_tokens_seen": 44731475, + "step": 2070, + "time_per_iteration": 2.7896978855133057 + }, + { + "auxiliary_loss_clip": 0.0116587, + "auxiliary_loss_mlp": 0.01165474, + "balance_loss_clip": 1.00249755, + "balance_loss_mlp": 1.00125134, + "epoch": 0.1245152562753645, + "flos": 18368919037440.0, + "grad_norm": 2.3442063994535807, + "language_loss": 0.81213003, + "learning_rate": 3.907021931556922e-06, + "loss": 0.8354435, + "num_input_tokens_seen": 44749685, + "step": 2071, + "time_per_iteration": 2.569586753845215 + }, + { + "auxiliary_loss_clip": 0.01164737, + "auxiliary_loss_mlp": 0.01165048, + "balance_loss_clip": 1.00230098, + "balance_loss_mlp": 1.00111127, + "epoch": 0.12457537952803246, + "flos": 33107986200960.0, + "grad_norm": 1.8256398388644914, + "language_loss": 0.78037208, + "learning_rate": 3.906904527881684e-06, + "loss": 0.80366993, + "num_input_tokens_seen": 44772165, + "step": 2072, + "time_per_iteration": 2.688129186630249 + }, + { + "auxiliary_loss_clip": 0.0114944, + "auxiliary_loss_mlp": 0.01165504, + "balance_loss_clip": 1.00255108, + "balance_loss_mlp": 1.00137663, + "epoch": 0.12463550278070043, + "flos": 22270217990400.0, + "grad_norm": 2.2249670275266276, + "language_loss": 0.74949002, + "learning_rate": 3.9067870518964355e-06, + "loss": 0.77263945, + "num_input_tokens_seen": 44790580, + "step": 2073, + "time_per_iteration": 2.6375038623809814 + }, + { + "auxiliary_loss_clip": 0.01103999, + "auxiliary_loss_mlp": 0.01164714, + "balance_loss_clip": 1.00193083, + "balance_loss_mlp": 1.00115836, + "epoch": 0.12469562603336841, + "flos": 14679025580160.0, + "grad_norm": 1.9057485753112853, + "language_loss": 0.90443951, + "learning_rate": 3.906669503605631e-06, + "loss": 0.92712665, + "num_input_tokens_seen": 44806730, + "step": 2074, + "time_per_iteration": 2.849716901779175 + }, + { + "auxiliary_loss_clip": 0.01100009, + "auxiliary_loss_mlp": 0.01165381, + "balance_loss_clip": 1.00174999, + "balance_loss_mlp": 1.0011586, + "epoch": 0.12475574928603637, + "flos": 24644775312000.0, + "grad_norm": 2.3263825342446993, + "language_loss": 0.83903873, + "learning_rate": 3.906551883013728e-06, + "loss": 0.86169255, + "num_input_tokens_seen": 44825550, + "step": 2075, + "time_per_iteration": 2.768836259841919 + }, + { + "auxiliary_loss_clip": 0.01117435, + "auxiliary_loss_mlp": 0.01165212, + "balance_loss_clip": 1.00208533, + "balance_loss_mlp": 1.00108433, + "epoch": 0.12481587253870434, + "flos": 21762980081280.0, + "grad_norm": 1.9073644948568964, + "language_loss": 0.7400679, + "learning_rate": 3.9064341901251865e-06, + "loss": 0.76289439, + "num_input_tokens_seen": 44844155, + "step": 2076, + "time_per_iteration": 2.717137098312378 + }, + { + "auxiliary_loss_clip": 0.01117158, + "auxiliary_loss_mlp": 0.01164921, + "balance_loss_clip": 1.00235796, + "balance_loss_mlp": 1.0011754, + "epoch": 0.12487599579137232, + "flos": 21432529935360.0, + "grad_norm": 1.7040099064794507, + "language_loss": 0.76125455, + "learning_rate": 3.906316424944469e-06, + "loss": 0.78407532, + "num_input_tokens_seen": 44863780, + "step": 2077, + "time_per_iteration": 2.7092397212982178 + }, + { + "auxiliary_loss_clip": 0.01164576, + "auxiliary_loss_mlp": 0.01165296, + "balance_loss_clip": 1.00223136, + "balance_loss_mlp": 1.00135946, + "epoch": 0.12493611904404028, + "flos": 16107624276480.0, + "grad_norm": 1.9552520153317958, + "language_loss": 0.82716906, + "learning_rate": 3.906198587476043e-06, + "loss": 0.8504678, + "num_input_tokens_seen": 44881480, + "step": 2078, + "time_per_iteration": 2.551928758621216 + }, + { + "auxiliary_loss_clip": 0.0114908, + "auxiliary_loss_mlp": 0.01165512, + "balance_loss_clip": 1.00229609, + "balance_loss_mlp": 1.00119436, + "epoch": 0.12499624229670825, + "flos": 21580266574080.0, + "grad_norm": 1.8099904136222338, + "language_loss": 0.74782163, + "learning_rate": 3.906080677724374e-06, + "loss": 0.77096754, + "num_input_tokens_seen": 44900390, + "step": 2079, + "time_per_iteration": 2.661370277404785 + }, + { + "auxiliary_loss_clip": 0.01165402, + "auxiliary_loss_mlp": 0.01165825, + "balance_loss_clip": 1.00244427, + "balance_loss_mlp": 1.0014112, + "epoch": 0.1250563655493762, + "flos": 25699040421120.0, + "grad_norm": 2.279569116266265, + "language_loss": 0.832735, + "learning_rate": 3.905962695693935e-06, + "loss": 0.85604727, + "num_input_tokens_seen": 44920375, + "step": 2080, + "time_per_iteration": 2.6302013397216797 + }, + { + "auxiliary_loss_clip": 0.01164898, + "auxiliary_loss_mlp": 0.01165179, + "balance_loss_clip": 1.00231993, + "balance_loss_mlp": 1.00124288, + "epoch": 0.12511648880204418, + "flos": 16909509450240.0, + "grad_norm": 2.2381246773254353, + "language_loss": 0.85038733, + "learning_rate": 3.9058446413892e-06, + "loss": 0.87368804, + "num_input_tokens_seen": 44938415, + "step": 2081, + "time_per_iteration": 2.5322747230529785 + }, + { + "auxiliary_loss_clip": 0.01164833, + "auxiliary_loss_mlp": 0.01164903, + "balance_loss_clip": 1.0022893, + "balance_loss_mlp": 1.00106144, + "epoch": 0.12517661205471217, + "flos": 17567500740480.0, + "grad_norm": 2.2770100747869773, + "language_loss": 0.76640725, + "learning_rate": 3.905726514814646e-06, + "loss": 0.78970456, + "num_input_tokens_seen": 44957135, + "step": 2082, + "time_per_iteration": 2.561316728591919 + }, + { + "auxiliary_loss_clip": 0.01153032, + "auxiliary_loss_mlp": 0.01165633, + "balance_loss_clip": 1.00246644, + "balance_loss_mlp": 1.00112414, + "epoch": 0.12523673530738014, + "flos": 16033791870720.0, + "grad_norm": 2.992208869951407, + "language_loss": 0.78632981, + "learning_rate": 3.9056083159747495e-06, + "loss": 0.80951643, + "num_input_tokens_seen": 44974480, + "step": 2083, + "time_per_iteration": 2.581951141357422 + }, + { + "auxiliary_loss_clip": 0.01148342, + "auxiliary_loss_mlp": 0.01165299, + "balance_loss_clip": 1.00218368, + "balance_loss_mlp": 1.00098133, + "epoch": 0.1252968585600481, + "flos": 18807747494400.0, + "grad_norm": 3.0373244254186003, + "language_loss": 0.90419072, + "learning_rate": 3.9054900448739966e-06, + "loss": 0.92732716, + "num_input_tokens_seen": 44990310, + "step": 2084, + "time_per_iteration": 2.6168477535247803 + }, + { + "auxiliary_loss_clip": 0.01133483, + "auxiliary_loss_mlp": 0.01165553, + "balance_loss_clip": 1.00200081, + "balance_loss_mlp": 1.00142574, + "epoch": 0.12535698181271607, + "flos": 27271568914560.0, + "grad_norm": 1.9990728267400724, + "language_loss": 0.80043435, + "learning_rate": 3.905371701516869e-06, + "loss": 0.8234247, + "num_input_tokens_seen": 45010720, + "step": 2085, + "time_per_iteration": 2.732037305831909 + }, + { + "auxiliary_loss_clip": 0.0118114, + "auxiliary_loss_mlp": 0.01164874, + "balance_loss_clip": 1.0022738, + "balance_loss_mlp": 1.00103319, + "epoch": 0.12541710506538403, + "flos": 22054107813120.0, + "grad_norm": 1.6354431852901634, + "language_loss": 0.88080537, + "learning_rate": 3.905253285907856e-06, + "loss": 0.90426552, + "num_input_tokens_seen": 45030360, + "step": 2086, + "time_per_iteration": 2.5708770751953125 + }, + { + "auxiliary_loss_clip": 0.01148946, + "auxiliary_loss_mlp": 0.01165011, + "balance_loss_clip": 1.00212836, + "balance_loss_mlp": 1.00107479, + "epoch": 0.125477228318052, + "flos": 12603173760000.0, + "grad_norm": 2.2293157000208823, + "language_loss": 0.87017375, + "learning_rate": 3.905134798051447e-06, + "loss": 0.89331329, + "num_input_tokens_seen": 45045085, + "step": 2087, + "time_per_iteration": 2.566089153289795 + }, + { + "auxiliary_loss_clip": 0.01148063, + "auxiliary_loss_mlp": 0.01165174, + "balance_loss_clip": 1.00221407, + "balance_loss_mlp": 1.00114155, + "epoch": 0.12553735157071996, + "flos": 23878549365120.0, + "grad_norm": 1.6261868006290099, + "language_loss": 0.73151547, + "learning_rate": 3.905016237952136e-06, + "loss": 0.75464791, + "num_input_tokens_seen": 45065145, + "step": 2088, + "time_per_iteration": 2.6622211933135986 + }, + { + "auxiliary_loss_clip": 0.01163797, + "auxiliary_loss_mlp": 0.01157991, + "balance_loss_clip": 1.00276971, + "balance_loss_mlp": 1.00006223, + "epoch": 0.12559747482338796, + "flos": 69920841830400.0, + "grad_norm": 0.755799333227955, + "language_loss": 0.61791408, + "learning_rate": 3.904897605614418e-06, + "loss": 0.641132, + "num_input_tokens_seen": 45126230, + "step": 2089, + "time_per_iteration": 3.1138463020324707 + }, + { + "auxiliary_loss_clip": 0.0114872, + "auxiliary_loss_mlp": 0.0116509, + "balance_loss_clip": 1.002244, + "balance_loss_mlp": 1.00105786, + "epoch": 0.12565759807605592, + "flos": 24279563779200.0, + "grad_norm": 2.021856764052984, + "language_loss": 0.77601874, + "learning_rate": 3.904778901042793e-06, + "loss": 0.79915679, + "num_input_tokens_seen": 45145545, + "step": 2090, + "time_per_iteration": 2.628868341445923 + }, + { + "auxiliary_loss_clip": 0.01146285, + "auxiliary_loss_mlp": 0.01157992, + "balance_loss_clip": 1.00233972, + "balance_loss_mlp": 1.00006378, + "epoch": 0.12571772132872389, + "flos": 56451180286080.0, + "grad_norm": 0.7806482441186624, + "language_loss": 0.59471393, + "learning_rate": 3.90466012424176e-06, + "loss": 0.61775666, + "num_input_tokens_seen": 45206845, + "step": 2091, + "time_per_iteration": 3.1511247158050537 + }, + { + "auxiliary_loss_clip": 0.01164889, + "auxiliary_loss_mlp": 0.01164574, + "balance_loss_clip": 1.00242949, + "balance_loss_mlp": 1.00101876, + "epoch": 0.12577784458139185, + "flos": 41245846675200.0, + "grad_norm": 1.7761084605188742, + "language_loss": 0.63171965, + "learning_rate": 3.904541275215825e-06, + "loss": 0.65501428, + "num_input_tokens_seen": 45228495, + "step": 2092, + "time_per_iteration": 2.750337600708008 + }, + { + "auxiliary_loss_clip": 0.01149294, + "auxiliary_loss_mlp": 0.01165731, + "balance_loss_clip": 1.00243115, + "balance_loss_mlp": 1.001508, + "epoch": 0.12583796783405982, + "flos": 19755501799680.0, + "grad_norm": 2.2315077697702628, + "language_loss": 0.80097497, + "learning_rate": 3.904422353969493e-06, + "loss": 0.82412517, + "num_input_tokens_seen": 45245720, + "step": 2093, + "time_per_iteration": 2.608726739883423 + }, + { + "auxiliary_loss_clip": 0.01165628, + "auxiliary_loss_mlp": 0.0116534, + "balance_loss_clip": 1.00237525, + "balance_loss_mlp": 1.00149822, + "epoch": 0.12589809108672778, + "flos": 22602104680320.0, + "grad_norm": 1.8254653933923388, + "language_loss": 0.76179957, + "learning_rate": 3.904303360507276e-06, + "loss": 0.78510922, + "num_input_tokens_seen": 45265650, + "step": 2094, + "time_per_iteration": 2.5656003952026367 + }, + { + "auxiliary_loss_clip": 0.01132413, + "auxiliary_loss_mlp": 0.01164391, + "balance_loss_clip": 1.00221682, + "balance_loss_mlp": 1.00102711, + "epoch": 0.12595821433939577, + "flos": 45222845541120.0, + "grad_norm": 1.6232141808560905, + "language_loss": 0.7678538, + "learning_rate": 3.9041842948336835e-06, + "loss": 0.79082185, + "num_input_tokens_seen": 45287790, + "step": 2095, + "time_per_iteration": 2.8618264198303223 + }, + { + "auxiliary_loss_clip": 0.01148655, + "auxiliary_loss_mlp": 0.01164872, + "balance_loss_clip": 1.00205255, + "balance_loss_mlp": 1.00131655, + "epoch": 0.12601833759206374, + "flos": 14319811618560.0, + "grad_norm": 2.457521308578446, + "language_loss": 0.83115733, + "learning_rate": 3.904065156953232e-06, + "loss": 0.85429257, + "num_input_tokens_seen": 45305720, + "step": 2096, + "time_per_iteration": 2.601041793823242 + }, + { + "auxiliary_loss_clip": 0.01165125, + "auxiliary_loss_mlp": 0.01164736, + "balance_loss_clip": 1.00230742, + "balance_loss_mlp": 1.00098968, + "epoch": 0.1260784608447317, + "flos": 21288241002240.0, + "grad_norm": 2.42683190272539, + "language_loss": 0.75702703, + "learning_rate": 3.903945946870439e-06, + "loss": 0.78032565, + "num_input_tokens_seen": 45325290, + "step": 2097, + "time_per_iteration": 2.584178924560547 + }, + { + "auxiliary_loss_clip": 0.01164527, + "auxiliary_loss_mlp": 0.0116507, + "balance_loss_clip": 1.00226569, + "balance_loss_mlp": 1.00132442, + "epoch": 0.12613858409739967, + "flos": 26251311006720.0, + "grad_norm": 4.56865486424507, + "language_loss": 0.87016094, + "learning_rate": 3.9038266645898246e-06, + "loss": 0.89345694, + "num_input_tokens_seen": 45344465, + "step": 2098, + "time_per_iteration": 2.6124918460845947 + }, + { + "auxiliary_loss_clip": 0.01116387, + "auxiliary_loss_mlp": 0.01165046, + "balance_loss_clip": 1.00201654, + "balance_loss_mlp": 1.00129962, + "epoch": 0.12619870735006763, + "flos": 21579979265280.0, + "grad_norm": 1.7685430539044464, + "language_loss": 0.69776773, + "learning_rate": 3.903707310115912e-06, + "loss": 0.72058207, + "num_input_tokens_seen": 45362465, + "step": 2099, + "time_per_iteration": 2.697500705718994 + }, + { + "auxiliary_loss_clip": 0.01153321, + "auxiliary_loss_mlp": 0.01165024, + "balance_loss_clip": 1.00243211, + "balance_loss_mlp": 1.00127816, + "epoch": 0.1262588306027356, + "flos": 23367037737600.0, + "grad_norm": 2.598266816058081, + "language_loss": 0.80992424, + "learning_rate": 3.903587883453228e-06, + "loss": 0.83310771, + "num_input_tokens_seen": 45382700, + "step": 2100, + "time_per_iteration": 5.638598918914795 + }, + { + "auxiliary_loss_clip": 0.01148584, + "auxiliary_loss_mlp": 0.01164857, + "balance_loss_clip": 1.00206327, + "balance_loss_mlp": 1.00111103, + "epoch": 0.12631895385540357, + "flos": 23949185460480.0, + "grad_norm": 1.84661602422827, + "language_loss": 0.80113584, + "learning_rate": 3.903468384606302e-06, + "loss": 0.82427025, + "num_input_tokens_seen": 45401005, + "step": 2101, + "time_per_iteration": 2.6579370498657227 + }, + { + "auxiliary_loss_clip": 0.0117945, + "auxiliary_loss_mlp": 0.01157908, + "balance_loss_clip": 1.00291467, + "balance_loss_mlp": 0.9999792, + "epoch": 0.12637907710807156, + "flos": 70282138780800.0, + "grad_norm": 0.7323913844959992, + "language_loss": 0.57104576, + "learning_rate": 3.903348813579662e-06, + "loss": 0.59441936, + "num_input_tokens_seen": 45466555, + "step": 2102, + "time_per_iteration": 4.662429571151733 + }, + { + "auxiliary_loss_clip": 0.01132534, + "auxiliary_loss_mlp": 0.01164463, + "balance_loss_clip": 1.00219321, + "balance_loss_mlp": 1.00109816, + "epoch": 0.12643920036073952, + "flos": 18915084311040.0, + "grad_norm": 1.9675446543839095, + "language_loss": 0.93527687, + "learning_rate": 3.903229170377845e-06, + "loss": 0.95824683, + "num_input_tokens_seen": 45485165, + "step": 2103, + "time_per_iteration": 2.6413588523864746 + }, + { + "auxiliary_loss_clip": 0.01165209, + "auxiliary_loss_mlp": 0.01164362, + "balance_loss_clip": 1.00226355, + "balance_loss_mlp": 1.00080681, + "epoch": 0.1264993236134075, + "flos": 27782470010880.0, + "grad_norm": 1.6223280365807244, + "language_loss": 0.78180861, + "learning_rate": 3.903109455005387e-06, + "loss": 0.80510437, + "num_input_tokens_seen": 45504630, + "step": 2104, + "time_per_iteration": 4.0403265953063965 + }, + { + "auxiliary_loss_clip": 0.0113262, + "auxiliary_loss_mlp": 0.01164887, + "balance_loss_clip": 1.00226533, + "balance_loss_mlp": 1.00114131, + "epoch": 0.12655944686607545, + "flos": 24754697907840.0, + "grad_norm": 1.9401226931448061, + "language_loss": 0.80951667, + "learning_rate": 3.902989667466828e-06, + "loss": 0.83249176, + "num_input_tokens_seen": 45524885, + "step": 2105, + "time_per_iteration": 2.7074034214019775 + }, + { + "auxiliary_loss_clip": 0.0116906, + "auxiliary_loss_mlp": 0.0116538, + "balance_loss_clip": 1.00258183, + "balance_loss_mlp": 1.00125265, + "epoch": 0.12661957011874342, + "flos": 24133048202880.0, + "grad_norm": 2.101373320980191, + "language_loss": 0.83678395, + "learning_rate": 3.90286980776671e-06, + "loss": 0.8601284, + "num_input_tokens_seen": 45545000, + "step": 2106, + "time_per_iteration": 2.628042459487915 + }, + { + "auxiliary_loss_clip": 0.01120134, + "auxiliary_loss_mlp": 0.01164778, + "balance_loss_clip": 1.00230241, + "balance_loss_mlp": 1.00131822, + "epoch": 0.12667969337141138, + "flos": 24569614103040.0, + "grad_norm": 1.7216344871619853, + "language_loss": 0.73509943, + "learning_rate": 3.902749875909578e-06, + "loss": 0.75794852, + "num_input_tokens_seen": 45564210, + "step": 2107, + "time_per_iteration": 2.714912176132202 + }, + { + "auxiliary_loss_clip": 0.01180965, + "auxiliary_loss_mlp": 0.01164493, + "balance_loss_clip": 1.00231957, + "balance_loss_mlp": 1.00103354, + "epoch": 0.12673981662407935, + "flos": 22961677777920.0, + "grad_norm": 2.7038981835123894, + "language_loss": 0.79365844, + "learning_rate": 3.90262987189998e-06, + "loss": 0.81711298, + "num_input_tokens_seen": 45583030, + "step": 2108, + "time_per_iteration": 2.540210247039795 + }, + { + "auxiliary_loss_clip": 0.01181138, + "auxiliary_loss_mlp": 0.01164602, + "balance_loss_clip": 1.00230908, + "balance_loss_mlp": 1.00095165, + "epoch": 0.12679993987674734, + "flos": 17274864637440.0, + "grad_norm": 2.00705574873083, + "language_loss": 0.76074898, + "learning_rate": 3.902509795742467e-06, + "loss": 0.78420645, + "num_input_tokens_seen": 45602265, + "step": 2109, + "time_per_iteration": 2.5984833240509033 + }, + { + "auxiliary_loss_clip": 0.01134055, + "auxiliary_loss_mlp": 0.01164865, + "balance_loss_clip": 1.00249994, + "balance_loss_mlp": 1.00130963, + "epoch": 0.1268600631294153, + "flos": 17275080119040.0, + "grad_norm": 1.624594766144739, + "language_loss": 0.8300404, + "learning_rate": 3.902389647441592e-06, + "loss": 0.85302961, + "num_input_tokens_seen": 45620595, + "step": 2110, + "time_per_iteration": 2.721653699874878 + }, + { + "auxiliary_loss_clip": 0.01149415, + "auxiliary_loss_mlp": 0.00749042, + "balance_loss_clip": 1.0022254, + "balance_loss_mlp": 1.00021756, + "epoch": 0.12692018638208327, + "flos": 24061047390720.0, + "grad_norm": 1.71123021682417, + "language_loss": 0.78668964, + "learning_rate": 3.90226942700191e-06, + "loss": 0.8056742, + "num_input_tokens_seen": 45641140, + "step": 2111, + "time_per_iteration": 2.6469688415527344 + }, + { + "auxiliary_loss_clip": 0.01115272, + "auxiliary_loss_mlp": 0.01164996, + "balance_loss_clip": 1.00192022, + "balance_loss_mlp": 1.00144112, + "epoch": 0.12698030963475124, + "flos": 31831900652160.0, + "grad_norm": 2.027146122848215, + "language_loss": 0.76628065, + "learning_rate": 3.902149134427982e-06, + "loss": 0.78908336, + "num_input_tokens_seen": 45662315, + "step": 2112, + "time_per_iteration": 2.747738838195801 + }, + { + "auxiliary_loss_clip": 0.01132803, + "auxiliary_loss_mlp": 0.01164526, + "balance_loss_clip": 1.00204682, + "balance_loss_mlp": 1.0012573, + "epoch": 0.1270404328874192, + "flos": 25187744275200.0, + "grad_norm": 1.8429070485313623, + "language_loss": 0.85731316, + "learning_rate": 3.902028769724367e-06, + "loss": 0.88028646, + "num_input_tokens_seen": 45680335, + "step": 2113, + "time_per_iteration": 2.675658702850342 + }, + { + "auxiliary_loss_clip": 0.01133147, + "auxiliary_loss_mlp": 0.01164779, + "balance_loss_clip": 1.0021534, + "balance_loss_mlp": 1.00141406, + "epoch": 0.12710055614008717, + "flos": 15997342544640.0, + "grad_norm": 2.0699200004605203, + "language_loss": 0.73924828, + "learning_rate": 3.9019083328956315e-06, + "loss": 0.76222754, + "num_input_tokens_seen": 45696240, + "step": 2114, + "time_per_iteration": 2.6165215969085693 + }, + { + "auxiliary_loss_clip": 0.01164306, + "auxiliary_loss_mlp": 0.01164318, + "balance_loss_clip": 1.00220656, + "balance_loss_mlp": 1.00114477, + "epoch": 0.12716067939275516, + "flos": 15085642515840.0, + "grad_norm": 1.7714349345165612, + "language_loss": 0.83583564, + "learning_rate": 3.901787823946341e-06, + "loss": 0.85912192, + "num_input_tokens_seen": 45713695, + "step": 2115, + "time_per_iteration": 2.5392239093780518 + }, + { + "auxiliary_loss_clip": 0.01148953, + "auxiliary_loss_mlp": 0.0116467, + "balance_loss_clip": 1.00224137, + "balance_loss_mlp": 1.00149655, + "epoch": 0.12722080264542313, + "flos": 28366736636160.0, + "grad_norm": 1.7222615486617718, + "language_loss": 0.86730152, + "learning_rate": 3.901667242881065e-06, + "loss": 0.89043784, + "num_input_tokens_seen": 45736655, + "step": 2116, + "time_per_iteration": 2.67948055267334 + }, + { + "auxiliary_loss_clip": 0.01148976, + "auxiliary_loss_mlp": 0.00748946, + "balance_loss_clip": 1.00213695, + "balance_loss_mlp": 1.00020397, + "epoch": 0.1272809258980911, + "flos": 32379897519360.0, + "grad_norm": 1.756059479753817, + "language_loss": 0.70800865, + "learning_rate": 3.9015465897043775e-06, + "loss": 0.72698784, + "num_input_tokens_seen": 45758195, + "step": 2117, + "time_per_iteration": 2.6965343952178955 + }, + { + "auxiliary_loss_clip": 0.01149521, + "auxiliary_loss_mlp": 0.01164504, + "balance_loss_clip": 1.00223637, + "balance_loss_mlp": 1.00133014, + "epoch": 0.12734104915075906, + "flos": 16034402401920.0, + "grad_norm": 1.9570975209271504, + "language_loss": 0.86454391, + "learning_rate": 3.901425864420852e-06, + "loss": 0.88768417, + "num_input_tokens_seen": 45774280, + "step": 2118, + "time_per_iteration": 2.570850133895874 + }, + { + "auxiliary_loss_clip": 0.01164799, + "auxiliary_loss_mlp": 0.01164294, + "balance_loss_clip": 1.00221324, + "balance_loss_mlp": 1.00112045, + "epoch": 0.12740117240342702, + "flos": 18260325244800.0, + "grad_norm": 1.9083045062169848, + "language_loss": 0.87406588, + "learning_rate": 3.901305067035068e-06, + "loss": 0.89735675, + "num_input_tokens_seen": 45792760, + "step": 2119, + "time_per_iteration": 2.55285382270813 + }, + { + "auxiliary_loss_clip": 0.01165596, + "auxiliary_loss_mlp": 0.00748963, + "balance_loss_clip": 1.00249219, + "balance_loss_mlp": 1.00014377, + "epoch": 0.127461295656095, + "flos": 12121790664960.0, + "grad_norm": 2.2714508111520275, + "language_loss": 0.8762598, + "learning_rate": 3.901184197551605e-06, + "loss": 0.89540541, + "num_input_tokens_seen": 45804300, + "step": 2120, + "time_per_iteration": 2.4917049407958984 + }, + { + "auxiliary_loss_clip": 0.01180995, + "auxiliary_loss_mlp": 0.01164418, + "balance_loss_clip": 1.00233364, + "balance_loss_mlp": 1.00105357, + "epoch": 0.12752141890876295, + "flos": 23149095966720.0, + "grad_norm": 2.13243141875742, + "language_loss": 0.75502843, + "learning_rate": 3.901063255975046e-06, + "loss": 0.77848256, + "num_input_tokens_seen": 45823780, + "step": 2121, + "time_per_iteration": 2.5246241092681885 + }, + { + "auxiliary_loss_clip": 0.0112056, + "auxiliary_loss_mlp": 0.01164725, + "balance_loss_clip": 1.00213456, + "balance_loss_mlp": 1.00126541, + "epoch": 0.12758154216143094, + "flos": 21615997628160.0, + "grad_norm": 2.220651257131202, + "language_loss": 0.82822531, + "learning_rate": 3.900942242309978e-06, + "loss": 0.85107815, + "num_input_tokens_seen": 45840495, + "step": 2122, + "time_per_iteration": 2.700995445251465 + }, + { + "auxiliary_loss_clip": 0.01148373, + "auxiliary_loss_mlp": 0.01164698, + "balance_loss_clip": 1.0021553, + "balance_loss_mlp": 1.00104737, + "epoch": 0.1276416654140989, + "flos": 15924874855680.0, + "grad_norm": 2.128845715413553, + "language_loss": 0.79052645, + "learning_rate": 3.90082115656099e-06, + "loss": 0.8136571, + "num_input_tokens_seen": 45857735, + "step": 2123, + "time_per_iteration": 2.596602439880371 + }, + { + "auxiliary_loss_clip": 0.01180867, + "auxiliary_loss_mlp": 0.01164731, + "balance_loss_clip": 1.00227499, + "balance_loss_mlp": 1.00127137, + "epoch": 0.12770178866676687, + "flos": 22382690451840.0, + "grad_norm": 1.4942477292242609, + "language_loss": 0.79004323, + "learning_rate": 3.900699998732673e-06, + "loss": 0.81349921, + "num_input_tokens_seen": 45876485, + "step": 2124, + "time_per_iteration": 2.5301513671875 + }, + { + "auxiliary_loss_clip": 0.01164858, + "auxiliary_loss_mlp": 0.00749031, + "balance_loss_clip": 1.0022372, + "balance_loss_mlp": 1.00018859, + "epoch": 0.12776191191943484, + "flos": 21652482867840.0, + "grad_norm": 2.3673844979018686, + "language_loss": 0.75791907, + "learning_rate": 3.900578768829623e-06, + "loss": 0.77705801, + "num_input_tokens_seen": 45894645, + "step": 2125, + "time_per_iteration": 2.5794625282287598 + }, + { + "auxiliary_loss_clip": 0.01165193, + "auxiliary_loss_mlp": 0.00748995, + "balance_loss_clip": 1.00221276, + "balance_loss_mlp": 1.00020516, + "epoch": 0.1278220351721028, + "flos": 25735561574400.0, + "grad_norm": 2.97058996147663, + "language_loss": 0.78230369, + "learning_rate": 3.900457466856434e-06, + "loss": 0.80144554, + "num_input_tokens_seen": 45913755, + "step": 2126, + "time_per_iteration": 2.5961310863494873 + }, + { + "auxiliary_loss_clip": 0.01132078, + "auxiliary_loss_mlp": 0.01164747, + "balance_loss_clip": 1.00213528, + "balance_loss_mlp": 1.0012877, + "epoch": 0.12788215842477077, + "flos": 41243224982400.0, + "grad_norm": 1.780103380595343, + "language_loss": 0.68933451, + "learning_rate": 3.9003360928177085e-06, + "loss": 0.7123028, + "num_input_tokens_seen": 45936095, + "step": 2127, + "time_per_iteration": 2.8138718605041504 + }, + { + "auxiliary_loss_clip": 0.01114751, + "auxiliary_loss_mlp": 0.00748652, + "balance_loss_clip": 1.00250602, + "balance_loss_mlp": 0.99956381, + "epoch": 0.12794228167743876, + "flos": 70877430881280.0, + "grad_norm": 0.853228931185775, + "language_loss": 0.62806934, + "learning_rate": 3.900214646718047e-06, + "loss": 0.64670336, + "num_input_tokens_seen": 46004655, + "step": 2128, + "time_per_iteration": 3.286226749420166 + }, + { + "auxiliary_loss_clip": 0.01165562, + "auxiliary_loss_mlp": 0.01164738, + "balance_loss_clip": 1.00235438, + "balance_loss_mlp": 1.00108719, + "epoch": 0.12800240493010673, + "flos": 16289727252480.0, + "grad_norm": 2.2804626342103553, + "language_loss": 0.77280074, + "learning_rate": 3.900093128562056e-06, + "loss": 0.79610372, + "num_input_tokens_seen": 46023610, + "step": 2129, + "time_per_iteration": 2.5684242248535156 + }, + { + "auxiliary_loss_clip": 0.01116133, + "auxiliary_loss_mlp": 0.0116468, + "balance_loss_clip": 1.00209737, + "balance_loss_mlp": 1.00102925, + "epoch": 0.1280625281827747, + "flos": 20631542601600.0, + "grad_norm": 2.2288420510772275, + "language_loss": 0.79512924, + "learning_rate": 3.899971538354343e-06, + "loss": 0.81793737, + "num_input_tokens_seen": 46041725, + "step": 2130, + "time_per_iteration": 2.7313525676727295 + }, + { + "auxiliary_loss_clip": 0.01152954, + "auxiliary_loss_mlp": 0.01164591, + "balance_loss_clip": 1.00230932, + "balance_loss_mlp": 1.00094032, + "epoch": 0.12812265143544266, + "flos": 22638230784000.0, + "grad_norm": 1.805832588560599, + "language_loss": 0.71093786, + "learning_rate": 3.899849876099518e-06, + "loss": 0.73411334, + "num_input_tokens_seen": 46061095, + "step": 2131, + "time_per_iteration": 2.5904057025909424 + }, + { + "auxiliary_loss_clip": 0.01099697, + "auxiliary_loss_mlp": 0.01164565, + "balance_loss_clip": 1.00189137, + "balance_loss_mlp": 1.00129628, + "epoch": 0.12818277468811062, + "flos": 34714701463680.0, + "grad_norm": 2.054802522301856, + "language_loss": 0.72373414, + "learning_rate": 3.899728141802197e-06, + "loss": 0.74637675, + "num_input_tokens_seen": 46082670, + "step": 2132, + "time_per_iteration": 2.8200674057006836 + }, + { + "auxiliary_loss_clip": 0.01121251, + "auxiliary_loss_mlp": 0.01164565, + "balance_loss_clip": 1.00213969, + "balance_loss_mlp": 1.00110519, + "epoch": 0.1282428979407786, + "flos": 23112107936640.0, + "grad_norm": 2.1873451516220204, + "language_loss": 0.82261562, + "learning_rate": 3.8996063354669935e-06, + "loss": 0.84547377, + "num_input_tokens_seen": 46102410, + "step": 2133, + "time_per_iteration": 2.7150399684906006 + }, + { + "auxiliary_loss_clip": 0.01165053, + "auxiliary_loss_mlp": 0.01165077, + "balance_loss_clip": 1.00218844, + "balance_loss_mlp": 1.00114083, + "epoch": 0.12830302119344655, + "flos": 20886508316160.0, + "grad_norm": 3.8724747921104616, + "language_loss": 0.79802406, + "learning_rate": 3.899484457098528e-06, + "loss": 0.82132542, + "num_input_tokens_seen": 46121145, + "step": 2134, + "time_per_iteration": 2.58984375 + }, + { + "auxiliary_loss_clip": 0.01164549, + "auxiliary_loss_mlp": 0.01164351, + "balance_loss_clip": 1.00219464, + "balance_loss_mlp": 1.00098622, + "epoch": 0.12836314444611455, + "flos": 21397768548480.0, + "grad_norm": 1.7296696079477505, + "language_loss": 0.82771367, + "learning_rate": 3.899362506701421e-06, + "loss": 0.85100269, + "num_input_tokens_seen": 46140740, + "step": 2135, + "time_per_iteration": 2.5720958709716797 + }, + { + "auxiliary_loss_clip": 0.01148514, + "auxiliary_loss_mlp": 0.01164974, + "balance_loss_clip": 1.00211692, + "balance_loss_mlp": 1.00132382, + "epoch": 0.1284232676987825, + "flos": 13662466773120.0, + "grad_norm": 2.138608245950359, + "language_loss": 0.77026653, + "learning_rate": 3.899240484280298e-06, + "loss": 0.79340148, + "num_input_tokens_seen": 46156805, + "step": 2136, + "time_per_iteration": 2.5976626873016357 + }, + { + "auxiliary_loss_clip": 0.0111397, + "auxiliary_loss_mlp": 0.01157925, + "balance_loss_clip": 1.0021112, + "balance_loss_mlp": 0.9999966, + "epoch": 0.12848339095145048, + "flos": 59994737735040.0, + "grad_norm": 0.9027434175369375, + "language_loss": 0.59155148, + "learning_rate": 3.899118389839785e-06, + "loss": 0.61427039, + "num_input_tokens_seen": 46222085, + "step": 2137, + "time_per_iteration": 3.355912208557129 + }, + { + "auxiliary_loss_clip": 0.01164267, + "auxiliary_loss_mlp": 0.01164645, + "balance_loss_clip": 1.00209856, + "balance_loss_mlp": 1.0012809, + "epoch": 0.12854351420411844, + "flos": 13881378211200.0, + "grad_norm": 2.2353698395401764, + "language_loss": 0.82380956, + "learning_rate": 3.898996223384512e-06, + "loss": 0.84709865, + "num_input_tokens_seen": 46239970, + "step": 2138, + "time_per_iteration": 3.9710755348205566 + }, + { + "auxiliary_loss_clip": 0.0116462, + "auxiliary_loss_mlp": 0.01165102, + "balance_loss_clip": 1.00230014, + "balance_loss_mlp": 1.00126076, + "epoch": 0.1286036374567864, + "flos": 22637943475200.0, + "grad_norm": 2.476277272526319, + "language_loss": 0.77996707, + "learning_rate": 3.898873984919113e-06, + "loss": 0.80326426, + "num_input_tokens_seen": 46257740, + "step": 2139, + "time_per_iteration": 5.3915040493011475 + }, + { + "auxiliary_loss_clip": 0.0113179, + "auxiliary_loss_mlp": 0.01164361, + "balance_loss_clip": 1.00216079, + "balance_loss_mlp": 1.00137758, + "epoch": 0.12866376070945437, + "flos": 16324775948160.0, + "grad_norm": 2.863089375761434, + "language_loss": 0.85206938, + "learning_rate": 3.8987516744482215e-06, + "loss": 0.87503088, + "num_input_tokens_seen": 46275445, + "step": 2140, + "time_per_iteration": 2.6309750080108643 + }, + { + "auxiliary_loss_clip": 0.01148519, + "auxiliary_loss_mlp": 0.01164087, + "balance_loss_clip": 1.00207257, + "balance_loss_mlp": 1.00129473, + "epoch": 0.12872388396212234, + "flos": 11874546374400.0, + "grad_norm": 2.044241711001973, + "language_loss": 0.85927641, + "learning_rate": 3.898629291976476e-06, + "loss": 0.88240242, + "num_input_tokens_seen": 46291710, + "step": 2141, + "time_per_iteration": 4.022416591644287 + }, + { + "auxiliary_loss_clip": 0.01148549, + "auxiliary_loss_mlp": 0.01164731, + "balance_loss_clip": 1.00203514, + "balance_loss_mlp": 1.00117636, + "epoch": 0.12878400721479033, + "flos": 28366700722560.0, + "grad_norm": 1.8778371984863285, + "language_loss": 0.68067098, + "learning_rate": 3.898506837508518e-06, + "loss": 0.70380378, + "num_input_tokens_seen": 46311335, + "step": 2142, + "time_per_iteration": 2.738335609436035 + }, + { + "auxiliary_loss_clip": 0.01164548, + "auxiliary_loss_mlp": 0.00748988, + "balance_loss_clip": 1.0021987, + "balance_loss_mlp": 1.00013757, + "epoch": 0.1288441304674583, + "flos": 25885632597120.0, + "grad_norm": 2.1727649254863843, + "language_loss": 0.83055818, + "learning_rate": 3.89838431104899e-06, + "loss": 0.84969354, + "num_input_tokens_seen": 46330985, + "step": 2143, + "time_per_iteration": 2.6550841331481934 + }, + { + "auxiliary_loss_clip": 0.01181213, + "auxiliary_loss_mlp": 0.00748942, + "balance_loss_clip": 1.0025115, + "balance_loss_mlp": 1.00018549, + "epoch": 0.12890425372012626, + "flos": 20813789232000.0, + "grad_norm": 1.6855443794714933, + "language_loss": 0.81883514, + "learning_rate": 3.898261712602539e-06, + "loss": 0.83813667, + "num_input_tokens_seen": 46351295, + "step": 2144, + "time_per_iteration": 2.551863670349121 + }, + { + "auxiliary_loss_clip": 0.01148738, + "auxiliary_loss_mlp": 0.01165053, + "balance_loss_clip": 1.00211215, + "balance_loss_mlp": 1.00111675, + "epoch": 0.12896437697279423, + "flos": 22565870835840.0, + "grad_norm": 2.0473802863401303, + "language_loss": 0.78179526, + "learning_rate": 3.898139042173813e-06, + "loss": 0.80493313, + "num_input_tokens_seen": 46368600, + "step": 2145, + "time_per_iteration": 2.6115009784698486 + }, + { + "auxiliary_loss_clip": 0.01180962, + "auxiliary_loss_mlp": 0.01164431, + "balance_loss_clip": 1.00234246, + "balance_loss_mlp": 1.00116158, + "epoch": 0.1290245002254622, + "flos": 17493776075520.0, + "grad_norm": 2.176915310948943, + "language_loss": 0.82485902, + "learning_rate": 3.898016299767465e-06, + "loss": 0.84831297, + "num_input_tokens_seen": 46387370, + "step": 2146, + "time_per_iteration": 2.528125524520874 + }, + { + "auxiliary_loss_clip": 0.01147841, + "auxiliary_loss_mlp": 0.01164676, + "balance_loss_clip": 1.00196338, + "balance_loss_mlp": 1.00112104, + "epoch": 0.12908462347813016, + "flos": 36315957859200.0, + "grad_norm": 2.564996418062781, + "language_loss": 0.70961046, + "learning_rate": 3.897893485388149e-06, + "loss": 0.73273563, + "num_input_tokens_seen": 46409570, + "step": 2147, + "time_per_iteration": 2.728729248046875 + }, + { + "auxiliary_loss_clip": 0.01147794, + "auxiliary_loss_mlp": 0.01164602, + "balance_loss_clip": 1.00199819, + "balance_loss_mlp": 1.00114202, + "epoch": 0.12914474673079815, + "flos": 22528703237760.0, + "grad_norm": 2.278471206888382, + "language_loss": 0.71435112, + "learning_rate": 3.897770599040521e-06, + "loss": 0.73747504, + "num_input_tokens_seen": 46429320, + "step": 2148, + "time_per_iteration": 2.603848695755005 + }, + { + "auxiliary_loss_clip": 0.01180901, + "auxiliary_loss_mlp": 0.01164284, + "balance_loss_clip": 1.00230217, + "balance_loss_mlp": 1.00110996, + "epoch": 0.12920486998346611, + "flos": 21471888263040.0, + "grad_norm": 1.5413050558550108, + "language_loss": 0.79077065, + "learning_rate": 3.897647640729242e-06, + "loss": 0.81422246, + "num_input_tokens_seen": 46450155, + "step": 2149, + "time_per_iteration": 2.6269071102142334 + }, + { + "auxiliary_loss_clip": 0.01165149, + "auxiliary_loss_mlp": 0.01164097, + "balance_loss_clip": 1.00226259, + "balance_loss_mlp": 1.00120938, + "epoch": 0.12926499323613408, + "flos": 27308556944640.0, + "grad_norm": 2.30094281976741, + "language_loss": 0.76751304, + "learning_rate": 3.897524610458975e-06, + "loss": 0.79080546, + "num_input_tokens_seen": 46470280, + "step": 2150, + "time_per_iteration": 2.6921467781066895 + }, + { + "auxiliary_loss_clip": 0.01164345, + "auxiliary_loss_mlp": 0.0116452, + "balance_loss_clip": 1.00232291, + "balance_loss_mlp": 1.0011555, + "epoch": 0.12932511648880204, + "flos": 22091131756800.0, + "grad_norm": 2.0944320638772105, + "language_loss": 0.70367527, + "learning_rate": 3.8974015082343835e-06, + "loss": 0.72696394, + "num_input_tokens_seen": 46487605, + "step": 2151, + "time_per_iteration": 2.5743913650512695 + }, + { + "auxiliary_loss_clip": 0.01180992, + "auxiliary_loss_mlp": 0.01164646, + "balance_loss_clip": 1.00241637, + "balance_loss_mlp": 1.00118589, + "epoch": 0.12938523974147, + "flos": 20302780394880.0, + "grad_norm": 1.9771670411376328, + "language_loss": 0.83872747, + "learning_rate": 3.897278334060137e-06, + "loss": 0.86218381, + "num_input_tokens_seen": 46505100, + "step": 2152, + "time_per_iteration": 2.5053060054779053 + }, + { + "auxiliary_loss_clip": 0.01165096, + "auxiliary_loss_mlp": 0.01164691, + "balance_loss_clip": 1.00232339, + "balance_loss_mlp": 1.00113571, + "epoch": 0.12944536299413797, + "flos": 19499961467520.0, + "grad_norm": 1.6569885659873742, + "language_loss": 0.78625154, + "learning_rate": 3.897155087940906e-06, + "loss": 0.80954945, + "num_input_tokens_seen": 46524020, + "step": 2153, + "time_per_iteration": 2.545398235321045 + }, + { + "auxiliary_loss_clip": 0.01132457, + "auxiliary_loss_mlp": 0.00748941, + "balance_loss_clip": 1.00225389, + "balance_loss_mlp": 1.00014257, + "epoch": 0.12950548624680594, + "flos": 27707919333120.0, + "grad_norm": 1.6859115151068165, + "language_loss": 0.79917145, + "learning_rate": 3.897031769881364e-06, + "loss": 0.81798542, + "num_input_tokens_seen": 46544640, + "step": 2154, + "time_per_iteration": 2.707463502883911 + }, + { + "auxiliary_loss_clip": 0.0116538, + "auxiliary_loss_mlp": 0.01164719, + "balance_loss_clip": 1.002424, + "balance_loss_mlp": 1.00125909, + "epoch": 0.12956560949947393, + "flos": 17565740974080.0, + "grad_norm": 1.9232377389068085, + "language_loss": 0.83520621, + "learning_rate": 3.896908379886188e-06, + "loss": 0.85850716, + "num_input_tokens_seen": 46561395, + "step": 2155, + "time_per_iteration": 2.534374952316284 + }, + { + "auxiliary_loss_clip": 0.011648, + "auxiliary_loss_mlp": 0.01164434, + "balance_loss_clip": 1.00221515, + "balance_loss_mlp": 1.00116491, + "epoch": 0.1296257327521419, + "flos": 20740711011840.0, + "grad_norm": 2.605464479791483, + "language_loss": 0.76226544, + "learning_rate": 3.896784917960055e-06, + "loss": 0.78555781, + "num_input_tokens_seen": 46579395, + "step": 2156, + "time_per_iteration": 2.576720952987671 + }, + { + "auxiliary_loss_clip": 0.01104569, + "auxiliary_loss_mlp": 0.01164299, + "balance_loss_clip": 1.00272822, + "balance_loss_mlp": 1.00131607, + "epoch": 0.12968585600480986, + "flos": 16395735265920.0, + "grad_norm": 1.7692992705818755, + "language_loss": 0.86599219, + "learning_rate": 3.896661384107648e-06, + "loss": 0.88868082, + "num_input_tokens_seen": 46597090, + "step": 2157, + "time_per_iteration": 2.70458722114563 + }, + { + "auxiliary_loss_clip": 0.01180825, + "auxiliary_loss_mlp": 0.01164616, + "balance_loss_clip": 1.00217175, + "balance_loss_mlp": 1.00125158, + "epoch": 0.12974597925747783, + "flos": 28329533124480.0, + "grad_norm": 3.023853623124413, + "language_loss": 0.81322491, + "learning_rate": 3.896537778333651e-06, + "loss": 0.83667928, + "num_input_tokens_seen": 46617355, + "step": 2158, + "time_per_iteration": 2.5822250843048096 + }, + { + "auxiliary_loss_clip": 0.01180935, + "auxiliary_loss_mlp": 0.01164889, + "balance_loss_clip": 1.00231719, + "balance_loss_mlp": 1.00133431, + "epoch": 0.1298061025101458, + "flos": 9683025782400.0, + "grad_norm": 2.46191007859823, + "language_loss": 0.74682885, + "learning_rate": 3.896414100642752e-06, + "loss": 0.77028716, + "num_input_tokens_seen": 46633130, + "step": 2159, + "time_per_iteration": 2.5384891033172607 + }, + { + "auxiliary_loss_clip": 0.01132584, + "auxiliary_loss_mlp": 0.01163981, + "balance_loss_clip": 1.00203872, + "balance_loss_mlp": 1.00099814, + "epoch": 0.12986622576281376, + "flos": 27709535445120.0, + "grad_norm": 2.3325956445535465, + "language_loss": 0.82342494, + "learning_rate": 3.89629035103964e-06, + "loss": 0.8463906, + "num_input_tokens_seen": 46650575, + "step": 2160, + "time_per_iteration": 2.710045099258423 + }, + { + "auxiliary_loss_clip": 0.01164244, + "auxiliary_loss_mlp": 0.0116407, + "balance_loss_clip": 1.00229788, + "balance_loss_mlp": 1.00118232, + "epoch": 0.12992634901548175, + "flos": 18802719590400.0, + "grad_norm": 1.7389894450449754, + "language_loss": 0.8212887, + "learning_rate": 3.896166529529008e-06, + "loss": 0.84457195, + "num_input_tokens_seen": 46668780, + "step": 2161, + "time_per_iteration": 2.5538718700408936 + }, + { + "auxiliary_loss_clip": 0.01152299, + "auxiliary_loss_mlp": 0.01164724, + "balance_loss_clip": 1.00253963, + "balance_loss_mlp": 1.00126386, + "epoch": 0.12998647226814972, + "flos": 29127575543040.0, + "grad_norm": 1.9403853538618812, + "language_loss": 0.82620513, + "learning_rate": 3.896042636115551e-06, + "loss": 0.84937537, + "num_input_tokens_seen": 46687550, + "step": 2162, + "time_per_iteration": 2.6776812076568604 + }, + { + "auxiliary_loss_clip": 0.01131261, + "auxiliary_loss_mlp": 0.01164235, + "balance_loss_clip": 1.00193977, + "balance_loss_mlp": 1.00125241, + "epoch": 0.13004659552081768, + "flos": 19573686132480.0, + "grad_norm": 2.175073015997847, + "language_loss": 0.73172843, + "learning_rate": 3.895918670803968e-06, + "loss": 0.75468338, + "num_input_tokens_seen": 46706730, + "step": 2163, + "time_per_iteration": 2.6340551376342773 + }, + { + "auxiliary_loss_clip": 0.01180908, + "auxiliary_loss_mlp": 0.00749033, + "balance_loss_clip": 1.00226986, + "balance_loss_mlp": 1.00016081, + "epoch": 0.13010671877348565, + "flos": 22490709626880.0, + "grad_norm": 2.2086980276382255, + "language_loss": 0.8120333, + "learning_rate": 3.895794633598958e-06, + "loss": 0.83133268, + "num_input_tokens_seen": 46724250, + "step": 2164, + "time_per_iteration": 2.537475109100342 + }, + { + "auxiliary_loss_clip": 0.01115915, + "auxiliary_loss_mlp": 0.01164707, + "balance_loss_clip": 1.00204098, + "balance_loss_mlp": 1.00115228, + "epoch": 0.1301668420261536, + "flos": 23878226142720.0, + "grad_norm": 2.0703445542152688, + "language_loss": 0.72370952, + "learning_rate": 3.8956705245052256e-06, + "loss": 0.74651575, + "num_input_tokens_seen": 46744105, + "step": 2165, + "time_per_iteration": 2.6933343410491943 + }, + { + "auxiliary_loss_clip": 0.0110338, + "auxiliary_loss_mlp": 0.01164609, + "balance_loss_clip": 1.00227976, + "balance_loss_mlp": 1.00095844, + "epoch": 0.13022696527882158, + "flos": 23150065633920.0, + "grad_norm": 1.9829964118210002, + "language_loss": 0.74974644, + "learning_rate": 3.8955463435274765e-06, + "loss": 0.77242625, + "num_input_tokens_seen": 46764250, + "step": 2166, + "time_per_iteration": 2.8338747024536133 + }, + { + "auxiliary_loss_clip": 0.01180843, + "auxiliary_loss_mlp": 0.01164422, + "balance_loss_clip": 1.00234389, + "balance_loss_mlp": 1.00115275, + "epoch": 0.13028708853148954, + "flos": 26908548111360.0, + "grad_norm": 1.5565491513521645, + "language_loss": 0.82831097, + "learning_rate": 3.895422090670421e-06, + "loss": 0.85176367, + "num_input_tokens_seen": 46786865, + "step": 2167, + "time_per_iteration": 2.866763114929199 + }, + { + "auxiliary_loss_clip": 0.01116468, + "auxiliary_loss_mlp": 0.01164507, + "balance_loss_clip": 1.00218415, + "balance_loss_mlp": 1.00123835, + "epoch": 0.13034721178415754, + "flos": 21251468453760.0, + "grad_norm": 1.5113004225706494, + "language_loss": 0.83436787, + "learning_rate": 3.89529776593877e-06, + "loss": 0.85717762, + "num_input_tokens_seen": 46807030, + "step": 2168, + "time_per_iteration": 2.707340955734253 + }, + { + "auxiliary_loss_clip": 0.01083936, + "auxiliary_loss_mlp": 0.01164545, + "balance_loss_clip": 1.00191736, + "balance_loss_mlp": 1.00127649, + "epoch": 0.1304073350368255, + "flos": 18767239931520.0, + "grad_norm": 2.077309741893673, + "language_loss": 0.80139762, + "learning_rate": 3.8951733693372375e-06, + "loss": 0.82388246, + "num_input_tokens_seen": 46826280, + "step": 2169, + "time_per_iteration": 2.7758243083953857 + }, + { + "auxiliary_loss_clip": 0.01181056, + "auxiliary_loss_mlp": 0.01164324, + "balance_loss_clip": 1.00242901, + "balance_loss_mlp": 1.00095963, + "epoch": 0.13046745828949347, + "flos": 28364653647360.0, + "grad_norm": 2.0409581211773884, + "language_loss": 0.66395456, + "learning_rate": 3.8950489008705406e-06, + "loss": 0.68740839, + "num_input_tokens_seen": 46846505, + "step": 2170, + "time_per_iteration": 2.5942885875701904 + }, + { + "auxiliary_loss_clip": 0.01147703, + "auxiliary_loss_mlp": 0.0116438, + "balance_loss_clip": 1.00220215, + "balance_loss_mlp": 1.00101578, + "epoch": 0.13052758154216143, + "flos": 29605044055680.0, + "grad_norm": 1.5700837166904698, + "language_loss": 0.6701529, + "learning_rate": 3.8949243605434e-06, + "loss": 0.69327378, + "num_input_tokens_seen": 46867380, + "step": 2171, + "time_per_iteration": 2.66928768157959 + }, + { + "auxiliary_loss_clip": 0.01164311, + "auxiliary_loss_mlp": 0.01164385, + "balance_loss_clip": 1.00215948, + "balance_loss_mlp": 1.00130653, + "epoch": 0.1305877047948294, + "flos": 19390864884480.0, + "grad_norm": 2.4209238860334135, + "language_loss": 0.71947145, + "learning_rate": 3.894799748360537e-06, + "loss": 0.74275839, + "num_input_tokens_seen": 46886810, + "step": 2172, + "time_per_iteration": 2.567812919616699 + }, + { + "auxiliary_loss_clip": 0.0113206, + "auxiliary_loss_mlp": 0.0116382, + "balance_loss_clip": 1.00225949, + "balance_loss_mlp": 1.00102782, + "epoch": 0.13064782804749736, + "flos": 16873527000960.0, + "grad_norm": 1.7978907827705137, + "language_loss": 0.75779021, + "learning_rate": 3.894675064326678e-06, + "loss": 0.78074908, + "num_input_tokens_seen": 46905620, + "step": 2173, + "time_per_iteration": 2.661017656326294 + }, + { + "auxiliary_loss_clip": 0.01132654, + "auxiliary_loss_mlp": 0.01164833, + "balance_loss_clip": 1.00223172, + "balance_loss_mlp": 1.00099206, + "epoch": 0.13070795130016533, + "flos": 24499085748480.0, + "grad_norm": 2.2831626643862464, + "language_loss": 0.70702702, + "learning_rate": 3.894550308446551e-06, + "loss": 0.73000181, + "num_input_tokens_seen": 46925120, + "step": 2174, + "time_per_iteration": 2.7410924434661865 + }, + { + "auxiliary_loss_clip": 0.0114569, + "auxiliary_loss_mlp": 0.01157122, + "balance_loss_clip": 1.00227308, + "balance_loss_mlp": 0.99995607, + "epoch": 0.13076807455283332, + "flos": 71054505953280.0, + "grad_norm": 0.8082520361886117, + "language_loss": 0.59113431, + "learning_rate": 3.894425480724886e-06, + "loss": 0.61416245, + "num_input_tokens_seen": 46988195, + "step": 2175, + "time_per_iteration": 3.296912670135498 + }, + { + "auxiliary_loss_clip": 0.01165186, + "auxiliary_loss_mlp": 0.01164244, + "balance_loss_clip": 1.00231051, + "balance_loss_mlp": 1.00107074, + "epoch": 0.13082819780550128, + "flos": 20264499475200.0, + "grad_norm": 2.379329866859145, + "language_loss": 0.80343854, + "learning_rate": 3.894300581166417e-06, + "loss": 0.82673281, + "num_input_tokens_seen": 47004720, + "step": 2176, + "time_per_iteration": 5.242400407791138 + }, + { + "auxiliary_loss_clip": 0.01180794, + "auxiliary_loss_mlp": 0.01164278, + "balance_loss_clip": 1.00229001, + "balance_loss_mlp": 1.00110459, + "epoch": 0.13088832105816925, + "flos": 34203441231360.0, + "grad_norm": 1.891833380402867, + "language_loss": 0.74589896, + "learning_rate": 3.894175609775881e-06, + "loss": 0.76934969, + "num_input_tokens_seen": 47024255, + "step": 2177, + "time_per_iteration": 4.028919696807861 + }, + { + "auxiliary_loss_clip": 0.01131196, + "auxiliary_loss_mlp": 0.01164535, + "balance_loss_clip": 1.00212967, + "balance_loss_mlp": 1.00117075, + "epoch": 0.13094844431083721, + "flos": 17894970057600.0, + "grad_norm": 3.4377223853894487, + "language_loss": 0.82145071, + "learning_rate": 3.894050566558015e-06, + "loss": 0.84440804, + "num_input_tokens_seen": 47042465, + "step": 2178, + "time_per_iteration": 2.6013875007629395 + }, + { + "auxiliary_loss_clip": 0.01180916, + "auxiliary_loss_mlp": 0.01164568, + "balance_loss_clip": 1.00232434, + "balance_loss_mlp": 1.00110853, + "epoch": 0.13100856756350518, + "flos": 17311313963520.0, + "grad_norm": 3.508336756353218, + "language_loss": 0.74642479, + "learning_rate": 3.893925451517562e-06, + "loss": 0.76987964, + "num_input_tokens_seen": 47060370, + "step": 2179, + "time_per_iteration": 4.007976770401001 + }, + { + "auxiliary_loss_clip": 0.01131461, + "auxiliary_loss_mlp": 0.01163972, + "balance_loss_clip": 1.00205898, + "balance_loss_mlp": 1.00108397, + "epoch": 0.13106869081617314, + "flos": 22200551562240.0, + "grad_norm": 9.714849976589965, + "language_loss": 0.84606957, + "learning_rate": 3.893800264659266e-06, + "loss": 0.86902386, + "num_input_tokens_seen": 47081415, + "step": 2180, + "time_per_iteration": 2.657503604888916 + }, + { + "auxiliary_loss_clip": 0.01164542, + "auxiliary_loss_mlp": 0.0116431, + "balance_loss_clip": 1.00237155, + "balance_loss_mlp": 1.00142229, + "epoch": 0.13112881406884114, + "flos": 21763123735680.0, + "grad_norm": 1.7618333580869758, + "language_loss": 0.89832318, + "learning_rate": 3.8936750059878746e-06, + "loss": 0.92161179, + "num_input_tokens_seen": 47099860, + "step": 2181, + "time_per_iteration": 2.5643179416656494 + }, + { + "auxiliary_loss_clip": 0.0116516, + "auxiliary_loss_mlp": 0.01164481, + "balance_loss_clip": 1.00230253, + "balance_loss_mlp": 1.00111628, + "epoch": 0.1311889373215091, + "flos": 23331091201920.0, + "grad_norm": 2.0213556848006706, + "language_loss": 0.68794298, + "learning_rate": 3.893549675508137e-06, + "loss": 0.71123934, + "num_input_tokens_seen": 47118540, + "step": 2182, + "time_per_iteration": 2.6100471019744873 + }, + { + "auxiliary_loss_clip": 0.01132018, + "auxiliary_loss_mlp": 0.01163968, + "balance_loss_clip": 1.00206363, + "balance_loss_mlp": 1.001176, + "epoch": 0.13124906057417707, + "flos": 21467363149440.0, + "grad_norm": 2.103062432818273, + "language_loss": 0.78728735, + "learning_rate": 3.893424273224806e-06, + "loss": 0.81024718, + "num_input_tokens_seen": 47136710, + "step": 2183, + "time_per_iteration": 2.692031145095825 + }, + { + "auxiliary_loss_clip": 0.01180719, + "auxiliary_loss_mlp": 0.01163643, + "balance_loss_clip": 1.00225282, + "balance_loss_mlp": 1.00094652, + "epoch": 0.13130918382684503, + "flos": 23255319461760.0, + "grad_norm": 1.8644755975217555, + "language_loss": 0.85408825, + "learning_rate": 3.893298799142636e-06, + "loss": 0.87753189, + "num_input_tokens_seen": 47157155, + "step": 2184, + "time_per_iteration": 2.5809364318847656 + }, + { + "auxiliary_loss_clip": 0.01131061, + "auxiliary_loss_mlp": 0.01164092, + "balance_loss_clip": 1.00205672, + "balance_loss_mlp": 1.00101352, + "epoch": 0.131369307079513, + "flos": 20850274471680.0, + "grad_norm": 1.8865826198185733, + "language_loss": 0.82034332, + "learning_rate": 3.893173253266387e-06, + "loss": 0.84329486, + "num_input_tokens_seen": 47176820, + "step": 2185, + "time_per_iteration": 2.684147834777832 + }, + { + "auxiliary_loss_clip": 0.01147655, + "auxiliary_loss_mlp": 0.01164289, + "balance_loss_clip": 1.00195265, + "balance_loss_mlp": 1.0011152, + "epoch": 0.13142943033218096, + "flos": 17858341163520.0, + "grad_norm": 1.8086600851890946, + "language_loss": 0.72568071, + "learning_rate": 3.893047635600818e-06, + "loss": 0.74880016, + "num_input_tokens_seen": 47195855, + "step": 2186, + "time_per_iteration": 2.648632764816284 + }, + { + "auxiliary_loss_clip": 0.01163922, + "auxiliary_loss_mlp": 0.01164028, + "balance_loss_clip": 1.00217724, + "balance_loss_mlp": 1.00104547, + "epoch": 0.13148955358484893, + "flos": 20996035862400.0, + "grad_norm": 2.146188617225645, + "language_loss": 0.80076551, + "learning_rate": 3.892921946150693e-06, + "loss": 0.824045, + "num_input_tokens_seen": 47214535, + "step": 2187, + "time_per_iteration": 2.631546974182129 + }, + { + "auxiliary_loss_clip": 0.01118447, + "auxiliary_loss_mlp": 0.01157131, + "balance_loss_clip": 1.00323939, + "balance_loss_mlp": 0.99996519, + "epoch": 0.13154967683751692, + "flos": 70172467580160.0, + "grad_norm": 0.8444529867096041, + "language_loss": 0.59025204, + "learning_rate": 3.892796184920778e-06, + "loss": 0.61300778, + "num_input_tokens_seen": 47270300, + "step": 2188, + "time_per_iteration": 3.2637414932250977 + }, + { + "auxiliary_loss_clip": 0.01084599, + "auxiliary_loss_mlp": 0.01163827, + "balance_loss_clip": 1.00200653, + "balance_loss_mlp": 1.00122595, + "epoch": 0.1316098000901849, + "flos": 20376145923840.0, + "grad_norm": 2.0000848372400433, + "language_loss": 0.7382139, + "learning_rate": 3.892670351915842e-06, + "loss": 0.7606982, + "num_input_tokens_seen": 47290720, + "step": 2189, + "time_per_iteration": 2.8962771892547607 + }, + { + "auxiliary_loss_clip": 0.01168787, + "auxiliary_loss_mlp": 0.01163853, + "balance_loss_clip": 1.00262356, + "balance_loss_mlp": 1.00096548, + "epoch": 0.13166992334285285, + "flos": 23221132692480.0, + "grad_norm": 1.8216555662058957, + "language_loss": 0.7242552, + "learning_rate": 3.892544447140657e-06, + "loss": 0.7475816, + "num_input_tokens_seen": 47311820, + "step": 2190, + "time_per_iteration": 3.00484037399292 + }, + { + "auxiliary_loss_clip": 0.0116401, + "auxiliary_loss_mlp": 0.01164075, + "balance_loss_clip": 1.00223219, + "balance_loss_mlp": 1.00147307, + "epoch": 0.13173004659552082, + "flos": 23330947547520.0, + "grad_norm": 1.899618578906328, + "language_loss": 0.74457181, + "learning_rate": 3.892418470599996e-06, + "loss": 0.76785266, + "num_input_tokens_seen": 47331605, + "step": 2191, + "time_per_iteration": 2.573458194732666 + }, + { + "auxiliary_loss_clip": 0.01133439, + "auxiliary_loss_mlp": 0.01164112, + "balance_loss_clip": 1.00234306, + "balance_loss_mlp": 1.00093818, + "epoch": 0.13179016984818878, + "flos": 21251504367360.0, + "grad_norm": 2.06032765528175, + "language_loss": 0.78744668, + "learning_rate": 3.892292422298637e-06, + "loss": 0.81042218, + "num_input_tokens_seen": 47350455, + "step": 2192, + "time_per_iteration": 2.690340995788574 + }, + { + "auxiliary_loss_clip": 0.01116219, + "auxiliary_loss_mlp": 0.0116392, + "balance_loss_clip": 1.00207758, + "balance_loss_mlp": 1.00103271, + "epoch": 0.13185029310085675, + "flos": 17778690754560.0, + "grad_norm": 2.3228839199212605, + "language_loss": 0.8531127, + "learning_rate": 3.892166302241361e-06, + "loss": 0.8759141, + "num_input_tokens_seen": 47368225, + "step": 2193, + "time_per_iteration": 2.678147077560425 + }, + { + "auxiliary_loss_clip": 0.01147146, + "auxiliary_loss_mlp": 0.01157455, + "balance_loss_clip": 1.00287962, + "balance_loss_mlp": 1.00028944, + "epoch": 0.1319104163535247, + "flos": 69851785933440.0, + "grad_norm": 1.8342460613505263, + "language_loss": 0.54142427, + "learning_rate": 3.8920401104329475e-06, + "loss": 0.56447029, + "num_input_tokens_seen": 47427125, + "step": 2194, + "time_per_iteration": 3.163085460662842 + }, + { + "auxiliary_loss_clip": 0.01180763, + "auxiliary_loss_mlp": 0.01164298, + "balance_loss_clip": 1.00235748, + "balance_loss_mlp": 1.00121975, + "epoch": 0.1319705396061927, + "flos": 25193095401600.0, + "grad_norm": 1.8530561316873373, + "language_loss": 0.72280401, + "learning_rate": 3.891913846878185e-06, + "loss": 0.74625468, + "num_input_tokens_seen": 47450275, + "step": 2195, + "time_per_iteration": 2.604079246520996 + }, + { + "auxiliary_loss_clip": 0.01130968, + "auxiliary_loss_mlp": 0.00748981, + "balance_loss_clip": 1.00190401, + "balance_loss_mlp": 1.00013125, + "epoch": 0.13203066285886067, + "flos": 20740459616640.0, + "grad_norm": 1.9550265334773116, + "language_loss": 0.77964628, + "learning_rate": 3.891787511581859e-06, + "loss": 0.7984457, + "num_input_tokens_seen": 47469155, + "step": 2196, + "time_per_iteration": 2.6654140949249268 + }, + { + "auxiliary_loss_clip": 0.01165197, + "auxiliary_loss_mlp": 0.01163861, + "balance_loss_clip": 1.00224781, + "balance_loss_mlp": 1.0009737, + "epoch": 0.13209078611152864, + "flos": 22054395121920.0, + "grad_norm": 2.163355819930509, + "language_loss": 0.74905539, + "learning_rate": 3.89166110454876e-06, + "loss": 0.77234602, + "num_input_tokens_seen": 47488405, + "step": 2197, + "time_per_iteration": 2.5934693813323975 + }, + { + "auxiliary_loss_clip": 0.01180822, + "auxiliary_loss_mlp": 0.01163892, + "balance_loss_clip": 1.00228071, + "balance_loss_mlp": 1.00090885, + "epoch": 0.1321509093641966, + "flos": 16284950743680.0, + "grad_norm": 2.4479667641191623, + "language_loss": 0.79575849, + "learning_rate": 3.891534625783685e-06, + "loss": 0.81920552, + "num_input_tokens_seen": 47505650, + "step": 2198, + "time_per_iteration": 2.500981569290161 + }, + { + "auxiliary_loss_clip": 0.01180799, + "auxiliary_loss_mlp": 0.01163919, + "balance_loss_clip": 1.00236154, + "balance_loss_mlp": 1.00131726, + "epoch": 0.13221103261686457, + "flos": 16983018633600.0, + "grad_norm": 2.2696731780505424, + "language_loss": 0.83018255, + "learning_rate": 3.891408075291425e-06, + "loss": 0.85362977, + "num_input_tokens_seen": 47521540, + "step": 2199, + "time_per_iteration": 2.5080928802490234 + }, + { + "auxiliary_loss_clip": 0.01116388, + "auxiliary_loss_mlp": 0.01164144, + "balance_loss_clip": 1.00213027, + "balance_loss_mlp": 1.00096989, + "epoch": 0.13227115586953253, + "flos": 34233605677440.0, + "grad_norm": 1.6962472109911346, + "language_loss": 0.69259256, + "learning_rate": 3.8912814530767826e-06, + "loss": 0.71539783, + "num_input_tokens_seen": 47543625, + "step": 2200, + "time_per_iteration": 2.8099050521850586 + }, + { + "auxiliary_loss_clip": 0.01180528, + "auxiliary_loss_mlp": 0.01163659, + "balance_loss_clip": 1.00224853, + "balance_loss_mlp": 1.0012486, + "epoch": 0.13233127912220052, + "flos": 20704656735360.0, + "grad_norm": 1.9481987525171514, + "language_loss": 0.84642529, + "learning_rate": 3.891154759144557e-06, + "loss": 0.86986721, + "num_input_tokens_seen": 47563740, + "step": 2201, + "time_per_iteration": 2.543356418609619 + }, + { + "auxiliary_loss_clip": 0.01180816, + "auxiliary_loss_mlp": 0.01163997, + "balance_loss_clip": 1.00234258, + "balance_loss_mlp": 1.0009191, + "epoch": 0.1323914023748685, + "flos": 25805048434560.0, + "grad_norm": 2.679340579981091, + "language_loss": 0.87023753, + "learning_rate": 3.891027993499554e-06, + "loss": 0.89368564, + "num_input_tokens_seen": 47582655, + "step": 2202, + "time_per_iteration": 2.581691265106201 + }, + { + "auxiliary_loss_clip": 0.01147575, + "auxiliary_loss_mlp": 0.01164017, + "balance_loss_clip": 1.00225186, + "balance_loss_mlp": 1.00112891, + "epoch": 0.13245152562753645, + "flos": 21251540280960.0, + "grad_norm": 1.9933602166721172, + "language_loss": 0.72746396, + "learning_rate": 3.89090115614658e-06, + "loss": 0.75057983, + "num_input_tokens_seen": 47600875, + "step": 2203, + "time_per_iteration": 2.704547166824341 + }, + { + "auxiliary_loss_clip": 0.01116234, + "auxiliary_loss_mlp": 0.01163641, + "balance_loss_clip": 1.00196087, + "balance_loss_mlp": 1.00103962, + "epoch": 0.13251164888020442, + "flos": 26610955931520.0, + "grad_norm": 2.557972872664878, + "language_loss": 0.72940505, + "learning_rate": 3.890774247090444e-06, + "loss": 0.75220376, + "num_input_tokens_seen": 47619250, + "step": 2204, + "time_per_iteration": 2.7316508293151855 + }, + { + "auxiliary_loss_clip": 0.01168874, + "auxiliary_loss_mlp": 0.01164406, + "balance_loss_clip": 1.00295472, + "balance_loss_mlp": 1.0011369, + "epoch": 0.13257177213287238, + "flos": 29826541272960.0, + "grad_norm": 1.6863144156515464, + "language_loss": 0.78299296, + "learning_rate": 3.89064726633596e-06, + "loss": 0.80632579, + "num_input_tokens_seen": 47639445, + "step": 2205, + "time_per_iteration": 2.6466939449310303 + }, + { + "auxiliary_loss_clip": 0.01132058, + "auxiliary_loss_mlp": 0.0116369, + "balance_loss_clip": 1.00205684, + "balance_loss_mlp": 1.00089753, + "epoch": 0.13263189538554035, + "flos": 21288456483840.0, + "grad_norm": 1.7590218939754143, + "language_loss": 0.79342556, + "learning_rate": 3.890520213887941e-06, + "loss": 0.816383, + "num_input_tokens_seen": 47658740, + "step": 2206, + "time_per_iteration": 2.7103431224823 + }, + { + "auxiliary_loss_clip": 0.01131839, + "auxiliary_loss_mlp": 0.01163656, + "balance_loss_clip": 1.00210667, + "balance_loss_mlp": 1.00095868, + "epoch": 0.13269201863820831, + "flos": 16874101618560.0, + "grad_norm": 1.9051130522772997, + "language_loss": 0.74244648, + "learning_rate": 3.890393089751208e-06, + "loss": 0.76540142, + "num_input_tokens_seen": 47676880, + "step": 2207, + "time_per_iteration": 2.6226518154144287 + }, + { + "auxiliary_loss_clip": 0.01148458, + "auxiliary_loss_mlp": 0.01163698, + "balance_loss_clip": 1.00215209, + "balance_loss_mlp": 1.00090599, + "epoch": 0.1327521418908763, + "flos": 23768914078080.0, + "grad_norm": 1.6715423385424364, + "language_loss": 0.83905274, + "learning_rate": 3.890265893930578e-06, + "loss": 0.86217433, + "num_input_tokens_seen": 47696635, + "step": 2208, + "time_per_iteration": 2.650221347808838 + }, + { + "auxiliary_loss_clip": 0.01164224, + "auxiliary_loss_mlp": 0.01163621, + "balance_loss_clip": 1.0022974, + "balance_loss_mlp": 1.00121021, + "epoch": 0.13281226514354427, + "flos": 26505594362880.0, + "grad_norm": 2.003554824576109, + "language_loss": 0.85185617, + "learning_rate": 3.890138626430876e-06, + "loss": 0.87513459, + "num_input_tokens_seen": 47717760, + "step": 2209, + "time_per_iteration": 2.614226818084717 + }, + { + "auxiliary_loss_clip": 0.01153319, + "auxiliary_loss_mlp": 0.00748815, + "balance_loss_clip": 1.00309479, + "balance_loss_mlp": 1.00014281, + "epoch": 0.13287238839621224, + "flos": 24498762526080.0, + "grad_norm": 1.863461378141949, + "language_loss": 0.82102633, + "learning_rate": 3.890011287256929e-06, + "loss": 0.84004766, + "num_input_tokens_seen": 47737685, + "step": 2210, + "time_per_iteration": 2.6602444648742676 + }, + { + "auxiliary_loss_clip": 0.01130701, + "auxiliary_loss_mlp": 0.00748572, + "balance_loss_clip": 1.00282693, + "balance_loss_mlp": 0.99971783, + "epoch": 0.1329325116488802, + "flos": 67694344369920.0, + "grad_norm": 0.7507860060546105, + "language_loss": 0.57995427, + "learning_rate": 3.889883876413563e-06, + "loss": 0.59874701, + "num_input_tokens_seen": 47802415, + "step": 2211, + "time_per_iteration": 3.354491710662842 + }, + { + "auxiliary_loss_clip": 0.01146711, + "auxiliary_loss_mlp": 0.01156399, + "balance_loss_clip": 1.00255108, + "balance_loss_mlp": 0.99999613, + "epoch": 0.13299263490154817, + "flos": 72261894741120.0, + "grad_norm": 0.8146267903737676, + "language_loss": 0.55415428, + "learning_rate": 3.889756393905611e-06, + "loss": 0.57718539, + "num_input_tokens_seen": 47871485, + "step": 2212, + "time_per_iteration": 3.2342047691345215 + }, + { + "auxiliary_loss_clip": 0.01132252, + "auxiliary_loss_mlp": 0.01163622, + "balance_loss_clip": 1.00208235, + "balance_loss_mlp": 1.00092554, + "epoch": 0.13305275815421613, + "flos": 17931275729280.0, + "grad_norm": 2.2807902306866263, + "language_loss": 0.74455601, + "learning_rate": 3.889628839737908e-06, + "loss": 0.76751471, + "num_input_tokens_seen": 47888315, + "step": 2213, + "time_per_iteration": 5.354238510131836 + }, + { + "auxiliary_loss_clip": 0.01133689, + "auxiliary_loss_mlp": 0.01163198, + "balance_loss_clip": 1.00234926, + "balance_loss_mlp": 1.00097787, + "epoch": 0.13311288140688413, + "flos": 22340889999360.0, + "grad_norm": 2.276066636715426, + "language_loss": 0.78957629, + "learning_rate": 3.889501213915291e-06, + "loss": 0.81254518, + "num_input_tokens_seen": 47906600, + "step": 2214, + "time_per_iteration": 2.6656737327575684 + }, + { + "auxiliary_loss_clip": 0.01148391, + "auxiliary_loss_mlp": 0.01163562, + "balance_loss_clip": 1.00218701, + "balance_loss_mlp": 1.00115144, + "epoch": 0.1331730046595521, + "flos": 31868888682240.0, + "grad_norm": 4.3170469834284875, + "language_loss": 0.6978206, + "learning_rate": 3.889373516442597e-06, + "loss": 0.72094017, + "num_input_tokens_seen": 47927630, + "step": 2215, + "time_per_iteration": 4.112346887588501 + }, + { + "auxiliary_loss_clip": 0.01164994, + "auxiliary_loss_mlp": 0.01163568, + "balance_loss_clip": 1.00226688, + "balance_loss_mlp": 1.00115776, + "epoch": 0.13323312791222006, + "flos": 22566589107840.0, + "grad_norm": 2.05144714019465, + "language_loss": 0.81292629, + "learning_rate": 3.889245747324671e-06, + "loss": 0.83621192, + "num_input_tokens_seen": 47947935, + "step": 2216, + "time_per_iteration": 2.6216773986816406 + }, + { + "auxiliary_loss_clip": 0.0116417, + "auxiliary_loss_mlp": 0.01163761, + "balance_loss_clip": 1.00226736, + "balance_loss_mlp": 1.00134981, + "epoch": 0.13329325116488802, + "flos": 15085319293440.0, + "grad_norm": 2.278640322321359, + "language_loss": 0.87408501, + "learning_rate": 3.889117906566356e-06, + "loss": 0.89736432, + "num_input_tokens_seen": 47965515, + "step": 2217, + "time_per_iteration": 3.98190975189209 + }, + { + "auxiliary_loss_clip": 0.0114786, + "auxiliary_loss_mlp": 0.01163515, + "balance_loss_clip": 1.002244, + "balance_loss_mlp": 1.00129473, + "epoch": 0.133353374417556, + "flos": 27453671890560.0, + "grad_norm": 3.3429553960748217, + "language_loss": 0.72754771, + "learning_rate": 3.888989994172501e-06, + "loss": 0.75066137, + "num_input_tokens_seen": 47985675, + "step": 2218, + "time_per_iteration": 2.6771764755249023 + }, + { + "auxiliary_loss_clip": 0.01115589, + "auxiliary_loss_mlp": 0.01163096, + "balance_loss_clip": 1.00195706, + "balance_loss_mlp": 1.00087595, + "epoch": 0.13341349767022395, + "flos": 24094695456000.0, + "grad_norm": 1.8845157946886442, + "language_loss": 0.87663543, + "learning_rate": 3.8888620101479565e-06, + "loss": 0.89942229, + "num_input_tokens_seen": 48004985, + "step": 2219, + "time_per_iteration": 2.7141847610473633 + }, + { + "auxiliary_loss_clip": 0.01131961, + "auxiliary_loss_mlp": 0.01163694, + "balance_loss_clip": 1.00211978, + "balance_loss_mlp": 1.00099742, + "epoch": 0.13347362092289192, + "flos": 24133335511680.0, + "grad_norm": 1.4763849054385927, + "language_loss": 0.77180475, + "learning_rate": 3.888733954497574e-06, + "loss": 0.7947613, + "num_input_tokens_seen": 48024965, + "step": 2220, + "time_per_iteration": 2.6983556747436523 + }, + { + "auxiliary_loss_clip": 0.01148161, + "auxiliary_loss_mlp": 0.01163922, + "balance_loss_clip": 1.00202405, + "balance_loss_mlp": 1.00122499, + "epoch": 0.1335337441755599, + "flos": 18436538390400.0, + "grad_norm": 2.1532093712868337, + "language_loss": 0.78734702, + "learning_rate": 3.888605827226212e-06, + "loss": 0.81046784, + "num_input_tokens_seen": 48040890, + "step": 2221, + "time_per_iteration": 2.592416763305664 + }, + { + "auxiliary_loss_clip": 0.01162045, + "auxiliary_loss_mlp": 0.01156416, + "balance_loss_clip": 1.00259924, + "balance_loss_mlp": 1.00001371, + "epoch": 0.13359386742822787, + "flos": 50611997652480.0, + "grad_norm": 0.98425963174567, + "language_loss": 0.69049692, + "learning_rate": 3.8884776283387275e-06, + "loss": 0.71368146, + "num_input_tokens_seen": 48091855, + "step": 2222, + "time_per_iteration": 2.9855401515960693 + }, + { + "auxiliary_loss_clip": 0.01130859, + "auxiliary_loss_mlp": 0.01163799, + "balance_loss_clip": 1.00204515, + "balance_loss_mlp": 1.00129247, + "epoch": 0.13365399068089584, + "flos": 22778569221120.0, + "grad_norm": 2.8071056687271416, + "language_loss": 0.67399842, + "learning_rate": 3.888349357839982e-06, + "loss": 0.69694501, + "num_input_tokens_seen": 48111350, + "step": 2223, + "time_per_iteration": 2.657501697540283 + }, + { + "auxiliary_loss_clip": 0.01163768, + "auxiliary_loss_mlp": 0.01164099, + "balance_loss_clip": 1.00215101, + "balance_loss_mlp": 1.0014019, + "epoch": 0.1337141139335638, + "flos": 12531603911040.0, + "grad_norm": 2.051084852358526, + "language_loss": 0.82677233, + "learning_rate": 3.88822101573484e-06, + "loss": 0.85005093, + "num_input_tokens_seen": 48129840, + "step": 2224, + "time_per_iteration": 2.538294553756714 + }, + { + "auxiliary_loss_clip": 0.01180578, + "auxiliary_loss_mlp": 0.01163706, + "balance_loss_clip": 1.00222719, + "balance_loss_mlp": 1.00110483, + "epoch": 0.13377423718623177, + "flos": 23038957889280.0, + "grad_norm": 1.9102327942628448, + "language_loss": 0.65956885, + "learning_rate": 3.888092602028167e-06, + "loss": 0.68301177, + "num_input_tokens_seen": 48149240, + "step": 2225, + "time_per_iteration": 2.5735747814178467 + }, + { + "auxiliary_loss_clip": 0.01165045, + "auxiliary_loss_mlp": 0.01163408, + "balance_loss_clip": 1.00227523, + "balance_loss_mlp": 1.00099707, + "epoch": 0.13383436043889974, + "flos": 16216397637120.0, + "grad_norm": 4.030865678711212, + "language_loss": 0.88846886, + "learning_rate": 3.887964116724835e-06, + "loss": 0.91175336, + "num_input_tokens_seen": 48166330, + "step": 2226, + "time_per_iteration": 2.5616116523742676 + }, + { + "auxiliary_loss_clip": 0.0114832, + "auxiliary_loss_mlp": 0.01163927, + "balance_loss_clip": 1.00216818, + "balance_loss_mlp": 1.00132537, + "epoch": 0.1338944836915677, + "flos": 24279671520000.0, + "grad_norm": 1.8634081262338529, + "language_loss": 0.73910981, + "learning_rate": 3.887835559829712e-06, + "loss": 0.7622323, + "num_input_tokens_seen": 48187600, + "step": 2227, + "time_per_iteration": 2.6491153240203857 + }, + { + "auxiliary_loss_clip": 0.01163884, + "auxiliary_loss_mlp": 0.01163722, + "balance_loss_clip": 1.00214839, + "balance_loss_mlp": 1.00102496, + "epoch": 0.1339546069442357, + "flos": 17598742594560.0, + "grad_norm": 2.1378132374543086, + "language_loss": 0.8511374, + "learning_rate": 3.8877069313476764e-06, + "loss": 0.87441343, + "num_input_tokens_seen": 48204400, + "step": 2228, + "time_per_iteration": 2.5689144134521484 + }, + { + "auxiliary_loss_clip": 0.01147932, + "auxiliary_loss_mlp": 0.01163207, + "balance_loss_clip": 1.00214338, + "balance_loss_mlp": 1.00098681, + "epoch": 0.13401473019690366, + "flos": 18990065952000.0, + "grad_norm": 1.91563054449983, + "language_loss": 0.81171882, + "learning_rate": 3.8875782312836054e-06, + "loss": 0.83483022, + "num_input_tokens_seen": 48222180, + "step": 2229, + "time_per_iteration": 2.6458897590637207 + }, + { + "auxiliary_loss_clip": 0.01115215, + "auxiliary_loss_mlp": 0.01164037, + "balance_loss_clip": 1.00199461, + "balance_loss_mlp": 1.00153089, + "epoch": 0.13407485344957162, + "flos": 26943812288640.0, + "grad_norm": 1.712557838845144, + "language_loss": 0.74396884, + "learning_rate": 3.887449459642378e-06, + "loss": 0.7667613, + "num_input_tokens_seen": 48243245, + "step": 2230, + "time_per_iteration": 2.706483840942383 + }, + { + "auxiliary_loss_clip": 0.01130716, + "auxiliary_loss_mlp": 0.01163716, + "balance_loss_clip": 1.00185835, + "balance_loss_mlp": 1.00120997, + "epoch": 0.1341349767022396, + "flos": 20339373375360.0, + "grad_norm": 1.7526131674632686, + "language_loss": 0.7974906, + "learning_rate": 3.8873206164288785e-06, + "loss": 0.82043493, + "num_input_tokens_seen": 48262600, + "step": 2231, + "time_per_iteration": 2.6187744140625 + }, + { + "auxiliary_loss_clip": 0.01116279, + "auxiliary_loss_mlp": 0.01164024, + "balance_loss_clip": 1.00206184, + "balance_loss_mlp": 1.00161314, + "epoch": 0.13419509995490755, + "flos": 29862020931840.0, + "grad_norm": 1.5561588257066976, + "language_loss": 0.72197676, + "learning_rate": 3.887191701647992e-06, + "loss": 0.74477977, + "num_input_tokens_seen": 48285075, + "step": 2232, + "time_per_iteration": 2.7474682331085205 + }, + { + "auxiliary_loss_clip": 0.0113186, + "auxiliary_loss_mlp": 0.01163748, + "balance_loss_clip": 1.00216961, + "balance_loss_mlp": 1.00105166, + "epoch": 0.13425522320757552, + "flos": 26942986275840.0, + "grad_norm": 2.892540383974264, + "language_loss": 0.66027403, + "learning_rate": 3.8870627153046066e-06, + "loss": 0.6832301, + "num_input_tokens_seen": 48301285, + "step": 2233, + "time_per_iteration": 2.6910531520843506 + }, + { + "auxiliary_loss_clip": 0.01180413, + "auxiliary_loss_mlp": 0.01163073, + "balance_loss_clip": 1.00216055, + "balance_loss_mlp": 1.00094795, + "epoch": 0.1343153464602435, + "flos": 15777281871360.0, + "grad_norm": 2.2176583508317513, + "language_loss": 0.81952292, + "learning_rate": 3.886933657403615e-06, + "loss": 0.84295774, + "num_input_tokens_seen": 48317835, + "step": 2234, + "time_per_iteration": 2.5139660835266113 + }, + { + "auxiliary_loss_clip": 0.01147989, + "auxiliary_loss_mlp": 0.01163679, + "balance_loss_clip": 1.00205588, + "balance_loss_mlp": 1.00126815, + "epoch": 0.13437546971291148, + "flos": 24314756129280.0, + "grad_norm": 1.8653866641693104, + "language_loss": 0.82333642, + "learning_rate": 3.886804527949909e-06, + "loss": 0.84645313, + "num_input_tokens_seen": 48335670, + "step": 2235, + "time_per_iteration": 2.676974058151245 + }, + { + "auxiliary_loss_clip": 0.01163898, + "auxiliary_loss_mlp": 0.01163721, + "balance_loss_clip": 1.00196075, + "balance_loss_mlp": 1.00111961, + "epoch": 0.13443559296557944, + "flos": 26650673395200.0, + "grad_norm": 1.4608558199358759, + "language_loss": 0.86388111, + "learning_rate": 3.8866753269483864e-06, + "loss": 0.88715732, + "num_input_tokens_seen": 48357805, + "step": 2236, + "time_per_iteration": 2.604020595550537 + }, + { + "auxiliary_loss_clip": 0.01180359, + "auxiliary_loss_mlp": 0.01163776, + "balance_loss_clip": 1.00213838, + "balance_loss_mlp": 1.00126994, + "epoch": 0.1344957162182474, + "flos": 21796197183360.0, + "grad_norm": 1.7460354280603734, + "language_loss": 0.77102637, + "learning_rate": 3.886546054403946e-06, + "loss": 0.79446769, + "num_input_tokens_seen": 48377845, + "step": 2237, + "time_per_iteration": 2.565256118774414 + }, + { + "auxiliary_loss_clip": 0.01148647, + "auxiliary_loss_mlp": 0.01163366, + "balance_loss_clip": 1.00212216, + "balance_loss_mlp": 1.00095582, + "epoch": 0.13455583947091537, + "flos": 19865568049920.0, + "grad_norm": 3.2365214209701425, + "language_loss": 0.78787458, + "learning_rate": 3.886416710321491e-06, + "loss": 0.81099474, + "num_input_tokens_seen": 48394735, + "step": 2238, + "time_per_iteration": 2.628373622894287 + }, + { + "auxiliary_loss_clip": 0.01147472, + "auxiliary_loss_mlp": 0.01162979, + "balance_loss_clip": 1.00204909, + "balance_loss_mlp": 1.00104547, + "epoch": 0.13461596272358334, + "flos": 30846835094400.0, + "grad_norm": 2.3084661661534693, + "language_loss": 0.68530381, + "learning_rate": 3.886287294705924e-06, + "loss": 0.70840836, + "num_input_tokens_seen": 48414200, + "step": 2239, + "time_per_iteration": 2.692656993865967 + }, + { + "auxiliary_loss_clip": 0.01151941, + "auxiliary_loss_mlp": 0.01163476, + "balance_loss_clip": 1.00262415, + "balance_loss_mlp": 1.00116038, + "epoch": 0.1346760859762513, + "flos": 12494436312960.0, + "grad_norm": 2.139339410926004, + "language_loss": 0.81602037, + "learning_rate": 3.8861578075621555e-06, + "loss": 0.83917451, + "num_input_tokens_seen": 48431065, + "step": 2240, + "time_per_iteration": 2.5602009296417236 + }, + { + "auxiliary_loss_clip": 0.01115968, + "auxiliary_loss_mlp": 0.01163599, + "balance_loss_clip": 1.00196624, + "balance_loss_mlp": 1.00118804, + "epoch": 0.1347362092289193, + "flos": 21836022387840.0, + "grad_norm": 1.8358287853805149, + "language_loss": 0.77851713, + "learning_rate": 3.886028248895093e-06, + "loss": 0.8013128, + "num_input_tokens_seen": 48450335, + "step": 2241, + "time_per_iteration": 2.702470302581787 + }, + { + "auxiliary_loss_clip": 0.01180396, + "auxiliary_loss_mlp": 0.01163415, + "balance_loss_clip": 1.00231624, + "balance_loss_mlp": 1.00100446, + "epoch": 0.13479633248158726, + "flos": 23509459163520.0, + "grad_norm": 1.68728359451592, + "language_loss": 0.83083528, + "learning_rate": 3.88589861870965e-06, + "loss": 0.85427338, + "num_input_tokens_seen": 48468555, + "step": 2242, + "time_per_iteration": 2.5499627590179443 + }, + { + "auxiliary_loss_clip": 0.01180475, + "auxiliary_loss_mlp": 0.01163729, + "balance_loss_clip": 1.00225627, + "balance_loss_mlp": 1.00112748, + "epoch": 0.13485645573425523, + "flos": 29344332165120.0, + "grad_norm": 2.246954091031019, + "language_loss": 0.65155524, + "learning_rate": 3.885768917010744e-06, + "loss": 0.67499733, + "num_input_tokens_seen": 48488515, + "step": 2243, + "time_per_iteration": 2.6149723529815674 + }, + { + "auxiliary_loss_clip": 0.01147776, + "auxiliary_loss_mlp": 0.01162913, + "balance_loss_clip": 1.00195289, + "balance_loss_mlp": 1.00117004, + "epoch": 0.1349165789869232, + "flos": 28037112503040.0, + "grad_norm": 1.376894930673023, + "language_loss": 0.72578329, + "learning_rate": 3.8856391438032895e-06, + "loss": 0.74889022, + "num_input_tokens_seen": 48510515, + "step": 2244, + "time_per_iteration": 2.691112995147705 + }, + { + "auxiliary_loss_clip": 0.01163585, + "auxiliary_loss_mlp": 0.01163369, + "balance_loss_clip": 1.00213766, + "balance_loss_mlp": 1.00134003, + "epoch": 0.13497670223959116, + "flos": 22853730430080.0, + "grad_norm": 1.5131766546721614, + "language_loss": 0.86232996, + "learning_rate": 3.88550929909221e-06, + "loss": 0.88559949, + "num_input_tokens_seen": 48529940, + "step": 2245, + "time_per_iteration": 2.6056337356567383 + }, + { + "auxiliary_loss_clip": 0.01164134, + "auxiliary_loss_mlp": 0.01163285, + "balance_loss_clip": 1.00215173, + "balance_loss_mlp": 1.00125611, + "epoch": 0.13503682549225912, + "flos": 16504580453760.0, + "grad_norm": 1.5797021500501014, + "language_loss": 0.78683615, + "learning_rate": 3.88537938288243e-06, + "loss": 0.81011039, + "num_input_tokens_seen": 48548190, + "step": 2246, + "time_per_iteration": 2.5770795345306396 + }, + { + "auxiliary_loss_clip": 0.01099044, + "auxiliary_loss_mlp": 0.01155951, + "balance_loss_clip": 1.00243759, + "balance_loss_mlp": 1.0003109, + "epoch": 0.1350969487449271, + "flos": 70756303242240.0, + "grad_norm": 0.7468098272924163, + "language_loss": 0.60529876, + "learning_rate": 3.885249395178874e-06, + "loss": 0.62784863, + "num_input_tokens_seen": 48613165, + "step": 2247, + "time_per_iteration": 3.6261487007141113 + }, + { + "auxiliary_loss_clip": 0.01164053, + "auxiliary_loss_mlp": 0.01163834, + "balance_loss_clip": 1.00228322, + "balance_loss_mlp": 1.00113678, + "epoch": 0.13515707199759508, + "flos": 23075981832960.0, + "grad_norm": 2.078951607424206, + "language_loss": 0.80747485, + "learning_rate": 3.885119335986473e-06, + "loss": 0.83075368, + "num_input_tokens_seen": 48631705, + "step": 2248, + "time_per_iteration": 2.855268955230713 + }, + { + "auxiliary_loss_clip": 0.01148091, + "auxiliary_loss_mlp": 0.01163046, + "balance_loss_clip": 1.00203705, + "balance_loss_mlp": 1.00092101, + "epoch": 0.13521719525026304, + "flos": 23186371305600.0, + "grad_norm": 1.7747461657346773, + "language_loss": 0.77708137, + "learning_rate": 3.884989205310157e-06, + "loss": 0.80019271, + "num_input_tokens_seen": 48649740, + "step": 2249, + "time_per_iteration": 2.6574113368988037 + }, + { + "auxiliary_loss_clip": 0.01130542, + "auxiliary_loss_mlp": 0.01163101, + "balance_loss_clip": 1.00205183, + "balance_loss_mlp": 1.00126266, + "epoch": 0.135277318502931, + "flos": 24790931752320.0, + "grad_norm": 1.4177802313546886, + "language_loss": 0.84431446, + "learning_rate": 3.884859003154862e-06, + "loss": 0.86725086, + "num_input_tokens_seen": 48671565, + "step": 2250, + "time_per_iteration": 2.7557802200317383 + }, + { + "auxiliary_loss_clip": 0.01163827, + "auxiliary_loss_mlp": 0.01163283, + "balance_loss_clip": 1.00212717, + "balance_loss_mlp": 1.00115871, + "epoch": 0.13533744175559898, + "flos": 21908525990400.0, + "grad_norm": 2.063163796566268, + "language_loss": 0.82030189, + "learning_rate": 3.884728729525524e-06, + "loss": 0.84357297, + "num_input_tokens_seen": 48690425, + "step": 2251, + "time_per_iteration": 5.31679105758667 + }, + { + "auxiliary_loss_clip": 0.011802, + "auxiliary_loss_mlp": 0.01163257, + "balance_loss_clip": 1.00209033, + "balance_loss_mlp": 1.00113261, + "epoch": 0.13539756500826694, + "flos": 21211643249280.0, + "grad_norm": 8.16098930482053, + "language_loss": 0.85804147, + "learning_rate": 3.884598384427084e-06, + "loss": 0.88147604, + "num_input_tokens_seen": 48707505, + "step": 2252, + "time_per_iteration": 2.5298829078674316 + }, + { + "auxiliary_loss_clip": 0.0116227, + "auxiliary_loss_mlp": 0.01155747, + "balance_loss_clip": 1.00275636, + "balance_loss_mlp": 1.00010717, + "epoch": 0.1354576882609349, + "flos": 63242103634560.0, + "grad_norm": 0.8183904533215101, + "language_loss": 0.61786032, + "learning_rate": 3.884467967864485e-06, + "loss": 0.64104056, + "num_input_tokens_seen": 48775895, + "step": 2253, + "time_per_iteration": 4.620393991470337 + }, + { + "auxiliary_loss_clip": 0.01163655, + "auxiliary_loss_mlp": 0.01163489, + "balance_loss_clip": 1.00218749, + "balance_loss_mlp": 1.00146008, + "epoch": 0.1355178115136029, + "flos": 25483037984640.0, + "grad_norm": 1.8171426320357664, + "language_loss": 0.89264953, + "learning_rate": 3.884337479842671e-06, + "loss": 0.91592097, + "num_input_tokens_seen": 48798370, + "step": 2254, + "time_per_iteration": 4.040761709213257 + }, + { + "auxiliary_loss_clip": 0.01148836, + "auxiliary_loss_mlp": 0.0116359, + "balance_loss_clip": 1.00210905, + "balance_loss_mlp": 1.00108409, + "epoch": 0.13557793476627086, + "flos": 21616967295360.0, + "grad_norm": 2.043815521284547, + "language_loss": 0.84877443, + "learning_rate": 3.884206920366591e-06, + "loss": 0.87189877, + "num_input_tokens_seen": 48817955, + "step": 2255, + "time_per_iteration": 2.627840280532837 + }, + { + "auxiliary_loss_clip": 0.01180288, + "auxiliary_loss_mlp": 0.01163495, + "balance_loss_clip": 1.00224197, + "balance_loss_mlp": 1.00127518, + "epoch": 0.13563805801893883, + "flos": 24928253447040.0, + "grad_norm": 2.1843599529033013, + "language_loss": 0.74463022, + "learning_rate": 3.884076289441196e-06, + "loss": 0.76806808, + "num_input_tokens_seen": 48836330, + "step": 2256, + "time_per_iteration": 2.5615334510803223 + }, + { + "auxiliary_loss_clip": 0.0113069, + "auxiliary_loss_mlp": 0.01163515, + "balance_loss_clip": 1.00186336, + "balance_loss_mlp": 1.00100923, + "epoch": 0.1356981812716068, + "flos": 14750272206720.0, + "grad_norm": 2.009005537121994, + "language_loss": 0.83178544, + "learning_rate": 3.88394558707144e-06, + "loss": 0.85472751, + "num_input_tokens_seen": 48851890, + "step": 2257, + "time_per_iteration": 2.6326000690460205 + }, + { + "auxiliary_loss_clip": 0.0114721, + "auxiliary_loss_mlp": 0.00748954, + "balance_loss_clip": 1.00199854, + "balance_loss_mlp": 1.00021827, + "epoch": 0.13575830452427476, + "flos": 11108571822720.0, + "grad_norm": 2.714804247483336, + "language_loss": 0.82083756, + "learning_rate": 3.883814813262277e-06, + "loss": 0.83979923, + "num_input_tokens_seen": 48865510, + "step": 2258, + "time_per_iteration": 2.578282117843628 + }, + { + "auxiliary_loss_clip": 0.01163882, + "auxiliary_loss_mlp": 0.01163585, + "balance_loss_clip": 1.00205994, + "balance_loss_mlp": 1.00127017, + "epoch": 0.13581842777694272, + "flos": 17960290940160.0, + "grad_norm": 2.571027635130646, + "language_loss": 0.82871139, + "learning_rate": 3.883683968018669e-06, + "loss": 0.85198605, + "num_input_tokens_seen": 48882360, + "step": 2259, + "time_per_iteration": 2.5971803665161133 + }, + { + "auxiliary_loss_clip": 0.01131398, + "auxiliary_loss_mlp": 0.01163186, + "balance_loss_clip": 1.0019753, + "balance_loss_mlp": 1.00144327, + "epoch": 0.1358785510296107, + "flos": 22857142222080.0, + "grad_norm": 2.0224982563022045, + "language_loss": 0.73712659, + "learning_rate": 3.8835530513455755e-06, + "loss": 0.76007235, + "num_input_tokens_seen": 48902700, + "step": 2260, + "time_per_iteration": 2.635201930999756 + }, + { + "auxiliary_loss_clip": 0.0114794, + "auxiliary_loss_mlp": 0.01163319, + "balance_loss_clip": 1.00215793, + "balance_loss_mlp": 1.00138557, + "epoch": 0.13593867428227868, + "flos": 25739404329600.0, + "grad_norm": 4.856449113212162, + "language_loss": 0.7537235, + "learning_rate": 3.883422063247961e-06, + "loss": 0.77683604, + "num_input_tokens_seen": 48922525, + "step": 2261, + "time_per_iteration": 2.696352958679199 + }, + { + "auxiliary_loss_clip": 0.01180088, + "auxiliary_loss_mlp": 0.01162907, + "balance_loss_clip": 1.00210524, + "balance_loss_mlp": 1.00125933, + "epoch": 0.13599879753494665, + "flos": 31249214225280.0, + "grad_norm": 1.7953052541691465, + "language_loss": 0.6342957, + "learning_rate": 3.883291003730794e-06, + "loss": 0.65772563, + "num_input_tokens_seen": 48942510, + "step": 2262, + "time_per_iteration": 2.601917028427124 + }, + { + "auxiliary_loss_clip": 0.01148082, + "auxiliary_loss_mlp": 0.0116312, + "balance_loss_clip": 1.0020504, + "balance_loss_mlp": 1.00109076, + "epoch": 0.1360589207876146, + "flos": 23915034604800.0, + "grad_norm": 2.3446987720822405, + "language_loss": 0.8249886, + "learning_rate": 3.883159872799043e-06, + "loss": 0.84810066, + "num_input_tokens_seen": 48962625, + "step": 2263, + "time_per_iteration": 2.647200345993042 + }, + { + "auxiliary_loss_clip": 0.01097818, + "auxiliary_loss_mlp": 0.01163715, + "balance_loss_clip": 1.00199723, + "balance_loss_mlp": 1.00139952, + "epoch": 0.13611904404028258, + "flos": 19974197756160.0, + "grad_norm": 1.9039410565245842, + "language_loss": 0.87675071, + "learning_rate": 3.8830286704576815e-06, + "loss": 0.89936602, + "num_input_tokens_seen": 48982525, + "step": 2264, + "time_per_iteration": 2.727036952972412 + }, + { + "auxiliary_loss_clip": 0.01163493, + "auxiliary_loss_mlp": 0.01163369, + "balance_loss_clip": 1.00206196, + "balance_loss_mlp": 1.00124431, + "epoch": 0.13617916729295054, + "flos": 15340644144000.0, + "grad_norm": 29.583538182775236, + "language_loss": 0.71642894, + "learning_rate": 3.882897396711683e-06, + "loss": 0.73969758, + "num_input_tokens_seen": 48997605, + "step": 2265, + "time_per_iteration": 2.536057472229004 + }, + { + "auxiliary_loss_clip": 0.01114423, + "auxiliary_loss_mlp": 0.01163242, + "balance_loss_clip": 1.00195122, + "balance_loss_mlp": 1.00130844, + "epoch": 0.1362392905456185, + "flos": 27451445247360.0, + "grad_norm": 1.871724135012291, + "language_loss": 0.6664505, + "learning_rate": 3.882766051566027e-06, + "loss": 0.68922716, + "num_input_tokens_seen": 49018535, + "step": 2266, + "time_per_iteration": 2.7388150691986084 + }, + { + "auxiliary_loss_clip": 0.01114392, + "auxiliary_loss_mlp": 0.01163021, + "balance_loss_clip": 1.00177014, + "balance_loss_mlp": 1.0012784, + "epoch": 0.1362994137982865, + "flos": 25009017177600.0, + "grad_norm": 1.5640321749942885, + "language_loss": 0.767524, + "learning_rate": 3.882634635025694e-06, + "loss": 0.7902981, + "num_input_tokens_seen": 49038865, + "step": 2267, + "time_per_iteration": 2.736811399459839 + }, + { + "auxiliary_loss_clip": 0.01147711, + "auxiliary_loss_mlp": 0.01163109, + "balance_loss_clip": 1.00198472, + "balance_loss_mlp": 1.00107932, + "epoch": 0.13635953705095447, + "flos": 20303031790080.0, + "grad_norm": 1.7889570498908947, + "language_loss": 0.82039076, + "learning_rate": 3.882503147095667e-06, + "loss": 0.84349895, + "num_input_tokens_seen": 49058010, + "step": 2268, + "time_per_iteration": 2.5950169563293457 + }, + { + "auxiliary_loss_clip": 0.01163604, + "auxiliary_loss_mlp": 0.01162795, + "balance_loss_clip": 1.00216699, + "balance_loss_mlp": 1.00105143, + "epoch": 0.13641966030362243, + "flos": 31358418549120.0, + "grad_norm": 2.2068320558017462, + "language_loss": 0.76099241, + "learning_rate": 3.882371587780931e-06, + "loss": 0.78425634, + "num_input_tokens_seen": 49080330, + "step": 2269, + "time_per_iteration": 2.661484956741333 + }, + { + "auxiliary_loss_clip": 0.01131952, + "auxiliary_loss_mlp": 0.01163094, + "balance_loss_clip": 1.00215459, + "balance_loss_mlp": 1.00096965, + "epoch": 0.1364797835562904, + "flos": 20478095700480.0, + "grad_norm": 1.9163076026258912, + "language_loss": 0.81334996, + "learning_rate": 3.882239957086477e-06, + "loss": 0.83630049, + "num_input_tokens_seen": 49097035, + "step": 2270, + "time_per_iteration": 2.6574254035949707 + }, + { + "auxiliary_loss_clip": 0.01147617, + "auxiliary_loss_mlp": 0.01163238, + "balance_loss_clip": 1.00206816, + "balance_loss_mlp": 1.00149524, + "epoch": 0.13653990680895836, + "flos": 13078343802240.0, + "grad_norm": 2.9315881699810067, + "language_loss": 0.7562108, + "learning_rate": 3.882108255017295e-06, + "loss": 0.77931941, + "num_input_tokens_seen": 49113945, + "step": 2271, + "time_per_iteration": 2.630645990371704 + }, + { + "auxiliary_loss_clip": 0.01164075, + "auxiliary_loss_mlp": 0.01163327, + "balance_loss_clip": 1.0020448, + "balance_loss_mlp": 1.0013926, + "epoch": 0.13660003006162633, + "flos": 16946712961920.0, + "grad_norm": 1.94526660892146, + "language_loss": 0.80711055, + "learning_rate": 3.881976481578379e-06, + "loss": 0.83038461, + "num_input_tokens_seen": 49132855, + "step": 2272, + "time_per_iteration": 2.5389907360076904 + }, + { + "auxiliary_loss_clip": 0.01161473, + "auxiliary_loss_mlp": 0.0115482, + "balance_loss_clip": 1.0023222, + "balance_loss_mlp": 0.99994296, + "epoch": 0.1366601533142943, + "flos": 68682749892480.0, + "grad_norm": 0.6872915116657771, + "language_loss": 0.60685223, + "learning_rate": 3.8818446367747255e-06, + "loss": 0.63001519, + "num_input_tokens_seen": 49198310, + "step": 2273, + "time_per_iteration": 3.2284207344055176 + }, + { + "auxiliary_loss_clip": 0.01180072, + "auxiliary_loss_mlp": 0.00748882, + "balance_loss_clip": 1.00221527, + "balance_loss_mlp": 1.00012398, + "epoch": 0.13672027656696228, + "flos": 19244241567360.0, + "grad_norm": 1.7981919121277565, + "language_loss": 0.78085423, + "learning_rate": 3.881712720611336e-06, + "loss": 0.80014372, + "num_input_tokens_seen": 49217250, + "step": 2274, + "time_per_iteration": 2.5460946559906006 + }, + { + "auxiliary_loss_clip": 0.01164498, + "auxiliary_loss_mlp": 0.01162809, + "balance_loss_clip": 1.00209165, + "balance_loss_mlp": 1.00116086, + "epoch": 0.13678039981963025, + "flos": 24534924543360.0, + "grad_norm": 1.8675834529870539, + "language_loss": 0.78189892, + "learning_rate": 3.881580733093211e-06, + "loss": 0.80517197, + "num_input_tokens_seen": 49236615, + "step": 2275, + "time_per_iteration": 2.608675718307495 + }, + { + "auxiliary_loss_clip": 0.01163431, + "auxiliary_loss_mlp": 0.0116283, + "balance_loss_clip": 1.00202227, + "balance_loss_mlp": 1.00099111, + "epoch": 0.13684052307229821, + "flos": 15669334523520.0, + "grad_norm": 2.4185676032764194, + "language_loss": 0.81321466, + "learning_rate": 3.881448674225356e-06, + "loss": 0.83647728, + "num_input_tokens_seen": 49253935, + "step": 2276, + "time_per_iteration": 2.5138049125671387 + }, + { + "auxiliary_loss_clip": 0.01164487, + "auxiliary_loss_mlp": 0.01163692, + "balance_loss_clip": 1.0020597, + "balance_loss_mlp": 1.00137627, + "epoch": 0.13690064632496618, + "flos": 28364689560960.0, + "grad_norm": 2.7407737237618854, + "language_loss": 0.69691968, + "learning_rate": 3.881316544012779e-06, + "loss": 0.72020149, + "num_input_tokens_seen": 49273605, + "step": 2277, + "time_per_iteration": 2.6323678493499756 + }, + { + "auxiliary_loss_clip": 0.0116467, + "auxiliary_loss_mlp": 0.00748951, + "balance_loss_clip": 1.00224316, + "balance_loss_mlp": 1.00023031, + "epoch": 0.13696076957763414, + "flos": 23404779953280.0, + "grad_norm": 2.195534704001081, + "language_loss": 0.80231273, + "learning_rate": 3.88118434246049e-06, + "loss": 0.82144892, + "num_input_tokens_seen": 49291785, + "step": 2278, + "time_per_iteration": 2.5842695236206055 + }, + { + "auxiliary_loss_clip": 0.01164224, + "auxiliary_loss_mlp": 0.01163265, + "balance_loss_clip": 1.00237823, + "balance_loss_mlp": 1.0012362, + "epoch": 0.1370208928303021, + "flos": 37196595601920.0, + "grad_norm": 2.103663171550229, + "language_loss": 0.74970037, + "learning_rate": 3.881052069573502e-06, + "loss": 0.77297521, + "num_input_tokens_seen": 49311405, + "step": 2279, + "time_per_iteration": 2.6762919425964355 + }, + { + "auxiliary_loss_clip": 0.01097993, + "auxiliary_loss_mlp": 0.01163473, + "balance_loss_clip": 1.00165021, + "balance_loss_mlp": 1.00134885, + "epoch": 0.13708101608297008, + "flos": 26976311118720.0, + "grad_norm": 1.9206391489622923, + "language_loss": 0.7705611, + "learning_rate": 3.880919725356831e-06, + "loss": 0.79317576, + "num_input_tokens_seen": 49331835, + "step": 2280, + "time_per_iteration": 2.739295244216919 + }, + { + "auxiliary_loss_clip": 0.01114584, + "auxiliary_loss_mlp": 0.01162662, + "balance_loss_clip": 1.00172102, + "balance_loss_mlp": 1.0009191, + "epoch": 0.13714113933563807, + "flos": 32556864850560.0, + "grad_norm": 1.612477789883387, + "language_loss": 0.79524159, + "learning_rate": 3.880787309815496e-06, + "loss": 0.81801403, + "num_input_tokens_seen": 49352290, + "step": 2281, + "time_per_iteration": 2.7609410285949707 + }, + { + "auxiliary_loss_clip": 0.01180339, + "auxiliary_loss_mlp": 0.01163813, + "balance_loss_clip": 1.00226319, + "balance_loss_mlp": 1.00149727, + "epoch": 0.13720126258830603, + "flos": 16101267569280.0, + "grad_norm": 1.6484165397009596, + "language_loss": 0.83363748, + "learning_rate": 3.880654822954518e-06, + "loss": 0.85707903, + "num_input_tokens_seen": 49370285, + "step": 2282, + "time_per_iteration": 2.571974515914917 + }, + { + "auxiliary_loss_clip": 0.01147441, + "auxiliary_loss_mlp": 0.01163008, + "balance_loss_clip": 1.00202906, + "balance_loss_mlp": 1.00107384, + "epoch": 0.137261385840974, + "flos": 18953544798720.0, + "grad_norm": 1.665228762489924, + "language_loss": 0.73343986, + "learning_rate": 3.8805222647789195e-06, + "loss": 0.75654435, + "num_input_tokens_seen": 49389610, + "step": 2283, + "time_per_iteration": 2.6076042652130127 + }, + { + "auxiliary_loss_clip": 0.01164372, + "auxiliary_loss_mlp": 0.01163402, + "balance_loss_clip": 1.00229061, + "balance_loss_mlp": 1.00137246, + "epoch": 0.13732150909364196, + "flos": 23295360147840.0, + "grad_norm": 4.4485201029895585, + "language_loss": 0.84109688, + "learning_rate": 3.880389635293729e-06, + "loss": 0.86437464, + "num_input_tokens_seen": 49408390, + "step": 2284, + "time_per_iteration": 2.576247453689575 + }, + { + "auxiliary_loss_clip": 0.01147237, + "auxiliary_loss_mlp": 0.01163292, + "balance_loss_clip": 1.00207114, + "balance_loss_mlp": 1.00135827, + "epoch": 0.13738163234630993, + "flos": 29351263489920.0, + "grad_norm": 1.8443603802343322, + "language_loss": 0.75057173, + "learning_rate": 3.880256934503974e-06, + "loss": 0.77367699, + "num_input_tokens_seen": 49427725, + "step": 2285, + "time_per_iteration": 2.6848642826080322 + }, + { + "auxiliary_loss_clip": 0.01147001, + "auxiliary_loss_mlp": 0.01162952, + "balance_loss_clip": 1.00199413, + "balance_loss_mlp": 1.00120854, + "epoch": 0.1374417555989779, + "flos": 26651319840000.0, + "grad_norm": 1.5950662864354985, + "language_loss": 0.747859, + "learning_rate": 3.880124162414689e-06, + "loss": 0.77095854, + "num_input_tokens_seen": 49449000, + "step": 2286, + "time_per_iteration": 2.6719789505004883 + }, + { + "auxiliary_loss_clip": 0.01115002, + "auxiliary_loss_mlp": 0.01163218, + "balance_loss_clip": 1.00204051, + "balance_loss_mlp": 1.00090313, + "epoch": 0.1375018788516459, + "flos": 28403401443840.0, + "grad_norm": 2.4654714268532905, + "language_loss": 0.86235225, + "learning_rate": 3.879991319030908e-06, + "loss": 0.88513446, + "num_input_tokens_seen": 49468360, + "step": 2287, + "time_per_iteration": 2.7322351932525635 + }, + { + "auxiliary_loss_clip": 0.01131281, + "auxiliary_loss_mlp": 0.01163469, + "balance_loss_clip": 1.00201511, + "balance_loss_mlp": 1.00134397, + "epoch": 0.13756200210431385, + "flos": 37413783187200.0, + "grad_norm": 1.9412910029739405, + "language_loss": 0.6839782, + "learning_rate": 3.879858404357666e-06, + "loss": 0.70692575, + "num_input_tokens_seen": 49493450, + "step": 2288, + "time_per_iteration": 5.597338914871216 + }, + { + "auxiliary_loss_clip": 0.0111515, + "auxiliary_loss_mlp": 0.01163321, + "balance_loss_clip": 1.00204897, + "balance_loss_mlp": 1.00138736, + "epoch": 0.13762212535698182, + "flos": 22711021695360.0, + "grad_norm": 2.3050257876779714, + "language_loss": 0.87123692, + "learning_rate": 3.879725418400005e-06, + "loss": 0.89402163, + "num_input_tokens_seen": 49511220, + "step": 2289, + "time_per_iteration": 2.6739425659179688 + }, + { + "auxiliary_loss_clip": 0.01147458, + "auxiliary_loss_mlp": 0.00748888, + "balance_loss_clip": 1.00206661, + "balance_loss_mlp": 1.00028515, + "epoch": 0.13768224860964978, + "flos": 23952130375680.0, + "grad_norm": 1.6698482895181148, + "language_loss": 0.74954933, + "learning_rate": 3.879592361162969e-06, + "loss": 0.76851285, + "num_input_tokens_seen": 49529820, + "step": 2290, + "time_per_iteration": 4.090861558914185 + }, + { + "auxiliary_loss_clip": 0.01127895, + "auxiliary_loss_mlp": 0.01154906, + "balance_loss_clip": 1.00176716, + "balance_loss_mlp": 1.00002944, + "epoch": 0.13774237186231775, + "flos": 63590438753280.0, + "grad_norm": 0.7120213525444546, + "language_loss": 0.51591778, + "learning_rate": 3.8794592326516015e-06, + "loss": 0.53874588, + "num_input_tokens_seen": 49595325, + "step": 2291, + "time_per_iteration": 3.277315616607666 + }, + { + "auxiliary_loss_clip": 0.01164576, + "auxiliary_loss_mlp": 0.0116327, + "balance_loss_clip": 1.00217628, + "balance_loss_mlp": 1.00124061, + "epoch": 0.1378024951149857, + "flos": 24279456038400.0, + "grad_norm": 2.1637780921723633, + "language_loss": 0.7089256, + "learning_rate": 3.879326032870952e-06, + "loss": 0.73220408, + "num_input_tokens_seen": 49615850, + "step": 2292, + "time_per_iteration": 4.037820816040039 + }, + { + "auxiliary_loss_clip": 0.01164139, + "auxiliary_loss_mlp": 0.01163547, + "balance_loss_clip": 1.00226188, + "balance_loss_mlp": 1.00151789, + "epoch": 0.13786261836765368, + "flos": 14021537080320.0, + "grad_norm": 3.152302766991848, + "language_loss": 0.79863155, + "learning_rate": 3.879192761826071e-06, + "loss": 0.82190835, + "num_input_tokens_seen": 49631860, + "step": 2293, + "time_per_iteration": 2.5684564113616943 + }, + { + "auxiliary_loss_clip": 0.01163604, + "auxiliary_loss_mlp": 0.01163024, + "balance_loss_clip": 1.00210321, + "balance_loss_mlp": 1.00099468, + "epoch": 0.13792274162032167, + "flos": 28878679226880.0, + "grad_norm": 2.113247486389652, + "language_loss": 0.78300786, + "learning_rate": 3.879059419522011e-06, + "loss": 0.80627412, + "num_input_tokens_seen": 49652145, + "step": 2294, + "time_per_iteration": 2.696909189224243 + }, + { + "auxiliary_loss_clip": 0.01131236, + "auxiliary_loss_mlp": 0.01162717, + "balance_loss_clip": 1.00197554, + "balance_loss_mlp": 1.0014509, + "epoch": 0.13798286487298964, + "flos": 21141150808320.0, + "grad_norm": 2.6567389820317486, + "language_loss": 0.79870427, + "learning_rate": 3.878926005963831e-06, + "loss": 0.82164377, + "num_input_tokens_seen": 49669880, + "step": 2295, + "time_per_iteration": 2.695834159851074 + }, + { + "auxiliary_loss_clip": 0.01163398, + "auxiliary_loss_mlp": 0.01162849, + "balance_loss_clip": 1.0020026, + "balance_loss_mlp": 1.00101018, + "epoch": 0.1380429881256576, + "flos": 22487477402880.0, + "grad_norm": 1.9506803865211086, + "language_loss": 0.78567982, + "learning_rate": 3.878792521156588e-06, + "loss": 0.80894232, + "num_input_tokens_seen": 49687255, + "step": 2296, + "time_per_iteration": 2.5832223892211914 + }, + { + "auxiliary_loss_clip": 0.01164446, + "auxiliary_loss_mlp": 0.01163099, + "balance_loss_clip": 1.00222135, + "balance_loss_mlp": 1.00135565, + "epoch": 0.13810311137832557, + "flos": 21393674398080.0, + "grad_norm": 1.7780941134750607, + "language_loss": 0.78527308, + "learning_rate": 3.8786589651053446e-06, + "loss": 0.80854851, + "num_input_tokens_seen": 49706650, + "step": 2297, + "time_per_iteration": 2.6512203216552734 + }, + { + "auxiliary_loss_clip": 0.0111551, + "auxiliary_loss_mlp": 0.01161848, + "balance_loss_clip": 1.00206649, + "balance_loss_mlp": 1.0008682, + "epoch": 0.13816323463099353, + "flos": 25989844930560.0, + "grad_norm": 1.9089327862653542, + "language_loss": 0.68505347, + "learning_rate": 3.878525337815164e-06, + "loss": 0.70782697, + "num_input_tokens_seen": 49725715, + "step": 2298, + "time_per_iteration": 2.6982598304748535 + }, + { + "auxiliary_loss_clip": 0.0114818, + "auxiliary_loss_mlp": 0.01162842, + "balance_loss_clip": 1.00211036, + "balance_loss_mlp": 1.00119436, + "epoch": 0.1382233578836615, + "flos": 19244313394560.0, + "grad_norm": 1.6946266160056989, + "language_loss": 0.86619991, + "learning_rate": 3.878391639291116e-06, + "loss": 0.88931012, + "num_input_tokens_seen": 49744710, + "step": 2299, + "time_per_iteration": 2.579674243927002 + }, + { + "auxiliary_loss_clip": 0.01180082, + "auxiliary_loss_mlp": 0.01162862, + "balance_loss_clip": 1.00222301, + "balance_loss_mlp": 1.00121427, + "epoch": 0.1382834811363295, + "flos": 25666290195840.0, + "grad_norm": 2.561753285263719, + "language_loss": 0.75703859, + "learning_rate": 3.878257869538267e-06, + "loss": 0.78046799, + "num_input_tokens_seen": 49764300, + "step": 2300, + "time_per_iteration": 2.5441675186157227 + }, + { + "auxiliary_loss_clip": 0.0113092, + "auxiliary_loss_mlp": 0.01162491, + "balance_loss_clip": 1.00197697, + "balance_loss_mlp": 1.00122464, + "epoch": 0.13834360438899745, + "flos": 19784193788160.0, + "grad_norm": 2.96571360446241, + "language_loss": 0.82955527, + "learning_rate": 3.878124028561692e-06, + "loss": 0.85248941, + "num_input_tokens_seen": 49778380, + "step": 2301, + "time_per_iteration": 2.5946922302246094 + }, + { + "auxiliary_loss_clip": 0.01147208, + "auxiliary_loss_mlp": 0.00748918, + "balance_loss_clip": 1.0019834, + "balance_loss_mlp": 1.00020242, + "epoch": 0.13840372764166542, + "flos": 26651858544000.0, + "grad_norm": 2.1127062391610645, + "language_loss": 0.86340344, + "learning_rate": 3.877990116366466e-06, + "loss": 0.88236475, + "num_input_tokens_seen": 49797460, + "step": 2302, + "time_per_iteration": 2.6745479106903076 + }, + { + "auxiliary_loss_clip": 0.01161012, + "auxiliary_loss_mlp": 0.01154877, + "balance_loss_clip": 1.00228977, + "balance_loss_mlp": 1.00000048, + "epoch": 0.13846385089433338, + "flos": 70510998286080.0, + "grad_norm": 0.7572993151517908, + "language_loss": 0.656214, + "learning_rate": 3.877856132957667e-06, + "loss": 0.67937291, + "num_input_tokens_seen": 49868005, + "step": 2303, + "time_per_iteration": 3.269744634628296 + }, + { + "auxiliary_loss_clip": 0.01163377, + "auxiliary_loss_mlp": 0.01161903, + "balance_loss_clip": 1.00210428, + "balance_loss_mlp": 1.00073195, + "epoch": 0.13852397414700135, + "flos": 17348732956800.0, + "grad_norm": 1.848037763619758, + "language_loss": 0.78159964, + "learning_rate": 3.877722078340374e-06, + "loss": 0.80485237, + "num_input_tokens_seen": 49885825, + "step": 2304, + "time_per_iteration": 2.58453369140625 + }, + { + "auxiliary_loss_clip": 0.01163532, + "auxiliary_loss_mlp": 0.01163061, + "balance_loss_clip": 1.0021944, + "balance_loss_mlp": 1.00112748, + "epoch": 0.13858409739966931, + "flos": 21543781334400.0, + "grad_norm": 1.6858711138051834, + "language_loss": 0.77750385, + "learning_rate": 3.877587952519672e-06, + "loss": 0.80076981, + "num_input_tokens_seen": 49905975, + "step": 2305, + "time_per_iteration": 2.57377552986145 + }, + { + "auxiliary_loss_clip": 0.01083783, + "auxiliary_loss_mlp": 0.01162506, + "balance_loss_clip": 1.00176764, + "balance_loss_mlp": 1.00114441, + "epoch": 0.13864422065233728, + "flos": 21579907438080.0, + "grad_norm": 1.805556673505196, + "language_loss": 0.87996596, + "learning_rate": 3.877453755500647e-06, + "loss": 0.90242887, + "num_input_tokens_seen": 49925800, + "step": 2306, + "time_per_iteration": 2.778853416442871 + }, + { + "auxiliary_loss_clip": 0.01177775, + "auxiliary_loss_mlp": 0.01154852, + "balance_loss_clip": 1.00225687, + "balance_loss_mlp": 0.9999752, + "epoch": 0.13870434390500527, + "flos": 53371156872960.0, + "grad_norm": 0.8751407583443733, + "language_loss": 0.58968866, + "learning_rate": 3.877319487288387e-06, + "loss": 0.61301494, + "num_input_tokens_seen": 49977620, + "step": 2307, + "time_per_iteration": 3.1499931812286377 + }, + { + "auxiliary_loss_clip": 0.01179932, + "auxiliary_loss_mlp": 0.00748991, + "balance_loss_clip": 1.00210106, + "balance_loss_mlp": 1.00026464, + "epoch": 0.13876446715767324, + "flos": 22565906749440.0, + "grad_norm": 1.6725654117627053, + "language_loss": 0.79752606, + "learning_rate": 3.877185147887984e-06, + "loss": 0.81681526, + "num_input_tokens_seen": 49996650, + "step": 2308, + "time_per_iteration": 2.549637794494629 + }, + { + "auxiliary_loss_clip": 0.01131206, + "auxiliary_loss_mlp": 0.01162602, + "balance_loss_clip": 1.00209606, + "balance_loss_mlp": 1.00105, + "epoch": 0.1388245904103412, + "flos": 20705231352960.0, + "grad_norm": 2.1428285481056464, + "language_loss": 0.78011304, + "learning_rate": 3.877050737304533e-06, + "loss": 0.80305111, + "num_input_tokens_seen": 50015640, + "step": 2309, + "time_per_iteration": 2.6765851974487305 + }, + { + "auxiliary_loss_clip": 0.01132094, + "auxiliary_loss_mlp": 0.01162859, + "balance_loss_clip": 1.0020026, + "balance_loss_mlp": 1.00092554, + "epoch": 0.13888471366300917, + "flos": 20554729367040.0, + "grad_norm": 1.9850025986309787, + "language_loss": 0.6778937, + "learning_rate": 3.876916255543129e-06, + "loss": 0.70084321, + "num_input_tokens_seen": 50033500, + "step": 2310, + "time_per_iteration": 2.642418622970581 + }, + { + "auxiliary_loss_clip": 0.01179909, + "auxiliary_loss_mlp": 0.01162476, + "balance_loss_clip": 1.0021584, + "balance_loss_mlp": 1.00120974, + "epoch": 0.13894483691567713, + "flos": 13838033473920.0, + "grad_norm": 2.1738997904787567, + "language_loss": 0.84038734, + "learning_rate": 3.8767817026088725e-06, + "loss": 0.86381114, + "num_input_tokens_seen": 50050075, + "step": 2311, + "time_per_iteration": 2.4874095916748047 + }, + { + "auxiliary_loss_clip": 0.01180027, + "auxiliary_loss_mlp": 0.01162462, + "balance_loss_clip": 1.00220037, + "balance_loss_mlp": 1.0010047, + "epoch": 0.1390049601683451, + "flos": 28031186759040.0, + "grad_norm": 1.9506541581996735, + "language_loss": 0.81785703, + "learning_rate": 3.876647078506866e-06, + "loss": 0.84128195, + "num_input_tokens_seen": 50070080, + "step": 2312, + "time_per_iteration": 2.540102005004883 + }, + { + "auxiliary_loss_clip": 0.01130281, + "auxiliary_loss_mlp": 0.00748949, + "balance_loss_clip": 1.00204659, + "balance_loss_mlp": 1.00030422, + "epoch": 0.13906508342101306, + "flos": 26756860976640.0, + "grad_norm": 1.7403953790413853, + "language_loss": 0.86504436, + "learning_rate": 3.876512383242215e-06, + "loss": 0.88383663, + "num_input_tokens_seen": 50090040, + "step": 2313, + "time_per_iteration": 2.6775405406951904 + }, + { + "auxiliary_loss_clip": 0.01179875, + "auxiliary_loss_mlp": 0.01162395, + "balance_loss_clip": 1.00222981, + "balance_loss_mlp": 1.00122416, + "epoch": 0.13912520667368106, + "flos": 24535104111360.0, + "grad_norm": 1.740213944341072, + "language_loss": 0.8025257, + "learning_rate": 3.876377616820024e-06, + "loss": 0.82594836, + "num_input_tokens_seen": 50110595, + "step": 2314, + "time_per_iteration": 2.528632879257202 + }, + { + "auxiliary_loss_clip": 0.0113085, + "auxiliary_loss_mlp": 0.01162532, + "balance_loss_clip": 1.00195742, + "balance_loss_mlp": 1.00107455, + "epoch": 0.13918532992634902, + "flos": 19383215287680.0, + "grad_norm": 3.009428669779328, + "language_loss": 0.85155797, + "learning_rate": 3.876242779245409e-06, + "loss": 0.87449175, + "num_input_tokens_seen": 50125430, + "step": 2315, + "time_per_iteration": 2.6061313152313232 + }, + { + "auxiliary_loss_clip": 0.01164168, + "auxiliary_loss_mlp": 0.01163034, + "balance_loss_clip": 1.00206459, + "balance_loss_mlp": 1.00129116, + "epoch": 0.139245453179017, + "flos": 21323756574720.0, + "grad_norm": 1.9930752506645237, + "language_loss": 0.77228284, + "learning_rate": 3.876107870523477e-06, + "loss": 0.79555488, + "num_input_tokens_seen": 50144120, + "step": 2316, + "time_per_iteration": 2.5321052074432373 + }, + { + "auxiliary_loss_clip": 0.01179822, + "auxiliary_loss_mlp": 0.00748932, + "balance_loss_clip": 1.00216174, + "balance_loss_mlp": 1.00026965, + "epoch": 0.13930557643168495, + "flos": 19500607912320.0, + "grad_norm": 1.8072591294889626, + "language_loss": 0.77234364, + "learning_rate": 3.875972890659349e-06, + "loss": 0.79163116, + "num_input_tokens_seen": 50162500, + "step": 2317, + "time_per_iteration": 2.5092196464538574 + }, + { + "auxiliary_loss_clip": 0.01147432, + "auxiliary_loss_mlp": 0.01162327, + "balance_loss_clip": 1.00209892, + "balance_loss_mlp": 1.0009656, + "epoch": 0.13936569968435292, + "flos": 25410821690880.0, + "grad_norm": 1.8678707015774434, + "language_loss": 0.80314285, + "learning_rate": 3.875837839658139e-06, + "loss": 0.82624042, + "num_input_tokens_seen": 50182415, + "step": 2318, + "time_per_iteration": 2.6096696853637695 + }, + { + "auxiliary_loss_clip": 0.01149664, + "auxiliary_loss_mlp": 0.01154113, + "balance_loss_clip": 1.0030539, + "balance_loss_mlp": 0.99999887, + "epoch": 0.13942582293702088, + "flos": 70771063731840.0, + "grad_norm": 0.8487599226716306, + "language_loss": 0.5904032, + "learning_rate": 3.87570271752497e-06, + "loss": 0.61344099, + "num_input_tokens_seen": 50245160, + "step": 2319, + "time_per_iteration": 3.2149906158447266 + }, + { + "auxiliary_loss_clip": 0.011323, + "auxiliary_loss_mlp": 0.01162544, + "balance_loss_clip": 1.00216842, + "balance_loss_mlp": 1.00108671, + "epoch": 0.13948594618968888, + "flos": 35590885920000.0, + "grad_norm": 2.4565898192870694, + "language_loss": 0.65140843, + "learning_rate": 3.875567524264967e-06, + "loss": 0.67435688, + "num_input_tokens_seen": 50268215, + "step": 2320, + "time_per_iteration": 2.751164436340332 + }, + { + "auxiliary_loss_clip": 0.01115791, + "auxiliary_loss_mlp": 0.01161954, + "balance_loss_clip": 1.00207889, + "balance_loss_mlp": 1.00097382, + "epoch": 0.13954606944235684, + "flos": 21105204272640.0, + "grad_norm": 1.5761205117494788, + "language_loss": 0.71142954, + "learning_rate": 3.875432259883256e-06, + "loss": 0.73420703, + "num_input_tokens_seen": 50288575, + "step": 2321, + "time_per_iteration": 2.691873788833618 + }, + { + "auxiliary_loss_clip": 0.01131781, + "auxiliary_loss_mlp": 0.01162723, + "balance_loss_clip": 1.00193739, + "balance_loss_mlp": 1.00117028, + "epoch": 0.1396061926950248, + "flos": 25044425009280.0, + "grad_norm": 1.9624921738818941, + "language_loss": 0.85698342, + "learning_rate": 3.875296924384965e-06, + "loss": 0.87992847, + "num_input_tokens_seen": 50308735, + "step": 2322, + "time_per_iteration": 2.6853511333465576 + }, + { + "auxiliary_loss_clip": 0.01132174, + "auxiliary_loss_mlp": 0.01161437, + "balance_loss_clip": 1.0019834, + "balance_loss_mlp": 1.00112438, + "epoch": 0.13966631594769277, + "flos": 37634023428480.0, + "grad_norm": 1.5491753872654066, + "language_loss": 0.67101908, + "learning_rate": 3.875161517775226e-06, + "loss": 0.69395518, + "num_input_tokens_seen": 50331025, + "step": 2323, + "time_per_iteration": 2.804856538772583 + }, + { + "auxiliary_loss_clip": 0.0113174, + "auxiliary_loss_mlp": 0.01162938, + "balance_loss_clip": 1.00201678, + "balance_loss_mlp": 1.00119543, + "epoch": 0.13972643920036074, + "flos": 16690993061760.0, + "grad_norm": 1.8993005096482842, + "language_loss": 0.89238995, + "learning_rate": 3.875026040059175e-06, + "loss": 0.91533673, + "num_input_tokens_seen": 50349725, + "step": 2324, + "time_per_iteration": 2.6411995887756348 + }, + { + "auxiliary_loss_clip": 0.01163134, + "auxiliary_loss_mlp": 0.01162184, + "balance_loss_clip": 1.00198221, + "balance_loss_mlp": 1.00101328, + "epoch": 0.1397865624530287, + "flos": 23331055288320.0, + "grad_norm": 2.6146304042073494, + "language_loss": 0.71253991, + "learning_rate": 3.8748904912419485e-06, + "loss": 0.73579311, + "num_input_tokens_seen": 50367965, + "step": 2325, + "time_per_iteration": 2.5722134113311768 + }, + { + "auxiliary_loss_clip": 0.01148013, + "auxiliary_loss_mlp": 0.00748876, + "balance_loss_clip": 1.00237, + "balance_loss_mlp": 1.00028718, + "epoch": 0.13984668570569667, + "flos": 22778317825920.0, + "grad_norm": 2.366313328244553, + "language_loss": 0.81752563, + "learning_rate": 3.874754871328688e-06, + "loss": 0.83649457, + "num_input_tokens_seen": 50385605, + "step": 2326, + "time_per_iteration": 5.376426696777344 + }, + { + "auxiliary_loss_clip": 0.01163229, + "auxiliary_loss_mlp": 0.01161778, + "balance_loss_clip": 1.00209212, + "balance_loss_mlp": 1.00108385, + "epoch": 0.13990680895836466, + "flos": 19464553635840.0, + "grad_norm": 1.7070305004801165, + "language_loss": 0.89075768, + "learning_rate": 3.874619180324534e-06, + "loss": 0.91400772, + "num_input_tokens_seen": 50403985, + "step": 2327, + "time_per_iteration": 2.6029255390167236 + }, + { + "auxiliary_loss_clip": 0.01130781, + "auxiliary_loss_mlp": 0.01162115, + "balance_loss_clip": 1.00196981, + "balance_loss_mlp": 1.00123012, + "epoch": 0.13996693221103262, + "flos": 20303283185280.0, + "grad_norm": 2.478384692966244, + "language_loss": 0.85100102, + "learning_rate": 3.874483418234632e-06, + "loss": 0.87392998, + "num_input_tokens_seen": 50421590, + "step": 2328, + "time_per_iteration": 4.069833993911743 + }, + { + "auxiliary_loss_clip": 0.01163279, + "auxiliary_loss_mlp": 0.01162384, + "balance_loss_clip": 1.00209832, + "balance_loss_mlp": 1.00111794, + "epoch": 0.1400270554637006, + "flos": 26617707688320.0, + "grad_norm": 1.5290818113480955, + "language_loss": 0.73800558, + "learning_rate": 3.874347585064131e-06, + "loss": 0.7612623, + "num_input_tokens_seen": 50443945, + "step": 2329, + "time_per_iteration": 2.632697343826294 + }, + { + "auxiliary_loss_clip": 0.01163238, + "auxiliary_loss_mlp": 0.01162234, + "balance_loss_clip": 1.00206208, + "balance_loss_mlp": 1.00115836, + "epoch": 0.14008717871636855, + "flos": 19391475415680.0, + "grad_norm": 2.0705458264898433, + "language_loss": 0.78467309, + "learning_rate": 3.874211680818183e-06, + "loss": 0.80792779, + "num_input_tokens_seen": 50462065, + "step": 2330, + "time_per_iteration": 4.010467529296875 + }, + { + "auxiliary_loss_clip": 0.01148112, + "auxiliary_loss_mlp": 0.01162152, + "balance_loss_clip": 1.00208771, + "balance_loss_mlp": 1.00107622, + "epoch": 0.14014730196903652, + "flos": 15304266645120.0, + "grad_norm": 1.8672731600941677, + "language_loss": 0.71951246, + "learning_rate": 3.87407570550194e-06, + "loss": 0.74261504, + "num_input_tokens_seen": 50479565, + "step": 2331, + "time_per_iteration": 2.575437545776367 + }, + { + "auxiliary_loss_clip": 0.0117959, + "auxiliary_loss_mlp": 0.01162337, + "balance_loss_clip": 1.00220299, + "balance_loss_mlp": 1.00145221, + "epoch": 0.14020742522170448, + "flos": 14939701557120.0, + "grad_norm": 1.4920422709134176, + "language_loss": 0.72601712, + "learning_rate": 3.873939659120557e-06, + "loss": 0.74943638, + "num_input_tokens_seen": 50497305, + "step": 2332, + "time_per_iteration": 2.5001325607299805 + }, + { + "auxiliary_loss_clip": 0.01161224, + "auxiliary_loss_mlp": 0.01154958, + "balance_loss_clip": 1.00218391, + "balance_loss_mlp": 1.00008106, + "epoch": 0.14026754847437245, + "flos": 48824580044160.0, + "grad_norm": 0.8271903001868658, + "language_loss": 0.56107581, + "learning_rate": 3.873803541679196e-06, + "loss": 0.58423764, + "num_input_tokens_seen": 50549735, + "step": 2333, + "time_per_iteration": 2.9869470596313477 + }, + { + "auxiliary_loss_clip": 0.01129935, + "auxiliary_loss_mlp": 0.01161902, + "balance_loss_clip": 1.00181794, + "balance_loss_mlp": 1.00101757, + "epoch": 0.14032767172704044, + "flos": 25773267876480.0, + "grad_norm": 1.6641569533458627, + "language_loss": 0.82543296, + "learning_rate": 3.873667353183016e-06, + "loss": 0.84835136, + "num_input_tokens_seen": 50570100, + "step": 2334, + "time_per_iteration": 2.679779052734375 + }, + { + "auxiliary_loss_clip": 0.01130871, + "auxiliary_loss_mlp": 0.01161525, + "balance_loss_clip": 1.00187576, + "balance_loss_mlp": 1.00092649, + "epoch": 0.1403877949797084, + "flos": 21216312017280.0, + "grad_norm": 1.9632619098104456, + "language_loss": 0.80969071, + "learning_rate": 3.8735310936371825e-06, + "loss": 0.83261466, + "num_input_tokens_seen": 50589185, + "step": 2335, + "time_per_iteration": 2.6860971450805664 + }, + { + "auxiliary_loss_clip": 0.01115036, + "auxiliary_loss_mlp": 0.01162135, + "balance_loss_clip": 1.00176966, + "balance_loss_mlp": 1.00105953, + "epoch": 0.14044791823237637, + "flos": 22747973811840.0, + "grad_norm": 1.5313302036313063, + "language_loss": 0.81820488, + "learning_rate": 3.873394763046862e-06, + "loss": 0.8409766, + "num_input_tokens_seen": 50609645, + "step": 2336, + "time_per_iteration": 2.692777633666992 + }, + { + "auxiliary_loss_clip": 0.01163572, + "auxiliary_loss_mlp": 0.01162284, + "balance_loss_clip": 1.00220776, + "balance_loss_mlp": 1.00139928, + "epoch": 0.14050804148504434, + "flos": 22964443125120.0, + "grad_norm": 1.9335965627139728, + "language_loss": 0.80464649, + "learning_rate": 3.873258361417225e-06, + "loss": 0.82790506, + "num_input_tokens_seen": 50628385, + "step": 2337, + "time_per_iteration": 2.596147298812866 + }, + { + "auxiliary_loss_clip": 0.01162993, + "auxiliary_loss_mlp": 0.01161834, + "balance_loss_clip": 1.00203657, + "balance_loss_mlp": 1.00104475, + "epoch": 0.1405681647377123, + "flos": 22200336080640.0, + "grad_norm": 1.9634322920373182, + "language_loss": 0.79275918, + "learning_rate": 3.873121888753442e-06, + "loss": 0.81600749, + "num_input_tokens_seen": 50647260, + "step": 2338, + "time_per_iteration": 2.568483352661133 + }, + { + "auxiliary_loss_clip": 0.01163221, + "auxiliary_loss_mlp": 0.01162057, + "balance_loss_clip": 1.00221789, + "balance_loss_mlp": 1.00098133, + "epoch": 0.14062828799038027, + "flos": 23732787974400.0, + "grad_norm": 2.080300900764305, + "language_loss": 0.80116183, + "learning_rate": 3.87298534506069e-06, + "loss": 0.82441461, + "num_input_tokens_seen": 50666130, + "step": 2339, + "time_per_iteration": 2.576690673828125 + }, + { + "auxiliary_loss_clip": 0.01100015, + "auxiliary_loss_mlp": 0.01161953, + "balance_loss_clip": 1.00207877, + "balance_loss_mlp": 1.00116384, + "epoch": 0.14068841124304826, + "flos": 39202493685120.0, + "grad_norm": 1.9035358428955904, + "language_loss": 0.65748417, + "learning_rate": 3.872848730344146e-06, + "loss": 0.68010378, + "num_input_tokens_seen": 50687440, + "step": 2340, + "time_per_iteration": 2.850553274154663 + }, + { + "auxiliary_loss_clip": 0.01163121, + "auxiliary_loss_mlp": 0.01161549, + "balance_loss_clip": 1.00206852, + "balance_loss_mlp": 1.00104582, + "epoch": 0.14074853449571623, + "flos": 20192283181440.0, + "grad_norm": 2.8211730517831515, + "language_loss": 0.78248763, + "learning_rate": 3.87271204460899e-06, + "loss": 0.8057344, + "num_input_tokens_seen": 50704030, + "step": 2341, + "time_per_iteration": 2.619164228439331 + }, + { + "auxiliary_loss_clip": 0.01179625, + "auxiliary_loss_mlp": 0.011619, + "balance_loss_clip": 1.00215888, + "balance_loss_mlp": 1.00120568, + "epoch": 0.1408086577483842, + "flos": 18405871153920.0, + "grad_norm": 2.120928700050816, + "language_loss": 0.80006039, + "learning_rate": 3.8725752878604066e-06, + "loss": 0.8234756, + "num_input_tokens_seen": 50723305, + "step": 2342, + "time_per_iteration": 2.501171112060547 + }, + { + "auxiliary_loss_clip": 0.01162942, + "auxiliary_loss_mlp": 0.01161668, + "balance_loss_clip": 1.0021708, + "balance_loss_mlp": 1.00106966, + "epoch": 0.14086878100105216, + "flos": 25264593423360.0, + "grad_norm": 1.8881507816633414, + "language_loss": 0.77958369, + "learning_rate": 3.87243846010358e-06, + "loss": 0.8028298, + "num_input_tokens_seen": 50743270, + "step": 2343, + "time_per_iteration": 2.6249217987060547 + }, + { + "auxiliary_loss_clip": 0.01145418, + "auxiliary_loss_mlp": 0.0115417, + "balance_loss_clip": 1.0021553, + "balance_loss_mlp": 1.00005579, + "epoch": 0.14092890425372012, + "flos": 65978388869760.0, + "grad_norm": 0.8386339351222458, + "language_loss": 0.61517668, + "learning_rate": 3.872301561343699e-06, + "loss": 0.63817257, + "num_input_tokens_seen": 50802710, + "step": 2344, + "time_per_iteration": 3.143862247467041 + }, + { + "auxiliary_loss_clip": 0.01163536, + "auxiliary_loss_mlp": 0.01161791, + "balance_loss_clip": 1.00202167, + "balance_loss_mlp": 1.0010972, + "epoch": 0.1409890275063881, + "flos": 23694973931520.0, + "grad_norm": 1.5213503648028197, + "language_loss": 0.64847922, + "learning_rate": 3.872164591585956e-06, + "loss": 0.67173254, + "num_input_tokens_seen": 50822625, + "step": 2345, + "time_per_iteration": 2.64886212348938 + }, + { + "auxiliary_loss_clip": 0.01164012, + "auxiliary_loss_mlp": 0.01161997, + "balance_loss_clip": 1.00206351, + "balance_loss_mlp": 1.00073099, + "epoch": 0.14104915075905605, + "flos": 23623152687360.0, + "grad_norm": 2.408584270909962, + "language_loss": 0.73443747, + "learning_rate": 3.8720275508355435e-06, + "loss": 0.75769758, + "num_input_tokens_seen": 50842330, + "step": 2346, + "time_per_iteration": 2.578582286834717 + }, + { + "auxiliary_loss_clip": 0.01163107, + "auxiliary_loss_mlp": 0.0116188, + "balance_loss_clip": 1.00214255, + "balance_loss_mlp": 1.0009948, + "epoch": 0.14110927401172405, + "flos": 20595165102720.0, + "grad_norm": 1.821028481615988, + "language_loss": 0.77222443, + "learning_rate": 3.8718904390976585e-06, + "loss": 0.79547429, + "num_input_tokens_seen": 50861035, + "step": 2347, + "time_per_iteration": 2.5859880447387695 + }, + { + "auxiliary_loss_clip": 0.01179696, + "auxiliary_loss_mlp": 0.01162098, + "balance_loss_clip": 1.00218475, + "balance_loss_mlp": 1.0013088, + "epoch": 0.141169397264392, + "flos": 28548049512960.0, + "grad_norm": 1.6850470030599625, + "language_loss": 0.771438, + "learning_rate": 3.8717532563775e-06, + "loss": 0.79485595, + "num_input_tokens_seen": 50880105, + "step": 2348, + "time_per_iteration": 2.5747287273406982 + }, + { + "auxiliary_loss_clip": 0.011639, + "auxiliary_loss_mlp": 0.01161415, + "balance_loss_clip": 1.00215721, + "balance_loss_mlp": 1.0010072, + "epoch": 0.14122952051705998, + "flos": 17092258871040.0, + "grad_norm": 1.5832806481871968, + "language_loss": 0.86590028, + "learning_rate": 3.871616002680272e-06, + "loss": 0.88915348, + "num_input_tokens_seen": 50897720, + "step": 2349, + "time_per_iteration": 2.5604453086853027 + }, + { + "auxiliary_loss_clip": 0.01162808, + "auxiliary_loss_mlp": 0.01162067, + "balance_loss_clip": 1.00206041, + "balance_loss_mlp": 1.00118184, + "epoch": 0.14128964376972794, + "flos": 28946801370240.0, + "grad_norm": 1.6547580341623036, + "language_loss": 0.89364487, + "learning_rate": 3.871478678011177e-06, + "loss": 0.9168936, + "num_input_tokens_seen": 50918385, + "step": 2350, + "time_per_iteration": 2.6150898933410645 + }, + { + "auxiliary_loss_clip": 0.01147346, + "auxiliary_loss_mlp": 0.01162004, + "balance_loss_clip": 1.00207591, + "balance_loss_mlp": 1.00102341, + "epoch": 0.1413497670223959, + "flos": 18989778643200.0, + "grad_norm": 1.6942351899976644, + "language_loss": 0.80953366, + "learning_rate": 3.871341282375423e-06, + "loss": 0.83262718, + "num_input_tokens_seen": 50938270, + "step": 2351, + "time_per_iteration": 2.6375842094421387 + }, + { + "auxiliary_loss_clip": 0.01164037, + "auxiliary_loss_mlp": 0.01161963, + "balance_loss_clip": 1.00213027, + "balance_loss_mlp": 1.00098264, + "epoch": 0.14140989027506387, + "flos": 29862236413440.0, + "grad_norm": 2.7363003302161553, + "language_loss": 0.83185238, + "learning_rate": 3.871203815778219e-06, + "loss": 0.85511231, + "num_input_tokens_seen": 50958155, + "step": 2352, + "time_per_iteration": 2.622180938720703 + }, + { + "auxiliary_loss_clip": 0.01162067, + "auxiliary_loss_mlp": 0.01154404, + "balance_loss_clip": 1.00240922, + "balance_loss_mlp": 1.0002898, + "epoch": 0.14147001352773186, + "flos": 62079532041600.0, + "grad_norm": 0.918294244489875, + "language_loss": 0.62000346, + "learning_rate": 3.87106627822478e-06, + "loss": 0.64316821, + "num_input_tokens_seen": 51020705, + "step": 2353, + "time_per_iteration": 3.0835742950439453 + }, + { + "auxiliary_loss_clip": 0.01146159, + "auxiliary_loss_mlp": 0.01161961, + "balance_loss_clip": 1.00193191, + "balance_loss_mlp": 1.00126708, + "epoch": 0.14153013678039983, + "flos": 22017514832640.0, + "grad_norm": 1.5896382259469286, + "language_loss": 0.8715663, + "learning_rate": 3.8709286697203196e-06, + "loss": 0.89464748, + "num_input_tokens_seen": 51039995, + "step": 2354, + "time_per_iteration": 2.620642900466919 + }, + { + "auxiliary_loss_clip": 0.01129885, + "auxiliary_loss_mlp": 0.01161951, + "balance_loss_clip": 1.00180674, + "balance_loss_mlp": 1.00097084, + "epoch": 0.1415902600330678, + "flos": 19720093968000.0, + "grad_norm": 1.639835220374508, + "language_loss": 0.74671471, + "learning_rate": 3.870790990270057e-06, + "loss": 0.76963305, + "num_input_tokens_seen": 51059075, + "step": 2355, + "time_per_iteration": 2.6169509887695312 + }, + { + "auxiliary_loss_clip": 0.01161972, + "auxiliary_loss_mlp": 0.01154328, + "balance_loss_clip": 1.00235963, + "balance_loss_mlp": 1.00021422, + "epoch": 0.14165038328573576, + "flos": 65900929190400.0, + "grad_norm": 0.6772395077808141, + "language_loss": 0.51858306, + "learning_rate": 3.870653239879212e-06, + "loss": 0.54174602, + "num_input_tokens_seen": 51120380, + "step": 2356, + "time_per_iteration": 3.0591371059417725 + }, + { + "auxiliary_loss_clip": 0.01179523, + "auxiliary_loss_mlp": 0.01161978, + "balance_loss_clip": 1.00217581, + "balance_loss_mlp": 1.00137937, + "epoch": 0.14171050653840372, + "flos": 12130158533760.0, + "grad_norm": 1.8182238750236628, + "language_loss": 0.70660603, + "learning_rate": 3.8705154185530095e-06, + "loss": 0.730021, + "num_input_tokens_seen": 51136950, + "step": 2357, + "time_per_iteration": 2.5259134769439697 + }, + { + "auxiliary_loss_clip": 0.01132083, + "auxiliary_loss_mlp": 0.01162401, + "balance_loss_clip": 1.00211596, + "balance_loss_mlp": 1.00123024, + "epoch": 0.1417706297910717, + "flos": 20412487509120.0, + "grad_norm": 1.7946027231253543, + "language_loss": 0.81985986, + "learning_rate": 3.870377526296674e-06, + "loss": 0.84280467, + "num_input_tokens_seen": 51155175, + "step": 2358, + "time_per_iteration": 2.6960208415985107 + }, + { + "auxiliary_loss_clip": 0.0114663, + "auxiliary_loss_mlp": 0.01162187, + "balance_loss_clip": 1.00202739, + "balance_loss_mlp": 1.0010159, + "epoch": 0.14183075304373965, + "flos": 22380607463040.0, + "grad_norm": 2.2259896889368425, + "language_loss": 0.71882892, + "learning_rate": 3.870239563115436e-06, + "loss": 0.74191701, + "num_input_tokens_seen": 51174500, + "step": 2359, + "time_per_iteration": 2.6198694705963135 + }, + { + "auxiliary_loss_clip": 0.01118289, + "auxiliary_loss_mlp": 0.00748969, + "balance_loss_clip": 1.00313365, + "balance_loss_mlp": 1.00041556, + "epoch": 0.14189087629640765, + "flos": 21580913018880.0, + "grad_norm": 2.5077072930169124, + "language_loss": 0.75762808, + "learning_rate": 3.870101529014526e-06, + "loss": 0.77630067, + "num_input_tokens_seen": 51194270, + "step": 2360, + "time_per_iteration": 2.700552225112915 + }, + { + "auxiliary_loss_clip": 0.01114504, + "auxiliary_loss_mlp": 0.0116161, + "balance_loss_clip": 1.00189972, + "balance_loss_mlp": 1.00091577, + "epoch": 0.1419509995490756, + "flos": 20008564093440.0, + "grad_norm": 1.9968207593262033, + "language_loss": 0.8167904, + "learning_rate": 3.869963423999178e-06, + "loss": 0.83955157, + "num_input_tokens_seen": 51211850, + "step": 2361, + "time_per_iteration": 2.665844440460205 + }, + { + "auxiliary_loss_clip": 0.01162867, + "auxiliary_loss_mlp": 0.01162269, + "balance_loss_clip": 1.00201035, + "balance_loss_mlp": 1.00128841, + "epoch": 0.14201112280174358, + "flos": 31941464112000.0, + "grad_norm": 1.6697638042268794, + "language_loss": 0.74004889, + "learning_rate": 3.86982524807463e-06, + "loss": 0.7633003, + "num_input_tokens_seen": 51233545, + "step": 2362, + "time_per_iteration": 2.6604955196380615 + }, + { + "auxiliary_loss_clip": 0.01163036, + "auxiliary_loss_mlp": 0.01161554, + "balance_loss_clip": 1.00209594, + "balance_loss_mlp": 1.00133657, + "epoch": 0.14207124605441154, + "flos": 41464147582080.0, + "grad_norm": 1.5967612283465957, + "language_loss": 0.73911494, + "learning_rate": 3.869687001246122e-06, + "loss": 0.76236087, + "num_input_tokens_seen": 51257615, + "step": 2363, + "time_per_iteration": 4.173407793045044 + }, + { + "auxiliary_loss_clip": 0.01132233, + "auxiliary_loss_mlp": 0.0116182, + "balance_loss_clip": 1.00213552, + "balance_loss_mlp": 1.00112605, + "epoch": 0.1421313693070795, + "flos": 31905086613120.0, + "grad_norm": 1.6710102054370581, + "language_loss": 0.72980595, + "learning_rate": 3.8695486835188946e-06, + "loss": 0.75274652, + "num_input_tokens_seen": 51279645, + "step": 2364, + "time_per_iteration": 4.116501808166504 + }, + { + "auxiliary_loss_clip": 0.01147426, + "auxiliary_loss_mlp": 0.01161857, + "balance_loss_clip": 1.00214577, + "balance_loss_mlp": 1.0013535, + "epoch": 0.14219149255974747, + "flos": 26871165031680.0, + "grad_norm": 1.8051112198641894, + "language_loss": 0.9065572, + "learning_rate": 3.869410294898195e-06, + "loss": 0.92965007, + "num_input_tokens_seen": 51299775, + "step": 2365, + "time_per_iteration": 2.658543348312378 + }, + { + "auxiliary_loss_clip": 0.011484, + "auxiliary_loss_mlp": 0.01161903, + "balance_loss_clip": 1.00220132, + "balance_loss_mlp": 1.00120914, + "epoch": 0.14225161581241544, + "flos": 27454426076160.0, + "grad_norm": 1.7933308932168661, + "language_loss": 0.65430045, + "learning_rate": 3.869271835389268e-06, + "loss": 0.67740345, + "num_input_tokens_seen": 51319430, + "step": 2366, + "time_per_iteration": 3.999436378479004 + }, + { + "auxiliary_loss_clip": 0.01152204, + "auxiliary_loss_mlp": 0.01161491, + "balance_loss_clip": 1.00348401, + "balance_loss_mlp": 1.00098789, + "epoch": 0.14231173906508343, + "flos": 10561436881920.0, + "grad_norm": 2.245818913662251, + "language_loss": 0.80588496, + "learning_rate": 3.8691333049973665e-06, + "loss": 0.82902193, + "num_input_tokens_seen": 51336045, + "step": 2367, + "time_per_iteration": 4.0127272605896 + }, + { + "auxiliary_loss_clip": 0.01148395, + "auxiliary_loss_mlp": 0.01162155, + "balance_loss_clip": 1.00218487, + "balance_loss_mlp": 1.00117528, + "epoch": 0.1423718623177514, + "flos": 28360882719360.0, + "grad_norm": 1.917084345755208, + "language_loss": 0.82443744, + "learning_rate": 3.868994703727742e-06, + "loss": 0.84754294, + "num_input_tokens_seen": 51357030, + "step": 2368, + "time_per_iteration": 2.6982173919677734 + }, + { + "auxiliary_loss_clip": 0.01131416, + "auxiliary_loss_mlp": 0.01162155, + "balance_loss_clip": 1.00209343, + "balance_loss_mlp": 1.00098419, + "epoch": 0.14243198557041936, + "flos": 19354235990400.0, + "grad_norm": 2.4521797057063686, + "language_loss": 0.86797827, + "learning_rate": 3.868856031585652e-06, + "loss": 0.89091396, + "num_input_tokens_seen": 51374890, + "step": 2369, + "time_per_iteration": 2.7557265758514404 + }, + { + "auxiliary_loss_clip": 0.01131079, + "auxiliary_loss_mlp": 0.011623, + "balance_loss_clip": 1.00194645, + "balance_loss_mlp": 1.00131929, + "epoch": 0.14249210882308733, + "flos": 28806857982720.0, + "grad_norm": 1.5209777457474125, + "language_loss": 0.75959492, + "learning_rate": 3.868717288576354e-06, + "loss": 0.78252864, + "num_input_tokens_seen": 51398100, + "step": 2370, + "time_per_iteration": 2.7147374153137207 + }, + { + "auxiliary_loss_clip": 0.01163712, + "auxiliary_loss_mlp": 0.00748968, + "balance_loss_clip": 1.0020442, + "balance_loss_mlp": 1.00045753, + "epoch": 0.1425522320757553, + "flos": 21835016807040.0, + "grad_norm": 1.6091219563000232, + "language_loss": 0.83181, + "learning_rate": 3.868578474705109e-06, + "loss": 0.85093677, + "num_input_tokens_seen": 51418745, + "step": 2371, + "time_per_iteration": 2.581564426422119 + }, + { + "auxiliary_loss_clip": 0.01179698, + "auxiliary_loss_mlp": 0.01162438, + "balance_loss_clip": 1.00227666, + "balance_loss_mlp": 1.0011723, + "epoch": 0.14261235532842326, + "flos": 17311457617920.0, + "grad_norm": 1.7887226457166139, + "language_loss": 0.828565, + "learning_rate": 3.868439589977181e-06, + "loss": 0.85198629, + "num_input_tokens_seen": 51437455, + "step": 2372, + "time_per_iteration": 2.500910758972168 + }, + { + "auxiliary_loss_clip": 0.01179588, + "auxiliary_loss_mlp": 0.01161878, + "balance_loss_clip": 1.00221574, + "balance_loss_mlp": 1.00118375, + "epoch": 0.14267247858109125, + "flos": 18806741913600.0, + "grad_norm": 2.0259396241614605, + "language_loss": 0.85054147, + "learning_rate": 3.868300634397836e-06, + "loss": 0.87395608, + "num_input_tokens_seen": 51455710, + "step": 2373, + "time_per_iteration": 2.522905111312866 + }, + { + "auxiliary_loss_clip": 0.01147356, + "auxiliary_loss_mlp": 0.0116214, + "balance_loss_clip": 1.00216138, + "balance_loss_mlp": 1.00154138, + "epoch": 0.14273260183375922, + "flos": 11358904682880.0, + "grad_norm": 2.0358556664703205, + "language_loss": 0.85906041, + "learning_rate": 3.8681616079723445e-06, + "loss": 0.88215536, + "num_input_tokens_seen": 51471270, + "step": 2374, + "time_per_iteration": 2.5705275535583496 + }, + { + "auxiliary_loss_clip": 0.0116356, + "auxiliary_loss_mlp": 0.0116159, + "balance_loss_clip": 1.00212848, + "balance_loss_mlp": 1.00099111, + "epoch": 0.14279272508642718, + "flos": 27567688636800.0, + "grad_norm": 1.776365013640241, + "language_loss": 0.79439789, + "learning_rate": 3.868022510705977e-06, + "loss": 0.81764936, + "num_input_tokens_seen": 51492705, + "step": 2375, + "time_per_iteration": 2.6297147274017334 + }, + { + "auxiliary_loss_clip": 0.01163622, + "auxiliary_loss_mlp": 0.01162008, + "balance_loss_clip": 1.00220537, + "balance_loss_mlp": 1.00131392, + "epoch": 0.14285284833909515, + "flos": 16252559654400.0, + "grad_norm": 2.08769154305321, + "language_loss": 0.7649281, + "learning_rate": 3.867883342604009e-06, + "loss": 0.7881844, + "num_input_tokens_seen": 51510780, + "step": 2376, + "time_per_iteration": 2.5385329723358154 + }, + { + "auxiliary_loss_clip": 0.01163666, + "auxiliary_loss_mlp": 0.0116191, + "balance_loss_clip": 1.00224781, + "balance_loss_mlp": 1.00093007, + "epoch": 0.1429129715917631, + "flos": 19755609540480.0, + "grad_norm": 1.7345758423165885, + "language_loss": 0.93062735, + "learning_rate": 3.867744103671717e-06, + "loss": 0.95388305, + "num_input_tokens_seen": 51531400, + "step": 2377, + "time_per_iteration": 2.610672950744629 + }, + { + "auxiliary_loss_clip": 0.01147272, + "auxiliary_loss_mlp": 0.01161791, + "balance_loss_clip": 1.00205541, + "balance_loss_mlp": 1.0010016, + "epoch": 0.14297309484443108, + "flos": 21137092571520.0, + "grad_norm": 1.7651043604269747, + "language_loss": 0.91679859, + "learning_rate": 3.867604793914382e-06, + "loss": 0.93988925, + "num_input_tokens_seen": 51548215, + "step": 2378, + "time_per_iteration": 2.59808087348938 + }, + { + "auxiliary_loss_clip": 0.01162994, + "auxiliary_loss_mlp": 0.01161903, + "balance_loss_clip": 1.00207019, + "balance_loss_mlp": 1.00120866, + "epoch": 0.14303321809709904, + "flos": 23586667447680.0, + "grad_norm": 1.8412088106194686, + "language_loss": 0.7384032, + "learning_rate": 3.8674654133372864e-06, + "loss": 0.76165217, + "num_input_tokens_seen": 51566820, + "step": 2379, + "time_per_iteration": 2.562154769897461 + }, + { + "auxiliary_loss_clip": 0.01130207, + "auxiliary_loss_mlp": 0.01161777, + "balance_loss_clip": 1.00197136, + "balance_loss_mlp": 1.00127387, + "epoch": 0.14309334134976703, + "flos": 15888281875200.0, + "grad_norm": 1.788880326956046, + "language_loss": 0.78653097, + "learning_rate": 3.867325961945714e-06, + "loss": 0.80945075, + "num_input_tokens_seen": 51585075, + "step": 2380, + "time_per_iteration": 2.656400680541992 + }, + { + "auxiliary_loss_clip": 0.01130985, + "auxiliary_loss_mlp": 0.01162092, + "balance_loss_clip": 1.00226879, + "balance_loss_mlp": 1.00111222, + "epoch": 0.143153464602435, + "flos": 16325601960960.0, + "grad_norm": 2.0329597000500934, + "language_loss": 0.88204312, + "learning_rate": 3.867186439744955e-06, + "loss": 0.90497392, + "num_input_tokens_seen": 51603185, + "step": 2381, + "time_per_iteration": 2.616591215133667 + }, + { + "auxiliary_loss_clip": 0.01147449, + "auxiliary_loss_mlp": 0.01161874, + "balance_loss_clip": 1.00215399, + "balance_loss_mlp": 1.00108457, + "epoch": 0.14321358785510296, + "flos": 17092079303040.0, + "grad_norm": 2.2160197925663456, + "language_loss": 0.76827478, + "learning_rate": 3.867046846740299e-06, + "loss": 0.79136801, + "num_input_tokens_seen": 51620880, + "step": 2382, + "time_per_iteration": 2.646763563156128 + }, + { + "auxiliary_loss_clip": 0.01129899, + "auxiliary_loss_mlp": 0.0116192, + "balance_loss_clip": 1.00185227, + "balance_loss_mlp": 1.00103521, + "epoch": 0.14327371110777093, + "flos": 26322916769280.0, + "grad_norm": 2.350645534101902, + "language_loss": 0.76221406, + "learning_rate": 3.866907182937039e-06, + "loss": 0.78513229, + "num_input_tokens_seen": 51640170, + "step": 2383, + "time_per_iteration": 2.6782665252685547 + }, + { + "auxiliary_loss_clip": 0.01131095, + "auxiliary_loss_mlp": 0.01162225, + "balance_loss_clip": 1.00204444, + "balance_loss_mlp": 1.00114906, + "epoch": 0.1433338343604389, + "flos": 18076462502400.0, + "grad_norm": 2.7092646952253214, + "language_loss": 0.88165021, + "learning_rate": 3.866767448340471e-06, + "loss": 0.90458339, + "num_input_tokens_seen": 51656580, + "step": 2384, + "time_per_iteration": 2.6410393714904785 + }, + { + "auxiliary_loss_clip": 0.01163111, + "auxiliary_loss_mlp": 0.01162442, + "balance_loss_clip": 1.00207591, + "balance_loss_mlp": 1.00127077, + "epoch": 0.14339395761310686, + "flos": 15522783033600.0, + "grad_norm": 3.820906609505425, + "language_loss": 0.79593539, + "learning_rate": 3.866627642955895e-06, + "loss": 0.81919098, + "num_input_tokens_seen": 51674645, + "step": 2385, + "time_per_iteration": 2.538398265838623 + }, + { + "auxiliary_loss_clip": 0.01163965, + "auxiliary_loss_mlp": 0.01161376, + "balance_loss_clip": 1.00202441, + "balance_loss_mlp": 1.00115919, + "epoch": 0.14345408086577485, + "flos": 28548767784960.0, + "grad_norm": 1.58847563580127, + "language_loss": 0.75512606, + "learning_rate": 3.866487766788612e-06, + "loss": 0.77837944, + "num_input_tokens_seen": 51695770, + "step": 2386, + "time_per_iteration": 2.6416189670562744 + }, + { + "auxiliary_loss_clip": 0.0117968, + "auxiliary_loss_mlp": 0.01161611, + "balance_loss_clip": 1.00228965, + "balance_loss_mlp": 1.00091684, + "epoch": 0.14351420411844282, + "flos": 20230061310720.0, + "grad_norm": 1.944558134080662, + "language_loss": 0.78729236, + "learning_rate": 3.866347819843925e-06, + "loss": 0.81070524, + "num_input_tokens_seen": 51714165, + "step": 2387, + "time_per_iteration": 2.522610664367676 + }, + { + "auxiliary_loss_clip": 0.01146461, + "auxiliary_loss_mlp": 0.011619, + "balance_loss_clip": 1.00187886, + "balance_loss_mlp": 1.00111079, + "epoch": 0.14357432737111078, + "flos": 19865029345920.0, + "grad_norm": 2.3044629764225437, + "language_loss": 0.82155824, + "learning_rate": 3.866207802127143e-06, + "loss": 0.84464186, + "num_input_tokens_seen": 51734440, + "step": 2388, + "time_per_iteration": 2.672879695892334 + }, + { + "auxiliary_loss_clip": 0.01164021, + "auxiliary_loss_mlp": 0.01162005, + "balance_loss_clip": 1.00227952, + "balance_loss_mlp": 1.00112009, + "epoch": 0.14363445062377875, + "flos": 28256814040320.0, + "grad_norm": 2.2754811457544557, + "language_loss": 0.82322258, + "learning_rate": 3.866067713643573e-06, + "loss": 0.84648281, + "num_input_tokens_seen": 51753730, + "step": 2389, + "time_per_iteration": 2.649017810821533 + }, + { + "auxiliary_loss_clip": 0.01147348, + "auxiliary_loss_mlp": 0.0116225, + "balance_loss_clip": 1.00196707, + "balance_loss_mlp": 1.00117409, + "epoch": 0.1436945738764467, + "flos": 18186672407040.0, + "grad_norm": 2.011531421948951, + "language_loss": 0.82679552, + "learning_rate": 3.8659275543985285e-06, + "loss": 0.84989154, + "num_input_tokens_seen": 51771195, + "step": 2390, + "time_per_iteration": 2.595837116241455 + }, + { + "auxiliary_loss_clip": 0.01163566, + "auxiliary_loss_mlp": 0.01162194, + "balance_loss_clip": 1.00228405, + "balance_loss_mlp": 1.00130868, + "epoch": 0.14375469712911468, + "flos": 27307910499840.0, + "grad_norm": 1.7726202697581435, + "language_loss": 0.75327408, + "learning_rate": 3.865787324397324e-06, + "loss": 0.7765317, + "num_input_tokens_seen": 51792290, + "step": 2391, + "time_per_iteration": 2.6511714458465576 + }, + { + "auxiliary_loss_clip": 0.01129165, + "auxiliary_loss_mlp": 0.01154345, + "balance_loss_clip": 1.00187862, + "balance_loss_mlp": 1.00023174, + "epoch": 0.14381482038178264, + "flos": 56891445287040.0, + "grad_norm": 0.8668858246231079, + "language_loss": 0.61832297, + "learning_rate": 3.865647023645277e-06, + "loss": 0.64115798, + "num_input_tokens_seen": 51843675, + "step": 2392, + "time_per_iteration": 3.05810546875 + }, + { + "auxiliary_loss_clip": 0.01162968, + "auxiliary_loss_mlp": 0.01162403, + "balance_loss_clip": 1.00203359, + "balance_loss_mlp": 1.00142241, + "epoch": 0.14387494363445064, + "flos": 14282177143680.0, + "grad_norm": 2.3031255493456095, + "language_loss": 0.76932061, + "learning_rate": 3.865506652147709e-06, + "loss": 0.79257429, + "num_input_tokens_seen": 51860285, + "step": 2393, + "time_per_iteration": 2.515394449234009 + }, + { + "auxiliary_loss_clip": 0.0117959, + "auxiliary_loss_mlp": 0.01161836, + "balance_loss_clip": 1.0021832, + "balance_loss_mlp": 1.00123739, + "epoch": 0.1439350668871186, + "flos": 26761493831040.0, + "grad_norm": 1.7327752494634663, + "language_loss": 0.76399112, + "learning_rate": 3.865366209909941e-06, + "loss": 0.78740537, + "num_input_tokens_seen": 51880105, + "step": 2394, + "time_per_iteration": 2.566645860671997 + }, + { + "auxiliary_loss_clip": 0.01179494, + "auxiliary_loss_mlp": 0.01161855, + "balance_loss_clip": 1.00218034, + "balance_loss_mlp": 1.0011605, + "epoch": 0.14399519013978657, + "flos": 40700040537600.0, + "grad_norm": 1.5966532104210225, + "language_loss": 0.85912549, + "learning_rate": 3.8652256969372994e-06, + "loss": 0.88253903, + "num_input_tokens_seen": 51905175, + "step": 2395, + "time_per_iteration": 2.7077012062072754 + }, + { + "auxiliary_loss_clip": 0.01136467, + "auxiliary_loss_mlp": 0.01161643, + "balance_loss_clip": 1.00262165, + "balance_loss_mlp": 1.00104451, + "epoch": 0.14405531339245453, + "flos": 20557530627840.0, + "grad_norm": 1.5153081261630752, + "language_loss": 0.83150846, + "learning_rate": 3.865085113235113e-06, + "loss": 0.85448956, + "num_input_tokens_seen": 51924490, + "step": 2396, + "time_per_iteration": 2.663546562194824 + }, + { + "auxiliary_loss_clip": 0.01148182, + "auxiliary_loss_mlp": 0.00748893, + "balance_loss_clip": 1.00213075, + "balance_loss_mlp": 1.00040436, + "epoch": 0.1441154366451225, + "flos": 19572931946880.0, + "grad_norm": 9.80596098017989, + "language_loss": 0.82943147, + "learning_rate": 3.864944458808712e-06, + "loss": 0.8484022, + "num_input_tokens_seen": 51940490, + "step": 2397, + "time_per_iteration": 2.5848398208618164 + }, + { + "auxiliary_loss_clip": 0.01179577, + "auxiliary_loss_mlp": 0.01161899, + "balance_loss_clip": 1.00220191, + "balance_loss_mlp": 1.00101447, + "epoch": 0.14417555989779046, + "flos": 18515721922560.0, + "grad_norm": 2.0462153835773065, + "language_loss": 0.80333501, + "learning_rate": 3.86480373366343e-06, + "loss": 0.8267498, + "num_input_tokens_seen": 51957910, + "step": 2398, + "time_per_iteration": 2.5063648223876953 + }, + { + "auxiliary_loss_clip": 0.01163815, + "auxiliary_loss_mlp": 0.01161451, + "balance_loss_clip": 1.002213, + "balance_loss_mlp": 1.00123358, + "epoch": 0.14423568315045843, + "flos": 26031681296640.0, + "grad_norm": 1.764769805466545, + "language_loss": 0.64780819, + "learning_rate": 3.864662937804603e-06, + "loss": 0.6710608, + "num_input_tokens_seen": 51978010, + "step": 2399, + "time_per_iteration": 2.6085283756256104 + }, + { + "auxiliary_loss_clip": 0.01148215, + "auxiliary_loss_mlp": 0.01161808, + "balance_loss_clip": 1.00209951, + "balance_loss_mlp": 1.00120878, + "epoch": 0.14429580640312642, + "flos": 21288743792640.0, + "grad_norm": 1.541551831680603, + "language_loss": 0.82310152, + "learning_rate": 3.864522071237571e-06, + "loss": 0.84620178, + "num_input_tokens_seen": 51998515, + "step": 2400, + "time_per_iteration": 2.618917226791382 + }, + { + "auxiliary_loss_clip": 0.01146934, + "auxiliary_loss_mlp": 0.01161797, + "balance_loss_clip": 1.00200772, + "balance_loss_mlp": 1.00129366, + "epoch": 0.14435592965579438, + "flos": 25627865621760.0, + "grad_norm": 1.5426252691391267, + "language_loss": 0.74782884, + "learning_rate": 3.864381133967676e-06, + "loss": 0.7709161, + "num_input_tokens_seen": 52019270, + "step": 2401, + "time_per_iteration": 4.155958890914917 + }, + { + "auxiliary_loss_clip": 0.01147308, + "auxiliary_loss_mlp": 0.01161052, + "balance_loss_clip": 1.00194049, + "balance_loss_mlp": 1.00102532, + "epoch": 0.14441605290846235, + "flos": 22965053656320.0, + "grad_norm": 1.4973644470156533, + "language_loss": 0.80652833, + "learning_rate": 3.86424012600026e-06, + "loss": 0.82961202, + "num_input_tokens_seen": 52039315, + "step": 2402, + "time_per_iteration": 2.6009323596954346 + }, + { + "auxiliary_loss_clip": 0.0113117, + "auxiliary_loss_mlp": 0.01161572, + "balance_loss_clip": 1.0019629, + "balance_loss_mlp": 1.00106895, + "epoch": 0.14447617616113032, + "flos": 17347655548800.0, + "grad_norm": 2.871713179150071, + "language_loss": 0.84370559, + "learning_rate": 3.864099047340673e-06, + "loss": 0.86663306, + "num_input_tokens_seen": 52056555, + "step": 2403, + "time_per_iteration": 4.019186735153198 + }, + { + "auxiliary_loss_clip": 0.01135154, + "auxiliary_loss_mlp": 0.00749004, + "balance_loss_clip": 1.00238419, + "balance_loss_mlp": 1.00040424, + "epoch": 0.14453629941379828, + "flos": 24060185464320.0, + "grad_norm": 1.6917291657070912, + "language_loss": 0.70011777, + "learning_rate": 3.863957897994262e-06, + "loss": 0.71895933, + "num_input_tokens_seen": 52075800, + "step": 2404, + "time_per_iteration": 2.672536611557007 + }, + { + "auxiliary_loss_clip": 0.01147114, + "auxiliary_loss_mlp": 0.01161221, + "balance_loss_clip": 1.00198209, + "balance_loss_mlp": 1.00109911, + "epoch": 0.14459642266646625, + "flos": 14429554646400.0, + "grad_norm": 2.062035356537528, + "language_loss": 0.73535287, + "learning_rate": 3.863816677966381e-06, + "loss": 0.7584362, + "num_input_tokens_seen": 52092585, + "step": 2405, + "time_per_iteration": 2.574206829071045 + }, + { + "auxiliary_loss_clip": 0.01115663, + "auxiliary_loss_mlp": 0.01161152, + "balance_loss_clip": 1.00201857, + "balance_loss_mlp": 1.00103045, + "epoch": 0.14465654591913424, + "flos": 9867032179200.0, + "grad_norm": 2.1237477461646836, + "language_loss": 0.72780383, + "learning_rate": 3.863675387262386e-06, + "loss": 0.75057197, + "num_input_tokens_seen": 52108990, + "step": 2406, + "time_per_iteration": 4.083793640136719 + }, + { + "auxiliary_loss_clip": 0.01162975, + "auxiliary_loss_mlp": 0.01161799, + "balance_loss_clip": 1.00197947, + "balance_loss_mlp": 1.0012002, + "epoch": 0.1447166691718022, + "flos": 24972926987520.0, + "grad_norm": 2.4112408138162063, + "language_loss": 0.75771159, + "learning_rate": 3.8635340258876325e-06, + "loss": 0.78095931, + "num_input_tokens_seen": 52125385, + "step": 2407, + "time_per_iteration": 2.615443706512451 + }, + { + "auxiliary_loss_clip": 0.01179178, + "auxiliary_loss_mlp": 0.01161067, + "balance_loss_clip": 1.00201881, + "balance_loss_mlp": 1.00104094, + "epoch": 0.14477679242447017, + "flos": 21908023200000.0, + "grad_norm": 1.581086004197449, + "language_loss": 0.79188812, + "learning_rate": 3.8633925938474826e-06, + "loss": 0.81529057, + "num_input_tokens_seen": 52144985, + "step": 2408, + "time_per_iteration": 2.531451463699341 + }, + { + "auxiliary_loss_clip": 0.01162765, + "auxiliary_loss_mlp": 0.01161542, + "balance_loss_clip": 1.00193071, + "balance_loss_mlp": 1.00132465, + "epoch": 0.14483691567713813, + "flos": 20740746925440.0, + "grad_norm": 1.7903085542542296, + "language_loss": 0.82068789, + "learning_rate": 3.863251091147299e-06, + "loss": 0.84393096, + "num_input_tokens_seen": 52163885, + "step": 2409, + "time_per_iteration": 2.5606496334075928 + }, + { + "auxiliary_loss_clip": 0.01115587, + "auxiliary_loss_mlp": 0.0116174, + "balance_loss_clip": 1.00188971, + "balance_loss_mlp": 1.00104618, + "epoch": 0.1448970389298061, + "flos": 35407705536000.0, + "grad_norm": 1.9872081192457105, + "language_loss": 0.74715912, + "learning_rate": 3.863109517792446e-06, + "loss": 0.76993239, + "num_input_tokens_seen": 52184325, + "step": 2410, + "time_per_iteration": 2.7843120098114014 + }, + { + "auxiliary_loss_clip": 0.01179433, + "auxiliary_loss_mlp": 0.01161293, + "balance_loss_clip": 1.00222504, + "balance_loss_mlp": 1.00098026, + "epoch": 0.14495716218247406, + "flos": 15414368808960.0, + "grad_norm": 1.849611360767686, + "language_loss": 0.81434745, + "learning_rate": 3.8629678737882945e-06, + "loss": 0.83775473, + "num_input_tokens_seen": 52202740, + "step": 2411, + "time_per_iteration": 2.53597092628479 + }, + { + "auxiliary_loss_clip": 0.01151192, + "auxiliary_loss_mlp": 0.0116145, + "balance_loss_clip": 1.00248885, + "balance_loss_mlp": 1.00132775, + "epoch": 0.14501728543514203, + "flos": 33693222493440.0, + "grad_norm": 1.9846169557026758, + "language_loss": 0.70049679, + "learning_rate": 3.862826159140214e-06, + "loss": 0.72362322, + "num_input_tokens_seen": 52223100, + "step": 2412, + "time_per_iteration": 2.6879050731658936 + }, + { + "auxiliary_loss_clip": 0.01163308, + "auxiliary_loss_mlp": 0.01161609, + "balance_loss_clip": 1.00217199, + "balance_loss_mlp": 1.00120139, + "epoch": 0.14507740868781002, + "flos": 15596112648960.0, + "grad_norm": 2.157491861057881, + "language_loss": 0.77079082, + "learning_rate": 3.862684373853579e-06, + "loss": 0.79403996, + "num_input_tokens_seen": 52239690, + "step": 2413, + "time_per_iteration": 2.537616729736328 + }, + { + "auxiliary_loss_clip": 0.01160621, + "auxiliary_loss_mlp": 0.01153453, + "balance_loss_clip": 1.00200605, + "balance_loss_mlp": 1.00010228, + "epoch": 0.145137531940478, + "flos": 66675343438080.0, + "grad_norm": 0.9141596299185843, + "language_loss": 0.58877939, + "learning_rate": 3.8625425179337656e-06, + "loss": 0.61192012, + "num_input_tokens_seen": 52296705, + "step": 2414, + "time_per_iteration": 3.0779740810394287 + }, + { + "auxiliary_loss_clip": 0.01143336, + "auxiliary_loss_mlp": 0.01153354, + "balance_loss_clip": 1.00167692, + "balance_loss_mlp": 1.00000334, + "epoch": 0.14519765519314595, + "flos": 67521578929920.0, + "grad_norm": 0.8549998904887383, + "language_loss": 0.62230903, + "learning_rate": 3.862400591386154e-06, + "loss": 0.64527595, + "num_input_tokens_seen": 52361830, + "step": 2415, + "time_per_iteration": 3.149965763092041 + }, + { + "auxiliary_loss_clip": 0.01162692, + "auxiliary_loss_mlp": 0.01161116, + "balance_loss_clip": 1.00201988, + "balance_loss_mlp": 1.00108981, + "epoch": 0.14525777844581392, + "flos": 17198913329280.0, + "grad_norm": 1.858323347391317, + "language_loss": 0.72103328, + "learning_rate": 3.8622585942161245e-06, + "loss": 0.74427134, + "num_input_tokens_seen": 52379420, + "step": 2416, + "time_per_iteration": 2.5565104484558105 + }, + { + "auxiliary_loss_clip": 0.01129271, + "auxiliary_loss_mlp": 0.01152541, + "balance_loss_clip": 1.00170684, + "balance_loss_mlp": 0.99995291, + "epoch": 0.14531790169848188, + "flos": 65404609015680.0, + "grad_norm": 0.7071631147061872, + "language_loss": 0.60393202, + "learning_rate": 3.8621165264290635e-06, + "loss": 0.62675011, + "num_input_tokens_seen": 52446290, + "step": 2417, + "time_per_iteration": 3.2265377044677734 + }, + { + "auxiliary_loss_clip": 0.01179322, + "auxiliary_loss_mlp": 0.01161386, + "balance_loss_clip": 1.00203824, + "balance_loss_mlp": 1.00097823, + "epoch": 0.14537802495114985, + "flos": 32562467372160.0, + "grad_norm": 2.593897755359951, + "language_loss": 0.79157519, + "learning_rate": 3.861974388030356e-06, + "loss": 0.8149823, + "num_input_tokens_seen": 52467295, + "step": 2418, + "time_per_iteration": 2.5983893871307373 + }, + { + "auxiliary_loss_clip": 0.01132203, + "auxiliary_loss_mlp": 0.01160703, + "balance_loss_clip": 1.0019505, + "balance_loss_mlp": 1.00096273, + "epoch": 0.1454381482038178, + "flos": 20226685432320.0, + "grad_norm": 1.7413627279544959, + "language_loss": 0.71582854, + "learning_rate": 3.861832179025394e-06, + "loss": 0.73875761, + "num_input_tokens_seen": 52487295, + "step": 2419, + "time_per_iteration": 2.6633825302124023 + }, + { + "auxiliary_loss_clip": 0.01150757, + "auxiliary_loss_mlp": 0.01161126, + "balance_loss_clip": 1.00211263, + "balance_loss_mlp": 1.00090861, + "epoch": 0.1454982714564858, + "flos": 22893124671360.0, + "grad_norm": 2.2511438046427017, + "language_loss": 0.90124524, + "learning_rate": 3.861689899419569e-06, + "loss": 0.92436409, + "num_input_tokens_seen": 52504220, + "step": 2420, + "time_per_iteration": 2.617574453353882 + }, + { + "auxiliary_loss_clip": 0.01163302, + "auxiliary_loss_mlp": 0.01160967, + "balance_loss_clip": 1.00206065, + "balance_loss_mlp": 1.00103533, + "epoch": 0.14555839470915377, + "flos": 20229845829120.0, + "grad_norm": 1.7896733954712816, + "language_loss": 0.82860941, + "learning_rate": 3.861547549218276e-06, + "loss": 0.85185212, + "num_input_tokens_seen": 52521900, + "step": 2421, + "time_per_iteration": 2.5517923831939697 + }, + { + "auxiliary_loss_clip": 0.01115804, + "auxiliary_loss_mlp": 0.01160787, + "balance_loss_clip": 1.00182307, + "balance_loss_mlp": 1.00104702, + "epoch": 0.14561851796182174, + "flos": 22236282616320.0, + "grad_norm": 1.4408213052007814, + "language_loss": 0.81557769, + "learning_rate": 3.861405128426914e-06, + "loss": 0.83834362, + "num_input_tokens_seen": 52540495, + "step": 2422, + "time_per_iteration": 2.7049903869628906 + }, + { + "auxiliary_loss_clip": 0.01130197, + "auxiliary_loss_mlp": 0.00748327, + "balance_loss_clip": 1.00250912, + "balance_loss_mlp": 0.999578, + "epoch": 0.1456786412144897, + "flos": 52636786289280.0, + "grad_norm": 0.9311539675295366, + "language_loss": 0.6333009, + "learning_rate": 3.861262637050883e-06, + "loss": 0.65208614, + "num_input_tokens_seen": 52603305, + "step": 2423, + "time_per_iteration": 3.2341504096984863 + }, + { + "auxiliary_loss_clip": 0.01113638, + "auxiliary_loss_mlp": 0.00748934, + "balance_loss_clip": 1.00185752, + "balance_loss_mlp": 1.00047505, + "epoch": 0.14573876446715767, + "flos": 23221671396480.0, + "grad_norm": 1.7992339122765764, + "language_loss": 0.82620633, + "learning_rate": 3.861120075095585e-06, + "loss": 0.844832, + "num_input_tokens_seen": 52623435, + "step": 2424, + "time_per_iteration": 2.7287745475769043 + }, + { + "auxiliary_loss_clip": 0.01146105, + "auxiliary_loss_mlp": 0.01160803, + "balance_loss_clip": 1.0019443, + "balance_loss_mlp": 1.00106215, + "epoch": 0.14579888771982563, + "flos": 18114384286080.0, + "grad_norm": 2.280318092946193, + "language_loss": 0.78603721, + "learning_rate": 3.860977442566429e-06, + "loss": 0.80910629, + "num_input_tokens_seen": 52642255, + "step": 2425, + "time_per_iteration": 2.6164841651916504 + }, + { + "auxiliary_loss_clip": 0.01162861, + "auxiliary_loss_mlp": 0.01161073, + "balance_loss_clip": 1.00208235, + "balance_loss_mlp": 1.00114179, + "epoch": 0.14585901097249362, + "flos": 23001107932800.0, + "grad_norm": 2.4959014973827323, + "language_loss": 0.83284378, + "learning_rate": 3.860834739468821e-06, + "loss": 0.85608315, + "num_input_tokens_seen": 52658700, + "step": 2426, + "time_per_iteration": 2.575772762298584 + }, + { + "auxiliary_loss_clip": 0.01179439, + "auxiliary_loss_mlp": 0.01160772, + "balance_loss_clip": 1.00223041, + "balance_loss_mlp": 1.00122261, + "epoch": 0.1459191342251616, + "flos": 21908669644800.0, + "grad_norm": 2.0884317463561377, + "language_loss": 0.87400061, + "learning_rate": 3.860691965808173e-06, + "loss": 0.8974027, + "num_input_tokens_seen": 52678140, + "step": 2427, + "time_per_iteration": 2.5514492988586426 + }, + { + "auxiliary_loss_clip": 0.01132186, + "auxiliary_loss_mlp": 0.01161153, + "balance_loss_clip": 1.00191092, + "balance_loss_mlp": 1.00112653, + "epoch": 0.14597925747782955, + "flos": 14975504438400.0, + "grad_norm": 1.8168495047382176, + "language_loss": 0.66989803, + "learning_rate": 3.8605491215899e-06, + "loss": 0.69283146, + "num_input_tokens_seen": 52696825, + "step": 2428, + "time_per_iteration": 2.6342196464538574 + }, + { + "auxiliary_loss_clip": 0.01163681, + "auxiliary_loss_mlp": 0.01160852, + "balance_loss_clip": 1.00208378, + "balance_loss_mlp": 1.00111187, + "epoch": 0.14603938073049752, + "flos": 21068898600960.0, + "grad_norm": 2.8626820496963754, + "language_loss": 0.83870137, + "learning_rate": 3.860406206819417e-06, + "loss": 0.8619467, + "num_input_tokens_seen": 52715125, + "step": 2429, + "time_per_iteration": 2.5814826488494873 + }, + { + "auxiliary_loss_clip": 0.0113097, + "auxiliary_loss_mlp": 0.01160821, + "balance_loss_clip": 1.00178599, + "balance_loss_mlp": 1.001176, + "epoch": 0.14609950398316549, + "flos": 19864777950720.0, + "grad_norm": 1.6213517920622957, + "language_loss": 0.79098654, + "learning_rate": 3.860263221502145e-06, + "loss": 0.8139044, + "num_input_tokens_seen": 52734015, + "step": 2430, + "time_per_iteration": 2.6392173767089844 + }, + { + "auxiliary_loss_clip": 0.01179673, + "auxiliary_loss_mlp": 0.01161318, + "balance_loss_clip": 1.00236583, + "balance_loss_mlp": 1.00129128, + "epoch": 0.14615962723583345, + "flos": 22418852469120.0, + "grad_norm": 3.143519923416739, + "language_loss": 0.82519281, + "learning_rate": 3.860120165643504e-06, + "loss": 0.84860277, + "num_input_tokens_seen": 52753025, + "step": 2431, + "time_per_iteration": 2.5602478981018066 + }, + { + "auxiliary_loss_clip": 0.01163933, + "auxiliary_loss_mlp": 0.01161314, + "balance_loss_clip": 1.00210357, + "balance_loss_mlp": 1.00119174, + "epoch": 0.14621975048850142, + "flos": 22346241125760.0, + "grad_norm": 2.1646280924771033, + "language_loss": 0.7888912, + "learning_rate": 3.859977039248921e-06, + "loss": 0.81214368, + "num_input_tokens_seen": 52773420, + "step": 2432, + "time_per_iteration": 2.5658929347991943 + }, + { + "auxiliary_loss_clip": 0.01179439, + "auxiliary_loss_mlp": 0.00749037, + "balance_loss_clip": 1.00212741, + "balance_loss_mlp": 1.0004586, + "epoch": 0.1462798737411694, + "flos": 24389163152640.0, + "grad_norm": 2.1231300815163445, + "language_loss": 0.79968727, + "learning_rate": 3.859833842323822e-06, + "loss": 0.81897199, + "num_input_tokens_seen": 52792870, + "step": 2433, + "time_per_iteration": 2.61244797706604 + }, + { + "auxiliary_loss_clip": 0.0113113, + "auxiliary_loss_mlp": 0.01160874, + "balance_loss_clip": 1.0020504, + "balance_loss_mlp": 1.00103807, + "epoch": 0.14633999699383737, + "flos": 19244672530560.0, + "grad_norm": 1.976476450127108, + "language_loss": 0.78005081, + "learning_rate": 3.859690574873638e-06, + "loss": 0.80297083, + "num_input_tokens_seen": 52811615, + "step": 2434, + "time_per_iteration": 2.621415138244629 + }, + { + "auxiliary_loss_clip": 0.01132957, + "auxiliary_loss_mlp": 0.01152593, + "balance_loss_clip": 1.00233865, + "balance_loss_mlp": 1.00000477, + "epoch": 0.14640012024650534, + "flos": 62660638270080.0, + "grad_norm": 0.8406370177517063, + "language_loss": 0.58414602, + "learning_rate": 3.8595472369038e-06, + "loss": 0.60700154, + "num_input_tokens_seen": 52873230, + "step": 2435, + "time_per_iteration": 3.199378252029419 + }, + { + "auxiliary_loss_clip": 0.01179073, + "auxiliary_loss_mlp": 0.01160377, + "balance_loss_clip": 1.00203872, + "balance_loss_mlp": 1.00101817, + "epoch": 0.1464602434991733, + "flos": 12276243146880.0, + "grad_norm": 2.466897556424509, + "language_loss": 0.8809132, + "learning_rate": 3.859403828419744e-06, + "loss": 0.90430772, + "num_input_tokens_seen": 52889325, + "step": 2436, + "time_per_iteration": 2.551884174346924 + }, + { + "auxiliary_loss_clip": 0.01162741, + "auxiliary_loss_mlp": 0.00748975, + "balance_loss_clip": 1.00200391, + "balance_loss_mlp": 1.0005182, + "epoch": 0.14652036675184127, + "flos": 20922311197440.0, + "grad_norm": 1.8916498103825887, + "language_loss": 0.74595487, + "learning_rate": 3.85926034942691e-06, + "loss": 0.76507199, + "num_input_tokens_seen": 52909705, + "step": 2437, + "time_per_iteration": 2.576550006866455 + }, + { + "auxiliary_loss_clip": 0.01179327, + "auxiliary_loss_mlp": 0.01160824, + "balance_loss_clip": 1.00203919, + "balance_loss_mlp": 1.00098825, + "epoch": 0.14658049000450923, + "flos": 27703681528320.0, + "grad_norm": 3.68225842588486, + "language_loss": 0.74041826, + "learning_rate": 3.859116799930736e-06, + "loss": 0.76381975, + "num_input_tokens_seen": 52930300, + "step": 2438, + "time_per_iteration": 2.575773239135742 + }, + { + "auxiliary_loss_clip": 0.01162801, + "auxiliary_loss_mlp": 0.0116084, + "balance_loss_clip": 1.00214195, + "balance_loss_mlp": 1.00100422, + "epoch": 0.14664061325717723, + "flos": 24936513575040.0, + "grad_norm": 1.9136354109490603, + "language_loss": 0.74772906, + "learning_rate": 3.858973179936668e-06, + "loss": 0.77096546, + "num_input_tokens_seen": 52949955, + "step": 2439, + "time_per_iteration": 4.06809139251709 + }, + { + "auxiliary_loss_clip": 0.0116274, + "auxiliary_loss_mlp": 0.01160871, + "balance_loss_clip": 1.00203443, + "balance_loss_mlp": 1.00122571, + "epoch": 0.1467007365098452, + "flos": 40297661406720.0, + "grad_norm": 4.586536207667379, + "language_loss": 0.74412966, + "learning_rate": 3.85882948945015e-06, + "loss": 0.76736581, + "num_input_tokens_seen": 52972905, + "step": 2440, + "time_per_iteration": 2.7217442989349365 + }, + { + "auxiliary_loss_clip": 0.01179115, + "auxiliary_loss_mlp": 0.01160708, + "balance_loss_clip": 1.00201631, + "balance_loss_mlp": 1.00106311, + "epoch": 0.14676085976251316, + "flos": 26541074021760.0, + "grad_norm": 1.5984496959825165, + "language_loss": 0.82964504, + "learning_rate": 3.85868572847663e-06, + "loss": 0.85304332, + "num_input_tokens_seen": 52994850, + "step": 2441, + "time_per_iteration": 3.9458234310150146 + }, + { + "auxiliary_loss_clip": 0.01163441, + "auxiliary_loss_mlp": 0.01161521, + "balance_loss_clip": 1.00216055, + "balance_loss_mlp": 1.00101781, + "epoch": 0.14682098301518112, + "flos": 23550110380800.0, + "grad_norm": 2.104428409000799, + "language_loss": 0.72251511, + "learning_rate": 3.858541897021563e-06, + "loss": 0.74576473, + "num_input_tokens_seen": 53014740, + "step": 2442, + "time_per_iteration": 2.5544636249542236 + }, + { + "auxiliary_loss_clip": 0.01134741, + "auxiliary_loss_mlp": 0.01161017, + "balance_loss_clip": 1.0023123, + "balance_loss_mlp": 1.0008955, + "epoch": 0.1468811062678491, + "flos": 11651073909120.0, + "grad_norm": 2.502949242659379, + "language_loss": 0.80504709, + "learning_rate": 3.8583979950904e-06, + "loss": 0.82800472, + "num_input_tokens_seen": 53029780, + "step": 2443, + "time_per_iteration": 4.076063394546509 + }, + { + "auxiliary_loss_clip": 0.011636, + "auxiliary_loss_mlp": 0.01160853, + "balance_loss_clip": 1.00207341, + "balance_loss_mlp": 1.00092161, + "epoch": 0.14694122952051705, + "flos": 23002616304000.0, + "grad_norm": 1.6456136709695168, + "language_loss": 0.83435112, + "learning_rate": 3.858254022688599e-06, + "loss": 0.85759562, + "num_input_tokens_seen": 53048620, + "step": 2444, + "time_per_iteration": 2.5665066242218018 + }, + { + "auxiliary_loss_clip": 0.01146265, + "auxiliary_loss_mlp": 0.01161052, + "balance_loss_clip": 1.00183082, + "balance_loss_mlp": 1.00121629, + "epoch": 0.14700135277318502, + "flos": 26502972670080.0, + "grad_norm": 1.6121087475433498, + "language_loss": 0.71098447, + "learning_rate": 3.85810997982162e-06, + "loss": 0.73405761, + "num_input_tokens_seen": 53070055, + "step": 2445, + "time_per_iteration": 2.6441054344177246 + }, + { + "auxiliary_loss_clip": 0.01176478, + "auxiliary_loss_mlp": 0.01152683, + "balance_loss_clip": 1.00163782, + "balance_loss_mlp": 1.00009501, + "epoch": 0.147061476025853, + "flos": 59449434387840.0, + "grad_norm": 0.823436862977031, + "language_loss": 0.63119972, + "learning_rate": 3.857965866494923e-06, + "loss": 0.65449131, + "num_input_tokens_seen": 53126945, + "step": 2446, + "time_per_iteration": 3.0249524116516113 + }, + { + "auxiliary_loss_clip": 0.01117954, + "auxiliary_loss_mlp": 0.01160795, + "balance_loss_clip": 1.00228477, + "balance_loss_mlp": 1.00114965, + "epoch": 0.14712159927852098, + "flos": 28330897841280.0, + "grad_norm": 1.557043584747521, + "language_loss": 0.74841487, + "learning_rate": 3.857821682713975e-06, + "loss": 0.77120233, + "num_input_tokens_seen": 53149130, + "step": 2447, + "time_per_iteration": 2.7619595527648926 + }, + { + "auxiliary_loss_clip": 0.01179367, + "auxiliary_loss_mlp": 0.01160916, + "balance_loss_clip": 1.0021677, + "balance_loss_mlp": 1.0011754, + "epoch": 0.14718172253118894, + "flos": 27089825074560.0, + "grad_norm": 1.8731140475598855, + "language_loss": 0.85382915, + "learning_rate": 3.857677428484242e-06, + "loss": 0.87723202, + "num_input_tokens_seen": 53167120, + "step": 2448, + "time_per_iteration": 2.581453800201416 + }, + { + "auxiliary_loss_clip": 0.0117644, + "auxiliary_loss_mlp": 0.01151871, + "balance_loss_clip": 1.00162816, + "balance_loss_mlp": 1.00004661, + "epoch": 0.1472418457838569, + "flos": 66706764860160.0, + "grad_norm": 0.7648500056283163, + "language_loss": 0.56878346, + "learning_rate": 3.857533103811195e-06, + "loss": 0.59206659, + "num_input_tokens_seen": 53227945, + "step": 2449, + "time_per_iteration": 3.008134365081787 + }, + { + "auxiliary_loss_clip": 0.01147817, + "auxiliary_loss_mlp": 0.01160435, + "balance_loss_clip": 1.00192392, + "balance_loss_mlp": 1.00107574, + "epoch": 0.14730196903652487, + "flos": 19573578391680.0, + "grad_norm": 1.9537344401279417, + "language_loss": 0.85175723, + "learning_rate": 3.857388708700307e-06, + "loss": 0.87483972, + "num_input_tokens_seen": 53244615, + "step": 2450, + "time_per_iteration": 2.5715103149414062 + }, + { + "auxiliary_loss_clip": 0.01163576, + "auxiliary_loss_mlp": 0.0116072, + "balance_loss_clip": 1.00206208, + "balance_loss_mlp": 1.00117028, + "epoch": 0.14736209228919284, + "flos": 16071031296000.0, + "grad_norm": 2.0033809487013707, + "language_loss": 0.74623203, + "learning_rate": 3.857244243157052e-06, + "loss": 0.76947498, + "num_input_tokens_seen": 53262205, + "step": 2451, + "time_per_iteration": 2.5315184593200684 + }, + { + "auxiliary_loss_clip": 0.01146938, + "auxiliary_loss_mlp": 0.01160088, + "balance_loss_clip": 1.00184298, + "balance_loss_mlp": 1.00082445, + "epoch": 0.1474222155418608, + "flos": 23039460679680.0, + "grad_norm": 1.6368648046197831, + "language_loss": 0.82218707, + "learning_rate": 3.85709970718691e-06, + "loss": 0.84525728, + "num_input_tokens_seen": 53282445, + "step": 2452, + "time_per_iteration": 2.6575071811676025 + }, + { + "auxiliary_loss_clip": 0.01085962, + "auxiliary_loss_mlp": 0.01160346, + "balance_loss_clip": 1.00233459, + "balance_loss_mlp": 1.00089145, + "epoch": 0.1474823387945288, + "flos": 17018641946880.0, + "grad_norm": 1.5092903649749436, + "language_loss": 0.7437278, + "learning_rate": 3.856955100795361e-06, + "loss": 0.76619095, + "num_input_tokens_seen": 53299060, + "step": 2453, + "time_per_iteration": 2.7231130599975586 + }, + { + "auxiliary_loss_clip": 0.01146309, + "auxiliary_loss_mlp": 0.01160985, + "balance_loss_clip": 1.00184941, + "balance_loss_mlp": 1.00124478, + "epoch": 0.14754246204719676, + "flos": 17895041884800.0, + "grad_norm": 2.2356609251526303, + "language_loss": 0.75674939, + "learning_rate": 3.856810423987889e-06, + "loss": 0.77982223, + "num_input_tokens_seen": 53315970, + "step": 2454, + "time_per_iteration": 2.5755484104156494 + }, + { + "auxiliary_loss_clip": 0.01147026, + "auxiliary_loss_mlp": 0.01160584, + "balance_loss_clip": 1.00189698, + "balance_loss_mlp": 1.00103426, + "epoch": 0.14760258529986472, + "flos": 13079097987840.0, + "grad_norm": 2.1371541568869183, + "language_loss": 0.82753557, + "learning_rate": 3.856665676769979e-06, + "loss": 0.85061169, + "num_input_tokens_seen": 53332940, + "step": 2455, + "time_per_iteration": 2.5708096027374268 + }, + { + "auxiliary_loss_clip": 0.01119395, + "auxiliary_loss_mlp": 0.01160864, + "balance_loss_clip": 1.00210607, + "balance_loss_mlp": 1.00112343, + "epoch": 0.1476627085525327, + "flos": 30806399358720.0, + "grad_norm": 2.184351597012592, + "language_loss": 0.83835757, + "learning_rate": 3.85652085914712e-06, + "loss": 0.86116016, + "num_input_tokens_seen": 53353295, + "step": 2456, + "time_per_iteration": 2.7429018020629883 + }, + { + "auxiliary_loss_clip": 0.01167391, + "auxiliary_loss_mlp": 0.01159777, + "balance_loss_clip": 1.00233364, + "balance_loss_mlp": 1.00070405, + "epoch": 0.14772283180520066, + "flos": 21689434984320.0, + "grad_norm": 2.061210324601764, + "language_loss": 0.84494936, + "learning_rate": 3.856375971124805e-06, + "loss": 0.86822104, + "num_input_tokens_seen": 53373410, + "step": 2457, + "time_per_iteration": 2.562596321105957 + }, + { + "auxiliary_loss_clip": 0.01163601, + "auxiliary_loss_mlp": 0.01159923, + "balance_loss_clip": 1.00212979, + "balance_loss_mlp": 1.00085056, + "epoch": 0.14778295505786862, + "flos": 18770400328320.0, + "grad_norm": 1.8558678867812592, + "language_loss": 0.75700909, + "learning_rate": 3.856231012708527e-06, + "loss": 0.78024435, + "num_input_tokens_seen": 53391430, + "step": 2458, + "time_per_iteration": 2.6037192344665527 + }, + { + "auxiliary_loss_clip": 0.01115181, + "auxiliary_loss_mlp": 0.011608, + "balance_loss_clip": 1.001917, + "balance_loss_mlp": 1.00096428, + "epoch": 0.1478430783105366, + "flos": 22893555634560.0, + "grad_norm": 1.9698502601826704, + "language_loss": 0.83309293, + "learning_rate": 3.856085983903782e-06, + "loss": 0.85585272, + "num_input_tokens_seen": 53409960, + "step": 2459, + "time_per_iteration": 2.696606397628784 + }, + { + "auxiliary_loss_clip": 0.01146929, + "auxiliary_loss_mlp": 0.01160315, + "balance_loss_clip": 1.00200367, + "balance_loss_mlp": 1.00105143, + "epoch": 0.14790320156320458, + "flos": 15085319293440.0, + "grad_norm": 2.4073963960150664, + "language_loss": 0.75416982, + "learning_rate": 3.855940884716071e-06, + "loss": 0.7772423, + "num_input_tokens_seen": 53426160, + "step": 2460, + "time_per_iteration": 2.5710668563842773 + }, + { + "auxiliary_loss_clip": 0.01129659, + "auxiliary_loss_mlp": 0.01160762, + "balance_loss_clip": 1.00182199, + "balance_loss_mlp": 1.00092661, + "epoch": 0.14796332481587254, + "flos": 26504768350080.0, + "grad_norm": 1.6812246105723625, + "language_loss": 0.81731355, + "learning_rate": 3.855795715150896e-06, + "loss": 0.84021771, + "num_input_tokens_seen": 53448530, + "step": 2461, + "time_per_iteration": 2.693571090698242 + }, + { + "auxiliary_loss_clip": 0.01163658, + "auxiliary_loss_mlp": 0.01160662, + "balance_loss_clip": 1.00206971, + "balance_loss_mlp": 1.00120759, + "epoch": 0.1480234480685405, + "flos": 17563191108480.0, + "grad_norm": 2.3548091824694226, + "language_loss": 0.65905178, + "learning_rate": 3.855650475213761e-06, + "loss": 0.68229496, + "num_input_tokens_seen": 53465915, + "step": 2462, + "time_per_iteration": 2.521380662918091 + }, + { + "auxiliary_loss_clip": 0.01146181, + "auxiliary_loss_mlp": 0.0116062, + "balance_loss_clip": 1.00183308, + "balance_loss_mlp": 1.00107074, + "epoch": 0.14808357132120847, + "flos": 53582203232640.0, + "grad_norm": 1.9991345710092026, + "language_loss": 0.67233992, + "learning_rate": 3.8555051649101745e-06, + "loss": 0.69540799, + "num_input_tokens_seen": 53496055, + "step": 2463, + "time_per_iteration": 2.895329475402832 + }, + { + "auxiliary_loss_clip": 0.01163248, + "auxiliary_loss_mlp": 0.01160624, + "balance_loss_clip": 1.00201917, + "balance_loss_mlp": 1.00116968, + "epoch": 0.14814369457387644, + "flos": 19829190551040.0, + "grad_norm": 1.8105788137353904, + "language_loss": 0.76653725, + "learning_rate": 3.855359784245646e-06, + "loss": 0.78977597, + "num_input_tokens_seen": 53513790, + "step": 2464, + "time_per_iteration": 2.561471939086914 + }, + { + "auxiliary_loss_clip": 0.01151828, + "auxiliary_loss_mlp": 0.01159999, + "balance_loss_clip": 1.00251985, + "balance_loss_mlp": 1.00092638, + "epoch": 0.1482038178265444, + "flos": 23914962777600.0, + "grad_norm": 1.6138833568856605, + "language_loss": 0.79863954, + "learning_rate": 3.855214333225688e-06, + "loss": 0.82175779, + "num_input_tokens_seen": 53533410, + "step": 2465, + "time_per_iteration": 2.631986141204834 + }, + { + "auxiliary_loss_clip": 0.01179403, + "auxiliary_loss_mlp": 0.01160778, + "balance_loss_clip": 1.00210977, + "balance_loss_mlp": 1.00094175, + "epoch": 0.1482639410792124, + "flos": 24170503109760.0, + "grad_norm": 1.543101814320736, + "language_loss": 0.7623533, + "learning_rate": 3.855068811855817e-06, + "loss": 0.78575504, + "num_input_tokens_seen": 53554775, + "step": 2466, + "time_per_iteration": 2.5499680042266846 + }, + { + "auxiliary_loss_clip": 0.01095039, + "auxiliary_loss_mlp": 0.01151796, + "balance_loss_clip": 1.001266, + "balance_loss_mlp": 0.99997121, + "epoch": 0.14832406433188036, + "flos": 66191051341440.0, + "grad_norm": 0.8040844019747424, + "language_loss": 0.60071045, + "learning_rate": 3.854923220141551e-06, + "loss": 0.62317884, + "num_input_tokens_seen": 53609675, + "step": 2467, + "time_per_iteration": 3.487740993499756 + }, + { + "auxiliary_loss_clip": 0.01147828, + "auxiliary_loss_mlp": 0.01160666, + "balance_loss_clip": 1.00195456, + "balance_loss_mlp": 1.00111651, + "epoch": 0.14838418758454833, + "flos": 25411252654080.0, + "grad_norm": 2.330044817281745, + "language_loss": 0.87566549, + "learning_rate": 3.85477755808841e-06, + "loss": 0.89875042, + "num_input_tokens_seen": 53626950, + "step": 2468, + "time_per_iteration": 3.136906862258911 + }, + { + "auxiliary_loss_clip": 0.01134931, + "auxiliary_loss_mlp": 0.01160914, + "balance_loss_clip": 1.00200832, + "balance_loss_mlp": 1.00098228, + "epoch": 0.1484443108372163, + "flos": 23289901280640.0, + "grad_norm": 1.869086642523428, + "language_loss": 0.75941861, + "learning_rate": 3.854631825701919e-06, + "loss": 0.78237712, + "num_input_tokens_seen": 53644200, + "step": 2469, + "time_per_iteration": 2.7085494995117188 + }, + { + "auxiliary_loss_clip": 0.01130253, + "auxiliary_loss_mlp": 0.01160345, + "balance_loss_clip": 1.00179243, + "balance_loss_mlp": 1.0011766, + "epoch": 0.14850443408988426, + "flos": 14647675985280.0, + "grad_norm": 3.1637972699371093, + "language_loss": 0.75496995, + "learning_rate": 3.854486022987603e-06, + "loss": 0.7778759, + "num_input_tokens_seen": 53659650, + "step": 2470, + "time_per_iteration": 2.636593818664551 + }, + { + "auxiliary_loss_clip": 0.01179043, + "auxiliary_loss_mlp": 0.01160322, + "balance_loss_clip": 1.00201201, + "balance_loss_mlp": 1.00086725, + "epoch": 0.14856455734255222, + "flos": 23548314700800.0, + "grad_norm": 2.1109004601673402, + "language_loss": 0.72172058, + "learning_rate": 3.8543401499509905e-06, + "loss": 0.74511427, + "num_input_tokens_seen": 53680275, + "step": 2471, + "time_per_iteration": 2.5520882606506348 + }, + { + "auxiliary_loss_clip": 0.01147292, + "auxiliary_loss_mlp": 0.01160976, + "balance_loss_clip": 1.00200462, + "balance_loss_mlp": 1.00104463, + "epoch": 0.1486246805952202, + "flos": 18077288515200.0, + "grad_norm": 1.906736038049347, + "language_loss": 0.89515233, + "learning_rate": 3.854194206597615e-06, + "loss": 0.91823494, + "num_input_tokens_seen": 53698270, + "step": 2472, + "time_per_iteration": 2.6033504009246826 + }, + { + "auxiliary_loss_clip": 0.01131247, + "auxiliary_loss_mlp": 0.01160245, + "balance_loss_clip": 1.00196338, + "balance_loss_mlp": 1.00088644, + "epoch": 0.14868480384788818, + "flos": 19353625459200.0, + "grad_norm": 2.4771634871188315, + "language_loss": 0.80217057, + "learning_rate": 3.854048192933008e-06, + "loss": 0.82508552, + "num_input_tokens_seen": 53716845, + "step": 2473, + "time_per_iteration": 2.6367685794830322 + }, + { + "auxiliary_loss_clip": 0.01162823, + "auxiliary_loss_mlp": 0.01160999, + "balance_loss_clip": 1.00201511, + "balance_loss_mlp": 1.00135374, + "epoch": 0.14874492710055615, + "flos": 22200192426240.0, + "grad_norm": 2.425532608585439, + "language_loss": 0.77440882, + "learning_rate": 3.853902108962709e-06, + "loss": 0.797647, + "num_input_tokens_seen": 53734970, + "step": 2474, + "time_per_iteration": 2.5925068855285645 + }, + { + "auxiliary_loss_clip": 0.01130183, + "auxiliary_loss_mlp": 0.01160934, + "balance_loss_clip": 1.00185299, + "balance_loss_mlp": 1.00128865, + "epoch": 0.1488050503532241, + "flos": 21103444506240.0, + "grad_norm": 1.8566994313600176, + "language_loss": 0.82245344, + "learning_rate": 3.853755954692255e-06, + "loss": 0.84536463, + "num_input_tokens_seen": 53753415, + "step": 2475, + "time_per_iteration": 2.632139205932617 + }, + { + "auxiliary_loss_clip": 0.01114629, + "auxiliary_loss_mlp": 0.0116047, + "balance_loss_clip": 1.00207698, + "balance_loss_mlp": 1.00111103, + "epoch": 0.14886517360589208, + "flos": 12786569625600.0, + "grad_norm": 1.6989403677605184, + "language_loss": 0.80778277, + "learning_rate": 3.85360973012719e-06, + "loss": 0.83053374, + "num_input_tokens_seen": 53770305, + "step": 2476, + "time_per_iteration": 2.6916613578796387 + }, + { + "auxiliary_loss_clip": 0.01162995, + "auxiliary_loss_mlp": 0.0116006, + "balance_loss_clip": 1.00201178, + "balance_loss_mlp": 1.00098693, + "epoch": 0.14892529685856004, + "flos": 29022860419200.0, + "grad_norm": 1.5714550195233554, + "language_loss": 0.77504528, + "learning_rate": 3.853463435273058e-06, + "loss": 0.79827583, + "num_input_tokens_seen": 53788895, + "step": 2477, + "time_per_iteration": 5.528183460235596 + }, + { + "auxiliary_loss_clip": 0.01145266, + "auxiliary_loss_mlp": 0.01151858, + "balance_loss_clip": 1.00167561, + "balance_loss_mlp": 1.00003326, + "epoch": 0.148985420111228, + "flos": 61926121054080.0, + "grad_norm": 0.7955753394660501, + "language_loss": 0.60188818, + "learning_rate": 3.853317070135407e-06, + "loss": 0.62485945, + "num_input_tokens_seen": 53850260, + "step": 2478, + "time_per_iteration": 3.210221767425537 + }, + { + "auxiliary_loss_clip": 0.01114959, + "auxiliary_loss_mlp": 0.01160374, + "balance_loss_clip": 1.00194228, + "balance_loss_mlp": 1.00111055, + "epoch": 0.149045543363896, + "flos": 23915106432000.0, + "grad_norm": 2.401968337059026, + "language_loss": 0.70847124, + "learning_rate": 3.853170634719787e-06, + "loss": 0.7312246, + "num_input_tokens_seen": 53867520, + "step": 2479, + "time_per_iteration": 4.180497884750366 + }, + { + "auxiliary_loss_clip": 0.01145979, + "auxiliary_loss_mlp": 0.01160149, + "balance_loss_clip": 1.00183487, + "balance_loss_mlp": 1.00107634, + "epoch": 0.14910566661656396, + "flos": 23654394541440.0, + "grad_norm": 1.5358782038058183, + "language_loss": 0.81186712, + "learning_rate": 3.853024129031751e-06, + "loss": 0.83492839, + "num_input_tokens_seen": 53886620, + "step": 2480, + "time_per_iteration": 2.6179871559143066 + }, + { + "auxiliary_loss_clip": 0.01131032, + "auxiliary_loss_mlp": 0.0116048, + "balance_loss_clip": 1.00186074, + "balance_loss_mlp": 1.00131202, + "epoch": 0.14916578986923193, + "flos": 20515299212160.0, + "grad_norm": 2.0137590202741213, + "language_loss": 0.83950353, + "learning_rate": 3.852877553076854e-06, + "loss": 0.86241865, + "num_input_tokens_seen": 53902230, + "step": 2481, + "time_per_iteration": 4.068188428878784 + }, + { + "auxiliary_loss_clip": 0.01162592, + "auxiliary_loss_mlp": 0.01160609, + "balance_loss_clip": 1.0020262, + "balance_loss_mlp": 1.00115514, + "epoch": 0.1492259131218999, + "flos": 22491822948480.0, + "grad_norm": 1.8593080485578959, + "language_loss": 0.77250409, + "learning_rate": 3.8527309068606546e-06, + "loss": 0.79573607, + "num_input_tokens_seen": 53919475, + "step": 2482, + "time_per_iteration": 2.5772430896759033 + }, + { + "auxiliary_loss_clip": 0.01130499, + "auxiliary_loss_mlp": 0.01160883, + "balance_loss_clip": 1.00190485, + "balance_loss_mlp": 1.00104713, + "epoch": 0.14928603637456786, + "flos": 23185868515200.0, + "grad_norm": 2.180829701373215, + "language_loss": 0.7868073, + "learning_rate": 3.852584190388713e-06, + "loss": 0.80972117, + "num_input_tokens_seen": 53939150, + "step": 2483, + "time_per_iteration": 2.7184109687805176 + }, + { + "auxiliary_loss_clip": 0.01163091, + "auxiliary_loss_mlp": 0.00749032, + "balance_loss_clip": 1.00208485, + "balance_loss_mlp": 1.0008626, + "epoch": 0.14934615962723582, + "flos": 21653237053440.0, + "grad_norm": 1.5189204299761232, + "language_loss": 0.70370507, + "learning_rate": 3.852437403666595e-06, + "loss": 0.72282624, + "num_input_tokens_seen": 53958735, + "step": 2484, + "time_per_iteration": 2.5733578205108643 + }, + { + "auxiliary_loss_clip": 0.01146854, + "auxiliary_loss_mlp": 0.00749144, + "balance_loss_clip": 1.0019331, + "balance_loss_mlp": 1.0007515, + "epoch": 0.1494062828799038, + "flos": 27010066924800.0, + "grad_norm": 1.8496778451747864, + "language_loss": 0.84452617, + "learning_rate": 3.852290546699863e-06, + "loss": 0.86348617, + "num_input_tokens_seen": 53975065, + "step": 2485, + "time_per_iteration": 2.6273505687713623 + }, + { + "auxiliary_loss_clip": 0.01163737, + "auxiliary_loss_mlp": 0.0116075, + "balance_loss_clip": 1.0021528, + "balance_loss_mlp": 1.00119996, + "epoch": 0.14946640613257178, + "flos": 21214947300480.0, + "grad_norm": 1.919210759714938, + "language_loss": 0.8460629, + "learning_rate": 3.8521436194940894e-06, + "loss": 0.86930776, + "num_input_tokens_seen": 53993330, + "step": 2486, + "time_per_iteration": 2.5391011238098145 + }, + { + "auxiliary_loss_clip": 0.01162464, + "auxiliary_loss_mlp": 0.01160129, + "balance_loss_clip": 1.00195456, + "balance_loss_mlp": 1.00115132, + "epoch": 0.14952652938523975, + "flos": 13370872164480.0, + "grad_norm": 2.0288368920333206, + "language_loss": 0.74764144, + "learning_rate": 3.851996622054842e-06, + "loss": 0.77086735, + "num_input_tokens_seen": 54010515, + "step": 2487, + "time_per_iteration": 2.5518174171447754 + }, + { + "auxiliary_loss_clip": 0.01163629, + "auxiliary_loss_mlp": 0.01160154, + "balance_loss_clip": 1.00210309, + "balance_loss_mlp": 1.00108171, + "epoch": 0.1495866526379077, + "flos": 35517699959040.0, + "grad_norm": 1.9216053171850451, + "language_loss": 0.71573162, + "learning_rate": 3.8518495543877e-06, + "loss": 0.73896945, + "num_input_tokens_seen": 54031315, + "step": 2488, + "time_per_iteration": 2.668926477432251 + }, + { + "auxiliary_loss_clip": 0.0114604, + "auxiliary_loss_mlp": 0.01160899, + "balance_loss_clip": 1.00194144, + "balance_loss_mlp": 1.00106335, + "epoch": 0.14964677589057568, + "flos": 17632749795840.0, + "grad_norm": 3.3395615761198822, + "language_loss": 0.70507157, + "learning_rate": 3.851702416498235e-06, + "loss": 0.72814095, + "num_input_tokens_seen": 54045965, + "step": 2489, + "time_per_iteration": 2.5540475845336914 + }, + { + "auxiliary_loss_clip": 0.01147076, + "auxiliary_loss_mlp": 0.01160877, + "balance_loss_clip": 1.00192761, + "balance_loss_mlp": 1.00094604, + "epoch": 0.14970689914324364, + "flos": 20185280029440.0, + "grad_norm": 2.4399165859533505, + "language_loss": 0.80943596, + "learning_rate": 3.8515552083920295e-06, + "loss": 0.83251554, + "num_input_tokens_seen": 54059960, + "step": 2490, + "time_per_iteration": 2.5803935527801514 + }, + { + "auxiliary_loss_clip": 0.01131252, + "auxiliary_loss_mlp": 0.01160863, + "balance_loss_clip": 1.00210142, + "balance_loss_mlp": 1.00112295, + "epoch": 0.1497670223959116, + "flos": 37228699382400.0, + "grad_norm": 1.668132850580101, + "language_loss": 0.79983962, + "learning_rate": 3.851407930074666e-06, + "loss": 0.82276076, + "num_input_tokens_seen": 54079330, + "step": 2491, + "time_per_iteration": 2.775907516479492 + }, + { + "auxiliary_loss_clip": 0.01163721, + "auxiliary_loss_mlp": 0.01160263, + "balance_loss_clip": 1.00210047, + "balance_loss_mlp": 1.00099897, + "epoch": 0.1498271456485796, + "flos": 24455848752000.0, + "grad_norm": 1.8988697219862267, + "language_loss": 0.90427011, + "learning_rate": 3.851260581551727e-06, + "loss": 0.92750996, + "num_input_tokens_seen": 54097555, + "step": 2492, + "time_per_iteration": 2.6088852882385254 + }, + { + "auxiliary_loss_clip": 0.01162769, + "auxiliary_loss_mlp": 0.01160911, + "balance_loss_clip": 1.00205517, + "balance_loss_mlp": 1.00136161, + "epoch": 0.14988726890124757, + "flos": 16253601148800.0, + "grad_norm": 2.3494457039511447, + "language_loss": 0.78651595, + "learning_rate": 3.851113162828802e-06, + "loss": 0.8097527, + "num_input_tokens_seen": 54115600, + "step": 2493, + "time_per_iteration": 2.522459030151367 + }, + { + "auxiliary_loss_clip": 0.01162398, + "auxiliary_loss_mlp": 0.01160487, + "balance_loss_clip": 1.00199103, + "balance_loss_mlp": 1.00084138, + "epoch": 0.14994739215391553, + "flos": 20666555383680.0, + "grad_norm": 1.704036436056264, + "language_loss": 0.79981899, + "learning_rate": 3.85096567391148e-06, + "loss": 0.82304788, + "num_input_tokens_seen": 54135220, + "step": 2494, + "time_per_iteration": 2.555748224258423 + }, + { + "auxiliary_loss_clip": 0.01150815, + "auxiliary_loss_mlp": 0.01160411, + "balance_loss_clip": 1.00235105, + "balance_loss_mlp": 1.0010519, + "epoch": 0.1500075154065835, + "flos": 70652375239680.0, + "grad_norm": 2.116742293282108, + "language_loss": 0.66200799, + "learning_rate": 3.850818114805354e-06, + "loss": 0.68512022, + "num_input_tokens_seen": 54161065, + "step": 2495, + "time_per_iteration": 2.9865472316741943 + }, + { + "auxiliary_loss_clip": 0.01159933, + "auxiliary_loss_mlp": 0.01151083, + "balance_loss_clip": 1.00181079, + "balance_loss_mlp": 1.00002122, + "epoch": 0.15006763865925146, + "flos": 68011937447040.0, + "grad_norm": 0.8814826567706089, + "language_loss": 0.59518957, + "learning_rate": 3.850670485516019e-06, + "loss": 0.61829972, + "num_input_tokens_seen": 54225095, + "step": 2496, + "time_per_iteration": 3.1411774158477783 + }, + { + "auxiliary_loss_clip": 0.01179227, + "auxiliary_loss_mlp": 0.01161209, + "balance_loss_clip": 1.0020839, + "balance_loss_mlp": 1.00118303, + "epoch": 0.15012776191191943, + "flos": 18916269459840.0, + "grad_norm": 1.9825838360553765, + "language_loss": 0.65651011, + "learning_rate": 3.850522786049075e-06, + "loss": 0.67991453, + "num_input_tokens_seen": 54243750, + "step": 2497, + "time_per_iteration": 2.492001533508301 + }, + { + "auxiliary_loss_clip": 0.01148196, + "auxiliary_loss_mlp": 0.01161126, + "balance_loss_clip": 1.00230229, + "balance_loss_mlp": 1.00148082, + "epoch": 0.1501878851645874, + "flos": 23701330638720.0, + "grad_norm": 1.4350073998099466, + "language_loss": 0.75096977, + "learning_rate": 3.850375016410121e-06, + "loss": 0.77406299, + "num_input_tokens_seen": 54266185, + "step": 2498, + "time_per_iteration": 2.6659529209136963 + }, + { + "auxiliary_loss_clip": 0.01129712, + "auxiliary_loss_mlp": 0.01160589, + "balance_loss_clip": 1.00195754, + "balance_loss_mlp": 1.00084853, + "epoch": 0.15024800841725539, + "flos": 20412523422720.0, + "grad_norm": 2.08493753780965, + "language_loss": 0.72461677, + "learning_rate": 3.850227176604761e-06, + "loss": 0.74751979, + "num_input_tokens_seen": 54283940, + "step": 2499, + "time_per_iteration": 2.6307263374328613 + }, + { + "auxiliary_loss_clip": 0.01147106, + "auxiliary_loss_mlp": 0.01161066, + "balance_loss_clip": 1.00206041, + "balance_loss_mlp": 1.00132537, + "epoch": 0.15030813166992335, + "flos": 31831002812160.0, + "grad_norm": 1.9915673247729864, + "language_loss": 0.71818537, + "learning_rate": 3.850079266638601e-06, + "loss": 0.74126709, + "num_input_tokens_seen": 54304830, + "step": 2500, + "time_per_iteration": 2.674388885498047 + }, + { + "auxiliary_loss_clip": 0.01130794, + "auxiliary_loss_mlp": 0.01160573, + "balance_loss_clip": 1.00193024, + "balance_loss_mlp": 1.00140464, + "epoch": 0.15036825492259132, + "flos": 35657822914560.0, + "grad_norm": 1.812348759264706, + "language_loss": 0.65051687, + "learning_rate": 3.849931286517249e-06, + "loss": 0.67343056, + "num_input_tokens_seen": 54325595, + "step": 2501, + "time_per_iteration": 2.738760471343994 + }, + { + "auxiliary_loss_clip": 0.01146036, + "auxiliary_loss_mlp": 0.01160355, + "balance_loss_clip": 1.00191617, + "balance_loss_mlp": 1.00128174, + "epoch": 0.15042837817525928, + "flos": 18838163335680.0, + "grad_norm": 1.9917514755429326, + "language_loss": 0.83159292, + "learning_rate": 3.849783236246318e-06, + "loss": 0.85465682, + "num_input_tokens_seen": 54342180, + "step": 2502, + "time_per_iteration": 2.6413917541503906 + }, + { + "auxiliary_loss_clip": 0.0113012, + "auxiliary_loss_mlp": 0.01160449, + "balance_loss_clip": 1.00180984, + "balance_loss_mlp": 1.00128126, + "epoch": 0.15048850142792725, + "flos": 19535548867200.0, + "grad_norm": 1.8967918426369632, + "language_loss": 0.76957631, + "learning_rate": 3.849635115831421e-06, + "loss": 0.79248196, + "num_input_tokens_seen": 54360255, + "step": 2503, + "time_per_iteration": 2.6236038208007812 + }, + { + "auxiliary_loss_clip": 0.01179003, + "auxiliary_loss_mlp": 0.01160035, + "balance_loss_clip": 1.00211573, + "balance_loss_mlp": 1.00086725, + "epoch": 0.1505486246805952, + "flos": 22017550746240.0, + "grad_norm": 1.9185984860685155, + "language_loss": 0.85405886, + "learning_rate": 3.849486925278176e-06, + "loss": 0.87744915, + "num_input_tokens_seen": 54378260, + "step": 2504, + "time_per_iteration": 2.526231050491333 + }, + { + "auxiliary_loss_clip": 0.01162272, + "auxiliary_loss_mlp": 0.01160133, + "balance_loss_clip": 1.00198793, + "balance_loss_mlp": 1.00106001, + "epoch": 0.15060874793326318, + "flos": 20743153136640.0, + "grad_norm": 1.65754243989342, + "language_loss": 0.83121812, + "learning_rate": 3.8493386645922e-06, + "loss": 0.85444218, + "num_input_tokens_seen": 54399745, + "step": 2505, + "time_per_iteration": 2.5754902362823486 + }, + { + "auxiliary_loss_clip": 0.01131409, + "auxiliary_loss_mlp": 0.01160117, + "balance_loss_clip": 1.00188732, + "balance_loss_mlp": 1.00094843, + "epoch": 0.15066887118593117, + "flos": 16471902055680.0, + "grad_norm": 1.923048997247959, + "language_loss": 0.76600873, + "learning_rate": 3.849190333779117e-06, + "loss": 0.78892398, + "num_input_tokens_seen": 54417105, + "step": 2506, + "time_per_iteration": 2.610800266265869 + }, + { + "auxiliary_loss_clip": 0.01179303, + "auxiliary_loss_mlp": 0.01160327, + "balance_loss_clip": 1.0021379, + "balance_loss_mlp": 1.00087309, + "epoch": 0.15072899443859913, + "flos": 19859319083520.0, + "grad_norm": 5.25319795199477, + "language_loss": 0.75995946, + "learning_rate": 3.849041932844552e-06, + "loss": 0.78335571, + "num_input_tokens_seen": 54433920, + "step": 2507, + "time_per_iteration": 2.514761447906494 + }, + { + "auxiliary_loss_clip": 0.01162443, + "auxiliary_loss_mlp": 0.0116015, + "balance_loss_clip": 1.00194836, + "balance_loss_mlp": 1.00117254, + "epoch": 0.1507891176912671, + "flos": 20776226584320.0, + "grad_norm": 2.1825846591205207, + "language_loss": 0.69315332, + "learning_rate": 3.848893461794131e-06, + "loss": 0.71637928, + "num_input_tokens_seen": 54451540, + "step": 2508, + "time_per_iteration": 2.592609405517578 + }, + { + "auxiliary_loss_clip": 0.01147153, + "auxiliary_loss_mlp": 0.01160727, + "balance_loss_clip": 1.00219381, + "balance_loss_mlp": 1.00117767, + "epoch": 0.15084924094393506, + "flos": 23586631534080.0, + "grad_norm": 2.0881057276554946, + "language_loss": 0.77752376, + "learning_rate": 3.8487449206334845e-06, + "loss": 0.80060261, + "num_input_tokens_seen": 54470800, + "step": 2509, + "time_per_iteration": 2.596858501434326 + }, + { + "auxiliary_loss_clip": 0.01146916, + "auxiliary_loss_mlp": 0.00749093, + "balance_loss_clip": 1.00203228, + "balance_loss_mlp": 1.00071323, + "epoch": 0.15090936419660303, + "flos": 18911313383040.0, + "grad_norm": 2.2335823301404565, + "language_loss": 0.79903901, + "learning_rate": 3.848596309368246e-06, + "loss": 0.81799906, + "num_input_tokens_seen": 54486525, + "step": 2510, + "time_per_iteration": 2.6211659908294678 + }, + { + "auxiliary_loss_clip": 0.01162593, + "auxiliary_loss_mlp": 0.0116062, + "balance_loss_clip": 1.00199747, + "balance_loss_mlp": 1.00107074, + "epoch": 0.150969487449271, + "flos": 17928223073280.0, + "grad_norm": 1.7815261409896215, + "language_loss": 0.73426652, + "learning_rate": 3.8484476280040495e-06, + "loss": 0.75749862, + "num_input_tokens_seen": 54503795, + "step": 2511, + "time_per_iteration": 2.5394387245178223 + }, + { + "auxiliary_loss_clip": 0.01098054, + "auxiliary_loss_mlp": 0.01160332, + "balance_loss_clip": 1.00180638, + "balance_loss_mlp": 1.00097322, + "epoch": 0.151029610701939, + "flos": 24243078539520.0, + "grad_norm": 1.9444655610487835, + "language_loss": 0.69435734, + "learning_rate": 3.848298876546534e-06, + "loss": 0.71694118, + "num_input_tokens_seen": 54523025, + "step": 2512, + "time_per_iteration": 2.757922649383545 + }, + { + "auxiliary_loss_clip": 0.01163412, + "auxiliary_loss_mlp": 0.0116047, + "balance_loss_clip": 1.0021472, + "balance_loss_mlp": 1.00101614, + "epoch": 0.15108973395460695, + "flos": 30262496641920.0, + "grad_norm": 2.06690364392115, + "language_loss": 0.73819739, + "learning_rate": 3.84815005500134e-06, + "loss": 0.76143628, + "num_input_tokens_seen": 54545025, + "step": 2513, + "time_per_iteration": 2.6457746028900146 + }, + { + "auxiliary_loss_clip": 0.01096185, + "auxiliary_loss_mlp": 0.01151804, + "balance_loss_clip": 1.00153196, + "balance_loss_mlp": 0.9999795, + "epoch": 0.15114985720727492, + "flos": 60437624428800.0, + "grad_norm": 0.8679138286651709, + "language_loss": 0.6482901, + "learning_rate": 3.84800116337411e-06, + "loss": 0.67076999, + "num_input_tokens_seen": 54604545, + "step": 2514, + "time_per_iteration": 3.264509439468384 + }, + { + "auxiliary_loss_clip": 0.01162351, + "auxiliary_loss_mlp": 0.01160461, + "balance_loss_clip": 1.00208271, + "balance_loss_mlp": 1.0010066, + "epoch": 0.15120998045994288, + "flos": 20521691832960.0, + "grad_norm": 3.8430800513169965, + "language_loss": 0.73193502, + "learning_rate": 3.8478522016704916e-06, + "loss": 0.75516319, + "num_input_tokens_seen": 54620590, + "step": 2515, + "time_per_iteration": 4.034433841705322 + }, + { + "auxiliary_loss_clip": 0.01146921, + "auxiliary_loss_mlp": 0.0116023, + "balance_loss_clip": 1.00195193, + "balance_loss_mlp": 1.00125265, + "epoch": 0.15127010371261085, + "flos": 21178893024000.0, + "grad_norm": 2.132956328659328, + "language_loss": 0.77619225, + "learning_rate": 3.8477031698961325e-06, + "loss": 0.79926378, + "num_input_tokens_seen": 54640410, + "step": 2516, + "time_per_iteration": 5.811364412307739 + }, + { + "auxiliary_loss_clip": 0.01160707, + "auxiliary_loss_mlp": 0.01151795, + "balance_loss_clip": 1.00176144, + "balance_loss_mlp": 0.9999699, + "epoch": 0.1513302269652788, + "flos": 65320648974720.0, + "grad_norm": 0.7206020294657398, + "language_loss": 0.54610765, + "learning_rate": 3.8475540680566835e-06, + "loss": 0.5692327, + "num_input_tokens_seen": 54701430, + "step": 2517, + "time_per_iteration": 3.164226531982422 + }, + { + "auxiliary_loss_clip": 0.01131876, + "auxiliary_loss_mlp": 0.01160302, + "balance_loss_clip": 1.00188923, + "balance_loss_mlp": 1.00103891, + "epoch": 0.15139035021794678, + "flos": 19135827342720.0, + "grad_norm": 1.7542052096652636, + "language_loss": 0.78708166, + "learning_rate": 3.8474048961577995e-06, + "loss": 0.8100034, + "num_input_tokens_seen": 54720845, + "step": 2518, + "time_per_iteration": 2.648865222930908 + }, + { + "auxiliary_loss_clip": 0.0116369, + "auxiliary_loss_mlp": 0.01160548, + "balance_loss_clip": 1.00214124, + "balance_loss_mlp": 1.00099802, + "epoch": 0.15145047347061477, + "flos": 26578564842240.0, + "grad_norm": 2.3557563064825384, + "language_loss": 0.70617878, + "learning_rate": 3.847255654205137e-06, + "loss": 0.72942114, + "num_input_tokens_seen": 54740495, + "step": 2519, + "time_per_iteration": 4.044745445251465 + }, + { + "auxiliary_loss_clip": 0.01162605, + "auxiliary_loss_mlp": 0.01160311, + "balance_loss_clip": 1.00196183, + "balance_loss_mlp": 1.00095177, + "epoch": 0.15151059672328274, + "flos": 20302959962880.0, + "grad_norm": 2.174439181504825, + "language_loss": 0.78810918, + "learning_rate": 3.847106342204354e-06, + "loss": 0.81133837, + "num_input_tokens_seen": 54758415, + "step": 2520, + "time_per_iteration": 2.553826093673706 + }, + { + "auxiliary_loss_clip": 0.01147023, + "auxiliary_loss_mlp": 0.01160796, + "balance_loss_clip": 1.00192928, + "balance_loss_mlp": 1.00124598, + "epoch": 0.1515707199759507, + "flos": 27228367831680.0, + "grad_norm": 1.6594691144290503, + "language_loss": 0.74631882, + "learning_rate": 3.846956960161114e-06, + "loss": 0.76939702, + "num_input_tokens_seen": 54779355, + "step": 2521, + "time_per_iteration": 2.645254611968994 + }, + { + "auxiliary_loss_clip": 0.01146932, + "auxiliary_loss_mlp": 0.01160786, + "balance_loss_clip": 1.00197303, + "balance_loss_mlp": 1.00123668, + "epoch": 0.15163084322861867, + "flos": 23587349806080.0, + "grad_norm": 1.9080112253538015, + "language_loss": 0.81766993, + "learning_rate": 3.84680750808108e-06, + "loss": 0.84074712, + "num_input_tokens_seen": 54799465, + "step": 2522, + "time_per_iteration": 2.611036777496338 + }, + { + "auxiliary_loss_clip": 0.01110382, + "auxiliary_loss_mlp": 0.01151035, + "balance_loss_clip": 1.00099909, + "balance_loss_mlp": 0.99997294, + "epoch": 0.15169096648128663, + "flos": 66889622021760.0, + "grad_norm": 0.8169287354811233, + "language_loss": 0.57921731, + "learning_rate": 3.846657985969922e-06, + "loss": 0.60183144, + "num_input_tokens_seen": 54857665, + "step": 2523, + "time_per_iteration": 3.1983985900878906 + }, + { + "auxiliary_loss_clip": 0.01163119, + "auxiliary_loss_mlp": 0.01160109, + "balance_loss_clip": 1.00204027, + "balance_loss_mlp": 1.00103593, + "epoch": 0.1517510897339546, + "flos": 29095435848960.0, + "grad_norm": 1.5950089730399681, + "language_loss": 0.74781692, + "learning_rate": 3.8465083938333066e-06, + "loss": 0.7710492, + "num_input_tokens_seen": 54879895, + "step": 2524, + "time_per_iteration": 2.6434099674224854 + }, + { + "auxiliary_loss_clip": 0.01147108, + "auxiliary_loss_mlp": 0.01160494, + "balance_loss_clip": 1.00201464, + "balance_loss_mlp": 1.00103927, + "epoch": 0.1518112129866226, + "flos": 18406553512320.0, + "grad_norm": 1.6227715032593921, + "language_loss": 0.75003284, + "learning_rate": 3.8463587316769085e-06, + "loss": 0.77310884, + "num_input_tokens_seen": 54898245, + "step": 2525, + "time_per_iteration": 2.5872042179107666 + }, + { + "auxiliary_loss_clip": 0.01163471, + "auxiliary_loss_mlp": 0.01160366, + "balance_loss_clip": 1.00217295, + "balance_loss_mlp": 1.00091195, + "epoch": 0.15187133623929056, + "flos": 19425410789760.0, + "grad_norm": 1.927297714258138, + "language_loss": 0.80087835, + "learning_rate": 3.846208999506402e-06, + "loss": 0.82411671, + "num_input_tokens_seen": 54917060, + "step": 2526, + "time_per_iteration": 2.563417434692383 + }, + { + "auxiliary_loss_clip": 0.01146279, + "auxiliary_loss_mlp": 0.01160166, + "balance_loss_clip": 1.00189745, + "balance_loss_mlp": 1.00109339, + "epoch": 0.15193145949195852, + "flos": 17566207850880.0, + "grad_norm": 1.670844645493059, + "language_loss": 0.85106325, + "learning_rate": 3.846059197327466e-06, + "loss": 0.87412763, + "num_input_tokens_seen": 54936365, + "step": 2527, + "time_per_iteration": 2.6164450645446777 + }, + { + "auxiliary_loss_clip": 0.01146767, + "auxiliary_loss_mlp": 0.01160506, + "balance_loss_clip": 1.00208938, + "balance_loss_mlp": 1.00095582, + "epoch": 0.15199158274462649, + "flos": 36176265866880.0, + "grad_norm": 1.8931646301520033, + "language_loss": 0.68963444, + "learning_rate": 3.845909325145779e-06, + "loss": 0.71270716, + "num_input_tokens_seen": 54961365, + "step": 2528, + "time_per_iteration": 2.7753748893737793 + }, + { + "auxiliary_loss_clip": 0.01147427, + "auxiliary_loss_mlp": 0.0116069, + "balance_loss_clip": 1.00215948, + "balance_loss_mlp": 1.00123596, + "epoch": 0.15205170599729445, + "flos": 23074042498560.0, + "grad_norm": 1.9389168155642926, + "language_loss": 0.8719995, + "learning_rate": 3.845759382967026e-06, + "loss": 0.89508069, + "num_input_tokens_seen": 54980750, + "step": 2529, + "time_per_iteration": 2.608715295791626 + }, + { + "auxiliary_loss_clip": 0.01145675, + "auxiliary_loss_mlp": 0.01160131, + "balance_loss_clip": 1.00178492, + "balance_loss_mlp": 1.00096297, + "epoch": 0.15211182924996242, + "flos": 21908382336000.0, + "grad_norm": 1.7327615121607962, + "language_loss": 0.83728045, + "learning_rate": 3.845609370796893e-06, + "loss": 0.86033851, + "num_input_tokens_seen": 54999675, + "step": 2530, + "time_per_iteration": 2.6529364585876465 + }, + { + "auxiliary_loss_clip": 0.01146996, + "auxiliary_loss_mlp": 0.01160313, + "balance_loss_clip": 1.00213838, + "balance_loss_mlp": 1.00114512, + "epoch": 0.15217195250263038, + "flos": 13881521865600.0, + "grad_norm": 1.69573334236041, + "language_loss": 0.79990029, + "learning_rate": 3.845459288641066e-06, + "loss": 0.82297337, + "num_input_tokens_seen": 55018295, + "step": 2531, + "time_per_iteration": 2.5975654125213623 + }, + { + "auxiliary_loss_clip": 0.01163409, + "auxiliary_loss_mlp": 0.01160158, + "balance_loss_clip": 1.00210547, + "balance_loss_mlp": 1.00098968, + "epoch": 0.15223207575529837, + "flos": 24535319592960.0, + "grad_norm": 1.790424083394276, + "language_loss": 0.7895062, + "learning_rate": 3.8453091365052394e-06, + "loss": 0.81274188, + "num_input_tokens_seen": 55037975, + "step": 2532, + "time_per_iteration": 2.611640214920044 + }, + { + "auxiliary_loss_clip": 0.01162937, + "auxiliary_loss_mlp": 0.01160419, + "balance_loss_clip": 1.00206327, + "balance_loss_mlp": 1.00105977, + "epoch": 0.15229219900796634, + "flos": 25556798563200.0, + "grad_norm": 1.7660531498618974, + "language_loss": 0.87172484, + "learning_rate": 3.845158914395105e-06, + "loss": 0.89495838, + "num_input_tokens_seen": 55057135, + "step": 2533, + "time_per_iteration": 2.5818281173706055 + }, + { + "auxiliary_loss_clip": 0.01119335, + "auxiliary_loss_mlp": 0.01159922, + "balance_loss_clip": 1.00237298, + "balance_loss_mlp": 1.00103974, + "epoch": 0.1523523222606343, + "flos": 18217806520320.0, + "grad_norm": 2.0056320071066094, + "language_loss": 0.78859413, + "learning_rate": 3.84500862231636e-06, + "loss": 0.81138676, + "num_input_tokens_seen": 55075525, + "step": 2534, + "time_per_iteration": 2.6712427139282227 + }, + { + "auxiliary_loss_clip": 0.01179014, + "auxiliary_loss_mlp": 0.01160447, + "balance_loss_clip": 1.00209081, + "balance_loss_mlp": 1.00099301, + "epoch": 0.15241244551330227, + "flos": 13260087642240.0, + "grad_norm": 2.2193850275567, + "language_loss": 0.77043802, + "learning_rate": 3.844858260274702e-06, + "loss": 0.79383266, + "num_input_tokens_seen": 55090845, + "step": 2535, + "time_per_iteration": 2.491711139678955 + }, + { + "auxiliary_loss_clip": 0.01151681, + "auxiliary_loss_mlp": 0.01160566, + "balance_loss_clip": 1.00238407, + "balance_loss_mlp": 1.00092125, + "epoch": 0.15247256876597023, + "flos": 19715568854400.0, + "grad_norm": 1.903910822197129, + "language_loss": 0.78350449, + "learning_rate": 3.844707828275835e-06, + "loss": 0.80662704, + "num_input_tokens_seen": 55108750, + "step": 2536, + "time_per_iteration": 2.590712308883667 + }, + { + "auxiliary_loss_clip": 0.01147386, + "auxiliary_loss_mlp": 0.01161143, + "balance_loss_clip": 1.00219834, + "balance_loss_mlp": 1.00168908, + "epoch": 0.1525326920186382, + "flos": 20375858615040.0, + "grad_norm": 1.9694780227671016, + "language_loss": 0.76135814, + "learning_rate": 3.844557326325461e-06, + "loss": 0.7844435, + "num_input_tokens_seen": 55126750, + "step": 2537, + "time_per_iteration": 2.5826215744018555 + }, + { + "auxiliary_loss_clip": 0.011624, + "auxiliary_loss_mlp": 0.01160752, + "balance_loss_clip": 1.00199854, + "balance_loss_mlp": 1.00129819, + "epoch": 0.15259281527130616, + "flos": 13589963170560.0, + "grad_norm": 2.5611530862007745, + "language_loss": 0.77475345, + "learning_rate": 3.8444067544292896e-06, + "loss": 0.79798496, + "num_input_tokens_seen": 55144690, + "step": 2538, + "time_per_iteration": 2.520307779312134 + }, + { + "auxiliary_loss_clip": 0.01114182, + "auxiliary_loss_mlp": 0.01159545, + "balance_loss_clip": 1.0017134, + "balance_loss_mlp": 1.00094926, + "epoch": 0.15265293852397416, + "flos": 22860374446080.0, + "grad_norm": 4.398794262276793, + "language_loss": 0.89832997, + "learning_rate": 3.844256112593029e-06, + "loss": 0.92106724, + "num_input_tokens_seen": 55166055, + "step": 2539, + "time_per_iteration": 2.737549066543579 + }, + { + "auxiliary_loss_clip": 0.01162494, + "auxiliary_loss_mlp": 0.01160337, + "balance_loss_clip": 1.00193417, + "balance_loss_mlp": 1.00097823, + "epoch": 0.15271306177664212, + "flos": 29238108670080.0, + "grad_norm": 1.8378779111942816, + "language_loss": 0.93332946, + "learning_rate": 3.844105400822391e-06, + "loss": 0.95655781, + "num_input_tokens_seen": 55186285, + "step": 2540, + "time_per_iteration": 2.6102232933044434 + }, + { + "auxiliary_loss_clip": 0.01146404, + "auxiliary_loss_mlp": 0.01160246, + "balance_loss_clip": 1.00193393, + "balance_loss_mlp": 1.00117314, + "epoch": 0.1527731850293101, + "flos": 31246269310080.0, + "grad_norm": 1.6542463452085185, + "language_loss": 0.75030339, + "learning_rate": 3.843954619123092e-06, + "loss": 0.77336991, + "num_input_tokens_seen": 55207915, + "step": 2541, + "time_per_iteration": 2.7050716876983643 + }, + { + "auxiliary_loss_clip": 0.01133559, + "auxiliary_loss_mlp": 0.01160073, + "balance_loss_clip": 1.0018909, + "balance_loss_mlp": 1.00128615, + "epoch": 0.15283330828197805, + "flos": 22382079920640.0, + "grad_norm": 1.5671384169478726, + "language_loss": 0.81137133, + "learning_rate": 3.84380376750085e-06, + "loss": 0.83430767, + "num_input_tokens_seen": 55227860, + "step": 2542, + "time_per_iteration": 2.640001058578491 + }, + { + "auxiliary_loss_clip": 0.01178984, + "auxiliary_loss_mlp": 0.01160333, + "balance_loss_clip": 1.0021466, + "balance_loss_mlp": 1.00106931, + "epoch": 0.15289343153464602, + "flos": 25520133755520.0, + "grad_norm": 2.4754611492746355, + "language_loss": 0.77464986, + "learning_rate": 3.843652845961383e-06, + "loss": 0.79804301, + "num_input_tokens_seen": 55247330, + "step": 2543, + "time_per_iteration": 2.572479009628296 + }, + { + "auxiliary_loss_clip": 0.01162315, + "auxiliary_loss_mlp": 0.01159956, + "balance_loss_clip": 1.00199699, + "balance_loss_mlp": 1.00107408, + "epoch": 0.15295355478731398, + "flos": 22710016114560.0, + "grad_norm": 2.047455646347454, + "language_loss": 0.86160004, + "learning_rate": 3.843501854510416e-06, + "loss": 0.88482273, + "num_input_tokens_seen": 55266195, + "step": 2544, + "time_per_iteration": 2.583655834197998 + }, + { + "auxiliary_loss_clip": 0.0116247, + "auxiliary_loss_mlp": 0.01160948, + "balance_loss_clip": 1.0020504, + "balance_loss_mlp": 1.00130308, + "epoch": 0.15301367803998198, + "flos": 23251907669760.0, + "grad_norm": 2.0594639594845034, + "language_loss": 0.82829809, + "learning_rate": 3.843350793153673e-06, + "loss": 0.85153228, + "num_input_tokens_seen": 55283305, + "step": 2545, + "time_per_iteration": 2.5578250885009766 + }, + { + "auxiliary_loss_clip": 0.0117903, + "auxiliary_loss_mlp": 0.01160531, + "balance_loss_clip": 1.00218654, + "balance_loss_mlp": 1.0011723, + "epoch": 0.15307380129264994, + "flos": 25886279041920.0, + "grad_norm": 2.2074682950608455, + "language_loss": 0.71285337, + "learning_rate": 3.843199661896884e-06, + "loss": 0.73624909, + "num_input_tokens_seen": 55303035, + "step": 2546, + "time_per_iteration": 2.5847740173339844 + }, + { + "auxiliary_loss_clip": 0.01147409, + "auxiliary_loss_mlp": 0.01160684, + "balance_loss_clip": 1.00205278, + "balance_loss_mlp": 1.00103855, + "epoch": 0.1531339245453179, + "flos": 46973239205760.0, + "grad_norm": 1.8353380252638882, + "language_loss": 0.7737757, + "learning_rate": 3.843048460745779e-06, + "loss": 0.79685658, + "num_input_tokens_seen": 55327570, + "step": 2547, + "time_per_iteration": 2.812929630279541 + }, + { + "auxiliary_loss_clip": 0.01118521, + "auxiliary_loss_mlp": 0.01160239, + "balance_loss_clip": 1.00196338, + "balance_loss_mlp": 1.00097573, + "epoch": 0.15319404779798587, + "flos": 35882049565440.0, + "grad_norm": 2.18833151891855, + "language_loss": 0.74635172, + "learning_rate": 3.842897189706092e-06, + "loss": 0.76913929, + "num_input_tokens_seen": 55351090, + "step": 2548, + "time_per_iteration": 2.7882771492004395 + }, + { + "auxiliary_loss_clip": 0.01145894, + "auxiliary_loss_mlp": 0.01159969, + "balance_loss_clip": 1.0017935, + "balance_loss_mlp": 1.00108659, + "epoch": 0.15325417105065384, + "flos": 25664638170240.0, + "grad_norm": 1.4142488847485672, + "language_loss": 0.80871266, + "learning_rate": 3.842745848783558e-06, + "loss": 0.83177131, + "num_input_tokens_seen": 55371050, + "step": 2549, + "time_per_iteration": 2.6332030296325684 + }, + { + "auxiliary_loss_clip": 0.01162248, + "auxiliary_loss_mlp": 0.01160198, + "balance_loss_clip": 1.00188065, + "balance_loss_mlp": 1.0011251, + "epoch": 0.1533142943033218, + "flos": 18770831291520.0, + "grad_norm": 1.531077414413715, + "language_loss": 0.74698275, + "learning_rate": 3.842594437983917e-06, + "loss": 0.77020723, + "num_input_tokens_seen": 55390375, + "step": 2550, + "time_per_iteration": 2.5284643173217773 + }, + { + "auxiliary_loss_clip": 0.01167186, + "auxiliary_loss_mlp": 0.01160166, + "balance_loss_clip": 1.00223303, + "balance_loss_mlp": 1.00090218, + "epoch": 0.15337441755598977, + "flos": 23107367341440.0, + "grad_norm": 2.2846399420650765, + "language_loss": 0.77097666, + "learning_rate": 3.8424429573129115e-06, + "loss": 0.79425019, + "num_input_tokens_seen": 55408890, + "step": 2551, + "time_per_iteration": 2.5543034076690674 + }, + { + "auxiliary_loss_clip": 0.01160531, + "auxiliary_loss_mlp": 0.01151061, + "balance_loss_clip": 1.00168371, + "balance_loss_mlp": 0.99999875, + "epoch": 0.15343454080865776, + "flos": 59861079227520.0, + "grad_norm": 0.9486371273392923, + "language_loss": 0.56673741, + "learning_rate": 3.842291406776283e-06, + "loss": 0.58985341, + "num_input_tokens_seen": 55463815, + "step": 2552, + "time_per_iteration": 3.0772206783294678 + }, + { + "auxiliary_loss_clip": 0.01117115, + "auxiliary_loss_mlp": 0.011601, + "balance_loss_clip": 1.00184786, + "balance_loss_mlp": 1.00102711, + "epoch": 0.15349466406132573, + "flos": 11910887959680.0, + "grad_norm": 2.0761897868808257, + "language_loss": 0.88666117, + "learning_rate": 3.84213978637978e-06, + "loss": 0.90943336, + "num_input_tokens_seen": 55481050, + "step": 2553, + "time_per_iteration": 4.145695209503174 + }, + { + "auxiliary_loss_clip": 0.01162415, + "auxiliary_loss_mlp": 0.01160605, + "balance_loss_clip": 1.00201654, + "balance_loss_mlp": 1.00134087, + "epoch": 0.1535547873139937, + "flos": 24096922099200.0, + "grad_norm": 1.535496467809863, + "language_loss": 0.78110874, + "learning_rate": 3.841988096129152e-06, + "loss": 0.80433893, + "num_input_tokens_seen": 55500050, + "step": 2554, + "time_per_iteration": 3.958388090133667 + }, + { + "auxiliary_loss_clip": 0.01098212, + "auxiliary_loss_mlp": 0.01160145, + "balance_loss_clip": 1.00181413, + "balance_loss_mlp": 1.00107217, + "epoch": 0.15361491056666166, + "flos": 17566459246080.0, + "grad_norm": 2.5228008123347387, + "language_loss": 0.78277385, + "learning_rate": 3.841836336030151e-06, + "loss": 0.80535746, + "num_input_tokens_seen": 55518125, + "step": 2555, + "time_per_iteration": 2.687955141067505 + }, + { + "auxiliary_loss_clip": 0.01130411, + "auxiliary_loss_mlp": 0.01159796, + "balance_loss_clip": 1.0019269, + "balance_loss_mlp": 1.00119996, + "epoch": 0.15367503381932962, + "flos": 25046041121280.0, + "grad_norm": 1.4248611100678468, + "language_loss": 0.76979858, + "learning_rate": 3.8416845060885305e-06, + "loss": 0.79270065, + "num_input_tokens_seen": 55540960, + "step": 2556, + "time_per_iteration": 4.111367225646973 + }, + { + "auxiliary_loss_clip": 0.01162788, + "auxiliary_loss_mlp": 0.00749141, + "balance_loss_clip": 1.00212467, + "balance_loss_mlp": 1.00091314, + "epoch": 0.15373515707199759, + "flos": 21507332008320.0, + "grad_norm": 1.791002028219827, + "language_loss": 0.898844, + "learning_rate": 3.84153260631005e-06, + "loss": 0.91796327, + "num_input_tokens_seen": 55559210, + "step": 2557, + "time_per_iteration": 2.5683798789978027 + }, + { + "auxiliary_loss_clip": 0.01150486, + "auxiliary_loss_mlp": 0.01160067, + "balance_loss_clip": 1.00199842, + "balance_loss_mlp": 1.00108957, + "epoch": 0.15379528032466555, + "flos": 25994729180160.0, + "grad_norm": 3.4902323248236615, + "language_loss": 0.70852685, + "learning_rate": 3.841380636700468e-06, + "loss": 0.73163241, + "num_input_tokens_seen": 55578925, + "step": 2558, + "time_per_iteration": 2.632453441619873 + }, + { + "auxiliary_loss_clip": 0.01145801, + "auxiliary_loss_mlp": 0.0116014, + "balance_loss_clip": 1.00191307, + "balance_loss_mlp": 1.00097156, + "epoch": 0.15385540357733354, + "flos": 19277315015040.0, + "grad_norm": 1.9002381955205592, + "language_loss": 0.91845179, + "learning_rate": 3.841228597265548e-06, + "loss": 0.94151115, + "num_input_tokens_seen": 55597255, + "step": 2559, + "time_per_iteration": 2.5759172439575195 + }, + { + "auxiliary_loss_clip": 0.01146543, + "auxiliary_loss_mlp": 0.01161052, + "balance_loss_clip": 1.0021528, + "balance_loss_mlp": 1.00121582, + "epoch": 0.1539155268300015, + "flos": 28549126920960.0, + "grad_norm": 2.379891052385847, + "language_loss": 0.63334632, + "learning_rate": 3.841076488011055e-06, + "loss": 0.65642226, + "num_input_tokens_seen": 55619515, + "step": 2560, + "time_per_iteration": 2.6553103923797607 + }, + { + "auxiliary_loss_clip": 0.01145684, + "auxiliary_loss_mlp": 0.01160335, + "balance_loss_clip": 1.00187635, + "balance_loss_mlp": 1.00107193, + "epoch": 0.15397565008266947, + "flos": 23547883737600.0, + "grad_norm": 1.6851198870416932, + "language_loss": 0.87932837, + "learning_rate": 3.8409243089427574e-06, + "loss": 0.90238857, + "num_input_tokens_seen": 55640050, + "step": 2561, + "time_per_iteration": 2.6189608573913574 + }, + { + "auxiliary_loss_clip": 0.01162129, + "auxiliary_loss_mlp": 0.01159405, + "balance_loss_clip": 1.00210023, + "balance_loss_mlp": 1.00099945, + "epoch": 0.15403577333533744, + "flos": 17129821518720.0, + "grad_norm": 1.6043873990868103, + "language_loss": 0.83151805, + "learning_rate": 3.840772060066425e-06, + "loss": 0.85473335, + "num_input_tokens_seen": 55658695, + "step": 2562, + "time_per_iteration": 2.5454413890838623 + }, + { + "auxiliary_loss_clip": 0.01147095, + "auxiliary_loss_mlp": 0.00749238, + "balance_loss_clip": 1.00219655, + "balance_loss_mlp": 1.00095606, + "epoch": 0.1540958965880054, + "flos": 17894503180800.0, + "grad_norm": 1.8824905613304412, + "language_loss": 0.74574685, + "learning_rate": 3.840619741387832e-06, + "loss": 0.76471019, + "num_input_tokens_seen": 55676340, + "step": 2563, + "time_per_iteration": 2.565671682357788 + }, + { + "auxiliary_loss_clip": 0.01113456, + "auxiliary_loss_mlp": 0.01160007, + "balance_loss_clip": 1.00180352, + "balance_loss_mlp": 1.00083899, + "epoch": 0.15415601984067337, + "flos": 32161057908480.0, + "grad_norm": 1.9866021594193115, + "language_loss": 0.75854182, + "learning_rate": 3.8404673529127534e-06, + "loss": 0.78127646, + "num_input_tokens_seen": 55698890, + "step": 2564, + "time_per_iteration": 2.7633767127990723 + }, + { + "auxiliary_loss_clip": 0.01146264, + "auxiliary_loss_mlp": 0.01160121, + "balance_loss_clip": 1.00203252, + "balance_loss_mlp": 1.0012387, + "epoch": 0.15421614309334136, + "flos": 24024418496640.0, + "grad_norm": 1.7124975011807826, + "language_loss": 0.70535278, + "learning_rate": 3.840314894646969e-06, + "loss": 0.72841662, + "num_input_tokens_seen": 55718535, + "step": 2565, + "time_per_iteration": 2.662217378616333 + }, + { + "auxiliary_loss_clip": 0.01163082, + "auxiliary_loss_mlp": 0.01160189, + "balance_loss_clip": 1.00206351, + "balance_loss_mlp": 1.00111651, + "epoch": 0.15427626634600933, + "flos": 24386290064640.0, + "grad_norm": 1.9182918129503344, + "language_loss": 0.71476471, + "learning_rate": 3.840162366596259e-06, + "loss": 0.73799741, + "num_input_tokens_seen": 55738970, + "step": 2566, + "time_per_iteration": 2.6024580001831055 + }, + { + "auxiliary_loss_clip": 0.01178659, + "auxiliary_loss_mlp": 0.01159936, + "balance_loss_clip": 1.0019567, + "balance_loss_mlp": 1.00114954, + "epoch": 0.1543363895986773, + "flos": 23331522165120.0, + "grad_norm": 1.6159346033876596, + "language_loss": 0.84880829, + "learning_rate": 3.840009768766408e-06, + "loss": 0.87219429, + "num_input_tokens_seen": 55759585, + "step": 2567, + "time_per_iteration": 2.5298612117767334 + }, + { + "auxiliary_loss_clip": 0.01131411, + "auxiliary_loss_mlp": 0.01159897, + "balance_loss_clip": 1.00211, + "balance_loss_mlp": 1.00111043, + "epoch": 0.15439651285134526, + "flos": 24274284480000.0, + "grad_norm": 1.7640411126274316, + "language_loss": 0.77987951, + "learning_rate": 3.839857101163202e-06, + "loss": 0.80279261, + "num_input_tokens_seen": 55779250, + "step": 2568, + "time_per_iteration": 2.6784372329711914 + }, + { + "auxiliary_loss_clip": 0.01146978, + "auxiliary_loss_mlp": 0.01159796, + "balance_loss_clip": 1.00220823, + "balance_loss_mlp": 1.0009141, + "epoch": 0.15445663610401322, + "flos": 22456163721600.0, + "grad_norm": 18.578636415107336, + "language_loss": 0.70260048, + "learning_rate": 3.83970436379243e-06, + "loss": 0.72566831, + "num_input_tokens_seen": 55800470, + "step": 2569, + "time_per_iteration": 2.6108558177948 + }, + { + "auxiliary_loss_clip": 0.01145495, + "auxiliary_loss_mlp": 0.01159992, + "balance_loss_clip": 1.00189984, + "balance_loss_mlp": 1.00101483, + "epoch": 0.1545167593566812, + "flos": 22049510872320.0, + "grad_norm": 1.8498548724667814, + "language_loss": 0.76863611, + "learning_rate": 3.839551556659884e-06, + "loss": 0.79169095, + "num_input_tokens_seen": 55817795, + "step": 2570, + "time_per_iteration": 2.5971407890319824 + }, + { + "auxiliary_loss_clip": 0.01162254, + "auxiliary_loss_mlp": 0.01160177, + "balance_loss_clip": 1.00209451, + "balance_loss_mlp": 1.00091326, + "epoch": 0.15457688260934915, + "flos": 19318253541120.0, + "grad_norm": 2.056536877221583, + "language_loss": 0.773785, + "learning_rate": 3.839398679771359e-06, + "loss": 0.79700935, + "num_input_tokens_seen": 55836125, + "step": 2571, + "time_per_iteration": 2.5369701385498047 + }, + { + "auxiliary_loss_clip": 0.01150263, + "auxiliary_loss_mlp": 0.0115995, + "balance_loss_clip": 1.00210571, + "balance_loss_mlp": 1.0008775, + "epoch": 0.15463700586201715, + "flos": 24133981956480.0, + "grad_norm": 1.8433813908357157, + "language_loss": 0.82131445, + "learning_rate": 3.839245733132652e-06, + "loss": 0.84441662, + "num_input_tokens_seen": 55855280, + "step": 2572, + "time_per_iteration": 2.638321876525879 + }, + { + "auxiliary_loss_clip": 0.0117886, + "auxiliary_loss_mlp": 0.01160489, + "balance_loss_clip": 1.00216055, + "balance_loss_mlp": 1.00122523, + "epoch": 0.1546971291146851, + "flos": 22420935457920.0, + "grad_norm": 1.5836345600953137, + "language_loss": 0.90437156, + "learning_rate": 3.839092716749563e-06, + "loss": 0.92776501, + "num_input_tokens_seen": 55875695, + "step": 2573, + "time_per_iteration": 2.51229190826416 + }, + { + "auxiliary_loss_clip": 0.01114626, + "auxiliary_loss_mlp": 0.01159868, + "balance_loss_clip": 1.00187004, + "balance_loss_mlp": 1.00108135, + "epoch": 0.15475725236735308, + "flos": 17530225401600.0, + "grad_norm": 1.6892904128499502, + "language_loss": 0.69957399, + "learning_rate": 3.838939630627893e-06, + "loss": 0.72231895, + "num_input_tokens_seen": 55894575, + "step": 2574, + "time_per_iteration": 2.622622489929199 + }, + { + "auxiliary_loss_clip": 0.01146486, + "auxiliary_loss_mlp": 0.01160221, + "balance_loss_clip": 1.00194764, + "balance_loss_mlp": 1.00114775, + "epoch": 0.15481737562002104, + "flos": 22561740771840.0, + "grad_norm": 1.645413756670183, + "language_loss": 0.8249653, + "learning_rate": 3.838786474773448e-06, + "loss": 0.84803236, + "num_input_tokens_seen": 55912855, + "step": 2575, + "time_per_iteration": 2.6182830333709717 + }, + { + "auxiliary_loss_clip": 0.01145398, + "auxiliary_loss_mlp": 0.01159759, + "balance_loss_clip": 1.00179076, + "balance_loss_mlp": 1.00097203, + "epoch": 0.154877498872689, + "flos": 24900567039360.0, + "grad_norm": 1.7175106147648846, + "language_loss": 0.84771252, + "learning_rate": 3.838633249192036e-06, + "loss": 0.87076402, + "num_input_tokens_seen": 55932375, + "step": 2576, + "time_per_iteration": 2.6764278411865234 + }, + { + "auxiliary_loss_clip": 0.01178672, + "auxiliary_loss_mlp": 0.01159734, + "balance_loss_clip": 1.00196505, + "balance_loss_mlp": 1.00094748, + "epoch": 0.15493762212535697, + "flos": 28147501975680.0, + "grad_norm": 2.193023086475751, + "language_loss": 0.81790525, + "learning_rate": 3.838479953889465e-06, + "loss": 0.84128928, + "num_input_tokens_seen": 55953970, + "step": 2577, + "time_per_iteration": 2.5720131397247314 + }, + { + "auxiliary_loss_clip": 0.01129676, + "auxiliary_loss_mlp": 0.01160271, + "balance_loss_clip": 1.00201714, + "balance_loss_mlp": 1.00110316, + "epoch": 0.15499774537802496, + "flos": 25411073086080.0, + "grad_norm": 2.138949950290415, + "language_loss": 0.76412749, + "learning_rate": 3.8383265888715525e-06, + "loss": 0.787027, + "num_input_tokens_seen": 55973120, + "step": 2578, + "time_per_iteration": 2.692513942718506 + }, + { + "auxiliary_loss_clip": 0.01130554, + "auxiliary_loss_mlp": 0.01159693, + "balance_loss_clip": 1.00195122, + "balance_loss_mlp": 1.00100207, + "epoch": 0.15505786863069293, + "flos": 22091562720000.0, + "grad_norm": 1.8430295540843378, + "language_loss": 0.82920754, + "learning_rate": 3.83817315414411e-06, + "loss": 0.85211003, + "num_input_tokens_seen": 55993260, + "step": 2579, + "time_per_iteration": 2.629488229751587 + }, + { + "auxiliary_loss_clip": 0.01146648, + "auxiliary_loss_mlp": 0.01159962, + "balance_loss_clip": 1.00212312, + "balance_loss_mlp": 1.00117564, + "epoch": 0.1551179918833609, + "flos": 18917131386240.0, + "grad_norm": 1.5691224349342339, + "language_loss": 0.8077147, + "learning_rate": 3.838019649712958e-06, + "loss": 0.8307808, + "num_input_tokens_seen": 56012130, + "step": 2580, + "time_per_iteration": 2.601409435272217 + }, + { + "auxiliary_loss_clip": 0.01159065, + "auxiliary_loss_mlp": 0.01151054, + "balance_loss_clip": 1.00140595, + "balance_loss_mlp": 0.99999249, + "epoch": 0.15517811513602886, + "flos": 66239172587520.0, + "grad_norm": 0.852026651303336, + "language_loss": 0.5884788, + "learning_rate": 3.8378660755839166e-06, + "loss": 0.61158001, + "num_input_tokens_seen": 56079045, + "step": 2581, + "time_per_iteration": 3.253737449645996 + }, + { + "auxiliary_loss_clip": 0.01130797, + "auxiliary_loss_mlp": 0.01159726, + "balance_loss_clip": 1.00189686, + "balance_loss_mlp": 1.00093925, + "epoch": 0.15523823838869683, + "flos": 24021078531840.0, + "grad_norm": 1.86814491849248, + "language_loss": 0.85524011, + "learning_rate": 3.8377124317628095e-06, + "loss": 0.87814534, + "num_input_tokens_seen": 56098745, + "step": 2582, + "time_per_iteration": 2.6528127193450928 + }, + { + "auxiliary_loss_clip": 0.01162234, + "auxiliary_loss_mlp": 0.01160511, + "balance_loss_clip": 1.00204241, + "balance_loss_mlp": 1.00143802, + "epoch": 0.1552983616413648, + "flos": 20485062938880.0, + "grad_norm": 3.9841550164705346, + "language_loss": 0.78716916, + "learning_rate": 3.8375587182554625e-06, + "loss": 0.81039661, + "num_input_tokens_seen": 56117655, + "step": 2583, + "time_per_iteration": 2.6035561561584473 + }, + { + "auxiliary_loss_clip": 0.01163016, + "auxiliary_loss_mlp": 0.01160039, + "balance_loss_clip": 1.00199771, + "balance_loss_mlp": 1.00115728, + "epoch": 0.15535848489403276, + "flos": 32123710742400.0, + "grad_norm": 1.487949350225995, + "language_loss": 0.76255411, + "learning_rate": 3.837404935067705e-06, + "loss": 0.78578466, + "num_input_tokens_seen": 56141960, + "step": 2584, + "time_per_iteration": 2.662303924560547 + }, + { + "auxiliary_loss_clip": 0.01163197, + "auxiliary_loss_mlp": 0.01159706, + "balance_loss_clip": 1.00213909, + "balance_loss_mlp": 1.00082386, + "epoch": 0.15541860814670075, + "flos": 19098444263040.0, + "grad_norm": 1.7045015756593258, + "language_loss": 0.75708103, + "learning_rate": 3.837251082205368e-06, + "loss": 0.78031003, + "num_input_tokens_seen": 56161430, + "step": 2585, + "time_per_iteration": 2.5822877883911133 + }, + { + "auxiliary_loss_clip": 0.01128819, + "auxiliary_loss_mlp": 0.01159773, + "balance_loss_clip": 1.00172007, + "balance_loss_mlp": 1.00089049, + "epoch": 0.1554787313993687, + "flos": 19172097100800.0, + "grad_norm": 1.7875664539287837, + "language_loss": 0.61610067, + "learning_rate": 3.837097159674286e-06, + "loss": 0.63898659, + "num_input_tokens_seen": 56179390, + "step": 2586, + "time_per_iteration": 2.627779245376587 + }, + { + "auxiliary_loss_clip": 0.01147148, + "auxiliary_loss_mlp": 0.01160033, + "balance_loss_clip": 1.00191534, + "balance_loss_mlp": 1.00105596, + "epoch": 0.15553885465203668, + "flos": 16143822207360.0, + "grad_norm": 1.6499345504523972, + "language_loss": 0.8115412, + "learning_rate": 3.836943167480296e-06, + "loss": 0.83461303, + "num_input_tokens_seen": 56198020, + "step": 2587, + "time_per_iteration": 2.605316162109375 + }, + { + "auxiliary_loss_clip": 0.01178839, + "auxiliary_loss_mlp": 0.01160488, + "balance_loss_clip": 1.00204897, + "balance_loss_mlp": 1.00112939, + "epoch": 0.15559897790470464, + "flos": 25337779384320.0, + "grad_norm": 1.7246165892200163, + "language_loss": 0.88437158, + "learning_rate": 3.836789105629236e-06, + "loss": 0.90776479, + "num_input_tokens_seen": 56218165, + "step": 2588, + "time_per_iteration": 2.5645272731781006 + }, + { + "auxiliary_loss_clip": 0.01114558, + "auxiliary_loss_mlp": 0.01159975, + "balance_loss_clip": 1.00179243, + "balance_loss_mlp": 1.0011878, + "epoch": 0.1556591011573726, + "flos": 23148772744320.0, + "grad_norm": 2.822140351136954, + "language_loss": 0.64805508, + "learning_rate": 3.83663497412695e-06, + "loss": 0.67080045, + "num_input_tokens_seen": 56237160, + "step": 2589, + "time_per_iteration": 2.690765619277954 + }, + { + "auxiliary_loss_clip": 0.01131103, + "auxiliary_loss_mlp": 0.01160032, + "balance_loss_clip": 1.00206101, + "balance_loss_mlp": 1.001055, + "epoch": 0.15571922441004057, + "flos": 25370888745600.0, + "grad_norm": 1.8269303536232775, + "language_loss": 0.82837212, + "learning_rate": 3.836480772979281e-06, + "loss": 0.85128349, + "num_input_tokens_seen": 56257610, + "step": 2590, + "time_per_iteration": 4.161813020706177 + }, + { + "auxiliary_loss_clip": 0.01129922, + "auxiliary_loss_mlp": 0.011598, + "balance_loss_clip": 1.00188112, + "balance_loss_mlp": 1.00110841, + "epoch": 0.15577934766270854, + "flos": 14501375890560.0, + "grad_norm": 1.9478284146168925, + "language_loss": 0.79249954, + "learning_rate": 3.836326502192077e-06, + "loss": 0.81539673, + "num_input_tokens_seen": 56275215, + "step": 2591, + "time_per_iteration": 4.187891006469727 + }, + { + "auxiliary_loss_clip": 0.01162161, + "auxiliary_loss_mlp": 0.01159914, + "balance_loss_clip": 1.00195062, + "balance_loss_mlp": 1.00122297, + "epoch": 0.15583947091537653, + "flos": 37414537372800.0, + "grad_norm": 2.6025886275278856, + "language_loss": 0.64476264, + "learning_rate": 3.836172161771189e-06, + "loss": 0.66798341, + "num_input_tokens_seen": 56297130, + "step": 2592, + "time_per_iteration": 4.082520246505737 + }, + { + "auxiliary_loss_clip": 0.01145771, + "auxiliary_loss_mlp": 0.01160412, + "balance_loss_clip": 1.00192761, + "balance_loss_mlp": 1.00105321, + "epoch": 0.1558995941680445, + "flos": 21834729498240.0, + "grad_norm": 1.8377995206645916, + "language_loss": 0.81962806, + "learning_rate": 3.836017751722467e-06, + "loss": 0.84268987, + "num_input_tokens_seen": 56314995, + "step": 2593, + "time_per_iteration": 2.5728776454925537 + }, + { + "auxiliary_loss_clip": 0.01161999, + "auxiliary_loss_mlp": 0.01159762, + "balance_loss_clip": 1.00197864, + "balance_loss_mlp": 1.00107062, + "epoch": 0.15595971742071246, + "flos": 19792633484160.0, + "grad_norm": 2.2503595314293294, + "language_loss": 0.73304713, + "learning_rate": 3.8358632720517695e-06, + "loss": 0.75626481, + "num_input_tokens_seen": 56334005, + "step": 2594, + "time_per_iteration": 3.993400812149048 + }, + { + "auxiliary_loss_clip": 0.01147094, + "auxiliary_loss_mlp": 0.01159509, + "balance_loss_clip": 1.00194359, + "balance_loss_mlp": 1.00091267, + "epoch": 0.15601984067338043, + "flos": 26722135503360.0, + "grad_norm": 2.946609948969393, + "language_loss": 0.81442356, + "learning_rate": 3.835708722764952e-06, + "loss": 0.8374896, + "num_input_tokens_seen": 56353795, + "step": 2595, + "time_per_iteration": 2.6294636726379395 + }, + { + "auxiliary_loss_clip": 0.01178729, + "auxiliary_loss_mlp": 0.01159542, + "balance_loss_clip": 1.00206161, + "balance_loss_mlp": 1.0009464, + "epoch": 0.1560799639260484, + "flos": 18369278173440.0, + "grad_norm": 1.8426352318582826, + "language_loss": 0.86641204, + "learning_rate": 3.835554103867876e-06, + "loss": 0.88979471, + "num_input_tokens_seen": 56373195, + "step": 2596, + "time_per_iteration": 2.5001399517059326 + }, + { + "auxiliary_loss_clip": 0.011626, + "auxiliary_loss_mlp": 0.01159075, + "balance_loss_clip": 1.00204825, + "balance_loss_mlp": 1.0011462, + "epoch": 0.15614008717871636, + "flos": 22598980197120.0, + "grad_norm": 1.8405815349618515, + "language_loss": 0.68264067, + "learning_rate": 3.835399415366404e-06, + "loss": 0.70585746, + "num_input_tokens_seen": 56391525, + "step": 2597, + "time_per_iteration": 2.5604135990142822 + }, + { + "auxiliary_loss_clip": 0.01145935, + "auxiliary_loss_mlp": 0.01159627, + "balance_loss_clip": 1.002069, + "balance_loss_mlp": 1.00093567, + "epoch": 0.15620021043138435, + "flos": 22746860490240.0, + "grad_norm": 1.588620336365239, + "language_loss": 0.80059385, + "learning_rate": 3.8352446572664035e-06, + "loss": 0.82364953, + "num_input_tokens_seen": 56410715, + "step": 2598, + "time_per_iteration": 2.5939745903015137 + }, + { + "auxiliary_loss_clip": 0.01146326, + "auxiliary_loss_mlp": 0.00749065, + "balance_loss_clip": 1.00188863, + "balance_loss_mlp": 1.00071597, + "epoch": 0.15626033368405232, + "flos": 13114936782720.0, + "grad_norm": 1.748988050483676, + "language_loss": 0.82357264, + "learning_rate": 3.8350898295737405e-06, + "loss": 0.84252656, + "num_input_tokens_seen": 56429170, + "step": 2599, + "time_per_iteration": 2.557370662689209 + }, + { + "auxiliary_loss_clip": 0.01178754, + "auxiliary_loss_mlp": 0.01160204, + "balance_loss_clip": 1.0021143, + "balance_loss_mlp": 1.0012269, + "epoch": 0.15632045693672028, + "flos": 16472297105280.0, + "grad_norm": 2.1006959389896407, + "language_loss": 0.81960917, + "learning_rate": 3.834934932294287e-06, + "loss": 0.84299874, + "num_input_tokens_seen": 56445685, + "step": 2600, + "time_per_iteration": 2.4695494174957275 + }, + { + "auxiliary_loss_clip": 0.01178672, + "auxiliary_loss_mlp": 0.00749136, + "balance_loss_clip": 1.00211453, + "balance_loss_mlp": 1.00076985, + "epoch": 0.15638058018938825, + "flos": 20850346298880.0, + "grad_norm": 1.8942022394077767, + "language_loss": 0.88604385, + "learning_rate": 3.834779965433917e-06, + "loss": 0.90532196, + "num_input_tokens_seen": 56465900, + "step": 2601, + "time_per_iteration": 2.532010793685913 + }, + { + "auxiliary_loss_clip": 0.01178793, + "auxiliary_loss_mlp": 0.01160351, + "balance_loss_clip": 1.00214267, + "balance_loss_mlp": 1.00137389, + "epoch": 0.1564407034420562, + "flos": 21872220318720.0, + "grad_norm": 2.951997188518163, + "language_loss": 0.78525102, + "learning_rate": 3.834624928998508e-06, + "loss": 0.80864245, + "num_input_tokens_seen": 56485020, + "step": 2602, + "time_per_iteration": 2.520256757736206 + }, + { + "auxiliary_loss_clip": 0.01130146, + "auxiliary_loss_mlp": 0.01159898, + "balance_loss_clip": 1.00194514, + "balance_loss_mlp": 1.00111079, + "epoch": 0.15650082669472418, + "flos": 21834549930240.0, + "grad_norm": 1.8623271067528009, + "language_loss": 0.7399931, + "learning_rate": 3.8344698229939376e-06, + "loss": 0.76289356, + "num_input_tokens_seen": 56505205, + "step": 2603, + "time_per_iteration": 2.6550583839416504 + }, + { + "auxiliary_loss_clip": 0.01162906, + "auxiliary_loss_mlp": 0.01159972, + "balance_loss_clip": 1.00200427, + "balance_loss_mlp": 1.00108981, + "epoch": 0.15656094994739214, + "flos": 13800542653440.0, + "grad_norm": 2.598019085062349, + "language_loss": 0.87456387, + "learning_rate": 3.8343146474260865e-06, + "loss": 0.8977927, + "num_input_tokens_seen": 56521495, + "step": 2604, + "time_per_iteration": 2.535491466522217 + }, + { + "auxiliary_loss_clip": 0.01162166, + "auxiliary_loss_mlp": 0.01160246, + "balance_loss_clip": 1.00196385, + "balance_loss_mlp": 1.00117338, + "epoch": 0.15662107320006013, + "flos": 27308197808640.0, + "grad_norm": 3.246129285387669, + "language_loss": 0.85373783, + "learning_rate": 3.834159402300841e-06, + "loss": 0.87696201, + "num_input_tokens_seen": 56540665, + "step": 2605, + "time_per_iteration": 2.5931191444396973 + }, + { + "auxiliary_loss_clip": 0.011633, + "auxiliary_loss_mlp": 0.01160066, + "balance_loss_clip": 1.00208032, + "balance_loss_mlp": 1.00099373, + "epoch": 0.1566811964527281, + "flos": 26685075646080.0, + "grad_norm": 2.0267727253571595, + "language_loss": 0.73232377, + "learning_rate": 3.834004087624087e-06, + "loss": 0.75555742, + "num_input_tokens_seen": 56560805, + "step": 2606, + "time_per_iteration": 2.60868501663208 + }, + { + "auxiliary_loss_clip": 0.01178611, + "auxiliary_loss_mlp": 0.01160043, + "balance_loss_clip": 1.00216532, + "balance_loss_mlp": 1.00116086, + "epoch": 0.15674131970539606, + "flos": 16103422385280.0, + "grad_norm": 2.1039182237034906, + "language_loss": 0.76142174, + "learning_rate": 3.8338487034017145e-06, + "loss": 0.78480828, + "num_input_tokens_seen": 56576335, + "step": 2607, + "time_per_iteration": 2.472825527191162 + }, + { + "auxiliary_loss_clip": 0.01129572, + "auxiliary_loss_mlp": 0.01159863, + "balance_loss_clip": 1.00201559, + "balance_loss_mlp": 1.00098038, + "epoch": 0.15680144295806403, + "flos": 19169690889600.0, + "grad_norm": 1.6079869226914778, + "language_loss": 0.81880069, + "learning_rate": 3.833693249639615e-06, + "loss": 0.84169507, + "num_input_tokens_seen": 56595880, + "step": 2608, + "time_per_iteration": 2.6678333282470703 + }, + { + "auxiliary_loss_clip": 0.01146629, + "auxiliary_loss_mlp": 0.01159958, + "balance_loss_clip": 1.00207555, + "balance_loss_mlp": 1.00117159, + "epoch": 0.156861566210732, + "flos": 20813430096000.0, + "grad_norm": 1.7142908953246554, + "language_loss": 0.72430116, + "learning_rate": 3.833537726343684e-06, + "loss": 0.74736708, + "num_input_tokens_seen": 56615130, + "step": 2609, + "time_per_iteration": 2.61429500579834 + }, + { + "auxiliary_loss_clip": 0.0116308, + "auxiliary_loss_mlp": 0.01159317, + "balance_loss_clip": 1.00204897, + "balance_loss_mlp": 1.00091171, + "epoch": 0.15692168946339996, + "flos": 20047922421120.0, + "grad_norm": 2.488870959108303, + "language_loss": 0.71884751, + "learning_rate": 3.833382133519818e-06, + "loss": 0.74207151, + "num_input_tokens_seen": 56634005, + "step": 2610, + "time_per_iteration": 2.5655229091644287 + }, + { + "auxiliary_loss_clip": 0.01178698, + "auxiliary_loss_mlp": 0.01160151, + "balance_loss_clip": 1.00204122, + "balance_loss_mlp": 1.00107825, + "epoch": 0.15698181271606793, + "flos": 21398019943680.0, + "grad_norm": 1.845883550436779, + "language_loss": 0.72450471, + "learning_rate": 3.833226471173919e-06, + "loss": 0.74789321, + "num_input_tokens_seen": 56653480, + "step": 2611, + "time_per_iteration": 2.532677173614502 + }, + { + "auxiliary_loss_clip": 0.01163084, + "auxiliary_loss_mlp": 0.01159532, + "balance_loss_clip": 1.00211394, + "balance_loss_mlp": 1.00093627, + "epoch": 0.15704193596873592, + "flos": 20845785271680.0, + "grad_norm": 2.005674387977833, + "language_loss": 0.70169616, + "learning_rate": 3.833070739311887e-06, + "loss": 0.7249223, + "num_input_tokens_seen": 56672270, + "step": 2612, + "time_per_iteration": 2.58686900138855 + }, + { + "auxiliary_loss_clip": 0.01134775, + "auxiliary_loss_mlp": 0.01160121, + "balance_loss_clip": 1.00217152, + "balance_loss_mlp": 1.00123906, + "epoch": 0.15710205922140388, + "flos": 21762908254080.0, + "grad_norm": 1.9273856130692282, + "language_loss": 0.7620573, + "learning_rate": 3.83291493793963e-06, + "loss": 0.78500628, + "num_input_tokens_seen": 56691510, + "step": 2613, + "time_per_iteration": 2.6448729038238525 + }, + { + "auxiliary_loss_clip": 0.01129997, + "auxiliary_loss_mlp": 0.0115992, + "balance_loss_clip": 1.00193214, + "balance_loss_mlp": 1.00141966, + "epoch": 0.15716218247407185, + "flos": 25007760201600.0, + "grad_norm": 1.7358948126378504, + "language_loss": 0.66125107, + "learning_rate": 3.832759067063055e-06, + "loss": 0.68415022, + "num_input_tokens_seen": 56712230, + "step": 2614, + "time_per_iteration": 2.661977529525757 + }, + { + "auxiliary_loss_clip": 0.01162394, + "auxiliary_loss_mlp": 0.01159892, + "balance_loss_clip": 1.00214386, + "balance_loss_mlp": 1.00110519, + "epoch": 0.1572223057267398, + "flos": 20191780391040.0, + "grad_norm": 2.1707709778508693, + "language_loss": 0.75539297, + "learning_rate": 3.832603126688072e-06, + "loss": 0.77861583, + "num_input_tokens_seen": 56727490, + "step": 2615, + "time_per_iteration": 2.53027081489563 + }, + { + "auxiliary_loss_clip": 0.01162886, + "auxiliary_loss_mlp": 0.01159706, + "balance_loss_clip": 1.00221419, + "balance_loss_mlp": 1.00149131, + "epoch": 0.15728242897940778, + "flos": 20959514709120.0, + "grad_norm": 1.665869544239123, + "language_loss": 0.73284066, + "learning_rate": 3.832447116820594e-06, + "loss": 0.75606656, + "num_input_tokens_seen": 56747385, + "step": 2616, + "time_per_iteration": 2.546687364578247 + }, + { + "auxiliary_loss_clip": 0.01145437, + "auxiliary_loss_mlp": 0.01159588, + "balance_loss_clip": 1.00204599, + "balance_loss_mlp": 1.00108743, + "epoch": 0.15734255223207574, + "flos": 23038275530880.0, + "grad_norm": 2.3795391433640267, + "language_loss": 0.72327769, + "learning_rate": 3.832291037466539e-06, + "loss": 0.746328, + "num_input_tokens_seen": 56768055, + "step": 2617, + "time_per_iteration": 2.59744930267334 + }, + { + "auxiliary_loss_clip": 0.01162989, + "auxiliary_loss_mlp": 0.01159495, + "balance_loss_clip": 1.00225568, + "balance_loss_mlp": 1.0008986, + "epoch": 0.15740267548474374, + "flos": 20551281661440.0, + "grad_norm": 2.1815902318083493, + "language_loss": 0.7396127, + "learning_rate": 3.8321348886318235e-06, + "loss": 0.76283753, + "num_input_tokens_seen": 56785110, + "step": 2618, + "time_per_iteration": 2.5549204349517822 + }, + { + "auxiliary_loss_clip": 0.01178898, + "auxiliary_loss_mlp": 0.01160321, + "balance_loss_clip": 1.00216675, + "balance_loss_mlp": 1.00105715, + "epoch": 0.1574627987374117, + "flos": 22666922772480.0, + "grad_norm": 1.88272003197498, + "language_loss": 0.78482449, + "learning_rate": 3.8319786703223695e-06, + "loss": 0.80821669, + "num_input_tokens_seen": 56804975, + "step": 2619, + "time_per_iteration": 2.525761365890503 + }, + { + "auxiliary_loss_clip": 0.01146515, + "auxiliary_loss_mlp": 0.01159548, + "balance_loss_clip": 1.00212538, + "balance_loss_mlp": 1.00114298, + "epoch": 0.15752292199007967, + "flos": 16800664262400.0, + "grad_norm": 1.7342921338656725, + "language_loss": 0.76685333, + "learning_rate": 3.831822382544101e-06, + "loss": 0.78991401, + "num_input_tokens_seen": 56822470, + "step": 2620, + "time_per_iteration": 2.58660626411438 + }, + { + "auxiliary_loss_clip": 0.01150504, + "auxiliary_loss_mlp": 0.01159855, + "balance_loss_clip": 1.00232136, + "balance_loss_mlp": 1.00125921, + "epoch": 0.15758304524274763, + "flos": 29826002568960.0, + "grad_norm": 1.7202138658767867, + "language_loss": 0.70842111, + "learning_rate": 3.831666025302944e-06, + "loss": 0.73152477, + "num_input_tokens_seen": 56842100, + "step": 2621, + "time_per_iteration": 2.6575372219085693 + }, + { + "auxiliary_loss_clip": 0.01114067, + "auxiliary_loss_mlp": 0.0115987, + "balance_loss_clip": 1.00197232, + "balance_loss_mlp": 1.00108325, + "epoch": 0.1576431684954156, + "flos": 53577426723840.0, + "grad_norm": 2.1097074744442756, + "language_loss": 0.72196782, + "learning_rate": 3.831509598604828e-06, + "loss": 0.74470711, + "num_input_tokens_seen": 56865920, + "step": 2622, + "time_per_iteration": 2.9619269371032715 + }, + { + "auxiliary_loss_clip": 0.01114943, + "auxiliary_loss_mlp": 0.01159534, + "balance_loss_clip": 1.00199866, + "balance_loss_mlp": 1.00122428, + "epoch": 0.15770329174808356, + "flos": 20813609664000.0, + "grad_norm": 1.8009601011648269, + "language_loss": 0.8784858, + "learning_rate": 3.831353102455684e-06, + "loss": 0.90123063, + "num_input_tokens_seen": 56885265, + "step": 2623, + "time_per_iteration": 2.7092275619506836 + }, + { + "auxiliary_loss_clip": 0.01178615, + "auxiliary_loss_mlp": 0.01159737, + "balance_loss_clip": 1.00216973, + "balance_loss_mlp": 1.00114119, + "epoch": 0.15776341500075153, + "flos": 24974004395520.0, + "grad_norm": 1.6687512920395582, + "language_loss": 0.81618202, + "learning_rate": 3.831196536861448e-06, + "loss": 0.83956552, + "num_input_tokens_seen": 56906710, + "step": 2624, + "time_per_iteration": 2.5855212211608887 + }, + { + "auxiliary_loss_clip": 0.01129443, + "auxiliary_loss_mlp": 0.01159453, + "balance_loss_clip": 1.00186241, + "balance_loss_mlp": 1.00114357, + "epoch": 0.15782353825341952, + "flos": 21907915459200.0, + "grad_norm": 2.2274796924076465, + "language_loss": 0.80022883, + "learning_rate": 3.831039901828054e-06, + "loss": 0.82311773, + "num_input_tokens_seen": 56924275, + "step": 2625, + "time_per_iteration": 2.6547763347625732 + }, + { + "auxiliary_loss_clip": 0.01178639, + "auxiliary_loss_mlp": 0.0115992, + "balance_loss_clip": 1.00214779, + "balance_loss_mlp": 1.00141978, + "epoch": 0.15788366150608749, + "flos": 26177191292160.0, + "grad_norm": 2.391891337566535, + "language_loss": 0.80656803, + "learning_rate": 3.830883197361445e-06, + "loss": 0.82995367, + "num_input_tokens_seen": 56941525, + "step": 2626, + "time_per_iteration": 2.5533621311187744 + }, + { + "auxiliary_loss_clip": 0.0111495, + "auxiliary_loss_mlp": 0.01159489, + "balance_loss_clip": 1.00217557, + "balance_loss_mlp": 1.00108337, + "epoch": 0.15794378475875545, + "flos": 27709822753920.0, + "grad_norm": 1.949672656055478, + "language_loss": 0.73598015, + "learning_rate": 3.830726423467561e-06, + "loss": 0.75872457, + "num_input_tokens_seen": 56962145, + "step": 2627, + "time_per_iteration": 2.7664988040924072 + }, + { + "auxiliary_loss_clip": 0.01129959, + "auxiliary_loss_mlp": 0.01159282, + "balance_loss_clip": 1.00204587, + "balance_loss_mlp": 1.00116277, + "epoch": 0.15800390801142342, + "flos": 12130158533760.0, + "grad_norm": 2.330013844688894, + "language_loss": 0.84720939, + "learning_rate": 3.830569580152348e-06, + "loss": 0.87010181, + "num_input_tokens_seen": 56977505, + "step": 2628, + "time_per_iteration": 4.151933908462524 + }, + { + "auxiliary_loss_clip": 0.01145301, + "auxiliary_loss_mlp": 0.01159333, + "balance_loss_clip": 1.00196481, + "balance_loss_mlp": 1.00111878, + "epoch": 0.15806403126409138, + "flos": 20704728562560.0, + "grad_norm": 1.7663347706192432, + "language_loss": 0.76501429, + "learning_rate": 3.830412667421752e-06, + "loss": 0.78806067, + "num_input_tokens_seen": 56996770, + "step": 2629, + "time_per_iteration": 4.107591390609741 + }, + { + "auxiliary_loss_clip": 0.01162061, + "auxiliary_loss_mlp": 0.01160246, + "balance_loss_clip": 1.0021342, + "balance_loss_mlp": 1.00117302, + "epoch": 0.15812415451675935, + "flos": 17821712269440.0, + "grad_norm": 2.206120350839995, + "language_loss": 0.73778713, + "learning_rate": 3.8302556852817245e-06, + "loss": 0.76101017, + "num_input_tokens_seen": 57014970, + "step": 2630, + "time_per_iteration": 2.5388338565826416 + }, + { + "auxiliary_loss_clip": 0.01162754, + "auxiliary_loss_mlp": 0.01160247, + "balance_loss_clip": 1.00205255, + "balance_loss_mlp": 1.00117445, + "epoch": 0.15818427776942734, + "flos": 20084048524800.0, + "grad_norm": 1.8682297722770105, + "language_loss": 0.84125793, + "learning_rate": 3.8300986337382184e-06, + "loss": 0.86448789, + "num_input_tokens_seen": 57034045, + "step": 2631, + "time_per_iteration": 5.4359142780303955 + }, + { + "auxiliary_loss_clip": 0.01178563, + "auxiliary_loss_mlp": 0.01158949, + "balance_loss_clip": 1.00206542, + "balance_loss_mlp": 1.00092542, + "epoch": 0.1582444010220953, + "flos": 21214911386880.0, + "grad_norm": 1.6471279795703524, + "language_loss": 0.78356934, + "learning_rate": 3.8299415127971895e-06, + "loss": 0.80694437, + "num_input_tokens_seen": 57053695, + "step": 2632, + "time_per_iteration": 2.551711082458496 + }, + { + "auxiliary_loss_clip": 0.01161987, + "auxiliary_loss_mlp": 0.01160222, + "balance_loss_clip": 1.00204241, + "balance_loss_mlp": 1.00134039, + "epoch": 0.15830452427476327, + "flos": 17858341163520.0, + "grad_norm": 1.9736804715180793, + "language_loss": 0.83361006, + "learning_rate": 3.829784322464594e-06, + "loss": 0.85683215, + "num_input_tokens_seen": 57071290, + "step": 2633, + "time_per_iteration": 2.536363363265991 + }, + { + "auxiliary_loss_clip": 0.01178893, + "auxiliary_loss_mlp": 0.01159684, + "balance_loss_clip": 1.00228465, + "balance_loss_mlp": 1.00118351, + "epoch": 0.15836464752743123, + "flos": 24534960456960.0, + "grad_norm": 1.719321238057926, + "language_loss": 0.77466702, + "learning_rate": 3.829627062746394e-06, + "loss": 0.79805285, + "num_input_tokens_seen": 57091465, + "step": 2634, + "time_per_iteration": 2.5756747722625732 + }, + { + "auxiliary_loss_clip": 0.01129219, + "auxiliary_loss_mlp": 0.00749071, + "balance_loss_clip": 1.00188947, + "balance_loss_mlp": 1.0007602, + "epoch": 0.1584247707800992, + "flos": 20120821073280.0, + "grad_norm": 1.8631769820017263, + "language_loss": 0.88289773, + "learning_rate": 3.829469733648552e-06, + "loss": 0.90168071, + "num_input_tokens_seen": 57110075, + "step": 2635, + "time_per_iteration": 2.6357951164245605 + }, + { + "auxiliary_loss_clip": 0.01097601, + "auxiliary_loss_mlp": 0.01159959, + "balance_loss_clip": 1.00183797, + "balance_loss_mlp": 1.00145841, + "epoch": 0.15848489403276717, + "flos": 20375966355840.0, + "grad_norm": 2.0303766155369507, + "language_loss": 0.75734729, + "learning_rate": 3.829312335177034e-06, + "loss": 0.77992284, + "num_input_tokens_seen": 57128945, + "step": 2636, + "time_per_iteration": 2.718099594116211 + }, + { + "auxiliary_loss_clip": 0.01128854, + "auxiliary_loss_mlp": 0.01159545, + "balance_loss_clip": 1.00178957, + "balance_loss_mlp": 1.00114012, + "epoch": 0.15854501728543513, + "flos": 39346890359040.0, + "grad_norm": 3.0487421445345455, + "language_loss": 0.72106057, + "learning_rate": 3.82915486733781e-06, + "loss": 0.74394453, + "num_input_tokens_seen": 57152385, + "step": 2637, + "time_per_iteration": 2.773409366607666 + }, + { + "auxiliary_loss_clip": 0.01162188, + "auxiliary_loss_mlp": 0.01159088, + "balance_loss_clip": 1.00204968, + "balance_loss_mlp": 1.00106454, + "epoch": 0.15860514053810312, + "flos": 24864225454080.0, + "grad_norm": 1.8242169930093772, + "language_loss": 0.77936256, + "learning_rate": 3.82899733013685e-06, + "loss": 0.80257529, + "num_input_tokens_seen": 57172620, + "step": 2638, + "time_per_iteration": 2.5947301387786865 + }, + { + "auxiliary_loss_clip": 0.01129592, + "auxiliary_loss_mlp": 0.01159368, + "balance_loss_clip": 1.00194943, + "balance_loss_mlp": 1.00134468, + "epoch": 0.1586652637907711, + "flos": 26177694082560.0, + "grad_norm": 1.64957869982407, + "language_loss": 0.755871, + "learning_rate": 3.828839723580128e-06, + "loss": 0.77876067, + "num_input_tokens_seen": 57194680, + "step": 2639, + "time_per_iteration": 2.6862144470214844 + }, + { + "auxiliary_loss_clip": 0.01096499, + "auxiliary_loss_mlp": 0.01159593, + "balance_loss_clip": 1.00173664, + "balance_loss_mlp": 1.00147343, + "epoch": 0.15872538704343905, + "flos": 19792058866560.0, + "grad_norm": 2.0069358098393506, + "language_loss": 0.81087548, + "learning_rate": 3.82868204767362e-06, + "loss": 0.83343637, + "num_input_tokens_seen": 57214675, + "step": 2640, + "time_per_iteration": 2.717611312866211 + }, + { + "auxiliary_loss_clip": 0.01146822, + "auxiliary_loss_mlp": 0.01159444, + "balance_loss_clip": 1.00204217, + "balance_loss_mlp": 1.00142002, + "epoch": 0.15878551029610702, + "flos": 28475366342400.0, + "grad_norm": 1.688015424226334, + "language_loss": 0.67021227, + "learning_rate": 3.828524302423306e-06, + "loss": 0.69327492, + "num_input_tokens_seen": 57235830, + "step": 2641, + "time_per_iteration": 2.659694194793701 + }, + { + "auxiliary_loss_clip": 0.0114728, + "auxiliary_loss_mlp": 0.01160044, + "balance_loss_clip": 1.00206995, + "balance_loss_mlp": 1.00125766, + "epoch": 0.15884563354877498, + "flos": 24206701040640.0, + "grad_norm": 2.0387022202595744, + "language_loss": 0.74958706, + "learning_rate": 3.828366487835167e-06, + "loss": 0.77266037, + "num_input_tokens_seen": 57255970, + "step": 2642, + "time_per_iteration": 2.615374803543091 + }, + { + "auxiliary_loss_clip": 0.01162659, + "auxiliary_loss_mlp": 0.01158913, + "balance_loss_clip": 1.00212693, + "balance_loss_mlp": 1.00127089, + "epoch": 0.15890575680144295, + "flos": 23949795991680.0, + "grad_norm": 1.937729812878359, + "language_loss": 0.70420605, + "learning_rate": 3.828208603915186e-06, + "loss": 0.72742176, + "num_input_tokens_seen": 57274435, + "step": 2643, + "time_per_iteration": 2.57774019241333 + }, + { + "auxiliary_loss_clip": 0.01178626, + "auxiliary_loss_mlp": 0.01158666, + "balance_loss_clip": 1.00226378, + "balance_loss_mlp": 1.00083292, + "epoch": 0.15896588005411091, + "flos": 21215019127680.0, + "grad_norm": 2.114307107952317, + "language_loss": 0.78668702, + "learning_rate": 3.828050650669353e-06, + "loss": 0.81005991, + "num_input_tokens_seen": 57293115, + "step": 2644, + "time_per_iteration": 2.5372257232666016 + }, + { + "auxiliary_loss_clip": 0.01162163, + "auxiliary_loss_mlp": 0.01159594, + "balance_loss_clip": 1.00205231, + "balance_loss_mlp": 1.00128412, + "epoch": 0.1590260033067789, + "flos": 24352390604160.0, + "grad_norm": 1.8472389387929065, + "language_loss": 0.821657, + "learning_rate": 3.827892628103657e-06, + "loss": 0.84487462, + "num_input_tokens_seen": 57312565, + "step": 2645, + "time_per_iteration": 2.615232229232788 + }, + { + "auxiliary_loss_clip": 0.01178699, + "auxiliary_loss_mlp": 0.01159247, + "balance_loss_clip": 1.0020957, + "balance_loss_mlp": 1.00112796, + "epoch": 0.15908612655944687, + "flos": 32048944583040.0, + "grad_norm": 1.8450282710575863, + "language_loss": 0.6972509, + "learning_rate": 3.827734536224087e-06, + "loss": 0.72063029, + "num_input_tokens_seen": 57333360, + "step": 2646, + "time_per_iteration": 2.645468235015869 + }, + { + "auxiliary_loss_clip": 0.01145881, + "auxiliary_loss_mlp": 0.01159337, + "balance_loss_clip": 1.00205302, + "balance_loss_mlp": 1.00131285, + "epoch": 0.15914624981211484, + "flos": 17785370684160.0, + "grad_norm": 2.1784023929456207, + "language_loss": 0.62202811, + "learning_rate": 3.827576375036642e-06, + "loss": 0.64508021, + "num_input_tokens_seen": 57350575, + "step": 2647, + "time_per_iteration": 2.5983269214630127 + }, + { + "auxiliary_loss_clip": 0.0117863, + "auxiliary_loss_mlp": 0.01158679, + "balance_loss_clip": 1.00223041, + "balance_loss_mlp": 1.00094116, + "epoch": 0.1592063730647828, + "flos": 17712507945600.0, + "grad_norm": 3.7821129193631853, + "language_loss": 0.89503706, + "learning_rate": 3.827418144547318e-06, + "loss": 0.91841012, + "num_input_tokens_seen": 57367570, + "step": 2648, + "time_per_iteration": 2.493136167526245 + }, + { + "auxiliary_loss_clip": 0.01178663, + "auxiliary_loss_mlp": 0.01159157, + "balance_loss_clip": 1.00227726, + "balance_loss_mlp": 1.00113297, + "epoch": 0.15926649631745077, + "flos": 18803545603200.0, + "grad_norm": 1.8303178138704916, + "language_loss": 0.91402012, + "learning_rate": 3.827259844762114e-06, + "loss": 0.93739831, + "num_input_tokens_seen": 57383980, + "step": 2649, + "time_per_iteration": 2.5328149795532227 + }, + { + "auxiliary_loss_clip": 0.01081224, + "auxiliary_loss_mlp": 0.01160028, + "balance_loss_clip": 1.00181663, + "balance_loss_mlp": 1.00114608, + "epoch": 0.15932661957011873, + "flos": 17566243764480.0, + "grad_norm": 8.713858997050107, + "language_loss": 0.71784878, + "learning_rate": 3.827101475687033e-06, + "loss": 0.74026126, + "num_input_tokens_seen": 57400840, + "step": 2650, + "time_per_iteration": 2.744565725326538 + }, + { + "auxiliary_loss_clip": 0.01162901, + "auxiliary_loss_mlp": 0.01158842, + "balance_loss_clip": 1.00214744, + "balance_loss_mlp": 1.00110376, + "epoch": 0.15938674282278673, + "flos": 13334351011200.0, + "grad_norm": 1.8356344534448041, + "language_loss": 0.71197969, + "learning_rate": 3.826943037328082e-06, + "loss": 0.73519713, + "num_input_tokens_seen": 57419230, + "step": 2651, + "time_per_iteration": 2.5511927604675293 + }, + { + "auxiliary_loss_clip": 0.01130091, + "auxiliary_loss_mlp": 0.00748987, + "balance_loss_clip": 1.0021342, + "balance_loss_mlp": 1.00069809, + "epoch": 0.1594468660754547, + "flos": 22488842119680.0, + "grad_norm": 1.7716029837078466, + "language_loss": 0.79753768, + "learning_rate": 3.8267845296912674e-06, + "loss": 0.81632847, + "num_input_tokens_seen": 57439315, + "step": 2652, + "time_per_iteration": 2.665005922317505 + }, + { + "auxiliary_loss_clip": 0.01145921, + "auxiliary_loss_mlp": 0.00748979, + "balance_loss_clip": 1.00196958, + "balance_loss_mlp": 1.0007633, + "epoch": 0.15950698932812266, + "flos": 15007320910080.0, + "grad_norm": 2.3809046306196677, + "language_loss": 0.7005887, + "learning_rate": 3.826625952782601e-06, + "loss": 0.71953768, + "num_input_tokens_seen": 57454635, + "step": 2653, + "time_per_iteration": 2.632460832595825 + }, + { + "auxiliary_loss_clip": 0.01161995, + "auxiliary_loss_mlp": 0.01158697, + "balance_loss_clip": 1.00204599, + "balance_loss_mlp": 1.00076842, + "epoch": 0.15956711258079062, + "flos": 30155052084480.0, + "grad_norm": 2.392560048615962, + "language_loss": 0.76660216, + "learning_rate": 3.826467306608095e-06, + "loss": 0.78980911, + "num_input_tokens_seen": 57476805, + "step": 2654, + "time_per_iteration": 2.630722761154175 + }, + { + "auxiliary_loss_clip": 0.01129671, + "auxiliary_loss_mlp": 0.01158674, + "balance_loss_clip": 1.00196791, + "balance_loss_mlp": 1.00103176, + "epoch": 0.1596272358334586, + "flos": 21032700670080.0, + "grad_norm": 1.7801119151722433, + "language_loss": 0.81569278, + "learning_rate": 3.826308591173765e-06, + "loss": 0.83857626, + "num_input_tokens_seen": 57496400, + "step": 2655, + "time_per_iteration": 2.637256383895874 + }, + { + "auxiliary_loss_clip": 0.011297, + "auxiliary_loss_mlp": 0.01158607, + "balance_loss_clip": 1.00184381, + "balance_loss_mlp": 1.00096488, + "epoch": 0.15968735908612655, + "flos": 15268032800640.0, + "grad_norm": 1.938609020736278, + "language_loss": 0.73623407, + "learning_rate": 3.826149806485631e-06, + "loss": 0.75911713, + "num_input_tokens_seen": 57513700, + "step": 2656, + "time_per_iteration": 2.611971855163574 + }, + { + "auxiliary_loss_clip": 0.01128551, + "auxiliary_loss_mlp": 0.01158166, + "balance_loss_clip": 1.0019412, + "balance_loss_mlp": 1.00100088, + "epoch": 0.15974748233879452, + "flos": 52665726695040.0, + "grad_norm": 2.2253084699202197, + "language_loss": 0.77962208, + "learning_rate": 3.825990952549713e-06, + "loss": 0.80248928, + "num_input_tokens_seen": 57536180, + "step": 2657, + "time_per_iteration": 2.895571231842041 + }, + { + "auxiliary_loss_clip": 0.01162944, + "auxiliary_loss_mlp": 0.01158829, + "balance_loss_clip": 1.00219464, + "balance_loss_mlp": 1.00109124, + "epoch": 0.1598076055914625, + "flos": 18733232730240.0, + "grad_norm": 1.7070595694984754, + "language_loss": 0.74861825, + "learning_rate": 3.825832029372035e-06, + "loss": 0.77183592, + "num_input_tokens_seen": 57555025, + "step": 2658, + "time_per_iteration": 2.534395694732666 + }, + { + "auxiliary_loss_clip": 0.01129464, + "auxiliary_loss_mlp": 0.01158447, + "balance_loss_clip": 1.00183213, + "balance_loss_mlp": 1.00090003, + "epoch": 0.15986772884413047, + "flos": 34349238535680.0, + "grad_norm": 1.661729098306574, + "language_loss": 0.7500453, + "learning_rate": 3.825673036958624e-06, + "loss": 0.77292436, + "num_input_tokens_seen": 57577660, + "step": 2659, + "time_per_iteration": 2.749446153640747 + }, + { + "auxiliary_loss_clip": 0.01130463, + "auxiliary_loss_mlp": 0.01159087, + "balance_loss_clip": 1.00209749, + "balance_loss_mlp": 1.00125384, + "epoch": 0.15992785209679844, + "flos": 22054969739520.0, + "grad_norm": 2.2047653730412504, + "language_loss": 0.90585381, + "learning_rate": 3.825513975315508e-06, + "loss": 0.92874932, + "num_input_tokens_seen": 57596335, + "step": 2660, + "time_per_iteration": 2.671752691268921 + }, + { + "auxiliary_loss_clip": 0.01113187, + "auxiliary_loss_mlp": 0.01158933, + "balance_loss_clip": 1.00192666, + "balance_loss_mlp": 1.00110018, + "epoch": 0.1599879753494664, + "flos": 33066652625280.0, + "grad_norm": 1.5613346350716273, + "language_loss": 0.77907258, + "learning_rate": 3.82535484444872e-06, + "loss": 0.80179375, + "num_input_tokens_seen": 57616830, + "step": 2661, + "time_per_iteration": 2.771223306655884 + }, + { + "auxiliary_loss_clip": 0.01146553, + "auxiliary_loss_mlp": 0.00748944, + "balance_loss_clip": 1.0020045, + "balance_loss_mlp": 1.00065136, + "epoch": 0.16004809860213437, + "flos": 28038010343040.0, + "grad_norm": 1.9487349839370876, + "language_loss": 0.73960257, + "learning_rate": 3.825195644364292e-06, + "loss": 0.75855756, + "num_input_tokens_seen": 57635515, + "step": 2662, + "time_per_iteration": 2.6512258052825928 + }, + { + "auxiliary_loss_clip": 0.0114627, + "auxiliary_loss_mlp": 0.00748968, + "balance_loss_clip": 1.00200403, + "balance_loss_mlp": 1.00076938, + "epoch": 0.16010822185480234, + "flos": 22780113505920.0, + "grad_norm": 1.676081605346406, + "language_loss": 0.81915879, + "learning_rate": 3.825036375068263e-06, + "loss": 0.83811116, + "num_input_tokens_seen": 57654250, + "step": 2663, + "time_per_iteration": 2.638129949569702 + }, + { + "auxiliary_loss_clip": 0.01114217, + "auxiliary_loss_mlp": 0.0115885, + "balance_loss_clip": 1.00184202, + "balance_loss_mlp": 1.00101709, + "epoch": 0.16016834510747033, + "flos": 20084012611200.0, + "grad_norm": 2.164048711646517, + "language_loss": 0.79406548, + "learning_rate": 3.824877036566672e-06, + "loss": 0.81679612, + "num_input_tokens_seen": 57672645, + "step": 2664, + "time_per_iteration": 2.68300724029541 + }, + { + "auxiliary_loss_clip": 0.01163032, + "auxiliary_loss_mlp": 0.01158534, + "balance_loss_clip": 1.00214291, + "balance_loss_mlp": 1.00117731, + "epoch": 0.1602284683601383, + "flos": 21173829206400.0, + "grad_norm": 3.35871983064864, + "language_loss": 0.93969059, + "learning_rate": 3.824717628865561e-06, + "loss": 0.96290624, + "num_input_tokens_seen": 57691055, + "step": 2665, + "time_per_iteration": 3.953596830368042 + }, + { + "auxiliary_loss_clip": 0.01129485, + "auxiliary_loss_mlp": 0.01158155, + "balance_loss_clip": 1.00188076, + "balance_loss_mlp": 1.00089443, + "epoch": 0.16028859161280626, + "flos": 14647568244480.0, + "grad_norm": 2.6328831754904183, + "language_loss": 0.8517167, + "learning_rate": 3.824558151970974e-06, + "loss": 0.87459314, + "num_input_tokens_seen": 57707235, + "step": 2666, + "time_per_iteration": 2.6092658042907715 + }, + { + "auxiliary_loss_clip": 0.0114639, + "auxiliary_loss_mlp": 0.00749051, + "balance_loss_clip": 1.0020268, + "balance_loss_mlp": 1.00072336, + "epoch": 0.16034871486547422, + "flos": 20990325600000.0, + "grad_norm": 1.8088682064312676, + "language_loss": 0.81485045, + "learning_rate": 3.8243986058889595e-06, + "loss": 0.83380485, + "num_input_tokens_seen": 57724190, + "step": 2667, + "time_per_iteration": 3.9994277954101562 + }, + { + "auxiliary_loss_clip": 0.0117852, + "auxiliary_loss_mlp": 0.01158844, + "balance_loss_clip": 1.00216699, + "balance_loss_mlp": 1.00091577, + "epoch": 0.1604088381181422, + "flos": 21397732634880.0, + "grad_norm": 1.8702089298973008, + "language_loss": 0.73793626, + "learning_rate": 3.824238990625567e-06, + "loss": 0.76130986, + "num_input_tokens_seen": 57743620, + "step": 2668, + "time_per_iteration": 2.551220655441284 + }, + { + "auxiliary_loss_clip": 0.01161902, + "auxiliary_loss_mlp": 0.01158726, + "balance_loss_clip": 1.00202465, + "balance_loss_mlp": 1.00108337, + "epoch": 0.16046896137081015, + "flos": 23877040993920.0, + "grad_norm": 1.7608815724004865, + "language_loss": 0.77178538, + "learning_rate": 3.824079306186848e-06, + "loss": 0.79499161, + "num_input_tokens_seen": 57764810, + "step": 2669, + "time_per_iteration": 4.149597406387329 + }, + { + "auxiliary_loss_clip": 0.01159363, + "auxiliary_loss_mlp": 0.01148948, + "balance_loss_clip": 1.00182986, + "balance_loss_mlp": 1.00017476, + "epoch": 0.16052908462347812, + "flos": 59806709015040.0, + "grad_norm": 0.7989912640260444, + "language_loss": 0.55577058, + "learning_rate": 3.823919552578861e-06, + "loss": 0.57885367, + "num_input_tokens_seen": 57824390, + "step": 2670, + "time_per_iteration": 3.0291662216186523 + }, + { + "auxiliary_loss_clip": 0.0116194, + "auxiliary_loss_mlp": 0.01157948, + "balance_loss_clip": 1.00194252, + "balance_loss_mlp": 1.00087762, + "epoch": 0.1605892078761461, + "flos": 18296559089280.0, + "grad_norm": 2.0384218246314996, + "language_loss": 0.77683282, + "learning_rate": 3.82375972980766e-06, + "loss": 0.80003172, + "num_input_tokens_seen": 57843665, + "step": 2671, + "time_per_iteration": 2.6025242805480957 + }, + { + "auxiliary_loss_clip": 0.01163033, + "auxiliary_loss_mlp": 0.01158532, + "balance_loss_clip": 1.00216246, + "balance_loss_mlp": 1.00108016, + "epoch": 0.16064933112881408, + "flos": 32160734686080.0, + "grad_norm": 2.2033826702092503, + "language_loss": 0.64789784, + "learning_rate": 3.8235998378793086e-06, + "loss": 0.67111349, + "num_input_tokens_seen": 57863305, + "step": 2672, + "time_per_iteration": 2.677393674850464 + }, + { + "auxiliary_loss_clip": 0.01162639, + "auxiliary_loss_mlp": 0.01158909, + "balance_loss_clip": 1.0022459, + "balance_loss_mlp": 1.00088561, + "epoch": 0.16070945438148204, + "flos": 19828795501440.0, + "grad_norm": 2.180619120659405, + "language_loss": 0.85707974, + "learning_rate": 3.8234398767998675e-06, + "loss": 0.88029528, + "num_input_tokens_seen": 57883025, + "step": 2673, + "time_per_iteration": 2.566199541091919 + }, + { + "auxiliary_loss_clip": 0.01129901, + "auxiliary_loss_mlp": 0.01158154, + "balance_loss_clip": 1.00187302, + "balance_loss_mlp": 1.00108421, + "epoch": 0.16076957763415, + "flos": 18913144976640.0, + "grad_norm": 2.50066722675937, + "language_loss": 0.72467613, + "learning_rate": 3.823279846575403e-06, + "loss": 0.74755663, + "num_input_tokens_seen": 57901430, + "step": 2674, + "time_per_iteration": 2.61944317817688 + }, + { + "auxiliary_loss_clip": 0.01161876, + "auxiliary_loss_mlp": 0.01158306, + "balance_loss_clip": 1.00194311, + "balance_loss_mlp": 1.00094974, + "epoch": 0.16082970088681797, + "flos": 16764358590720.0, + "grad_norm": 1.6524032342388164, + "language_loss": 0.84351945, + "learning_rate": 3.823119747211986e-06, + "loss": 0.86672121, + "num_input_tokens_seen": 57919550, + "step": 2675, + "time_per_iteration": 2.5418200492858887 + }, + { + "auxiliary_loss_clip": 0.0113095, + "auxiliary_loss_mlp": 0.01159219, + "balance_loss_clip": 1.00219464, + "balance_loss_mlp": 1.0012908, + "epoch": 0.16088982413948594, + "flos": 35150261783040.0, + "grad_norm": 1.8054262485565513, + "language_loss": 0.82603776, + "learning_rate": 3.822959578715685e-06, + "loss": 0.84893948, + "num_input_tokens_seen": 57939890, + "step": 2676, + "time_per_iteration": 2.738948106765747 + }, + { + "auxiliary_loss_clip": 0.01161873, + "auxiliary_loss_mlp": 0.01158101, + "balance_loss_clip": 1.00214887, + "balance_loss_mlp": 1.00103068, + "epoch": 0.1609499473921539, + "flos": 18625105814400.0, + "grad_norm": 2.2593760369906883, + "language_loss": 0.73514652, + "learning_rate": 3.822799341092573e-06, + "loss": 0.75834632, + "num_input_tokens_seen": 57957410, + "step": 2677, + "time_per_iteration": 2.5309064388275146 + }, + { + "auxiliary_loss_clip": 0.01150397, + "auxiliary_loss_mlp": 0.01157997, + "balance_loss_clip": 1.00257373, + "balance_loss_mlp": 1.0010221, + "epoch": 0.1610100706448219, + "flos": 33145728416640.0, + "grad_norm": 1.7364247209770034, + "language_loss": 0.76753902, + "learning_rate": 3.822639034348728e-06, + "loss": 0.79062295, + "num_input_tokens_seen": 57977900, + "step": 2678, + "time_per_iteration": 2.7039055824279785 + }, + { + "auxiliary_loss_clip": 0.01161854, + "auxiliary_loss_mlp": 0.01158177, + "balance_loss_clip": 1.00211859, + "balance_loss_mlp": 1.00091624, + "epoch": 0.16107019389748986, + "flos": 34676707852800.0, + "grad_norm": 1.702293866681386, + "language_loss": 0.70246482, + "learning_rate": 3.822478658490228e-06, + "loss": 0.72566509, + "num_input_tokens_seen": 57998210, + "step": 2679, + "time_per_iteration": 2.6632370948791504 + }, + { + "auxiliary_loss_clip": 0.01126918, + "auxiliary_loss_mlp": 0.0074836, + "balance_loss_clip": 1.00169957, + "balance_loss_mlp": 0.99970478, + "epoch": 0.16113031715015783, + "flos": 65713403260800.0, + "grad_norm": 0.776626172961329, + "language_loss": 0.51804543, + "learning_rate": 3.822318213523154e-06, + "loss": 0.53679824, + "num_input_tokens_seen": 58059420, + "step": 2680, + "time_per_iteration": 3.2866427898406982 + }, + { + "auxiliary_loss_clip": 0.0114598, + "auxiliary_loss_mlp": 0.01158229, + "balance_loss_clip": 1.00189567, + "balance_loss_mlp": 1.00087237, + "epoch": 0.1611904404028258, + "flos": 20810413353600.0, + "grad_norm": 1.9807191631474474, + "language_loss": 0.80097556, + "learning_rate": 3.8221576994535925e-06, + "loss": 0.82401764, + "num_input_tokens_seen": 58078370, + "step": 2681, + "time_per_iteration": 2.5971431732177734 + }, + { + "auxiliary_loss_clip": 0.01145476, + "auxiliary_loss_mlp": 0.01158444, + "balance_loss_clip": 1.00199735, + "balance_loss_mlp": 1.00146949, + "epoch": 0.16125056365549376, + "flos": 27013335062400.0, + "grad_norm": 1.7084554768087097, + "language_loss": 0.6895234, + "learning_rate": 3.821997116287627e-06, + "loss": 0.71256268, + "num_input_tokens_seen": 58097395, + "step": 2682, + "time_per_iteration": 2.6439552307128906 + }, + { + "auxiliary_loss_clip": 0.0114699, + "auxiliary_loss_mlp": 0.01158393, + "balance_loss_clip": 1.00218844, + "balance_loss_mlp": 1.00084615, + "epoch": 0.16131068690816172, + "flos": 19276524915840.0, + "grad_norm": 1.7622293834055816, + "language_loss": 0.87464833, + "learning_rate": 3.821836464031348e-06, + "loss": 0.8977021, + "num_input_tokens_seen": 58115630, + "step": 2683, + "time_per_iteration": 2.6059508323669434 + }, + { + "auxiliary_loss_clip": 0.01178567, + "auxiliary_loss_mlp": 0.01158809, + "balance_loss_clip": 1.00222158, + "balance_loss_mlp": 1.00126183, + "epoch": 0.16137081016082971, + "flos": 35337931367040.0, + "grad_norm": 1.7696806230167315, + "language_loss": 0.74369752, + "learning_rate": 3.821675742690849e-06, + "loss": 0.76707125, + "num_input_tokens_seen": 58138655, + "step": 2684, + "time_per_iteration": 2.6513113975524902 + }, + { + "auxiliary_loss_clip": 0.01146475, + "auxiliary_loss_mlp": 0.00749023, + "balance_loss_clip": 1.00205445, + "balance_loss_mlp": 1.00071573, + "epoch": 0.16143093341349768, + "flos": 34235257703040.0, + "grad_norm": 1.8973947183794813, + "language_loss": 0.70423436, + "learning_rate": 3.821514952272223e-06, + "loss": 0.72318935, + "num_input_tokens_seen": 58157440, + "step": 2685, + "time_per_iteration": 2.7110202312469482 + }, + { + "auxiliary_loss_clip": 0.01130268, + "auxiliary_loss_mlp": 0.01158164, + "balance_loss_clip": 1.00212955, + "balance_loss_mlp": 1.0010941, + "epoch": 0.16149105666616564, + "flos": 27999262546560.0, + "grad_norm": 2.1922270191295663, + "language_loss": 0.71411824, + "learning_rate": 3.821354092781567e-06, + "loss": 0.73700255, + "num_input_tokens_seen": 58176660, + "step": 2686, + "time_per_iteration": 2.677645206451416 + }, + { + "auxiliary_loss_clip": 0.01166912, + "auxiliary_loss_mlp": 0.01158439, + "balance_loss_clip": 1.00237274, + "balance_loss_mlp": 1.00108254, + "epoch": 0.1615511799188336, + "flos": 19422214479360.0, + "grad_norm": 1.857428311934207, + "language_loss": 0.816594, + "learning_rate": 3.821193164224981e-06, + "loss": 0.83984745, + "num_input_tokens_seen": 58195085, + "step": 2687, + "time_per_iteration": 2.5446035861968994 + }, + { + "auxiliary_loss_clip": 0.01161842, + "auxiliary_loss_mlp": 0.01158501, + "balance_loss_clip": 1.0019722, + "balance_loss_mlp": 1.00104892, + "epoch": 0.16161130317150157, + "flos": 22854915578880.0, + "grad_norm": 1.9291113450781023, + "language_loss": 0.71666801, + "learning_rate": 3.821032166608568e-06, + "loss": 0.73987144, + "num_input_tokens_seen": 58213540, + "step": 2688, + "time_per_iteration": 2.567110776901245 + }, + { + "auxiliary_loss_clip": 0.01130079, + "auxiliary_loss_mlp": 0.01158292, + "balance_loss_clip": 1.00200725, + "balance_loss_mlp": 1.00093555, + "epoch": 0.16167142642416954, + "flos": 26110577520000.0, + "grad_norm": 1.6640033985534188, + "language_loss": 0.75818157, + "learning_rate": 3.8208710999384325e-06, + "loss": 0.78106523, + "num_input_tokens_seen": 58236995, + "step": 2689, + "time_per_iteration": 2.746265411376953 + }, + { + "auxiliary_loss_clip": 0.0117857, + "auxiliary_loss_mlp": 0.01158241, + "balance_loss_clip": 1.00230479, + "balance_loss_mlp": 1.00136113, + "epoch": 0.1617315496768375, + "flos": 22779646629120.0, + "grad_norm": 2.2246235660836247, + "language_loss": 0.87415087, + "learning_rate": 3.820709964220683e-06, + "loss": 0.89751899, + "num_input_tokens_seen": 58257230, + "step": 2690, + "time_per_iteration": 2.5402743816375732 + }, + { + "auxiliary_loss_clip": 0.01162132, + "auxiliary_loss_mlp": 0.01158191, + "balance_loss_clip": 1.00214767, + "balance_loss_mlp": 1.00131118, + "epoch": 0.1617916729295055, + "flos": 22017299351040.0, + "grad_norm": 1.552097945167404, + "language_loss": 0.88139451, + "learning_rate": 3.8205487594614284e-06, + "loss": 0.9045977, + "num_input_tokens_seen": 58277080, + "step": 2691, + "time_per_iteration": 2.577801465988159 + }, + { + "auxiliary_loss_clip": 0.0116195, + "auxiliary_loss_mlp": 0.01158981, + "balance_loss_clip": 1.00205183, + "balance_loss_mlp": 1.00105274, + "epoch": 0.16185179618217346, + "flos": 23438248450560.0, + "grad_norm": 2.1489686158084567, + "language_loss": 0.82506394, + "learning_rate": 3.820387485666784e-06, + "loss": 0.84827328, + "num_input_tokens_seen": 58294815, + "step": 2692, + "time_per_iteration": 2.5796074867248535 + }, + { + "auxiliary_loss_clip": 0.01178684, + "auxiliary_loss_mlp": 0.01158674, + "balance_loss_clip": 1.00218844, + "balance_loss_mlp": 1.00103152, + "epoch": 0.16191191943484143, + "flos": 25666110627840.0, + "grad_norm": 2.878880432944381, + "language_loss": 0.81524259, + "learning_rate": 3.820226142842862e-06, + "loss": 0.83861619, + "num_input_tokens_seen": 58313215, + "step": 2693, + "time_per_iteration": 2.561912775039673 + }, + { + "auxiliary_loss_clip": 0.01178437, + "auxiliary_loss_mlp": 0.01158463, + "balance_loss_clip": 1.00224781, + "balance_loss_mlp": 1.00148821, + "epoch": 0.1619720426875094, + "flos": 23477355383040.0, + "grad_norm": 1.5285993386579946, + "language_loss": 0.83712828, + "learning_rate": 3.820064730995783e-06, + "loss": 0.8604973, + "num_input_tokens_seen": 58333215, + "step": 2694, + "time_per_iteration": 2.5257387161254883 + }, + { + "auxiliary_loss_clip": 0.01130279, + "auxiliary_loss_mlp": 0.01158678, + "balance_loss_clip": 1.00199008, + "balance_loss_mlp": 1.00132203, + "epoch": 0.16203216594017736, + "flos": 24133658734080.0, + "grad_norm": 1.8480807150924874, + "language_loss": 0.69271016, + "learning_rate": 3.819903250131667e-06, + "loss": 0.71559978, + "num_input_tokens_seen": 58351160, + "step": 2695, + "time_per_iteration": 2.6450259685516357 + }, + { + "auxiliary_loss_clip": 0.01162177, + "auxiliary_loss_mlp": 0.01158992, + "balance_loss_clip": 1.00222695, + "balance_loss_mlp": 1.00106323, + "epoch": 0.16209228919284532, + "flos": 22340889999360.0, + "grad_norm": 3.5203434349383462, + "language_loss": 0.82227141, + "learning_rate": 3.819741700256637e-06, + "loss": 0.84548306, + "num_input_tokens_seen": 58368505, + "step": 2696, + "time_per_iteration": 2.5498104095458984 + }, + { + "auxiliary_loss_clip": 0.01178749, + "auxiliary_loss_mlp": 0.01158987, + "balance_loss_clip": 1.00222874, + "balance_loss_mlp": 1.00124979, + "epoch": 0.1621524124455133, + "flos": 15815131827840.0, + "grad_norm": 1.97124083588704, + "language_loss": 0.88352227, + "learning_rate": 3.8195800813768194e-06, + "loss": 0.90689957, + "num_input_tokens_seen": 58385085, + "step": 2697, + "time_per_iteration": 2.4914605617523193 + }, + { + "auxiliary_loss_clip": 0.01178483, + "auxiliary_loss_mlp": 0.01158484, + "balance_loss_clip": 1.00219369, + "balance_loss_mlp": 1.00141382, + "epoch": 0.16221253569818128, + "flos": 30186688988160.0, + "grad_norm": 1.3829155449094357, + "language_loss": 0.80563003, + "learning_rate": 3.819418393498343e-06, + "loss": 0.82899964, + "num_input_tokens_seen": 58406985, + "step": 2698, + "time_per_iteration": 2.6059062480926514 + }, + { + "auxiliary_loss_clip": 0.01161925, + "auxiliary_loss_mlp": 0.01158194, + "balance_loss_clip": 1.00220227, + "balance_loss_mlp": 1.00112367, + "epoch": 0.16227265895084925, + "flos": 24605991601920.0, + "grad_norm": 1.6455065930289214, + "language_loss": 0.7748307, + "learning_rate": 3.819256636627339e-06, + "loss": 0.79803193, + "num_input_tokens_seen": 58426205, + "step": 2699, + "time_per_iteration": 2.5785257816314697 + }, + { + "auxiliary_loss_clip": 0.01150217, + "auxiliary_loss_mlp": 0.01158156, + "balance_loss_clip": 1.00223184, + "balance_loss_mlp": 1.00099063, + "epoch": 0.1623327822035172, + "flos": 19573326996480.0, + "grad_norm": 1.915275795353709, + "language_loss": 0.85957289, + "learning_rate": 3.81909481076994e-06, + "loss": 0.88265669, + "num_input_tokens_seen": 58443830, + "step": 2700, + "time_per_iteration": 2.577047109603882 + }, + { + "auxiliary_loss_clip": 0.01162874, + "auxiliary_loss_mlp": 0.00748936, + "balance_loss_clip": 1.00215602, + "balance_loss_mlp": 1.00068831, + "epoch": 0.16239290545618518, + "flos": 26468462678400.0, + "grad_norm": 1.5685134113524173, + "language_loss": 0.80542535, + "learning_rate": 3.818932915932284e-06, + "loss": 0.82454342, + "num_input_tokens_seen": 58464405, + "step": 2701, + "time_per_iteration": 2.620896816253662 + }, + { + "auxiliary_loss_clip": 0.01150113, + "auxiliary_loss_mlp": 0.01158651, + "balance_loss_clip": 1.0022428, + "balance_loss_mlp": 1.00119972, + "epoch": 0.16245302870885314, + "flos": 15851940289920.0, + "grad_norm": 1.6394558693902042, + "language_loss": 0.73133731, + "learning_rate": 3.818770952120511e-06, + "loss": 0.75442493, + "num_input_tokens_seen": 58483295, + "step": 2702, + "time_per_iteration": 2.575890064239502 + }, + { + "auxiliary_loss_clip": 0.01162006, + "auxiliary_loss_mlp": 0.01158757, + "balance_loss_clip": 1.00209618, + "balance_loss_mlp": 1.00120997, + "epoch": 0.1625131519615211, + "flos": 14756521173120.0, + "grad_norm": 1.8905046484292853, + "language_loss": 0.72506618, + "learning_rate": 3.81860891934076e-06, + "loss": 0.74827385, + "num_input_tokens_seen": 58501205, + "step": 2703, + "time_per_iteration": 3.9641284942626953 + }, + { + "auxiliary_loss_clip": 0.0117865, + "auxiliary_loss_mlp": 0.01158878, + "balance_loss_clip": 1.00219464, + "balance_loss_mlp": 1.00114059, + "epoch": 0.1625732752141891, + "flos": 28220508368640.0, + "grad_norm": 1.9851928728010009, + "language_loss": 0.70764351, + "learning_rate": 3.818446817599176e-06, + "loss": 0.73101872, + "num_input_tokens_seen": 58522315, + "step": 2704, + "time_per_iteration": 2.5901756286621094 + }, + { + "auxiliary_loss_clip": 0.01125405, + "auxiliary_loss_mlp": 0.0114879, + "balance_loss_clip": 1.0013932, + "balance_loss_mlp": 1.00001645, + "epoch": 0.16263339846685707, + "flos": 67327947688320.0, + "grad_norm": 0.7758642456028709, + "language_loss": 0.53398103, + "learning_rate": 3.818284646901907e-06, + "loss": 0.556723, + "num_input_tokens_seen": 58586695, + "step": 2705, + "time_per_iteration": 3.1997358798980713 + }, + { + "auxiliary_loss_clip": 0.0114644, + "auxiliary_loss_mlp": 0.00749063, + "balance_loss_clip": 1.00207925, + "balance_loss_mlp": 1.00072682, + "epoch": 0.16269352171952503, + "flos": 14319165173760.0, + "grad_norm": 2.8144375369459707, + "language_loss": 0.76578903, + "learning_rate": 3.818122407255102e-06, + "loss": 0.78474408, + "num_input_tokens_seen": 58602435, + "step": 2706, + "time_per_iteration": 5.375585556030273 + }, + { + "auxiliary_loss_clip": 0.01135747, + "auxiliary_loss_mlp": 0.01158237, + "balance_loss_clip": 1.0020988, + "balance_loss_mlp": 1.00116634, + "epoch": 0.162753644972193, + "flos": 28361205941760.0, + "grad_norm": 1.7204398328742097, + "language_loss": 0.71915692, + "learning_rate": 3.817960098664914e-06, + "loss": 0.74209672, + "num_input_tokens_seen": 58621275, + "step": 2707, + "time_per_iteration": 4.1275506019592285 + }, + { + "auxiliary_loss_clip": 0.01150256, + "auxiliary_loss_mlp": 0.01158492, + "balance_loss_clip": 1.00220478, + "balance_loss_mlp": 1.00113511, + "epoch": 0.16281376822486096, + "flos": 19937856170880.0, + "grad_norm": 2.7158834180810976, + "language_loss": 0.8361817, + "learning_rate": 3.817797721137495e-06, + "loss": 0.8592692, + "num_input_tokens_seen": 58637550, + "step": 2708, + "time_per_iteration": 2.616546392440796 + }, + { + "auxiliary_loss_clip": 0.01096982, + "auxiliary_loss_mlp": 0.00749035, + "balance_loss_clip": 1.00188732, + "balance_loss_mlp": 1.0008018, + "epoch": 0.16287389147752893, + "flos": 21251719848960.0, + "grad_norm": 2.6143146830581885, + "language_loss": 0.8610124, + "learning_rate": 3.817635274679006e-06, + "loss": 0.87947261, + "num_input_tokens_seen": 58654135, + "step": 2709, + "time_per_iteration": 2.7357754707336426 + }, + { + "auxiliary_loss_clip": 0.0114602, + "auxiliary_loss_mlp": 0.00749009, + "balance_loss_clip": 1.00207233, + "balance_loss_mlp": 1.0008055, + "epoch": 0.1629340147301969, + "flos": 19244672530560.0, + "grad_norm": 1.5879993123685514, + "language_loss": 0.91415536, + "learning_rate": 3.817472759295605e-06, + "loss": 0.93310565, + "num_input_tokens_seen": 58674320, + "step": 2710, + "time_per_iteration": 2.627392530441284 + }, + { + "auxiliary_loss_clip": 0.01130597, + "auxiliary_loss_mlp": 0.01159344, + "balance_loss_clip": 1.00230515, + "balance_loss_mlp": 1.00151074, + "epoch": 0.16299413798286488, + "flos": 21249816428160.0, + "grad_norm": 2.298316909337273, + "language_loss": 0.81623304, + "learning_rate": 3.817310174993453e-06, + "loss": 0.83913243, + "num_input_tokens_seen": 58691000, + "step": 2711, + "time_per_iteration": 2.632642984390259 + }, + { + "auxiliary_loss_clip": 0.01147116, + "auxiliary_loss_mlp": 0.011588, + "balance_loss_clip": 1.00203395, + "balance_loss_mlp": 1.00087154, + "epoch": 0.16305426123553285, + "flos": 18770579896320.0, + "grad_norm": 2.1051129321718403, + "language_loss": 0.81207728, + "learning_rate": 3.817147521778719e-06, + "loss": 0.83513647, + "num_input_tokens_seen": 58710230, + "step": 2712, + "time_per_iteration": 2.666746139526367 + }, + { + "auxiliary_loss_clip": 0.01178783, + "auxiliary_loss_mlp": 0.01159182, + "balance_loss_clip": 1.0022707, + "balance_loss_mlp": 1.00134873, + "epoch": 0.16311438448820081, + "flos": 22087648137600.0, + "grad_norm": 1.9939558591363304, + "language_loss": 0.76528502, + "learning_rate": 3.816984799657568e-06, + "loss": 0.7886647, + "num_input_tokens_seen": 58728610, + "step": 2713, + "time_per_iteration": 2.5307857990264893 + }, + { + "auxiliary_loss_clip": 0.01162307, + "auxiliary_loss_mlp": 0.01158231, + "balance_loss_clip": 1.00237012, + "balance_loss_mlp": 1.00135159, + "epoch": 0.16317450774086878, + "flos": 16467700164480.0, + "grad_norm": 5.191384084871747, + "language_loss": 0.79103947, + "learning_rate": 3.8168220086361715e-06, + "loss": 0.81424487, + "num_input_tokens_seen": 58744385, + "step": 2714, + "time_per_iteration": 2.5379672050476074 + }, + { + "auxiliary_loss_clip": 0.01163089, + "auxiliary_loss_mlp": 0.01159085, + "balance_loss_clip": 1.00233459, + "balance_loss_mlp": 1.00144243, + "epoch": 0.16323463099353674, + "flos": 24352929308160.0, + "grad_norm": 1.6335792888505518, + "language_loss": 0.78265524, + "learning_rate": 3.816659148720702e-06, + "loss": 0.80587697, + "num_input_tokens_seen": 58763905, + "step": 2715, + "time_per_iteration": 2.5900566577911377 + }, + { + "auxiliary_loss_clip": 0.01146638, + "auxiliary_loss_mlp": 0.01158407, + "balance_loss_clip": 1.00205457, + "balance_loss_mlp": 1.00105119, + "epoch": 0.1632947542462047, + "flos": 24900782520960.0, + "grad_norm": 1.9080264012720576, + "language_loss": 0.81974316, + "learning_rate": 3.816496219917336e-06, + "loss": 0.8427937, + "num_input_tokens_seen": 58785580, + "step": 2716, + "time_per_iteration": 2.6251208782196045 + }, + { + "auxiliary_loss_clip": 0.01145456, + "auxiliary_loss_mlp": 0.01158715, + "balance_loss_clip": 1.00214314, + "balance_loss_mlp": 1.00116801, + "epoch": 0.1633548774988727, + "flos": 24900279730560.0, + "grad_norm": 2.35897145322912, + "language_loss": 0.86118066, + "learning_rate": 3.816333222232251e-06, + "loss": 0.88422239, + "num_input_tokens_seen": 58806075, + "step": 2717, + "time_per_iteration": 2.6784629821777344 + }, + { + "auxiliary_loss_clip": 0.01145433, + "auxiliary_loss_mlp": 0.0115823, + "balance_loss_clip": 1.00215781, + "balance_loss_mlp": 1.00096917, + "epoch": 0.16341500075154067, + "flos": 30441798357120.0, + "grad_norm": 1.676566578641238, + "language_loss": 0.76572806, + "learning_rate": 3.816170155671629e-06, + "loss": 0.78876466, + "num_input_tokens_seen": 58827405, + "step": 2718, + "time_per_iteration": 2.6587345600128174 + }, + { + "auxiliary_loss_clip": 0.01145548, + "auxiliary_loss_mlp": 0.01158784, + "balance_loss_clip": 1.00199699, + "balance_loss_mlp": 1.00133252, + "epoch": 0.16347512400420863, + "flos": 22784530878720.0, + "grad_norm": 1.7623443171343576, + "language_loss": 0.73311502, + "learning_rate": 3.816007020241652e-06, + "loss": 0.75615835, + "num_input_tokens_seen": 58847205, + "step": 2719, + "time_per_iteration": 2.6278886795043945 + }, + { + "auxiliary_loss_clip": 0.01135863, + "auxiliary_loss_mlp": 0.01158698, + "balance_loss_clip": 1.0023638, + "balance_loss_mlp": 1.00105619, + "epoch": 0.1635352472568766, + "flos": 22633274707200.0, + "grad_norm": 1.7341734613857829, + "language_loss": 0.72145236, + "learning_rate": 3.815843815948507e-06, + "loss": 0.744398, + "num_input_tokens_seen": 58866865, + "step": 2720, + "time_per_iteration": 2.6282753944396973 + }, + { + "auxiliary_loss_clip": 0.01131629, + "auxiliary_loss_mlp": 0.01158999, + "balance_loss_clip": 1.00236881, + "balance_loss_mlp": 1.00116563, + "epoch": 0.16359537050954456, + "flos": 15522998515200.0, + "grad_norm": 1.9852531430574496, + "language_loss": 0.74567902, + "learning_rate": 3.8156805427983824e-06, + "loss": 0.76858532, + "num_input_tokens_seen": 58885200, + "step": 2721, + "time_per_iteration": 2.6264946460723877 + }, + { + "auxiliary_loss_clip": 0.01114843, + "auxiliary_loss_mlp": 0.01158627, + "balance_loss_clip": 1.0020659, + "balance_loss_mlp": 1.00108004, + "epoch": 0.16365549376221253, + "flos": 22090162089600.0, + "grad_norm": 1.7308035573524296, + "language_loss": 0.78972518, + "learning_rate": 3.8155172007974695e-06, + "loss": 0.81245995, + "num_input_tokens_seen": 58906385, + "step": 2722, + "time_per_iteration": 2.7207274436950684 + }, + { + "auxiliary_loss_clip": 0.01163, + "auxiliary_loss_mlp": 0.00749002, + "balance_loss_clip": 1.00220704, + "balance_loss_mlp": 1.00069416, + "epoch": 0.1637156170148805, + "flos": 24060400945920.0, + "grad_norm": 1.9274530337048208, + "language_loss": 0.84797084, + "learning_rate": 3.8153537899519624e-06, + "loss": 0.86709088, + "num_input_tokens_seen": 58925040, + "step": 2723, + "time_per_iteration": 2.5952112674713135 + }, + { + "auxiliary_loss_clip": 0.01113836, + "auxiliary_loss_mlp": 0.01158063, + "balance_loss_clip": 1.00189424, + "balance_loss_mlp": 1.00118351, + "epoch": 0.1637757402675485, + "flos": 26685362954880.0, + "grad_norm": 1.7147283909539621, + "language_loss": 0.71020603, + "learning_rate": 3.815190310268058e-06, + "loss": 0.73292506, + "num_input_tokens_seen": 58944790, + "step": 2724, + "time_per_iteration": 2.7558436393737793 + }, + { + "auxiliary_loss_clip": 0.01129745, + "auxiliary_loss_mlp": 0.01158854, + "balance_loss_clip": 1.0020318, + "balance_loss_mlp": 1.00149751, + "epoch": 0.16383586352021645, + "flos": 16106941918080.0, + "grad_norm": 2.2043050539235893, + "language_loss": 0.70488328, + "learning_rate": 3.815026761751955e-06, + "loss": 0.72776926, + "num_input_tokens_seen": 58962500, + "step": 2725, + "time_per_iteration": 2.6939985752105713 + }, + { + "auxiliary_loss_clip": 0.01130874, + "auxiliary_loss_mlp": 0.01158222, + "balance_loss_clip": 1.0022006, + "balance_loss_mlp": 1.00124681, + "epoch": 0.16389598677288442, + "flos": 19165991788800.0, + "grad_norm": 2.2172456207323012, + "language_loss": 0.8832038, + "learning_rate": 3.814863144409855e-06, + "loss": 0.90609473, + "num_input_tokens_seen": 58980355, + "step": 2726, + "time_per_iteration": 2.651059627532959 + }, + { + "auxiliary_loss_clip": 0.01162036, + "auxiliary_loss_mlp": 0.01158451, + "balance_loss_clip": 1.00219893, + "balance_loss_mlp": 1.00109494, + "epoch": 0.16395611002555238, + "flos": 21507008785920.0, + "grad_norm": 1.808145122597959, + "language_loss": 0.73863262, + "learning_rate": 3.814699458247963e-06, + "loss": 0.76183748, + "num_input_tokens_seen": 58999505, + "step": 2727, + "time_per_iteration": 2.575982093811035 + }, + { + "auxiliary_loss_clip": 0.01161857, + "auxiliary_loss_mlp": 0.0115844, + "balance_loss_clip": 1.00224328, + "balance_loss_mlp": 1.00127494, + "epoch": 0.16401623327822035, + "flos": 21470918595840.0, + "grad_norm": 1.813613937413472, + "language_loss": 0.82551748, + "learning_rate": 3.8145357032724855e-06, + "loss": 0.84872043, + "num_input_tokens_seen": 59017930, + "step": 2728, + "time_per_iteration": 2.566319227218628 + }, + { + "auxiliary_loss_clip": 0.01161959, + "auxiliary_loss_mlp": 0.01158556, + "balance_loss_clip": 1.00214791, + "balance_loss_mlp": 1.00110459, + "epoch": 0.1640763565308883, + "flos": 13626232928640.0, + "grad_norm": 2.2615579495067943, + "language_loss": 0.8499403, + "learning_rate": 3.814371879489633e-06, + "loss": 0.87314534, + "num_input_tokens_seen": 59035130, + "step": 2729, + "time_per_iteration": 2.5435125827789307 + }, + { + "auxiliary_loss_clip": 0.01178534, + "auxiliary_loss_mlp": 0.01158555, + "balance_loss_clip": 1.00229251, + "balance_loss_mlp": 1.00119877, + "epoch": 0.16413647978355628, + "flos": 15451464579840.0, + "grad_norm": 1.746144155454442, + "language_loss": 0.72496474, + "learning_rate": 3.814207986905616e-06, + "loss": 0.74833566, + "num_input_tokens_seen": 59053080, + "step": 2730, + "time_per_iteration": 2.5023131370544434 + }, + { + "auxiliary_loss_clip": 0.01145858, + "auxiliary_loss_mlp": 0.01158499, + "balance_loss_clip": 1.00192142, + "balance_loss_mlp": 1.00104785, + "epoch": 0.16419660303622427, + "flos": 45878682015360.0, + "grad_norm": 1.872849119508444, + "language_loss": 0.74760556, + "learning_rate": 3.814044025526651e-06, + "loss": 0.77064908, + "num_input_tokens_seen": 59075610, + "step": 2731, + "time_per_iteration": 2.8363845348358154 + }, + { + "auxiliary_loss_clip": 0.01131758, + "auxiliary_loss_mlp": 0.01158712, + "balance_loss_clip": 1.00218821, + "balance_loss_mlp": 1.00107002, + "epoch": 0.16425672628889224, + "flos": 18952826526720.0, + "grad_norm": 1.867509262836013, + "language_loss": 0.78862369, + "learning_rate": 3.8138799953589548e-06, + "loss": 0.81152838, + "num_input_tokens_seen": 59094555, + "step": 2732, + "time_per_iteration": 2.6775200366973877 + }, + { + "auxiliary_loss_clip": 0.0114749, + "auxiliary_loss_mlp": 0.0115902, + "balance_loss_clip": 1.00220656, + "balance_loss_mlp": 1.00118709, + "epoch": 0.1643168495415602, + "flos": 24312996362880.0, + "grad_norm": 1.9455432429138246, + "language_loss": 0.68989909, + "learning_rate": 3.8137158964087473e-06, + "loss": 0.71296424, + "num_input_tokens_seen": 59113515, + "step": 2733, + "time_per_iteration": 2.631305456161499 + }, + { + "auxiliary_loss_clip": 0.01146111, + "auxiliary_loss_mlp": 0.0115817, + "balance_loss_clip": 1.00207782, + "balance_loss_mlp": 1.00100493, + "epoch": 0.16437697279422817, + "flos": 26428421992320.0, + "grad_norm": 1.948935351811013, + "language_loss": 0.80888498, + "learning_rate": 3.8135517286822508e-06, + "loss": 0.83192778, + "num_input_tokens_seen": 59133275, + "step": 2734, + "time_per_iteration": 2.658803939819336 + }, + { + "auxiliary_loss_clip": 0.01145923, + "auxiliary_loss_mlp": 0.01158173, + "balance_loss_clip": 1.00205779, + "balance_loss_mlp": 1.00110269, + "epoch": 0.16443709604689613, + "flos": 34532239351680.0, + "grad_norm": 2.525740930942223, + "language_loss": 0.81927025, + "learning_rate": 3.8133874921856914e-06, + "loss": 0.8423112, + "num_input_tokens_seen": 59154095, + "step": 2735, + "time_per_iteration": 2.705595016479492 + }, + { + "auxiliary_loss_clip": 0.01080875, + "auxiliary_loss_mlp": 0.01158224, + "balance_loss_clip": 1.00165558, + "balance_loss_mlp": 1.00115395, + "epoch": 0.1644972192995641, + "flos": 23258048895360.0, + "grad_norm": 2.0614797077222073, + "language_loss": 0.78724432, + "learning_rate": 3.813223186925296e-06, + "loss": 0.80963534, + "num_input_tokens_seen": 59173795, + "step": 2736, + "time_per_iteration": 2.7549240589141846 + }, + { + "auxiliary_loss_clip": 0.01150315, + "auxiliary_loss_mlp": 0.01158886, + "balance_loss_clip": 1.00236368, + "balance_loss_mlp": 1.00152957, + "epoch": 0.1645573425522321, + "flos": 26979543342720.0, + "grad_norm": 1.6503400496845984, + "language_loss": 0.81607074, + "learning_rate": 3.8130588129072964e-06, + "loss": 0.83916277, + "num_input_tokens_seen": 59191610, + "step": 2737, + "time_per_iteration": 2.6355979442596436 + }, + { + "auxiliary_loss_clip": 0.01162674, + "auxiliary_loss_mlp": 0.01158113, + "balance_loss_clip": 1.00212407, + "balance_loss_mlp": 1.00104332, + "epoch": 0.16461746580490005, + "flos": 28731768600960.0, + "grad_norm": 1.8848340979801887, + "language_loss": 0.87251306, + "learning_rate": 3.8128943701379246e-06, + "loss": 0.89572096, + "num_input_tokens_seen": 59213000, + "step": 2738, + "time_per_iteration": 2.618058919906616 + }, + { + "auxiliary_loss_clip": 0.01146101, + "auxiliary_loss_mlp": 0.01158423, + "balance_loss_clip": 1.0021069, + "balance_loss_mlp": 1.00116241, + "epoch": 0.16467758905756802, + "flos": 24930156867840.0, + "grad_norm": 1.9914056657990815, + "language_loss": 0.72470558, + "learning_rate": 3.8127298586234167e-06, + "loss": 0.74775082, + "num_input_tokens_seen": 59232340, + "step": 2739, + "time_per_iteration": 2.628175973892212 + }, + { + "auxiliary_loss_clip": 0.01161957, + "auxiliary_loss_mlp": 0.01158346, + "balance_loss_clip": 1.0020895, + "balance_loss_mlp": 1.00118065, + "epoch": 0.16473771231023598, + "flos": 24826519152000.0, + "grad_norm": 1.6053833573415845, + "language_loss": 0.81596643, + "learning_rate": 3.8125652783700104e-06, + "loss": 0.8391695, + "num_input_tokens_seen": 59253950, + "step": 2740, + "time_per_iteration": 2.6115479469299316 + }, + { + "auxiliary_loss_clip": 0.01131856, + "auxiliary_loss_mlp": 0.01159176, + "balance_loss_clip": 1.00218987, + "balance_loss_mlp": 1.00124729, + "epoch": 0.16479783556290395, + "flos": 39896072375040.0, + "grad_norm": 2.2987092018910307, + "language_loss": 0.69085205, + "learning_rate": 3.8124006293839475e-06, + "loss": 0.7137624, + "num_input_tokens_seen": 59275545, + "step": 2741, + "time_per_iteration": 4.257767915725708 + }, + { + "auxiliary_loss_clip": 0.01178478, + "auxiliary_loss_mlp": 0.01158189, + "balance_loss_clip": 1.00228989, + "balance_loss_mlp": 1.00092793, + "epoch": 0.16485795881557191, + "flos": 19897061299200.0, + "grad_norm": 1.7237696605505552, + "language_loss": 0.79761612, + "learning_rate": 3.812235911671472e-06, + "loss": 0.82098281, + "num_input_tokens_seen": 59293480, + "step": 2742, + "time_per_iteration": 2.504474639892578 + }, + { + "auxiliary_loss_clip": 0.01145131, + "auxiliary_loss_mlp": 0.01157997, + "balance_loss_clip": 1.00214338, + "balance_loss_mlp": 1.00102174, + "epoch": 0.16491808206823988, + "flos": 20556129997440.0, + "grad_norm": 1.8149544593552764, + "language_loss": 0.84101379, + "learning_rate": 3.8120711252388274e-06, + "loss": 0.86404502, + "num_input_tokens_seen": 59313435, + "step": 2743, + "time_per_iteration": 2.6024038791656494 + }, + { + "auxiliary_loss_clip": 0.01178212, + "auxiliary_loss_mlp": 0.01158192, + "balance_loss_clip": 1.0021683, + "balance_loss_mlp": 1.00102687, + "epoch": 0.16497820532090787, + "flos": 23800802376960.0, + "grad_norm": 1.5623576438253635, + "language_loss": 0.85797513, + "learning_rate": 3.811906270092265e-06, + "loss": 0.88133919, + "num_input_tokens_seen": 59331535, + "step": 2744, + "time_per_iteration": 5.316013813018799 + }, + { + "auxiliary_loss_clip": 0.01145663, + "auxiliary_loss_mlp": 0.01157332, + "balance_loss_clip": 1.00218725, + "balance_loss_mlp": 1.00121582, + "epoch": 0.16503832857357584, + "flos": 25482642935040.0, + "grad_norm": 1.6702799850245815, + "language_loss": 0.82747281, + "learning_rate": 3.811741346238036e-06, + "loss": 0.85050273, + "num_input_tokens_seen": 59350680, + "step": 2745, + "time_per_iteration": 4.084433078765869 + }, + { + "auxiliary_loss_clip": 0.01128617, + "auxiliary_loss_mlp": 0.01158121, + "balance_loss_clip": 1.00204062, + "balance_loss_mlp": 1.00105047, + "epoch": 0.1650984518262438, + "flos": 17676058619520.0, + "grad_norm": 2.053612140353463, + "language_loss": 0.76730448, + "learning_rate": 3.8115763536823923e-06, + "loss": 0.7901718, + "num_input_tokens_seen": 59367020, + "step": 2746, + "time_per_iteration": 2.62445330619812 + }, + { + "auxiliary_loss_clip": 0.01178292, + "auxiliary_loss_mlp": 0.0115831, + "balance_loss_clip": 1.00213909, + "balance_loss_mlp": 1.00124025, + "epoch": 0.16515857507891177, + "flos": 18698327688960.0, + "grad_norm": 1.5434187511565556, + "language_loss": 0.807594, + "learning_rate": 3.811411292431592e-06, + "loss": 0.83095998, + "num_input_tokens_seen": 59386075, + "step": 2747, + "time_per_iteration": 2.511539936065674 + }, + { + "auxiliary_loss_clip": 0.01161791, + "auxiliary_loss_mlp": 0.01158229, + "balance_loss_clip": 1.00213432, + "balance_loss_mlp": 1.0012542, + "epoch": 0.16521869833157973, + "flos": 15010481306880.0, + "grad_norm": 2.094760386832385, + "language_loss": 0.69533753, + "learning_rate": 3.8112461624918945e-06, + "loss": 0.71853769, + "num_input_tokens_seen": 59402690, + "step": 2748, + "time_per_iteration": 2.542318820953369 + }, + { + "auxiliary_loss_clip": 0.01178375, + "auxiliary_loss_mlp": 0.00748982, + "balance_loss_clip": 1.00226271, + "balance_loss_mlp": 1.00085449, + "epoch": 0.1652788215842477, + "flos": 22121152548480.0, + "grad_norm": 2.118267258982717, + "language_loss": 0.88136256, + "learning_rate": 3.811080963869561e-06, + "loss": 0.90063614, + "num_input_tokens_seen": 59421130, + "step": 2749, + "time_per_iteration": 2.570962905883789 + }, + { + "auxiliary_loss_clip": 0.01161558, + "auxiliary_loss_mlp": 0.01158004, + "balance_loss_clip": 1.00197029, + "balance_loss_mlp": 1.00121987, + "epoch": 0.16533894483691566, + "flos": 18333080242560.0, + "grad_norm": 2.9481891902279305, + "language_loss": 0.79113448, + "learning_rate": 3.8109156965708557e-06, + "loss": 0.81433004, + "num_input_tokens_seen": 59438970, + "step": 2750, + "time_per_iteration": 2.558952808380127 + }, + { + "auxiliary_loss_clip": 0.01161762, + "auxiliary_loss_mlp": 0.01158005, + "balance_loss_clip": 1.00221348, + "balance_loss_mlp": 1.00112581, + "epoch": 0.16539906808958366, + "flos": 22382115834240.0, + "grad_norm": 1.65048089624204, + "language_loss": 0.94991666, + "learning_rate": 3.8107503606020455e-06, + "loss": 0.97311437, + "num_input_tokens_seen": 59458510, + "step": 2751, + "time_per_iteration": 2.5810651779174805 + }, + { + "auxiliary_loss_clip": 0.01082921, + "auxiliary_loss_mlp": 0.01158261, + "balance_loss_clip": 1.00214338, + "balance_loss_mlp": 1.0011903, + "epoch": 0.16545919134225162, + "flos": 22711093522560.0, + "grad_norm": 2.1773032619627277, + "language_loss": 0.70956802, + "learning_rate": 3.8105849559693997e-06, + "loss": 0.73197985, + "num_input_tokens_seen": 59477110, + "step": 2752, + "time_per_iteration": 2.823864459991455 + }, + { + "auxiliary_loss_clip": 0.01160162, + "auxiliary_loss_mlp": 0.011488, + "balance_loss_clip": 1.00203824, + "balance_loss_mlp": 1.00002694, + "epoch": 0.1655193145949196, + "flos": 67802974076160.0, + "grad_norm": 0.7659667915067448, + "language_loss": 0.54138738, + "learning_rate": 3.810419482679192e-06, + "loss": 0.56447697, + "num_input_tokens_seen": 59541155, + "step": 2753, + "time_per_iteration": 3.269618272781372 + }, + { + "auxiliary_loss_clip": 0.01178359, + "auxiliary_loss_mlp": 0.00748952, + "balance_loss_clip": 1.00230408, + "balance_loss_mlp": 1.00076938, + "epoch": 0.16557943784758755, + "flos": 24280389792000.0, + "grad_norm": 1.5969033084863071, + "language_loss": 0.75344002, + "learning_rate": 3.8102539407376954e-06, + "loss": 0.77271318, + "num_input_tokens_seen": 59561155, + "step": 2754, + "time_per_iteration": 2.5569679737091064 + }, + { + "auxiliary_loss_clip": 0.01146089, + "auxiliary_loss_mlp": 0.01158925, + "balance_loss_clip": 1.00207508, + "balance_loss_mlp": 1.00137794, + "epoch": 0.16563956110025552, + "flos": 20083617561600.0, + "grad_norm": 2.272877467571084, + "language_loss": 0.8649776, + "learning_rate": 3.810088330151188e-06, + "loss": 0.88802767, + "num_input_tokens_seen": 59580460, + "step": 2755, + "time_per_iteration": 2.588829755783081 + }, + { + "auxiliary_loss_clip": 0.01130389, + "auxiliary_loss_mlp": 0.01158184, + "balance_loss_clip": 1.00205874, + "balance_loss_mlp": 1.00111413, + "epoch": 0.16569968435292348, + "flos": 28034454896640.0, + "grad_norm": 1.710469967192616, + "language_loss": 0.72944969, + "learning_rate": 3.80992265092595e-06, + "loss": 0.75233543, + "num_input_tokens_seen": 59600025, + "step": 2756, + "time_per_iteration": 2.695261240005493 + }, + { + "auxiliary_loss_clip": 0.01129826, + "auxiliary_loss_mlp": 0.01157598, + "balance_loss_clip": 1.00210321, + "balance_loss_mlp": 1.00109971, + "epoch": 0.16575980760559147, + "flos": 26250233598720.0, + "grad_norm": 3.028658351712777, + "language_loss": 0.74857366, + "learning_rate": 3.8097569030682636e-06, + "loss": 0.7714479, + "num_input_tokens_seen": 59620600, + "step": 2757, + "time_per_iteration": 2.7301652431488037 + }, + { + "auxiliary_loss_clip": 0.01145115, + "auxiliary_loss_mlp": 0.01158045, + "balance_loss_clip": 1.00219548, + "balance_loss_mlp": 1.00097513, + "epoch": 0.16581993085825944, + "flos": 26943955943040.0, + "grad_norm": 1.5725114081028198, + "language_loss": 0.85336959, + "learning_rate": 3.8095910865844137e-06, + "loss": 0.87640119, + "num_input_tokens_seen": 59641385, + "step": 2758, + "time_per_iteration": 2.650641441345215 + }, + { + "auxiliary_loss_clip": 0.01178426, + "auxiliary_loss_mlp": 0.01158392, + "balance_loss_clip": 1.00233233, + "balance_loss_mlp": 1.00132227, + "epoch": 0.1658800541109274, + "flos": 21653632103040.0, + "grad_norm": 1.8134686592891978, + "language_loss": 0.79184145, + "learning_rate": 3.809425201480689e-06, + "loss": 0.81520963, + "num_input_tokens_seen": 59659865, + "step": 2759, + "time_per_iteration": 2.56972599029541 + }, + { + "auxiliary_loss_clip": 0.01097354, + "auxiliary_loss_mlp": 0.01158233, + "balance_loss_clip": 1.00184464, + "balance_loss_mlp": 1.00097179, + "epoch": 0.16594017736359537, + "flos": 16435488643200.0, + "grad_norm": 2.104589544751717, + "language_loss": 0.75026226, + "learning_rate": 3.8092592477633793e-06, + "loss": 0.77281809, + "num_input_tokens_seen": 59678780, + "step": 2760, + "time_per_iteration": 2.697948455810547 + }, + { + "auxiliary_loss_clip": 0.01113631, + "auxiliary_loss_mlp": 0.01158056, + "balance_loss_clip": 1.00191188, + "balance_loss_mlp": 1.0009861, + "epoch": 0.16600030061626334, + "flos": 22637297030400.0, + "grad_norm": 2.7424068463976075, + "language_loss": 0.73126572, + "learning_rate": 3.8090932254387774e-06, + "loss": 0.75398266, + "num_input_tokens_seen": 59698795, + "step": 2761, + "time_per_iteration": 2.6901843547821045 + }, + { + "auxiliary_loss_clip": 0.01146333, + "auxiliary_loss_mlp": 0.01158067, + "balance_loss_clip": 1.00218761, + "balance_loss_mlp": 1.00109172, + "epoch": 0.1660604238689313, + "flos": 26396569607040.0, + "grad_norm": 1.7023999889379733, + "language_loss": 0.88831788, + "learning_rate": 3.8089271345131788e-06, + "loss": 0.91136193, + "num_input_tokens_seen": 59718795, + "step": 2762, + "time_per_iteration": 2.649021863937378 + }, + { + "auxiliary_loss_clip": 0.01112997, + "auxiliary_loss_mlp": 0.01157901, + "balance_loss_clip": 1.00179553, + "balance_loss_mlp": 1.00111639, + "epoch": 0.16612054712159927, + "flos": 23039999383680.0, + "grad_norm": 1.6807140694586205, + "language_loss": 0.87947512, + "learning_rate": 3.8087609749928822e-06, + "loss": 0.90218407, + "num_input_tokens_seen": 59737555, + "step": 2763, + "time_per_iteration": 2.6776912212371826 + }, + { + "auxiliary_loss_clip": 0.01175449, + "auxiliary_loss_mlp": 0.01148086, + "balance_loss_clip": 1.00195801, + "balance_loss_mlp": 1.00007558, + "epoch": 0.16618067037426726, + "flos": 59241225202560.0, + "grad_norm": 0.8109887581811741, + "language_loss": 0.59815109, + "learning_rate": 3.8085947468841885e-06, + "loss": 0.62138641, + "num_input_tokens_seen": 59800915, + "step": 2764, + "time_per_iteration": 3.1457464694976807 + }, + { + "auxiliary_loss_clip": 0.01162772, + "auxiliary_loss_mlp": 0.01158451, + "balance_loss_clip": 1.00229788, + "balance_loss_mlp": 1.00138092, + "epoch": 0.16624079362693522, + "flos": 27198813916800.0, + "grad_norm": 1.7696228852681786, + "language_loss": 0.82061684, + "learning_rate": 3.808428450193401e-06, + "loss": 0.8438291, + "num_input_tokens_seen": 59822910, + "step": 2765, + "time_per_iteration": 2.636336088180542 + }, + { + "auxiliary_loss_clip": 0.01178659, + "auxiliary_loss_mlp": 0.01158675, + "balance_loss_clip": 1.00229239, + "balance_loss_mlp": 1.00131893, + "epoch": 0.1663009168796032, + "flos": 10925068216320.0, + "grad_norm": 2.472216064543952, + "language_loss": 0.70134473, + "learning_rate": 3.8082620849268244e-06, + "loss": 0.72471809, + "num_input_tokens_seen": 59838805, + "step": 2766, + "time_per_iteration": 2.531141996383667 + }, + { + "auxiliary_loss_clip": 0.01161783, + "auxiliary_loss_mlp": 0.01158178, + "balance_loss_clip": 1.00230718, + "balance_loss_mlp": 1.00110793, + "epoch": 0.16636104013227115, + "flos": 17894431353600.0, + "grad_norm": 2.643168372126546, + "language_loss": 0.88476712, + "learning_rate": 3.808095651090769e-06, + "loss": 0.90796673, + "num_input_tokens_seen": 59855345, + "step": 2767, + "time_per_iteration": 2.5283901691436768 + }, + { + "auxiliary_loss_clip": 0.01158811, + "auxiliary_loss_mlp": 0.01148051, + "balance_loss_clip": 1.00186467, + "balance_loss_mlp": 1.00004089, + "epoch": 0.16642116338493912, + "flos": 66726050463360.0, + "grad_norm": 0.7255332112568433, + "language_loss": 0.52893186, + "learning_rate": 3.8079291486915447e-06, + "loss": 0.55200046, + "num_input_tokens_seen": 59917710, + "step": 2768, + "time_per_iteration": 3.2418477535247803 + }, + { + "auxiliary_loss_clip": 0.01145264, + "auxiliary_loss_mlp": 0.01158528, + "balance_loss_clip": 1.00186896, + "balance_loss_mlp": 1.00107622, + "epoch": 0.16648128663760708, + "flos": 19026048401280.0, + "grad_norm": 3.1210101115677236, + "language_loss": 0.85081244, + "learning_rate": 3.8077625777354667e-06, + "loss": 0.87385035, + "num_input_tokens_seen": 59935105, + "step": 2769, + "time_per_iteration": 2.5957486629486084 + }, + { + "auxiliary_loss_clip": 0.01143143, + "auxiliary_loss_mlp": 0.0114803, + "balance_loss_clip": 1.00185287, + "balance_loss_mlp": 1.00001931, + "epoch": 0.16654140989027508, + "flos": 70134976759680.0, + "grad_norm": 0.8089322370652191, + "language_loss": 0.57508081, + "learning_rate": 3.80759593822885e-06, + "loss": 0.59799254, + "num_input_tokens_seen": 59984085, + "step": 2770, + "time_per_iteration": 3.02292799949646 + }, + { + "auxiliary_loss_clip": 0.01127466, + "auxiliary_loss_mlp": 0.01148045, + "balance_loss_clip": 1.00191844, + "balance_loss_mlp": 1.00003457, + "epoch": 0.16660153314294304, + "flos": 70272406195200.0, + "grad_norm": 0.8628557158194862, + "language_loss": 0.5620997, + "learning_rate": 3.807429230178015e-06, + "loss": 0.58485478, + "num_input_tokens_seen": 60043470, + "step": 2771, + "time_per_iteration": 3.037550210952759 + }, + { + "auxiliary_loss_clip": 0.01112779, + "auxiliary_loss_mlp": 0.01158368, + "balance_loss_clip": 1.00192952, + "balance_loss_mlp": 1.00120211, + "epoch": 0.166661656395611, + "flos": 23075048079360.0, + "grad_norm": 2.2808088743060444, + "language_loss": 0.7081145, + "learning_rate": 3.8072624535892817e-06, + "loss": 0.73082602, + "num_input_tokens_seen": 60063045, + "step": 2772, + "time_per_iteration": 2.6904280185699463 + }, + { + "auxiliary_loss_clip": 0.01162376, + "auxiliary_loss_mlp": 0.01158051, + "balance_loss_clip": 1.00220895, + "balance_loss_mlp": 1.00107646, + "epoch": 0.16672177964827897, + "flos": 28366341586560.0, + "grad_norm": 1.7001450809982903, + "language_loss": 0.85740441, + "learning_rate": 3.807095608468975e-06, + "loss": 0.88060868, + "num_input_tokens_seen": 60081945, + "step": 2773, + "time_per_iteration": 2.6537129878997803 + }, + { + "auxiliary_loss_clip": 0.01114214, + "auxiliary_loss_mlp": 0.01157481, + "balance_loss_clip": 1.00209498, + "balance_loss_mlp": 1.00107813, + "epoch": 0.16678190290094694, + "flos": 19091010147840.0, + "grad_norm": 2.1291563438986914, + "language_loss": 0.81606114, + "learning_rate": 3.8069286948234224e-06, + "loss": 0.83877814, + "num_input_tokens_seen": 60096820, + "step": 2774, + "time_per_iteration": 2.6723885536193848 + }, + { + "auxiliary_loss_clip": 0.01130157, + "auxiliary_loss_mlp": 0.01158174, + "balance_loss_clip": 1.00204778, + "balance_loss_mlp": 1.00110376, + "epoch": 0.1668420261536149, + "flos": 21799106184960.0, + "grad_norm": 2.2873804424525486, + "language_loss": 0.82701725, + "learning_rate": 3.806761712658952e-06, + "loss": 0.84990054, + "num_input_tokens_seen": 60116140, + "step": 2775, + "time_per_iteration": 2.6531569957733154 + }, + { + "auxiliary_loss_clip": 0.01161763, + "auxiliary_loss_mlp": 0.0115803, + "balance_loss_clip": 1.00223422, + "balance_loss_mlp": 1.00115085, + "epoch": 0.16690214940628287, + "flos": 19062533640960.0, + "grad_norm": 1.6968019207962504, + "language_loss": 0.80666226, + "learning_rate": 3.806594661981897e-06, + "loss": 0.82986021, + "num_input_tokens_seen": 60134235, + "step": 2776, + "time_per_iteration": 2.546463966369629 + }, + { + "auxiliary_loss_clip": 0.01162308, + "auxiliary_loss_mlp": 0.01158392, + "balance_loss_clip": 1.00242591, + "balance_loss_mlp": 1.00132203, + "epoch": 0.16696227265895086, + "flos": 18588548747520.0, + "grad_norm": 2.0918358031680873, + "language_loss": 0.80270815, + "learning_rate": 3.8064275427985906e-06, + "loss": 0.82591516, + "num_input_tokens_seen": 60153275, + "step": 2777, + "time_per_iteration": 2.550689935684204 + }, + { + "auxiliary_loss_clip": 0.01162731, + "auxiliary_loss_mlp": 0.01158107, + "balance_loss_clip": 1.00213993, + "balance_loss_mlp": 1.00113177, + "epoch": 0.16702239591161883, + "flos": 23294139085440.0, + "grad_norm": 1.841782017696038, + "language_loss": 0.85547602, + "learning_rate": 3.806260355115371e-06, + "loss": 0.87868446, + "num_input_tokens_seen": 60173215, + "step": 2778, + "time_per_iteration": 3.9256811141967773 + }, + { + "auxiliary_loss_clip": 0.01145305, + "auxiliary_loss_mlp": 0.01157687, + "balance_loss_clip": 1.00208735, + "balance_loss_mlp": 1.00090337, + "epoch": 0.1670825191642868, + "flos": 24425648392320.0, + "grad_norm": 6.218129935749245, + "language_loss": 0.74056983, + "learning_rate": 3.8060930989385778e-06, + "loss": 0.76359975, + "num_input_tokens_seen": 60190515, + "step": 2779, + "time_per_iteration": 2.63885760307312 + }, + { + "auxiliary_loss_clip": 0.011174, + "auxiliary_loss_mlp": 0.00748833, + "balance_loss_clip": 1.0020597, + "balance_loss_mlp": 1.00060236, + "epoch": 0.16714264241695476, + "flos": 26797512193920.0, + "grad_norm": 2.058990456446311, + "language_loss": 0.65362746, + "learning_rate": 3.805925774274554e-06, + "loss": 0.67228979, + "num_input_tokens_seen": 60211655, + "step": 2780, + "time_per_iteration": 2.7320358753204346 + }, + { + "auxiliary_loss_clip": 0.01146201, + "auxiliary_loss_mlp": 0.01157437, + "balance_loss_clip": 1.00226557, + "balance_loss_mlp": 1.00103498, + "epoch": 0.16720276566962272, + "flos": 21835304115840.0, + "grad_norm": 2.238624865804456, + "language_loss": 0.78503102, + "learning_rate": 3.805758381129643e-06, + "loss": 0.80806744, + "num_input_tokens_seen": 60230860, + "step": 2781, + "time_per_iteration": 3.9953551292419434 + }, + { + "auxiliary_loss_clip": 0.01114137, + "auxiliary_loss_mlp": 0.01157767, + "balance_loss_clip": 1.00200045, + "balance_loss_mlp": 1.00098324, + "epoch": 0.1672628889222907, + "flos": 21470415805440.0, + "grad_norm": 1.3589679108224457, + "language_loss": 0.75084102, + "learning_rate": 3.805590919510193e-06, + "loss": 0.77356005, + "num_input_tokens_seen": 60250535, + "step": 2782, + "time_per_iteration": 5.534957408905029 + }, + { + "auxiliary_loss_clip": 0.01134469, + "auxiliary_loss_mlp": 0.01158289, + "balance_loss_clip": 1.00207496, + "balance_loss_mlp": 1.00112343, + "epoch": 0.16732301217495865, + "flos": 30774008269440.0, + "grad_norm": 1.9827129048744032, + "language_loss": 0.68232024, + "learning_rate": 3.8054233894225547e-06, + "loss": 0.70524776, + "num_input_tokens_seen": 60269530, + "step": 2783, + "time_per_iteration": 2.7158284187316895 + }, + { + "auxiliary_loss_clip": 0.01178444, + "auxiliary_loss_mlp": 0.01157848, + "balance_loss_clip": 1.00233984, + "balance_loss_mlp": 1.00125456, + "epoch": 0.16738313542762664, + "flos": 23474625949440.0, + "grad_norm": 1.6217663855336972, + "language_loss": 0.69894642, + "learning_rate": 3.805255790873081e-06, + "loss": 0.72230935, + "num_input_tokens_seen": 60289900, + "step": 2784, + "time_per_iteration": 2.5589871406555176 + }, + { + "auxiliary_loss_clip": 0.01146015, + "auxiliary_loss_mlp": 0.01158292, + "balance_loss_clip": 1.00203526, + "balance_loss_mlp": 1.00122166, + "epoch": 0.1674432586802946, + "flos": 29789086366080.0, + "grad_norm": 2.2850788380090665, + "language_loss": 0.60451543, + "learning_rate": 3.805088123868126e-06, + "loss": 0.62755853, + "num_input_tokens_seen": 60310025, + "step": 2785, + "time_per_iteration": 2.6456072330474854 + }, + { + "auxiliary_loss_clip": 0.01142774, + "auxiliary_loss_mlp": 0.01148013, + "balance_loss_clip": 1.00172853, + "balance_loss_mlp": 1.0000031, + "epoch": 0.16750338193296258, + "flos": 66136073575680.0, + "grad_norm": 0.7788911653293507, + "language_loss": 0.58842027, + "learning_rate": 3.8049203884140492e-06, + "loss": 0.61132812, + "num_input_tokens_seen": 60377800, + "step": 2786, + "time_per_iteration": 3.222403049468994 + }, + { + "auxiliary_loss_clip": 0.01162645, + "auxiliary_loss_mlp": 0.01158129, + "balance_loss_clip": 1.00223267, + "balance_loss_mlp": 1.00134504, + "epoch": 0.16756350518563054, + "flos": 25696777864320.0, + "grad_norm": 2.4811430394384093, + "language_loss": 0.75860119, + "learning_rate": 3.80475258451721e-06, + "loss": 0.78180891, + "num_input_tokens_seen": 60398215, + "step": 2787, + "time_per_iteration": 2.5927600860595703 + }, + { + "auxiliary_loss_clip": 0.01161477, + "auxiliary_loss_mlp": 0.01157761, + "balance_loss_clip": 1.00210404, + "balance_loss_mlp": 1.0009768, + "epoch": 0.1676236284382985, + "flos": 23836102467840.0, + "grad_norm": 1.763107151722889, + "language_loss": 0.77944833, + "learning_rate": 3.804584712183972e-06, + "loss": 0.80264068, + "num_input_tokens_seen": 60416910, + "step": 2788, + "time_per_iteration": 2.572211265563965 + }, + { + "auxiliary_loss_clip": 0.01143155, + "auxiliary_loss_mlp": 0.0114801, + "balance_loss_clip": 1.00164914, + "balance_loss_mlp": 0.9999997, + "epoch": 0.16768375169096647, + "flos": 59874902985600.0, + "grad_norm": 0.8618959396623419, + "language_loss": 0.59356427, + "learning_rate": 3.8044167714207013e-06, + "loss": 0.61647594, + "num_input_tokens_seen": 60468660, + "step": 2789, + "time_per_iteration": 3.0385544300079346 + }, + { + "auxiliary_loss_clip": 0.0116166, + "auxiliary_loss_mlp": 0.01157865, + "balance_loss_clip": 1.00211215, + "balance_loss_mlp": 1.00127196, + "epoch": 0.16774387494363446, + "flos": 38435657207040.0, + "grad_norm": 1.3741300657358355, + "language_loss": 0.7006042, + "learning_rate": 3.804248762233765e-06, + "loss": 0.72379947, + "num_input_tokens_seen": 60492370, + "step": 2790, + "time_per_iteration": 2.7260451316833496 + }, + { + "auxiliary_loss_clip": 0.01129942, + "auxiliary_loss_mlp": 0.01157872, + "balance_loss_clip": 1.00201261, + "balance_loss_mlp": 1.00108826, + "epoch": 0.16780399819630243, + "flos": 22637620252800.0, + "grad_norm": 1.5928652403282637, + "language_loss": 0.78981578, + "learning_rate": 3.8040806846295356e-06, + "loss": 0.81269383, + "num_input_tokens_seen": 60512655, + "step": 2791, + "time_per_iteration": 2.6638107299804688 + }, + { + "auxiliary_loss_clip": 0.01146339, + "auxiliary_loss_mlp": 0.01158013, + "balance_loss_clip": 1.00211918, + "balance_loss_mlp": 1.00113344, + "epoch": 0.1678641214489704, + "flos": 32891516887680.0, + "grad_norm": 1.665673858129539, + "language_loss": 0.7139172, + "learning_rate": 3.8039125386143853e-06, + "loss": 0.73696077, + "num_input_tokens_seen": 60533090, + "step": 2792, + "time_per_iteration": 2.7047760486602783 + }, + { + "auxiliary_loss_clip": 0.01129459, + "auxiliary_loss_mlp": 0.01157648, + "balance_loss_clip": 1.00190973, + "balance_loss_mlp": 1.00105429, + "epoch": 0.16792424470163836, + "flos": 19974916028160.0, + "grad_norm": 3.0353307928778825, + "language_loss": 0.71856618, + "learning_rate": 3.803744324194691e-06, + "loss": 0.7414372, + "num_input_tokens_seen": 60553190, + "step": 2793, + "time_per_iteration": 2.6303584575653076 + }, + { + "auxiliary_loss_clip": 0.01161888, + "auxiliary_loss_mlp": 0.0115786, + "balance_loss_clip": 1.00219643, + "balance_loss_mlp": 1.00107598, + "epoch": 0.16798436795430632, + "flos": 19719878486400.0, + "grad_norm": 1.7226228779471175, + "language_loss": 0.76985812, + "learning_rate": 3.803576041376831e-06, + "loss": 0.79305553, + "num_input_tokens_seen": 60571995, + "step": 2794, + "time_per_iteration": 2.555168390274048 + }, + { + "auxiliary_loss_clip": 0.01150094, + "auxiliary_loss_mlp": 0.01157483, + "balance_loss_clip": 1.00241375, + "balance_loss_mlp": 1.0008893, + "epoch": 0.1680444912069743, + "flos": 28104839596800.0, + "grad_norm": 2.28459175258828, + "language_loss": 0.71510112, + "learning_rate": 3.803407690167187e-06, + "loss": 0.73817682, + "num_input_tokens_seen": 60591275, + "step": 2795, + "time_per_iteration": 2.65320086479187 + }, + { + "auxiliary_loss_clip": 0.01145323, + "auxiliary_loss_mlp": 0.01157317, + "balance_loss_clip": 1.00195408, + "balance_loss_mlp": 1.00110555, + "epoch": 0.16810461445964225, + "flos": 18075205526400.0, + "grad_norm": 1.703910405919781, + "language_loss": 0.84377682, + "learning_rate": 3.803239270572142e-06, + "loss": 0.86680317, + "num_input_tokens_seen": 60609235, + "step": 2796, + "time_per_iteration": 2.5662178993225098 + }, + { + "auxiliary_loss_clip": 0.01097003, + "auxiliary_loss_mlp": 0.01157776, + "balance_loss_clip": 1.0017395, + "balance_loss_mlp": 1.00099194, + "epoch": 0.16816473771231025, + "flos": 23878657105920.0, + "grad_norm": 1.6514620793612633, + "language_loss": 0.81292319, + "learning_rate": 3.8030707825980838e-06, + "loss": 0.83547097, + "num_input_tokens_seen": 60629880, + "step": 2797, + "time_per_iteration": 2.727694511413574 + }, + { + "auxiliary_loss_clip": 0.01162083, + "auxiliary_loss_mlp": 0.0115727, + "balance_loss_clip": 1.00223184, + "balance_loss_mlp": 1.00115323, + "epoch": 0.1682248609649782, + "flos": 22783597125120.0, + "grad_norm": 1.3450923082413169, + "language_loss": 0.74759483, + "learning_rate": 3.802902226251401e-06, + "loss": 0.77078831, + "num_input_tokens_seen": 60651175, + "step": 2798, + "time_per_iteration": 2.578336715698242 + }, + { + "auxiliary_loss_clip": 0.01178303, + "auxiliary_loss_mlp": 0.01157935, + "balance_loss_clip": 1.00235677, + "balance_loss_mlp": 1.00115073, + "epoch": 0.16828498421764618, + "flos": 20705123612160.0, + "grad_norm": 1.4739911413602835, + "language_loss": 0.79311073, + "learning_rate": 3.8027336015384845e-06, + "loss": 0.81647307, + "num_input_tokens_seen": 60670210, + "step": 2799, + "time_per_iteration": 2.552340030670166 + }, + { + "auxiliary_loss_clip": 0.01081482, + "auxiliary_loss_mlp": 0.01158029, + "balance_loss_clip": 1.00185621, + "balance_loss_mlp": 1.00086331, + "epoch": 0.16834510747031414, + "flos": 29420606695680.0, + "grad_norm": 1.9833036949738958, + "language_loss": 0.70822287, + "learning_rate": 3.8025649084657296e-06, + "loss": 0.730618, + "num_input_tokens_seen": 60690895, + "step": 2800, + "time_per_iteration": 3.081599712371826 + }, + { + "auxiliary_loss_clip": 0.0113081, + "auxiliary_loss_mlp": 0.00748874, + "balance_loss_clip": 1.00213408, + "balance_loss_mlp": 1.00062895, + "epoch": 0.1684052307229821, + "flos": 18145374744960.0, + "grad_norm": 2.28868806696054, + "language_loss": 0.83940923, + "learning_rate": 3.8023961470395326e-06, + "loss": 0.85820603, + "num_input_tokens_seen": 60708280, + "step": 2801, + "time_per_iteration": 2.9188051223754883 + }, + { + "auxiliary_loss_clip": 0.01146041, + "auxiliary_loss_mlp": 0.01157851, + "balance_loss_clip": 1.00212157, + "balance_loss_mlp": 1.00116229, + "epoch": 0.16846535397565007, + "flos": 16574929240320.0, + "grad_norm": 2.1384745610976053, + "language_loss": 0.82292145, + "learning_rate": 3.8022273172662933e-06, + "loss": 0.84596038, + "num_input_tokens_seen": 60724150, + "step": 2802, + "time_per_iteration": 2.574138879776001 + }, + { + "auxiliary_loss_clip": 0.01161555, + "auxiliary_loss_mlp": 0.01157664, + "balance_loss_clip": 1.00218952, + "balance_loss_mlp": 1.00097489, + "epoch": 0.16852547722831807, + "flos": 30408868563840.0, + "grad_norm": 1.52720264492262, + "language_loss": 0.80769563, + "learning_rate": 3.802058419152413e-06, + "loss": 0.83088785, + "num_input_tokens_seen": 60746485, + "step": 2803, + "time_per_iteration": 2.6433582305908203 + }, + { + "auxiliary_loss_clip": 0.01162533, + "auxiliary_loss_mlp": 0.01157405, + "balance_loss_clip": 1.00229573, + "balance_loss_mlp": 1.00109756, + "epoch": 0.16858560048098603, + "flos": 33507420416640.0, + "grad_norm": 2.3555297058992575, + "language_loss": 0.76207459, + "learning_rate": 3.801889452704297e-06, + "loss": 0.78527403, + "num_input_tokens_seen": 60762875, + "step": 2804, + "time_per_iteration": 2.641453742980957 + }, + { + "auxiliary_loss_clip": 0.01127557, + "auxiliary_loss_mlp": 0.01147235, + "balance_loss_clip": 1.00176382, + "balance_loss_mlp": 0.9999882, + "epoch": 0.168645723733654, + "flos": 67370502326400.0, + "grad_norm": 0.8440592187876466, + "language_loss": 0.55432719, + "learning_rate": 3.8017204179283526e-06, + "loss": 0.57707512, + "num_input_tokens_seen": 60825510, + "step": 2805, + "time_per_iteration": 3.187316417694092 + }, + { + "auxiliary_loss_clip": 0.01162355, + "auxiliary_loss_mlp": 0.01157374, + "balance_loss_clip": 1.00215936, + "balance_loss_mlp": 1.00097167, + "epoch": 0.16870584698632196, + "flos": 21324618501120.0, + "grad_norm": 1.7077235348105937, + "language_loss": 0.72736275, + "learning_rate": 3.8015513148309892e-06, + "loss": 0.75056005, + "num_input_tokens_seen": 60844440, + "step": 2806, + "time_per_iteration": 2.5917932987213135 + }, + { + "auxiliary_loss_clip": 0.01128223, + "auxiliary_loss_mlp": 0.01157335, + "balance_loss_clip": 1.00192118, + "balance_loss_mlp": 1.00121832, + "epoch": 0.16876597023898993, + "flos": 20740746925440.0, + "grad_norm": 1.8693143865023358, + "language_loss": 0.6993525, + "learning_rate": 3.80138214341862e-06, + "loss": 0.72220802, + "num_input_tokens_seen": 60863210, + "step": 2807, + "time_per_iteration": 2.642756462097168 + }, + { + "auxiliary_loss_clip": 0.01146954, + "auxiliary_loss_mlp": 0.01157532, + "balance_loss_clip": 1.00204229, + "balance_loss_mlp": 1.00141585, + "epoch": 0.1688260934916579, + "flos": 20303498666880.0, + "grad_norm": 3.3313697755817118, + "language_loss": 0.69995123, + "learning_rate": 3.8012129036976587e-06, + "loss": 0.72299612, + "num_input_tokens_seen": 60882510, + "step": 2808, + "time_per_iteration": 2.6746153831481934 + }, + { + "auxiliary_loss_clip": 0.01128485, + "auxiliary_loss_mlp": 0.0115693, + "balance_loss_clip": 1.00179648, + "balance_loss_mlp": 1.00090873, + "epoch": 0.16888621674432586, + "flos": 20340702178560.0, + "grad_norm": 2.293713340559986, + "language_loss": 0.80244422, + "learning_rate": 3.8010435956745236e-06, + "loss": 0.82529837, + "num_input_tokens_seen": 60901105, + "step": 2809, + "time_per_iteration": 2.63569712638855 + }, + { + "auxiliary_loss_clip": 0.01162157, + "auxiliary_loss_mlp": 0.01157332, + "balance_loss_clip": 1.00218463, + "balance_loss_mlp": 1.00112045, + "epoch": 0.16894633999699385, + "flos": 16244802316800.0, + "grad_norm": 2.0585409779651465, + "language_loss": 0.87735116, + "learning_rate": 3.8008742193556358e-06, + "loss": 0.90054613, + "num_input_tokens_seen": 60915340, + "step": 2810, + "time_per_iteration": 2.6074612140655518 + }, + { + "auxiliary_loss_clip": 0.01161677, + "auxiliary_loss_mlp": 0.01157175, + "balance_loss_clip": 1.00212133, + "balance_loss_mlp": 1.00115407, + "epoch": 0.16900646324966181, + "flos": 19610171372160.0, + "grad_norm": 2.25018467636564, + "language_loss": 0.92652518, + "learning_rate": 3.800704774747416e-06, + "loss": 0.94971371, + "num_input_tokens_seen": 60933735, + "step": 2811, + "time_per_iteration": 2.559990882873535 + }, + { + "auxiliary_loss_clip": 0.01162716, + "auxiliary_loss_mlp": 0.01157546, + "balance_loss_clip": 1.00229657, + "balance_loss_mlp": 1.00104845, + "epoch": 0.16906658650232978, + "flos": 22018089450240.0, + "grad_norm": 1.7511845910277626, + "language_loss": 0.78398371, + "learning_rate": 3.800535261856291e-06, + "loss": 0.80718625, + "num_input_tokens_seen": 60953105, + "step": 2812, + "time_per_iteration": 2.6167001724243164 + }, + { + "auxiliary_loss_clip": 0.01161453, + "auxiliary_loss_mlp": 0.0115764, + "balance_loss_clip": 1.00214374, + "balance_loss_mlp": 1.00095153, + "epoch": 0.16912670975499774, + "flos": 11763690024960.0, + "grad_norm": 2.666646385658321, + "language_loss": 0.75361824, + "learning_rate": 3.8003656806886887e-06, + "loss": 0.77680922, + "num_input_tokens_seen": 60969150, + "step": 2813, + "time_per_iteration": 2.5297298431396484 + }, + { + "auxiliary_loss_clip": 0.01145642, + "auxiliary_loss_mlp": 0.01157174, + "balance_loss_clip": 1.00212514, + "balance_loss_mlp": 1.00096178, + "epoch": 0.1691868330076657, + "flos": 17161386595200.0, + "grad_norm": 3.5661156350782064, + "language_loss": 0.69236702, + "learning_rate": 3.8001960312510396e-06, + "loss": 0.71539515, + "num_input_tokens_seen": 60982825, + "step": 2814, + "time_per_iteration": 2.573124885559082 + }, + { + "auxiliary_loss_clip": 0.01178066, + "auxiliary_loss_mlp": 0.0115757, + "balance_loss_clip": 1.00224698, + "balance_loss_mlp": 1.00116801, + "epoch": 0.16924695626033368, + "flos": 22416553998720.0, + "grad_norm": 1.6863155331018578, + "language_loss": 0.61518693, + "learning_rate": 3.800026313549776e-06, + "loss": 0.63854331, + "num_input_tokens_seen": 61000875, + "step": 2815, + "time_per_iteration": 2.5000741481781006 + }, + { + "auxiliary_loss_clip": 0.01145596, + "auxiliary_loss_mlp": 0.01156656, + "balance_loss_clip": 1.00200057, + "balance_loss_mlp": 1.00101614, + "epoch": 0.16930707951300164, + "flos": 25739655724800.0, + "grad_norm": 1.6876500847142537, + "language_loss": 0.82012385, + "learning_rate": 3.7998565275913342e-06, + "loss": 0.84314638, + "num_input_tokens_seen": 61021940, + "step": 2816, + "time_per_iteration": 2.6584742069244385 + }, + { + "auxiliary_loss_clip": 0.01144908, + "auxiliary_loss_mlp": 0.0115718, + "balance_loss_clip": 1.00199914, + "balance_loss_mlp": 1.00106299, + "epoch": 0.16936720276566963, + "flos": 22747040058240.0, + "grad_norm": 2.4893719930024525, + "language_loss": 0.87266076, + "learning_rate": 3.799686673382153e-06, + "loss": 0.89568162, + "num_input_tokens_seen": 61040285, + "step": 2817, + "time_per_iteration": 3.9411840438842773 + }, + { + "auxiliary_loss_clip": 0.01146073, + "auxiliary_loss_mlp": 0.011571, + "balance_loss_clip": 1.00205851, + "balance_loss_mlp": 1.00136566, + "epoch": 0.1694273260183376, + "flos": 19573973441280.0, + "grad_norm": 1.7313047855602555, + "language_loss": 0.81734133, + "learning_rate": 3.799516750928672e-06, + "loss": 0.84037304, + "num_input_tokens_seen": 61059020, + "step": 2818, + "time_per_iteration": 2.5909411907196045 + }, + { + "auxiliary_loss_clip": 0.01178063, + "auxiliary_loss_mlp": 0.0115718, + "balance_loss_clip": 1.00225449, + "balance_loss_mlp": 1.00115895, + "epoch": 0.16948744927100556, + "flos": 12457843332480.0, + "grad_norm": 4.591002704060319, + "language_loss": 0.80869913, + "learning_rate": 3.799346760237336e-06, + "loss": 0.83205152, + "num_input_tokens_seen": 61074245, + "step": 2819, + "time_per_iteration": 3.916301727294922 + }, + { + "auxiliary_loss_clip": 0.01143371, + "auxiliary_loss_mlp": 0.01147286, + "balance_loss_clip": 1.00154877, + "balance_loss_mlp": 1.00003874, + "epoch": 0.16954757252367353, + "flos": 71291694435840.0, + "grad_norm": 0.9449025922121416, + "language_loss": 0.61061341, + "learning_rate": 3.7991767013145902e-06, + "loss": 0.63352001, + "num_input_tokens_seen": 61127080, + "step": 2820, + "time_per_iteration": 4.605437278747559 + }, + { + "auxiliary_loss_clip": 0.01130043, + "auxiliary_loss_mlp": 0.01156863, + "balance_loss_clip": 1.00195503, + "balance_loss_mlp": 1.00093699, + "epoch": 0.1696076957763415, + "flos": 29606516513280.0, + "grad_norm": 1.8534890855263668, + "language_loss": 0.78694081, + "learning_rate": 3.7990065741668844e-06, + "loss": 0.80980992, + "num_input_tokens_seen": 61146955, + "step": 2821, + "time_per_iteration": 2.7235710620880127 + }, + { + "auxiliary_loss_clip": 0.01162703, + "auxiliary_loss_mlp": 0.01157187, + "balance_loss_clip": 1.00229537, + "balance_loss_mlp": 1.00116634, + "epoch": 0.16966781902900946, + "flos": 24388588535040.0, + "grad_norm": 2.000805567976879, + "language_loss": 0.78724653, + "learning_rate": 3.7988363788006685e-06, + "loss": 0.81044543, + "num_input_tokens_seen": 61166605, + "step": 2822, + "time_per_iteration": 2.5961408615112305 + }, + { + "auxiliary_loss_clip": 0.01161324, + "auxiliary_loss_mlp": 0.00748921, + "balance_loss_clip": 1.00207233, + "balance_loss_mlp": 1.00070572, + "epoch": 0.16972794228167745, + "flos": 23038814234880.0, + "grad_norm": 1.9707747503931226, + "language_loss": 0.75207525, + "learning_rate": 3.7986661152223967e-06, + "loss": 0.77117777, + "num_input_tokens_seen": 61186535, + "step": 2823, + "time_per_iteration": 2.590468406677246 + }, + { + "auxiliary_loss_clip": 0.01144974, + "auxiliary_loss_mlp": 0.01157496, + "balance_loss_clip": 1.00218177, + "balance_loss_mlp": 1.00118923, + "epoch": 0.16978806553434542, + "flos": 35228691129600.0, + "grad_norm": 1.6658359869157875, + "language_loss": 0.5992415, + "learning_rate": 3.7984957834385257e-06, + "loss": 0.62226617, + "num_input_tokens_seen": 61208965, + "step": 2824, + "time_per_iteration": 2.757960081100464 + }, + { + "auxiliary_loss_clip": 0.01149773, + "auxiliary_loss_mlp": 0.01156825, + "balance_loss_clip": 1.0022471, + "balance_loss_mlp": 1.00109005, + "epoch": 0.16984818878701338, + "flos": 32014290936960.0, + "grad_norm": 2.003499765408392, + "language_loss": 0.7341783, + "learning_rate": 3.7983253834555144e-06, + "loss": 0.75724423, + "num_input_tokens_seen": 61230670, + "step": 2825, + "time_per_iteration": 2.6772301197052 + }, + { + "auxiliary_loss_clip": 0.01178096, + "auxiliary_loss_mlp": 0.01157556, + "balance_loss_clip": 1.00219035, + "balance_loss_mlp": 1.00115347, + "epoch": 0.16990831203968135, + "flos": 22818609907200.0, + "grad_norm": 1.704886740934503, + "language_loss": 0.85616279, + "learning_rate": 3.7981549152798245e-06, + "loss": 0.87951928, + "num_input_tokens_seen": 61249510, + "step": 2826, + "time_per_iteration": 2.528477668762207 + }, + { + "auxiliary_loss_clip": 0.01162723, + "auxiliary_loss_mlp": 0.01157239, + "balance_loss_clip": 1.00231528, + "balance_loss_mlp": 1.00102687, + "epoch": 0.1699684352923493, + "flos": 23039604334080.0, + "grad_norm": 2.3258558162857694, + "language_loss": 0.82422268, + "learning_rate": 3.7979843789179196e-06, + "loss": 0.8474223, + "num_input_tokens_seen": 61269440, + "step": 2827, + "time_per_iteration": 2.5624465942382812 + }, + { + "auxiliary_loss_clip": 0.01146048, + "auxiliary_loss_mlp": 0.01157187, + "balance_loss_clip": 1.00199485, + "balance_loss_mlp": 1.00097537, + "epoch": 0.17002855854501728, + "flos": 21434110133760.0, + "grad_norm": 1.8003174577932715, + "language_loss": 0.74003536, + "learning_rate": 3.797813774376267e-06, + "loss": 0.76306772, + "num_input_tokens_seen": 61288195, + "step": 2828, + "time_per_iteration": 2.620159864425659 + }, + { + "auxiliary_loss_clip": 0.01127437, + "auxiliary_loss_mlp": 0.01147356, + "balance_loss_clip": 1.00158453, + "balance_loss_mlp": 1.00010872, + "epoch": 0.17008868179768524, + "flos": 71453509205760.0, + "grad_norm": 0.7512454120637196, + "language_loss": 0.56434846, + "learning_rate": 3.797643101661336e-06, + "loss": 0.58709633, + "num_input_tokens_seen": 61350850, + "step": 2829, + "time_per_iteration": 3.2778878211975098 + }, + { + "auxiliary_loss_clip": 0.01135072, + "auxiliary_loss_mlp": 0.01157079, + "balance_loss_clip": 1.00195324, + "balance_loss_mlp": 1.00115359, + "epoch": 0.17014880505035324, + "flos": 24900315644160.0, + "grad_norm": 1.7300574549557532, + "language_loss": 0.83409625, + "learning_rate": 3.7974723607795983e-06, + "loss": 0.85701776, + "num_input_tokens_seen": 61370765, + "step": 2830, + "time_per_iteration": 2.6679840087890625 + }, + { + "auxiliary_loss_clip": 0.01129298, + "auxiliary_loss_mlp": 0.01156956, + "balance_loss_clip": 1.00198841, + "balance_loss_mlp": 1.0009346, + "epoch": 0.1702089283030212, + "flos": 29862415981440.0, + "grad_norm": 7.38576205625503, + "language_loss": 0.78487968, + "learning_rate": 3.797301551737529e-06, + "loss": 0.80774218, + "num_input_tokens_seen": 61388935, + "step": 2831, + "time_per_iteration": 2.7454335689544678 + }, + { + "auxiliary_loss_clip": 0.0112835, + "auxiliary_loss_mlp": 0.01157367, + "balance_loss_clip": 1.00202465, + "balance_loss_mlp": 1.00125027, + "epoch": 0.17026905155568917, + "flos": 17744180762880.0, + "grad_norm": 2.103702820234587, + "language_loss": 0.79561806, + "learning_rate": 3.7971306745416044e-06, + "loss": 0.81847525, + "num_input_tokens_seen": 61407350, + "step": 2832, + "time_per_iteration": 2.6590735912323 + }, + { + "auxiliary_loss_clip": 0.01145808, + "auxiliary_loss_mlp": 0.01157172, + "balance_loss_clip": 1.00215816, + "balance_loss_mlp": 1.00134218, + "epoch": 0.17032917480835713, + "flos": 23148665003520.0, + "grad_norm": 1.8223458134819395, + "language_loss": 0.8885901, + "learning_rate": 3.7969597291983046e-06, + "loss": 0.9116199, + "num_input_tokens_seen": 61429010, + "step": 2833, + "time_per_iteration": 2.7047219276428223 + }, + { + "auxiliary_loss_clip": 0.01177904, + "auxiliary_loss_mlp": 0.01156873, + "balance_loss_clip": 1.00217152, + "balance_loss_mlp": 1.00113845, + "epoch": 0.1703892980610251, + "flos": 39202565512320.0, + "grad_norm": 2.0510474132144623, + "language_loss": 0.72108889, + "learning_rate": 3.7967887157141115e-06, + "loss": 0.74443674, + "num_input_tokens_seen": 61450040, + "step": 2834, + "time_per_iteration": 2.6880643367767334 + }, + { + "auxiliary_loss_clip": 0.01129166, + "auxiliary_loss_mlp": 0.01157104, + "balance_loss_clip": 1.00211501, + "balance_loss_mlp": 1.00117826, + "epoch": 0.17044942131369306, + "flos": 23039101543680.0, + "grad_norm": 2.0555220349861325, + "language_loss": 0.86756539, + "learning_rate": 3.7966176340955106e-06, + "loss": 0.89042807, + "num_input_tokens_seen": 61468585, + "step": 2835, + "time_per_iteration": 2.6739821434020996 + }, + { + "auxiliary_loss_clip": 0.01161647, + "auxiliary_loss_mlp": 0.01157087, + "balance_loss_clip": 1.0020982, + "balance_loss_mlp": 1.00087547, + "epoch": 0.17050954456636103, + "flos": 17054983532160.0, + "grad_norm": 4.187006311000087, + "language_loss": 0.74129981, + "learning_rate": 3.796446484348989e-06, + "loss": 0.76448715, + "num_input_tokens_seen": 61486330, + "step": 2836, + "time_per_iteration": 2.545790672302246 + }, + { + "auxiliary_loss_clip": 0.01113906, + "auxiliary_loss_mlp": 0.01156994, + "balance_loss_clip": 1.00203466, + "balance_loss_mlp": 1.00097239, + "epoch": 0.17056966781902902, + "flos": 16836969934080.0, + "grad_norm": 2.0371048882640266, + "language_loss": 0.80059677, + "learning_rate": 3.796275266481036e-06, + "loss": 0.82330579, + "num_input_tokens_seen": 61503950, + "step": 2837, + "time_per_iteration": 2.65044903755188 + }, + { + "auxiliary_loss_clip": 0.01161866, + "auxiliary_loss_mlp": 0.01156424, + "balance_loss_clip": 1.00222731, + "balance_loss_mlp": 1.00097466, + "epoch": 0.17062979107169698, + "flos": 17712543859200.0, + "grad_norm": 1.8265170522520895, + "language_loss": 0.83211684, + "learning_rate": 3.7961039804981456e-06, + "loss": 0.85529971, + "num_input_tokens_seen": 61523550, + "step": 2838, + "time_per_iteration": 2.569121837615967 + }, + { + "auxiliary_loss_clip": 0.01112713, + "auxiliary_loss_mlp": 0.01156732, + "balance_loss_clip": 1.00200737, + "balance_loss_mlp": 1.00109196, + "epoch": 0.17068991432436495, + "flos": 22525040050560.0, + "grad_norm": 1.9039014762441577, + "language_loss": 0.93411434, + "learning_rate": 3.795932626406812e-06, + "loss": 0.95680881, + "num_input_tokens_seen": 61542720, + "step": 2839, + "time_per_iteration": 2.7097935676574707 + }, + { + "auxiliary_loss_clip": 0.01145841, + "auxiliary_loss_mlp": 0.01156532, + "balance_loss_clip": 1.00208473, + "balance_loss_mlp": 1.00108314, + "epoch": 0.17075003757703291, + "flos": 25882939077120.0, + "grad_norm": 2.0831272337908593, + "language_loss": 0.83609903, + "learning_rate": 3.7957612042135336e-06, + "loss": 0.85912275, + "num_input_tokens_seen": 61563040, + "step": 2840, + "time_per_iteration": 2.6383581161499023 + }, + { + "auxiliary_loss_clip": 0.01162382, + "auxiliary_loss_mlp": 0.0115693, + "balance_loss_clip": 1.00219202, + "balance_loss_mlp": 1.00109959, + "epoch": 0.17081016082970088, + "flos": 20120713332480.0, + "grad_norm": 1.643951385264992, + "language_loss": 0.76004565, + "learning_rate": 3.79558971392481e-06, + "loss": 0.78323877, + "num_input_tokens_seen": 61581890, + "step": 2841, + "time_per_iteration": 2.5518548488616943 + }, + { + "auxiliary_loss_clip": 0.01149763, + "auxiliary_loss_mlp": 0.01156784, + "balance_loss_clip": 1.0022788, + "balance_loss_mlp": 1.0010494, + "epoch": 0.17087028408236885, + "flos": 24936477661440.0, + "grad_norm": 1.7460419205107038, + "language_loss": 0.76872385, + "learning_rate": 3.7954181555471443e-06, + "loss": 0.79178935, + "num_input_tokens_seen": 61602095, + "step": 2842, + "time_per_iteration": 2.6582210063934326 + }, + { + "auxiliary_loss_clip": 0.0117782, + "auxiliary_loss_mlp": 0.01156493, + "balance_loss_clip": 1.0022049, + "balance_loss_mlp": 1.0010438, + "epoch": 0.17093040733503684, + "flos": 19057864872960.0, + "grad_norm": 1.8585896735483804, + "language_loss": 0.85823011, + "learning_rate": 3.795246529087043e-06, + "loss": 0.8815732, + "num_input_tokens_seen": 61620400, + "step": 2843, + "time_per_iteration": 2.503406286239624 + }, + { + "auxiliary_loss_clip": 0.01178109, + "auxiliary_loss_mlp": 0.01156787, + "balance_loss_clip": 1.00248516, + "balance_loss_mlp": 1.00114751, + "epoch": 0.1709905305877048, + "flos": 13078954333440.0, + "grad_norm": 1.6117525956021268, + "language_loss": 0.68154776, + "learning_rate": 3.7950748345510126e-06, + "loss": 0.70489669, + "num_input_tokens_seen": 61637680, + "step": 2844, + "time_per_iteration": 2.5012176036834717 + }, + { + "auxiliary_loss_clip": 0.01145767, + "auxiliary_loss_mlp": 0.00748968, + "balance_loss_clip": 1.0021255, + "balance_loss_mlp": 1.00069618, + "epoch": 0.17105065384037277, + "flos": 19209336526080.0, + "grad_norm": 1.7794155849338074, + "language_loss": 0.78259206, + "learning_rate": 3.7949030719455646e-06, + "loss": 0.80153948, + "num_input_tokens_seen": 61655630, + "step": 2845, + "time_per_iteration": 2.624894618988037 + }, + { + "auxiliary_loss_clip": 0.01161385, + "auxiliary_loss_mlp": 0.01156721, + "balance_loss_clip": 1.00218177, + "balance_loss_mlp": 1.00098634, + "epoch": 0.17111077709304073, + "flos": 18515183218560.0, + "grad_norm": 2.6424825594981165, + "language_loss": 0.78160274, + "learning_rate": 3.7947312412772127e-06, + "loss": 0.80478382, + "num_input_tokens_seen": 61673475, + "step": 2846, + "time_per_iteration": 2.5540671348571777 + }, + { + "auxiliary_loss_clip": 0.01161294, + "auxiliary_loss_mlp": 0.01156437, + "balance_loss_clip": 1.00209165, + "balance_loss_mlp": 1.00117886, + "epoch": 0.1711709003457087, + "flos": 25082670015360.0, + "grad_norm": 1.7144940384978131, + "language_loss": 0.79745668, + "learning_rate": 3.794559342552472e-06, + "loss": 0.82063395, + "num_input_tokens_seen": 61693370, + "step": 2847, + "time_per_iteration": 2.620105504989624 + }, + { + "auxiliary_loss_clip": 0.01160963, + "auxiliary_loss_mlp": 0.01156696, + "balance_loss_clip": 1.00198746, + "balance_loss_mlp": 1.00105691, + "epoch": 0.17123102359837666, + "flos": 17566387418880.0, + "grad_norm": 2.033267401968529, + "language_loss": 0.86524987, + "learning_rate": 3.7943873757778614e-06, + "loss": 0.88842648, + "num_input_tokens_seen": 61710820, + "step": 2848, + "time_per_iteration": 2.5670580863952637 + }, + { + "auxiliary_loss_clip": 0.01128964, + "auxiliary_loss_mlp": 0.01156315, + "balance_loss_clip": 1.00191975, + "balance_loss_mlp": 1.00105715, + "epoch": 0.17129114685104463, + "flos": 26173635845760.0, + "grad_norm": 1.918212571712097, + "language_loss": 0.75335616, + "learning_rate": 3.794215340959902e-06, + "loss": 0.77620894, + "num_input_tokens_seen": 61729855, + "step": 2849, + "time_per_iteration": 2.6708810329437256 + }, + { + "auxiliary_loss_clip": 0.01131077, + "auxiliary_loss_mlp": 0.01145703, + "balance_loss_clip": 1.00174379, + "balance_loss_mlp": 0.9999814, + "epoch": 0.17135127010371262, + "flos": 69269710037760.0, + "grad_norm": 0.7969812264699676, + "language_loss": 0.57523072, + "learning_rate": 3.7940432381051163e-06, + "loss": 0.5979985, + "num_input_tokens_seen": 61790290, + "step": 2850, + "time_per_iteration": 3.1891493797302246 + }, + { + "auxiliary_loss_clip": 0.01128025, + "auxiliary_loss_mlp": 0.01155877, + "balance_loss_clip": 1.00189757, + "balance_loss_mlp": 1.00100017, + "epoch": 0.1714113933563806, + "flos": 23550110380800.0, + "grad_norm": 2.57304650972152, + "language_loss": 0.81240737, + "learning_rate": 3.793871067220031e-06, + "loss": 0.83524638, + "num_input_tokens_seen": 61809265, + "step": 2851, + "time_per_iteration": 2.668788194656372 + }, + { + "auxiliary_loss_clip": 0.01128747, + "auxiliary_loss_mlp": 0.01156364, + "balance_loss_clip": 1.00207329, + "balance_loss_mlp": 1.00120151, + "epoch": 0.17147151660904855, + "flos": 21142443697920.0, + "grad_norm": 1.9422331745856147, + "language_loss": 0.92993414, + "learning_rate": 3.7936988283111764e-06, + "loss": 0.95278525, + "num_input_tokens_seen": 61828980, + "step": 2852, + "time_per_iteration": 2.6804466247558594 + }, + { + "auxiliary_loss_clip": 0.01129125, + "auxiliary_loss_mlp": 0.01156954, + "balance_loss_clip": 1.00193012, + "balance_loss_mlp": 1.00102806, + "epoch": 0.17153163986171652, + "flos": 18624890332800.0, + "grad_norm": 1.7772334076522074, + "language_loss": 0.69175935, + "learning_rate": 3.7935265213850817e-06, + "loss": 0.71462011, + "num_input_tokens_seen": 61847915, + "step": 2853, + "time_per_iteration": 2.64534330368042 + }, + { + "auxiliary_loss_clip": 0.01113594, + "auxiliary_loss_mlp": 0.01156374, + "balance_loss_clip": 1.00189686, + "balance_loss_mlp": 1.00111568, + "epoch": 0.17159176311438448, + "flos": 18223265387520.0, + "grad_norm": 1.9059729800939609, + "language_loss": 0.66828543, + "learning_rate": 3.7933541464482815e-06, + "loss": 0.69098508, + "num_input_tokens_seen": 61865570, + "step": 2854, + "time_per_iteration": 4.03536319732666 + }, + { + "auxiliary_loss_clip": 0.0114553, + "auxiliary_loss_mlp": 0.01155617, + "balance_loss_clip": 1.0020504, + "balance_loss_mlp": 1.00102687, + "epoch": 0.17165188636705245, + "flos": 20738987159040.0, + "grad_norm": 1.5740919818620436, + "language_loss": 0.89109415, + "learning_rate": 3.7931817035073124e-06, + "loss": 0.91410559, + "num_input_tokens_seen": 61883340, + "step": 2855, + "time_per_iteration": 2.5974855422973633 + }, + { + "auxiliary_loss_clip": 0.01177813, + "auxiliary_loss_mlp": 0.01156434, + "balance_loss_clip": 1.00215912, + "balance_loss_mlp": 1.00108075, + "epoch": 0.17171200961972044, + "flos": 24899884680960.0, + "grad_norm": 2.1927250592895517, + "language_loss": 0.83666897, + "learning_rate": 3.7930091925687134e-06, + "loss": 0.86001146, + "num_input_tokens_seen": 61900610, + "step": 2856, + "time_per_iteration": 2.5445363521575928 + }, + { + "auxiliary_loss_clip": 0.01161255, + "auxiliary_loss_mlp": 0.01156304, + "balance_loss_clip": 1.00213838, + "balance_loss_mlp": 1.00104618, + "epoch": 0.1717721328723884, + "flos": 20157234485760.0, + "grad_norm": 2.213045578199767, + "language_loss": 0.8644222, + "learning_rate": 3.792836613639026e-06, + "loss": 0.8875978, + "num_input_tokens_seen": 61916795, + "step": 2857, + "time_per_iteration": 5.3565990924835205 + }, + { + "auxiliary_loss_clip": 0.01161344, + "auxiliary_loss_mlp": 0.01156454, + "balance_loss_clip": 1.00222528, + "balance_loss_mlp": 1.00119567, + "epoch": 0.17183225612505637, + "flos": 23361650697600.0, + "grad_norm": 2.1048479270962765, + "language_loss": 0.78275907, + "learning_rate": 3.7926639667247947e-06, + "loss": 0.80593705, + "num_input_tokens_seen": 61936665, + "step": 2858, + "time_per_iteration": 4.030694961547852 + }, + { + "auxiliary_loss_clip": 0.01150489, + "auxiliary_loss_mlp": 0.01156273, + "balance_loss_clip": 1.00240612, + "balance_loss_mlp": 1.00120521, + "epoch": 0.17189237937772434, + "flos": 18114240631680.0, + "grad_norm": 1.7012912512989655, + "language_loss": 0.77374685, + "learning_rate": 3.7924912518325663e-06, + "loss": 0.7968145, + "num_input_tokens_seen": 61954415, + "step": 2859, + "time_per_iteration": 2.58188796043396 + }, + { + "auxiliary_loss_clip": 0.01116906, + "auxiliary_loss_mlp": 0.01155603, + "balance_loss_clip": 1.00208306, + "balance_loss_mlp": 1.00082123, + "epoch": 0.1719525026303923, + "flos": 23258408031360.0, + "grad_norm": 2.198254483739871, + "language_loss": 0.76496768, + "learning_rate": 3.7923184689688902e-06, + "loss": 0.78769273, + "num_input_tokens_seen": 61973940, + "step": 2860, + "time_per_iteration": 2.6812479496002197 + }, + { + "auxiliary_loss_clip": 0.01161187, + "auxiliary_loss_mlp": 0.01156784, + "balance_loss_clip": 1.00207782, + "balance_loss_mlp": 1.00123954, + "epoch": 0.17201262588306027, + "flos": 20810413353600.0, + "grad_norm": 1.8777808238738585, + "language_loss": 0.8167963, + "learning_rate": 3.792145618140317e-06, + "loss": 0.83997607, + "num_input_tokens_seen": 61991845, + "step": 2861, + "time_per_iteration": 2.5482370853424072 + }, + { + "auxiliary_loss_clip": 0.01145709, + "auxiliary_loss_mlp": 0.01156249, + "balance_loss_clip": 1.00217056, + "balance_loss_mlp": 1.00099063, + "epoch": 0.17207274913572823, + "flos": 20375858615040.0, + "grad_norm": 1.8321827569642684, + "language_loss": 0.84793282, + "learning_rate": 3.7919726993534038e-06, + "loss": 0.87095249, + "num_input_tokens_seen": 62009395, + "step": 2862, + "time_per_iteration": 2.5966029167175293 + }, + { + "auxiliary_loss_clip": 0.01128686, + "auxiliary_loss_mlp": 0.01155977, + "balance_loss_clip": 1.00196075, + "balance_loss_mlp": 1.00100493, + "epoch": 0.17213287238839622, + "flos": 26797727675520.0, + "grad_norm": 1.6500346816819158, + "language_loss": 0.77826476, + "learning_rate": 3.7917997126147054e-06, + "loss": 0.8011114, + "num_input_tokens_seen": 62029005, + "step": 2863, + "time_per_iteration": 2.671844959259033 + }, + { + "auxiliary_loss_clip": 0.01128975, + "auxiliary_loss_mlp": 0.00748892, + "balance_loss_clip": 1.00191176, + "balance_loss_mlp": 1.0006609, + "epoch": 0.1721929956410642, + "flos": 26030819370240.0, + "grad_norm": 1.922841339929855, + "language_loss": 0.72423196, + "learning_rate": 3.7916266579307823e-06, + "loss": 0.74301064, + "num_input_tokens_seen": 62048730, + "step": 2864, + "time_per_iteration": 2.678057909011841 + }, + { + "auxiliary_loss_clip": 0.01129125, + "auxiliary_loss_mlp": 0.01156779, + "balance_loss_clip": 1.00198162, + "balance_loss_mlp": 1.00133002, + "epoch": 0.17225311889373215, + "flos": 22273091078400.0, + "grad_norm": 1.9691046396882568, + "language_loss": 0.72551888, + "learning_rate": 3.7914535353081973e-06, + "loss": 0.74837792, + "num_input_tokens_seen": 62069000, + "step": 2865, + "time_per_iteration": 2.6671485900878906 + }, + { + "auxiliary_loss_clip": 0.01161397, + "auxiliary_loss_mlp": 0.00748897, + "balance_loss_clip": 1.00221777, + "balance_loss_mlp": 1.00079584, + "epoch": 0.17231324214640012, + "flos": 21287774125440.0, + "grad_norm": 2.4628257541404106, + "language_loss": 0.79171836, + "learning_rate": 3.7912803447535145e-06, + "loss": 0.81082135, + "num_input_tokens_seen": 62086750, + "step": 2866, + "time_per_iteration": 2.5638391971588135 + }, + { + "auxiliary_loss_clip": 0.01177862, + "auxiliary_loss_mlp": 0.01156197, + "balance_loss_clip": 1.00225353, + "balance_loss_mlp": 1.00112915, + "epoch": 0.17237336539906808, + "flos": 19680735640320.0, + "grad_norm": 2.8938096437086465, + "language_loss": 0.80032831, + "learning_rate": 3.7911070862733016e-06, + "loss": 0.82366896, + "num_input_tokens_seen": 62106240, + "step": 2867, + "time_per_iteration": 2.5535895824432373 + }, + { + "auxiliary_loss_clip": 0.01144492, + "auxiliary_loss_mlp": 0.0115601, + "balance_loss_clip": 1.00196087, + "balance_loss_mlp": 1.00084734, + "epoch": 0.17243348865173605, + "flos": 17529650784000.0, + "grad_norm": 1.9938547140470646, + "language_loss": 0.79507118, + "learning_rate": 3.7909337598741276e-06, + "loss": 0.81807619, + "num_input_tokens_seen": 62124895, + "step": 2868, + "time_per_iteration": 2.605181932449341 + }, + { + "auxiliary_loss_clip": 0.01117379, + "auxiliary_loss_mlp": 0.01157008, + "balance_loss_clip": 1.00246835, + "balance_loss_mlp": 1.00117779, + "epoch": 0.17249361190440402, + "flos": 18259858368000.0, + "grad_norm": 1.7549586047297747, + "language_loss": 0.83823079, + "learning_rate": 3.7907603655625674e-06, + "loss": 0.86097467, + "num_input_tokens_seen": 62143510, + "step": 2869, + "time_per_iteration": 2.656085729598999 + }, + { + "auxiliary_loss_clip": 0.01144697, + "auxiliary_loss_mlp": 0.01156161, + "balance_loss_clip": 1.002069, + "balance_loss_mlp": 1.00109363, + "epoch": 0.172553735157072, + "flos": 21174367910400.0, + "grad_norm": 1.9256743338931572, + "language_loss": 0.77529866, + "learning_rate": 3.7905869033451932e-06, + "loss": 0.79830718, + "num_input_tokens_seen": 62162285, + "step": 2870, + "time_per_iteration": 2.6132218837738037 + }, + { + "auxiliary_loss_clip": 0.01177777, + "auxiliary_loss_mlp": 0.01155583, + "balance_loss_clip": 1.00235343, + "balance_loss_mlp": 1.00089741, + "epoch": 0.17261385840973997, + "flos": 22273270646400.0, + "grad_norm": 1.7434165559109192, + "language_loss": 0.76968896, + "learning_rate": 3.7904133732285857e-06, + "loss": 0.79302251, + "num_input_tokens_seen": 62180970, + "step": 2871, + "time_per_iteration": 2.5439646244049072 + }, + { + "auxiliary_loss_clip": 0.01145779, + "auxiliary_loss_mlp": 0.01156633, + "balance_loss_clip": 1.00218892, + "balance_loss_mlp": 1.00118399, + "epoch": 0.17267398166240794, + "flos": 27922233830400.0, + "grad_norm": 2.384688095409687, + "language_loss": 0.74244177, + "learning_rate": 3.7902397752193228e-06, + "loss": 0.76546592, + "num_input_tokens_seen": 62198965, + "step": 2872, + "time_per_iteration": 2.654280185699463 + }, + { + "auxiliary_loss_clip": 0.01177613, + "auxiliary_loss_mlp": 0.01155989, + "balance_loss_clip": 1.00212789, + "balance_loss_mlp": 1.00101697, + "epoch": 0.1727341049150759, + "flos": 21945118970880.0, + "grad_norm": 1.6348642454932636, + "language_loss": 0.82527673, + "learning_rate": 3.790066109323988e-06, + "loss": 0.84861279, + "num_input_tokens_seen": 62219890, + "step": 2873, + "time_per_iteration": 2.529205560684204 + }, + { + "auxiliary_loss_clip": 0.01129104, + "auxiliary_loss_mlp": 0.01155589, + "balance_loss_clip": 1.00201201, + "balance_loss_mlp": 1.00090289, + "epoch": 0.17279422816774387, + "flos": 18107883924480.0, + "grad_norm": 1.8633808562037082, + "language_loss": 0.7487331, + "learning_rate": 3.7898923755491678e-06, + "loss": 0.7715801, + "num_input_tokens_seen": 62237140, + "step": 2874, + "time_per_iteration": 2.624030113220215 + }, + { + "auxiliary_loss_clip": 0.01177811, + "auxiliary_loss_mlp": 0.01156058, + "balance_loss_clip": 1.00219691, + "balance_loss_mlp": 1.00079942, + "epoch": 0.17285435142041183, + "flos": 21835447770240.0, + "grad_norm": 1.9874513841233106, + "language_loss": 0.8080622, + "learning_rate": 3.7897185739014487e-06, + "loss": 0.83140087, + "num_input_tokens_seen": 62255405, + "step": 2875, + "time_per_iteration": 2.5227887630462646 + }, + { + "auxiliary_loss_clip": 0.01144337, + "auxiliary_loss_mlp": 0.01156408, + "balance_loss_clip": 1.00194037, + "balance_loss_mlp": 1.00105429, + "epoch": 0.17291447467307983, + "flos": 18368452160640.0, + "grad_norm": 2.889920190796729, + "language_loss": 0.87434196, + "learning_rate": 3.7895447043874217e-06, + "loss": 0.89734948, + "num_input_tokens_seen": 62271280, + "step": 2876, + "time_per_iteration": 2.5474698543548584 + }, + { + "auxiliary_loss_clip": 0.01144825, + "auxiliary_loss_mlp": 0.01156294, + "balance_loss_clip": 1.00208569, + "balance_loss_mlp": 1.00122678, + "epoch": 0.1729745979257478, + "flos": 18624638937600.0, + "grad_norm": 2.1332098083035462, + "language_loss": 0.84254301, + "learning_rate": 3.789370767013681e-06, + "loss": 0.86555421, + "num_input_tokens_seen": 62289140, + "step": 2877, + "time_per_iteration": 2.5809011459350586 + }, + { + "auxiliary_loss_clip": 0.01133347, + "auxiliary_loss_mlp": 0.01155932, + "balance_loss_clip": 1.00257432, + "balance_loss_mlp": 1.0007695, + "epoch": 0.17303472117841576, + "flos": 22998234844800.0, + "grad_norm": 2.947188032546332, + "language_loss": 0.79276431, + "learning_rate": 3.7891967617868204e-06, + "loss": 0.81565714, + "num_input_tokens_seen": 62307490, + "step": 2878, + "time_per_iteration": 2.641350507736206 + }, + { + "auxiliary_loss_clip": 0.01145129, + "auxiliary_loss_mlp": 0.01156037, + "balance_loss_clip": 1.00203717, + "balance_loss_mlp": 1.00106525, + "epoch": 0.17309484443108372, + "flos": 25664386775040.0, + "grad_norm": 1.659905567706698, + "language_loss": 0.70221841, + "learning_rate": 3.78902268871344e-06, + "loss": 0.7252301, + "num_input_tokens_seen": 62328570, + "step": 2879, + "time_per_iteration": 2.6468474864959717 + }, + { + "auxiliary_loss_clip": 0.01144666, + "auxiliary_loss_mlp": 0.01156362, + "balance_loss_clip": 1.00197875, + "balance_loss_mlp": 1.00100863, + "epoch": 0.1731549676837517, + "flos": 13552903313280.0, + "grad_norm": 1.9208989095650597, + "language_loss": 0.8332302, + "learning_rate": 3.78884854780014e-06, + "loss": 0.85624051, + "num_input_tokens_seen": 62345735, + "step": 2880, + "time_per_iteration": 2.5588996410369873 + }, + { + "auxiliary_loss_clip": 0.01113436, + "auxiliary_loss_mlp": 0.01155762, + "balance_loss_clip": 1.00192213, + "balance_loss_mlp": 1.00098085, + "epoch": 0.17321509093641965, + "flos": 22857070394880.0, + "grad_norm": 2.9110838506109245, + "language_loss": 0.81166124, + "learning_rate": 3.7886743390535236e-06, + "loss": 0.83435321, + "num_input_tokens_seen": 62365525, + "step": 2881, + "time_per_iteration": 2.6800432205200195 + }, + { + "auxiliary_loss_clip": 0.01145262, + "auxiliary_loss_mlp": 0.01156478, + "balance_loss_clip": 1.00210238, + "balance_loss_mlp": 1.00112414, + "epoch": 0.17327521418908762, + "flos": 24352785653760.0, + "grad_norm": 1.8359415243026327, + "language_loss": 0.7680338, + "learning_rate": 3.788500062480197e-06, + "loss": 0.79105121, + "num_input_tokens_seen": 62385160, + "step": 2882, + "time_per_iteration": 2.625682830810547 + }, + { + "auxiliary_loss_clip": 0.01113009, + "auxiliary_loss_mlp": 0.0115563, + "balance_loss_clip": 1.00190616, + "balance_loss_mlp": 1.00103974, + "epoch": 0.1733353374417556, + "flos": 33105651816960.0, + "grad_norm": 1.8456798859276498, + "language_loss": 0.76007581, + "learning_rate": 3.788325718086769e-06, + "loss": 0.78276217, + "num_input_tokens_seen": 62405280, + "step": 2883, + "time_per_iteration": 2.7719085216522217 + }, + { + "auxiliary_loss_clip": 0.01128817, + "auxiliary_loss_mlp": 0.01155563, + "balance_loss_clip": 1.00190353, + "balance_loss_mlp": 1.00087714, + "epoch": 0.17339546069442358, + "flos": 24388947671040.0, + "grad_norm": 2.1349267992345364, + "language_loss": 0.8594687, + "learning_rate": 3.7881513058798503e-06, + "loss": 0.88231254, + "num_input_tokens_seen": 62423665, + "step": 2884, + "time_per_iteration": 2.6738202571868896 + }, + { + "auxiliary_loss_clip": 0.01145372, + "auxiliary_loss_mlp": 0.00748865, + "balance_loss_clip": 1.00217748, + "balance_loss_mlp": 1.00058508, + "epoch": 0.17345558394709154, + "flos": 27454174680960.0, + "grad_norm": 1.5043488547783717, + "language_loss": 0.74216533, + "learning_rate": 3.787976825866055e-06, + "loss": 0.76110768, + "num_input_tokens_seen": 62445170, + "step": 2885, + "time_per_iteration": 2.666344404220581 + }, + { + "auxiliary_loss_clip": 0.01145677, + "auxiliary_loss_mlp": 0.0115578, + "balance_loss_clip": 1.0021435, + "balance_loss_mlp": 1.00099874, + "epoch": 0.1735157071997595, + "flos": 24682158391680.0, + "grad_norm": 1.4614095777254748, + "language_loss": 0.7092945, + "learning_rate": 3.7878022780519998e-06, + "loss": 0.7323091, + "num_input_tokens_seen": 62466135, + "step": 2886, + "time_per_iteration": 2.6630187034606934 + }, + { + "auxiliary_loss_clip": 0.01161076, + "auxiliary_loss_mlp": 0.01155312, + "balance_loss_clip": 1.00207484, + "balance_loss_mlp": 1.0010078, + "epoch": 0.17357583045242747, + "flos": 21688932193920.0, + "grad_norm": 2.8495305243905444, + "language_loss": 0.69104737, + "learning_rate": 3.7876276624443024e-06, + "loss": 0.71421123, + "num_input_tokens_seen": 62483910, + "step": 2887, + "time_per_iteration": 2.568570613861084 + }, + { + "auxiliary_loss_clip": 0.01130001, + "auxiliary_loss_mlp": 0.01155372, + "balance_loss_clip": 1.00214899, + "balance_loss_mlp": 1.00106752, + "epoch": 0.17363595370509544, + "flos": 15375728753280.0, + "grad_norm": 1.6564061824791827, + "language_loss": 0.85572684, + "learning_rate": 3.787452979049585e-06, + "loss": 0.87858057, + "num_input_tokens_seen": 62501530, + "step": 2888, + "time_per_iteration": 2.6460068225860596 + }, + { + "auxiliary_loss_clip": 0.01096457, + "auxiliary_loss_mlp": 0.01156074, + "balance_loss_clip": 1.00183058, + "balance_loss_mlp": 1.00110221, + "epoch": 0.1736960769577634, + "flos": 23440941970560.0, + "grad_norm": 2.7601650522611214, + "language_loss": 0.78557622, + "learning_rate": 3.7872782278744718e-06, + "loss": 0.80810153, + "num_input_tokens_seen": 62521295, + "step": 2889, + "time_per_iteration": 2.759873390197754 + }, + { + "auxiliary_loss_clip": 0.01127499, + "auxiliary_loss_mlp": 0.00748856, + "balance_loss_clip": 1.00187647, + "balance_loss_mlp": 1.00069284, + "epoch": 0.1737562002104314, + "flos": 18587830475520.0, + "grad_norm": 2.5618211336880745, + "language_loss": 0.8380307, + "learning_rate": 3.7871034089255883e-06, + "loss": 0.85679424, + "num_input_tokens_seen": 62539615, + "step": 2890, + "time_per_iteration": 2.656940460205078 + }, + { + "auxiliary_loss_clip": 0.01161291, + "auxiliary_loss_mlp": 0.0115615, + "balance_loss_clip": 1.00204515, + "balance_loss_mlp": 1.00127316, + "epoch": 0.17381632346309936, + "flos": 15998060816640.0, + "grad_norm": 1.9568837603669196, + "language_loss": 0.82175761, + "learning_rate": 3.7869285222095653e-06, + "loss": 0.84493202, + "num_input_tokens_seen": 62556820, + "step": 2891, + "time_per_iteration": 2.543262004852295 + }, + { + "auxiliary_loss_clip": 0.01131058, + "auxiliary_loss_mlp": 0.01155942, + "balance_loss_clip": 1.0020256, + "balance_loss_mlp": 1.00096965, + "epoch": 0.17387644671576732, + "flos": 13369830670080.0, + "grad_norm": 2.0120411810920906, + "language_loss": 0.81094688, + "learning_rate": 3.7867535677330334e-06, + "loss": 0.83381683, + "num_input_tokens_seen": 62572450, + "step": 2892, + "time_per_iteration": 3.9745850563049316 + }, + { + "auxiliary_loss_clip": 0.01161359, + "auxiliary_loss_mlp": 0.0115631, + "balance_loss_clip": 1.00228167, + "balance_loss_mlp": 1.00133824, + "epoch": 0.1739365699684353, + "flos": 26615516958720.0, + "grad_norm": 1.8131514450685662, + "language_loss": 0.74294233, + "learning_rate": 3.786578545502627e-06, + "loss": 0.766119, + "num_input_tokens_seen": 62592580, + "step": 2893, + "time_per_iteration": 2.6231300830841064 + }, + { + "auxiliary_loss_clip": 0.01145858, + "auxiliary_loss_mlp": 0.01155694, + "balance_loss_clip": 1.00203884, + "balance_loss_mlp": 1.0010078, + "epoch": 0.17399669322110325, + "flos": 23367971491200.0, + "grad_norm": 2.334492654667561, + "language_loss": 0.82612473, + "learning_rate": 3.7864034555249828e-06, + "loss": 0.84914023, + "num_input_tokens_seen": 62611220, + "step": 2894, + "time_per_iteration": 3.991180419921875 + }, + { + "auxiliary_loss_clip": 0.01129335, + "auxiliary_loss_mlp": 0.01156075, + "balance_loss_clip": 1.00215173, + "balance_loss_mlp": 1.0010072, + "epoch": 0.17405681647377122, + "flos": 22054107813120.0, + "grad_norm": 2.010864207905066, + "language_loss": 0.74405998, + "learning_rate": 3.786228297806741e-06, + "loss": 0.76691401, + "num_input_tokens_seen": 62629185, + "step": 2895, + "time_per_iteration": 5.56524658203125 + }, + { + "auxiliary_loss_clip": 0.01110496, + "auxiliary_loss_mlp": 0.01144977, + "balance_loss_clip": 1.00153291, + "balance_loss_mlp": 1.00001836, + "epoch": 0.1741169397264392, + "flos": 61457559114240.0, + "grad_norm": 0.8705202713974333, + "language_loss": 0.62860644, + "learning_rate": 3.7860530723545435e-06, + "loss": 0.65116119, + "num_input_tokens_seen": 62691895, + "step": 2896, + "time_per_iteration": 3.3161356449127197 + }, + { + "auxiliary_loss_clip": 0.01145073, + "auxiliary_loss_mlp": 0.00748808, + "balance_loss_clip": 1.00198758, + "balance_loss_mlp": 1.00057626, + "epoch": 0.17417706297910718, + "flos": 27017680608000.0, + "grad_norm": 3.071174886393027, + "language_loss": 0.7597152, + "learning_rate": 3.785877779175034e-06, + "loss": 0.77865398, + "num_input_tokens_seen": 62713790, + "step": 2897, + "time_per_iteration": 2.6748011112213135 + }, + { + "auxiliary_loss_clip": 0.01160973, + "auxiliary_loss_mlp": 0.01155232, + "balance_loss_clip": 1.00204873, + "balance_loss_mlp": 1.00102305, + "epoch": 0.17423718623177514, + "flos": 33508856960640.0, + "grad_norm": 1.9175927763936798, + "language_loss": 0.69136798, + "learning_rate": 3.7857024182748606e-06, + "loss": 0.71452999, + "num_input_tokens_seen": 62736285, + "step": 2898, + "time_per_iteration": 2.6763675212860107 + }, + { + "auxiliary_loss_clip": 0.01145721, + "auxiliary_loss_mlp": 0.01156164, + "balance_loss_clip": 1.00223589, + "balance_loss_mlp": 1.00119162, + "epoch": 0.1742973094844431, + "flos": 27198634348800.0, + "grad_norm": 2.270193024595862, + "language_loss": 0.7642777, + "learning_rate": 3.7855269896606717e-06, + "loss": 0.78729659, + "num_input_tokens_seen": 62756240, + "step": 2899, + "time_per_iteration": 2.658013105392456 + }, + { + "auxiliary_loss_clip": 0.01112013, + "auxiliary_loss_mlp": 0.01155141, + "balance_loss_clip": 1.00183725, + "balance_loss_mlp": 1.00112283, + "epoch": 0.17435743273711107, + "flos": 22710734386560.0, + "grad_norm": 1.8121242197138308, + "language_loss": 0.72867632, + "learning_rate": 3.785351493339121e-06, + "loss": 0.7513479, + "num_input_tokens_seen": 62775910, + "step": 2900, + "time_per_iteration": 2.698397397994995 + }, + { + "auxiliary_loss_clip": 0.01129468, + "auxiliary_loss_mlp": 0.00748823, + "balance_loss_clip": 1.00195408, + "balance_loss_mlp": 1.00049484, + "epoch": 0.17441755598977904, + "flos": 41646466039680.0, + "grad_norm": 1.4908088223619151, + "language_loss": 0.69837153, + "learning_rate": 3.785175929316863e-06, + "loss": 0.71715444, + "num_input_tokens_seen": 62799385, + "step": 2901, + "time_per_iteration": 2.8140878677368164 + }, + { + "auxiliary_loss_clip": 0.01145833, + "auxiliary_loss_mlp": 0.01155905, + "balance_loss_clip": 1.00218117, + "balance_loss_mlp": 1.00102854, + "epoch": 0.174477679242447, + "flos": 26287077974400.0, + "grad_norm": 1.8061019018935165, + "language_loss": 0.76314187, + "learning_rate": 3.7850002976005543e-06, + "loss": 0.78615928, + "num_input_tokens_seen": 62819380, + "step": 2902, + "time_per_iteration": 2.6464531421661377 + }, + { + "auxiliary_loss_clip": 0.01161666, + "auxiliary_loss_mlp": 0.01155921, + "balance_loss_clip": 1.00216055, + "balance_loss_mlp": 1.00123501, + "epoch": 0.174537802495115, + "flos": 17858412990720.0, + "grad_norm": 2.0222245334021944, + "language_loss": 0.81638205, + "learning_rate": 3.7848245981968558e-06, + "loss": 0.83955789, + "num_input_tokens_seen": 62836205, + "step": 2903, + "time_per_iteration": 2.558373212814331 + }, + { + "auxiliary_loss_clip": 0.01144447, + "auxiliary_loss_mlp": 0.01155533, + "balance_loss_clip": 1.00199723, + "balance_loss_mlp": 1.0010376, + "epoch": 0.17459792574778296, + "flos": 16940715390720.0, + "grad_norm": 2.139346130654192, + "language_loss": 0.73189873, + "learning_rate": 3.784648831112429e-06, + "loss": 0.75489849, + "num_input_tokens_seen": 62854045, + "step": 2904, + "time_per_iteration": 2.5711259841918945 + }, + { + "auxiliary_loss_clip": 0.01101813, + "auxiliary_loss_mlp": 0.01155066, + "balance_loss_clip": 1.00189757, + "balance_loss_mlp": 1.00095272, + "epoch": 0.17465804900045093, + "flos": 25520026014720.0, + "grad_norm": 1.8195248648689322, + "language_loss": 0.63811994, + "learning_rate": 3.7844729963539406e-06, + "loss": 0.66068864, + "num_input_tokens_seen": 62873075, + "step": 2905, + "time_per_iteration": 2.7825591564178467 + }, + { + "auxiliary_loss_clip": 0.01128975, + "auxiliary_loss_mlp": 0.01155885, + "balance_loss_clip": 1.00207567, + "balance_loss_mlp": 1.00100803, + "epoch": 0.1747181722531189, + "flos": 24129708238080.0, + "grad_norm": 1.7877583194387165, + "language_loss": 0.79666883, + "learning_rate": 3.7842970939280566e-06, + "loss": 0.81951743, + "num_input_tokens_seen": 62892675, + "step": 2906, + "time_per_iteration": 2.711221694946289 + }, + { + "auxiliary_loss_clip": 0.01161174, + "auxiliary_loss_mlp": 0.01155663, + "balance_loss_clip": 1.00224662, + "balance_loss_mlp": 1.00107241, + "epoch": 0.17477829550578686, + "flos": 17748813617280.0, + "grad_norm": 1.7858119728921993, + "language_loss": 0.81425351, + "learning_rate": 3.784121123841449e-06, + "loss": 0.83742189, + "num_input_tokens_seen": 62910675, + "step": 2907, + "time_per_iteration": 2.554975986480713 + }, + { + "auxiliary_loss_clip": 0.01161097, + "auxiliary_loss_mlp": 0.0115554, + "balance_loss_clip": 1.00218558, + "balance_loss_mlp": 1.00114059, + "epoch": 0.17483841875845482, + "flos": 15377344865280.0, + "grad_norm": 2.9542451355078176, + "language_loss": 0.80747747, + "learning_rate": 3.7839450861007886e-06, + "loss": 0.83064389, + "num_input_tokens_seen": 62928130, + "step": 2908, + "time_per_iteration": 2.550060987472534 + }, + { + "auxiliary_loss_clip": 0.01145574, + "auxiliary_loss_mlp": 0.01155753, + "balance_loss_clip": 1.00212526, + "balance_loss_mlp": 1.00135291, + "epoch": 0.17489854201112282, + "flos": 17163254102400.0, + "grad_norm": 2.497924491763793, + "language_loss": 0.80558681, + "learning_rate": 3.7837689807127518e-06, + "loss": 0.82860005, + "num_input_tokens_seen": 62944290, + "step": 2909, + "time_per_iteration": 2.5805959701538086 + }, + { + "auxiliary_loss_clip": 0.01113875, + "auxiliary_loss_mlp": 0.01156159, + "balance_loss_clip": 1.00215006, + "balance_loss_mlp": 1.00109196, + "epoch": 0.17495866526379078, + "flos": 19755286318080.0, + "grad_norm": 1.9870964764095274, + "language_loss": 0.76709157, + "learning_rate": 3.783592807684017e-06, + "loss": 0.78979194, + "num_input_tokens_seen": 62963505, + "step": 2910, + "time_per_iteration": 2.6681759357452393 + }, + { + "auxiliary_loss_clip": 0.01177711, + "auxiliary_loss_mlp": 0.01155932, + "balance_loss_clip": 1.00226188, + "balance_loss_mlp": 1.00115049, + "epoch": 0.17501878851645875, + "flos": 28511133310080.0, + "grad_norm": 2.960473790852061, + "language_loss": 0.87149882, + "learning_rate": 3.7834165670212645e-06, + "loss": 0.89483523, + "num_input_tokens_seen": 62985020, + "step": 2911, + "time_per_iteration": 2.6187641620635986 + }, + { + "auxiliary_loss_clip": 0.01177661, + "auxiliary_loss_mlp": 0.00748768, + "balance_loss_clip": 1.00218272, + "balance_loss_mlp": 1.00052357, + "epoch": 0.1750789117691267, + "flos": 17931203902080.0, + "grad_norm": 2.092060548175764, + "language_loss": 0.89738059, + "learning_rate": 3.7832402587311764e-06, + "loss": 0.91664493, + "num_input_tokens_seen": 63001745, + "step": 2912, + "time_per_iteration": 2.535135269165039 + }, + { + "auxiliary_loss_clip": 0.01161822, + "auxiliary_loss_mlp": 0.01156116, + "balance_loss_clip": 1.00212395, + "balance_loss_mlp": 1.00095272, + "epoch": 0.17513903502179468, + "flos": 18259427404800.0, + "grad_norm": 1.742783066098964, + "language_loss": 0.72273284, + "learning_rate": 3.783063882820439e-06, + "loss": 0.74591219, + "num_input_tokens_seen": 63019750, + "step": 2913, + "time_per_iteration": 2.5551815032958984 + }, + { + "auxiliary_loss_clip": 0.01146181, + "auxiliary_loss_mlp": 0.01155423, + "balance_loss_clip": 1.00216937, + "balance_loss_mlp": 1.00102282, + "epoch": 0.17519915827446264, + "flos": 20704728562560.0, + "grad_norm": 1.6548069921868844, + "language_loss": 0.69127882, + "learning_rate": 3.782887439295741e-06, + "loss": 0.71429485, + "num_input_tokens_seen": 63039500, + "step": 2914, + "time_per_iteration": 2.6323554515838623 + }, + { + "auxiliary_loss_clip": 0.01161004, + "auxiliary_loss_mlp": 0.01155809, + "balance_loss_clip": 1.00216055, + "balance_loss_mlp": 1.00102735, + "epoch": 0.1752592815271306, + "flos": 20523415685760.0, + "grad_norm": 1.750425960798799, + "language_loss": 0.93664336, + "learning_rate": 3.782710928163772e-06, + "loss": 0.95981145, + "num_input_tokens_seen": 63059785, + "step": 2915, + "time_per_iteration": 2.584087371826172 + }, + { + "auxiliary_loss_clip": 0.01132519, + "auxiliary_loss_mlp": 0.0115534, + "balance_loss_clip": 1.00204051, + "balance_loss_mlp": 1.0011313, + "epoch": 0.1753194047797986, + "flos": 21799178012160.0, + "grad_norm": 1.997354162083397, + "language_loss": 0.81179249, + "learning_rate": 3.782534349431226e-06, + "loss": 0.83467114, + "num_input_tokens_seen": 63079385, + "step": 2916, + "time_per_iteration": 2.6680002212524414 + }, + { + "auxiliary_loss_clip": 0.01161296, + "auxiliary_loss_mlp": 0.01156104, + "balance_loss_clip": 1.0021987, + "balance_loss_mlp": 1.00113225, + "epoch": 0.17537952803246656, + "flos": 20668351063680.0, + "grad_norm": 1.5409904324831445, + "language_loss": 0.73839003, + "learning_rate": 3.782357703104799e-06, + "loss": 0.76156402, + "num_input_tokens_seen": 63098970, + "step": 2917, + "time_per_iteration": 2.6045122146606445 + }, + { + "auxiliary_loss_clip": 0.0116152, + "auxiliary_loss_mlp": 0.01155341, + "balance_loss_clip": 1.00227928, + "balance_loss_mlp": 1.00094151, + "epoch": 0.17543965128513453, + "flos": 23295072839040.0, + "grad_norm": 1.7916268441500924, + "language_loss": 0.76676917, + "learning_rate": 3.7821809891911897e-06, + "loss": 0.78993773, + "num_input_tokens_seen": 63118750, + "step": 2918, + "time_per_iteration": 2.5784900188446045 + }, + { + "auxiliary_loss_clip": 0.01096146, + "auxiliary_loss_mlp": 0.01155462, + "balance_loss_clip": 1.00181687, + "balance_loss_mlp": 1.00087178, + "epoch": 0.1754997745378025, + "flos": 29095615416960.0, + "grad_norm": 2.0181027947913184, + "language_loss": 0.738249, + "learning_rate": 3.782004207697098e-06, + "loss": 0.76076514, + "num_input_tokens_seen": 63136865, + "step": 2919, + "time_per_iteration": 2.7524173259735107 + }, + { + "auxiliary_loss_clip": 0.01145873, + "auxiliary_loss_mlp": 0.01155475, + "balance_loss_clip": 1.002177, + "balance_loss_mlp": 1.00107527, + "epoch": 0.17555989779047046, + "flos": 30371844620160.0, + "grad_norm": 2.392279168978431, + "language_loss": 0.74571896, + "learning_rate": 3.781827358629228e-06, + "loss": 0.76873243, + "num_input_tokens_seen": 63158325, + "step": 2920, + "time_per_iteration": 2.684485912322998 + }, + { + "auxiliary_loss_clip": 0.01146245, + "auxiliary_loss_mlp": 0.01155042, + "balance_loss_clip": 1.00206399, + "balance_loss_mlp": 1.00083268, + "epoch": 0.17562002104313842, + "flos": 23287746464640.0, + "grad_norm": 2.5722026329008476, + "language_loss": 0.79545105, + "learning_rate": 3.7816504419942873e-06, + "loss": 0.81846398, + "num_input_tokens_seen": 63173115, + "step": 2921, + "time_per_iteration": 2.5950050354003906 + }, + { + "auxiliary_loss_clip": 0.01128973, + "auxiliary_loss_mlp": 0.01156076, + "balance_loss_clip": 1.00191247, + "balance_loss_mlp": 1.00129437, + "epoch": 0.1756801442958064, + "flos": 24790500789120.0, + "grad_norm": 2.1789527883432647, + "language_loss": 0.87842524, + "learning_rate": 3.7814734577989823e-06, + "loss": 0.90127575, + "num_input_tokens_seen": 63192880, + "step": 2922, + "time_per_iteration": 2.6788370609283447 + }, + { + "auxiliary_loss_clip": 0.01161053, + "auxiliary_loss_mlp": 0.01155531, + "balance_loss_clip": 1.00210118, + "balance_loss_mlp": 1.00113177, + "epoch": 0.17574026754847438, + "flos": 25771651764480.0, + "grad_norm": 3.071263105756313, + "language_loss": 0.62944877, + "learning_rate": 3.7812964060500253e-06, + "loss": 0.65261459, + "num_input_tokens_seen": 63214395, + "step": 2923, + "time_per_iteration": 2.6431612968444824 + }, + { + "auxiliary_loss_clip": 0.01129679, + "auxiliary_loss_mlp": 0.01155739, + "balance_loss_clip": 1.00208449, + "balance_loss_mlp": 1.00105321, + "epoch": 0.17580039080114235, + "flos": 17456608477440.0, + "grad_norm": 2.2855020644925914, + "language_loss": 0.81295931, + "learning_rate": 3.78111928675413e-06, + "loss": 0.83581346, + "num_input_tokens_seen": 63231020, + "step": 2924, + "time_per_iteration": 2.6036479473114014 + }, + { + "auxiliary_loss_clip": 0.01144336, + "auxiliary_loss_mlp": 0.01156315, + "balance_loss_clip": 1.00199711, + "balance_loss_mlp": 1.00134277, + "epoch": 0.1758605140538103, + "flos": 14864648088960.0, + "grad_norm": 2.0618387002156293, + "language_loss": 0.71411252, + "learning_rate": 3.7809420999180126e-06, + "loss": 0.73711902, + "num_input_tokens_seen": 63246245, + "step": 2925, + "time_per_iteration": 2.5617871284484863 + }, + { + "auxiliary_loss_clip": 0.01127914, + "auxiliary_loss_mlp": 0.0115496, + "balance_loss_clip": 1.00201201, + "balance_loss_mlp": 1.00094151, + "epoch": 0.17592063730647828, + "flos": 23004268329600.0, + "grad_norm": 1.719108291560385, + "language_loss": 0.72011858, + "learning_rate": 3.7807648455483934e-06, + "loss": 0.74294734, + "num_input_tokens_seen": 63267790, + "step": 2926, + "time_per_iteration": 2.66562819480896 + }, + { + "auxiliary_loss_clip": 0.01130119, + "auxiliary_loss_mlp": 0.01155379, + "balance_loss_clip": 1.00206029, + "balance_loss_mlp": 1.00088406, + "epoch": 0.17598076055914624, + "flos": 20741501111040.0, + "grad_norm": 2.7335608752679934, + "language_loss": 0.84592843, + "learning_rate": 3.7805875236519918e-06, + "loss": 0.86878347, + "num_input_tokens_seen": 63286830, + "step": 2927, + "time_per_iteration": 2.644731044769287 + }, + { + "auxiliary_loss_clip": 0.01116658, + "auxiliary_loss_mlp": 0.0115564, + "balance_loss_clip": 1.00205457, + "balance_loss_mlp": 1.00104976, + "epoch": 0.1760408838118142, + "flos": 34092441227520.0, + "grad_norm": 1.8582674588323842, + "language_loss": 0.71609467, + "learning_rate": 3.7804101342355336e-06, + "loss": 0.73881769, + "num_input_tokens_seen": 63308870, + "step": 2928, + "time_per_iteration": 2.7925009727478027 + }, + { + "auxiliary_loss_clip": 0.01130594, + "auxiliary_loss_mlp": 0.01154935, + "balance_loss_clip": 1.00216079, + "balance_loss_mlp": 1.00091684, + "epoch": 0.1761010070644822, + "flos": 24168384207360.0, + "grad_norm": 1.7907769632640431, + "language_loss": 0.83223784, + "learning_rate": 3.780232677305744e-06, + "loss": 0.85509306, + "num_input_tokens_seen": 63329005, + "step": 2929, + "time_per_iteration": 4.2108259201049805 + }, + { + "auxiliary_loss_clip": 0.01145448, + "auxiliary_loss_mlp": 0.01155241, + "balance_loss_clip": 1.00208807, + "balance_loss_mlp": 1.00093699, + "epoch": 0.17616113031715017, + "flos": 26576697335040.0, + "grad_norm": 2.5917913805294, + "language_loss": 0.79110193, + "learning_rate": 3.7800551528693535e-06, + "loss": 0.81410885, + "num_input_tokens_seen": 63349390, + "step": 2930, + "time_per_iteration": 2.680847644805908 + }, + { + "auxiliary_loss_clip": 0.0117773, + "auxiliary_loss_mlp": 0.01155551, + "balance_loss_clip": 1.00232792, + "balance_loss_mlp": 1.00096083, + "epoch": 0.17622125356981813, + "flos": 25666685245440.0, + "grad_norm": 1.8598641155157714, + "language_loss": 0.76036835, + "learning_rate": 3.7798775609330927e-06, + "loss": 0.78370118, + "num_input_tokens_seen": 63368835, + "step": 2931, + "time_per_iteration": 4.109360456466675 + }, + { + "auxiliary_loss_clip": 0.01081111, + "auxiliary_loss_mlp": 0.01155264, + "balance_loss_clip": 1.00184238, + "balance_loss_mlp": 1.00086451, + "epoch": 0.1762813768224861, + "flos": 16508530949760.0, + "grad_norm": 2.242303301026872, + "language_loss": 0.75183117, + "learning_rate": 3.779699901503696e-06, + "loss": 0.7741949, + "num_input_tokens_seen": 63385220, + "step": 2932, + "time_per_iteration": 4.185896158218384 + }, + { + "auxiliary_loss_clip": 0.01162322, + "auxiliary_loss_mlp": 0.01155696, + "balance_loss_clip": 1.00212669, + "balance_loss_mlp": 1.0008198, + "epoch": 0.17634150007515406, + "flos": 11211850402560.0, + "grad_norm": 2.320373263959151, + "language_loss": 0.9022209, + "learning_rate": 3.7795221745879016e-06, + "loss": 0.92540109, + "num_input_tokens_seen": 63400865, + "step": 2933, + "time_per_iteration": 4.154932737350464 + }, + { + "auxiliary_loss_clip": 0.01177643, + "auxiliary_loss_mlp": 0.0115538, + "balance_loss_clip": 1.00229621, + "balance_loss_mlp": 1.00107539, + "epoch": 0.17640162332782203, + "flos": 23659925235840.0, + "grad_norm": 1.7816315425999563, + "language_loss": 0.88336217, + "learning_rate": 3.779344380192448e-06, + "loss": 0.90669245, + "num_input_tokens_seen": 63421390, + "step": 2934, + "time_per_iteration": 2.563962459564209 + }, + { + "auxiliary_loss_clip": 0.01144201, + "auxiliary_loss_mlp": 0.01155093, + "balance_loss_clip": 1.00203896, + "balance_loss_mlp": 1.00117028, + "epoch": 0.17646174658049, + "flos": 53796984606720.0, + "grad_norm": 1.5635661665364833, + "language_loss": 0.70597506, + "learning_rate": 3.779166518324077e-06, + "loss": 0.72896802, + "num_input_tokens_seen": 63444715, + "step": 2935, + "time_per_iteration": 2.885829210281372 + }, + { + "auxiliary_loss_clip": 0.01128678, + "auxiliary_loss_mlp": 0.01155972, + "balance_loss_clip": 1.00197232, + "balance_loss_mlp": 1.00080943, + "epoch": 0.17652186983315798, + "flos": 24243868638720.0, + "grad_norm": 1.92808360104758, + "language_loss": 0.69420534, + "learning_rate": 3.7789885889895325e-06, + "loss": 0.7170518, + "num_input_tokens_seen": 63465525, + "step": 2936, + "time_per_iteration": 2.659447193145752 + }, + { + "auxiliary_loss_clip": 0.0111335, + "auxiliary_loss_mlp": 0.01155491, + "balance_loss_clip": 1.00201702, + "balance_loss_mlp": 1.00099599, + "epoch": 0.17658199308582595, + "flos": 27454282421760.0, + "grad_norm": 1.8232102440724314, + "language_loss": 0.7164607, + "learning_rate": 3.7788105921955634e-06, + "loss": 0.73914921, + "num_input_tokens_seen": 63485815, + "step": 2937, + "time_per_iteration": 2.7129323482513428 + }, + { + "auxiliary_loss_clip": 0.01144715, + "auxiliary_loss_mlp": 0.01155799, + "balance_loss_clip": 1.00217927, + "balance_loss_mlp": 1.00101805, + "epoch": 0.17664211633849392, + "flos": 22418672901120.0, + "grad_norm": 2.1333247370560455, + "language_loss": 0.75762796, + "learning_rate": 3.7786325279489184e-06, + "loss": 0.78063315, + "num_input_tokens_seen": 63503905, + "step": 2938, + "time_per_iteration": 2.658005475997925 + }, + { + "auxiliary_loss_clip": 0.01161048, + "auxiliary_loss_mlp": 0.01155228, + "balance_loss_clip": 1.00215864, + "balance_loss_mlp": 1.00092387, + "epoch": 0.17670223959116188, + "flos": 24715124098560.0, + "grad_norm": 1.9918700075629463, + "language_loss": 0.70760274, + "learning_rate": 3.7784543962563495e-06, + "loss": 0.73076546, + "num_input_tokens_seen": 63521985, + "step": 2939, + "time_per_iteration": 2.5717194080352783 + }, + { + "auxiliary_loss_clip": 0.01177821, + "auxiliary_loss_mlp": 0.01155123, + "balance_loss_clip": 1.00238144, + "balance_loss_mlp": 1.00081849, + "epoch": 0.17676236284382985, + "flos": 22527051212160.0, + "grad_norm": 2.210254018898836, + "language_loss": 0.73587656, + "learning_rate": 3.7782761971246115e-06, + "loss": 0.75920594, + "num_input_tokens_seen": 63539830, + "step": 2940, + "time_per_iteration": 2.5221757888793945 + }, + { + "auxiliary_loss_clip": 0.01127651, + "auxiliary_loss_mlp": 0.01155874, + "balance_loss_clip": 1.00202847, + "balance_loss_mlp": 1.00099707, + "epoch": 0.1768224860964978, + "flos": 12385160161920.0, + "grad_norm": 2.0629680291467287, + "language_loss": 0.85245359, + "learning_rate": 3.7780979305604616e-06, + "loss": 0.87528884, + "num_input_tokens_seen": 63555495, + "step": 2941, + "time_per_iteration": 2.613079309463501 + }, + { + "auxiliary_loss_clip": 0.01177733, + "auxiliary_loss_mlp": 0.0115483, + "balance_loss_clip": 1.00228596, + "balance_loss_mlp": 1.0009073, + "epoch": 0.1768826093491658, + "flos": 24353360271360.0, + "grad_norm": 2.4351777015410927, + "language_loss": 0.76754665, + "learning_rate": 3.7779195965706607e-06, + "loss": 0.79087234, + "num_input_tokens_seen": 63575290, + "step": 2942, + "time_per_iteration": 2.5492289066314697 + }, + { + "auxiliary_loss_clip": 0.01130623, + "auxiliary_loss_mlp": 0.00748799, + "balance_loss_clip": 1.00218391, + "balance_loss_mlp": 1.00052667, + "epoch": 0.17694273260183377, + "flos": 23587062497280.0, + "grad_norm": 1.8181653526428072, + "language_loss": 0.80563545, + "learning_rate": 3.77774119516197e-06, + "loss": 0.82442969, + "num_input_tokens_seen": 63594670, + "step": 2943, + "time_per_iteration": 2.68373966217041 + }, + { + "auxiliary_loss_clip": 0.01146042, + "auxiliary_loss_mlp": 0.01155647, + "balance_loss_clip": 1.00217354, + "balance_loss_mlp": 1.00105667, + "epoch": 0.17700285585450173, + "flos": 26760991040640.0, + "grad_norm": 1.79179536797272, + "language_loss": 0.80990094, + "learning_rate": 3.777562726341155e-06, + "loss": 0.83291781, + "num_input_tokens_seen": 63614780, + "step": 2944, + "time_per_iteration": 2.676882743835449 + }, + { + "auxiliary_loss_clip": 0.01177537, + "auxiliary_loss_mlp": 0.01155694, + "balance_loss_clip": 1.00215888, + "balance_loss_mlp": 1.0013895, + "epoch": 0.1770629791071697, + "flos": 42776323320960.0, + "grad_norm": 1.913025813577399, + "language_loss": 0.73521984, + "learning_rate": 3.7773841901149835e-06, + "loss": 0.75855213, + "num_input_tokens_seen": 63637190, + "step": 2945, + "time_per_iteration": 2.703463315963745 + }, + { + "auxiliary_loss_clip": 0.01160992, + "auxiliary_loss_mlp": 0.01155776, + "balance_loss_clip": 1.00221634, + "balance_loss_mlp": 1.00118554, + "epoch": 0.17712310235983766, + "flos": 17345572560000.0, + "grad_norm": 17.82965748739452, + "language_loss": 0.77278608, + "learning_rate": 3.7772055864902256e-06, + "loss": 0.79595375, + "num_input_tokens_seen": 63652140, + "step": 2946, + "time_per_iteration": 2.513993740081787 + }, + { + "auxiliary_loss_clip": 0.01130253, + "auxiliary_loss_mlp": 0.01155639, + "balance_loss_clip": 1.00214815, + "balance_loss_mlp": 1.00133467, + "epoch": 0.17718322561250563, + "flos": 23878477537920.0, + "grad_norm": 1.7500988692765749, + "language_loss": 0.76407039, + "learning_rate": 3.7770269154736535e-06, + "loss": 0.78692925, + "num_input_tokens_seen": 63671700, + "step": 2947, + "time_per_iteration": 2.672938585281372 + }, + { + "auxiliary_loss_clip": 0.01162008, + "auxiliary_loss_mlp": 0.01155524, + "balance_loss_clip": 1.0022136, + "balance_loss_mlp": 1.00093377, + "epoch": 0.1772433488651736, + "flos": 36466352104320.0, + "grad_norm": 2.1853460253084873, + "language_loss": 0.72647452, + "learning_rate": 3.7768481770720424e-06, + "loss": 0.74964976, + "num_input_tokens_seen": 63691685, + "step": 2948, + "time_per_iteration": 2.6758623123168945 + }, + { + "auxiliary_loss_clip": 0.01165956, + "auxiliary_loss_mlp": 0.01154933, + "balance_loss_clip": 1.00223386, + "balance_loss_mlp": 1.00101054, + "epoch": 0.1773034721178416, + "flos": 26684716510080.0, + "grad_norm": 1.6648255411023203, + "language_loss": 0.81841958, + "learning_rate": 3.776669371292171e-06, + "loss": 0.84162843, + "num_input_tokens_seen": 63711720, + "step": 2949, + "time_per_iteration": 2.603508710861206 + }, + { + "auxiliary_loss_clip": 0.01158982, + "auxiliary_loss_mlp": 0.01144708, + "balance_loss_clip": 1.0015974, + "balance_loss_mlp": 1.000512, + "epoch": 0.17736359537050955, + "flos": 57117467617920.0, + "grad_norm": 0.7420180103889313, + "language_loss": 0.64995414, + "learning_rate": 3.7764904981408186e-06, + "loss": 0.67299104, + "num_input_tokens_seen": 63776280, + "step": 2950, + "time_per_iteration": 3.2243032455444336 + }, + { + "auxiliary_loss_clip": 0.01127314, + "auxiliary_loss_mlp": 0.01154923, + "balance_loss_clip": 1.00184429, + "balance_loss_mlp": 1.00099981, + "epoch": 0.17742371862317752, + "flos": 27198203385600.0, + "grad_norm": 1.8535888556971802, + "language_loss": 0.83704233, + "learning_rate": 3.7763115576247686e-06, + "loss": 0.85986471, + "num_input_tokens_seen": 63797535, + "step": 2951, + "time_per_iteration": 2.6976823806762695 + }, + { + "auxiliary_loss_clip": 0.0114555, + "auxiliary_loss_mlp": 0.0115544, + "balance_loss_clip": 1.0020932, + "balance_loss_mlp": 1.00094473, + "epoch": 0.17748384187584548, + "flos": 20959694277120.0, + "grad_norm": 2.20282697602832, + "language_loss": 0.80193794, + "learning_rate": 3.776132549750806e-06, + "loss": 0.82494783, + "num_input_tokens_seen": 63817045, + "step": 2952, + "time_per_iteration": 2.613416910171509 + }, + { + "auxiliary_loss_clip": 0.01177591, + "auxiliary_loss_mlp": 0.01154944, + "balance_loss_clip": 1.00230384, + "balance_loss_mlp": 1.00102067, + "epoch": 0.17754396512851345, + "flos": 25009986844800.0, + "grad_norm": 2.414211140113296, + "language_loss": 0.79101396, + "learning_rate": 3.7759534745257194e-06, + "loss": 0.81433934, + "num_input_tokens_seen": 63837665, + "step": 2953, + "time_per_iteration": 2.5929458141326904 + }, + { + "auxiliary_loss_clip": 0.01129591, + "auxiliary_loss_mlp": 0.01155465, + "balance_loss_clip": 1.00210381, + "balance_loss_mlp": 1.00106525, + "epoch": 0.1776040883811814, + "flos": 32051566275840.0, + "grad_norm": 1.9118168834813092, + "language_loss": 0.87858152, + "learning_rate": 3.7757743319562994e-06, + "loss": 0.9014321, + "num_input_tokens_seen": 63858455, + "step": 2954, + "time_per_iteration": 2.738909959793091 + }, + { + "auxiliary_loss_clip": 0.0114455, + "auxiliary_loss_mlp": 0.01155797, + "balance_loss_clip": 1.00201941, + "balance_loss_mlp": 1.00101566, + "epoch": 0.17766421163384938, + "flos": 21574125348480.0, + "grad_norm": 2.1495525114325305, + "language_loss": 0.85076034, + "learning_rate": 3.7755951220493386e-06, + "loss": 0.87376374, + "num_input_tokens_seen": 63876935, + "step": 2955, + "time_per_iteration": 2.6534459590911865 + }, + { + "auxiliary_loss_clip": 0.01145252, + "auxiliary_loss_mlp": 0.01155351, + "balance_loss_clip": 1.00201297, + "balance_loss_mlp": 1.00104618, + "epoch": 0.17772433488651737, + "flos": 22419319345920.0, + "grad_norm": 1.9056787411420955, + "language_loss": 0.70994055, + "learning_rate": 3.7754158448116327e-06, + "loss": 0.73294663, + "num_input_tokens_seen": 63896815, + "step": 2956, + "time_per_iteration": 2.639592409133911 + }, + { + "auxiliary_loss_clip": 0.01160842, + "auxiliary_loss_mlp": 0.01155433, + "balance_loss_clip": 1.00209308, + "balance_loss_mlp": 1.00141466, + "epoch": 0.17778445813918534, + "flos": 25629445820160.0, + "grad_norm": 1.6860636631489692, + "language_loss": 0.82840097, + "learning_rate": 3.7752365002499795e-06, + "loss": 0.85156363, + "num_input_tokens_seen": 63916140, + "step": 2957, + "time_per_iteration": 2.5965542793273926 + }, + { + "auxiliary_loss_clip": 0.01114292, + "auxiliary_loss_mlp": 0.01154662, + "balance_loss_clip": 1.00203478, + "balance_loss_mlp": 1.00092995, + "epoch": 0.1778445813918533, + "flos": 25628871202560.0, + "grad_norm": 1.5180896811155724, + "language_loss": 0.74806511, + "learning_rate": 3.7750570883711807e-06, + "loss": 0.77075464, + "num_input_tokens_seen": 63935220, + "step": 2958, + "time_per_iteration": 2.6964550018310547 + }, + { + "auxiliary_loss_clip": 0.01145503, + "auxiliary_loss_mlp": 0.01155534, + "balance_loss_clip": 1.00211203, + "balance_loss_mlp": 1.00094342, + "epoch": 0.17790470464452127, + "flos": 22345522853760.0, + "grad_norm": 2.0219979341672962, + "language_loss": 0.79964006, + "learning_rate": 3.7748776091820397e-06, + "loss": 0.82265043, + "num_input_tokens_seen": 63954550, + "step": 2959, + "time_per_iteration": 2.602936267852783 + }, + { + "auxiliary_loss_clip": 0.01177705, + "auxiliary_loss_mlp": 0.0115586, + "balance_loss_clip": 1.00224161, + "balance_loss_mlp": 1.00107908, + "epoch": 0.17796482789718923, + "flos": 18765875214720.0, + "grad_norm": 2.2350830550252243, + "language_loss": 0.52026027, + "learning_rate": 3.774698062689362e-06, + "loss": 0.54359591, + "num_input_tokens_seen": 63972425, + "step": 2960, + "time_per_iteration": 2.546372890472412 + }, + { + "auxiliary_loss_clip": 0.01113079, + "auxiliary_loss_mlp": 0.01155426, + "balance_loss_clip": 1.00189209, + "balance_loss_mlp": 1.00102627, + "epoch": 0.1780249511498572, + "flos": 23440941970560.0, + "grad_norm": 1.8783364754605067, + "language_loss": 0.89145529, + "learning_rate": 3.7745184488999548e-06, + "loss": 0.9141404, + "num_input_tokens_seen": 63992165, + "step": 2961, + "time_per_iteration": 2.6853432655334473 + }, + { + "auxiliary_loss_clip": 0.0112964, + "auxiliary_loss_mlp": 0.01155732, + "balance_loss_clip": 1.00217354, + "balance_loss_mlp": 1.0010457, + "epoch": 0.1780850744025252, + "flos": 23367468700800.0, + "grad_norm": 1.7000844173118035, + "language_loss": 0.79172671, + "learning_rate": 3.774338767820631e-06, + "loss": 0.8145805, + "num_input_tokens_seen": 64013470, + "step": 2962, + "time_per_iteration": 2.6643497943878174 + }, + { + "auxiliary_loss_clip": 0.01161866, + "auxiliary_loss_mlp": 0.01155714, + "balance_loss_clip": 1.00213981, + "balance_loss_mlp": 1.00102806, + "epoch": 0.17814519765519315, + "flos": 13771994319360.0, + "grad_norm": 1.9457671739160527, + "language_loss": 0.74388349, + "learning_rate": 3.774159019458203e-06, + "loss": 0.76705933, + "num_input_tokens_seen": 64030975, + "step": 2963, + "time_per_iteration": 2.5596940517425537 + }, + { + "auxiliary_loss_clip": 0.01144427, + "auxiliary_loss_mlp": 0.01156035, + "balance_loss_clip": 1.00210667, + "balance_loss_mlp": 1.00115824, + "epoch": 0.17820532090786112, + "flos": 21976396738560.0, + "grad_norm": 1.5123938905210923, + "language_loss": 0.78646016, + "learning_rate": 3.7739792038194877e-06, + "loss": 0.80946481, + "num_input_tokens_seen": 64050075, + "step": 2964, + "time_per_iteration": 2.6176745891571045 + }, + { + "auxiliary_loss_clip": 0.01160888, + "auxiliary_loss_mlp": 0.00748845, + "balance_loss_clip": 1.0021261, + "balance_loss_mlp": 1.00049198, + "epoch": 0.17826544416052909, + "flos": 24790752184320.0, + "grad_norm": 1.8685485033547604, + "language_loss": 0.81258184, + "learning_rate": 3.7737993209113027e-06, + "loss": 0.83167917, + "num_input_tokens_seen": 64071920, + "step": 2965, + "time_per_iteration": 2.612391233444214 + }, + { + "auxiliary_loss_clip": 0.0116078, + "auxiliary_loss_mlp": 0.01155755, + "balance_loss_clip": 1.00210619, + "balance_loss_mlp": 1.00106907, + "epoch": 0.17832556741319705, + "flos": 13879582531200.0, + "grad_norm": 2.2633285476327023, + "language_loss": 0.94413984, + "learning_rate": 3.7736193707404698e-06, + "loss": 0.96730518, + "num_input_tokens_seen": 64086835, + "step": 2966, + "time_per_iteration": 2.543665885925293 + }, + { + "auxiliary_loss_clip": 0.01113332, + "auxiliary_loss_mlp": 0.00748822, + "balance_loss_clip": 1.00195813, + "balance_loss_mlp": 1.00059569, + "epoch": 0.17838569066586502, + "flos": 36641703323520.0, + "grad_norm": 2.3166162857734403, + "language_loss": 0.7277472, + "learning_rate": 3.7734393533138127e-06, + "loss": 0.74636877, + "num_input_tokens_seen": 64107360, + "step": 2967, + "time_per_iteration": 4.184880971908569 + }, + { + "auxiliary_loss_clip": 0.01146625, + "auxiliary_loss_mlp": 0.01155082, + "balance_loss_clip": 1.00221992, + "balance_loss_mlp": 1.00106359, + "epoch": 0.17844581391853298, + "flos": 18727271072640.0, + "grad_norm": 1.8741584319407127, + "language_loss": 0.76730865, + "learning_rate": 3.773259268638157e-06, + "loss": 0.79032576, + "num_input_tokens_seen": 64124690, + "step": 2968, + "time_per_iteration": 2.5899808406829834 + }, + { + "auxiliary_loss_clip": 0.01098038, + "auxiliary_loss_mlp": 0.01155677, + "balance_loss_clip": 1.00205684, + "balance_loss_mlp": 1.00108647, + "epoch": 0.17850593717120097, + "flos": 27378259286400.0, + "grad_norm": 1.8223008330582984, + "language_loss": 0.75967383, + "learning_rate": 3.7730791167203333e-06, + "loss": 0.78221101, + "num_input_tokens_seen": 64146315, + "step": 2969, + "time_per_iteration": 4.169270277023315 + }, + { + "auxiliary_loss_clip": 0.01143862, + "auxiliary_loss_mlp": 0.01145169, + "balance_loss_clip": 1.00222218, + "balance_loss_mlp": 1.00021064, + "epoch": 0.17856606042386894, + "flos": 66996025084800.0, + "grad_norm": 0.8500439370304357, + "language_loss": 0.69125438, + "learning_rate": 3.772898897567171e-06, + "loss": 0.71414471, + "num_input_tokens_seen": 64210875, + "step": 2970, + "time_per_iteration": 6.127662658691406 + }, + { + "auxiliary_loss_clip": 0.01145504, + "auxiliary_loss_mlp": 0.01155162, + "balance_loss_clip": 1.00214052, + "balance_loss_mlp": 1.00085759, + "epoch": 0.1786261836765369, + "flos": 36977001805440.0, + "grad_norm": 3.8330268167284167, + "language_loss": 0.67150086, + "learning_rate": 3.772718611185505e-06, + "loss": 0.69450754, + "num_input_tokens_seen": 64230740, + "step": 2971, + "time_per_iteration": 2.7328736782073975 + }, + { + "auxiliary_loss_clip": 0.01117781, + "auxiliary_loss_mlp": 0.01155735, + "balance_loss_clip": 1.0022397, + "balance_loss_mlp": 1.00095356, + "epoch": 0.17868630692920487, + "flos": 24825441744000.0, + "grad_norm": 1.8229193248680013, + "language_loss": 0.89732349, + "learning_rate": 3.7725382575821717e-06, + "loss": 0.92005861, + "num_input_tokens_seen": 64252300, + "step": 2972, + "time_per_iteration": 2.7118518352508545 + }, + { + "auxiliary_loss_clip": 0.01129818, + "auxiliary_loss_mlp": 0.01155711, + "balance_loss_clip": 1.00206542, + "balance_loss_mlp": 1.00121593, + "epoch": 0.17874643018187283, + "flos": 16981977139200.0, + "grad_norm": 2.142426165903813, + "language_loss": 0.88261873, + "learning_rate": 3.77235783676401e-06, + "loss": 0.90547407, + "num_input_tokens_seen": 64270105, + "step": 2973, + "time_per_iteration": 2.6526455879211426 + }, + { + "auxiliary_loss_clip": 0.01177535, + "auxiliary_loss_mlp": 0.01155702, + "balance_loss_clip": 1.00223863, + "balance_loss_mlp": 1.00130212, + "epoch": 0.1788065534345408, + "flos": 21032233793280.0, + "grad_norm": 2.504632481191839, + "language_loss": 0.76266003, + "learning_rate": 3.7721773487378615e-06, + "loss": 0.78599238, + "num_input_tokens_seen": 64287250, + "step": 2974, + "time_per_iteration": 2.5615897178649902 + }, + { + "auxiliary_loss_clip": 0.01144655, + "auxiliary_loss_mlp": 0.01155206, + "balance_loss_clip": 1.00210667, + "balance_loss_mlp": 1.00099707, + "epoch": 0.17886667668720876, + "flos": 23987717775360.0, + "grad_norm": 2.354440330852215, + "language_loss": 0.74893403, + "learning_rate": 3.7719967935105705e-06, + "loss": 0.7719326, + "num_input_tokens_seen": 64307140, + "step": 2975, + "time_per_iteration": 2.622481107711792 + }, + { + "auxiliary_loss_clip": 0.01161576, + "auxiliary_loss_mlp": 0.01155287, + "balance_loss_clip": 1.00210369, + "balance_loss_mlp": 1.00126839, + "epoch": 0.17892679993987676, + "flos": 25739476156800.0, + "grad_norm": 1.4755966173273156, + "language_loss": 0.7323575, + "learning_rate": 3.7718161710889833e-06, + "loss": 0.75552619, + "num_input_tokens_seen": 64328760, + "step": 2976, + "time_per_iteration": 2.592168092727661 + }, + { + "auxiliary_loss_clip": 0.01161941, + "auxiliary_loss_mlp": 0.01154524, + "balance_loss_clip": 1.00222564, + "balance_loss_mlp": 1.00126886, + "epoch": 0.17898692319254472, + "flos": 25699686865920.0, + "grad_norm": 1.4363545411493381, + "language_loss": 0.77773088, + "learning_rate": 3.7716354814799495e-06, + "loss": 0.80089545, + "num_input_tokens_seen": 64348800, + "step": 2977, + "time_per_iteration": 2.592054605484009 + }, + { + "auxiliary_loss_clip": 0.01132576, + "auxiliary_loss_mlp": 0.01155529, + "balance_loss_clip": 1.00217843, + "balance_loss_mlp": 1.00112867, + "epoch": 0.1790470464452127, + "flos": 19317786664320.0, + "grad_norm": 1.8761692436130382, + "language_loss": 0.79531205, + "learning_rate": 3.7714547246903203e-06, + "loss": 0.81819308, + "num_input_tokens_seen": 64367955, + "step": 2978, + "time_per_iteration": 2.651846170425415 + }, + { + "auxiliary_loss_clip": 0.01145421, + "auxiliary_loss_mlp": 0.01155865, + "balance_loss_clip": 1.00216532, + "balance_loss_mlp": 1.00117946, + "epoch": 0.17910716969788065, + "flos": 30044267562240.0, + "grad_norm": 1.4883384873635457, + "language_loss": 0.76306295, + "learning_rate": 3.7712739007269508e-06, + "loss": 0.78607577, + "num_input_tokens_seen": 64389805, + "step": 2979, + "time_per_iteration": 2.670811653137207 + }, + { + "auxiliary_loss_clip": 0.01128574, + "auxiliary_loss_mlp": 0.01155244, + "balance_loss_clip": 1.00195312, + "balance_loss_mlp": 1.00132084, + "epoch": 0.17916729295054862, + "flos": 19427709260160.0, + "grad_norm": 1.943728739965368, + "language_loss": 0.69545245, + "learning_rate": 3.7710930095966976e-06, + "loss": 0.71829069, + "num_input_tokens_seen": 64408220, + "step": 2980, + "time_per_iteration": 2.630131959915161 + }, + { + "auxiliary_loss_clip": 0.01161722, + "auxiliary_loss_mlp": 0.01155295, + "balance_loss_clip": 1.00222135, + "balance_loss_mlp": 1.00099015, + "epoch": 0.17922741620321658, + "flos": 14611549881600.0, + "grad_norm": 1.7411217231591705, + "language_loss": 0.70651418, + "learning_rate": 3.7709120513064196e-06, + "loss": 0.72968435, + "num_input_tokens_seen": 64426380, + "step": 2981, + "time_per_iteration": 2.552910327911377 + }, + { + "auxiliary_loss_clip": 0.01144784, + "auxiliary_loss_mlp": 0.01156222, + "balance_loss_clip": 1.00218582, + "balance_loss_mlp": 1.00134492, + "epoch": 0.17928753945588458, + "flos": 17165301177600.0, + "grad_norm": 2.717244323365387, + "language_loss": 0.82161903, + "learning_rate": 3.7707310258629796e-06, + "loss": 0.84462905, + "num_input_tokens_seen": 64444355, + "step": 2982, + "time_per_iteration": 2.5720810890197754 + }, + { + "auxiliary_loss_clip": 0.01177547, + "auxiliary_loss_mlp": 0.01155025, + "balance_loss_clip": 1.00233197, + "balance_loss_mlp": 1.00110221, + "epoch": 0.17934766270855254, + "flos": 31395622060800.0, + "grad_norm": 1.5625780818619672, + "language_loss": 0.82762182, + "learning_rate": 3.7705499332732413e-06, + "loss": 0.85094744, + "num_input_tokens_seen": 64467800, + "step": 2983, + "time_per_iteration": 2.609494209289551 + }, + { + "auxiliary_loss_clip": 0.01161689, + "auxiliary_loss_mlp": 0.01155755, + "balance_loss_clip": 1.00215685, + "balance_loss_mlp": 1.00116467, + "epoch": 0.1794077859612205, + "flos": 20814184281600.0, + "grad_norm": 1.7690744776580734, + "language_loss": 0.85321856, + "learning_rate": 3.7703687735440718e-06, + "loss": 0.87639302, + "num_input_tokens_seen": 64487230, + "step": 2984, + "time_per_iteration": 2.5783605575561523 + }, + { + "auxiliary_loss_clip": 0.01127757, + "auxiliary_loss_mlp": 0.01155163, + "balance_loss_clip": 1.00194979, + "balance_loss_mlp": 1.00085855, + "epoch": 0.17946790921388847, + "flos": 28986447006720.0, + "grad_norm": 2.8423100181181566, + "language_loss": 0.89142871, + "learning_rate": 3.7701875466823416e-06, + "loss": 0.91425788, + "num_input_tokens_seen": 64509165, + "step": 2985, + "time_per_iteration": 2.69427752494812 + }, + { + "auxiliary_loss_clip": 0.01177303, + "auxiliary_loss_mlp": 0.01154821, + "balance_loss_clip": 1.00220776, + "balance_loss_mlp": 1.00137472, + "epoch": 0.17952803246655644, + "flos": 20737406960640.0, + "grad_norm": 1.9947853379027367, + "language_loss": 0.69678676, + "learning_rate": 3.770006252694922e-06, + "loss": 0.72010803, + "num_input_tokens_seen": 64527940, + "step": 2986, + "time_per_iteration": 2.5442557334899902 + }, + { + "auxiliary_loss_clip": 0.01177495, + "auxiliary_loss_mlp": 0.00748855, + "balance_loss_clip": 1.00224841, + "balance_loss_mlp": 1.00063348, + "epoch": 0.1795881557192244, + "flos": 28255988027520.0, + "grad_norm": 2.1010068002179527, + "language_loss": 0.77533805, + "learning_rate": 3.769824891588688e-06, + "loss": 0.79460156, + "num_input_tokens_seen": 64545230, + "step": 2987, + "time_per_iteration": 2.5865378379821777 + }, + { + "auxiliary_loss_clip": 0.01177607, + "auxiliary_loss_mlp": 0.01155049, + "balance_loss_clip": 1.00230932, + "balance_loss_mlp": 1.00083947, + "epoch": 0.17964827897189237, + "flos": 18552027594240.0, + "grad_norm": 1.8898330613960752, + "language_loss": 0.77896678, + "learning_rate": 3.7696434633705164e-06, + "loss": 0.80229336, + "num_input_tokens_seen": 64563820, + "step": 2988, + "time_per_iteration": 2.5439703464508057 + }, + { + "auxiliary_loss_clip": 0.01109402, + "auxiliary_loss_mlp": 0.00748202, + "balance_loss_clip": 1.00190818, + "balance_loss_mlp": 0.99982452, + "epoch": 0.17970840222456036, + "flos": 58165088711040.0, + "grad_norm": 0.7499535881819255, + "language_loss": 0.62663829, + "learning_rate": 3.7694619680472875e-06, + "loss": 0.64521432, + "num_input_tokens_seen": 64621315, + "step": 2989, + "time_per_iteration": 3.187042713165283 + }, + { + "auxiliary_loss_clip": 0.01144849, + "auxiliary_loss_mlp": 0.01154851, + "balance_loss_clip": 1.00204325, + "balance_loss_mlp": 1.00092781, + "epoch": 0.17976852547722832, + "flos": 20300805146880.0, + "grad_norm": 1.8932977765740853, + "language_loss": 0.70503777, + "learning_rate": 3.7692804056258837e-06, + "loss": 0.72803473, + "num_input_tokens_seen": 64639885, + "step": 2990, + "time_per_iteration": 2.6158463954925537 + }, + { + "auxiliary_loss_clip": 0.01144374, + "auxiliary_loss_mlp": 0.01155437, + "balance_loss_clip": 1.00205064, + "balance_loss_mlp": 1.00122786, + "epoch": 0.1798286487298963, + "flos": 39669367685760.0, + "grad_norm": 1.92951869336661, + "language_loss": 0.6931653, + "learning_rate": 3.7690987761131893e-06, + "loss": 0.71616346, + "num_input_tokens_seen": 64661220, + "step": 2991, + "time_per_iteration": 2.7523996829986572 + }, + { + "auxiliary_loss_clip": 0.0111175, + "auxiliary_loss_mlp": 0.01155004, + "balance_loss_clip": 1.00200558, + "balance_loss_mlp": 1.00108075, + "epoch": 0.17988877198256426, + "flos": 25520313323520.0, + "grad_norm": 1.7380730391564037, + "language_loss": 0.83139849, + "learning_rate": 3.7689170795160924e-06, + "loss": 0.85406601, + "num_input_tokens_seen": 64682530, + "step": 2992, + "time_per_iteration": 2.762208938598633 + }, + { + "auxiliary_loss_clip": 0.01160701, + "auxiliary_loss_mlp": 0.01154136, + "balance_loss_clip": 1.00204325, + "balance_loss_mlp": 1.00097597, + "epoch": 0.17994889523523222, + "flos": 18807496099200.0, + "grad_norm": 2.6546197078087217, + "language_loss": 0.81849802, + "learning_rate": 3.7687353158414822e-06, + "loss": 0.84164637, + "num_input_tokens_seen": 64701025, + "step": 2993, + "time_per_iteration": 2.54477858543396 + }, + { + "auxiliary_loss_clip": 0.01162028, + "auxiliary_loss_mlp": 0.0115469, + "balance_loss_clip": 1.00212204, + "balance_loss_mlp": 1.00086236, + "epoch": 0.18000901848790019, + "flos": 21104450087040.0, + "grad_norm": 1.7137670204053603, + "language_loss": 0.78200662, + "learning_rate": 3.7685534850962517e-06, + "loss": 0.80517381, + "num_input_tokens_seen": 64719570, + "step": 2994, + "time_per_iteration": 2.631709337234497 + }, + { + "auxiliary_loss_clip": 0.0117749, + "auxiliary_loss_mlp": 0.01154951, + "balance_loss_clip": 1.0022366, + "balance_loss_mlp": 1.00102782, + "epoch": 0.18006914174056818, + "flos": 19646441130240.0, + "grad_norm": 2.1270881227887384, + "language_loss": 0.80469143, + "learning_rate": 3.768371587287296e-06, + "loss": 0.82801586, + "num_input_tokens_seen": 64738110, + "step": 2995, + "time_per_iteration": 2.5072360038757324 + }, + { + "auxiliary_loss_clip": 0.01161086, + "auxiliary_loss_mlp": 0.01155441, + "balance_loss_clip": 1.00219703, + "balance_loss_mlp": 1.00132763, + "epoch": 0.18012926499323614, + "flos": 19499889640320.0, + "grad_norm": 2.2019939981872, + "language_loss": 0.84404647, + "learning_rate": 3.768189622421512e-06, + "loss": 0.86721182, + "num_input_tokens_seen": 64756345, + "step": 2996, + "time_per_iteration": 2.581179618835449 + }, + { + "auxiliary_loss_clip": 0.01129313, + "auxiliary_loss_mlp": 0.0115449, + "balance_loss_clip": 1.00203454, + "balance_loss_mlp": 1.00094855, + "epoch": 0.1801893882459041, + "flos": 19464553635840.0, + "grad_norm": 1.6594114197370478, + "language_loss": 0.88177603, + "learning_rate": 3.7680075905058006e-06, + "loss": 0.90461409, + "num_input_tokens_seen": 64776375, + "step": 2997, + "time_per_iteration": 2.701669931411743 + }, + { + "auxiliary_loss_clip": 0.01144999, + "auxiliary_loss_mlp": 0.01155304, + "balance_loss_clip": 1.00199533, + "balance_loss_mlp": 1.00099969, + "epoch": 0.18024951149857207, + "flos": 26870590414080.0, + "grad_norm": 2.1144181379653957, + "language_loss": 0.84919024, + "learning_rate": 3.7678254915470643e-06, + "loss": 0.87219328, + "num_input_tokens_seen": 64796210, + "step": 2998, + "time_per_iteration": 2.6841416358947754 + }, + { + "auxiliary_loss_clip": 0.01177461, + "auxiliary_loss_mlp": 0.0115488, + "balance_loss_clip": 1.00235009, + "balance_loss_mlp": 1.00105238, + "epoch": 0.18030963475124004, + "flos": 30226621933440.0, + "grad_norm": 2.0391796614042517, + "language_loss": 0.8455888, + "learning_rate": 3.7676433255522084e-06, + "loss": 0.86891222, + "num_input_tokens_seen": 64818590, + "step": 2999, + "time_per_iteration": 2.6000494956970215 + }, + { + "auxiliary_loss_clip": 0.01162033, + "auxiliary_loss_mlp": 0.01154879, + "balance_loss_clip": 1.00219059, + "balance_loss_mlp": 1.00095654, + "epoch": 0.180369758003908, + "flos": 22307493329280.0, + "grad_norm": 3.244697288744346, + "language_loss": 0.75089449, + "learning_rate": 3.76746109252814e-06, + "loss": 0.77406359, + "num_input_tokens_seen": 64838350, + "step": 3000, + "time_per_iteration": 2.6357712745666504 + }, + { + "auxiliary_loss_clip": 0.01144923, + "auxiliary_loss_mlp": 0.0074889, + "balance_loss_clip": 1.00214362, + "balance_loss_mlp": 1.00074601, + "epoch": 0.18042988125657597, + "flos": 23732033788800.0, + "grad_norm": 1.8610473130056997, + "language_loss": 0.70801717, + "learning_rate": 3.76727879248177e-06, + "loss": 0.72695529, + "num_input_tokens_seen": 64858065, + "step": 3001, + "time_per_iteration": 2.6327829360961914 + }, + { + "auxiliary_loss_clip": 0.01160905, + "auxiliary_loss_mlp": 0.01154818, + "balance_loss_clip": 1.0022428, + "balance_loss_mlp": 1.00108624, + "epoch": 0.18049000450924396, + "flos": 24093582134400.0, + "grad_norm": 2.6199928727582513, + "language_loss": 0.88195378, + "learning_rate": 3.767096425420011e-06, + "loss": 0.90511101, + "num_input_tokens_seen": 64877305, + "step": 3002, + "time_per_iteration": 2.5711100101470947 + }, + { + "auxiliary_loss_clip": 0.0117748, + "auxiliary_loss_mlp": 0.01154569, + "balance_loss_clip": 1.00225616, + "balance_loss_mlp": 1.00102735, + "epoch": 0.18055012776191193, + "flos": 22163168482560.0, + "grad_norm": 1.6206221198244373, + "language_loss": 0.80477411, + "learning_rate": 3.7669139913497788e-06, + "loss": 0.8280946, + "num_input_tokens_seen": 64896955, + "step": 3003, + "time_per_iteration": 2.5123674869537354 + }, + { + "auxiliary_loss_clip": 0.01177494, + "auxiliary_loss_mlp": 0.01155354, + "balance_loss_clip": 1.00222671, + "balance_loss_mlp": 1.0013361, + "epoch": 0.1806102510145799, + "flos": 28913512440960.0, + "grad_norm": 2.02382885663805, + "language_loss": 0.67323077, + "learning_rate": 3.7667314902779907e-06, + "loss": 0.69655919, + "num_input_tokens_seen": 64917080, + "step": 3004, + "time_per_iteration": 2.5715479850769043 + }, + { + "auxiliary_loss_clip": 0.01160843, + "auxiliary_loss_mlp": 0.01155391, + "balance_loss_clip": 1.00214386, + "balance_loss_mlp": 1.00108671, + "epoch": 0.18067037426724786, + "flos": 19025689265280.0, + "grad_norm": 2.2142820140257222, + "language_loss": 0.85267448, + "learning_rate": 3.7665489222115677e-06, + "loss": 0.87583685, + "num_input_tokens_seen": 64935215, + "step": 3005, + "time_per_iteration": 2.551530361175537 + }, + { + "auxiliary_loss_clip": 0.01160727, + "auxiliary_loss_mlp": 0.01154452, + "balance_loss_clip": 1.00215006, + "balance_loss_mlp": 1.00091004, + "epoch": 0.18073049751991582, + "flos": 27453635976960.0, + "grad_norm": 3.7179095709526635, + "language_loss": 0.83056504, + "learning_rate": 3.766366287157432e-06, + "loss": 0.85371685, + "num_input_tokens_seen": 64956275, + "step": 3006, + "time_per_iteration": 4.0658416748046875 + }, + { + "auxiliary_loss_clip": 0.01146345, + "auxiliary_loss_mlp": 0.01155419, + "balance_loss_clip": 1.00211394, + "balance_loss_mlp": 1.00101948, + "epoch": 0.1807906207725838, + "flos": 28729039167360.0, + "grad_norm": 1.702084015754156, + "language_loss": 0.77185231, + "learning_rate": 3.7661835851225103e-06, + "loss": 0.79486996, + "num_input_tokens_seen": 64979390, + "step": 3007, + "time_per_iteration": 4.058492422103882 + }, + { + "auxiliary_loss_clip": 0.01142799, + "auxiliary_loss_mlp": 0.01143753, + "balance_loss_clip": 1.00162721, + "balance_loss_mlp": 1.00032067, + "epoch": 0.18085074402525175, + "flos": 64466515468800.0, + "grad_norm": 0.8381539509862374, + "language_loss": 0.56949347, + "learning_rate": 3.7660008161137294e-06, + "loss": 0.59235901, + "num_input_tokens_seen": 65043135, + "step": 3008, + "time_per_iteration": 4.676165342330933 + }, + { + "auxiliary_loss_clip": 0.01145375, + "auxiliary_loss_mlp": 0.01155072, + "balance_loss_clip": 1.00214648, + "balance_loss_mlp": 1.00114918, + "epoch": 0.18091086727791975, + "flos": 23476960333440.0, + "grad_norm": 2.004989283574114, + "language_loss": 0.67526007, + "learning_rate": 3.765817980138021e-06, + "loss": 0.6982646, + "num_input_tokens_seen": 65062845, + "step": 3009, + "time_per_iteration": 4.068760633468628 + }, + { + "auxiliary_loss_clip": 0.01177449, + "auxiliary_loss_mlp": 0.01155, + "balance_loss_clip": 1.00232291, + "balance_loss_mlp": 1.00098181, + "epoch": 0.1809709905305877, + "flos": 24170467196160.0, + "grad_norm": 1.8558903593537996, + "language_loss": 0.75534463, + "learning_rate": 3.7656350772023177e-06, + "loss": 0.77866912, + "num_input_tokens_seen": 65082110, + "step": 3010, + "time_per_iteration": 2.5595662593841553 + }, + { + "auxiliary_loss_clip": 0.0114376, + "auxiliary_loss_mlp": 0.01154046, + "balance_loss_clip": 1.00196457, + "balance_loss_mlp": 1.00088584, + "epoch": 0.18103111378325568, + "flos": 21650902669440.0, + "grad_norm": 1.745659794738249, + "language_loss": 0.6711331, + "learning_rate": 3.7654521073135553e-06, + "loss": 0.69411117, + "num_input_tokens_seen": 65101985, + "step": 3011, + "time_per_iteration": 2.600503921508789 + }, + { + "auxiliary_loss_clip": 0.01129679, + "auxiliary_loss_mlp": 0.00748772, + "balance_loss_clip": 1.00199306, + "balance_loss_mlp": 1.00075352, + "epoch": 0.18109123703592364, + "flos": 53686918356480.0, + "grad_norm": 1.783508547408837, + "language_loss": 0.71572113, + "learning_rate": 3.7652690704786723e-06, + "loss": 0.73450565, + "num_input_tokens_seen": 65129295, + "step": 3012, + "time_per_iteration": 2.958226203918457 + }, + { + "auxiliary_loss_clip": 0.01149438, + "auxiliary_loss_mlp": 0.01154795, + "balance_loss_clip": 1.0022943, + "balance_loss_mlp": 1.00125384, + "epoch": 0.1811513602885916, + "flos": 35845564325760.0, + "grad_norm": 2.117827784472674, + "language_loss": 0.62249506, + "learning_rate": 3.765085966704609e-06, + "loss": 0.64553738, + "num_input_tokens_seen": 65150625, + "step": 3013, + "time_per_iteration": 2.7406954765319824 + }, + { + "auxiliary_loss_clip": 0.0114524, + "auxiliary_loss_mlp": 0.01155172, + "balance_loss_clip": 1.00211596, + "balance_loss_mlp": 1.00124884, + "epoch": 0.18121148354125957, + "flos": 23732572492800.0, + "grad_norm": 1.604257430161411, + "language_loss": 0.75469577, + "learning_rate": 3.764902795998309e-06, + "loss": 0.77769989, + "num_input_tokens_seen": 65170880, + "step": 3014, + "time_per_iteration": 2.6521623134613037 + }, + { + "auxiliary_loss_clip": 0.01177632, + "auxiliary_loss_mlp": 0.01155886, + "balance_loss_clip": 1.00232363, + "balance_loss_mlp": 1.00100875, + "epoch": 0.18127160679392756, + "flos": 28728320895360.0, + "grad_norm": 1.8541538406152833, + "language_loss": 0.65876794, + "learning_rate": 3.7647195583667184e-06, + "loss": 0.68210304, + "num_input_tokens_seen": 65192530, + "step": 3015, + "time_per_iteration": 2.5780811309814453 + }, + { + "auxiliary_loss_clip": 0.01144242, + "auxiliary_loss_mlp": 0.00748825, + "balance_loss_clip": 1.00208664, + "balance_loss_mlp": 1.00066352, + "epoch": 0.18133173004659553, + "flos": 20485062938880.0, + "grad_norm": 1.6593455606194019, + "language_loss": 0.77311701, + "learning_rate": 3.764536253816785e-06, + "loss": 0.79204768, + "num_input_tokens_seen": 65211675, + "step": 3016, + "time_per_iteration": 2.6062216758728027 + }, + { + "auxiliary_loss_clip": 0.01162121, + "auxiliary_loss_mlp": 0.01155488, + "balance_loss_clip": 1.00236404, + "balance_loss_mlp": 1.00118351, + "epoch": 0.1813918532992635, + "flos": 22852078404480.0, + "grad_norm": 2.1621302651480723, + "language_loss": 0.83643049, + "learning_rate": 3.7643528823554602e-06, + "loss": 0.85960656, + "num_input_tokens_seen": 65231185, + "step": 3017, + "time_per_iteration": 2.552908420562744 + }, + { + "auxiliary_loss_clip": 0.01161923, + "auxiliary_loss_mlp": 0.01154568, + "balance_loss_clip": 1.00226831, + "balance_loss_mlp": 1.00083542, + "epoch": 0.18145197655193146, + "flos": 36065122208640.0, + "grad_norm": 2.2921053776653166, + "language_loss": 0.67271888, + "learning_rate": 3.764169443989697e-06, + "loss": 0.69588387, + "num_input_tokens_seen": 65251645, + "step": 3018, + "time_per_iteration": 2.7075912952423096 + }, + { + "auxiliary_loss_clip": 0.01160745, + "auxiliary_loss_mlp": 0.00748898, + "balance_loss_clip": 1.0021435, + "balance_loss_mlp": 1.0007503, + "epoch": 0.18151209980459942, + "flos": 24023951619840.0, + "grad_norm": 2.0844490140139955, + "language_loss": 0.76015764, + "learning_rate": 3.7639859387264518e-06, + "loss": 0.77925408, + "num_input_tokens_seen": 65271125, + "step": 3019, + "time_per_iteration": 2.56750226020813 + }, + { + "auxiliary_loss_clip": 0.01128871, + "auxiliary_loss_mlp": 0.0115543, + "balance_loss_clip": 1.00226271, + "balance_loss_mlp": 1.00103009, + "epoch": 0.1815722230572674, + "flos": 23951627585280.0, + "grad_norm": 2.096982785907663, + "language_loss": 0.8142125, + "learning_rate": 3.7638023665726834e-06, + "loss": 0.8370555, + "num_input_tokens_seen": 65290600, + "step": 3020, + "time_per_iteration": 2.6722466945648193 + }, + { + "auxiliary_loss_clip": 0.01144076, + "auxiliary_loss_mlp": 0.01154866, + "balance_loss_clip": 1.00209939, + "balance_loss_mlp": 1.00094342, + "epoch": 0.18163234630993536, + "flos": 24386469632640.0, + "grad_norm": 1.9730964700370839, + "language_loss": 0.77595931, + "learning_rate": 3.763618727535352e-06, + "loss": 0.79894876, + "num_input_tokens_seen": 65311040, + "step": 3021, + "time_per_iteration": 2.642331838607788 + }, + { + "auxiliary_loss_clip": 0.01160344, + "auxiliary_loss_mlp": 0.01154117, + "balance_loss_clip": 1.00201714, + "balance_loss_mlp": 1.00114751, + "epoch": 0.18169246956260335, + "flos": 24681332378880.0, + "grad_norm": 1.747854036177732, + "language_loss": 0.84781897, + "learning_rate": 3.763435021621422e-06, + "loss": 0.87096363, + "num_input_tokens_seen": 65332115, + "step": 3022, + "time_per_iteration": 2.5970888137817383 + }, + { + "auxiliary_loss_clip": 0.0112933, + "auxiliary_loss_mlp": 0.01154738, + "balance_loss_clip": 1.00206614, + "balance_loss_mlp": 1.00091028, + "epoch": 0.1817525928152713, + "flos": 24243294021120.0, + "grad_norm": 2.9828258810172263, + "language_loss": 0.69217944, + "learning_rate": 3.763251248837859e-06, + "loss": 0.71502018, + "num_input_tokens_seen": 65352210, + "step": 3023, + "time_per_iteration": 2.69760799407959 + }, + { + "auxiliary_loss_clip": 0.01145937, + "auxiliary_loss_mlp": 0.01154489, + "balance_loss_clip": 1.00208187, + "balance_loss_mlp": 1.00113821, + "epoch": 0.18181271606793928, + "flos": 16472081623680.0, + "grad_norm": 1.611550527322817, + "language_loss": 0.73840213, + "learning_rate": 3.7630674091916317e-06, + "loss": 0.76140636, + "num_input_tokens_seen": 65370600, + "step": 3024, + "time_per_iteration": 2.6735737323760986 + }, + { + "auxiliary_loss_clip": 0.01160563, + "auxiliary_loss_mlp": 0.01154667, + "balance_loss_clip": 1.00209665, + "balance_loss_mlp": 1.00093508, + "epoch": 0.18187283932060724, + "flos": 18581042805120.0, + "grad_norm": 1.9777414609987098, + "language_loss": 0.88256288, + "learning_rate": 3.7628835026897123e-06, + "loss": 0.90571529, + "num_input_tokens_seen": 65387270, + "step": 3025, + "time_per_iteration": 2.525301218032837 + }, + { + "auxiliary_loss_clip": 0.01144676, + "auxiliary_loss_mlp": 0.01154862, + "balance_loss_clip": 1.00201726, + "balance_loss_mlp": 1.00151086, + "epoch": 0.1819329625732752, + "flos": 20266833859200.0, + "grad_norm": 1.9339327048778254, + "language_loss": 0.79355806, + "learning_rate": 3.7626995293390735e-06, + "loss": 0.81655335, + "num_input_tokens_seen": 65406550, + "step": 3026, + "time_per_iteration": 2.5955417156219482 + }, + { + "auxiliary_loss_clip": 0.01144588, + "auxiliary_loss_mlp": 0.01155446, + "balance_loss_clip": 1.00208259, + "balance_loss_mlp": 1.00123644, + "epoch": 0.18199308582594317, + "flos": 25915186512000.0, + "grad_norm": 1.9543494056430601, + "language_loss": 0.75972289, + "learning_rate": 3.762515489146692e-06, + "loss": 0.78272325, + "num_input_tokens_seen": 65425955, + "step": 3027, + "time_per_iteration": 2.6271157264709473 + }, + { + "auxiliary_loss_clip": 0.0117751, + "auxiliary_loss_mlp": 0.01155376, + "balance_loss_clip": 1.00217032, + "balance_loss_mlp": 1.00116694, + "epoch": 0.18205320907861114, + "flos": 15377524433280.0, + "grad_norm": 1.9101486912230985, + "language_loss": 0.85569483, + "learning_rate": 3.762331382119546e-06, + "loss": 0.87902367, + "num_input_tokens_seen": 65442820, + "step": 3028, + "time_per_iteration": 2.4999468326568604 + }, + { + "auxiliary_loss_clip": 0.01177429, + "auxiliary_loss_mlp": 0.01154812, + "balance_loss_clip": 1.00225317, + "balance_loss_mlp": 1.00098431, + "epoch": 0.18211333233127913, + "flos": 25624310175360.0, + "grad_norm": 1.6670216945001737, + "language_loss": 0.82531905, + "learning_rate": 3.7621472082646183e-06, + "loss": 0.8486414, + "num_input_tokens_seen": 65461825, + "step": 3029, + "time_per_iteration": 2.5421667098999023 + }, + { + "auxiliary_loss_clip": 0.01129621, + "auxiliary_loss_mlp": 0.01155294, + "balance_loss_clip": 1.0021739, + "balance_loss_mlp": 1.0010848, + "epoch": 0.1821734555839471, + "flos": 14976007228800.0, + "grad_norm": 2.6057695376242984, + "language_loss": 0.77845448, + "learning_rate": 3.761962967588891e-06, + "loss": 0.80130374, + "num_input_tokens_seen": 65479480, + "step": 3030, + "time_per_iteration": 2.6009292602539062 + }, + { + "auxiliary_loss_clip": 0.01162095, + "auxiliary_loss_mlp": 0.01154773, + "balance_loss_clip": 1.00226617, + "balance_loss_mlp": 1.00104034, + "epoch": 0.18223357883661506, + "flos": 20194007034240.0, + "grad_norm": 1.9133016180312978, + "language_loss": 0.84479129, + "learning_rate": 3.761778660099352e-06, + "loss": 0.86795998, + "num_input_tokens_seen": 65497775, + "step": 3031, + "time_per_iteration": 2.5646438598632812 + }, + { + "auxiliary_loss_clip": 0.01128695, + "auxiliary_loss_mlp": 0.00748837, + "balance_loss_clip": 1.00196218, + "balance_loss_mlp": 1.00072479, + "epoch": 0.18229370208928303, + "flos": 15231978524160.0, + "grad_norm": 1.6167560263946121, + "language_loss": 0.7990371, + "learning_rate": 3.76159428580299e-06, + "loss": 0.81781244, + "num_input_tokens_seen": 65516505, + "step": 3032, + "time_per_iteration": 2.62300968170166 + }, + { + "auxiliary_loss_clip": 0.01177603, + "auxiliary_loss_mlp": 0.01155388, + "balance_loss_clip": 1.00227249, + "balance_loss_mlp": 1.00098836, + "epoch": 0.182353825341951, + "flos": 23840483927040.0, + "grad_norm": 1.792850935853295, + "language_loss": 0.81174266, + "learning_rate": 3.761409844706795e-06, + "loss": 0.83507264, + "num_input_tokens_seen": 65536160, + "step": 3033, + "time_per_iteration": 2.5466835498809814 + }, + { + "auxiliary_loss_clip": 0.01095411, + "auxiliary_loss_mlp": 0.01144224, + "balance_loss_clip": 1.00142741, + "balance_loss_mlp": 1.00002885, + "epoch": 0.18241394859461896, + "flos": 61190957393280.0, + "grad_norm": 0.8731686937739098, + "language_loss": 0.63467342, + "learning_rate": 3.7612253368177625e-06, + "loss": 0.65706974, + "num_input_tokens_seen": 65589375, + "step": 3034, + "time_per_iteration": 3.2009336948394775 + }, + { + "auxiliary_loss_clip": 0.01129538, + "auxiliary_loss_mlp": 0.01154885, + "balance_loss_clip": 1.00208211, + "balance_loss_mlp": 1.00096202, + "epoch": 0.18247407184728695, + "flos": 18471694826880.0, + "grad_norm": 1.926848536537367, + "language_loss": 0.79106784, + "learning_rate": 3.7610407621428893e-06, + "loss": 0.81391203, + "num_input_tokens_seen": 65606720, + "step": 3035, + "time_per_iteration": 2.608513593673706 + }, + { + "auxiliary_loss_clip": 0.01144314, + "auxiliary_loss_mlp": 0.01154512, + "balance_loss_clip": 1.00208545, + "balance_loss_mlp": 1.00125647, + "epoch": 0.18253419509995492, + "flos": 21795191602560.0, + "grad_norm": 2.1892034263728566, + "language_loss": 0.84938711, + "learning_rate": 3.7608561206891735e-06, + "loss": 0.87237537, + "num_input_tokens_seen": 65625495, + "step": 3036, + "time_per_iteration": 2.6213808059692383 + }, + { + "auxiliary_loss_clip": 0.01160676, + "auxiliary_loss_mlp": 0.01154356, + "balance_loss_clip": 1.00228, + "balance_loss_mlp": 1.00110078, + "epoch": 0.18259431835262288, + "flos": 20149764456960.0, + "grad_norm": 1.956164147887205, + "language_loss": 0.79977518, + "learning_rate": 3.760671412463617e-06, + "loss": 0.82292551, + "num_input_tokens_seen": 65643515, + "step": 3037, + "time_per_iteration": 2.533792018890381 + }, + { + "auxiliary_loss_clip": 0.01144422, + "auxiliary_loss_mlp": 0.00748816, + "balance_loss_clip": 1.00210738, + "balance_loss_mlp": 1.00059342, + "epoch": 0.18265444160529085, + "flos": 16981653916800.0, + "grad_norm": 2.966674568372144, + "language_loss": 0.79472673, + "learning_rate": 3.7604866374732246e-06, + "loss": 0.81365907, + "num_input_tokens_seen": 65658155, + "step": 3038, + "time_per_iteration": 2.563483715057373 + }, + { + "auxiliary_loss_clip": 0.01145046, + "auxiliary_loss_mlp": 0.01154583, + "balance_loss_clip": 1.00209641, + "balance_loss_mlp": 1.00094616, + "epoch": 0.1827145648579588, + "flos": 34423250509440.0, + "grad_norm": 1.9079002100126283, + "language_loss": 0.67582405, + "learning_rate": 3.7603017957250023e-06, + "loss": 0.69882035, + "num_input_tokens_seen": 65679310, + "step": 3039, + "time_per_iteration": 2.6884360313415527 + }, + { + "auxiliary_loss_clip": 0.011461, + "auxiliary_loss_mlp": 0.01154851, + "balance_loss_clip": 1.00230777, + "balance_loss_mlp": 1.00102377, + "epoch": 0.18277468811062678, + "flos": 53287017264000.0, + "grad_norm": 2.5460367261108616, + "language_loss": 0.73422801, + "learning_rate": 3.7601168872259593e-06, + "loss": 0.75723755, + "num_input_tokens_seen": 65705235, + "step": 3040, + "time_per_iteration": 2.874387502670288 + }, + { + "auxiliary_loss_clip": 0.01160858, + "auxiliary_loss_mlp": 0.01154854, + "balance_loss_clip": 1.00216115, + "balance_loss_mlp": 1.00102592, + "epoch": 0.18283481136329474, + "flos": 31650659602560.0, + "grad_norm": 1.996318930778006, + "language_loss": 0.60215867, + "learning_rate": 3.7599319119831075e-06, + "loss": 0.62531579, + "num_input_tokens_seen": 65727575, + "step": 3041, + "time_per_iteration": 2.6267356872558594 + }, + { + "auxiliary_loss_clip": 0.01128855, + "auxiliary_loss_mlp": 0.01154858, + "balance_loss_clip": 1.00200748, + "balance_loss_mlp": 1.00112581, + "epoch": 0.18289493461596273, + "flos": 53137664513280.0, + "grad_norm": 1.5465905433633038, + "language_loss": 0.59936416, + "learning_rate": 3.7597468700034616e-06, + "loss": 0.62220126, + "num_input_tokens_seen": 65751370, + "step": 3042, + "time_per_iteration": 2.936156988143921 + }, + { + "auxiliary_loss_clip": 0.01145591, + "auxiliary_loss_mlp": 0.01154588, + "balance_loss_clip": 1.00226164, + "balance_loss_mlp": 1.00114191, + "epoch": 0.1829550578686307, + "flos": 25589369220480.0, + "grad_norm": 2.8574715972288685, + "language_loss": 0.87291789, + "learning_rate": 3.7595617612940374e-06, + "loss": 0.89591968, + "num_input_tokens_seen": 65771040, + "step": 3043, + "time_per_iteration": 4.091348648071289 + }, + { + "auxiliary_loss_clip": 0.01065437, + "auxiliary_loss_mlp": 0.01154381, + "balance_loss_clip": 1.00184512, + "balance_loss_mlp": 1.00112522, + "epoch": 0.18301518112129866, + "flos": 22601422321920.0, + "grad_norm": 1.9616004992386216, + "language_loss": 0.70997673, + "learning_rate": 3.7593765858618552e-06, + "loss": 0.73217493, + "num_input_tokens_seen": 65789345, + "step": 3044, + "time_per_iteration": 4.262147903442383 + }, + { + "auxiliary_loss_clip": 0.01113653, + "auxiliary_loss_mlp": 0.01154966, + "balance_loss_clip": 1.00202107, + "balance_loss_mlp": 1.00094724, + "epoch": 0.18307530437396663, + "flos": 34020799551360.0, + "grad_norm": 2.163353136632935, + "language_loss": 0.64435685, + "learning_rate": 3.7591913437139365e-06, + "loss": 0.66704303, + "num_input_tokens_seen": 65810990, + "step": 3045, + "time_per_iteration": 2.797917127609253 + }, + { + "auxiliary_loss_clip": 0.01177474, + "auxiliary_loss_mlp": 0.01154616, + "balance_loss_clip": 1.00237334, + "balance_loss_mlp": 1.00097871, + "epoch": 0.1831354276266346, + "flos": 21279765392640.0, + "grad_norm": 2.278245009781785, + "language_loss": 0.78940749, + "learning_rate": 3.7590060348573066e-06, + "loss": 0.8127284, + "num_input_tokens_seen": 65827230, + "step": 3046, + "time_per_iteration": 3.9817845821380615 + }, + { + "auxiliary_loss_clip": 0.01128368, + "auxiliary_loss_mlp": 0.01154539, + "balance_loss_clip": 1.00188088, + "balance_loss_mlp": 1.00090182, + "epoch": 0.18319555087930256, + "flos": 21032952065280.0, + "grad_norm": 1.7600278285780406, + "language_loss": 0.79232073, + "learning_rate": 3.7588206592989903e-06, + "loss": 0.81514978, + "num_input_tokens_seen": 65845900, + "step": 3047, + "time_per_iteration": 4.072145700454712 + }, + { + "auxiliary_loss_clip": 0.0116088, + "auxiliary_loss_mlp": 0.01154017, + "balance_loss_clip": 1.00220323, + "balance_loss_mlp": 1.00085747, + "epoch": 0.18325567413197055, + "flos": 34382958428160.0, + "grad_norm": 1.4725175985522219, + "language_loss": 0.80753946, + "learning_rate": 3.7586352170460194e-06, + "loss": 0.83068848, + "num_input_tokens_seen": 65868730, + "step": 3048, + "time_per_iteration": 2.697690486907959 + }, + { + "auxiliary_loss_clip": 0.01161691, + "auxiliary_loss_mlp": 0.01154353, + "balance_loss_clip": 1.00222313, + "balance_loss_mlp": 1.00090718, + "epoch": 0.18331579738463852, + "flos": 20558464381440.0, + "grad_norm": 1.8892025829213916, + "language_loss": 0.86397076, + "learning_rate": 3.758449708105424e-06, + "loss": 0.88713121, + "num_input_tokens_seen": 65888420, + "step": 3049, + "time_per_iteration": 2.569035530090332 + }, + { + "auxiliary_loss_clip": 0.01161931, + "auxiliary_loss_mlp": 0.01154593, + "balance_loss_clip": 1.00224185, + "balance_loss_mlp": 1.00086081, + "epoch": 0.18337592063730648, + "flos": 19607872901760.0, + "grad_norm": 3.1852033683110075, + "language_loss": 0.77390659, + "learning_rate": 3.75826413248424e-06, + "loss": 0.79707181, + "num_input_tokens_seen": 65905840, + "step": 3050, + "time_per_iteration": 2.549532413482666 + }, + { + "auxiliary_loss_clip": 0.01145235, + "auxiliary_loss_mlp": 0.01154104, + "balance_loss_clip": 1.00206542, + "balance_loss_mlp": 1.0009439, + "epoch": 0.18343604388997445, + "flos": 20850885002880.0, + "grad_norm": 2.0254683205780135, + "language_loss": 0.99177814, + "learning_rate": 3.7580784901895035e-06, + "loss": 1.01477146, + "num_input_tokens_seen": 65922845, + "step": 3051, + "time_per_iteration": 2.6100401878356934 + }, + { + "auxiliary_loss_clip": 0.01146167, + "auxiliary_loss_mlp": 0.01153887, + "balance_loss_clip": 1.00226974, + "balance_loss_mlp": 1.00082278, + "epoch": 0.1834961671426424, + "flos": 24394370624640.0, + "grad_norm": 1.5469607698146033, + "language_loss": 0.85810077, + "learning_rate": 3.7578927812282542e-06, + "loss": 0.88110137, + "num_input_tokens_seen": 65945555, + "step": 3052, + "time_per_iteration": 2.6861255168914795 + }, + { + "auxiliary_loss_clip": 0.01177393, + "auxiliary_loss_mlp": 0.01154478, + "balance_loss_clip": 1.00229442, + "balance_loss_mlp": 1.001127, + "epoch": 0.18355629039531038, + "flos": 21251612108160.0, + "grad_norm": 1.733056704223496, + "language_loss": 0.73245108, + "learning_rate": 3.7577070056075356e-06, + "loss": 0.75576979, + "num_input_tokens_seen": 65963965, + "step": 3053, + "time_per_iteration": 2.52901554107666 + }, + { + "auxiliary_loss_clip": 0.01177488, + "auxiliary_loss_mlp": 0.01154783, + "balance_loss_clip": 1.00231242, + "balance_loss_mlp": 1.00105095, + "epoch": 0.18361641364797834, + "flos": 28656499651200.0, + "grad_norm": 2.2915214883852277, + "language_loss": 0.61647463, + "learning_rate": 3.7575211633343902e-06, + "loss": 0.63979733, + "num_input_tokens_seen": 65985965, + "step": 3054, + "time_per_iteration": 2.597677707672119 + }, + { + "auxiliary_loss_clip": 0.01111038, + "auxiliary_loss_mlp": 0.01154141, + "balance_loss_clip": 1.00188589, + "balance_loss_mlp": 1.00098062, + "epoch": 0.18367653690064634, + "flos": 20918827578240.0, + "grad_norm": 2.132084257531632, + "language_loss": 0.7828269, + "learning_rate": 3.7573352544158663e-06, + "loss": 0.80547869, + "num_input_tokens_seen": 66005645, + "step": 3055, + "time_per_iteration": 2.6924874782562256 + }, + { + "auxiliary_loss_clip": 0.01111722, + "auxiliary_loss_mlp": 0.011543, + "balance_loss_clip": 1.00194895, + "balance_loss_mlp": 1.00114036, + "epoch": 0.1837366601533143, + "flos": 28765596234240.0, + "grad_norm": 1.804528236312625, + "language_loss": 0.69629335, + "learning_rate": 3.757149278859014e-06, + "loss": 0.71895355, + "num_input_tokens_seen": 66025675, + "step": 3056, + "time_per_iteration": 2.7277979850769043 + }, + { + "auxiliary_loss_clip": 0.01160701, + "auxiliary_loss_mlp": 0.01154016, + "balance_loss_clip": 1.00211632, + "balance_loss_mlp": 1.00114179, + "epoch": 0.18379678340598227, + "flos": 21251432540160.0, + "grad_norm": 1.446660056964137, + "language_loss": 0.80546159, + "learning_rate": 3.7569632366708842e-06, + "loss": 0.82860875, + "num_input_tokens_seen": 66046125, + "step": 3057, + "time_per_iteration": 2.5557539463043213 + }, + { + "auxiliary_loss_clip": 0.01160901, + "auxiliary_loss_mlp": 0.011547, + "balance_loss_clip": 1.00215578, + "balance_loss_mlp": 1.00096798, + "epoch": 0.18385690665865023, + "flos": 20449619193600.0, + "grad_norm": 2.2773844377118553, + "language_loss": 0.82298976, + "learning_rate": 3.756777127858533e-06, + "loss": 0.84614575, + "num_input_tokens_seen": 66064375, + "step": 3058, + "time_per_iteration": 2.5639724731445312 + }, + { + "auxiliary_loss_clip": 0.01129451, + "auxiliary_loss_mlp": 0.00748923, + "balance_loss_clip": 1.00206161, + "balance_loss_mlp": 1.00067735, + "epoch": 0.1839170299113182, + "flos": 26140562398080.0, + "grad_norm": 2.103483440597024, + "language_loss": 0.86144596, + "learning_rate": 3.756590952429017e-06, + "loss": 0.88022971, + "num_input_tokens_seen": 66084590, + "step": 3059, + "time_per_iteration": 2.702643871307373 + }, + { + "auxiliary_loss_clip": 0.01177258, + "auxiliary_loss_mlp": 0.00748777, + "balance_loss_clip": 1.00223553, + "balance_loss_mlp": 1.00057065, + "epoch": 0.18397715316398616, + "flos": 31758032332800.0, + "grad_norm": 2.402923345004009, + "language_loss": 0.72944587, + "learning_rate": 3.756404710389396e-06, + "loss": 0.74870622, + "num_input_tokens_seen": 66107105, + "step": 3060, + "time_per_iteration": 2.609729290008545 + }, + { + "auxiliary_loss_clip": 0.01161932, + "auxiliary_loss_mlp": 0.01154151, + "balance_loss_clip": 1.00224471, + "balance_loss_mlp": 1.00089574, + "epoch": 0.18403727641665413, + "flos": 24611989173120.0, + "grad_norm": 1.4821226213129959, + "language_loss": 0.73059392, + "learning_rate": 3.7562184017467323e-06, + "loss": 0.75375473, + "num_input_tokens_seen": 66129295, + "step": 3061, + "time_per_iteration": 2.6155409812927246 + }, + { + "auxiliary_loss_clip": 0.01161759, + "auxiliary_loss_mlp": 0.01154528, + "balance_loss_clip": 1.00224435, + "balance_loss_mlp": 1.00089145, + "epoch": 0.18409739966932212, + "flos": 23439900476160.0, + "grad_norm": 1.6653661486864326, + "language_loss": 0.81882215, + "learning_rate": 3.7560320265080906e-06, + "loss": 0.84198505, + "num_input_tokens_seen": 66146910, + "step": 3062, + "time_per_iteration": 2.5797624588012695 + }, + { + "auxiliary_loss_clip": 0.0116096, + "auxiliary_loss_mlp": 0.01154657, + "balance_loss_clip": 1.0020721, + "balance_loss_mlp": 1.00111508, + "epoch": 0.18415752292199009, + "flos": 21872112577920.0, + "grad_norm": 2.2527454784093583, + "language_loss": 0.73227882, + "learning_rate": 3.7558455846805383e-06, + "loss": 0.75543493, + "num_input_tokens_seen": 66165370, + "step": 3063, + "time_per_iteration": 2.568887710571289 + }, + { + "auxiliary_loss_clip": 0.0116113, + "auxiliary_loss_mlp": 0.01154226, + "balance_loss_clip": 1.00214648, + "balance_loss_mlp": 1.00097036, + "epoch": 0.18421764617465805, + "flos": 25410678036480.0, + "grad_norm": 1.6412736159880834, + "language_loss": 0.65295517, + "learning_rate": 3.7556590762711463e-06, + "loss": 0.67610878, + "num_input_tokens_seen": 66186210, + "step": 3064, + "time_per_iteration": 2.5858914852142334 + }, + { + "auxiliary_loss_clip": 0.01160682, + "auxiliary_loss_mlp": 0.01154432, + "balance_loss_clip": 1.00211048, + "balance_loss_mlp": 1.0011766, + "epoch": 0.18427776942732602, + "flos": 27198131558400.0, + "grad_norm": 2.725842585891004, + "language_loss": 0.68646979, + "learning_rate": 3.7554725012869853e-06, + "loss": 0.70962089, + "num_input_tokens_seen": 66204800, + "step": 3065, + "time_per_iteration": 2.635742425918579 + }, + { + "auxiliary_loss_clip": 0.0114429, + "auxiliary_loss_mlp": 0.01154112, + "balance_loss_clip": 1.00209045, + "balance_loss_mlp": 1.00095201, + "epoch": 0.18433789267999398, + "flos": 27852351920640.0, + "grad_norm": 2.217908770331613, + "language_loss": 0.73032832, + "learning_rate": 3.7552858597351318e-06, + "loss": 0.75331235, + "num_input_tokens_seen": 66222195, + "step": 3066, + "time_per_iteration": 2.666647434234619 + }, + { + "auxiliary_loss_clip": 0.01146211, + "auxiliary_loss_mlp": 0.01154479, + "balance_loss_clip": 1.00225699, + "balance_loss_mlp": 1.00103331, + "epoch": 0.18439801593266195, + "flos": 17856940533120.0, + "grad_norm": 2.499560739986276, + "language_loss": 0.82125384, + "learning_rate": 3.7550991516226622e-06, + "loss": 0.84426075, + "num_input_tokens_seen": 66239505, + "step": 3067, + "time_per_iteration": 2.5846269130706787 + }, + { + "auxiliary_loss_clip": 0.01174062, + "auxiliary_loss_mlp": 0.00748095, + "balance_loss_clip": 1.00152421, + "balance_loss_mlp": 0.99966449, + "epoch": 0.18445813918532994, + "flos": 56389522590720.0, + "grad_norm": 0.7827331207932444, + "language_loss": 0.59730792, + "learning_rate": 3.754912376956657e-06, + "loss": 0.61652946, + "num_input_tokens_seen": 66295695, + "step": 3068, + "time_per_iteration": 2.959101676940918 + }, + { + "auxiliary_loss_clip": 0.01144799, + "auxiliary_loss_mlp": 0.01154049, + "balance_loss_clip": 1.00228572, + "balance_loss_mlp": 1.00098479, + "epoch": 0.1845182624379979, + "flos": 20957180325120.0, + "grad_norm": 1.8226246963557073, + "language_loss": 0.76148456, + "learning_rate": 3.7547255357441987e-06, + "loss": 0.78447306, + "num_input_tokens_seen": 66315315, + "step": 3069, + "time_per_iteration": 2.6116623878479004 + }, + { + "auxiliary_loss_clip": 0.01161486, + "auxiliary_loss_mlp": 0.01154381, + "balance_loss_clip": 1.00226307, + "balance_loss_mlp": 1.00074434, + "epoch": 0.18457838569066587, + "flos": 20485170679680.0, + "grad_norm": 1.6474563934587536, + "language_loss": 0.85119605, + "learning_rate": 3.7545386279923718e-06, + "loss": 0.87435472, + "num_input_tokens_seen": 66333675, + "step": 3070, + "time_per_iteration": 2.57538104057312 + }, + { + "auxiliary_loss_clip": 0.01129378, + "auxiliary_loss_mlp": 0.01154767, + "balance_loss_clip": 1.00210071, + "balance_loss_mlp": 1.00093925, + "epoch": 0.18463850894333383, + "flos": 25010022758400.0, + "grad_norm": 1.8805293256429132, + "language_loss": 0.7762109, + "learning_rate": 3.754351653708265e-06, + "loss": 0.79905236, + "num_input_tokens_seen": 66354075, + "step": 3071, + "time_per_iteration": 2.6770246028900146 + }, + { + "auxiliary_loss_clip": 0.01129156, + "auxiliary_loss_mlp": 0.01154973, + "balance_loss_clip": 1.00225186, + "balance_loss_mlp": 1.00076401, + "epoch": 0.1846986321960018, + "flos": 16800628348800.0, + "grad_norm": 1.991317151558303, + "language_loss": 0.76900524, + "learning_rate": 3.7541646128989674e-06, + "loss": 0.79184651, + "num_input_tokens_seen": 66372520, + "step": 3072, + "time_per_iteration": 2.6347081661224365 + }, + { + "auxiliary_loss_clip": 0.01161766, + "auxiliary_loss_mlp": 0.01154235, + "balance_loss_clip": 1.00212753, + "balance_loss_mlp": 1.00097978, + "epoch": 0.18475875544866976, + "flos": 20814327936000.0, + "grad_norm": 1.8600747379513554, + "language_loss": 0.86232865, + "learning_rate": 3.7539775055715715e-06, + "loss": 0.88548875, + "num_input_tokens_seen": 66390745, + "step": 3073, + "time_per_iteration": 2.566329002380371 + }, + { + "auxiliary_loss_clip": 0.0117739, + "auxiliary_loss_mlp": 0.01154286, + "balance_loss_clip": 1.00231957, + "balance_loss_mlp": 1.00122094, + "epoch": 0.18481887870133773, + "flos": 22601422321920.0, + "grad_norm": 2.2069870857869525, + "language_loss": 0.91582072, + "learning_rate": 3.7537903317331732e-06, + "loss": 0.93913746, + "num_input_tokens_seen": 66410525, + "step": 3074, + "time_per_iteration": 2.563784599304199 + }, + { + "auxiliary_loss_clip": 0.01130855, + "auxiliary_loss_mlp": 0.01154537, + "balance_loss_clip": 1.00216508, + "balance_loss_mlp": 1.001091, + "epoch": 0.18487900195400572, + "flos": 29458815788160.0, + "grad_norm": 1.7706432120649926, + "language_loss": 0.65063, + "learning_rate": 3.75360309139087e-06, + "loss": 0.67348391, + "num_input_tokens_seen": 66432535, + "step": 3075, + "time_per_iteration": 2.7031092643737793 + }, + { + "auxiliary_loss_clip": 0.01144061, + "auxiliary_loss_mlp": 0.01154226, + "balance_loss_clip": 1.00218916, + "balance_loss_mlp": 1.00135159, + "epoch": 0.1849391252066737, + "flos": 20628777254400.0, + "grad_norm": 1.9901421201411393, + "language_loss": 0.72539902, + "learning_rate": 3.753415784551761e-06, + "loss": 0.74838185, + "num_input_tokens_seen": 66450620, + "step": 3076, + "time_per_iteration": 2.5792245864868164 + }, + { + "auxiliary_loss_clip": 0.01128319, + "auxiliary_loss_mlp": 0.01154202, + "balance_loss_clip": 1.00199199, + "balance_loss_mlp": 1.000947, + "epoch": 0.18499924845934165, + "flos": 14428549065600.0, + "grad_norm": 2.319890157161931, + "language_loss": 0.81149167, + "learning_rate": 3.7532284112229507e-06, + "loss": 0.83431697, + "num_input_tokens_seen": 66467865, + "step": 3077, + "time_per_iteration": 2.613947629928589 + }, + { + "auxiliary_loss_clip": 0.01145201, + "auxiliary_loss_mlp": 0.01154628, + "balance_loss_clip": 1.00221372, + "balance_loss_mlp": 1.00118208, + "epoch": 0.18505937171200962, + "flos": 23727652329600.0, + "grad_norm": 1.6781305031309097, + "language_loss": 0.78968501, + "learning_rate": 3.7530409714115424e-06, + "loss": 0.81268328, + "num_input_tokens_seen": 66486245, + "step": 3078, + "time_per_iteration": 2.6272313594818115 + }, + { + "auxiliary_loss_clip": 0.01177332, + "auxiliary_loss_mlp": 0.01154411, + "balance_loss_clip": 1.0023644, + "balance_loss_mlp": 1.00096464, + "epoch": 0.18511949496467758, + "flos": 25957489754880.0, + "grad_norm": 1.8274052177923001, + "language_loss": 0.77667356, + "learning_rate": 3.7528534651246453e-06, + "loss": 0.79999101, + "num_input_tokens_seen": 66506510, + "step": 3079, + "time_per_iteration": 2.5555853843688965 + }, + { + "auxiliary_loss_clip": 0.01145496, + "auxiliary_loss_mlp": 0.01153902, + "balance_loss_clip": 1.00216603, + "balance_loss_mlp": 1.00093305, + "epoch": 0.18517961821734555, + "flos": 42413553912960.0, + "grad_norm": 1.9563584331773065, + "language_loss": 0.81986749, + "learning_rate": 3.752665892369369e-06, + "loss": 0.84286153, + "num_input_tokens_seen": 66530960, + "step": 3080, + "time_per_iteration": 2.7894561290740967 + }, + { + "auxiliary_loss_clip": 0.0111278, + "auxiliary_loss_mlp": 0.01154104, + "balance_loss_clip": 1.00200272, + "balance_loss_mlp": 1.00084829, + "epoch": 0.18523974147001354, + "flos": 24097568544000.0, + "grad_norm": 1.989331566692629, + "language_loss": 0.73832119, + "learning_rate": 3.7524782531528266e-06, + "loss": 0.76099002, + "num_input_tokens_seen": 66550275, + "step": 3081, + "time_per_iteration": 4.115151643753052 + }, + { + "auxiliary_loss_clip": 0.01144585, + "auxiliary_loss_mlp": 0.01154552, + "balance_loss_clip": 1.00213766, + "balance_loss_mlp": 1.00110555, + "epoch": 0.1852998647226815, + "flos": 27375278457600.0, + "grad_norm": 1.954859102763673, + "language_loss": 0.71339905, + "learning_rate": 3.7522905474821334e-06, + "loss": 0.73639041, + "num_input_tokens_seen": 66569040, + "step": 3082, + "time_per_iteration": 4.11580491065979 + }, + { + "auxiliary_loss_clip": 0.0112787, + "auxiliary_loss_mlp": 0.01153907, + "balance_loss_clip": 1.00217175, + "balance_loss_mlp": 1.00084198, + "epoch": 0.18535998797534947, + "flos": 18332757020160.0, + "grad_norm": 2.063128215003623, + "language_loss": 0.69645941, + "learning_rate": 3.752102775364407e-06, + "loss": 0.71927714, + "num_input_tokens_seen": 66587775, + "step": 3083, + "time_per_iteration": 2.612100601196289 + }, + { + "auxiliary_loss_clip": 0.01145383, + "auxiliary_loss_mlp": 0.01153998, + "balance_loss_clip": 1.00214767, + "balance_loss_mlp": 1.00112414, + "epoch": 0.18542011122801744, + "flos": 37845859887360.0, + "grad_norm": 1.835227146421941, + "language_loss": 0.68505526, + "learning_rate": 3.751914936806767e-06, + "loss": 0.70804906, + "num_input_tokens_seen": 66610800, + "step": 3084, + "time_per_iteration": 4.1655144691467285 + }, + { + "auxiliary_loss_clip": 0.01177207, + "auxiliary_loss_mlp": 0.01153384, + "balance_loss_clip": 1.0022893, + "balance_loss_mlp": 1.00079656, + "epoch": 0.1854802344806854, + "flos": 25186128163200.0, + "grad_norm": 1.5765702004102922, + "language_loss": 0.77789849, + "learning_rate": 3.7517270318163377e-06, + "loss": 0.80120438, + "num_input_tokens_seen": 66630960, + "step": 3085, + "time_per_iteration": 2.5642025470733643 + }, + { + "auxiliary_loss_clip": 0.01177207, + "auxiliary_loss_mlp": 0.011544, + "balance_loss_clip": 1.0022099, + "balance_loss_mlp": 1.00124002, + "epoch": 0.18554035773335337, + "flos": 26684788337280.0, + "grad_norm": 1.7652644038565015, + "language_loss": 0.73554564, + "learning_rate": 3.751539060400244e-06, + "loss": 0.75886172, + "num_input_tokens_seen": 66650585, + "step": 3086, + "time_per_iteration": 2.5704193115234375 + }, + { + "auxiliary_loss_clip": 0.0116065, + "auxiliary_loss_mlp": 0.01154037, + "balance_loss_clip": 1.00213766, + "balance_loss_mlp": 1.00116324, + "epoch": 0.18560048098602133, + "flos": 22346887570560.0, + "grad_norm": 2.183965635132149, + "language_loss": 0.69797444, + "learning_rate": 3.7513510225656132e-06, + "loss": 0.72112131, + "num_input_tokens_seen": 66670045, + "step": 3087, + "time_per_iteration": 2.5725841522216797 + }, + { + "auxiliary_loss_clip": 0.01129287, + "auxiliary_loss_mlp": 0.01153934, + "balance_loss_clip": 1.00213587, + "balance_loss_mlp": 1.00105989, + "epoch": 0.18566060423868933, + "flos": 17748526308480.0, + "grad_norm": 8.587342756627086, + "language_loss": 0.73212361, + "learning_rate": 3.7511629183195764e-06, + "loss": 0.75495577, + "num_input_tokens_seen": 66688790, + "step": 3088, + "time_per_iteration": 2.6141104698181152 + }, + { + "auxiliary_loss_clip": 0.01146093, + "auxiliary_loss_mlp": 0.01153913, + "balance_loss_clip": 1.00216842, + "balance_loss_mlp": 1.00103879, + "epoch": 0.1857207274913573, + "flos": 24677274142080.0, + "grad_norm": 2.120668591601745, + "language_loss": 0.91946197, + "learning_rate": 3.7509747476692663e-06, + "loss": 0.94246197, + "num_input_tokens_seen": 66708090, + "step": 3089, + "time_per_iteration": 2.6304385662078857 + }, + { + "auxiliary_loss_clip": 0.01113768, + "auxiliary_loss_mlp": 0.01153692, + "balance_loss_clip": 1.00205159, + "balance_loss_mlp": 1.00100911, + "epoch": 0.18578085074402526, + "flos": 28147825198080.0, + "grad_norm": 3.5460817107423352, + "language_loss": 0.57973653, + "learning_rate": 3.7507865106218176e-06, + "loss": 0.60241115, + "num_input_tokens_seen": 66727320, + "step": 3090, + "time_per_iteration": 2.80023455619812 + }, + { + "auxiliary_loss_clip": 0.01145659, + "auxiliary_loss_mlp": 0.01153662, + "balance_loss_clip": 1.00206804, + "balance_loss_mlp": 1.00097847, + "epoch": 0.18584097399669322, + "flos": 23951878980480.0, + "grad_norm": 1.6969563893093325, + "language_loss": 0.8181268, + "learning_rate": 3.7505982071843695e-06, + "loss": 0.84111995, + "num_input_tokens_seen": 66747505, + "step": 3091, + "time_per_iteration": 2.7133004665374756 + }, + { + "auxiliary_loss_clip": 0.01098133, + "auxiliary_loss_mlp": 0.01154193, + "balance_loss_clip": 1.00196886, + "balance_loss_mlp": 1.00084174, + "epoch": 0.18590109724936119, + "flos": 17201678676480.0, + "grad_norm": 2.4323968973445624, + "language_loss": 0.84201002, + "learning_rate": 3.7504098373640617e-06, + "loss": 0.86453325, + "num_input_tokens_seen": 66766425, + "step": 3092, + "time_per_iteration": 2.746880054473877 + }, + { + "auxiliary_loss_clip": 0.01144607, + "auxiliary_loss_mlp": 0.01154066, + "balance_loss_clip": 1.00205898, + "balance_loss_mlp": 1.00090611, + "epoch": 0.18596122050202915, + "flos": 17234644383360.0, + "grad_norm": 2.0991167454600954, + "language_loss": 0.92644817, + "learning_rate": 3.750221401168038e-06, + "loss": 0.94943488, + "num_input_tokens_seen": 66781130, + "step": 3093, + "time_per_iteration": 2.6593010425567627 + }, + { + "auxiliary_loss_clip": 0.01145757, + "auxiliary_loss_mlp": 0.01153556, + "balance_loss_clip": 1.00220084, + "balance_loss_mlp": 1.00096786, + "epoch": 0.18602134375469712, + "flos": 19020733188480.0, + "grad_norm": 2.6110948887528362, + "language_loss": 0.77560556, + "learning_rate": 3.750032898603443e-06, + "loss": 0.79859871, + "num_input_tokens_seen": 66797535, + "step": 3094, + "time_per_iteration": 2.596107006072998 + }, + { + "auxiliary_loss_clip": 0.01095542, + "auxiliary_loss_mlp": 0.01154003, + "balance_loss_clip": 1.00203288, + "balance_loss_mlp": 1.00112867, + "epoch": 0.1860814670073651, + "flos": 50950094417280.0, + "grad_norm": 1.4689139092006231, + "language_loss": 0.6952945, + "learning_rate": 3.749844329677425e-06, + "loss": 0.71778989, + "num_input_tokens_seen": 66821720, + "step": 3095, + "time_per_iteration": 2.9811441898345947 + }, + { + "auxiliary_loss_clip": 0.0112966, + "auxiliary_loss_mlp": 0.01154223, + "balance_loss_clip": 1.00222206, + "balance_loss_mlp": 1.00106311, + "epoch": 0.18614159026003307, + "flos": 19390972625280.0, + "grad_norm": 1.974017605141366, + "language_loss": 0.80882221, + "learning_rate": 3.749655694397135e-06, + "loss": 0.83166105, + "num_input_tokens_seen": 66839060, + "step": 3096, + "time_per_iteration": 2.642463445663452 + }, + { + "auxiliary_loss_clip": 0.01160888, + "auxiliary_loss_mlp": 0.01153978, + "balance_loss_clip": 1.00217557, + "balance_loss_mlp": 1.00100899, + "epoch": 0.18620171351270104, + "flos": 21798782962560.0, + "grad_norm": 1.8556568755622387, + "language_loss": 0.75041437, + "learning_rate": 3.7494669927697255e-06, + "loss": 0.77356303, + "num_input_tokens_seen": 66857760, + "step": 3097, + "time_per_iteration": 2.581224203109741 + }, + { + "auxiliary_loss_clip": 0.01144027, + "auxiliary_loss_mlp": 0.01153887, + "balance_loss_clip": 1.00214267, + "balance_loss_mlp": 1.0010128, + "epoch": 0.186261836765369, + "flos": 16362877299840.0, + "grad_norm": 2.3047061376384885, + "language_loss": 0.66534245, + "learning_rate": 3.749278224802352e-06, + "loss": 0.68832165, + "num_input_tokens_seen": 66876460, + "step": 3098, + "time_per_iteration": 2.5756008625030518 + }, + { + "auxiliary_loss_clip": 0.01177348, + "auxiliary_loss_mlp": 0.01154491, + "balance_loss_clip": 1.00227904, + "balance_loss_mlp": 1.00123501, + "epoch": 0.18632196001803697, + "flos": 23370054480000.0, + "grad_norm": 1.5333327571649318, + "language_loss": 0.69673204, + "learning_rate": 3.7490893905021733e-06, + "loss": 0.72005039, + "num_input_tokens_seen": 66897960, + "step": 3099, + "time_per_iteration": 2.601970911026001 + }, + { + "auxiliary_loss_clip": 0.0116057, + "auxiliary_loss_mlp": 0.01154291, + "balance_loss_clip": 1.00218105, + "balance_loss_mlp": 1.00122631, + "epoch": 0.18638208327070493, + "flos": 22492002516480.0, + "grad_norm": 3.0704660713919485, + "language_loss": 0.7167927, + "learning_rate": 3.7489004898763494e-06, + "loss": 0.73994136, + "num_input_tokens_seen": 66917675, + "step": 3100, + "time_per_iteration": 2.581172466278076 + }, + { + "auxiliary_loss_clip": 0.01145803, + "auxiliary_loss_mlp": 0.01154059, + "balance_loss_clip": 1.00222909, + "balance_loss_mlp": 1.00099456, + "epoch": 0.18644220652337293, + "flos": 29165245931520.0, + "grad_norm": 1.653591952161636, + "language_loss": 0.80032206, + "learning_rate": 3.7487115229320444e-06, + "loss": 0.82332069, + "num_input_tokens_seen": 66936000, + "step": 3101, + "time_per_iteration": 2.6556320190429688 + }, + { + "auxiliary_loss_clip": 0.0112863, + "auxiliary_loss_mlp": 0.01153747, + "balance_loss_clip": 1.00209332, + "balance_loss_mlp": 1.00115955, + "epoch": 0.1865023297760409, + "flos": 24243796811520.0, + "grad_norm": 2.139835901658982, + "language_loss": 0.76956677, + "learning_rate": 3.7485224896764222e-06, + "loss": 0.79239058, + "num_input_tokens_seen": 66955700, + "step": 3102, + "time_per_iteration": 2.683870553970337 + }, + { + "auxiliary_loss_clip": 0.01161644, + "auxiliary_loss_mlp": 0.01153554, + "balance_loss_clip": 1.00213408, + "balance_loss_mlp": 1.00087094, + "epoch": 0.18656245302870886, + "flos": 19128716449920.0, + "grad_norm": 2.135531546876327, + "language_loss": 0.76824504, + "learning_rate": 3.7483333901166525e-06, + "loss": 0.79139698, + "num_input_tokens_seen": 66972815, + "step": 3103, + "time_per_iteration": 2.537095546722412 + }, + { + "auxiliary_loss_clip": 0.01144102, + "auxiliary_loss_mlp": 0.01153765, + "balance_loss_clip": 1.00212836, + "balance_loss_mlp": 1.00089121, + "epoch": 0.18662257628137682, + "flos": 17786088956160.0, + "grad_norm": 1.653695424716071, + "language_loss": 0.7872076, + "learning_rate": 3.7481442242599054e-06, + "loss": 0.81018633, + "num_input_tokens_seen": 66992280, + "step": 3104, + "time_per_iteration": 2.5754029750823975 + }, + { + "auxiliary_loss_clip": 0.01129662, + "auxiliary_loss_mlp": 0.0115379, + "balance_loss_clip": 1.00222731, + "balance_loss_mlp": 1.00101161, + "epoch": 0.1866826995340448, + "flos": 24024382583040.0, + "grad_norm": 1.9123715992585333, + "language_loss": 0.85046226, + "learning_rate": 3.747954992113354e-06, + "loss": 0.87329674, + "num_input_tokens_seen": 67012220, + "step": 3105, + "time_per_iteration": 2.665842294692993 + }, + { + "auxiliary_loss_clip": 0.01134097, + "auxiliary_loss_mlp": 0.01153987, + "balance_loss_clip": 1.00217533, + "balance_loss_mlp": 1.00101805, + "epoch": 0.18674282278671275, + "flos": 26141244756480.0, + "grad_norm": 1.9843179417010386, + "language_loss": 0.86774278, + "learning_rate": 3.7477656936841742e-06, + "loss": 0.89062357, + "num_input_tokens_seen": 67032030, + "step": 3106, + "time_per_iteration": 2.689514398574829 + }, + { + "auxiliary_loss_clip": 0.011621, + "auxiliary_loss_mlp": 0.01154226, + "balance_loss_clip": 1.00240469, + "balance_loss_mlp": 1.00087547, + "epoch": 0.18680294603938072, + "flos": 19201938324480.0, + "grad_norm": 2.1545277469093476, + "language_loss": 0.78045553, + "learning_rate": 3.7475763289795445e-06, + "loss": 0.80361879, + "num_input_tokens_seen": 67048920, + "step": 3107, + "time_per_iteration": 2.5652477741241455 + }, + { + "auxiliary_loss_clip": 0.01161313, + "auxiliary_loss_mlp": 0.01154036, + "balance_loss_clip": 1.00216651, + "balance_loss_mlp": 1.00125706, + "epoch": 0.1868630692920487, + "flos": 28544889116160.0, + "grad_norm": 2.3568714029978786, + "language_loss": 0.73940337, + "learning_rate": 3.7473868980066446e-06, + "loss": 0.76255685, + "num_input_tokens_seen": 67068645, + "step": 3108, + "time_per_iteration": 2.622173547744751 + }, + { + "auxiliary_loss_clip": 0.01111536, + "auxiliary_loss_mlp": 0.01153659, + "balance_loss_clip": 1.00185609, + "balance_loss_mlp": 1.00107121, + "epoch": 0.18692319254471668, + "flos": 17238020261760.0, + "grad_norm": 1.6116758656024597, + "language_loss": 0.74320209, + "learning_rate": 3.747197400772658e-06, + "loss": 0.76585406, + "num_input_tokens_seen": 67087075, + "step": 3109, + "time_per_iteration": 2.6959006786346436 + }, + { + "auxiliary_loss_clip": 0.01160629, + "auxiliary_loss_mlp": 0.01153858, + "balance_loss_clip": 1.00218153, + "balance_loss_mlp": 1.00088835, + "epoch": 0.18698331579738464, + "flos": 23185186156800.0, + "grad_norm": 1.3893089938062175, + "language_loss": 0.84412211, + "learning_rate": 3.747007837284772e-06, + "loss": 0.86726695, + "num_input_tokens_seen": 67108040, + "step": 3110, + "time_per_iteration": 2.600036382675171 + }, + { + "auxiliary_loss_clip": 0.01160485, + "auxiliary_loss_mlp": 0.01153634, + "balance_loss_clip": 1.0022105, + "balance_loss_mlp": 1.00095105, + "epoch": 0.1870434390500526, + "flos": 25516721963520.0, + "grad_norm": 1.6700079353701516, + "language_loss": 0.84565413, + "learning_rate": 3.7468182075501737e-06, + "loss": 0.86879528, + "num_input_tokens_seen": 67127605, + "step": 3111, + "time_per_iteration": 2.597327947616577 + }, + { + "auxiliary_loss_clip": 0.01144364, + "auxiliary_loss_mlp": 0.01153945, + "balance_loss_clip": 1.00204086, + "balance_loss_mlp": 1.00088072, + "epoch": 0.18710356230272057, + "flos": 19500823393920.0, + "grad_norm": 2.030395637528924, + "language_loss": 0.76760703, + "learning_rate": 3.7466285115760536e-06, + "loss": 0.79059011, + "num_input_tokens_seen": 67145785, + "step": 3112, + "time_per_iteration": 2.5906944274902344 + }, + { + "auxiliary_loss_clip": 0.01160607, + "auxiliary_loss_mlp": 0.01153401, + "balance_loss_clip": 1.00213099, + "balance_loss_mlp": 1.00100374, + "epoch": 0.18716368555538854, + "flos": 26760847386240.0, + "grad_norm": 1.850270696664874, + "language_loss": 0.643502, + "learning_rate": 3.7464387493696046e-06, + "loss": 0.66664207, + "num_input_tokens_seen": 67165930, + "step": 3113, + "time_per_iteration": 2.6154301166534424 + }, + { + "auxiliary_loss_clip": 0.01161703, + "auxiliary_loss_mlp": 0.01153745, + "balance_loss_clip": 1.00224245, + "balance_loss_mlp": 1.00096595, + "epoch": 0.1872238088080565, + "flos": 25189827264000.0, + "grad_norm": 1.9673824936902702, + "language_loss": 0.81379223, + "learning_rate": 3.746248920938024e-06, + "loss": 0.83694673, + "num_input_tokens_seen": 67185830, + "step": 3114, + "time_per_iteration": 2.6167101860046387 + }, + { + "auxiliary_loss_clip": 0.01111983, + "auxiliary_loss_mlp": 0.01153535, + "balance_loss_clip": 1.00183535, + "balance_loss_mlp": 1.00094676, + "epoch": 0.1872839320607245, + "flos": 24134305178880.0, + "grad_norm": 2.2370665828840353, + "language_loss": 0.5798521, + "learning_rate": 3.74605902628851e-06, + "loss": 0.60250723, + "num_input_tokens_seen": 67206930, + "step": 3115, + "time_per_iteration": 2.7123055458068848 + }, + { + "auxiliary_loss_clip": 0.0113025, + "auxiliary_loss_mlp": 0.01154081, + "balance_loss_clip": 1.00217724, + "balance_loss_mlp": 1.00130248, + "epoch": 0.18734405531339246, + "flos": 21173793292800.0, + "grad_norm": 1.932519172887217, + "language_loss": 0.70951068, + "learning_rate": 3.745869065428261e-06, + "loss": 0.73235399, + "num_input_tokens_seen": 67226290, + "step": 3116, + "time_per_iteration": 2.641526699066162 + }, + { + "auxiliary_loss_clip": 0.011771, + "auxiliary_loss_mlp": 0.01153082, + "balance_loss_clip": 1.00225079, + "balance_loss_mlp": 1.00078058, + "epoch": 0.18740417856606043, + "flos": 17237697039360.0, + "grad_norm": 1.918653932471556, + "language_loss": 0.78835964, + "learning_rate": 3.7456790383644833e-06, + "loss": 0.81166148, + "num_input_tokens_seen": 67244410, + "step": 3117, + "time_per_iteration": 2.490994453430176 + }, + { + "auxiliary_loss_clip": 0.01145104, + "auxiliary_loss_mlp": 0.01153819, + "balance_loss_clip": 1.00226164, + "balance_loss_mlp": 1.00123096, + "epoch": 0.1874643018187284, + "flos": 32558049999360.0, + "grad_norm": 1.6962297480694177, + "language_loss": 0.84075618, + "learning_rate": 3.745488945104381e-06, + "loss": 0.86374545, + "num_input_tokens_seen": 67264470, + "step": 3118, + "time_per_iteration": 2.73361873626709 + }, + { + "auxiliary_loss_clip": 0.01160524, + "auxiliary_loss_mlp": 0.01153354, + "balance_loss_clip": 1.00210929, + "balance_loss_mlp": 1.00095665, + "epoch": 0.18752442507139636, + "flos": 23258156636160.0, + "grad_norm": 1.6013379708251225, + "language_loss": 0.76319045, + "learning_rate": 3.7452987856551636e-06, + "loss": 0.78632927, + "num_input_tokens_seen": 67284315, + "step": 3119, + "time_per_iteration": 3.9350266456604004 + }, + { + "auxiliary_loss_clip": 0.01177177, + "auxiliary_loss_mlp": 0.01153539, + "balance_loss_clip": 1.00221443, + "balance_loss_mlp": 1.00114167, + "epoch": 0.18758454832406432, + "flos": 21760933006080.0, + "grad_norm": 1.5296000676081607, + "language_loss": 0.82187068, + "learning_rate": 3.7451085600240406e-06, + "loss": 0.84517789, + "num_input_tokens_seen": 67302780, + "step": 3120, + "time_per_iteration": 3.8889644145965576 + }, + { + "auxiliary_loss_clip": 0.01145235, + "auxiliary_loss_mlp": 0.01153079, + "balance_loss_clip": 1.00216639, + "balance_loss_mlp": 1.00087285, + "epoch": 0.1876446715767323, + "flos": 29570210841600.0, + "grad_norm": 1.5845286340798481, + "language_loss": 0.85037684, + "learning_rate": 3.7449182682182263e-06, + "loss": 0.87335992, + "num_input_tokens_seen": 67323405, + "step": 3121, + "time_per_iteration": 2.690891742706299 + }, + { + "auxiliary_loss_clip": 0.01113868, + "auxiliary_loss_mlp": 0.01153429, + "balance_loss_clip": 1.00224829, + "balance_loss_mlp": 1.00084138, + "epoch": 0.18770479482940028, + "flos": 30339992234880.0, + "grad_norm": 1.763842041685743, + "language_loss": 0.70392531, + "learning_rate": 3.744727910244937e-06, + "loss": 0.72659826, + "num_input_tokens_seen": 67345800, + "step": 3122, + "time_per_iteration": 4.17210841178894 + }, + { + "auxiliary_loss_clip": 0.01177208, + "auxiliary_loss_mlp": 0.0115318, + "balance_loss_clip": 1.00228834, + "balance_loss_mlp": 1.00097322, + "epoch": 0.18776491808206824, + "flos": 14465357527680.0, + "grad_norm": 9.607690842620984, + "language_loss": 0.70774472, + "learning_rate": 3.7445374861113905e-06, + "loss": 0.73104858, + "num_input_tokens_seen": 67363575, + "step": 3123, + "time_per_iteration": 2.6893558502197266 + }, + { + "auxiliary_loss_clip": 0.0116584, + "auxiliary_loss_mlp": 0.01153223, + "balance_loss_clip": 1.00244832, + "balance_loss_mlp": 1.00111187, + "epoch": 0.1878250413347362, + "flos": 24498547044480.0, + "grad_norm": 2.260374200431732, + "language_loss": 0.74084681, + "learning_rate": 3.7443469958248066e-06, + "loss": 0.76403743, + "num_input_tokens_seen": 67381765, + "step": 3124, + "time_per_iteration": 2.626067876815796 + }, + { + "auxiliary_loss_clip": 0.01177088, + "auxiliary_loss_mlp": 0.01152803, + "balance_loss_clip": 1.00218201, + "balance_loss_mlp": 1.00097823, + "epoch": 0.18788516458740417, + "flos": 39786185692800.0, + "grad_norm": 1.9123916846544262, + "language_loss": 0.80683565, + "learning_rate": 3.7441564393924106e-06, + "loss": 0.83013457, + "num_input_tokens_seen": 67405000, + "step": 3125, + "time_per_iteration": 2.670936107635498 + }, + { + "auxiliary_loss_clip": 0.01111555, + "auxiliary_loss_mlp": 0.0114276, + "balance_loss_clip": 1.00160956, + "balance_loss_mlp": 1.00009, + "epoch": 0.18794528784007214, + "flos": 64699250664960.0, + "grad_norm": 0.9515510138391299, + "language_loss": 0.63607752, + "learning_rate": 3.7439658168214273e-06, + "loss": 0.65862072, + "num_input_tokens_seen": 67467140, + "step": 3126, + "time_per_iteration": 3.280707359313965 + }, + { + "auxiliary_loss_clip": 0.01143833, + "auxiliary_loss_mlp": 0.0115321, + "balance_loss_clip": 1.00217783, + "balance_loss_mlp": 1.00090802, + "epoch": 0.1880054110927401, + "flos": 28622061486720.0, + "grad_norm": 1.5731273945856747, + "language_loss": 0.81563294, + "learning_rate": 3.7437751281190857e-06, + "loss": 0.83860338, + "num_input_tokens_seen": 67487980, + "step": 3127, + "time_per_iteration": 2.6704137325286865 + }, + { + "auxiliary_loss_clip": 0.01174042, + "auxiliary_loss_mlp": 0.011429, + "balance_loss_clip": 1.00173044, + "balance_loss_mlp": 1.00023031, + "epoch": 0.1880655343454081, + "flos": 64488958490880.0, + "grad_norm": 0.7661617688843874, + "language_loss": 0.61897862, + "learning_rate": 3.7435843732926164e-06, + "loss": 0.64214814, + "num_input_tokens_seen": 67552500, + "step": 3128, + "time_per_iteration": 3.1709980964660645 + }, + { + "auxiliary_loss_clip": 0.01114721, + "auxiliary_loss_mlp": 0.01153593, + "balance_loss_clip": 1.00200367, + "balance_loss_mlp": 1.00071907, + "epoch": 0.18812565759807606, + "flos": 32124464928000.0, + "grad_norm": 1.8811857757071206, + "language_loss": 0.70959514, + "learning_rate": 3.7433935523492536e-06, + "loss": 0.73227823, + "num_input_tokens_seen": 67573295, + "step": 3129, + "time_per_iteration": 2.761338472366333 + }, + { + "auxiliary_loss_clip": 0.01177123, + "auxiliary_loss_mlp": 0.0115351, + "balance_loss_clip": 1.00224555, + "balance_loss_mlp": 1.00101781, + "epoch": 0.18818578085074403, + "flos": 20624539449600.0, + "grad_norm": 1.7588141602580774, + "language_loss": 0.85499346, + "learning_rate": 3.7432026652962314e-06, + "loss": 0.87829983, + "num_input_tokens_seen": 67590010, + "step": 3130, + "time_per_iteration": 2.508319854736328 + }, + { + "auxiliary_loss_clip": 0.01128752, + "auxiliary_loss_mlp": 0.01153291, + "balance_loss_clip": 1.00207758, + "balance_loss_mlp": 1.00089359, + "epoch": 0.188245904103412, + "flos": 28840506048000.0, + "grad_norm": 2.0870470363024864, + "language_loss": 0.77006263, + "learning_rate": 3.7430117121407897e-06, + "loss": 0.79288304, + "num_input_tokens_seen": 67611110, + "step": 3131, + "time_per_iteration": 2.695817232131958 + }, + { + "auxiliary_loss_clip": 0.01129096, + "auxiliary_loss_mlp": 0.011532, + "balance_loss_clip": 1.00208092, + "balance_loss_mlp": 1.00099409, + "epoch": 0.18830602735607996, + "flos": 29420319386880.0, + "grad_norm": 1.7486760897462454, + "language_loss": 0.81336844, + "learning_rate": 3.74282069289017e-06, + "loss": 0.83619148, + "num_input_tokens_seen": 67631990, + "step": 3132, + "time_per_iteration": 2.7337639331817627 + }, + { + "auxiliary_loss_clip": 0.0111292, + "auxiliary_loss_mlp": 0.00748567, + "balance_loss_clip": 1.00195789, + "balance_loss_mlp": 1.00035334, + "epoch": 0.18836615060874792, + "flos": 28872933050880.0, + "grad_norm": 1.854892771865091, + "language_loss": 0.79533923, + "learning_rate": 3.742629607551614e-06, + "loss": 0.81395411, + "num_input_tokens_seen": 67650490, + "step": 3133, + "time_per_iteration": 2.7398500442504883 + }, + { + "auxiliary_loss_clip": 0.01129167, + "auxiliary_loss_mlp": 0.01152966, + "balance_loss_clip": 1.00207376, + "balance_loss_mlp": 1.0012362, + "epoch": 0.18842627386141592, + "flos": 22601673717120.0, + "grad_norm": 1.7704661293732773, + "language_loss": 0.82728171, + "learning_rate": 3.7424384561323698e-06, + "loss": 0.85010308, + "num_input_tokens_seen": 67668860, + "step": 3134, + "time_per_iteration": 2.6989831924438477 + }, + { + "auxiliary_loss_clip": 0.01145924, + "auxiliary_loss_mlp": 0.01153471, + "balance_loss_clip": 1.00211716, + "balance_loss_mlp": 1.00116897, + "epoch": 0.18848639711408388, + "flos": 24573600512640.0, + "grad_norm": 1.4001612434237138, + "language_loss": 0.82895172, + "learning_rate": 3.742247238639684e-06, + "loss": 0.85194558, + "num_input_tokens_seen": 67690220, + "step": 3135, + "time_per_iteration": 2.6546120643615723 + }, + { + "auxiliary_loss_clip": 0.01160634, + "auxiliary_loss_mlp": 0.01153204, + "balance_loss_clip": 1.0020566, + "balance_loss_mlp": 1.00090253, + "epoch": 0.18854652036675185, + "flos": 34166920078080.0, + "grad_norm": 1.7697578159059233, + "language_loss": 0.7849341, + "learning_rate": 3.7420559550808083e-06, + "loss": 0.80807251, + "num_input_tokens_seen": 67709820, + "step": 3136, + "time_per_iteration": 2.654787063598633 + }, + { + "auxiliary_loss_clip": 0.0115025, + "auxiliary_loss_mlp": 0.01154008, + "balance_loss_clip": 1.0023725, + "balance_loss_mlp": 1.00094318, + "epoch": 0.1886066436194198, + "flos": 24200236592640.0, + "grad_norm": 1.773923059437938, + "language_loss": 0.81381202, + "learning_rate": 3.741864605462996e-06, + "loss": 0.83685464, + "num_input_tokens_seen": 67729490, + "step": 3137, + "time_per_iteration": 2.656524181365967 + }, + { + "auxiliary_loss_clip": 0.01177501, + "auxiliary_loss_mlp": 0.01154097, + "balance_loss_clip": 1.00250411, + "balance_loss_mlp": 1.00103223, + "epoch": 0.18866676687208778, + "flos": 21251109317760.0, + "grad_norm": 1.7768731244495242, + "language_loss": 0.80715299, + "learning_rate": 3.741673189793504e-06, + "loss": 0.83046889, + "num_input_tokens_seen": 67749665, + "step": 3138, + "time_per_iteration": 2.5635507106781006 + }, + { + "auxiliary_loss_clip": 0.01160811, + "auxiliary_loss_mlp": 0.01153269, + "balance_loss_clip": 1.00214887, + "balance_loss_mlp": 1.0009675, + "epoch": 0.18872689012475574, + "flos": 37308673013760.0, + "grad_norm": 1.8070946594820572, + "language_loss": 0.63569582, + "learning_rate": 3.7414817080795896e-06, + "loss": 0.6588366, + "num_input_tokens_seen": 67776230, + "step": 3139, + "time_per_iteration": 2.7242870330810547 + }, + { + "auxiliary_loss_clip": 0.01177119, + "auxiliary_loss_mlp": 0.01153771, + "balance_loss_clip": 1.0022999, + "balance_loss_mlp": 1.00089717, + "epoch": 0.1887870133774237, + "flos": 21652303299840.0, + "grad_norm": 3.6272143717531606, + "language_loss": 0.71722162, + "learning_rate": 3.741290160328514e-06, + "loss": 0.74053055, + "num_input_tokens_seen": 67795080, + "step": 3140, + "time_per_iteration": 2.5268800258636475 + }, + { + "auxiliary_loss_clip": 0.01177037, + "auxiliary_loss_mlp": 0.01153277, + "balance_loss_clip": 1.00213385, + "balance_loss_mlp": 1.00088, + "epoch": 0.1888471366300917, + "flos": 15924659374080.0, + "grad_norm": 11.706943250474936, + "language_loss": 0.86998475, + "learning_rate": 3.7410985465475412e-06, + "loss": 0.8932879, + "num_input_tokens_seen": 67813110, + "step": 3141, + "time_per_iteration": 2.4789810180664062 + }, + { + "auxiliary_loss_clip": 0.01146207, + "auxiliary_loss_mlp": 0.0115405, + "balance_loss_clip": 1.00220394, + "balance_loss_mlp": 1.00108051, + "epoch": 0.18890725988275966, + "flos": 18551955767040.0, + "grad_norm": 1.748074251562166, + "language_loss": 0.77302575, + "learning_rate": 3.7409068667439378e-06, + "loss": 0.79602838, + "num_input_tokens_seen": 67831070, + "step": 3142, + "time_per_iteration": 2.563438653945923 + }, + { + "auxiliary_loss_clip": 0.01143792, + "auxiliary_loss_mlp": 0.01153369, + "balance_loss_clip": 1.00212276, + "balance_loss_mlp": 1.00097156, + "epoch": 0.18896738313542763, + "flos": 28840865184000.0, + "grad_norm": 2.719452777366166, + "language_loss": 0.7868551, + "learning_rate": 3.740715120924971e-06, + "loss": 0.80982673, + "num_input_tokens_seen": 67852170, + "step": 3143, + "time_per_iteration": 2.6499698162078857 + }, + { + "auxiliary_loss_clip": 0.0112855, + "auxiliary_loss_mlp": 0.01153912, + "balance_loss_clip": 1.00193727, + "balance_loss_mlp": 1.00113356, + "epoch": 0.1890275063880956, + "flos": 22412747157120.0, + "grad_norm": 2.1090443325949844, + "language_loss": 0.71381533, + "learning_rate": 3.740523309097912e-06, + "loss": 0.73663998, + "num_input_tokens_seen": 67869945, + "step": 3144, + "time_per_iteration": 2.6594178676605225 + }, + { + "auxiliary_loss_clip": 0.0114534, + "auxiliary_loss_mlp": 0.01153998, + "balance_loss_clip": 1.00226963, + "balance_loss_mlp": 1.00112414, + "epoch": 0.18908762964076356, + "flos": 24243904552320.0, + "grad_norm": 2.4718284510751225, + "language_loss": 0.73774678, + "learning_rate": 3.7403314312700356e-06, + "loss": 0.7607401, + "num_input_tokens_seen": 67890240, + "step": 3145, + "time_per_iteration": 2.6393978595733643 + }, + { + "auxiliary_loss_clip": 0.01129437, + "auxiliary_loss_mlp": 0.01153236, + "balance_loss_clip": 1.00205612, + "balance_loss_mlp": 1.00093436, + "epoch": 0.18914775289343153, + "flos": 16982910892800.0, + "grad_norm": 2.4608304536055923, + "language_loss": 0.7652064, + "learning_rate": 3.740139487448616e-06, + "loss": 0.78803313, + "num_input_tokens_seen": 67907825, + "step": 3146, + "time_per_iteration": 2.622793674468994 + }, + { + "auxiliary_loss_clip": 0.01113352, + "auxiliary_loss_mlp": 0.01153854, + "balance_loss_clip": 1.00202477, + "balance_loss_mlp": 1.00117099, + "epoch": 0.1892078761460995, + "flos": 21543781334400.0, + "grad_norm": 2.126124121747654, + "language_loss": 0.78674209, + "learning_rate": 3.7399474776409326e-06, + "loss": 0.80941415, + "num_input_tokens_seen": 67926670, + "step": 3147, + "time_per_iteration": 2.700117349624634 + }, + { + "auxiliary_loss_clip": 0.01160623, + "auxiliary_loss_mlp": 0.01153779, + "balance_loss_clip": 1.00221038, + "balance_loss_mlp": 1.00090516, + "epoch": 0.18926799939876748, + "flos": 23001538896000.0, + "grad_norm": 2.4867173526604645, + "language_loss": 0.66786158, + "learning_rate": 3.739755401854267e-06, + "loss": 0.69100559, + "num_input_tokens_seen": 67943645, + "step": 3148, + "time_per_iteration": 2.5865182876586914 + }, + { + "auxiliary_loss_clip": 0.0112911, + "auxiliary_loss_mlp": 0.01153263, + "balance_loss_clip": 1.00204849, + "balance_loss_mlp": 1.00067472, + "epoch": 0.18932812265143545, + "flos": 22273019251200.0, + "grad_norm": 2.244549396485429, + "language_loss": 0.7590363, + "learning_rate": 3.739563260095902e-06, + "loss": 0.78185999, + "num_input_tokens_seen": 67962345, + "step": 3149, + "time_per_iteration": 2.6557581424713135 + }, + { + "auxiliary_loss_clip": 0.01148882, + "auxiliary_loss_mlp": 0.01153176, + "balance_loss_clip": 1.00214207, + "balance_loss_mlp": 1.00116086, + "epoch": 0.1893882459041034, + "flos": 18624423456000.0, + "grad_norm": 2.286094304654652, + "language_loss": 0.80710089, + "learning_rate": 3.7393710523731245e-06, + "loss": 0.83012146, + "num_input_tokens_seen": 67979760, + "step": 3150, + "time_per_iteration": 2.5693740844726562 + }, + { + "auxiliary_loss_clip": 0.01161984, + "auxiliary_loss_mlp": 0.01153654, + "balance_loss_clip": 1.00242448, + "balance_loss_mlp": 1.00116158, + "epoch": 0.18944836915677138, + "flos": 22892981016960.0, + "grad_norm": 1.9770074286247896, + "language_loss": 0.85160995, + "learning_rate": 3.7391787786932215e-06, + "loss": 0.87476635, + "num_input_tokens_seen": 67996895, + "step": 3151, + "time_per_iteration": 2.5595548152923584 + }, + { + "auxiliary_loss_clip": 0.01127457, + "auxiliary_loss_mlp": 0.01153433, + "balance_loss_clip": 1.00202107, + "balance_loss_mlp": 1.0011313, + "epoch": 0.18950849240943934, + "flos": 26796542526720.0, + "grad_norm": 1.6193429981458576, + "language_loss": 0.74280405, + "learning_rate": 3.7389864390634857e-06, + "loss": 0.76561296, + "num_input_tokens_seen": 68018365, + "step": 3152, + "time_per_iteration": 2.681119918823242 + }, + { + "auxiliary_loss_clip": 0.0113379, + "auxiliary_loss_mlp": 0.01153807, + "balance_loss_clip": 1.00222492, + "balance_loss_mlp": 1.00102878, + "epoch": 0.1895686156621073, + "flos": 24971239048320.0, + "grad_norm": 1.7292429741718007, + "language_loss": 0.75420356, + "learning_rate": 3.738794033491209e-06, + "loss": 0.77707958, + "num_input_tokens_seen": 68037985, + "step": 3153, + "time_per_iteration": 2.6787848472595215 + }, + { + "auxiliary_loss_clip": 0.01177168, + "auxiliary_loss_mlp": 0.01153223, + "balance_loss_clip": 1.00228715, + "balance_loss_mlp": 1.00082636, + "epoch": 0.1896287389147753, + "flos": 21944544353280.0, + "grad_norm": 1.871299959455362, + "language_loss": 0.79262936, + "learning_rate": 3.7386015619836887e-06, + "loss": 0.81593329, + "num_input_tokens_seen": 68057975, + "step": 3154, + "time_per_iteration": 2.5318188667297363 + }, + { + "auxiliary_loss_clip": 0.01129113, + "auxiliary_loss_mlp": 0.01154086, + "balance_loss_clip": 1.00208831, + "balance_loss_mlp": 1.00111651, + "epoch": 0.18968886216744327, + "flos": 18179058723840.0, + "grad_norm": 2.357625896092402, + "language_loss": 0.73261708, + "learning_rate": 3.738409024548223e-06, + "loss": 0.75544906, + "num_input_tokens_seen": 68074175, + "step": 3155, + "time_per_iteration": 2.662454843521118 + }, + { + "auxiliary_loss_clip": 0.01143577, + "auxiliary_loss_mlp": 0.01152975, + "balance_loss_clip": 1.0020349, + "balance_loss_mlp": 1.00086379, + "epoch": 0.18974898542011123, + "flos": 20412487509120.0, + "grad_norm": 1.585246414895585, + "language_loss": 0.74204397, + "learning_rate": 3.7382164211921136e-06, + "loss": 0.76500946, + "num_input_tokens_seen": 68095230, + "step": 3156, + "time_per_iteration": 4.071953773498535 + }, + { + "auxiliary_loss_clip": 0.01177158, + "auxiliary_loss_mlp": 0.01153252, + "balance_loss_clip": 1.00227571, + "balance_loss_mlp": 1.00104523, + "epoch": 0.1898091086727792, + "flos": 23985024255360.0, + "grad_norm": 1.6946168800425727, + "language_loss": 0.68224454, + "learning_rate": 3.7380237519226623e-06, + "loss": 0.70554864, + "num_input_tokens_seen": 68113805, + "step": 3157, + "time_per_iteration": 2.5295653343200684 + }, + { + "auxiliary_loss_clip": 0.01128257, + "auxiliary_loss_mlp": 0.01153744, + "balance_loss_clip": 1.00209832, + "balance_loss_mlp": 1.00106084, + "epoch": 0.18986923192544716, + "flos": 27637067756160.0, + "grad_norm": 2.200608803697092, + "language_loss": 0.79990363, + "learning_rate": 3.737831016747176e-06, + "loss": 0.82272363, + "num_input_tokens_seen": 68133190, + "step": 3158, + "time_per_iteration": 2.6768243312835693 + }, + { + "auxiliary_loss_clip": 0.01177413, + "auxiliary_loss_mlp": 0.01154001, + "balance_loss_clip": 1.00234914, + "balance_loss_mlp": 1.00084066, + "epoch": 0.18992935517811513, + "flos": 25484151306240.0, + "grad_norm": 2.0977337472710977, + "language_loss": 0.72366321, + "learning_rate": 3.737638215672964e-06, + "loss": 0.74697733, + "num_input_tokens_seen": 68152330, + "step": 3159, + "time_per_iteration": 5.471362352371216 + }, + { + "auxiliary_loss_clip": 0.0116071, + "auxiliary_loss_mlp": 0.01154048, + "balance_loss_clip": 1.00230122, + "balance_loss_mlp": 1.00126886, + "epoch": 0.1899894784307831, + "flos": 17420805596160.0, + "grad_norm": 1.8636924238464265, + "language_loss": 0.84923577, + "learning_rate": 3.7374453487073366e-06, + "loss": 0.87238336, + "num_input_tokens_seen": 68170185, + "step": 3160, + "time_per_iteration": 3.932436227798462 + }, + { + "auxiliary_loss_clip": 0.01148951, + "auxiliary_loss_mlp": 0.0115333, + "balance_loss_clip": 1.00221694, + "balance_loss_mlp": 1.0012188, + "epoch": 0.19004960168345109, + "flos": 27492240119040.0, + "grad_norm": 1.7449987855714768, + "language_loss": 0.7327581, + "learning_rate": 3.7372524158576074e-06, + "loss": 0.75578094, + "num_input_tokens_seen": 68191665, + "step": 3161, + "time_per_iteration": 2.646564245223999 + }, + { + "auxiliary_loss_clip": 0.01161397, + "auxiliary_loss_mlp": 0.0115318, + "balance_loss_clip": 1.00220942, + "balance_loss_mlp": 1.00097334, + "epoch": 0.19010972493611905, + "flos": 38654676385920.0, + "grad_norm": 1.5542345242818147, + "language_loss": 0.80930018, + "learning_rate": 3.7370594171310926e-06, + "loss": 0.83244598, + "num_input_tokens_seen": 68214635, + "step": 3162, + "time_per_iteration": 2.6919450759887695 + }, + { + "auxiliary_loss_clip": 0.01177152, + "auxiliary_loss_mlp": 0.01153463, + "balance_loss_clip": 1.00232339, + "balance_loss_mlp": 1.00087547, + "epoch": 0.19016984818878702, + "flos": 19244744357760.0, + "grad_norm": 2.3725761403840933, + "language_loss": 0.75022364, + "learning_rate": 3.73686635253511e-06, + "loss": 0.77352977, + "num_input_tokens_seen": 68232150, + "step": 3163, + "time_per_iteration": 2.496166944503784 + }, + { + "auxiliary_loss_clip": 0.01111651, + "auxiliary_loss_mlp": 0.01152971, + "balance_loss_clip": 1.00207043, + "balance_loss_mlp": 1.00105047, + "epoch": 0.19022997144145498, + "flos": 37596891744000.0, + "grad_norm": 1.600971657639097, + "language_loss": 0.74232996, + "learning_rate": 3.736673222076982e-06, + "loss": 0.76497614, + "num_input_tokens_seen": 68253370, + "step": 3164, + "time_per_iteration": 2.806999444961548 + }, + { + "auxiliary_loss_clip": 0.01160665, + "auxiliary_loss_mlp": 0.01153452, + "balance_loss_clip": 1.00226176, + "balance_loss_mlp": 1.00076854, + "epoch": 0.19029009469412295, + "flos": 61530921665280.0, + "grad_norm": 1.6736690101936307, + "language_loss": 0.6692543, + "learning_rate": 3.7364800257640313e-06, + "loss": 0.69239551, + "num_input_tokens_seen": 68278895, + "step": 3165, + "time_per_iteration": 2.931097984313965 + }, + { + "auxiliary_loss_clip": 0.01161576, + "auxiliary_loss_mlp": 0.01153599, + "balance_loss_clip": 1.00229716, + "balance_loss_mlp": 1.00101125, + "epoch": 0.1903502179467909, + "flos": 13954851480960.0, + "grad_norm": 1.980651972015549, + "language_loss": 0.74315506, + "learning_rate": 3.7362867636035835e-06, + "loss": 0.76630676, + "num_input_tokens_seen": 68294880, + "step": 3166, + "time_per_iteration": 2.5504350662231445 + }, + { + "auxiliary_loss_clip": 0.01129778, + "auxiliary_loss_mlp": 0.01142329, + "balance_loss_clip": 1.00154555, + "balance_loss_mlp": 1.00042176, + "epoch": 0.1904103411994589, + "flos": 66899641916160.0, + "grad_norm": 0.7792611965281935, + "language_loss": 0.50400782, + "learning_rate": 3.736093435602968e-06, + "loss": 0.52672887, + "num_input_tokens_seen": 68359665, + "step": 3167, + "time_per_iteration": 3.2050087451934814 + }, + { + "auxiliary_loss_clip": 0.01160499, + "auxiliary_loss_mlp": 0.01152499, + "balance_loss_clip": 1.00209308, + "balance_loss_mlp": 1.00086522, + "epoch": 0.19047046445212687, + "flos": 21908741472000.0, + "grad_norm": 1.7943177366575018, + "language_loss": 0.74389076, + "learning_rate": 3.7359000417695156e-06, + "loss": 0.76702082, + "num_input_tokens_seen": 68378950, + "step": 3168, + "time_per_iteration": 2.5508365631103516 + }, + { + "auxiliary_loss_clip": 0.0111097, + "auxiliary_loss_mlp": 0.01142764, + "balance_loss_clip": 1.00148118, + "balance_loss_mlp": 1.00009477, + "epoch": 0.19053058770479483, + "flos": 59255156701440.0, + "grad_norm": 0.8565355665267563, + "language_loss": 0.60113829, + "learning_rate": 3.73570658211056e-06, + "loss": 0.62367564, + "num_input_tokens_seen": 68434235, + "step": 3169, + "time_per_iteration": 3.140146255493164 + }, + { + "auxiliary_loss_clip": 0.01095898, + "auxiliary_loss_mlp": 0.01153753, + "balance_loss_clip": 1.00183129, + "balance_loss_mlp": 1.00116563, + "epoch": 0.1905907109574628, + "flos": 23951304362880.0, + "grad_norm": 2.437809033301261, + "language_loss": 0.78538454, + "learning_rate": 3.735513056633436e-06, + "loss": 0.807881, + "num_input_tokens_seen": 68453830, + "step": 3170, + "time_per_iteration": 2.7629618644714355 + }, + { + "auxiliary_loss_clip": 0.01160499, + "auxiliary_loss_mlp": 0.01153298, + "balance_loss_clip": 1.00217652, + "balance_loss_mlp": 1.0012821, + "epoch": 0.19065083421013077, + "flos": 20812316774400.0, + "grad_norm": 1.6817811327284156, + "language_loss": 0.78051734, + "learning_rate": 3.7353194653454834e-06, + "loss": 0.80365533, + "num_input_tokens_seen": 68473005, + "step": 3171, + "time_per_iteration": 2.580183267593384 + }, + { + "auxiliary_loss_clip": 0.01177151, + "auxiliary_loss_mlp": 0.01153742, + "balance_loss_clip": 1.00223994, + "balance_loss_mlp": 1.00086761, + "epoch": 0.19071095746279873, + "flos": 31284981192960.0, + "grad_norm": 2.2506072159978343, + "language_loss": 0.78364229, + "learning_rate": 3.7351258082540426e-06, + "loss": 0.80695117, + "num_input_tokens_seen": 68493470, + "step": 3172, + "time_per_iteration": 2.6125102043151855 + }, + { + "auxiliary_loss_clip": 0.01161117, + "auxiliary_loss_mlp": 0.01153353, + "balance_loss_clip": 1.00215256, + "balance_loss_mlp": 1.00124192, + "epoch": 0.1907710807154667, + "flos": 14356117290240.0, + "grad_norm": 1.5279481393081418, + "language_loss": 0.80172431, + "learning_rate": 3.7349320853664576e-06, + "loss": 0.82486898, + "num_input_tokens_seen": 68511290, + "step": 3173, + "time_per_iteration": 2.549983263015747 + }, + { + "auxiliary_loss_clip": 0.01133994, + "auxiliary_loss_mlp": 0.00748658, + "balance_loss_clip": 1.00280964, + "balance_loss_mlp": 1.00034547, + "epoch": 0.1908312039681347, + "flos": 26907039740160.0, + "grad_norm": 1.4782862404701103, + "language_loss": 0.78858763, + "learning_rate": 3.7347382966900735e-06, + "loss": 0.80741417, + "num_input_tokens_seen": 68532575, + "step": 3174, + "time_per_iteration": 2.7080326080322266 + }, + { + "auxiliary_loss_clip": 0.01112314, + "auxiliary_loss_mlp": 0.0115315, + "balance_loss_clip": 1.00199521, + "balance_loss_mlp": 1.00084829, + "epoch": 0.19089132722080265, + "flos": 14494695960960.0, + "grad_norm": 2.114305373899257, + "language_loss": 0.80638921, + "learning_rate": 3.7345444422322395e-06, + "loss": 0.82904387, + "num_input_tokens_seen": 68548760, + "step": 3175, + "time_per_iteration": 2.633446216583252 + }, + { + "auxiliary_loss_clip": 0.01097386, + "auxiliary_loss_mlp": 0.01153799, + "balance_loss_clip": 1.00210929, + "balance_loss_mlp": 1.00111556, + "epoch": 0.19095145047347062, + "flos": 13952876232960.0, + "grad_norm": 2.1319529492677205, + "language_loss": 0.8592869, + "learning_rate": 3.7343505220003067e-06, + "loss": 0.88179874, + "num_input_tokens_seen": 68563100, + "step": 3176, + "time_per_iteration": 2.7026853561401367 + }, + { + "auxiliary_loss_clip": 0.01149503, + "auxiliary_loss_mlp": 0.01153699, + "balance_loss_clip": 1.00243306, + "balance_loss_mlp": 1.00111139, + "epoch": 0.19101157372613858, + "flos": 25301832848640.0, + "grad_norm": 1.6997309227485782, + "language_loss": 0.80836618, + "learning_rate": 3.7341565360016285e-06, + "loss": 0.83139819, + "num_input_tokens_seen": 68581650, + "step": 3177, + "time_per_iteration": 2.6138248443603516 + }, + { + "auxiliary_loss_clip": 0.01145822, + "auxiliary_loss_mlp": 0.01153076, + "balance_loss_clip": 1.00214565, + "balance_loss_mlp": 1.00077415, + "epoch": 0.19107169697880655, + "flos": 20558212986240.0, + "grad_norm": 2.100922254609632, + "language_loss": 0.74901557, + "learning_rate": 3.73396248424356e-06, + "loss": 0.77200454, + "num_input_tokens_seen": 68600360, + "step": 3178, + "time_per_iteration": 2.599912166595459 + }, + { + "auxiliary_loss_clip": 0.01160328, + "auxiliary_loss_mlp": 0.01152818, + "balance_loss_clip": 1.00210512, + "balance_loss_mlp": 1.00099349, + "epoch": 0.19113182023147451, + "flos": 22163204396160.0, + "grad_norm": 2.2392532059390278, + "language_loss": 0.81620514, + "learning_rate": 3.7337683667334606e-06, + "loss": 0.83933663, + "num_input_tokens_seen": 68617885, + "step": 3179, + "time_per_iteration": 2.5613925457000732 + }, + { + "auxiliary_loss_clip": 0.01160697, + "auxiliary_loss_mlp": 0.01153105, + "balance_loss_clip": 1.002352, + "balance_loss_mlp": 1.00099421, + "epoch": 0.19119194348414248, + "flos": 18581796990720.0, + "grad_norm": 3.7135542543645266, + "language_loss": 0.79491365, + "learning_rate": 3.733574183478691e-06, + "loss": 0.8180517, + "num_input_tokens_seen": 68634550, + "step": 3180, + "time_per_iteration": 2.5269148349761963 + }, + { + "auxiliary_loss_clip": 0.01149096, + "auxiliary_loss_mlp": 0.01153004, + "balance_loss_clip": 1.00247169, + "balance_loss_mlp": 1.00098801, + "epoch": 0.19125206673681047, + "flos": 19026623018880.0, + "grad_norm": 2.1879222681416866, + "language_loss": 0.79134536, + "learning_rate": 3.733379934486615e-06, + "loss": 0.81436634, + "num_input_tokens_seen": 68651895, + "step": 3181, + "time_per_iteration": 2.5630714893341064 + }, + { + "auxiliary_loss_clip": 0.01160382, + "auxiliary_loss_mlp": 0.01153474, + "balance_loss_clip": 1.00216699, + "balance_loss_mlp": 1.00126767, + "epoch": 0.19131218998947844, + "flos": 21690153256320.0, + "grad_norm": 1.765761246807188, + "language_loss": 0.74017179, + "learning_rate": 3.7331856197645973e-06, + "loss": 0.76331031, + "num_input_tokens_seen": 68671500, + "step": 3182, + "time_per_iteration": 2.5599253177642822 + }, + { + "auxiliary_loss_clip": 0.01128914, + "auxiliary_loss_mlp": 0.01152763, + "balance_loss_clip": 1.0020752, + "balance_loss_mlp": 1.00093806, + "epoch": 0.1913723132421464, + "flos": 18442500048000.0, + "grad_norm": 1.6613228266458906, + "language_loss": 0.65084255, + "learning_rate": 3.7329912393200084e-06, + "loss": 0.67365932, + "num_input_tokens_seen": 68690570, + "step": 3183, + "time_per_iteration": 2.6090006828308105 + }, + { + "auxiliary_loss_clip": 0.01144811, + "auxiliary_loss_mlp": 0.01153751, + "balance_loss_clip": 1.00202513, + "balance_loss_mlp": 1.00125861, + "epoch": 0.19143243649481437, + "flos": 27160102033920.0, + "grad_norm": 1.6610655095069613, + "language_loss": 0.73444217, + "learning_rate": 3.7327967931602173e-06, + "loss": 0.75742781, + "num_input_tokens_seen": 68709735, + "step": 3184, + "time_per_iteration": 2.6518945693969727 + }, + { + "auxiliary_loss_clip": 0.01145089, + "auxiliary_loss_mlp": 0.01153476, + "balance_loss_clip": 1.00215352, + "balance_loss_mlp": 1.00088787, + "epoch": 0.19149255974748233, + "flos": 21718952985600.0, + "grad_norm": 1.78349806153633, + "language_loss": 0.87890291, + "learning_rate": 3.732602281292598e-06, + "loss": 0.90188861, + "num_input_tokens_seen": 68727565, + "step": 3185, + "time_per_iteration": 2.6095943450927734 + }, + { + "auxiliary_loss_clip": 0.01176929, + "auxiliary_loss_mlp": 0.01152814, + "balance_loss_clip": 1.00219584, + "balance_loss_mlp": 1.00089407, + "epoch": 0.1915526830001503, + "flos": 22963293889920.0, + "grad_norm": 2.0205063786795856, + "language_loss": 0.72927356, + "learning_rate": 3.7324077037245267e-06, + "loss": 0.75257105, + "num_input_tokens_seen": 68748110, + "step": 3186, + "time_per_iteration": 2.5440566539764404 + }, + { + "auxiliary_loss_clip": 0.0114417, + "auxiliary_loss_mlp": 0.01153399, + "balance_loss_clip": 1.00216937, + "balance_loss_mlp": 1.00090623, + "epoch": 0.1916128062528183, + "flos": 26140741966080.0, + "grad_norm": 1.8666841055565104, + "language_loss": 0.8352735, + "learning_rate": 3.7322130604633825e-06, + "loss": 0.85824919, + "num_input_tokens_seen": 68769765, + "step": 3187, + "time_per_iteration": 2.6851706504821777 + }, + { + "auxiliary_loss_clip": 0.01158356, + "auxiliary_loss_mlp": 0.01141887, + "balance_loss_clip": 1.00176764, + "balance_loss_mlp": 0.99997979, + "epoch": 0.19167292950548626, + "flos": 54925767457920.0, + "grad_norm": 0.8575925490697925, + "language_loss": 0.55743843, + "learning_rate": 3.732018351516544e-06, + "loss": 0.58044082, + "num_input_tokens_seen": 68826815, + "step": 3188, + "time_per_iteration": 3.213536262512207 + }, + { + "auxiliary_loss_clip": 0.01160645, + "auxiliary_loss_mlp": 0.01153477, + "balance_loss_clip": 1.00211501, + "balance_loss_mlp": 1.00117481, + "epoch": 0.19173305275815422, + "flos": 29935601942400.0, + "grad_norm": 1.5597910851110315, + "language_loss": 0.70232177, + "learning_rate": 3.731823576891397e-06, + "loss": 0.72546297, + "num_input_tokens_seen": 68847585, + "step": 3189, + "time_per_iteration": 2.643603801727295 + }, + { + "auxiliary_loss_clip": 0.01143652, + "auxiliary_loss_mlp": 0.01153092, + "balance_loss_clip": 1.0019927, + "balance_loss_mlp": 1.00088584, + "epoch": 0.1917931760108222, + "flos": 24752471264640.0, + "grad_norm": 1.7197395524260732, + "language_loss": 0.74004346, + "learning_rate": 3.7316287365953266e-06, + "loss": 0.76301092, + "num_input_tokens_seen": 68866620, + "step": 3190, + "time_per_iteration": 2.6237800121307373 + }, + { + "auxiliary_loss_clip": 0.01129101, + "auxiliary_loss_mlp": 0.01153376, + "balance_loss_clip": 1.00206792, + "balance_loss_mlp": 1.00126457, + "epoch": 0.19185329926349015, + "flos": 18843550375680.0, + "grad_norm": 1.9823172628859989, + "language_loss": 0.84227967, + "learning_rate": 3.73143383063572e-06, + "loss": 0.86510444, + "num_input_tokens_seen": 68885515, + "step": 3191, + "time_per_iteration": 2.6232094764709473 + }, + { + "auxiliary_loss_clip": 0.01143593, + "auxiliary_loss_mlp": 0.01152646, + "balance_loss_clip": 1.00202179, + "balance_loss_mlp": 1.00072539, + "epoch": 0.19191342251615812, + "flos": 22086858038400.0, + "grad_norm": 1.7475494533852742, + "language_loss": 0.89716363, + "learning_rate": 3.73123885901997e-06, + "loss": 0.92012596, + "num_input_tokens_seen": 68903225, + "step": 3192, + "time_per_iteration": 2.5986104011535645 + }, + { + "auxiliary_loss_clip": 0.0112823, + "auxiliary_loss_mlp": 0.01153235, + "balance_loss_clip": 1.00208902, + "balance_loss_mlp": 1.00093317, + "epoch": 0.19197354576882608, + "flos": 22199115018240.0, + "grad_norm": 1.6718439524487352, + "language_loss": 0.75276625, + "learning_rate": 3.7310438217554687e-06, + "loss": 0.77558088, + "num_input_tokens_seen": 68922860, + "step": 3193, + "time_per_iteration": 2.6760778427124023 + }, + { + "auxiliary_loss_clip": 0.01144949, + "auxiliary_loss_mlp": 0.00748793, + "balance_loss_clip": 1.00209713, + "balance_loss_mlp": 1.00059259, + "epoch": 0.19203366902149407, + "flos": 24896185580160.0, + "grad_norm": 1.5641411924965591, + "language_loss": 0.75019306, + "learning_rate": 3.730848718849612e-06, + "loss": 0.76913047, + "num_input_tokens_seen": 68943000, + "step": 3194, + "time_per_iteration": 2.6382100582122803 + }, + { + "auxiliary_loss_clip": 0.01156904, + "auxiliary_loss_mlp": 0.01142035, + "balance_loss_clip": 1.00169683, + "balance_loss_mlp": 1.00012863, + "epoch": 0.19209379227416204, + "flos": 68416722789120.0, + "grad_norm": 0.7836186850400041, + "language_loss": 0.6851474, + "learning_rate": 3.7306535503097985e-06, + "loss": 0.7081368, + "num_input_tokens_seen": 69000255, + "step": 3195, + "time_per_iteration": 4.5121073722839355 + }, + { + "auxiliary_loss_clip": 0.01128825, + "auxiliary_loss_mlp": 0.01153137, + "balance_loss_clip": 1.00207293, + "balance_loss_mlp": 1.00083566, + "epoch": 0.19215391552683, + "flos": 22055185221120.0, + "grad_norm": 2.269144409482099, + "language_loss": 0.72809684, + "learning_rate": 3.730458316143429e-06, + "loss": 0.75091642, + "num_input_tokens_seen": 69019665, + "step": 3196, + "time_per_iteration": 4.03899073600769 + }, + { + "auxiliary_loss_clip": 0.01143875, + "auxiliary_loss_mlp": 0.01153181, + "balance_loss_clip": 1.00226498, + "balance_loss_mlp": 1.00097466, + "epoch": 0.19221403877949797, + "flos": 20302959962880.0, + "grad_norm": 1.9268774210295507, + "language_loss": 0.83880585, + "learning_rate": 3.7302630163579068e-06, + "loss": 0.86177641, + "num_input_tokens_seen": 69039055, + "step": 3197, + "time_per_iteration": 3.996534585952759 + }, + { + "auxiliary_loss_clip": 0.0109693, + "auxiliary_loss_mlp": 0.01153102, + "balance_loss_clip": 1.00200367, + "balance_loss_mlp": 1.00089526, + "epoch": 0.19227416203216594, + "flos": 23185329811200.0, + "grad_norm": 2.8052770376756793, + "language_loss": 0.80213493, + "learning_rate": 3.7300676509606373e-06, + "loss": 0.82463527, + "num_input_tokens_seen": 69056370, + "step": 3198, + "time_per_iteration": 4.21234917640686 + }, + { + "auxiliary_loss_clip": 0.01143432, + "auxiliary_loss_mlp": 0.01153506, + "balance_loss_clip": 1.00195765, + "balance_loss_mlp": 1.00101364, + "epoch": 0.1923342852848339, + "flos": 25776607841280.0, + "grad_norm": 2.2148128181205387, + "language_loss": 0.78732401, + "learning_rate": 3.729872219959029e-06, + "loss": 0.81029338, + "num_input_tokens_seen": 69075915, + "step": 3199, + "time_per_iteration": 2.6233785152435303 + }, + { + "auxiliary_loss_clip": 0.01134386, + "auxiliary_loss_mlp": 0.0115299, + "balance_loss_clip": 1.00286841, + "balance_loss_mlp": 1.00106943, + "epoch": 0.19239440853750187, + "flos": 17128349061120.0, + "grad_norm": 1.951783034130456, + "language_loss": 0.83569419, + "learning_rate": 3.7296767233604934e-06, + "loss": 0.85856789, + "num_input_tokens_seen": 69094145, + "step": 3200, + "time_per_iteration": 2.6380057334899902 + }, + { + "auxiliary_loss_clip": 0.01177107, + "auxiliary_loss_mlp": 0.01153554, + "balance_loss_clip": 1.00241065, + "balance_loss_mlp": 1.00125182, + "epoch": 0.19245453179016986, + "flos": 16435093593600.0, + "grad_norm": 2.4891569729105316, + "language_loss": 0.79325444, + "learning_rate": 3.729481161172443e-06, + "loss": 0.81656098, + "num_input_tokens_seen": 69111110, + "step": 3201, + "time_per_iteration": 2.505082368850708 + }, + { + "auxiliary_loss_clip": 0.0111219, + "auxiliary_loss_mlp": 0.01152715, + "balance_loss_clip": 1.0019176, + "balance_loss_mlp": 1.00089025, + "epoch": 0.19251465504283782, + "flos": 20230276792320.0, + "grad_norm": 2.2004986862894356, + "language_loss": 0.69674486, + "learning_rate": 3.7292855334022927e-06, + "loss": 0.71939397, + "num_input_tokens_seen": 69130280, + "step": 3202, + "time_per_iteration": 2.65956449508667 + }, + { + "auxiliary_loss_clip": 0.01144252, + "auxiliary_loss_mlp": 0.01151897, + "balance_loss_clip": 1.00208235, + "balance_loss_mlp": 1.00064397, + "epoch": 0.1925747782955058, + "flos": 19464374067840.0, + "grad_norm": 1.7322851881593553, + "language_loss": 0.90616786, + "learning_rate": 3.7290898400574627e-06, + "loss": 0.9291293, + "num_input_tokens_seen": 69149570, + "step": 3203, + "time_per_iteration": 2.5938310623168945 + }, + { + "auxiliary_loss_clip": 0.01161544, + "auxiliary_loss_mlp": 0.01153388, + "balance_loss_clip": 1.00225854, + "balance_loss_mlp": 1.0008955, + "epoch": 0.19263490154817375, + "flos": 17785586165760.0, + "grad_norm": 2.597083226358577, + "language_loss": 0.81853652, + "learning_rate": 3.7288940811453725e-06, + "loss": 0.84168589, + "num_input_tokens_seen": 69168190, + "step": 3204, + "time_per_iteration": 2.5332772731781006 + }, + { + "auxiliary_loss_clip": 0.01127632, + "auxiliary_loss_mlp": 0.0115284, + "balance_loss_clip": 1.00207901, + "balance_loss_mlp": 1.00082421, + "epoch": 0.19269502480084172, + "flos": 17457075354240.0, + "grad_norm": 1.8221280791491856, + "language_loss": 0.75458819, + "learning_rate": 3.7286982566734454e-06, + "loss": 0.77739292, + "num_input_tokens_seen": 69186950, + "step": 3205, + "time_per_iteration": 2.640683889389038 + }, + { + "auxiliary_loss_clip": 0.01144223, + "auxiliary_loss_mlp": 0.01153681, + "balance_loss_clip": 1.00216496, + "balance_loss_mlp": 1.00128376, + "epoch": 0.19275514805350968, + "flos": 21506901045120.0, + "grad_norm": 2.5091501529495335, + "language_loss": 0.83166671, + "learning_rate": 3.728502366649107e-06, + "loss": 0.85464573, + "num_input_tokens_seen": 69204850, + "step": 3206, + "time_per_iteration": 2.6221489906311035 + }, + { + "auxiliary_loss_clip": 0.01146314, + "auxiliary_loss_mlp": 0.01141255, + "balance_loss_clip": 1.00220728, + "balance_loss_mlp": 1.00011098, + "epoch": 0.19281527130617768, + "flos": 47695979738880.0, + "grad_norm": 0.8305173238846613, + "language_loss": 0.60678053, + "learning_rate": 3.728306411079786e-06, + "loss": 0.62965626, + "num_input_tokens_seen": 69259200, + "step": 3207, + "time_per_iteration": 3.0075631141662598 + }, + { + "auxiliary_loss_clip": 0.01127739, + "auxiliary_loss_mlp": 0.01152915, + "balance_loss_clip": 1.00201249, + "balance_loss_mlp": 1.00128114, + "epoch": 0.19287539455884564, + "flos": 11801252672640.0, + "grad_norm": 2.485060005239978, + "language_loss": 0.74975824, + "learning_rate": 3.7281103899729125e-06, + "loss": 0.77256471, + "num_input_tokens_seen": 69275835, + "step": 3208, + "time_per_iteration": 2.7090203762054443 + }, + { + "auxiliary_loss_clip": 0.01160555, + "auxiliary_loss_mlp": 0.00748751, + "balance_loss_clip": 1.00220346, + "balance_loss_mlp": 1.00051236, + "epoch": 0.1929355178115136, + "flos": 20631434860800.0, + "grad_norm": 2.0083103410312457, + "language_loss": 0.60949761, + "learning_rate": 3.7279143033359195e-06, + "loss": 0.62859058, + "num_input_tokens_seen": 69294810, + "step": 3209, + "time_per_iteration": 2.5985896587371826 + }, + { + "auxiliary_loss_clip": 0.01177058, + "auxiliary_loss_mlp": 0.01153309, + "balance_loss_clip": 1.00229967, + "balance_loss_mlp": 1.00110316, + "epoch": 0.19299564106418157, + "flos": 40807916058240.0, + "grad_norm": 1.8016314348126061, + "language_loss": 0.80098212, + "learning_rate": 3.727718151176243e-06, + "loss": 0.82428586, + "num_input_tokens_seen": 69316065, + "step": 3210, + "time_per_iteration": 2.7230732440948486 + }, + { + "auxiliary_loss_clip": 0.01128831, + "auxiliary_loss_mlp": 0.01152834, + "balance_loss_clip": 1.00199032, + "balance_loss_mlp": 1.00100946, + "epoch": 0.19305576431684954, + "flos": 11361418634880.0, + "grad_norm": 2.1533380219692164, + "language_loss": 0.82541615, + "learning_rate": 3.7275219335013217e-06, + "loss": 0.84823275, + "num_input_tokens_seen": 69332900, + "step": 3211, + "time_per_iteration": 2.671041965484619 + }, + { + "auxiliary_loss_clip": 0.01173778, + "auxiliary_loss_mlp": 0.01141466, + "balance_loss_clip": 1.00186253, + "balance_loss_mlp": 1.00032258, + "epoch": 0.1931158875695175, + "flos": 54511895975040.0, + "grad_norm": 0.9612705233697759, + "language_loss": 0.63667631, + "learning_rate": 3.7273256503185953e-06, + "loss": 0.65982878, + "num_input_tokens_seen": 69382535, + "step": 3212, + "time_per_iteration": 2.948801040649414 + }, + { + "auxiliary_loss_clip": 0.01144502, + "auxiliary_loss_mlp": 0.01152678, + "balance_loss_clip": 1.00206208, + "balance_loss_mlp": 1.00094843, + "epoch": 0.19317601082218547, + "flos": 19828436365440.0, + "grad_norm": 1.5352669924664604, + "language_loss": 0.7627238, + "learning_rate": 3.7271293016355074e-06, + "loss": 0.78569561, + "num_input_tokens_seen": 69400600, + "step": 3213, + "time_per_iteration": 2.6157047748565674 + }, + { + "auxiliary_loss_clip": 0.0112823, + "auxiliary_loss_mlp": 0.01153027, + "balance_loss_clip": 1.00201845, + "balance_loss_mlp": 1.00101149, + "epoch": 0.19323613407485346, + "flos": 13152068467200.0, + "grad_norm": 1.8972152452353126, + "language_loss": 0.71122897, + "learning_rate": 3.726932887459503e-06, + "loss": 0.73404157, + "num_input_tokens_seen": 69417350, + "step": 3214, + "time_per_iteration": 2.6496822834014893 + }, + { + "auxiliary_loss_clip": 0.01176997, + "auxiliary_loss_mlp": 0.01153047, + "balance_loss_clip": 1.00227368, + "balance_loss_mlp": 1.00103116, + "epoch": 0.19329625732752143, + "flos": 14027247342720.0, + "grad_norm": 2.5026590912320636, + "language_loss": 0.75090528, + "learning_rate": 3.72673640779803e-06, + "loss": 0.7742058, + "num_input_tokens_seen": 69431845, + "step": 3215, + "time_per_iteration": 2.4877965450286865 + }, + { + "auxiliary_loss_clip": 0.01126516, + "auxiliary_loss_mlp": 0.01152751, + "balance_loss_clip": 1.00193799, + "balance_loss_mlp": 1.00111711, + "epoch": 0.1933563805801894, + "flos": 23441732069760.0, + "grad_norm": 1.768132099280861, + "language_loss": 0.88572395, + "learning_rate": 3.72653986265854e-06, + "loss": 0.90851659, + "num_input_tokens_seen": 69453275, + "step": 3216, + "time_per_iteration": 2.6627795696258545 + }, + { + "auxiliary_loss_clip": 0.0117701, + "auxiliary_loss_mlp": 0.0115311, + "balance_loss_clip": 1.00236499, + "balance_loss_mlp": 1.00128555, + "epoch": 0.19341650383285736, + "flos": 20485314334080.0, + "grad_norm": 2.1551508562857116, + "language_loss": 0.79907835, + "learning_rate": 3.726343252048485e-06, + "loss": 0.82237959, + "num_input_tokens_seen": 69471830, + "step": 3217, + "time_per_iteration": 2.51993465423584 + }, + { + "auxiliary_loss_clip": 0.01145154, + "auxiliary_loss_mlp": 0.01153091, + "balance_loss_clip": 1.00215518, + "balance_loss_mlp": 1.00098026, + "epoch": 0.19347662708552532, + "flos": 17858484817920.0, + "grad_norm": 2.22139586935219, + "language_loss": 0.61563885, + "learning_rate": 3.7261465759753206e-06, + "loss": 0.63862127, + "num_input_tokens_seen": 69489320, + "step": 3218, + "time_per_iteration": 2.5725231170654297 + }, + { + "auxiliary_loss_clip": 0.01177122, + "auxiliary_loss_mlp": 0.01152641, + "balance_loss_clip": 1.00243008, + "balance_loss_mlp": 1.00091147, + "epoch": 0.1935367503381933, + "flos": 18187247024640.0, + "grad_norm": 1.560444486650763, + "language_loss": 0.80454791, + "learning_rate": 3.7259498344465053e-06, + "loss": 0.82784551, + "num_input_tokens_seen": 69506665, + "step": 3219, + "time_per_iteration": 2.5100717544555664 + }, + { + "auxiliary_loss_clip": 0.01112741, + "auxiliary_loss_mlp": 0.01152507, + "balance_loss_clip": 1.0019834, + "balance_loss_mlp": 1.00106394, + "epoch": 0.19359687359086128, + "flos": 15957122290560.0, + "grad_norm": 2.0604640250126662, + "language_loss": 0.85832202, + "learning_rate": 3.7257530274694993e-06, + "loss": 0.88097447, + "num_input_tokens_seen": 69523835, + "step": 3220, + "time_per_iteration": 2.647045373916626 + }, + { + "auxiliary_loss_clip": 0.01176642, + "auxiliary_loss_mlp": 0.01151815, + "balance_loss_clip": 1.00227976, + "balance_loss_mlp": 1.00084853, + "epoch": 0.19365699684352924, + "flos": 21215198695680.0, + "grad_norm": 2.0746849465665926, + "language_loss": 0.83816922, + "learning_rate": 3.725556155051766e-06, + "loss": 0.86145377, + "num_input_tokens_seen": 69542620, + "step": 3221, + "time_per_iteration": 2.539860486984253 + }, + { + "auxiliary_loss_clip": 0.01160929, + "auxiliary_loss_mlp": 0.01152599, + "balance_loss_clip": 1.0022831, + "balance_loss_mlp": 1.00115538, + "epoch": 0.1937171200961972, + "flos": 17311098481920.0, + "grad_norm": 2.692514722186529, + "language_loss": 0.86479723, + "learning_rate": 3.7253592172007702e-06, + "loss": 0.88793254, + "num_input_tokens_seen": 69561130, + "step": 3222, + "time_per_iteration": 2.5404696464538574 + }, + { + "auxiliary_loss_clip": 0.01080379, + "auxiliary_loss_mlp": 0.01152463, + "balance_loss_clip": 1.00180924, + "balance_loss_mlp": 1.00073338, + "epoch": 0.19377724334886517, + "flos": 22635968227200.0, + "grad_norm": 2.5954760568092405, + "language_loss": 0.78542447, + "learning_rate": 3.72516221392398e-06, + "loss": 0.80775285, + "num_input_tokens_seen": 69580425, + "step": 3223, + "time_per_iteration": 2.8011279106140137 + }, + { + "auxiliary_loss_clip": 0.01161015, + "auxiliary_loss_mlp": 0.01152581, + "balance_loss_clip": 1.00232387, + "balance_loss_mlp": 1.00113702, + "epoch": 0.19383736660153314, + "flos": 15077813351040.0, + "grad_norm": 1.9049595180865961, + "language_loss": 0.75519383, + "learning_rate": 3.7249651452288653e-06, + "loss": 0.77832979, + "num_input_tokens_seen": 69597085, + "step": 3224, + "time_per_iteration": 2.5411319732666016 + }, + { + "auxiliary_loss_clip": 0.01129315, + "auxiliary_loss_mlp": 0.01152705, + "balance_loss_clip": 1.00216949, + "balance_loss_mlp": 1.00097513, + "epoch": 0.1938974898542011, + "flos": 47119934350080.0, + "grad_norm": 2.2357674783957635, + "language_loss": 0.70976567, + "learning_rate": 3.7247680111229e-06, + "loss": 0.73258591, + "num_input_tokens_seen": 69618885, + "step": 3225, + "time_per_iteration": 2.841733455657959 + }, + { + "auxiliary_loss_clip": 0.01133262, + "auxiliary_loss_mlp": 0.01152207, + "balance_loss_clip": 1.00232852, + "balance_loss_mlp": 1.00085902, + "epoch": 0.19395761310686907, + "flos": 25812554376960.0, + "grad_norm": 2.838063713706713, + "language_loss": 0.68992651, + "learning_rate": 3.7245708116135585e-06, + "loss": 0.71278119, + "num_input_tokens_seen": 69638200, + "step": 3226, + "time_per_iteration": 2.6774137020111084 + }, + { + "auxiliary_loss_clip": 0.0112867, + "auxiliary_loss_mlp": 0.01152786, + "balance_loss_clip": 1.00216293, + "balance_loss_mlp": 1.00105643, + "epoch": 0.19401773635953706, + "flos": 23039604334080.0, + "grad_norm": 1.5930352433356811, + "language_loss": 0.76095402, + "learning_rate": 3.7243735467083193e-06, + "loss": 0.78376853, + "num_input_tokens_seen": 69657550, + "step": 3227, + "time_per_iteration": 2.665515422821045 + }, + { + "auxiliary_loss_clip": 0.0112802, + "auxiliary_loss_mlp": 0.01152555, + "balance_loss_clip": 1.00200307, + "balance_loss_mlp": 1.00101662, + "epoch": 0.19407785961220503, + "flos": 15920780705280.0, + "grad_norm": 1.7684260916504355, + "language_loss": 0.69244462, + "learning_rate": 3.724176216414662e-06, + "loss": 0.71525037, + "num_input_tokens_seen": 69675005, + "step": 3228, + "time_per_iteration": 2.6127848625183105 + }, + { + "auxiliary_loss_clip": 0.01160918, + "auxiliary_loss_mlp": 0.01152278, + "balance_loss_clip": 1.0022738, + "balance_loss_mlp": 1.00112021, + "epoch": 0.194137982864873, + "flos": 25921722787200.0, + "grad_norm": 2.78178886575785, + "language_loss": 0.74153447, + "learning_rate": 3.72397882074007e-06, + "loss": 0.76466644, + "num_input_tokens_seen": 69696455, + "step": 3229, + "time_per_iteration": 2.620704412460327 + }, + { + "auxiliary_loss_clip": 0.0112782, + "auxiliary_loss_mlp": 0.01152471, + "balance_loss_clip": 1.00206685, + "balance_loss_mlp": 1.00093198, + "epoch": 0.19419810611754096, + "flos": 13261344618240.0, + "grad_norm": 1.7973449956184488, + "language_loss": 0.65499312, + "learning_rate": 3.7237813596920285e-06, + "loss": 0.67779601, + "num_input_tokens_seen": 69714245, + "step": 3230, + "time_per_iteration": 2.618875503540039 + }, + { + "auxiliary_loss_clip": 0.01143445, + "auxiliary_loss_mlp": 0.00748653, + "balance_loss_clip": 1.00202441, + "balance_loss_mlp": 1.00049615, + "epoch": 0.19425822937020892, + "flos": 15705568368000.0, + "grad_norm": 1.7553973126252231, + "language_loss": 0.81681406, + "learning_rate": 3.7235838332780254e-06, + "loss": 0.83573508, + "num_input_tokens_seen": 69731515, + "step": 3231, + "time_per_iteration": 2.5746638774871826 + }, + { + "auxiliary_loss_clip": 0.01144847, + "auxiliary_loss_mlp": 0.01152579, + "balance_loss_clip": 1.00221133, + "balance_loss_mlp": 1.00084901, + "epoch": 0.1943183526228769, + "flos": 23105392093440.0, + "grad_norm": 1.6201310152483175, + "language_loss": 0.86776531, + "learning_rate": 3.72338624150555e-06, + "loss": 0.89073956, + "num_input_tokens_seen": 69748885, + "step": 3232, + "time_per_iteration": 4.075512647628784 + }, + { + "auxiliary_loss_clip": 0.01117718, + "auxiliary_loss_mlp": 0.0115282, + "balance_loss_clip": 1.00250745, + "balance_loss_mlp": 1.00109076, + "epoch": 0.19437847587554485, + "flos": 24712610146560.0, + "grad_norm": 4.5803682499874085, + "language_loss": 0.85349137, + "learning_rate": 3.723188584382096e-06, + "loss": 0.87619674, + "num_input_tokens_seen": 69767540, + "step": 3233, + "time_per_iteration": 2.681837558746338 + }, + { + "auxiliary_loss_clip": 0.01160559, + "auxiliary_loss_mlp": 0.01152901, + "balance_loss_clip": 1.00220942, + "balance_loss_mlp": 1.00098109, + "epoch": 0.19443859912821285, + "flos": 23116130259840.0, + "grad_norm": 2.2866920036031173, + "language_loss": 0.89154983, + "learning_rate": 3.722990861915158e-06, + "loss": 0.91468441, + "num_input_tokens_seen": 69789340, + "step": 3234, + "time_per_iteration": 4.0237181186676025 + }, + { + "auxiliary_loss_clip": 0.01161638, + "auxiliary_loss_mlp": 0.01152738, + "balance_loss_clip": 1.00229752, + "balance_loss_mlp": 1.00100875, + "epoch": 0.1944987223808808, + "flos": 15084385539840.0, + "grad_norm": 3.447298903978057, + "language_loss": 0.78211415, + "learning_rate": 3.722793074112234e-06, + "loss": 0.80525792, + "num_input_tokens_seen": 69806470, + "step": 3235, + "time_per_iteration": 4.0250139236450195 + }, + { + "auxiliary_loss_clip": 0.01143943, + "auxiliary_loss_mlp": 0.01152324, + "balance_loss_clip": 1.00218379, + "balance_loss_mlp": 1.00097549, + "epoch": 0.19455884563354878, + "flos": 17126876603520.0, + "grad_norm": 1.8933312026051938, + "language_loss": 0.79269469, + "learning_rate": 3.7225952209808233e-06, + "loss": 0.81565738, + "num_input_tokens_seen": 69822655, + "step": 3236, + "time_per_iteration": 3.999934434890747 + }, + { + "auxiliary_loss_clip": 0.01176818, + "auxiliary_loss_mlp": 0.01152667, + "balance_loss_clip": 1.00232244, + "balance_loss_mlp": 1.0009377, + "epoch": 0.19461896888621674, + "flos": 20193396503040.0, + "grad_norm": 1.6454908458354236, + "language_loss": 0.75696599, + "learning_rate": 3.72239730252843e-06, + "loss": 0.78026086, + "num_input_tokens_seen": 69841895, + "step": 3237, + "time_per_iteration": 2.5381839275360107 + }, + { + "auxiliary_loss_clip": 0.0117687, + "auxiliary_loss_mlp": 0.01152837, + "balance_loss_clip": 1.00229096, + "balance_loss_mlp": 1.00110698, + "epoch": 0.1946790921388847, + "flos": 25301365971840.0, + "grad_norm": 1.5177822086219175, + "language_loss": 0.75105757, + "learning_rate": 3.7221993187625583e-06, + "loss": 0.77435458, + "num_input_tokens_seen": 69862220, + "step": 3238, + "time_per_iteration": 2.5467376708984375 + }, + { + "auxiliary_loss_clip": 0.01112016, + "auxiliary_loss_mlp": 0.01152099, + "balance_loss_clip": 1.00195527, + "balance_loss_mlp": 1.00103688, + "epoch": 0.19473921539155267, + "flos": 20193396503040.0, + "grad_norm": 2.210626177391577, + "language_loss": 0.73782003, + "learning_rate": 3.7220012696907155e-06, + "loss": 0.76046121, + "num_input_tokens_seen": 69881830, + "step": 3239, + "time_per_iteration": 2.6529295444488525 + }, + { + "auxiliary_loss_clip": 0.01144628, + "auxiliary_loss_mlp": 0.01151687, + "balance_loss_clip": 1.00221443, + "balance_loss_mlp": 1.00081515, + "epoch": 0.19479933864422067, + "flos": 20887549810560.0, + "grad_norm": 1.5920259550347877, + "language_loss": 0.7366913, + "learning_rate": 3.721803155320412e-06, + "loss": 0.7596544, + "num_input_tokens_seen": 69900515, + "step": 3240, + "time_per_iteration": 2.5988874435424805 + }, + { + "auxiliary_loss_clip": 0.01143211, + "auxiliary_loss_mlp": 0.01152164, + "balance_loss_clip": 1.00204813, + "balance_loss_mlp": 1.00081575, + "epoch": 0.19485946189688863, + "flos": 23295072839040.0, + "grad_norm": 2.5813944094389654, + "language_loss": 0.66478401, + "learning_rate": 3.7216049756591606e-06, + "loss": 0.68773776, + "num_input_tokens_seen": 69920060, + "step": 3241, + "time_per_iteration": 2.5869498252868652 + }, + { + "auxiliary_loss_clip": 0.01150208, + "auxiliary_loss_mlp": 0.01152623, + "balance_loss_clip": 1.0028379, + "balance_loss_mlp": 1.00108409, + "epoch": 0.1949195851495566, + "flos": 23295036925440.0, + "grad_norm": 1.4142789037164676, + "language_loss": 0.82990849, + "learning_rate": 3.7214067307144754e-06, + "loss": 0.85293686, + "num_input_tokens_seen": 69939820, + "step": 3242, + "time_per_iteration": 2.612431049346924 + }, + { + "auxiliary_loss_clip": 0.011737, + "auxiliary_loss_mlp": 0.01141522, + "balance_loss_clip": 1.00167131, + "balance_loss_mlp": 1.00037789, + "epoch": 0.19497970840222456, + "flos": 64962871557120.0, + "grad_norm": 0.821474514961056, + "language_loss": 0.57489485, + "learning_rate": 3.721208420493875e-06, + "loss": 0.59804708, + "num_input_tokens_seen": 70002145, + "step": 3243, + "time_per_iteration": 3.094306230545044 + }, + { + "auxiliary_loss_clip": 0.01161106, + "auxiliary_loss_mlp": 0.01152428, + "balance_loss_clip": 1.00214708, + "balance_loss_mlp": 1.00098479, + "epoch": 0.19503983165489253, + "flos": 19644717277440.0, + "grad_norm": 1.8622853444189265, + "language_loss": 0.83492517, + "learning_rate": 3.7210100450048784e-06, + "loss": 0.85806048, + "num_input_tokens_seen": 70020510, + "step": 3244, + "time_per_iteration": 2.5565879344940186 + }, + { + "auxiliary_loss_clip": 0.01160354, + "auxiliary_loss_mlp": 0.01152397, + "balance_loss_clip": 1.00234151, + "balance_loss_mlp": 1.00123954, + "epoch": 0.1950999549075605, + "flos": 21141976821120.0, + "grad_norm": 2.1481856093543756, + "language_loss": 0.77221489, + "learning_rate": 3.7208116042550088e-06, + "loss": 0.79534245, + "num_input_tokens_seen": 70040760, + "step": 3245, + "time_per_iteration": 2.55434513092041 + }, + { + "auxiliary_loss_clip": 0.0116013, + "auxiliary_loss_mlp": 0.01152457, + "balance_loss_clip": 1.00207663, + "balance_loss_mlp": 1.00101304, + "epoch": 0.19516007816022846, + "flos": 20884820376960.0, + "grad_norm": 2.059241716281113, + "language_loss": 0.8417154, + "learning_rate": 3.7206130982517906e-06, + "loss": 0.86484128, + "num_input_tokens_seen": 70058720, + "step": 3246, + "time_per_iteration": 2.601813554763794 + }, + { + "auxiliary_loss_clip": 0.01160308, + "auxiliary_loss_mlp": 0.0074881, + "balance_loss_clip": 1.00214767, + "balance_loss_mlp": 1.00053215, + "epoch": 0.19522020141289645, + "flos": 16910515031040.0, + "grad_norm": 1.8553937733521926, + "language_loss": 0.7618981, + "learning_rate": 3.7204145270027514e-06, + "loss": 0.78098929, + "num_input_tokens_seen": 70076470, + "step": 3247, + "time_per_iteration": 2.559303045272827 + }, + { + "auxiliary_loss_clip": 0.01128077, + "auxiliary_loss_mlp": 0.01152783, + "balance_loss_clip": 1.00212693, + "balance_loss_mlp": 1.00124419, + "epoch": 0.19528032466556441, + "flos": 26724829023360.0, + "grad_norm": 1.6117393299560225, + "language_loss": 0.75909358, + "learning_rate": 3.720215890515421e-06, + "loss": 0.78190219, + "num_input_tokens_seen": 70096220, + "step": 3248, + "time_per_iteration": 2.7008347511291504 + }, + { + "auxiliary_loss_clip": 0.01176734, + "auxiliary_loss_mlp": 0.01152279, + "balance_loss_clip": 1.00220811, + "balance_loss_mlp": 1.00093079, + "epoch": 0.19534044791823238, + "flos": 21032808410880.0, + "grad_norm": 3.0479339064487685, + "language_loss": 0.78345543, + "learning_rate": 3.7200171887973316e-06, + "loss": 0.80674559, + "num_input_tokens_seen": 70114800, + "step": 3249, + "time_per_iteration": 2.5494203567504883 + }, + { + "auxiliary_loss_clip": 0.01160885, + "auxiliary_loss_mlp": 0.01152174, + "balance_loss_clip": 1.00223804, + "balance_loss_mlp": 1.00092173, + "epoch": 0.19540057117090034, + "flos": 22344050396160.0, + "grad_norm": 1.4322190645123876, + "language_loss": 0.73149884, + "learning_rate": 3.7198184218560176e-06, + "loss": 0.75462937, + "num_input_tokens_seen": 70134930, + "step": 3250, + "time_per_iteration": 2.5773112773895264 + }, + { + "auxiliary_loss_clip": 0.01117575, + "auxiliary_loss_mlp": 0.01151577, + "balance_loss_clip": 1.00251865, + "balance_loss_mlp": 1.00089669, + "epoch": 0.1954606944235683, + "flos": 20301631159680.0, + "grad_norm": 1.9525755257521091, + "language_loss": 0.79290783, + "learning_rate": 3.719619589699017e-06, + "loss": 0.81559932, + "num_input_tokens_seen": 70152045, + "step": 3251, + "time_per_iteration": 2.65492582321167 + }, + { + "auxiliary_loss_clip": 0.01176749, + "auxiliary_loss_mlp": 0.01152396, + "balance_loss_clip": 1.00216973, + "balance_loss_mlp": 1.00104833, + "epoch": 0.19552081767623627, + "flos": 17346865449600.0, + "grad_norm": 2.051552297454547, + "language_loss": 0.8347888, + "learning_rate": 3.7194206923338695e-06, + "loss": 0.85808027, + "num_input_tokens_seen": 70169240, + "step": 3252, + "time_per_iteration": 2.4945850372314453 + }, + { + "auxiliary_loss_clip": 0.01161169, + "auxiliary_loss_mlp": 0.01152658, + "balance_loss_clip": 1.00207794, + "balance_loss_mlp": 1.00102389, + "epoch": 0.19558094092890424, + "flos": 31977626129280.0, + "grad_norm": 2.0336978589488437, + "language_loss": 0.73307884, + "learning_rate": 3.719221729768117e-06, + "loss": 0.75621712, + "num_input_tokens_seen": 70192690, + "step": 3253, + "time_per_iteration": 2.645883798599243 + }, + { + "auxiliary_loss_clip": 0.01112721, + "auxiliary_loss_mlp": 0.01152286, + "balance_loss_clip": 1.00197101, + "balance_loss_mlp": 1.00103378, + "epoch": 0.19564106418157223, + "flos": 22268889187200.0, + "grad_norm": 1.7791952956728652, + "language_loss": 0.7658056, + "learning_rate": 3.7190227020093037e-06, + "loss": 0.78845572, + "num_input_tokens_seen": 70209685, + "step": 3254, + "time_per_iteration": 2.701200008392334 + }, + { + "auxiliary_loss_clip": 0.01109079, + "auxiliary_loss_mlp": 0.01141217, + "balance_loss_clip": 1.00126934, + "balance_loss_mlp": 1.00007331, + "epoch": 0.1957011874342402, + "flos": 54364554385920.0, + "grad_norm": 0.7486700861323046, + "language_loss": 0.553352, + "learning_rate": 3.7188236090649774e-06, + "loss": 0.57585496, + "num_input_tokens_seen": 70265050, + "step": 3255, + "time_per_iteration": 3.2225852012634277 + }, + { + "auxiliary_loss_clip": 0.01143569, + "auxiliary_loss_mlp": 0.01152693, + "balance_loss_clip": 1.00218129, + "balance_loss_mlp": 1.00096381, + "epoch": 0.19576131068690816, + "flos": 16506699356160.0, + "grad_norm": 2.225259909458568, + "language_loss": 0.70065504, + "learning_rate": 3.718624450942688e-06, + "loss": 0.72361761, + "num_input_tokens_seen": 70281830, + "step": 3256, + "time_per_iteration": 2.602548837661743 + }, + { + "auxiliary_loss_clip": 0.01176667, + "auxiliary_loss_mlp": 0.01152031, + "balance_loss_clip": 1.0022018, + "balance_loss_mlp": 1.00106478, + "epoch": 0.19582143393957613, + "flos": 14719676797440.0, + "grad_norm": 2.1415454940904155, + "language_loss": 0.80473208, + "learning_rate": 3.718425227649987e-06, + "loss": 0.82801902, + "num_input_tokens_seen": 70297420, + "step": 3257, + "time_per_iteration": 2.5124967098236084 + }, + { + "auxiliary_loss_clip": 0.01110433, + "auxiliary_loss_mlp": 0.01152102, + "balance_loss_clip": 1.00181627, + "balance_loss_mlp": 1.00094521, + "epoch": 0.1958815571922441, + "flos": 24425504737920.0, + "grad_norm": 2.0195614486900286, + "language_loss": 0.75304562, + "learning_rate": 3.7182259391944292e-06, + "loss": 0.77567101, + "num_input_tokens_seen": 70319210, + "step": 3258, + "time_per_iteration": 2.7218129634857178 + }, + { + "auxiliary_loss_clip": 0.01096229, + "auxiliary_loss_mlp": 0.01152258, + "balance_loss_clip": 1.00185466, + "balance_loss_mlp": 1.00090981, + "epoch": 0.19594168044491206, + "flos": 24900279730560.0, + "grad_norm": 1.7673981251961244, + "language_loss": 0.74193192, + "learning_rate": 3.7180265855835714e-06, + "loss": 0.76441675, + "num_input_tokens_seen": 70339045, + "step": 3259, + "time_per_iteration": 2.7419474124908447 + }, + { + "auxiliary_loss_clip": 0.01143434, + "auxiliary_loss_mlp": 0.01152491, + "balance_loss_clip": 1.00203836, + "balance_loss_mlp": 1.00085711, + "epoch": 0.19600180369758005, + "flos": 12057008486400.0, + "grad_norm": 2.7935962925751547, + "language_loss": 0.76783711, + "learning_rate": 3.7178271668249735e-06, + "loss": 0.7907964, + "num_input_tokens_seen": 70356505, + "step": 3260, + "time_per_iteration": 2.563713788986206 + }, + { + "auxiliary_loss_clip": 0.01161412, + "auxiliary_loss_mlp": 0.01152172, + "balance_loss_clip": 1.00217104, + "balance_loss_mlp": 1.00101519, + "epoch": 0.19606192695024802, + "flos": 20850202644480.0, + "grad_norm": 2.419625609057435, + "language_loss": 0.81676674, + "learning_rate": 3.7176276829261975e-06, + "loss": 0.83990252, + "num_input_tokens_seen": 70375410, + "step": 3261, + "time_per_iteration": 2.565399408340454 + }, + { + "auxiliary_loss_clip": 0.01128109, + "auxiliary_loss_mlp": 0.0115268, + "balance_loss_clip": 1.00207114, + "balance_loss_mlp": 1.0010457, + "epoch": 0.19612205020291598, + "flos": 28475509996800.0, + "grad_norm": 1.8129982068531079, + "language_loss": 0.76641148, + "learning_rate": 3.717428133894807e-06, + "loss": 0.78921938, + "num_input_tokens_seen": 70396315, + "step": 3262, + "time_per_iteration": 2.690892457962036 + }, + { + "auxiliary_loss_clip": 0.01160297, + "auxiliary_loss_mlp": 0.01152291, + "balance_loss_clip": 1.00216734, + "balance_loss_mlp": 1.0011338, + "epoch": 0.19618217345558395, + "flos": 25556618995200.0, + "grad_norm": 1.5721840886708105, + "language_loss": 0.8642652, + "learning_rate": 3.71722851973837e-06, + "loss": 0.88739109, + "num_input_tokens_seen": 70417945, + "step": 3263, + "time_per_iteration": 2.6079819202423096 + }, + { + "auxiliary_loss_clip": 0.01143446, + "auxiliary_loss_mlp": 0.01152197, + "balance_loss_clip": 1.00202513, + "balance_loss_mlp": 1.00094366, + "epoch": 0.1962422967082519, + "flos": 25264413855360.0, + "grad_norm": 1.6194908668085888, + "language_loss": 0.74220896, + "learning_rate": 3.717028840464455e-06, + "loss": 0.76516533, + "num_input_tokens_seen": 70438690, + "step": 3264, + "time_per_iteration": 2.6340172290802 + }, + { + "auxiliary_loss_clip": 0.01160196, + "auxiliary_loss_mlp": 0.01152104, + "balance_loss_clip": 1.00230002, + "balance_loss_mlp": 1.00104213, + "epoch": 0.19630241996091988, + "flos": 18807352444800.0, + "grad_norm": 1.8090032185998908, + "language_loss": 0.78262043, + "learning_rate": 3.7168290960806344e-06, + "loss": 0.80574346, + "num_input_tokens_seen": 70455385, + "step": 3265, + "time_per_iteration": 2.5564169883728027 + }, + { + "auxiliary_loss_clip": 0.01107755, + "auxiliary_loss_mlp": 0.01140537, + "balance_loss_clip": 1.00139046, + "balance_loss_mlp": 1.00015664, + "epoch": 0.19636254321358784, + "flos": 62321137896960.0, + "grad_norm": 0.8127723858334414, + "language_loss": 0.53393137, + "learning_rate": 3.716629286594483e-06, + "loss": 0.55641431, + "num_input_tokens_seen": 70514280, + "step": 3266, + "time_per_iteration": 3.2565078735351562 + }, + { + "auxiliary_loss_clip": 0.01144089, + "auxiliary_loss_mlp": 0.00748739, + "balance_loss_clip": 1.00217664, + "balance_loss_mlp": 1.00060558, + "epoch": 0.19642266646625584, + "flos": 21069329564160.0, + "grad_norm": 1.8440788532012677, + "language_loss": 0.80055737, + "learning_rate": 3.7164294120135767e-06, + "loss": 0.81948566, + "num_input_tokens_seen": 70531800, + "step": 3267, + "time_per_iteration": 2.603440999984741 + }, + { + "auxiliary_loss_clip": 0.01144813, + "auxiliary_loss_mlp": 0.01151874, + "balance_loss_clip": 1.00207388, + "balance_loss_mlp": 1.00090766, + "epoch": 0.1964827897189238, + "flos": 14538651229440.0, + "grad_norm": 1.8273963142660812, + "language_loss": 0.86253405, + "learning_rate": 3.7162294723454953e-06, + "loss": 0.88550091, + "num_input_tokens_seen": 70550615, + "step": 3268, + "time_per_iteration": 2.5981485843658447 + }, + { + "auxiliary_loss_clip": 0.01111881, + "auxiliary_loss_mlp": 0.01152132, + "balance_loss_clip": 1.00208974, + "balance_loss_mlp": 1.00097466, + "epoch": 0.19654291297159177, + "flos": 19244636616960.0, + "grad_norm": 2.1715005275772024, + "language_loss": 0.69693363, + "learning_rate": 3.7160294675978197e-06, + "loss": 0.7195738, + "num_input_tokens_seen": 70568690, + "step": 3269, + "time_per_iteration": 2.6719818115234375 + }, + { + "auxiliary_loss_clip": 0.0112916, + "auxiliary_loss_mlp": 0.01152582, + "balance_loss_clip": 1.00206339, + "balance_loss_mlp": 1.00123358, + "epoch": 0.19660303622425973, + "flos": 25775710001280.0, + "grad_norm": 1.6724586788547762, + "language_loss": 0.80855501, + "learning_rate": 3.715829397778135e-06, + "loss": 0.83137238, + "num_input_tokens_seen": 70588665, + "step": 3270, + "time_per_iteration": 4.0727698802948 + }, + { + "auxiliary_loss_clip": 0.01159973, + "auxiliary_loss_mlp": 0.01151786, + "balance_loss_clip": 1.00205898, + "balance_loss_mlp": 1.00101042, + "epoch": 0.1966631594769277, + "flos": 20595093275520.0, + "grad_norm": 1.8525227737428605, + "language_loss": 0.83660638, + "learning_rate": 3.715629262894028e-06, + "loss": 0.85972404, + "num_input_tokens_seen": 70606900, + "step": 3271, + "time_per_iteration": 2.5690650939941406 + }, + { + "auxiliary_loss_clip": 0.01159525, + "auxiliary_loss_mlp": 0.01152204, + "balance_loss_clip": 1.00198698, + "balance_loss_mlp": 1.00114155, + "epoch": 0.19672328272959566, + "flos": 23623188600960.0, + "grad_norm": 2.379645064430558, + "language_loss": 0.80030859, + "learning_rate": 3.715429062953087e-06, + "loss": 0.82342583, + "num_input_tokens_seen": 70625955, + "step": 3272, + "time_per_iteration": 5.405210256576538 + }, + { + "auxiliary_loss_clip": 0.01145562, + "auxiliary_loss_mlp": 0.01152509, + "balance_loss_clip": 1.00214076, + "balance_loss_mlp": 1.00096965, + "epoch": 0.19678340598226365, + "flos": 23110922787840.0, + "grad_norm": 1.9538573456725232, + "language_loss": 0.80740881, + "learning_rate": 3.7152287979629043e-06, + "loss": 0.83038956, + "num_input_tokens_seen": 70646090, + "step": 3273, + "time_per_iteration": 4.026473522186279 + }, + { + "auxiliary_loss_clip": 0.0115995, + "auxiliary_loss_mlp": 0.01152506, + "balance_loss_clip": 1.00206017, + "balance_loss_mlp": 1.00115824, + "epoch": 0.19684352923493162, + "flos": 24534852716160.0, + "grad_norm": 1.6809981207495837, + "language_loss": 0.77635938, + "learning_rate": 3.7150284679310735e-06, + "loss": 0.79948401, + "num_input_tokens_seen": 70666065, + "step": 3274, + "time_per_iteration": 2.592679500579834 + }, + { + "auxiliary_loss_clip": 0.01160192, + "auxiliary_loss_mlp": 0.01152681, + "balance_loss_clip": 1.00220323, + "balance_loss_mlp": 1.00104702, + "epoch": 0.19690365248759958, + "flos": 21796448578560.0, + "grad_norm": 2.239239246201106, + "language_loss": 0.8108952, + "learning_rate": 3.7148280728651914e-06, + "loss": 0.83402395, + "num_input_tokens_seen": 70681580, + "step": 3275, + "time_per_iteration": 2.568537712097168 + }, + { + "auxiliary_loss_clip": 0.01128035, + "auxiliary_loss_mlp": 0.01151655, + "balance_loss_clip": 1.0020175, + "balance_loss_mlp": 1.00087905, + "epoch": 0.19696377574026755, + "flos": 19056643810560.0, + "grad_norm": 1.8698636918130367, + "language_loss": 0.81068116, + "learning_rate": 3.7146276127728563e-06, + "loss": 0.83347809, + "num_input_tokens_seen": 70697745, + "step": 3276, + "time_per_iteration": 2.631453275680542 + }, + { + "auxiliary_loss_clip": 0.01159994, + "auxiliary_loss_mlp": 0.01151753, + "balance_loss_clip": 1.00212646, + "balance_loss_mlp": 1.00069118, + "epoch": 0.19702389899293551, + "flos": 22820656982400.0, + "grad_norm": 2.127172298983972, + "language_loss": 0.89406681, + "learning_rate": 3.7144270876616713e-06, + "loss": 0.91718423, + "num_input_tokens_seen": 70715110, + "step": 3277, + "time_per_iteration": 2.604276180267334 + }, + { + "auxiliary_loss_clip": 0.01110131, + "auxiliary_loss_mlp": 0.01152351, + "balance_loss_clip": 1.0017426, + "balance_loss_mlp": 1.0009079, + "epoch": 0.19708402224560348, + "flos": 22894237992960.0, + "grad_norm": 2.005836895963247, + "language_loss": 0.62428266, + "learning_rate": 3.714226497539239e-06, + "loss": 0.64690751, + "num_input_tokens_seen": 70734715, + "step": 3278, + "time_per_iteration": 2.7990002632141113 + }, + { + "auxiliary_loss_clip": 0.01127962, + "auxiliary_loss_mlp": 0.01152117, + "balance_loss_clip": 1.00202119, + "balance_loss_mlp": 1.00115001, + "epoch": 0.19714414549827144, + "flos": 25662519267840.0, + "grad_norm": 2.01135727568614, + "language_loss": 0.73609269, + "learning_rate": 3.714025842413166e-06, + "loss": 0.75889349, + "num_input_tokens_seen": 70752650, + "step": 3279, + "time_per_iteration": 2.662637948989868 + }, + { + "auxiliary_loss_clip": 0.01161133, + "auxiliary_loss_mlp": 0.01151676, + "balance_loss_clip": 1.00208747, + "balance_loss_mlp": 1.00089967, + "epoch": 0.19720426875093944, + "flos": 23915824704000.0, + "grad_norm": 1.5310169604208297, + "language_loss": 0.82961643, + "learning_rate": 3.713825122291061e-06, + "loss": 0.85274458, + "num_input_tokens_seen": 70772365, + "step": 3280, + "time_per_iteration": 2.6149401664733887 + }, + { + "auxiliary_loss_clip": 0.01127582, + "auxiliary_loss_mlp": 0.011521, + "balance_loss_clip": 1.00195551, + "balance_loss_mlp": 1.00094247, + "epoch": 0.1972643920036074, + "flos": 13881952828800.0, + "grad_norm": 1.6463313540475708, + "language_loss": 0.77732772, + "learning_rate": 3.713624337180536e-06, + "loss": 0.80012459, + "num_input_tokens_seen": 70790340, + "step": 3281, + "time_per_iteration": 2.6055028438568115 + }, + { + "auxiliary_loss_clip": 0.01144591, + "auxiliary_loss_mlp": 0.0115189, + "balance_loss_clip": 1.00220501, + "balance_loss_mlp": 1.00111449, + "epoch": 0.19732451525627537, + "flos": 19863592801920.0, + "grad_norm": 1.6621576685039823, + "language_loss": 0.79675055, + "learning_rate": 3.7134234870892045e-06, + "loss": 0.81971538, + "num_input_tokens_seen": 70809295, + "step": 3282, + "time_per_iteration": 2.589832305908203 + }, + { + "auxiliary_loss_clip": 0.0111224, + "auxiliary_loss_mlp": 0.01152007, + "balance_loss_clip": 1.00211465, + "balance_loss_mlp": 1.00084925, + "epoch": 0.19738463850894333, + "flos": 24973429777920.0, + "grad_norm": 2.1417073379771803, + "language_loss": 0.72050983, + "learning_rate": 3.7132225720246826e-06, + "loss": 0.74315226, + "num_input_tokens_seen": 70828765, + "step": 3283, + "time_per_iteration": 2.697695016860962 + }, + { + "auxiliary_loss_clip": 0.01160631, + "auxiliary_loss_mlp": 0.01152249, + "balance_loss_clip": 1.00223279, + "balance_loss_mlp": 1.00090098, + "epoch": 0.1974447617616113, + "flos": 18368883123840.0, + "grad_norm": 1.5928103055210838, + "language_loss": 0.78764617, + "learning_rate": 3.7130215919945886e-06, + "loss": 0.81077498, + "num_input_tokens_seen": 70846805, + "step": 3284, + "time_per_iteration": 2.538341522216797 + }, + { + "auxiliary_loss_clip": 0.01144555, + "auxiliary_loss_mlp": 0.00748872, + "balance_loss_clip": 1.00210929, + "balance_loss_mlp": 1.00094891, + "epoch": 0.19750488501427926, + "flos": 22892945103360.0, + "grad_norm": 2.136293394892777, + "language_loss": 0.86349362, + "learning_rate": 3.7128205470065445e-06, + "loss": 0.88242793, + "num_input_tokens_seen": 70863805, + "step": 3285, + "time_per_iteration": 2.6155316829681396 + }, + { + "auxiliary_loss_clip": 0.01134409, + "auxiliary_loss_mlp": 0.01151936, + "balance_loss_clip": 1.003263, + "balance_loss_mlp": 1.00115967, + "epoch": 0.19756500826694723, + "flos": 21871502046720.0, + "grad_norm": 2.0245566583813055, + "language_loss": 0.87882978, + "learning_rate": 3.712619437068174e-06, + "loss": 0.90169322, + "num_input_tokens_seen": 70882660, + "step": 3286, + "time_per_iteration": 2.6389753818511963 + }, + { + "auxiliary_loss_clip": 0.01127942, + "auxiliary_loss_mlp": 0.01151837, + "balance_loss_clip": 1.00209904, + "balance_loss_mlp": 1.00106072, + "epoch": 0.19762513151961522, + "flos": 15158972131200.0, + "grad_norm": 2.487574105984847, + "language_loss": 0.77813071, + "learning_rate": 3.712418262187102e-06, + "loss": 0.80092847, + "num_input_tokens_seen": 70898765, + "step": 3287, + "time_per_iteration": 2.658543109893799 + }, + { + "auxiliary_loss_clip": 0.01144123, + "auxiliary_loss_mlp": 0.01152722, + "balance_loss_clip": 1.00226831, + "balance_loss_mlp": 1.00089669, + "epoch": 0.1976852547722832, + "flos": 16979175878400.0, + "grad_norm": 2.2281387395615253, + "language_loss": 0.81054604, + "learning_rate": 3.7122170223709584e-06, + "loss": 0.83351445, + "num_input_tokens_seen": 70916370, + "step": 3288, + "time_per_iteration": 2.590688705444336 + }, + { + "auxiliary_loss_clip": 0.01161023, + "auxiliary_loss_mlp": 0.01152156, + "balance_loss_clip": 1.00221157, + "balance_loss_mlp": 1.00118983, + "epoch": 0.19774537802495115, + "flos": 20302924049280.0, + "grad_norm": 1.7642126894184265, + "language_loss": 0.72783279, + "learning_rate": 3.712015717627374e-06, + "loss": 0.75096452, + "num_input_tokens_seen": 70934870, + "step": 3289, + "time_per_iteration": 2.5619661808013916 + }, + { + "auxiliary_loss_clip": 0.01143586, + "auxiliary_loss_mlp": 0.01152179, + "balance_loss_clip": 1.00202036, + "balance_loss_mlp": 1.00102198, + "epoch": 0.19780550127761912, + "flos": 27235478724480.0, + "grad_norm": 1.8857173310758168, + "language_loss": 0.79273069, + "learning_rate": 3.7118143479639813e-06, + "loss": 0.81568825, + "num_input_tokens_seen": 70955140, + "step": 3290, + "time_per_iteration": 2.6470186710357666 + }, + { + "auxiliary_loss_clip": 0.01140368, + "auxiliary_loss_mlp": 0.01140638, + "balance_loss_clip": 1.00146914, + "balance_loss_mlp": 1.00025737, + "epoch": 0.19786562453028708, + "flos": 63550972684800.0, + "grad_norm": 0.8983723836762083, + "language_loss": 0.6038574, + "learning_rate": 3.711612913388418e-06, + "loss": 0.62666738, + "num_input_tokens_seen": 71012005, + "step": 3291, + "time_per_iteration": 3.1920578479766846 + }, + { + "auxiliary_loss_clip": 0.01176717, + "auxiliary_loss_mlp": 0.01152408, + "balance_loss_clip": 1.00217962, + "balance_loss_mlp": 1.00115514, + "epoch": 0.19792574778295505, + "flos": 26286647011200.0, + "grad_norm": 1.7829013151157251, + "language_loss": 0.81794655, + "learning_rate": 3.7114114139083204e-06, + "loss": 0.84123784, + "num_input_tokens_seen": 71031140, + "step": 3292, + "time_per_iteration": 2.5706844329833984 + }, + { + "auxiliary_loss_clip": 0.01144383, + "auxiliary_loss_mlp": 0.00748908, + "balance_loss_clip": 1.00212693, + "balance_loss_mlp": 1.00099814, + "epoch": 0.19798587103562304, + "flos": 19938107566080.0, + "grad_norm": 1.8256008423132446, + "language_loss": 0.81873757, + "learning_rate": 3.7112098495313313e-06, + "loss": 0.83767045, + "num_input_tokens_seen": 71050250, + "step": 3293, + "time_per_iteration": 2.6028032302856445 + }, + { + "auxiliary_loss_clip": 0.01145556, + "auxiliary_loss_mlp": 0.0115268, + "balance_loss_clip": 1.00219822, + "balance_loss_mlp": 1.00104594, + "epoch": 0.198045994288291, + "flos": 20120282369280.0, + "grad_norm": 1.662447340279467, + "language_loss": 0.61304235, + "learning_rate": 3.711008220265093e-06, + "loss": 0.63602471, + "num_input_tokens_seen": 71068665, + "step": 3294, + "time_per_iteration": 2.593238592147827 + }, + { + "auxiliary_loss_clip": 0.01143598, + "auxiliary_loss_mlp": 0.01151698, + "balance_loss_clip": 1.0020473, + "balance_loss_mlp": 1.00092173, + "epoch": 0.19810611754095897, + "flos": 17967653228160.0, + "grad_norm": 2.0607306822994347, + "language_loss": 0.8677367, + "learning_rate": 3.710806526117251e-06, + "loss": 0.89068961, + "num_input_tokens_seen": 71085320, + "step": 3295, + "time_per_iteration": 2.5839855670928955 + }, + { + "auxiliary_loss_clip": 0.01129191, + "auxiliary_loss_mlp": 0.01151786, + "balance_loss_clip": 1.00204039, + "balance_loss_mlp": 1.00110579, + "epoch": 0.19816624079362694, + "flos": 15084996071040.0, + "grad_norm": 4.252146335202724, + "language_loss": 0.80478406, + "learning_rate": 3.7106047670954544e-06, + "loss": 0.8275938, + "num_input_tokens_seen": 71102020, + "step": 3296, + "time_per_iteration": 2.6077208518981934 + }, + { + "auxiliary_loss_clip": 0.01127911, + "auxiliary_loss_mlp": 0.01151884, + "balance_loss_clip": 1.00196052, + "balance_loss_mlp": 1.00072694, + "epoch": 0.1982263640462949, + "flos": 24900315644160.0, + "grad_norm": 1.7701705173966376, + "language_loss": 0.68157458, + "learning_rate": 3.710402943207354e-06, + "loss": 0.70437253, + "num_input_tokens_seen": 71123390, + "step": 3297, + "time_per_iteration": 2.7016799449920654 + }, + { + "auxiliary_loss_clip": 0.01176639, + "auxiliary_loss_mlp": 0.0115158, + "balance_loss_clip": 1.00226736, + "balance_loss_mlp": 1.00109053, + "epoch": 0.19828648729896287, + "flos": 20376181837440.0, + "grad_norm": 1.5831940690125998, + "language_loss": 0.80926192, + "learning_rate": 3.7102010544606016e-06, + "loss": 0.83254409, + "num_input_tokens_seen": 71141800, + "step": 3298, + "time_per_iteration": 2.5257863998413086 + }, + { + "auxiliary_loss_clip": 0.01144802, + "auxiliary_loss_mlp": 0.01152075, + "balance_loss_clip": 1.00197911, + "balance_loss_mlp": 1.00082254, + "epoch": 0.19834661055163083, + "flos": 18880035615360.0, + "grad_norm": 2.0480032427729165, + "language_loss": 0.853374, + "learning_rate": 3.7099991008628544e-06, + "loss": 0.87634277, + "num_input_tokens_seen": 71159505, + "step": 3299, + "time_per_iteration": 2.5796446800231934 + }, + { + "auxiliary_loss_clip": 0.01125994, + "auxiliary_loss_mlp": 0.01140666, + "balance_loss_clip": 1.00145197, + "balance_loss_mlp": 1.00028467, + "epoch": 0.19840673380429882, + "flos": 60259184640000.0, + "grad_norm": 0.7718855838125578, + "language_loss": 0.5323053, + "learning_rate": 3.7097970824217706e-06, + "loss": 0.55497187, + "num_input_tokens_seen": 71223265, + "step": 3300, + "time_per_iteration": 3.1577999591827393 + }, + { + "auxiliary_loss_clip": 0.0113447, + "auxiliary_loss_mlp": 0.01152316, + "balance_loss_clip": 1.00231934, + "balance_loss_mlp": 1.0011586, + "epoch": 0.1984668570569668, + "flos": 19902017376000.0, + "grad_norm": 1.5297918168186593, + "language_loss": 0.73279506, + "learning_rate": 3.7095949991450093e-06, + "loss": 0.75566292, + "num_input_tokens_seen": 71242385, + "step": 3301, + "time_per_iteration": 2.646481513977051 + }, + { + "auxiliary_loss_clip": 0.01126693, + "auxiliary_loss_mlp": 0.01151969, + "balance_loss_clip": 1.0020082, + "balance_loss_mlp": 1.00090706, + "epoch": 0.19852698030963475, + "flos": 15630766295040.0, + "grad_norm": 2.0972603732362707, + "language_loss": 0.87443256, + "learning_rate": 3.709392851040235e-06, + "loss": 0.89721924, + "num_input_tokens_seen": 71258990, + "step": 3302, + "time_per_iteration": 2.593940496444702 + }, + { + "auxiliary_loss_clip": 0.01127951, + "auxiliary_loss_mlp": 0.01151701, + "balance_loss_clip": 1.00207114, + "balance_loss_mlp": 1.00073433, + "epoch": 0.19858710356230272, + "flos": 43143007311360.0, + "grad_norm": 1.7027802728501047, + "language_loss": 0.73629266, + "learning_rate": 3.709190638115111e-06, + "loss": 0.75908917, + "num_input_tokens_seen": 71282770, + "step": 3303, + "time_per_iteration": 2.826936721801758 + }, + { + "auxiliary_loss_clip": 0.01159761, + "auxiliary_loss_mlp": 0.01152, + "balance_loss_clip": 1.00211906, + "balance_loss_mlp": 1.00103343, + "epoch": 0.19864722681497068, + "flos": 35144084643840.0, + "grad_norm": 2.0448927473954215, + "language_loss": 0.74599189, + "learning_rate": 3.7089883603773084e-06, + "loss": 0.76910949, + "num_input_tokens_seen": 71301410, + "step": 3304, + "time_per_iteration": 2.6586952209472656 + }, + { + "auxiliary_loss_clip": 0.01144364, + "auxiliary_loss_mlp": 0.01151336, + "balance_loss_clip": 1.00202298, + "balance_loss_mlp": 1.00094199, + "epoch": 0.19870735006763865, + "flos": 19426200888960.0, + "grad_norm": 1.6219343304931306, + "language_loss": 0.85802948, + "learning_rate": 3.7087860178344955e-06, + "loss": 0.88098645, + "num_input_tokens_seen": 71319670, + "step": 3305, + "time_per_iteration": 2.583268404006958 + }, + { + "auxiliary_loss_clip": 0.01144526, + "auxiliary_loss_mlp": 0.01151612, + "balance_loss_clip": 1.00202096, + "balance_loss_mlp": 1.0009315, + "epoch": 0.19876747332030664, + "flos": 23547380947200.0, + "grad_norm": 1.6496509190003728, + "language_loss": 0.6847443, + "learning_rate": 3.7085836104943445e-06, + "loss": 0.70770562, + "num_input_tokens_seen": 71339850, + "step": 3306, + "time_per_iteration": 2.6182587146759033 + }, + { + "auxiliary_loss_clip": 0.01144764, + "auxiliary_loss_mlp": 0.01151788, + "balance_loss_clip": 1.0020926, + "balance_loss_mlp": 1.00091672, + "epoch": 0.1988275965729746, + "flos": 19829406032640.0, + "grad_norm": 1.5838663603112382, + "language_loss": 0.76308274, + "learning_rate": 3.7083811383645332e-06, + "loss": 0.78604823, + "num_input_tokens_seen": 71359795, + "step": 3307, + "time_per_iteration": 4.0715491771698 + }, + { + "auxiliary_loss_clip": 0.0117662, + "auxiliary_loss_mlp": 0.01151414, + "balance_loss_clip": 1.0022614, + "balance_loss_mlp": 1.00121069, + "epoch": 0.19888771982564257, + "flos": 23513625141120.0, + "grad_norm": 3.546836164684197, + "language_loss": 0.75948495, + "learning_rate": 3.708178601452737e-06, + "loss": 0.78276527, + "num_input_tokens_seen": 71378885, + "step": 3308, + "time_per_iteration": 2.552823305130005 + }, + { + "auxiliary_loss_clip": 0.01117126, + "auxiliary_loss_mlp": 0.01152013, + "balance_loss_clip": 1.00228, + "balance_loss_mlp": 1.00085604, + "epoch": 0.19894784307831054, + "flos": 18150510389760.0, + "grad_norm": 1.5579685803208647, + "language_loss": 0.75523102, + "learning_rate": 3.7079759997666374e-06, + "loss": 0.77792245, + "num_input_tokens_seen": 71397285, + "step": 3309, + "time_per_iteration": 4.07866907119751 + }, + { + "auxiliary_loss_clip": 0.01159805, + "auxiliary_loss_mlp": 0.0115108, + "balance_loss_clip": 1.00204086, + "balance_loss_mlp": 1.00106716, + "epoch": 0.1990079663309785, + "flos": 24276044246400.0, + "grad_norm": 1.4736757551358777, + "language_loss": 0.87909442, + "learning_rate": 3.707773333313917e-06, + "loss": 0.90220326, + "num_input_tokens_seen": 71415775, + "step": 3310, + "time_per_iteration": 5.464327812194824 + }, + { + "auxiliary_loss_clip": 0.0117635, + "auxiliary_loss_mlp": 0.0115136, + "balance_loss_clip": 1.00204086, + "balance_loss_mlp": 1.0007751, + "epoch": 0.19906808958364647, + "flos": 34897666366080.0, + "grad_norm": 2.2867821735790907, + "language_loss": 0.64818966, + "learning_rate": 3.70757060210226e-06, + "loss": 0.67146683, + "num_input_tokens_seen": 71437315, + "step": 3311, + "time_per_iteration": 2.6194822788238525 + }, + { + "auxiliary_loss_clip": 0.01128627, + "auxiliary_loss_mlp": 0.011515, + "balance_loss_clip": 1.00201881, + "balance_loss_mlp": 1.00081921, + "epoch": 0.19912821283631443, + "flos": 24024885373440.0, + "grad_norm": 2.1605636334219995, + "language_loss": 0.74186367, + "learning_rate": 3.707367806139355e-06, + "loss": 0.76466489, + "num_input_tokens_seen": 71456320, + "step": 3312, + "time_per_iteration": 2.655693531036377 + }, + { + "auxiliary_loss_clip": 0.01159581, + "auxiliary_loss_mlp": 0.01152032, + "balance_loss_clip": 1.00199723, + "balance_loss_mlp": 1.000875, + "epoch": 0.19918833608898243, + "flos": 19859031774720.0, + "grad_norm": 2.552052290476132, + "language_loss": 0.83499455, + "learning_rate": 3.7071649454328915e-06, + "loss": 0.85811067, + "num_input_tokens_seen": 71475360, + "step": 3313, + "time_per_iteration": 2.547308921813965 + }, + { + "auxiliary_loss_clip": 0.01159928, + "auxiliary_loss_mlp": 0.01151669, + "balance_loss_clip": 1.00211918, + "balance_loss_mlp": 1.00098825, + "epoch": 0.1992484593416504, + "flos": 29095794984960.0, + "grad_norm": 2.338122450464561, + "language_loss": 0.80474108, + "learning_rate": 3.7069620199905625e-06, + "loss": 0.82785708, + "num_input_tokens_seen": 71496155, + "step": 3314, + "time_per_iteration": 2.628197431564331 + }, + { + "auxiliary_loss_clip": 0.01145524, + "auxiliary_loss_mlp": 0.0115077, + "balance_loss_clip": 1.00210285, + "balance_loss_mlp": 1.00094724, + "epoch": 0.19930858259431836, + "flos": 23295001011840.0, + "grad_norm": 1.4647451000494214, + "language_loss": 0.87417549, + "learning_rate": 3.7067590298200627e-06, + "loss": 0.89713848, + "num_input_tokens_seen": 71517295, + "step": 3315, + "time_per_iteration": 2.6369283199310303 + }, + { + "auxiliary_loss_clip": 0.01128981, + "auxiliary_loss_mlp": 0.00748936, + "balance_loss_clip": 1.0021311, + "balance_loss_mlp": 1.00100994, + "epoch": 0.19936870584698632, + "flos": 25378825651200.0, + "grad_norm": 1.5272857481615016, + "language_loss": 0.70941663, + "learning_rate": 3.7065559749290892e-06, + "loss": 0.72819579, + "num_input_tokens_seen": 71540000, + "step": 3316, + "time_per_iteration": 2.7281110286712646 + }, + { + "auxiliary_loss_clip": 0.01092458, + "auxiliary_loss_mlp": 0.01140571, + "balance_loss_clip": 1.00136614, + "balance_loss_mlp": 1.00019026, + "epoch": 0.1994288290996543, + "flos": 62168053109760.0, + "grad_norm": 0.8277639296927372, + "language_loss": 0.66342735, + "learning_rate": 3.706352855325342e-06, + "loss": 0.68575764, + "num_input_tokens_seen": 71607880, + "step": 3317, + "time_per_iteration": 3.425846815109253 + }, + { + "auxiliary_loss_clip": 0.01160022, + "auxiliary_loss_mlp": 0.01151934, + "balance_loss_clip": 1.00211751, + "balance_loss_mlp": 1.00115776, + "epoch": 0.19948895235232225, + "flos": 19025832919680.0, + "grad_norm": 2.064955385374543, + "language_loss": 0.74451458, + "learning_rate": 3.7061496710165233e-06, + "loss": 0.76763415, + "num_input_tokens_seen": 71625695, + "step": 3318, + "time_per_iteration": 2.85703182220459 + }, + { + "auxiliary_loss_clip": 0.01128355, + "auxiliary_loss_mlp": 0.01151437, + "balance_loss_clip": 1.00200617, + "balance_loss_mlp": 1.00104308, + "epoch": 0.19954907560499022, + "flos": 37815803182080.0, + "grad_norm": 1.9222206928827363, + "language_loss": 0.78698295, + "learning_rate": 3.7059464220103385e-06, + "loss": 0.80978084, + "num_input_tokens_seen": 71648520, + "step": 3319, + "time_per_iteration": 2.7842791080474854 + }, + { + "auxiliary_loss_clip": 0.01144042, + "auxiliary_loss_mlp": 0.01151478, + "balance_loss_clip": 1.00205588, + "balance_loss_mlp": 1.00079751, + "epoch": 0.1996091988576582, + "flos": 49565199594240.0, + "grad_norm": 2.0488016814427574, + "language_loss": 0.75806701, + "learning_rate": 3.7057431083144945e-06, + "loss": 0.78102219, + "num_input_tokens_seen": 71672185, + "step": 3320, + "time_per_iteration": 2.8542561531066895 + }, + { + "auxiliary_loss_clip": 0.01127962, + "auxiliary_loss_mlp": 0.01151362, + "balance_loss_clip": 1.00187314, + "balance_loss_mlp": 1.00096703, + "epoch": 0.19966932211032618, + "flos": 22635788659200.0, + "grad_norm": 1.4381660831791252, + "language_loss": 0.80417103, + "learning_rate": 3.705539729936701e-06, + "loss": 0.82696426, + "num_input_tokens_seen": 71692890, + "step": 3321, + "time_per_iteration": 2.6684718132019043 + }, + { + "auxiliary_loss_clip": 0.01126429, + "auxiliary_loss_mlp": 0.01139747, + "balance_loss_clip": 1.00150967, + "balance_loss_mlp": 1.00012875, + "epoch": 0.19972944536299414, + "flos": 54082117745280.0, + "grad_norm": 0.8831032167668542, + "language_loss": 0.65163022, + "learning_rate": 3.7053362868846696e-06, + "loss": 0.67429197, + "num_input_tokens_seen": 71745815, + "step": 3322, + "time_per_iteration": 2.991543769836426 + }, + { + "auxiliary_loss_clip": 0.01141487, + "auxiliary_loss_mlp": 0.01140445, + "balance_loss_clip": 1.00222743, + "balance_loss_mlp": 1.00006413, + "epoch": 0.1997895686156621, + "flos": 69355031817600.0, + "grad_norm": 0.7812061867646066, + "language_loss": 0.56994426, + "learning_rate": 3.7051327791661153e-06, + "loss": 0.59276354, + "num_input_tokens_seen": 71806915, + "step": 3323, + "time_per_iteration": 3.273507833480835 + }, + { + "auxiliary_loss_clip": 0.01144472, + "auxiliary_loss_mlp": 0.00748846, + "balance_loss_clip": 1.00212109, + "balance_loss_mlp": 1.00092602, + "epoch": 0.19984969186833007, + "flos": 18552063507840.0, + "grad_norm": 2.8748500688435663, + "language_loss": 0.80764103, + "learning_rate": 3.7049292067887555e-06, + "loss": 0.82657421, + "num_input_tokens_seen": 71824645, + "step": 3324, + "time_per_iteration": 2.7623291015625 + }, + { + "auxiliary_loss_clip": 0.01160892, + "auxiliary_loss_mlp": 0.01151983, + "balance_loss_clip": 1.00215816, + "balance_loss_mlp": 1.00092137, + "epoch": 0.19990981512099804, + "flos": 26429678968320.0, + "grad_norm": 1.5057146470081264, + "language_loss": 0.5348959, + "learning_rate": 3.7047255697603092e-06, + "loss": 0.55802464, + "num_input_tokens_seen": 71845125, + "step": 3325, + "time_per_iteration": 2.626774311065674 + }, + { + "auxiliary_loss_clip": 0.01143501, + "auxiliary_loss_mlp": 0.01151868, + "balance_loss_clip": 1.00204635, + "balance_loss_mlp": 1.00090182, + "epoch": 0.19996993837366603, + "flos": 16325997010560.0, + "grad_norm": 1.8907986840251005, + "language_loss": 0.86176854, + "learning_rate": 3.7045218680884984e-06, + "loss": 0.88472223, + "num_input_tokens_seen": 71863500, + "step": 3326, + "time_per_iteration": 2.6055760383605957 + }, + { + "auxiliary_loss_clip": 0.01176565, + "auxiliary_loss_mlp": 0.01151428, + "balance_loss_clip": 1.00227642, + "balance_loss_mlp": 1.00103331, + "epoch": 0.200030061626334, + "flos": 20844169159680.0, + "grad_norm": 1.7566470146183568, + "language_loss": 0.72161543, + "learning_rate": 3.7043181017810476e-06, + "loss": 0.74489534, + "num_input_tokens_seen": 71881845, + "step": 3327, + "time_per_iteration": 2.524277448654175 + }, + { + "auxiliary_loss_clip": 0.0114335, + "auxiliary_loss_mlp": 0.01151536, + "balance_loss_clip": 1.00197148, + "balance_loss_mlp": 1.00095129, + "epoch": 0.20009018487900196, + "flos": 23762629198080.0, + "grad_norm": 1.6848386467799505, + "language_loss": 0.76425403, + "learning_rate": 3.7041142708456833e-06, + "loss": 0.78720289, + "num_input_tokens_seen": 71900940, + "step": 3328, + "time_per_iteration": 2.6069414615631104 + }, + { + "auxiliary_loss_clip": 0.01145431, + "auxiliary_loss_mlp": 0.01150613, + "balance_loss_clip": 1.00212836, + "balance_loss_mlp": 1.00098133, + "epoch": 0.20015030813166992, + "flos": 28111555440000.0, + "grad_norm": 1.6992318537619848, + "language_loss": 0.69524348, + "learning_rate": 3.7039103752901353e-06, + "loss": 0.7182039, + "num_input_tokens_seen": 71921925, + "step": 3329, + "time_per_iteration": 2.6829869747161865 + }, + { + "auxiliary_loss_clip": 0.01134371, + "auxiliary_loss_mlp": 0.01151502, + "balance_loss_clip": 1.00253248, + "balance_loss_mlp": 1.00091672, + "epoch": 0.2002104313843379, + "flos": 26067160955520.0, + "grad_norm": 1.6341537395507746, + "language_loss": 0.8122052, + "learning_rate": 3.7037064151221353e-06, + "loss": 0.83506393, + "num_input_tokens_seen": 71941855, + "step": 3330, + "time_per_iteration": 2.6697683334350586 + }, + { + "auxiliary_loss_clip": 0.01159987, + "auxiliary_loss_mlp": 0.01151164, + "balance_loss_clip": 1.00195765, + "balance_loss_mlp": 1.00076938, + "epoch": 0.20027055463700585, + "flos": 22966633854720.0, + "grad_norm": 2.0317099736865125, + "language_loss": 0.76138461, + "learning_rate": 3.703502390349417e-06, + "loss": 0.78449619, + "num_input_tokens_seen": 71960915, + "step": 3331, + "time_per_iteration": 2.556509494781494 + }, + { + "auxiliary_loss_clip": 0.01113357, + "auxiliary_loss_mlp": 0.01151557, + "balance_loss_clip": 1.00198197, + "balance_loss_mlp": 1.00097203, + "epoch": 0.20033067788967382, + "flos": 17165660313600.0, + "grad_norm": 2.3272284165260753, + "language_loss": 0.79462051, + "learning_rate": 3.7032983009797176e-06, + "loss": 0.81726968, + "num_input_tokens_seen": 71979220, + "step": 3332, + "time_per_iteration": 2.6448898315429688 + }, + { + "auxiliary_loss_clip": 0.01156677, + "auxiliary_loss_mlp": 0.01139828, + "balance_loss_clip": 1.00129879, + "balance_loss_mlp": 1.00021005, + "epoch": 0.2003908011423418, + "flos": 60825566292480.0, + "grad_norm": 0.9270863047436362, + "language_loss": 0.6198324, + "learning_rate": 3.703094147020776e-06, + "loss": 0.64279747, + "num_input_tokens_seen": 72033950, + "step": 3333, + "time_per_iteration": 3.0126194953918457 + }, + { + "auxiliary_loss_clip": 0.01127291, + "auxiliary_loss_mlp": 0.00748909, + "balance_loss_clip": 1.0018332, + "balance_loss_mlp": 1.00087309, + "epoch": 0.20045092439500978, + "flos": 24206234163840.0, + "grad_norm": 1.8406223651755238, + "language_loss": 0.80833071, + "learning_rate": 3.7028899284803334e-06, + "loss": 0.82709277, + "num_input_tokens_seen": 72051395, + "step": 3334, + "time_per_iteration": 2.696101665496826 + }, + { + "auxiliary_loss_clip": 0.0109408, + "auxiliary_loss_mlp": 0.01151685, + "balance_loss_clip": 1.00180745, + "balance_loss_mlp": 1.00100398, + "epoch": 0.20051104764767774, + "flos": 29387605075200.0, + "grad_norm": 1.7246244057419065, + "language_loss": 0.73888183, + "learning_rate": 3.702685645366134e-06, + "loss": 0.76133955, + "num_input_tokens_seen": 72071305, + "step": 3335, + "time_per_iteration": 2.774789571762085 + }, + { + "auxiliary_loss_clip": 0.01160615, + "auxiliary_loss_mlp": 0.01152382, + "balance_loss_clip": 1.00215173, + "balance_loss_mlp": 1.00151038, + "epoch": 0.2005711709003457, + "flos": 23513804709120.0, + "grad_norm": 1.6676162546232554, + "language_loss": 0.79978931, + "learning_rate": 3.7024812976859243e-06, + "loss": 0.82291931, + "num_input_tokens_seen": 72090165, + "step": 3336, + "time_per_iteration": 2.581312656402588 + }, + { + "auxiliary_loss_clip": 0.01128474, + "auxiliary_loss_mlp": 0.01152145, + "balance_loss_clip": 1.00206828, + "balance_loss_mlp": 1.00098765, + "epoch": 0.20063129415301367, + "flos": 22523388024960.0, + "grad_norm": 1.9723631091803147, + "language_loss": 0.77768266, + "learning_rate": 3.7022768854474532e-06, + "loss": 0.80048883, + "num_input_tokens_seen": 72107210, + "step": 3337, + "time_per_iteration": 2.6406633853912354 + }, + { + "auxiliary_loss_clip": 0.01176645, + "auxiliary_loss_mlp": 0.01151904, + "balance_loss_clip": 1.00224519, + "balance_loss_mlp": 1.00093734, + "epoch": 0.20069141740568164, + "flos": 25958243940480.0, + "grad_norm": 2.364154939613728, + "language_loss": 0.68664408, + "learning_rate": 3.7020724086584724e-06, + "loss": 0.70992959, + "num_input_tokens_seen": 72126315, + "step": 3338, + "time_per_iteration": 2.5538599491119385 + }, + { + "auxiliary_loss_clip": 0.01133573, + "auxiliary_loss_mlp": 0.01152024, + "balance_loss_clip": 1.0026474, + "balance_loss_mlp": 1.001248, + "epoch": 0.2007515406583496, + "flos": 24790608529920.0, + "grad_norm": 1.8077342924613244, + "language_loss": 0.68576783, + "learning_rate": 3.701867867326735e-06, + "loss": 0.70862377, + "num_input_tokens_seen": 72146470, + "step": 3339, + "time_per_iteration": 2.653907537460327 + }, + { + "auxiliary_loss_clip": 0.01113011, + "auxiliary_loss_mlp": 0.01151573, + "balance_loss_clip": 1.00209737, + "balance_loss_mlp": 1.00089288, + "epoch": 0.2008116639110176, + "flos": 37925582123520.0, + "grad_norm": 2.891351968321215, + "language_loss": 0.6630621, + "learning_rate": 3.7016632614599974e-06, + "loss": 0.68570793, + "num_input_tokens_seen": 72166600, + "step": 3340, + "time_per_iteration": 2.850391149520874 + }, + { + "auxiliary_loss_clip": 0.01159947, + "auxiliary_loss_mlp": 0.01151432, + "balance_loss_clip": 1.00213385, + "balance_loss_mlp": 1.00084662, + "epoch": 0.20087178716368556, + "flos": 20740531443840.0, + "grad_norm": 1.9697528136140139, + "language_loss": 0.74265301, + "learning_rate": 3.701458591066019e-06, + "loss": 0.76576674, + "num_input_tokens_seen": 72185160, + "step": 3341, + "time_per_iteration": 2.5563011169433594 + }, + { + "auxiliary_loss_clip": 0.01128503, + "auxiliary_loss_mlp": 0.0115105, + "balance_loss_clip": 1.00202835, + "balance_loss_mlp": 1.00103688, + "epoch": 0.20093191041635353, + "flos": 23842279607040.0, + "grad_norm": 1.7503294384067252, + "language_loss": 0.71406651, + "learning_rate": 3.70125385615256e-06, + "loss": 0.73686206, + "num_input_tokens_seen": 72205160, + "step": 3342, + "time_per_iteration": 2.6915485858917236 + }, + { + "auxiliary_loss_clip": 0.01117522, + "auxiliary_loss_mlp": 0.01151765, + "balance_loss_clip": 1.00227273, + "balance_loss_mlp": 1.00108457, + "epoch": 0.2009920336690215, + "flos": 21792067119360.0, + "grad_norm": 1.8511152950918492, + "language_loss": 0.72610486, + "learning_rate": 3.701049056727384e-06, + "loss": 0.74879777, + "num_input_tokens_seen": 72223555, + "step": 3343, + "time_per_iteration": 2.682410717010498 + }, + { + "auxiliary_loss_clip": 0.01128761, + "auxiliary_loss_mlp": 0.01151928, + "balance_loss_clip": 1.00196743, + "balance_loss_mlp": 1.00115252, + "epoch": 0.20105215692168946, + "flos": 26359222440960.0, + "grad_norm": 2.2145406325992125, + "language_loss": 0.80962312, + "learning_rate": 3.7008441927982574e-06, + "loss": 0.83243001, + "num_input_tokens_seen": 72242465, + "step": 3344, + "time_per_iteration": 4.056424140930176 + }, + { + "auxiliary_loss_clip": 0.01176546, + "auxiliary_loss_mlp": 0.01151239, + "balance_loss_clip": 1.00222766, + "balance_loss_mlp": 1.00074887, + "epoch": 0.20111228017435742, + "flos": 18807280617600.0, + "grad_norm": 2.130609564434657, + "language_loss": 0.83412361, + "learning_rate": 3.700639264372948e-06, + "loss": 0.85740137, + "num_input_tokens_seen": 72260655, + "step": 3345, + "time_per_iteration": 3.997098445892334 + }, + { + "auxiliary_loss_clip": 0.01112346, + "auxiliary_loss_mlp": 0.0115076, + "balance_loss_clip": 1.00196528, + "balance_loss_mlp": 1.00112855, + "epoch": 0.20117240342702541, + "flos": 19975059682560.0, + "grad_norm": 1.5829248274264145, + "language_loss": 0.67872989, + "learning_rate": 3.7004342714592283e-06, + "loss": 0.701361, + "num_input_tokens_seen": 72279055, + "step": 3346, + "time_per_iteration": 2.6809580326080322 + }, + { + "auxiliary_loss_clip": 0.01133048, + "auxiliary_loss_mlp": 0.01151472, + "balance_loss_clip": 1.00213027, + "balance_loss_mlp": 1.00107765, + "epoch": 0.20123252667969338, + "flos": 23142703345920.0, + "grad_norm": 2.48711527548543, + "language_loss": 0.73668969, + "learning_rate": 3.70022921406487e-06, + "loss": 0.7595349, + "num_input_tokens_seen": 72297895, + "step": 3347, + "time_per_iteration": 5.512611150741577 + }, + { + "auxiliary_loss_clip": 0.01159508, + "auxiliary_loss_mlp": 0.01151701, + "balance_loss_clip": 1.00205612, + "balance_loss_mlp": 1.00102079, + "epoch": 0.20129264993236134, + "flos": 23221671396480.0, + "grad_norm": 1.522769686164278, + "language_loss": 0.86750698, + "learning_rate": 3.70002409219765e-06, + "loss": 0.8906191, + "num_input_tokens_seen": 72318385, + "step": 3348, + "time_per_iteration": 2.555593967437744 + }, + { + "auxiliary_loss_clip": 0.01111018, + "auxiliary_loss_mlp": 0.01150878, + "balance_loss_clip": 1.00186861, + "balance_loss_mlp": 1.0008651, + "epoch": 0.2013527731850293, + "flos": 21871466133120.0, + "grad_norm": 1.6146519592040935, + "language_loss": 0.71109962, + "learning_rate": 3.699818905865346e-06, + "loss": 0.73371851, + "num_input_tokens_seen": 72338235, + "step": 3349, + "time_per_iteration": 2.6920278072357178 + }, + { + "auxiliary_loss_clip": 0.01127688, + "auxiliary_loss_mlp": 0.01151063, + "balance_loss_clip": 1.00198317, + "balance_loss_mlp": 1.00095487, + "epoch": 0.20141289643769728, + "flos": 18040803275520.0, + "grad_norm": 3.333652453856202, + "language_loss": 0.70950282, + "learning_rate": 3.6996136550757377e-06, + "loss": 0.73229027, + "num_input_tokens_seen": 72357825, + "step": 3350, + "time_per_iteration": 2.623145580291748 + }, + { + "auxiliary_loss_clip": 0.01144926, + "auxiliary_loss_mlp": 0.01150976, + "balance_loss_clip": 1.00205839, + "balance_loss_mlp": 1.00067663, + "epoch": 0.20147301969036524, + "flos": 23951412103680.0, + "grad_norm": 2.097091863013344, + "language_loss": 0.76030886, + "learning_rate": 3.69940833983661e-06, + "loss": 0.78326786, + "num_input_tokens_seen": 72376335, + "step": 3351, + "time_per_iteration": 2.600390911102295 + }, + { + "auxiliary_loss_clip": 0.01144613, + "auxiliary_loss_mlp": 0.01151896, + "balance_loss_clip": 1.00211859, + "balance_loss_mlp": 1.00092947, + "epoch": 0.2015331429430332, + "flos": 25588471380480.0, + "grad_norm": 1.5097959641380363, + "language_loss": 0.80574805, + "learning_rate": 3.699202960155748e-06, + "loss": 0.82871306, + "num_input_tokens_seen": 72395440, + "step": 3352, + "time_per_iteration": 2.622530937194824 + }, + { + "auxiliary_loss_clip": 0.01160963, + "auxiliary_loss_mlp": 0.01151863, + "balance_loss_clip": 1.00216293, + "balance_loss_mlp": 1.00108683, + "epoch": 0.2015932661957012, + "flos": 26724972677760.0, + "grad_norm": 1.7869093363639732, + "language_loss": 0.80482745, + "learning_rate": 3.6989975160409396e-06, + "loss": 0.82795572, + "num_input_tokens_seen": 72414670, + "step": 3353, + "time_per_iteration": 2.608760118484497 + }, + { + "auxiliary_loss_clip": 0.01142959, + "auxiliary_loss_mlp": 0.01151188, + "balance_loss_clip": 1.00207424, + "balance_loss_mlp": 1.00079346, + "epoch": 0.20165338944836916, + "flos": 15633136592640.0, + "grad_norm": 2.729177804881854, + "language_loss": 0.89668262, + "learning_rate": 3.6987920074999747e-06, + "loss": 0.91962409, + "num_input_tokens_seen": 72432210, + "step": 3354, + "time_per_iteration": 2.572842836380005 + }, + { + "auxiliary_loss_clip": 0.01139971, + "auxiliary_loss_mlp": 0.0074844, + "balance_loss_clip": 1.0012176, + "balance_loss_mlp": 1.00050485, + "epoch": 0.20171351270103713, + "flos": 57912529207680.0, + "grad_norm": 0.8403353110551698, + "language_loss": 0.55924773, + "learning_rate": 3.6985864345406465e-06, + "loss": 0.57813179, + "num_input_tokens_seen": 72489225, + "step": 3355, + "time_per_iteration": 3.1123344898223877 + }, + { + "auxiliary_loss_clip": 0.01144142, + "auxiliary_loss_mlp": 0.00748918, + "balance_loss_clip": 1.00202096, + "balance_loss_mlp": 1.00097334, + "epoch": 0.2017736359537051, + "flos": 20814363849600.0, + "grad_norm": 1.5236298058666216, + "language_loss": 0.842888, + "learning_rate": 3.698380797170751e-06, + "loss": 0.86181867, + "num_input_tokens_seen": 72508715, + "step": 3356, + "time_per_iteration": 2.609902858734131 + }, + { + "auxiliary_loss_clip": 0.01145475, + "auxiliary_loss_mlp": 0.0115265, + "balance_loss_clip": 1.00217175, + "balance_loss_mlp": 1.00092006, + "epoch": 0.20183375920637306, + "flos": 17092043389440.0, + "grad_norm": 2.523848706576521, + "language_loss": 0.69344205, + "learning_rate": 3.698175095398085e-06, + "loss": 0.71642327, + "num_input_tokens_seen": 72525135, + "step": 3357, + "time_per_iteration": 2.5904831886291504 + }, + { + "auxiliary_loss_clip": 0.01148531, + "auxiliary_loss_mlp": 0.01151039, + "balance_loss_clip": 1.00219369, + "balance_loss_mlp": 1.00083542, + "epoch": 0.20189388245904102, + "flos": 18661339658880.0, + "grad_norm": 1.8469821366389192, + "language_loss": 0.71788061, + "learning_rate": 3.6979693292304493e-06, + "loss": 0.74087632, + "num_input_tokens_seen": 72543690, + "step": 3358, + "time_per_iteration": 2.5610883235931396 + }, + { + "auxiliary_loss_clip": 0.01160873, + "auxiliary_loss_mlp": 0.01151409, + "balance_loss_clip": 1.00214362, + "balance_loss_mlp": 1.00139618, + "epoch": 0.20195400571170902, + "flos": 16797539779200.0, + "grad_norm": 1.704017937123952, + "language_loss": 0.83593374, + "learning_rate": 3.6977634986756463e-06, + "loss": 0.85905659, + "num_input_tokens_seen": 72560725, + "step": 3359, + "time_per_iteration": 2.5255117416381836 + }, + { + "auxiliary_loss_clip": 0.01157504, + "auxiliary_loss_mlp": 0.01139829, + "balance_loss_clip": 1.00127709, + "balance_loss_mlp": 1.00021136, + "epoch": 0.20201412896437698, + "flos": 67174716268800.0, + "grad_norm": 0.7740628793216475, + "language_loss": 0.58955944, + "learning_rate": 3.697557603741482e-06, + "loss": 0.61253273, + "num_input_tokens_seen": 72621940, + "step": 3360, + "time_per_iteration": 3.063840866088867 + }, + { + "auxiliary_loss_clip": 0.01098591, + "auxiliary_loss_mlp": 0.01151455, + "balance_loss_clip": 1.00188565, + "balance_loss_mlp": 1.00096488, + "epoch": 0.20207425221704495, + "flos": 21325013550720.0, + "grad_norm": 1.9239560925809325, + "language_loss": 0.61792505, + "learning_rate": 3.697351644435763e-06, + "loss": 0.64042544, + "num_input_tokens_seen": 72639135, + "step": 3361, + "time_per_iteration": 2.706505298614502 + }, + { + "auxiliary_loss_clip": 0.01144822, + "auxiliary_loss_mlp": 0.01152167, + "balance_loss_clip": 1.00231814, + "balance_loss_mlp": 1.00120068, + "epoch": 0.2021343754697129, + "flos": 22527158952960.0, + "grad_norm": 1.866944716945426, + "language_loss": 0.75660759, + "learning_rate": 3.6971456207662993e-06, + "loss": 0.77957749, + "num_input_tokens_seen": 72658525, + "step": 3362, + "time_per_iteration": 2.6337685585021973 + }, + { + "auxiliary_loss_clip": 0.01160972, + "auxiliary_loss_mlp": 0.00749008, + "balance_loss_clip": 1.002231, + "balance_loss_mlp": 1.00113857, + "epoch": 0.20219449872238088, + "flos": 19062785036160.0, + "grad_norm": 1.6997235340659314, + "language_loss": 0.76358181, + "learning_rate": 3.6969395327409035e-06, + "loss": 0.78268164, + "num_input_tokens_seen": 72678085, + "step": 3363, + "time_per_iteration": 2.55328106880188 + }, + { + "auxiliary_loss_clip": 0.01159806, + "auxiliary_loss_mlp": 0.01151946, + "balance_loss_clip": 1.00214994, + "balance_loss_mlp": 1.00116992, + "epoch": 0.20225462197504884, + "flos": 24717027519360.0, + "grad_norm": 1.5508755004344519, + "language_loss": 0.74994612, + "learning_rate": 3.696733380367391e-06, + "loss": 0.77306366, + "num_input_tokens_seen": 72698695, + "step": 3364, + "time_per_iteration": 2.6529054641723633 + }, + { + "auxiliary_loss_clip": 0.01111526, + "auxiliary_loss_mlp": 0.01152172, + "balance_loss_clip": 1.00211036, + "balance_loss_mlp": 1.00101447, + "epoch": 0.2023147452277168, + "flos": 22018304931840.0, + "grad_norm": 4.419630709797404, + "language_loss": 0.71814775, + "learning_rate": 3.6965271636535783e-06, + "loss": 0.74078476, + "num_input_tokens_seen": 72717880, + "step": 3365, + "time_per_iteration": 2.6866559982299805 + }, + { + "auxiliary_loss_clip": 0.01127904, + "auxiliary_loss_mlp": 0.01152053, + "balance_loss_clip": 1.00203419, + "balance_loss_mlp": 1.00099111, + "epoch": 0.2023748684803848, + "flos": 17745365911680.0, + "grad_norm": 1.8434253938826257, + "language_loss": 0.85996568, + "learning_rate": 3.696320882607286e-06, + "loss": 0.88276529, + "num_input_tokens_seen": 72736410, + "step": 3366, + "time_per_iteration": 2.7103776931762695 + }, + { + "auxiliary_loss_clip": 0.011275, + "auxiliary_loss_mlp": 0.01151153, + "balance_loss_clip": 1.00202346, + "balance_loss_mlp": 1.00085366, + "epoch": 0.20243499173305277, + "flos": 31138932493440.0, + "grad_norm": 2.480153509506235, + "language_loss": 0.69426042, + "learning_rate": 3.696114537236335e-06, + "loss": 0.71704698, + "num_input_tokens_seen": 72758295, + "step": 3367, + "time_per_iteration": 2.739346504211426 + }, + { + "auxiliary_loss_clip": 0.01160954, + "auxiliary_loss_mlp": 0.01151815, + "balance_loss_clip": 1.00204825, + "balance_loss_mlp": 1.00094378, + "epoch": 0.20249511498572073, + "flos": 33839235279360.0, + "grad_norm": 1.681157047972289, + "language_loss": 0.6855318, + "learning_rate": 3.6959081275485512e-06, + "loss": 0.70865953, + "num_input_tokens_seen": 72782495, + "step": 3368, + "time_per_iteration": 2.67954421043396 + }, + { + "auxiliary_loss_clip": 0.01127165, + "auxiliary_loss_mlp": 0.01151236, + "balance_loss_clip": 1.00200295, + "balance_loss_mlp": 1.00093699, + "epoch": 0.2025552382383887, + "flos": 21215629658880.0, + "grad_norm": 1.7815484433133, + "language_loss": 0.76989603, + "learning_rate": 3.6957016535517615e-06, + "loss": 0.79268003, + "num_input_tokens_seen": 72801885, + "step": 3369, + "time_per_iteration": 2.6473021507263184 + }, + { + "auxiliary_loss_clip": 0.0114969, + "auxiliary_loss_mlp": 0.01152006, + "balance_loss_clip": 1.00236392, + "balance_loss_mlp": 1.00132561, + "epoch": 0.20261536149105666, + "flos": 14647388676480.0, + "grad_norm": 2.5797007255577706, + "language_loss": 0.65350968, + "learning_rate": 3.695495115253795e-06, + "loss": 0.67652667, + "num_input_tokens_seen": 72816990, + "step": 3370, + "time_per_iteration": 2.5709972381591797 + }, + { + "auxiliary_loss_clip": 0.0115601, + "auxiliary_loss_mlp": 0.01139806, + "balance_loss_clip": 1.00120282, + "balance_loss_mlp": 1.00018823, + "epoch": 0.20267548474372463, + "flos": 66783649921920.0, + "grad_norm": 0.6755420933779256, + "language_loss": 0.5818404, + "learning_rate": 3.6952885126624834e-06, + "loss": 0.60479856, + "num_input_tokens_seen": 72879240, + "step": 3371, + "time_per_iteration": 3.216069221496582 + }, + { + "auxiliary_loss_clip": 0.01132107, + "auxiliary_loss_mlp": 0.01151224, + "balance_loss_clip": 1.0022583, + "balance_loss_mlp": 1.00092506, + "epoch": 0.2027356079963926, + "flos": 24680793674880.0, + "grad_norm": 1.5040568515718873, + "language_loss": 0.91594291, + "learning_rate": 3.6950818457856617e-06, + "loss": 0.93877625, + "num_input_tokens_seen": 72899030, + "step": 3372, + "time_per_iteration": 2.680393934249878 + }, + { + "auxiliary_loss_clip": 0.01160727, + "auxiliary_loss_mlp": 0.01151656, + "balance_loss_clip": 1.00210428, + "balance_loss_mlp": 1.00107038, + "epoch": 0.20279573124906058, + "flos": 26392762765440.0, + "grad_norm": 1.5540275389973348, + "language_loss": 0.78608239, + "learning_rate": 3.694875114631167e-06, + "loss": 0.80920625, + "num_input_tokens_seen": 72919190, + "step": 3373, + "time_per_iteration": 2.596902370452881 + }, + { + "auxiliary_loss_clip": 0.01111947, + "auxiliary_loss_mlp": 0.01150633, + "balance_loss_clip": 1.00190437, + "balance_loss_mlp": 1.00100195, + "epoch": 0.20285585450172855, + "flos": 33799984692480.0, + "grad_norm": 2.056258034316925, + "language_loss": 0.71289247, + "learning_rate": 3.6946683192068377e-06, + "loss": 0.73551828, + "num_input_tokens_seen": 72939720, + "step": 3374, + "time_per_iteration": 2.7840640544891357 + }, + { + "auxiliary_loss_clip": 0.01146353, + "auxiliary_loss_mlp": 0.01139737, + "balance_loss_clip": 1.00126362, + "balance_loss_mlp": 1.00011909, + "epoch": 0.20291597775439651, + "flos": 71164823598720.0, + "grad_norm": 0.9950523304356648, + "language_loss": 0.62489742, + "learning_rate": 3.694461459520516e-06, + "loss": 0.64775831, + "num_input_tokens_seen": 73000015, + "step": 3375, + "time_per_iteration": 3.1349573135375977 + }, + { + "auxiliary_loss_clip": 0.01176198, + "auxiliary_loss_mlp": 0.01151312, + "balance_loss_clip": 1.00210619, + "balance_loss_mlp": 1.00120342, + "epoch": 0.20297610100706448, + "flos": 19494287118720.0, + "grad_norm": 1.5389845652313294, + "language_loss": 0.82480246, + "learning_rate": 3.6942545355800463e-06, + "loss": 0.84807754, + "num_input_tokens_seen": 73017675, + "step": 3376, + "time_per_iteration": 2.5122692584991455 + }, + { + "auxiliary_loss_clip": 0.01160922, + "auxiliary_loss_mlp": 0.01151448, + "balance_loss_clip": 1.0020833, + "balance_loss_mlp": 1.00086236, + "epoch": 0.20303622425973245, + "flos": 25044245441280.0, + "grad_norm": 2.13377319057908, + "language_loss": 0.81698275, + "learning_rate": 3.6940475473932743e-06, + "loss": 0.84010643, + "num_input_tokens_seen": 73036135, + "step": 3377, + "time_per_iteration": 2.6252665519714355 + }, + { + "auxiliary_loss_clip": 0.01143077, + "auxiliary_loss_mlp": 0.01151567, + "balance_loss_clip": 1.00208998, + "balance_loss_mlp": 1.00117254, + "epoch": 0.2030963475124004, + "flos": 21979988098560.0, + "grad_norm": 1.7934789497861763, + "language_loss": 0.76590759, + "learning_rate": 3.69384049496805e-06, + "loss": 0.788854, + "num_input_tokens_seen": 73054075, + "step": 3378, + "time_per_iteration": 2.6153674125671387 + }, + { + "auxiliary_loss_clip": 0.01100436, + "auxiliary_loss_mlp": 0.01151371, + "balance_loss_clip": 1.00216532, + "balance_loss_mlp": 1.0008812, + "epoch": 0.2031564707650684, + "flos": 19500392430720.0, + "grad_norm": 2.94290271813784, + "language_loss": 0.79983407, + "learning_rate": 3.6936333783122242e-06, + "loss": 0.82235217, + "num_input_tokens_seen": 73073530, + "step": 3379, + "time_per_iteration": 2.719417095184326 + }, + { + "auxiliary_loss_clip": 0.01159354, + "auxiliary_loss_mlp": 0.01150948, + "balance_loss_clip": 1.00206101, + "balance_loss_mlp": 1.0009352, + "epoch": 0.20321659401773637, + "flos": 22747075971840.0, + "grad_norm": 1.705759948030176, + "language_loss": 0.86395788, + "learning_rate": 3.6934261974336505e-06, + "loss": 0.88706088, + "num_input_tokens_seen": 73092820, + "step": 3380, + "time_per_iteration": 2.5827908515930176 + }, + { + "auxiliary_loss_clip": 0.01176403, + "auxiliary_loss_mlp": 0.01151049, + "balance_loss_clip": 1.00234067, + "balance_loss_mlp": 1.00103569, + "epoch": 0.20327671727040433, + "flos": 22455840499200.0, + "grad_norm": 2.939034377984344, + "language_loss": 0.74545085, + "learning_rate": 3.693218952340186e-06, + "loss": 0.7687254, + "num_input_tokens_seen": 73113385, + "step": 3381, + "time_per_iteration": 2.552224636077881 + }, + { + "auxiliary_loss_clip": 0.01145752, + "auxiliary_loss_mlp": 0.01151646, + "balance_loss_clip": 1.00219941, + "balance_loss_mlp": 1.00134742, + "epoch": 0.2033368405230723, + "flos": 19535010163200.0, + "grad_norm": 1.6083233395066598, + "language_loss": 0.79397678, + "learning_rate": 3.6930116430396895e-06, + "loss": 0.8169508, + "num_input_tokens_seen": 73131195, + "step": 3382, + "time_per_iteration": 3.9600021839141846 + }, + { + "auxiliary_loss_clip": 0.01127385, + "auxiliary_loss_mlp": 0.00749021, + "balance_loss_clip": 1.00199926, + "balance_loss_mlp": 1.0010705, + "epoch": 0.20339696377574026, + "flos": 13809233744640.0, + "grad_norm": 2.0394356608537527, + "language_loss": 0.80037189, + "learning_rate": 3.6928042695400214e-06, + "loss": 0.8191359, + "num_input_tokens_seen": 73148850, + "step": 3383, + "time_per_iteration": 4.018149137496948 + }, + { + "auxiliary_loss_clip": 0.01128748, + "auxiliary_loss_mlp": 0.01150911, + "balance_loss_clip": 1.00209379, + "balance_loss_mlp": 1.00080299, + "epoch": 0.20345708702840823, + "flos": 20339409288960.0, + "grad_norm": 3.7274298184001196, + "language_loss": 0.7441678, + "learning_rate": 3.6925968318490464e-06, + "loss": 0.76696444, + "num_input_tokens_seen": 73166775, + "step": 3384, + "time_per_iteration": 2.650752067565918 + }, + { + "auxiliary_loss_clip": 0.01160072, + "auxiliary_loss_mlp": 0.01151823, + "balance_loss_clip": 1.0022372, + "balance_loss_mlp": 1.00095189, + "epoch": 0.2035172102810762, + "flos": 20333950421760.0, + "grad_norm": 2.2042210065508354, + "language_loss": 0.76063037, + "learning_rate": 3.6923893299746293e-06, + "loss": 0.78374928, + "num_input_tokens_seen": 73183215, + "step": 3385, + "time_per_iteration": 5.366784334182739 + }, + { + "auxiliary_loss_clip": 0.01111215, + "auxiliary_loss_mlp": 0.01151987, + "balance_loss_clip": 1.00183439, + "balance_loss_mlp": 1.00130653, + "epoch": 0.2035773335337442, + "flos": 23330983461120.0, + "grad_norm": 1.4600893010119786, + "language_loss": 0.68335855, + "learning_rate": 3.692181763924639e-06, + "loss": 0.70599055, + "num_input_tokens_seen": 73203290, + "step": 3386, + "time_per_iteration": 2.676884889602661 + }, + { + "auxiliary_loss_clip": 0.01111285, + "auxiliary_loss_mlp": 0.01151478, + "balance_loss_clip": 1.00194407, + "balance_loss_mlp": 1.00136936, + "epoch": 0.20363745678641215, + "flos": 28330287310080.0, + "grad_norm": 1.2992476223512175, + "language_loss": 0.80852735, + "learning_rate": 3.691974133706947e-06, + "loss": 0.83115494, + "num_input_tokens_seen": 73226185, + "step": 3387, + "time_per_iteration": 2.734251022338867 + }, + { + "auxiliary_loss_clip": 0.01144354, + "auxiliary_loss_mlp": 0.01151101, + "balance_loss_clip": 1.00224376, + "balance_loss_mlp": 1.00089753, + "epoch": 0.20369758003908012, + "flos": 18915658928640.0, + "grad_norm": 2.073730654873968, + "language_loss": 0.80063748, + "learning_rate": 3.6917664393294262e-06, + "loss": 0.82359207, + "num_input_tokens_seen": 73243300, + "step": 3388, + "time_per_iteration": 2.6315741539001465 + }, + { + "auxiliary_loss_clip": 0.01176367, + "auxiliary_loss_mlp": 0.01151728, + "balance_loss_clip": 1.00221324, + "balance_loss_mlp": 1.00104785, + "epoch": 0.20375770329174808, + "flos": 19206499351680.0, + "grad_norm": 1.753161180969614, + "language_loss": 0.72465086, + "learning_rate": 3.6915586807999527e-06, + "loss": 0.74793184, + "num_input_tokens_seen": 73261490, + "step": 3389, + "time_per_iteration": 2.5473363399505615 + }, + { + "auxiliary_loss_clip": 0.01159811, + "auxiliary_loss_mlp": 0.01151295, + "balance_loss_clip": 1.00228274, + "balance_loss_mlp": 1.00099552, + "epoch": 0.20381782654441605, + "flos": 19391008538880.0, + "grad_norm": 1.9762522622629417, + "language_loss": 0.87377208, + "learning_rate": 3.691350858126404e-06, + "loss": 0.89688313, + "num_input_tokens_seen": 73280180, + "step": 3390, + "time_per_iteration": 2.642986297607422 + }, + { + "auxiliary_loss_clip": 0.01143938, + "auxiliary_loss_mlp": 0.01150928, + "balance_loss_clip": 1.00199103, + "balance_loss_mlp": 1.00091505, + "epoch": 0.203877949797084, + "flos": 24827704300800.0, + "grad_norm": 1.7999893615974756, + "language_loss": 0.70828742, + "learning_rate": 3.691142971316662e-06, + "loss": 0.7312361, + "num_input_tokens_seen": 73300680, + "step": 3391, + "time_per_iteration": 2.642817497253418 + }, + { + "auxiliary_loss_clip": 0.01127188, + "auxiliary_loss_mlp": 0.0115129, + "balance_loss_clip": 1.00202084, + "balance_loss_mlp": 1.00118184, + "epoch": 0.20393807304975198, + "flos": 18003707504640.0, + "grad_norm": 2.1728497371622577, + "language_loss": 0.86651468, + "learning_rate": 3.6909350203786086e-06, + "loss": 0.88929945, + "num_input_tokens_seen": 73316760, + "step": 3392, + "time_per_iteration": 2.623993396759033 + }, + { + "auxiliary_loss_clip": 0.01160852, + "auxiliary_loss_mlp": 0.0115184, + "balance_loss_clip": 1.00220799, + "balance_loss_mlp": 1.00144601, + "epoch": 0.20399819630241997, + "flos": 24206988349440.0, + "grad_norm": 1.425307807023169, + "language_loss": 0.80905694, + "learning_rate": 3.69072700532013e-06, + "loss": 0.83218384, + "num_input_tokens_seen": 73339385, + "step": 3393, + "time_per_iteration": 2.602368116378784 + }, + { + "auxiliary_loss_clip": 0.01145074, + "auxiliary_loss_mlp": 0.01150879, + "balance_loss_clip": 1.00209546, + "balance_loss_mlp": 1.00096107, + "epoch": 0.20405831955508794, + "flos": 20777124424320.0, + "grad_norm": 1.8353376628173943, + "language_loss": 0.8593843, + "learning_rate": 3.6905189261491137e-06, + "loss": 0.88234377, + "num_input_tokens_seen": 73357235, + "step": 3394, + "time_per_iteration": 2.601459264755249 + }, + { + "auxiliary_loss_clip": 0.01159857, + "auxiliary_loss_mlp": 0.01150857, + "balance_loss_clip": 1.00209069, + "balance_loss_mlp": 1.00103498, + "epoch": 0.2041184428077559, + "flos": 15486908325120.0, + "grad_norm": 1.982964152377711, + "language_loss": 0.83849114, + "learning_rate": 3.69031078287345e-06, + "loss": 0.86159831, + "num_input_tokens_seen": 73374435, + "step": 3395, + "time_per_iteration": 2.5272696018218994 + }, + { + "auxiliary_loss_clip": 0.0115948, + "auxiliary_loss_mlp": 0.01151036, + "balance_loss_clip": 1.00209188, + "balance_loss_mlp": 1.00083268, + "epoch": 0.20417856606042387, + "flos": 15588463052160.0, + "grad_norm": 1.7733349300910026, + "language_loss": 0.83443463, + "learning_rate": 3.690102575501033e-06, + "loss": 0.85753977, + "num_input_tokens_seen": 73391025, + "step": 3396, + "time_per_iteration": 2.5331218242645264 + }, + { + "auxiliary_loss_clip": 0.01128746, + "auxiliary_loss_mlp": 0.01150444, + "balance_loss_clip": 1.00199652, + "balance_loss_mlp": 1.00090837, + "epoch": 0.20423868931309183, + "flos": 24279348297600.0, + "grad_norm": 1.718994452418961, + "language_loss": 0.77003294, + "learning_rate": 3.6898943040397556e-06, + "loss": 0.79282486, + "num_input_tokens_seen": 73409270, + "step": 3397, + "time_per_iteration": 2.699772834777832 + }, + { + "auxiliary_loss_clip": 0.01143007, + "auxiliary_loss_mlp": 0.01151471, + "balance_loss_clip": 1.00206208, + "balance_loss_mlp": 1.00098133, + "epoch": 0.2042988125657598, + "flos": 18614870438400.0, + "grad_norm": 2.348058384673271, + "language_loss": 0.87662303, + "learning_rate": 3.689685968497518e-06, + "loss": 0.89956784, + "num_input_tokens_seen": 73425225, + "step": 3398, + "time_per_iteration": 2.5791494846343994 + }, + { + "auxiliary_loss_clip": 0.01144447, + "auxiliary_loss_mlp": 0.01150947, + "balance_loss_clip": 1.00218439, + "balance_loss_mlp": 1.00093436, + "epoch": 0.2043589358184278, + "flos": 17851230270720.0, + "grad_norm": 3.278960130569561, + "language_loss": 0.78116208, + "learning_rate": 3.6894775688822186e-06, + "loss": 0.80411601, + "num_input_tokens_seen": 73440940, + "step": 3399, + "time_per_iteration": 2.568254232406616 + }, + { + "auxiliary_loss_clip": 0.01159629, + "auxiliary_loss_mlp": 0.01150964, + "balance_loss_clip": 1.00209069, + "balance_loss_mlp": 1.00095141, + "epoch": 0.20441905907109575, + "flos": 21435223455360.0, + "grad_norm": 1.8480553061025375, + "language_loss": 0.76549214, + "learning_rate": 3.6892691052017603e-06, + "loss": 0.78859806, + "num_input_tokens_seen": 73458805, + "step": 3400, + "time_per_iteration": 2.646183967590332 + }, + { + "auxiliary_loss_clip": 0.01126677, + "auxiliary_loss_mlp": 0.0074886, + "balance_loss_clip": 1.0019542, + "balance_loss_mlp": 1.00097799, + "epoch": 0.20447918232376372, + "flos": 27707703851520.0, + "grad_norm": 1.6086025664611603, + "language_loss": 0.79153121, + "learning_rate": 3.6890605774640487e-06, + "loss": 0.81028658, + "num_input_tokens_seen": 73479380, + "step": 3401, + "time_per_iteration": 2.715358018875122 + }, + { + "auxiliary_loss_clip": 0.01143664, + "auxiliary_loss_mlp": 0.01150314, + "balance_loss_clip": 1.00193954, + "balance_loss_mlp": 1.00087345, + "epoch": 0.20453930557643168, + "flos": 30524214113280.0, + "grad_norm": 1.5614197939283714, + "language_loss": 0.69596624, + "learning_rate": 3.688851985676991e-06, + "loss": 0.71890604, + "num_input_tokens_seen": 73505105, + "step": 3402, + "time_per_iteration": 2.743241310119629 + }, + { + "auxiliary_loss_clip": 0.01128501, + "auxiliary_loss_mlp": 0.01150682, + "balance_loss_clip": 1.00199533, + "balance_loss_mlp": 1.00105035, + "epoch": 0.20459942882909965, + "flos": 18987767481600.0, + "grad_norm": 2.215305969691296, + "language_loss": 0.80964506, + "learning_rate": 3.688643329848496e-06, + "loss": 0.83243686, + "num_input_tokens_seen": 73523700, + "step": 3403, + "time_per_iteration": 2.635089159011841 + }, + { + "auxiliary_loss_clip": 0.01160858, + "auxiliary_loss_mlp": 0.01150593, + "balance_loss_clip": 1.00219226, + "balance_loss_mlp": 1.00105667, + "epoch": 0.20465955208176762, + "flos": 20339050152960.0, + "grad_norm": 1.833561945349397, + "language_loss": 0.8314991, + "learning_rate": 3.6884346099864772e-06, + "loss": 0.8546136, + "num_input_tokens_seen": 73542625, + "step": 3404, + "time_per_iteration": 2.580209732055664 + }, + { + "auxiliary_loss_clip": 0.01160678, + "auxiliary_loss_mlp": 0.01150833, + "balance_loss_clip": 1.00210261, + "balance_loss_mlp": 1.00120139, + "epoch": 0.20471967533443558, + "flos": 21251288885760.0, + "grad_norm": 1.8158209194709847, + "language_loss": 0.8621251, + "learning_rate": 3.6882258260988487e-06, + "loss": 0.8852402, + "num_input_tokens_seen": 73561450, + "step": 3405, + "time_per_iteration": 2.5684187412261963 + }, + { + "auxiliary_loss_clip": 0.01126886, + "auxiliary_loss_mlp": 0.0115026, + "balance_loss_clip": 1.00188327, + "balance_loss_mlp": 1.00081921, + "epoch": 0.20477979858710357, + "flos": 14501555458560.0, + "grad_norm": 2.2673477459129634, + "language_loss": 0.84537971, + "learning_rate": 3.6880169781935276e-06, + "loss": 0.86815119, + "num_input_tokens_seen": 73577155, + "step": 3406, + "time_per_iteration": 2.6372084617614746 + }, + { + "auxiliary_loss_clip": 0.01176356, + "auxiliary_loss_mlp": 0.01150175, + "balance_loss_clip": 1.00230932, + "balance_loss_mlp": 1.00101995, + "epoch": 0.20483992183977154, + "flos": 11400310085760.0, + "grad_norm": 2.0730773848651025, + "language_loss": 0.6787591, + "learning_rate": 3.6878080662784336e-06, + "loss": 0.70202434, + "num_input_tokens_seen": 73594900, + "step": 3407, + "time_per_iteration": 2.4980409145355225 + }, + { + "auxiliary_loss_clip": 0.01176119, + "auxiliary_loss_mlp": 0.01150702, + "balance_loss_clip": 1.00213122, + "balance_loss_mlp": 1.00107038, + "epoch": 0.2049000450924395, + "flos": 19060271084160.0, + "grad_norm": 2.093860889782487, + "language_loss": 0.84642816, + "learning_rate": 3.6875990903614886e-06, + "loss": 0.86969638, + "num_input_tokens_seen": 73613810, + "step": 3408, + "time_per_iteration": 2.5026650428771973 + }, + { + "auxiliary_loss_clip": 0.0117639, + "auxiliary_loss_mlp": 0.01150978, + "balance_loss_clip": 1.00231349, + "balance_loss_mlp": 1.00106025, + "epoch": 0.20496016834510747, + "flos": 14574561851520.0, + "grad_norm": 2.380703196770869, + "language_loss": 0.64293545, + "learning_rate": 3.6873900504506166e-06, + "loss": 0.6662091, + "num_input_tokens_seen": 73631495, + "step": 3409, + "time_per_iteration": 2.4859201908111572 + }, + { + "auxiliary_loss_clip": 0.01159407, + "auxiliary_loss_mlp": 0.01150747, + "balance_loss_clip": 1.00206363, + "balance_loss_mlp": 1.00101995, + "epoch": 0.20502029159777543, + "flos": 22126647329280.0, + "grad_norm": 1.250356237667398, + "language_loss": 0.80563891, + "learning_rate": 3.687180946553745e-06, + "loss": 0.82874048, + "num_input_tokens_seen": 73652840, + "step": 3410, + "time_per_iteration": 2.5614073276519775 + }, + { + "auxiliary_loss_clip": 0.01094502, + "auxiliary_loss_mlp": 0.01150843, + "balance_loss_clip": 1.00185704, + "balance_loss_mlp": 1.00102115, + "epoch": 0.2050804148504434, + "flos": 25367907916800.0, + "grad_norm": 2.3929522693929335, + "language_loss": 0.76222384, + "learning_rate": 3.686971778678803e-06, + "loss": 0.78467727, + "num_input_tokens_seen": 73672150, + "step": 3411, + "time_per_iteration": 2.7295408248901367 + }, + { + "auxiliary_loss_clip": 0.01161062, + "auxiliary_loss_mlp": 0.01150237, + "balance_loss_clip": 1.00236893, + "balance_loss_mlp": 1.00098729, + "epoch": 0.2051405381031114, + "flos": 23620171858560.0, + "grad_norm": 1.9629364004315228, + "language_loss": 0.7343539, + "learning_rate": 3.686762546833722e-06, + "loss": 0.75746691, + "num_input_tokens_seen": 73691940, + "step": 3412, + "time_per_iteration": 2.56854248046875 + }, + { + "auxiliary_loss_clip": 0.01144071, + "auxiliary_loss_mlp": 0.01150757, + "balance_loss_clip": 1.00205159, + "balance_loss_mlp": 1.00103045, + "epoch": 0.20520066135577936, + "flos": 19565533745280.0, + "grad_norm": 2.658096617358213, + "language_loss": 0.77753848, + "learning_rate": 3.6865532510264362e-06, + "loss": 0.80048674, + "num_input_tokens_seen": 73709080, + "step": 3413, + "time_per_iteration": 2.581254720687866 + }, + { + "auxiliary_loss_clip": 0.01128468, + "auxiliary_loss_mlp": 0.0115056, + "balance_loss_clip": 1.00214577, + "balance_loss_mlp": 1.00102401, + "epoch": 0.20526078460844732, + "flos": 17676345928320.0, + "grad_norm": 1.9481506677279226, + "language_loss": 0.84809989, + "learning_rate": 3.6863438912648823e-06, + "loss": 0.87089014, + "num_input_tokens_seen": 73727670, + "step": 3414, + "time_per_iteration": 2.6241676807403564 + }, + { + "auxiliary_loss_clip": 0.01159452, + "auxiliary_loss_mlp": 0.01150681, + "balance_loss_clip": 1.00198877, + "balance_loss_mlp": 1.00095451, + "epoch": 0.2053209078611153, + "flos": 21500328856320.0, + "grad_norm": 2.0705955828453133, + "language_loss": 0.80660653, + "learning_rate": 3.6861344675569986e-06, + "loss": 0.82970786, + "num_input_tokens_seen": 73747170, + "step": 3415, + "time_per_iteration": 2.582042932510376 + }, + { + "auxiliary_loss_clip": 0.01095465, + "auxiliary_loss_mlp": 0.01150763, + "balance_loss_clip": 1.00197995, + "balance_loss_mlp": 1.00103629, + "epoch": 0.20538103111378325, + "flos": 25663524848640.0, + "grad_norm": 1.7564033949733309, + "language_loss": 0.72909391, + "learning_rate": 3.6859249799107275e-06, + "loss": 0.75155622, + "num_input_tokens_seen": 73767690, + "step": 3416, + "time_per_iteration": 2.7679519653320312 + }, + { + "auxiliary_loss_clip": 0.01159454, + "auxiliary_loss_mlp": 0.01150142, + "balance_loss_clip": 1.00206435, + "balance_loss_mlp": 1.00098729, + "epoch": 0.20544115436645122, + "flos": 23148952312320.0, + "grad_norm": 1.7871401614890152, + "language_loss": 0.7865662, + "learning_rate": 3.6857154283340115e-06, + "loss": 0.8096621, + "num_input_tokens_seen": 73786900, + "step": 3417, + "time_per_iteration": 2.6005425453186035 + }, + { + "auxiliary_loss_clip": 0.01160303, + "auxiliary_loss_mlp": 0.01150548, + "balance_loss_clip": 1.00213492, + "balance_loss_mlp": 1.00101209, + "epoch": 0.20550127761911918, + "flos": 19390433921280.0, + "grad_norm": 2.164037014123626, + "language_loss": 0.87204456, + "learning_rate": 3.685505812834798e-06, + "loss": 0.89515305, + "num_input_tokens_seen": 73804515, + "step": 3418, + "time_per_iteration": 2.5907790660858154 + }, + { + "auxiliary_loss_clip": 0.0114493, + "auxiliary_loss_mlp": 0.01150523, + "balance_loss_clip": 1.00206757, + "balance_loss_mlp": 1.0009867, + "epoch": 0.20556140087178718, + "flos": 22893124671360.0, + "grad_norm": 2.1253281246836977, + "language_loss": 0.6217984, + "learning_rate": 3.685296133421035e-06, + "loss": 0.64475292, + "num_input_tokens_seen": 73822910, + "step": 3419, + "time_per_iteration": 2.625418186187744 + }, + { + "auxiliary_loss_clip": 0.0114383, + "auxiliary_loss_mlp": 0.01151534, + "balance_loss_clip": 1.00210524, + "balance_loss_mlp": 1.00123477, + "epoch": 0.20562152412445514, + "flos": 19789652655360.0, + "grad_norm": 1.9615753420105344, + "language_loss": 0.86245501, + "learning_rate": 3.685086390100674e-06, + "loss": 0.88540864, + "num_input_tokens_seen": 73841160, + "step": 3420, + "time_per_iteration": 4.028972148895264 + }, + { + "auxiliary_loss_clip": 0.0112836, + "auxiliary_loss_mlp": 0.00748982, + "balance_loss_clip": 1.00196981, + "balance_loss_mlp": 1.00095832, + "epoch": 0.2056816473771231, + "flos": 31501989210240.0, + "grad_norm": 2.2746672679149613, + "language_loss": 0.71676326, + "learning_rate": 3.684876582881668e-06, + "loss": 0.73553663, + "num_input_tokens_seen": 73862795, + "step": 3421, + "time_per_iteration": 2.7437610626220703 + }, + { + "auxiliary_loss_clip": 0.01176133, + "auxiliary_loss_mlp": 0.01150108, + "balance_loss_clip": 1.00222397, + "balance_loss_mlp": 1.0009532, + "epoch": 0.20574177062979107, + "flos": 23258372117760.0, + "grad_norm": 2.0903829886789658, + "language_loss": 0.7048614, + "learning_rate": 3.6846667117719732e-06, + "loss": 0.72812378, + "num_input_tokens_seen": 73881525, + "step": 3422, + "time_per_iteration": 3.9824323654174805 + }, + { + "auxiliary_loss_clip": 0.01157537, + "auxiliary_loss_mlp": 0.01139263, + "balance_loss_clip": 1.00139475, + "balance_loss_mlp": 1.00040841, + "epoch": 0.20580189388245904, + "flos": 70312518708480.0, + "grad_norm": 0.7422944409315237, + "language_loss": 0.55556834, + "learning_rate": 3.684456776779548e-06, + "loss": 0.57853627, + "num_input_tokens_seen": 73937775, + "step": 3423, + "time_per_iteration": 4.628396987915039 + }, + { + "auxiliary_loss_clip": 0.01109796, + "auxiliary_loss_mlp": 0.01150713, + "balance_loss_clip": 1.00197458, + "balance_loss_mlp": 1.00108123, + "epoch": 0.205862017135127, + "flos": 30737846252160.0, + "grad_norm": 1.8356428396279363, + "language_loss": 0.71463645, + "learning_rate": 3.684246777912353e-06, + "loss": 0.73724151, + "num_input_tokens_seen": 73958250, + "step": 3424, + "time_per_iteration": 2.753420352935791 + }, + { + "auxiliary_loss_clip": 0.01132269, + "auxiliary_loss_mlp": 0.00748972, + "balance_loss_clip": 1.00278032, + "balance_loss_mlp": 1.00115776, + "epoch": 0.20592214038779497, + "flos": 21324546673920.0, + "grad_norm": 1.4004730810462884, + "language_loss": 0.75089484, + "learning_rate": 3.684036715178351e-06, + "loss": 0.76970726, + "num_input_tokens_seen": 73977775, + "step": 3425, + "time_per_iteration": 2.658771276473999 + }, + { + "auxiliary_loss_clip": 0.01127596, + "auxiliary_loss_mlp": 0.01150493, + "balance_loss_clip": 1.00199819, + "balance_loss_mlp": 1.00133848, + "epoch": 0.20598226364046296, + "flos": 22891652213760.0, + "grad_norm": 1.7303504403502996, + "language_loss": 0.87993258, + "learning_rate": 3.683826588585508e-06, + "loss": 0.90271342, + "num_input_tokens_seen": 73996590, + "step": 3426, + "time_per_iteration": 2.6777307987213135 + }, + { + "auxiliary_loss_clip": 0.01159773, + "auxiliary_loss_mlp": 0.0115073, + "balance_loss_clip": 1.00224996, + "balance_loss_mlp": 1.00090826, + "epoch": 0.20604238689313092, + "flos": 23878549365120.0, + "grad_norm": 1.8764373424705385, + "language_loss": 0.76380086, + "learning_rate": 3.6836163981417926e-06, + "loss": 0.78690588, + "num_input_tokens_seen": 74015935, + "step": 3427, + "time_per_iteration": 2.5868115425109863 + }, + { + "auxiliary_loss_clip": 0.01176079, + "auxiliary_loss_mlp": 0.01150541, + "balance_loss_clip": 1.00216651, + "balance_loss_mlp": 1.00100482, + "epoch": 0.2061025101457989, + "flos": 22491535639680.0, + "grad_norm": 1.5191537098328936, + "language_loss": 0.73760724, + "learning_rate": 3.683406143855174e-06, + "loss": 0.76087344, + "num_input_tokens_seen": 74036575, + "step": 3428, + "time_per_iteration": 2.546100378036499 + }, + { + "auxiliary_loss_clip": 0.01143353, + "auxiliary_loss_mlp": 0.01151291, + "balance_loss_clip": 1.00196481, + "balance_loss_mlp": 1.00099242, + "epoch": 0.20616263339846685, + "flos": 22778928357120.0, + "grad_norm": 1.7528977413423552, + "language_loss": 0.7389164, + "learning_rate": 3.6831958257336256e-06, + "loss": 0.76186287, + "num_input_tokens_seen": 74055365, + "step": 3429, + "time_per_iteration": 2.607761859893799 + }, + { + "auxiliary_loss_clip": 0.01161223, + "auxiliary_loss_mlp": 0.01151133, + "balance_loss_clip": 1.00248384, + "balance_loss_mlp": 1.00112045, + "epoch": 0.20622275665113482, + "flos": 20882198684160.0, + "grad_norm": 1.9502236069079857, + "language_loss": 0.84924829, + "learning_rate": 3.6829854437851237e-06, + "loss": 0.87237185, + "num_input_tokens_seen": 74074875, + "step": 3430, + "time_per_iteration": 2.548218011856079 + }, + { + "auxiliary_loss_clip": 0.01095839, + "auxiliary_loss_mlp": 0.01151203, + "balance_loss_clip": 1.00197482, + "balance_loss_mlp": 1.00109506, + "epoch": 0.20628287990380278, + "flos": 19354415558400.0, + "grad_norm": 1.4125243650272197, + "language_loss": 0.68819642, + "learning_rate": 3.6827749980176444e-06, + "loss": 0.71066684, + "num_input_tokens_seen": 74094505, + "step": 3431, + "time_per_iteration": 2.7196884155273438 + }, + { + "auxiliary_loss_clip": 0.01107773, + "auxiliary_loss_mlp": 0.01139179, + "balance_loss_clip": 1.00125563, + "balance_loss_mlp": 1.00032449, + "epoch": 0.20634300315647078, + "flos": 71517932248320.0, + "grad_norm": 0.8056498120096817, + "language_loss": 0.60272914, + "learning_rate": 3.6825644884391693e-06, + "loss": 0.62519872, + "num_input_tokens_seen": 74158500, + "step": 3432, + "time_per_iteration": 3.3802082538604736 + }, + { + "auxiliary_loss_clip": 0.01159644, + "auxiliary_loss_mlp": 0.01150822, + "balance_loss_clip": 1.00221324, + "balance_loss_mlp": 1.00109529, + "epoch": 0.20640312640913874, + "flos": 21723944976000.0, + "grad_norm": 1.9703950931594507, + "language_loss": 0.72388512, + "learning_rate": 3.682353915057679e-06, + "loss": 0.74698979, + "num_input_tokens_seen": 74176685, + "step": 3433, + "time_per_iteration": 2.593872547149658 + }, + { + "auxiliary_loss_clip": 0.01113259, + "auxiliary_loss_mlp": 0.01150959, + "balance_loss_clip": 1.00195098, + "balance_loss_mlp": 1.00094581, + "epoch": 0.2064632496618067, + "flos": 20554621626240.0, + "grad_norm": 1.6309189774101496, + "language_loss": 0.8681584, + "learning_rate": 3.6821432778811604e-06, + "loss": 0.8908006, + "num_input_tokens_seen": 74194935, + "step": 3434, + "time_per_iteration": 2.6697046756744385 + }, + { + "auxiliary_loss_clip": 0.01160787, + "auxiliary_loss_mlp": 0.01150811, + "balance_loss_clip": 1.00216937, + "balance_loss_mlp": 1.00079823, + "epoch": 0.20652337291447467, + "flos": 29823273135360.0, + "grad_norm": 1.785929476790235, + "language_loss": 0.69530392, + "learning_rate": 3.6819325769176004e-06, + "loss": 0.71841991, + "num_input_tokens_seen": 74215400, + "step": 3435, + "time_per_iteration": 2.628255844116211 + }, + { + "auxiliary_loss_clip": 0.01144505, + "auxiliary_loss_mlp": 0.01150578, + "balance_loss_clip": 1.00220251, + "balance_loss_mlp": 1.00085104, + "epoch": 0.20658349616714264, + "flos": 26213640618240.0, + "grad_norm": 1.7149096100257162, + "language_loss": 0.89267623, + "learning_rate": 3.681721812174988e-06, + "loss": 0.91562712, + "num_input_tokens_seen": 74234090, + "step": 3436, + "time_per_iteration": 2.6476898193359375 + }, + { + "auxiliary_loss_clip": 0.01128796, + "auxiliary_loss_mlp": 0.01150584, + "balance_loss_clip": 1.00214791, + "balance_loss_mlp": 1.00085735, + "epoch": 0.2066436194198106, + "flos": 25994370044160.0, + "grad_norm": 1.5742790020061288, + "language_loss": 0.7651149, + "learning_rate": 3.6815109836613163e-06, + "loss": 0.78790879, + "num_input_tokens_seen": 74253345, + "step": 3437, + "time_per_iteration": 2.6861321926116943 + }, + { + "auxiliary_loss_clip": 0.01165221, + "auxiliary_loss_mlp": 0.01150572, + "balance_loss_clip": 1.00294709, + "balance_loss_mlp": 1.00113142, + "epoch": 0.20670374267247857, + "flos": 21361067827200.0, + "grad_norm": 2.0188571647441975, + "language_loss": 0.77800512, + "learning_rate": 3.6813000913845795e-06, + "loss": 0.80116308, + "num_input_tokens_seen": 74271615, + "step": 3438, + "time_per_iteration": 2.6685147285461426 + }, + { + "auxiliary_loss_clip": 0.01156418, + "auxiliary_loss_mlp": 0.01139253, + "balance_loss_clip": 1.00147569, + "balance_loss_mlp": 1.00039852, + "epoch": 0.20676386592514656, + "flos": 66383281952640.0, + "grad_norm": 0.8741540842127521, + "language_loss": 0.67194319, + "learning_rate": 3.6810891353527747e-06, + "loss": 0.69489992, + "num_input_tokens_seen": 74331390, + "step": 3439, + "time_per_iteration": 3.0952963829040527 + }, + { + "auxiliary_loss_clip": 0.01159592, + "auxiliary_loss_mlp": 0.01150627, + "balance_loss_clip": 1.00210834, + "balance_loss_mlp": 1.00099504, + "epoch": 0.20682398917781453, + "flos": 17274577328640.0, + "grad_norm": 1.9987320895311462, + "language_loss": 0.83833963, + "learning_rate": 3.6808781155739014e-06, + "loss": 0.86144185, + "num_input_tokens_seen": 74347335, + "step": 3440, + "time_per_iteration": 2.549522638320923 + }, + { + "auxiliary_loss_clip": 0.01160496, + "auxiliary_loss_mlp": 0.01151119, + "balance_loss_clip": 1.0022552, + "balance_loss_mlp": 1.00110638, + "epoch": 0.2068841124304825, + "flos": 18077288515200.0, + "grad_norm": 1.8053085996258111, + "language_loss": 0.84745216, + "learning_rate": 3.6806670320559614e-06, + "loss": 0.87056834, + "num_input_tokens_seen": 74366310, + "step": 3441, + "time_per_iteration": 2.5353665351867676 + }, + { + "auxiliary_loss_clip": 0.01115504, + "auxiliary_loss_mlp": 0.01151033, + "balance_loss_clip": 1.00244272, + "balance_loss_mlp": 1.00092506, + "epoch": 0.20694423568315046, + "flos": 27347017432320.0, + "grad_norm": 1.817791360892144, + "language_loss": 0.86097968, + "learning_rate": 3.680455884806959e-06, + "loss": 0.88364506, + "num_input_tokens_seen": 74387100, + "step": 3442, + "time_per_iteration": 2.74064302444458 + }, + { + "auxiliary_loss_clip": 0.01083854, + "auxiliary_loss_mlp": 0.01151277, + "balance_loss_clip": 1.00243902, + "balance_loss_mlp": 1.00097799, + "epoch": 0.20700435893581842, + "flos": 20229845829120.0, + "grad_norm": 2.010635473834951, + "language_loss": 0.73051828, + "learning_rate": 3.6802446738349014e-06, + "loss": 0.75286961, + "num_input_tokens_seen": 74404460, + "step": 3443, + "time_per_iteration": 2.747992515563965 + }, + { + "auxiliary_loss_clip": 0.01142903, + "auxiliary_loss_mlp": 0.00749029, + "balance_loss_clip": 1.00196576, + "balance_loss_mlp": 1.00126803, + "epoch": 0.2070644821884864, + "flos": 20631111638400.0, + "grad_norm": 1.7028289431331964, + "language_loss": 0.85468483, + "learning_rate": 3.680033399147797e-06, + "loss": 0.87360418, + "num_input_tokens_seen": 74423790, + "step": 3444, + "time_per_iteration": 2.622105121612549 + }, + { + "auxiliary_loss_clip": 0.0110701, + "auxiliary_loss_mlp": 0.01138318, + "balance_loss_clip": 1.00123119, + "balance_loss_mlp": 1.00022626, + "epoch": 0.20712460544115438, + "flos": 65941077617280.0, + "grad_norm": 0.6863277258410257, + "language_loss": 0.57138336, + "learning_rate": 3.6798220607536585e-06, + "loss": 0.59383667, + "num_input_tokens_seen": 74488130, + "step": 3445, + "time_per_iteration": 3.200878620147705 + }, + { + "auxiliary_loss_clip": 0.01176145, + "auxiliary_loss_mlp": 0.00749003, + "balance_loss_clip": 1.00227129, + "balance_loss_mlp": 1.00117302, + "epoch": 0.20718472869382235, + "flos": 19425734012160.0, + "grad_norm": 1.4613913788230122, + "language_loss": 0.78319746, + "learning_rate": 3.6796106586604987e-06, + "loss": 0.80244899, + "num_input_tokens_seen": 74506720, + "step": 3446, + "time_per_iteration": 2.513740062713623 + }, + { + "auxiliary_loss_clip": 0.01160932, + "auxiliary_loss_mlp": 0.01151631, + "balance_loss_clip": 1.00215447, + "balance_loss_mlp": 1.00123668, + "epoch": 0.2072448519464903, + "flos": 24499049834880.0, + "grad_norm": 2.095458337277812, + "language_loss": 0.62626654, + "learning_rate": 3.679399192876334e-06, + "loss": 0.64939213, + "num_input_tokens_seen": 74525330, + "step": 3447, + "time_per_iteration": 2.632455348968506 + }, + { + "auxiliary_loss_clip": 0.01114262, + "auxiliary_loss_mlp": 0.01150808, + "balance_loss_clip": 1.00207043, + "balance_loss_mlp": 1.00108075, + "epoch": 0.20730497519915828, + "flos": 23075694524160.0, + "grad_norm": 1.6027277220196943, + "language_loss": 0.86455363, + "learning_rate": 3.679187663409184e-06, + "loss": 0.88720429, + "num_input_tokens_seen": 74544535, + "step": 3448, + "time_per_iteration": 2.685880661010742 + }, + { + "auxiliary_loss_clip": 0.01143749, + "auxiliary_loss_mlp": 0.01150254, + "balance_loss_clip": 1.00199699, + "balance_loss_mlp": 1.00081277, + "epoch": 0.20736509845182624, + "flos": 21069042255360.0, + "grad_norm": 1.8373962496205187, + "language_loss": 0.74787563, + "learning_rate": 3.6789760702670696e-06, + "loss": 0.77081573, + "num_input_tokens_seen": 74562300, + "step": 3449, + "time_per_iteration": 2.5840675830841064 + }, + { + "auxiliary_loss_clip": 0.01144967, + "auxiliary_loss_mlp": 0.01150864, + "balance_loss_clip": 1.00212622, + "balance_loss_mlp": 1.00123215, + "epoch": 0.2074252217044942, + "flos": 17633288499840.0, + "grad_norm": 1.7564895629664017, + "language_loss": 0.76601255, + "learning_rate": 3.6787644134580134e-06, + "loss": 0.78897083, + "num_input_tokens_seen": 74580080, + "step": 3450, + "time_per_iteration": 2.5666840076446533 + }, + { + "auxiliary_loss_clip": 0.01127556, + "auxiliary_loss_mlp": 0.01150726, + "balance_loss_clip": 1.00187814, + "balance_loss_mlp": 1.00109482, + "epoch": 0.20748534495716217, + "flos": 23546985897600.0, + "grad_norm": 1.497751280112664, + "language_loss": 0.82060361, + "learning_rate": 3.6785526929900436e-06, + "loss": 0.84338641, + "num_input_tokens_seen": 74598980, + "step": 3451, + "time_per_iteration": 2.625192642211914 + }, + { + "auxiliary_loss_clip": 0.01172591, + "auxiliary_loss_mlp": 0.01139106, + "balance_loss_clip": 1.00144219, + "balance_loss_mlp": 1.0002507, + "epoch": 0.20754546820983016, + "flos": 52252935598080.0, + "grad_norm": 0.7878572161332821, + "language_loss": 0.56576973, + "learning_rate": 3.6783409088711875e-06, + "loss": 0.58888668, + "num_input_tokens_seen": 74655275, + "step": 3452, + "time_per_iteration": 2.983509063720703 + }, + { + "auxiliary_loss_clip": 0.01127489, + "auxiliary_loss_mlp": 0.00748997, + "balance_loss_clip": 1.00200605, + "balance_loss_mlp": 1.00108814, + "epoch": 0.20760559146249813, + "flos": 20412379768320.0, + "grad_norm": 1.8893255224226846, + "language_loss": 0.88192976, + "learning_rate": 3.6781290611094755e-06, + "loss": 0.90069461, + "num_input_tokens_seen": 74674560, + "step": 3453, + "time_per_iteration": 2.621619701385498 + }, + { + "auxiliary_loss_clip": 0.01165408, + "auxiliary_loss_mlp": 0.01150962, + "balance_loss_clip": 1.00314426, + "balance_loss_mlp": 1.00094938, + "epoch": 0.2076657147151661, + "flos": 23186012169600.0, + "grad_norm": 1.7308760351372285, + "language_loss": 0.80195981, + "learning_rate": 3.6779171497129407e-06, + "loss": 0.82512349, + "num_input_tokens_seen": 74694500, + "step": 3454, + "time_per_iteration": 2.5631728172302246 + }, + { + "auxiliary_loss_clip": 0.0112979, + "auxiliary_loss_mlp": 0.00749001, + "balance_loss_clip": 1.00220537, + "balance_loss_mlp": 1.00120175, + "epoch": 0.20772583796783406, + "flos": 18293219124480.0, + "grad_norm": 2.945249919928943, + "language_loss": 0.77360308, + "learning_rate": 3.6777051746896202e-06, + "loss": 0.792391, + "num_input_tokens_seen": 74710485, + "step": 3455, + "time_per_iteration": 2.609924793243408 + }, + { + "auxiliary_loss_clip": 0.01133245, + "auxiliary_loss_mlp": 0.01150356, + "balance_loss_clip": 1.00277448, + "balance_loss_mlp": 1.00110638, + "epoch": 0.20778596122050202, + "flos": 17602800831360.0, + "grad_norm": 1.8248686132946204, + "language_loss": 0.80938113, + "learning_rate": 3.6774931360475516e-06, + "loss": 0.8322171, + "num_input_tokens_seen": 74727450, + "step": 3456, + "time_per_iteration": 2.6597652435302734 + }, + { + "auxiliary_loss_clip": 0.0111301, + "auxiliary_loss_mlp": 0.00749042, + "balance_loss_clip": 1.00198805, + "balance_loss_mlp": 1.00126147, + "epoch": 0.20784608447317, + "flos": 23805578885760.0, + "grad_norm": 1.5228630204537978, + "language_loss": 0.78491747, + "learning_rate": 3.6772810337947745e-06, + "loss": 0.80353796, + "num_input_tokens_seen": 74746725, + "step": 3457, + "time_per_iteration": 4.170856237411499 + }, + { + "auxiliary_loss_clip": 0.01097221, + "auxiliary_loss_mlp": 0.01150297, + "balance_loss_clip": 1.00199831, + "balance_loss_mlp": 1.00095153, + "epoch": 0.20790620772583795, + "flos": 17639286071040.0, + "grad_norm": 1.931905121532509, + "language_loss": 0.83246624, + "learning_rate": 3.677068867939333e-06, + "loss": 0.85494143, + "num_input_tokens_seen": 74765255, + "step": 3458, + "time_per_iteration": 4.080624580383301 + }, + { + "auxiliary_loss_clip": 0.01160581, + "auxiliary_loss_mlp": 0.00748931, + "balance_loss_clip": 1.0022049, + "balance_loss_mlp": 1.00126171, + "epoch": 0.20796633097850595, + "flos": 27673481168640.0, + "grad_norm": 1.994597922804272, + "language_loss": 0.76187444, + "learning_rate": 3.676856638489272e-06, + "loss": 0.78096956, + "num_input_tokens_seen": 74785710, + "step": 3459, + "time_per_iteration": 2.6135284900665283 + }, + { + "auxiliary_loss_clip": 0.01094989, + "auxiliary_loss_mlp": 0.01149671, + "balance_loss_clip": 1.00172091, + "balance_loss_mlp": 1.00070691, + "epoch": 0.2080264542311739, + "flos": 19245606284160.0, + "grad_norm": 1.8369681755447305, + "language_loss": 0.77356684, + "learning_rate": 3.6766443454526382e-06, + "loss": 0.79601347, + "num_input_tokens_seen": 74804490, + "step": 3460, + "time_per_iteration": 4.1163530349731445 + }, + { + "auxiliary_loss_clip": 0.01096337, + "auxiliary_loss_mlp": 0.01150138, + "balance_loss_clip": 1.00166798, + "balance_loss_mlp": 1.00088835, + "epoch": 0.20808657748384188, + "flos": 27525924097920.0, + "grad_norm": 1.7254070373960302, + "language_loss": 0.75921756, + "learning_rate": 3.6764319888374836e-06, + "loss": 0.78168225, + "num_input_tokens_seen": 74826340, + "step": 3461, + "time_per_iteration": 2.7661848068237305 + }, + { + "auxiliary_loss_clip": 0.01143997, + "auxiliary_loss_mlp": 0.01150805, + "balance_loss_clip": 1.00200653, + "balance_loss_mlp": 1.00079226, + "epoch": 0.20814670073650984, + "flos": 26906931999360.0, + "grad_norm": 1.8247705196524608, + "language_loss": 0.88416117, + "learning_rate": 3.6762195686518604e-06, + "loss": 0.9071092, + "num_input_tokens_seen": 74844960, + "step": 3462, + "time_per_iteration": 2.6394948959350586 + }, + { + "auxiliary_loss_clip": 0.01109171, + "auxiliary_loss_mlp": 0.00748368, + "balance_loss_clip": 1.00139785, + "balance_loss_mlp": 1.00054419, + "epoch": 0.2082068239891778, + "flos": 70175735717760.0, + "grad_norm": 0.7864379689671656, + "language_loss": 0.59009147, + "learning_rate": 3.6760070849038226e-06, + "loss": 0.6086669, + "num_input_tokens_seen": 74909075, + "step": 3463, + "time_per_iteration": 3.3686912059783936 + }, + { + "auxiliary_loss_clip": 0.01143955, + "auxiliary_loss_mlp": 0.01150729, + "balance_loss_clip": 1.00184298, + "balance_loss_mlp": 1.00090671, + "epoch": 0.20826694724184577, + "flos": 24608074590720.0, + "grad_norm": 3.6820811938893097, + "language_loss": 0.65923905, + "learning_rate": 3.675794537601429e-06, + "loss": 0.68218589, + "num_input_tokens_seen": 74928125, + "step": 3464, + "time_per_iteration": 2.6210269927978516 + }, + { + "auxiliary_loss_clip": 0.01127696, + "auxiliary_loss_mlp": 0.01151014, + "balance_loss_clip": 1.00198984, + "balance_loss_mlp": 1.00100136, + "epoch": 0.20832707049451377, + "flos": 12892829034240.0, + "grad_norm": 1.9237269582219372, + "language_loss": 0.84052074, + "learning_rate": 3.6755819267527373e-06, + "loss": 0.86330789, + "num_input_tokens_seen": 74945090, + "step": 3465, + "time_per_iteration": 2.6038060188293457 + }, + { + "auxiliary_loss_clip": 0.0111162, + "auxiliary_loss_mlp": 0.01150706, + "balance_loss_clip": 1.00194716, + "balance_loss_mlp": 1.00078869, + "epoch": 0.20838719374718173, + "flos": 22198827709440.0, + "grad_norm": 2.03270570971851, + "language_loss": 0.81885195, + "learning_rate": 3.6753692523658113e-06, + "loss": 0.84147519, + "num_input_tokens_seen": 74963630, + "step": 3466, + "time_per_iteration": 2.7045695781707764 + }, + { + "auxiliary_loss_clip": 0.01159348, + "auxiliary_loss_mlp": 0.01150521, + "balance_loss_clip": 1.00204325, + "balance_loss_mlp": 1.00088978, + "epoch": 0.2084473169998497, + "flos": 15158648908800.0, + "grad_norm": 2.3658888722395504, + "language_loss": 0.82178789, + "learning_rate": 3.675156514448716e-06, + "loss": 0.8448866, + "num_input_tokens_seen": 74981875, + "step": 3467, + "time_per_iteration": 2.568566083908081 + }, + { + "auxiliary_loss_clip": 0.01175952, + "auxiliary_loss_mlp": 0.01149596, + "balance_loss_clip": 1.00215101, + "balance_loss_mlp": 1.00101399, + "epoch": 0.20850744025251766, + "flos": 17456788045440.0, + "grad_norm": 1.7425473241092202, + "language_loss": 0.81775439, + "learning_rate": 3.674943713009518e-06, + "loss": 0.84100986, + "num_input_tokens_seen": 74999155, + "step": 3468, + "time_per_iteration": 2.481548309326172 + }, + { + "auxiliary_loss_clip": 0.01159606, + "auxiliary_loss_mlp": 0.01150528, + "balance_loss_clip": 1.00210416, + "balance_loss_mlp": 1.00089622, + "epoch": 0.20856756350518563, + "flos": 25698968593920.0, + "grad_norm": 2.0503364370293653, + "language_loss": 0.90129948, + "learning_rate": 3.6747308480562856e-06, + "loss": 0.92440081, + "num_input_tokens_seen": 75017850, + "step": 3469, + "time_per_iteration": 2.5682153701782227 + }, + { + "auxiliary_loss_clip": 0.01143744, + "auxiliary_loss_mlp": 0.0115105, + "balance_loss_clip": 1.00220132, + "balance_loss_mlp": 1.00094199, + "epoch": 0.2086276867578536, + "flos": 37889060970240.0, + "grad_norm": 1.549746208795237, + "language_loss": 0.76659858, + "learning_rate": 3.674517919597092e-06, + "loss": 0.78954655, + "num_input_tokens_seen": 75039270, + "step": 3470, + "time_per_iteration": 2.7190403938293457 + }, + { + "auxiliary_loss_clip": 0.01142902, + "auxiliary_loss_mlp": 0.01150366, + "balance_loss_clip": 1.00209236, + "balance_loss_mlp": 1.00102043, + "epoch": 0.20868781001052156, + "flos": 25557049958400.0, + "grad_norm": 2.361316292094441, + "language_loss": 0.75593698, + "learning_rate": 3.674304927640011e-06, + "loss": 0.77886963, + "num_input_tokens_seen": 75059350, + "step": 3471, + "time_per_iteration": 2.631491184234619 + }, + { + "auxiliary_loss_clip": 0.01126926, + "auxiliary_loss_mlp": 0.01151122, + "balance_loss_clip": 1.0019567, + "balance_loss_mlp": 1.00101376, + "epoch": 0.20874793326318955, + "flos": 27529192235520.0, + "grad_norm": 1.768266679116553, + "language_loss": 0.75472641, + "learning_rate": 3.67409187219312e-06, + "loss": 0.77750695, + "num_input_tokens_seen": 75080150, + "step": 3472, + "time_per_iteration": 2.6860127449035645 + }, + { + "auxiliary_loss_clip": 0.01165082, + "auxiliary_loss_mlp": 0.01150232, + "balance_loss_clip": 1.00265884, + "balance_loss_mlp": 1.00069618, + "epoch": 0.20880805651585752, + "flos": 18548795370240.0, + "grad_norm": 2.147209345579025, + "language_loss": 0.84062016, + "learning_rate": 3.6738787532644966e-06, + "loss": 0.86377335, + "num_input_tokens_seen": 75097920, + "step": 3473, + "time_per_iteration": 2.5218505859375 + }, + { + "auxiliary_loss_clip": 0.01126643, + "auxiliary_loss_mlp": 0.01138889, + "balance_loss_clip": 1.0025599, + "balance_loss_mlp": 1.00003386, + "epoch": 0.20886817976852548, + "flos": 65946644225280.0, + "grad_norm": 0.8833788495545253, + "language_loss": 0.63693005, + "learning_rate": 3.6736655708622235e-06, + "loss": 0.65958536, + "num_input_tokens_seen": 75152410, + "step": 3474, + "time_per_iteration": 3.112375497817993 + }, + { + "auxiliary_loss_clip": 0.01144116, + "auxiliary_loss_mlp": 0.01150646, + "balance_loss_clip": 1.00220728, + "balance_loss_mlp": 1.00091934, + "epoch": 0.20892830302119345, + "flos": 36539178929280.0, + "grad_norm": 1.8295515764594152, + "language_loss": 0.70241499, + "learning_rate": 3.6734523249943844e-06, + "loss": 0.72536266, + "num_input_tokens_seen": 75173265, + "step": 3475, + "time_per_iteration": 2.705960273742676 + }, + { + "auxiliary_loss_clip": 0.01176199, + "auxiliary_loss_mlp": 0.01150866, + "balance_loss_clip": 1.0022279, + "balance_loss_mlp": 1.0011394, + "epoch": 0.2089884262738614, + "flos": 20956749361920.0, + "grad_norm": 1.514266441115129, + "language_loss": 0.699229, + "learning_rate": 3.673239015669065e-06, + "loss": 0.72249961, + "num_input_tokens_seen": 75193640, + "step": 3476, + "time_per_iteration": 2.524592638015747 + }, + { + "auxiliary_loss_clip": 0.01143118, + "auxiliary_loss_mlp": 0.01150083, + "balance_loss_clip": 1.00211537, + "balance_loss_mlp": 1.00102353, + "epoch": 0.20904854952652938, + "flos": 22784028088320.0, + "grad_norm": 1.8663953579553456, + "language_loss": 0.89138448, + "learning_rate": 3.6730256428943544e-06, + "loss": 0.91431653, + "num_input_tokens_seen": 75212545, + "step": 3477, + "time_per_iteration": 2.602233409881592 + }, + { + "auxiliary_loss_clip": 0.01111635, + "auxiliary_loss_mlp": 0.01149984, + "balance_loss_clip": 1.00197208, + "balance_loss_mlp": 1.00092447, + "epoch": 0.20910867277919734, + "flos": 27303277645440.0, + "grad_norm": 2.091209450993755, + "language_loss": 0.67913938, + "learning_rate": 3.672812206678344e-06, + "loss": 0.70175558, + "num_input_tokens_seen": 75230865, + "step": 3478, + "time_per_iteration": 2.770325183868408 + }, + { + "auxiliary_loss_clip": 0.01129987, + "auxiliary_loss_mlp": 0.01150429, + "balance_loss_clip": 1.00219333, + "balance_loss_mlp": 1.00098777, + "epoch": 0.20916879603186533, + "flos": 14319237000960.0, + "grad_norm": 2.265813361811636, + "language_loss": 0.84112763, + "learning_rate": 3.672598707029127e-06, + "loss": 0.86393178, + "num_input_tokens_seen": 75248285, + "step": 3479, + "time_per_iteration": 2.620889902114868 + }, + { + "auxiliary_loss_clip": 0.01132552, + "auxiliary_loss_mlp": 0.01150133, + "balance_loss_clip": 1.00212812, + "balance_loss_mlp": 1.00097871, + "epoch": 0.2092289192845333, + "flos": 22273019251200.0, + "grad_norm": 2.402890551551436, + "language_loss": 0.73629737, + "learning_rate": 3.6723851439548003e-06, + "loss": 0.75912416, + "num_input_tokens_seen": 75266310, + "step": 3480, + "time_per_iteration": 2.6416704654693604 + }, + { + "auxiliary_loss_clip": 0.011272, + "auxiliary_loss_mlp": 0.01149714, + "balance_loss_clip": 1.00199008, + "balance_loss_mlp": 1.00103652, + "epoch": 0.20928904253720126, + "flos": 14830712714880.0, + "grad_norm": 1.9516086280240372, + "language_loss": 0.75212348, + "learning_rate": 3.67217151746346e-06, + "loss": 0.77489263, + "num_input_tokens_seen": 75284175, + "step": 3481, + "time_per_iteration": 2.649484395980835 + }, + { + "auxiliary_loss_clip": 0.01112482, + "auxiliary_loss_mlp": 0.01150173, + "balance_loss_clip": 1.00194144, + "balance_loss_mlp": 1.00130451, + "epoch": 0.20934916578986923, + "flos": 23259162216960.0, + "grad_norm": 1.8114831214135603, + "language_loss": 0.85268557, + "learning_rate": 3.671957827563209e-06, + "loss": 0.87531221, + "num_input_tokens_seen": 75303465, + "step": 3482, + "time_per_iteration": 2.6903603076934814 + }, + { + "auxiliary_loss_clip": 0.01094156, + "auxiliary_loss_mlp": 0.0114981, + "balance_loss_clip": 1.00180876, + "balance_loss_mlp": 1.00094128, + "epoch": 0.2094092890425372, + "flos": 32014398677760.0, + "grad_norm": 1.824750525232724, + "language_loss": 0.71028924, + "learning_rate": 3.6717440742621494e-06, + "loss": 0.73272884, + "num_input_tokens_seen": 75325290, + "step": 3483, + "time_per_iteration": 2.7902984619140625 + }, + { + "auxiliary_loss_clip": 0.01143049, + "auxiliary_loss_mlp": 0.01150386, + "balance_loss_clip": 1.00202394, + "balance_loss_mlp": 1.00113618, + "epoch": 0.20946941229520516, + "flos": 20010647082240.0, + "grad_norm": 1.5514533476069763, + "language_loss": 0.74788201, + "learning_rate": 3.6715302575683865e-06, + "loss": 0.77081633, + "num_input_tokens_seen": 75343895, + "step": 3484, + "time_per_iteration": 2.5938920974731445 + }, + { + "auxiliary_loss_clip": 0.0114481, + "auxiliary_loss_mlp": 0.01150621, + "balance_loss_clip": 1.00229728, + "balance_loss_mlp": 1.00098944, + "epoch": 0.20952953554787315, + "flos": 30740072895360.0, + "grad_norm": 1.8778261100932099, + "language_loss": 0.70779645, + "learning_rate": 3.6713163774900292e-06, + "loss": 0.73075068, + "num_input_tokens_seen": 75367100, + "step": 3485, + "time_per_iteration": 2.749873161315918 + }, + { + "auxiliary_loss_clip": 0.01112176, + "auxiliary_loss_mlp": 0.00749006, + "balance_loss_clip": 1.00203133, + "balance_loss_mlp": 1.00127256, + "epoch": 0.20958965880054112, + "flos": 27049209770880.0, + "grad_norm": 2.0503608119392442, + "language_loss": 0.82556725, + "learning_rate": 3.6711024340351875e-06, + "loss": 0.84417909, + "num_input_tokens_seen": 75389925, + "step": 3486, + "time_per_iteration": 2.7262208461761475 + }, + { + "auxiliary_loss_clip": 0.01159589, + "auxiliary_loss_mlp": 0.01150368, + "balance_loss_clip": 1.00212932, + "balance_loss_mlp": 1.00130892, + "epoch": 0.20964978205320908, + "flos": 34204123589760.0, + "grad_norm": 1.7589659040643824, + "language_loss": 0.86994433, + "learning_rate": 3.6708884272119737e-06, + "loss": 0.89304388, + "num_input_tokens_seen": 75408575, + "step": 3487, + "time_per_iteration": 2.6927223205566406 + }, + { + "auxiliary_loss_clip": 0.01128105, + "auxiliary_loss_mlp": 0.01149703, + "balance_loss_clip": 1.00199413, + "balance_loss_mlp": 1.00093019, + "epoch": 0.20970990530587705, + "flos": 23477391296640.0, + "grad_norm": 2.032787998941922, + "language_loss": 0.72834641, + "learning_rate": 3.670674357028504e-06, + "loss": 0.7511245, + "num_input_tokens_seen": 75427155, + "step": 3488, + "time_per_iteration": 2.6529273986816406 + }, + { + "auxiliary_loss_clip": 0.01143516, + "auxiliary_loss_mlp": 0.01149925, + "balance_loss_clip": 1.00221992, + "balance_loss_mlp": 1.00086594, + "epoch": 0.209770028558545, + "flos": 18551452976640.0, + "grad_norm": 2.3460281557282743, + "language_loss": 0.80496848, + "learning_rate": 3.6704602234928945e-06, + "loss": 0.82790291, + "num_input_tokens_seen": 75444450, + "step": 3489, + "time_per_iteration": 2.609699010848999 + }, + { + "auxiliary_loss_clip": 0.0117623, + "auxiliary_loss_mlp": 0.01149912, + "balance_loss_clip": 1.00221217, + "balance_loss_mlp": 1.00085282, + "epoch": 0.20983015181121298, + "flos": 21617003208960.0, + "grad_norm": 1.7307608251732234, + "language_loss": 0.72707933, + "learning_rate": 3.670246026613266e-06, + "loss": 0.75034076, + "num_input_tokens_seen": 75462625, + "step": 3490, + "time_per_iteration": 2.522599697113037 + }, + { + "auxiliary_loss_clip": 0.01142763, + "auxiliary_loss_mlp": 0.01149793, + "balance_loss_clip": 1.00212288, + "balance_loss_mlp": 1.00140166, + "epoch": 0.20989027506388094, + "flos": 16614718531200.0, + "grad_norm": 4.960335741567193, + "language_loss": 0.70533228, + "learning_rate": 3.6700317663977415e-06, + "loss": 0.72825789, + "num_input_tokens_seen": 75480640, + "step": 3491, + "time_per_iteration": 2.555724859237671 + }, + { + "auxiliary_loss_clip": 0.01159334, + "auxiliary_loss_mlp": 0.00748944, + "balance_loss_clip": 1.00199485, + "balance_loss_mlp": 1.00112975, + "epoch": 0.20995039831654894, + "flos": 23216823060480.0, + "grad_norm": 2.2906616359529797, + "language_loss": 0.79251558, + "learning_rate": 3.669817442854444e-06, + "loss": 0.8115983, + "num_input_tokens_seen": 75494900, + "step": 3492, + "time_per_iteration": 2.537946939468384 + }, + { + "auxiliary_loss_clip": 0.01159348, + "auxiliary_loss_mlp": 0.00748963, + "balance_loss_clip": 1.00208545, + "balance_loss_mlp": 1.00117254, + "epoch": 0.2100105215692169, + "flos": 18147493647360.0, + "grad_norm": 1.820824382456268, + "language_loss": 0.86892623, + "learning_rate": 3.669603055991502e-06, + "loss": 0.88800931, + "num_input_tokens_seen": 75513370, + "step": 3493, + "time_per_iteration": 2.5480825901031494 + }, + { + "auxiliary_loss_clip": 0.01145256, + "auxiliary_loss_mlp": 0.01149385, + "balance_loss_clip": 1.002177, + "balance_loss_mlp": 1.00099349, + "epoch": 0.21007064482188487, + "flos": 15961611490560.0, + "grad_norm": 2.0494776605057177, + "language_loss": 0.6936233, + "learning_rate": 3.6693886058170455e-06, + "loss": 0.71656978, + "num_input_tokens_seen": 75532480, + "step": 3494, + "time_per_iteration": 3.9963860511779785 + }, + { + "auxiliary_loss_clip": 0.01160374, + "auxiliary_loss_mlp": 0.01150265, + "balance_loss_clip": 1.00226879, + "balance_loss_mlp": 1.00091958, + "epoch": 0.21013076807455283, + "flos": 32234315696640.0, + "grad_norm": 1.7089392672191297, + "language_loss": 0.78603506, + "learning_rate": 3.6691740923392053e-06, + "loss": 0.80914152, + "num_input_tokens_seen": 75552745, + "step": 3495, + "time_per_iteration": 4.000063180923462 + }, + { + "auxiliary_loss_clip": 0.01143786, + "auxiliary_loss_mlp": 0.01150147, + "balance_loss_clip": 1.00208843, + "balance_loss_mlp": 1.00099194, + "epoch": 0.2101908913272208, + "flos": 23696625957120.0, + "grad_norm": 1.5523300838850507, + "language_loss": 0.77437598, + "learning_rate": 3.668959515566116e-06, + "loss": 0.79731536, + "num_input_tokens_seen": 75574355, + "step": 3496, + "time_per_iteration": 2.607111930847168 + }, + { + "auxiliary_loss_clip": 0.01144795, + "auxiliary_loss_mlp": 0.01150326, + "balance_loss_clip": 1.00207305, + "balance_loss_mlp": 1.00107598, + "epoch": 0.21025101457988876, + "flos": 20375786787840.0, + "grad_norm": 1.9637093855679384, + "language_loss": 0.82397634, + "learning_rate": 3.668744875505915e-06, + "loss": 0.84692758, + "num_input_tokens_seen": 75592215, + "step": 3497, + "time_per_iteration": 4.000343561172485 + }, + { + "auxiliary_loss_clip": 0.01159285, + "auxiliary_loss_mlp": 0.01150486, + "balance_loss_clip": 1.00208735, + "balance_loss_mlp": 1.00114024, + "epoch": 0.21031113783255675, + "flos": 25775638174080.0, + "grad_norm": 1.7180889406349775, + "language_loss": 0.67414534, + "learning_rate": 3.668530172166741e-06, + "loss": 0.69724298, + "num_input_tokens_seen": 75610740, + "step": 3498, + "time_per_iteration": 4.0389931201934814 + }, + { + "auxiliary_loss_clip": 0.01131697, + "auxiliary_loss_mlp": 0.01150215, + "balance_loss_clip": 1.0023663, + "balance_loss_mlp": 1.00106049, + "epoch": 0.21037126108522472, + "flos": 22018197191040.0, + "grad_norm": 1.9357140024218757, + "language_loss": 0.80357444, + "learning_rate": 3.6683154055567352e-06, + "loss": 0.8263936, + "num_input_tokens_seen": 75631005, + "step": 3499, + "time_per_iteration": 2.6477081775665283 + }, + { + "auxiliary_loss_clip": 0.01159467, + "auxiliary_loss_mlp": 0.0115023, + "balance_loss_clip": 1.00202513, + "balance_loss_mlp": 1.00107539, + "epoch": 0.21043138433789269, + "flos": 25334403505920.0, + "grad_norm": 1.8992088829752942, + "language_loss": 0.78547394, + "learning_rate": 3.668100575684043e-06, + "loss": 0.80857092, + "num_input_tokens_seen": 75650655, + "step": 3500, + "time_per_iteration": 2.596104383468628 + }, + { + "auxiliary_loss_clip": 0.01142808, + "auxiliary_loss_mlp": 0.01150433, + "balance_loss_clip": 1.00204134, + "balance_loss_mlp": 1.00108743, + "epoch": 0.21049150759056065, + "flos": 25556654908800.0, + "grad_norm": 1.6256835976935844, + "language_loss": 0.74177152, + "learning_rate": 3.6678856825568094e-06, + "loss": 0.76470387, + "num_input_tokens_seen": 75669895, + "step": 3501, + "time_per_iteration": 2.63152813911438 + }, + { + "auxiliary_loss_clip": 0.01159505, + "auxiliary_loss_mlp": 0.0114926, + "balance_loss_clip": 1.00210357, + "balance_loss_mlp": 1.00086796, + "epoch": 0.21055163084322862, + "flos": 24495602129280.0, + "grad_norm": 1.460987418651334, + "language_loss": 0.75447536, + "learning_rate": 3.667670726183183e-06, + "loss": 0.77756298, + "num_input_tokens_seen": 75689535, + "step": 3502, + "time_per_iteration": 2.571213722229004 + }, + { + "auxiliary_loss_clip": 0.0111293, + "auxiliary_loss_mlp": 0.01149374, + "balance_loss_clip": 1.00204313, + "balance_loss_mlp": 1.00088704, + "epoch": 0.21061175409589658, + "flos": 25739045193600.0, + "grad_norm": 2.517719858599163, + "language_loss": 0.77186865, + "learning_rate": 3.667455706571316e-06, + "loss": 0.79449171, + "num_input_tokens_seen": 75709265, + "step": 3503, + "time_per_iteration": 2.7245535850524902 + }, + { + "auxiliary_loss_clip": 0.01095698, + "auxiliary_loss_mlp": 0.01150317, + "balance_loss_clip": 1.00187469, + "balance_loss_mlp": 1.00097132, + "epoch": 0.21067187734856455, + "flos": 18989168112000.0, + "grad_norm": 3.095912150191278, + "language_loss": 0.78444898, + "learning_rate": 3.6672406237293617e-06, + "loss": 0.8069092, + "num_input_tokens_seen": 75727050, + "step": 3504, + "time_per_iteration": 2.696840286254883 + }, + { + "auxiliary_loss_clip": 0.01127093, + "auxiliary_loss_mlp": 0.01150388, + "balance_loss_clip": 1.00180423, + "balance_loss_mlp": 1.00104213, + "epoch": 0.21073200060123254, + "flos": 24681368292480.0, + "grad_norm": 1.4812051066112712, + "language_loss": 0.76562643, + "learning_rate": 3.6670254776654754e-06, + "loss": 0.78840125, + "num_input_tokens_seen": 75747175, + "step": 3505, + "time_per_iteration": 2.661172866821289 + }, + { + "auxiliary_loss_clip": 0.01144184, + "auxiliary_loss_mlp": 0.01149858, + "balance_loss_clip": 1.00212228, + "balance_loss_mlp": 1.00108528, + "epoch": 0.2107921238539005, + "flos": 28549342402560.0, + "grad_norm": 1.7445028883066152, + "language_loss": 0.63762176, + "learning_rate": 3.6668102683878163e-06, + "loss": 0.66056228, + "num_input_tokens_seen": 75767690, + "step": 3506, + "time_per_iteration": 2.660978317260742 + }, + { + "auxiliary_loss_clip": 0.01159259, + "auxiliary_loss_mlp": 0.01150117, + "balance_loss_clip": 1.00206208, + "balance_loss_mlp": 1.00115287, + "epoch": 0.21085224710656847, + "flos": 25885848078720.0, + "grad_norm": 1.5542720135958452, + "language_loss": 0.81805921, + "learning_rate": 3.6665949959045443e-06, + "loss": 0.84115303, + "num_input_tokens_seen": 75787255, + "step": 3507, + "time_per_iteration": 2.5987164974212646 + }, + { + "auxiliary_loss_clip": 0.01159291, + "auxiliary_loss_mlp": 0.0114983, + "balance_loss_clip": 1.00200391, + "balance_loss_mlp": 1.00105679, + "epoch": 0.21091237035923643, + "flos": 14976294537600.0, + "grad_norm": 1.606839248372013, + "language_loss": 0.75409073, + "learning_rate": 3.666379660223824e-06, + "loss": 0.77718192, + "num_input_tokens_seen": 75805890, + "step": 3508, + "time_per_iteration": 2.585244655609131 + }, + { + "auxiliary_loss_clip": 0.01175918, + "auxiliary_loss_mlp": 0.01149791, + "balance_loss_clip": 1.00210035, + "balance_loss_mlp": 1.00082707, + "epoch": 0.2109724936119044, + "flos": 16362518163840.0, + "grad_norm": 2.727188887210843, + "language_loss": 0.8499651, + "learning_rate": 3.6661642613538192e-06, + "loss": 0.87322223, + "num_input_tokens_seen": 75821620, + "step": 3509, + "time_per_iteration": 2.521043539047241 + }, + { + "auxiliary_loss_clip": 0.01126275, + "auxiliary_loss_mlp": 0.01149374, + "balance_loss_clip": 1.00186396, + "balance_loss_mlp": 1.00088739, + "epoch": 0.21103261686457236, + "flos": 31502492000640.0, + "grad_norm": 1.778784847258075, + "language_loss": 0.67971742, + "learning_rate": 3.6659487993026987e-06, + "loss": 0.70247388, + "num_input_tokens_seen": 75842490, + "step": 3510, + "time_per_iteration": 2.708513021469116 + }, + { + "auxiliary_loss_clip": 0.01175927, + "auxiliary_loss_mlp": 0.01150011, + "balance_loss_clip": 1.00210881, + "balance_loss_mlp": 1.00085676, + "epoch": 0.21109274011724033, + "flos": 27344072517120.0, + "grad_norm": 1.6705872419100716, + "language_loss": 0.72323215, + "learning_rate": 3.6657332740786327e-06, + "loss": 0.74649149, + "num_input_tokens_seen": 75865985, + "step": 3511, + "time_per_iteration": 2.5966734886169434 + }, + { + "auxiliary_loss_clip": 0.01061659, + "auxiliary_loss_mlp": 0.01150181, + "balance_loss_clip": 1.00141287, + "balance_loss_mlp": 1.00093126, + "epoch": 0.21115286336990832, + "flos": 17820383466240.0, + "grad_norm": 2.088202381035326, + "language_loss": 0.69356054, + "learning_rate": 3.665517685689794e-06, + "loss": 0.71567893, + "num_input_tokens_seen": 75882745, + "step": 3512, + "time_per_iteration": 2.870490312576294 + }, + { + "auxiliary_loss_clip": 0.01159285, + "auxiliary_loss_mlp": 0.01149535, + "balance_loss_clip": 1.0019803, + "balance_loss_mlp": 1.00095248, + "epoch": 0.2112129866225763, + "flos": 27197987904000.0, + "grad_norm": 3.2248738770416283, + "language_loss": 0.73120081, + "learning_rate": 3.6653020341443584e-06, + "loss": 0.75428903, + "num_input_tokens_seen": 75904305, + "step": 3513, + "time_per_iteration": 2.823167324066162 + }, + { + "auxiliary_loss_clip": 0.01143852, + "auxiliary_loss_mlp": 0.01149151, + "balance_loss_clip": 1.0021956, + "balance_loss_mlp": 1.00094986, + "epoch": 0.21127310987524425, + "flos": 23731279603200.0, + "grad_norm": 1.756080993642726, + "language_loss": 0.7437849, + "learning_rate": 3.665086319450502e-06, + "loss": 0.76671493, + "num_input_tokens_seen": 75923710, + "step": 3514, + "time_per_iteration": 2.607595205307007 + }, + { + "auxiliary_loss_clip": 0.01143406, + "auxiliary_loss_mlp": 0.01149497, + "balance_loss_clip": 1.00192356, + "balance_loss_mlp": 1.00072408, + "epoch": 0.21133323312791222, + "flos": 18332505624960.0, + "grad_norm": 1.7932265378185501, + "language_loss": 0.76108283, + "learning_rate": 3.6648705416164062e-06, + "loss": 0.78401184, + "num_input_tokens_seen": 75942625, + "step": 3515, + "time_per_iteration": 2.5577898025512695 + }, + { + "auxiliary_loss_clip": 0.01143774, + "auxiliary_loss_mlp": 0.01149444, + "balance_loss_clip": 1.00199032, + "balance_loss_mlp": 1.0010519, + "epoch": 0.21139335638058018, + "flos": 17931203902080.0, + "grad_norm": 2.2355302755116, + "language_loss": 0.67972237, + "learning_rate": 3.6646547006502518e-06, + "loss": 0.70265448, + "num_input_tokens_seen": 75959930, + "step": 3516, + "time_per_iteration": 2.5465474128723145 + }, + { + "auxiliary_loss_clip": 0.01126055, + "auxiliary_loss_mlp": 0.01150033, + "balance_loss_clip": 1.0018096, + "balance_loss_mlp": 1.00106931, + "epoch": 0.21145347963324815, + "flos": 24572092141440.0, + "grad_norm": 1.7458307864110942, + "language_loss": 0.84964544, + "learning_rate": 3.664438796560225e-06, + "loss": 0.8724063, + "num_input_tokens_seen": 75980335, + "step": 3517, + "time_per_iteration": 2.6476285457611084 + }, + { + "auxiliary_loss_clip": 0.01143372, + "auxiliary_loss_mlp": 0.01149062, + "balance_loss_clip": 1.00190246, + "balance_loss_mlp": 1.00095642, + "epoch": 0.21151360288591614, + "flos": 35845959375360.0, + "grad_norm": 1.9069387987298376, + "language_loss": 0.62582731, + "learning_rate": 3.664222829354512e-06, + "loss": 0.64875174, + "num_input_tokens_seen": 76002095, + "step": 3518, + "time_per_iteration": 2.722672462463379 + }, + { + "auxiliary_loss_clip": 0.01093797, + "auxiliary_loss_mlp": 0.01149781, + "balance_loss_clip": 1.00173748, + "balance_loss_mlp": 1.00119829, + "epoch": 0.2115737261385841, + "flos": 24641579001600.0, + "grad_norm": 1.9235343463827124, + "language_loss": 0.88900703, + "learning_rate": 3.664006799041303e-06, + "loss": 0.91144282, + "num_input_tokens_seen": 76020425, + "step": 3519, + "time_per_iteration": 2.735858201980591 + }, + { + "auxiliary_loss_clip": 0.01143807, + "auxiliary_loss_mlp": 0.01149876, + "balance_loss_clip": 1.00202012, + "balance_loss_mlp": 1.0012939, + "epoch": 0.21163384939125207, + "flos": 25226887121280.0, + "grad_norm": 1.8297461755828623, + "language_loss": 0.81406188, + "learning_rate": 3.6637907056287886e-06, + "loss": 0.8369987, + "num_input_tokens_seen": 76041210, + "step": 3520, + "time_per_iteration": 2.6124250888824463 + }, + { + "auxiliary_loss_clip": 0.01142441, + "auxiliary_loss_mlp": 0.01148966, + "balance_loss_clip": 1.00188625, + "balance_loss_mlp": 1.00105095, + "epoch": 0.21169397264392004, + "flos": 26067520091520.0, + "grad_norm": 1.6035310911914429, + "language_loss": 0.76056385, + "learning_rate": 3.6635745491251642e-06, + "loss": 0.7834779, + "num_input_tokens_seen": 76062685, + "step": 3521, + "time_per_iteration": 2.6133527755737305 + }, + { + "auxiliary_loss_clip": 0.01099159, + "auxiliary_loss_mlp": 0.01148943, + "balance_loss_clip": 1.00186288, + "balance_loss_mlp": 1.00083756, + "epoch": 0.211754095896588, + "flos": 23108265181440.0, + "grad_norm": 8.201044073986377, + "language_loss": 0.75585675, + "learning_rate": 3.663358329538626e-06, + "loss": 0.77833784, + "num_input_tokens_seen": 76082300, + "step": 3522, + "time_per_iteration": 2.7452216148376465 + }, + { + "auxiliary_loss_clip": 0.01175836, + "auxiliary_loss_mlp": 0.01149684, + "balance_loss_clip": 1.00204039, + "balance_loss_mlp": 1.00119686, + "epoch": 0.21181421914925597, + "flos": 27922341571200.0, + "grad_norm": 1.7757074005421023, + "language_loss": 0.70156717, + "learning_rate": 3.663142046877374e-06, + "loss": 0.72482234, + "num_input_tokens_seen": 76101135, + "step": 3523, + "time_per_iteration": 2.5438284873962402 + }, + { + "auxiliary_loss_clip": 0.01160698, + "auxiliary_loss_mlp": 0.01149672, + "balance_loss_clip": 1.00223935, + "balance_loss_mlp": 1.00128055, + "epoch": 0.21187434240192393, + "flos": 17128636369920.0, + "grad_norm": 2.538151477741912, + "language_loss": 0.77302462, + "learning_rate": 3.6629257011496085e-06, + "loss": 0.79612827, + "num_input_tokens_seen": 76119320, + "step": 3524, + "time_per_iteration": 2.529123306274414 + }, + { + "auxiliary_loss_clip": 0.01142765, + "auxiliary_loss_mlp": 0.01149592, + "balance_loss_clip": 1.00189078, + "balance_loss_mlp": 1.00091434, + "epoch": 0.21193446565459192, + "flos": 22347318533760.0, + "grad_norm": 1.8278688952880924, + "language_loss": 0.81351483, + "learning_rate": 3.6627092923635338e-06, + "loss": 0.83643836, + "num_input_tokens_seen": 76137445, + "step": 3525, + "time_per_iteration": 2.5956664085388184 + }, + { + "auxiliary_loss_clip": 0.01096416, + "auxiliary_loss_mlp": 0.0114924, + "balance_loss_clip": 1.00173926, + "balance_loss_mlp": 1.00084805, + "epoch": 0.2119945889072599, + "flos": 27199316707200.0, + "grad_norm": 4.2640899374041155, + "language_loss": 0.75093997, + "learning_rate": 3.662492820527356e-06, + "loss": 0.77339649, + "num_input_tokens_seen": 76159500, + "step": 3526, + "time_per_iteration": 2.7609012126922607 + }, + { + "auxiliary_loss_clip": 0.01175909, + "auxiliary_loss_mlp": 0.01148888, + "balance_loss_clip": 1.00214458, + "balance_loss_mlp": 1.00097322, + "epoch": 0.21205471215992786, + "flos": 20991869884800.0, + "grad_norm": 1.7776803298046726, + "language_loss": 0.76930547, + "learning_rate": 3.662276285649284e-06, + "loss": 0.79255342, + "num_input_tokens_seen": 76177990, + "step": 3527, + "time_per_iteration": 2.524660348892212 + }, + { + "auxiliary_loss_clip": 0.01175889, + "auxiliary_loss_mlp": 0.01149395, + "balance_loss_clip": 1.00214148, + "balance_loss_mlp": 1.00109839, + "epoch": 0.21211483541259582, + "flos": 20777663128320.0, + "grad_norm": 1.7307063067202546, + "language_loss": 0.77727664, + "learning_rate": 3.662059687737528e-06, + "loss": 0.80052948, + "num_input_tokens_seen": 76197125, + "step": 3528, + "time_per_iteration": 2.525773286819458 + }, + { + "auxiliary_loss_clip": 0.01159368, + "auxiliary_loss_mlp": 0.01148937, + "balance_loss_clip": 1.00200534, + "balance_loss_mlp": 1.00092661, + "epoch": 0.21217495866526379, + "flos": 18989994124800.0, + "grad_norm": 1.9565753118263551, + "language_loss": 0.81964839, + "learning_rate": 3.6618430268003024e-06, + "loss": 0.84273148, + "num_input_tokens_seen": 76216215, + "step": 3529, + "time_per_iteration": 2.548147678375244 + }, + { + "auxiliary_loss_clip": 0.01143529, + "auxiliary_loss_mlp": 0.00748938, + "balance_loss_clip": 1.00191712, + "balance_loss_mlp": 1.00114036, + "epoch": 0.21223508191793175, + "flos": 20667309569280.0, + "grad_norm": 2.4573662263719744, + "language_loss": 0.76354343, + "learning_rate": 3.6616263028458235e-06, + "loss": 0.78246808, + "num_input_tokens_seen": 76237010, + "step": 3530, + "time_per_iteration": 2.6738181114196777 + }, + { + "auxiliary_loss_clip": 0.01175958, + "auxiliary_loss_mlp": 0.01149161, + "balance_loss_clip": 1.0022465, + "balance_loss_mlp": 1.0010556, + "epoch": 0.21229520517059972, + "flos": 21616464504960.0, + "grad_norm": 2.03873978767957, + "language_loss": 0.83083826, + "learning_rate": 3.661409515882308e-06, + "loss": 0.8540895, + "num_input_tokens_seen": 76255965, + "step": 3531, + "time_per_iteration": 2.5172719955444336 + }, + { + "auxiliary_loss_clip": 0.01144121, + "auxiliary_loss_mlp": 0.01148816, + "balance_loss_clip": 1.00211191, + "balance_loss_mlp": 1.0009011, + "epoch": 0.2123553284232677, + "flos": 13991049411840.0, + "grad_norm": 2.4314120079835835, + "language_loss": 0.73478317, + "learning_rate": 3.661192665917977e-06, + "loss": 0.75771248, + "num_input_tokens_seen": 76272150, + "step": 3532, + "time_per_iteration": 3.992037773132324 + }, + { + "auxiliary_loss_clip": 0.01127882, + "auxiliary_loss_mlp": 0.01149043, + "balance_loss_clip": 1.00191784, + "balance_loss_mlp": 1.00084162, + "epoch": 0.21241545167593567, + "flos": 18296774570880.0, + "grad_norm": 1.6831813968442242, + "language_loss": 0.73793685, + "learning_rate": 3.660975752961054e-06, + "loss": 0.76070619, + "num_input_tokens_seen": 76291425, + "step": 3533, + "time_per_iteration": 4.046247720718384 + }, + { + "auxiliary_loss_clip": 0.01159983, + "auxiliary_loss_mlp": 0.01149797, + "balance_loss_clip": 1.0020746, + "balance_loss_mlp": 1.00092828, + "epoch": 0.21247557492860364, + "flos": 34713121265280.0, + "grad_norm": 2.0773462628168264, + "language_loss": 0.71288806, + "learning_rate": 3.6607587770197634e-06, + "loss": 0.73598588, + "num_input_tokens_seen": 76313975, + "step": 3534, + "time_per_iteration": 2.6772162914276123 + }, + { + "auxiliary_loss_clip": 0.01142852, + "auxiliary_loss_mlp": 0.01149659, + "balance_loss_clip": 1.00205433, + "balance_loss_mlp": 1.0008862, + "epoch": 0.2125356981812716, + "flos": 22053820504320.0, + "grad_norm": 2.097828638248539, + "language_loss": 0.719257, + "learning_rate": 3.6605417381023346e-06, + "loss": 0.74218214, + "num_input_tokens_seen": 76330955, + "step": 3535, + "time_per_iteration": 5.387702465057373 + }, + { + "auxiliary_loss_clip": 0.01159111, + "auxiliary_loss_mlp": 0.01149337, + "balance_loss_clip": 1.00200033, + "balance_loss_mlp": 1.0011363, + "epoch": 0.21259582143393957, + "flos": 28548336821760.0, + "grad_norm": 2.671465202606407, + "language_loss": 0.70335102, + "learning_rate": 3.660324636216996e-06, + "loss": 0.72643548, + "num_input_tokens_seen": 76352680, + "step": 3536, + "time_per_iteration": 2.585942506790161 + }, + { + "auxiliary_loss_clip": 0.01175914, + "auxiliary_loss_mlp": 0.01149274, + "balance_loss_clip": 1.00211728, + "balance_loss_mlp": 1.00097716, + "epoch": 0.21265594468660753, + "flos": 20120892900480.0, + "grad_norm": 1.7924770489569057, + "language_loss": 0.87837505, + "learning_rate": 3.660107471371981e-06, + "loss": 0.90162694, + "num_input_tokens_seen": 76370750, + "step": 3537, + "time_per_iteration": 2.4894704818725586 + }, + { + "auxiliary_loss_clip": 0.01159284, + "auxiliary_loss_mlp": 0.00748841, + "balance_loss_clip": 1.00204551, + "balance_loss_mlp": 1.00105762, + "epoch": 0.21271606793927553, + "flos": 23076161400960.0, + "grad_norm": 1.7345557393301234, + "language_loss": 0.8008734, + "learning_rate": 3.659890243575524e-06, + "loss": 0.81995463, + "num_input_tokens_seen": 76390610, + "step": 3538, + "time_per_iteration": 2.5493576526641846 + }, + { + "auxiliary_loss_clip": 0.01095612, + "auxiliary_loss_mlp": 0.01148574, + "balance_loss_clip": 1.00170267, + "balance_loss_mlp": 1.00094569, + "epoch": 0.2127761911919435, + "flos": 26388201738240.0, + "grad_norm": 2.2042848047313393, + "language_loss": 0.87234223, + "learning_rate": 3.659672952835863e-06, + "loss": 0.89478409, + "num_input_tokens_seen": 76408860, + "step": 3539, + "time_per_iteration": 2.717654228210449 + }, + { + "auxiliary_loss_clip": 0.01143429, + "auxiliary_loss_mlp": 0.01148915, + "balance_loss_clip": 1.0019604, + "balance_loss_mlp": 1.00099981, + "epoch": 0.21283631444461146, + "flos": 20228265630720.0, + "grad_norm": 2.2951284746501597, + "language_loss": 0.58217758, + "learning_rate": 3.659455599161237e-06, + "loss": 0.60510099, + "num_input_tokens_seen": 76424980, + "step": 3540, + "time_per_iteration": 2.5530381202697754 + }, + { + "auxiliary_loss_clip": 0.011758, + "auxiliary_loss_mlp": 0.01149136, + "balance_loss_clip": 1.00212765, + "balance_loss_mlp": 1.0006485, + "epoch": 0.21289643769727942, + "flos": 13516992691200.0, + "grad_norm": 1.9821727062498091, + "language_loss": 0.75534332, + "learning_rate": 3.659238182559888e-06, + "loss": 0.77859265, + "num_input_tokens_seen": 76443135, + "step": 3541, + "time_per_iteration": 2.485869884490967 + }, + { + "auxiliary_loss_clip": 0.01128745, + "auxiliary_loss_mlp": 0.0114923, + "balance_loss_clip": 1.00205183, + "balance_loss_mlp": 1.00102925, + "epoch": 0.2129565609499474, + "flos": 24827021942400.0, + "grad_norm": 2.280804057388372, + "language_loss": 0.6949079, + "learning_rate": 3.6590207030400615e-06, + "loss": 0.71768773, + "num_input_tokens_seen": 76462470, + "step": 3542, + "time_per_iteration": 2.6482584476470947 + }, + { + "auxiliary_loss_clip": 0.01175814, + "auxiliary_loss_mlp": 0.01148699, + "balance_loss_clip": 1.00216639, + "balance_loss_mlp": 1.0007838, + "epoch": 0.21301668420261535, + "flos": 23659242877440.0, + "grad_norm": 2.045257294084894, + "language_loss": 0.75227273, + "learning_rate": 3.658803160610004e-06, + "loss": 0.77551782, + "num_input_tokens_seen": 76481995, + "step": 3543, + "time_per_iteration": 2.5363402366638184 + }, + { + "auxiliary_loss_clip": 0.01142641, + "auxiliary_loss_mlp": 0.01149181, + "balance_loss_clip": 1.00201249, + "balance_loss_mlp": 1.00097966, + "epoch": 0.21307680745528332, + "flos": 16362805472640.0, + "grad_norm": 1.823000067302018, + "language_loss": 0.66647053, + "learning_rate": 3.6585855552779634e-06, + "loss": 0.68938875, + "num_input_tokens_seen": 76500245, + "step": 3544, + "time_per_iteration": 2.5976803302764893 + }, + { + "auxiliary_loss_clip": 0.01144354, + "auxiliary_loss_mlp": 0.0114942, + "balance_loss_clip": 1.00212431, + "balance_loss_mlp": 1.00112343, + "epoch": 0.2131369307079513, + "flos": 19099054794240.0, + "grad_norm": 2.078724026790191, + "language_loss": 0.71013594, + "learning_rate": 3.6583678870521934e-06, + "loss": 0.73307365, + "num_input_tokens_seen": 76519535, + "step": 3545, + "time_per_iteration": 2.586855411529541 + }, + { + "auxiliary_loss_clip": 0.01142796, + "auxiliary_loss_mlp": 0.01149823, + "balance_loss_clip": 1.00187516, + "balance_loss_mlp": 1.00114536, + "epoch": 0.21319705396061928, + "flos": 30372275583360.0, + "grad_norm": 1.6903669781155732, + "language_loss": 0.72224724, + "learning_rate": 3.658150155940946e-06, + "loss": 0.74517345, + "num_input_tokens_seen": 76542065, + "step": 3546, + "time_per_iteration": 2.658910036087036 + }, + { + "auxiliary_loss_clip": 0.01111415, + "auxiliary_loss_mlp": 0.01149603, + "balance_loss_clip": 1.00194788, + "balance_loss_mlp": 1.00102091, + "epoch": 0.21325717721328724, + "flos": 21756192410880.0, + "grad_norm": 1.6743249589209162, + "language_loss": 0.7990154, + "learning_rate": 3.657932361952479e-06, + "loss": 0.82162559, + "num_input_tokens_seen": 76560540, + "step": 3547, + "time_per_iteration": 2.674095869064331 + }, + { + "auxiliary_loss_clip": 0.01175816, + "auxiliary_loss_mlp": 0.01149245, + "balance_loss_clip": 1.00208735, + "balance_loss_mlp": 1.00085318, + "epoch": 0.2133173004659552, + "flos": 28730870760960.0, + "grad_norm": 2.2763775771310564, + "language_loss": 0.74736452, + "learning_rate": 3.6577145050950504e-06, + "loss": 0.77061516, + "num_input_tokens_seen": 76581760, + "step": 3548, + "time_per_iteration": 2.563089370727539 + }, + { + "auxiliary_loss_clip": 0.01127694, + "auxiliary_loss_mlp": 0.01149747, + "balance_loss_clip": 1.00196624, + "balance_loss_mlp": 1.00106919, + "epoch": 0.21337742371862317, + "flos": 16837077674880.0, + "grad_norm": 1.950207456880087, + "language_loss": 0.7407434, + "learning_rate": 3.657496585376922e-06, + "loss": 0.76351786, + "num_input_tokens_seen": 76599940, + "step": 3549, + "time_per_iteration": 2.6064162254333496 + }, + { + "auxiliary_loss_clip": 0.01127161, + "auxiliary_loss_mlp": 0.01149414, + "balance_loss_clip": 1.00190783, + "balance_loss_mlp": 1.00102258, + "epoch": 0.21343754697129114, + "flos": 24424930120320.0, + "grad_norm": 1.6927125417941595, + "language_loss": 0.8053571, + "learning_rate": 3.657278602806357e-06, + "loss": 0.82812285, + "num_input_tokens_seen": 76619580, + "step": 3550, + "time_per_iteration": 2.7280426025390625 + }, + { + "auxiliary_loss_clip": 0.01175751, + "auxiliary_loss_mlp": 0.01148852, + "balance_loss_clip": 1.00215125, + "balance_loss_mlp": 1.00103223, + "epoch": 0.21349767022395913, + "flos": 19277817805440.0, + "grad_norm": 2.293704225600299, + "language_loss": 0.87932706, + "learning_rate": 3.657060557391621e-06, + "loss": 0.90257311, + "num_input_tokens_seen": 76638195, + "step": 3551, + "time_per_iteration": 2.4997079372406006 + }, + { + "auxiliary_loss_clip": 0.01175762, + "auxiliary_loss_mlp": 0.01149204, + "balance_loss_clip": 1.00204146, + "balance_loss_mlp": 1.00100338, + "epoch": 0.2135577934766271, + "flos": 17347547808000.0, + "grad_norm": 1.936476468212998, + "language_loss": 0.83757073, + "learning_rate": 3.656842449140983e-06, + "loss": 0.86082035, + "num_input_tokens_seen": 76656695, + "step": 3552, + "time_per_iteration": 2.500570297241211 + }, + { + "auxiliary_loss_clip": 0.01160391, + "auxiliary_loss_mlp": 0.01149368, + "balance_loss_clip": 1.00204062, + "balance_loss_mlp": 1.00116694, + "epoch": 0.21361791672929506, + "flos": 24057204635520.0, + "grad_norm": 1.6106014668559598, + "language_loss": 0.76643848, + "learning_rate": 3.656624278062713e-06, + "loss": 0.78953612, + "num_input_tokens_seen": 76677430, + "step": 3553, + "time_per_iteration": 2.566985845565796 + }, + { + "auxiliary_loss_clip": 0.01159184, + "auxiliary_loss_mlp": 0.01148809, + "balance_loss_clip": 1.00200868, + "balance_loss_mlp": 1.00108492, + "epoch": 0.21367803998196302, + "flos": 22162306556160.0, + "grad_norm": 1.5138479492430061, + "language_loss": 0.72534406, + "learning_rate": 3.6564060441650843e-06, + "loss": 0.74842399, + "num_input_tokens_seen": 76697615, + "step": 3554, + "time_per_iteration": 2.5612637996673584 + }, + { + "auxiliary_loss_clip": 0.01111809, + "auxiliary_loss_mlp": 0.00748736, + "balance_loss_clip": 1.00201488, + "balance_loss_mlp": 1.00105608, + "epoch": 0.213738163234631, + "flos": 20886867452160.0, + "grad_norm": 1.9008835804757034, + "language_loss": 0.67861134, + "learning_rate": 3.6561877474563724e-06, + "loss": 0.69721675, + "num_input_tokens_seen": 76715685, + "step": 3555, + "time_per_iteration": 2.6756951808929443 + }, + { + "auxiliary_loss_clip": 0.0112689, + "auxiliary_loss_mlp": 0.01149984, + "balance_loss_clip": 1.00177503, + "balance_loss_mlp": 1.0008297, + "epoch": 0.21379828648729896, + "flos": 28403114135040.0, + "grad_norm": 1.8951882523122388, + "language_loss": 0.64829689, + "learning_rate": 3.6559693879448553e-06, + "loss": 0.67106569, + "num_input_tokens_seen": 76735405, + "step": 3556, + "time_per_iteration": 2.7029690742492676 + }, + { + "auxiliary_loss_clip": 0.01159368, + "auxiliary_loss_mlp": 0.01149587, + "balance_loss_clip": 1.00207317, + "balance_loss_mlp": 1.00119519, + "epoch": 0.21385840973996692, + "flos": 25479662106240.0, + "grad_norm": 1.7052156596165706, + "language_loss": 0.72722483, + "learning_rate": 3.6557509656388125e-06, + "loss": 0.75031441, + "num_input_tokens_seen": 76754395, + "step": 3557, + "time_per_iteration": 2.589468479156494 + }, + { + "auxiliary_loss_clip": 0.01144741, + "auxiliary_loss_mlp": 0.00748882, + "balance_loss_clip": 1.00209498, + "balance_loss_mlp": 1.00114369, + "epoch": 0.2139185329926349, + "flos": 28074280101120.0, + "grad_norm": 1.6916448338315966, + "language_loss": 0.66953272, + "learning_rate": 3.655532480546528e-06, + "loss": 0.68846899, + "num_input_tokens_seen": 76777210, + "step": 3558, + "time_per_iteration": 2.661226987838745 + }, + { + "auxiliary_loss_clip": 0.01175965, + "auxiliary_loss_mlp": 0.0114938, + "balance_loss_clip": 1.0020864, + "balance_loss_mlp": 1.00089335, + "epoch": 0.21397865624530288, + "flos": 19608698914560.0, + "grad_norm": 1.7114052413482321, + "language_loss": 0.79512835, + "learning_rate": 3.655313932676286e-06, + "loss": 0.81838179, + "num_input_tokens_seen": 76795830, + "step": 3559, + "time_per_iteration": 2.4923267364501953 + }, + { + "auxiliary_loss_clip": 0.01175856, + "auxiliary_loss_mlp": 0.01148976, + "balance_loss_clip": 1.00213325, + "balance_loss_mlp": 1.00096536, + "epoch": 0.21403877949797084, + "flos": 24681476033280.0, + "grad_norm": 1.563561132481444, + "language_loss": 0.6772272, + "learning_rate": 3.655095322036373e-06, + "loss": 0.70047545, + "num_input_tokens_seen": 76814700, + "step": 3560, + "time_per_iteration": 2.5337634086608887 + }, + { + "auxiliary_loss_clip": 0.01159508, + "auxiliary_loss_mlp": 0.01149466, + "balance_loss_clip": 1.00211811, + "balance_loss_mlp": 1.00107396, + "epoch": 0.2140989027506388, + "flos": 19861150677120.0, + "grad_norm": 1.8269118136680016, + "language_loss": 0.73253214, + "learning_rate": 3.65487664863508e-06, + "loss": 0.75562191, + "num_input_tokens_seen": 76833400, + "step": 3561, + "time_per_iteration": 2.5415842533111572 + }, + { + "auxiliary_loss_clip": 0.01145163, + "auxiliary_loss_mlp": 0.01149589, + "balance_loss_clip": 1.0020982, + "balance_loss_mlp": 1.00100613, + "epoch": 0.21415902600330677, + "flos": 19135324552320.0, + "grad_norm": 2.216513376006369, + "language_loss": 0.77442527, + "learning_rate": 3.654657912480698e-06, + "loss": 0.79737276, + "num_input_tokens_seen": 76850645, + "step": 3562, + "time_per_iteration": 2.5751264095306396 + }, + { + "auxiliary_loss_clip": 0.01175885, + "auxiliary_loss_mlp": 0.01148993, + "balance_loss_clip": 1.00218415, + "balance_loss_mlp": 1.00088739, + "epoch": 0.21421914925597474, + "flos": 22272624201600.0, + "grad_norm": 1.498893535737226, + "language_loss": 0.84373468, + "learning_rate": 3.6544391135815237e-06, + "loss": 0.86698353, + "num_input_tokens_seen": 76870135, + "step": 3563, + "time_per_iteration": 2.5217576026916504 + }, + { + "auxiliary_loss_clip": 0.01175936, + "auxiliary_loss_mlp": 0.01149027, + "balance_loss_clip": 1.00224304, + "balance_loss_mlp": 1.00111175, + "epoch": 0.2142792725086427, + "flos": 33875109987840.0, + "grad_norm": 2.435473437694306, + "language_loss": 0.76421916, + "learning_rate": 3.6542202519458507e-06, + "loss": 0.78746879, + "num_input_tokens_seen": 76893905, + "step": 3564, + "time_per_iteration": 2.6109819412231445 + }, + { + "auxiliary_loss_clip": 0.01142702, + "auxiliary_loss_mlp": 0.0114926, + "balance_loss_clip": 1.00208235, + "balance_loss_mlp": 1.00096416, + "epoch": 0.2143393957613107, + "flos": 19860216923520.0, + "grad_norm": 1.6369829462327739, + "language_loss": 0.88442671, + "learning_rate": 3.654001327581981e-06, + "loss": 0.90734625, + "num_input_tokens_seen": 76914205, + "step": 3565, + "time_per_iteration": 2.6193673610687256 + }, + { + "auxiliary_loss_clip": 0.01140824, + "auxiliary_loss_mlp": 0.01136708, + "balance_loss_clip": 1.00179839, + "balance_loss_mlp": 1.00014234, + "epoch": 0.21439951901397866, + "flos": 68530093090560.0, + "grad_norm": 0.8485417880394566, + "language_loss": 0.52262956, + "learning_rate": 3.653782340498215e-06, + "loss": 0.54540485, + "num_input_tokens_seen": 76975650, + "step": 3566, + "time_per_iteration": 3.1190643310546875 + }, + { + "auxiliary_loss_clip": 0.01159112, + "auxiliary_loss_mlp": 0.01148159, + "balance_loss_clip": 1.0021019, + "balance_loss_mlp": 1.00072098, + "epoch": 0.21445964226664663, + "flos": 19682998197120.0, + "grad_norm": 1.843089684290939, + "language_loss": 0.67360699, + "learning_rate": 3.6535632907028566e-06, + "loss": 0.69667971, + "num_input_tokens_seen": 76992615, + "step": 3567, + "time_per_iteration": 2.537759304046631 + }, + { + "auxiliary_loss_clip": 0.01144403, + "auxiliary_loss_mlp": 0.01148825, + "balance_loss_clip": 1.00214219, + "balance_loss_mlp": 1.00110054, + "epoch": 0.2145197655193146, + "flos": 31107259676160.0, + "grad_norm": 1.5668309824245075, + "language_loss": 0.74283046, + "learning_rate": 3.6533441782042126e-06, + "loss": 0.76576269, + "num_input_tokens_seen": 77017005, + "step": 3568, + "time_per_iteration": 2.723902940750122 + }, + { + "auxiliary_loss_clip": 0.01159127, + "auxiliary_loss_mlp": 0.01149281, + "balance_loss_clip": 1.00208402, + "balance_loss_mlp": 1.00108004, + "epoch": 0.21457988877198256, + "flos": 20120785159680.0, + "grad_norm": 3.0655783161084877, + "language_loss": 0.77455693, + "learning_rate": 3.6531250030105917e-06, + "loss": 0.79764104, + "num_input_tokens_seen": 77034990, + "step": 3569, + "time_per_iteration": 3.9518110752105713 + }, + { + "auxiliary_loss_clip": 0.01160599, + "auxiliary_loss_mlp": 0.01149617, + "balance_loss_clip": 1.00227332, + "balance_loss_mlp": 1.00093889, + "epoch": 0.21464001202465052, + "flos": 18588045957120.0, + "grad_norm": 2.372181755564203, + "language_loss": 0.71037281, + "learning_rate": 3.6529057651303053e-06, + "loss": 0.73347497, + "num_input_tokens_seen": 77052610, + "step": 3570, + "time_per_iteration": 2.5455799102783203 + }, + { + "auxiliary_loss_clip": 0.01175999, + "auxiliary_loss_mlp": 0.01149383, + "balance_loss_clip": 1.00217104, + "balance_loss_mlp": 1.00108647, + "epoch": 0.21470013527731852, + "flos": 21835160461440.0, + "grad_norm": 2.2301956758797084, + "language_loss": 0.78593743, + "learning_rate": 3.6526864645716666e-06, + "loss": 0.80919123, + "num_input_tokens_seen": 77072475, + "step": 3571, + "time_per_iteration": 3.958292007446289 + }, + { + "auxiliary_loss_clip": 0.01160286, + "auxiliary_loss_mlp": 0.0114971, + "balance_loss_clip": 1.00215292, + "balance_loss_mlp": 1.00093722, + "epoch": 0.21476025852998648, + "flos": 17603195880960.0, + "grad_norm": 2.041493054647931, + "language_loss": 0.82774448, + "learning_rate": 3.652467101342991e-06, + "loss": 0.85084444, + "num_input_tokens_seen": 77089930, + "step": 3572, + "time_per_iteration": 2.511777877807617 + }, + { + "auxiliary_loss_clip": 0.01142987, + "auxiliary_loss_mlp": 0.01149356, + "balance_loss_clip": 1.00198126, + "balance_loss_mlp": 1.00077331, + "epoch": 0.21482038178265445, + "flos": 24828135264000.0, + "grad_norm": 2.4028268661029357, + "language_loss": 0.64879847, + "learning_rate": 3.652247675452598e-06, + "loss": 0.67172182, + "num_input_tokens_seen": 77108970, + "step": 3573, + "time_per_iteration": 3.991732597351074 + }, + { + "auxiliary_loss_clip": 0.01175764, + "auxiliary_loss_mlp": 0.01148872, + "balance_loss_clip": 1.00213289, + "balance_loss_mlp": 1.00105202, + "epoch": 0.2148805050353224, + "flos": 23258228463360.0, + "grad_norm": 1.8879549152355601, + "language_loss": 0.75284064, + "learning_rate": 3.652028186908807e-06, + "loss": 0.77608693, + "num_input_tokens_seen": 77126045, + "step": 3574, + "time_per_iteration": 2.499575614929199 + }, + { + "auxiliary_loss_clip": 0.01159078, + "auxiliary_loss_mlp": 0.0114895, + "balance_loss_clip": 1.00199175, + "balance_loss_mlp": 1.00084496, + "epoch": 0.21494062828799038, + "flos": 21321098968320.0, + "grad_norm": 1.8264668888905693, + "language_loss": 0.72376895, + "learning_rate": 3.6518086357199416e-06, + "loss": 0.74684918, + "num_input_tokens_seen": 77144600, + "step": 3575, + "time_per_iteration": 2.5361320972442627 + }, + { + "auxiliary_loss_clip": 0.01143355, + "auxiliary_loss_mlp": 0.01148445, + "balance_loss_clip": 1.00205135, + "balance_loss_mlp": 1.00110209, + "epoch": 0.21500075154065834, + "flos": 18843334894080.0, + "grad_norm": 1.6763792966180084, + "language_loss": 0.68352306, + "learning_rate": 3.6515890218943277e-06, + "loss": 0.7064411, + "num_input_tokens_seen": 77162965, + "step": 3576, + "time_per_iteration": 2.5731515884399414 + }, + { + "auxiliary_loss_clip": 0.01160321, + "auxiliary_loss_mlp": 0.01149034, + "balance_loss_clip": 1.00221992, + "balance_loss_mlp": 1.00073743, + "epoch": 0.2150608747933263, + "flos": 18441997257600.0, + "grad_norm": 1.9153512483948965, + "language_loss": 0.88377076, + "learning_rate": 3.651369345440292e-06, + "loss": 0.90686429, + "num_input_tokens_seen": 77179960, + "step": 3577, + "time_per_iteration": 2.5051636695861816 + }, + { + "auxiliary_loss_clip": 0.01142505, + "auxiliary_loss_mlp": 0.01136791, + "balance_loss_clip": 1.00203633, + "balance_loss_mlp": 1.00022507, + "epoch": 0.2151209980459943, + "flos": 66598242894720.0, + "grad_norm": 0.8091686874122495, + "language_loss": 0.56175315, + "learning_rate": 3.6511496063661654e-06, + "loss": 0.58454609, + "num_input_tokens_seen": 77239500, + "step": 3578, + "time_per_iteration": 3.1168243885040283 + }, + { + "auxiliary_loss_clip": 0.01160422, + "auxiliary_loss_mlp": 0.00748658, + "balance_loss_clip": 1.00215292, + "balance_loss_mlp": 1.00092769, + "epoch": 0.21518112129866226, + "flos": 21575885114880.0, + "grad_norm": 1.6625516434002212, + "language_loss": 0.88727391, + "learning_rate": 3.6509298046802807e-06, + "loss": 0.90636468, + "num_input_tokens_seen": 77254680, + "step": 3579, + "time_per_iteration": 2.568002700805664 + }, + { + "auxiliary_loss_clip": 0.01160397, + "auxiliary_loss_mlp": 0.01148784, + "balance_loss_clip": 1.00215459, + "balance_loss_mlp": 1.00086951, + "epoch": 0.21524124455133023, + "flos": 20047635112320.0, + "grad_norm": 1.6471629156754706, + "language_loss": 0.77825332, + "learning_rate": 3.650709940390972e-06, + "loss": 0.80134511, + "num_input_tokens_seen": 77274060, + "step": 3580, + "time_per_iteration": 2.581239938735962 + }, + { + "auxiliary_loss_clip": 0.01159921, + "auxiliary_loss_mlp": 0.01148288, + "balance_loss_clip": 1.00213993, + "balance_loss_mlp": 1.00104094, + "epoch": 0.2153013678039982, + "flos": 23951807153280.0, + "grad_norm": 1.8087784647506862, + "language_loss": 0.72784394, + "learning_rate": 3.6504900135065775e-06, + "loss": 0.75092602, + "num_input_tokens_seen": 77293255, + "step": 3581, + "time_per_iteration": 2.5764212608337402 + }, + { + "auxiliary_loss_clip": 0.01159006, + "auxiliary_loss_mlp": 0.01148626, + "balance_loss_clip": 1.0019865, + "balance_loss_mlp": 1.00090158, + "epoch": 0.21536149105666616, + "flos": 20594841880320.0, + "grad_norm": 1.9637499075656446, + "language_loss": 0.7052815, + "learning_rate": 3.6502700240354357e-06, + "loss": 0.72835785, + "num_input_tokens_seen": 77312390, + "step": 3582, + "time_per_iteration": 2.5883026123046875 + }, + { + "auxiliary_loss_clip": 0.01175792, + "auxiliary_loss_mlp": 0.01148434, + "balance_loss_clip": 1.00213385, + "balance_loss_mlp": 1.00090075, + "epoch": 0.21542161430933413, + "flos": 12860042895360.0, + "grad_norm": 2.190250722121569, + "language_loss": 0.8425473, + "learning_rate": 3.650049971985889e-06, + "loss": 0.86578953, + "num_input_tokens_seen": 77330985, + "step": 3583, + "time_per_iteration": 2.495013475418091 + }, + { + "auxiliary_loss_clip": 0.01143941, + "auxiliary_loss_mlp": 0.01149412, + "balance_loss_clip": 1.00198698, + "balance_loss_mlp": 1.00130689, + "epoch": 0.21548173756200212, + "flos": 26103933504000.0, + "grad_norm": 2.2324617576671497, + "language_loss": 0.82860559, + "learning_rate": 3.6498298573662824e-06, + "loss": 0.85153913, + "num_input_tokens_seen": 77350770, + "step": 3584, + "time_per_iteration": 2.613283157348633 + }, + { + "auxiliary_loss_clip": 0.01127313, + "auxiliary_loss_mlp": 0.00748754, + "balance_loss_clip": 1.00189447, + "balance_loss_mlp": 1.00114799, + "epoch": 0.21554186081467008, + "flos": 22163779013760.0, + "grad_norm": 1.94690201990647, + "language_loss": 0.90089405, + "learning_rate": 3.6496096801849625e-06, + "loss": 0.91965473, + "num_input_tokens_seen": 77370510, + "step": 3585, + "time_per_iteration": 2.6385462284088135 + }, + { + "auxiliary_loss_clip": 0.01158795, + "auxiliary_loss_mlp": 0.01148127, + "balance_loss_clip": 1.0020293, + "balance_loss_mlp": 1.00097525, + "epoch": 0.21560198406733805, + "flos": 22966741595520.0, + "grad_norm": 2.0562106032548133, + "language_loss": 0.74295902, + "learning_rate": 3.649389440450277e-06, + "loss": 0.76602829, + "num_input_tokens_seen": 77390645, + "step": 3586, + "time_per_iteration": 2.5405805110931396 + }, + { + "auxiliary_loss_clip": 0.01112556, + "auxiliary_loss_mlp": 0.01149178, + "balance_loss_clip": 1.00195754, + "balance_loss_mlp": 1.00107265, + "epoch": 0.215662107320006, + "flos": 22784064001920.0, + "grad_norm": 1.711658840017594, + "language_loss": 0.83127362, + "learning_rate": 3.6491691381705804e-06, + "loss": 0.85389096, + "num_input_tokens_seen": 77409655, + "step": 3587, + "time_per_iteration": 2.6887505054473877 + }, + { + "auxiliary_loss_clip": 0.01111061, + "auxiliary_loss_mlp": 0.00748856, + "balance_loss_clip": 1.00185013, + "balance_loss_mlp": 1.00116527, + "epoch": 0.21572223057267398, + "flos": 30883859038080.0, + "grad_norm": 1.9038693862214409, + "language_loss": 0.75811493, + "learning_rate": 3.648948773354224e-06, + "loss": 0.77671409, + "num_input_tokens_seen": 77430560, + "step": 3588, + "time_per_iteration": 2.7435789108276367 + }, + { + "auxiliary_loss_clip": 0.01160402, + "auxiliary_loss_mlp": 0.01148658, + "balance_loss_clip": 1.0021652, + "balance_loss_mlp": 1.00093412, + "epoch": 0.21578235382534194, + "flos": 26910487445760.0, + "grad_norm": 2.548177895144292, + "language_loss": 0.80664253, + "learning_rate": 3.6487283460095643e-06, + "loss": 0.82973313, + "num_input_tokens_seen": 77455000, + "step": 3589, + "time_per_iteration": 2.637143611907959 + }, + { + "auxiliary_loss_clip": 0.01175842, + "auxiliary_loss_mlp": 0.01148424, + "balance_loss_clip": 1.00224948, + "balance_loss_mlp": 1.00089085, + "epoch": 0.2158424770780099, + "flos": 24425720219520.0, + "grad_norm": 1.9428822820633282, + "language_loss": 0.72517776, + "learning_rate": 3.648507856144961e-06, + "loss": 0.74842036, + "num_input_tokens_seen": 77475075, + "step": 3590, + "time_per_iteration": 2.5756304264068604 + }, + { + "auxiliary_loss_clip": 0.01143426, + "auxiliary_loss_mlp": 0.01149014, + "balance_loss_clip": 1.00190878, + "balance_loss_mlp": 1.00090814, + "epoch": 0.2159026003306779, + "flos": 23949975559680.0, + "grad_norm": 2.71869838516325, + "language_loss": 0.83866346, + "learning_rate": 3.648287303768775e-06, + "loss": 0.86158788, + "num_input_tokens_seen": 77495945, + "step": 3591, + "time_per_iteration": 2.6087141036987305 + }, + { + "auxiliary_loss_clip": 0.01126205, + "auxiliary_loss_mlp": 0.01149512, + "balance_loss_clip": 1.00197554, + "balance_loss_mlp": 1.00111985, + "epoch": 0.21596272358334587, + "flos": 30040963511040.0, + "grad_norm": 1.566808576697416, + "language_loss": 0.69134235, + "learning_rate": 3.6480666888893686e-06, + "loss": 0.71409953, + "num_input_tokens_seen": 77517140, + "step": 3592, + "time_per_iteration": 2.7000858783721924 + }, + { + "auxiliary_loss_clip": 0.01127272, + "auxiliary_loss_mlp": 0.01148773, + "balance_loss_clip": 1.00195777, + "balance_loss_mlp": 1.00114417, + "epoch": 0.21602284683601383, + "flos": 20376217751040.0, + "grad_norm": 2.6656056691711996, + "language_loss": 0.8372122, + "learning_rate": 3.647846011515108e-06, + "loss": 0.85997272, + "num_input_tokens_seen": 77536085, + "step": 3593, + "time_per_iteration": 2.6077687740325928 + }, + { + "auxiliary_loss_clip": 0.01127053, + "auxiliary_loss_mlp": 0.01149105, + "balance_loss_clip": 1.00196457, + "balance_loss_mlp": 1.00099945, + "epoch": 0.2160829700886818, + "flos": 20777339905920.0, + "grad_norm": 4.069601871871483, + "language_loss": 0.75575012, + "learning_rate": 3.6476252716543625e-06, + "loss": 0.7785117, + "num_input_tokens_seen": 77553675, + "step": 3594, + "time_per_iteration": 2.61423659324646 + }, + { + "auxiliary_loss_clip": 0.01158937, + "auxiliary_loss_mlp": 0.0114829, + "balance_loss_clip": 1.00205958, + "balance_loss_mlp": 1.00094712, + "epoch": 0.21614309334134976, + "flos": 22309755886080.0, + "grad_norm": 1.4909621707019023, + "language_loss": 0.80516148, + "learning_rate": 3.6474044693155007e-06, + "loss": 0.82823378, + "num_input_tokens_seen": 77573360, + "step": 3595, + "time_per_iteration": 2.565725803375244 + }, + { + "auxiliary_loss_clip": 0.01125922, + "auxiliary_loss_mlp": 0.01148852, + "balance_loss_clip": 1.00195491, + "balance_loss_mlp": 1.00084186, + "epoch": 0.21620321659401773, + "flos": 19609524927360.0, + "grad_norm": 2.2771987451711673, + "language_loss": 0.78742546, + "learning_rate": 3.647183604506897e-06, + "loss": 0.81017315, + "num_input_tokens_seen": 77591865, + "step": 3596, + "time_per_iteration": 2.6169896125793457 + }, + { + "auxiliary_loss_clip": 0.01093523, + "auxiliary_loss_mlp": 0.01148078, + "balance_loss_clip": 1.00180602, + "balance_loss_mlp": 1.00102127, + "epoch": 0.2162633398466857, + "flos": 18844555956480.0, + "grad_norm": 1.4741923804745953, + "language_loss": 0.82846475, + "learning_rate": 3.6469626772369253e-06, + "loss": 0.85088068, + "num_input_tokens_seen": 77611600, + "step": 3597, + "time_per_iteration": 2.6922271251678467 + }, + { + "auxiliary_loss_clip": 0.01143543, + "auxiliary_loss_mlp": 0.00748885, + "balance_loss_clip": 1.0018847, + "balance_loss_mlp": 1.00098383, + "epoch": 0.21632346309935369, + "flos": 18768820129920.0, + "grad_norm": 1.5540832702134435, + "language_loss": 0.80652094, + "learning_rate": 3.6467416875139642e-06, + "loss": 0.82544523, + "num_input_tokens_seen": 77630665, + "step": 3598, + "time_per_iteration": 2.5697906017303467 + }, + { + "auxiliary_loss_clip": 0.0114388, + "auxiliary_loss_mlp": 0.01148914, + "balance_loss_clip": 1.00200033, + "balance_loss_mlp": 1.00099874, + "epoch": 0.21638358635202165, + "flos": 26324173745280.0, + "grad_norm": 2.125689496935353, + "language_loss": 0.81876171, + "learning_rate": 3.6465206353463934e-06, + "loss": 0.84168965, + "num_input_tokens_seen": 77650835, + "step": 3599, + "time_per_iteration": 2.621713876724243 + }, + { + "auxiliary_loss_clip": 0.01127076, + "auxiliary_loss_mlp": 0.0074892, + "balance_loss_clip": 1.00191665, + "balance_loss_mlp": 1.00113583, + "epoch": 0.21644370960468962, + "flos": 20740854666240.0, + "grad_norm": 4.160999083082297, + "language_loss": 0.7651931, + "learning_rate": 3.6462995207425947e-06, + "loss": 0.78395307, + "num_input_tokens_seen": 77669000, + "step": 3600, + "time_per_iteration": 2.620728015899658 + }, + { + "auxiliary_loss_clip": 0.01128612, + "auxiliary_loss_mlp": 0.01149067, + "balance_loss_clip": 1.0021193, + "balance_loss_mlp": 1.00115252, + "epoch": 0.21650383285735758, + "flos": 23952238116480.0, + "grad_norm": 1.6494901506863513, + "language_loss": 0.79847193, + "learning_rate": 3.6460783437109533e-06, + "loss": 0.82124877, + "num_input_tokens_seen": 77688745, + "step": 3601, + "time_per_iteration": 2.6738123893737793 + }, + { + "auxiliary_loss_clip": 0.01175743, + "auxiliary_loss_mlp": 0.01148546, + "balance_loss_clip": 1.00208879, + "balance_loss_mlp": 1.00110793, + "epoch": 0.21656395611002555, + "flos": 23696087253120.0, + "grad_norm": 1.9127797089704799, + "language_loss": 0.83330929, + "learning_rate": 3.6458571042598565e-06, + "loss": 0.85655224, + "num_input_tokens_seen": 77708445, + "step": 3602, + "time_per_iteration": 2.5221335887908936 + }, + { + "auxiliary_loss_clip": 0.01175771, + "auxiliary_loss_mlp": 0.01149055, + "balance_loss_clip": 1.00213885, + "balance_loss_mlp": 1.00104475, + "epoch": 0.2166240793626935, + "flos": 20666052593280.0, + "grad_norm": 1.9038076985712202, + "language_loss": 0.74545944, + "learning_rate": 3.645635802397693e-06, + "loss": 0.76870775, + "num_input_tokens_seen": 77728465, + "step": 3603, + "time_per_iteration": 2.501279830932617 + }, + { + "auxiliary_loss_clip": 0.01144684, + "auxiliary_loss_mlp": 0.01148965, + "balance_loss_clip": 1.00212991, + "balance_loss_mlp": 1.00095451, + "epoch": 0.2166842026153615, + "flos": 21580410228480.0, + "grad_norm": 1.851718129229352, + "language_loss": 0.74023414, + "learning_rate": 3.645414438132855e-06, + "loss": 0.7631706, + "num_input_tokens_seen": 77746735, + "step": 3604, + "time_per_iteration": 2.582855701446533 + }, + { + "auxiliary_loss_clip": 0.0115925, + "auxiliary_loss_mlp": 0.01149081, + "balance_loss_clip": 1.00209606, + "balance_loss_mlp": 1.00107121, + "epoch": 0.21674432586802947, + "flos": 25629948610560.0, + "grad_norm": 1.7419937666453436, + "language_loss": 0.80057245, + "learning_rate": 3.6451930114737366e-06, + "loss": 0.82365572, + "num_input_tokens_seen": 77768105, + "step": 3605, + "time_per_iteration": 2.600228786468506 + }, + { + "auxiliary_loss_clip": 0.01172771, + "auxiliary_loss_mlp": 0.01136881, + "balance_loss_clip": 1.00166059, + "balance_loss_mlp": 1.00031471, + "epoch": 0.21680444912069743, + "flos": 56417783616000.0, + "grad_norm": 0.6936509453584452, + "language_loss": 0.58369607, + "learning_rate": 3.6449715224287347e-06, + "loss": 0.60679257, + "num_input_tokens_seen": 77833750, + "step": 3606, + "time_per_iteration": 3.1862916946411133 + }, + { + "auxiliary_loss_clip": 0.01175949, + "auxiliary_loss_mlp": 0.01149177, + "balance_loss_clip": 1.00221634, + "balance_loss_mlp": 1.00107181, + "epoch": 0.2168645723733654, + "flos": 23878944414720.0, + "grad_norm": 1.9482997373559958, + "language_loss": 0.72773659, + "learning_rate": 3.644749971006248e-06, + "loss": 0.75098783, + "num_input_tokens_seen": 77853780, + "step": 3607, + "time_per_iteration": 3.9079573154449463 + }, + { + "auxiliary_loss_clip": 0.01142778, + "auxiliary_loss_mlp": 0.01148893, + "balance_loss_clip": 1.00194621, + "balance_loss_mlp": 1.00097799, + "epoch": 0.21692469562603336, + "flos": 16946174257920.0, + "grad_norm": 1.9313421947812401, + "language_loss": 0.76554477, + "learning_rate": 3.6445283572146765e-06, + "loss": 0.78846145, + "num_input_tokens_seen": 77872575, + "step": 3608, + "time_per_iteration": 3.9961774349212646 + }, + { + "auxiliary_loss_clip": 0.0109393, + "auxiliary_loss_mlp": 0.01148826, + "balance_loss_clip": 1.00161481, + "balance_loss_mlp": 1.00100636, + "epoch": 0.21698481887870133, + "flos": 25119047514240.0, + "grad_norm": 1.6653350352109992, + "language_loss": 0.74414259, + "learning_rate": 3.6443066810624255e-06, + "loss": 0.76657015, + "num_input_tokens_seen": 77892700, + "step": 3609, + "time_per_iteration": 2.7340235710144043 + }, + { + "auxiliary_loss_clip": 0.0114792, + "auxiliary_loss_mlp": 0.01148924, + "balance_loss_clip": 1.00203907, + "balance_loss_mlp": 1.00119972, + "epoch": 0.2170449421313693, + "flos": 17894682748800.0, + "grad_norm": 1.8666355313793623, + "language_loss": 0.88614142, + "learning_rate": 3.6440849425579e-06, + "loss": 0.90910983, + "num_input_tokens_seen": 77911060, + "step": 3610, + "time_per_iteration": 2.5514371395111084 + }, + { + "auxiliary_loss_clip": 0.01175723, + "auxiliary_loss_mlp": 0.01148837, + "balance_loss_clip": 1.00213385, + "balance_loss_mlp": 1.00092244, + "epoch": 0.2171050653840373, + "flos": 22638446265600.0, + "grad_norm": 1.5296556063790858, + "language_loss": 0.7719937, + "learning_rate": 3.6438631417095095e-06, + "loss": 0.79523933, + "num_input_tokens_seen": 77929930, + "step": 3611, + "time_per_iteration": 5.364413738250732 + }, + { + "auxiliary_loss_clip": 0.01095929, + "auxiliary_loss_mlp": 0.01148419, + "balance_loss_clip": 1.00181818, + "balance_loss_mlp": 1.00117135, + "epoch": 0.21716518863670525, + "flos": 19499997381120.0, + "grad_norm": 1.9017743126035251, + "language_loss": 0.63074899, + "learning_rate": 3.6436412785256637e-06, + "loss": 0.65319246, + "num_input_tokens_seen": 77949060, + "step": 3612, + "time_per_iteration": 2.704228162765503 + }, + { + "auxiliary_loss_clip": 0.01112243, + "auxiliary_loss_mlp": 0.01148581, + "balance_loss_clip": 1.00176525, + "balance_loss_mlp": 1.000857, + "epoch": 0.21722531188937322, + "flos": 19792022952960.0, + "grad_norm": 1.689978796287111, + "language_loss": 0.7550981, + "learning_rate": 3.643419353014776e-06, + "loss": 0.77770633, + "num_input_tokens_seen": 77967920, + "step": 3613, + "time_per_iteration": 2.653812885284424 + }, + { + "auxiliary_loss_clip": 0.01127121, + "auxiliary_loss_mlp": 0.0114868, + "balance_loss_clip": 1.00189483, + "balance_loss_mlp": 1.00105155, + "epoch": 0.21728543514204118, + "flos": 13334386924800.0, + "grad_norm": 2.1448724350352624, + "language_loss": 0.71094447, + "learning_rate": 3.643197365185261e-06, + "loss": 0.73370248, + "num_input_tokens_seen": 77985330, + "step": 3614, + "time_per_iteration": 2.651897430419922 + }, + { + "auxiliary_loss_clip": 0.01159, + "auxiliary_loss_mlp": 0.01148758, + "balance_loss_clip": 1.0019865, + "balance_loss_mlp": 1.00122452, + "epoch": 0.21734555839470915, + "flos": 15231870783360.0, + "grad_norm": 1.6613897561684863, + "language_loss": 0.73704898, + "learning_rate": 3.6429753150455378e-06, + "loss": 0.76012653, + "num_input_tokens_seen": 78003105, + "step": 3615, + "time_per_iteration": 2.5315070152282715 + }, + { + "auxiliary_loss_clip": 0.0116048, + "auxiliary_loss_mlp": 0.0114952, + "balance_loss_clip": 1.00208211, + "balance_loss_mlp": 1.00103283, + "epoch": 0.2174056816473771, + "flos": 19973982274560.0, + "grad_norm": 2.0928554548260694, + "language_loss": 0.89727402, + "learning_rate": 3.6427532026040263e-06, + "loss": 0.92037404, + "num_input_tokens_seen": 78019655, + "step": 3616, + "time_per_iteration": 2.5514793395996094 + }, + { + "auxiliary_loss_clip": 0.01111719, + "auxiliary_loss_mlp": 0.01149202, + "balance_loss_clip": 1.00191712, + "balance_loss_mlp": 1.00090587, + "epoch": 0.21746580490004508, + "flos": 16687293960960.0, + "grad_norm": 2.112467293437978, + "language_loss": 0.81028277, + "learning_rate": 3.642531027869148e-06, + "loss": 0.832892, + "num_input_tokens_seen": 78036025, + "step": 3617, + "time_per_iteration": 2.633237600326538 + }, + { + "auxiliary_loss_clip": 0.01142467, + "auxiliary_loss_mlp": 0.01148656, + "balance_loss_clip": 1.0019536, + "balance_loss_mlp": 1.00102675, + "epoch": 0.21752592815271307, + "flos": 25772298209280.0, + "grad_norm": 2.137964428119706, + "language_loss": 0.75344318, + "learning_rate": 3.642308790849329e-06, + "loss": 0.77635443, + "num_input_tokens_seen": 78055645, + "step": 3618, + "time_per_iteration": 2.611325740814209 + }, + { + "auxiliary_loss_clip": 0.01160447, + "auxiliary_loss_mlp": 0.01149271, + "balance_loss_clip": 1.00207067, + "balance_loss_mlp": 1.00106966, + "epoch": 0.21758605140538104, + "flos": 11254692349440.0, + "grad_norm": 1.718011195854095, + "language_loss": 0.68899983, + "learning_rate": 3.642086491552996e-06, + "loss": 0.71209699, + "num_input_tokens_seen": 78071660, + "step": 3619, + "time_per_iteration": 2.5115678310394287 + }, + { + "auxiliary_loss_clip": 0.01158854, + "auxiliary_loss_mlp": 0.01149189, + "balance_loss_clip": 1.0019356, + "balance_loss_mlp": 1.0011785, + "epoch": 0.217646174658049, + "flos": 19242625455360.0, + "grad_norm": 1.641441428561215, + "language_loss": 0.78202772, + "learning_rate": 3.641864129988579e-06, + "loss": 0.80510819, + "num_input_tokens_seen": 78091265, + "step": 3620, + "time_per_iteration": 2.557938814163208 + }, + { + "auxiliary_loss_clip": 0.01175617, + "auxiliary_loss_mlp": 0.01148197, + "balance_loss_clip": 1.00208187, + "balance_loss_mlp": 1.00114071, + "epoch": 0.21770629791071697, + "flos": 21945083057280.0, + "grad_norm": 2.438436071576929, + "language_loss": 0.79996085, + "learning_rate": 3.641641706164509e-06, + "loss": 0.82319903, + "num_input_tokens_seen": 78110095, + "step": 3621, + "time_per_iteration": 2.5080201625823975 + }, + { + "auxiliary_loss_clip": 0.01160243, + "auxiliary_loss_mlp": 0.01148237, + "balance_loss_clip": 1.00199699, + "balance_loss_mlp": 1.00079942, + "epoch": 0.21776642116338493, + "flos": 24936764970240.0, + "grad_norm": 1.608474850173913, + "language_loss": 0.87631989, + "learning_rate": 3.641419220089221e-06, + "loss": 0.89940459, + "num_input_tokens_seen": 78129475, + "step": 3622, + "time_per_iteration": 2.5730624198913574 + }, + { + "auxiliary_loss_clip": 0.01160392, + "auxiliary_loss_mlp": 0.01149116, + "balance_loss_clip": 1.0020237, + "balance_loss_mlp": 1.00081968, + "epoch": 0.2178265444160529, + "flos": 17821317219840.0, + "grad_norm": 1.8168431110337258, + "language_loss": 0.77173138, + "learning_rate": 3.641196671771152e-06, + "loss": 0.79482639, + "num_input_tokens_seen": 78146880, + "step": 3623, + "time_per_iteration": 2.531092405319214 + }, + { + "auxiliary_loss_clip": 0.01131089, + "auxiliary_loss_mlp": 0.01149086, + "balance_loss_clip": 1.00203478, + "balance_loss_mlp": 1.00107527, + "epoch": 0.2178866676687209, + "flos": 17712902995200.0, + "grad_norm": 3.304973343019973, + "language_loss": 0.8460933, + "learning_rate": 3.640974061218741e-06, + "loss": 0.86889505, + "num_input_tokens_seen": 78165065, + "step": 3624, + "time_per_iteration": 2.591301918029785 + }, + { + "auxiliary_loss_clip": 0.01160446, + "auxiliary_loss_mlp": 0.01149702, + "balance_loss_clip": 1.00221455, + "balance_loss_mlp": 1.00150108, + "epoch": 0.21794679092138886, + "flos": 16945851035520.0, + "grad_norm": 2.313539052986113, + "language_loss": 0.77876568, + "learning_rate": 3.640751388440429e-06, + "loss": 0.80186713, + "num_input_tokens_seen": 78180005, + "step": 3625, + "time_per_iteration": 2.506833791732788 + }, + { + "auxiliary_loss_clip": 0.01157648, + "auxiliary_loss_mlp": 0.01136854, + "balance_loss_clip": 1.00172806, + "balance_loss_mlp": 1.00028753, + "epoch": 0.21800691417405682, + "flos": 63718566566400.0, + "grad_norm": 0.7919162952293926, + "language_loss": 0.60724026, + "learning_rate": 3.64052865344466e-06, + "loss": 0.63018525, + "num_input_tokens_seen": 78245350, + "step": 3626, + "time_per_iteration": 3.208979845046997 + }, + { + "auxiliary_loss_clip": 0.01143653, + "auxiliary_loss_mlp": 0.00748926, + "balance_loss_clip": 1.00204659, + "balance_loss_mlp": 1.0011375, + "epoch": 0.21806703742672479, + "flos": 21616392677760.0, + "grad_norm": 1.800570943706753, + "language_loss": 0.90333802, + "learning_rate": 3.6403058562398795e-06, + "loss": 0.92226386, + "num_input_tokens_seen": 78264165, + "step": 3627, + "time_per_iteration": 2.73987078666687 + }, + { + "auxiliary_loss_clip": 0.01117291, + "auxiliary_loss_mlp": 0.0114879, + "balance_loss_clip": 1.00225115, + "balance_loss_mlp": 1.00087535, + "epoch": 0.21812716067939275, + "flos": 19354882435200.0, + "grad_norm": 1.8763284133780047, + "language_loss": 0.73463857, + "learning_rate": 3.6400829968345365e-06, + "loss": 0.75729942, + "num_input_tokens_seen": 78283745, + "step": 3628, + "time_per_iteration": 2.6551501750946045 + }, + { + "auxiliary_loss_clip": 0.01175649, + "auxiliary_loss_mlp": 0.01148525, + "balance_loss_clip": 1.00209808, + "balance_loss_mlp": 1.0009917, + "epoch": 0.21818728393206072, + "flos": 23548063305600.0, + "grad_norm": 1.7069707281426258, + "language_loss": 0.7739501, + "learning_rate": 3.6398600752370826e-06, + "loss": 0.79719186, + "num_input_tokens_seen": 78302900, + "step": 3629, + "time_per_iteration": 2.5809519290924072 + }, + { + "auxiliary_loss_clip": 0.01159246, + "auxiliary_loss_mlp": 0.01148872, + "balance_loss_clip": 1.0021354, + "balance_loss_mlp": 1.00114799, + "epoch": 0.21824740718472868, + "flos": 30225652266240.0, + "grad_norm": 1.7350516683814208, + "language_loss": 0.71052974, + "learning_rate": 3.63963709145597e-06, + "loss": 0.73361093, + "num_input_tokens_seen": 78326470, + "step": 3630, + "time_per_iteration": 2.621098756790161 + }, + { + "auxiliary_loss_clip": 0.01095778, + "auxiliary_loss_mlp": 0.01148099, + "balance_loss_clip": 1.0018723, + "balance_loss_mlp": 1.00094748, + "epoch": 0.21830753043739667, + "flos": 26134672567680.0, + "grad_norm": 1.6276185609936815, + "language_loss": 0.76528549, + "learning_rate": 3.6394140454996544e-06, + "loss": 0.78772432, + "num_input_tokens_seen": 78345810, + "step": 3631, + "time_per_iteration": 2.7506067752838135 + }, + { + "auxiliary_loss_clip": 0.01175734, + "auxiliary_loss_mlp": 0.01148852, + "balance_loss_clip": 1.00211072, + "balance_loss_mlp": 1.00112796, + "epoch": 0.21836765369006464, + "flos": 21720712752000.0, + "grad_norm": 1.9225890537300492, + "language_loss": 0.75438154, + "learning_rate": 3.639190937376594e-06, + "loss": 0.77762747, + "num_input_tokens_seen": 78364085, + "step": 3632, + "time_per_iteration": 2.4992966651916504 + }, + { + "auxiliary_loss_clip": 0.01175538, + "auxiliary_loss_mlp": 0.01147847, + "balance_loss_clip": 1.00211859, + "balance_loss_mlp": 1.00098157, + "epoch": 0.2184277769427326, + "flos": 19937604775680.0, + "grad_norm": 1.9065878986509526, + "language_loss": 0.83301097, + "learning_rate": 3.638967767095249e-06, + "loss": 0.8562448, + "num_input_tokens_seen": 78381385, + "step": 3633, + "time_per_iteration": 2.5453290939331055 + }, + { + "auxiliary_loss_clip": 0.01132119, + "auxiliary_loss_mlp": 0.01149004, + "balance_loss_clip": 1.00210178, + "balance_loss_mlp": 1.00118423, + "epoch": 0.21848790019540057, + "flos": 20340235301760.0, + "grad_norm": 2.0905680503758703, + "language_loss": 0.8147676, + "learning_rate": 3.6387445346640823e-06, + "loss": 0.83757883, + "num_input_tokens_seen": 78400500, + "step": 3634, + "time_per_iteration": 2.632272720336914 + }, + { + "auxiliary_loss_clip": 0.01158833, + "auxiliary_loss_mlp": 0.01148698, + "balance_loss_clip": 1.00198495, + "balance_loss_mlp": 1.00087845, + "epoch": 0.21854802344806853, + "flos": 15450818135040.0, + "grad_norm": 1.9845006840143837, + "language_loss": 0.7512185, + "learning_rate": 3.638521240091558e-06, + "loss": 0.77429384, + "num_input_tokens_seen": 78418340, + "step": 3635, + "time_per_iteration": 2.5796310901641846 + }, + { + "auxiliary_loss_clip": 0.01143858, + "auxiliary_loss_mlp": 0.01147758, + "balance_loss_clip": 1.00214791, + "balance_loss_mlp": 1.00098729, + "epoch": 0.2186081467007365, + "flos": 16320717711360.0, + "grad_norm": 1.871586158418905, + "language_loss": 0.88356632, + "learning_rate": 3.6382978833861445e-06, + "loss": 0.90648246, + "num_input_tokens_seen": 78434375, + "step": 3636, + "time_per_iteration": 2.552576780319214 + }, + { + "auxiliary_loss_clip": 0.01126935, + "auxiliary_loss_mlp": 0.00748938, + "balance_loss_clip": 1.00191879, + "balance_loss_mlp": 1.00117135, + "epoch": 0.2186682699534045, + "flos": 21689255416320.0, + "grad_norm": 2.014157296349044, + "language_loss": 0.75653189, + "learning_rate": 3.638074464556311e-06, + "loss": 0.77529061, + "num_input_tokens_seen": 78451735, + "step": 3637, + "time_per_iteration": 2.6235249042510986 + }, + { + "auxiliary_loss_clip": 0.01142775, + "auxiliary_loss_mlp": 0.01149011, + "balance_loss_clip": 1.00183463, + "balance_loss_mlp": 1.00080967, + "epoch": 0.21872839320607246, + "flos": 17739260599680.0, + "grad_norm": 2.4763577221446127, + "language_loss": 0.90427303, + "learning_rate": 3.63785098361053e-06, + "loss": 0.9271909, + "num_input_tokens_seen": 78462730, + "step": 3638, + "time_per_iteration": 2.514850378036499 + }, + { + "auxiliary_loss_clip": 0.0115909, + "auxiliary_loss_mlp": 0.0114845, + "balance_loss_clip": 1.0020864, + "balance_loss_mlp": 1.00120234, + "epoch": 0.21878851645874042, + "flos": 18652289431680.0, + "grad_norm": 2.415475316654035, + "language_loss": 0.89550561, + "learning_rate": 3.637627440557275e-06, + "loss": 0.91858101, + "num_input_tokens_seen": 78476300, + "step": 3639, + "time_per_iteration": 2.485523223876953 + }, + { + "auxiliary_loss_clip": 0.01143607, + "auxiliary_loss_mlp": 0.00748877, + "balance_loss_clip": 1.00204885, + "balance_loss_mlp": 1.00098014, + "epoch": 0.2188486397114084, + "flos": 25557301353600.0, + "grad_norm": 1.7150574461118657, + "language_loss": 0.79609311, + "learning_rate": 3.637403835405024e-06, + "loss": 0.815018, + "num_input_tokens_seen": 78496135, + "step": 3640, + "time_per_iteration": 2.639185905456543 + }, + { + "auxiliary_loss_clip": 0.01160377, + "auxiliary_loss_mlp": 0.01149068, + "balance_loss_clip": 1.00213647, + "balance_loss_mlp": 1.00124896, + "epoch": 0.21890876296407635, + "flos": 17892061056000.0, + "grad_norm": 2.9972200113547722, + "language_loss": 0.71930617, + "learning_rate": 3.637180168162255e-06, + "loss": 0.74240059, + "num_input_tokens_seen": 78513855, + "step": 3641, + "time_per_iteration": 2.5135343074798584 + }, + { + "auxiliary_loss_clip": 0.01143595, + "auxiliary_loss_mlp": 0.01148596, + "balance_loss_clip": 1.00214958, + "balance_loss_mlp": 1.00087237, + "epoch": 0.21896888621674432, + "flos": 17749100926080.0, + "grad_norm": 1.8757451009351946, + "language_loss": 0.81160307, + "learning_rate": 3.63695643883745e-06, + "loss": 0.83452499, + "num_input_tokens_seen": 78531740, + "step": 3642, + "time_per_iteration": 2.5532970428466797 + }, + { + "auxiliary_loss_clip": 0.01160353, + "auxiliary_loss_mlp": 0.01148468, + "balance_loss_clip": 1.00213051, + "balance_loss_mlp": 1.00103021, + "epoch": 0.21902900946941228, + "flos": 23076161400960.0, + "grad_norm": 1.6130625630531255, + "language_loss": 0.71704221, + "learning_rate": 3.6367326474390928e-06, + "loss": 0.74013048, + "num_input_tokens_seen": 78549600, + "step": 3643, + "time_per_iteration": 2.545543670654297 + }, + { + "auxiliary_loss_clip": 0.01175777, + "auxiliary_loss_mlp": 0.01149231, + "balance_loss_clip": 1.00221539, + "balance_loss_mlp": 1.00112581, + "epoch": 0.21908913272208028, + "flos": 48178545004800.0, + "grad_norm": 1.8344135221673552, + "language_loss": 0.68140769, + "learning_rate": 3.6365087939756696e-06, + "loss": 0.70465779, + "num_input_tokens_seen": 78573350, + "step": 3644, + "time_per_iteration": 2.7429609298706055 + }, + { + "auxiliary_loss_clip": 0.01175782, + "auxiliary_loss_mlp": 0.01149363, + "balance_loss_clip": 1.00214326, + "balance_loss_mlp": 1.00097167, + "epoch": 0.21914925597474824, + "flos": 22236749493120.0, + "grad_norm": 2.0899232107983416, + "language_loss": 0.77694404, + "learning_rate": 3.636284878455669e-06, + "loss": 0.80019546, + "num_input_tokens_seen": 78591005, + "step": 3645, + "time_per_iteration": 3.9067931175231934 + }, + { + "auxiliary_loss_clip": 0.01159125, + "auxiliary_loss_mlp": 0.01149007, + "balance_loss_clip": 1.0022459, + "balance_loss_mlp": 1.00137806, + "epoch": 0.2192093792274162, + "flos": 22125605834880.0, + "grad_norm": 2.3191746130240345, + "language_loss": 0.81996071, + "learning_rate": 3.636060900887582e-06, + "loss": 0.84304202, + "num_input_tokens_seen": 78610645, + "step": 3646, + "time_per_iteration": 2.6488304138183594 + }, + { + "auxiliary_loss_clip": 0.01160185, + "auxiliary_loss_mlp": 0.01148255, + "balance_loss_clip": 1.00223708, + "balance_loss_mlp": 1.00091243, + "epoch": 0.21926950248008417, + "flos": 15669442264320.0, + "grad_norm": 1.9857502498426427, + "language_loss": 0.82782674, + "learning_rate": 3.635836861279901e-06, + "loss": 0.85091114, + "num_input_tokens_seen": 78628340, + "step": 3647, + "time_per_iteration": 3.9538230895996094 + }, + { + "auxiliary_loss_clip": 0.01175657, + "auxiliary_loss_mlp": 0.01148376, + "balance_loss_clip": 1.00212407, + "balance_loss_mlp": 1.00103354, + "epoch": 0.21932962573275214, + "flos": 30262496641920.0, + "grad_norm": 1.9450147837251859, + "language_loss": 0.7285347, + "learning_rate": 3.635612759641123e-06, + "loss": 0.75177503, + "num_input_tokens_seen": 78649355, + "step": 3648, + "time_per_iteration": 2.574890375137329 + }, + { + "auxiliary_loss_clip": 0.01127376, + "auxiliary_loss_mlp": 0.0114885, + "balance_loss_clip": 1.00190926, + "balance_loss_mlp": 1.00093472, + "epoch": 0.2193897489854201, + "flos": 10780132838400.0, + "grad_norm": 2.25175549808283, + "language_loss": 0.74467474, + "learning_rate": 3.635388595979745e-06, + "loss": 0.76743698, + "num_input_tokens_seen": 78664915, + "step": 3649, + "time_per_iteration": 3.993350028991699 + }, + { + "auxiliary_loss_clip": 0.01159069, + "auxiliary_loss_mlp": 0.01148053, + "balance_loss_clip": 1.00221336, + "balance_loss_mlp": 1.00118756, + "epoch": 0.21944987223808807, + "flos": 19133313390720.0, + "grad_norm": 1.7938978160512868, + "language_loss": 0.86521745, + "learning_rate": 3.635164370304267e-06, + "loss": 0.88828874, + "num_input_tokens_seen": 78681475, + "step": 3650, + "time_per_iteration": 3.921325445175171 + }, + { + "auxiliary_loss_clip": 0.01143207, + "auxiliary_loss_mlp": 0.01148519, + "balance_loss_clip": 1.00199354, + "balance_loss_mlp": 1.00117683, + "epoch": 0.21950999549075606, + "flos": 22711093522560.0, + "grad_norm": 1.9524822151385473, + "language_loss": 0.83748639, + "learning_rate": 3.6349400826231927e-06, + "loss": 0.86040366, + "num_input_tokens_seen": 78702300, + "step": 3651, + "time_per_iteration": 2.612251043319702 + }, + { + "auxiliary_loss_clip": 0.01160246, + "auxiliary_loss_mlp": 0.01148235, + "balance_loss_clip": 1.00208497, + "balance_loss_mlp": 1.00089288, + "epoch": 0.21957011874342403, + "flos": 10561329141120.0, + "grad_norm": 2.3460132092112818, + "language_loss": 0.74683124, + "learning_rate": 3.634715732945027e-06, + "loss": 0.76991606, + "num_input_tokens_seen": 78720230, + "step": 3652, + "time_per_iteration": 2.5257811546325684 + }, + { + "auxiliary_loss_clip": 0.01108853, + "auxiliary_loss_mlp": 0.0113778, + "balance_loss_clip": 1.00162315, + "balance_loss_mlp": 1.00045133, + "epoch": 0.219630241996092, + "flos": 65747913252480.0, + "grad_norm": 0.8195418986919275, + "language_loss": 0.51587355, + "learning_rate": 3.6344913212782764e-06, + "loss": 0.53833997, + "num_input_tokens_seen": 78780200, + "step": 3653, + "time_per_iteration": 3.2323009967803955 + }, + { + "auxiliary_loss_clip": 0.01143851, + "auxiliary_loss_mlp": 0.01149043, + "balance_loss_clip": 1.00228882, + "balance_loss_mlp": 1.00122356, + "epoch": 0.21969036524875996, + "flos": 23696518216320.0, + "grad_norm": 1.879534737430245, + "language_loss": 0.74809229, + "learning_rate": 3.6342668476314514e-06, + "loss": 0.77102125, + "num_input_tokens_seen": 78800575, + "step": 3654, + "time_per_iteration": 2.602893352508545 + }, + { + "auxiliary_loss_clip": 0.01164808, + "auxiliary_loss_mlp": 0.01149174, + "balance_loss_clip": 1.00249147, + "balance_loss_mlp": 1.00106859, + "epoch": 0.21975048850142792, + "flos": 19640910435840.0, + "grad_norm": 3.0400720524660163, + "language_loss": 0.72849739, + "learning_rate": 3.634042312013064e-06, + "loss": 0.75163716, + "num_input_tokens_seen": 78819585, + "step": 3655, + "time_per_iteration": 2.522754669189453 + }, + { + "auxiliary_loss_clip": 0.01144312, + "auxiliary_loss_mlp": 0.01147639, + "balance_loss_clip": 1.00211811, + "balance_loss_mlp": 1.00105917, + "epoch": 0.21981061175409589, + "flos": 22448550038400.0, + "grad_norm": 1.482978028874051, + "language_loss": 0.80511236, + "learning_rate": 3.6338177144316276e-06, + "loss": 0.8280319, + "num_input_tokens_seen": 78837330, + "step": 3656, + "time_per_iteration": 2.620889663696289 + }, + { + "auxiliary_loss_clip": 0.01126966, + "auxiliary_loss_mlp": 0.00748811, + "balance_loss_clip": 1.00218022, + "balance_loss_mlp": 1.00111234, + "epoch": 0.21987073500676388, + "flos": 18151049093760.0, + "grad_norm": 1.9983429347786916, + "language_loss": 0.84926981, + "learning_rate": 3.63359305489566e-06, + "loss": 0.86802757, + "num_input_tokens_seen": 78854955, + "step": 3657, + "time_per_iteration": 2.604132652282715 + }, + { + "auxiliary_loss_clip": 0.01159432, + "auxiliary_loss_mlp": 0.01148617, + "balance_loss_clip": 1.00209117, + "balance_loss_mlp": 1.0008924, + "epoch": 0.21993085825943184, + "flos": 25626177682560.0, + "grad_norm": 1.554515328006171, + "language_loss": 0.80592835, + "learning_rate": 3.6333683334136803e-06, + "loss": 0.82900888, + "num_input_tokens_seen": 78874965, + "step": 3658, + "time_per_iteration": 2.5688023567199707 + }, + { + "auxiliary_loss_clip": 0.01125053, + "auxiliary_loss_mlp": 0.01136831, + "balance_loss_clip": 1.00152135, + "balance_loss_mlp": 1.00026441, + "epoch": 0.2199909815120998, + "flos": 70923217743360.0, + "grad_norm": 0.7765828221723693, + "language_loss": 0.58245784, + "learning_rate": 3.6331435499942095e-06, + "loss": 0.60507673, + "num_input_tokens_seen": 78937740, + "step": 3659, + "time_per_iteration": 3.2877798080444336 + }, + { + "auxiliary_loss_clip": 0.01128284, + "auxiliary_loss_mlp": 0.01148321, + "balance_loss_clip": 1.00202084, + "balance_loss_mlp": 1.00097799, + "epoch": 0.22005110476476777, + "flos": 21543529939200.0, + "grad_norm": 2.763495131421254, + "language_loss": 0.73741996, + "learning_rate": 3.632918704645772e-06, + "loss": 0.76018596, + "num_input_tokens_seen": 78955055, + "step": 3660, + "time_per_iteration": 2.6138882637023926 + }, + { + "auxiliary_loss_clip": 0.01158972, + "auxiliary_loss_mlp": 0.01148537, + "balance_loss_clip": 1.00205624, + "balance_loss_mlp": 1.00081313, + "epoch": 0.22011122801743574, + "flos": 22054502862720.0, + "grad_norm": 1.6886973454222076, + "language_loss": 0.80985343, + "learning_rate": 3.632693797376893e-06, + "loss": 0.83292854, + "num_input_tokens_seen": 78974895, + "step": 3661, + "time_per_iteration": 2.548978805541992 + }, + { + "auxiliary_loss_clip": 0.01144528, + "auxiliary_loss_mlp": 0.01148285, + "balance_loss_clip": 1.00209963, + "balance_loss_mlp": 1.00094259, + "epoch": 0.2201713512701037, + "flos": 26687589598080.0, + "grad_norm": 1.7314467392822668, + "language_loss": 0.73254251, + "learning_rate": 3.632468828196102e-06, + "loss": 0.75547063, + "num_input_tokens_seen": 78994990, + "step": 3662, + "time_per_iteration": 2.6283278465270996 + }, + { + "auxiliary_loss_clip": 0.0114315, + "auxiliary_loss_mlp": 0.01148717, + "balance_loss_clip": 1.00209093, + "balance_loss_mlp": 1.00137472, + "epoch": 0.22023147452277167, + "flos": 22162198815360.0, + "grad_norm": 1.5404008417904183, + "language_loss": 0.78519315, + "learning_rate": 3.632243797111929e-06, + "loss": 0.80811179, + "num_input_tokens_seen": 79014405, + "step": 3663, + "time_per_iteration": 2.5978152751922607 + }, + { + "auxiliary_loss_clip": 0.01160229, + "auxiliary_loss_mlp": 0.01148993, + "balance_loss_clip": 1.00226915, + "balance_loss_mlp": 1.00117373, + "epoch": 0.22029159777543966, + "flos": 22523280284160.0, + "grad_norm": 1.985671859168501, + "language_loss": 0.80557626, + "learning_rate": 3.632018704132908e-06, + "loss": 0.82866853, + "num_input_tokens_seen": 79032375, + "step": 3664, + "time_per_iteration": 2.5455288887023926 + }, + { + "auxiliary_loss_clip": 0.01141912, + "auxiliary_loss_mlp": 0.01148724, + "balance_loss_clip": 1.00185144, + "balance_loss_mlp": 1.00090408, + "epoch": 0.22035172102810763, + "flos": 13042469093760.0, + "grad_norm": 7.567886715532415, + "language_loss": 0.76754463, + "learning_rate": 3.6317935492675742e-06, + "loss": 0.79045093, + "num_input_tokens_seen": 79049635, + "step": 3665, + "time_per_iteration": 2.5466065406799316 + }, + { + "auxiliary_loss_clip": 0.01142821, + "auxiliary_loss_mlp": 0.01149163, + "balance_loss_clip": 1.00215614, + "balance_loss_mlp": 1.00124836, + "epoch": 0.2204118442807756, + "flos": 12165817760640.0, + "grad_norm": 2.3747682797012146, + "language_loss": 0.97760475, + "learning_rate": 3.631568332524466e-06, + "loss": 1.00052464, + "num_input_tokens_seen": 79062890, + "step": 3666, + "time_per_iteration": 2.545927047729492 + }, + { + "auxiliary_loss_clip": 0.01159007, + "auxiliary_loss_mlp": 0.00748939, + "balance_loss_clip": 1.00197732, + "balance_loss_mlp": 1.0011282, + "epoch": 0.22047196753344356, + "flos": 40108806673920.0, + "grad_norm": 1.5382910222942856, + "language_loss": 0.80391651, + "learning_rate": 3.631343053912122e-06, + "loss": 0.82299602, + "num_input_tokens_seen": 79085495, + "step": 3667, + "time_per_iteration": 2.711524486541748 + }, + { + "auxiliary_loss_clip": 0.0115894, + "auxiliary_loss_mlp": 0.01148986, + "balance_loss_clip": 1.00203443, + "balance_loss_mlp": 1.00097561, + "epoch": 0.22053209078611152, + "flos": 20701137202560.0, + "grad_norm": 1.7066806158366923, + "language_loss": 0.77177018, + "learning_rate": 3.631117713439087e-06, + "loss": 0.79484946, + "num_input_tokens_seen": 79101820, + "step": 3668, + "time_per_iteration": 2.524568557739258 + }, + { + "auxiliary_loss_clip": 0.01159501, + "auxiliary_loss_mlp": 0.01147705, + "balance_loss_clip": 1.00228429, + "balance_loss_mlp": 1.0009346, + "epoch": 0.2205922140387795, + "flos": 24716309247360.0, + "grad_norm": 1.5668372573860252, + "language_loss": 0.71329445, + "learning_rate": 3.630892311113904e-06, + "loss": 0.73636651, + "num_input_tokens_seen": 79123320, + "step": 3669, + "time_per_iteration": 2.618659257888794 + }, + { + "auxiliary_loss_clip": 0.01175512, + "auxiliary_loss_mlp": 0.01147681, + "balance_loss_clip": 1.00208271, + "balance_loss_mlp": 1.00081551, + "epoch": 0.22065233729144745, + "flos": 23477247642240.0, + "grad_norm": 1.5665626355471596, + "language_loss": 0.85149777, + "learning_rate": 3.6306668469451215e-06, + "loss": 0.87472969, + "num_input_tokens_seen": 79141615, + "step": 3670, + "time_per_iteration": 2.522183895111084 + }, + { + "auxiliary_loss_clip": 0.01144458, + "auxiliary_loss_mlp": 0.01148478, + "balance_loss_clip": 1.00218463, + "balance_loss_mlp": 1.00094438, + "epoch": 0.22071246054411545, + "flos": 35225566646400.0, + "grad_norm": 1.820301854143178, + "language_loss": 0.77024007, + "learning_rate": 3.6304413209412886e-06, + "loss": 0.7931695, + "num_input_tokens_seen": 79164910, + "step": 3671, + "time_per_iteration": 2.725428581237793 + }, + { + "auxiliary_loss_clip": 0.01142832, + "auxiliary_loss_mlp": 0.0114826, + "balance_loss_clip": 1.00219059, + "balance_loss_mlp": 1.00082231, + "epoch": 0.2207725837967834, + "flos": 18150294908160.0, + "grad_norm": 2.0313633363745716, + "language_loss": 0.81001842, + "learning_rate": 3.6302157331109573e-06, + "loss": 0.83292937, + "num_input_tokens_seen": 79179685, + "step": 3672, + "time_per_iteration": 2.5997347831726074 + }, + { + "auxiliary_loss_clip": 0.01159734, + "auxiliary_loss_mlp": 0.01148437, + "balance_loss_clip": 1.00215483, + "balance_loss_mlp": 1.00118947, + "epoch": 0.22083270704945138, + "flos": 20479675898880.0, + "grad_norm": 1.8355100072498254, + "language_loss": 0.73407757, + "learning_rate": 3.629990083462682e-06, + "loss": 0.75715929, + "num_input_tokens_seen": 79196285, + "step": 3673, + "time_per_iteration": 2.5451674461364746 + }, + { + "auxiliary_loss_clip": 0.01128772, + "auxiliary_loss_mlp": 0.01148551, + "balance_loss_clip": 1.00213218, + "balance_loss_mlp": 1.00073099, + "epoch": 0.22089283030211934, + "flos": 34125801984000.0, + "grad_norm": 1.9540071462784807, + "language_loss": 0.76392388, + "learning_rate": 3.6297643720050203e-06, + "loss": 0.78669703, + "num_input_tokens_seen": 79216060, + "step": 3674, + "time_per_iteration": 2.75406813621521 + }, + { + "auxiliary_loss_clip": 0.01175591, + "auxiliary_loss_mlp": 0.01148111, + "balance_loss_clip": 1.00221395, + "balance_loss_mlp": 1.00105488, + "epoch": 0.2209529535547873, + "flos": 18077216688000.0, + "grad_norm": 2.1906037455375102, + "language_loss": 0.74367803, + "learning_rate": 3.6295385987465293e-06, + "loss": 0.76691502, + "num_input_tokens_seen": 79235145, + "step": 3675, + "time_per_iteration": 2.5299439430236816 + }, + { + "auxiliary_loss_clip": 0.01175715, + "auxiliary_loss_mlp": 0.01148997, + "balance_loss_clip": 1.00222731, + "balance_loss_mlp": 1.00089097, + "epoch": 0.22101307680745527, + "flos": 27235335070080.0, + "grad_norm": 1.5965353654154413, + "language_loss": 0.80086416, + "learning_rate": 3.629312763695772e-06, + "loss": 0.82411134, + "num_input_tokens_seen": 79256960, + "step": 3676, + "time_per_iteration": 2.554025650024414 + }, + { + "auxiliary_loss_clip": 0.01142261, + "auxiliary_loss_mlp": 0.01148304, + "balance_loss_clip": 1.0019629, + "balance_loss_mlp": 1.00105679, + "epoch": 0.22107320006012326, + "flos": 16543256423040.0, + "grad_norm": 1.8180504351438063, + "language_loss": 0.7513206, + "learning_rate": 3.6290868668613107e-06, + "loss": 0.77422631, + "num_input_tokens_seen": 79274860, + "step": 3677, + "time_per_iteration": 2.5805325508117676 + }, + { + "auxiliary_loss_clip": 0.01127206, + "auxiliary_loss_mlp": 0.01148548, + "balance_loss_clip": 1.00186801, + "balance_loss_mlp": 1.00120533, + "epoch": 0.22113332331279123, + "flos": 22054466949120.0, + "grad_norm": 1.6412540671006872, + "language_loss": 0.82775342, + "learning_rate": 3.628860908251712e-06, + "loss": 0.85051095, + "num_input_tokens_seen": 79294005, + "step": 3678, + "time_per_iteration": 2.621603488922119 + }, + { + "auxiliary_loss_clip": 0.01111639, + "auxiliary_loss_mlp": 0.0114807, + "balance_loss_clip": 1.00211287, + "balance_loss_mlp": 1.00101388, + "epoch": 0.2211934465654592, + "flos": 26612787525120.0, + "grad_norm": 1.6348371122207288, + "language_loss": 0.8900528, + "learning_rate": 3.6286348878755452e-06, + "loss": 0.91264987, + "num_input_tokens_seen": 79314005, + "step": 3679, + "time_per_iteration": 2.685516595840454 + }, + { + "auxiliary_loss_clip": 0.01159825, + "auxiliary_loss_mlp": 0.01149655, + "balance_loss_clip": 1.0021888, + "balance_loss_mlp": 1.00116765, + "epoch": 0.22125356981812716, + "flos": 16360363347840.0, + "grad_norm": 2.090082608124157, + "language_loss": 0.86357391, + "learning_rate": 3.6284088057413803e-06, + "loss": 0.88666868, + "num_input_tokens_seen": 79331030, + "step": 3680, + "time_per_iteration": 2.505521535873413 + }, + { + "auxiliary_loss_clip": 0.01125959, + "auxiliary_loss_mlp": 0.011483, + "balance_loss_clip": 1.00194764, + "balance_loss_mlp": 1.00133836, + "epoch": 0.22131369307079513, + "flos": 21651118151040.0, + "grad_norm": 1.850359255231559, + "language_loss": 0.81517565, + "learning_rate": 3.6281826618577894e-06, + "loss": 0.83791822, + "num_input_tokens_seen": 79348560, + "step": 3681, + "time_per_iteration": 2.611769676208496 + }, + { + "auxiliary_loss_clip": 0.01175447, + "auxiliary_loss_mlp": 0.00748946, + "balance_loss_clip": 1.00215197, + "balance_loss_mlp": 1.00126231, + "epoch": 0.2213738163234631, + "flos": 19609524927360.0, + "grad_norm": 2.3422094982933173, + "language_loss": 0.80110353, + "learning_rate": 3.62795645623335e-06, + "loss": 0.82034755, + "num_input_tokens_seen": 79367175, + "step": 3682, + "time_per_iteration": 3.907386541366577 + }, + { + "auxiliary_loss_clip": 0.01144968, + "auxiliary_loss_mlp": 0.01148311, + "balance_loss_clip": 1.0021224, + "balance_loss_mlp": 1.00087333, + "epoch": 0.22143393957613106, + "flos": 23623404082560.0, + "grad_norm": 1.5041789089909654, + "language_loss": 0.77723354, + "learning_rate": 3.627730188876638e-06, + "loss": 0.80016637, + "num_input_tokens_seen": 79388435, + "step": 3683, + "time_per_iteration": 2.6072216033935547 + }, + { + "auxiliary_loss_clip": 0.01143696, + "auxiliary_loss_mlp": 0.01148484, + "balance_loss_clip": 1.00203347, + "balance_loss_mlp": 1.00104618, + "epoch": 0.22149406282879905, + "flos": 26177801823360.0, + "grad_norm": 1.95035857423128, + "language_loss": 0.72387427, + "learning_rate": 3.627503859796234e-06, + "loss": 0.74679613, + "num_input_tokens_seen": 79407910, + "step": 3684, + "time_per_iteration": 4.052606582641602 + }, + { + "auxiliary_loss_clip": 0.01095167, + "auxiliary_loss_mlp": 0.01148215, + "balance_loss_clip": 1.00193501, + "balance_loss_mlp": 1.00106299, + "epoch": 0.221554186081467, + "flos": 14538758970240.0, + "grad_norm": 2.172417950789652, + "language_loss": 0.79829776, + "learning_rate": 3.6272774690007207e-06, + "loss": 0.82073158, + "num_input_tokens_seen": 79424020, + "step": 3685, + "time_per_iteration": 2.681363105773926 + }, + { + "auxiliary_loss_clip": 0.01175463, + "auxiliary_loss_mlp": 0.01147753, + "balance_loss_clip": 1.00214756, + "balance_loss_mlp": 1.00117326, + "epoch": 0.22161430933413498, + "flos": 22238257864320.0, + "grad_norm": 1.356360165345551, + "language_loss": 0.87294483, + "learning_rate": 3.6270510164986823e-06, + "loss": 0.89617705, + "num_input_tokens_seen": 79445605, + "step": 3686, + "time_per_iteration": 2.538849115371704 + }, + { + "auxiliary_loss_clip": 0.01158742, + "auxiliary_loss_mlp": 0.01148007, + "balance_loss_clip": 1.00194049, + "balance_loss_mlp": 1.00094998, + "epoch": 0.22167443258680294, + "flos": 23476529370240.0, + "grad_norm": 1.8723696001167833, + "language_loss": 0.77711821, + "learning_rate": 3.626824502298707e-06, + "loss": 0.80018562, + "num_input_tokens_seen": 79463850, + "step": 3687, + "time_per_iteration": 3.940661907196045 + }, + { + "auxiliary_loss_clip": 0.01144677, + "auxiliary_loss_mlp": 0.01148847, + "balance_loss_clip": 1.00215244, + "balance_loss_mlp": 1.00121844, + "epoch": 0.2217345558394709, + "flos": 23221132692480.0, + "grad_norm": 1.7455680095121369, + "language_loss": 0.84720576, + "learning_rate": 3.626597926409383e-06, + "loss": 0.87014103, + "num_input_tokens_seen": 79482845, + "step": 3688, + "time_per_iteration": 4.016922473907471 + }, + { + "auxiliary_loss_clip": 0.01111752, + "auxiliary_loss_mlp": 0.01148404, + "balance_loss_clip": 1.00188923, + "balance_loss_mlp": 1.00096607, + "epoch": 0.22179467909213887, + "flos": 20011078045440.0, + "grad_norm": 1.71683607536902, + "language_loss": 0.8085314, + "learning_rate": 3.6263712888393027e-06, + "loss": 0.83113301, + "num_input_tokens_seen": 79501550, + "step": 3689, + "time_per_iteration": 2.6719703674316406 + }, + { + "auxiliary_loss_clip": 0.01142126, + "auxiliary_loss_mlp": 0.0114829, + "balance_loss_clip": 1.00197101, + "balance_loss_mlp": 1.00104332, + "epoch": 0.22185480234480687, + "flos": 19683034110720.0, + "grad_norm": 1.7352360718863624, + "language_loss": 0.700966, + "learning_rate": 3.626144589597061e-06, + "loss": 0.72387016, + "num_input_tokens_seen": 79519680, + "step": 3690, + "time_per_iteration": 2.595357656478882 + }, + { + "auxiliary_loss_clip": 0.01159635, + "auxiliary_loss_mlp": 0.00749056, + "balance_loss_clip": 1.0020293, + "balance_loss_mlp": 1.00131547, + "epoch": 0.22191492559747483, + "flos": 21981316901760.0, + "grad_norm": 1.563275496105679, + "language_loss": 0.72027695, + "learning_rate": 3.6259178286912528e-06, + "loss": 0.73936391, + "num_input_tokens_seen": 79539000, + "step": 3691, + "time_per_iteration": 2.5633862018585205 + }, + { + "auxiliary_loss_clip": 0.01159112, + "auxiliary_loss_mlp": 0.01148541, + "balance_loss_clip": 1.00220037, + "balance_loss_mlp": 1.00110269, + "epoch": 0.2219750488501428, + "flos": 23222066446080.0, + "grad_norm": 2.7025464584487704, + "language_loss": 0.71396375, + "learning_rate": 3.625691006130477e-06, + "loss": 0.73704028, + "num_input_tokens_seen": 79559695, + "step": 3692, + "time_per_iteration": 2.5714685916900635 + }, + { + "auxiliary_loss_clip": 0.01164562, + "auxiliary_loss_mlp": 0.01148004, + "balance_loss_clip": 1.00234032, + "balance_loss_mlp": 1.00104237, + "epoch": 0.22203517210281076, + "flos": 22453685683200.0, + "grad_norm": 1.5369622387435182, + "language_loss": 0.87602854, + "learning_rate": 3.6254641219233362e-06, + "loss": 0.89915419, + "num_input_tokens_seen": 79579095, + "step": 3693, + "time_per_iteration": 2.567808151245117 + }, + { + "auxiliary_loss_clip": 0.01159537, + "auxiliary_loss_mlp": 0.01147223, + "balance_loss_clip": 1.00203109, + "balance_loss_mlp": 1.00092924, + "epoch": 0.22209529535547873, + "flos": 17564555825280.0, + "grad_norm": 2.157136196402419, + "language_loss": 0.85239196, + "learning_rate": 3.6252371760784325e-06, + "loss": 0.87545955, + "num_input_tokens_seen": 79596430, + "step": 3694, + "time_per_iteration": 2.538569688796997 + }, + { + "auxiliary_loss_clip": 0.01128213, + "auxiliary_loss_mlp": 0.01148712, + "balance_loss_clip": 1.00192046, + "balance_loss_mlp": 1.00098753, + "epoch": 0.2221554186081467, + "flos": 21469015175040.0, + "grad_norm": 1.6734850203415155, + "language_loss": 0.69185936, + "learning_rate": 3.6250101686043725e-06, + "loss": 0.71462864, + "num_input_tokens_seen": 79615825, + "step": 3695, + "time_per_iteration": 2.6146445274353027 + }, + { + "auxiliary_loss_clip": 0.01143447, + "auxiliary_loss_mlp": 0.01147576, + "balance_loss_clip": 1.00200295, + "balance_loss_mlp": 1.00099635, + "epoch": 0.22221554186081466, + "flos": 27673445255040.0, + "grad_norm": 1.397346476960059, + "language_loss": 0.71512985, + "learning_rate": 3.6247830995097637e-06, + "loss": 0.73804003, + "num_input_tokens_seen": 79637875, + "step": 3696, + "time_per_iteration": 2.6485939025878906 + }, + { + "auxiliary_loss_clip": 0.01158924, + "auxiliary_loss_mlp": 0.01147819, + "balance_loss_clip": 1.00203824, + "balance_loss_mlp": 1.0009532, + "epoch": 0.22227566511348265, + "flos": 25958926298880.0, + "grad_norm": 1.75198378018584, + "language_loss": 0.87391937, + "learning_rate": 3.624555968803217e-06, + "loss": 0.89698672, + "num_input_tokens_seen": 79656970, + "step": 3697, + "time_per_iteration": 2.5937647819519043 + }, + { + "auxiliary_loss_clip": 0.01143399, + "auxiliary_loss_mlp": 0.01146802, + "balance_loss_clip": 1.00201201, + "balance_loss_mlp": 1.00088978, + "epoch": 0.22233578836615062, + "flos": 39203678833920.0, + "grad_norm": 1.6588468072008782, + "language_loss": 0.66097778, + "learning_rate": 3.624328776493346e-06, + "loss": 0.68387979, + "num_input_tokens_seen": 79680275, + "step": 3698, + "time_per_iteration": 2.742452621459961 + }, + { + "auxiliary_loss_clip": 0.01158834, + "auxiliary_loss_mlp": 0.01148299, + "balance_loss_clip": 1.00202584, + "balance_loss_mlp": 1.00095654, + "epoch": 0.22239591161881858, + "flos": 36283782251520.0, + "grad_norm": 1.6305388034976478, + "language_loss": 0.82380992, + "learning_rate": 3.6241015225887637e-06, + "loss": 0.84688127, + "num_input_tokens_seen": 79701255, + "step": 3699, + "time_per_iteration": 2.67177152633667 + }, + { + "auxiliary_loss_clip": 0.01144716, + "auxiliary_loss_mlp": 0.01147652, + "balance_loss_clip": 1.00204182, + "balance_loss_mlp": 1.00107241, + "epoch": 0.22245603487148655, + "flos": 19719591177600.0, + "grad_norm": 1.4557928748610125, + "language_loss": 0.79778832, + "learning_rate": 3.62387420709809e-06, + "loss": 0.82071203, + "num_input_tokens_seen": 79721315, + "step": 3700, + "time_per_iteration": 2.595796823501587 + }, + { + "auxiliary_loss_clip": 0.01116818, + "auxiliary_loss_mlp": 0.01148381, + "balance_loss_clip": 1.00221097, + "balance_loss_mlp": 1.00075173, + "epoch": 0.2225161581241545, + "flos": 46280450615040.0, + "grad_norm": 2.0918823817052754, + "language_loss": 0.72238791, + "learning_rate": 3.623646830029943e-06, + "loss": 0.74503988, + "num_input_tokens_seen": 79742705, + "step": 3701, + "time_per_iteration": 2.90248441696167 + }, + { + "auxiliary_loss_clip": 0.01164427, + "auxiliary_loss_mlp": 0.01148008, + "balance_loss_clip": 1.00230932, + "balance_loss_mlp": 1.00095153, + "epoch": 0.22257628137682248, + "flos": 23696194993920.0, + "grad_norm": 1.6584714496984945, + "language_loss": 0.80149484, + "learning_rate": 3.6234193913929454e-06, + "loss": 0.82461917, + "num_input_tokens_seen": 79763000, + "step": 3702, + "time_per_iteration": 2.5570900440216064 + }, + { + "auxiliary_loss_clip": 0.01158515, + "auxiliary_loss_mlp": 0.01147641, + "balance_loss_clip": 1.00196981, + "balance_loss_mlp": 1.00087094, + "epoch": 0.22263640462949044, + "flos": 19353984595200.0, + "grad_norm": 1.9633713895624236, + "language_loss": 0.78155053, + "learning_rate": 3.623191891195723e-06, + "loss": 0.8046121, + "num_input_tokens_seen": 79781335, + "step": 3703, + "time_per_iteration": 2.540567636489868 + }, + { + "auxiliary_loss_clip": 0.01159119, + "auxiliary_loss_mlp": 0.01147527, + "balance_loss_clip": 1.00204146, + "balance_loss_mlp": 1.00075662, + "epoch": 0.22269652788215843, + "flos": 20776047016320.0, + "grad_norm": 3.1933111788955895, + "language_loss": 0.74109387, + "learning_rate": 3.6229643294469005e-06, + "loss": 0.76416028, + "num_input_tokens_seen": 79800150, + "step": 3704, + "time_per_iteration": 2.536869525909424 + }, + { + "auxiliary_loss_clip": 0.01127166, + "auxiliary_loss_mlp": 0.01147028, + "balance_loss_clip": 1.0021131, + "balance_loss_mlp": 1.00121093, + "epoch": 0.2227566511348264, + "flos": 47958843467520.0, + "grad_norm": 1.7497648736365876, + "language_loss": 0.64276218, + "learning_rate": 3.6227367061551074e-06, + "loss": 0.6655041, + "num_input_tokens_seen": 79822390, + "step": 3705, + "time_per_iteration": 2.856774091720581 + }, + { + "auxiliary_loss_clip": 0.0112319, + "auxiliary_loss_mlp": 0.01135026, + "balance_loss_clip": 1.00251722, + "balance_loss_mlp": 0.99998552, + "epoch": 0.22281677438749437, + "flos": 66218953230720.0, + "grad_norm": 1.1926823177993686, + "language_loss": 0.65204644, + "learning_rate": 3.6225090213289766e-06, + "loss": 0.67462862, + "num_input_tokens_seen": 79873350, + "step": 3706, + "time_per_iteration": 3.076089382171631 + }, + { + "auxiliary_loss_clip": 0.01143471, + "auxiliary_loss_mlp": 0.01147784, + "balance_loss_clip": 1.00200582, + "balance_loss_mlp": 1.00091767, + "epoch": 0.22287689764016233, + "flos": 21871609787520.0, + "grad_norm": 1.7111706503308568, + "language_loss": 0.80270207, + "learning_rate": 3.622281274977141e-06, + "loss": 0.82561469, + "num_input_tokens_seen": 79891715, + "step": 3707, + "time_per_iteration": 2.611074209213257 + }, + { + "auxiliary_loss_clip": 0.01175235, + "auxiliary_loss_mlp": 0.01147399, + "balance_loss_clip": 1.00210285, + "balance_loss_mlp": 1.00091481, + "epoch": 0.2229370208928303, + "flos": 27672475587840.0, + "grad_norm": 1.8402414558460316, + "language_loss": 0.78172261, + "learning_rate": 3.6220534671082367e-06, + "loss": 0.80494893, + "num_input_tokens_seen": 79911175, + "step": 3708, + "time_per_iteration": 2.562655448913574 + }, + { + "auxiliary_loss_clip": 0.01144039, + "auxiliary_loss_mlp": 0.01147608, + "balance_loss_clip": 1.0019865, + "balance_loss_mlp": 1.00083756, + "epoch": 0.22299714414549826, + "flos": 30154657034880.0, + "grad_norm": 1.9038143068639186, + "language_loss": 0.80274236, + "learning_rate": 3.6218255977309024e-06, + "loss": 0.8256588, + "num_input_tokens_seen": 79931875, + "step": 3709, + "time_per_iteration": 2.697542190551758 + }, + { + "auxiliary_loss_clip": 0.01158651, + "auxiliary_loss_mlp": 0.00748892, + "balance_loss_clip": 1.00197756, + "balance_loss_mlp": 1.00107265, + "epoch": 0.22305726739816625, + "flos": 23143134309120.0, + "grad_norm": 1.6983349332954796, + "language_loss": 0.68584412, + "learning_rate": 3.6215976668537787e-06, + "loss": 0.70491958, + "num_input_tokens_seen": 79952445, + "step": 3710, + "time_per_iteration": 2.603637933731079 + }, + { + "auxiliary_loss_clip": 0.01128681, + "auxiliary_loss_mlp": 0.01147831, + "balance_loss_clip": 1.00197768, + "balance_loss_mlp": 1.00106049, + "epoch": 0.22311739065083422, + "flos": 19172061187200.0, + "grad_norm": 2.243844594031844, + "language_loss": 0.90480065, + "learning_rate": 3.6213696744855096e-06, + "loss": 0.92756575, + "num_input_tokens_seen": 79971030, + "step": 3711, + "time_per_iteration": 2.62164306640625 + }, + { + "auxiliary_loss_clip": 0.01142181, + "auxiliary_loss_mlp": 0.01147715, + "balance_loss_clip": 1.00192738, + "balance_loss_mlp": 1.00113523, + "epoch": 0.22317751390350218, + "flos": 13617757319040.0, + "grad_norm": 3.7872148645991537, + "language_loss": 0.88877684, + "learning_rate": 3.6211416206347395e-06, + "loss": 0.91167581, + "num_input_tokens_seen": 79982085, + "step": 3712, + "time_per_iteration": 2.5089128017425537 + }, + { + "auxiliary_loss_clip": 0.01175255, + "auxiliary_loss_mlp": 0.01147295, + "balance_loss_clip": 1.00212002, + "balance_loss_mlp": 1.00119162, + "epoch": 0.22323763715617015, + "flos": 11029065068160.0, + "grad_norm": 2.5523211696434633, + "language_loss": 0.74580073, + "learning_rate": 3.620913505310117e-06, + "loss": 0.76902628, + "num_input_tokens_seen": 79997460, + "step": 3713, + "time_per_iteration": 2.4909374713897705 + }, + { + "auxiliary_loss_clip": 0.01100733, + "auxiliary_loss_mlp": 0.01147319, + "balance_loss_clip": 1.00220764, + "balance_loss_mlp": 1.00092947, + "epoch": 0.22329776040883811, + "flos": 41351531466240.0, + "grad_norm": 1.797895046511924, + "language_loss": 0.62810999, + "learning_rate": 3.6206853285202917e-06, + "loss": 0.65059048, + "num_input_tokens_seen": 80022450, + "step": 3714, + "time_per_iteration": 2.8830811977386475 + }, + { + "auxiliary_loss_clip": 0.01127442, + "auxiliary_loss_mlp": 0.01147371, + "balance_loss_clip": 1.00188351, + "balance_loss_mlp": 1.00079179, + "epoch": 0.22335788366150608, + "flos": 25119478477440.0, + "grad_norm": 1.7998163714641977, + "language_loss": 0.79231107, + "learning_rate": 3.6204570902739164e-06, + "loss": 0.81505919, + "num_input_tokens_seen": 80042100, + "step": 3715, + "time_per_iteration": 2.677985906600952 + }, + { + "auxiliary_loss_clip": 0.01125658, + "auxiliary_loss_mlp": 0.01148099, + "balance_loss_clip": 1.00195396, + "balance_loss_mlp": 1.00132871, + "epoch": 0.22341800691417404, + "flos": 16983377769600.0, + "grad_norm": 1.8535962039523874, + "language_loss": 0.76600736, + "learning_rate": 3.620228790579645e-06, + "loss": 0.78874493, + "num_input_tokens_seen": 80059690, + "step": 3716, + "time_per_iteration": 2.6247119903564453 + }, + { + "auxiliary_loss_clip": 0.01143399, + "auxiliary_loss_mlp": 0.01147706, + "balance_loss_clip": 1.00194049, + "balance_loss_mlp": 1.00093544, + "epoch": 0.22347813016684204, + "flos": 14136738975360.0, + "grad_norm": 3.401210949030849, + "language_loss": 0.797068, + "learning_rate": 3.6200004294461367e-06, + "loss": 0.81997907, + "num_input_tokens_seen": 80076060, + "step": 3717, + "time_per_iteration": 2.5527472496032715 + }, + { + "auxiliary_loss_clip": 0.01094873, + "auxiliary_loss_mlp": 0.01148094, + "balance_loss_clip": 1.00185096, + "balance_loss_mlp": 1.00094175, + "epoch": 0.22353825341951, + "flos": 23583147914880.0, + "grad_norm": 2.319326890300204, + "language_loss": 0.68368554, + "learning_rate": 3.6197720068820497e-06, + "loss": 0.70611525, + "num_input_tokens_seen": 80094760, + "step": 3718, + "time_per_iteration": 2.714169979095459 + }, + { + "auxiliary_loss_clip": 0.01144071, + "auxiliary_loss_mlp": 0.0114717, + "balance_loss_clip": 1.00199914, + "balance_loss_mlp": 1.00087678, + "epoch": 0.22359837667217797, + "flos": 29824206888960.0, + "grad_norm": 1.5372775512323285, + "language_loss": 0.80553257, + "learning_rate": 3.619543522896045e-06, + "loss": 0.82844508, + "num_input_tokens_seen": 80114475, + "step": 3719, + "time_per_iteration": 2.6599810123443604 + }, + { + "auxiliary_loss_clip": 0.01142472, + "auxiliary_loss_mlp": 0.01148408, + "balance_loss_clip": 1.00196862, + "balance_loss_mlp": 1.00116062, + "epoch": 0.22365849992484593, + "flos": 17603088140160.0, + "grad_norm": 1.964831799536542, + "language_loss": 0.8727448, + "learning_rate": 3.6193149774967885e-06, + "loss": 0.89565361, + "num_input_tokens_seen": 80132920, + "step": 3720, + "time_per_iteration": 2.5641286373138428 + }, + { + "auxiliary_loss_clip": 0.01143121, + "auxiliary_loss_mlp": 0.0114741, + "balance_loss_clip": 1.00209367, + "balance_loss_mlp": 1.00083005, + "epoch": 0.2237186231775139, + "flos": 22710949868160.0, + "grad_norm": 1.7190797892423455, + "language_loss": 0.74520338, + "learning_rate": 3.619086370692945e-06, + "loss": 0.76810867, + "num_input_tokens_seen": 80152845, + "step": 3721, + "time_per_iteration": 4.0486297607421875 + }, + { + "auxiliary_loss_clip": 0.01175518, + "auxiliary_loss_mlp": 0.01147853, + "balance_loss_clip": 1.00217104, + "balance_loss_mlp": 1.00098765, + "epoch": 0.22377874643018186, + "flos": 13371518609280.0, + "grad_norm": 2.5394136033146877, + "language_loss": 0.79423809, + "learning_rate": 3.6188577024931844e-06, + "loss": 0.8174718, + "num_input_tokens_seen": 80170680, + "step": 3722, + "time_per_iteration": 3.8952460289001465 + }, + { + "auxiliary_loss_clip": 0.01125402, + "auxiliary_loss_mlp": 0.01147786, + "balance_loss_clip": 1.0018841, + "balance_loss_mlp": 1.00101566, + "epoch": 0.22383886968284986, + "flos": 17894970057600.0, + "grad_norm": 1.9228940924573923, + "language_loss": 0.82246721, + "learning_rate": 3.618628972906178e-06, + "loss": 0.84519905, + "num_input_tokens_seen": 80189030, + "step": 3723, + "time_per_iteration": 2.61044979095459 + }, + { + "auxiliary_loss_clip": 0.01175396, + "auxiliary_loss_mlp": 0.01147768, + "balance_loss_clip": 1.00209522, + "balance_loss_mlp": 1.00109279, + "epoch": 0.22389899293551782, + "flos": 23879123982720.0, + "grad_norm": 1.633366949462437, + "language_loss": 0.84221888, + "learning_rate": 3.6184001819405984e-06, + "loss": 0.8654505, + "num_input_tokens_seen": 80208365, + "step": 3724, + "time_per_iteration": 3.963634490966797 + }, + { + "auxiliary_loss_clip": 0.01149111, + "auxiliary_loss_mlp": 0.01147464, + "balance_loss_clip": 1.00283718, + "balance_loss_mlp": 1.00107515, + "epoch": 0.2239591161881858, + "flos": 27272430840960.0, + "grad_norm": 1.8137276605618868, + "language_loss": 0.78823912, + "learning_rate": 3.618171329605121e-06, + "loss": 0.81120491, + "num_input_tokens_seen": 80228685, + "step": 3725, + "time_per_iteration": 2.6427876949310303 + }, + { + "auxiliary_loss_clip": 0.01092121, + "auxiliary_loss_mlp": 0.0114729, + "balance_loss_clip": 1.00166166, + "balance_loss_mlp": 1.00090122, + "epoch": 0.22401923944085375, + "flos": 22236857233920.0, + "grad_norm": 1.9755762895697548, + "language_loss": 0.77303207, + "learning_rate": 3.6179424159084254e-06, + "loss": 0.79542625, + "num_input_tokens_seen": 80247635, + "step": 3726, + "time_per_iteration": 4.1148459911346436 + }, + { + "auxiliary_loss_clip": 0.0116028, + "auxiliary_loss_mlp": 0.01149013, + "balance_loss_clip": 1.00213349, + "balance_loss_mlp": 1.00100231, + "epoch": 0.22407936269352172, + "flos": 12053668521600.0, + "grad_norm": 4.154867318413261, + "language_loss": 0.72474289, + "learning_rate": 3.6177134408591914e-06, + "loss": 0.74783587, + "num_input_tokens_seen": 80260045, + "step": 3727, + "time_per_iteration": 2.5117828845977783 + }, + { + "auxiliary_loss_clip": 0.01175332, + "auxiliary_loss_mlp": 0.01147361, + "balance_loss_clip": 1.00209069, + "balance_loss_mlp": 1.00097156, + "epoch": 0.22413948594618968, + "flos": 19353553632000.0, + "grad_norm": 2.0100858735139613, + "language_loss": 0.86848623, + "learning_rate": 3.6174844044661013e-06, + "loss": 0.89171314, + "num_input_tokens_seen": 80277680, + "step": 3728, + "time_per_iteration": 2.5073304176330566 + }, + { + "auxiliary_loss_clip": 0.01143296, + "auxiliary_loss_mlp": 0.01147404, + "balance_loss_clip": 1.00197101, + "balance_loss_mlp": 1.00082433, + "epoch": 0.22419960919885765, + "flos": 24170000319360.0, + "grad_norm": 2.050968413362204, + "language_loss": 0.80123562, + "learning_rate": 3.6172553067378406e-06, + "loss": 0.82414263, + "num_input_tokens_seen": 80294795, + "step": 3729, + "time_per_iteration": 2.59852933883667 + }, + { + "auxiliary_loss_clip": 0.01142047, + "auxiliary_loss_mlp": 0.0114721, + "balance_loss_clip": 1.00190568, + "balance_loss_mlp": 1.00110698, + "epoch": 0.22425973245152564, + "flos": 27378977558400.0, + "grad_norm": 1.9847796880611686, + "language_loss": 0.8696872, + "learning_rate": 3.6170261476830964e-06, + "loss": 0.89257979, + "num_input_tokens_seen": 80315425, + "step": 3730, + "time_per_iteration": 2.6439766883850098 + }, + { + "auxiliary_loss_clip": 0.01147288, + "auxiliary_loss_mlp": 0.00748956, + "balance_loss_clip": 1.00213945, + "balance_loss_mlp": 1.00124073, + "epoch": 0.2243198557041936, + "flos": 13735652734080.0, + "grad_norm": 1.6641424434027203, + "language_loss": 0.73189449, + "learning_rate": 3.616796927310559e-06, + "loss": 0.75085688, + "num_input_tokens_seen": 80333905, + "step": 3731, + "time_per_iteration": 2.5788214206695557 + }, + { + "auxiliary_loss_clip": 0.01127308, + "auxiliary_loss_mlp": 0.01147319, + "balance_loss_clip": 1.00188541, + "balance_loss_mlp": 1.00083506, + "epoch": 0.22437997895686157, + "flos": 19530700531200.0, + "grad_norm": 1.69995656980425, + "language_loss": 0.75542992, + "learning_rate": 3.6165676456289195e-06, + "loss": 0.77817619, + "num_input_tokens_seen": 80352165, + "step": 3732, + "time_per_iteration": 2.6228127479553223 + }, + { + "auxiliary_loss_clip": 0.01175385, + "auxiliary_loss_mlp": 0.01147219, + "balance_loss_clip": 1.00215983, + "balance_loss_mlp": 1.00111604, + "epoch": 0.22444010220952954, + "flos": 23696230907520.0, + "grad_norm": 1.658572812698727, + "language_loss": 0.88290262, + "learning_rate": 3.616338302646873e-06, + "loss": 0.90612864, + "num_input_tokens_seen": 80371305, + "step": 3733, + "time_per_iteration": 2.535024404525757 + }, + { + "auxiliary_loss_clip": 0.0111575, + "auxiliary_loss_mlp": 0.01147717, + "balance_loss_clip": 1.00191939, + "balance_loss_mlp": 1.00104225, + "epoch": 0.2245002254621975, + "flos": 22382905933440.0, + "grad_norm": 1.492635621754797, + "language_loss": 0.84395832, + "learning_rate": 3.6161088983731166e-06, + "loss": 0.866593, + "num_input_tokens_seen": 80391020, + "step": 3734, + "time_per_iteration": 2.7235937118530273 + }, + { + "auxiliary_loss_clip": 0.01144553, + "auxiliary_loss_mlp": 0.01146683, + "balance_loss_clip": 1.00198078, + "balance_loss_mlp": 1.00105667, + "epoch": 0.22456034871486547, + "flos": 26942303917440.0, + "grad_norm": 1.5415901443839473, + "language_loss": 0.76531976, + "learning_rate": 3.6158794328163482e-06, + "loss": 0.78823209, + "num_input_tokens_seen": 80411365, + "step": 3735, + "time_per_iteration": 2.6818795204162598 + }, + { + "auxiliary_loss_clip": 0.01158564, + "auxiliary_loss_mlp": 0.01146723, + "balance_loss_clip": 1.00209069, + "balance_loss_mlp": 1.0008111, + "epoch": 0.22462047196753343, + "flos": 28983538005120.0, + "grad_norm": 1.6398010653022004, + "language_loss": 0.84704822, + "learning_rate": 3.6156499059852702e-06, + "loss": 0.87010109, + "num_input_tokens_seen": 80431075, + "step": 3736, + "time_per_iteration": 2.6096341609954834 + }, + { + "auxiliary_loss_clip": 0.01125511, + "auxiliary_loss_mlp": 0.01148277, + "balance_loss_clip": 1.00195885, + "balance_loss_mlp": 1.00102925, + "epoch": 0.22468059522020142, + "flos": 20011329440640.0, + "grad_norm": 1.5554027084092932, + "language_loss": 0.86874461, + "learning_rate": 3.615420317888586e-06, + "loss": 0.89148253, + "num_input_tokens_seen": 80449240, + "step": 3737, + "time_per_iteration": 2.6162190437316895 + }, + { + "auxiliary_loss_clip": 0.01175405, + "auxiliary_loss_mlp": 0.01147493, + "balance_loss_clip": 1.00213611, + "balance_loss_mlp": 1.00100827, + "epoch": 0.2247407184728694, + "flos": 29314239546240.0, + "grad_norm": 1.7668565129423661, + "language_loss": 0.79480845, + "learning_rate": 3.6151906685350006e-06, + "loss": 0.81803739, + "num_input_tokens_seen": 80467900, + "step": 3738, + "time_per_iteration": 2.5643606185913086 + }, + { + "auxiliary_loss_clip": 0.01143422, + "auxiliary_loss_mlp": 0.01147285, + "balance_loss_clip": 1.00208735, + "balance_loss_mlp": 1.00118256, + "epoch": 0.22480084172553735, + "flos": 22310366417280.0, + "grad_norm": 1.6052230240553027, + "language_loss": 0.75773764, + "learning_rate": 3.614960957933224e-06, + "loss": 0.78064471, + "num_input_tokens_seen": 80487100, + "step": 3739, + "time_per_iteration": 2.6022801399230957 + }, + { + "auxiliary_loss_clip": 0.01128203, + "auxiliary_loss_mlp": 0.01146764, + "balance_loss_clip": 1.00192976, + "balance_loss_mlp": 1.00094688, + "epoch": 0.22486096497820532, + "flos": 25591272641280.0, + "grad_norm": 1.6310093969613928, + "language_loss": 0.74584675, + "learning_rate": 3.6147311860919655e-06, + "loss": 0.76859635, + "num_input_tokens_seen": 80508625, + "step": 3740, + "time_per_iteration": 2.6570417881011963 + }, + { + "auxiliary_loss_clip": 0.01175196, + "auxiliary_loss_mlp": 0.01146679, + "balance_loss_clip": 1.0020057, + "balance_loss_mlp": 1.00105309, + "epoch": 0.22492108823087328, + "flos": 17639824775040.0, + "grad_norm": 1.8929344733265518, + "language_loss": 0.75477648, + "learning_rate": 3.614501353019939e-06, + "loss": 0.77799529, + "num_input_tokens_seen": 80527345, + "step": 3741, + "time_per_iteration": 2.50197172164917 + }, + { + "auxiliary_loss_clip": 0.01147769, + "auxiliary_loss_mlp": 0.0114744, + "balance_loss_clip": 1.00206435, + "balance_loss_mlp": 1.00085998, + "epoch": 0.22498121148354125, + "flos": 16034653797120.0, + "grad_norm": 1.6277566295336405, + "language_loss": 0.87036258, + "learning_rate": 3.6142714587258592e-06, + "loss": 0.89331472, + "num_input_tokens_seen": 80545545, + "step": 3742, + "time_per_iteration": 2.548234224319458 + }, + { + "auxiliary_loss_clip": 0.01116638, + "auxiliary_loss_mlp": 0.0114674, + "balance_loss_clip": 1.00197268, + "balance_loss_mlp": 1.00111413, + "epoch": 0.22504133473620924, + "flos": 24023772051840.0, + "grad_norm": 1.712693092230775, + "language_loss": 0.81449425, + "learning_rate": 3.614041503218444e-06, + "loss": 0.83712804, + "num_input_tokens_seen": 80565040, + "step": 3743, + "time_per_iteration": 2.6885242462158203 + }, + { + "auxiliary_loss_clip": 0.01159596, + "auxiliary_loss_mlp": 0.01147145, + "balance_loss_clip": 1.00212765, + "balance_loss_mlp": 1.00075579, + "epoch": 0.2251014579888772, + "flos": 16763963541120.0, + "grad_norm": 1.8388134291950815, + "language_loss": 0.63837427, + "learning_rate": 3.6138114865064134e-06, + "loss": 0.66144168, + "num_input_tokens_seen": 80582815, + "step": 3744, + "time_per_iteration": 2.523815155029297 + }, + { + "auxiliary_loss_clip": 0.0115848, + "auxiliary_loss_mlp": 0.01146636, + "balance_loss_clip": 1.00191665, + "balance_loss_mlp": 1.00081933, + "epoch": 0.22516158124154517, + "flos": 13991013498240.0, + "grad_norm": 2.587609762933733, + "language_loss": 0.75680304, + "learning_rate": 3.613581408598489e-06, + "loss": 0.77985418, + "num_input_tokens_seen": 80600865, + "step": 3745, + "time_per_iteration": 2.527191162109375 + }, + { + "auxiliary_loss_clip": 0.01143546, + "auxiliary_loss_mlp": 0.01146889, + "balance_loss_clip": 1.00214934, + "balance_loss_mlp": 1.00097728, + "epoch": 0.22522170449421314, + "flos": 14390016750720.0, + "grad_norm": 1.7499341205090073, + "language_loss": 0.80748725, + "learning_rate": 3.6133512695033965e-06, + "loss": 0.83039159, + "num_input_tokens_seen": 80617455, + "step": 3746, + "time_per_iteration": 2.559591770172119 + }, + { + "auxiliary_loss_clip": 0.01158589, + "auxiliary_loss_mlp": 0.01147338, + "balance_loss_clip": 1.00196409, + "balance_loss_mlp": 1.00104403, + "epoch": 0.2252818277468811, + "flos": 23805542972160.0, + "grad_norm": 2.372427115588737, + "language_loss": 0.85917675, + "learning_rate": 3.613121069229862e-06, + "loss": 0.88223606, + "num_input_tokens_seen": 80635125, + "step": 3747, + "time_per_iteration": 2.582977533340454 + }, + { + "auxiliary_loss_clip": 0.01158548, + "auxiliary_loss_mlp": 0.00748895, + "balance_loss_clip": 1.00192332, + "balance_loss_mlp": 1.00117767, + "epoch": 0.22534195099954907, + "flos": 24718033100160.0, + "grad_norm": 1.6594643052353435, + "language_loss": 0.76581621, + "learning_rate": 3.6128908077866145e-06, + "loss": 0.78489065, + "num_input_tokens_seen": 80656370, + "step": 3748, + "time_per_iteration": 2.6048529148101807 + }, + { + "auxiliary_loss_clip": 0.01175209, + "auxiliary_loss_mlp": 0.01146863, + "balance_loss_clip": 1.00208235, + "balance_loss_mlp": 1.00095129, + "epoch": 0.22540207425221703, + "flos": 21032341534080.0, + "grad_norm": 1.7241820024124266, + "language_loss": 0.79634339, + "learning_rate": 3.6126604851823864e-06, + "loss": 0.8195641, + "num_input_tokens_seen": 80676495, + "step": 3749, + "time_per_iteration": 2.526364326477051 + }, + { + "auxiliary_loss_clip": 0.01142168, + "auxiliary_loss_mlp": 0.01146322, + "balance_loss_clip": 1.00194216, + "balance_loss_mlp": 1.00098181, + "epoch": 0.22546219750488503, + "flos": 19390362094080.0, + "grad_norm": 1.5096323999264911, + "language_loss": 0.79570961, + "learning_rate": 3.6124301014259108e-06, + "loss": 0.81859452, + "num_input_tokens_seen": 80694755, + "step": 3750, + "time_per_iteration": 2.5908501148223877 + }, + { + "auxiliary_loss_clip": 0.01111524, + "auxiliary_loss_mlp": 0.01147367, + "balance_loss_clip": 1.0020268, + "balance_loss_mlp": 1.00097823, + "epoch": 0.225522320757553, + "flos": 25192628524800.0, + "grad_norm": 3.4130306046979824, + "language_loss": 0.82013607, + "learning_rate": 3.6121996565259244e-06, + "loss": 0.84272492, + "num_input_tokens_seen": 80713670, + "step": 3751, + "time_per_iteration": 2.695308208465576 + }, + { + "auxiliary_loss_clip": 0.01143382, + "auxiliary_loss_mlp": 0.01146816, + "balance_loss_clip": 1.00207472, + "balance_loss_mlp": 1.00090373, + "epoch": 0.22558244401022096, + "flos": 17163110448000.0, + "grad_norm": 1.8718299787506187, + "language_loss": 0.83779949, + "learning_rate": 3.611969150491165e-06, + "loss": 0.86070144, + "num_input_tokens_seen": 80731450, + "step": 3752, + "time_per_iteration": 2.567891836166382 + }, + { + "auxiliary_loss_clip": 0.01175204, + "auxiliary_loss_mlp": 0.01146362, + "balance_loss_clip": 1.00206769, + "balance_loss_mlp": 1.00073552, + "epoch": 0.22564256726288892, + "flos": 15231008856960.0, + "grad_norm": 1.6582651538814275, + "language_loss": 0.78133833, + "learning_rate": 3.611738583330375e-06, + "loss": 0.80455399, + "num_input_tokens_seen": 80748415, + "step": 3753, + "time_per_iteration": 2.490377426147461 + }, + { + "auxiliary_loss_clip": 0.01141961, + "auxiliary_loss_mlp": 0.01146825, + "balance_loss_clip": 1.00190306, + "balance_loss_mlp": 1.00091302, + "epoch": 0.2257026905155569, + "flos": 34568652764160.0, + "grad_norm": 1.6845054273140185, + "language_loss": 0.78439951, + "learning_rate": 3.611507955052295e-06, + "loss": 0.80728734, + "num_input_tokens_seen": 80770835, + "step": 3754, + "time_per_iteration": 2.702726125717163 + }, + { + "auxiliary_loss_clip": 0.01142474, + "auxiliary_loss_mlp": 0.01147207, + "balance_loss_clip": 1.00210881, + "balance_loss_mlp": 1.00110364, + "epoch": 0.22576281376822485, + "flos": 19938430788480.0, + "grad_norm": 1.6304092876294418, + "language_loss": 0.70404983, + "learning_rate": 3.6112772656656727e-06, + "loss": 0.72694659, + "num_input_tokens_seen": 80787840, + "step": 3755, + "time_per_iteration": 2.582185745239258 + }, + { + "auxiliary_loss_clip": 0.01126753, + "auxiliary_loss_mlp": 0.01147077, + "balance_loss_clip": 1.00197554, + "balance_loss_mlp": 1.00116515, + "epoch": 0.22582293702089282, + "flos": 24602005192320.0, + "grad_norm": 1.973386161885297, + "language_loss": 0.77119201, + "learning_rate": 3.6110465151792547e-06, + "loss": 0.79393029, + "num_input_tokens_seen": 80806335, + "step": 3756, + "time_per_iteration": 2.670245885848999 + }, + { + "auxiliary_loss_clip": 0.01144017, + "auxiliary_loss_mlp": 0.01147364, + "balance_loss_clip": 1.00216818, + "balance_loss_mlp": 1.00097466, + "epoch": 0.2258830602735608, + "flos": 23035438356480.0, + "grad_norm": 1.752460770611826, + "language_loss": 0.82302874, + "learning_rate": 3.6108157036017916e-06, + "loss": 0.8459425, + "num_input_tokens_seen": 80825355, + "step": 3757, + "time_per_iteration": 2.5888633728027344 + }, + { + "auxiliary_loss_clip": 0.01158428, + "auxiliary_loss_mlp": 0.01147038, + "balance_loss_clip": 1.00194919, + "balance_loss_mlp": 1.00084031, + "epoch": 0.22594318352622877, + "flos": 22158427887360.0, + "grad_norm": 1.9359113074599266, + "language_loss": 0.73068619, + "learning_rate": 3.6105848309420358e-06, + "loss": 0.75374091, + "num_input_tokens_seen": 80842570, + "step": 3758, + "time_per_iteration": 3.967008113861084 + }, + { + "auxiliary_loss_clip": 0.01142961, + "auxiliary_loss_mlp": 0.01146469, + "balance_loss_clip": 1.00198174, + "balance_loss_mlp": 1.00112939, + "epoch": 0.22600330677889674, + "flos": 20594303176320.0, + "grad_norm": 2.5350076698213906, + "language_loss": 0.77431583, + "learning_rate": 3.6103538972087412e-06, + "loss": 0.7972101, + "num_input_tokens_seen": 80858745, + "step": 3759, + "time_per_iteration": 2.6080102920532227 + }, + { + "auxiliary_loss_clip": 0.01109663, + "auxiliary_loss_mlp": 0.01147402, + "balance_loss_clip": 1.00165594, + "balance_loss_mlp": 1.00110817, + "epoch": 0.2260634300315647, + "flos": 35659798162560.0, + "grad_norm": 1.7420089892251642, + "language_loss": 0.78469598, + "learning_rate": 3.6101229024106655e-06, + "loss": 0.80726659, + "num_input_tokens_seen": 80880085, + "step": 3760, + "time_per_iteration": 4.187054872512817 + }, + { + "auxiliary_loss_clip": 0.01125398, + "auxiliary_loss_mlp": 0.01135351, + "balance_loss_clip": 1.0018295, + "balance_loss_mlp": 1.00031042, + "epoch": 0.22612355328423267, + "flos": 72090455126400.0, + "grad_norm": 0.9535053584729318, + "language_loss": 0.60036314, + "learning_rate": 3.609891846556569e-06, + "loss": 0.62297058, + "num_input_tokens_seen": 80937660, + "step": 3761, + "time_per_iteration": 4.5500807762146 + }, + { + "auxiliary_loss_clip": 0.01124806, + "auxiliary_loss_mlp": 0.01147122, + "balance_loss_clip": 1.00171399, + "balance_loss_mlp": 1.00082886, + "epoch": 0.22618367653690064, + "flos": 22783776693120.0, + "grad_norm": 2.1211375293100554, + "language_loss": 0.77111173, + "learning_rate": 3.609660729655211e-06, + "loss": 0.79383099, + "num_input_tokens_seen": 80956265, + "step": 3762, + "time_per_iteration": 2.6268694400787354 + }, + { + "auxiliary_loss_clip": 0.01144034, + "auxiliary_loss_mlp": 0.01147407, + "balance_loss_clip": 1.00214875, + "balance_loss_mlp": 1.00073218, + "epoch": 0.22624379978956863, + "flos": 20448254476800.0, + "grad_norm": 1.940654463961023, + "language_loss": 0.78950483, + "learning_rate": 3.6094295517153573e-06, + "loss": 0.81241924, + "num_input_tokens_seen": 80975185, + "step": 3763, + "time_per_iteration": 3.992344379425049 + }, + { + "auxiliary_loss_clip": 0.01159295, + "auxiliary_loss_mlp": 0.01147968, + "balance_loss_clip": 1.00213385, + "balance_loss_mlp": 1.00110209, + "epoch": 0.2263039230422366, + "flos": 17494314779520.0, + "grad_norm": 1.6069263318724127, + "language_loss": 0.91099507, + "learning_rate": 3.6091983127457743e-06, + "loss": 0.93406773, + "num_input_tokens_seen": 80992830, + "step": 3764, + "time_per_iteration": 2.545987129211426 + }, + { + "auxiliary_loss_clip": 0.01160049, + "auxiliary_loss_mlp": 0.01146273, + "balance_loss_clip": 1.00215507, + "balance_loss_mlp": 1.00112367, + "epoch": 0.22636404629490456, + "flos": 28329748606080.0, + "grad_norm": 1.619789083877025, + "language_loss": 0.75084299, + "learning_rate": 3.6089670127552293e-06, + "loss": 0.77390623, + "num_input_tokens_seen": 81013675, + "step": 3765, + "time_per_iteration": 2.6034984588623047 + }, + { + "auxiliary_loss_clip": 0.011588, + "auxiliary_loss_mlp": 0.01146954, + "balance_loss_clip": 1.00211692, + "balance_loss_mlp": 1.00104237, + "epoch": 0.22642416954757252, + "flos": 17489143221120.0, + "grad_norm": 1.9867142672330536, + "language_loss": 0.8946954, + "learning_rate": 3.608735651752494e-06, + "loss": 0.91775292, + "num_input_tokens_seen": 81030345, + "step": 3766, + "time_per_iteration": 2.517808675765991 + }, + { + "auxiliary_loss_clip": 0.01142056, + "auxiliary_loss_mlp": 0.01146944, + "balance_loss_clip": 1.00190485, + "balance_loss_mlp": 1.00093639, + "epoch": 0.2264842928002405, + "flos": 24384530298240.0, + "grad_norm": 1.492324576353432, + "language_loss": 0.74458754, + "learning_rate": 3.6085042297463417e-06, + "loss": 0.76747751, + "num_input_tokens_seen": 81051000, + "step": 3767, + "time_per_iteration": 2.6351020336151123 + }, + { + "auxiliary_loss_clip": 0.01158341, + "auxiliary_loss_mlp": 0.0114726, + "balance_loss_clip": 1.00188136, + "balance_loss_mlp": 1.00096655, + "epoch": 0.22654441605290845, + "flos": 19830519354240.0, + "grad_norm": 1.388280122300739, + "language_loss": 0.71497053, + "learning_rate": 3.6082727467455477e-06, + "loss": 0.7380265, + "num_input_tokens_seen": 81071205, + "step": 3768, + "time_per_iteration": 2.5955629348754883 + }, + { + "auxiliary_loss_clip": 0.01158833, + "auxiliary_loss_mlp": 0.01147202, + "balance_loss_clip": 1.00229967, + "balance_loss_mlp": 1.00128984, + "epoch": 0.22660453930557642, + "flos": 27454569730560.0, + "grad_norm": 1.844907900181758, + "language_loss": 0.78419828, + "learning_rate": 3.6080412027588905e-06, + "loss": 0.80725867, + "num_input_tokens_seen": 81091880, + "step": 3769, + "time_per_iteration": 2.684713840484619 + }, + { + "auxiliary_loss_clip": 0.01144534, + "auxiliary_loss_mlp": 0.01147021, + "balance_loss_clip": 1.00208509, + "balance_loss_mlp": 1.00091791, + "epoch": 0.2266646625582444, + "flos": 23988148738560.0, + "grad_norm": 1.8718049797330216, + "language_loss": 0.68920487, + "learning_rate": 3.6078095977951488e-06, + "loss": 0.71212041, + "num_input_tokens_seen": 81113290, + "step": 3770, + "time_per_iteration": 2.6592934131622314 + }, + { + "auxiliary_loss_clip": 0.01175408, + "auxiliary_loss_mlp": 0.01147514, + "balance_loss_clip": 1.00213504, + "balance_loss_mlp": 1.00102997, + "epoch": 0.22672478581091238, + "flos": 26028054023040.0, + "grad_norm": 1.5492034649874515, + "language_loss": 0.80292553, + "learning_rate": 3.6075779318631067e-06, + "loss": 0.82615471, + "num_input_tokens_seen": 81133535, + "step": 3771, + "time_per_iteration": 2.593992233276367 + }, + { + "auxiliary_loss_clip": 0.01130898, + "auxiliary_loss_mlp": 0.01146059, + "balance_loss_clip": 1.00237632, + "balance_loss_mlp": 1.00100565, + "epoch": 0.22678490906358034, + "flos": 23841812730240.0, + "grad_norm": 1.6246718051905806, + "language_loss": 0.7872448, + "learning_rate": 3.6073462049715486e-06, + "loss": 0.81001437, + "num_input_tokens_seen": 81154650, + "step": 3772, + "time_per_iteration": 2.6654136180877686 + }, + { + "auxiliary_loss_clip": 0.01108061, + "auxiliary_loss_mlp": 0.01134227, + "balance_loss_clip": 1.00156832, + "balance_loss_mlp": 0.99994987, + "epoch": 0.2268450323162483, + "flos": 65048088574080.0, + "grad_norm": 0.6576778419496979, + "language_loss": 0.54374945, + "learning_rate": 3.607114417129261e-06, + "loss": 0.56617236, + "num_input_tokens_seen": 81221240, + "step": 3773, + "time_per_iteration": 3.332775354385376 + }, + { + "auxiliary_loss_clip": 0.01131495, + "auxiliary_loss_mlp": 0.01146058, + "balance_loss_clip": 1.00246358, + "balance_loss_mlp": 1.00090885, + "epoch": 0.22690515556891627, + "flos": 22526081544960.0, + "grad_norm": 1.6276856074604036, + "language_loss": 0.70535088, + "learning_rate": 3.6068825683450334e-06, + "loss": 0.72812641, + "num_input_tokens_seen": 81241520, + "step": 3774, + "time_per_iteration": 2.6554436683654785 + }, + { + "auxiliary_loss_clip": 0.01141982, + "auxiliary_loss_mlp": 0.01146816, + "balance_loss_clip": 1.00197053, + "balance_loss_mlp": 1.00109482, + "epoch": 0.22696527882158424, + "flos": 18223444955520.0, + "grad_norm": 1.9200751906459617, + "language_loss": 0.74747908, + "learning_rate": 3.606650658627658e-06, + "loss": 0.77036709, + "num_input_tokens_seen": 81256825, + "step": 3775, + "time_per_iteration": 2.5871925354003906 + }, + { + "auxiliary_loss_clip": 0.01175127, + "auxiliary_loss_mlp": 0.01146153, + "balance_loss_clip": 1.00204098, + "balance_loss_mlp": 1.0009079, + "epoch": 0.22702540207425223, + "flos": 17019252478080.0, + "grad_norm": 4.0439954952752135, + "language_loss": 0.82364887, + "learning_rate": 3.606418687985928e-06, + "loss": 0.8468616, + "num_input_tokens_seen": 81275695, + "step": 3776, + "time_per_iteration": 2.5629169940948486 + }, + { + "auxiliary_loss_clip": 0.01142981, + "auxiliary_loss_mlp": 0.01146308, + "balance_loss_clip": 1.00196326, + "balance_loss_mlp": 1.00106359, + "epoch": 0.2270855253269202, + "flos": 21325731822720.0, + "grad_norm": 1.7621658488469056, + "language_loss": 0.82731158, + "learning_rate": 3.606186656428641e-06, + "loss": 0.85020441, + "num_input_tokens_seen": 81294920, + "step": 3777, + "time_per_iteration": 2.6026151180267334 + }, + { + "auxiliary_loss_clip": 0.0114266, + "auxiliary_loss_mlp": 0.01147595, + "balance_loss_clip": 1.00213289, + "balance_loss_mlp": 1.00111032, + "epoch": 0.22714564857958816, + "flos": 23550469516800.0, + "grad_norm": 1.7691135472580484, + "language_loss": 0.71940506, + "learning_rate": 3.6059545639645955e-06, + "loss": 0.7423076, + "num_input_tokens_seen": 81314275, + "step": 3778, + "time_per_iteration": 2.618638753890991 + }, + { + "auxiliary_loss_clip": 0.01128129, + "auxiliary_loss_mlp": 0.01146565, + "balance_loss_clip": 1.00201583, + "balance_loss_mlp": 1.00084376, + "epoch": 0.22720577183225613, + "flos": 25989880844160.0, + "grad_norm": 44.0345412158452, + "language_loss": 0.64568651, + "learning_rate": 3.605722410602591e-06, + "loss": 0.66843343, + "num_input_tokens_seen": 81333890, + "step": 3779, + "time_per_iteration": 2.6905500888824463 + }, + { + "auxiliary_loss_clip": 0.01160066, + "auxiliary_loss_mlp": 0.01146636, + "balance_loss_clip": 1.0021342, + "balance_loss_mlp": 1.00120091, + "epoch": 0.2272658950849241, + "flos": 20814076540800.0, + "grad_norm": 1.9487886840471265, + "language_loss": 0.71011746, + "learning_rate": 3.6054901963514323e-06, + "loss": 0.73318452, + "num_input_tokens_seen": 81353640, + "step": 3780, + "time_per_iteration": 2.560439109802246 + }, + { + "auxiliary_loss_clip": 0.01158855, + "auxiliary_loss_mlp": 0.01146695, + "balance_loss_clip": 1.00211525, + "balance_loss_mlp": 1.0009737, + "epoch": 0.22732601833759206, + "flos": 23909324342400.0, + "grad_norm": 1.6158636983509913, + "language_loss": 0.89205992, + "learning_rate": 3.6052579212199246e-06, + "loss": 0.91511548, + "num_input_tokens_seen": 81371595, + "step": 3781, + "time_per_iteration": 2.5683817863464355 + }, + { + "auxiliary_loss_clip": 0.01175273, + "auxiliary_loss_mlp": 0.01146882, + "balance_loss_clip": 1.00202346, + "balance_loss_mlp": 1.00106537, + "epoch": 0.22738614159026002, + "flos": 15924407978880.0, + "grad_norm": 1.9639560050226206, + "language_loss": 0.74383223, + "learning_rate": 3.6050255852168753e-06, + "loss": 0.76705384, + "num_input_tokens_seen": 81388435, + "step": 3782, + "time_per_iteration": 2.4885451793670654 + }, + { + "auxiliary_loss_clip": 0.01143116, + "auxiliary_loss_mlp": 0.01146545, + "balance_loss_clip": 1.00187075, + "balance_loss_mlp": 1.00101423, + "epoch": 0.22744626484292801, + "flos": 24205515891840.0, + "grad_norm": 1.4020424905392002, + "language_loss": 0.82746124, + "learning_rate": 3.604793188351095e-06, + "loss": 0.85035783, + "num_input_tokens_seen": 81410195, + "step": 3783, + "time_per_iteration": 2.6297731399536133 + }, + { + "auxiliary_loss_clip": 0.01142092, + "auxiliary_loss_mlp": 0.0114669, + "balance_loss_clip": 1.0019114, + "balance_loss_mlp": 1.00087357, + "epoch": 0.22750638809559598, + "flos": 24791614110720.0, + "grad_norm": 1.9080281932299872, + "language_loss": 0.76192355, + "learning_rate": 3.6045607306313964e-06, + "loss": 0.78481132, + "num_input_tokens_seen": 81430060, + "step": 3784, + "time_per_iteration": 2.616309642791748 + }, + { + "auxiliary_loss_clip": 0.01175173, + "auxiliary_loss_mlp": 0.01146314, + "balance_loss_clip": 1.00206184, + "balance_loss_mlp": 1.00087929, + "epoch": 0.22756651134826394, + "flos": 22236498097920.0, + "grad_norm": 1.7959713903531982, + "language_loss": 0.71459383, + "learning_rate": 3.604328212066594e-06, + "loss": 0.7378087, + "num_input_tokens_seen": 81447375, + "step": 3785, + "time_per_iteration": 2.5467374324798584 + }, + { + "auxiliary_loss_clip": 0.01124734, + "auxiliary_loss_mlp": 0.01135314, + "balance_loss_clip": 1.00200498, + "balance_loss_mlp": 1.00027335, + "epoch": 0.2276266346009319, + "flos": 62707466626560.0, + "grad_norm": 0.8206891433879976, + "language_loss": 0.61906099, + "learning_rate": 3.6040956326655047e-06, + "loss": 0.64166147, + "num_input_tokens_seen": 81505235, + "step": 3786, + "time_per_iteration": 3.213641881942749 + }, + { + "auxiliary_loss_clip": 0.0114322, + "auxiliary_loss_mlp": 0.01147324, + "balance_loss_clip": 1.00215316, + "balance_loss_mlp": 1.00093484, + "epoch": 0.22768675785359987, + "flos": 18613936684800.0, + "grad_norm": 2.3927615587448954, + "language_loss": 0.86386549, + "learning_rate": 3.6038629924369486e-06, + "loss": 0.88677096, + "num_input_tokens_seen": 81518685, + "step": 3787, + "time_per_iteration": 2.5868563652038574 + }, + { + "auxiliary_loss_clip": 0.01143391, + "auxiliary_loss_mlp": 0.01146284, + "balance_loss_clip": 1.00202811, + "balance_loss_mlp": 1.00084889, + "epoch": 0.22774688110626784, + "flos": 26870195364480.0, + "grad_norm": 1.3061811921275157, + "language_loss": 0.73019868, + "learning_rate": 3.6036302913897474e-06, + "loss": 0.75309545, + "num_input_tokens_seen": 81538940, + "step": 3788, + "time_per_iteration": 2.657285213470459 + }, + { + "auxiliary_loss_clip": 0.01142974, + "auxiliary_loss_mlp": 0.01146154, + "balance_loss_clip": 1.00197554, + "balance_loss_mlp": 1.00071919, + "epoch": 0.2278070043589358, + "flos": 15553593924480.0, + "grad_norm": 2.1209510897335635, + "language_loss": 0.67384338, + "learning_rate": 3.6033975295327243e-06, + "loss": 0.69673467, + "num_input_tokens_seen": 81555525, + "step": 3789, + "time_per_iteration": 2.57326340675354 + }, + { + "auxiliary_loss_clip": 0.01144639, + "auxiliary_loss_mlp": 0.01146661, + "balance_loss_clip": 1.00206959, + "balance_loss_mlp": 1.00084388, + "epoch": 0.2278671276116038, + "flos": 22416805393920.0, + "grad_norm": 2.0072603427468017, + "language_loss": 0.76164407, + "learning_rate": 3.6031647068747065e-06, + "loss": 0.7845571, + "num_input_tokens_seen": 81576305, + "step": 3790, + "time_per_iteration": 2.6277055740356445 + }, + { + "auxiliary_loss_clip": 0.01109789, + "auxiliary_loss_mlp": 0.0114565, + "balance_loss_clip": 1.0017308, + "balance_loss_mlp": 1.00097752, + "epoch": 0.22792725086427176, + "flos": 20631363033600.0, + "grad_norm": 2.3304921199455344, + "language_loss": 0.91281152, + "learning_rate": 3.602931823424522e-06, + "loss": 0.93536592, + "num_input_tokens_seen": 81594115, + "step": 3791, + "time_per_iteration": 2.695892333984375 + }, + { + "auxiliary_loss_clip": 0.01159814, + "auxiliary_loss_mlp": 0.01145583, + "balance_loss_clip": 1.0019877, + "balance_loss_mlp": 1.00072038, + "epoch": 0.22798737411693973, + "flos": 31428946903680.0, + "grad_norm": 1.5944963631208438, + "language_loss": 0.82538337, + "learning_rate": 3.6026988791910026e-06, + "loss": 0.84843731, + "num_input_tokens_seen": 81615355, + "step": 3792, + "time_per_iteration": 2.652010679244995 + }, + { + "auxiliary_loss_clip": 0.01172206, + "auxiliary_loss_mlp": 0.01134304, + "balance_loss_clip": 1.001894, + "balance_loss_mlp": 1.00002658, + "epoch": 0.2280474973696077, + "flos": 52396685827200.0, + "grad_norm": 1.147480681941964, + "language_loss": 0.65738529, + "learning_rate": 3.602465874182981e-06, + "loss": 0.68045038, + "num_input_tokens_seen": 81662075, + "step": 3793, + "time_per_iteration": 2.854971408843994 + }, + { + "auxiliary_loss_clip": 0.01175308, + "auxiliary_loss_mlp": 0.01146701, + "balance_loss_clip": 1.00210416, + "balance_loss_mlp": 1.00098002, + "epoch": 0.22810762062227566, + "flos": 26396066816640.0, + "grad_norm": 1.894710047109037, + "language_loss": 0.77076054, + "learning_rate": 3.602232808409293e-06, + "loss": 0.79398072, + "num_input_tokens_seen": 81681625, + "step": 3794, + "time_per_iteration": 2.5771145820617676 + }, + { + "auxiliary_loss_clip": 0.01127623, + "auxiliary_loss_mlp": 0.01146224, + "balance_loss_clip": 1.00190306, + "balance_loss_mlp": 1.00088358, + "epoch": 0.22816774387494362, + "flos": 25630271832960.0, + "grad_norm": 2.401875085203937, + "language_loss": 0.80706596, + "learning_rate": 3.6019996818787755e-06, + "loss": 0.82980442, + "num_input_tokens_seen": 81701170, + "step": 3795, + "time_per_iteration": 2.7310478687286377 + }, + { + "auxiliary_loss_clip": 0.01158493, + "auxiliary_loss_mlp": 0.01145713, + "balance_loss_clip": 1.00196886, + "balance_loss_mlp": 1.00094557, + "epoch": 0.22822786712761162, + "flos": 22451602694400.0, + "grad_norm": 1.5580789187799713, + "language_loss": 0.77430665, + "learning_rate": 3.6017664946002704e-06, + "loss": 0.79734874, + "num_input_tokens_seen": 81721265, + "step": 3796, + "time_per_iteration": 4.020950555801392 + }, + { + "auxiliary_loss_clip": 0.01126573, + "auxiliary_loss_mlp": 0.007487, + "balance_loss_clip": 1.00200558, + "balance_loss_mlp": 1.0007298, + "epoch": 0.22828799038027958, + "flos": 12202554395520.0, + "grad_norm": 2.690975619822112, + "language_loss": 0.9625116, + "learning_rate": 3.6015332465826188e-06, + "loss": 0.98126429, + "num_input_tokens_seen": 81736565, + "step": 3797, + "time_per_iteration": 2.6763880252838135 + }, + { + "auxiliary_loss_clip": 0.01159342, + "auxiliary_loss_mlp": 0.00748651, + "balance_loss_clip": 1.00207067, + "balance_loss_mlp": 1.00077271, + "epoch": 0.22834811363294755, + "flos": 22085708803200.0, + "grad_norm": 2.0431649407106005, + "language_loss": 0.8145591, + "learning_rate": 3.601299937834666e-06, + "loss": 0.83363903, + "num_input_tokens_seen": 81756240, + "step": 3798, + "time_per_iteration": 4.034044981002808 + }, + { + "auxiliary_loss_clip": 0.01127636, + "auxiliary_loss_mlp": 0.01146367, + "balance_loss_clip": 1.001791, + "balance_loss_mlp": 1.00074089, + "epoch": 0.2284082368856155, + "flos": 24860634094080.0, + "grad_norm": 2.1792378119491436, + "language_loss": 0.79229867, + "learning_rate": 3.6010665683652596e-06, + "loss": 0.81503868, + "num_input_tokens_seen": 81775720, + "step": 3799, + "time_per_iteration": 4.041197061538696 + }, + { + "auxiliary_loss_clip": 0.01143088, + "auxiliary_loss_mlp": 0.01145875, + "balance_loss_clip": 1.0020287, + "balance_loss_mlp": 1.00091672, + "epoch": 0.22846836013828348, + "flos": 23292882109440.0, + "grad_norm": 1.4709514720069, + "language_loss": 0.75420415, + "learning_rate": 3.6008331381832484e-06, + "loss": 0.77709377, + "num_input_tokens_seen": 81795830, + "step": 3800, + "time_per_iteration": 2.616525650024414 + }, + { + "auxiliary_loss_clip": 0.0114316, + "auxiliary_loss_mlp": 0.01145598, + "balance_loss_clip": 1.00201559, + "balance_loss_mlp": 1.00092602, + "epoch": 0.22852848339095144, + "flos": 27416288810880.0, + "grad_norm": 1.5941081116928928, + "language_loss": 0.63764608, + "learning_rate": 3.600599647297484e-06, + "loss": 0.66053367, + "num_input_tokens_seen": 81815745, + "step": 3801, + "time_per_iteration": 4.061419248580933 + }, + { + "auxiliary_loss_clip": 0.01143165, + "auxiliary_loss_mlp": 0.01145037, + "balance_loss_clip": 1.00203848, + "balance_loss_mlp": 1.00084138, + "epoch": 0.2285886066436194, + "flos": 26321157002880.0, + "grad_norm": 1.6000053110586967, + "language_loss": 0.81520307, + "learning_rate": 3.60036609571682e-06, + "loss": 0.83808506, + "num_input_tokens_seen": 81835155, + "step": 3802, + "time_per_iteration": 2.6447606086730957 + }, + { + "auxiliary_loss_clip": 0.01143895, + "auxiliary_loss_mlp": 0.01146435, + "balance_loss_clip": 1.0020752, + "balance_loss_mlp": 1.00100017, + "epoch": 0.2286487298962874, + "flos": 29716475022720.0, + "grad_norm": 1.6701800679347403, + "language_loss": 0.78481877, + "learning_rate": 3.600132483450114e-06, + "loss": 0.80772209, + "num_input_tokens_seen": 81855655, + "step": 3803, + "time_per_iteration": 2.6681740283966064 + }, + { + "auxiliary_loss_clip": 0.01126589, + "auxiliary_loss_mlp": 0.01145855, + "balance_loss_clip": 1.00177479, + "balance_loss_mlp": 1.00089693, + "epoch": 0.22870885314895537, + "flos": 21287199507840.0, + "grad_norm": 1.5557935464877757, + "language_loss": 0.85249341, + "learning_rate": 3.5998988105062235e-06, + "loss": 0.87521785, + "num_input_tokens_seen": 81876385, + "step": 3804, + "time_per_iteration": 2.6491384506225586 + }, + { + "auxiliary_loss_clip": 0.01158649, + "auxiliary_loss_mlp": 0.01146371, + "balance_loss_clip": 1.00208151, + "balance_loss_mlp": 1.00093532, + "epoch": 0.22876897640162333, + "flos": 14939450161920.0, + "grad_norm": 1.7755135090036036, + "language_loss": 0.76395988, + "learning_rate": 3.59966507689401e-06, + "loss": 0.78701007, + "num_input_tokens_seen": 81893225, + "step": 3805, + "time_per_iteration": 2.5279223918914795 + }, + { + "auxiliary_loss_clip": 0.0114238, + "auxiliary_loss_mlp": 0.00748613, + "balance_loss_clip": 1.00197554, + "balance_loss_mlp": 1.00055027, + "epoch": 0.2288290996542913, + "flos": 18113917409280.0, + "grad_norm": 2.90469992196759, + "language_loss": 0.7925247, + "learning_rate": 3.5994312826223363e-06, + "loss": 0.81143463, + "num_input_tokens_seen": 81911350, + "step": 3806, + "time_per_iteration": 2.58786940574646 + }, + { + "auxiliary_loss_clip": 0.01143211, + "auxiliary_loss_mlp": 0.01146656, + "balance_loss_clip": 1.0020076, + "balance_loss_mlp": 1.00103021, + "epoch": 0.22888922290695926, + "flos": 39855457071360.0, + "grad_norm": 1.82433458117265, + "language_loss": 0.69883668, + "learning_rate": 3.5991974277000684e-06, + "loss": 0.7217353, + "num_input_tokens_seen": 81935420, + "step": 3807, + "time_per_iteration": 2.7320995330810547 + }, + { + "auxiliary_loss_clip": 0.01158677, + "auxiliary_loss_mlp": 0.0114706, + "balance_loss_clip": 1.00210285, + "balance_loss_mlp": 1.00124371, + "epoch": 0.22894934615962723, + "flos": 23403774372480.0, + "grad_norm": 1.9814698694334167, + "language_loss": 0.65406299, + "learning_rate": 3.5989635121360733e-06, + "loss": 0.67712039, + "num_input_tokens_seen": 81953845, + "step": 3808, + "time_per_iteration": 2.587672233581543 + }, + { + "auxiliary_loss_clip": 0.01110101, + "auxiliary_loss_mlp": 0.01147193, + "balance_loss_clip": 1.00195932, + "balance_loss_mlp": 1.00109029, + "epoch": 0.22900946941229522, + "flos": 18843011671680.0, + "grad_norm": 2.0566911346003005, + "language_loss": 0.75033271, + "learning_rate": 3.598729535939222e-06, + "loss": 0.77290565, + "num_input_tokens_seen": 81972100, + "step": 3809, + "time_per_iteration": 2.680347442626953 + }, + { + "auxiliary_loss_clip": 0.01142798, + "auxiliary_loss_mlp": 0.01145313, + "balance_loss_clip": 1.0020591, + "balance_loss_mlp": 1.00092721, + "epoch": 0.22906959266496318, + "flos": 22929394429440.0, + "grad_norm": 1.4660985939269393, + "language_loss": 0.81667042, + "learning_rate": 3.5984954991183862e-06, + "loss": 0.83955151, + "num_input_tokens_seen": 81992760, + "step": 3810, + "time_per_iteration": 2.6395046710968018 + }, + { + "auxiliary_loss_clip": 0.01158999, + "auxiliary_loss_mlp": 0.01145612, + "balance_loss_clip": 1.00218248, + "balance_loss_mlp": 1.00093949, + "epoch": 0.22912971591763115, + "flos": 19354523299200.0, + "grad_norm": 1.747965245758589, + "language_loss": 0.78432512, + "learning_rate": 3.598261401682441e-06, + "loss": 0.80737126, + "num_input_tokens_seen": 82009080, + "step": 3811, + "time_per_iteration": 2.5804905891418457 + }, + { + "auxiliary_loss_clip": 0.01148018, + "auxiliary_loss_mlp": 0.00748516, + "balance_loss_clip": 1.00306022, + "balance_loss_mlp": 1.00047421, + "epoch": 0.22918983917029911, + "flos": 19933546538880.0, + "grad_norm": 1.9392899202152751, + "language_loss": 0.82681066, + "learning_rate": 3.5980272436402632e-06, + "loss": 0.84577602, + "num_input_tokens_seen": 82026705, + "step": 3812, + "time_per_iteration": 2.6400015354156494 + }, + { + "auxiliary_loss_clip": 0.01111761, + "auxiliary_loss_mlp": 0.01146675, + "balance_loss_clip": 1.00209761, + "balance_loss_mlp": 1.00124002, + "epoch": 0.22924996242296708, + "flos": 16690885320960.0, + "grad_norm": 2.4501232363036576, + "language_loss": 0.82425714, + "learning_rate": 3.5977930250007324e-06, + "loss": 0.84684145, + "num_input_tokens_seen": 82043245, + "step": 3813, + "time_per_iteration": 2.660551071166992 + }, + { + "auxiliary_loss_clip": 0.01159715, + "auxiliary_loss_mlp": 0.01145914, + "balance_loss_clip": 1.00212955, + "balance_loss_mlp": 1.00095546, + "epoch": 0.22931008567563504, + "flos": 33036164956800.0, + "grad_norm": 1.7597707534652933, + "language_loss": 0.70250285, + "learning_rate": 3.5975587457727298e-06, + "loss": 0.72555912, + "num_input_tokens_seen": 82066870, + "step": 3814, + "time_per_iteration": 2.659945011138916 + }, + { + "auxiliary_loss_clip": 0.01159727, + "auxiliary_loss_mlp": 0.01145465, + "balance_loss_clip": 1.00200522, + "balance_loss_mlp": 1.00088811, + "epoch": 0.229370208928303, + "flos": 23330696152320.0, + "grad_norm": 2.299198458781198, + "language_loss": 0.67064893, + "learning_rate": 3.597324405965139e-06, + "loss": 0.69370091, + "num_input_tokens_seen": 82083180, + "step": 3815, + "time_per_iteration": 2.5935873985290527 + }, + { + "auxiliary_loss_clip": 0.01164342, + "auxiliary_loss_mlp": 0.01146416, + "balance_loss_clip": 1.0026648, + "balance_loss_mlp": 1.00107563, + "epoch": 0.229430332180971, + "flos": 28617213150720.0, + "grad_norm": 1.735768321978167, + "language_loss": 0.83456755, + "learning_rate": 3.597090005586848e-06, + "loss": 0.85767508, + "num_input_tokens_seen": 82102950, + "step": 3816, + "time_per_iteration": 2.6827688217163086 + }, + { + "auxiliary_loss_clip": 0.0115814, + "auxiliary_loss_mlp": 0.01145895, + "balance_loss_clip": 1.00193715, + "balance_loss_mlp": 1.00103235, + "epoch": 0.22949045543363897, + "flos": 17238199829760.0, + "grad_norm": 2.154095377698227, + "language_loss": 0.87232947, + "learning_rate": 3.596855544646742e-06, + "loss": 0.89536989, + "num_input_tokens_seen": 82119510, + "step": 3817, + "time_per_iteration": 2.5469014644622803 + }, + { + "auxiliary_loss_clip": 0.01142972, + "auxiliary_loss_mlp": 0.01146296, + "balance_loss_clip": 1.00195146, + "balance_loss_mlp": 1.00105155, + "epoch": 0.22955057868630693, + "flos": 27489438858240.0, + "grad_norm": 2.42359661114749, + "language_loss": 0.74693274, + "learning_rate": 3.5966210231537154e-06, + "loss": 0.76982546, + "num_input_tokens_seen": 82140095, + "step": 3818, + "time_per_iteration": 2.6676697731018066 + }, + { + "auxiliary_loss_clip": 0.01158292, + "auxiliary_loss_mlp": 0.01145563, + "balance_loss_clip": 1.00200546, + "balance_loss_mlp": 1.00089097, + "epoch": 0.2296107019389749, + "flos": 23476421629440.0, + "grad_norm": 1.6364230107130464, + "language_loss": 0.74781209, + "learning_rate": 3.596386441116659e-06, + "loss": 0.77085066, + "num_input_tokens_seen": 82159510, + "step": 3819, + "time_per_iteration": 2.556741237640381 + }, + { + "auxiliary_loss_clip": 0.01158473, + "auxiliary_loss_mlp": 0.01145753, + "balance_loss_clip": 1.00205183, + "balance_loss_mlp": 1.00098526, + "epoch": 0.22967082519164286, + "flos": 31285160760960.0, + "grad_norm": 1.7679812450757284, + "language_loss": 0.81371272, + "learning_rate": 3.5961517985444684e-06, + "loss": 0.83675504, + "num_input_tokens_seen": 82179580, + "step": 3820, + "time_per_iteration": 2.624769926071167 + }, + { + "auxiliary_loss_clip": 0.01142201, + "auxiliary_loss_mlp": 0.0114665, + "balance_loss_clip": 1.00200081, + "balance_loss_mlp": 1.00102353, + "epoch": 0.22973094844431083, + "flos": 14642935390080.0, + "grad_norm": 2.1876027708435153, + "language_loss": 0.68811917, + "learning_rate": 3.595917095446042e-06, + "loss": 0.71100771, + "num_input_tokens_seen": 82195585, + "step": 3821, + "time_per_iteration": 2.5824246406555176 + }, + { + "auxiliary_loss_clip": 0.01107979, + "auxiliary_loss_mlp": 0.01145789, + "balance_loss_clip": 1.00173521, + "balance_loss_mlp": 1.00083101, + "epoch": 0.2297910716969788, + "flos": 22823853292800.0, + "grad_norm": 1.4895161293005623, + "language_loss": 0.82975858, + "learning_rate": 3.5956823318302796e-06, + "loss": 0.85229623, + "num_input_tokens_seen": 82217530, + "step": 3822, + "time_per_iteration": 2.714752435684204 + }, + { + "auxiliary_loss_clip": 0.01175292, + "auxiliary_loss_mlp": 0.01146272, + "balance_loss_clip": 1.00224257, + "balance_loss_mlp": 1.00083673, + "epoch": 0.2298511949496468, + "flos": 23039029716480.0, + "grad_norm": 1.4478714159005228, + "language_loss": 0.66305321, + "learning_rate": 3.5954475077060833e-06, + "loss": 0.68626881, + "num_input_tokens_seen": 82237980, + "step": 3823, + "time_per_iteration": 2.5387885570526123 + }, + { + "auxiliary_loss_clip": 0.01155342, + "auxiliary_loss_mlp": 0.01133519, + "balance_loss_clip": 1.00144351, + "balance_loss_mlp": 1.00000417, + "epoch": 0.22991131820231475, + "flos": 66890914911360.0, + "grad_norm": 0.7939203392619383, + "language_loss": 0.5680393, + "learning_rate": 3.595212623082357e-06, + "loss": 0.59092784, + "num_input_tokens_seen": 82301785, + "step": 3824, + "time_per_iteration": 3.223198413848877 + }, + { + "auxiliary_loss_clip": 0.01141728, + "auxiliary_loss_mlp": 0.01144935, + "balance_loss_clip": 1.00201821, + "balance_loss_mlp": 1.00093031, + "epoch": 0.22997144145498272, + "flos": 17887248633600.0, + "grad_norm": 1.9965160904116734, + "language_loss": 0.73086417, + "learning_rate": 3.594977677968009e-06, + "loss": 0.75373077, + "num_input_tokens_seen": 82317355, + "step": 3825, + "time_per_iteration": 2.566439151763916 + }, + { + "auxiliary_loss_clip": 0.01158669, + "auxiliary_loss_mlp": 0.01146128, + "balance_loss_clip": 1.00223255, + "balance_loss_mlp": 1.00097871, + "epoch": 0.23003156470765068, + "flos": 24676843178880.0, + "grad_norm": 1.8463214948921511, + "language_loss": 0.8779822, + "learning_rate": 3.5947426723719473e-06, + "loss": 0.90103018, + "num_input_tokens_seen": 82336645, + "step": 3826, + "time_per_iteration": 2.5919277667999268 + }, + { + "auxiliary_loss_clip": 0.01141913, + "auxiliary_loss_mlp": 0.01146044, + "balance_loss_clip": 1.00193799, + "balance_loss_mlp": 1.00099027, + "epoch": 0.23009168796031865, + "flos": 15814126247040.0, + "grad_norm": 2.388922406998855, + "language_loss": 0.81513679, + "learning_rate": 3.594507606303083e-06, + "loss": 0.83801639, + "num_input_tokens_seen": 82354225, + "step": 3827, + "time_per_iteration": 2.5709619522094727 + }, + { + "auxiliary_loss_clip": 0.01093391, + "auxiliary_loss_mlp": 0.01145675, + "balance_loss_clip": 1.00166893, + "balance_loss_mlp": 1.0008117, + "epoch": 0.2301518112129866, + "flos": 16212842190720.0, + "grad_norm": 1.665230650227329, + "language_loss": 0.86402857, + "learning_rate": 3.5942724797703314e-06, + "loss": 0.88641924, + "num_input_tokens_seen": 82370240, + "step": 3828, + "time_per_iteration": 2.686645746231079 + }, + { + "auxiliary_loss_clip": 0.01141837, + "auxiliary_loss_mlp": 0.01145626, + "balance_loss_clip": 1.00196767, + "balance_loss_mlp": 1.00095344, + "epoch": 0.2302119344656546, + "flos": 20595452411520.0, + "grad_norm": 1.8162001022647694, + "language_loss": 0.70449764, + "learning_rate": 3.594037292782607e-06, + "loss": 0.72737229, + "num_input_tokens_seen": 82389145, + "step": 3829, + "time_per_iteration": 2.6153693199157715 + }, + { + "auxiliary_loss_clip": 0.01093082, + "auxiliary_loss_mlp": 0.01145026, + "balance_loss_clip": 1.00182378, + "balance_loss_mlp": 1.00083077, + "epoch": 0.23027205771832257, + "flos": 26796901662720.0, + "grad_norm": 1.561027050949072, + "language_loss": 0.84435028, + "learning_rate": 3.5938020453488293e-06, + "loss": 0.86673141, + "num_input_tokens_seen": 82409185, + "step": 3830, + "time_per_iteration": 2.799724578857422 + }, + { + "auxiliary_loss_clip": 0.01158978, + "auxiliary_loss_mlp": 0.01145482, + "balance_loss_clip": 1.00222147, + "balance_loss_mlp": 1.00100076, + "epoch": 0.23033218097099054, + "flos": 43873143068160.0, + "grad_norm": 1.5909104128198845, + "language_loss": 0.67105871, + "learning_rate": 3.5935667374779177e-06, + "loss": 0.69410336, + "num_input_tokens_seen": 82432070, + "step": 3831, + "time_per_iteration": 2.798929452896118 + }, + { + "auxiliary_loss_clip": 0.01128057, + "auxiliary_loss_mlp": 0.01146326, + "balance_loss_clip": 1.00214529, + "balance_loss_mlp": 1.00108123, + "epoch": 0.2303923042236585, + "flos": 26067663745920.0, + "grad_norm": 2.3504383484694022, + "language_loss": 0.75136101, + "learning_rate": 3.5933313691787957e-06, + "loss": 0.77410483, + "num_input_tokens_seen": 82450625, + "step": 3832, + "time_per_iteration": 2.679063558578491 + }, + { + "auxiliary_loss_clip": 0.01108223, + "auxiliary_loss_mlp": 0.01145609, + "balance_loss_clip": 1.00178683, + "balance_loss_mlp": 1.00093627, + "epoch": 0.23045242747632647, + "flos": 18296379521280.0, + "grad_norm": 1.8056429871800015, + "language_loss": 0.87445545, + "learning_rate": 3.593095940460389e-06, + "loss": 0.89699382, + "num_input_tokens_seen": 82468575, + "step": 3833, + "time_per_iteration": 2.6924679279327393 + }, + { + "auxiliary_loss_clip": 0.01126441, + "auxiliary_loss_mlp": 0.0114561, + "balance_loss_clip": 1.00186372, + "balance_loss_mlp": 1.00074649, + "epoch": 0.23051255072899443, + "flos": 25520528805120.0, + "grad_norm": 1.8725890329447683, + "language_loss": 0.74863386, + "learning_rate": 3.592860451331624e-06, + "loss": 0.77135444, + "num_input_tokens_seen": 82488655, + "step": 3834, + "time_per_iteration": 4.121044635772705 + }, + { + "auxiliary_loss_clip": 0.01128316, + "auxiliary_loss_mlp": 0.01145993, + "balance_loss_clip": 1.00206244, + "balance_loss_mlp": 1.00112963, + "epoch": 0.2305726739816624, + "flos": 21215198695680.0, + "grad_norm": 1.9463361749606856, + "language_loss": 0.85924232, + "learning_rate": 3.592624901801432e-06, + "loss": 0.88198543, + "num_input_tokens_seen": 82507220, + "step": 3835, + "time_per_iteration": 2.645273447036743 + }, + { + "auxiliary_loss_clip": 0.01127874, + "auxiliary_loss_mlp": 0.01146332, + "balance_loss_clip": 1.00198257, + "balance_loss_mlp": 1.00099206, + "epoch": 0.2306327972343304, + "flos": 23331127115520.0, + "grad_norm": 1.8970079771207267, + "language_loss": 0.81899226, + "learning_rate": 3.5923892918787432e-06, + "loss": 0.84173441, + "num_input_tokens_seen": 82527920, + "step": 3836, + "time_per_iteration": 4.111414194107056 + }, + { + "auxiliary_loss_clip": 0.01158804, + "auxiliary_loss_mlp": 0.01145901, + "balance_loss_clip": 1.00214124, + "balance_loss_mlp": 1.00094259, + "epoch": 0.23069292048699835, + "flos": 20666734951680.0, + "grad_norm": 1.5274734222093156, + "language_loss": 0.79314101, + "learning_rate": 3.5921536215724934e-06, + "loss": 0.8161881, + "num_input_tokens_seen": 82549040, + "step": 3837, + "time_per_iteration": 3.9780220985412598 + }, + { + "auxiliary_loss_clip": 0.01124511, + "auxiliary_loss_mlp": 0.0113353, + "balance_loss_clip": 1.001948, + "balance_loss_mlp": 1.00001585, + "epoch": 0.23075304373966632, + "flos": 70454832393600.0, + "grad_norm": 0.9455260043059661, + "language_loss": 0.6538434, + "learning_rate": 3.5919178908916184e-06, + "loss": 0.67642379, + "num_input_tokens_seen": 82604070, + "step": 3838, + "time_per_iteration": 4.588951349258423 + }, + { + "auxiliary_loss_clip": 0.01158245, + "auxiliary_loss_mlp": 0.0114537, + "balance_loss_clip": 1.00211096, + "balance_loss_mlp": 1.00107932, + "epoch": 0.23081316699233428, + "flos": 16617986668800.0, + "grad_norm": 1.8542855373496379, + "language_loss": 0.75581408, + "learning_rate": 3.591682099845058e-06, + "loss": 0.7788502, + "num_input_tokens_seen": 82619665, + "step": 3839, + "time_per_iteration": 2.5194520950317383 + }, + { + "auxiliary_loss_clip": 0.01143354, + "auxiliary_loss_mlp": 0.0114574, + "balance_loss_clip": 1.00214899, + "balance_loss_mlp": 1.00087655, + "epoch": 0.23087329024500225, + "flos": 13298081253120.0, + "grad_norm": 1.7830662578020937, + "language_loss": 0.68374413, + "learning_rate": 3.591446248441752e-06, + "loss": 0.70663512, + "num_input_tokens_seen": 82637530, + "step": 3840, + "time_per_iteration": 2.5750107765197754 + }, + { + "auxiliary_loss_clip": 0.01175115, + "auxiliary_loss_mlp": 0.01145723, + "balance_loss_clip": 1.00225496, + "balance_loss_mlp": 1.00085938, + "epoch": 0.23093341349767021, + "flos": 17785729820160.0, + "grad_norm": 1.772858446669298, + "language_loss": 0.7965095, + "learning_rate": 3.591210336690645e-06, + "loss": 0.81971788, + "num_input_tokens_seen": 82656130, + "step": 3841, + "time_per_iteration": 2.4964444637298584 + }, + { + "auxiliary_loss_clip": 0.01158592, + "auxiliary_loss_mlp": 0.01145446, + "balance_loss_clip": 1.00209427, + "balance_loss_mlp": 1.00115538, + "epoch": 0.23099353675033818, + "flos": 23988076911360.0, + "grad_norm": 1.957465567173777, + "language_loss": 0.82582891, + "learning_rate": 3.590974364600683e-06, + "loss": 0.84886932, + "num_input_tokens_seen": 82675295, + "step": 3842, + "time_per_iteration": 2.5793519020080566 + }, + { + "auxiliary_loss_clip": 0.01159767, + "auxiliary_loss_mlp": 0.01145598, + "balance_loss_clip": 1.0021565, + "balance_loss_mlp": 1.0009253, + "epoch": 0.23105366000300617, + "flos": 35995168471680.0, + "grad_norm": 1.5070187400786683, + "language_loss": 0.66458344, + "learning_rate": 3.5907383321808135e-06, + "loss": 0.68763709, + "num_input_tokens_seen": 82703260, + "step": 3843, + "time_per_iteration": 2.750096559524536 + }, + { + "auxiliary_loss_clip": 0.01159491, + "auxiliary_loss_mlp": 0.01145932, + "balance_loss_clip": 1.00213218, + "balance_loss_mlp": 1.00135517, + "epoch": 0.23111378325567414, + "flos": 31245335556480.0, + "grad_norm": 1.7074709092601137, + "language_loss": 0.76922745, + "learning_rate": 3.590502239439987e-06, + "loss": 0.79228175, + "num_input_tokens_seen": 82725060, + "step": 3844, + "time_per_iteration": 2.6389529705047607 + }, + { + "auxiliary_loss_clip": 0.01159657, + "auxiliary_loss_mlp": 0.01145834, + "balance_loss_clip": 1.00214028, + "balance_loss_mlp": 1.0009706, + "epoch": 0.2311739065083421, + "flos": 19208223204480.0, + "grad_norm": 1.583192563203059, + "language_loss": 0.78199065, + "learning_rate": 3.590266086387156e-06, + "loss": 0.80504555, + "num_input_tokens_seen": 82742960, + "step": 3845, + "time_per_iteration": 2.5996878147125244 + }, + { + "auxiliary_loss_clip": 0.01126944, + "auxiliary_loss_mlp": 0.01145101, + "balance_loss_clip": 1.00199306, + "balance_loss_mlp": 1.00090563, + "epoch": 0.23123402976101007, + "flos": 23360178240000.0, + "grad_norm": 2.276968343437453, + "language_loss": 0.76416868, + "learning_rate": 3.590029873031276e-06, + "loss": 0.78688914, + "num_input_tokens_seen": 82760205, + "step": 3846, + "time_per_iteration": 2.6453092098236084 + }, + { + "auxiliary_loss_clip": 0.0114222, + "auxiliary_loss_mlp": 0.01145622, + "balance_loss_clip": 1.00194907, + "balance_loss_mlp": 1.00104463, + "epoch": 0.23129415301367803, + "flos": 13735365425280.0, + "grad_norm": 1.802920379340911, + "language_loss": 0.69480574, + "learning_rate": 3.589793599381304e-06, + "loss": 0.71768415, + "num_input_tokens_seen": 82778590, + "step": 3847, + "time_per_iteration": 2.564180850982666 + }, + { + "auxiliary_loss_clip": 0.01156821, + "auxiliary_loss_mlp": 0.01132784, + "balance_loss_clip": 1.00293303, + "balance_loss_mlp": 1.00003266, + "epoch": 0.231354276266346, + "flos": 69737015001600.0, + "grad_norm": 0.7932197716798904, + "language_loss": 0.61027288, + "learning_rate": 3.589557265446198e-06, + "loss": 0.63316894, + "num_input_tokens_seen": 82833925, + "step": 3848, + "time_per_iteration": 3.066512107849121 + }, + { + "auxiliary_loss_clip": 0.0115809, + "auxiliary_loss_mlp": 0.01145384, + "balance_loss_clip": 1.0020473, + "balance_loss_mlp": 1.00109315, + "epoch": 0.231414399519014, + "flos": 18835900778880.0, + "grad_norm": 1.8968965001749736, + "language_loss": 0.78186262, + "learning_rate": 3.589320871234923e-06, + "loss": 0.80489737, + "num_input_tokens_seen": 82850625, + "step": 3849, + "time_per_iteration": 2.543806552886963 + }, + { + "auxiliary_loss_clip": 0.01158169, + "auxiliary_loss_mlp": 0.01146071, + "balance_loss_clip": 1.00203037, + "balance_loss_mlp": 1.00082636, + "epoch": 0.23147452277168196, + "flos": 36135470995200.0, + "grad_norm": 1.6862311011558235, + "language_loss": 0.70943028, + "learning_rate": 3.5890844167564405e-06, + "loss": 0.73247266, + "num_input_tokens_seen": 82872105, + "step": 3850, + "time_per_iteration": 2.687692642211914 + }, + { + "auxiliary_loss_clip": 0.0114273, + "auxiliary_loss_mlp": 0.00748493, + "balance_loss_clip": 1.00199366, + "balance_loss_mlp": 1.00043762, + "epoch": 0.23153464602434992, + "flos": 20812927305600.0, + "grad_norm": 1.7698957561079895, + "language_loss": 0.76082414, + "learning_rate": 3.588847902019718e-06, + "loss": 0.77973634, + "num_input_tokens_seen": 82890595, + "step": 3851, + "time_per_iteration": 2.61403226852417 + }, + { + "auxiliary_loss_clip": 0.01174946, + "auxiliary_loss_mlp": 0.01145323, + "balance_loss_clip": 1.00218618, + "balance_loss_mlp": 1.00093699, + "epoch": 0.2315947692770179, + "flos": 19939256801280.0, + "grad_norm": 1.7974370995240518, + "language_loss": 0.69918346, + "learning_rate": 3.588611327033723e-06, + "loss": 0.72238612, + "num_input_tokens_seen": 82908910, + "step": 3852, + "time_per_iteration": 2.5374417304992676 + }, + { + "auxiliary_loss_clip": 0.01132232, + "auxiliary_loss_mlp": 0.01146087, + "balance_loss_clip": 1.00245047, + "balance_loss_mlp": 1.00103354, + "epoch": 0.23165489252968585, + "flos": 12855553695360.0, + "grad_norm": 2.6134147279991637, + "language_loss": 0.67431355, + "learning_rate": 3.588374691807428e-06, + "loss": 0.69709671, + "num_input_tokens_seen": 82925405, + "step": 3853, + "time_per_iteration": 2.6125993728637695 + }, + { + "auxiliary_loss_clip": 0.01158522, + "auxiliary_loss_mlp": 0.01146027, + "balance_loss_clip": 1.00219297, + "balance_loss_mlp": 1.00078213, + "epoch": 0.23171501578235382, + "flos": 30628282792320.0, + "grad_norm": 1.8240625910530255, + "language_loss": 0.79768002, + "learning_rate": 3.5881379963498053e-06, + "loss": 0.8207255, + "num_input_tokens_seen": 82945615, + "step": 3854, + "time_per_iteration": 2.651339530944824 + }, + { + "auxiliary_loss_clip": 0.01144408, + "auxiliary_loss_mlp": 0.01146169, + "balance_loss_clip": 1.00211036, + "balance_loss_mlp": 1.00101948, + "epoch": 0.23177513903502178, + "flos": 23842782397440.0, + "grad_norm": 1.863894593569698, + "language_loss": 0.65098667, + "learning_rate": 3.587901240669831e-06, + "loss": 0.6738925, + "num_input_tokens_seen": 82967570, + "step": 3855, + "time_per_iteration": 2.6320412158966064 + }, + { + "auxiliary_loss_clip": 0.0117495, + "auxiliary_loss_mlp": 0.0114552, + "balance_loss_clip": 1.00210881, + "balance_loss_mlp": 1.00113368, + "epoch": 0.23183526228768978, + "flos": 29570282668800.0, + "grad_norm": 1.9113587705835995, + "language_loss": 0.70659763, + "learning_rate": 3.5876644247764815e-06, + "loss": 0.72980231, + "num_input_tokens_seen": 82987435, + "step": 3856, + "time_per_iteration": 2.595130443572998 + }, + { + "auxiliary_loss_clip": 0.01109982, + "auxiliary_loss_mlp": 0.01145101, + "balance_loss_clip": 1.00184798, + "balance_loss_mlp": 1.00090575, + "epoch": 0.23189538554035774, + "flos": 34458694254720.0, + "grad_norm": 1.715845214962952, + "language_loss": 0.77372587, + "learning_rate": 3.5874275486787387e-06, + "loss": 0.79627669, + "num_input_tokens_seen": 83010505, + "step": 3857, + "time_per_iteration": 2.7908108234405518 + }, + { + "auxiliary_loss_clip": 0.01143651, + "auxiliary_loss_mlp": 0.00748596, + "balance_loss_clip": 1.00215948, + "balance_loss_mlp": 1.00050092, + "epoch": 0.2319555087930257, + "flos": 18003815245440.0, + "grad_norm": 2.3872933301064454, + "language_loss": 0.91233921, + "learning_rate": 3.587190612385584e-06, + "loss": 0.93126166, + "num_input_tokens_seen": 83026705, + "step": 3858, + "time_per_iteration": 2.5964303016662598 + }, + { + "auxiliary_loss_clip": 0.01110406, + "auxiliary_loss_mlp": 0.0114523, + "balance_loss_clip": 1.00205564, + "balance_loss_mlp": 1.00093925, + "epoch": 0.23201563204569367, + "flos": 23143852581120.0, + "grad_norm": 1.7619481007637947, + "language_loss": 0.76381683, + "learning_rate": 3.5869536159060026e-06, + "loss": 0.78637326, + "num_input_tokens_seen": 83046500, + "step": 3859, + "time_per_iteration": 2.6864893436431885 + }, + { + "auxiliary_loss_clip": 0.01158173, + "auxiliary_loss_mlp": 0.01144809, + "balance_loss_clip": 1.00189209, + "balance_loss_mlp": 1.00070858, + "epoch": 0.23207575529836164, + "flos": 20667991927680.0, + "grad_norm": 1.6796891914317849, + "language_loss": 0.84345508, + "learning_rate": 3.58671655924898e-06, + "loss": 0.86648488, + "num_input_tokens_seen": 83065280, + "step": 3860, + "time_per_iteration": 2.578047513961792 + }, + { + "auxiliary_loss_clip": 0.01110211, + "auxiliary_loss_mlp": 0.01145702, + "balance_loss_clip": 1.00187433, + "balance_loss_mlp": 1.00083888, + "epoch": 0.2321358785510296, + "flos": 16472189364480.0, + "grad_norm": 2.2548886833348187, + "language_loss": 0.83010572, + "learning_rate": 3.586479442423508e-06, + "loss": 0.85266489, + "num_input_tokens_seen": 83082310, + "step": 3861, + "time_per_iteration": 2.674262046813965 + }, + { + "auxiliary_loss_clip": 0.01159899, + "auxiliary_loss_mlp": 0.00748597, + "balance_loss_clip": 1.00222659, + "balance_loss_mlp": 1.00047612, + "epoch": 0.2321960018036976, + "flos": 21616320850560.0, + "grad_norm": 1.5840032488110654, + "language_loss": 0.85645616, + "learning_rate": 3.586242265438576e-06, + "loss": 0.87554109, + "num_input_tokens_seen": 83102065, + "step": 3862, + "time_per_iteration": 2.607198715209961 + }, + { + "auxiliary_loss_clip": 0.01130767, + "auxiliary_loss_mlp": 0.01145413, + "balance_loss_clip": 1.00240946, + "balance_loss_mlp": 1.00093162, + "epoch": 0.23225612505636556, + "flos": 22271474966400.0, + "grad_norm": 1.62292409551667, + "language_loss": 0.74861389, + "learning_rate": 3.5860050283031773e-06, + "loss": 0.77137566, + "num_input_tokens_seen": 83121445, + "step": 3863, + "time_per_iteration": 2.7032158374786377 + }, + { + "auxiliary_loss_clip": 0.01125144, + "auxiliary_loss_mlp": 0.01145129, + "balance_loss_clip": 1.0019232, + "balance_loss_mlp": 1.00112462, + "epoch": 0.23231624830903352, + "flos": 17052325925760.0, + "grad_norm": 1.74414801832258, + "language_loss": 0.74786741, + "learning_rate": 3.58576773102631e-06, + "loss": 0.77057016, + "num_input_tokens_seen": 83138175, + "step": 3864, + "time_per_iteration": 2.6149773597717285 + }, + { + "auxiliary_loss_clip": 0.01174994, + "auxiliary_loss_mlp": 0.01145726, + "balance_loss_clip": 1.00215769, + "balance_loss_mlp": 1.00095832, + "epoch": 0.2323763715617015, + "flos": 34640043045120.0, + "grad_norm": 1.694017362113039, + "language_loss": 0.70116985, + "learning_rate": 3.5855303736169714e-06, + "loss": 0.72437704, + "num_input_tokens_seen": 83161975, + "step": 3865, + "time_per_iteration": 2.673740863800049 + }, + { + "auxiliary_loss_clip": 0.01175196, + "auxiliary_loss_mlp": 0.01146792, + "balance_loss_clip": 1.00221443, + "balance_loss_mlp": 1.00107026, + "epoch": 0.23243649481436945, + "flos": 25551698832000.0, + "grad_norm": 3.948079398009851, + "language_loss": 0.94836932, + "learning_rate": 3.5852929560841617e-06, + "loss": 0.97158921, + "num_input_tokens_seen": 83180905, + "step": 3866, + "time_per_iteration": 2.543196439743042 + }, + { + "auxiliary_loss_clip": 0.01159491, + "auxiliary_loss_mlp": 0.01145982, + "balance_loss_clip": 1.0021745, + "balance_loss_mlp": 1.00121439, + "epoch": 0.23249661806703742, + "flos": 20483482740480.0, + "grad_norm": 4.778601775059104, + "language_loss": 0.73040468, + "learning_rate": 3.5850554784368846e-06, + "loss": 0.75345939, + "num_input_tokens_seen": 83196390, + "step": 3867, + "time_per_iteration": 2.5448927879333496 + }, + { + "auxiliary_loss_clip": 0.01143624, + "auxiliary_loss_mlp": 0.0114546, + "balance_loss_clip": 1.00194025, + "balance_loss_mlp": 1.00097799, + "epoch": 0.23255674131970538, + "flos": 20376612800640.0, + "grad_norm": 1.6775327931191957, + "language_loss": 0.82114398, + "learning_rate": 3.584817940684145e-06, + "loss": 0.84403479, + "num_input_tokens_seen": 83216165, + "step": 3868, + "time_per_iteration": 2.6086585521698 + }, + { + "auxiliary_loss_clip": 0.01159774, + "auxiliary_loss_mlp": 0.01144567, + "balance_loss_clip": 1.00225019, + "balance_loss_mlp": 1.0009439, + "epoch": 0.23261686457237338, + "flos": 17056096853760.0, + "grad_norm": 1.6809451284705572, + "language_loss": 0.73315597, + "learning_rate": 3.58458034283495e-06, + "loss": 0.75619936, + "num_input_tokens_seen": 83233845, + "step": 3869, + "time_per_iteration": 2.5211758613586426 + }, + { + "auxiliary_loss_clip": 0.01158278, + "auxiliary_loss_mlp": 0.011458, + "balance_loss_clip": 1.00212502, + "balance_loss_mlp": 1.00122297, + "epoch": 0.23267698782504134, + "flos": 29169878785920.0, + "grad_norm": 1.7384319243780053, + "language_loss": 0.79694223, + "learning_rate": 3.5843426848983097e-06, + "loss": 0.81998301, + "num_input_tokens_seen": 83254930, + "step": 3870, + "time_per_iteration": 2.6426095962524414 + }, + { + "auxiliary_loss_clip": 0.01175192, + "auxiliary_loss_mlp": 0.01146233, + "balance_loss_clip": 1.00230026, + "balance_loss_mlp": 1.00089312, + "epoch": 0.2327371110777093, + "flos": 21174655219200.0, + "grad_norm": 1.8029556961978466, + "language_loss": 0.70619154, + "learning_rate": 3.5841049668832357e-06, + "loss": 0.72940588, + "num_input_tokens_seen": 83272095, + "step": 3871, + "time_per_iteration": 3.9332311153411865 + }, + { + "auxiliary_loss_clip": 0.01158561, + "auxiliary_loss_mlp": 0.01146597, + "balance_loss_clip": 1.00226963, + "balance_loss_mlp": 1.00135207, + "epoch": 0.23279723433037727, + "flos": 24863112132480.0, + "grad_norm": 1.8308821356623244, + "language_loss": 0.6879108, + "learning_rate": 3.5838671887987433e-06, + "loss": 0.71096241, + "num_input_tokens_seen": 83290980, + "step": 3872, + "time_per_iteration": 2.5979762077331543 + }, + { + "auxiliary_loss_clip": 0.01159962, + "auxiliary_loss_mlp": 0.01146742, + "balance_loss_clip": 1.00225544, + "balance_loss_mlp": 1.00111628, + "epoch": 0.23285735758304524, + "flos": 38800617344640.0, + "grad_norm": 1.514168687012235, + "language_loss": 0.77795112, + "learning_rate": 3.5836293506538474e-06, + "loss": 0.80101818, + "num_input_tokens_seen": 83315175, + "step": 3873, + "time_per_iteration": 4.142972469329834 + }, + { + "auxiliary_loss_clip": 0.01140045, + "auxiliary_loss_mlp": 0.01133797, + "balance_loss_clip": 1.00196791, + "balance_loss_mlp": 1.00028217, + "epoch": 0.2329174808357132, + "flos": 53944113692160.0, + "grad_norm": 0.8467162315152522, + "language_loss": 0.6052165, + "learning_rate": 3.5833914524575687e-06, + "loss": 0.62795496, + "num_input_tokens_seen": 83372060, + "step": 3874, + "time_per_iteration": 3.086167812347412 + }, + { + "auxiliary_loss_clip": 0.01142992, + "auxiliary_loss_mlp": 0.0114534, + "balance_loss_clip": 1.00211263, + "balance_loss_mlp": 1.00095379, + "epoch": 0.23297760408838117, + "flos": 21216024708480.0, + "grad_norm": 2.563622214724155, + "language_loss": 0.80988979, + "learning_rate": 3.583153494218927e-06, + "loss": 0.83277309, + "num_input_tokens_seen": 83389795, + "step": 3875, + "time_per_iteration": 5.519067287445068 + }, + { + "auxiliary_loss_clip": 0.01175038, + "auxiliary_loss_mlp": 0.00748471, + "balance_loss_clip": 1.00224733, + "balance_loss_mlp": 1.00042081, + "epoch": 0.23303772734104916, + "flos": 28403006394240.0, + "grad_norm": 1.5710862529998397, + "language_loss": 0.61101723, + "learning_rate": 3.5829154759469464e-06, + "loss": 0.63025224, + "num_input_tokens_seen": 83410005, + "step": 3876, + "time_per_iteration": 2.6081924438476562 + }, + { + "auxiliary_loss_clip": 0.01143093, + "auxiliary_loss_mlp": 0.01145375, + "balance_loss_clip": 1.0021733, + "balance_loss_mlp": 1.00089312, + "epoch": 0.23309785059371713, + "flos": 24314720215680.0, + "grad_norm": 1.710216019204425, + "language_loss": 0.70316428, + "learning_rate": 3.5826773976506523e-06, + "loss": 0.72604901, + "num_input_tokens_seen": 83430250, + "step": 3877, + "time_per_iteration": 2.6512300968170166 + }, + { + "auxiliary_loss_clip": 0.01158713, + "auxiliary_loss_mlp": 0.01145793, + "balance_loss_clip": 1.00223863, + "balance_loss_mlp": 1.00131154, + "epoch": 0.2331579738463851, + "flos": 15992925171840.0, + "grad_norm": 2.3569039292846816, + "language_loss": 0.81128579, + "learning_rate": 3.582439259339073e-06, + "loss": 0.83433086, + "num_input_tokens_seen": 83447950, + "step": 3878, + "time_per_iteration": 2.5495080947875977 + }, + { + "auxiliary_loss_clip": 0.0109452, + "auxiliary_loss_mlp": 0.01146053, + "balance_loss_clip": 1.00184703, + "balance_loss_mlp": 1.00099909, + "epoch": 0.23321809709905306, + "flos": 36426957863040.0, + "grad_norm": 1.7043739867875956, + "language_loss": 0.75031972, + "learning_rate": 3.5822010610212374e-06, + "loss": 0.7727254, + "num_input_tokens_seen": 83467785, + "step": 3879, + "time_per_iteration": 2.8255958557128906 + }, + { + "auxiliary_loss_clip": 0.01111402, + "auxiliary_loss_mlp": 0.01145203, + "balance_loss_clip": 1.00188589, + "balance_loss_mlp": 1.0010072, + "epoch": 0.23327822035172102, + "flos": 21324762155520.0, + "grad_norm": 2.467978915072919, + "language_loss": 0.89156222, + "learning_rate": 3.5819628027061795e-06, + "loss": 0.9141283, + "num_input_tokens_seen": 83485390, + "step": 3880, + "time_per_iteration": 2.664646625518799 + }, + { + "auxiliary_loss_clip": 0.01141975, + "auxiliary_loss_mlp": 0.01145968, + "balance_loss_clip": 1.00208938, + "balance_loss_mlp": 1.00110459, + "epoch": 0.233338343604389, + "flos": 19171881619200.0, + "grad_norm": 1.6640430154910912, + "language_loss": 0.71948469, + "learning_rate": 3.5817244844029334e-06, + "loss": 0.74236417, + "num_input_tokens_seen": 83504890, + "step": 3881, + "time_per_iteration": 2.6189324855804443 + }, + { + "auxiliary_loss_clip": 0.01174949, + "auxiliary_loss_mlp": 0.01145446, + "balance_loss_clip": 1.00220907, + "balance_loss_mlp": 1.00096476, + "epoch": 0.23339846685705698, + "flos": 26908368543360.0, + "grad_norm": 1.5703733135460023, + "language_loss": 0.67955041, + "learning_rate": 3.581486106120537e-06, + "loss": 0.70275438, + "num_input_tokens_seen": 83526475, + "step": 3882, + "time_per_iteration": 2.57417368888855 + }, + { + "auxiliary_loss_clip": 0.01126292, + "auxiliary_loss_mlp": 0.01146291, + "balance_loss_clip": 1.00192046, + "balance_loss_mlp": 1.00123715, + "epoch": 0.23345859010972494, + "flos": 32343160884480.0, + "grad_norm": 2.225296353191625, + "language_loss": 0.76677513, + "learning_rate": 3.5812476678680287e-06, + "loss": 0.78950095, + "num_input_tokens_seen": 83546620, + "step": 3883, + "time_per_iteration": 2.709131956100464 + }, + { + "auxiliary_loss_clip": 0.0114025, + "auxiliary_loss_mlp": 0.01132871, + "balance_loss_clip": 1.00206232, + "balance_loss_mlp": 1.00011909, + "epoch": 0.2335187133623929, + "flos": 58484229050880.0, + "grad_norm": 0.7824085489291779, + "language_loss": 0.59150231, + "learning_rate": 3.58100916965445e-06, + "loss": 0.61423349, + "num_input_tokens_seen": 83616160, + "step": 3884, + "time_per_iteration": 3.326646327972412 + }, + { + "auxiliary_loss_clip": 0.01131601, + "auxiliary_loss_mlp": 0.01145177, + "balance_loss_clip": 1.00223291, + "balance_loss_mlp": 1.00079083, + "epoch": 0.23357883661506088, + "flos": 24502317972480.0, + "grad_norm": 1.8761328847607623, + "language_loss": 0.80352926, + "learning_rate": 3.5807706114888455e-06, + "loss": 0.82629704, + "num_input_tokens_seen": 83636795, + "step": 3885, + "time_per_iteration": 2.7008981704711914 + }, + { + "auxiliary_loss_clip": 0.01158539, + "auxiliary_loss_mlp": 0.01145322, + "balance_loss_clip": 1.00208473, + "balance_loss_mlp": 1.00112677, + "epoch": 0.23363895986772884, + "flos": 18948516894720.0, + "grad_norm": 17.341919674559808, + "language_loss": 0.8801378, + "learning_rate": 3.580531993380261e-06, + "loss": 0.90317643, + "num_input_tokens_seen": 83654050, + "step": 3886, + "time_per_iteration": 2.5607833862304688 + }, + { + "auxiliary_loss_clip": 0.01174961, + "auxiliary_loss_mlp": 0.01145591, + "balance_loss_clip": 1.00218773, + "balance_loss_mlp": 1.00101435, + "epoch": 0.2336990831203968, + "flos": 31686821619840.0, + "grad_norm": 1.867189635191675, + "language_loss": 0.73502821, + "learning_rate": 3.5802933153377445e-06, + "loss": 0.75823379, + "num_input_tokens_seen": 83673720, + "step": 3887, + "time_per_iteration": 2.6024880409240723 + }, + { + "auxiliary_loss_clip": 0.01158275, + "auxiliary_loss_mlp": 0.01145086, + "balance_loss_clip": 1.00204551, + "balance_loss_mlp": 1.00079489, + "epoch": 0.23375920637306477, + "flos": 27709750926720.0, + "grad_norm": 1.6872675137402326, + "language_loss": 0.8386628, + "learning_rate": 3.5800545773703475e-06, + "loss": 0.86169648, + "num_input_tokens_seen": 83693470, + "step": 3888, + "time_per_iteration": 2.621819019317627 + }, + { + "auxiliary_loss_clip": 0.01143519, + "auxiliary_loss_mlp": 0.01145656, + "balance_loss_clip": 1.00211942, + "balance_loss_mlp": 1.00117397, + "epoch": 0.23381932962573276, + "flos": 17675627656320.0, + "grad_norm": 2.0350378024800846, + "language_loss": 0.87294197, + "learning_rate": 3.5798157794871225e-06, + "loss": 0.89583367, + "num_input_tokens_seen": 83711620, + "step": 3889, + "time_per_iteration": 2.607116937637329 + }, + { + "auxiliary_loss_clip": 0.01158306, + "auxiliary_loss_mlp": 0.01145349, + "balance_loss_clip": 1.00213194, + "balance_loss_mlp": 1.00096273, + "epoch": 0.23387945287840073, + "flos": 14390842763520.0, + "grad_norm": 2.68950317025344, + "language_loss": 0.767941, + "learning_rate": 3.579576921697125e-06, + "loss": 0.79097748, + "num_input_tokens_seen": 83727890, + "step": 3890, + "time_per_iteration": 2.5266618728637695 + }, + { + "auxiliary_loss_clip": 0.01115397, + "auxiliary_loss_mlp": 0.00748421, + "balance_loss_clip": 1.00241482, + "balance_loss_mlp": 1.0003705, + "epoch": 0.2339395761310687, + "flos": 46097988503040.0, + "grad_norm": 1.7484478885623342, + "language_loss": 0.736045, + "learning_rate": 3.579338004009412e-06, + "loss": 0.75468314, + "num_input_tokens_seen": 83749370, + "step": 3891, + "time_per_iteration": 2.8834941387176514 + }, + { + "auxiliary_loss_clip": 0.01174826, + "auxiliary_loss_mlp": 0.01144878, + "balance_loss_clip": 1.00221848, + "balance_loss_mlp": 1.00096822, + "epoch": 0.23399969938373666, + "flos": 22382044007040.0, + "grad_norm": 1.5509480933336501, + "language_loss": 0.83143246, + "learning_rate": 3.5790990264330433e-06, + "loss": 0.85462952, + "num_input_tokens_seen": 83769560, + "step": 3892, + "time_per_iteration": 2.5533671379089355 + }, + { + "auxiliary_loss_clip": 0.01133355, + "auxiliary_loss_mlp": 0.01145885, + "balance_loss_clip": 1.00244665, + "balance_loss_mlp": 1.00102186, + "epoch": 0.23405982263640462, + "flos": 43508542066560.0, + "grad_norm": 1.5809496173257345, + "language_loss": 0.64720041, + "learning_rate": 3.578859988977082e-06, + "loss": 0.6699928, + "num_input_tokens_seen": 83795635, + "step": 3893, + "time_per_iteration": 2.8239896297454834 + }, + { + "auxiliary_loss_clip": 0.01125822, + "auxiliary_loss_mlp": 0.01145828, + "balance_loss_clip": 1.00207138, + "balance_loss_mlp": 1.00086975, + "epoch": 0.2341199458890726, + "flos": 22564685687040.0, + "grad_norm": 1.8714965801957888, + "language_loss": 0.79368633, + "learning_rate": 3.5786208916505916e-06, + "loss": 0.81640285, + "num_input_tokens_seen": 83814090, + "step": 3894, + "time_per_iteration": 2.653247117996216 + }, + { + "auxiliary_loss_clip": 0.01159343, + "auxiliary_loss_mlp": 0.01144984, + "balance_loss_clip": 1.00212002, + "balance_loss_mlp": 1.00097895, + "epoch": 0.23418006914174055, + "flos": 25633970933760.0, + "grad_norm": 1.4624301651126221, + "language_loss": 0.81885326, + "learning_rate": 3.5783817344626383e-06, + "loss": 0.84189653, + "num_input_tokens_seen": 83836870, + "step": 3895, + "time_per_iteration": 2.6380364894866943 + }, + { + "auxiliary_loss_clip": 0.01158245, + "auxiliary_loss_mlp": 0.01145502, + "balance_loss_clip": 1.00217295, + "balance_loss_mlp": 1.00111556, + "epoch": 0.23424019239440855, + "flos": 13545936074880.0, + "grad_norm": 2.407970067845886, + "language_loss": 0.80765289, + "learning_rate": 3.578142517422292e-06, + "loss": 0.83069038, + "num_input_tokens_seen": 83853275, + "step": 3896, + "time_per_iteration": 2.547116279602051 + }, + { + "auxiliary_loss_clip": 0.0114423, + "auxiliary_loss_mlp": 0.01145379, + "balance_loss_clip": 1.00206125, + "balance_loss_mlp": 1.00089765, + "epoch": 0.2343003156470765, + "flos": 22419498913920.0, + "grad_norm": 1.5394715721892174, + "language_loss": 0.83513176, + "learning_rate": 3.577903240538623e-06, + "loss": 0.85802788, + "num_input_tokens_seen": 83872340, + "step": 3897, + "time_per_iteration": 2.6114001274108887 + }, + { + "auxiliary_loss_clip": 0.01158782, + "auxiliary_loss_mlp": 0.01145237, + "balance_loss_clip": 1.00218558, + "balance_loss_mlp": 1.00104189, + "epoch": 0.23436043889974448, + "flos": 14790815683200.0, + "grad_norm": 1.5511329469914368, + "language_loss": 0.78808689, + "learning_rate": 3.577663903820705e-06, + "loss": 0.81112707, + "num_input_tokens_seen": 83888795, + "step": 3898, + "time_per_iteration": 2.5994656085968018 + }, + { + "auxiliary_loss_clip": 0.01127151, + "auxiliary_loss_mlp": 0.0114423, + "balance_loss_clip": 1.0021832, + "balance_loss_mlp": 1.00089312, + "epoch": 0.23442056215241244, + "flos": 22965700101120.0, + "grad_norm": 2.19239708852971, + "language_loss": 0.7413289, + "learning_rate": 3.577424507277614e-06, + "loss": 0.76404274, + "num_input_tokens_seen": 83906820, + "step": 3899, + "time_per_iteration": 2.65946626663208 + }, + { + "auxiliary_loss_clip": 0.01127401, + "auxiliary_loss_mlp": 0.01144708, + "balance_loss_clip": 1.00198054, + "balance_loss_mlp": 1.00089419, + "epoch": 0.2344806854050804, + "flos": 23071887682560.0, + "grad_norm": 1.6437720399929747, + "language_loss": 0.75445819, + "learning_rate": 3.5771850509184277e-06, + "loss": 0.77717924, + "num_input_tokens_seen": 83926370, + "step": 3900, + "time_per_iteration": 2.665311336517334 + }, + { + "auxiliary_loss_clip": 0.01114087, + "auxiliary_loss_mlp": 0.01144869, + "balance_loss_clip": 1.00201499, + "balance_loss_mlp": 1.0009594, + "epoch": 0.23454080865774837, + "flos": 16327074418560.0, + "grad_norm": 1.7218301279488015, + "language_loss": 0.66908228, + "learning_rate": 3.5769455347522256e-06, + "loss": 0.69167185, + "num_input_tokens_seen": 83944600, + "step": 3901, + "time_per_iteration": 2.6569724082946777 + }, + { + "auxiliary_loss_clip": 0.01109138, + "auxiliary_loss_mlp": 0.01132856, + "balance_loss_clip": 1.00217104, + "balance_loss_mlp": 1.00010443, + "epoch": 0.23460093191041637, + "flos": 67760958142080.0, + "grad_norm": 0.7488153398044984, + "language_loss": 0.58318919, + "learning_rate": 3.576705958788091e-06, + "loss": 0.60560912, + "num_input_tokens_seen": 84005100, + "step": 3902, + "time_per_iteration": 3.256498336791992 + }, + { + "auxiliary_loss_clip": 0.01142398, + "auxiliary_loss_mlp": 0.01144708, + "balance_loss_clip": 1.00209236, + "balance_loss_mlp": 1.00089359, + "epoch": 0.23466105516308433, + "flos": 20077619990400.0, + "grad_norm": 1.807136414176337, + "language_loss": 0.80112791, + "learning_rate": 3.576466323035108e-06, + "loss": 0.82399893, + "num_input_tokens_seen": 84023775, + "step": 3903, + "time_per_iteration": 2.612652063369751 + }, + { + "auxiliary_loss_clip": 0.01110627, + "auxiliary_loss_mlp": 0.01144677, + "balance_loss_clip": 1.001881, + "balance_loss_mlp": 1.00076747, + "epoch": 0.2347211784157523, + "flos": 24535714642560.0, + "grad_norm": 2.361359676484582, + "language_loss": 0.82157695, + "learning_rate": 3.5762266275023645e-06, + "loss": 0.84413004, + "num_input_tokens_seen": 84042605, + "step": 3904, + "time_per_iteration": 2.716259241104126 + }, + { + "auxiliary_loss_clip": 0.01174887, + "auxiliary_loss_mlp": 0.0114511, + "balance_loss_clip": 1.0022099, + "balance_loss_mlp": 1.00120044, + "epoch": 0.23478130166842026, + "flos": 23805040181760.0, + "grad_norm": 2.277485279010751, + "language_loss": 0.71711254, + "learning_rate": 3.57598687219895e-06, + "loss": 0.74031258, + "num_input_tokens_seen": 84061520, + "step": 3905, + "time_per_iteration": 2.554074764251709 + }, + { + "auxiliary_loss_clip": 0.01174772, + "auxiliary_loss_mlp": 0.01144621, + "balance_loss_clip": 1.00225055, + "balance_loss_mlp": 1.00090194, + "epoch": 0.23484142492108823, + "flos": 24093618048000.0, + "grad_norm": 1.789276949116599, + "language_loss": 0.7111972, + "learning_rate": 3.5757470571339543e-06, + "loss": 0.73439115, + "num_input_tokens_seen": 84081800, + "step": 3906, + "time_per_iteration": 2.5613672733306885 + }, + { + "auxiliary_loss_clip": 0.01158303, + "auxiliary_loss_mlp": 0.0114491, + "balance_loss_clip": 1.00205755, + "balance_loss_mlp": 1.0007143, + "epoch": 0.2349015481737562, + "flos": 29095830898560.0, + "grad_norm": 2.713727109561905, + "language_loss": 0.73762584, + "learning_rate": 3.575507182316473e-06, + "loss": 0.76065791, + "num_input_tokens_seen": 84102340, + "step": 3907, + "time_per_iteration": 2.616551399230957 + }, + { + "auxiliary_loss_clip": 0.01158152, + "auxiliary_loss_mlp": 0.01145233, + "balance_loss_clip": 1.00205624, + "balance_loss_mlp": 1.00113297, + "epoch": 0.23496167142642416, + "flos": 18916305373440.0, + "grad_norm": 1.6380622003124115, + "language_loss": 0.73301417, + "learning_rate": 3.575267247755601e-06, + "loss": 0.75604808, + "num_input_tokens_seen": 84120370, + "step": 3908, + "time_per_iteration": 3.9558305740356445 + }, + { + "auxiliary_loss_clip": 0.01141311, + "auxiliary_loss_mlp": 0.01132839, + "balance_loss_clip": 1.00216484, + "balance_loss_mlp": 1.00008798, + "epoch": 0.23502179467909215, + "flos": 55868062896000.0, + "grad_norm": 1.0341313849249951, + "language_loss": 0.73403126, + "learning_rate": 3.5750272534604367e-06, + "loss": 0.75677276, + "num_input_tokens_seen": 84165515, + "step": 3909, + "time_per_iteration": 2.9695029258728027 + }, + { + "auxiliary_loss_clip": 0.01158953, + "auxiliary_loss_mlp": 0.01144277, + "balance_loss_clip": 1.00209761, + "balance_loss_mlp": 1.00103521, + "epoch": 0.23508191793176011, + "flos": 23401763210880.0, + "grad_norm": 1.6250835674037765, + "language_loss": 0.87866795, + "learning_rate": 3.5747871994400822e-06, + "loss": 0.90170026, + "num_input_tokens_seen": 84184540, + "step": 3910, + "time_per_iteration": 2.6237080097198486 + }, + { + "auxiliary_loss_clip": 0.01158205, + "auxiliary_loss_mlp": 0.01144708, + "balance_loss_clip": 1.00212169, + "balance_loss_mlp": 1.00079858, + "epoch": 0.23514204118442808, + "flos": 20047671025920.0, + "grad_norm": 2.1928719760760806, + "language_loss": 0.75982147, + "learning_rate": 3.5745470857036386e-06, + "loss": 0.78285062, + "num_input_tokens_seen": 84202025, + "step": 3911, + "time_per_iteration": 4.004389524459839 + }, + { + "auxiliary_loss_clip": 0.01158056, + "auxiliary_loss_mlp": 0.01144512, + "balance_loss_clip": 1.0020864, + "balance_loss_mlp": 1.00107968, + "epoch": 0.23520216443709605, + "flos": 21580589796480.0, + "grad_norm": 1.4949218863983806, + "language_loss": 0.81637985, + "learning_rate": 3.5743069122602122e-06, + "loss": 0.83940554, + "num_input_tokens_seen": 84221895, + "step": 3912, + "time_per_iteration": 4.059706687927246 + }, + { + "auxiliary_loss_clip": 0.01142182, + "auxiliary_loss_mlp": 0.01144758, + "balance_loss_clip": 1.00199389, + "balance_loss_mlp": 1.00123024, + "epoch": 0.235262287689764, + "flos": 23185796688000.0, + "grad_norm": 1.95224981538049, + "language_loss": 0.71552718, + "learning_rate": 3.574066679118909e-06, + "loss": 0.73839658, + "num_input_tokens_seen": 84240455, + "step": 3913, + "time_per_iteration": 3.9943583011627197 + }, + { + "auxiliary_loss_clip": 0.01159079, + "auxiliary_loss_mlp": 0.00748528, + "balance_loss_clip": 1.00209475, + "balance_loss_mlp": 1.00038242, + "epoch": 0.23532241094243198, + "flos": 23185222070400.0, + "grad_norm": 4.344919506850415, + "language_loss": 0.76007593, + "learning_rate": 3.57382638628884e-06, + "loss": 0.77915204, + "num_input_tokens_seen": 84261605, + "step": 3914, + "time_per_iteration": 2.60971999168396 + }, + { + "auxiliary_loss_clip": 0.01094185, + "auxiliary_loss_mlp": 0.01144513, + "balance_loss_clip": 1.00192177, + "balance_loss_mlp": 1.00079393, + "epoch": 0.23538253419509997, + "flos": 17019324305280.0, + "grad_norm": 2.851873074172041, + "language_loss": 0.89532065, + "learning_rate": 3.5735860337791174e-06, + "loss": 0.91770756, + "num_input_tokens_seen": 84278675, + "step": 3915, + "time_per_iteration": 2.7094664573669434 + }, + { + "auxiliary_loss_clip": 0.01144484, + "auxiliary_loss_mlp": 0.01132788, + "balance_loss_clip": 1.002249, + "balance_loss_mlp": 1.00003684, + "epoch": 0.23544265744776793, + "flos": 63448588967040.0, + "grad_norm": 0.802034218744849, + "language_loss": 0.59384418, + "learning_rate": 3.573345621598854e-06, + "loss": 0.6166169, + "num_input_tokens_seen": 84329765, + "step": 3916, + "time_per_iteration": 3.139240264892578 + }, + { + "auxiliary_loss_clip": 0.01106635, + "auxiliary_loss_mlp": 0.01132849, + "balance_loss_clip": 1.00140119, + "balance_loss_mlp": 1.00009716, + "epoch": 0.2355027807004359, + "flos": 70515343831680.0, + "grad_norm": 0.7642286100604988, + "language_loss": 0.49506241, + "learning_rate": 3.5731051497571675e-06, + "loss": 0.51745725, + "num_input_tokens_seen": 84393680, + "step": 3917, + "time_per_iteration": 3.2300968170166016 + }, + { + "auxiliary_loss_clip": 0.01126017, + "auxiliary_loss_mlp": 0.01145511, + "balance_loss_clip": 1.00204718, + "balance_loss_mlp": 1.00131583, + "epoch": 0.23556290395310386, + "flos": 21434289701760.0, + "grad_norm": 1.785388983387029, + "language_loss": 0.76708156, + "learning_rate": 3.5728646182631756e-06, + "loss": 0.78979683, + "num_input_tokens_seen": 84412640, + "step": 3918, + "time_per_iteration": 2.6565709114074707 + }, + { + "auxiliary_loss_clip": 0.01112021, + "auxiliary_loss_mlp": 0.01144951, + "balance_loss_clip": 1.00209284, + "balance_loss_mlp": 1.00094664, + "epoch": 0.23562302720577183, + "flos": 18186421011840.0, + "grad_norm": 1.881524067994086, + "language_loss": 0.6970886, + "learning_rate": 3.5726240271259995e-06, + "loss": 0.71965837, + "num_input_tokens_seen": 84431605, + "step": 3919, + "time_per_iteration": 2.6630985736846924 + }, + { + "auxiliary_loss_clip": 0.01131206, + "auxiliary_loss_mlp": 0.01144179, + "balance_loss_clip": 1.00238681, + "balance_loss_mlp": 1.00103247, + "epoch": 0.2356831504584398, + "flos": 33730497832320.0, + "grad_norm": 1.6774426936071958, + "language_loss": 0.7059536, + "learning_rate": 3.5723833763547634e-06, + "loss": 0.72870743, + "num_input_tokens_seen": 84454210, + "step": 3920, + "time_per_iteration": 2.7641212940216064 + }, + { + "auxiliary_loss_clip": 0.01141555, + "auxiliary_loss_mlp": 0.01144785, + "balance_loss_clip": 1.00213325, + "balance_loss_mlp": 1.00106657, + "epoch": 0.23574327371110776, + "flos": 24932778560640.0, + "grad_norm": 1.6914534218610546, + "language_loss": 0.76877785, + "learning_rate": 3.5721426659585916e-06, + "loss": 0.79164124, + "num_input_tokens_seen": 84475540, + "step": 3921, + "time_per_iteration": 2.6580262184143066 + }, + { + "auxiliary_loss_clip": 0.01125648, + "auxiliary_loss_mlp": 0.01145038, + "balance_loss_clip": 1.00194085, + "balance_loss_mlp": 1.00093746, + "epoch": 0.23580339696377575, + "flos": 17822107319040.0, + "grad_norm": 2.1639902312940227, + "language_loss": 0.74796641, + "learning_rate": 3.571901895946612e-06, + "loss": 0.77067316, + "num_input_tokens_seen": 84494580, + "step": 3922, + "time_per_iteration": 2.6470084190368652 + }, + { + "auxiliary_loss_clip": 0.011415, + "auxiliary_loss_mlp": 0.0114404, + "balance_loss_clip": 1.00204015, + "balance_loss_mlp": 1.00079775, + "epoch": 0.23586352021644372, + "flos": 26286611097600.0, + "grad_norm": 1.9685340072151187, + "language_loss": 0.79896784, + "learning_rate": 3.571661066327956e-06, + "loss": 0.82182324, + "num_input_tokens_seen": 84513850, + "step": 3923, + "time_per_iteration": 2.6502699851989746 + }, + { + "auxiliary_loss_clip": 0.0111058, + "auxiliary_loss_mlp": 0.01145377, + "balance_loss_clip": 1.00201762, + "balance_loss_mlp": 1.00108612, + "epoch": 0.23592364346911168, + "flos": 14246697484800.0, + "grad_norm": 2.4229597946602435, + "language_loss": 0.74095976, + "learning_rate": 3.571420177111754e-06, + "loss": 0.76351929, + "num_input_tokens_seen": 84532315, + "step": 3924, + "time_per_iteration": 2.6410326957702637 + }, + { + "auxiliary_loss_clip": 0.01174895, + "auxiliary_loss_mlp": 0.01144956, + "balance_loss_clip": 1.00229573, + "balance_loss_mlp": 1.00095141, + "epoch": 0.23598376672177965, + "flos": 18587938216320.0, + "grad_norm": 1.5995479024468489, + "language_loss": 0.8253181, + "learning_rate": 3.5711792283071416e-06, + "loss": 0.84851664, + "num_input_tokens_seen": 84550970, + "step": 3925, + "time_per_iteration": 2.512734889984131 + }, + { + "auxiliary_loss_clip": 0.01147201, + "auxiliary_loss_mlp": 0.01145726, + "balance_loss_clip": 1.00238419, + "balance_loss_mlp": 1.00095868, + "epoch": 0.2360438899744476, + "flos": 22675542036480.0, + "grad_norm": 1.710458550498335, + "language_loss": 0.59295195, + "learning_rate": 3.5709382199232564e-06, + "loss": 0.6158812, + "num_input_tokens_seen": 84571655, + "step": 3926, + "time_per_iteration": 2.603682279586792 + }, + { + "auxiliary_loss_clip": 0.01158549, + "auxiliary_loss_mlp": 0.01144209, + "balance_loss_clip": 1.00226152, + "balance_loss_mlp": 1.0009675, + "epoch": 0.23610401322711558, + "flos": 29570139014400.0, + "grad_norm": 1.8600346433259536, + "language_loss": 0.71782869, + "learning_rate": 3.570697151969235e-06, + "loss": 0.74085629, + "num_input_tokens_seen": 84593130, + "step": 3927, + "time_per_iteration": 2.613565683364868 + }, + { + "auxiliary_loss_clip": 0.01142714, + "auxiliary_loss_mlp": 0.01144816, + "balance_loss_clip": 1.00206375, + "balance_loss_mlp": 1.00119257, + "epoch": 0.23616413647978354, + "flos": 17858520731520.0, + "grad_norm": 2.1435350624570035, + "language_loss": 0.75089985, + "learning_rate": 3.570456024454221e-06, + "loss": 0.77377516, + "num_input_tokens_seen": 84612410, + "step": 3928, + "time_per_iteration": 2.613269329071045 + }, + { + "auxiliary_loss_clip": 0.0114163, + "auxiliary_loss_mlp": 0.01145158, + "balance_loss_clip": 1.00192881, + "balance_loss_mlp": 1.0009625, + "epoch": 0.23622425973245154, + "flos": 11034847157760.0, + "grad_norm": 2.2780210934322636, + "language_loss": 0.81799197, + "learning_rate": 3.5702148373873576e-06, + "loss": 0.84085989, + "num_input_tokens_seen": 84627610, + "step": 3929, + "time_per_iteration": 2.5690219402313232 + }, + { + "auxiliary_loss_clip": 0.01174987, + "auxiliary_loss_mlp": 0.01145072, + "balance_loss_clip": 1.00220966, + "balance_loss_mlp": 1.00097167, + "epoch": 0.2362843829851195, + "flos": 23404061681280.0, + "grad_norm": 1.920116585893002, + "language_loss": 0.71739554, + "learning_rate": 3.569973590777789e-06, + "loss": 0.74059612, + "num_input_tokens_seen": 84648415, + "step": 3930, + "time_per_iteration": 2.536461114883423 + }, + { + "auxiliary_loss_clip": 0.01174753, + "auxiliary_loss_mlp": 0.0114486, + "balance_loss_clip": 1.00223756, + "balance_loss_mlp": 1.00095034, + "epoch": 0.23634450623778747, + "flos": 39529855261440.0, + "grad_norm": 1.8784478454932947, + "language_loss": 0.74367774, + "learning_rate": 3.569732284634665e-06, + "loss": 0.7668739, + "num_input_tokens_seen": 84670080, + "step": 3931, + "time_per_iteration": 2.694958448410034 + }, + { + "auxiliary_loss_clip": 0.01158242, + "auxiliary_loss_mlp": 0.01144989, + "balance_loss_clip": 1.00220966, + "balance_loss_mlp": 1.00079322, + "epoch": 0.23640462949045543, + "flos": 24207167917440.0, + "grad_norm": 1.896576639936158, + "language_loss": 0.80215955, + "learning_rate": 3.569490918967136e-06, + "loss": 0.82519192, + "num_input_tokens_seen": 84686465, + "step": 3932, + "time_per_iteration": 2.570917844772339 + }, + { + "auxiliary_loss_clip": 0.0112591, + "auxiliary_loss_mlp": 0.01144856, + "balance_loss_clip": 1.00209236, + "balance_loss_mlp": 1.00094628, + "epoch": 0.2364647527431234, + "flos": 26177622255360.0, + "grad_norm": 1.4442837445851364, + "language_loss": 0.85558927, + "learning_rate": 3.5692494937843537e-06, + "loss": 0.87829697, + "num_input_tokens_seen": 84708825, + "step": 3933, + "time_per_iteration": 2.6967670917510986 + }, + { + "auxiliary_loss_clip": 0.01116137, + "auxiliary_loss_mlp": 0.01144687, + "balance_loss_clip": 1.00207829, + "balance_loss_mlp": 1.00096834, + "epoch": 0.23652487599579136, + "flos": 22637009721600.0, + "grad_norm": 1.944972582271913, + "language_loss": 0.83331442, + "learning_rate": 3.5690080090954727e-06, + "loss": 0.8559227, + "num_input_tokens_seen": 84726165, + "step": 3934, + "time_per_iteration": 2.674232244491577 + }, + { + "auxiliary_loss_clip": 0.01174865, + "auxiliary_loss_mlp": 0.01144813, + "balance_loss_clip": 1.00228643, + "balance_loss_mlp": 1.00090361, + "epoch": 0.23658499924845935, + "flos": 21762261809280.0, + "grad_norm": 1.8838052481451488, + "language_loss": 0.78901243, + "learning_rate": 3.5687664649096515e-06, + "loss": 0.81220925, + "num_input_tokens_seen": 84745815, + "step": 3935, + "time_per_iteration": 2.551659345626831 + }, + { + "auxiliary_loss_clip": 0.01158096, + "auxiliary_loss_mlp": 0.01143891, + "balance_loss_clip": 1.00213957, + "balance_loss_mlp": 1.00112605, + "epoch": 0.23664512250112732, + "flos": 21798998444160.0, + "grad_norm": 2.067485932924775, + "language_loss": 0.79573989, + "learning_rate": 3.5685248612360487e-06, + "loss": 0.8187598, + "num_input_tokens_seen": 84765415, + "step": 3936, + "time_per_iteration": 2.602724313735962 + }, + { + "auxiliary_loss_clip": 0.01142395, + "auxiliary_loss_mlp": 0.01144283, + "balance_loss_clip": 1.00206327, + "balance_loss_mlp": 1.00094557, + "epoch": 0.23670524575379528, + "flos": 22637871648000.0, + "grad_norm": 1.6009706638238865, + "language_loss": 0.7907145, + "learning_rate": 3.568283198083826e-06, + "loss": 0.81358129, + "num_input_tokens_seen": 84787080, + "step": 3937, + "time_per_iteration": 2.6496834754943848 + }, + { + "auxiliary_loss_clip": 0.01158186, + "auxiliary_loss_mlp": 0.01144478, + "balance_loss_clip": 1.00229073, + "balance_loss_mlp": 1.00104547, + "epoch": 0.23676536900646325, + "flos": 16725000263040.0, + "grad_norm": 1.8638561786701442, + "language_loss": 0.8555001, + "learning_rate": 3.568041475462147e-06, + "loss": 0.87852669, + "num_input_tokens_seen": 84805395, + "step": 3938, + "time_per_iteration": 2.582911968231201 + }, + { + "auxiliary_loss_clip": 0.0117463, + "auxiliary_loss_mlp": 0.01144991, + "balance_loss_clip": 1.00221896, + "balance_loss_mlp": 1.00098658, + "epoch": 0.23682549225913122, + "flos": 11135611785600.0, + "grad_norm": 2.2713947026602406, + "language_loss": 0.94185495, + "learning_rate": 3.5677996933801785e-06, + "loss": 0.96505117, + "num_input_tokens_seen": 84818090, + "step": 3939, + "time_per_iteration": 2.505641460418701 + }, + { + "auxiliary_loss_clip": 0.01174757, + "auxiliary_loss_mlp": 0.01144946, + "balance_loss_clip": 1.00222707, + "balance_loss_mlp": 1.00113177, + "epoch": 0.23688561551179918, + "flos": 22559226819840.0, + "grad_norm": 1.5750003921812363, + "language_loss": 0.82246995, + "learning_rate": 3.567557851847088e-06, + "loss": 0.845667, + "num_input_tokens_seen": 84837695, + "step": 3940, + "time_per_iteration": 2.600679636001587 + }, + { + "auxiliary_loss_clip": 0.01147022, + "auxiliary_loss_mlp": 0.00748502, + "balance_loss_clip": 1.00221372, + "balance_loss_mlp": 1.0002594, + "epoch": 0.23694573876446715, + "flos": 18514895909760.0, + "grad_norm": 2.1693559061643755, + "language_loss": 0.89018393, + "learning_rate": 3.5673159508720464e-06, + "loss": 0.90913922, + "num_input_tokens_seen": 84854630, + "step": 3941, + "time_per_iteration": 2.61928653717041 + }, + { + "auxiliary_loss_clip": 0.01174711, + "auxiliary_loss_mlp": 0.01144666, + "balance_loss_clip": 1.00206852, + "balance_loss_mlp": 1.00085187, + "epoch": 0.23700586201713514, + "flos": 15335723980800.0, + "grad_norm": 2.080755537519606, + "language_loss": 0.845411, + "learning_rate": 3.5670739904642274e-06, + "loss": 0.86860478, + "num_input_tokens_seen": 84871805, + "step": 3942, + "time_per_iteration": 2.5720152854919434 + }, + { + "auxiliary_loss_clip": 0.01127134, + "auxiliary_loss_mlp": 0.01145437, + "balance_loss_clip": 1.0020318, + "balance_loss_mlp": 1.00095499, + "epoch": 0.2370659852698031, + "flos": 23947605262080.0, + "grad_norm": 1.9547152393515652, + "language_loss": 0.81000876, + "learning_rate": 3.5668319706328065e-06, + "loss": 0.83273447, + "num_input_tokens_seen": 84889815, + "step": 3943, + "time_per_iteration": 2.670671224594116 + }, + { + "auxiliary_loss_clip": 0.01143402, + "auxiliary_loss_mlp": 0.01145771, + "balance_loss_clip": 1.00214875, + "balance_loss_mlp": 1.00090742, + "epoch": 0.23712610852247107, + "flos": 15332527670400.0, + "grad_norm": 3.8464601459447714, + "language_loss": 0.66847956, + "learning_rate": 3.566589891386959e-06, + "loss": 0.69137126, + "num_input_tokens_seen": 84904380, + "step": 3944, + "time_per_iteration": 2.552687883377075 + }, + { + "auxiliary_loss_clip": 0.01142629, + "auxiliary_loss_mlp": 0.011454, + "balance_loss_clip": 1.0021261, + "balance_loss_mlp": 1.00091839, + "epoch": 0.23718623177513903, + "flos": 19682567233920.0, + "grad_norm": 1.9713273314988096, + "language_loss": 0.75469893, + "learning_rate": 3.566347752735866e-06, + "loss": 0.77757925, + "num_input_tokens_seen": 84922935, + "step": 3945, + "time_per_iteration": 2.6495471000671387 + }, + { + "auxiliary_loss_clip": 0.01159574, + "auxiliary_loss_mlp": 0.01145071, + "balance_loss_clip": 1.00224245, + "balance_loss_mlp": 1.00087512, + "epoch": 0.237246355027807, + "flos": 24973322037120.0, + "grad_norm": 1.6716567645123452, + "language_loss": 0.63588393, + "learning_rate": 3.5661055546887094e-06, + "loss": 0.6589303, + "num_input_tokens_seen": 84943685, + "step": 3946, + "time_per_iteration": 4.058262348175049 + }, + { + "auxiliary_loss_clip": 0.01159292, + "auxiliary_loss_mlp": 0.01145164, + "balance_loss_clip": 1.00221586, + "balance_loss_mlp": 1.00077772, + "epoch": 0.23730647828047496, + "flos": 15377416692480.0, + "grad_norm": 2.086323035375377, + "language_loss": 0.76933074, + "learning_rate": 3.5658632972546734e-06, + "loss": 0.79237533, + "num_input_tokens_seen": 84959505, + "step": 3947, + "time_per_iteration": 2.542456865310669 + }, + { + "auxiliary_loss_clip": 0.01158979, + "auxiliary_loss_mlp": 0.01145388, + "balance_loss_clip": 1.00226474, + "balance_loss_mlp": 1.00100124, + "epoch": 0.23736660153314296, + "flos": 28150662372480.0, + "grad_norm": 1.5624164277136787, + "language_loss": 0.80633414, + "learning_rate": 3.565620980442944e-06, + "loss": 0.82937789, + "num_input_tokens_seen": 84982130, + "step": 3948, + "time_per_iteration": 2.652240037918091 + }, + { + "auxiliary_loss_clip": 0.0114237, + "auxiliary_loss_mlp": 0.01144814, + "balance_loss_clip": 1.00204039, + "balance_loss_mlp": 1.00099969, + "epoch": 0.23742672478581092, + "flos": 22086570729600.0, + "grad_norm": 2.1957402002548787, + "language_loss": 0.80661935, + "learning_rate": 3.5653786042627107e-06, + "loss": 0.82949114, + "num_input_tokens_seen": 85000640, + "step": 3949, + "time_per_iteration": 4.010849714279175 + }, + { + "auxiliary_loss_clip": 0.01142215, + "auxiliary_loss_mlp": 0.01145436, + "balance_loss_clip": 1.00233603, + "balance_loss_mlp": 1.00095439, + "epoch": 0.2374868480384789, + "flos": 19537093152000.0, + "grad_norm": 2.0404341004253905, + "language_loss": 0.73015392, + "learning_rate": 3.565136168723163e-06, + "loss": 0.75303048, + "num_input_tokens_seen": 85018970, + "step": 3950, + "time_per_iteration": 4.002118825912476 + }, + { + "auxiliary_loss_clip": 0.01174856, + "auxiliary_loss_mlp": 0.01144356, + "balance_loss_clip": 1.00232279, + "balance_loss_mlp": 1.00082803, + "epoch": 0.23754697129114685, + "flos": 19422501788160.0, + "grad_norm": 1.9717214599575, + "language_loss": 0.72688925, + "learning_rate": 3.564893673833495e-06, + "loss": 0.75008142, + "num_input_tokens_seen": 85035905, + "step": 3951, + "time_per_iteration": 3.9099745750427246 + }, + { + "auxiliary_loss_clip": 0.01141798, + "auxiliary_loss_mlp": 0.01145714, + "balance_loss_clip": 1.00215304, + "balance_loss_mlp": 1.0011375, + "epoch": 0.23760709454381482, + "flos": 19501002961920.0, + "grad_norm": 2.7273169158241632, + "language_loss": 0.73986906, + "learning_rate": 3.564651119602903e-06, + "loss": 0.76274419, + "num_input_tokens_seen": 85054560, + "step": 3952, + "time_per_iteration": 2.600659132003784 + }, + { + "auxiliary_loss_clip": 0.01127521, + "auxiliary_loss_mlp": 0.01145275, + "balance_loss_clip": 1.00217199, + "balance_loss_mlp": 1.00079322, + "epoch": 0.23766721779648278, + "flos": 27636600879360.0, + "grad_norm": 1.8935140328365352, + "language_loss": 0.70985472, + "learning_rate": 3.564408506040583e-06, + "loss": 0.73258269, + "num_input_tokens_seen": 85074425, + "step": 3953, + "time_per_iteration": 2.6943156719207764 + }, + { + "auxiliary_loss_clip": 0.01174867, + "auxiliary_loss_mlp": 0.01145053, + "balance_loss_clip": 1.00230765, + "balance_loss_mlp": 1.0009526, + "epoch": 0.23772734104915075, + "flos": 23404348990080.0, + "grad_norm": 1.8669606338459153, + "language_loss": 0.81866449, + "learning_rate": 3.5641658331557356e-06, + "loss": 0.84186369, + "num_input_tokens_seen": 85092865, + "step": 3954, + "time_per_iteration": 2.6268980503082275 + }, + { + "auxiliary_loss_clip": 0.01141125, + "auxiliary_loss_mlp": 0.01145436, + "balance_loss_clip": 1.00206399, + "balance_loss_mlp": 1.00104952, + "epoch": 0.23778746430181874, + "flos": 15705496540800.0, + "grad_norm": 3.1973272666848884, + "language_loss": 0.66509944, + "learning_rate": 3.5639231009575634e-06, + "loss": 0.68796504, + "num_input_tokens_seen": 85110175, + "step": 3955, + "time_per_iteration": 2.6625781059265137 + }, + { + "auxiliary_loss_clip": 0.0117479, + "auxiliary_loss_mlp": 0.01145021, + "balance_loss_clip": 1.00229001, + "balance_loss_mlp": 1.00111139, + "epoch": 0.2378475875544867, + "flos": 19426452284160.0, + "grad_norm": 1.4708646377573973, + "language_loss": 0.83735532, + "learning_rate": 3.5636803094552704e-06, + "loss": 0.86055338, + "num_input_tokens_seen": 85129925, + "step": 3956, + "time_per_iteration": 2.5538039207458496 + }, + { + "auxiliary_loss_clip": 0.01126277, + "auxiliary_loss_mlp": 0.01144674, + "balance_loss_clip": 1.00213516, + "balance_loss_mlp": 1.00105035, + "epoch": 0.23790771080715467, + "flos": 22268565964800.0, + "grad_norm": 1.970562362887029, + "language_loss": 0.84876275, + "learning_rate": 3.5634374586580635e-06, + "loss": 0.87147224, + "num_input_tokens_seen": 85147755, + "step": 3957, + "time_per_iteration": 2.7015786170959473 + }, + { + "auxiliary_loss_clip": 0.01092569, + "auxiliary_loss_mlp": 0.01144656, + "balance_loss_clip": 1.00173831, + "balance_loss_mlp": 1.0011276, + "epoch": 0.23796783405982264, + "flos": 20047311889920.0, + "grad_norm": 2.0137218320280477, + "language_loss": 0.6986497, + "learning_rate": 3.563194548575151e-06, + "loss": 0.72102195, + "num_input_tokens_seen": 85165270, + "step": 3958, + "time_per_iteration": 2.7271039485931396 + }, + { + "auxiliary_loss_clip": 0.01127255, + "auxiliary_loss_mlp": 0.01145277, + "balance_loss_clip": 1.00203753, + "balance_loss_mlp": 1.00079584, + "epoch": 0.2380279573124906, + "flos": 14245943299200.0, + "grad_norm": 2.461473573252778, + "language_loss": 0.65681565, + "learning_rate": 3.562951579215745e-06, + "loss": 0.67954099, + "num_input_tokens_seen": 85181555, + "step": 3959, + "time_per_iteration": 2.6492369174957275 + }, + { + "auxiliary_loss_clip": 0.01114346, + "auxiliary_loss_mlp": 0.01145118, + "balance_loss_clip": 1.0020293, + "balance_loss_mlp": 1.00092268, + "epoch": 0.23808808056515857, + "flos": 21179180332800.0, + "grad_norm": 1.7055150662544518, + "language_loss": 0.7195161, + "learning_rate": 3.5627085505890586e-06, + "loss": 0.74211073, + "num_input_tokens_seen": 85199455, + "step": 3960, + "time_per_iteration": 2.684722900390625 + }, + { + "auxiliary_loss_clip": 0.01060665, + "auxiliary_loss_mlp": 0.01145165, + "balance_loss_clip": 1.00162435, + "balance_loss_mlp": 1.00077868, + "epoch": 0.23814820381782653, + "flos": 22528308188160.0, + "grad_norm": 1.6362958486241466, + "language_loss": 0.74204826, + "learning_rate": 3.562465462704307e-06, + "loss": 0.76410651, + "num_input_tokens_seen": 85219170, + "step": 3961, + "time_per_iteration": 2.9776716232299805 + }, + { + "auxiliary_loss_clip": 0.01174781, + "auxiliary_loss_mlp": 0.01145687, + "balance_loss_clip": 1.00219464, + "balance_loss_mlp": 1.00111043, + "epoch": 0.23820832707049452, + "flos": 22304332932480.0, + "grad_norm": 2.1924113346619913, + "language_loss": 0.653512, + "learning_rate": 3.5622223155707085e-06, + "loss": 0.67671669, + "num_input_tokens_seen": 85238480, + "step": 3962, + "time_per_iteration": 2.933938503265381 + }, + { + "auxiliary_loss_clip": 0.01142855, + "auxiliary_loss_mlp": 0.01145591, + "balance_loss_clip": 1.00219095, + "balance_loss_mlp": 1.00130033, + "epoch": 0.2382684503231625, + "flos": 24864225454080.0, + "grad_norm": 1.6780016693238562, + "language_loss": 0.74515831, + "learning_rate": 3.561979109197483e-06, + "loss": 0.7680428, + "num_input_tokens_seen": 85259180, + "step": 3963, + "time_per_iteration": 2.7970712184906006 + }, + { + "auxiliary_loss_clip": 0.01125352, + "auxiliary_loss_mlp": 0.01146381, + "balance_loss_clip": 1.00206733, + "balance_loss_mlp": 1.0010407, + "epoch": 0.23832857357583045, + "flos": 21871609787520.0, + "grad_norm": 1.883402337354603, + "language_loss": 0.77204883, + "learning_rate": 3.5617358435938538e-06, + "loss": 0.79476619, + "num_input_tokens_seen": 85278550, + "step": 3964, + "time_per_iteration": 2.7056448459625244 + }, + { + "auxiliary_loss_clip": 0.01131702, + "auxiliary_loss_mlp": 0.011451, + "balance_loss_clip": 1.00215578, + "balance_loss_mlp": 1.00099993, + "epoch": 0.23838869682849842, + "flos": 21288061434240.0, + "grad_norm": 1.878819886925768, + "language_loss": 0.71094543, + "learning_rate": 3.561492518769045e-06, + "loss": 0.73371339, + "num_input_tokens_seen": 85297345, + "step": 3965, + "time_per_iteration": 2.6632449626922607 + }, + { + "auxiliary_loss_clip": 0.01143923, + "auxiliary_loss_mlp": 0.01145422, + "balance_loss_clip": 1.00222445, + "balance_loss_mlp": 1.00084519, + "epoch": 0.23844882008116638, + "flos": 16180594755840.0, + "grad_norm": 1.9409687481685416, + "language_loss": 0.78265476, + "learning_rate": 3.561249134732282e-06, + "loss": 0.80554819, + "num_input_tokens_seen": 85315105, + "step": 3966, + "time_per_iteration": 2.6062772274017334 + }, + { + "auxiliary_loss_clip": 0.01142684, + "auxiliary_loss_mlp": 0.01144942, + "balance_loss_clip": 1.00210595, + "balance_loss_mlp": 1.00103283, + "epoch": 0.23850894333383435, + "flos": 21069724613760.0, + "grad_norm": 1.7504415724647537, + "language_loss": 0.69041443, + "learning_rate": 3.561005691492797e-06, + "loss": 0.71329075, + "num_input_tokens_seen": 85334735, + "step": 3967, + "time_per_iteration": 2.60540509223938 + }, + { + "auxiliary_loss_clip": 0.01126359, + "auxiliary_loss_mlp": 0.01145853, + "balance_loss_clip": 1.00204635, + "balance_loss_mlp": 1.00118017, + "epoch": 0.23856906658650234, + "flos": 17201606849280.0, + "grad_norm": 2.042189274823735, + "language_loss": 0.67997521, + "learning_rate": 3.5607621890598185e-06, + "loss": 0.70269728, + "num_input_tokens_seen": 85352875, + "step": 3968, + "time_per_iteration": 2.613638162612915 + }, + { + "auxiliary_loss_clip": 0.01108954, + "auxiliary_loss_mlp": 0.01144258, + "balance_loss_clip": 1.0020293, + "balance_loss_mlp": 1.00092065, + "epoch": 0.2386291898391703, + "flos": 29494223619840.0, + "grad_norm": 2.6348906645214543, + "language_loss": 0.76225865, + "learning_rate": 3.5605186274425823e-06, + "loss": 0.78479075, + "num_input_tokens_seen": 85372205, + "step": 3969, + "time_per_iteration": 2.765531539916992 + }, + { + "auxiliary_loss_clip": 0.01146865, + "auxiliary_loss_mlp": 0.01144951, + "balance_loss_clip": 1.00198174, + "balance_loss_mlp": 1.00085092, + "epoch": 0.23868931309183827, + "flos": 21142443697920.0, + "grad_norm": 2.8350652142125585, + "language_loss": 0.76083064, + "learning_rate": 3.5602750066503225e-06, + "loss": 0.78374875, + "num_input_tokens_seen": 85389705, + "step": 3970, + "time_per_iteration": 2.6232683658599854 + }, + { + "auxiliary_loss_clip": 0.01126827, + "auxiliary_loss_mlp": 0.01145299, + "balance_loss_clip": 1.00205493, + "balance_loss_mlp": 1.00100851, + "epoch": 0.23874943634450624, + "flos": 25659394784640.0, + "grad_norm": 2.50948577680114, + "language_loss": 0.85082006, + "learning_rate": 3.5600313266922793e-06, + "loss": 0.8735413, + "num_input_tokens_seen": 85407855, + "step": 3971, + "time_per_iteration": 2.681126594543457 + }, + { + "auxiliary_loss_clip": 0.01155736, + "auxiliary_loss_mlp": 0.01132015, + "balance_loss_clip": 1.00174618, + "balance_loss_mlp": 1.00002694, + "epoch": 0.2388095595971742, + "flos": 58986618624000.0, + "grad_norm": 0.7295124446796964, + "language_loss": 0.62786245, + "learning_rate": 3.5597875875776915e-06, + "loss": 0.65073997, + "num_input_tokens_seen": 85470885, + "step": 3972, + "time_per_iteration": 3.2113962173461914 + }, + { + "auxiliary_loss_clip": 0.01141182, + "auxiliary_loss_mlp": 0.01145332, + "balance_loss_clip": 1.00210047, + "balance_loss_mlp": 1.00094616, + "epoch": 0.23886968284984217, + "flos": 16800341040000.0, + "grad_norm": 2.781832013426903, + "language_loss": 0.81753707, + "learning_rate": 3.5595437893158013e-06, + "loss": 0.84040225, + "num_input_tokens_seen": 85488460, + "step": 3973, + "time_per_iteration": 2.5888705253601074 + }, + { + "auxiliary_loss_clip": 0.01142728, + "auxiliary_loss_mlp": 0.01145238, + "balance_loss_clip": 1.00211132, + "balance_loss_mlp": 1.00113797, + "epoch": 0.23892980610251013, + "flos": 22382654538240.0, + "grad_norm": 1.5992101021549279, + "language_loss": 0.79433465, + "learning_rate": 3.5592999319158546e-06, + "loss": 0.81721425, + "num_input_tokens_seen": 85508590, + "step": 3974, + "time_per_iteration": 2.9355356693267822 + }, + { + "auxiliary_loss_clip": 0.01159172, + "auxiliary_loss_mlp": 0.01145225, + "balance_loss_clip": 1.00218523, + "balance_loss_mlp": 1.00112522, + "epoch": 0.23898992935517813, + "flos": 12823198519680.0, + "grad_norm": 2.16566354051053, + "language_loss": 0.8424598, + "learning_rate": 3.5590560153870984e-06, + "loss": 0.86550373, + "num_input_tokens_seen": 85525970, + "step": 3975, + "time_per_iteration": 2.5925004482269287 + }, + { + "auxiliary_loss_clip": 0.01159524, + "auxiliary_loss_mlp": 0.01144528, + "balance_loss_clip": 1.00223875, + "balance_loss_mlp": 1.0009048, + "epoch": 0.2390500526078461, + "flos": 22345666508160.0, + "grad_norm": 2.003680472360101, + "language_loss": 0.83762109, + "learning_rate": 3.5588120397387816e-06, + "loss": 0.86066163, + "num_input_tokens_seen": 85543700, + "step": 3976, + "time_per_iteration": 2.5655550956726074 + }, + { + "auxiliary_loss_clip": 0.01093079, + "auxiliary_loss_mlp": 0.01144375, + "balance_loss_clip": 1.001894, + "balance_loss_mlp": 1.00084698, + "epoch": 0.23911017586051406, + "flos": 22635142214400.0, + "grad_norm": 1.8297469200953516, + "language_loss": 0.74493742, + "learning_rate": 3.5585680049801566e-06, + "loss": 0.76731193, + "num_input_tokens_seen": 85562765, + "step": 3977, + "time_per_iteration": 2.736895799636841 + }, + { + "auxiliary_loss_clip": 0.01174558, + "auxiliary_loss_mlp": 0.01144727, + "balance_loss_clip": 1.0022192, + "balance_loss_mlp": 1.00100863, + "epoch": 0.23917029911318202, + "flos": 23653281219840.0, + "grad_norm": 1.6594511493632689, + "language_loss": 0.7183429, + "learning_rate": 3.5583239111204764e-06, + "loss": 0.74153578, + "num_input_tokens_seen": 85581755, + "step": 3978, + "time_per_iteration": 2.5789458751678467 + }, + { + "auxiliary_loss_clip": 0.01143959, + "auxiliary_loss_mlp": 0.01145599, + "balance_loss_clip": 1.00228226, + "balance_loss_mlp": 1.00111699, + "epoch": 0.23923042236585, + "flos": 22783597125120.0, + "grad_norm": 2.556022692400676, + "language_loss": 0.78612518, + "learning_rate": 3.558079758168997e-06, + "loss": 0.80902082, + "num_input_tokens_seen": 85599455, + "step": 3979, + "time_per_iteration": 2.612316370010376 + }, + { + "auxiliary_loss_clip": 0.01142446, + "auxiliary_loss_mlp": 0.0114465, + "balance_loss_clip": 1.00206947, + "balance_loss_mlp": 1.00112176, + "epoch": 0.23929054561851795, + "flos": 28147717457280.0, + "grad_norm": 1.6597734998028244, + "language_loss": 0.82011509, + "learning_rate": 3.557835546134977e-06, + "loss": 0.84298605, + "num_input_tokens_seen": 85619970, + "step": 3980, + "time_per_iteration": 2.655740976333618 + }, + { + "auxiliary_loss_clip": 0.011263, + "auxiliary_loss_mlp": 0.01143936, + "balance_loss_clip": 1.00205541, + "balance_loss_mlp": 1.00088453, + "epoch": 0.23935066887118592, + "flos": 21686525982720.0, + "grad_norm": 2.3772615456845645, + "language_loss": 0.83945155, + "learning_rate": 3.5575912750276775e-06, + "loss": 0.86215389, + "num_input_tokens_seen": 85638850, + "step": 3981, + "time_per_iteration": 2.6859843730926514 + }, + { + "auxiliary_loss_clip": 0.01142989, + "auxiliary_loss_mlp": 0.01145071, + "balance_loss_clip": 1.00228596, + "balance_loss_mlp": 1.00106645, + "epoch": 0.2394107921238539, + "flos": 32122274198400.0, + "grad_norm": 3.7193249456116115, + "language_loss": 0.77230102, + "learning_rate": 3.5573469448563607e-06, + "loss": 0.79518163, + "num_input_tokens_seen": 85656285, + "step": 3982, + "time_per_iteration": 2.6837191581726074 + }, + { + "auxiliary_loss_clip": 0.01125922, + "auxiliary_loss_mlp": 0.01143879, + "balance_loss_clip": 1.00203335, + "balance_loss_mlp": 1.00101829, + "epoch": 0.23947091537652188, + "flos": 17019180650880.0, + "grad_norm": 1.6357249175843422, + "language_loss": 0.78220731, + "learning_rate": 3.5571025556302915e-06, + "loss": 0.8049053, + "num_input_tokens_seen": 85673020, + "step": 3983, + "time_per_iteration": 4.070282697677612 + }, + { + "auxiliary_loss_clip": 0.01157762, + "auxiliary_loss_mlp": 0.00748438, + "balance_loss_clip": 1.00207281, + "balance_loss_mlp": 1.00036025, + "epoch": 0.23953103862918984, + "flos": 20593584904320.0, + "grad_norm": 1.9227078616860984, + "language_loss": 0.7306453, + "learning_rate": 3.556858107358737e-06, + "loss": 0.74970728, + "num_input_tokens_seen": 85692565, + "step": 3984, + "time_per_iteration": 2.5817487239837646 + }, + { + "auxiliary_loss_clip": 0.01126149, + "auxiliary_loss_mlp": 0.0114449, + "balance_loss_clip": 1.00200939, + "balance_loss_mlp": 1.00105786, + "epoch": 0.2395911618818578, + "flos": 20704405340160.0, + "grad_norm": 1.9595057218930136, + "language_loss": 0.78761113, + "learning_rate": 3.5566136000509674e-06, + "loss": 0.81031752, + "num_input_tokens_seen": 85709730, + "step": 3985, + "time_per_iteration": 2.6306357383728027 + }, + { + "auxiliary_loss_clip": 0.01111783, + "auxiliary_loss_mlp": 0.01144429, + "balance_loss_clip": 1.00196719, + "balance_loss_mlp": 1.00128257, + "epoch": 0.23965128513452577, + "flos": 27053519402880.0, + "grad_norm": 2.011607674230085, + "language_loss": 0.73436368, + "learning_rate": 3.556369033716254e-06, + "loss": 0.75692582, + "num_input_tokens_seen": 85730045, + "step": 3986, + "time_per_iteration": 2.738494634628296 + }, + { + "auxiliary_loss_clip": 0.0115824, + "auxiliary_loss_mlp": 0.01144554, + "balance_loss_clip": 1.00219667, + "balance_loss_mlp": 1.00112152, + "epoch": 0.23971140838719374, + "flos": 23144319457920.0, + "grad_norm": 1.8921877770052764, + "language_loss": 0.88272101, + "learning_rate": 3.556124408363871e-06, + "loss": 0.90574896, + "num_input_tokens_seen": 85747590, + "step": 3987, + "time_per_iteration": 5.422579050064087 + }, + { + "auxiliary_loss_clip": 0.0115767, + "auxiliary_loss_mlp": 0.01143907, + "balance_loss_clip": 1.0021112, + "balance_loss_mlp": 1.0010469, + "epoch": 0.23977153163986173, + "flos": 18034554309120.0, + "grad_norm": 3.827705039738196, + "language_loss": 0.82920694, + "learning_rate": 3.5558797240030945e-06, + "loss": 0.85222268, + "num_input_tokens_seen": 85763460, + "step": 3988, + "time_per_iteration": 4.239508390426636 + }, + { + "auxiliary_loss_clip": 0.01157803, + "auxiliary_loss_mlp": 0.01143679, + "balance_loss_clip": 1.00205886, + "balance_loss_mlp": 1.00091374, + "epoch": 0.2398316548925297, + "flos": 18113378705280.0, + "grad_norm": 1.626170890969306, + "language_loss": 0.85347819, + "learning_rate": 3.5556349806432035e-06, + "loss": 0.87649298, + "num_input_tokens_seen": 85782050, + "step": 3989, + "time_per_iteration": 2.584476947784424 + }, + { + "auxiliary_loss_clip": 0.01174436, + "auxiliary_loss_mlp": 0.01143383, + "balance_loss_clip": 1.00221753, + "balance_loss_mlp": 1.00080848, + "epoch": 0.23989177814519766, + "flos": 12567730014720.0, + "grad_norm": 2.7976804049845474, + "language_loss": 0.84616667, + "learning_rate": 3.555390178293477e-06, + "loss": 0.86934483, + "num_input_tokens_seen": 85797400, + "step": 3990, + "time_per_iteration": 2.5071709156036377 + }, + { + "auxiliary_loss_clip": 0.01159172, + "auxiliary_loss_mlp": 0.01143677, + "balance_loss_clip": 1.0021522, + "balance_loss_mlp": 1.00091147, + "epoch": 0.23995190139786562, + "flos": 25264593423360.0, + "grad_norm": 1.4604578474156356, + "language_loss": 0.75849694, + "learning_rate": 3.5551453169631994e-06, + "loss": 0.78152537, + "num_input_tokens_seen": 85818995, + "step": 3991, + "time_per_iteration": 2.625765800476074 + }, + { + "auxiliary_loss_clip": 0.01139159, + "auxiliary_loss_mlp": 0.01131241, + "balance_loss_clip": 1.00144339, + "balance_loss_mlp": 1.0000155, + "epoch": 0.2400120246505336, + "flos": 61960379650560.0, + "grad_norm": 0.8792540737902991, + "language_loss": 0.63789856, + "learning_rate": 3.554900396661656e-06, + "loss": 0.66060257, + "num_input_tokens_seen": 85876695, + "step": 3992, + "time_per_iteration": 3.1327338218688965 + }, + { + "auxiliary_loss_clip": 0.01154546, + "auxiliary_loss_mlp": 0.01131267, + "balance_loss_clip": 1.00160527, + "balance_loss_mlp": 1.00004101, + "epoch": 0.24007214790320155, + "flos": 66708560540160.0, + "grad_norm": 0.7612472008949678, + "language_loss": 0.62918049, + "learning_rate": 3.5546554173981334e-06, + "loss": 0.65203863, + "num_input_tokens_seen": 85940990, + "step": 3993, + "time_per_iteration": 3.2083067893981934 + }, + { + "auxiliary_loss_clip": 0.01126415, + "auxiliary_loss_mlp": 0.01144243, + "balance_loss_clip": 1.00207746, + "balance_loss_mlp": 1.00090611, + "epoch": 0.24013227115586952, + "flos": 25809070757760.0, + "grad_norm": 1.4967524004381976, + "language_loss": 0.7649979, + "learning_rate": 3.5544103791819218e-06, + "loss": 0.78770447, + "num_input_tokens_seen": 85961165, + "step": 3994, + "time_per_iteration": 2.7968292236328125 + }, + { + "auxiliary_loss_clip": 0.01142595, + "auxiliary_loss_mlp": 0.01144039, + "balance_loss_clip": 1.00213444, + "balance_loss_mlp": 1.00098801, + "epoch": 0.2401923944085375, + "flos": 25557480921600.0, + "grad_norm": 1.4713087745441773, + "language_loss": 0.77959001, + "learning_rate": 3.5541652820223124e-06, + "loss": 0.80245638, + "num_input_tokens_seen": 85982710, + "step": 3995, + "time_per_iteration": 2.662945032119751 + }, + { + "auxiliary_loss_clip": 0.01122106, + "auxiliary_loss_mlp": 0.01131226, + "balance_loss_clip": 1.00165963, + "balance_loss_mlp": 1.00000095, + "epoch": 0.24025251766120548, + "flos": 54941138478720.0, + "grad_norm": 0.8981593029289658, + "language_loss": 0.63477421, + "learning_rate": 3.5539201259286006e-06, + "loss": 0.65730751, + "num_input_tokens_seen": 86046935, + "step": 3996, + "time_per_iteration": 3.291255474090576 + }, + { + "auxiliary_loss_clip": 0.01140907, + "auxiliary_loss_mlp": 0.01144196, + "balance_loss_clip": 1.00198555, + "balance_loss_mlp": 1.00114524, + "epoch": 0.24031264091387344, + "flos": 20631075724800.0, + "grad_norm": 2.475242435228682, + "language_loss": 0.69667494, + "learning_rate": 3.5536749109100808e-06, + "loss": 0.71952599, + "num_input_tokens_seen": 86064355, + "step": 3997, + "time_per_iteration": 2.6428346633911133 + }, + { + "auxiliary_loss_clip": 0.01158995, + "auxiliary_loss_mlp": 0.01143423, + "balance_loss_clip": 1.00210357, + "balance_loss_mlp": 1.00094378, + "epoch": 0.2403727641665414, + "flos": 20886256920960.0, + "grad_norm": 1.7666611167415744, + "language_loss": 0.87145621, + "learning_rate": 3.5534296369760535e-06, + "loss": 0.89448035, + "num_input_tokens_seen": 86081340, + "step": 3998, + "time_per_iteration": 2.5972044467926025 + }, + { + "auxiliary_loss_clip": 0.01159306, + "auxiliary_loss_mlp": 0.01144028, + "balance_loss_clip": 1.00209796, + "balance_loss_mlp": 1.00078607, + "epoch": 0.24043288741920937, + "flos": 22820046451200.0, + "grad_norm": 1.6160956116486869, + "language_loss": 0.75804073, + "learning_rate": 3.5531843041358183e-06, + "loss": 0.78107405, + "num_input_tokens_seen": 86102260, + "step": 3999, + "time_per_iteration": 2.5990536212921143 + }, + { + "auxiliary_loss_clip": 0.01148015, + "auxiliary_loss_mlp": 0.01143656, + "balance_loss_clip": 1.00235188, + "balance_loss_mlp": 1.0009861, + "epoch": 0.24049301067187734, + "flos": 27959652823680.0, + "grad_norm": 1.720260303598658, + "language_loss": 0.71688908, + "learning_rate": 3.552938912398679e-06, + "loss": 0.73980582, + "num_input_tokens_seen": 86123400, + "step": 4000, + "time_per_iteration": 2.6712803840637207 + }, + { + "auxiliary_loss_clip": 0.01158003, + "auxiliary_loss_mlp": 0.01143532, + "balance_loss_clip": 1.00219738, + "balance_loss_mlp": 1.00095725, + "epoch": 0.24055313392454533, + "flos": 27451409333760.0, + "grad_norm": 2.033687013809259, + "language_loss": 0.66344678, + "learning_rate": 3.5526934617739397e-06, + "loss": 0.68646204, + "num_input_tokens_seen": 86144060, + "step": 4001, + "time_per_iteration": 2.644623041152954 + }, + { + "auxiliary_loss_clip": 0.01174488, + "auxiliary_loss_mlp": 0.01144057, + "balance_loss_clip": 1.00222969, + "balance_loss_mlp": 1.00100589, + "epoch": 0.2406132571772133, + "flos": 25556618995200.0, + "grad_norm": 1.6692561894056346, + "language_loss": 0.82839894, + "learning_rate": 3.5524479522709095e-06, + "loss": 0.85158437, + "num_input_tokens_seen": 86163005, + "step": 4002, + "time_per_iteration": 2.564326524734497 + }, + { + "auxiliary_loss_clip": 0.0112606, + "auxiliary_loss_mlp": 0.01143406, + "balance_loss_clip": 1.00201559, + "balance_loss_mlp": 1.00092673, + "epoch": 0.24067338042988126, + "flos": 24791398629120.0, + "grad_norm": 1.7966617912159724, + "language_loss": 0.82621616, + "learning_rate": 3.552202383898897e-06, + "loss": 0.84891081, + "num_input_tokens_seen": 86182580, + "step": 4003, + "time_per_iteration": 2.740300178527832 + }, + { + "auxiliary_loss_clip": 0.01142689, + "auxiliary_loss_mlp": 0.01144794, + "balance_loss_clip": 1.00210202, + "balance_loss_mlp": 1.00097978, + "epoch": 0.24073350368254923, + "flos": 21177923356800.0, + "grad_norm": 3.5525327228396475, + "language_loss": 0.86914349, + "learning_rate": 3.551956756667215e-06, + "loss": 0.89201832, + "num_input_tokens_seen": 86200665, + "step": 4004, + "time_per_iteration": 2.587306261062622 + }, + { + "auxiliary_loss_clip": 0.01142645, + "auxiliary_loss_mlp": 0.01144176, + "balance_loss_clip": 1.00207829, + "balance_loss_mlp": 1.00112438, + "epoch": 0.2407936269352172, + "flos": 22494300986880.0, + "grad_norm": 2.992746869734619, + "language_loss": 0.77922493, + "learning_rate": 3.551711070585177e-06, + "loss": 0.80209309, + "num_input_tokens_seen": 86221640, + "step": 4005, + "time_per_iteration": 2.6967198848724365 + }, + { + "auxiliary_loss_clip": 0.01110249, + "auxiliary_loss_mlp": 0.01143596, + "balance_loss_clip": 1.00202513, + "balance_loss_mlp": 1.00092578, + "epoch": 0.24085375018788516, + "flos": 18551129754240.0, + "grad_norm": 1.5466363154358824, + "language_loss": 0.78953147, + "learning_rate": 3.5514653256620995e-06, + "loss": 0.81206989, + "num_input_tokens_seen": 86240795, + "step": 4006, + "time_per_iteration": 2.6927056312561035 + }, + { + "auxiliary_loss_clip": 0.01143405, + "auxiliary_loss_mlp": 0.00748435, + "balance_loss_clip": 1.00205183, + "balance_loss_mlp": 1.00031424, + "epoch": 0.24091387344055312, + "flos": 24170539023360.0, + "grad_norm": 1.7387867963855612, + "language_loss": 0.71675384, + "learning_rate": 3.551219521907302e-06, + "loss": 0.73567224, + "num_input_tokens_seen": 86262000, + "step": 4007, + "time_per_iteration": 2.6820666790008545 + }, + { + "auxiliary_loss_clip": 0.01126803, + "auxiliary_loss_mlp": 0.01143723, + "balance_loss_clip": 1.00211656, + "balance_loss_mlp": 1.00124383, + "epoch": 0.24097399669322112, + "flos": 11036319615360.0, + "grad_norm": 1.586627285090603, + "language_loss": 0.75856233, + "learning_rate": 3.5509736593301042e-06, + "loss": 0.78126752, + "num_input_tokens_seen": 86279680, + "step": 4008, + "time_per_iteration": 2.64038348197937 + }, + { + "auxiliary_loss_clip": 0.01157985, + "auxiliary_loss_mlp": 0.01143803, + "balance_loss_clip": 1.00213814, + "balance_loss_mlp": 1.0007515, + "epoch": 0.24103411994588908, + "flos": 17165085696000.0, + "grad_norm": 2.4804995232628344, + "language_loss": 0.74629152, + "learning_rate": 3.5507277379398295e-06, + "loss": 0.7693094, + "num_input_tokens_seen": 86297180, + "step": 4009, + "time_per_iteration": 2.6146702766418457 + }, + { + "auxiliary_loss_clip": 0.01157964, + "auxiliary_loss_mlp": 0.01143188, + "balance_loss_clip": 1.00224853, + "balance_loss_mlp": 1.00089955, + "epoch": 0.24109424319855705, + "flos": 20667956014080.0, + "grad_norm": 1.6832046862017493, + "language_loss": 0.80105191, + "learning_rate": 3.550481757745804e-06, + "loss": 0.82406342, + "num_input_tokens_seen": 86317660, + "step": 4010, + "time_per_iteration": 2.6195459365844727 + }, + { + "auxiliary_loss_clip": 0.0114165, + "auxiliary_loss_mlp": 0.01144366, + "balance_loss_clip": 1.00212669, + "balance_loss_mlp": 1.00112367, + "epoch": 0.241154366451225, + "flos": 28181796485760.0, + "grad_norm": 1.8223889695115336, + "language_loss": 0.70549214, + "learning_rate": 3.5502357187573555e-06, + "loss": 0.72835231, + "num_input_tokens_seen": 86338325, + "step": 4011, + "time_per_iteration": 2.7153501510620117 + }, + { + "auxiliary_loss_clip": 0.01077651, + "auxiliary_loss_mlp": 0.01143569, + "balance_loss_clip": 1.00175834, + "balance_loss_mlp": 1.00089955, + "epoch": 0.24121448970389298, + "flos": 21689722293120.0, + "grad_norm": 1.7271973312427793, + "language_loss": 0.6920321, + "learning_rate": 3.5499896209838118e-06, + "loss": 0.71424437, + "num_input_tokens_seen": 86357615, + "step": 4012, + "time_per_iteration": 2.7959957122802734 + }, + { + "auxiliary_loss_clip": 0.01158209, + "auxiliary_loss_mlp": 0.0114407, + "balance_loss_clip": 1.00222492, + "balance_loss_mlp": 1.00092387, + "epoch": 0.24127461295656094, + "flos": 39676191269760.0, + "grad_norm": 1.7413371447133046, + "language_loss": 0.73266673, + "learning_rate": 3.5497434644345073e-06, + "loss": 0.75568956, + "num_input_tokens_seen": 86380355, + "step": 4013, + "time_per_iteration": 2.774212598800659 + }, + { + "auxiliary_loss_clip": 0.01174613, + "auxiliary_loss_mlp": 0.01143768, + "balance_loss_clip": 1.00235784, + "balance_loss_mlp": 1.00090766, + "epoch": 0.2413347362092289, + "flos": 19135863256320.0, + "grad_norm": 1.7704456137677227, + "language_loss": 0.8793655, + "learning_rate": 3.5494972491187753e-06, + "loss": 0.90254933, + "num_input_tokens_seen": 86399125, + "step": 4014, + "time_per_iteration": 2.606293201446533 + }, + { + "auxiliary_loss_clip": 0.01144081, + "auxiliary_loss_mlp": 0.01143849, + "balance_loss_clip": 1.0021348, + "balance_loss_mlp": 1.000893, + "epoch": 0.2413948594618969, + "flos": 26939430829440.0, + "grad_norm": 2.9062863174664204, + "language_loss": 0.94753504, + "learning_rate": 3.549250975045952e-06, + "loss": 0.97041428, + "num_input_tokens_seen": 86418625, + "step": 4015, + "time_per_iteration": 2.6955089569091797 + }, + { + "auxiliary_loss_clip": 0.01142649, + "auxiliary_loss_mlp": 0.0114415, + "balance_loss_clip": 1.00212336, + "balance_loss_mlp": 1.00090861, + "epoch": 0.24145498271456486, + "flos": 25228108183680.0, + "grad_norm": 1.7931897230855522, + "language_loss": 0.82640004, + "learning_rate": 3.5490046422253768e-06, + "loss": 0.84926808, + "num_input_tokens_seen": 86438375, + "step": 4016, + "time_per_iteration": 2.6812334060668945 + }, + { + "auxiliary_loss_clip": 0.01125833, + "auxiliary_loss_mlp": 0.01143642, + "balance_loss_clip": 1.00192225, + "balance_loss_mlp": 1.00116324, + "epoch": 0.24151510596723283, + "flos": 40661759617920.0, + "grad_norm": 1.6724678337698364, + "language_loss": 0.68708956, + "learning_rate": 3.54875825066639e-06, + "loss": 0.70978427, + "num_input_tokens_seen": 86463230, + "step": 4017, + "time_per_iteration": 2.8421390056610107 + }, + { + "auxiliary_loss_clip": 0.01159237, + "auxiliary_loss_mlp": 0.01144861, + "balance_loss_clip": 1.00221002, + "balance_loss_mlp": 1.00123799, + "epoch": 0.2415752292199008, + "flos": 18146667634560.0, + "grad_norm": 1.732999342292716, + "language_loss": 0.84783626, + "learning_rate": 3.5485118003783353e-06, + "loss": 0.87087721, + "num_input_tokens_seen": 86481230, + "step": 4018, + "time_per_iteration": 2.5946707725524902 + }, + { + "auxiliary_loss_clip": 0.01154644, + "auxiliary_loss_mlp": 0.01130532, + "balance_loss_clip": 1.00191414, + "balance_loss_mlp": 1.0000689, + "epoch": 0.24163535247256876, + "flos": 67288409792640.0, + "grad_norm": 0.8276728139815583, + "language_loss": 0.60731304, + "learning_rate": 3.548265291370558e-06, + "loss": 0.6301648, + "num_input_tokens_seen": 86541260, + "step": 4019, + "time_per_iteration": 3.2030348777770996 + }, + { + "auxiliary_loss_clip": 0.01142248, + "auxiliary_loss_mlp": 0.01143246, + "balance_loss_clip": 1.00205374, + "balance_loss_mlp": 1.00105298, + "epoch": 0.24169547572523672, + "flos": 24929941386240.0, + "grad_norm": 1.870427776016757, + "language_loss": 0.73483932, + "learning_rate": 3.5480187236524055e-06, + "loss": 0.75769424, + "num_input_tokens_seen": 86559580, + "step": 4020, + "time_per_iteration": 2.6486313343048096 + }, + { + "auxiliary_loss_clip": 0.01124336, + "auxiliary_loss_mlp": 0.01144229, + "balance_loss_clip": 1.00202537, + "balance_loss_mlp": 1.0008918, + "epoch": 0.24175559897790472, + "flos": 18728312567040.0, + "grad_norm": 2.5072494021777865, + "language_loss": 0.81710947, + "learning_rate": 3.5477720972332285e-06, + "loss": 0.83979511, + "num_input_tokens_seen": 86577560, + "step": 4021, + "time_per_iteration": 4.038138389587402 + }, + { + "auxiliary_loss_clip": 0.01174648, + "auxiliary_loss_mlp": 0.01144345, + "balance_loss_clip": 1.00230849, + "balance_loss_mlp": 1.00081706, + "epoch": 0.24181572223057268, + "flos": 23039281111680.0, + "grad_norm": 2.474001644112372, + "language_loss": 0.76468861, + "learning_rate": 3.547525412122378e-06, + "loss": 0.78787851, + "num_input_tokens_seen": 86595350, + "step": 4022, + "time_per_iteration": 2.556715726852417 + }, + { + "auxiliary_loss_clip": 0.01127159, + "auxiliary_loss_mlp": 0.01144546, + "balance_loss_clip": 1.00205493, + "balance_loss_mlp": 1.00092208, + "epoch": 0.24187584548324065, + "flos": 20376145923840.0, + "grad_norm": 1.705939260146579, + "language_loss": 0.75183547, + "learning_rate": 3.5472786683292083e-06, + "loss": 0.77455246, + "num_input_tokens_seen": 86614805, + "step": 4023, + "time_per_iteration": 2.696701765060425 + }, + { + "auxiliary_loss_clip": 0.01142263, + "auxiliary_loss_mlp": 0.0114428, + "balance_loss_clip": 1.00213361, + "balance_loss_mlp": 1.00103819, + "epoch": 0.2419359687359086, + "flos": 21397517153280.0, + "grad_norm": 1.7958872750615265, + "language_loss": 0.82606101, + "learning_rate": 3.5470318658630766e-06, + "loss": 0.84892642, + "num_input_tokens_seen": 86633700, + "step": 4024, + "time_per_iteration": 4.015101432800293 + }, + { + "auxiliary_loss_clip": 0.01157732, + "auxiliary_loss_mlp": 0.01143793, + "balance_loss_clip": 1.00219417, + "balance_loss_mlp": 1.00093269, + "epoch": 0.24199609198857658, + "flos": 18369385914240.0, + "grad_norm": 2.08180198046922, + "language_loss": 0.85921609, + "learning_rate": 3.5467850047333424e-06, + "loss": 0.88223135, + "num_input_tokens_seen": 86650905, + "step": 4025, + "time_per_iteration": 5.377407550811768 + }, + { + "auxiliary_loss_clip": 0.0112775, + "auxiliary_loss_mlp": 0.01144646, + "balance_loss_clip": 1.0022583, + "balance_loss_mlp": 1.00102246, + "epoch": 0.24205621524124454, + "flos": 19463871277440.0, + "grad_norm": 2.5443824764024705, + "language_loss": 0.71662492, + "learning_rate": 3.546538084949365e-06, + "loss": 0.73934883, + "num_input_tokens_seen": 86669185, + "step": 4026, + "time_per_iteration": 2.6251142024993896 + }, + { + "auxiliary_loss_clip": 0.01159164, + "auxiliary_loss_mlp": 0.01143519, + "balance_loss_clip": 1.00223529, + "balance_loss_mlp": 1.00094473, + "epoch": 0.2421163384939125, + "flos": 14976330451200.0, + "grad_norm": 2.150829620438158, + "language_loss": 0.64105129, + "learning_rate": 3.546291106520509e-06, + "loss": 0.66407824, + "num_input_tokens_seen": 86686805, + "step": 4027, + "time_per_iteration": 2.543410539627075 + }, + { + "auxiliary_loss_clip": 0.01157744, + "auxiliary_loss_mlp": 0.00748385, + "balance_loss_clip": 1.00215936, + "balance_loss_mlp": 1.00024629, + "epoch": 0.2421764617465805, + "flos": 18662057930880.0, + "grad_norm": 2.205159069589345, + "language_loss": 0.70572799, + "learning_rate": 3.5460440694561388e-06, + "loss": 0.72478926, + "num_input_tokens_seen": 86705520, + "step": 4028, + "time_per_iteration": 2.56660532951355 + }, + { + "auxiliary_loss_clip": 0.01155715, + "auxiliary_loss_mlp": 0.01130518, + "balance_loss_clip": 1.00191092, + "balance_loss_mlp": 1.00005519, + "epoch": 0.24223658499924847, + "flos": 64347327164160.0, + "grad_norm": 0.8491963878165409, + "language_loss": 0.55318385, + "learning_rate": 3.545796973765623e-06, + "loss": 0.57604617, + "num_input_tokens_seen": 86767320, + "step": 4029, + "time_per_iteration": 3.1533894538879395 + }, + { + "auxiliary_loss_clip": 0.01159173, + "auxiliary_loss_mlp": 0.01143976, + "balance_loss_clip": 1.00219059, + "balance_loss_mlp": 1.00092494, + "epoch": 0.24229670825191643, + "flos": 25775243124480.0, + "grad_norm": 2.2102588797994036, + "language_loss": 0.73664653, + "learning_rate": 3.54554981945833e-06, + "loss": 0.75967801, + "num_input_tokens_seen": 86788110, + "step": 4030, + "time_per_iteration": 2.6467108726501465 + }, + { + "auxiliary_loss_clip": 0.01174578, + "auxiliary_loss_mlp": 0.01144163, + "balance_loss_clip": 1.002303, + "balance_loss_mlp": 1.00120735, + "epoch": 0.2423568315045844, + "flos": 20667094087680.0, + "grad_norm": 1.7997327778378764, + "language_loss": 0.76695979, + "learning_rate": 3.5453026065436343e-06, + "loss": 0.79014719, + "num_input_tokens_seen": 86807640, + "step": 4031, + "time_per_iteration": 2.5556132793426514 + }, + { + "auxiliary_loss_clip": 0.0114281, + "auxiliary_loss_mlp": 0.00748475, + "balance_loss_clip": 1.00204301, + "balance_loss_mlp": 1.00032425, + "epoch": 0.24241695475725236, + "flos": 22416805393920.0, + "grad_norm": 2.0419765445300344, + "language_loss": 0.65331066, + "learning_rate": 3.5450553350309083e-06, + "loss": 0.67222351, + "num_input_tokens_seen": 86826795, + "step": 4032, + "time_per_iteration": 2.634585380554199 + }, + { + "auxiliary_loss_clip": 0.01157662, + "auxiliary_loss_mlp": 0.01143902, + "balance_loss_clip": 1.00203013, + "balance_loss_mlp": 1.00094593, + "epoch": 0.24247707800992033, + "flos": 17128995505920.0, + "grad_norm": 2.1299171543482216, + "language_loss": 0.81668484, + "learning_rate": 3.5448080049295286e-06, + "loss": 0.83970046, + "num_input_tokens_seen": 86843175, + "step": 4033, + "time_per_iteration": 2.562676191329956 + }, + { + "auxiliary_loss_clip": 0.01132831, + "auxiliary_loss_mlp": 0.01143486, + "balance_loss_clip": 1.00232506, + "balance_loss_mlp": 1.00081658, + "epoch": 0.2425372012625883, + "flos": 31613743399680.0, + "grad_norm": 1.9365831733197736, + "language_loss": 0.69206834, + "learning_rate": 3.5445606162488754e-06, + "loss": 0.71483147, + "num_input_tokens_seen": 86863185, + "step": 4034, + "time_per_iteration": 2.7223446369171143 + }, + { + "auxiliary_loss_clip": 0.01142153, + "auxiliary_loss_mlp": 0.01143778, + "balance_loss_clip": 1.00213897, + "balance_loss_mlp": 1.0007267, + "epoch": 0.24259732451525629, + "flos": 16326032924160.0, + "grad_norm": 1.988606336491334, + "language_loss": 0.96098948, + "learning_rate": 3.5443131689983283e-06, + "loss": 0.98384881, + "num_input_tokens_seen": 86880040, + "step": 4035, + "time_per_iteration": 2.6303610801696777 + }, + { + "auxiliary_loss_clip": 0.01140948, + "auxiliary_loss_mlp": 0.0114342, + "balance_loss_clip": 1.00200164, + "balance_loss_mlp": 1.00122726, + "epoch": 0.24265744776792425, + "flos": 22856639431680.0, + "grad_norm": 1.5158676213717899, + "language_loss": 0.78144604, + "learning_rate": 3.5440656631872715e-06, + "loss": 0.8042897, + "num_input_tokens_seen": 86900610, + "step": 4036, + "time_per_iteration": 2.633450508117676 + }, + { + "auxiliary_loss_clip": 0.01158135, + "auxiliary_loss_mlp": 0.01143972, + "balance_loss_clip": 1.00216937, + "balance_loss_mlp": 1.00101638, + "epoch": 0.24271757102059222, + "flos": 21871573873920.0, + "grad_norm": 1.6506640499381096, + "language_loss": 0.74725151, + "learning_rate": 3.5438180988250898e-06, + "loss": 0.77027261, + "num_input_tokens_seen": 86919385, + "step": 4037, + "time_per_iteration": 2.618553638458252 + }, + { + "auxiliary_loss_clip": 0.01133027, + "auxiliary_loss_mlp": 0.01143489, + "balance_loss_clip": 1.00229025, + "balance_loss_mlp": 1.00091505, + "epoch": 0.24277769427326018, + "flos": 19208582340480.0, + "grad_norm": 2.0522247241089002, + "language_loss": 0.76550925, + "learning_rate": 3.543570475921171e-06, + "loss": 0.78827441, + "num_input_tokens_seen": 86938885, + "step": 4038, + "time_per_iteration": 2.673452138900757 + }, + { + "auxiliary_loss_clip": 0.01158307, + "auxiliary_loss_mlp": 0.01143815, + "balance_loss_clip": 1.00223303, + "balance_loss_mlp": 1.00104976, + "epoch": 0.24283781752592815, + "flos": 19499889640320.0, + "grad_norm": 1.8860096658999541, + "language_loss": 0.72069311, + "learning_rate": 3.543322794484905e-06, + "loss": 0.74371433, + "num_input_tokens_seen": 86957705, + "step": 4039, + "time_per_iteration": 2.6482648849487305 + }, + { + "auxiliary_loss_clip": 0.01159007, + "auxiliary_loss_mlp": 0.01143751, + "balance_loss_clip": 1.00218987, + "balance_loss_mlp": 1.00089049, + "epoch": 0.2428979407785961, + "flos": 19902196944000.0, + "grad_norm": 1.6984740888341219, + "language_loss": 0.78052372, + "learning_rate": 3.5430750545256843e-06, + "loss": 0.80355132, + "num_input_tokens_seen": 86975845, + "step": 4040, + "time_per_iteration": 2.5602316856384277 + }, + { + "auxiliary_loss_clip": 0.01107755, + "auxiliary_loss_mlp": 0.01143128, + "balance_loss_clip": 1.00184357, + "balance_loss_mlp": 1.00074458, + "epoch": 0.2429580640312641, + "flos": 24715878284160.0, + "grad_norm": 1.9903358474297626, + "language_loss": 0.80515206, + "learning_rate": 3.5428272560529027e-06, + "loss": 0.82766092, + "num_input_tokens_seen": 86994800, + "step": 4041, + "time_per_iteration": 2.709803819656372 + }, + { + "auxiliary_loss_clip": 0.01126724, + "auxiliary_loss_mlp": 0.01143591, + "balance_loss_clip": 1.00202239, + "balance_loss_mlp": 1.00111246, + "epoch": 0.24301818728393207, + "flos": 25630343660160.0, + "grad_norm": 2.6026774115835467, + "language_loss": 0.76890993, + "learning_rate": 3.542579399075957e-06, + "loss": 0.7916131, + "num_input_tokens_seen": 87016845, + "step": 4042, + "time_per_iteration": 2.707141160964966 + }, + { + "auxiliary_loss_clip": 0.01066486, + "auxiliary_loss_mlp": 0.01143515, + "balance_loss_clip": 1.00180233, + "balance_loss_mlp": 1.00084543, + "epoch": 0.24307831053660003, + "flos": 26141388410880.0, + "grad_norm": 1.7913048505658864, + "language_loss": 0.81470275, + "learning_rate": 3.542331483604246e-06, + "loss": 0.83680272, + "num_input_tokens_seen": 87036270, + "step": 4043, + "time_per_iteration": 3.017160177230835 + }, + { + "auxiliary_loss_clip": 0.01142473, + "auxiliary_loss_mlp": 0.01143867, + "balance_loss_clip": 1.00198257, + "balance_loss_mlp": 1.00081623, + "epoch": 0.243138433789268, + "flos": 14972415868800.0, + "grad_norm": 2.631764991707825, + "language_loss": 0.72823602, + "learning_rate": 3.5420835096471706e-06, + "loss": 0.75109935, + "num_input_tokens_seen": 87049920, + "step": 4044, + "time_per_iteration": 2.9325385093688965 + }, + { + "auxiliary_loss_clip": 0.01158069, + "auxiliary_loss_mlp": 0.01143852, + "balance_loss_clip": 1.0023222, + "balance_loss_mlp": 1.00089669, + "epoch": 0.24319855704193596, + "flos": 25191694771200.0, + "grad_norm": 1.9283551215832824, + "language_loss": 0.833794, + "learning_rate": 3.5418354772141337e-06, + "loss": 0.85681325, + "num_input_tokens_seen": 87068230, + "step": 4045, + "time_per_iteration": 2.643003225326538 + }, + { + "auxiliary_loss_clip": 0.01075364, + "auxiliary_loss_mlp": 0.01143751, + "balance_loss_clip": 1.00165105, + "balance_loss_mlp": 1.00108123, + "epoch": 0.24325868029460393, + "flos": 22127221946880.0, + "grad_norm": 1.6172866923128835, + "language_loss": 0.8666541, + "learning_rate": 3.541587386314541e-06, + "loss": 0.88884521, + "num_input_tokens_seen": 87086435, + "step": 4046, + "time_per_iteration": 2.785907030105591 + }, + { + "auxiliary_loss_clip": 0.01141766, + "auxiliary_loss_mlp": 0.01143336, + "balance_loss_clip": 1.00192988, + "balance_loss_mlp": 1.00095236, + "epoch": 0.2433188035472719, + "flos": 23582106420480.0, + "grad_norm": 2.126158452631852, + "language_loss": 0.72792101, + "learning_rate": 3.5413392369578e-06, + "loss": 0.750772, + "num_input_tokens_seen": 87105340, + "step": 4047, + "time_per_iteration": 2.659789562225342 + }, + { + "auxiliary_loss_clip": 0.01159241, + "auxiliary_loss_mlp": 0.01143687, + "balance_loss_clip": 1.00216234, + "balance_loss_mlp": 1.00082624, + "epoch": 0.2433789267999399, + "flos": 24462815990400.0, + "grad_norm": 3.4333108679070325, + "language_loss": 0.7275824, + "learning_rate": 3.5410910291533213e-06, + "loss": 0.75061166, + "num_input_tokens_seen": 87125780, + "step": 4048, + "time_per_iteration": 2.704315423965454 + }, + { + "auxiliary_loss_clip": 0.011244, + "auxiliary_loss_mlp": 0.01143606, + "balance_loss_clip": 1.00190985, + "balance_loss_mlp": 1.00103199, + "epoch": 0.24343905005260785, + "flos": 16727909264640.0, + "grad_norm": 1.861911187228932, + "language_loss": 0.73245966, + "learning_rate": 3.5408427629105155e-06, + "loss": 0.75513977, + "num_input_tokens_seen": 87144470, + "step": 4049, + "time_per_iteration": 2.6419410705566406 + }, + { + "auxiliary_loss_clip": 0.01125618, + "auxiliary_loss_mlp": 0.01143437, + "balance_loss_clip": 1.00194633, + "balance_loss_mlp": 1.00105309, + "epoch": 0.24349917330527582, + "flos": 20043756443520.0, + "grad_norm": 1.6343909929855738, + "language_loss": 0.73729169, + "learning_rate": 3.5405944382387985e-06, + "loss": 0.75998229, + "num_input_tokens_seen": 87162830, + "step": 4050, + "time_per_iteration": 2.646338939666748 + }, + { + "auxiliary_loss_clip": 0.01141012, + "auxiliary_loss_mlp": 0.01143181, + "balance_loss_clip": 1.00193238, + "balance_loss_mlp": 1.0009886, + "epoch": 0.24355929655794378, + "flos": 17420554200960.0, + "grad_norm": 2.1630558524674526, + "language_loss": 0.75125247, + "learning_rate": 3.5403460551475854e-06, + "loss": 0.77409446, + "num_input_tokens_seen": 87180905, + "step": 4051, + "time_per_iteration": 2.582620859146118 + }, + { + "auxiliary_loss_clip": 0.01109092, + "auxiliary_loss_mlp": 0.01144105, + "balance_loss_clip": 1.00183082, + "balance_loss_mlp": 1.00086355, + "epoch": 0.24361941981061175, + "flos": 25410929431680.0, + "grad_norm": 2.640842398333112, + "language_loss": 0.7056644, + "learning_rate": 3.540097613646296e-06, + "loss": 0.72819638, + "num_input_tokens_seen": 87202290, + "step": 4052, + "time_per_iteration": 2.7258639335632324 + }, + { + "auxiliary_loss_clip": 0.01141166, + "auxiliary_loss_mlp": 0.01143789, + "balance_loss_clip": 1.00204945, + "balance_loss_mlp": 1.00102437, + "epoch": 0.2436795430632797, + "flos": 22820800636800.0, + "grad_norm": 1.6474109326753354, + "language_loss": 0.81365305, + "learning_rate": 3.539849113744351e-06, + "loss": 0.83650261, + "num_input_tokens_seen": 87221650, + "step": 4053, + "time_per_iteration": 2.6671621799468994 + }, + { + "auxiliary_loss_clip": 0.011744, + "auxiliary_loss_mlp": 0.01142746, + "balance_loss_clip": 1.00210333, + "balance_loss_mlp": 1.00064886, + "epoch": 0.2437396663159477, + "flos": 15157786982400.0, + "grad_norm": 1.6789825846414972, + "language_loss": 0.77790487, + "learning_rate": 3.539600555451172e-06, + "loss": 0.80107629, + "num_input_tokens_seen": 87238515, + "step": 4054, + "time_per_iteration": 2.9451165199279785 + }, + { + "auxiliary_loss_clip": 0.01127352, + "auxiliary_loss_mlp": 0.01143421, + "balance_loss_clip": 1.0021683, + "balance_loss_mlp": 1.00132394, + "epoch": 0.24379978956861567, + "flos": 22091131756800.0, + "grad_norm": 1.67722091772392, + "language_loss": 0.84168231, + "learning_rate": 3.5393519387761866e-06, + "loss": 0.86439002, + "num_input_tokens_seen": 87256290, + "step": 4055, + "time_per_iteration": 2.680783987045288 + }, + { + "auxiliary_loss_clip": 0.0112611, + "auxiliary_loss_mlp": 0.01143582, + "balance_loss_clip": 1.00193369, + "balance_loss_mlp": 1.0008173, + "epoch": 0.24385991282128364, + "flos": 31467766527360.0, + "grad_norm": 2.3399931497164403, + "language_loss": 0.55179495, + "learning_rate": 3.5391032637288217e-06, + "loss": 0.57449186, + "num_input_tokens_seen": 87277085, + "step": 4056, + "time_per_iteration": 2.7305452823638916 + }, + { + "auxiliary_loss_clip": 0.01158852, + "auxiliary_loss_mlp": 0.01143869, + "balance_loss_clip": 1.00226903, + "balance_loss_mlp": 1.00110364, + "epoch": 0.2439200360739516, + "flos": 23838795987840.0, + "grad_norm": 2.4629072195331387, + "language_loss": 0.8032189, + "learning_rate": 3.538854530318506e-06, + "loss": 0.82624608, + "num_input_tokens_seen": 87293020, + "step": 4057, + "time_per_iteration": 2.6327972412109375 + }, + { + "auxiliary_loss_clip": 0.0115802, + "auxiliary_loss_mlp": 0.01143169, + "balance_loss_clip": 1.00209105, + "balance_loss_mlp": 1.00088048, + "epoch": 0.24398015932661957, + "flos": 19169978198400.0, + "grad_norm": 1.682879004917693, + "language_loss": 0.79098713, + "learning_rate": 3.538605738554673e-06, + "loss": 0.81399894, + "num_input_tokens_seen": 87311445, + "step": 4058, + "time_per_iteration": 2.55416202545166 + }, + { + "auxiliary_loss_clip": 0.01174469, + "auxiliary_loss_mlp": 0.01143521, + "balance_loss_clip": 1.0021733, + "balance_loss_mlp": 1.00094712, + "epoch": 0.24404028257928753, + "flos": 25262474520960.0, + "grad_norm": 1.6365924137306782, + "language_loss": 0.85799634, + "learning_rate": 3.538356888446756e-06, + "loss": 0.88117617, + "num_input_tokens_seen": 87332055, + "step": 4059, + "time_per_iteration": 4.39821457862854 + }, + { + "auxiliary_loss_clip": 0.01158147, + "auxiliary_loss_mlp": 0.01142933, + "balance_loss_clip": 1.00219595, + "balance_loss_mlp": 1.00083578, + "epoch": 0.2441004058319555, + "flos": 26467600752000.0, + "grad_norm": 1.9440018525144358, + "language_loss": 0.74024701, + "learning_rate": 3.5381079800041913e-06, + "loss": 0.7632578, + "num_input_tokens_seen": 87351295, + "step": 4060, + "time_per_iteration": 2.6403000354766846 + }, + { + "auxiliary_loss_clip": 0.01124773, + "auxiliary_loss_mlp": 0.01144082, + "balance_loss_clip": 1.00195658, + "balance_loss_mlp": 1.00112653, + "epoch": 0.2441605290846235, + "flos": 26760524163840.0, + "grad_norm": 1.8721117153467197, + "language_loss": 0.73639333, + "learning_rate": 3.5378590132364182e-06, + "loss": 0.75908184, + "num_input_tokens_seen": 87370650, + "step": 4061, + "time_per_iteration": 2.6902871131896973 + }, + { + "auxiliary_loss_clip": 0.01174199, + "auxiliary_loss_mlp": 0.01143357, + "balance_loss_clip": 1.00211537, + "balance_loss_mlp": 1.00106835, + "epoch": 0.24422065233729146, + "flos": 21105850717440.0, + "grad_norm": 1.6238774813614838, + "language_loss": 0.75655735, + "learning_rate": 3.5376099881528768e-06, + "loss": 0.77973294, + "num_input_tokens_seen": 87389020, + "step": 4062, + "time_per_iteration": 5.311549425125122 + }, + { + "auxiliary_loss_clip": 0.01126795, + "auxiliary_loss_mlp": 0.01143426, + "balance_loss_clip": 1.00200164, + "balance_loss_mlp": 1.00094652, + "epoch": 0.24428077558995942, + "flos": 25263156879360.0, + "grad_norm": 1.6109547729377849, + "language_loss": 0.84989828, + "learning_rate": 3.537360904763011e-06, + "loss": 0.87260044, + "num_input_tokens_seen": 87409695, + "step": 4063, + "time_per_iteration": 4.06933856010437 + }, + { + "auxiliary_loss_clip": 0.01142171, + "auxiliary_loss_mlp": 0.01143802, + "balance_loss_clip": 1.00193739, + "balance_loss_mlp": 1.00084639, + "epoch": 0.24434089884262739, + "flos": 20485278420480.0, + "grad_norm": 2.4026760295423566, + "language_loss": 0.68289202, + "learning_rate": 3.5371117630762656e-06, + "loss": 0.70575178, + "num_input_tokens_seen": 87428250, + "step": 4064, + "time_per_iteration": 2.643519639968872 + }, + { + "auxiliary_loss_clip": 0.01158737, + "auxiliary_loss_mlp": 0.01143413, + "balance_loss_clip": 1.00216901, + "balance_loss_mlp": 1.00093412, + "epoch": 0.24440102209529535, + "flos": 23621895711360.0, + "grad_norm": 1.8464377850698914, + "language_loss": 0.70109719, + "learning_rate": 3.536862563102088e-06, + "loss": 0.72411871, + "num_input_tokens_seen": 87449380, + "step": 4065, + "time_per_iteration": 2.5926077365875244 + }, + { + "auxiliary_loss_clip": 0.01174414, + "auxiliary_loss_mlp": 0.01143688, + "balance_loss_clip": 1.00211596, + "balance_loss_mlp": 1.0009234, + "epoch": 0.24446114534796332, + "flos": 20554729367040.0, + "grad_norm": 2.3326735694521754, + "language_loss": 0.83974242, + "learning_rate": 3.5366133048499282e-06, + "loss": 0.8629235, + "num_input_tokens_seen": 87465365, + "step": 4066, + "time_per_iteration": 2.536473274230957 + }, + { + "auxiliary_loss_clip": 0.01171155, + "auxiliary_loss_mlp": 0.01129868, + "balance_loss_clip": 1.00189543, + "balance_loss_mlp": 1.00016844, + "epoch": 0.24452126860063128, + "flos": 60389575009920.0, + "grad_norm": 0.7261330709869223, + "language_loss": 0.5228883, + "learning_rate": 3.5363639883292374e-06, + "loss": 0.54589856, + "num_input_tokens_seen": 87522525, + "step": 4067, + "time_per_iteration": 3.002112627029419 + }, + { + "auxiliary_loss_clip": 0.01141398, + "auxiliary_loss_mlp": 0.01143577, + "balance_loss_clip": 1.00212824, + "balance_loss_mlp": 1.00090778, + "epoch": 0.24458139185329927, + "flos": 15121660878720.0, + "grad_norm": 3.56268160304369, + "language_loss": 0.72418147, + "learning_rate": 3.5361146135494706e-06, + "loss": 0.74703121, + "num_input_tokens_seen": 87539170, + "step": 4068, + "time_per_iteration": 2.6051502227783203 + }, + { + "auxiliary_loss_clip": 0.01110215, + "auxiliary_loss_mlp": 0.01142978, + "balance_loss_clip": 1.00186551, + "balance_loss_mlp": 1.00097561, + "epoch": 0.24464151510596724, + "flos": 27998723842560.0, + "grad_norm": 1.68295217324046, + "language_loss": 0.77526295, + "learning_rate": 3.5358651805200835e-06, + "loss": 0.79779488, + "num_input_tokens_seen": 87558875, + "step": 4069, + "time_per_iteration": 2.7340621948242188 + }, + { + "auxiliary_loss_clip": 0.01140981, + "auxiliary_loss_mlp": 0.01143062, + "balance_loss_clip": 1.00212622, + "balance_loss_mlp": 1.00106001, + "epoch": 0.2447016383586352, + "flos": 19792884879360.0, + "grad_norm": 1.7774880612031798, + "language_loss": 0.80056512, + "learning_rate": 3.5356156892505347e-06, + "loss": 0.82340556, + "num_input_tokens_seen": 87576485, + "step": 4070, + "time_per_iteration": 2.6007392406463623 + }, + { + "auxiliary_loss_clip": 0.01159174, + "auxiliary_loss_mlp": 0.01143174, + "balance_loss_clip": 1.00210619, + "balance_loss_mlp": 1.00088573, + "epoch": 0.24476176161130317, + "flos": 26067340523520.0, + "grad_norm": 2.271555225463394, + "language_loss": 0.84357625, + "learning_rate": 3.5353661397502854e-06, + "loss": 0.86659968, + "num_input_tokens_seen": 87598620, + "step": 4071, + "time_per_iteration": 2.620105266571045 + }, + { + "auxiliary_loss_clip": 0.01142102, + "auxiliary_loss_mlp": 0.01143811, + "balance_loss_clip": 1.0019803, + "balance_loss_mlp": 1.00114119, + "epoch": 0.24482188486397113, + "flos": 18843550375680.0, + "grad_norm": 1.793412174477122, + "language_loss": 0.79889917, + "learning_rate": 3.535116532028798e-06, + "loss": 0.82175827, + "num_input_tokens_seen": 87616595, + "step": 4072, + "time_per_iteration": 2.6212728023529053 + }, + { + "auxiliary_loss_clip": 0.01157655, + "auxiliary_loss_mlp": 0.0114311, + "balance_loss_clip": 1.00212932, + "balance_loss_mlp": 1.00110817, + "epoch": 0.2448820081166391, + "flos": 21251791676160.0, + "grad_norm": 1.465000784633758, + "language_loss": 0.70080322, + "learning_rate": 3.5348668660955382e-06, + "loss": 0.72381079, + "num_input_tokens_seen": 87635755, + "step": 4073, + "time_per_iteration": 2.6050734519958496 + }, + { + "auxiliary_loss_clip": 0.01142662, + "auxiliary_loss_mlp": 0.01143316, + "balance_loss_clip": 1.00233507, + "balance_loss_mlp": 1.00102818, + "epoch": 0.2449421313693071, + "flos": 23950586090880.0, + "grad_norm": 2.1676684012099985, + "language_loss": 0.6763742, + "learning_rate": 3.5346171419599728e-06, + "loss": 0.69923401, + "num_input_tokens_seen": 87652885, + "step": 4074, + "time_per_iteration": 2.6696784496307373 + }, + { + "auxiliary_loss_clip": 0.01171116, + "auxiliary_loss_mlp": 0.01129773, + "balance_loss_clip": 1.00177181, + "balance_loss_mlp": 1.00007343, + "epoch": 0.24500225462197506, + "flos": 60687669980160.0, + "grad_norm": 0.9032590885881522, + "language_loss": 0.68648154, + "learning_rate": 3.5343673596315718e-06, + "loss": 0.70949042, + "num_input_tokens_seen": 87713220, + "step": 4075, + "time_per_iteration": 3.2058827877044678 + }, + { + "auxiliary_loss_clip": 0.01174185, + "auxiliary_loss_mlp": 0.01143026, + "balance_loss_clip": 1.00217795, + "balance_loss_mlp": 1.0012145, + "epoch": 0.24506237787464302, + "flos": 26284204886400.0, + "grad_norm": 1.7792691260660682, + "language_loss": 0.79622597, + "learning_rate": 3.5341175191198063e-06, + "loss": 0.81939805, + "num_input_tokens_seen": 87732680, + "step": 4076, + "time_per_iteration": 2.582044839859009 + }, + { + "auxiliary_loss_clip": 0.01142247, + "auxiliary_loss_mlp": 0.00748415, + "balance_loss_clip": 1.00202298, + "balance_loss_mlp": 1.00034761, + "epoch": 0.245122501127311, + "flos": 20552287242240.0, + "grad_norm": 1.8320645695485906, + "language_loss": 0.81895214, + "learning_rate": 3.533867620434151e-06, + "loss": 0.8378588, + "num_input_tokens_seen": 87751880, + "step": 4077, + "time_per_iteration": 2.6414554119110107 + }, + { + "auxiliary_loss_clip": 0.01174391, + "auxiliary_loss_mlp": 0.01143324, + "balance_loss_clip": 1.00221455, + "balance_loss_mlp": 1.00113082, + "epoch": 0.24518262437997895, + "flos": 29132603447040.0, + "grad_norm": 1.8424891407317716, + "language_loss": 0.62072885, + "learning_rate": 3.533617663584082e-06, + "loss": 0.643906, + "num_input_tokens_seen": 87771795, + "step": 4078, + "time_per_iteration": 2.5862960815429688 + }, + { + "auxiliary_loss_clip": 0.01141319, + "auxiliary_loss_mlp": 0.01143164, + "balance_loss_clip": 1.00205767, + "balance_loss_mlp": 1.00087535, + "epoch": 0.24524274763264692, + "flos": 23476924419840.0, + "grad_norm": 1.4005445876737521, + "language_loss": 0.75324392, + "learning_rate": 3.5333676485790765e-06, + "loss": 0.77608871, + "num_input_tokens_seen": 87793640, + "step": 4079, + "time_per_iteration": 2.659564733505249 + }, + { + "auxiliary_loss_clip": 0.01174256, + "auxiliary_loss_mlp": 0.01142393, + "balance_loss_clip": 1.00222516, + "balance_loss_mlp": 1.0008682, + "epoch": 0.24530287088531488, + "flos": 17201175886080.0, + "grad_norm": 1.8112842119741213, + "language_loss": 0.75095171, + "learning_rate": 3.5331175754286173e-06, + "loss": 0.77411819, + "num_input_tokens_seen": 87812390, + "step": 4080, + "time_per_iteration": 2.5058774948120117 + }, + { + "auxiliary_loss_clip": 0.01140603, + "auxiliary_loss_mlp": 0.01142379, + "balance_loss_clip": 1.00192213, + "balance_loss_mlp": 1.00085366, + "epoch": 0.24536299413798288, + "flos": 14867449349760.0, + "grad_norm": 1.8420596524688704, + "language_loss": 0.82493931, + "learning_rate": 3.532867444142186e-06, + "loss": 0.84776914, + "num_input_tokens_seen": 87830640, + "step": 4081, + "time_per_iteration": 2.602224349975586 + }, + { + "auxiliary_loss_clip": 0.01140868, + "auxiliary_loss_mlp": 0.01142976, + "balance_loss_clip": 1.00196648, + "balance_loss_mlp": 1.00087833, + "epoch": 0.24542311739065084, + "flos": 35262051886080.0, + "grad_norm": 1.9651560732859161, + "language_loss": 0.73233789, + "learning_rate": 3.532617254729267e-06, + "loss": 0.75517631, + "num_input_tokens_seen": 87850450, + "step": 4082, + "time_per_iteration": 2.706726312637329 + }, + { + "auxiliary_loss_clip": 0.01125561, + "auxiliary_loss_mlp": 0.01142532, + "balance_loss_clip": 1.0019424, + "balance_loss_mlp": 1.00110197, + "epoch": 0.2454832406433188, + "flos": 21503130117120.0, + "grad_norm": 1.6190465357257031, + "language_loss": 0.71968079, + "learning_rate": 3.5323670071993485e-06, + "loss": 0.74236178, + "num_input_tokens_seen": 87868810, + "step": 4083, + "time_per_iteration": 2.65217661857605 + }, + { + "auxiliary_loss_clip": 0.01142485, + "auxiliary_loss_mlp": 0.01143631, + "balance_loss_clip": 1.00221038, + "balance_loss_mlp": 1.00105715, + "epoch": 0.24554336389598677, + "flos": 14756664827520.0, + "grad_norm": 1.8761093260802808, + "language_loss": 0.74437892, + "learning_rate": 3.532116701561919e-06, + "loss": 0.76724005, + "num_input_tokens_seen": 87885685, + "step": 4084, + "time_per_iteration": 2.586364984512329 + }, + { + "auxiliary_loss_clip": 0.01158822, + "auxiliary_loss_mlp": 0.01142495, + "balance_loss_clip": 1.00212908, + "balance_loss_mlp": 1.00096989, + "epoch": 0.24560348714865474, + "flos": 14976402278400.0, + "grad_norm": 1.8026330742470487, + "language_loss": 0.85355079, + "learning_rate": 3.531866337826471e-06, + "loss": 0.87656397, + "num_input_tokens_seen": 87903715, + "step": 4085, + "time_per_iteration": 2.557995319366455 + }, + { + "auxiliary_loss_clip": 0.01141397, + "auxiliary_loss_mlp": 0.01143336, + "balance_loss_clip": 1.00209534, + "balance_loss_mlp": 1.00104749, + "epoch": 0.2456636104013227, + "flos": 22675326554880.0, + "grad_norm": 2.184702664725024, + "language_loss": 0.7900728, + "learning_rate": 3.5316159160024982e-06, + "loss": 0.81292021, + "num_input_tokens_seen": 87923375, + "step": 4086, + "time_per_iteration": 2.661241054534912 + }, + { + "auxiliary_loss_clip": 0.01110098, + "auxiliary_loss_mlp": 0.01142254, + "balance_loss_clip": 1.00197625, + "balance_loss_mlp": 1.00101519, + "epoch": 0.2457237336539907, + "flos": 27417869009280.0, + "grad_norm": 1.8951774377616815, + "language_loss": 0.75344288, + "learning_rate": 3.531365436099496e-06, + "loss": 0.77596641, + "num_input_tokens_seen": 87943115, + "step": 4087, + "time_per_iteration": 2.737201452255249 + }, + { + "auxiliary_loss_clip": 0.01111045, + "auxiliary_loss_mlp": 0.0114346, + "balance_loss_clip": 1.00221729, + "balance_loss_mlp": 1.00098133, + "epoch": 0.24578385690665866, + "flos": 20412379768320.0, + "grad_norm": 2.792462174947315, + "language_loss": 0.79799473, + "learning_rate": 3.5311148981269635e-06, + "loss": 0.82053971, + "num_input_tokens_seen": 87959505, + "step": 4088, + "time_per_iteration": 2.694600820541382 + }, + { + "auxiliary_loss_clip": 0.01125554, + "auxiliary_loss_mlp": 0.01141868, + "balance_loss_clip": 1.00182474, + "balance_loss_mlp": 1.00081944, + "epoch": 0.24584398015932662, + "flos": 23915393740800.0, + "grad_norm": 1.557729169561945, + "language_loss": 0.77147186, + "learning_rate": 3.5308643020944e-06, + "loss": 0.79414606, + "num_input_tokens_seen": 87979725, + "step": 4089, + "time_per_iteration": 2.697495460510254 + }, + { + "auxiliary_loss_clip": 0.01157954, + "auxiliary_loss_mlp": 0.01142825, + "balance_loss_clip": 1.00207639, + "balance_loss_mlp": 1.00091791, + "epoch": 0.2459041034119946, + "flos": 41496359103360.0, + "grad_norm": 2.0513064590469563, + "language_loss": 0.81283998, + "learning_rate": 3.530613648011309e-06, + "loss": 0.83584774, + "num_input_tokens_seen": 87998270, + "step": 4090, + "time_per_iteration": 2.7669713497161865 + }, + { + "auxiliary_loss_clip": 0.01143046, + "auxiliary_loss_mlp": 0.0114255, + "balance_loss_clip": 1.00206506, + "balance_loss_mlp": 1.00102496, + "epoch": 0.24596422666466256, + "flos": 19936814676480.0, + "grad_norm": 1.940961071849894, + "language_loss": 0.73432088, + "learning_rate": 3.5303629358871946e-06, + "loss": 0.75717682, + "num_input_tokens_seen": 88016760, + "step": 4091, + "time_per_iteration": 2.625980854034424 + }, + { + "auxiliary_loss_clip": 0.01126185, + "auxiliary_loss_mlp": 0.01142333, + "balance_loss_clip": 1.00207675, + "balance_loss_mlp": 1.00090313, + "epoch": 0.24602434991733052, + "flos": 21544391865600.0, + "grad_norm": 1.8349288531570898, + "language_loss": 0.76871055, + "learning_rate": 3.5301121657315653e-06, + "loss": 0.79139578, + "num_input_tokens_seen": 88036465, + "step": 4092, + "time_per_iteration": 2.683774471282959 + }, + { + "auxiliary_loss_clip": 0.01143604, + "auxiliary_loss_mlp": 0.01143128, + "balance_loss_clip": 1.00216842, + "balance_loss_mlp": 1.00064945, + "epoch": 0.24608447316999849, + "flos": 23185078416000.0, + "grad_norm": 3.858564383390531, + "language_loss": 0.81405556, + "learning_rate": 3.5298613375539287e-06, + "loss": 0.83692288, + "num_input_tokens_seen": 88053270, + "step": 4093, + "time_per_iteration": 2.643195629119873 + }, + { + "auxiliary_loss_clip": 0.01157597, + "auxiliary_loss_mlp": 0.01143335, + "balance_loss_clip": 1.00206733, + "balance_loss_mlp": 1.00095189, + "epoch": 0.24614459642266648, + "flos": 19641951930240.0, + "grad_norm": 5.583294256363778, + "language_loss": 0.8708331, + "learning_rate": 3.529610451363797e-06, + "loss": 0.89384246, + "num_input_tokens_seen": 88072305, + "step": 4094, + "time_per_iteration": 2.5897083282470703 + }, + { + "auxiliary_loss_clip": 0.01096576, + "auxiliary_loss_mlp": 0.01128989, + "balance_loss_clip": 1.00292397, + "balance_loss_mlp": 1.00005174, + "epoch": 0.24620471967533444, + "flos": 61739816186880.0, + "grad_norm": 0.7686087275896296, + "language_loss": 0.57490188, + "learning_rate": 3.5293595071706833e-06, + "loss": 0.59715754, + "num_input_tokens_seen": 88137995, + "step": 4095, + "time_per_iteration": 3.5208017826080322 + }, + { + "auxiliary_loss_clip": 0.01139069, + "auxiliary_loss_mlp": 0.01129781, + "balance_loss_clip": 1.00173426, + "balance_loss_mlp": 1.0000813, + "epoch": 0.2462648429280024, + "flos": 69154436315520.0, + "grad_norm": 0.644388669925753, + "language_loss": 0.56210917, + "learning_rate": 3.5291085049841042e-06, + "loss": 0.58479768, + "num_input_tokens_seen": 88208490, + "step": 4096, + "time_per_iteration": 3.7772247791290283 + }, + { + "auxiliary_loss_clip": 0.011462, + "auxiliary_loss_mlp": 0.01142835, + "balance_loss_clip": 1.00209546, + "balance_loss_mlp": 1.0008328, + "epoch": 0.24632496618067037, + "flos": 29459605887360.0, + "grad_norm": 1.8772405582450842, + "language_loss": 0.77234912, + "learning_rate": 3.5288574448135773e-06, + "loss": 0.79523945, + "num_input_tokens_seen": 88228050, + "step": 4097, + "time_per_iteration": 4.132148742675781 + }, + { + "auxiliary_loss_clip": 0.01126705, + "auxiliary_loss_mlp": 0.01143386, + "balance_loss_clip": 1.00190449, + "balance_loss_mlp": 1.00090754, + "epoch": 0.24638508943333834, + "flos": 24316444068480.0, + "grad_norm": 2.099010579591484, + "language_loss": 0.76443428, + "learning_rate": 3.5286063266686235e-06, + "loss": 0.78713518, + "num_input_tokens_seen": 88248090, + "step": 4098, + "time_per_iteration": 2.7426905632019043 + }, + { + "auxiliary_loss_clip": 0.01141721, + "auxiliary_loss_mlp": 0.01142425, + "balance_loss_clip": 1.00209916, + "balance_loss_mlp": 1.00099528, + "epoch": 0.2464452126860063, + "flos": 26613254401920.0, + "grad_norm": 2.0896316918169067, + "language_loss": 0.68003786, + "learning_rate": 3.528355150558764e-06, + "loss": 0.70287931, + "num_input_tokens_seen": 88267545, + "step": 4099, + "time_per_iteration": 2.656738042831421 + }, + { + "auxiliary_loss_clip": 0.01157701, + "auxiliary_loss_mlp": 0.01142416, + "balance_loss_clip": 1.00208807, + "balance_loss_mlp": 1.00079525, + "epoch": 0.24650533593867427, + "flos": 31212405763200.0, + "grad_norm": 2.2485613753343157, + "language_loss": 0.66007531, + "learning_rate": 3.5281039164935237e-06, + "loss": 0.6830765, + "num_input_tokens_seen": 88289785, + "step": 4100, + "time_per_iteration": 5.645984649658203 + }, + { + "auxiliary_loss_clip": 0.0113918, + "auxiliary_loss_mlp": 0.01129065, + "balance_loss_clip": 1.00167942, + "balance_loss_mlp": 1.00012875, + "epoch": 0.24656545919134226, + "flos": 68494002900480.0, + "grad_norm": 0.7139582210677131, + "language_loss": 0.6153248, + "learning_rate": 3.5278526244824304e-06, + "loss": 0.63800728, + "num_input_tokens_seen": 88357320, + "step": 4101, + "time_per_iteration": 3.2482874393463135 + }, + { + "auxiliary_loss_clip": 0.01174088, + "auxiliary_loss_mlp": 0.01142625, + "balance_loss_clip": 1.00213957, + "balance_loss_mlp": 1.00090933, + "epoch": 0.24662558244401023, + "flos": 20084192179200.0, + "grad_norm": 3.1889717494347365, + "language_loss": 0.73363149, + "learning_rate": 3.527601274535012e-06, + "loss": 0.75679862, + "num_input_tokens_seen": 88377040, + "step": 4102, + "time_per_iteration": 2.5225279331207275 + }, + { + "auxiliary_loss_clip": 0.01146414, + "auxiliary_loss_mlp": 0.011428, + "balance_loss_clip": 1.00207663, + "balance_loss_mlp": 1.00108361, + "epoch": 0.2466857056966782, + "flos": 30701361012480.0, + "grad_norm": 1.8560001845054221, + "language_loss": 0.75669813, + "learning_rate": 3.5273498666608004e-06, + "loss": 0.77959037, + "num_input_tokens_seen": 88395085, + "step": 4103, + "time_per_iteration": 2.6526222229003906 + }, + { + "auxiliary_loss_clip": 0.01157534, + "auxiliary_loss_mlp": 0.01142759, + "balance_loss_clip": 1.00205731, + "balance_loss_mlp": 1.00075746, + "epoch": 0.24674582894934616, + "flos": 22528523669760.0, + "grad_norm": 3.7452246950929515, + "language_loss": 0.78107584, + "learning_rate": 3.5270984008693288e-06, + "loss": 0.8040787, + "num_input_tokens_seen": 88413205, + "step": 4104, + "time_per_iteration": 2.5489683151245117 + }, + { + "auxiliary_loss_clip": 0.0115736, + "auxiliary_loss_mlp": 0.01142159, + "balance_loss_clip": 1.00208402, + "balance_loss_mlp": 1.00101542, + "epoch": 0.24680595220201412, + "flos": 20704297599360.0, + "grad_norm": 1.6862508505031797, + "language_loss": 0.83574247, + "learning_rate": 3.526846877170133e-06, + "loss": 0.85873771, + "num_input_tokens_seen": 88431525, + "step": 4105, + "time_per_iteration": 2.5748698711395264 + }, + { + "auxiliary_loss_clip": 0.01173974, + "auxiliary_loss_mlp": 0.01142455, + "balance_loss_clip": 1.00209093, + "balance_loss_mlp": 1.0010252, + "epoch": 0.2468660754546821, + "flos": 21831174051840.0, + "grad_norm": 1.7594409370806725, + "language_loss": 0.7621361, + "learning_rate": 3.52659529557275e-06, + "loss": 0.78530031, + "num_input_tokens_seen": 88451210, + "step": 4106, + "time_per_iteration": 2.623443841934204 + }, + { + "auxiliary_loss_clip": 0.0114339, + "auxiliary_loss_mlp": 0.01142941, + "balance_loss_clip": 1.00206554, + "balance_loss_mlp": 1.00103426, + "epoch": 0.24692619870735008, + "flos": 15267709578240.0, + "grad_norm": 2.561440112801411, + "language_loss": 0.72884613, + "learning_rate": 3.5263436560867205e-06, + "loss": 0.75170946, + "num_input_tokens_seen": 88467790, + "step": 4107, + "time_per_iteration": 2.5986552238464355 + }, + { + "auxiliary_loss_clip": 0.01174248, + "auxiliary_loss_mlp": 0.01143172, + "balance_loss_clip": 1.00227356, + "balance_loss_mlp": 1.0008837, + "epoch": 0.24698632196001805, + "flos": 29680097523840.0, + "grad_norm": 2.137970676197148, + "language_loss": 0.65689957, + "learning_rate": 3.526091958721587e-06, + "loss": 0.6800738, + "num_input_tokens_seen": 88490330, + "step": 4108, + "time_per_iteration": 2.6225619316101074 + }, + { + "auxiliary_loss_clip": 0.01109927, + "auxiliary_loss_mlp": 0.01142698, + "balance_loss_clip": 1.00182557, + "balance_loss_mlp": 1.00098169, + "epoch": 0.247046445212686, + "flos": 39165469741440.0, + "grad_norm": 1.6669530898333513, + "language_loss": 0.72905469, + "learning_rate": 3.5258402034868936e-06, + "loss": 0.75158095, + "num_input_tokens_seen": 88512435, + "step": 4109, + "time_per_iteration": 2.8504579067230225 + }, + { + "auxiliary_loss_clip": 0.01125068, + "auxiliary_loss_mlp": 0.011422, + "balance_loss_clip": 1.00187039, + "balance_loss_mlp": 1.00105572, + "epoch": 0.24710656846535398, + "flos": 22998845376000.0, + "grad_norm": 1.648812105298561, + "language_loss": 0.79028898, + "learning_rate": 3.5255883903921866e-06, + "loss": 0.8129617, + "num_input_tokens_seen": 88529780, + "step": 4110, + "time_per_iteration": 2.6993510723114014 + }, + { + "auxiliary_loss_clip": 0.01142334, + "auxiliary_loss_mlp": 0.01143232, + "balance_loss_clip": 1.00211, + "balance_loss_mlp": 1.00094342, + "epoch": 0.24716669171802194, + "flos": 26432803451520.0, + "grad_norm": 2.1945170744965554, + "language_loss": 0.81051606, + "learning_rate": 3.5253365194470144e-06, + "loss": 0.8333717, + "num_input_tokens_seen": 88547200, + "step": 4111, + "time_per_iteration": 2.639669418334961 + }, + { + "auxiliary_loss_clip": 0.01174159, + "auxiliary_loss_mlp": 0.01142459, + "balance_loss_clip": 1.0021131, + "balance_loss_mlp": 1.00083828, + "epoch": 0.2472268149706899, + "flos": 23329870139520.0, + "grad_norm": 1.7101614788983777, + "language_loss": 0.74895871, + "learning_rate": 3.5250845906609294e-06, + "loss": 0.77212489, + "num_input_tokens_seen": 88566415, + "step": 4112, + "time_per_iteration": 2.5558371543884277 + }, + { + "auxiliary_loss_clip": 0.01125628, + "auxiliary_loss_mlp": 0.00748389, + "balance_loss_clip": 1.0019989, + "balance_loss_mlp": 1.00025332, + "epoch": 0.24728693822335787, + "flos": 23768734510080.0, + "grad_norm": 3.223125903431796, + "language_loss": 0.82811975, + "learning_rate": 3.5248326040434835e-06, + "loss": 0.84685993, + "num_input_tokens_seen": 88585225, + "step": 4113, + "time_per_iteration": 2.697134494781494 + }, + { + "auxiliary_loss_clip": 0.01173964, + "auxiliary_loss_mlp": 0.01142039, + "balance_loss_clip": 1.00206208, + "balance_loss_mlp": 1.00079989, + "epoch": 0.24734706147602586, + "flos": 19317499355520.0, + "grad_norm": 2.0496998340486914, + "language_loss": 0.87097549, + "learning_rate": 3.5245805596042322e-06, + "loss": 0.89413553, + "num_input_tokens_seen": 88603280, + "step": 4114, + "time_per_iteration": 2.530582904815674 + }, + { + "auxiliary_loss_clip": 0.01108063, + "auxiliary_loss_mlp": 0.01142613, + "balance_loss_clip": 1.00178647, + "balance_loss_mlp": 1.00099254, + "epoch": 0.24740718472869383, + "flos": 28036932935040.0, + "grad_norm": 1.9572944302921818, + "language_loss": 0.75415909, + "learning_rate": 3.524328457352734e-06, + "loss": 0.77666581, + "num_input_tokens_seen": 88624925, + "step": 4115, + "time_per_iteration": 2.7516043186187744 + }, + { + "auxiliary_loss_clip": 0.01090851, + "auxiliary_loss_mlp": 0.01129817, + "balance_loss_clip": 1.00120401, + "balance_loss_mlp": 1.00011778, + "epoch": 0.2474673079813618, + "flos": 68107569408000.0, + "grad_norm": 0.6541544774190511, + "language_loss": 0.58251894, + "learning_rate": 3.5240762972985475e-06, + "loss": 0.6047256, + "num_input_tokens_seen": 88691475, + "step": 4116, + "time_per_iteration": 3.4717438220977783 + }, + { + "auxiliary_loss_clip": 0.01141422, + "auxiliary_loss_mlp": 0.01142383, + "balance_loss_clip": 1.00202584, + "balance_loss_mlp": 1.0009532, + "epoch": 0.24752743123402976, + "flos": 29462119839360.0, + "grad_norm": 1.4294148737323082, + "language_loss": 0.83433163, + "learning_rate": 3.523824079451235e-06, + "loss": 0.85716963, + "num_input_tokens_seen": 88713425, + "step": 4117, + "time_per_iteration": 2.981064558029175 + }, + { + "auxiliary_loss_clip": 0.01138272, + "auxiliary_loss_mlp": 0.00748038, + "balance_loss_clip": 1.00162435, + "balance_loss_mlp": 1.00027323, + "epoch": 0.24758755448669773, + "flos": 58350459824640.0, + "grad_norm": 0.8967557768345722, + "language_loss": 0.63482738, + "learning_rate": 3.5235718038203602e-06, + "loss": 0.65369046, + "num_input_tokens_seen": 88769995, + "step": 4118, + "time_per_iteration": 3.0394952297210693 + }, + { + "auxiliary_loss_clip": 0.0115866, + "auxiliary_loss_mlp": 0.01142875, + "balance_loss_clip": 1.002074, + "balance_loss_mlp": 1.00077784, + "epoch": 0.2476476777393657, + "flos": 20484416494080.0, + "grad_norm": 1.678976920781098, + "language_loss": 0.79136366, + "learning_rate": 3.523319470415491e-06, + "loss": 0.81437898, + "num_input_tokens_seen": 88789970, + "step": 4119, + "time_per_iteration": 2.5820112228393555 + }, + { + "auxiliary_loss_clip": 0.01157433, + "auxiliary_loss_mlp": 0.0114195, + "balance_loss_clip": 1.00204456, + "balance_loss_mlp": 1.00090146, + "epoch": 0.24770780099203366, + "flos": 20485853038080.0, + "grad_norm": 2.229564294892849, + "language_loss": 0.74294996, + "learning_rate": 3.5230670792461943e-06, + "loss": 0.76594377, + "num_input_tokens_seen": 88810000, + "step": 4120, + "time_per_iteration": 2.575780153274536 + }, + { + "auxiliary_loss_clip": 0.01163372, + "auxiliary_loss_mlp": 0.01142231, + "balance_loss_clip": 1.00279331, + "balance_loss_mlp": 1.0008961, + "epoch": 0.24776792424470165, + "flos": 15153405523200.0, + "grad_norm": 1.9070419068926523, + "language_loss": 0.88217294, + "learning_rate": 3.522814630322041e-06, + "loss": 0.90522897, + "num_input_tokens_seen": 88827515, + "step": 4121, + "time_per_iteration": 2.557415723800659 + }, + { + "auxiliary_loss_clip": 0.01174102, + "auxiliary_loss_mlp": 0.01142761, + "balance_loss_clip": 1.00212538, + "balance_loss_mlp": 1.00075901, + "epoch": 0.2478280474973696, + "flos": 21725453347200.0, + "grad_norm": 1.9339472368596207, + "language_loss": 0.69369769, + "learning_rate": 3.5225621236526045e-06, + "loss": 0.71686631, + "num_input_tokens_seen": 88845025, + "step": 4122, + "time_per_iteration": 2.5346171855926514 + }, + { + "auxiliary_loss_clip": 0.01174118, + "auxiliary_loss_mlp": 0.01142226, + "balance_loss_clip": 1.00210452, + "balance_loss_mlp": 1.0007956, + "epoch": 0.24788817075003758, + "flos": 20412200200320.0, + "grad_norm": 2.0298028430897275, + "language_loss": 0.80329728, + "learning_rate": 3.5223095592474596e-06, + "loss": 0.82646072, + "num_input_tokens_seen": 88861740, + "step": 4123, + "time_per_iteration": 2.531163454055786 + }, + { + "auxiliary_loss_clip": 0.01109405, + "auxiliary_loss_mlp": 0.01142434, + "balance_loss_clip": 1.00181699, + "balance_loss_mlp": 1.00109959, + "epoch": 0.24794829400270554, + "flos": 22594455083520.0, + "grad_norm": 1.7395231078298847, + "language_loss": 0.74817944, + "learning_rate": 3.5220569371161846e-06, + "loss": 0.77069783, + "num_input_tokens_seen": 88879740, + "step": 4124, + "time_per_iteration": 2.6753275394439697 + }, + { + "auxiliary_loss_clip": 0.0115867, + "auxiliary_loss_mlp": 0.01142107, + "balance_loss_clip": 1.0021143, + "balance_loss_mlp": 1.00077295, + "epoch": 0.2480084172553735, + "flos": 39676047615360.0, + "grad_norm": 1.591371730514682, + "language_loss": 0.7382195, + "learning_rate": 3.521804257268357e-06, + "loss": 0.76122725, + "num_input_tokens_seen": 88904095, + "step": 4125, + "time_per_iteration": 2.7463502883911133 + }, + { + "auxiliary_loss_clip": 0.01125296, + "auxiliary_loss_mlp": 0.00748426, + "balance_loss_clip": 1.00188613, + "balance_loss_mlp": 1.00028706, + "epoch": 0.24806854050804147, + "flos": 22053712763520.0, + "grad_norm": 2.17569836969674, + "language_loss": 0.69531, + "learning_rate": 3.5215515197135595e-06, + "loss": 0.71404725, + "num_input_tokens_seen": 88920740, + "step": 4126, + "time_per_iteration": 2.6356029510498047 + }, + { + "auxiliary_loss_clip": 0.01158583, + "auxiliary_loss_mlp": 0.0114219, + "balance_loss_clip": 1.00207353, + "balance_loss_mlp": 1.00085592, + "epoch": 0.24812866376070947, + "flos": 15486764670720.0, + "grad_norm": 2.6716211592315298, + "language_loss": 0.8123163, + "learning_rate": 3.5212987244613764e-06, + "loss": 0.83532399, + "num_input_tokens_seen": 88938510, + "step": 4127, + "time_per_iteration": 2.575200080871582 + }, + { + "auxiliary_loss_clip": 0.01163405, + "auxiliary_loss_mlp": 0.00748417, + "balance_loss_clip": 1.00256205, + "balance_loss_mlp": 1.00025988, + "epoch": 0.24818878701337743, + "flos": 14757419013120.0, + "grad_norm": 3.6567900603072805, + "language_loss": 0.83911788, + "learning_rate": 3.5210458715213927e-06, + "loss": 0.85823607, + "num_input_tokens_seen": 88955235, + "step": 4128, + "time_per_iteration": 2.545790910720825 + }, + { + "auxiliary_loss_clip": 0.01141878, + "auxiliary_loss_mlp": 0.01142802, + "balance_loss_clip": 1.00195241, + "balance_loss_mlp": 1.00099051, + "epoch": 0.2482489102660454, + "flos": 27089501852160.0, + "grad_norm": 2.1986075959096785, + "language_loss": 0.65748996, + "learning_rate": 3.5207929609031973e-06, + "loss": 0.68033671, + "num_input_tokens_seen": 88975210, + "step": 4129, + "time_per_iteration": 2.675569534301758 + }, + { + "auxiliary_loss_clip": 0.01114758, + "auxiliary_loss_mlp": 0.01142377, + "balance_loss_clip": 1.00229096, + "balance_loss_mlp": 1.00085139, + "epoch": 0.24830903351871336, + "flos": 26467528924800.0, + "grad_norm": 1.671089824528661, + "language_loss": 0.7500571, + "learning_rate": 3.5205399926163806e-06, + "loss": 0.77262843, + "num_input_tokens_seen": 88996120, + "step": 4130, + "time_per_iteration": 2.727567195892334 + }, + { + "auxiliary_loss_clip": 0.01093097, + "auxiliary_loss_mlp": 0.01142913, + "balance_loss_clip": 1.00186276, + "balance_loss_mlp": 1.00119698, + "epoch": 0.24836915677138133, + "flos": 10228436870400.0, + "grad_norm": 2.377270748161107, + "language_loss": 0.77177352, + "learning_rate": 3.520286966670535e-06, + "loss": 0.7941336, + "num_input_tokens_seen": 89008685, + "step": 4131, + "time_per_iteration": 2.6938116550445557 + }, + { + "auxiliary_loss_clip": 0.01157293, + "auxiliary_loss_mlp": 0.01142101, + "balance_loss_clip": 1.00211024, + "balance_loss_mlp": 1.00086164, + "epoch": 0.2484292800240493, + "flos": 30080429579520.0, + "grad_norm": 1.5373349346033256, + "language_loss": 0.83805084, + "learning_rate": 3.520033883075255e-06, + "loss": 0.86104476, + "num_input_tokens_seen": 89031160, + "step": 4132, + "time_per_iteration": 2.6648716926574707 + }, + { + "auxiliary_loss_clip": 0.01141919, + "auxiliary_loss_mlp": 0.01142747, + "balance_loss_clip": 1.00198305, + "balance_loss_mlp": 1.00084031, + "epoch": 0.24848940327671726, + "flos": 13442944803840.0, + "grad_norm": 2.555449461636378, + "language_loss": 0.71075451, + "learning_rate": 3.5197807418401386e-06, + "loss": 0.73360121, + "num_input_tokens_seen": 89047235, + "step": 4133, + "time_per_iteration": 2.5969760417938232 + }, + { + "auxiliary_loss_clip": 0.01174166, + "auxiliary_loss_mlp": 0.01143589, + "balance_loss_clip": 1.0020895, + "balance_loss_mlp": 1.00063324, + "epoch": 0.24854952652938525, + "flos": 19970247260160.0, + "grad_norm": 2.0471114636995336, + "language_loss": 0.61365646, + "learning_rate": 3.5195275429747834e-06, + "loss": 0.63683403, + "num_input_tokens_seen": 89064790, + "step": 4134, + "time_per_iteration": 4.110972166061401 + }, + { + "auxiliary_loss_clip": 0.01163226, + "auxiliary_loss_mlp": 0.01142792, + "balance_loss_clip": 1.00232744, + "balance_loss_mlp": 1.00078988, + "epoch": 0.24860964978205322, + "flos": 18150187167360.0, + "grad_norm": 2.734659190445174, + "language_loss": 0.78584731, + "learning_rate": 3.5192742864887914e-06, + "loss": 0.80890751, + "num_input_tokens_seen": 89083250, + "step": 4135, + "time_per_iteration": 2.574847459793091 + }, + { + "auxiliary_loss_clip": 0.01141165, + "auxiliary_loss_mlp": 0.01142427, + "balance_loss_clip": 1.00207341, + "balance_loss_mlp": 1.00071073, + "epoch": 0.24866977303472118, + "flos": 11728641329280.0, + "grad_norm": 2.2900671776787034, + "language_loss": 0.82906556, + "learning_rate": 3.5190209723917662e-06, + "loss": 0.85190147, + "num_input_tokens_seen": 89100905, + "step": 4136, + "time_per_iteration": 2.6148793697357178 + }, + { + "auxiliary_loss_clip": 0.01125752, + "auxiliary_loss_mlp": 0.01142579, + "balance_loss_clip": 1.00188971, + "balance_loss_mlp": 1.00076807, + "epoch": 0.24872989628738915, + "flos": 34823582565120.0, + "grad_norm": 1.7273663054540789, + "language_loss": 0.70708686, + "learning_rate": 3.518767600693314e-06, + "loss": 0.72977012, + "num_input_tokens_seen": 89122630, + "step": 4137, + "time_per_iteration": 4.142946481704712 + }, + { + "auxiliary_loss_clip": 0.01158455, + "auxiliary_loss_mlp": 0.00748457, + "balance_loss_clip": 1.00209165, + "balance_loss_mlp": 1.00029182, + "epoch": 0.2487900195400571, + "flos": 13699347062400.0, + "grad_norm": 2.1304429680423334, + "language_loss": 0.66897666, + "learning_rate": 3.518514171403042e-06, + "loss": 0.68804574, + "num_input_tokens_seen": 89141050, + "step": 4138, + "time_per_iteration": 5.345879554748535 + }, + { + "auxiliary_loss_clip": 0.01124793, + "auxiliary_loss_mlp": 0.01142161, + "balance_loss_clip": 1.00187492, + "balance_loss_mlp": 1.00082624, + "epoch": 0.24885014279272508, + "flos": 25337815297920.0, + "grad_norm": 2.426397735043681, + "language_loss": 0.84118891, + "learning_rate": 3.51826068453056e-06, + "loss": 0.86385846, + "num_input_tokens_seen": 89160810, + "step": 4139, + "time_per_iteration": 2.7459254264831543 + }, + { + "auxiliary_loss_clip": 0.01130915, + "auxiliary_loss_mlp": 0.01142954, + "balance_loss_clip": 1.0022037, + "balance_loss_mlp": 1.00114262, + "epoch": 0.24891026604539307, + "flos": 20631434860800.0, + "grad_norm": 1.491731602022909, + "language_loss": 0.78758484, + "learning_rate": 3.518007140085481e-06, + "loss": 0.81032354, + "num_input_tokens_seen": 89180610, + "step": 4140, + "time_per_iteration": 2.6483733654022217 + }, + { + "auxiliary_loss_clip": 0.01155287, + "auxiliary_loss_mlp": 0.01128877, + "balance_loss_clip": 1.00242066, + "balance_loss_mlp": 0.9999401, + "epoch": 0.24897038929806103, + "flos": 66960294030720.0, + "grad_norm": 0.8151617347593313, + "language_loss": 0.61007756, + "learning_rate": 3.51775353807742e-06, + "loss": 0.63291919, + "num_input_tokens_seen": 89241880, + "step": 4141, + "time_per_iteration": 3.1912286281585693 + }, + { + "auxiliary_loss_clip": 0.01174243, + "auxiliary_loss_mlp": 0.01143618, + "balance_loss_clip": 1.00218356, + "balance_loss_mlp": 1.00123441, + "epoch": 0.249030512550729, + "flos": 36392555612160.0, + "grad_norm": 2.2345884687115296, + "language_loss": 0.7258023, + "learning_rate": 3.5174998785159913e-06, + "loss": 0.74898094, + "num_input_tokens_seen": 89263340, + "step": 4142, + "time_per_iteration": 2.6418519020080566 + }, + { + "auxiliary_loss_clip": 0.01158588, + "auxiliary_loss_mlp": 0.01142924, + "balance_loss_clip": 1.0020932, + "balance_loss_mlp": 1.00101781, + "epoch": 0.24909063580339696, + "flos": 20154576879360.0, + "grad_norm": 2.151907944445746, + "language_loss": 0.80746502, + "learning_rate": 3.5172461614108157e-06, + "loss": 0.83048016, + "num_input_tokens_seen": 89282870, + "step": 4143, + "time_per_iteration": 2.638280153274536 + }, + { + "auxiliary_loss_clip": 0.01140895, + "auxiliary_loss_mlp": 0.01142065, + "balance_loss_clip": 1.00190449, + "balance_loss_mlp": 1.00092101, + "epoch": 0.24915075905606493, + "flos": 26396569607040.0, + "grad_norm": 2.6866526529475983, + "language_loss": 0.58738089, + "learning_rate": 3.5169923867715137e-06, + "loss": 0.61021048, + "num_input_tokens_seen": 89303830, + "step": 4144, + "time_per_iteration": 2.639394998550415 + }, + { + "auxiliary_loss_clip": 0.01158541, + "auxiliary_loss_mlp": 0.01142571, + "balance_loss_clip": 1.00206208, + "balance_loss_mlp": 1.00104523, + "epoch": 0.2492108823087329, + "flos": 27527216987520.0, + "grad_norm": 2.017443186485865, + "language_loss": 0.78384304, + "learning_rate": 3.516738554607708e-06, + "loss": 0.80685425, + "num_input_tokens_seen": 89324350, + "step": 4145, + "time_per_iteration": 2.6118569374084473 + }, + { + "auxiliary_loss_clip": 0.01157534, + "auxiliary_loss_mlp": 0.0074849, + "balance_loss_clip": 1.00201476, + "balance_loss_mlp": 1.00025153, + "epoch": 0.24927100556140086, + "flos": 16691388111360.0, + "grad_norm": 2.2014574661824553, + "language_loss": 0.65843689, + "learning_rate": 3.5164846649290253e-06, + "loss": 0.67749715, + "num_input_tokens_seen": 89342875, + "step": 4146, + "time_per_iteration": 2.565321683883667 + }, + { + "auxiliary_loss_clip": 0.01138512, + "auxiliary_loss_mlp": 0.0112985, + "balance_loss_clip": 1.00145733, + "balance_loss_mlp": 1.00015032, + "epoch": 0.24933112881406885, + "flos": 62772464286720.0, + "grad_norm": 0.9485725685435019, + "language_loss": 0.67307228, + "learning_rate": 3.5162307177450915e-06, + "loss": 0.6957559, + "num_input_tokens_seen": 89404925, + "step": 4147, + "time_per_iteration": 3.3046741485595703 + }, + { + "auxiliary_loss_clip": 0.01142314, + "auxiliary_loss_mlp": 0.01142151, + "balance_loss_clip": 1.00203574, + "balance_loss_mlp": 1.00091195, + "epoch": 0.24939125206673682, + "flos": 26651894457600.0, + "grad_norm": 1.8436995853124478, + "language_loss": 0.88872647, + "learning_rate": 3.5159767130655366e-06, + "loss": 0.91157109, + "num_input_tokens_seen": 89425090, + "step": 4148, + "time_per_iteration": 2.6759469509124756 + }, + { + "auxiliary_loss_clip": 0.01109382, + "auxiliary_loss_mlp": 0.01143946, + "balance_loss_clip": 1.00181282, + "balance_loss_mlp": 1.00108552, + "epoch": 0.24945137531940478, + "flos": 20704333512960.0, + "grad_norm": 2.1211029884939703, + "language_loss": 0.68078768, + "learning_rate": 3.5157226508999935e-06, + "loss": 0.70332098, + "num_input_tokens_seen": 89442615, + "step": 4149, + "time_per_iteration": 2.7000889778137207 + }, + { + "auxiliary_loss_clip": 0.01158281, + "auxiliary_loss_mlp": 0.01143012, + "balance_loss_clip": 1.00215948, + "balance_loss_mlp": 1.00100994, + "epoch": 0.24951149857207275, + "flos": 23768662682880.0, + "grad_norm": 1.8106160425460045, + "language_loss": 0.71034205, + "learning_rate": 3.515468531258095e-06, + "loss": 0.73335493, + "num_input_tokens_seen": 89463025, + "step": 4150, + "time_per_iteration": 2.636765718460083 + }, + { + "auxiliary_loss_clip": 0.01110476, + "auxiliary_loss_mlp": 0.01142444, + "balance_loss_clip": 1.00179553, + "balance_loss_mlp": 1.00101435, + "epoch": 0.2495716218247407, + "flos": 15664881237120.0, + "grad_norm": 2.0029763951568484, + "language_loss": 0.72873193, + "learning_rate": 3.515214354149478e-06, + "loss": 0.75126117, + "num_input_tokens_seen": 89480225, + "step": 4151, + "time_per_iteration": 2.675351858139038 + }, + { + "auxiliary_loss_clip": 0.0115889, + "auxiliary_loss_mlp": 0.01143284, + "balance_loss_clip": 1.0020349, + "balance_loss_mlp": 1.00099623, + "epoch": 0.24963174507740868, + "flos": 24052499953920.0, + "grad_norm": 3.1998897144192444, + "language_loss": 0.6311332, + "learning_rate": 3.514960119583781e-06, + "loss": 0.6541549, + "num_input_tokens_seen": 89496985, + "step": 4152, + "time_per_iteration": 2.578108310699463 + }, + { + "auxiliary_loss_clip": 0.01157475, + "auxiliary_loss_mlp": 0.01142714, + "balance_loss_clip": 1.00195265, + "balance_loss_mlp": 1.00109327, + "epoch": 0.24969186833007664, + "flos": 21799501234560.0, + "grad_norm": 2.3908589895170875, + "language_loss": 0.76934069, + "learning_rate": 3.514705827570645e-06, + "loss": 0.79234254, + "num_input_tokens_seen": 89514420, + "step": 4153, + "time_per_iteration": 2.559344530105591 + }, + { + "auxiliary_loss_clip": 0.01158113, + "auxiliary_loss_mlp": 0.0114244, + "balance_loss_clip": 1.00208116, + "balance_loss_mlp": 1.00101054, + "epoch": 0.24975199158274464, + "flos": 19938143479680.0, + "grad_norm": 2.3160538833133746, + "language_loss": 0.76275516, + "learning_rate": 3.514451478119711e-06, + "loss": 0.7857607, + "num_input_tokens_seen": 89532925, + "step": 4154, + "time_per_iteration": 2.6033074855804443 + }, + { + "auxiliary_loss_clip": 0.01157445, + "auxiliary_loss_mlp": 0.01143193, + "balance_loss_clip": 1.00190711, + "balance_loss_mlp": 1.00099993, + "epoch": 0.2498121148354126, + "flos": 25338389915520.0, + "grad_norm": 2.0797858265887728, + "language_loss": 0.70506918, + "learning_rate": 3.5141970712406258e-06, + "loss": 0.72807562, + "num_input_tokens_seen": 89552855, + "step": 4155, + "time_per_iteration": 2.6269092559814453 + }, + { + "auxiliary_loss_clip": 0.01141222, + "auxiliary_loss_mlp": 0.01143475, + "balance_loss_clip": 1.00192153, + "balance_loss_mlp": 1.00118661, + "epoch": 0.24987223808808057, + "flos": 20558787603840.0, + "grad_norm": 1.5456492232532262, + "language_loss": 0.74975824, + "learning_rate": 3.513942606943036e-06, + "loss": 0.77260524, + "num_input_tokens_seen": 89572830, + "step": 4156, + "time_per_iteration": 2.6031224727630615 + }, + { + "auxiliary_loss_clip": 0.01157525, + "auxiliary_loss_mlp": 0.01143016, + "balance_loss_clip": 1.00198197, + "balance_loss_mlp": 1.00110912, + "epoch": 0.24993236134074853, + "flos": 19749037351680.0, + "grad_norm": 1.7440595493500715, + "language_loss": 0.76391435, + "learning_rate": 3.513688085236591e-06, + "loss": 0.78691983, + "num_input_tokens_seen": 89590345, + "step": 4157, + "time_per_iteration": 2.578888416290283 + }, + { + "auxiliary_loss_clip": 0.01093902, + "auxiliary_loss_mlp": 0.01142886, + "balance_loss_clip": 1.00167394, + "balance_loss_mlp": 1.00097895, + "epoch": 0.2499924845934165, + "flos": 18770292587520.0, + "grad_norm": 1.5825145224577288, + "language_loss": 0.81169772, + "learning_rate": 3.513433506130942e-06, + "loss": 0.83406562, + "num_input_tokens_seen": 89610295, + "step": 4158, + "time_per_iteration": 2.710803270339966 + }, + { + "auxiliary_loss_clip": 0.01142256, + "auxiliary_loss_mlp": 0.01142436, + "balance_loss_clip": 1.00198042, + "balance_loss_mlp": 1.00072002, + "epoch": 0.25005260784608446, + "flos": 16872198197760.0, + "grad_norm": 1.844226411767323, + "language_loss": 0.75268602, + "learning_rate": 3.5131788696357427e-06, + "loss": 0.77553296, + "num_input_tokens_seen": 89627795, + "step": 4159, + "time_per_iteration": 2.6010823249816895 + }, + { + "auxiliary_loss_clip": 0.01163246, + "auxiliary_loss_mlp": 0.01142984, + "balance_loss_clip": 1.00226748, + "balance_loss_mlp": 1.00088644, + "epoch": 0.2501127310987524, + "flos": 22124923476480.0, + "grad_norm": 1.6452102606087387, + "language_loss": 0.71496356, + "learning_rate": 3.512924175760649e-06, + "loss": 0.7380259, + "num_input_tokens_seen": 89648090, + "step": 4160, + "time_per_iteration": 2.590009927749634 + }, + { + "auxiliary_loss_clip": 0.01170715, + "auxiliary_loss_mlp": 0.01129161, + "balance_loss_clip": 1.00166059, + "balance_loss_mlp": 1.00022447, + "epoch": 0.2501728543514204, + "flos": 69458061980160.0, + "grad_norm": 0.749460882859815, + "language_loss": 0.56751192, + "learning_rate": 3.5126694245153186e-06, + "loss": 0.59051073, + "num_input_tokens_seen": 89710345, + "step": 4161, + "time_per_iteration": 3.155344009399414 + }, + { + "auxiliary_loss_clip": 0.01157491, + "auxiliary_loss_mlp": 0.01143653, + "balance_loss_clip": 1.00203061, + "balance_loss_mlp": 1.00088811, + "epoch": 0.25023297760408836, + "flos": 16289978647680.0, + "grad_norm": 1.8778545555031507, + "language_loss": 0.80641949, + "learning_rate": 3.5124146159094125e-06, + "loss": 0.82943094, + "num_input_tokens_seen": 89729390, + "step": 4162, + "time_per_iteration": 2.6017508506774902 + }, + { + "auxiliary_loss_clip": 0.01141945, + "auxiliary_loss_mlp": 0.00748452, + "balance_loss_clip": 1.00192499, + "balance_loss_mlp": 1.00038934, + "epoch": 0.2502931008567563, + "flos": 12237998140800.0, + "grad_norm": 2.7125315991826855, + "language_loss": 0.87484163, + "learning_rate": 3.5121597499525927e-06, + "loss": 0.8937456, + "num_input_tokens_seen": 89742805, + "step": 4163, + "time_per_iteration": 2.5547683238983154 + }, + { + "auxiliary_loss_clip": 0.01157907, + "auxiliary_loss_mlp": 0.01143323, + "balance_loss_clip": 1.00212646, + "balance_loss_mlp": 1.00084388, + "epoch": 0.25035322410942434, + "flos": 23181882105600.0, + "grad_norm": 2.117746141516277, + "language_loss": 0.83224142, + "learning_rate": 3.5119048266545232e-06, + "loss": 0.8552537, + "num_input_tokens_seen": 89761145, + "step": 4164, + "time_per_iteration": 2.579136371612549 + }, + { + "auxiliary_loss_clip": 0.01157631, + "auxiliary_loss_mlp": 0.0114201, + "balance_loss_clip": 1.00211918, + "balance_loss_mlp": 1.00105703, + "epoch": 0.2504133473620923, + "flos": 20917534688640.0, + "grad_norm": 1.734910693257028, + "language_loss": 0.74281484, + "learning_rate": 3.5116498460248716e-06, + "loss": 0.76581126, + "num_input_tokens_seen": 89780905, + "step": 4165, + "time_per_iteration": 2.564962863922119 + }, + { + "auxiliary_loss_clip": 0.01125095, + "auxiliary_loss_mlp": 0.01142713, + "balance_loss_clip": 1.00183487, + "balance_loss_mlp": 1.00090122, + "epoch": 0.2504734706147603, + "flos": 20776549806720.0, + "grad_norm": 1.7990237922569738, + "language_loss": 0.73785549, + "learning_rate": 3.5113948080733062e-06, + "loss": 0.76053357, + "num_input_tokens_seen": 89799230, + "step": 4166, + "time_per_iteration": 2.6655654907226562 + }, + { + "auxiliary_loss_clip": 0.01124605, + "auxiliary_loss_mlp": 0.0114188, + "balance_loss_clip": 1.00177884, + "balance_loss_mlp": 1.00102234, + "epoch": 0.25053359386742824, + "flos": 24349373861760.0, + "grad_norm": 1.675233765414938, + "language_loss": 0.81627345, + "learning_rate": 3.5111397128094973e-06, + "loss": 0.8389383, + "num_input_tokens_seen": 89818240, + "step": 4167, + "time_per_iteration": 2.657834768295288 + }, + { + "auxiliary_loss_clip": 0.01157338, + "auxiliary_loss_mlp": 0.01142136, + "balance_loss_clip": 1.00190043, + "balance_loss_mlp": 1.00080109, + "epoch": 0.2505937171200962, + "flos": 21214336769280.0, + "grad_norm": 2.0071412498773262, + "language_loss": 0.79472721, + "learning_rate": 3.51088456024312e-06, + "loss": 0.81772196, + "num_input_tokens_seen": 89834485, + "step": 4168, + "time_per_iteration": 2.5605974197387695 + }, + { + "auxiliary_loss_clip": 0.01157249, + "auxiliary_loss_mlp": 0.01143416, + "balance_loss_clip": 1.0019244, + "balance_loss_mlp": 1.00093746, + "epoch": 0.25065384037276417, + "flos": 41427231379200.0, + "grad_norm": 4.453200708424152, + "language_loss": 0.6980474, + "learning_rate": 3.510629350383849e-06, + "loss": 0.72105402, + "num_input_tokens_seen": 89855645, + "step": 4169, + "time_per_iteration": 2.7345004081726074 + }, + { + "auxiliary_loss_clip": 0.01130375, + "auxiliary_loss_mlp": 0.01141938, + "balance_loss_clip": 1.001863, + "balance_loss_mlp": 1.00089002, + "epoch": 0.25071396362543213, + "flos": 26102389219200.0, + "grad_norm": 1.6945343971417048, + "language_loss": 0.77573019, + "learning_rate": 3.510374083241361e-06, + "loss": 0.79845333, + "num_input_tokens_seen": 89874895, + "step": 4170, + "time_per_iteration": 2.698364734649658 + }, + { + "auxiliary_loss_clip": 0.01141177, + "auxiliary_loss_mlp": 0.0114209, + "balance_loss_clip": 1.00192547, + "balance_loss_mlp": 1.00075531, + "epoch": 0.2507740868781001, + "flos": 19098982967040.0, + "grad_norm": 3.983052468703075, + "language_loss": 0.76075363, + "learning_rate": 3.5101187588253368e-06, + "loss": 0.78358632, + "num_input_tokens_seen": 89891700, + "step": 4171, + "time_per_iteration": 3.949648141860962 + }, + { + "auxiliary_loss_clip": 0.01170684, + "auxiliary_loss_mlp": 0.01128983, + "balance_loss_clip": 1.00164711, + "balance_loss_mlp": 1.00004613, + "epoch": 0.25083421013076806, + "flos": 64341868296960.0, + "grad_norm": 0.8233865784355133, + "language_loss": 0.60033435, + "learning_rate": 3.509863377145458e-06, + "loss": 0.62333101, + "num_input_tokens_seen": 89955775, + "step": 4172, + "time_per_iteration": 3.11575984954834 + }, + { + "auxiliary_loss_clip": 0.01142201, + "auxiliary_loss_mlp": 0.01142172, + "balance_loss_clip": 1.00182796, + "balance_loss_mlp": 1.00083733, + "epoch": 0.25089433338343603, + "flos": 24279599692800.0, + "grad_norm": 2.348366417600149, + "language_loss": 0.78925085, + "learning_rate": 3.509607938211409e-06, + "loss": 0.81209457, + "num_input_tokens_seen": 89977150, + "step": 4173, + "time_per_iteration": 2.650733470916748 + }, + { + "auxiliary_loss_clip": 0.01174072, + "auxiliary_loss_mlp": 0.01142834, + "balance_loss_clip": 1.00216651, + "balance_loss_mlp": 1.00092745, + "epoch": 0.250954456636104, + "flos": 14721472477440.0, + "grad_norm": 2.2392302647294304, + "language_loss": 0.83381283, + "learning_rate": 3.509352442032875e-06, + "loss": 0.85698187, + "num_input_tokens_seen": 89994925, + "step": 4174, + "time_per_iteration": 2.523134231567383 + }, + { + "auxiliary_loss_clip": 0.01109923, + "auxiliary_loss_mlp": 0.01142643, + "balance_loss_clip": 1.00186729, + "balance_loss_mlp": 1.0008316, + "epoch": 0.25101457988877196, + "flos": 22273593868800.0, + "grad_norm": 2.3605738802452807, + "language_loss": 0.70875156, + "learning_rate": 3.509096888619545e-06, + "loss": 0.73127729, + "num_input_tokens_seen": 90013235, + "step": 4175, + "time_per_iteration": 4.096187353134155 + }, + { + "auxiliary_loss_clip": 0.01126375, + "auxiliary_loss_mlp": 0.01142357, + "balance_loss_clip": 1.00175142, + "balance_loss_mlp": 1.00073695, + "epoch": 0.2510747031414399, + "flos": 25188929424000.0, + "grad_norm": 2.089144989254762, + "language_loss": 0.80913639, + "learning_rate": 3.50884127798111e-06, + "loss": 0.83182371, + "num_input_tokens_seen": 90032150, + "step": 4176, + "time_per_iteration": 4.1284825801849365 + }, + { + "auxiliary_loss_clip": 0.01141623, + "auxiliary_loss_mlp": 0.01142327, + "balance_loss_clip": 1.00197065, + "balance_loss_mlp": 1.00089765, + "epoch": 0.25113482639410795, + "flos": 20704189858560.0, + "grad_norm": 1.9999344685165965, + "language_loss": 0.83051479, + "learning_rate": 3.5085856101272623e-06, + "loss": 0.85335428, + "num_input_tokens_seen": 90049085, + "step": 4177, + "time_per_iteration": 2.5845818519592285 + }, + { + "auxiliary_loss_clip": 0.01125429, + "auxiliary_loss_mlp": 0.0114267, + "balance_loss_clip": 1.00197434, + "balance_loss_mlp": 1.00076318, + "epoch": 0.2511949496467759, + "flos": 21506936958720.0, + "grad_norm": 2.0874128043618585, + "language_loss": 0.82346213, + "learning_rate": 3.508329885067698e-06, + "loss": 0.84614313, + "num_input_tokens_seen": 90067695, + "step": 4178, + "time_per_iteration": 2.6389763355255127 + }, + { + "auxiliary_loss_clip": 0.01173794, + "auxiliary_loss_mlp": 0.00748273, + "balance_loss_clip": 1.00202489, + "balance_loss_mlp": 1.00037229, + "epoch": 0.2512550728994439, + "flos": 20701999128960.0, + "grad_norm": 2.1452705043581264, + "language_loss": 0.75972492, + "learning_rate": 3.508074102812112e-06, + "loss": 0.77894557, + "num_input_tokens_seen": 90083890, + "step": 4179, + "time_per_iteration": 2.5078907012939453 + }, + { + "auxiliary_loss_clip": 0.01125082, + "auxiliary_loss_mlp": 0.01143049, + "balance_loss_clip": 1.00190246, + "balance_loss_mlp": 1.00114262, + "epoch": 0.25131519615211184, + "flos": 18478626151680.0, + "grad_norm": 1.9332620966381715, + "language_loss": 0.70659721, + "learning_rate": 3.507818263370206e-06, + "loss": 0.7292785, + "num_input_tokens_seen": 90100995, + "step": 4180, + "time_per_iteration": 2.603478193283081 + }, + { + "auxiliary_loss_clip": 0.01173995, + "auxiliary_loss_mlp": 0.01142905, + "balance_loss_clip": 1.0021677, + "balance_loss_mlp": 1.00109363, + "epoch": 0.2513753194047798, + "flos": 20484955198080.0, + "grad_norm": 1.786443801903832, + "language_loss": 0.86107957, + "learning_rate": 3.5075623667516796e-06, + "loss": 0.8842485, + "num_input_tokens_seen": 90120365, + "step": 4181, + "time_per_iteration": 2.5107500553131104 + }, + { + "auxiliary_loss_clip": 0.01173998, + "auxiliary_loss_mlp": 0.01142271, + "balance_loss_clip": 1.00212312, + "balance_loss_mlp": 1.00093687, + "epoch": 0.25143544265744777, + "flos": 37670077704960.0, + "grad_norm": 2.032726442062672, + "language_loss": 0.68211865, + "learning_rate": 3.507306412966238e-06, + "loss": 0.70528132, + "num_input_tokens_seen": 90142610, + "step": 4182, + "time_per_iteration": 2.645979404449463 + }, + { + "auxiliary_loss_clip": 0.01139922, + "auxiliary_loss_mlp": 0.01128882, + "balance_loss_clip": 1.00160766, + "balance_loss_mlp": 0.99994487, + "epoch": 0.25149556591011574, + "flos": 69367457923200.0, + "grad_norm": 0.841469024518536, + "language_loss": 0.70063889, + "learning_rate": 3.5070504020235853e-06, + "loss": 0.72332692, + "num_input_tokens_seen": 90200555, + "step": 4183, + "time_per_iteration": 3.2013142108917236 + }, + { + "auxiliary_loss_clip": 0.01143214, + "auxiliary_loss_mlp": 0.01142678, + "balance_loss_clip": 1.0018388, + "balance_loss_mlp": 1.00086689, + "epoch": 0.2515556891627837, + "flos": 13990402967040.0, + "grad_norm": 1.6165629544793692, + "language_loss": 0.74212956, + "learning_rate": 3.506794333933431e-06, + "loss": 0.76498842, + "num_input_tokens_seen": 90218120, + "step": 4184, + "time_per_iteration": 2.581597089767456 + }, + { + "auxiliary_loss_clip": 0.01157402, + "auxiliary_loss_mlp": 0.01142456, + "balance_loss_clip": 1.00204635, + "balance_loss_mlp": 1.00102627, + "epoch": 0.25161581241545167, + "flos": 22163527618560.0, + "grad_norm": 1.9652294505018466, + "language_loss": 0.83160746, + "learning_rate": 3.506538208705484e-06, + "loss": 0.85460603, + "num_input_tokens_seen": 90236790, + "step": 4185, + "time_per_iteration": 2.5910708904266357 + }, + { + "auxiliary_loss_clip": 0.01104498, + "auxiliary_loss_mlp": 0.01128971, + "balance_loss_clip": 1.00191092, + "balance_loss_mlp": 1.00003469, + "epoch": 0.25167593566811963, + "flos": 69358407696000.0, + "grad_norm": 0.7855454735319894, + "language_loss": 0.6151396, + "learning_rate": 3.5062820263494574e-06, + "loss": 0.6374743, + "num_input_tokens_seen": 90297070, + "step": 4186, + "time_per_iteration": 3.156230926513672 + }, + { + "auxiliary_loss_clip": 0.01124064, + "auxiliary_loss_mlp": 0.01142887, + "balance_loss_clip": 1.00183129, + "balance_loss_mlp": 1.00088537, + "epoch": 0.2517360589207876, + "flos": 13261452359040.0, + "grad_norm": 1.8406265812289997, + "language_loss": 0.79230928, + "learning_rate": 3.5060257868750656e-06, + "loss": 0.81497878, + "num_input_tokens_seen": 90315255, + "step": 4187, + "time_per_iteration": 2.669506311416626 + }, + { + "auxiliary_loss_clip": 0.01091695, + "auxiliary_loss_mlp": 0.01142547, + "balance_loss_clip": 1.00169873, + "balance_loss_mlp": 1.0010215, + "epoch": 0.25179618217345556, + "flos": 20376828282240.0, + "grad_norm": 1.5465390846076685, + "language_loss": 0.80173475, + "learning_rate": 3.5057694902920244e-06, + "loss": 0.82407719, + "num_input_tokens_seen": 90334990, + "step": 4188, + "time_per_iteration": 2.7392735481262207 + }, + { + "auxiliary_loss_clip": 0.01157286, + "auxiliary_loss_mlp": 0.01143327, + "balance_loss_clip": 1.00202608, + "balance_loss_mlp": 1.00113392, + "epoch": 0.25185630542612353, + "flos": 27664718250240.0, + "grad_norm": 1.8423991602392447, + "language_loss": 0.74458802, + "learning_rate": 3.5055131366100534e-06, + "loss": 0.76759416, + "num_input_tokens_seen": 90351825, + "step": 4189, + "time_per_iteration": 2.606858491897583 + }, + { + "auxiliary_loss_clip": 0.01141904, + "auxiliary_loss_mlp": 0.01141788, + "balance_loss_clip": 1.00196743, + "balance_loss_mlp": 1.00083447, + "epoch": 0.25191642867879155, + "flos": 20996430912000.0, + "grad_norm": 1.9769620055448853, + "language_loss": 0.8401165, + "learning_rate": 3.5052567258388745e-06, + "loss": 0.86295342, + "num_input_tokens_seen": 90369860, + "step": 4190, + "time_per_iteration": 2.6192703247070312 + }, + { + "auxiliary_loss_clip": 0.01142726, + "auxiliary_loss_mlp": 0.01142386, + "balance_loss_clip": 1.00194764, + "balance_loss_mlp": 1.00086117, + "epoch": 0.2519765519314595, + "flos": 21105671149440.0, + "grad_norm": 1.7372200837793894, + "language_loss": 0.75365031, + "learning_rate": 3.5050002579882082e-06, + "loss": 0.77650148, + "num_input_tokens_seen": 90389245, + "step": 4191, + "time_per_iteration": 2.6365623474121094 + }, + { + "auxiliary_loss_clip": 0.01154008, + "auxiliary_loss_mlp": 0.01128962, + "balance_loss_clip": 1.0014981, + "balance_loss_mlp": 1.00002503, + "epoch": 0.2520366751841275, + "flos": 62744993360640.0, + "grad_norm": 0.7114999372401055, + "language_loss": 0.57173145, + "learning_rate": 3.5047437330677823e-06, + "loss": 0.5945611, + "num_input_tokens_seen": 90456735, + "step": 4192, + "time_per_iteration": 3.2466979026794434 + }, + { + "auxiliary_loss_clip": 0.01141131, + "auxiliary_loss_mlp": 0.01142524, + "balance_loss_clip": 1.0021143, + "balance_loss_mlp": 1.00080788, + "epoch": 0.25209679843679544, + "flos": 22230716008320.0, + "grad_norm": 2.1591422493429144, + "language_loss": 0.76012051, + "learning_rate": 3.504487151087323e-06, + "loss": 0.78295714, + "num_input_tokens_seen": 90474165, + "step": 4193, + "time_per_iteration": 2.6396422386169434 + }, + { + "auxiliary_loss_clip": 0.01157401, + "auxiliary_loss_mlp": 0.01142847, + "balance_loss_clip": 1.00204504, + "balance_loss_mlp": 1.00113106, + "epoch": 0.2521569216894634, + "flos": 12166643773440.0, + "grad_norm": 2.424480737577612, + "language_loss": 0.84349787, + "learning_rate": 3.5042305120565598e-06, + "loss": 0.86650038, + "num_input_tokens_seen": 90491660, + "step": 4194, + "time_per_iteration": 2.554859161376953 + }, + { + "auxiliary_loss_clip": 0.01174039, + "auxiliary_loss_mlp": 0.0114213, + "balance_loss_clip": 1.00208855, + "balance_loss_mlp": 1.00108194, + "epoch": 0.2522170449421314, + "flos": 23699786353920.0, + "grad_norm": 1.4394476712180895, + "language_loss": 0.88144439, + "learning_rate": 3.5039738159852253e-06, + "loss": 0.9046061, + "num_input_tokens_seen": 90514025, + "step": 4195, + "time_per_iteration": 2.601600408554077 + }, + { + "auxiliary_loss_clip": 0.01174016, + "auxiliary_loss_mlp": 0.01142784, + "balance_loss_clip": 1.00209641, + "balance_loss_mlp": 1.00078189, + "epoch": 0.25227716819479934, + "flos": 20955456472320.0, + "grad_norm": 2.112891930259561, + "language_loss": 0.85899466, + "learning_rate": 3.503717062883053e-06, + "loss": 0.88216269, + "num_input_tokens_seen": 90533530, + "step": 4196, + "time_per_iteration": 2.553457736968994 + }, + { + "auxiliary_loss_clip": 0.01157496, + "auxiliary_loss_mlp": 0.01143061, + "balance_loss_clip": 1.0020268, + "balance_loss_mlp": 1.00096321, + "epoch": 0.2523372914474673, + "flos": 23331342597120.0, + "grad_norm": 1.7383283031659222, + "language_loss": 0.83690345, + "learning_rate": 3.5034602527597786e-06, + "loss": 0.85990906, + "num_input_tokens_seen": 90554025, + "step": 4197, + "time_per_iteration": 2.576833486557007 + }, + { + "auxiliary_loss_clip": 0.01157314, + "auxiliary_loss_mlp": 0.01142815, + "balance_loss_clip": 1.00202417, + "balance_loss_mlp": 1.00090802, + "epoch": 0.25239741470013527, + "flos": 36970321875840.0, + "grad_norm": 1.7822233060716837, + "language_loss": 0.73160183, + "learning_rate": 3.5032033856251405e-06, + "loss": 0.75460315, + "num_input_tokens_seen": 90576930, + "step": 4198, + "time_per_iteration": 2.6903011798858643 + }, + { + "auxiliary_loss_clip": 0.0117406, + "auxiliary_loss_mlp": 0.01142895, + "balance_loss_clip": 1.0019958, + "balance_loss_mlp": 1.00098872, + "epoch": 0.25245753795280323, + "flos": 18515757836160.0, + "grad_norm": 2.0376194047633653, + "language_loss": 0.76644349, + "learning_rate": 3.50294646148888e-06, + "loss": 0.78961313, + "num_input_tokens_seen": 90595710, + "step": 4199, + "time_per_iteration": 2.503579616546631 + }, + { + "auxiliary_loss_clip": 0.01140595, + "auxiliary_loss_mlp": 0.00748271, + "balance_loss_clip": 1.00193369, + "balance_loss_mlp": 1.00020838, + "epoch": 0.2525176612054712, + "flos": 32344884737280.0, + "grad_norm": 1.829240117150987, + "language_loss": 0.73457092, + "learning_rate": 3.502689480360739e-06, + "loss": 0.75345957, + "num_input_tokens_seen": 90617945, + "step": 4200, + "time_per_iteration": 2.7141454219818115 + }, + { + "auxiliary_loss_clip": 0.01157437, + "auxiliary_loss_mlp": 0.01142274, + "balance_loss_clip": 1.0019778, + "balance_loss_mlp": 1.0011301, + "epoch": 0.25257778445813917, + "flos": 45258217459200.0, + "grad_norm": 1.5981493528008688, + "language_loss": 0.82074267, + "learning_rate": 3.5024324422504616e-06, + "loss": 0.84373975, + "num_input_tokens_seen": 90640855, + "step": 4201, + "time_per_iteration": 2.772739887237549 + }, + { + "auxiliary_loss_clip": 0.01107769, + "auxiliary_loss_mlp": 0.01143284, + "balance_loss_clip": 1.00187087, + "balance_loss_mlp": 1.00090003, + "epoch": 0.25263790771080713, + "flos": 23367791923200.0, + "grad_norm": 2.46725197984269, + "language_loss": 0.75164783, + "learning_rate": 3.5021753471677965e-06, + "loss": 0.77415836, + "num_input_tokens_seen": 90661350, + "step": 4202, + "time_per_iteration": 2.7049901485443115 + }, + { + "auxiliary_loss_clip": 0.01157245, + "auxiliary_loss_mlp": 0.01142011, + "balance_loss_clip": 1.0019412, + "balance_loss_mlp": 1.00096226, + "epoch": 0.25269803096347515, + "flos": 18515039564160.0, + "grad_norm": 2.4006525584217933, + "language_loss": 0.73319519, + "learning_rate": 3.501918195122491e-06, + "loss": 0.7561878, + "num_input_tokens_seen": 90680540, + "step": 4203, + "time_per_iteration": 2.545673370361328 + }, + { + "auxiliary_loss_clip": 0.01142018, + "auxiliary_loss_mlp": 0.0114247, + "balance_loss_clip": 1.00192225, + "balance_loss_mlp": 1.00075436, + "epoch": 0.2527581542161431, + "flos": 24610552629120.0, + "grad_norm": 1.6763788521625305, + "language_loss": 0.77317059, + "learning_rate": 3.501660986124297e-06, + "loss": 0.7960155, + "num_input_tokens_seen": 90703460, + "step": 4204, + "time_per_iteration": 2.64800763130188 + }, + { + "auxiliary_loss_clip": 0.01124885, + "auxiliary_loss_mlp": 0.01142066, + "balance_loss_clip": 1.00168884, + "balance_loss_mlp": 1.00101781, + "epoch": 0.2528182774688111, + "flos": 12641275111680.0, + "grad_norm": 1.8230593677135316, + "language_loss": 0.71917409, + "learning_rate": 3.5014037201829684e-06, + "loss": 0.74184358, + "num_input_tokens_seen": 90718815, + "step": 4205, + "time_per_iteration": 2.57974910736084 + }, + { + "auxiliary_loss_clip": 0.01158684, + "auxiliary_loss_mlp": 0.01141008, + "balance_loss_clip": 1.00213909, + "balance_loss_mlp": 1.00091362, + "epoch": 0.25287840072147905, + "flos": 46936789879680.0, + "grad_norm": 1.3267367484500776, + "language_loss": 0.75449026, + "learning_rate": 3.50114639730826e-06, + "loss": 0.77748728, + "num_input_tokens_seen": 90742125, + "step": 4206, + "time_per_iteration": 2.7880442142486572 + }, + { + "auxiliary_loss_clip": 0.01125332, + "auxiliary_loss_mlp": 0.01141969, + "balance_loss_clip": 1.00193501, + "balance_loss_mlp": 1.00092077, + "epoch": 0.252938523974147, + "flos": 18879712392960.0, + "grad_norm": 2.722724996946368, + "language_loss": 0.79164016, + "learning_rate": 3.5008890175099296e-06, + "loss": 0.81431317, + "num_input_tokens_seen": 90760785, + "step": 4207, + "time_per_iteration": 2.6402668952941895 + }, + { + "auxiliary_loss_clip": 0.0115703, + "auxiliary_loss_mlp": 0.01142112, + "balance_loss_clip": 1.00199676, + "balance_loss_mlp": 1.00096822, + "epoch": 0.252998647226815, + "flos": 21434720664960.0, + "grad_norm": 1.8566341089507352, + "language_loss": 0.76103556, + "learning_rate": 3.5006315807977375e-06, + "loss": 0.78402698, + "num_input_tokens_seen": 90780045, + "step": 4208, + "time_per_iteration": 2.564711570739746 + }, + { + "auxiliary_loss_clip": 0.0115697, + "auxiliary_loss_mlp": 0.01141773, + "balance_loss_clip": 1.00198197, + "balance_loss_mlp": 1.0008204, + "epoch": 0.25305877047948294, + "flos": 25442171285760.0, + "grad_norm": 1.892587805262022, + "language_loss": 0.69906104, + "learning_rate": 3.5003740871814456e-06, + "loss": 0.7220484, + "num_input_tokens_seen": 90797980, + "step": 4209, + "time_per_iteration": 3.9808590412139893 + }, + { + "auxiliary_loss_clip": 0.01155238, + "auxiliary_loss_mlp": 0.01128139, + "balance_loss_clip": 1.00182605, + "balance_loss_mlp": 0.99996531, + "epoch": 0.2531188937321509, + "flos": 60185603629440.0, + "grad_norm": 0.761040046989595, + "language_loss": 0.55112219, + "learning_rate": 3.5001165366708175e-06, + "loss": 0.57395601, + "num_input_tokens_seen": 90864865, + "step": 4210, + "time_per_iteration": 3.20766544342041 + }, + { + "auxiliary_loss_clip": 0.01124249, + "auxiliary_loss_mlp": 0.0114172, + "balance_loss_clip": 1.00179386, + "balance_loss_mlp": 1.00067103, + "epoch": 0.25317901698481887, + "flos": 19682387665920.0, + "grad_norm": 1.8361340094759877, + "language_loss": 0.79979157, + "learning_rate": 3.4998589292756204e-06, + "loss": 0.82245123, + "num_input_tokens_seen": 90882885, + "step": 4211, + "time_per_iteration": 2.6613104343414307 + }, + { + "auxiliary_loss_clip": 0.01109051, + "auxiliary_loss_mlp": 0.01141724, + "balance_loss_clip": 1.00180292, + "balance_loss_mlp": 1.00077081, + "epoch": 0.25323914023748684, + "flos": 24424355502720.0, + "grad_norm": 1.5161516528163914, + "language_loss": 0.7820307, + "learning_rate": 3.499601265005622e-06, + "loss": 0.80453849, + "num_input_tokens_seen": 90902985, + "step": 4212, + "time_per_iteration": 2.7454490661621094 + }, + { + "auxiliary_loss_clip": 0.01158127, + "auxiliary_loss_mlp": 0.0114211, + "balance_loss_clip": 1.00199032, + "balance_loss_mlp": 1.00096643, + "epoch": 0.2532992634901548, + "flos": 25447450584960.0, + "grad_norm": 1.8425990016873295, + "language_loss": 0.53719282, + "learning_rate": 3.4993435438705938e-06, + "loss": 0.56019521, + "num_input_tokens_seen": 90923550, + "step": 4213, + "time_per_iteration": 4.092382907867432 + }, + { + "auxiliary_loss_clip": 0.01141866, + "auxiliary_loss_mlp": 0.01142294, + "balance_loss_clip": 1.00191736, + "balance_loss_mlp": 1.00086427, + "epoch": 0.25335938674282277, + "flos": 18880538405760.0, + "grad_norm": 2.2296904138900966, + "language_loss": 0.64651048, + "learning_rate": 3.499085765880308e-06, + "loss": 0.66935211, + "num_input_tokens_seen": 90943260, + "step": 4214, + "time_per_iteration": 5.396606206893921 + }, + { + "auxiliary_loss_clip": 0.01155919, + "auxiliary_loss_mlp": 0.01128275, + "balance_loss_clip": 1.00186849, + "balance_loss_mlp": 1.00010145, + "epoch": 0.25341950999549073, + "flos": 53062649936640.0, + "grad_norm": 0.8638723859872195, + "language_loss": 0.58063412, + "learning_rate": 3.4988279310445396e-06, + "loss": 0.60347605, + "num_input_tokens_seen": 90996295, + "step": 4215, + "time_per_iteration": 2.915281057357788 + }, + { + "auxiliary_loss_clip": 0.01141102, + "auxiliary_loss_mlp": 0.01141966, + "balance_loss_clip": 1.00205851, + "balance_loss_mlp": 1.00082219, + "epoch": 0.2534796332481587, + "flos": 39020247054720.0, + "grad_norm": 1.557055906224618, + "language_loss": 0.83392769, + "learning_rate": 3.498570039373066e-06, + "loss": 0.85675836, + "num_input_tokens_seen": 91017545, + "step": 4216, + "time_per_iteration": 2.7668445110321045 + }, + { + "auxiliary_loss_clip": 0.01157375, + "auxiliary_loss_mlp": 0.01141834, + "balance_loss_clip": 1.00191128, + "balance_loss_mlp": 1.00088096, + "epoch": 0.2535397565008267, + "flos": 23586990670080.0, + "grad_norm": 2.029858254658826, + "language_loss": 0.79847825, + "learning_rate": 3.498312090875666e-06, + "loss": 0.82147038, + "num_input_tokens_seen": 91037715, + "step": 4217, + "time_per_iteration": 2.5634214878082275 + }, + { + "auxiliary_loss_clip": 0.01143036, + "auxiliary_loss_mlp": 0.0114164, + "balance_loss_clip": 1.00190639, + "balance_loss_mlp": 1.00068676, + "epoch": 0.2535998797534947, + "flos": 19281373251840.0, + "grad_norm": 4.731480309525545, + "language_loss": 0.75103718, + "learning_rate": 3.4980540855621218e-06, + "loss": 0.77388394, + "num_input_tokens_seen": 91055295, + "step": 4218, + "time_per_iteration": 2.595245122909546 + }, + { + "auxiliary_loss_clip": 0.01158208, + "auxiliary_loss_mlp": 0.01141931, + "balance_loss_clip": 1.00208426, + "balance_loss_mlp": 1.00097787, + "epoch": 0.25366000300616265, + "flos": 24024382583040.0, + "grad_norm": 1.7386919295256074, + "language_loss": 0.75049281, + "learning_rate": 3.4977960234422167e-06, + "loss": 0.77349424, + "num_input_tokens_seen": 91075485, + "step": 4219, + "time_per_iteration": 2.5705907344818115 + }, + { + "auxiliary_loss_clip": 0.01163177, + "auxiliary_loss_mlp": 0.01142364, + "balance_loss_clip": 1.00231862, + "balance_loss_mlp": 1.00102997, + "epoch": 0.2537201262588306, + "flos": 16289368116480.0, + "grad_norm": 1.790995612105261, + "language_loss": 0.81219327, + "learning_rate": 3.497537904525736e-06, + "loss": 0.83524871, + "num_input_tokens_seen": 91093620, + "step": 4220, + "time_per_iteration": 2.548534631729126 + }, + { + "auxiliary_loss_clip": 0.01110755, + "auxiliary_loss_mlp": 0.01142448, + "balance_loss_clip": 1.00196159, + "balance_loss_mlp": 1.00082779, + "epoch": 0.2537802495114986, + "flos": 23294677789440.0, + "grad_norm": 2.354283165220776, + "language_loss": 0.71230626, + "learning_rate": 3.497279728822468e-06, + "loss": 0.73483837, + "num_input_tokens_seen": 91114110, + "step": 4221, + "time_per_iteration": 2.7137451171875 + }, + { + "auxiliary_loss_clip": 0.01174027, + "auxiliary_loss_mlp": 0.01141982, + "balance_loss_clip": 1.00211191, + "balance_loss_mlp": 1.00093365, + "epoch": 0.25384037276416654, + "flos": 17639142416640.0, + "grad_norm": 1.6387335764161812, + "language_loss": 0.61747527, + "learning_rate": 3.497021496342202e-06, + "loss": 0.64063537, + "num_input_tokens_seen": 91133135, + "step": 4222, + "time_per_iteration": 2.498722791671753 + }, + { + "auxiliary_loss_clip": 0.01157416, + "auxiliary_loss_mlp": 0.01142673, + "balance_loss_clip": 1.00199866, + "balance_loss_mlp": 1.00133872, + "epoch": 0.2539004960168345, + "flos": 21507044699520.0, + "grad_norm": 1.4956391714909185, + "language_loss": 0.74826837, + "learning_rate": 3.496763207094731e-06, + "loss": 0.77126932, + "num_input_tokens_seen": 91151805, + "step": 4223, + "time_per_iteration": 2.535022020339966 + }, + { + "auxiliary_loss_clip": 0.0110821, + "auxiliary_loss_mlp": 0.0114111, + "balance_loss_clip": 1.00194716, + "balance_loss_mlp": 1.00063372, + "epoch": 0.2539606192695025, + "flos": 23950909313280.0, + "grad_norm": 2.1254865224702675, + "language_loss": 0.80272317, + "learning_rate": 3.49650486108985e-06, + "loss": 0.82521635, + "num_input_tokens_seen": 91172270, + "step": 4224, + "time_per_iteration": 2.7061498165130615 + }, + { + "auxiliary_loss_clip": 0.0115731, + "auxiliary_loss_mlp": 0.00748233, + "balance_loss_clip": 1.00198174, + "balance_loss_mlp": 1.00016797, + "epoch": 0.25402074252217044, + "flos": 24169784837760.0, + "grad_norm": 1.4066110060465151, + "language_loss": 0.77754188, + "learning_rate": 3.496246458337354e-06, + "loss": 0.7965973, + "num_input_tokens_seen": 91192080, + "step": 4225, + "time_per_iteration": 2.5830461978912354 + }, + { + "auxiliary_loss_clip": 0.01157054, + "auxiliary_loss_mlp": 0.01141825, + "balance_loss_clip": 1.00196004, + "balance_loss_mlp": 1.00106263, + "epoch": 0.2540808657748384, + "flos": 22303758314880.0, + "grad_norm": 2.1782647398840145, + "language_loss": 0.84725285, + "learning_rate": 3.4959879988470426e-06, + "loss": 0.87024164, + "num_input_tokens_seen": 91211450, + "step": 4226, + "time_per_iteration": 2.5751659870147705 + }, + { + "auxiliary_loss_clip": 0.01173737, + "auxiliary_loss_mlp": 0.01141495, + "balance_loss_clip": 1.0019989, + "balance_loss_mlp": 1.00101829, + "epoch": 0.25414098902750637, + "flos": 27599541022080.0, + "grad_norm": 1.753101390989761, + "language_loss": 0.71319073, + "learning_rate": 3.4957294826287164e-06, + "loss": 0.73634303, + "num_input_tokens_seen": 91231835, + "step": 4227, + "time_per_iteration": 2.578383684158325 + }, + { + "auxiliary_loss_clip": 0.01170858, + "auxiliary_loss_mlp": 0.01128433, + "balance_loss_clip": 1.00190902, + "balance_loss_mlp": 1.00025868, + "epoch": 0.25420111228017434, + "flos": 58170834887040.0, + "grad_norm": 0.9875129997334863, + "language_loss": 0.61810941, + "learning_rate": 3.4954709096921785e-06, + "loss": 0.64110231, + "num_input_tokens_seen": 91288755, + "step": 4228, + "time_per_iteration": 2.9568662643432617 + }, + { + "auxiliary_loss_clip": 0.01158725, + "auxiliary_loss_mlp": 0.01141716, + "balance_loss_clip": 1.00206447, + "balance_loss_mlp": 1.00085819, + "epoch": 0.2542612355328423, + "flos": 11464409905920.0, + "grad_norm": 2.5210159708940827, + "language_loss": 0.86446655, + "learning_rate": 3.4952122800472336e-06, + "loss": 0.8874709, + "num_input_tokens_seen": 91302485, + "step": 4229, + "time_per_iteration": 2.5307157039642334 + }, + { + "auxiliary_loss_clip": 0.01125434, + "auxiliary_loss_mlp": 0.0114212, + "balance_loss_clip": 1.00187635, + "balance_loss_mlp": 1.00078547, + "epoch": 0.2543213587855103, + "flos": 22965879669120.0, + "grad_norm": 2.716219551686268, + "language_loss": 0.77573133, + "learning_rate": 3.4949535937036892e-06, + "loss": 0.79840684, + "num_input_tokens_seen": 91321120, + "step": 4230, + "time_per_iteration": 2.631765365600586 + }, + { + "auxiliary_loss_clip": 0.0115731, + "auxiliary_loss_mlp": 0.0114192, + "balance_loss_clip": 1.00191522, + "balance_loss_mlp": 1.00096726, + "epoch": 0.2543814820381783, + "flos": 18253178438400.0, + "grad_norm": 2.0580855055308493, + "language_loss": 0.75231028, + "learning_rate": 3.4946948506713544e-06, + "loss": 0.77530265, + "num_input_tokens_seen": 91338575, + "step": 4231, + "time_per_iteration": 2.5367512702941895 + }, + { + "auxiliary_loss_clip": 0.01162955, + "auxiliary_loss_mlp": 0.01141719, + "balance_loss_clip": 1.00223565, + "balance_loss_mlp": 1.00076628, + "epoch": 0.25444160529084625, + "flos": 15632705629440.0, + "grad_norm": 2.450847498473524, + "language_loss": 0.73925984, + "learning_rate": 3.4944360509600416e-06, + "loss": 0.76230663, + "num_input_tokens_seen": 91357355, + "step": 4232, + "time_per_iteration": 2.524136781692505 + }, + { + "auxiliary_loss_clip": 0.01173858, + "auxiliary_loss_mlp": 0.01141986, + "balance_loss_clip": 1.00212002, + "balance_loss_mlp": 1.00093794, + "epoch": 0.2545017285435142, + "flos": 24601610142720.0, + "grad_norm": 2.165816392681546, + "language_loss": 0.86676097, + "learning_rate": 3.4941771945795637e-06, + "loss": 0.8899194, + "num_input_tokens_seen": 91376515, + "step": 4233, + "time_per_iteration": 2.641862154006958 + }, + { + "auxiliary_loss_clip": 0.01094315, + "auxiliary_loss_mlp": 0.01140687, + "balance_loss_clip": 1.00172305, + "balance_loss_mlp": 1.0008781, + "epoch": 0.2545618517961822, + "flos": 24679069822080.0, + "grad_norm": 1.6205870354706549, + "language_loss": 0.74796867, + "learning_rate": 3.493918281539737e-06, + "loss": 0.77031863, + "num_input_tokens_seen": 91397595, + "step": 4234, + "time_per_iteration": 2.804769992828369 + }, + { + "auxiliary_loss_clip": 0.01140999, + "auxiliary_loss_mlp": 0.01142013, + "balance_loss_clip": 1.00194621, + "balance_loss_mlp": 1.0009644, + "epoch": 0.25462197504885015, + "flos": 23915106432000.0, + "grad_norm": 1.4034685704726027, + "language_loss": 0.74476898, + "learning_rate": 3.493659311850379e-06, + "loss": 0.76759911, + "num_input_tokens_seen": 91417775, + "step": 4235, + "time_per_iteration": 2.639942169189453 + }, + { + "auxiliary_loss_clip": 0.01127699, + "auxiliary_loss_mlp": 0.0074824, + "balance_loss_clip": 1.00202048, + "balance_loss_mlp": 1.00015354, + "epoch": 0.2546820983015181, + "flos": 24789387467520.0, + "grad_norm": 2.0250032950900922, + "language_loss": 0.65215743, + "learning_rate": 3.4934002855213106e-06, + "loss": 0.6709168, + "num_input_tokens_seen": 91437665, + "step": 4236, + "time_per_iteration": 2.7323899269104004 + }, + { + "auxiliary_loss_clip": 0.01173726, + "auxiliary_loss_mlp": 0.01141736, + "balance_loss_clip": 1.00207734, + "balance_loss_mlp": 1.00078249, + "epoch": 0.2547422215541861, + "flos": 18734130570240.0, + "grad_norm": 1.6017691277397765, + "language_loss": 0.66595644, + "learning_rate": 3.493141202562354e-06, + "loss": 0.68911111, + "num_input_tokens_seen": 91456705, + "step": 4237, + "time_per_iteration": 2.5100035667419434 + }, + { + "auxiliary_loss_clip": 0.01173786, + "auxiliary_loss_mlp": 0.01141904, + "balance_loss_clip": 1.0020442, + "balance_loss_mlp": 1.00075984, + "epoch": 0.25480234480685404, + "flos": 21032449274880.0, + "grad_norm": 2.007625616058924, + "language_loss": 0.75362033, + "learning_rate": 3.492882062983333e-06, + "loss": 0.77677715, + "num_input_tokens_seen": 91475535, + "step": 4238, + "time_per_iteration": 2.5234756469726562 + }, + { + "auxiliary_loss_clip": 0.01157058, + "auxiliary_loss_mlp": 0.01141942, + "balance_loss_clip": 1.00190175, + "balance_loss_mlp": 1.00079823, + "epoch": 0.254862468059522, + "flos": 25082167224960.0, + "grad_norm": 2.0763378385942093, + "language_loss": 0.80117416, + "learning_rate": 3.492622866794074e-06, + "loss": 0.82416421, + "num_input_tokens_seen": 91499140, + "step": 4239, + "time_per_iteration": 2.622385263442993 + }, + { + "auxiliary_loss_clip": 0.0115746, + "auxiliary_loss_mlp": 0.01142203, + "balance_loss_clip": 1.0020144, + "balance_loss_mlp": 1.0009644, + "epoch": 0.25492259131219, + "flos": 20558392554240.0, + "grad_norm": 1.6809676683873604, + "language_loss": 0.7740196, + "learning_rate": 3.492363614004407e-06, + "loss": 0.79701614, + "num_input_tokens_seen": 91518335, + "step": 4240, + "time_per_iteration": 2.5267693996429443 + }, + { + "auxiliary_loss_clip": 0.01173951, + "auxiliary_loss_mlp": 0.01142392, + "balance_loss_clip": 1.00205255, + "balance_loss_mlp": 1.00067639, + "epoch": 0.25498271456485794, + "flos": 25042485674880.0, + "grad_norm": 1.732249221693215, + "language_loss": 0.8360492, + "learning_rate": 3.492104304624162e-06, + "loss": 0.85921258, + "num_input_tokens_seen": 91537655, + "step": 4241, + "time_per_iteration": 2.5401623249053955 + }, + { + "auxiliary_loss_clip": 0.01157229, + "auxiliary_loss_mlp": 0.01141862, + "balance_loss_clip": 1.00203216, + "balance_loss_mlp": 1.00100446, + "epoch": 0.2550428378175259, + "flos": 26178412354560.0, + "grad_norm": 1.703820347520583, + "language_loss": 0.73438329, + "learning_rate": 3.4918449386631725e-06, + "loss": 0.75737417, + "num_input_tokens_seen": 91557545, + "step": 4242, + "time_per_iteration": 2.587108850479126 + }, + { + "auxiliary_loss_clip": 0.01173824, + "auxiliary_loss_mlp": 0.00748202, + "balance_loss_clip": 1.00205255, + "balance_loss_mlp": 1.00008798, + "epoch": 0.2551029610701939, + "flos": 15267170874240.0, + "grad_norm": 2.6308552802217027, + "language_loss": 0.7199313, + "learning_rate": 3.491585516131273e-06, + "loss": 0.7391516, + "num_input_tokens_seen": 91574405, + "step": 4243, + "time_per_iteration": 2.506603240966797 + }, + { + "auxiliary_loss_clip": 0.01156726, + "auxiliary_loss_mlp": 0.01141978, + "balance_loss_clip": 1.00190043, + "balance_loss_mlp": 1.00102496, + "epoch": 0.2551630843228619, + "flos": 18112193556480.0, + "grad_norm": 2.5046370985265796, + "language_loss": 0.81789666, + "learning_rate": 3.491326037038301e-06, + "loss": 0.84088373, + "num_input_tokens_seen": 91593755, + "step": 4244, + "time_per_iteration": 2.5503666400909424 + }, + { + "auxiliary_loss_clip": 0.01153909, + "auxiliary_loss_mlp": 0.01128289, + "balance_loss_clip": 1.00192785, + "balance_loss_mlp": 1.00011539, + "epoch": 0.25522320757552985, + "flos": 70520192167680.0, + "grad_norm": 0.6890405405559111, + "language_loss": 0.57759678, + "learning_rate": 3.4910665013940967e-06, + "loss": 0.60041881, + "num_input_tokens_seen": 91660335, + "step": 4245, + "time_per_iteration": 3.2362282276153564 + }, + { + "auxiliary_loss_clip": 0.01173817, + "auxiliary_loss_mlp": 0.01141916, + "balance_loss_clip": 1.00197279, + "balance_loss_mlp": 1.00105834, + "epoch": 0.2552833308281978, + "flos": 22893088757760.0, + "grad_norm": 2.191691822627757, + "language_loss": 0.65068966, + "learning_rate": 3.4908069092085015e-06, + "loss": 0.67384696, + "num_input_tokens_seen": 91678500, + "step": 4246, + "time_per_iteration": 2.52129864692688 + }, + { + "auxiliary_loss_clip": 0.01158138, + "auxiliary_loss_mlp": 0.01141247, + "balance_loss_clip": 1.00197506, + "balance_loss_mlp": 1.00077033, + "epoch": 0.2553434540808658, + "flos": 22053605022720.0, + "grad_norm": 2.0483890194247865, + "language_loss": 0.81310284, + "learning_rate": 3.4905472604913585e-06, + "loss": 0.83609664, + "num_input_tokens_seen": 91696430, + "step": 4247, + "time_per_iteration": 3.957453489303589 + }, + { + "auxiliary_loss_clip": 0.01157246, + "auxiliary_loss_mlp": 0.01142661, + "balance_loss_clip": 1.00193381, + "balance_loss_mlp": 1.00084996, + "epoch": 0.25540357733353375, + "flos": 16544190176640.0, + "grad_norm": 2.1466240138493315, + "language_loss": 0.83304411, + "learning_rate": 3.490287555252514e-06, + "loss": 0.85604322, + "num_input_tokens_seen": 91713270, + "step": 4248, + "time_per_iteration": 2.5225276947021484 + }, + { + "auxiliary_loss_clip": 0.01143094, + "auxiliary_loss_mlp": 0.01141947, + "balance_loss_clip": 1.00208151, + "balance_loss_mlp": 1.00099349, + "epoch": 0.2554637005862017, + "flos": 17565022702080.0, + "grad_norm": 1.9561409186727754, + "language_loss": 0.83962864, + "learning_rate": 3.4900277935018166e-06, + "loss": 0.86247903, + "num_input_tokens_seen": 91728865, + "step": 4249, + "time_per_iteration": 2.5593459606170654 + }, + { + "auxiliary_loss_clip": 0.01093848, + "auxiliary_loss_mlp": 0.01128214, + "balance_loss_clip": 1.00149369, + "balance_loss_mlp": 1.00004053, + "epoch": 0.2555238238388697, + "flos": 72244763953920.0, + "grad_norm": 0.7641035492972665, + "language_loss": 0.56342852, + "learning_rate": 3.489767975249115e-06, + "loss": 0.58564913, + "num_input_tokens_seen": 91787470, + "step": 4250, + "time_per_iteration": 4.72487998008728 + }, + { + "auxiliary_loss_clip": 0.01140338, + "auxiliary_loss_mlp": 0.01141735, + "balance_loss_clip": 1.00180459, + "balance_loss_mlp": 1.00059128, + "epoch": 0.25558394709153764, + "flos": 24389414547840.0, + "grad_norm": 2.7794648084284224, + "language_loss": 0.80522829, + "learning_rate": 3.4895081005042632e-06, + "loss": 0.82804906, + "num_input_tokens_seen": 91805640, + "step": 4251, + "time_per_iteration": 4.226870059967041 + }, + { + "auxiliary_loss_clip": 0.01129247, + "auxiliary_loss_mlp": 0.01128231, + "balance_loss_clip": 1.00196111, + "balance_loss_mlp": 1.0000571, + "epoch": 0.2556440703442056, + "flos": 69231213636480.0, + "grad_norm": 0.8093781618737511, + "language_loss": 0.66084671, + "learning_rate": 3.4892481692771146e-06, + "loss": 0.68342149, + "num_input_tokens_seen": 91869695, + "step": 4252, + "time_per_iteration": 4.85169792175293 + }, + { + "auxiliary_loss_clip": 0.01156618, + "auxiliary_loss_mlp": 0.01140774, + "balance_loss_clip": 1.00186968, + "balance_loss_mlp": 1.0006789, + "epoch": 0.2557041935968736, + "flos": 24863902231680.0, + "grad_norm": 2.098278102242397, + "language_loss": 0.73390603, + "learning_rate": 3.4889881815775267e-06, + "loss": 0.75687999, + "num_input_tokens_seen": 91889920, + "step": 4253, + "time_per_iteration": 2.5699093341827393 + }, + { + "auxiliary_loss_clip": 0.01132397, + "auxiliary_loss_mlp": 0.0114124, + "balance_loss_clip": 1.00206518, + "balance_loss_mlp": 1.00076365, + "epoch": 0.25576431684954154, + "flos": 22492110257280.0, + "grad_norm": 1.9580120590296914, + "language_loss": 0.72857141, + "learning_rate": 3.488728137415357e-06, + "loss": 0.75130779, + "num_input_tokens_seen": 91908665, + "step": 4254, + "time_per_iteration": 2.6531102657318115 + }, + { + "auxiliary_loss_clip": 0.01110564, + "auxiliary_loss_mlp": 0.00748253, + "balance_loss_clip": 1.00174212, + "balance_loss_mlp": 1.0001471, + "epoch": 0.2558244401022095, + "flos": 19826748426240.0, + "grad_norm": 1.598477618898845, + "language_loss": 0.80871505, + "learning_rate": 3.4884680368004675e-06, + "loss": 0.82730317, + "num_input_tokens_seen": 91927855, + "step": 4255, + "time_per_iteration": 2.723870277404785 + }, + { + "auxiliary_loss_clip": 0.01141857, + "auxiliary_loss_mlp": 0.01141037, + "balance_loss_clip": 1.00198436, + "balance_loss_mlp": 1.00075138, + "epoch": 0.2558845633548775, + "flos": 23220486247680.0, + "grad_norm": 2.0612411812037226, + "language_loss": 0.85485375, + "learning_rate": 3.488207879742721e-06, + "loss": 0.87768275, + "num_input_tokens_seen": 91948500, + "step": 4256, + "time_per_iteration": 2.656886577606201 + }, + { + "auxiliary_loss_clip": 0.01126878, + "auxiliary_loss_mlp": 0.01142553, + "balance_loss_clip": 1.00208712, + "balance_loss_mlp": 1.00093293, + "epoch": 0.2559446866075455, + "flos": 16837867774080.0, + "grad_norm": 1.766680555227863, + "language_loss": 0.7480849, + "learning_rate": 3.4879476662519826e-06, + "loss": 0.77077925, + "num_input_tokens_seen": 91968375, + "step": 4257, + "time_per_iteration": 2.6516339778900146 + }, + { + "auxiliary_loss_clip": 0.01111124, + "auxiliary_loss_mlp": 0.01128088, + "balance_loss_clip": 1.00267625, + "balance_loss_mlp": 0.99991435, + "epoch": 0.25600480986021346, + "flos": 57593786895360.0, + "grad_norm": 0.802916814809769, + "language_loss": 0.65305686, + "learning_rate": 3.4876873963381196e-06, + "loss": 0.67544901, + "num_input_tokens_seen": 92028490, + "step": 4258, + "time_per_iteration": 3.3941991329193115 + }, + { + "auxiliary_loss_clip": 0.011081, + "auxiliary_loss_mlp": 0.00748145, + "balance_loss_clip": 1.00177741, + "balance_loss_mlp": 1.00019431, + "epoch": 0.2560649331128814, + "flos": 27819529868160.0, + "grad_norm": 1.5705745439122256, + "language_loss": 0.76616967, + "learning_rate": 3.4874270700110013e-06, + "loss": 0.7847321, + "num_input_tokens_seen": 92048060, + "step": 4259, + "time_per_iteration": 2.9818942546844482 + }, + { + "auxiliary_loss_clip": 0.01122854, + "auxiliary_loss_mlp": 0.01127372, + "balance_loss_clip": 1.00175834, + "balance_loss_mlp": 0.9999609, + "epoch": 0.2561250563655494, + "flos": 70950509101440.0, + "grad_norm": 0.8049278281133593, + "language_loss": 0.58506501, + "learning_rate": 3.4871666872804994e-06, + "loss": 0.60756731, + "num_input_tokens_seen": 92118180, + "step": 4260, + "time_per_iteration": 3.296565055847168 + }, + { + "auxiliary_loss_clip": 0.01157689, + "auxiliary_loss_mlp": 0.01140875, + "balance_loss_clip": 1.00187802, + "balance_loss_mlp": 1.00078034, + "epoch": 0.25618517961821735, + "flos": 27012329481600.0, + "grad_norm": 1.8592438596425553, + "language_loss": 0.76661038, + "learning_rate": 3.4869062481564875e-06, + "loss": 0.78959602, + "num_input_tokens_seen": 92137570, + "step": 4261, + "time_per_iteration": 2.615403652191162 + }, + { + "auxiliary_loss_clip": 0.01173704, + "auxiliary_loss_mlp": 0.01141127, + "balance_loss_clip": 1.00204408, + "balance_loss_mlp": 1.00074589, + "epoch": 0.2562453028708853, + "flos": 23068296322560.0, + "grad_norm": 1.5590088262000779, + "language_loss": 0.8308028, + "learning_rate": 3.486645752648842e-06, + "loss": 0.8539511, + "num_input_tokens_seen": 92157625, + "step": 4262, + "time_per_iteration": 2.5674667358398438 + }, + { + "auxiliary_loss_clip": 0.01157552, + "auxiliary_loss_mlp": 0.01142095, + "balance_loss_clip": 1.00198603, + "balance_loss_mlp": 1.00085545, + "epoch": 0.2563054261235533, + "flos": 15120942606720.0, + "grad_norm": 5.267642293777252, + "language_loss": 0.73902625, + "learning_rate": 3.4863852007674405e-06, + "loss": 0.76202273, + "num_input_tokens_seen": 92175350, + "step": 4263, + "time_per_iteration": 2.5337045192718506 + }, + { + "auxiliary_loss_clip": 0.01140992, + "auxiliary_loss_mlp": 0.00748138, + "balance_loss_clip": 1.00196576, + "balance_loss_mlp": 1.00017834, + "epoch": 0.25636554937622125, + "flos": 27854865872640.0, + "grad_norm": 2.2866141376253117, + "language_loss": 0.82821059, + "learning_rate": 3.486124592522163e-06, + "loss": 0.84710193, + "num_input_tokens_seen": 92196070, + "step": 4264, + "time_per_iteration": 2.65434193611145 + }, + { + "auxiliary_loss_clip": 0.01157204, + "auxiliary_loss_mlp": 0.01142291, + "balance_loss_clip": 1.00189948, + "balance_loss_mlp": 1.00095654, + "epoch": 0.2564256726288892, + "flos": 28906509288960.0, + "grad_norm": 1.925546386366347, + "language_loss": 0.74343336, + "learning_rate": 3.4858639279228924e-06, + "loss": 0.76642829, + "num_input_tokens_seen": 92216310, + "step": 4265, + "time_per_iteration": 2.607004404067993 + }, + { + "auxiliary_loss_clip": 0.01125892, + "auxiliary_loss_mlp": 0.01141153, + "balance_loss_clip": 1.00177455, + "balance_loss_mlp": 1.00058091, + "epoch": 0.2564857958815572, + "flos": 18514931823360.0, + "grad_norm": 1.7147930431504497, + "language_loss": 0.8180443, + "learning_rate": 3.485603206979513e-06, + "loss": 0.84071481, + "num_input_tokens_seen": 92234510, + "step": 4266, + "time_per_iteration": 2.6327860355377197 + }, + { + "auxiliary_loss_clip": 0.01108539, + "auxiliary_loss_mlp": 0.01141125, + "balance_loss_clip": 1.00173163, + "balance_loss_mlp": 1.00083923, + "epoch": 0.25654591913422514, + "flos": 25808280658560.0, + "grad_norm": 1.6091162250083328, + "language_loss": 0.79230833, + "learning_rate": 3.4853424297019103e-06, + "loss": 0.81480491, + "num_input_tokens_seen": 92254070, + "step": 4267, + "time_per_iteration": 2.7082300186157227 + }, + { + "auxiliary_loss_clip": 0.01124977, + "auxiliary_loss_mlp": 0.01141601, + "balance_loss_clip": 1.00192332, + "balance_loss_mlp": 1.00093448, + "epoch": 0.2566060423868931, + "flos": 19099665325440.0, + "grad_norm": 1.96349996112619, + "language_loss": 0.79042673, + "learning_rate": 3.4850815960999736e-06, + "loss": 0.81309247, + "num_input_tokens_seen": 92275060, + "step": 4268, + "time_per_iteration": 2.68442964553833 + }, + { + "auxiliary_loss_clip": 0.01125512, + "auxiliary_loss_mlp": 0.00748375, + "balance_loss_clip": 1.00185728, + "balance_loss_mlp": 1.00017834, + "epoch": 0.25666616563956113, + "flos": 23842674656640.0, + "grad_norm": 1.5608062710712132, + "language_loss": 0.6765331, + "learning_rate": 3.484820706183595e-06, + "loss": 0.69527197, + "num_input_tokens_seen": 92293610, + "step": 4269, + "time_per_iteration": 2.6683998107910156 + }, + { + "auxiliary_loss_clip": 0.0114009, + "auxiliary_loss_mlp": 0.01141557, + "balance_loss_clip": 1.00183296, + "balance_loss_mlp": 1.00079513, + "epoch": 0.2567262888922291, + "flos": 14604259420800.0, + "grad_norm": 3.7350243406764534, + "language_loss": 0.78169179, + "learning_rate": 3.484559759962666e-06, + "loss": 0.80450827, + "num_input_tokens_seen": 92308305, + "step": 4270, + "time_per_iteration": 2.563098192214966 + }, + { + "auxiliary_loss_clip": 0.0110991, + "auxiliary_loss_mlp": 0.01142152, + "balance_loss_clip": 1.00186455, + "balance_loss_mlp": 1.00072169, + "epoch": 0.25678641214489706, + "flos": 32923117877760.0, + "grad_norm": 1.7895315186125509, + "language_loss": 0.67179871, + "learning_rate": 3.4842987574470816e-06, + "loss": 0.69431931, + "num_input_tokens_seen": 92329875, + "step": 4271, + "time_per_iteration": 2.790591239929199 + }, + { + "auxiliary_loss_clip": 0.01157076, + "auxiliary_loss_mlp": 0.00748227, + "balance_loss_clip": 1.0019201, + "balance_loss_mlp": 1.00018072, + "epoch": 0.256846535397565, + "flos": 24098933260800.0, + "grad_norm": 1.4926157080321898, + "language_loss": 0.87180281, + "learning_rate": 3.4840376986467403e-06, + "loss": 0.89085579, + "num_input_tokens_seen": 92348780, + "step": 4272, + "time_per_iteration": 2.6004302501678467 + }, + { + "auxiliary_loss_clip": 0.01141112, + "auxiliary_loss_mlp": 0.0114249, + "balance_loss_clip": 1.00200415, + "balance_loss_mlp": 1.00077438, + "epoch": 0.256906658650233, + "flos": 19718441942400.0, + "grad_norm": 1.7464397050720326, + "language_loss": 0.81461579, + "learning_rate": 3.483776583571541e-06, + "loss": 0.83745182, + "num_input_tokens_seen": 92368175, + "step": 4273, + "time_per_iteration": 2.607706069946289 + }, + { + "auxiliary_loss_clip": 0.01126393, + "auxiliary_loss_mlp": 0.01140705, + "balance_loss_clip": 1.00190783, + "balance_loss_mlp": 1.00080109, + "epoch": 0.25696678190290095, + "flos": 22926018551040.0, + "grad_norm": 1.5167872212785445, + "language_loss": 0.7704227, + "learning_rate": 3.4835154122313846e-06, + "loss": 0.79309368, + "num_input_tokens_seen": 92387755, + "step": 4274, + "time_per_iteration": 2.647649049758911 + }, + { + "auxiliary_loss_clip": 0.0114151, + "auxiliary_loss_mlp": 0.0114043, + "balance_loss_clip": 1.0017972, + "balance_loss_mlp": 1.00071692, + "epoch": 0.2570269051555689, + "flos": 27307838672640.0, + "grad_norm": 2.098300226344309, + "language_loss": 0.83891141, + "learning_rate": 3.4832541846361743e-06, + "loss": 0.86173081, + "num_input_tokens_seen": 92409850, + "step": 4275, + "time_per_iteration": 2.6598901748657227 + }, + { + "auxiliary_loss_clip": 0.01141504, + "auxiliary_loss_mlp": 0.01141914, + "balance_loss_clip": 1.00186729, + "balance_loss_mlp": 1.00067472, + "epoch": 0.2570870284082369, + "flos": 27563414918400.0, + "grad_norm": 1.9063671262847919, + "language_loss": 0.78077817, + "learning_rate": 3.4829929007958175e-06, + "loss": 0.80361235, + "num_input_tokens_seen": 92431250, + "step": 4276, + "time_per_iteration": 2.6395084857940674 + }, + { + "auxiliary_loss_clip": 0.01157744, + "auxiliary_loss_mlp": 0.01141112, + "balance_loss_clip": 1.0019654, + "balance_loss_mlp": 1.00092244, + "epoch": 0.25714715166090485, + "flos": 28730834847360.0, + "grad_norm": 1.7162557640795502, + "language_loss": 0.79234064, + "learning_rate": 3.4827315607202214e-06, + "loss": 0.81532919, + "num_input_tokens_seen": 92452065, + "step": 4277, + "time_per_iteration": 2.612309217453003 + }, + { + "auxiliary_loss_clip": 0.01173634, + "auxiliary_loss_mlp": 0.01141247, + "balance_loss_clip": 1.00201273, + "balance_loss_mlp": 1.00077057, + "epoch": 0.2572072749135728, + "flos": 20116152305280.0, + "grad_norm": 2.259680300171711, + "language_loss": 0.78581953, + "learning_rate": 3.482470164419295e-06, + "loss": 0.80896831, + "num_input_tokens_seen": 92470025, + "step": 4278, + "time_per_iteration": 2.5070760250091553 + }, + { + "auxiliary_loss_clip": 0.01140541, + "auxiliary_loss_mlp": 0.01141538, + "balance_loss_clip": 1.00190258, + "balance_loss_mlp": 1.00068009, + "epoch": 0.2572673981662408, + "flos": 26030855283840.0, + "grad_norm": 2.6423565259623056, + "language_loss": 0.74832857, + "learning_rate": 3.482208711902952e-06, + "loss": 0.77114928, + "num_input_tokens_seen": 92489825, + "step": 4279, + "time_per_iteration": 2.6454854011535645 + }, + { + "auxiliary_loss_clip": 0.0115711, + "auxiliary_loss_mlp": 0.01141725, + "balance_loss_clip": 1.00188684, + "balance_loss_mlp": 1.00086725, + "epoch": 0.25732752141890874, + "flos": 16106618695680.0, + "grad_norm": 1.8980955155196606, + "language_loss": 0.84986281, + "learning_rate": 3.4819472031811065e-06, + "loss": 0.87285113, + "num_input_tokens_seen": 92507270, + "step": 4280, + "time_per_iteration": 2.5533509254455566 + }, + { + "auxiliary_loss_clip": 0.01157008, + "auxiliary_loss_mlp": 0.01141566, + "balance_loss_clip": 1.00193882, + "balance_loss_mlp": 1.00089908, + "epoch": 0.2573876446715767, + "flos": 22524429519360.0, + "grad_norm": 3.778243857986138, + "language_loss": 0.78463447, + "learning_rate": 3.4816856382636744e-06, + "loss": 0.80762017, + "num_input_tokens_seen": 92526300, + "step": 4281, + "time_per_iteration": 2.543930768966675 + }, + { + "auxiliary_loss_clip": 0.01124734, + "auxiliary_loss_mlp": 0.01141378, + "balance_loss_clip": 1.00178742, + "balance_loss_mlp": 1.0007112, + "epoch": 0.2574477679242447, + "flos": 23950837486080.0, + "grad_norm": 2.437504025339068, + "language_loss": 0.87264544, + "learning_rate": 3.4814240171605737e-06, + "loss": 0.89530647, + "num_input_tokens_seen": 92546465, + "step": 4282, + "time_per_iteration": 2.6365160942077637 + }, + { + "auxiliary_loss_clip": 0.01173662, + "auxiliary_loss_mlp": 0.01141492, + "balance_loss_clip": 1.00202775, + "balance_loss_mlp": 1.00082517, + "epoch": 0.2575078911769127, + "flos": 21981711951360.0, + "grad_norm": 1.8875056783649762, + "language_loss": 0.70521808, + "learning_rate": 3.4811623398817267e-06, + "loss": 0.72836959, + "num_input_tokens_seen": 92567260, + "step": 4283, + "time_per_iteration": 2.554588556289673 + }, + { + "auxiliary_loss_clip": 0.01173538, + "auxiliary_loss_mlp": 0.00748155, + "balance_loss_clip": 1.00206673, + "balance_loss_mlp": 1.00014615, + "epoch": 0.25756801442958066, + "flos": 21945406279680.0, + "grad_norm": 1.8406997159352347, + "language_loss": 0.80948055, + "learning_rate": 3.4809006064370553e-06, + "loss": 0.82869744, + "num_input_tokens_seen": 92585425, + "step": 4284, + "time_per_iteration": 2.537883996963501 + }, + { + "auxiliary_loss_clip": 0.01098927, + "auxiliary_loss_mlp": 0.01141476, + "balance_loss_clip": 1.00212598, + "balance_loss_mlp": 1.00080872, + "epoch": 0.2576281376822486, + "flos": 35261980058880.0, + "grad_norm": 3.672174342022182, + "language_loss": 0.70258993, + "learning_rate": 3.4806388168364835e-06, + "loss": 0.724994, + "num_input_tokens_seen": 92604770, + "step": 4285, + "time_per_iteration": 4.296573162078857 + }, + { + "auxiliary_loss_clip": 0.01142992, + "auxiliary_loss_mlp": 0.01141371, + "balance_loss_clip": 1.0020082, + "balance_loss_mlp": 1.00089467, + "epoch": 0.2576882609349166, + "flos": 14132285688960.0, + "grad_norm": 2.5802183375838483, + "language_loss": 0.589715, + "learning_rate": 3.4803769710899402e-06, + "loss": 0.6125586, + "num_input_tokens_seen": 92622635, + "step": 4286, + "time_per_iteration": 2.57810378074646 + }, + { + "auxiliary_loss_clip": 0.01156747, + "auxiliary_loss_mlp": 0.01142415, + "balance_loss_clip": 1.00192666, + "balance_loss_mlp": 1.00098491, + "epoch": 0.25774838418758456, + "flos": 23258336204160.0, + "grad_norm": 1.4574610993894184, + "language_loss": 0.6383093, + "learning_rate": 3.480115069207354e-06, + "loss": 0.66130096, + "num_input_tokens_seen": 92642960, + "step": 4287, + "time_per_iteration": 4.018574237823486 + }, + { + "auxiliary_loss_clip": 0.01140561, + "auxiliary_loss_mlp": 0.01141527, + "balance_loss_clip": 1.00193691, + "balance_loss_mlp": 1.00076497, + "epoch": 0.2578085074402525, + "flos": 22601745544320.0, + "grad_norm": 2.241325408090118, + "language_loss": 0.71547616, + "learning_rate": 3.4798531111986557e-06, + "loss": 0.7382971, + "num_input_tokens_seen": 92662455, + "step": 4288, + "time_per_iteration": 4.002351760864258 + }, + { + "auxiliary_loss_clip": 0.01123656, + "auxiliary_loss_mlp": 0.01140554, + "balance_loss_clip": 1.00178313, + "balance_loss_mlp": 1.00093651, + "epoch": 0.2578686306929205, + "flos": 24571840746240.0, + "grad_norm": 1.4537067931047212, + "language_loss": 0.77199936, + "learning_rate": 3.4795910970737786e-06, + "loss": 0.79464144, + "num_input_tokens_seen": 92683520, + "step": 4289, + "time_per_iteration": 2.6842336654663086 + }, + { + "auxiliary_loss_clip": 0.01173685, + "auxiliary_loss_mlp": 0.00748196, + "balance_loss_clip": 1.00205302, + "balance_loss_mlp": 1.00018442, + "epoch": 0.25792875394558845, + "flos": 18113953322880.0, + "grad_norm": 2.04707022508758, + "language_loss": 0.84317571, + "learning_rate": 3.4793290268426592e-06, + "loss": 0.86239451, + "num_input_tokens_seen": 92701450, + "step": 4290, + "time_per_iteration": 3.9185073375701904 + }, + { + "auxiliary_loss_clip": 0.01127094, + "auxiliary_loss_mlp": 0.01141511, + "balance_loss_clip": 1.00181937, + "balance_loss_mlp": 1.00093937, + "epoch": 0.2579888771982564, + "flos": 17712902995200.0, + "grad_norm": 1.8159118368786278, + "language_loss": 0.72254145, + "learning_rate": 3.4790669005152354e-06, + "loss": 0.74522752, + "num_input_tokens_seen": 92720355, + "step": 4291, + "time_per_iteration": 2.610614776611328 + }, + { + "auxiliary_loss_clip": 0.01173764, + "auxiliary_loss_mlp": 0.01140789, + "balance_loss_clip": 1.00211728, + "balance_loss_mlp": 1.0006938, + "epoch": 0.2580490004509244, + "flos": 16434878112000.0, + "grad_norm": 2.3423047260710286, + "language_loss": 0.80743217, + "learning_rate": 3.4788047181014458e-06, + "loss": 0.83057767, + "num_input_tokens_seen": 92736755, + "step": 4292, + "time_per_iteration": 2.4967756271362305 + }, + { + "auxiliary_loss_clip": 0.01173729, + "auxiliary_loss_mlp": 0.01141635, + "balance_loss_clip": 1.00215435, + "balance_loss_mlp": 1.00068212, + "epoch": 0.25810912370359235, + "flos": 33835141128960.0, + "grad_norm": 2.009796664027391, + "language_loss": 0.67359126, + "learning_rate": 3.4785424796112337e-06, + "loss": 0.69674492, + "num_input_tokens_seen": 92757655, + "step": 4293, + "time_per_iteration": 2.635145664215088 + }, + { + "auxiliary_loss_clip": 0.01123376, + "auxiliary_loss_mlp": 0.01141181, + "balance_loss_clip": 1.00180292, + "balance_loss_mlp": 1.00099099, + "epoch": 0.2581692469562603, + "flos": 25192197561600.0, + "grad_norm": 1.8536834386821475, + "language_loss": 0.75714397, + "learning_rate": 3.478280185054542e-06, + "loss": 0.77978957, + "num_input_tokens_seen": 92776100, + "step": 4294, + "time_per_iteration": 2.6776671409606934 + }, + { + "auxiliary_loss_clip": 0.01126137, + "auxiliary_loss_mlp": 0.01141152, + "balance_loss_clip": 1.00172675, + "balance_loss_mlp": 1.00077128, + "epoch": 0.2582293702089283, + "flos": 34932212271360.0, + "grad_norm": 1.9687541399584374, + "language_loss": 0.81331474, + "learning_rate": 3.478017834441318e-06, + "loss": 0.83598763, + "num_input_tokens_seen": 92798880, + "step": 4295, + "time_per_iteration": 2.7969346046447754 + }, + { + "auxiliary_loss_clip": 0.01042878, + "auxiliary_loss_mlp": 0.01141749, + "balance_loss_clip": 1.00147033, + "balance_loss_mlp": 1.00089121, + "epoch": 0.2582894934615963, + "flos": 26833746038400.0, + "grad_norm": 2.238637361170385, + "language_loss": 0.72856903, + "learning_rate": 3.4777554277815096e-06, + "loss": 0.75041527, + "num_input_tokens_seen": 92817750, + "step": 4296, + "time_per_iteration": 2.9778993129730225 + }, + { + "auxiliary_loss_clip": 0.01092399, + "auxiliary_loss_mlp": 0.01141907, + "balance_loss_clip": 1.00164664, + "balance_loss_mlp": 1.00085831, + "epoch": 0.25834961671426426, + "flos": 23515241253120.0, + "grad_norm": 1.5152274072192984, + "language_loss": 0.86672157, + "learning_rate": 3.477492965085067e-06, + "loss": 0.88906467, + "num_input_tokens_seen": 92837995, + "step": 4297, + "time_per_iteration": 3.2508935928344727 + }, + { + "auxiliary_loss_clip": 0.01173833, + "auxiliary_loss_mlp": 0.01141848, + "balance_loss_clip": 1.00213337, + "balance_loss_mlp": 1.00108552, + "epoch": 0.25840973996693223, + "flos": 22451028076800.0, + "grad_norm": 1.6822674317819704, + "language_loss": 0.84485996, + "learning_rate": 3.477230446361943e-06, + "loss": 0.86801672, + "num_input_tokens_seen": 92857245, + "step": 4298, + "time_per_iteration": 2.558134078979492 + }, + { + "auxiliary_loss_clip": 0.01156881, + "auxiliary_loss_mlp": 0.00748243, + "balance_loss_clip": 1.00188911, + "balance_loss_mlp": 1.00019431, + "epoch": 0.2584698632196002, + "flos": 11290854366720.0, + "grad_norm": 2.5169167628614737, + "language_loss": 0.83122921, + "learning_rate": 3.4769678716220927e-06, + "loss": 0.85028046, + "num_input_tokens_seen": 92873265, + "step": 4299, + "time_per_iteration": 2.5353758335113525 + }, + { + "auxiliary_loss_clip": 0.01141468, + "auxiliary_loss_mlp": 0.01140505, + "balance_loss_clip": 1.00194132, + "balance_loss_mlp": 1.00079131, + "epoch": 0.25852998647226816, + "flos": 17929982839680.0, + "grad_norm": 2.7573257903290145, + "language_loss": 0.82446653, + "learning_rate": 3.4767052408754726e-06, + "loss": 0.84728628, + "num_input_tokens_seen": 92890880, + "step": 4300, + "time_per_iteration": 2.562181234359741 + }, + { + "auxiliary_loss_clip": 0.01157025, + "auxiliary_loss_mlp": 0.01141801, + "balance_loss_clip": 1.0019381, + "balance_loss_mlp": 1.00084841, + "epoch": 0.2585901097249361, + "flos": 33256117889280.0, + "grad_norm": 2.245836095547184, + "language_loss": 0.67051971, + "learning_rate": 3.4764425541320417e-06, + "loss": 0.69350797, + "num_input_tokens_seen": 92910770, + "step": 4301, + "time_per_iteration": 2.6451363563537598 + }, + { + "auxiliary_loss_clip": 0.01156671, + "auxiliary_loss_mlp": 0.01141634, + "balance_loss_clip": 1.00185776, + "balance_loss_mlp": 1.00077629, + "epoch": 0.2586502329776041, + "flos": 18441278985600.0, + "grad_norm": 2.8616912803190133, + "language_loss": 0.81987435, + "learning_rate": 3.4761798114017617e-06, + "loss": 0.84285736, + "num_input_tokens_seen": 92929520, + "step": 4302, + "time_per_iteration": 2.522542953491211 + }, + { + "auxiliary_loss_clip": 0.01108739, + "auxiliary_loss_mlp": 0.01141243, + "balance_loss_clip": 1.00178146, + "balance_loss_mlp": 1.00095749, + "epoch": 0.25871035623027205, + "flos": 17968120104960.0, + "grad_norm": 1.7189856484068515, + "language_loss": 0.91798121, + "learning_rate": 3.475917012694595e-06, + "loss": 0.94048107, + "num_input_tokens_seen": 92947890, + "step": 4303, + "time_per_iteration": 2.652540922164917 + }, + { + "auxiliary_loss_clip": 0.01158284, + "auxiliary_loss_mlp": 0.01141339, + "balance_loss_clip": 1.00193667, + "balance_loss_mlp": 1.00086272, + "epoch": 0.25877047948294, + "flos": 27777729415680.0, + "grad_norm": 1.6922160148237264, + "language_loss": 0.67118585, + "learning_rate": 3.475654158020507e-06, + "loss": 0.69418204, + "num_input_tokens_seen": 92967690, + "step": 4304, + "time_per_iteration": 2.6294825077056885 + }, + { + "auxiliary_loss_clip": 0.01124042, + "auxiliary_loss_mlp": 0.01141211, + "balance_loss_clip": 1.00176549, + "balance_loss_mlp": 1.00083005, + "epoch": 0.258830602735608, + "flos": 27125843437440.0, + "grad_norm": 1.9053620150718604, + "language_loss": 0.71814084, + "learning_rate": 3.4753912473894657e-06, + "loss": 0.74079335, + "num_input_tokens_seen": 92986830, + "step": 4305, + "time_per_iteration": 2.6669094562530518 + }, + { + "auxiliary_loss_clip": 0.01115096, + "auxiliary_loss_mlp": 0.00748296, + "balance_loss_clip": 1.00192404, + "balance_loss_mlp": 1.0002166, + "epoch": 0.25889072598827595, + "flos": 17891486438400.0, + "grad_norm": 2.402526778032767, + "language_loss": 0.76043022, + "learning_rate": 3.4751282808114403e-06, + "loss": 0.77906406, + "num_input_tokens_seen": 93002740, + "step": 4306, + "time_per_iteration": 2.6412875652313232 + }, + { + "auxiliary_loss_clip": 0.01137275, + "auxiliary_loss_mlp": 0.01126667, + "balance_loss_clip": 1.00157404, + "balance_loss_mlp": 1.00001955, + "epoch": 0.2589508492409439, + "flos": 53934955724160.0, + "grad_norm": 0.8266106658357316, + "language_loss": 0.5709635, + "learning_rate": 3.474865258296403e-06, + "loss": 0.5936029, + "num_input_tokens_seen": 93058645, + "step": 4307, + "time_per_iteration": 3.1375436782836914 + }, + { + "auxiliary_loss_clip": 0.01141538, + "auxiliary_loss_mlp": 0.01140581, + "balance_loss_clip": 1.00198507, + "balance_loss_mlp": 1.00077188, + "epoch": 0.2590109724936119, + "flos": 22125785402880.0, + "grad_norm": 1.9061536765829092, + "language_loss": 0.71601701, + "learning_rate": 3.474602179854327e-06, + "loss": 0.7388382, + "num_input_tokens_seen": 93077140, + "step": 4308, + "time_per_iteration": 2.613628387451172 + }, + { + "auxiliary_loss_clip": 0.01173598, + "auxiliary_loss_mlp": 0.01141176, + "balance_loss_clip": 1.00204837, + "balance_loss_mlp": 1.00089002, + "epoch": 0.2590710957462799, + "flos": 13474294398720.0, + "grad_norm": 2.2692887666718367, + "language_loss": 0.84175122, + "learning_rate": 3.4743390454951886e-06, + "loss": 0.86489892, + "num_input_tokens_seen": 93093580, + "step": 4309, + "time_per_iteration": 2.5034425258636475 + }, + { + "auxiliary_loss_clip": 0.01156888, + "auxiliary_loss_mlp": 0.01140692, + "balance_loss_clip": 1.00203037, + "balance_loss_mlp": 1.00097847, + "epoch": 0.25913121899894787, + "flos": 22307098279680.0, + "grad_norm": 1.479369751601758, + "language_loss": 0.84671003, + "learning_rate": 3.474075855228966e-06, + "loss": 0.86968583, + "num_input_tokens_seen": 93112345, + "step": 4310, + "time_per_iteration": 2.5619025230407715 + }, + { + "auxiliary_loss_clip": 0.01157319, + "auxiliary_loss_mlp": 0.01141659, + "balance_loss_clip": 1.0020963, + "balance_loss_mlp": 1.00089657, + "epoch": 0.25919134225161583, + "flos": 25811728364160.0, + "grad_norm": 1.958073720610728, + "language_loss": 0.77334166, + "learning_rate": 3.473812609065639e-06, + "loss": 0.79633141, + "num_input_tokens_seen": 93131545, + "step": 4311, + "time_per_iteration": 2.6120598316192627 + }, + { + "auxiliary_loss_clip": 0.01143277, + "auxiliary_loss_mlp": 0.01141446, + "balance_loss_clip": 1.00211644, + "balance_loss_mlp": 1.00077915, + "epoch": 0.2592514655042838, + "flos": 31212262108800.0, + "grad_norm": 1.9412028971089792, + "language_loss": 0.72461241, + "learning_rate": 3.4735493070151904e-06, + "loss": 0.74745965, + "num_input_tokens_seen": 93150730, + "step": 4312, + "time_per_iteration": 2.664433002471924 + }, + { + "auxiliary_loss_clip": 0.01173537, + "auxiliary_loss_mlp": 0.01140906, + "balance_loss_clip": 1.00198197, + "balance_loss_mlp": 1.00071561, + "epoch": 0.25931158875695176, + "flos": 18474998878080.0, + "grad_norm": 3.0651812915470775, + "language_loss": 0.70113885, + "learning_rate": 3.4732859490876044e-06, + "loss": 0.72428328, + "num_input_tokens_seen": 93167895, + "step": 4313, + "time_per_iteration": 2.502044916152954 + }, + { + "auxiliary_loss_clip": 0.01173383, + "auxiliary_loss_mlp": 0.01140608, + "balance_loss_clip": 1.00191045, + "balance_loss_mlp": 1.00098956, + "epoch": 0.2593717120096197, + "flos": 19207935895680.0, + "grad_norm": 1.8630903733586186, + "language_loss": 0.80301791, + "learning_rate": 3.473022535292867e-06, + "loss": 0.82615781, + "num_input_tokens_seen": 93187650, + "step": 4314, + "time_per_iteration": 2.5147218704223633 + }, + { + "auxiliary_loss_clip": 0.01124629, + "auxiliary_loss_mlp": 0.01141182, + "balance_loss_clip": 1.00179827, + "balance_loss_mlp": 1.00099194, + "epoch": 0.2594318352622877, + "flos": 31248100903680.0, + "grad_norm": 1.926775567621106, + "language_loss": 0.66706777, + "learning_rate": 3.472759065640968e-06, + "loss": 0.68972588, + "num_input_tokens_seen": 93207370, + "step": 4315, + "time_per_iteration": 2.7057442665100098 + }, + { + "auxiliary_loss_clip": 0.01114663, + "auxiliary_loss_mlp": 0.01140102, + "balance_loss_clip": 1.00195062, + "balance_loss_mlp": 1.00105596, + "epoch": 0.25949195851495566, + "flos": 22237144542720.0, + "grad_norm": 1.5134848066809976, + "language_loss": 0.7941854, + "learning_rate": 3.4724955401418976e-06, + "loss": 0.81673306, + "num_input_tokens_seen": 93227925, + "step": 4316, + "time_per_iteration": 2.6941301822662354 + }, + { + "auxiliary_loss_clip": 0.01112948, + "auxiliary_loss_mlp": 0.01140982, + "balance_loss_clip": 1.00177753, + "balance_loss_mlp": 1.00079203, + "epoch": 0.2595520817676236, + "flos": 28075716645120.0, + "grad_norm": 1.7228101322136424, + "language_loss": 0.77874565, + "learning_rate": 3.4722319588056487e-06, + "loss": 0.80128503, + "num_input_tokens_seen": 93250020, + "step": 4317, + "time_per_iteration": 2.7004523277282715 + }, + { + "auxiliary_loss_clip": 0.01173669, + "auxiliary_loss_mlp": 0.01141438, + "balance_loss_clip": 1.00210249, + "balance_loss_mlp": 1.00115252, + "epoch": 0.2596122050202916, + "flos": 20190954378240.0, + "grad_norm": 2.4258258849087353, + "language_loss": 0.77284211, + "learning_rate": 3.4719683216422163e-06, + "loss": 0.79599315, + "num_input_tokens_seen": 93269070, + "step": 4318, + "time_per_iteration": 2.507023334503174 + }, + { + "auxiliary_loss_clip": 0.01173504, + "auxiliary_loss_mlp": 0.01140229, + "balance_loss_clip": 1.00201297, + "balance_loss_mlp": 1.00070691, + "epoch": 0.25967232827295955, + "flos": 22527949052160.0, + "grad_norm": 1.6087633025928803, + "language_loss": 0.76100719, + "learning_rate": 3.471704628661598e-06, + "loss": 0.78414452, + "num_input_tokens_seen": 93290250, + "step": 4319, + "time_per_iteration": 2.5378615856170654 + }, + { + "auxiliary_loss_clip": 0.011409, + "auxiliary_loss_mlp": 0.01140345, + "balance_loss_clip": 1.00190413, + "balance_loss_mlp": 1.00091779, + "epoch": 0.2597324515256275, + "flos": 21068252156160.0, + "grad_norm": 1.927898140079663, + "language_loss": 0.76654536, + "learning_rate": 3.4714408798737925e-06, + "loss": 0.78935778, + "num_input_tokens_seen": 93310090, + "step": 4320, + "time_per_iteration": 2.6144349575042725 + }, + { + "auxiliary_loss_clip": 0.01126591, + "auxiliary_loss_mlp": 0.01140518, + "balance_loss_clip": 1.00184679, + "balance_loss_mlp": 1.00089979, + "epoch": 0.2597925747782955, + "flos": 22050013662720.0, + "grad_norm": 1.5319042880537967, + "language_loss": 0.71212435, + "learning_rate": 3.471177075288801e-06, + "loss": 0.73479545, + "num_input_tokens_seen": 93329570, + "step": 4321, + "time_per_iteration": 2.6588151454925537 + }, + { + "auxiliary_loss_clip": 0.01141763, + "auxiliary_loss_mlp": 0.01141068, + "balance_loss_clip": 1.00192595, + "balance_loss_mlp": 1.00078273, + "epoch": 0.2598526980309635, + "flos": 19536949497600.0, + "grad_norm": 2.037766236039147, + "language_loss": 0.74614733, + "learning_rate": 3.4709132149166277e-06, + "loss": 0.76897568, + "num_input_tokens_seen": 93347920, + "step": 4322, + "time_per_iteration": 2.5872652530670166 + }, + { + "auxiliary_loss_clip": 0.01125129, + "auxiliary_loss_mlp": 0.01140794, + "balance_loss_clip": 1.00177646, + "balance_loss_mlp": 1.00098491, + "epoch": 0.25991282128363147, + "flos": 24495207079680.0, + "grad_norm": 2.078489492510635, + "language_loss": 0.73681068, + "learning_rate": 3.470649298767278e-06, + "loss": 0.75946987, + "num_input_tokens_seen": 93367145, + "step": 4323, + "time_per_iteration": 4.1185302734375 + }, + { + "auxiliary_loss_clip": 0.01156971, + "auxiliary_loss_mlp": 0.0074833, + "balance_loss_clip": 1.00185537, + "balance_loss_mlp": 1.00029635, + "epoch": 0.25997294453629943, + "flos": 24201457655040.0, + "grad_norm": 1.8280893961069862, + "language_loss": 0.66913015, + "learning_rate": 3.4703853268507597e-06, + "loss": 0.68818319, + "num_input_tokens_seen": 93386555, + "step": 4324, + "time_per_iteration": 2.5636582374572754 + }, + { + "auxiliary_loss_clip": 0.01107816, + "auxiliary_loss_mlp": 0.01140859, + "balance_loss_clip": 1.00184536, + "balance_loss_mlp": 1.00105071, + "epoch": 0.2600330677889674, + "flos": 31431460855680.0, + "grad_norm": 1.9805085116724523, + "language_loss": 0.70599049, + "learning_rate": 3.470121299177082e-06, + "loss": 0.72847724, + "num_input_tokens_seen": 93405590, + "step": 4325, + "time_per_iteration": 4.346478700637817 + }, + { + "auxiliary_loss_clip": 0.01157014, + "auxiliary_loss_mlp": 0.01140575, + "balance_loss_clip": 1.00188887, + "balance_loss_mlp": 1.00076687, + "epoch": 0.26009319104163536, + "flos": 32266527217920.0, + "grad_norm": 3.2335271799334513, + "language_loss": 0.72934914, + "learning_rate": 3.469857215756257e-06, + "loss": 0.75232506, + "num_input_tokens_seen": 93424750, + "step": 4326, + "time_per_iteration": 4.119899034500122 + }, + { + "auxiliary_loss_clip": 0.01142706, + "auxiliary_loss_mlp": 0.00748052, + "balance_loss_clip": 1.0019207, + "balance_loss_mlp": 1.0001905, + "epoch": 0.26015331429430333, + "flos": 26286754752000.0, + "grad_norm": 2.041121154853046, + "language_loss": 0.86355293, + "learning_rate": 3.4695930765982997e-06, + "loss": 0.88246047, + "num_input_tokens_seen": 93443465, + "step": 4327, + "time_per_iteration": 4.006536960601807 + }, + { + "auxiliary_loss_clip": 0.0117361, + "auxiliary_loss_mlp": 0.00748162, + "balance_loss_clip": 1.00208187, + "balance_loss_mlp": 1.00020385, + "epoch": 0.2602134375469713, + "flos": 21142335957120.0, + "grad_norm": 1.5074958227377353, + "language_loss": 0.80455816, + "learning_rate": 3.4693288817132255e-06, + "loss": 0.82377589, + "num_input_tokens_seen": 93462580, + "step": 4328, + "time_per_iteration": 2.5237858295440674 + }, + { + "auxiliary_loss_clip": 0.01141845, + "auxiliary_loss_mlp": 0.00748085, + "balance_loss_clip": 1.00190711, + "balance_loss_mlp": 1.00018859, + "epoch": 0.26027356079963926, + "flos": 25921327737600.0, + "grad_norm": 1.541860061163631, + "language_loss": 0.88037121, + "learning_rate": 3.4690646311110525e-06, + "loss": 0.89927053, + "num_input_tokens_seen": 93482790, + "step": 4329, + "time_per_iteration": 2.637540102005005 + }, + { + "auxiliary_loss_clip": 0.01173522, + "auxiliary_loss_mlp": 0.01140063, + "balance_loss_clip": 1.00206792, + "balance_loss_mlp": 1.00101733, + "epoch": 0.2603336840523072, + "flos": 26359222440960.0, + "grad_norm": 2.078991790031582, + "language_loss": 0.7762742, + "learning_rate": 3.468800324801802e-06, + "loss": 0.79941005, + "num_input_tokens_seen": 93498795, + "step": 4330, + "time_per_iteration": 2.5509111881256104 + }, + { + "auxiliary_loss_clip": 0.01173502, + "auxiliary_loss_mlp": 0.01141392, + "balance_loss_clip": 1.00201821, + "balance_loss_mlp": 1.00110686, + "epoch": 0.2603938073049752, + "flos": 23513661054720.0, + "grad_norm": 1.4315401365286229, + "language_loss": 0.75441539, + "learning_rate": 3.4685359627954958e-06, + "loss": 0.77756441, + "num_input_tokens_seen": 93518335, + "step": 4331, + "time_per_iteration": 2.5148637294769287 + }, + { + "auxiliary_loss_clip": 0.01146187, + "auxiliary_loss_mlp": 0.01140405, + "balance_loss_clip": 1.00222206, + "balance_loss_mlp": 1.00097799, + "epoch": 0.26045393055764315, + "flos": 25374300537600.0, + "grad_norm": 1.403755117424989, + "language_loss": 0.69007659, + "learning_rate": 3.4682715451021584e-06, + "loss": 0.71294254, + "num_input_tokens_seen": 93539170, + "step": 4332, + "time_per_iteration": 2.624136447906494 + }, + { + "auxiliary_loss_clip": 0.01127061, + "auxiliary_loss_mlp": 0.01140808, + "balance_loss_clip": 1.00182819, + "balance_loss_mlp": 1.00080824, + "epoch": 0.2605140538103111, + "flos": 27635272076160.0, + "grad_norm": 1.7971171288912318, + "language_loss": 0.79175168, + "learning_rate": 3.4680070717318174e-06, + "loss": 0.81443048, + "num_input_tokens_seen": 93558480, + "step": 4333, + "time_per_iteration": 2.7111318111419678 + }, + { + "auxiliary_loss_clip": 0.01173349, + "auxiliary_loss_mlp": 0.01140312, + "balance_loss_clip": 1.00199699, + "balance_loss_mlp": 1.00097966, + "epoch": 0.2605741770629791, + "flos": 13769839503360.0, + "grad_norm": 1.6873999363640453, + "language_loss": 0.80608749, + "learning_rate": 3.467742542694501e-06, + "loss": 0.82922411, + "num_input_tokens_seen": 93575220, + "step": 4334, + "time_per_iteration": 2.4823074340820312 + }, + { + "auxiliary_loss_clip": 0.0114109, + "auxiliary_loss_mlp": 0.011406, + "balance_loss_clip": 1.00182056, + "balance_loss_mlp": 1.00079131, + "epoch": 0.26063430031564705, + "flos": 26031681296640.0, + "grad_norm": 1.7461136660734087, + "language_loss": 0.7978158, + "learning_rate": 3.46747795800024e-06, + "loss": 0.8206327, + "num_input_tokens_seen": 93597015, + "step": 4335, + "time_per_iteration": 2.6476809978485107 + }, + { + "auxiliary_loss_clip": 0.01155046, + "auxiliary_loss_mlp": 0.01126833, + "balance_loss_clip": 1.00151587, + "balance_loss_mlp": 1.00018501, + "epoch": 0.26069442356831507, + "flos": 62443809820800.0, + "grad_norm": 0.8387871698811712, + "language_loss": 0.60752833, + "learning_rate": 3.467213317659068e-06, + "loss": 0.63034707, + "num_input_tokens_seen": 93657775, + "step": 4336, + "time_per_iteration": 3.1161370277404785 + }, + { + "auxiliary_loss_clip": 0.01124801, + "auxiliary_loss_mlp": 0.01140821, + "balance_loss_clip": 1.00181484, + "balance_loss_mlp": 1.00091672, + "epoch": 0.26075454682098304, + "flos": 13626376583040.0, + "grad_norm": 2.0427912846329734, + "language_loss": 0.77465534, + "learning_rate": 3.46694862168102e-06, + "loss": 0.79731154, + "num_input_tokens_seen": 93676145, + "step": 4337, + "time_per_iteration": 2.6266016960144043 + }, + { + "auxiliary_loss_clip": 0.0114012, + "auxiliary_loss_mlp": 0.01140625, + "balance_loss_clip": 1.00181949, + "balance_loss_mlp": 1.0008167, + "epoch": 0.260814670073651, + "flos": 12126531260160.0, + "grad_norm": 2.037208598479417, + "language_loss": 0.74853665, + "learning_rate": 3.4666838700761334e-06, + "loss": 0.77134407, + "num_input_tokens_seen": 93692480, + "step": 4338, + "time_per_iteration": 2.5549027919769287 + }, + { + "auxiliary_loss_clip": 0.0115786, + "auxiliary_loss_mlp": 0.01140771, + "balance_loss_clip": 1.00195932, + "balance_loss_mlp": 1.00096226, + "epoch": 0.26087479332631897, + "flos": 15122522805120.0, + "grad_norm": 2.39453321106328, + "language_loss": 0.80748689, + "learning_rate": 3.466419062854447e-06, + "loss": 0.83047324, + "num_input_tokens_seen": 93710165, + "step": 4339, + "time_per_iteration": 2.5456385612487793 + }, + { + "auxiliary_loss_clip": 0.0110831, + "auxiliary_loss_mlp": 0.01139852, + "balance_loss_clip": 1.00167203, + "balance_loss_mlp": 1.00099742, + "epoch": 0.26093491657898693, + "flos": 24680937329280.0, + "grad_norm": 1.5925849758859754, + "language_loss": 0.765136, + "learning_rate": 3.4661542000260033e-06, + "loss": 0.78761762, + "num_input_tokens_seen": 93730185, + "step": 4340, + "time_per_iteration": 2.6951792240142822 + }, + { + "auxiliary_loss_clip": 0.01095377, + "auxiliary_loss_mlp": 0.01140929, + "balance_loss_clip": 1.00172615, + "balance_loss_mlp": 1.00092959, + "epoch": 0.2609950398316549, + "flos": 25116138512640.0, + "grad_norm": 1.682290199548833, + "language_loss": 0.82370913, + "learning_rate": 3.465889281600845e-06, + "loss": 0.8460722, + "num_input_tokens_seen": 93747690, + "step": 4341, + "time_per_iteration": 2.757678985595703 + }, + { + "auxiliary_loss_clip": 0.01173487, + "auxiliary_loss_mlp": 0.0114028, + "balance_loss_clip": 1.00203156, + "balance_loss_mlp": 1.0008533, + "epoch": 0.26105516308432286, + "flos": 28548588216960.0, + "grad_norm": 1.8942493807436378, + "language_loss": 0.76490718, + "learning_rate": 3.4656243075890183e-06, + "loss": 0.78804481, + "num_input_tokens_seen": 93767405, + "step": 4342, + "time_per_iteration": 2.5564303398132324 + }, + { + "auxiliary_loss_clip": 0.01162544, + "auxiliary_loss_mlp": 0.01140061, + "balance_loss_clip": 1.00208914, + "balance_loss_mlp": 1.00063348, + "epoch": 0.2611152863369908, + "flos": 39530609447040.0, + "grad_norm": 1.8531526269013754, + "language_loss": 0.65894532, + "learning_rate": 3.4653592780005707e-06, + "loss": 0.68197143, + "num_input_tokens_seen": 93789950, + "step": 4343, + "time_per_iteration": 2.6901726722717285 + }, + { + "auxiliary_loss_clip": 0.01094416, + "auxiliary_loss_mlp": 0.01140781, + "balance_loss_clip": 1.00156808, + "balance_loss_mlp": 1.00087714, + "epoch": 0.2611754095896588, + "flos": 13735329511680.0, + "grad_norm": 2.060192545604871, + "language_loss": 0.73436856, + "learning_rate": 3.465094192845553e-06, + "loss": 0.75672054, + "num_input_tokens_seen": 93807835, + "step": 4344, + "time_per_iteration": 2.677281141281128 + }, + { + "auxiliary_loss_clip": 0.0117365, + "auxiliary_loss_mlp": 0.01140872, + "balance_loss_clip": 1.00212705, + "balance_loss_mlp": 1.00087225, + "epoch": 0.26123553284232676, + "flos": 21506649649920.0, + "grad_norm": 2.082250810466758, + "language_loss": 0.86740673, + "learning_rate": 3.4648290521340165e-06, + "loss": 0.89055198, + "num_input_tokens_seen": 93825670, + "step": 4345, + "time_per_iteration": 2.5344059467315674 + }, + { + "auxiliary_loss_clip": 0.0114065, + "auxiliary_loss_mlp": 0.01140473, + "balance_loss_clip": 1.00180697, + "balance_loss_mlp": 1.00066388, + "epoch": 0.2612956560949947, + "flos": 21139786091520.0, + "grad_norm": 2.0734475280882685, + "language_loss": 0.76306927, + "learning_rate": 3.464563855876015e-06, + "loss": 0.78588045, + "num_input_tokens_seen": 93844045, + "step": 4346, + "time_per_iteration": 2.578491449356079 + }, + { + "auxiliary_loss_clip": 0.0115755, + "auxiliary_loss_mlp": 0.01140093, + "balance_loss_clip": 1.00191617, + "balance_loss_mlp": 1.00085628, + "epoch": 0.2613557793476627, + "flos": 25119011600640.0, + "grad_norm": 1.525018874314906, + "language_loss": 0.75740218, + "learning_rate": 3.464298604081606e-06, + "loss": 0.78037858, + "num_input_tokens_seen": 93864380, + "step": 4347, + "time_per_iteration": 2.5937447547912598 + }, + { + "auxiliary_loss_clip": 0.01126675, + "auxiliary_loss_mlp": 0.0114047, + "balance_loss_clip": 1.00200105, + "balance_loss_mlp": 1.0007565, + "epoch": 0.26141590260033065, + "flos": 26067699659520.0, + "grad_norm": 1.3295496415018833, + "language_loss": 0.73440492, + "learning_rate": 3.4640332967608476e-06, + "loss": 0.75707638, + "num_input_tokens_seen": 93885475, + "step": 4348, + "time_per_iteration": 2.68035626411438 + }, + { + "auxiliary_loss_clip": 0.01123387, + "auxiliary_loss_mlp": 0.01141488, + "balance_loss_clip": 1.00165248, + "balance_loss_mlp": 1.00101209, + "epoch": 0.2614760258529987, + "flos": 25701518459520.0, + "grad_norm": 1.864626360766458, + "language_loss": 0.91185498, + "learning_rate": 3.463767933923799e-06, + "loss": 0.93450379, + "num_input_tokens_seen": 93905545, + "step": 4349, + "time_per_iteration": 2.6651153564453125 + }, + { + "auxiliary_loss_clip": 0.0116272, + "auxiliary_loss_mlp": 0.01139978, + "balance_loss_clip": 1.00238752, + "balance_loss_mlp": 1.00074196, + "epoch": 0.26153614910566664, + "flos": 17457147181440.0, + "grad_norm": 1.736287777898867, + "language_loss": 0.80244768, + "learning_rate": 3.463502515580524e-06, + "loss": 0.82547468, + "num_input_tokens_seen": 93924185, + "step": 4350, + "time_per_iteration": 2.5652787685394287 + }, + { + "auxiliary_loss_clip": 0.01156792, + "auxiliary_loss_mlp": 0.01140338, + "balance_loss_clip": 1.00193214, + "balance_loss_mlp": 1.00091076, + "epoch": 0.2615962723583346, + "flos": 17712831168000.0, + "grad_norm": 1.6694468165049179, + "language_loss": 0.62104309, + "learning_rate": 3.4632370417410866e-06, + "loss": 0.64401442, + "num_input_tokens_seen": 93942825, + "step": 4351, + "time_per_iteration": 2.5411362648010254 + }, + { + "auxiliary_loss_clip": 0.01158192, + "auxiliary_loss_mlp": 0.011405, + "balance_loss_clip": 1.00194407, + "balance_loss_mlp": 1.0007863, + "epoch": 0.26165639561100257, + "flos": 23257725672960.0, + "grad_norm": 1.840725172502485, + "language_loss": 0.83434534, + "learning_rate": 3.462971512415555e-06, + "loss": 0.85733223, + "num_input_tokens_seen": 93962045, + "step": 4352, + "time_per_iteration": 2.556131601333618 + }, + { + "auxiliary_loss_clip": 0.01153361, + "auxiliary_loss_mlp": 0.01125982, + "balance_loss_clip": 1.00146031, + "balance_loss_mlp": 1.00009727, + "epoch": 0.26171651886367053, + "flos": 66737970800640.0, + "grad_norm": 0.7935962932811489, + "language_loss": 0.70635188, + "learning_rate": 3.462705927613996e-06, + "loss": 0.72914529, + "num_input_tokens_seen": 94021175, + "step": 4353, + "time_per_iteration": 3.019699811935425 + }, + { + "auxiliary_loss_clip": 0.01142657, + "auxiliary_loss_mlp": 0.01140547, + "balance_loss_clip": 1.00197494, + "balance_loss_mlp": 1.00092912, + "epoch": 0.2617766421163385, + "flos": 22349581090560.0, + "grad_norm": 2.820656215767681, + "language_loss": 0.77486485, + "learning_rate": 3.4624402873464816e-06, + "loss": 0.79769695, + "num_input_tokens_seen": 94043370, + "step": 4354, + "time_per_iteration": 2.6604111194610596 + }, + { + "auxiliary_loss_clip": 0.01109874, + "auxiliary_loss_mlp": 0.01140945, + "balance_loss_clip": 1.0019058, + "balance_loss_mlp": 1.00104058, + "epoch": 0.26183676536900646, + "flos": 26067125041920.0, + "grad_norm": 1.7608438219188831, + "language_loss": 0.68467468, + "learning_rate": 3.462174591623085e-06, + "loss": 0.70718288, + "num_input_tokens_seen": 94063510, + "step": 4355, + "time_per_iteration": 2.697868824005127 + }, + { + "auxiliary_loss_clip": 0.01108864, + "auxiliary_loss_mlp": 0.01140012, + "balance_loss_clip": 1.00177765, + "balance_loss_mlp": 1.00087094, + "epoch": 0.26189688862167443, + "flos": 20996466825600.0, + "grad_norm": 1.8844725055134677, + "language_loss": 0.67067206, + "learning_rate": 3.4619088404538815e-06, + "loss": 0.69316077, + "num_input_tokens_seen": 94083865, + "step": 4356, + "time_per_iteration": 2.7051944732666016 + }, + { + "auxiliary_loss_clip": 0.01153319, + "auxiliary_loss_mlp": 0.01126021, + "balance_loss_clip": 1.00154662, + "balance_loss_mlp": 1.0001359, + "epoch": 0.2619570118743424, + "flos": 65798261141760.0, + "grad_norm": 0.6756992070134784, + "language_loss": 0.53214484, + "learning_rate": 3.4616430338489487e-06, + "loss": 0.5549382, + "num_input_tokens_seen": 94144095, + "step": 4357, + "time_per_iteration": 3.0501749515533447 + }, + { + "auxiliary_loss_clip": 0.01158318, + "auxiliary_loss_mlp": 0.01140497, + "balance_loss_clip": 1.00209236, + "balance_loss_mlp": 1.00078416, + "epoch": 0.26201713512701036, + "flos": 28766817296640.0, + "grad_norm": 1.7251591042479282, + "language_loss": 0.84039533, + "learning_rate": 3.4613771718183654e-06, + "loss": 0.86338341, + "num_input_tokens_seen": 94163035, + "step": 4358, + "time_per_iteration": 2.618708610534668 + }, + { + "auxiliary_loss_clip": 0.0114303, + "auxiliary_loss_mlp": 0.01140949, + "balance_loss_clip": 1.00201035, + "balance_loss_mlp": 1.00075936, + "epoch": 0.2620772583796783, + "flos": 26432516142720.0, + "grad_norm": 2.0867771535766475, + "language_loss": 0.67044902, + "learning_rate": 3.4611112543722127e-06, + "loss": 0.69328874, + "num_input_tokens_seen": 94182520, + "step": 4359, + "time_per_iteration": 2.651634931564331 + }, + { + "auxiliary_loss_clip": 0.01141404, + "auxiliary_loss_mlp": 0.01140901, + "balance_loss_clip": 1.00192142, + "balance_loss_mlp": 1.0009017, + "epoch": 0.2621373816323463, + "flos": 20156552127360.0, + "grad_norm": 1.825428209867569, + "language_loss": 0.78140593, + "learning_rate": 3.4608452815205757e-06, + "loss": 0.80422902, + "num_input_tokens_seen": 94201795, + "step": 4360, + "time_per_iteration": 4.125424385070801 + }, + { + "auxiliary_loss_clip": 0.01141273, + "auxiliary_loss_mlp": 0.0114025, + "balance_loss_clip": 1.00183594, + "balance_loss_mlp": 1.00091863, + "epoch": 0.26219750488501425, + "flos": 28621235473920.0, + "grad_norm": 1.9270010713543946, + "language_loss": 0.67993867, + "learning_rate": 3.4605792532735387e-06, + "loss": 0.70275384, + "num_input_tokens_seen": 94222390, + "step": 4361, + "time_per_iteration": 2.6550681591033936 + }, + { + "auxiliary_loss_clip": 0.01156939, + "auxiliary_loss_mlp": 0.01140644, + "balance_loss_clip": 1.00204146, + "balance_loss_mlp": 1.00102568, + "epoch": 0.2622576281376823, + "flos": 15042549173760.0, + "grad_norm": 1.783623171601286, + "language_loss": 0.84544474, + "learning_rate": 3.46031316964119e-06, + "loss": 0.8684206, + "num_input_tokens_seen": 94239980, + "step": 4362, + "time_per_iteration": 2.5289878845214844 + }, + { + "auxiliary_loss_clip": 0.0112481, + "auxiliary_loss_mlp": 0.01140328, + "balance_loss_clip": 1.00191569, + "balance_loss_mlp": 1.00099611, + "epoch": 0.26231775139035024, + "flos": 26396174557440.0, + "grad_norm": 1.805659011139174, + "language_loss": 0.6501193, + "learning_rate": 3.4600470306336197e-06, + "loss": 0.67277062, + "num_input_tokens_seen": 94260715, + "step": 4363, + "time_per_iteration": 4.066460609436035 + }, + { + "auxiliary_loss_clip": 0.01137954, + "auxiliary_loss_mlp": 0.01125893, + "balance_loss_clip": 1.00169146, + "balance_loss_mlp": 1.00000846, + "epoch": 0.2623778746430182, + "flos": 65408918647680.0, + "grad_norm": 0.8859608587031673, + "language_loss": 0.6115945, + "learning_rate": 3.4597808362609194e-06, + "loss": 0.634233, + "num_input_tokens_seen": 94321285, + "step": 4364, + "time_per_iteration": 3.267756938934326 + }, + { + "auxiliary_loss_clip": 0.01173503, + "auxiliary_loss_mlp": 0.01140464, + "balance_loss_clip": 1.00209033, + "balance_loss_mlp": 1.00084662, + "epoch": 0.26243799789568617, + "flos": 12604215254400.0, + "grad_norm": 3.3848682222443482, + "language_loss": 0.72273993, + "learning_rate": 3.459514586533184e-06, + "loss": 0.74587959, + "num_input_tokens_seen": 94335420, + "step": 4365, + "time_per_iteration": 3.921780824661255 + }, + { + "auxiliary_loss_clip": 0.01142016, + "auxiliary_loss_mlp": 0.00748141, + "balance_loss_clip": 1.00192523, + "balance_loss_mlp": 1.00018501, + "epoch": 0.26249812114835414, + "flos": 28623821253120.0, + "grad_norm": 1.604084164117998, + "language_loss": 0.768502, + "learning_rate": 3.459248281460509e-06, + "loss": 0.78740358, + "num_input_tokens_seen": 94357440, + "step": 4366, + "time_per_iteration": 2.6644773483276367 + }, + { + "auxiliary_loss_clip": 0.01173451, + "auxiliary_loss_mlp": 0.01140416, + "balance_loss_clip": 1.00205135, + "balance_loss_mlp": 1.00079858, + "epoch": 0.2625582444010221, + "flos": 14465393441280.0, + "grad_norm": 1.6501492188750595, + "language_loss": 0.76030552, + "learning_rate": 3.4589819210529927e-06, + "loss": 0.78344417, + "num_input_tokens_seen": 94375690, + "step": 4367, + "time_per_iteration": 2.4873476028442383 + }, + { + "auxiliary_loss_clip": 0.01158057, + "auxiliary_loss_mlp": 0.01139749, + "balance_loss_clip": 1.002002, + "balance_loss_mlp": 1.0007031, + "epoch": 0.26261836765369007, + "flos": 16613174246400.0, + "grad_norm": 2.018816238055587, + "language_loss": 0.69251609, + "learning_rate": 3.458715505320736e-06, + "loss": 0.71549422, + "num_input_tokens_seen": 94393190, + "step": 4368, + "time_per_iteration": 2.536879777908325 + }, + { + "auxiliary_loss_clip": 0.01140859, + "auxiliary_loss_mlp": 0.01139698, + "balance_loss_clip": 1.00183797, + "balance_loss_mlp": 1.00084293, + "epoch": 0.26267849090635803, + "flos": 20519932066560.0, + "grad_norm": 2.1370959734547337, + "language_loss": 0.78808647, + "learning_rate": 3.458449034273841e-06, + "loss": 0.81089211, + "num_input_tokens_seen": 94410975, + "step": 4369, + "time_per_iteration": 2.609696626663208 + }, + { + "auxiliary_loss_clip": 0.01140736, + "auxiliary_loss_mlp": 0.01140346, + "balance_loss_clip": 1.00200033, + "balance_loss_mlp": 1.0009191, + "epoch": 0.262738614159026, + "flos": 21323936142720.0, + "grad_norm": 1.9799646595003486, + "language_loss": 0.83379269, + "learning_rate": 3.4581825079224133e-06, + "loss": 0.8566035, + "num_input_tokens_seen": 94429985, + "step": 4370, + "time_per_iteration": 2.6440978050231934 + }, + { + "auxiliary_loss_clip": 0.01158192, + "auxiliary_loss_mlp": 0.01140746, + "balance_loss_clip": 1.00195575, + "balance_loss_mlp": 1.0009377, + "epoch": 0.26279873741169396, + "flos": 17603590930560.0, + "grad_norm": 1.6666132039417672, + "language_loss": 0.712538, + "learning_rate": 3.4579159262765575e-06, + "loss": 0.73552734, + "num_input_tokens_seen": 94448660, + "step": 4371, + "time_per_iteration": 2.5341720581054688 + }, + { + "auxiliary_loss_clip": 0.01170042, + "auxiliary_loss_mlp": 0.01125887, + "balance_loss_clip": 1.00170088, + "balance_loss_mlp": 1.00000215, + "epoch": 0.2628588606643619, + "flos": 60949746587520.0, + "grad_norm": 0.71040713828208, + "language_loss": 0.56425011, + "learning_rate": 3.457649289346384e-06, + "loss": 0.58720946, + "num_input_tokens_seen": 94515630, + "step": 4372, + "time_per_iteration": 3.232717752456665 + }, + { + "auxiliary_loss_clip": 0.01140292, + "auxiliary_loss_mlp": 0.01139494, + "balance_loss_clip": 1.0019021, + "balance_loss_mlp": 1.00073457, + "epoch": 0.2629189839170299, + "flos": 27016315891200.0, + "grad_norm": 1.642164566114738, + "language_loss": 0.77713954, + "learning_rate": 3.4573825971420042e-06, + "loss": 0.79993743, + "num_input_tokens_seen": 94535385, + "step": 4373, + "time_per_iteration": 2.672053813934326 + }, + { + "auxiliary_loss_clip": 0.0112323, + "auxiliary_loss_mlp": 0.01139931, + "balance_loss_clip": 1.00180125, + "balance_loss_mlp": 1.00098109, + "epoch": 0.26297910716969786, + "flos": 17019863009280.0, + "grad_norm": 2.2262721171853515, + "language_loss": 0.71310532, + "learning_rate": 3.4571158496735294e-06, + "loss": 0.73573697, + "num_input_tokens_seen": 94552650, + "step": 4374, + "time_per_iteration": 2.6678836345672607 + }, + { + "auxiliary_loss_clip": 0.01140562, + "auxiliary_loss_mlp": 0.01140775, + "balance_loss_clip": 1.0020175, + "balance_loss_mlp": 1.00087094, + "epoch": 0.2630392304223659, + "flos": 24897370728960.0, + "grad_norm": 1.794780492527642, + "language_loss": 0.81012523, + "learning_rate": 3.4568490469510756e-06, + "loss": 0.83293855, + "num_input_tokens_seen": 94574075, + "step": 4375, + "time_per_iteration": 2.6390159130096436 + }, + { + "auxiliary_loss_clip": 0.0114139, + "auxiliary_loss_mlp": 0.01139168, + "balance_loss_clip": 1.00195825, + "balance_loss_mlp": 1.00069427, + "epoch": 0.26309935367503384, + "flos": 32854026067200.0, + "grad_norm": 2.315579960787764, + "language_loss": 0.66646945, + "learning_rate": 3.4565821889847603e-06, + "loss": 0.68927503, + "num_input_tokens_seen": 94594255, + "step": 4376, + "time_per_iteration": 2.70491099357605 + }, + { + "auxiliary_loss_clip": 0.01113755, + "auxiliary_loss_mlp": 0.01140699, + "balance_loss_clip": 1.00222301, + "balance_loss_mlp": 1.00098586, + "epoch": 0.2631594769277018, + "flos": 15887958652800.0, + "grad_norm": 2.4282388376392667, + "language_loss": 0.69586265, + "learning_rate": 3.4563152757847026e-06, + "loss": 0.71840715, + "num_input_tokens_seen": 94611410, + "step": 4377, + "time_per_iteration": 2.649545669555664 + }, + { + "auxiliary_loss_clip": 0.01156678, + "auxiliary_loss_mlp": 0.01140133, + "balance_loss_clip": 1.00199878, + "balance_loss_mlp": 1.00080097, + "epoch": 0.2632196001803698, + "flos": 50804943557760.0, + "grad_norm": 1.7277463163375368, + "language_loss": 0.78532922, + "learning_rate": 3.4560483073610233e-06, + "loss": 0.80829734, + "num_input_tokens_seen": 94636575, + "step": 4378, + "time_per_iteration": 2.8085758686065674 + }, + { + "auxiliary_loss_clip": 0.01140158, + "auxiliary_loss_mlp": 0.01140281, + "balance_loss_clip": 1.0019052, + "balance_loss_mlp": 1.00094914, + "epoch": 0.26327972343303774, + "flos": 13733031041280.0, + "grad_norm": 2.040979464815911, + "language_loss": 0.76742315, + "learning_rate": 3.455781283723846e-06, + "loss": 0.79022753, + "num_input_tokens_seen": 94654345, + "step": 4379, + "time_per_iteration": 2.5857441425323486 + }, + { + "auxiliary_loss_clip": 0.01123689, + "auxiliary_loss_mlp": 0.01141194, + "balance_loss_clip": 1.001894, + "balance_loss_mlp": 1.00090885, + "epoch": 0.2633398466857057, + "flos": 23769057732480.0, + "grad_norm": 2.2104568597399017, + "language_loss": 0.77873534, + "learning_rate": 3.4555142048832975e-06, + "loss": 0.80138415, + "num_input_tokens_seen": 94673985, + "step": 4380, + "time_per_iteration": 2.665771722793579 + }, + { + "auxiliary_loss_clip": 0.01142608, + "auxiliary_loss_mlp": 0.01140285, + "balance_loss_clip": 1.00183046, + "balance_loss_mlp": 1.00076199, + "epoch": 0.26339996993837367, + "flos": 27600223380480.0, + "grad_norm": 1.9009756346797435, + "language_loss": 0.64447784, + "learning_rate": 3.4552470708495036e-06, + "loss": 0.66730672, + "num_input_tokens_seen": 94693145, + "step": 4381, + "time_per_iteration": 2.659759283065796 + }, + { + "auxiliary_loss_clip": 0.01158081, + "auxiliary_loss_mlp": 0.01140193, + "balance_loss_clip": 1.0020597, + "balance_loss_mlp": 1.00086069, + "epoch": 0.26346009319104163, + "flos": 16946317912320.0, + "grad_norm": 1.9431243679150083, + "language_loss": 0.82770324, + "learning_rate": 3.454979881632595e-06, + "loss": 0.85068595, + "num_input_tokens_seen": 94710185, + "step": 4382, + "time_per_iteration": 2.536336660385132 + }, + { + "auxiliary_loss_clip": 0.01124911, + "auxiliary_loss_mlp": 0.01140788, + "balance_loss_clip": 1.00183892, + "balance_loss_mlp": 1.00088418, + "epoch": 0.2635202164437096, + "flos": 37232218915200.0, + "grad_norm": 2.4545512495663866, + "language_loss": 0.6965912, + "learning_rate": 3.4547126372427035e-06, + "loss": 0.71924818, + "num_input_tokens_seen": 94730280, + "step": 4383, + "time_per_iteration": 2.752204179763794 + }, + { + "auxiliary_loss_clip": 0.01158138, + "auxiliary_loss_mlp": 0.01140054, + "balance_loss_clip": 1.00214481, + "balance_loss_mlp": 1.00081801, + "epoch": 0.26358033969637756, + "flos": 20996359084800.0, + "grad_norm": 1.8805599791806091, + "language_loss": 0.69257843, + "learning_rate": 3.4544453376899638e-06, + "loss": 0.71556032, + "num_input_tokens_seen": 94748560, + "step": 4384, + "time_per_iteration": 2.5866575241088867 + }, + { + "auxiliary_loss_clip": 0.01157546, + "auxiliary_loss_mlp": 0.01139494, + "balance_loss_clip": 1.00205672, + "balance_loss_mlp": 1.00073409, + "epoch": 0.26364046294904553, + "flos": 27746092512000.0, + "grad_norm": 2.2545649579975136, + "language_loss": 0.70175809, + "learning_rate": 3.45417798298451e-06, + "loss": 0.72472847, + "num_input_tokens_seen": 94767570, + "step": 4385, + "time_per_iteration": 2.5968434810638428 + }, + { + "auxiliary_loss_clip": 0.0112611, + "auxiliary_loss_mlp": 0.01140391, + "balance_loss_clip": 1.00202656, + "balance_loss_mlp": 1.00105882, + "epoch": 0.2637005862017135, + "flos": 22893088757760.0, + "grad_norm": 1.8683825916425363, + "language_loss": 0.85493708, + "learning_rate": 3.453910573136482e-06, + "loss": 0.8776021, + "num_input_tokens_seen": 94784985, + "step": 4386, + "time_per_iteration": 2.645568370819092 + }, + { + "auxiliary_loss_clip": 0.01140914, + "auxiliary_loss_mlp": 0.01140262, + "balance_loss_clip": 1.00201917, + "balance_loss_mlp": 1.0008353, + "epoch": 0.26376070945438146, + "flos": 15048834053760.0, + "grad_norm": 2.4709558307801838, + "language_loss": 0.76688707, + "learning_rate": 3.4536431081560196e-06, + "loss": 0.7896989, + "num_input_tokens_seen": 94802545, + "step": 4387, + "time_per_iteration": 2.583768367767334 + }, + { + "auxiliary_loss_clip": 0.01156905, + "auxiliary_loss_mlp": 0.01140633, + "balance_loss_clip": 1.0019722, + "balance_loss_mlp": 1.00111008, + "epoch": 0.2638208327070494, + "flos": 21141833166720.0, + "grad_norm": 1.9068708395092189, + "language_loss": 0.76054776, + "learning_rate": 3.453375588053264e-06, + "loss": 0.7835232, + "num_input_tokens_seen": 94820730, + "step": 4388, + "time_per_iteration": 2.5601751804351807 + }, + { + "auxiliary_loss_clip": 0.01173382, + "auxiliary_loss_mlp": 0.01139876, + "balance_loss_clip": 1.00209343, + "balance_loss_mlp": 1.00063944, + "epoch": 0.26388095595971744, + "flos": 21725597001600.0, + "grad_norm": 2.104449130581893, + "language_loss": 0.86300349, + "learning_rate": 3.4531080128383617e-06, + "loss": 0.88613605, + "num_input_tokens_seen": 94839175, + "step": 4389, + "time_per_iteration": 2.5205957889556885 + }, + { + "auxiliary_loss_clip": 0.01153728, + "auxiliary_loss_mlp": 0.01125872, + "balance_loss_clip": 1.00183654, + "balance_loss_mlp": 0.99998707, + "epoch": 0.2639410792123854, + "flos": 65515537192320.0, + "grad_norm": 7.176854176195039, + "language_loss": 0.60347903, + "learning_rate": 3.452840382521457e-06, + "loss": 0.62627506, + "num_input_tokens_seen": 94898865, + "step": 4390, + "time_per_iteration": 3.1556625366210938 + }, + { + "auxiliary_loss_clip": 0.01146864, + "auxiliary_loss_mlp": 0.01140605, + "balance_loss_clip": 1.00211477, + "balance_loss_mlp": 1.00089192, + "epoch": 0.2640012024650534, + "flos": 23948574929280.0, + "grad_norm": 1.5468826942257592, + "language_loss": 0.7720319, + "learning_rate": 3.4525726971127e-06, + "loss": 0.79490656, + "num_input_tokens_seen": 94917490, + "step": 4391, + "time_per_iteration": 2.6108341217041016 + }, + { + "auxiliary_loss_clip": 0.01123068, + "auxiliary_loss_mlp": 0.00747626, + "balance_loss_clip": 1.00176406, + "balance_loss_mlp": 0.99999285, + "epoch": 0.26406132571772134, + "flos": 56441163369600.0, + "grad_norm": 0.8353548719532118, + "language_loss": 0.58803403, + "learning_rate": 3.45230495662224e-06, + "loss": 0.60674095, + "num_input_tokens_seen": 94969065, + "step": 4392, + "time_per_iteration": 3.1761152744293213 + }, + { + "auxiliary_loss_clip": 0.01158175, + "auxiliary_loss_mlp": 0.01139904, + "balance_loss_clip": 1.00217116, + "balance_loss_mlp": 1.00076342, + "epoch": 0.2641214489703893, + "flos": 22090557139200.0, + "grad_norm": 1.7177373025789748, + "language_loss": 0.68928093, + "learning_rate": 3.4520371610602306e-06, + "loss": 0.71226168, + "num_input_tokens_seen": 94988540, + "step": 4393, + "time_per_iteration": 2.588771104812622 + }, + { + "auxiliary_loss_clip": 0.01156837, + "auxiliary_loss_mlp": 0.01141413, + "balance_loss_clip": 1.00199497, + "balance_loss_mlp": 1.00084174, + "epoch": 0.26418157222305727, + "flos": 16544764794240.0, + "grad_norm": 1.8023768445830444, + "language_loss": 0.83540285, + "learning_rate": 3.4517693104368267e-06, + "loss": 0.85838532, + "num_input_tokens_seen": 95004810, + "step": 4394, + "time_per_iteration": 2.5499749183654785 + }, + { + "auxiliary_loss_clip": 0.01141409, + "auxiliary_loss_mlp": 0.01140914, + "balance_loss_clip": 1.00199437, + "balance_loss_mlp": 1.00081968, + "epoch": 0.26424169547572524, + "flos": 18002486442240.0, + "grad_norm": 5.926179654972435, + "language_loss": 0.70063043, + "learning_rate": 3.4515014047621856e-06, + "loss": 0.72345358, + "num_input_tokens_seen": 95024085, + "step": 4395, + "time_per_iteration": 2.5974113941192627 + }, + { + "auxiliary_loss_clip": 0.01125821, + "auxiliary_loss_mlp": 0.01140102, + "balance_loss_clip": 1.00186014, + "balance_loss_mlp": 1.0006752, + "epoch": 0.2643018187283932, + "flos": 16983162288000.0, + "grad_norm": 1.6979039625723344, + "language_loss": 0.86744034, + "learning_rate": 3.4512334440464655e-06, + "loss": 0.89009964, + "num_input_tokens_seen": 95042515, + "step": 4396, + "time_per_iteration": 2.6549599170684814 + }, + { + "auxiliary_loss_clip": 0.01093203, + "auxiliary_loss_mlp": 0.01125898, + "balance_loss_clip": 1.0018909, + "balance_loss_mlp": 1.00001264, + "epoch": 0.26436194198106117, + "flos": 59664359416320.0, + "grad_norm": 0.8032931008987383, + "language_loss": 0.55062318, + "learning_rate": 3.4509654282998277e-06, + "loss": 0.57281423, + "num_input_tokens_seen": 95094835, + "step": 4397, + "time_per_iteration": 3.0655534267425537 + }, + { + "auxiliary_loss_clip": 0.01157557, + "auxiliary_loss_mlp": 0.0114086, + "balance_loss_clip": 1.00205231, + "balance_loss_mlp": 1.00124192, + "epoch": 0.26442206523372913, + "flos": 32921322197760.0, + "grad_norm": 2.035083977054888, + "language_loss": 0.78332686, + "learning_rate": 3.450697357532435e-06, + "loss": 0.80631107, + "num_input_tokens_seen": 95113480, + "step": 4398, + "time_per_iteration": 4.324145555496216 + }, + { + "auxiliary_loss_clip": 0.01156493, + "auxiliary_loss_mlp": 0.01140391, + "balance_loss_clip": 1.00201726, + "balance_loss_mlp": 1.00067735, + "epoch": 0.2644821884863971, + "flos": 21031300039680.0, + "grad_norm": 1.609454266912927, + "language_loss": 0.67205793, + "learning_rate": 3.4504292317544534e-06, + "loss": 0.69502676, + "num_input_tokens_seen": 95132580, + "step": 4399, + "time_per_iteration": 2.5587544441223145 + }, + { + "auxiliary_loss_clip": 0.0112495, + "auxiliary_loss_mlp": 0.01140353, + "balance_loss_clip": 1.00189197, + "balance_loss_mlp": 1.00092578, + "epoch": 0.26454231173906506, + "flos": 20776801201920.0, + "grad_norm": 1.6054239821236356, + "language_loss": 0.8628695, + "learning_rate": 3.4501610509760504e-06, + "loss": 0.8855226, + "num_input_tokens_seen": 95152375, + "step": 4400, + "time_per_iteration": 2.6467063426971436 + }, + { + "auxiliary_loss_clip": 0.01141408, + "auxiliary_loss_mlp": 0.01140952, + "balance_loss_clip": 1.00196934, + "balance_loss_mlp": 1.00085711, + "epoch": 0.264602434991733, + "flos": 16618669027200.0, + "grad_norm": 1.787516875962333, + "language_loss": 0.75858349, + "learning_rate": 3.4498928152073944e-06, + "loss": 0.78140712, + "num_input_tokens_seen": 95170265, + "step": 4401, + "time_per_iteration": 3.9887208938598633 + }, + { + "auxiliary_loss_clip": 0.01125141, + "auxiliary_loss_mlp": 0.01141191, + "balance_loss_clip": 1.00192213, + "balance_loss_mlp": 1.00109625, + "epoch": 0.26466255824440105, + "flos": 19062677295360.0, + "grad_norm": 1.755549014951038, + "language_loss": 0.88334703, + "learning_rate": 3.4496245244586577e-06, + "loss": 0.90601039, + "num_input_tokens_seen": 95188655, + "step": 4402, + "time_per_iteration": 2.6613359451293945 + }, + { + "auxiliary_loss_clip": 0.01129776, + "auxiliary_loss_mlp": 0.01140819, + "balance_loss_clip": 1.00207627, + "balance_loss_mlp": 1.00091457, + "epoch": 0.264722681497069, + "flos": 22638554006400.0, + "grad_norm": 2.9185759495210046, + "language_loss": 0.78012371, + "learning_rate": 3.4493561787400137e-06, + "loss": 0.80282962, + "num_input_tokens_seen": 95209615, + "step": 4403, + "time_per_iteration": 4.570936441421509 + }, + { + "auxiliary_loss_clip": 0.01157941, + "auxiliary_loss_mlp": 0.01140667, + "balance_loss_clip": 1.0020541, + "balance_loss_mlp": 1.00085783, + "epoch": 0.264782804749737, + "flos": 22492253911680.0, + "grad_norm": 3.348228302583534, + "language_loss": 0.88096976, + "learning_rate": 3.4490877780616387e-06, + "loss": 0.90395588, + "num_input_tokens_seen": 95227810, + "step": 4404, + "time_per_iteration": 2.559532880783081 + }, + { + "auxiliary_loss_clip": 0.0114233, + "auxiliary_loss_mlp": 0.01139902, + "balance_loss_clip": 1.00193548, + "balance_loss_mlp": 1.00076115, + "epoch": 0.26484292800240494, + "flos": 16800269212800.0, + "grad_norm": 1.7001594370927975, + "language_loss": 0.758039, + "learning_rate": 3.448819322433709e-06, + "loss": 0.78086138, + "num_input_tokens_seen": 95245890, + "step": 4405, + "time_per_iteration": 2.5872669219970703 + }, + { + "auxiliary_loss_clip": 0.01173551, + "auxiliary_loss_mlp": 0.0114077, + "balance_loss_clip": 1.00223923, + "balance_loss_mlp": 1.00067496, + "epoch": 0.2649030512550729, + "flos": 20449583280000.0, + "grad_norm": 2.0796139710504518, + "language_loss": 0.6977374, + "learning_rate": 3.4485508118664066e-06, + "loss": 0.72088057, + "num_input_tokens_seen": 95264955, + "step": 4406, + "time_per_iteration": 2.509122610092163 + }, + { + "auxiliary_loss_clip": 0.01141261, + "auxiliary_loss_mlp": 0.01140243, + "balance_loss_clip": 1.00202441, + "balance_loss_mlp": 1.000911, + "epoch": 0.2649631745077409, + "flos": 22416123035520.0, + "grad_norm": 1.6044107740511806, + "language_loss": 0.83620453, + "learning_rate": 3.448282246369912e-06, + "loss": 0.85901958, + "num_input_tokens_seen": 95284245, + "step": 4407, + "time_per_iteration": 2.619755744934082 + }, + { + "auxiliary_loss_clip": 0.01125656, + "auxiliary_loss_mlp": 0.01140094, + "balance_loss_clip": 1.00187278, + "balance_loss_mlp": 1.00057197, + "epoch": 0.26502329776040884, + "flos": 35116110927360.0, + "grad_norm": 1.6295088484652775, + "language_loss": 0.75922555, + "learning_rate": 3.4480136259544084e-06, + "loss": 0.781883, + "num_input_tokens_seen": 95307125, + "step": 4408, + "time_per_iteration": 2.8029098510742188 + }, + { + "auxiliary_loss_clip": 0.0112616, + "auxiliary_loss_mlp": 0.01139888, + "balance_loss_clip": 1.00198781, + "balance_loss_mlp": 1.00065184, + "epoch": 0.2650834210130768, + "flos": 38687498438400.0, + "grad_norm": 1.7674810075041345, + "language_loss": 0.7077499, + "learning_rate": 3.447744950630084e-06, + "loss": 0.73041034, + "num_input_tokens_seen": 95329150, + "step": 4409, + "time_per_iteration": 2.8018784523010254 + }, + { + "auxiliary_loss_clip": 0.01156389, + "auxiliary_loss_mlp": 0.01139805, + "balance_loss_clip": 1.00195193, + "balance_loss_mlp": 1.00066423, + "epoch": 0.26514354426574477, + "flos": 24716847951360.0, + "grad_norm": 2.088961589061443, + "language_loss": 0.73559028, + "learning_rate": 3.4474762204071253e-06, + "loss": 0.75855219, + "num_input_tokens_seen": 95349880, + "step": 4410, + "time_per_iteration": 2.5918092727661133 + }, + { + "auxiliary_loss_clip": 0.01158156, + "auxiliary_loss_mlp": 0.01140434, + "balance_loss_clip": 1.00206351, + "balance_loss_mlp": 1.00091195, + "epoch": 0.26520366751841273, + "flos": 20340055733760.0, + "grad_norm": 1.9704051719823428, + "language_loss": 0.73245203, + "learning_rate": 3.4472074352957244e-06, + "loss": 0.75543797, + "num_input_tokens_seen": 95368570, + "step": 4411, + "time_per_iteration": 2.5521037578582764 + }, + { + "auxiliary_loss_clip": 0.01113717, + "auxiliary_loss_mlp": 0.01140104, + "balance_loss_clip": 1.00196278, + "balance_loss_mlp": 1.00077224, + "epoch": 0.2652637907710807, + "flos": 22343870828160.0, + "grad_norm": 2.026392365530762, + "language_loss": 0.82061428, + "learning_rate": 3.446938595306071e-06, + "loss": 0.84315252, + "num_input_tokens_seen": 95387065, + "step": 4412, + "time_per_iteration": 2.675788164138794 + }, + { + "auxiliary_loss_clip": 0.01156351, + "auxiliary_loss_mlp": 0.01140485, + "balance_loss_clip": 1.00191998, + "balance_loss_mlp": 1.00124836, + "epoch": 0.26532391402374866, + "flos": 19354235990400.0, + "grad_norm": 1.6415506420596453, + "language_loss": 0.74247277, + "learning_rate": 3.4466697004483622e-06, + "loss": 0.76544118, + "num_input_tokens_seen": 95406345, + "step": 4413, + "time_per_iteration": 2.554436445236206 + }, + { + "auxiliary_loss_clip": 0.01155217, + "auxiliary_loss_mlp": 0.01125959, + "balance_loss_clip": 1.00200796, + "balance_loss_mlp": 1.00007427, + "epoch": 0.26538403727641663, + "flos": 44787611422080.0, + "grad_norm": 0.8708338780845918, + "language_loss": 0.56989872, + "learning_rate": 3.446400750732793e-06, + "loss": 0.59271049, + "num_input_tokens_seen": 95463595, + "step": 4414, + "time_per_iteration": 3.0897347927093506 + }, + { + "auxiliary_loss_clip": 0.01141053, + "auxiliary_loss_mlp": 0.0113971, + "balance_loss_clip": 1.00198817, + "balance_loss_mlp": 1.00095057, + "epoch": 0.26544416052908465, + "flos": 28182119708160.0, + "grad_norm": 1.6170832908623938, + "language_loss": 0.74370587, + "learning_rate": 3.4461317461695625e-06, + "loss": 0.76651347, + "num_input_tokens_seen": 95484115, + "step": 4415, + "time_per_iteration": 2.6802210807800293 + }, + { + "auxiliary_loss_clip": 0.01109604, + "auxiliary_loss_mlp": 0.01140714, + "balance_loss_clip": 1.0018003, + "balance_loss_mlp": 1.00080967, + "epoch": 0.2655042837817526, + "flos": 17565274097280.0, + "grad_norm": 1.9967466463738797, + "language_loss": 0.86852479, + "learning_rate": 3.4458626867688707e-06, + "loss": 0.89102799, + "num_input_tokens_seen": 95501435, + "step": 4416, + "time_per_iteration": 2.6728591918945312 + }, + { + "auxiliary_loss_clip": 0.01158013, + "auxiliary_loss_mlp": 0.01140831, + "balance_loss_clip": 1.0019474, + "balance_loss_mlp": 1.00102234, + "epoch": 0.2655644070344206, + "flos": 23404636298880.0, + "grad_norm": 1.6201571091211313, + "language_loss": 0.76341105, + "learning_rate": 3.4455935725409217e-06, + "loss": 0.78639948, + "num_input_tokens_seen": 95520135, + "step": 4417, + "time_per_iteration": 2.5698814392089844 + }, + { + "auxiliary_loss_clip": 0.01139921, + "auxiliary_loss_mlp": 0.01140308, + "balance_loss_clip": 1.00185132, + "balance_loss_mlp": 1.00078559, + "epoch": 0.26562453028708854, + "flos": 26468462678400.0, + "grad_norm": 1.4943934013052118, + "language_loss": 0.79969835, + "learning_rate": 3.4453244034959196e-06, + "loss": 0.82250059, + "num_input_tokens_seen": 95541705, + "step": 4418, + "time_per_iteration": 2.63368821144104 + }, + { + "auxiliary_loss_clip": 0.0115786, + "auxiliary_loss_mlp": 0.01140535, + "balance_loss_clip": 1.00205278, + "balance_loss_mlp": 1.00091732, + "epoch": 0.2656846535397565, + "flos": 19207576759680.0, + "grad_norm": 1.9426617585472465, + "language_loss": 0.6694355, + "learning_rate": 3.445055179644071e-06, + "loss": 0.69241947, + "num_input_tokens_seen": 95560300, + "step": 4419, + "time_per_iteration": 2.5770699977874756 + }, + { + "auxiliary_loss_clip": 0.01173336, + "auxiliary_loss_mlp": 0.01140411, + "balance_loss_clip": 1.00209236, + "balance_loss_mlp": 1.00088835, + "epoch": 0.2657447767924245, + "flos": 30551325903360.0, + "grad_norm": 2.389976320815573, + "language_loss": 0.79024619, + "learning_rate": 3.444785900995585e-06, + "loss": 0.81338364, + "num_input_tokens_seen": 95580150, + "step": 4420, + "time_per_iteration": 2.6074209213256836 + }, + { + "auxiliary_loss_clip": 0.01139877, + "auxiliary_loss_mlp": 0.01140166, + "balance_loss_clip": 1.00198781, + "balance_loss_mlp": 1.00083411, + "epoch": 0.26580490004509244, + "flos": 20922742160640.0, + "grad_norm": 1.957011598414368, + "language_loss": 0.81732428, + "learning_rate": 3.444516567560673e-06, + "loss": 0.84012473, + "num_input_tokens_seen": 95597570, + "step": 4421, + "time_per_iteration": 2.5926308631896973 + }, + { + "auxiliary_loss_clip": 0.01156595, + "auxiliary_loss_mlp": 0.01139556, + "balance_loss_clip": 1.0019722, + "balance_loss_mlp": 1.00079644, + "epoch": 0.2658650232977604, + "flos": 43945682584320.0, + "grad_norm": 1.4480611143419382, + "language_loss": 0.65544212, + "learning_rate": 3.444247179349548e-06, + "loss": 0.67840356, + "num_input_tokens_seen": 95619415, + "step": 4422, + "time_per_iteration": 2.783367395401001 + }, + { + "auxiliary_loss_clip": 0.01156777, + "auxiliary_loss_mlp": 0.01140828, + "balance_loss_clip": 1.00190651, + "balance_loss_mlp": 1.00092399, + "epoch": 0.26592514655042837, + "flos": 29716439109120.0, + "grad_norm": 2.1245956797073138, + "language_loss": 0.73749864, + "learning_rate": 3.4439777363724252e-06, + "loss": 0.76047468, + "num_input_tokens_seen": 95639155, + "step": 4423, + "time_per_iteration": 2.6289761066436768 + }, + { + "auxiliary_loss_clip": 0.01157863, + "auxiliary_loss_mlp": 0.01140843, + "balance_loss_clip": 1.00204659, + "balance_loss_mlp": 1.00093865, + "epoch": 0.26598526980309634, + "flos": 46677730014720.0, + "grad_norm": 1.522494819991859, + "language_loss": 0.77930558, + "learning_rate": 3.443708238639522e-06, + "loss": 0.80229259, + "num_input_tokens_seen": 95663320, + "step": 4424, + "time_per_iteration": 2.7797791957855225 + }, + { + "auxiliary_loss_clip": 0.01157887, + "auxiliary_loss_mlp": 0.01140536, + "balance_loss_clip": 1.00205362, + "balance_loss_mlp": 1.0010134, + "epoch": 0.2660453930557643, + "flos": 11509442582400.0, + "grad_norm": 1.8931008578759518, + "language_loss": 0.79013085, + "learning_rate": 3.4434386861610573e-06, + "loss": 0.81311512, + "num_input_tokens_seen": 95680260, + "step": 4425, + "time_per_iteration": 2.5278522968292236 + }, + { + "auxiliary_loss_clip": 0.0113989, + "auxiliary_loss_mlp": 0.01140034, + "balance_loss_clip": 1.00192189, + "balance_loss_mlp": 1.00108349, + "epoch": 0.26610551630843227, + "flos": 24791578197120.0, + "grad_norm": 1.4762239527141359, + "language_loss": 0.79502189, + "learning_rate": 3.4431690789472532e-06, + "loss": 0.81782115, + "num_input_tokens_seen": 95701140, + "step": 4426, + "time_per_iteration": 2.6253082752227783 + }, + { + "auxiliary_loss_clip": 0.01173383, + "auxiliary_loss_mlp": 0.01140843, + "balance_loss_clip": 1.00217891, + "balance_loss_mlp": 1.0010345, + "epoch": 0.26616563956110023, + "flos": 27636385397760.0, + "grad_norm": 1.699061462144729, + "language_loss": 0.76882297, + "learning_rate": 3.442899417008333e-06, + "loss": 0.79196525, + "num_input_tokens_seen": 95722060, + "step": 4427, + "time_per_iteration": 2.5715928077697754 + }, + { + "auxiliary_loss_clip": 0.01123452, + "auxiliary_loss_mlp": 0.01139357, + "balance_loss_clip": 1.00184119, + "balance_loss_mlp": 1.00078821, + "epoch": 0.26622576281376825, + "flos": 28362893880960.0, + "grad_norm": 1.5099912609433792, + "language_loss": 0.76871884, + "learning_rate": 3.4426297003545227e-06, + "loss": 0.79134691, + "num_input_tokens_seen": 95742495, + "step": 4428, + "time_per_iteration": 2.686248779296875 + }, + { + "auxiliary_loss_clip": 0.01125654, + "auxiliary_loss_mlp": 0.00748319, + "balance_loss_clip": 1.00179636, + "balance_loss_mlp": 1.0003643, + "epoch": 0.2662858860664362, + "flos": 18041341979520.0, + "grad_norm": 1.7310177489012297, + "language_loss": 0.82551134, + "learning_rate": 3.4423599289960495e-06, + "loss": 0.8442511, + "num_input_tokens_seen": 95761510, + "step": 4429, + "time_per_iteration": 2.6262271404266357 + }, + { + "auxiliary_loss_clip": 0.01124912, + "auxiliary_loss_mlp": 0.01140311, + "balance_loss_clip": 1.00181472, + "balance_loss_mlp": 1.00069296, + "epoch": 0.2663460093191042, + "flos": 22745818995840.0, + "grad_norm": 1.6543280898816197, + "language_loss": 0.71953309, + "learning_rate": 3.442090102943143e-06, + "loss": 0.74218535, + "num_input_tokens_seen": 95782385, + "step": 4430, + "time_per_iteration": 2.632938861846924 + }, + { + "auxiliary_loss_clip": 0.01173145, + "auxiliary_loss_mlp": 0.01140898, + "balance_loss_clip": 1.00205457, + "balance_loss_mlp": 1.00080276, + "epoch": 0.26640613257177215, + "flos": 16508782344960.0, + "grad_norm": 1.8820399731510167, + "language_loss": 0.8201009, + "learning_rate": 3.441820222206035e-06, + "loss": 0.84324133, + "num_input_tokens_seen": 95800595, + "step": 4431, + "time_per_iteration": 2.5086987018585205 + }, + { + "auxiliary_loss_clip": 0.0115673, + "auxiliary_loss_mlp": 0.01140815, + "balance_loss_clip": 1.00197268, + "balance_loss_mlp": 1.00100684, + "epoch": 0.2664662558244401, + "flos": 23075945919360.0, + "grad_norm": 2.1819859933550605, + "language_loss": 0.76309061, + "learning_rate": 3.44155028679496e-06, + "loss": 0.78606606, + "num_input_tokens_seen": 95818480, + "step": 4432, + "time_per_iteration": 2.5671005249023438 + }, + { + "auxiliary_loss_clip": 0.01110602, + "auxiliary_loss_mlp": 0.01140496, + "balance_loss_clip": 1.00193071, + "balance_loss_mlp": 1.00068736, + "epoch": 0.2665263790771081, + "flos": 23769273214080.0, + "grad_norm": 1.8370709343974156, + "language_loss": 0.83030367, + "learning_rate": 3.441280296720154e-06, + "loss": 0.85281467, + "num_input_tokens_seen": 95837205, + "step": 4433, + "time_per_iteration": 2.695155382156372 + }, + { + "auxiliary_loss_clip": 0.01158019, + "auxiliary_loss_mlp": 0.01140951, + "balance_loss_clip": 1.00205028, + "balance_loss_mlp": 1.00104654, + "epoch": 0.26658650232977604, + "flos": 28001273708160.0, + "grad_norm": 2.0073410451105893, + "language_loss": 0.76365906, + "learning_rate": 3.441010251991854e-06, + "loss": 0.78664875, + "num_input_tokens_seen": 95858395, + "step": 4434, + "time_per_iteration": 2.624159336090088 + }, + { + "auxiliary_loss_clip": 0.01173246, + "auxiliary_loss_mlp": 0.01140082, + "balance_loss_clip": 1.00213337, + "balance_loss_mlp": 1.00094092, + "epoch": 0.266646625582444, + "flos": 22163635359360.0, + "grad_norm": 1.772389999781916, + "language_loss": 0.82321262, + "learning_rate": 3.440740152620301e-06, + "loss": 0.84634584, + "num_input_tokens_seen": 95877875, + "step": 4435, + "time_per_iteration": 2.5452985763549805 + }, + { + "auxiliary_loss_clip": 0.01114025, + "auxiliary_loss_mlp": 0.01141224, + "balance_loss_clip": 1.0017792, + "balance_loss_mlp": 1.00132012, + "epoch": 0.266706748835112, + "flos": 27853537069440.0, + "grad_norm": 2.0152305513983193, + "language_loss": 0.87335759, + "learning_rate": 3.4404699986157376e-06, + "loss": 0.89591008, + "num_input_tokens_seen": 95895820, + "step": 4436, + "time_per_iteration": 4.095678806304932 + }, + { + "auxiliary_loss_clip": 0.01141026, + "auxiliary_loss_mlp": 0.01140587, + "balance_loss_clip": 1.00176978, + "balance_loss_mlp": 1.00077796, + "epoch": 0.26676687208777994, + "flos": 25812123413760.0, + "grad_norm": 1.422399052234263, + "language_loss": 0.78706217, + "learning_rate": 3.440199789988407e-06, + "loss": 0.80987823, + "num_input_tokens_seen": 95918025, + "step": 4437, + "time_per_iteration": 2.646040201187134 + }, + { + "auxiliary_loss_clip": 0.01093803, + "auxiliary_loss_mlp": 0.01140172, + "balance_loss_clip": 1.00170064, + "balance_loss_mlp": 1.00103092, + "epoch": 0.2668269953404479, + "flos": 36064583504640.0, + "grad_norm": 1.9713972707963185, + "language_loss": 0.64152849, + "learning_rate": 3.439929526748556e-06, + "loss": 0.66386825, + "num_input_tokens_seen": 95937725, + "step": 4438, + "time_per_iteration": 4.437206268310547 + }, + { + "auxiliary_loss_clip": 0.01094245, + "auxiliary_loss_mlp": 0.01140592, + "balance_loss_clip": 1.00172842, + "balance_loss_mlp": 1.00087857, + "epoch": 0.26688711859311587, + "flos": 26570987072640.0, + "grad_norm": 1.8821471512035686, + "language_loss": 0.75714153, + "learning_rate": 3.4396592089064334e-06, + "loss": 0.77948993, + "num_input_tokens_seen": 95956335, + "step": 4439, + "time_per_iteration": 3.054136037826538 + }, + { + "auxiliary_loss_clip": 0.01090282, + "auxiliary_loss_mlp": 0.01140253, + "balance_loss_clip": 1.00150359, + "balance_loss_mlp": 1.00063503, + "epoch": 0.26694724184578383, + "flos": 26761565658240.0, + "grad_norm": 1.8014203377278608, + "language_loss": 0.7138567, + "learning_rate": 3.4393888364722897e-06, + "loss": 0.73616207, + "num_input_tokens_seen": 95977135, + "step": 4440, + "time_per_iteration": 4.133834600448608 + }, + { + "auxiliary_loss_clip": 0.01141314, + "auxiliary_loss_mlp": 0.01140024, + "balance_loss_clip": 1.00183749, + "balance_loss_mlp": 1.00088251, + "epoch": 0.2670073650984518, + "flos": 20959586536320.0, + "grad_norm": 1.8928149285269835, + "language_loss": 0.6686846, + "learning_rate": 3.439118409456376e-06, + "loss": 0.69149798, + "num_input_tokens_seen": 95995435, + "step": 4441, + "time_per_iteration": 3.976530075073242 + }, + { + "auxiliary_loss_clip": 0.01157775, + "auxiliary_loss_mlp": 0.01140366, + "balance_loss_clip": 1.00196886, + "balance_loss_mlp": 1.00074792, + "epoch": 0.2670674883511198, + "flos": 28366054277760.0, + "grad_norm": 1.6146062309133595, + "language_loss": 0.76045883, + "learning_rate": 3.4388479278689486e-06, + "loss": 0.78344023, + "num_input_tokens_seen": 96016340, + "step": 4442, + "time_per_iteration": 2.6333162784576416 + }, + { + "auxiliary_loss_clip": 0.01090561, + "auxiliary_loss_mlp": 0.01125313, + "balance_loss_clip": 1.00194848, + "balance_loss_mlp": 1.00019073, + "epoch": 0.2671276116037878, + "flos": 58971319430400.0, + "grad_norm": 0.9296624644522629, + "language_loss": 0.61251712, + "learning_rate": 3.4385773917202637e-06, + "loss": 0.63467586, + "num_input_tokens_seen": 96071205, + "step": 4443, + "time_per_iteration": 3.2277004718780518 + }, + { + "auxiliary_loss_clip": 0.01125938, + "auxiliary_loss_mlp": 0.01140417, + "balance_loss_clip": 1.0019058, + "balance_loss_mlp": 1.00079918, + "epoch": 0.26718773485645575, + "flos": 43945072053120.0, + "grad_norm": 1.4661545275873356, + "language_loss": 0.7621519, + "learning_rate": 3.4383068010205793e-06, + "loss": 0.78481543, + "num_input_tokens_seen": 96094240, + "step": 4444, + "time_per_iteration": 2.8453755378723145 + }, + { + "auxiliary_loss_clip": 0.01156489, + "auxiliary_loss_mlp": 0.0114015, + "balance_loss_clip": 1.00189972, + "balance_loss_mlp": 1.00072253, + "epoch": 0.2672478581091237, + "flos": 25228323665280.0, + "grad_norm": 1.580521303563002, + "language_loss": 0.80369675, + "learning_rate": 3.438036155780158e-06, + "loss": 0.82666314, + "num_input_tokens_seen": 96114105, + "step": 4445, + "time_per_iteration": 2.600614309310913 + }, + { + "auxiliary_loss_clip": 0.0114046, + "auxiliary_loss_mlp": 0.01140406, + "balance_loss_clip": 1.001881, + "balance_loss_mlp": 1.00069332, + "epoch": 0.2673079813617917, + "flos": 15268176455040.0, + "grad_norm": 1.7899516873969998, + "language_loss": 0.88989544, + "learning_rate": 3.43776545600926e-06, + "loss": 0.91270411, + "num_input_tokens_seen": 96132140, + "step": 4446, + "time_per_iteration": 2.5972442626953125 + }, + { + "auxiliary_loss_clip": 0.01157515, + "auxiliary_loss_mlp": 0.01139494, + "balance_loss_clip": 1.00204599, + "balance_loss_mlp": 1.00073445, + "epoch": 0.26736810461445965, + "flos": 25812733944960.0, + "grad_norm": 2.535034342821622, + "language_loss": 0.68418336, + "learning_rate": 3.437494701718153e-06, + "loss": 0.70715344, + "num_input_tokens_seen": 96152090, + "step": 4447, + "time_per_iteration": 2.621061086654663 + }, + { + "auxiliary_loss_clip": 0.0115667, + "auxiliary_loss_mlp": 0.01140151, + "balance_loss_clip": 1.00202441, + "balance_loss_mlp": 1.00062823, + "epoch": 0.2674282278671276, + "flos": 24312709054080.0, + "grad_norm": 1.8374025120884887, + "language_loss": 0.83033925, + "learning_rate": 3.4372238929171026e-06, + "loss": 0.85330743, + "num_input_tokens_seen": 96170015, + "step": 4448, + "time_per_iteration": 2.5870797634124756 + }, + { + "auxiliary_loss_clip": 0.01125013, + "auxiliary_loss_mlp": 0.01139687, + "balance_loss_clip": 1.00167251, + "balance_loss_mlp": 1.00092745, + "epoch": 0.2674883511197956, + "flos": 22815521337600.0, + "grad_norm": 1.5746177696217196, + "language_loss": 0.84382522, + "learning_rate": 3.436953029616378e-06, + "loss": 0.86647224, + "num_input_tokens_seen": 96188065, + "step": 4449, + "time_per_iteration": 2.6519789695739746 + }, + { + "auxiliary_loss_clip": 0.01140565, + "auxiliary_loss_mlp": 0.01140823, + "balance_loss_clip": 1.00173521, + "balance_loss_mlp": 1.0008235, + "epoch": 0.26754847437246354, + "flos": 25370170473600.0, + "grad_norm": 1.571685365786515, + "language_loss": 0.8382163, + "learning_rate": 3.4366821118262506e-06, + "loss": 0.86103022, + "num_input_tokens_seen": 96205780, + "step": 4450, + "time_per_iteration": 2.6254723072052 + }, + { + "auxiliary_loss_clip": 0.01124751, + "auxiliary_loss_mlp": 0.01139353, + "balance_loss_clip": 1.00180745, + "balance_loss_mlp": 1.00087941, + "epoch": 0.2676085976251315, + "flos": 20230420446720.0, + "grad_norm": 2.136174002155836, + "language_loss": 0.80806601, + "learning_rate": 3.4364111395569937e-06, + "loss": 0.83070701, + "num_input_tokens_seen": 96224990, + "step": 4451, + "time_per_iteration": 2.6556477546691895 + }, + { + "auxiliary_loss_clip": 0.01156631, + "auxiliary_loss_mlp": 0.01139653, + "balance_loss_clip": 1.00208306, + "balance_loss_mlp": 1.00089383, + "epoch": 0.26766872087779947, + "flos": 28038225824640.0, + "grad_norm": 1.638228246505315, + "language_loss": 0.86259526, + "learning_rate": 3.436140112818882e-06, + "loss": 0.88555801, + "num_input_tokens_seen": 96245345, + "step": 4452, + "time_per_iteration": 2.615912914276123 + }, + { + "auxiliary_loss_clip": 0.01145654, + "auxiliary_loss_mlp": 0.01140176, + "balance_loss_clip": 1.00207281, + "balance_loss_mlp": 1.00065303, + "epoch": 0.26772884413046744, + "flos": 18325179250560.0, + "grad_norm": 1.8993896367886032, + "language_loss": 0.83303201, + "learning_rate": 3.435869031622194e-06, + "loss": 0.85589027, + "num_input_tokens_seen": 96259000, + "step": 4453, + "time_per_iteration": 2.5654001235961914 + }, + { + "auxiliary_loss_clip": 0.01156371, + "auxiliary_loss_mlp": 0.01140034, + "balance_loss_clip": 1.00190711, + "balance_loss_mlp": 1.00098848, + "epoch": 0.2677889673831354, + "flos": 22127509255680.0, + "grad_norm": 1.7776585488576502, + "language_loss": 0.79123843, + "learning_rate": 3.435597895977208e-06, + "loss": 0.81420255, + "num_input_tokens_seen": 96277000, + "step": 4454, + "time_per_iteration": 2.560483932495117 + }, + { + "auxiliary_loss_clip": 0.01140377, + "auxiliary_loss_mlp": 0.01140687, + "balance_loss_clip": 1.00186479, + "balance_loss_mlp": 1.0009737, + "epoch": 0.2678490906358034, + "flos": 23729699404800.0, + "grad_norm": 1.5380778896504834, + "language_loss": 0.72825557, + "learning_rate": 3.435326705894206e-06, + "loss": 0.75106621, + "num_input_tokens_seen": 96297010, + "step": 4455, + "time_per_iteration": 2.6250104904174805 + }, + { + "auxiliary_loss_clip": 0.01125161, + "auxiliary_loss_mlp": 0.01139607, + "balance_loss_clip": 1.00202465, + "balance_loss_mlp": 1.0008477, + "epoch": 0.2679092138884714, + "flos": 21762872340480.0, + "grad_norm": 1.5026757558798578, + "language_loss": 0.73680824, + "learning_rate": 3.435055461383471e-06, + "loss": 0.75945598, + "num_input_tokens_seen": 96315780, + "step": 4456, + "time_per_iteration": 2.6589207649230957 + }, + { + "auxiliary_loss_clip": 0.011567, + "auxiliary_loss_mlp": 0.01140486, + "balance_loss_clip": 1.0020206, + "balance_loss_mlp": 1.00086844, + "epoch": 0.26796933714113935, + "flos": 19861186590720.0, + "grad_norm": 2.134321071299531, + "language_loss": 0.70737112, + "learning_rate": 3.4347841624552896e-06, + "loss": 0.73034304, + "num_input_tokens_seen": 96333465, + "step": 4457, + "time_per_iteration": 2.5453782081604004 + }, + { + "auxiliary_loss_clip": 0.01126131, + "auxiliary_loss_mlp": 0.01141202, + "balance_loss_clip": 1.0019927, + "balance_loss_mlp": 1.00091648, + "epoch": 0.2680294603938073, + "flos": 20047886507520.0, + "grad_norm": 1.537451637721719, + "language_loss": 0.79012394, + "learning_rate": 3.4345128091199493e-06, + "loss": 0.81279731, + "num_input_tokens_seen": 96352005, + "step": 4458, + "time_per_iteration": 2.641789674758911 + }, + { + "auxiliary_loss_clip": 0.01120236, + "auxiliary_loss_mlp": 0.01125084, + "balance_loss_clip": 1.00159144, + "balance_loss_mlp": 0.99996167, + "epoch": 0.2680895836464753, + "flos": 72113763052800.0, + "grad_norm": 0.8657267483301497, + "language_loss": 0.58667469, + "learning_rate": 3.434241401387739e-06, + "loss": 0.60912788, + "num_input_tokens_seen": 96406265, + "step": 4459, + "time_per_iteration": 3.185652494430542 + }, + { + "auxiliary_loss_clip": 0.0111534, + "auxiliary_loss_mlp": 0.0113977, + "balance_loss_clip": 1.00195205, + "balance_loss_mlp": 1.00091553, + "epoch": 0.26814970689914325, + "flos": 20449044576000.0, + "grad_norm": 1.830852571166847, + "language_loss": 0.84967971, + "learning_rate": 3.4339699392689507e-06, + "loss": 0.87223083, + "num_input_tokens_seen": 96425225, + "step": 4460, + "time_per_iteration": 2.679380416870117 + }, + { + "auxiliary_loss_clip": 0.01157686, + "auxiliary_loss_mlp": 0.01139981, + "balance_loss_clip": 1.00199533, + "balance_loss_mlp": 1.00083983, + "epoch": 0.2682098301518112, + "flos": 17566674727680.0, + "grad_norm": 2.209940333251825, + "language_loss": 0.68381894, + "learning_rate": 3.4336984227738796e-06, + "loss": 0.70679557, + "num_input_tokens_seen": 96443780, + "step": 4461, + "time_per_iteration": 2.569861888885498 + }, + { + "auxiliary_loss_clip": 0.01130352, + "auxiliary_loss_mlp": 0.01140992, + "balance_loss_clip": 1.00191307, + "balance_loss_mlp": 1.00108767, + "epoch": 0.2682699534044792, + "flos": 18333259810560.0, + "grad_norm": 1.5307521931996277, + "language_loss": 0.6691978, + "learning_rate": 3.43342685191282e-06, + "loss": 0.69191122, + "num_input_tokens_seen": 96464530, + "step": 4462, + "time_per_iteration": 2.666903495788574 + }, + { + "auxiliary_loss_clip": 0.0112344, + "auxiliary_loss_mlp": 0.01139897, + "balance_loss_clip": 1.00190187, + "balance_loss_mlp": 1.00075626, + "epoch": 0.26833007665714714, + "flos": 25301294144640.0, + "grad_norm": 1.8337490429163616, + "language_loss": 0.69644356, + "learning_rate": 3.4331552266960705e-06, + "loss": 0.71907699, + "num_input_tokens_seen": 96483345, + "step": 4463, + "time_per_iteration": 2.6837098598480225 + }, + { + "auxiliary_loss_clip": 0.0114234, + "auxiliary_loss_mlp": 0.01139814, + "balance_loss_clip": 1.00192285, + "balance_loss_mlp": 1.00076795, + "epoch": 0.2683901999098151, + "flos": 16099759198080.0, + "grad_norm": 2.501047869723069, + "language_loss": 0.77970093, + "learning_rate": 3.432883547133931e-06, + "loss": 0.80252248, + "num_input_tokens_seen": 96498305, + "step": 4464, + "time_per_iteration": 2.5469603538513184 + }, + { + "auxiliary_loss_clip": 0.01156796, + "auxiliary_loss_mlp": 0.01140178, + "balance_loss_clip": 1.00192773, + "balance_loss_mlp": 1.00084615, + "epoch": 0.2684503231624831, + "flos": 27308054154240.0, + "grad_norm": 2.038308440788285, + "language_loss": 0.70530933, + "learning_rate": 3.432611813236704e-06, + "loss": 0.72827905, + "num_input_tokens_seen": 96519740, + "step": 4465, + "time_per_iteration": 2.606666088104248 + }, + { + "auxiliary_loss_clip": 0.01137893, + "auxiliary_loss_mlp": 0.01125142, + "balance_loss_clip": 1.00166893, + "balance_loss_mlp": 1.00002027, + "epoch": 0.26851044641515104, + "flos": 71858007239040.0, + "grad_norm": 0.6727795294607612, + "language_loss": 0.53143775, + "learning_rate": 3.4323400250146943e-06, + "loss": 0.55406809, + "num_input_tokens_seen": 96588870, + "step": 4466, + "time_per_iteration": 3.3355164527893066 + }, + { + "auxiliary_loss_clip": 0.01141013, + "auxiliary_loss_mlp": 0.01139846, + "balance_loss_clip": 1.00187397, + "balance_loss_mlp": 1.0008955, + "epoch": 0.268570569667819, + "flos": 18733771434240.0, + "grad_norm": 2.0102214121953774, + "language_loss": 0.74437749, + "learning_rate": 3.4320681824782057e-06, + "loss": 0.76718605, + "num_input_tokens_seen": 96605100, + "step": 4467, + "time_per_iteration": 2.5888328552246094 + }, + { + "auxiliary_loss_clip": 0.01141212, + "auxiliary_loss_mlp": 0.00748281, + "balance_loss_clip": 1.00195324, + "balance_loss_mlp": 1.00038481, + "epoch": 0.268630692920487, + "flos": 18178376365440.0, + "grad_norm": 2.650092945790467, + "language_loss": 0.80178744, + "learning_rate": 3.4317962856375493e-06, + "loss": 0.82068241, + "num_input_tokens_seen": 96621410, + "step": 4468, + "time_per_iteration": 2.606257915496826 + }, + { + "auxiliary_loss_clip": 0.01170119, + "auxiliary_loss_mlp": 0.01125062, + "balance_loss_clip": 1.00190496, + "balance_loss_mlp": 0.99993974, + "epoch": 0.268690816173155, + "flos": 68731768978560.0, + "grad_norm": 0.845070756955585, + "language_loss": 0.59612715, + "learning_rate": 3.4315243345030334e-06, + "loss": 0.61907887, + "num_input_tokens_seen": 96684810, + "step": 4469, + "time_per_iteration": 3.1934757232666016 + }, + { + "auxiliary_loss_clip": 0.01173172, + "auxiliary_loss_mlp": 0.01140367, + "balance_loss_clip": 1.0019989, + "balance_loss_mlp": 1.00084472, + "epoch": 0.26875093942582295, + "flos": 23293636295040.0, + "grad_norm": 2.0661622306695087, + "language_loss": 0.81545043, + "learning_rate": 3.431252329084972e-06, + "loss": 0.83858579, + "num_input_tokens_seen": 96701920, + "step": 4470, + "time_per_iteration": 2.6328787803649902 + }, + { + "auxiliary_loss_clip": 0.01139878, + "auxiliary_loss_mlp": 0.01138991, + "balance_loss_clip": 1.00171447, + "balance_loss_mlp": 1.00070822, + "epoch": 0.2688110626784909, + "flos": 21543458112000.0, + "grad_norm": 1.8320338063419555, + "language_loss": 0.82778418, + "learning_rate": 3.4309802693936786e-06, + "loss": 0.85057282, + "num_input_tokens_seen": 96721260, + "step": 4471, + "time_per_iteration": 2.6605234146118164 + }, + { + "auxiliary_loss_clip": 0.0115656, + "auxiliary_loss_mlp": 0.01139242, + "balance_loss_clip": 1.00184751, + "balance_loss_mlp": 1.00067353, + "epoch": 0.2688711859311589, + "flos": 28400600183040.0, + "grad_norm": 2.0336415564070345, + "language_loss": 0.6948781, + "learning_rate": 3.43070815543947e-06, + "loss": 0.71783608, + "num_input_tokens_seen": 96740385, + "step": 4472, + "time_per_iteration": 2.6469902992248535 + }, + { + "auxiliary_loss_clip": 0.01173228, + "auxiliary_loss_mlp": 0.01139814, + "balance_loss_clip": 1.00212455, + "balance_loss_mlp": 1.0009594, + "epoch": 0.26893130918382685, + "flos": 25994944661760.0, + "grad_norm": 1.5908807865072707, + "language_loss": 0.67931199, + "learning_rate": 3.4304359872326656e-06, + "loss": 0.70244235, + "num_input_tokens_seen": 96761860, + "step": 4473, + "time_per_iteration": 4.028048753738403 + }, + { + "auxiliary_loss_clip": 0.01140013, + "auxiliary_loss_mlp": 0.01139512, + "balance_loss_clip": 1.00192511, + "balance_loss_mlp": 1.00103831, + "epoch": 0.2689914324364948, + "flos": 20339624770560.0, + "grad_norm": 1.746949904704517, + "language_loss": 0.83186126, + "learning_rate": 3.4301637647835843e-06, + "loss": 0.85465658, + "num_input_tokens_seen": 96781890, + "step": 4474, + "time_per_iteration": 2.593252658843994 + }, + { + "auxiliary_loss_clip": 0.01162382, + "auxiliary_loss_mlp": 0.01139545, + "balance_loss_clip": 1.00232601, + "balance_loss_mlp": 1.00097632, + "epoch": 0.2690515556891628, + "flos": 19464553635840.0, + "grad_norm": 1.8289822059852991, + "language_loss": 0.70725209, + "learning_rate": 3.4298914881025494e-06, + "loss": 0.7302714, + "num_input_tokens_seen": 96800390, + "step": 4475, + "time_per_iteration": 2.5458264350891113 + }, + { + "auxiliary_loss_clip": 0.01123292, + "auxiliary_loss_mlp": 0.00748316, + "balance_loss_clip": 1.0018456, + "balance_loss_mlp": 1.00031257, + "epoch": 0.26911167894183075, + "flos": 18146631720960.0, + "grad_norm": 1.6946988391159084, + "language_loss": 0.73430884, + "learning_rate": 3.4296191571998863e-06, + "loss": 0.75302494, + "num_input_tokens_seen": 96816685, + "step": 4476, + "time_per_iteration": 2.632106304168701 + }, + { + "auxiliary_loss_clip": 0.01139994, + "auxiliary_loss_mlp": 0.01140041, + "balance_loss_clip": 1.00181997, + "balance_loss_mlp": 1.00070882, + "epoch": 0.2691718021944987, + "flos": 19975131509760.0, + "grad_norm": 2.8534558738418676, + "language_loss": 0.80386126, + "learning_rate": 3.429346772085922e-06, + "loss": 0.82666159, + "num_input_tokens_seen": 96836285, + "step": 4477, + "time_per_iteration": 4.121751546859741 + }, + { + "auxiliary_loss_clip": 0.01107522, + "auxiliary_loss_mlp": 0.011397, + "balance_loss_clip": 1.00164199, + "balance_loss_mlp": 1.00084543, + "epoch": 0.2692319254471667, + "flos": 37447215770880.0, + "grad_norm": 3.520233376231923, + "language_loss": 0.64776754, + "learning_rate": 3.429074332770984e-06, + "loss": 0.67023981, + "num_input_tokens_seen": 96857745, + "step": 4478, + "time_per_iteration": 4.371646404266357 + }, + { + "auxiliary_loss_clip": 0.01157747, + "auxiliary_loss_mlp": 0.0113954, + "balance_loss_clip": 1.00203347, + "balance_loss_mlp": 1.00087571, + "epoch": 0.26929204869983464, + "flos": 22127796564480.0, + "grad_norm": 1.759687697998221, + "language_loss": 0.80511594, + "learning_rate": 3.4288018392654047e-06, + "loss": 0.82808888, + "num_input_tokens_seen": 96877295, + "step": 4479, + "time_per_iteration": 2.565804958343506 + }, + { + "auxiliary_loss_clip": 0.01141268, + "auxiliary_loss_mlp": 0.00748338, + "balance_loss_clip": 1.00195885, + "balance_loss_mlp": 1.00045788, + "epoch": 0.2693521719525026, + "flos": 19792813052160.0, + "grad_norm": 2.017789732278415, + "language_loss": 0.80880868, + "learning_rate": 3.4285292915795166e-06, + "loss": 0.82770473, + "num_input_tokens_seen": 96896160, + "step": 4480, + "time_per_iteration": 2.6292834281921387 + }, + { + "auxiliary_loss_clip": 0.01124677, + "auxiliary_loss_mlp": 0.01139744, + "balance_loss_clip": 1.00188589, + "balance_loss_mlp": 1.00079393, + "epoch": 0.2694122952051706, + "flos": 20994383836800.0, + "grad_norm": 1.9309547323372027, + "language_loss": 0.77898538, + "learning_rate": 3.4282566897236543e-06, + "loss": 0.80162966, + "num_input_tokens_seen": 96915410, + "step": 4481, + "time_per_iteration": 2.6430912017822266 + }, + { + "auxiliary_loss_clip": 0.01157892, + "auxiliary_loss_mlp": 0.01139806, + "balance_loss_clip": 1.00201917, + "balance_loss_mlp": 1.00095105, + "epoch": 0.2694724184578386, + "flos": 25849291011840.0, + "grad_norm": 1.6231563009844596, + "language_loss": 0.74106956, + "learning_rate": 3.4279840337081547e-06, + "loss": 0.76404655, + "num_input_tokens_seen": 96937865, + "step": 4482, + "time_per_iteration": 2.60017728805542 + }, + { + "auxiliary_loss_clip": 0.01141389, + "auxiliary_loss_mlp": 0.01139324, + "balance_loss_clip": 1.0020349, + "balance_loss_mlp": 1.00085068, + "epoch": 0.26953254171050656, + "flos": 21726961718400.0, + "grad_norm": 2.6968753107143284, + "language_loss": 0.72202498, + "learning_rate": 3.4277113235433584e-06, + "loss": 0.74483204, + "num_input_tokens_seen": 96957710, + "step": 4483, + "time_per_iteration": 2.6442067623138428 + }, + { + "auxiliary_loss_clip": 0.01157907, + "auxiliary_loss_mlp": 0.01140239, + "balance_loss_clip": 1.00193262, + "balance_loss_mlp": 1.00100267, + "epoch": 0.2695926649631745, + "flos": 19682926369920.0, + "grad_norm": 2.078921640432159, + "language_loss": 0.86715531, + "learning_rate": 3.427438559239605e-06, + "loss": 0.89013678, + "num_input_tokens_seen": 96975890, + "step": 4484, + "time_per_iteration": 2.5728280544281006 + }, + { + "auxiliary_loss_clip": 0.01156433, + "auxiliary_loss_mlp": 0.01139459, + "balance_loss_clip": 1.00188625, + "balance_loss_mlp": 1.00079489, + "epoch": 0.2696527882158425, + "flos": 32886596724480.0, + "grad_norm": 1.4988733561980387, + "language_loss": 0.66639805, + "learning_rate": 3.427165740807239e-06, + "loss": 0.68935692, + "num_input_tokens_seen": 96998595, + "step": 4485, + "time_per_iteration": 2.661001682281494 + }, + { + "auxiliary_loss_clip": 0.01142656, + "auxiliary_loss_mlp": 0.01139994, + "balance_loss_clip": 1.00192142, + "balance_loss_mlp": 1.00104392, + "epoch": 0.26971291146851045, + "flos": 12124843320960.0, + "grad_norm": 2.2809576515450636, + "language_loss": 0.73179013, + "learning_rate": 3.426892868256604e-06, + "loss": 0.75461662, + "num_input_tokens_seen": 97013715, + "step": 4486, + "time_per_iteration": 2.5533103942871094 + }, + { + "auxiliary_loss_clip": 0.01173201, + "auxiliary_loss_mlp": 0.01140173, + "balance_loss_clip": 1.00208902, + "balance_loss_mlp": 1.001127, + "epoch": 0.2697730347211784, + "flos": 22634459856000.0, + "grad_norm": 1.747959368878589, + "language_loss": 0.83759534, + "learning_rate": 3.4266199415980495e-06, + "loss": 0.8607291, + "num_input_tokens_seen": 97031570, + "step": 4487, + "time_per_iteration": 2.5197606086730957 + }, + { + "auxiliary_loss_clip": 0.01125591, + "auxiliary_loss_mlp": 0.01140191, + "balance_loss_clip": 1.00189352, + "balance_loss_mlp": 1.00085974, + "epoch": 0.2698331579738464, + "flos": 23513050523520.0, + "grad_norm": 2.1250816935327506, + "language_loss": 0.72196662, + "learning_rate": 3.4263469608419234e-06, + "loss": 0.74462438, + "num_input_tokens_seen": 97049815, + "step": 4488, + "time_per_iteration": 2.661161184310913 + }, + { + "auxiliary_loss_clip": 0.0107563, + "auxiliary_loss_mlp": 0.01139511, + "balance_loss_clip": 1.00174868, + "balance_loss_mlp": 1.00103784, + "epoch": 0.26989328122651435, + "flos": 24641040297600.0, + "grad_norm": 1.8472519167754196, + "language_loss": 0.8352598, + "learning_rate": 3.426073925998578e-06, + "loss": 0.85741127, + "num_input_tokens_seen": 97067570, + "step": 4489, + "time_per_iteration": 2.7845458984375 + }, + { + "auxiliary_loss_clip": 0.01140464, + "auxiliary_loss_mlp": 0.01140105, + "balance_loss_clip": 1.00204206, + "balance_loss_mlp": 1.0011543, + "epoch": 0.2699534044791823, + "flos": 10772555068800.0, + "grad_norm": 2.1862144681324995, + "language_loss": 0.9025619, + "learning_rate": 3.4258008370783656e-06, + "loss": 0.92536759, + "num_input_tokens_seen": 97082180, + "step": 4490, + "time_per_iteration": 2.589413642883301 + }, + { + "auxiliary_loss_clip": 0.01092413, + "auxiliary_loss_mlp": 0.01138887, + "balance_loss_clip": 1.00166798, + "balance_loss_mlp": 1.00079489, + "epoch": 0.2700135277318503, + "flos": 36171597098880.0, + "grad_norm": 7.5270754716424895, + "language_loss": 0.73386455, + "learning_rate": 3.4255276940916434e-06, + "loss": 0.75617754, + "num_input_tokens_seen": 97103470, + "step": 4491, + "time_per_iteration": 2.8760697841644287 + }, + { + "auxiliary_loss_clip": 0.01173203, + "auxiliary_loss_mlp": 0.01139643, + "balance_loss_clip": 1.00213659, + "balance_loss_mlp": 1.00078797, + "epoch": 0.27007365098451824, + "flos": 17418614866560.0, + "grad_norm": 2.156680625887406, + "language_loss": 0.74344522, + "learning_rate": 3.4252544970487676e-06, + "loss": 0.76657367, + "num_input_tokens_seen": 97118100, + "step": 4492, + "time_per_iteration": 2.5119450092315674 + }, + { + "auxiliary_loss_clip": 0.0114122, + "auxiliary_loss_mlp": 0.01138734, + "balance_loss_clip": 1.0019871, + "balance_loss_mlp": 1.00083292, + "epoch": 0.2701337742371862, + "flos": 23185688947200.0, + "grad_norm": 2.165513179458135, + "language_loss": 0.89133418, + "learning_rate": 3.4249812459600986e-06, + "loss": 0.91413367, + "num_input_tokens_seen": 97136765, + "step": 4493, + "time_per_iteration": 2.6229612827301025 + }, + { + "auxiliary_loss_clip": 0.01157515, + "auxiliary_loss_mlp": 0.01139232, + "balance_loss_clip": 1.00212073, + "balance_loss_mlp": 1.0009495, + "epoch": 0.2701938974898542, + "flos": 24389450461440.0, + "grad_norm": 1.3887974104946064, + "language_loss": 0.71254754, + "learning_rate": 3.424707940835998e-06, + "loss": 0.73551506, + "num_input_tokens_seen": 97157470, + "step": 4494, + "time_per_iteration": 2.6163296699523926 + }, + { + "auxiliary_loss_clip": 0.01139594, + "auxiliary_loss_mlp": 0.01139309, + "balance_loss_clip": 1.00190544, + "balance_loss_mlp": 1.0008359, + "epoch": 0.2702540207425222, + "flos": 26214322976640.0, + "grad_norm": 1.8296246466027026, + "language_loss": 0.86569512, + "learning_rate": 3.42443458168683e-06, + "loss": 0.88848412, + "num_input_tokens_seen": 97176905, + "step": 4495, + "time_per_iteration": 2.644303321838379 + }, + { + "auxiliary_loss_clip": 0.01172924, + "auxiliary_loss_mlp": 0.01139138, + "balance_loss_clip": 1.00199091, + "balance_loss_mlp": 1.00095105, + "epoch": 0.27031414399519016, + "flos": 22926377687040.0, + "grad_norm": 1.586890645254338, + "language_loss": 0.76563907, + "learning_rate": 3.424161168522959e-06, + "loss": 0.78875965, + "num_input_tokens_seen": 97196380, + "step": 4496, + "time_per_iteration": 2.526273250579834 + }, + { + "auxiliary_loss_clip": 0.01169785, + "auxiliary_loss_mlp": 0.0112442, + "balance_loss_clip": 1.00183916, + "balance_loss_mlp": 1.00006092, + "epoch": 0.2703742672478581, + "flos": 63019780404480.0, + "grad_norm": 1.9663596276081603, + "language_loss": 0.50196618, + "learning_rate": 3.423887701354754e-06, + "loss": 0.52490819, + "num_input_tokens_seen": 97260100, + "step": 4497, + "time_per_iteration": 3.1777184009552 + }, + { + "auxiliary_loss_clip": 0.0111219, + "auxiliary_loss_mlp": 0.01139305, + "balance_loss_clip": 1.00191855, + "balance_loss_mlp": 1.00083113, + "epoch": 0.2704343905005261, + "flos": 18840820942080.0, + "grad_norm": 1.7851582454704893, + "language_loss": 0.72257274, + "learning_rate": 3.4236141801925847e-06, + "loss": 0.74508768, + "num_input_tokens_seen": 97277935, + "step": 4498, + "time_per_iteration": 2.683899164199829 + }, + { + "auxiliary_loss_clip": 0.01123806, + "auxiliary_loss_mlp": 0.01124308, + "balance_loss_clip": 1.001827, + "balance_loss_mlp": 0.99994892, + "epoch": 0.27049451375319405, + "flos": 71233412618880.0, + "grad_norm": 0.7549931794668147, + "language_loss": 0.59240818, + "learning_rate": 3.4233406050468237e-06, + "loss": 0.61488932, + "num_input_tokens_seen": 97338845, + "step": 4499, + "time_per_iteration": 3.278693437576294 + }, + { + "auxiliary_loss_clip": 0.01140881, + "auxiliary_loss_mlp": 0.01138893, + "balance_loss_clip": 1.00195432, + "balance_loss_mlp": 1.00080132, + "epoch": 0.270554637005862, + "flos": 24278594112000.0, + "grad_norm": 2.223120030608766, + "language_loss": 0.73491955, + "learning_rate": 3.4230669759278438e-06, + "loss": 0.75771731, + "num_input_tokens_seen": 97356640, + "step": 4500, + "time_per_iteration": 2.615485429763794 + }, + { + "auxiliary_loss_clip": 0.01142169, + "auxiliary_loss_mlp": 0.01138784, + "balance_loss_clip": 1.00194144, + "balance_loss_mlp": 1.0008831, + "epoch": 0.27061476025853, + "flos": 17632318832640.0, + "grad_norm": 2.355785571286761, + "language_loss": 0.81475329, + "learning_rate": 3.4227932928460215e-06, + "loss": 0.8375628, + "num_input_tokens_seen": 97372585, + "step": 4501, + "time_per_iteration": 2.5459821224212646 + }, + { + "auxiliary_loss_clip": 0.01124199, + "auxiliary_loss_mlp": 0.01139871, + "balance_loss_clip": 1.00187194, + "balance_loss_mlp": 1.00092077, + "epoch": 0.27067488351119795, + "flos": 22710123855360.0, + "grad_norm": 1.6198025151130484, + "language_loss": 0.72557604, + "learning_rate": 3.422519555811735e-06, + "loss": 0.74821681, + "num_input_tokens_seen": 97393315, + "step": 4502, + "time_per_iteration": 2.648101568222046 + }, + { + "auxiliary_loss_clip": 0.0114003, + "auxiliary_loss_mlp": 0.01139171, + "balance_loss_clip": 1.00177526, + "balance_loss_mlp": 1.00060213, + "epoch": 0.2707350067638659, + "flos": 41719616087040.0, + "grad_norm": 1.8630043126422875, + "language_loss": 0.68438333, + "learning_rate": 3.4222457648353642e-06, + "loss": 0.70717537, + "num_input_tokens_seen": 97417860, + "step": 4503, + "time_per_iteration": 2.7912216186523438 + }, + { + "auxiliary_loss_clip": 0.01109427, + "auxiliary_loss_mlp": 0.01138926, + "balance_loss_clip": 1.00185061, + "balance_loss_mlp": 1.00083351, + "epoch": 0.2707951300165339, + "flos": 20193037367040.0, + "grad_norm": 2.571685788381272, + "language_loss": 0.68004298, + "learning_rate": 3.4219719199272918e-06, + "loss": 0.70252657, + "num_input_tokens_seen": 97436780, + "step": 4504, + "time_per_iteration": 2.678136110305786 + }, + { + "auxiliary_loss_clip": 0.01157186, + "auxiliary_loss_mlp": 0.01138941, + "balance_loss_clip": 1.00200903, + "balance_loss_mlp": 1.00103998, + "epoch": 0.27085525326920185, + "flos": 21433966479360.0, + "grad_norm": 1.4820258845970193, + "language_loss": 0.75314379, + "learning_rate": 3.421698021097902e-06, + "loss": 0.77610505, + "num_input_tokens_seen": 97456190, + "step": 4505, + "time_per_iteration": 2.5506577491760254 + }, + { + "auxiliary_loss_clip": 0.01173064, + "auxiliary_loss_mlp": 0.01139091, + "balance_loss_clip": 1.00205445, + "balance_loss_mlp": 1.00090408, + "epoch": 0.2709153765218698, + "flos": 17675232606720.0, + "grad_norm": 2.154634321742605, + "language_loss": 0.73423707, + "learning_rate": 3.42142406835758e-06, + "loss": 0.75735855, + "num_input_tokens_seen": 97474545, + "step": 4506, + "time_per_iteration": 2.5063934326171875 + }, + { + "auxiliary_loss_clip": 0.01140743, + "auxiliary_loss_mlp": 0.01138875, + "balance_loss_clip": 1.00192535, + "balance_loss_mlp": 1.00068748, + "epoch": 0.2709754997745378, + "flos": 24456243801600.0, + "grad_norm": 2.0056756272806266, + "language_loss": 0.80764532, + "learning_rate": 3.421150061716715e-06, + "loss": 0.83044147, + "num_input_tokens_seen": 97494520, + "step": 4507, + "time_per_iteration": 2.6330063343048096 + }, + { + "auxiliary_loss_clip": 0.01138252, + "auxiliary_loss_mlp": 0.01123608, + "balance_loss_clip": 1.00182199, + "balance_loss_mlp": 1.00001192, + "epoch": 0.2710356230272058, + "flos": 65210798206080.0, + "grad_norm": 0.7391635732218422, + "language_loss": 0.50888121, + "learning_rate": 3.420876001185698e-06, + "loss": 0.53149974, + "num_input_tokens_seen": 97552455, + "step": 4508, + "time_per_iteration": 3.086585521697998 + }, + { + "auxiliary_loss_clip": 0.01098724, + "auxiliary_loss_mlp": 0.01138517, + "balance_loss_clip": 1.00192356, + "balance_loss_mlp": 1.00090218, + "epoch": 0.27109574627987376, + "flos": 25484438615040.0, + "grad_norm": 1.8881343297090722, + "language_loss": 0.74513042, + "learning_rate": 3.4206018867749197e-06, + "loss": 0.76750284, + "num_input_tokens_seen": 97572650, + "step": 4509, + "time_per_iteration": 2.7439260482788086 + }, + { + "auxiliary_loss_clip": 0.01156334, + "auxiliary_loss_mlp": 0.01137661, + "balance_loss_clip": 1.0019201, + "balance_loss_mlp": 1.00061762, + "epoch": 0.2711558695325417, + "flos": 19682782715520.0, + "grad_norm": 1.6259231813925494, + "language_loss": 0.71431994, + "learning_rate": 3.4203277184947757e-06, + "loss": 0.73725986, + "num_input_tokens_seen": 97591150, + "step": 4510, + "time_per_iteration": 2.5630087852478027 + }, + { + "auxiliary_loss_clip": 0.01156154, + "auxiliary_loss_mlp": 0.01138442, + "balance_loss_clip": 1.00202274, + "balance_loss_mlp": 1.00082695, + "epoch": 0.2712159927852097, + "flos": 18587758648320.0, + "grad_norm": 2.355023128006471, + "language_loss": 0.70637697, + "learning_rate": 3.4200534963556627e-06, + "loss": 0.72932291, + "num_input_tokens_seen": 97607410, + "step": 4511, + "time_per_iteration": 3.9379470348358154 + }, + { + "auxiliary_loss_clip": 0.01145667, + "auxiliary_loss_mlp": 0.01139263, + "balance_loss_clip": 1.00191236, + "balance_loss_mlp": 1.00088489, + "epoch": 0.27127611603787766, + "flos": 25630235919360.0, + "grad_norm": 2.3310085565167924, + "language_loss": 0.81108797, + "learning_rate": 3.419779220367979e-06, + "loss": 0.83393729, + "num_input_tokens_seen": 97626870, + "step": 4512, + "time_per_iteration": 2.659346342086792 + }, + { + "auxiliary_loss_clip": 0.01172935, + "auxiliary_loss_mlp": 0.01138327, + "balance_loss_clip": 1.00204813, + "balance_loss_mlp": 1.00071228, + "epoch": 0.2713362392905456, + "flos": 23148952312320.0, + "grad_norm": 1.4181712187315345, + "language_loss": 0.80410838, + "learning_rate": 3.419504890542124e-06, + "loss": 0.82722098, + "num_input_tokens_seen": 97646595, + "step": 4513, + "time_per_iteration": 2.589907646179199 + }, + { + "auxiliary_loss_clip": 0.01141058, + "auxiliary_loss_mlp": 0.01138167, + "balance_loss_clip": 1.00191295, + "balance_loss_mlp": 1.00083828, + "epoch": 0.2713963625432136, + "flos": 18366045949440.0, + "grad_norm": 2.148075175176967, + "language_loss": 0.87595308, + "learning_rate": 3.4192305068885026e-06, + "loss": 0.8987453, + "num_input_tokens_seen": 97665485, + "step": 4514, + "time_per_iteration": 4.031371116638184 + }, + { + "auxiliary_loss_clip": 0.01140817, + "auxiliary_loss_mlp": 0.01138834, + "balance_loss_clip": 1.00182915, + "balance_loss_mlp": 1.00093246, + "epoch": 0.27145648579588155, + "flos": 22491751121280.0, + "grad_norm": 1.5797253631915225, + "language_loss": 0.91855234, + "learning_rate": 3.418956069417517e-06, + "loss": 0.94134885, + "num_input_tokens_seen": 97683800, + "step": 4515, + "time_per_iteration": 2.5884203910827637 + }, + { + "auxiliary_loss_clip": 0.0110973, + "auxiliary_loss_mlp": 0.01139639, + "balance_loss_clip": 1.00194907, + "balance_loss_mlp": 1.00135589, + "epoch": 0.2715166090485495, + "flos": 19239177749760.0, + "grad_norm": 2.2355958612753897, + "language_loss": 0.73570764, + "learning_rate": 3.4186815781395756e-06, + "loss": 0.75820136, + "num_input_tokens_seen": 97700505, + "step": 4516, + "time_per_iteration": 5.453780651092529 + }, + { + "auxiliary_loss_clip": 0.01156424, + "auxiliary_loss_mlp": 0.01138662, + "balance_loss_clip": 1.00198388, + "balance_loss_mlp": 1.00085604, + "epoch": 0.2715767323012175, + "flos": 17709598944000.0, + "grad_norm": 1.8724719140663806, + "language_loss": 0.75646049, + "learning_rate": 3.4184070330650866e-06, + "loss": 0.77941132, + "num_input_tokens_seen": 97717410, + "step": 4517, + "time_per_iteration": 2.5362918376922607 + }, + { + "auxiliary_loss_clip": 0.011271, + "auxiliary_loss_mlp": 0.01138347, + "balance_loss_clip": 1.00194776, + "balance_loss_mlp": 1.00082684, + "epoch": 0.27163685555388545, + "flos": 22382834106240.0, + "grad_norm": 2.3878593093734755, + "language_loss": 0.76723289, + "learning_rate": 3.4181324342044607e-06, + "loss": 0.78988737, + "num_input_tokens_seen": 97734545, + "step": 4518, + "time_per_iteration": 2.634251117706299 + }, + { + "auxiliary_loss_clip": 0.01141431, + "auxiliary_loss_mlp": 0.01138989, + "balance_loss_clip": 1.00195765, + "balance_loss_mlp": 1.00080168, + "epoch": 0.2716969788065534, + "flos": 22346708002560.0, + "grad_norm": 2.082612249397289, + "language_loss": 0.68260539, + "learning_rate": 3.41785778156811e-06, + "loss": 0.70540959, + "num_input_tokens_seen": 97754000, + "step": 4519, + "time_per_iteration": 2.5950980186462402 + }, + { + "auxiliary_loss_clip": 0.01156116, + "auxiliary_loss_mlp": 0.01137812, + "balance_loss_clip": 1.00181592, + "balance_loss_mlp": 1.00067377, + "epoch": 0.2717571020592214, + "flos": 25228467319680.0, + "grad_norm": 5.433973440892773, + "language_loss": 0.75626457, + "learning_rate": 3.417583075166451e-06, + "loss": 0.77920389, + "num_input_tokens_seen": 97772080, + "step": 4520, + "time_per_iteration": 2.5792059898376465 + }, + { + "auxiliary_loss_clip": 0.0115777, + "auxiliary_loss_mlp": 0.01139121, + "balance_loss_clip": 1.00200319, + "balance_loss_mlp": 1.00093389, + "epoch": 0.2718172253118894, + "flos": 20189769229440.0, + "grad_norm": 2.3842703997911, + "language_loss": 0.76396441, + "learning_rate": 3.4173083150099e-06, + "loss": 0.78693336, + "num_input_tokens_seen": 97789370, + "step": 4521, + "time_per_iteration": 2.547776699066162 + }, + { + "auxiliary_loss_clip": 0.01142446, + "auxiliary_loss_mlp": 0.01139607, + "balance_loss_clip": 1.00200737, + "balance_loss_mlp": 1.00103867, + "epoch": 0.27187734856455736, + "flos": 14319129260160.0, + "grad_norm": 2.207266498012508, + "language_loss": 0.75570321, + "learning_rate": 3.417033501108875e-06, + "loss": 0.77852374, + "num_input_tokens_seen": 97807385, + "step": 4522, + "time_per_iteration": 2.5841641426086426 + }, + { + "auxiliary_loss_clip": 0.0117305, + "auxiliary_loss_mlp": 0.0113859, + "balance_loss_clip": 1.00210905, + "balance_loss_mlp": 1.00068831, + "epoch": 0.27193747181722533, + "flos": 21107682311040.0, + "grad_norm": 1.659966702774036, + "language_loss": 0.72799718, + "learning_rate": 3.416758633473798e-06, + "loss": 0.75111353, + "num_input_tokens_seen": 97827930, + "step": 4523, + "time_per_iteration": 2.5289173126220703 + }, + { + "auxiliary_loss_clip": 0.0114087, + "auxiliary_loss_mlp": 0.01138525, + "balance_loss_clip": 1.00181353, + "balance_loss_mlp": 1.00071907, + "epoch": 0.2719975950698933, + "flos": 19682782715520.0, + "grad_norm": 1.4967882953001865, + "language_loss": 0.74435693, + "learning_rate": 3.4164837121150915e-06, + "loss": 0.76715088, + "num_input_tokens_seen": 97847440, + "step": 4524, + "time_per_iteration": 2.590869903564453 + }, + { + "auxiliary_loss_clip": 0.01173102, + "auxiliary_loss_mlp": 0.0113844, + "balance_loss_clip": 1.00207162, + "balance_loss_mlp": 1.00092041, + "epoch": 0.27205771832256126, + "flos": 24754482426240.0, + "grad_norm": 2.2833532404605745, + "language_loss": 0.76487041, + "learning_rate": 3.4162087370431803e-06, + "loss": 0.78798586, + "num_input_tokens_seen": 97867620, + "step": 4525, + "time_per_iteration": 2.5396974086761475 + }, + { + "auxiliary_loss_clip": 0.01157754, + "auxiliary_loss_mlp": 0.01137924, + "balance_loss_clip": 1.0019902, + "balance_loss_mlp": 1.00097644, + "epoch": 0.2721178415752292, + "flos": 21755581879680.0, + "grad_norm": 1.7326802988106291, + "language_loss": 0.81514823, + "learning_rate": 3.4159337082684926e-06, + "loss": 0.83810502, + "num_input_tokens_seen": 97884345, + "step": 4526, + "time_per_iteration": 2.534726619720459 + }, + { + "auxiliary_loss_clip": 0.01173144, + "auxiliary_loss_mlp": 0.01139175, + "balance_loss_clip": 1.00201273, + "balance_loss_mlp": 1.00079703, + "epoch": 0.2721779648278972, + "flos": 12676826597760.0, + "grad_norm": 2.137677763845774, + "language_loss": 0.76965088, + "learning_rate": 3.4156586258014566e-06, + "loss": 0.79277408, + "num_input_tokens_seen": 97901500, + "step": 4527, + "time_per_iteration": 2.4896068572998047 + }, + { + "auxiliary_loss_clip": 0.01124765, + "auxiliary_loss_mlp": 0.00748169, + "balance_loss_clip": 1.00189805, + "balance_loss_mlp": 1.00031495, + "epoch": 0.27223808808056515, + "flos": 16253206099200.0, + "grad_norm": 2.1695744787849365, + "language_loss": 0.82041359, + "learning_rate": 3.415383489652503e-06, + "loss": 0.83914292, + "num_input_tokens_seen": 97917800, + "step": 4528, + "time_per_iteration": 2.6398775577545166 + }, + { + "auxiliary_loss_clip": 0.01142557, + "auxiliary_loss_mlp": 0.01138035, + "balance_loss_clip": 1.00197864, + "balance_loss_mlp": 1.00089669, + "epoch": 0.2722982113332331, + "flos": 27745805203200.0, + "grad_norm": 1.8822598019579444, + "language_loss": 0.77631402, + "learning_rate": 3.4151082998320666e-06, + "loss": 0.79911995, + "num_input_tokens_seen": 97937225, + "step": 4529, + "time_per_iteration": 2.6418988704681396 + }, + { + "auxiliary_loss_clip": 0.0113956, + "auxiliary_loss_mlp": 0.0113875, + "balance_loss_clip": 1.00170875, + "balance_loss_mlp": 1.00103927, + "epoch": 0.2723583345859011, + "flos": 21726243446400.0, + "grad_norm": 1.9795415312758975, + "language_loss": 0.81964695, + "learning_rate": 3.4148330563505805e-06, + "loss": 0.84243011, + "num_input_tokens_seen": 97956845, + "step": 4530, + "time_per_iteration": 2.5783519744873047 + }, + { + "auxiliary_loss_clip": 0.01156563, + "auxiliary_loss_mlp": 0.01138454, + "balance_loss_clip": 1.00205123, + "balance_loss_mlp": 1.00083899, + "epoch": 0.27241845783856905, + "flos": 17347260499200.0, + "grad_norm": 1.9906877685783848, + "language_loss": 0.91273308, + "learning_rate": 3.4145577592184838e-06, + "loss": 0.93568325, + "num_input_tokens_seen": 97972465, + "step": 4531, + "time_per_iteration": 2.5392377376556396 + }, + { + "auxiliary_loss_clip": 0.01156002, + "auxiliary_loss_mlp": 0.01139121, + "balance_loss_clip": 1.00179029, + "balance_loss_mlp": 1.00093389, + "epoch": 0.272478581091237, + "flos": 24754302858240.0, + "grad_norm": 1.8102661129421702, + "language_loss": 0.76512849, + "learning_rate": 3.4142824084462155e-06, + "loss": 0.78807974, + "num_input_tokens_seen": 97990770, + "step": 4532, + "time_per_iteration": 2.563649892807007 + }, + { + "auxiliary_loss_clip": 0.0112465, + "auxiliary_loss_mlp": 0.0113827, + "balance_loss_clip": 1.00180602, + "balance_loss_mlp": 1.00075006, + "epoch": 0.272538704343905, + "flos": 17890624512000.0, + "grad_norm": 2.2984167940383715, + "language_loss": 0.88945663, + "learning_rate": 3.4140070040442162e-06, + "loss": 0.91208583, + "num_input_tokens_seen": 98005775, + "step": 4533, + "time_per_iteration": 2.6237401962280273 + }, + { + "auxiliary_loss_clip": 0.01140738, + "auxiliary_loss_mlp": 0.01138097, + "balance_loss_clip": 1.00186491, + "balance_loss_mlp": 1.00067306, + "epoch": 0.272598827596573, + "flos": 22932016122240.0, + "grad_norm": 1.6927117932770537, + "language_loss": 0.71443683, + "learning_rate": 3.413731546022929e-06, + "loss": 0.73722517, + "num_input_tokens_seen": 98025750, + "step": 4534, + "time_per_iteration": 2.6596665382385254 + }, + { + "auxiliary_loss_clip": 0.01139518, + "auxiliary_loss_mlp": 0.01139063, + "balance_loss_clip": 1.00166965, + "balance_loss_mlp": 1.00078034, + "epoch": 0.27265895084924097, + "flos": 24238409771520.0, + "grad_norm": 1.7594674082794375, + "language_loss": 0.91455603, + "learning_rate": 3.4134560343928005e-06, + "loss": 0.93734181, + "num_input_tokens_seen": 98044955, + "step": 4535, + "time_per_iteration": 2.6057310104370117 + }, + { + "auxiliary_loss_clip": 0.01140987, + "auxiliary_loss_mlp": 0.01138845, + "balance_loss_clip": 1.00202882, + "balance_loss_mlp": 1.00084853, + "epoch": 0.27271907410190893, + "flos": 27013155494400.0, + "grad_norm": 1.6309377728181382, + "language_loss": 0.73019993, + "learning_rate": 3.4131804691642778e-06, + "loss": 0.75299823, + "num_input_tokens_seen": 98065860, + "step": 4536, + "time_per_iteration": 2.6948583126068115 + }, + { + "auxiliary_loss_clip": 0.01157708, + "auxiliary_loss_mlp": 0.01138785, + "balance_loss_clip": 1.00201094, + "balance_loss_mlp": 1.00078797, + "epoch": 0.2727791973545769, + "flos": 34452588942720.0, + "grad_norm": 1.6836056751173805, + "language_loss": 0.71360451, + "learning_rate": 3.41290485034781e-06, + "loss": 0.7365694, + "num_input_tokens_seen": 98085450, + "step": 4537, + "time_per_iteration": 2.688528060913086 + }, + { + "auxiliary_loss_clip": 0.01141123, + "auxiliary_loss_mlp": 0.01138606, + "balance_loss_clip": 1.00184679, + "balance_loss_mlp": 1.00080037, + "epoch": 0.27283932060724486, + "flos": 15041723160960.0, + "grad_norm": 2.7724395269151465, + "language_loss": 0.77725494, + "learning_rate": 3.4126291779538485e-06, + "loss": 0.80005223, + "num_input_tokens_seen": 98099115, + "step": 4538, + "time_per_iteration": 2.5753018856048584 + }, + { + "auxiliary_loss_clip": 0.01156012, + "auxiliary_loss_mlp": 0.0113843, + "balance_loss_clip": 1.00187671, + "balance_loss_mlp": 1.00090981, + "epoch": 0.2728994438599128, + "flos": 21652411040640.0, + "grad_norm": 2.5777701205567736, + "language_loss": 0.90293241, + "learning_rate": 3.412353451992847e-06, + "loss": 0.92587686, + "num_input_tokens_seen": 98118415, + "step": 4539, + "time_per_iteration": 2.5575366020202637 + }, + { + "auxiliary_loss_clip": 0.0114106, + "auxiliary_loss_mlp": 0.01138736, + "balance_loss_clip": 1.00194716, + "balance_loss_mlp": 1.00083423, + "epoch": 0.2729595671125808, + "flos": 17488424949120.0, + "grad_norm": 1.812475906397251, + "language_loss": 0.87925816, + "learning_rate": 3.4120776724752607e-06, + "loss": 0.90205616, + "num_input_tokens_seen": 98136300, + "step": 4540, + "time_per_iteration": 2.5683934688568115 + }, + { + "auxiliary_loss_clip": 0.01157275, + "auxiliary_loss_mlp": 0.00748172, + "balance_loss_clip": 1.00196743, + "balance_loss_mlp": 1.00025582, + "epoch": 0.27301969036524876, + "flos": 19318145800320.0, + "grad_norm": 5.641601664795083, + "language_loss": 0.82240367, + "learning_rate": 3.4118018394115476e-06, + "loss": 0.8414582, + "num_input_tokens_seen": 98154580, + "step": 4541, + "time_per_iteration": 2.5368194580078125 + }, + { + "auxiliary_loss_clip": 0.01140489, + "auxiliary_loss_mlp": 0.01137829, + "balance_loss_clip": 1.00181174, + "balance_loss_mlp": 1.00078559, + "epoch": 0.2730798136179167, + "flos": 21065666376960.0, + "grad_norm": 1.8273005519582786, + "language_loss": 0.79878336, + "learning_rate": 3.4115259528121678e-06, + "loss": 0.82156658, + "num_input_tokens_seen": 98173115, + "step": 4542, + "time_per_iteration": 2.592381477355957 + }, + { + "auxiliary_loss_clip": 0.01141101, + "auxiliary_loss_mlp": 0.01138273, + "balance_loss_clip": 1.00201595, + "balance_loss_mlp": 1.00084853, + "epoch": 0.2731399368705847, + "flos": 19171737964800.0, + "grad_norm": 2.23470935484249, + "language_loss": 0.8957063, + "learning_rate": 3.411250012687582e-06, + "loss": 0.91850001, + "num_input_tokens_seen": 98190260, + "step": 4543, + "time_per_iteration": 2.687500238418579 + }, + { + "auxiliary_loss_clip": 0.01141337, + "auxiliary_loss_mlp": 0.0074815, + "balance_loss_clip": 1.00186157, + "balance_loss_mlp": 1.00035143, + "epoch": 0.27320006012325265, + "flos": 18290130554880.0, + "grad_norm": 2.825827459055268, + "language_loss": 0.63452244, + "learning_rate": 3.410974019048255e-06, + "loss": 0.65341729, + "num_input_tokens_seen": 98207115, + "step": 4544, + "time_per_iteration": 2.5693788528442383 + }, + { + "auxiliary_loss_clip": 0.01139673, + "auxiliary_loss_mlp": 0.01138495, + "balance_loss_clip": 1.00178313, + "balance_loss_mlp": 1.00107038, + "epoch": 0.2732601833759206, + "flos": 34860929731200.0, + "grad_norm": 1.584298422371849, + "language_loss": 0.70093077, + "learning_rate": 3.410697971904651e-06, + "loss": 0.72371244, + "num_input_tokens_seen": 98230610, + "step": 4545, + "time_per_iteration": 2.7273406982421875 + }, + { + "auxiliary_loss_clip": 0.0114373, + "auxiliary_loss_mlp": 0.01123693, + "balance_loss_clip": 1.00179577, + "balance_loss_mlp": 1.00009716, + "epoch": 0.2733203066285886, + "flos": 53910824762880.0, + "grad_norm": 0.7167824777425879, + "language_loss": 0.61621225, + "learning_rate": 3.4104218712672383e-06, + "loss": 0.63888645, + "num_input_tokens_seen": 98293585, + "step": 4546, + "time_per_iteration": 3.1881630420684814 + }, + { + "auxiliary_loss_clip": 0.01066279, + "auxiliary_loss_mlp": 0.01138771, + "balance_loss_clip": 1.00190055, + "balance_loss_mlp": 1.00106072, + "epoch": 0.2733804298812566, + "flos": 20660378244480.0, + "grad_norm": 2.271053423500276, + "language_loss": 0.64986515, + "learning_rate": 3.410145717146488e-06, + "loss": 0.67191571, + "num_input_tokens_seen": 98311680, + "step": 4547, + "time_per_iteration": 2.7980587482452393 + }, + { + "auxiliary_loss_clip": 0.01140446, + "auxiliary_loss_mlp": 0.00748113, + "balance_loss_clip": 1.00187826, + "balance_loss_mlp": 1.000368, + "epoch": 0.27344055313392457, + "flos": 25884339707520.0, + "grad_norm": 2.223958359873512, + "language_loss": 0.77368462, + "learning_rate": 3.4098695095528694e-06, + "loss": 0.79257023, + "num_input_tokens_seen": 98330770, + "step": 4548, + "time_per_iteration": 4.0776801109313965 + }, + { + "auxiliary_loss_clip": 0.01139604, + "auxiliary_loss_mlp": 0.01138014, + "balance_loss_clip": 1.00196469, + "balance_loss_mlp": 1.00087607, + "epoch": 0.27350067638659253, + "flos": 22929753565440.0, + "grad_norm": 2.146428756745969, + "language_loss": 0.82641155, + "learning_rate": 3.4095932484968585e-06, + "loss": 0.84918773, + "num_input_tokens_seen": 98349860, + "step": 4549, + "time_per_iteration": 2.6337790489196777 + }, + { + "auxiliary_loss_clip": 0.01156424, + "auxiliary_loss_mlp": 0.01138833, + "balance_loss_clip": 1.00192094, + "balance_loss_mlp": 1.00064588, + "epoch": 0.2735607996392605, + "flos": 16574821499520.0, + "grad_norm": 2.0033308226346835, + "language_loss": 0.70762748, + "learning_rate": 3.4093169339889305e-06, + "loss": 0.73058003, + "num_input_tokens_seen": 98367040, + "step": 4550, + "time_per_iteration": 2.56184458732605 + }, + { + "auxiliary_loss_clip": 0.0112326, + "auxiliary_loss_mlp": 0.01138167, + "balance_loss_clip": 1.00177956, + "balance_loss_mlp": 1.00083816, + "epoch": 0.27362092289192846, + "flos": 19645291895040.0, + "grad_norm": 2.039967826644632, + "language_loss": 0.78829992, + "learning_rate": 3.409040566039563e-06, + "loss": 0.81091416, + "num_input_tokens_seen": 98384010, + "step": 4551, + "time_per_iteration": 4.069150447845459 + }, + { + "auxiliary_loss_clip": 0.01124013, + "auxiliary_loss_mlp": 0.01138125, + "balance_loss_clip": 1.00173926, + "balance_loss_mlp": 1.00089169, + "epoch": 0.27368104614459643, + "flos": 17639142416640.0, + "grad_norm": 2.6905620382205933, + "language_loss": 0.70334858, + "learning_rate": 3.4087641446592362e-06, + "loss": 0.72596997, + "num_input_tokens_seen": 98399625, + "step": 4552, + "time_per_iteration": 2.625523805618286 + }, + { + "auxiliary_loss_clip": 0.01139747, + "auxiliary_loss_mlp": 0.01138208, + "balance_loss_clip": 1.00193107, + "balance_loss_mlp": 1.00078356, + "epoch": 0.2737411693972644, + "flos": 21580015178880.0, + "grad_norm": 2.1313232232963095, + "language_loss": 0.71722817, + "learning_rate": 3.408487669858431e-06, + "loss": 0.7400077, + "num_input_tokens_seen": 98417310, + "step": 4553, + "time_per_iteration": 4.008981227874756 + }, + { + "auxiliary_loss_clip": 0.01156215, + "auxiliary_loss_mlp": 0.01137674, + "balance_loss_clip": 1.00190365, + "balance_loss_mlp": 1.00082123, + "epoch": 0.27380129264993236, + "flos": 25484043565440.0, + "grad_norm": 1.5670242578404936, + "language_loss": 0.59084475, + "learning_rate": 3.4082111416476337e-06, + "loss": 0.61378372, + "num_input_tokens_seen": 98438670, + "step": 4554, + "time_per_iteration": 3.980691909790039 + }, + { + "auxiliary_loss_clip": 0.01139941, + "auxiliary_loss_mlp": 0.01138545, + "balance_loss_clip": 1.00186157, + "balance_loss_mlp": 1.00073886, + "epoch": 0.2738614159026003, + "flos": 18661196004480.0, + "grad_norm": 1.691128959343268, + "language_loss": 0.74158788, + "learning_rate": 3.4079345600373275e-06, + "loss": 0.76437271, + "num_input_tokens_seen": 98456060, + "step": 4555, + "time_per_iteration": 2.611393928527832 + }, + { + "auxiliary_loss_clip": 0.01156252, + "auxiliary_loss_mlp": 0.0113852, + "balance_loss_clip": 1.00197721, + "balance_loss_mlp": 1.0007143, + "epoch": 0.2739215391552683, + "flos": 23477139901440.0, + "grad_norm": 1.741691491559673, + "language_loss": 0.7733683, + "learning_rate": 3.407657925038002e-06, + "loss": 0.79631603, + "num_input_tokens_seen": 98473765, + "step": 4556, + "time_per_iteration": 2.5882153511047363 + }, + { + "auxiliary_loss_clip": 0.01157719, + "auxiliary_loss_mlp": 0.01139328, + "balance_loss_clip": 1.00196087, + "balance_loss_mlp": 1.00085413, + "epoch": 0.27398166240793626, + "flos": 17128636369920.0, + "grad_norm": 2.2296142538043573, + "language_loss": 0.82308346, + "learning_rate": 3.4073812366601473e-06, + "loss": 0.84605384, + "num_input_tokens_seen": 98490590, + "step": 4557, + "time_per_iteration": 2.70719051361084 + }, + { + "auxiliary_loss_clip": 0.0111176, + "auxiliary_loss_mlp": 0.01138222, + "balance_loss_clip": 1.00185704, + "balance_loss_mlp": 1.00070238, + "epoch": 0.2740417856606042, + "flos": 23404744039680.0, + "grad_norm": 1.7855725601699164, + "language_loss": 0.7274071, + "learning_rate": 3.4071044949142547e-06, + "loss": 0.74990696, + "num_input_tokens_seen": 98510590, + "step": 4558, + "time_per_iteration": 2.6814374923706055 + }, + { + "auxiliary_loss_clip": 0.01139514, + "auxiliary_loss_mlp": 0.01137854, + "balance_loss_clip": 1.00169778, + "balance_loss_mlp": 1.00100136, + "epoch": 0.2741019089132722, + "flos": 12780428400000.0, + "grad_norm": 2.059715145908022, + "language_loss": 0.67974126, + "learning_rate": 3.406827699810819e-06, + "loss": 0.70251501, + "num_input_tokens_seen": 98527875, + "step": 4559, + "time_per_iteration": 2.6536359786987305 + }, + { + "auxiliary_loss_clip": 0.01141549, + "auxiliary_loss_mlp": 0.0113809, + "balance_loss_clip": 1.00186181, + "balance_loss_mlp": 1.00095212, + "epoch": 0.27416203216594015, + "flos": 20631542601600.0, + "grad_norm": 1.7201824489088247, + "language_loss": 0.72083521, + "learning_rate": 3.4065508513603353e-06, + "loss": 0.7436316, + "num_input_tokens_seen": 98547575, + "step": 4560, + "time_per_iteration": 2.6197004318237305 + }, + { + "auxiliary_loss_clip": 0.01139388, + "auxiliary_loss_mlp": 0.01138401, + "balance_loss_clip": 1.00181913, + "balance_loss_mlp": 1.00078595, + "epoch": 0.27422215541860817, + "flos": 26541576812160.0, + "grad_norm": 1.6353366000283012, + "language_loss": 0.8142395, + "learning_rate": 3.406273949573303e-06, + "loss": 0.8370173, + "num_input_tokens_seen": 98566290, + "step": 4561, + "time_per_iteration": 2.6525139808654785 + }, + { + "auxiliary_loss_clip": 0.01172849, + "auxiliary_loss_mlp": 0.01139211, + "balance_loss_clip": 1.00202262, + "balance_loss_mlp": 1.00102401, + "epoch": 0.27428227867127614, + "flos": 23331163029120.0, + "grad_norm": 2.324464957247882, + "language_loss": 0.75199604, + "learning_rate": 3.4059969944602214e-06, + "loss": 0.77511668, + "num_input_tokens_seen": 98586255, + "step": 4562, + "time_per_iteration": 2.5773069858551025 + }, + { + "auxiliary_loss_clip": 0.01172834, + "auxiliary_loss_mlp": 0.0113839, + "balance_loss_clip": 1.00202584, + "balance_loss_mlp": 1.00067949, + "epoch": 0.2743424019239441, + "flos": 23035115134080.0, + "grad_norm": 1.494554933731816, + "language_loss": 0.74360043, + "learning_rate": 3.4057199860315928e-06, + "loss": 0.76671267, + "num_input_tokens_seen": 98606030, + "step": 4563, + "time_per_iteration": 2.515460729598999 + }, + { + "auxiliary_loss_clip": 0.0112549, + "auxiliary_loss_mlp": 0.01139517, + "balance_loss_clip": 1.00179577, + "balance_loss_mlp": 1.00075746, + "epoch": 0.27440252517661207, + "flos": 21981101420160.0, + "grad_norm": 1.7735939037289077, + "language_loss": 0.62482542, + "learning_rate": 3.4054429242979213e-06, + "loss": 0.64747554, + "num_input_tokens_seen": 98625225, + "step": 4564, + "time_per_iteration": 2.639781951904297 + }, + { + "auxiliary_loss_clip": 0.01140704, + "auxiliary_loss_mlp": 0.01139072, + "balance_loss_clip": 1.0017978, + "balance_loss_mlp": 1.00088418, + "epoch": 0.27446264842928003, + "flos": 40187451502080.0, + "grad_norm": 1.6066390905535082, + "language_loss": 0.78529948, + "learning_rate": 3.4051658092697135e-06, + "loss": 0.80809724, + "num_input_tokens_seen": 98649470, + "step": 4565, + "time_per_iteration": 2.746579647064209 + }, + { + "auxiliary_loss_clip": 0.01109168, + "auxiliary_loss_mlp": 0.01138321, + "balance_loss_clip": 1.00185657, + "balance_loss_mlp": 1.00089669, + "epoch": 0.274522771681948, + "flos": 13479681438720.0, + "grad_norm": 1.837229998065121, + "language_loss": 0.68502378, + "learning_rate": 3.404888640957477e-06, + "loss": 0.70749867, + "num_input_tokens_seen": 98666915, + "step": 4566, + "time_per_iteration": 2.6361894607543945 + }, + { + "auxiliary_loss_clip": 0.01157072, + "auxiliary_loss_mlp": 0.01138367, + "balance_loss_clip": 1.00199115, + "balance_loss_mlp": 1.00122845, + "epoch": 0.27458289493461596, + "flos": 28622133313920.0, + "grad_norm": 1.6690899455795822, + "language_loss": 0.60664046, + "learning_rate": 3.404611419371723e-06, + "loss": 0.6295948, + "num_input_tokens_seen": 98688240, + "step": 4567, + "time_per_iteration": 2.6143763065338135 + }, + { + "auxiliary_loss_clip": 0.01157461, + "auxiliary_loss_mlp": 0.01138698, + "balance_loss_clip": 1.00196695, + "balance_loss_mlp": 1.00070179, + "epoch": 0.2746430181872839, + "flos": 20119815492480.0, + "grad_norm": 1.7437061012303665, + "language_loss": 0.82403338, + "learning_rate": 3.4043341445229627e-06, + "loss": 0.84699494, + "num_input_tokens_seen": 98708245, + "step": 4568, + "time_per_iteration": 2.5736260414123535 + }, + { + "auxiliary_loss_clip": 0.01156304, + "auxiliary_loss_mlp": 0.0113893, + "balance_loss_clip": 1.00198555, + "balance_loss_mlp": 1.00064707, + "epoch": 0.2747031414399519, + "flos": 20193468330240.0, + "grad_norm": 1.9377293320161295, + "language_loss": 0.68356586, + "learning_rate": 3.4040568164217117e-06, + "loss": 0.70651817, + "num_input_tokens_seen": 98724575, + "step": 4569, + "time_per_iteration": 2.5721986293792725 + }, + { + "auxiliary_loss_clip": 0.01142085, + "auxiliary_loss_mlp": 0.01138329, + "balance_loss_clip": 1.00185513, + "balance_loss_mlp": 1.00071418, + "epoch": 0.27476326469261986, + "flos": 13516346246400.0, + "grad_norm": 2.0249135095931097, + "language_loss": 0.71097475, + "learning_rate": 3.4037794350784848e-06, + "loss": 0.73377883, + "num_input_tokens_seen": 98740700, + "step": 4570, + "time_per_iteration": 2.569708824157715 + }, + { + "auxiliary_loss_clip": 0.01120931, + "auxiliary_loss_mlp": 0.01123594, + "balance_loss_clip": 1.00143039, + "balance_loss_mlp": 0.99999774, + "epoch": 0.2748233879452878, + "flos": 65937127121280.0, + "grad_norm": 0.7245185275399679, + "language_loss": 0.55810237, + "learning_rate": 3.4035020005038014e-06, + "loss": 0.58054763, + "num_input_tokens_seen": 98803030, + "step": 4571, + "time_per_iteration": 3.292773962020874 + }, + { + "auxiliary_loss_clip": 0.01108914, + "auxiliary_loss_mlp": 0.01138891, + "balance_loss_clip": 1.00176561, + "balance_loss_mlp": 1.00099015, + "epoch": 0.2748835111979558, + "flos": 17384212615680.0, + "grad_norm": 2.293248641307409, + "language_loss": 0.7791065, + "learning_rate": 3.4032245127081812e-06, + "loss": 0.8015846, + "num_input_tokens_seen": 98820505, + "step": 4572, + "time_per_iteration": 2.661435842514038 + }, + { + "auxiliary_loss_clip": 0.01172616, + "auxiliary_loss_mlp": 0.01137424, + "balance_loss_clip": 1.00195515, + "balance_loss_mlp": 1.00085795, + "epoch": 0.27494363445062375, + "flos": 23587565287680.0, + "grad_norm": 1.559927184638123, + "language_loss": 0.81240118, + "learning_rate": 3.402946971702147e-06, + "loss": 0.83550167, + "num_input_tokens_seen": 98842150, + "step": 4573, + "time_per_iteration": 2.563830614089966 + }, + { + "auxiliary_loss_clip": 0.01156325, + "auxiliary_loss_mlp": 0.01138108, + "balance_loss_clip": 1.00183475, + "balance_loss_mlp": 1.00068378, + "epoch": 0.2750037577032918, + "flos": 17164582905600.0, + "grad_norm": 1.6245798439885373, + "language_loss": 0.79244745, + "learning_rate": 3.402669377496223e-06, + "loss": 0.81539178, + "num_input_tokens_seen": 98861050, + "step": 4574, + "time_per_iteration": 2.543797016143799 + }, + { + "auxiliary_loss_clip": 0.01107568, + "auxiliary_loss_mlp": 0.01138375, + "balance_loss_clip": 1.00168443, + "balance_loss_mlp": 1.00123656, + "epoch": 0.27506388095595974, + "flos": 24491903028480.0, + "grad_norm": 1.935757489668191, + "language_loss": 0.73867404, + "learning_rate": 3.402391730100936e-06, + "loss": 0.76113349, + "num_input_tokens_seen": 98879695, + "step": 4575, + "time_per_iteration": 2.7617321014404297 + }, + { + "auxiliary_loss_clip": 0.01140421, + "auxiliary_loss_mlp": 0.01137587, + "balance_loss_clip": 1.00181651, + "balance_loss_mlp": 1.00083041, + "epoch": 0.2751240042086277, + "flos": 38764706722560.0, + "grad_norm": 1.823860262648916, + "language_loss": 0.71756482, + "learning_rate": 3.402114029526814e-06, + "loss": 0.74034494, + "num_input_tokens_seen": 98902035, + "step": 4576, + "time_per_iteration": 2.748080015182495 + }, + { + "auxiliary_loss_clip": 0.01123932, + "auxiliary_loss_mlp": 0.00748187, + "balance_loss_clip": 1.00166154, + "balance_loss_mlp": 1.00050998, + "epoch": 0.27518412746129567, + "flos": 26907039740160.0, + "grad_norm": 1.560231595427206, + "language_loss": 0.73245955, + "learning_rate": 3.4018362757843866e-06, + "loss": 0.75118071, + "num_input_tokens_seen": 98921835, + "step": 4577, + "time_per_iteration": 2.6995491981506348 + }, + { + "auxiliary_loss_clip": 0.01139428, + "auxiliary_loss_mlp": 0.0113807, + "balance_loss_clip": 1.00180936, + "balance_loss_mlp": 1.00074112, + "epoch": 0.27524425071396363, + "flos": 24900531125760.0, + "grad_norm": 2.727499043227873, + "language_loss": 0.76258051, + "learning_rate": 3.401558468884188e-06, + "loss": 0.78535545, + "num_input_tokens_seen": 98939610, + "step": 4578, + "time_per_iteration": 2.653251886367798 + }, + { + "auxiliary_loss_clip": 0.01139773, + "auxiliary_loss_mlp": 0.01138537, + "balance_loss_clip": 1.00193906, + "balance_loss_mlp": 1.00082636, + "epoch": 0.2753043739666316, + "flos": 26288047641600.0, + "grad_norm": 1.3448740899165164, + "language_loss": 0.66293049, + "learning_rate": 3.4012806088367516e-06, + "loss": 0.68571359, + "num_input_tokens_seen": 98962250, + "step": 4579, + "time_per_iteration": 2.6503491401672363 + }, + { + "auxiliary_loss_clip": 0.0112481, + "auxiliary_loss_mlp": 0.01138517, + "balance_loss_clip": 1.00176144, + "balance_loss_mlp": 1.00118816, + "epoch": 0.27536449721929956, + "flos": 24206772867840.0, + "grad_norm": 1.7705987281039248, + "language_loss": 0.79573357, + "learning_rate": 3.4010026956526137e-06, + "loss": 0.81836689, + "num_input_tokens_seen": 98981845, + "step": 4580, + "time_per_iteration": 2.664217472076416 + }, + { + "auxiliary_loss_clip": 0.01156184, + "auxiliary_loss_mlp": 0.01137937, + "balance_loss_clip": 1.00192857, + "balance_loss_mlp": 1.00070369, + "epoch": 0.27542462047196753, + "flos": 19537272720000.0, + "grad_norm": 1.4750223087630108, + "language_loss": 0.6742065, + "learning_rate": 3.4007247293423137e-06, + "loss": 0.69714767, + "num_input_tokens_seen": 99001855, + "step": 4581, + "time_per_iteration": 2.561741590499878 + }, + { + "auxiliary_loss_clip": 0.01145549, + "auxiliary_loss_mlp": 0.01138044, + "balance_loss_clip": 1.00227594, + "balance_loss_mlp": 1.00090551, + "epoch": 0.2754847437246355, + "flos": 14319165173760.0, + "grad_norm": 1.6186788086978623, + "language_loss": 0.77867436, + "learning_rate": 3.400446709916392e-06, + "loss": 0.80151027, + "num_input_tokens_seen": 99019880, + "step": 4582, + "time_per_iteration": 2.5382747650146484 + }, + { + "auxiliary_loss_clip": 0.01110128, + "auxiliary_loss_mlp": 0.01137534, + "balance_loss_clip": 1.0018611, + "balance_loss_mlp": 1.00087297, + "epoch": 0.27554486697730346, + "flos": 18838773866880.0, + "grad_norm": 1.5763131502017729, + "language_loss": 0.84031785, + "learning_rate": 3.4001686373853895e-06, + "loss": 0.86279452, + "num_input_tokens_seen": 99037570, + "step": 4583, + "time_per_iteration": 2.656719207763672 + }, + { + "auxiliary_loss_clip": 0.01156262, + "auxiliary_loss_mlp": 0.01138424, + "balance_loss_clip": 1.00194478, + "balance_loss_mlp": 1.00090384, + "epoch": 0.2756049902299714, + "flos": 22382295402240.0, + "grad_norm": 1.7113576715406302, + "language_loss": 0.67190087, + "learning_rate": 3.3998905117598528e-06, + "loss": 0.6948477, + "num_input_tokens_seen": 99056875, + "step": 4584, + "time_per_iteration": 2.579813003540039 + }, + { + "auxiliary_loss_clip": 0.01110159, + "auxiliary_loss_mlp": 0.01137221, + "balance_loss_clip": 1.00172567, + "balance_loss_mlp": 1.00065482, + "epoch": 0.2756651134826394, + "flos": 19573901614080.0, + "grad_norm": 1.7472920252506912, + "language_loss": 0.76917082, + "learning_rate": 3.399612333050327e-06, + "loss": 0.79164463, + "num_input_tokens_seen": 99074685, + "step": 4585, + "time_per_iteration": 2.6808643341064453 + }, + { + "auxiliary_loss_clip": 0.01156367, + "auxiliary_loss_mlp": 0.00748213, + "balance_loss_clip": 1.00201225, + "balance_loss_mlp": 1.00047171, + "epoch": 0.27572523673530736, + "flos": 23586559706880.0, + "grad_norm": 1.6067103640335683, + "language_loss": 0.72075951, + "learning_rate": 3.399334101267362e-06, + "loss": 0.73980522, + "num_input_tokens_seen": 99095300, + "step": 4586, + "time_per_iteration": 3.987569808959961 + }, + { + "auxiliary_loss_clip": 0.0113955, + "auxiliary_loss_mlp": 0.01137802, + "balance_loss_clip": 1.00187302, + "balance_loss_mlp": 1.00075901, + "epoch": 0.2757853599879754, + "flos": 22820118278400.0, + "grad_norm": 1.7503556940214149, + "language_loss": 0.80492568, + "learning_rate": 3.3990558164215073e-06, + "loss": 0.82769918, + "num_input_tokens_seen": 99115965, + "step": 4587, + "time_per_iteration": 2.606482744216919 + }, + { + "auxiliary_loss_clip": 0.01157557, + "auxiliary_loss_mlp": 0.0113747, + "balance_loss_clip": 1.00196338, + "balance_loss_mlp": 1.00071287, + "epoch": 0.27584548324064334, + "flos": 18551704371840.0, + "grad_norm": 1.8294258327507003, + "language_loss": 0.83502197, + "learning_rate": 3.398777478523316e-06, + "loss": 0.85797226, + "num_input_tokens_seen": 99134265, + "step": 4588, + "time_per_iteration": 2.552936315536499 + }, + { + "auxiliary_loss_clip": 0.01125593, + "auxiliary_loss_mlp": 0.01137479, + "balance_loss_clip": 1.00177658, + "balance_loss_mlp": 1.00062728, + "epoch": 0.2759056064933113, + "flos": 23769883745280.0, + "grad_norm": 1.3106592105998804, + "language_loss": 0.75735784, + "learning_rate": 3.398499087583342e-06, + "loss": 0.77998853, + "num_input_tokens_seen": 99156185, + "step": 4589, + "time_per_iteration": 4.07940936088562 + }, + { + "auxiliary_loss_clip": 0.01162001, + "auxiliary_loss_mlp": 0.01137686, + "balance_loss_clip": 1.00214839, + "balance_loss_mlp": 1.00092947, + "epoch": 0.27596572974597927, + "flos": 24281898163200.0, + "grad_norm": 1.8536588803619123, + "language_loss": 0.88661695, + "learning_rate": 3.398220643612143e-06, + "loss": 0.90961385, + "num_input_tokens_seen": 99176735, + "step": 4590, + "time_per_iteration": 2.5742642879486084 + }, + { + "auxiliary_loss_clip": 0.01156135, + "auxiliary_loss_mlp": 0.01138278, + "balance_loss_clip": 1.00169945, + "balance_loss_mlp": 1.00094938, + "epoch": 0.27602585299864724, + "flos": 35040985632000.0, + "grad_norm": 1.6760306161487217, + "language_loss": 0.71469164, + "learning_rate": 3.397942146620277e-06, + "loss": 0.73763573, + "num_input_tokens_seen": 99199765, + "step": 4591, + "time_per_iteration": 5.437584400177002 + }, + { + "auxiliary_loss_clip": 0.01123104, + "auxiliary_loss_mlp": 0.01137813, + "balance_loss_clip": 1.00184917, + "balance_loss_mlp": 1.00086498, + "epoch": 0.2760859762513152, + "flos": 24309405002880.0, + "grad_norm": 1.8878727647526308, + "language_loss": 0.80744046, + "learning_rate": 3.3976635966183046e-06, + "loss": 0.83004963, + "num_input_tokens_seen": 99218435, + "step": 4592, + "time_per_iteration": 2.644988536834717 + }, + { + "auxiliary_loss_clip": 0.01154048, + "auxiliary_loss_mlp": 0.00747585, + "balance_loss_clip": 1.00157785, + "balance_loss_mlp": 1.00004041, + "epoch": 0.27614609950398317, + "flos": 71260739890560.0, + "grad_norm": 0.7486248711785155, + "language_loss": 0.61618364, + "learning_rate": 3.3973849936167886e-06, + "loss": 0.63519996, + "num_input_tokens_seen": 99276200, + "step": 4593, + "time_per_iteration": 3.109114170074463 + }, + { + "auxiliary_loss_clip": 0.01156505, + "auxiliary_loss_mlp": 0.0113766, + "balance_loss_clip": 1.00193799, + "balance_loss_mlp": 1.00080752, + "epoch": 0.27620622275665113, + "flos": 29674854138240.0, + "grad_norm": 1.937378301274247, + "language_loss": 0.77058816, + "learning_rate": 3.3971063376262937e-06, + "loss": 0.79352987, + "num_input_tokens_seen": 99297625, + "step": 4594, + "time_per_iteration": 2.635770320892334 + }, + { + "auxiliary_loss_clip": 0.01156084, + "auxiliary_loss_mlp": 0.01137581, + "balance_loss_clip": 1.00197196, + "balance_loss_mlp": 1.00082397, + "epoch": 0.2762663460093191, + "flos": 15378063137280.0, + "grad_norm": 1.4175786649634574, + "language_loss": 0.91675872, + "learning_rate": 3.3968276286573866e-06, + "loss": 0.93969536, + "num_input_tokens_seen": 99315790, + "step": 4595, + "time_per_iteration": 2.524070978164673 + }, + { + "auxiliary_loss_clip": 0.01156211, + "auxiliary_loss_mlp": 0.01138762, + "balance_loss_clip": 1.00197029, + "balance_loss_mlp": 1.00095582, + "epoch": 0.27632646926198706, + "flos": 20704082117760.0, + "grad_norm": 1.7971124578388453, + "language_loss": 0.69203556, + "learning_rate": 3.3965488667206353e-06, + "loss": 0.71498537, + "num_input_tokens_seen": 99334615, + "step": 4596, + "time_per_iteration": 2.5525805950164795 + }, + { + "auxiliary_loss_clip": 0.01140944, + "auxiliary_loss_mlp": 0.01138604, + "balance_loss_clip": 1.00189328, + "balance_loss_mlp": 1.00070214, + "epoch": 0.276386592514655, + "flos": 32813374849920.0, + "grad_norm": 1.682146787110782, + "language_loss": 0.63956213, + "learning_rate": 3.3962700518266113e-06, + "loss": 0.66235763, + "num_input_tokens_seen": 99356685, + "step": 4597, + "time_per_iteration": 2.7207767963409424 + }, + { + "auxiliary_loss_clip": 0.01172647, + "auxiliary_loss_mlp": 0.01137463, + "balance_loss_clip": 1.00201309, + "balance_loss_mlp": 1.00089717, + "epoch": 0.276446715767323, + "flos": 18551704371840.0, + "grad_norm": 1.7781078096844993, + "language_loss": 0.86090446, + "learning_rate": 3.395991183985887e-06, + "loss": 0.88400555, + "num_input_tokens_seen": 99374810, + "step": 4598, + "time_per_iteration": 2.513015031814575 + }, + { + "auxiliary_loss_clip": 0.01172796, + "auxiliary_loss_mlp": 0.01137917, + "balance_loss_clip": 1.00200987, + "balance_loss_mlp": 1.00068355, + "epoch": 0.27650683901999096, + "flos": 22819615488000.0, + "grad_norm": 2.5935900540127927, + "language_loss": 0.79926932, + "learning_rate": 3.395712263209037e-06, + "loss": 0.82237649, + "num_input_tokens_seen": 99391290, + "step": 4599, + "time_per_iteration": 2.491215229034424 + }, + { + "auxiliary_loss_clip": 0.0114191, + "auxiliary_loss_mlp": 0.01137864, + "balance_loss_clip": 1.00186193, + "balance_loss_mlp": 1.00091648, + "epoch": 0.276566962272659, + "flos": 21361534704000.0, + "grad_norm": 1.7873170392843476, + "language_loss": 0.79012561, + "learning_rate": 3.395433289506639e-06, + "loss": 0.81292331, + "num_input_tokens_seen": 99409120, + "step": 4600, + "time_per_iteration": 2.586880922317505 + }, + { + "auxiliary_loss_clip": 0.01123324, + "auxiliary_loss_mlp": 0.01138316, + "balance_loss_clip": 1.00176895, + "balance_loss_mlp": 1.00098705, + "epoch": 0.27662708552532694, + "flos": 17710604524800.0, + "grad_norm": 2.0941162928608996, + "language_loss": 0.72939825, + "learning_rate": 3.3951542628892694e-06, + "loss": 0.75201464, + "num_input_tokens_seen": 99426180, + "step": 4601, + "time_per_iteration": 2.599174737930298 + }, + { + "auxiliary_loss_clip": 0.01155992, + "auxiliary_loss_mlp": 0.01137432, + "balance_loss_clip": 1.0018605, + "balance_loss_mlp": 1.00086594, + "epoch": 0.2766872087779949, + "flos": 21252725429760.0, + "grad_norm": 1.7300149669541434, + "language_loss": 0.79843855, + "learning_rate": 3.3948751833675113e-06, + "loss": 0.82137287, + "num_input_tokens_seen": 99447720, + "step": 4602, + "time_per_iteration": 2.590912103652954 + }, + { + "auxiliary_loss_clip": 0.01140557, + "auxiliary_loss_mlp": 0.01138277, + "balance_loss_clip": 1.00183654, + "balance_loss_mlp": 1.00094748, + "epoch": 0.2767473320306629, + "flos": 12931900053120.0, + "grad_norm": 2.068910753590096, + "language_loss": 0.77026105, + "learning_rate": 3.3945960509519455e-06, + "loss": 0.7930494, + "num_input_tokens_seen": 99464720, + "step": 4603, + "time_per_iteration": 2.555016279220581 + }, + { + "auxiliary_loss_clip": 0.01139398, + "auxiliary_loss_mlp": 0.01137312, + "balance_loss_clip": 1.00178552, + "balance_loss_mlp": 1.00093615, + "epoch": 0.27680745528333084, + "flos": 15012851604480.0, + "grad_norm": 1.5112400818875622, + "language_loss": 0.81606698, + "learning_rate": 3.3943168656531585e-06, + "loss": 0.83883411, + "num_input_tokens_seen": 99482310, + "step": 4604, + "time_per_iteration": 2.573714017868042 + }, + { + "auxiliary_loss_clip": 0.011074, + "auxiliary_loss_mlp": 0.01137168, + "balance_loss_clip": 1.00173628, + "balance_loss_mlp": 1.0007925, + "epoch": 0.2768675785359988, + "flos": 22637835734400.0, + "grad_norm": 1.612943586492413, + "language_loss": 0.69576305, + "learning_rate": 3.3940376274817363e-06, + "loss": 0.71820873, + "num_input_tokens_seen": 99501255, + "step": 4605, + "time_per_iteration": 2.701474905014038 + }, + { + "auxiliary_loss_clip": 0.01152939, + "auxiliary_loss_mlp": 0.01123137, + "balance_loss_clip": 1.00176835, + "balance_loss_mlp": 1.00030351, + "epoch": 0.27692770178866677, + "flos": 66130542881280.0, + "grad_norm": 0.6958178474959935, + "language_loss": 0.57223821, + "learning_rate": 3.3937583364482673e-06, + "loss": 0.59499896, + "num_input_tokens_seen": 99568925, + "step": 4606, + "time_per_iteration": 3.229825019836426 + }, + { + "auxiliary_loss_clip": 0.01139611, + "auxiliary_loss_mlp": 0.01138351, + "balance_loss_clip": 1.00188947, + "balance_loss_mlp": 1.00092649, + "epoch": 0.27698782504133473, + "flos": 26464979059200.0, + "grad_norm": 2.2177988331964067, + "language_loss": 0.69525671, + "learning_rate": 3.3934789925633424e-06, + "loss": 0.71803629, + "num_input_tokens_seen": 99588455, + "step": 4607, + "time_per_iteration": 2.6315953731536865 + }, + { + "auxiliary_loss_clip": 0.01156129, + "auxiliary_loss_mlp": 0.01137162, + "balance_loss_clip": 1.00176299, + "balance_loss_mlp": 1.00078654, + "epoch": 0.2770479482940027, + "flos": 25884806584320.0, + "grad_norm": 1.6798654932312602, + "language_loss": 0.70047289, + "learning_rate": 3.393199595837555e-06, + "loss": 0.72340578, + "num_input_tokens_seen": 99609355, + "step": 4608, + "time_per_iteration": 2.5847549438476562 + }, + { + "auxiliary_loss_clip": 0.01092194, + "auxiliary_loss_mlp": 0.01137291, + "balance_loss_clip": 1.00160384, + "balance_loss_mlp": 1.00072515, + "epoch": 0.27710807154667066, + "flos": 22857249962880.0, + "grad_norm": 1.6790122272939707, + "language_loss": 0.7293855, + "learning_rate": 3.392920146281499e-06, + "loss": 0.75168031, + "num_input_tokens_seen": 99628780, + "step": 4609, + "time_per_iteration": 2.7289390563964844 + }, + { + "auxiliary_loss_clip": 0.01126858, + "auxiliary_loss_mlp": 0.01137622, + "balance_loss_clip": 1.00180042, + "balance_loss_mlp": 1.00096059, + "epoch": 0.27716819479933863, + "flos": 17711071401600.0, + "grad_norm": 5.55941870943445, + "language_loss": 0.83511865, + "learning_rate": 3.3926406439057714e-06, + "loss": 0.85776347, + "num_input_tokens_seen": 99644545, + "step": 4610, + "time_per_iteration": 2.607409954071045 + }, + { + "auxiliary_loss_clip": 0.0109342, + "auxiliary_loss_mlp": 0.00748317, + "balance_loss_clip": 1.0017277, + "balance_loss_mlp": 1.00054443, + "epoch": 0.2772283180520066, + "flos": 19646046080640.0, + "grad_norm": 2.0093684905569265, + "language_loss": 0.69061577, + "learning_rate": 3.3923610887209705e-06, + "loss": 0.70903313, + "num_input_tokens_seen": 99663125, + "step": 4611, + "time_per_iteration": 2.7261528968811035 + }, + { + "auxiliary_loss_clip": 0.01172608, + "auxiliary_loss_mlp": 0.01136398, + "balance_loss_clip": 1.00199544, + "balance_loss_mlp": 1.00078559, + "epoch": 0.27728844130467456, + "flos": 21032628842880.0, + "grad_norm": 1.8535970782118554, + "language_loss": 0.73508549, + "learning_rate": 3.392081480737698e-06, + "loss": 0.75817549, + "num_input_tokens_seen": 99682645, + "step": 4612, + "time_per_iteration": 2.5341289043426514 + }, + { + "auxiliary_loss_clip": 0.01155938, + "auxiliary_loss_mlp": 0.00748201, + "balance_loss_clip": 1.001827, + "balance_loss_mlp": 1.00058484, + "epoch": 0.2773485645573425, + "flos": 18989204025600.0, + "grad_norm": 2.178577182825712, + "language_loss": 0.66226673, + "learning_rate": 3.3918018199665563e-06, + "loss": 0.68130803, + "num_input_tokens_seen": 99700520, + "step": 4613, + "time_per_iteration": 2.557677745819092 + }, + { + "auxiliary_loss_clip": 0.0110933, + "auxiliary_loss_mlp": 0.01137627, + "balance_loss_clip": 1.00174975, + "balance_loss_mlp": 1.00077438, + "epoch": 0.27740868781001055, + "flos": 21468440557440.0, + "grad_norm": 1.6239910682293495, + "language_loss": 0.79441136, + "learning_rate": 3.39152210641815e-06, + "loss": 0.81688094, + "num_input_tokens_seen": 99720355, + "step": 4614, + "time_per_iteration": 2.692451238632202 + }, + { + "auxiliary_loss_clip": 0.01157329, + "auxiliary_loss_mlp": 0.0113826, + "balance_loss_clip": 1.00195944, + "balance_loss_mlp": 1.00093079, + "epoch": 0.2774688110626785, + "flos": 19827825834240.0, + "grad_norm": 2.14766410135202, + "language_loss": 0.79954535, + "learning_rate": 3.3912423401030865e-06, + "loss": 0.82250118, + "num_input_tokens_seen": 99736090, + "step": 4615, + "time_per_iteration": 2.5279839038848877 + }, + { + "auxiliary_loss_clip": 0.01125112, + "auxiliary_loss_mlp": 0.01137893, + "balance_loss_clip": 1.0017693, + "balance_loss_mlp": 1.00084996, + "epoch": 0.2775289343153465, + "flos": 18216226321920.0, + "grad_norm": 2.497290962925232, + "language_loss": 0.63511193, + "learning_rate": 3.3909625210319735e-06, + "loss": 0.65774202, + "num_input_tokens_seen": 99751805, + "step": 4616, + "time_per_iteration": 2.6120951175689697 + }, + { + "auxiliary_loss_clip": 0.0115613, + "auxiliary_loss_mlp": 0.01137299, + "balance_loss_clip": 1.00177717, + "balance_loss_mlp": 1.0008285, + "epoch": 0.27758905756801444, + "flos": 16472476673280.0, + "grad_norm": 2.144008229733497, + "language_loss": 0.8221491, + "learning_rate": 3.3906826492154226e-06, + "loss": 0.84508342, + "num_input_tokens_seen": 99770610, + "step": 4617, + "time_per_iteration": 2.5534605979919434 + }, + { + "auxiliary_loss_clip": 0.0117255, + "auxiliary_loss_mlp": 0.01137179, + "balance_loss_clip": 1.00187135, + "balance_loss_mlp": 1.00080359, + "epoch": 0.2776491808206824, + "flos": 18728240739840.0, + "grad_norm": 1.8619684862374033, + "language_loss": 0.76728773, + "learning_rate": 3.3904027246640458e-06, + "loss": 0.79038501, + "num_input_tokens_seen": 99787305, + "step": 4618, + "time_per_iteration": 2.54752516746521 + }, + { + "auxiliary_loss_clip": 0.01172783, + "auxiliary_loss_mlp": 0.01137478, + "balance_loss_clip": 1.00209141, + "balance_loss_mlp": 1.00091171, + "epoch": 0.27770930407335037, + "flos": 28038189911040.0, + "grad_norm": 1.674037029168375, + "language_loss": 0.84668005, + "learning_rate": 3.390122747388459e-06, + "loss": 0.86978263, + "num_input_tokens_seen": 99808940, + "step": 4619, + "time_per_iteration": 2.561586380004883 + }, + { + "auxiliary_loss_clip": 0.01140261, + "auxiliary_loss_mlp": 0.01136657, + "balance_loss_clip": 1.00171328, + "balance_loss_mlp": 1.00075817, + "epoch": 0.27776942732601834, + "flos": 23549823072000.0, + "grad_norm": 1.4272679400740504, + "language_loss": 0.76870185, + "learning_rate": 3.3898427173992778e-06, + "loss": 0.79147106, + "num_input_tokens_seen": 99829575, + "step": 4620, + "time_per_iteration": 2.623443365097046 + }, + { + "auxiliary_loss_clip": 0.01125666, + "auxiliary_loss_mlp": 0.01137018, + "balance_loss_clip": 1.00168598, + "balance_loss_mlp": 1.00064278, + "epoch": 0.2778295505786863, + "flos": 23908713811200.0, + "grad_norm": 2.381999221123952, + "language_loss": 0.78567708, + "learning_rate": 3.389562634707122e-06, + "loss": 0.80830383, + "num_input_tokens_seen": 99847575, + "step": 4621, + "time_per_iteration": 2.6196415424346924 + }, + { + "auxiliary_loss_clip": 0.01124347, + "auxiliary_loss_mlp": 0.01137635, + "balance_loss_clip": 1.00167465, + "balance_loss_mlp": 1.00097382, + "epoch": 0.27788967383135427, + "flos": 25554571920000.0, + "grad_norm": 2.192117420281536, + "language_loss": 0.87729466, + "learning_rate": 3.389282499322611e-06, + "loss": 0.8999145, + "num_input_tokens_seen": 99864995, + "step": 4622, + "time_per_iteration": 2.6660847663879395 + }, + { + "auxiliary_loss_clip": 0.01106651, + "auxiliary_loss_mlp": 0.01137349, + "balance_loss_clip": 1.00156057, + "balance_loss_mlp": 1.00078332, + "epoch": 0.27794979708402223, + "flos": 16252631481600.0, + "grad_norm": 1.677842104639196, + "language_loss": 0.81201667, + "learning_rate": 3.389002311256369e-06, + "loss": 0.83445668, + "num_input_tokens_seen": 99881540, + "step": 4623, + "time_per_iteration": 2.6348230838775635 + }, + { + "auxiliary_loss_clip": 0.01124555, + "auxiliary_loss_mlp": 0.01137474, + "balance_loss_clip": 1.00173819, + "balance_loss_mlp": 1.00081265, + "epoch": 0.2780099203366902, + "flos": 20667632791680.0, + "grad_norm": 2.0102193368794117, + "language_loss": 0.81595814, + "learning_rate": 3.3887220705190204e-06, + "loss": 0.83857834, + "num_input_tokens_seen": 99899595, + "step": 4624, + "time_per_iteration": 3.979332208633423 + }, + { + "auxiliary_loss_clip": 0.01140774, + "auxiliary_loss_mlp": 0.00748172, + "balance_loss_clip": 1.00186563, + "balance_loss_mlp": 1.00049651, + "epoch": 0.27807004358935816, + "flos": 17739583822080.0, + "grad_norm": 2.193264530597419, + "language_loss": 0.76835454, + "learning_rate": 3.388441777121191e-06, + "loss": 0.78724396, + "num_input_tokens_seen": 99913020, + "step": 4625, + "time_per_iteration": 2.552074670791626 + }, + { + "auxiliary_loss_clip": 0.01142201, + "auxiliary_loss_mlp": 0.01136302, + "balance_loss_clip": 1.00184166, + "balance_loss_mlp": 1.00078523, + "epoch": 0.2781301668420261, + "flos": 16727119165440.0, + "grad_norm": 1.8940063827230014, + "language_loss": 0.6992811, + "learning_rate": 3.388161431073511e-06, + "loss": 0.7220661, + "num_input_tokens_seen": 99931405, + "step": 4626, + "time_per_iteration": 2.5778074264526367 + }, + { + "auxiliary_loss_clip": 0.01109007, + "auxiliary_loss_mlp": 0.01137728, + "balance_loss_clip": 1.00171208, + "balance_loss_mlp": 1.00068498, + "epoch": 0.27819029009469415, + "flos": 13844749317120.0, + "grad_norm": 2.12845337626462, + "language_loss": 0.93286681, + "learning_rate": 3.38788103238661e-06, + "loss": 0.95533419, + "num_input_tokens_seen": 99948100, + "step": 4627, + "time_per_iteration": 4.053231954574585 + }, + { + "auxiliary_loss_clip": 0.01172655, + "auxiliary_loss_mlp": 0.0113741, + "balance_loss_clip": 1.00191593, + "balance_loss_mlp": 1.00074852, + "epoch": 0.2782504133473621, + "flos": 27089286370560.0, + "grad_norm": 1.7175272562975084, + "language_loss": 0.8542614, + "learning_rate": 3.387600581071121e-06, + "loss": 0.87736201, + "num_input_tokens_seen": 99966470, + "step": 4628, + "time_per_iteration": 2.5731372833251953 + }, + { + "auxiliary_loss_clip": 0.01126331, + "auxiliary_loss_mlp": 0.01136085, + "balance_loss_clip": 1.00173676, + "balance_loss_mlp": 1.00085425, + "epoch": 0.2783105366000301, + "flos": 21068826773760.0, + "grad_norm": 1.446389013843189, + "language_loss": 0.79142022, + "learning_rate": 3.387320077137679e-06, + "loss": 0.8140443, + "num_input_tokens_seen": 99985930, + "step": 4629, + "time_per_iteration": 5.464744567871094 + }, + { + "auxiliary_loss_clip": 0.01125843, + "auxiliary_loss_mlp": 0.01135685, + "balance_loss_clip": 1.00184846, + "balance_loss_mlp": 1.00083542, + "epoch": 0.27837065985269804, + "flos": 26501823434880.0, + "grad_norm": 1.5134611732975594, + "language_loss": 0.84545606, + "learning_rate": 3.3870395205969208e-06, + "loss": 0.86807132, + "num_input_tokens_seen": 100006235, + "step": 4630, + "time_per_iteration": 2.750981569290161 + }, + { + "auxiliary_loss_clip": 0.01140326, + "auxiliary_loss_mlp": 0.01137085, + "balance_loss_clip": 1.00185323, + "balance_loss_mlp": 1.00071001, + "epoch": 0.278430783105366, + "flos": 20223201813120.0, + "grad_norm": 2.8062032580505796, + "language_loss": 0.81156039, + "learning_rate": 3.386758911459485e-06, + "loss": 0.83433455, + "num_input_tokens_seen": 100023655, + "step": 4631, + "time_per_iteration": 2.5977299213409424 + }, + { + "auxiliary_loss_clip": 0.01172772, + "auxiliary_loss_mlp": 0.01137789, + "balance_loss_clip": 1.00206089, + "balance_loss_mlp": 1.00074625, + "epoch": 0.278490906358034, + "flos": 25592888753280.0, + "grad_norm": 1.7153993021315277, + "language_loss": 0.71683615, + "learning_rate": 3.3864782497360126e-06, + "loss": 0.73994178, + "num_input_tokens_seen": 100043280, + "step": 4632, + "time_per_iteration": 2.54538893699646 + }, + { + "auxiliary_loss_clip": 0.01155403, + "auxiliary_loss_mlp": 0.01135764, + "balance_loss_clip": 1.001894, + "balance_loss_mlp": 1.00091457, + "epoch": 0.27855102961070194, + "flos": 16171544528640.0, + "grad_norm": 1.6895733096740972, + "language_loss": 0.82231045, + "learning_rate": 3.386197535437145e-06, + "loss": 0.84522212, + "num_input_tokens_seen": 100057690, + "step": 4633, + "time_per_iteration": 2.4947776794433594 + }, + { + "auxiliary_loss_clip": 0.0114033, + "auxiliary_loss_mlp": 0.01136345, + "balance_loss_clip": 1.00185657, + "balance_loss_mlp": 1.00063705, + "epoch": 0.2786111528633699, + "flos": 22927598749440.0, + "grad_norm": 1.6404094422051252, + "language_loss": 0.87829852, + "learning_rate": 3.385916768573529e-06, + "loss": 0.90106529, + "num_input_tokens_seen": 100075875, + "step": 4634, + "time_per_iteration": 2.6041135787963867 + }, + { + "auxiliary_loss_clip": 0.01140516, + "auxiliary_loss_mlp": 0.01137875, + "balance_loss_clip": 1.00188088, + "balance_loss_mlp": 1.00073683, + "epoch": 0.27867127611603787, + "flos": 23404205335680.0, + "grad_norm": 1.5818571615758805, + "language_loss": 0.76872706, + "learning_rate": 3.38563594915581e-06, + "loss": 0.79151094, + "num_input_tokens_seen": 100092930, + "step": 4635, + "time_per_iteration": 2.5968871116638184 + }, + { + "auxiliary_loss_clip": 0.01172575, + "auxiliary_loss_mlp": 0.01136758, + "balance_loss_clip": 1.00189972, + "balance_loss_mlp": 1.0009551, + "epoch": 0.27873139936870583, + "flos": 19829010983040.0, + "grad_norm": 1.5027343227405594, + "language_loss": 0.65438426, + "learning_rate": 3.385355077194637e-06, + "loss": 0.6774776, + "num_input_tokens_seen": 100110790, + "step": 4636, + "time_per_iteration": 2.5195586681365967 + }, + { + "auxiliary_loss_clip": 0.01157465, + "auxiliary_loss_mlp": 0.01137947, + "balance_loss_clip": 1.0019238, + "balance_loss_mlp": 1.00071287, + "epoch": 0.2787915226213738, + "flos": 17707659609600.0, + "grad_norm": 2.6164539181344852, + "language_loss": 0.84180129, + "learning_rate": 3.3850741527006604e-06, + "loss": 0.86475539, + "num_input_tokens_seen": 100126970, + "step": 4637, + "time_per_iteration": 2.540222644805908 + }, + { + "auxiliary_loss_clip": 0.0114174, + "auxiliary_loss_mlp": 0.0113661, + "balance_loss_clip": 1.00180173, + "balance_loss_mlp": 1.00090218, + "epoch": 0.27885164587404176, + "flos": 22090557139200.0, + "grad_norm": 1.4332707489612888, + "language_loss": 0.75650853, + "learning_rate": 3.384793175684533e-06, + "loss": 0.77929199, + "num_input_tokens_seen": 100146720, + "step": 4638, + "time_per_iteration": 2.6111197471618652 + }, + { + "auxiliary_loss_clip": 0.01157254, + "auxiliary_loss_mlp": 0.01137217, + "balance_loss_clip": 1.00182259, + "balance_loss_mlp": 1.00103199, + "epoch": 0.27891176912670973, + "flos": 19207684500480.0, + "grad_norm": 1.5535517030854797, + "language_loss": 0.7116403, + "learning_rate": 3.38451214615691e-06, + "loss": 0.73458505, + "num_input_tokens_seen": 100165920, + "step": 4639, + "time_per_iteration": 2.536466121673584 + }, + { + "auxiliary_loss_clip": 0.01156876, + "auxiliary_loss_mlp": 0.01136861, + "balance_loss_clip": 1.00189424, + "balance_loss_mlp": 1.00067651, + "epoch": 0.27897189237937775, + "flos": 27600007898880.0, + "grad_norm": 2.476763990456925, + "language_loss": 0.66075903, + "learning_rate": 3.384231064128447e-06, + "loss": 0.68369639, + "num_input_tokens_seen": 100185525, + "step": 4640, + "time_per_iteration": 2.605890989303589 + }, + { + "auxiliary_loss_clip": 0.01155959, + "auxiliary_loss_mlp": 0.01136854, + "balance_loss_clip": 1.00184059, + "balance_loss_mlp": 1.00066924, + "epoch": 0.2790320156320457, + "flos": 21178210665600.0, + "grad_norm": 2.82241218803579, + "language_loss": 0.72052538, + "learning_rate": 3.383949929609804e-06, + "loss": 0.7434535, + "num_input_tokens_seen": 100204850, + "step": 4641, + "time_per_iteration": 2.554229736328125 + }, + { + "auxiliary_loss_clip": 0.01125378, + "auxiliary_loss_mlp": 0.0113732, + "balance_loss_clip": 1.00173187, + "balance_loss_mlp": 1.00065851, + "epoch": 0.2790921388847137, + "flos": 22783920347520.0, + "grad_norm": 1.8110604659757807, + "language_loss": 0.74635577, + "learning_rate": 3.383668742611641e-06, + "loss": 0.76898277, + "num_input_tokens_seen": 100224520, + "step": 4642, + "time_per_iteration": 2.622844696044922 + }, + { + "auxiliary_loss_clip": 0.01126399, + "auxiliary_loss_mlp": 0.0113758, + "balance_loss_clip": 1.00179255, + "balance_loss_mlp": 1.00091839, + "epoch": 0.27915226213738165, + "flos": 23400649889280.0, + "grad_norm": 2.0165506716583024, + "language_loss": 0.85793614, + "learning_rate": 3.3833875031446205e-06, + "loss": 0.8805759, + "num_input_tokens_seen": 100243935, + "step": 4643, + "time_per_iteration": 2.6358771324157715 + }, + { + "auxiliary_loss_clip": 0.01124478, + "auxiliary_loss_mlp": 0.01137224, + "balance_loss_clip": 1.00180101, + "balance_loss_mlp": 1.00094378, + "epoch": 0.2792123853900496, + "flos": 22747794243840.0, + "grad_norm": 1.6560612383320412, + "language_loss": 0.83121026, + "learning_rate": 3.383106211219407e-06, + "loss": 0.85382724, + "num_input_tokens_seen": 100262290, + "step": 4644, + "time_per_iteration": 2.62138032913208 + }, + { + "auxiliary_loss_clip": 0.01157117, + "auxiliary_loss_mlp": 0.01136982, + "balance_loss_clip": 1.00190449, + "balance_loss_mlp": 1.00070155, + "epoch": 0.2792725086427176, + "flos": 15049372757760.0, + "grad_norm": 1.8609309249071864, + "language_loss": 0.79679817, + "learning_rate": 3.3828248668466673e-06, + "loss": 0.81973916, + "num_input_tokens_seen": 100280015, + "step": 4645, + "time_per_iteration": 2.5328643321990967 + }, + { + "auxiliary_loss_clip": 0.0113754, + "auxiliary_loss_mlp": 0.01122217, + "balance_loss_clip": 1.00161529, + "balance_loss_mlp": 1.00014687, + "epoch": 0.27933263189538554, + "flos": 62544861757440.0, + "grad_norm": 0.7729322872552953, + "language_loss": 0.62278944, + "learning_rate": 3.3825434700370705e-06, + "loss": 0.64538705, + "num_input_tokens_seen": 100338935, + "step": 4646, + "time_per_iteration": 3.1349170207977295 + }, + { + "auxiliary_loss_clip": 0.01139094, + "auxiliary_loss_mlp": 0.01135859, + "balance_loss_clip": 1.00178301, + "balance_loss_mlp": 1.00072312, + "epoch": 0.2793927551480535, + "flos": 25118365155840.0, + "grad_norm": 1.5249723025485618, + "language_loss": 0.89452469, + "learning_rate": 3.3822620208012865e-06, + "loss": 0.91727424, + "num_input_tokens_seen": 100359905, + "step": 4647, + "time_per_iteration": 2.64844012260437 + }, + { + "auxiliary_loss_clip": 0.01156834, + "auxiliary_loss_mlp": 0.01137408, + "balance_loss_clip": 1.0018065, + "balance_loss_mlp": 1.00084209, + "epoch": 0.27945287840072147, + "flos": 21324582587520.0, + "grad_norm": 1.5456796048829915, + "language_loss": 0.8715291, + "learning_rate": 3.381980519149988e-06, + "loss": 0.89447153, + "num_input_tokens_seen": 100376955, + "step": 4648, + "time_per_iteration": 2.5459213256835938 + }, + { + "auxiliary_loss_clip": 0.01156074, + "auxiliary_loss_mlp": 0.0113755, + "balance_loss_clip": 1.00184584, + "balance_loss_mlp": 1.00069821, + "epoch": 0.27951300165338944, + "flos": 27450547407360.0, + "grad_norm": 2.142371037043384, + "language_loss": 0.7296797, + "learning_rate": 3.38169896509385e-06, + "loss": 0.75261599, + "num_input_tokens_seen": 100397545, + "step": 4649, + "time_per_iteration": 2.6124236583709717 + }, + { + "auxiliary_loss_clip": 0.01123049, + "auxiliary_loss_mlp": 0.01136853, + "balance_loss_clip": 1.00162613, + "balance_loss_mlp": 1.00085878, + "epoch": 0.2795731249060574, + "flos": 15159008044800.0, + "grad_norm": 2.1109148188196984, + "language_loss": 0.80768293, + "learning_rate": 3.381417358643549e-06, + "loss": 0.83028191, + "num_input_tokens_seen": 100415080, + "step": 4650, + "time_per_iteration": 2.610332489013672 + }, + { + "auxiliary_loss_clip": 0.01137324, + "auxiliary_loss_mlp": 0.00747548, + "balance_loss_clip": 1.00241446, + "balance_loss_mlp": 1.00010514, + "epoch": 0.27963324815872537, + "flos": 60120103178880.0, + "grad_norm": 0.817946484844447, + "language_loss": 0.58850241, + "learning_rate": 3.3811356998097624e-06, + "loss": 0.60735112, + "num_input_tokens_seen": 100471105, + "step": 4651, + "time_per_iteration": 3.2231433391571045 + }, + { + "auxiliary_loss_clip": 0.01156074, + "auxiliary_loss_mlp": 0.01137313, + "balance_loss_clip": 1.00179982, + "balance_loss_mlp": 1.00084198, + "epoch": 0.27969337141139333, + "flos": 21765960910080.0, + "grad_norm": 1.5389136485937729, + "language_loss": 0.74469364, + "learning_rate": 3.3808539886031726e-06, + "loss": 0.76762748, + "num_input_tokens_seen": 100492520, + "step": 4652, + "time_per_iteration": 2.6090192794799805 + }, + { + "auxiliary_loss_clip": 0.01172724, + "auxiliary_loss_mlp": 0.01137452, + "balance_loss_clip": 1.0020349, + "balance_loss_mlp": 1.00107682, + "epoch": 0.27975349466406135, + "flos": 39851398834560.0, + "grad_norm": 2.329777577885941, + "language_loss": 0.79809844, + "learning_rate": 3.380572225034461e-06, + "loss": 0.82120019, + "num_input_tokens_seen": 100512870, + "step": 4653, + "time_per_iteration": 2.6627514362335205 + }, + { + "auxiliary_loss_clip": 0.01138953, + "auxiliary_loss_mlp": 0.01137239, + "balance_loss_clip": 1.00172961, + "balance_loss_mlp": 1.00086379, + "epoch": 0.2798136179167293, + "flos": 21579799697280.0, + "grad_norm": 2.0472370374538267, + "language_loss": 0.78756231, + "learning_rate": 3.380290409114312e-06, + "loss": 0.81032419, + "num_input_tokens_seen": 100531655, + "step": 4654, + "time_per_iteration": 2.588128089904785 + }, + { + "auxiliary_loss_clip": 0.01108722, + "auxiliary_loss_mlp": 0.01137959, + "balance_loss_clip": 1.00165963, + "balance_loss_mlp": 1.00082088, + "epoch": 0.2798737411693973, + "flos": 21537676022400.0, + "grad_norm": 5.178594803957837, + "language_loss": 0.80780399, + "learning_rate": 3.3800085408534127e-06, + "loss": 0.83027077, + "num_input_tokens_seen": 100548005, + "step": 4655, + "time_per_iteration": 2.6742658615112305 + }, + { + "auxiliary_loss_clip": 0.0112387, + "auxiliary_loss_mlp": 0.00748229, + "balance_loss_clip": 1.00161958, + "balance_loss_mlp": 1.00041127, + "epoch": 0.27993386442206525, + "flos": 26981051713920.0, + "grad_norm": 1.5777024641420845, + "language_loss": 0.81480056, + "learning_rate": 3.3797266202624506e-06, + "loss": 0.8335216, + "num_input_tokens_seen": 100567980, + "step": 4656, + "time_per_iteration": 2.7308506965637207 + }, + { + "auxiliary_loss_clip": 0.01139602, + "auxiliary_loss_mlp": 0.01136806, + "balance_loss_clip": 1.00166392, + "balance_loss_mlp": 1.00081229, + "epoch": 0.2799939876747332, + "flos": 24349876652160.0, + "grad_norm": 1.6872224489572432, + "language_loss": 0.83457565, + "learning_rate": 3.3794446473521176e-06, + "loss": 0.85733974, + "num_input_tokens_seen": 100588630, + "step": 4657, + "time_per_iteration": 2.6266767978668213 + }, + { + "auxiliary_loss_clip": 0.01124967, + "auxiliary_loss_mlp": 0.01137328, + "balance_loss_clip": 1.00174046, + "balance_loss_mlp": 1.00085688, + "epoch": 0.2800541109274012, + "flos": 33656988648960.0, + "grad_norm": 1.6477967241556264, + "language_loss": 0.63902956, + "learning_rate": 3.379162622133105e-06, + "loss": 0.66165257, + "num_input_tokens_seen": 100608775, + "step": 4658, + "time_per_iteration": 2.7666993141174316 + }, + { + "auxiliary_loss_clip": 0.01157296, + "auxiliary_loss_mlp": 0.01137604, + "balance_loss_clip": 1.00184298, + "balance_loss_mlp": 1.00084734, + "epoch": 0.28011423418006914, + "flos": 21614417429760.0, + "grad_norm": 1.7730607764995425, + "language_loss": 0.78331321, + "learning_rate": 3.3788805446161073e-06, + "loss": 0.80626225, + "num_input_tokens_seen": 100627975, + "step": 4659, + "time_per_iteration": 2.5441110134124756 + }, + { + "auxiliary_loss_clip": 0.01123992, + "auxiliary_loss_mlp": 0.01138083, + "balance_loss_clip": 1.0018338, + "balance_loss_mlp": 1.00084949, + "epoch": 0.2801743574327371, + "flos": 23112431159040.0, + "grad_norm": 1.684417262013102, + "language_loss": 0.79533952, + "learning_rate": 3.3785984148118215e-06, + "loss": 0.81796026, + "num_input_tokens_seen": 100645430, + "step": 4660, + "time_per_iteration": 2.6500468254089355 + }, + { + "auxiliary_loss_clip": 0.01123744, + "auxiliary_loss_mlp": 0.01136778, + "balance_loss_clip": 1.00177026, + "balance_loss_mlp": 1.0007844, + "epoch": 0.2802344806854051, + "flos": 12641418766080.0, + "grad_norm": 1.9018476795228787, + "language_loss": 0.80171967, + "learning_rate": 3.3783162327309453e-06, + "loss": 0.82432491, + "num_input_tokens_seen": 100663775, + "step": 4661, + "time_per_iteration": 2.6112475395202637 + }, + { + "auxiliary_loss_clip": 0.01140397, + "auxiliary_loss_mlp": 0.01138225, + "balance_loss_clip": 1.00206637, + "balance_loss_mlp": 1.00108695, + "epoch": 0.28029460393807304, + "flos": 37267878142080.0, + "grad_norm": 1.7001886825550658, + "language_loss": 0.7871182, + "learning_rate": 3.3780339983841794e-06, + "loss": 0.80990446, + "num_input_tokens_seen": 100686085, + "step": 4662, + "time_per_iteration": 4.076837778091431 + }, + { + "auxiliary_loss_clip": 0.01141231, + "auxiliary_loss_mlp": 0.01137808, + "balance_loss_clip": 1.00182474, + "balance_loss_mlp": 1.00085998, + "epoch": 0.280354727190741, + "flos": 20741106061440.0, + "grad_norm": 1.5183052837733806, + "language_loss": 0.69856668, + "learning_rate": 3.377751711782227e-06, + "loss": 0.72135699, + "num_input_tokens_seen": 100705135, + "step": 4663, + "time_per_iteration": 2.613271713256836 + }, + { + "auxiliary_loss_clip": 0.01145464, + "auxiliary_loss_mlp": 0.01137887, + "balance_loss_clip": 1.00258636, + "balance_loss_mlp": 1.00084364, + "epoch": 0.28041485044340897, + "flos": 21471026336640.0, + "grad_norm": 1.789990892892017, + "language_loss": 0.77725774, + "learning_rate": 3.377469372935791e-06, + "loss": 0.80009127, + "num_input_tokens_seen": 100724960, + "step": 4664, + "time_per_iteration": 4.015566825866699 + }, + { + "auxiliary_loss_clip": 0.01128904, + "auxiliary_loss_mlp": 0.01136871, + "balance_loss_clip": 1.00237417, + "balance_loss_mlp": 1.00078201, + "epoch": 0.28047497369607693, + "flos": 14794263388800.0, + "grad_norm": 1.9929231389763993, + "language_loss": 0.79132056, + "learning_rate": 3.377186981855578e-06, + "loss": 0.81397831, + "num_input_tokens_seen": 100741995, + "step": 4665, + "time_per_iteration": 2.5895774364471436 + }, + { + "auxiliary_loss_clip": 0.01157118, + "auxiliary_loss_mlp": 0.01137434, + "balance_loss_clip": 1.00187588, + "balance_loss_mlp": 1.00077236, + "epoch": 0.2805350969487449, + "flos": 23070738447360.0, + "grad_norm": 1.713978147501336, + "language_loss": 0.80613375, + "learning_rate": 3.3769045385522968e-06, + "loss": 0.82907927, + "num_input_tokens_seen": 100758985, + "step": 4666, + "time_per_iteration": 5.394855499267578 + }, + { + "auxiliary_loss_clip": 0.01124289, + "auxiliary_loss_mlp": 0.01137959, + "balance_loss_clip": 1.00186038, + "balance_loss_mlp": 1.00091636, + "epoch": 0.2805952202014129, + "flos": 20479855466880.0, + "grad_norm": 1.8663641657169063, + "language_loss": 0.84525454, + "learning_rate": 3.376622043036658e-06, + "loss": 0.86787707, + "num_input_tokens_seen": 100777820, + "step": 4667, + "time_per_iteration": 2.6111018657684326 + }, + { + "auxiliary_loss_clip": 0.01125536, + "auxiliary_loss_mlp": 0.0074811, + "balance_loss_clip": 1.0018276, + "balance_loss_mlp": 1.00037408, + "epoch": 0.2806553434540809, + "flos": 27417330305280.0, + "grad_norm": 1.528530605553866, + "language_loss": 0.79371345, + "learning_rate": 3.376339495319373e-06, + "loss": 0.81244987, + "num_input_tokens_seen": 100798205, + "step": 4668, + "time_per_iteration": 2.6945712566375732 + }, + { + "auxiliary_loss_clip": 0.01096899, + "auxiliary_loss_mlp": 0.0113792, + "balance_loss_clip": 1.00199497, + "balance_loss_mlp": 1.00068641, + "epoch": 0.28071546670674885, + "flos": 26505019745280.0, + "grad_norm": 1.383488927746776, + "language_loss": 0.76169449, + "learning_rate": 3.3760568954111563e-06, + "loss": 0.78404266, + "num_input_tokens_seen": 100819800, + "step": 4669, + "time_per_iteration": 2.7462291717529297 + }, + { + "auxiliary_loss_clip": 0.01155966, + "auxiliary_loss_mlp": 0.01137725, + "balance_loss_clip": 1.00185394, + "balance_loss_mlp": 1.00106382, + "epoch": 0.2807755899594168, + "flos": 20558679863040.0, + "grad_norm": 1.9779908048798163, + "language_loss": 0.78686762, + "learning_rate": 3.375774243322725e-06, + "loss": 0.80980456, + "num_input_tokens_seen": 100837880, + "step": 4670, + "time_per_iteration": 2.5597572326660156 + }, + { + "auxiliary_loss_clip": 0.01124168, + "auxiliary_loss_mlp": 0.01137452, + "balance_loss_clip": 1.00186038, + "balance_loss_mlp": 1.00079107, + "epoch": 0.2808357132120848, + "flos": 24313319585280.0, + "grad_norm": 2.1097834835907006, + "language_loss": 0.79154295, + "learning_rate": 3.3754915390647955e-06, + "loss": 0.81415915, + "num_input_tokens_seen": 100856350, + "step": 4671, + "time_per_iteration": 2.6513278484344482 + }, + { + "auxiliary_loss_clip": 0.01157172, + "auxiliary_loss_mlp": 0.01137352, + "balance_loss_clip": 1.00198996, + "balance_loss_mlp": 1.00078607, + "epoch": 0.28089583646475275, + "flos": 26432408401920.0, + "grad_norm": 1.7058974431491043, + "language_loss": 0.7498216, + "learning_rate": 3.37520878264809e-06, + "loss": 0.77276683, + "num_input_tokens_seen": 100876135, + "step": 4672, + "time_per_iteration": 2.6260251998901367 + }, + { + "auxiliary_loss_clip": 0.01141521, + "auxiliary_loss_mlp": 0.01138794, + "balance_loss_clip": 1.00184488, + "balance_loss_mlp": 1.00098765, + "epoch": 0.2809559597174207, + "flos": 23111820627840.0, + "grad_norm": 2.8541675355266443, + "language_loss": 0.74851596, + "learning_rate": 3.3749259740833286e-06, + "loss": 0.77131915, + "num_input_tokens_seen": 100894790, + "step": 4673, + "time_per_iteration": 2.625065803527832 + }, + { + "auxiliary_loss_clip": 0.0115563, + "auxiliary_loss_mlp": 0.01137459, + "balance_loss_clip": 1.00177479, + "balance_loss_mlp": 1.00079799, + "epoch": 0.2810160829700887, + "flos": 20923496346240.0, + "grad_norm": 2.9445722474653606, + "language_loss": 0.72461635, + "learning_rate": 3.374643113381237e-06, + "loss": 0.74754721, + "num_input_tokens_seen": 100915100, + "step": 4674, + "time_per_iteration": 2.565648317337036 + }, + { + "auxiliary_loss_clip": 0.01157299, + "auxiliary_loss_mlp": 0.0113805, + "balance_loss_clip": 1.00191438, + "balance_loss_mlp": 1.00072098, + "epoch": 0.28107620622275664, + "flos": 14355901808640.0, + "grad_norm": 1.913962267628552, + "language_loss": 0.77479738, + "learning_rate": 3.374360200552541e-06, + "loss": 0.79775089, + "num_input_tokens_seen": 100932795, + "step": 4675, + "time_per_iteration": 2.590945243835449 + }, + { + "auxiliary_loss_clip": 0.01172544, + "auxiliary_loss_mlp": 0.01137631, + "balance_loss_clip": 1.00183225, + "balance_loss_mlp": 1.00077844, + "epoch": 0.2811363294754246, + "flos": 20919078973440.0, + "grad_norm": 2.097155582918032, + "language_loss": 0.70527864, + "learning_rate": 3.374077235607968e-06, + "loss": 0.72838044, + "num_input_tokens_seen": 100950505, + "step": 4676, + "time_per_iteration": 2.485564947128296 + }, + { + "auxiliary_loss_clip": 0.01172379, + "auxiliary_loss_mlp": 0.01137022, + "balance_loss_clip": 1.00195432, + "balance_loss_mlp": 1.00083733, + "epoch": 0.28119645272809257, + "flos": 20594841880320.0, + "grad_norm": 1.577100566646417, + "language_loss": 0.70603591, + "learning_rate": 3.3737942185582487e-06, + "loss": 0.72912991, + "num_input_tokens_seen": 100968790, + "step": 4677, + "time_per_iteration": 2.5265066623687744 + }, + { + "auxiliary_loss_clip": 0.01157413, + "auxiliary_loss_mlp": 0.01138397, + "balance_loss_clip": 1.00188422, + "balance_loss_mlp": 1.00097275, + "epoch": 0.28125657598076054, + "flos": 25337420248320.0, + "grad_norm": 1.5186819950733106, + "language_loss": 0.63287836, + "learning_rate": 3.3735111494141153e-06, + "loss": 0.65583646, + "num_input_tokens_seen": 100990205, + "step": 4678, + "time_per_iteration": 2.607574939727783 + }, + { + "auxiliary_loss_clip": 0.01156045, + "auxiliary_loss_mlp": 0.0113712, + "balance_loss_clip": 1.00172305, + "balance_loss_mlp": 1.00084043, + "epoch": 0.2813166992334285, + "flos": 24827093769600.0, + "grad_norm": 1.9033659484857186, + "language_loss": 0.70214868, + "learning_rate": 3.3732280281863013e-06, + "loss": 0.72508037, + "num_input_tokens_seen": 101009815, + "step": 4679, + "time_per_iteration": 2.72627854347229 + }, + { + "auxiliary_loss_clip": 0.01155872, + "auxiliary_loss_mlp": 0.01137153, + "balance_loss_clip": 1.00183713, + "balance_loss_mlp": 1.0007776, + "epoch": 0.2813768224860965, + "flos": 21760753438080.0, + "grad_norm": 1.854302145963408, + "language_loss": 0.75046551, + "learning_rate": 3.3729448548855422e-06, + "loss": 0.77339578, + "num_input_tokens_seen": 101026780, + "step": 4680, + "time_per_iteration": 2.5308632850646973 + }, + { + "auxiliary_loss_clip": 0.01172518, + "auxiliary_loss_mlp": 0.01137328, + "balance_loss_clip": 1.00191796, + "balance_loss_mlp": 1.00085759, + "epoch": 0.2814369457387645, + "flos": 24316803204480.0, + "grad_norm": 1.6488021793442331, + "language_loss": 0.77485621, + "learning_rate": 3.3726616295225774e-06, + "loss": 0.79795468, + "num_input_tokens_seen": 101046215, + "step": 4681, + "time_per_iteration": 2.522111415863037 + }, + { + "auxiliary_loss_clip": 0.01156148, + "auxiliary_loss_mlp": 0.0113728, + "balance_loss_clip": 1.00196671, + "balance_loss_mlp": 1.00090432, + "epoch": 0.28149706899143245, + "flos": 18515326872960.0, + "grad_norm": 1.7972467300932924, + "language_loss": 0.73701191, + "learning_rate": 3.372378352108146e-06, + "loss": 0.75994617, + "num_input_tokens_seen": 101063365, + "step": 4682, + "time_per_iteration": 2.5202085971832275 + }, + { + "auxiliary_loss_clip": 0.01172471, + "auxiliary_loss_mlp": 0.01136703, + "balance_loss_clip": 1.0019629, + "balance_loss_mlp": 1.00080407, + "epoch": 0.2815571922441004, + "flos": 24863255786880.0, + "grad_norm": 1.4066413459933078, + "language_loss": 0.80568933, + "learning_rate": 3.3720950226529894e-06, + "loss": 0.82878107, + "num_input_tokens_seen": 101083835, + "step": 4683, + "time_per_iteration": 2.5348052978515625 + }, + { + "auxiliary_loss_clip": 0.01108038, + "auxiliary_loss_mlp": 0.01138253, + "balance_loss_clip": 1.00195289, + "balance_loss_mlp": 1.00092363, + "epoch": 0.2816173154967684, + "flos": 19901622326400.0, + "grad_norm": 1.5782728672816622, + "language_loss": 0.76078498, + "learning_rate": 3.371811641167852e-06, + "loss": 0.78324789, + "num_input_tokens_seen": 101101740, + "step": 4684, + "time_per_iteration": 2.659048557281494 + }, + { + "auxiliary_loss_clip": 0.01106714, + "auxiliary_loss_mlp": 0.01136649, + "balance_loss_clip": 1.0015378, + "balance_loss_mlp": 1.00075054, + "epoch": 0.28167743874943635, + "flos": 17491333950720.0, + "grad_norm": 1.682421985743824, + "language_loss": 0.75912887, + "learning_rate": 3.3715282076634807e-06, + "loss": 0.78156251, + "num_input_tokens_seen": 101120480, + "step": 4685, + "time_per_iteration": 2.633256435394287 + }, + { + "auxiliary_loss_clip": 0.01139236, + "auxiliary_loss_mlp": 0.01136833, + "balance_loss_clip": 1.0019269, + "balance_loss_mlp": 1.00074399, + "epoch": 0.2817375620021043, + "flos": 25302120157440.0, + "grad_norm": 2.1943107175970376, + "language_loss": 0.75939673, + "learning_rate": 3.3712447221506218e-06, + "loss": 0.78215742, + "num_input_tokens_seen": 101142910, + "step": 4686, + "time_per_iteration": 2.6420187950134277 + }, + { + "auxiliary_loss_clip": 0.01140672, + "auxiliary_loss_mlp": 0.01137827, + "balance_loss_clip": 1.0018065, + "balance_loss_mlp": 1.00087929, + "epoch": 0.2817976852547723, + "flos": 18693227957760.0, + "grad_norm": 2.331940779035314, + "language_loss": 0.62693, + "learning_rate": 3.370961184640025e-06, + "loss": 0.64971501, + "num_input_tokens_seen": 101160030, + "step": 4687, + "time_per_iteration": 2.5626542568206787 + }, + { + "auxiliary_loss_clip": 0.01139525, + "auxiliary_loss_mlp": 0.01137352, + "balance_loss_clip": 1.0017966, + "balance_loss_mlp": 1.00107217, + "epoch": 0.28185780850744024, + "flos": 22742263549440.0, + "grad_norm": 3.694886128360216, + "language_loss": 0.76014578, + "learning_rate": 3.3706775951424433e-06, + "loss": 0.78291464, + "num_input_tokens_seen": 101177675, + "step": 4688, + "time_per_iteration": 2.609288215637207 + }, + { + "auxiliary_loss_clip": 0.01129751, + "auxiliary_loss_mlp": 0.01137061, + "balance_loss_clip": 1.00223374, + "balance_loss_mlp": 1.00078082, + "epoch": 0.2819179317601082, + "flos": 14933919467520.0, + "grad_norm": 1.8705167603298078, + "language_loss": 0.78554964, + "learning_rate": 3.37039395366863e-06, + "loss": 0.80821776, + "num_input_tokens_seen": 101192225, + "step": 4689, + "time_per_iteration": 2.5697762966156006 + }, + { + "auxiliary_loss_clip": 0.01125156, + "auxiliary_loss_mlp": 0.0113728, + "balance_loss_clip": 1.00165713, + "balance_loss_mlp": 1.00071442, + "epoch": 0.2819780550127762, + "flos": 23145325038720.0, + "grad_norm": 1.585311008615606, + "language_loss": 0.77893519, + "learning_rate": 3.37011026022934e-06, + "loss": 0.80155957, + "num_input_tokens_seen": 101210870, + "step": 4690, + "time_per_iteration": 2.6245217323303223 + }, + { + "auxiliary_loss_clip": 0.01172571, + "auxiliary_loss_mlp": 0.00748155, + "balance_loss_clip": 1.00189686, + "balance_loss_mlp": 1.00026798, + "epoch": 0.28203817826544414, + "flos": 21616356764160.0, + "grad_norm": 1.6941385585249027, + "language_loss": 0.87611532, + "learning_rate": 3.369826514835332e-06, + "loss": 0.89532256, + "num_input_tokens_seen": 101229965, + "step": 4691, + "time_per_iteration": 2.512693405151367 + }, + { + "auxiliary_loss_clip": 0.0114051, + "auxiliary_loss_mlp": 0.01137757, + "balance_loss_clip": 1.00176275, + "balance_loss_mlp": 1.00080931, + "epoch": 0.2820983015181121, + "flos": 24026788794240.0, + "grad_norm": 1.616870182636898, + "language_loss": 0.81622052, + "learning_rate": 3.3695427174973654e-06, + "loss": 0.83900321, + "num_input_tokens_seen": 101250980, + "step": 4692, + "time_per_iteration": 2.6237781047821045 + }, + { + "auxiliary_loss_clip": 0.01128735, + "auxiliary_loss_mlp": 0.01136815, + "balance_loss_clip": 1.0020082, + "balance_loss_mlp": 1.00072527, + "epoch": 0.2821584247707801, + "flos": 30007925976960.0, + "grad_norm": 1.3592548101789903, + "language_loss": 0.74374211, + "learning_rate": 3.3692588682262022e-06, + "loss": 0.7663976, + "num_input_tokens_seen": 101273335, + "step": 4693, + "time_per_iteration": 2.682243824005127 + }, + { + "auxiliary_loss_clip": 0.01124068, + "auxiliary_loss_mlp": 0.01137052, + "balance_loss_clip": 1.00166631, + "balance_loss_mlp": 1.00058115, + "epoch": 0.2822185480234481, + "flos": 21396762967680.0, + "grad_norm": 1.5108351266459026, + "language_loss": 0.77706361, + "learning_rate": 3.3689749670326046e-06, + "loss": 0.79967487, + "num_input_tokens_seen": 101292110, + "step": 4694, + "time_per_iteration": 2.61507248878479 + }, + { + "auxiliary_loss_clip": 0.01155783, + "auxiliary_loss_mlp": 0.01136952, + "balance_loss_clip": 1.00184488, + "balance_loss_mlp": 1.00057709, + "epoch": 0.28227867127611606, + "flos": 27452809964160.0, + "grad_norm": 3.0635382475927777, + "language_loss": 0.66311347, + "learning_rate": 3.3686910139273392e-06, + "loss": 0.68604088, + "num_input_tokens_seen": 101312815, + "step": 4695, + "time_per_iteration": 2.6238458156585693 + }, + { + "auxiliary_loss_clip": 0.01140147, + "auxiliary_loss_mlp": 0.01138015, + "balance_loss_clip": 1.0018239, + "balance_loss_mlp": 1.00087714, + "epoch": 0.282338794528784, + "flos": 22593736811520.0, + "grad_norm": 2.076586319821319, + "language_loss": 0.75642288, + "learning_rate": 3.3684070089211736e-06, + "loss": 0.77920449, + "num_input_tokens_seen": 101329045, + "step": 4696, + "time_per_iteration": 2.598637342453003 + }, + { + "auxiliary_loss_clip": 0.01130059, + "auxiliary_loss_mlp": 0.0113724, + "balance_loss_clip": 1.00216007, + "balance_loss_mlp": 1.0007689, + "epoch": 0.282398917781452, + "flos": 42010923386880.0, + "grad_norm": 1.524611963799363, + "language_loss": 0.62604737, + "learning_rate": 3.368122952024877e-06, + "loss": 0.64872038, + "num_input_tokens_seen": 101352715, + "step": 4697, + "time_per_iteration": 2.7891790866851807 + }, + { + "auxiliary_loss_clip": 0.01125368, + "auxiliary_loss_mlp": 0.01136728, + "balance_loss_clip": 1.0016861, + "balance_loss_mlp": 1.00073397, + "epoch": 0.28245904103411995, + "flos": 23224724052480.0, + "grad_norm": 1.4042211191125828, + "language_loss": 0.73171902, + "learning_rate": 3.3678388432492214e-06, + "loss": 0.75434005, + "num_input_tokens_seen": 101374640, + "step": 4698, + "time_per_iteration": 2.7213480472564697 + }, + { + "auxiliary_loss_clip": 0.01172351, + "auxiliary_loss_mlp": 0.011367, + "balance_loss_clip": 1.00183153, + "balance_loss_mlp": 1.0007062, + "epoch": 0.2825191642867879, + "flos": 25374623760000.0, + "grad_norm": 1.5300166806146782, + "language_loss": 0.74965608, + "learning_rate": 3.3675546826049788e-06, + "loss": 0.77274662, + "num_input_tokens_seen": 101393595, + "step": 4699, + "time_per_iteration": 3.9518966674804688 + }, + { + "auxiliary_loss_clip": 0.01157355, + "auxiliary_loss_mlp": 0.01136946, + "balance_loss_clip": 1.0018568, + "balance_loss_mlp": 1.00066626, + "epoch": 0.2825792875394559, + "flos": 17236799199360.0, + "grad_norm": 2.6364353607500175, + "language_loss": 0.80669624, + "learning_rate": 3.3672704701029265e-06, + "loss": 0.82963932, + "num_input_tokens_seen": 101409265, + "step": 4700, + "time_per_iteration": 2.5493056774139404 + }, + { + "auxiliary_loss_clip": 0.01139318, + "auxiliary_loss_mlp": 0.01136747, + "balance_loss_clip": 1.00181031, + "balance_loss_mlp": 1.00103879, + "epoch": 0.28263941079212385, + "flos": 26723967096960.0, + "grad_norm": 1.930016278918759, + "language_loss": 0.81597984, + "learning_rate": 3.3669862057538402e-06, + "loss": 0.83874047, + "num_input_tokens_seen": 101428365, + "step": 4701, + "time_per_iteration": 2.6244518756866455 + }, + { + "auxiliary_loss_clip": 0.0107672, + "auxiliary_loss_mlp": 0.01136452, + "balance_loss_clip": 1.00153852, + "balance_loss_mlp": 1.00074434, + "epoch": 0.2826995340447918, + "flos": 25921327737600.0, + "grad_norm": 2.344833745848164, + "language_loss": 0.72982001, + "learning_rate": 3.3667018895685004e-06, + "loss": 0.75195169, + "num_input_tokens_seen": 101447280, + "step": 4702, + "time_per_iteration": 4.208219528198242 + }, + { + "auxiliary_loss_clip": 0.01172326, + "auxiliary_loss_mlp": 0.01136772, + "balance_loss_clip": 1.00189996, + "balance_loss_mlp": 1.0007782, + "epoch": 0.2827596572974598, + "flos": 22379709623040.0, + "grad_norm": 2.0245177439243127, + "language_loss": 0.78775311, + "learning_rate": 3.3664175215576886e-06, + "loss": 0.81084406, + "num_input_tokens_seen": 101465435, + "step": 4703, + "time_per_iteration": 2.5080761909484863 + }, + { + "auxiliary_loss_clip": 0.01140778, + "auxiliary_loss_mlp": 0.01136852, + "balance_loss_clip": 1.00183368, + "balance_loss_mlp": 1.00095379, + "epoch": 0.28281978055012774, + "flos": 33547137880320.0, + "grad_norm": 1.5168245319629448, + "language_loss": 0.69195157, + "learning_rate": 3.3661331017321867e-06, + "loss": 0.71472788, + "num_input_tokens_seen": 101486355, + "step": 4704, + "time_per_iteration": 5.431425333023071 + }, + { + "auxiliary_loss_clip": 0.01124954, + "auxiliary_loss_mlp": 0.01136108, + "balance_loss_clip": 1.00163436, + "balance_loss_mlp": 1.00078201, + "epoch": 0.2828799038027957, + "flos": 23440870143360.0, + "grad_norm": 1.8772053274522345, + "language_loss": 0.7074337, + "learning_rate": 3.3658486301027807e-06, + "loss": 0.73004436, + "num_input_tokens_seen": 101505875, + "step": 4705, + "time_per_iteration": 2.6902503967285156 + }, + { + "auxiliary_loss_clip": 0.01153063, + "auxiliary_loss_mlp": 0.01121238, + "balance_loss_clip": 1.00216246, + "balance_loss_mlp": 0.99993098, + "epoch": 0.2829400270554637, + "flos": 69873690251520.0, + "grad_norm": 0.7277659051625681, + "language_loss": 0.59282339, + "learning_rate": 3.3655641066802577e-06, + "loss": 0.61556649, + "num_input_tokens_seen": 101565045, + "step": 4706, + "time_per_iteration": 3.1737282276153564 + }, + { + "auxiliary_loss_clip": 0.01141382, + "auxiliary_loss_mlp": 0.01136082, + "balance_loss_clip": 1.00174522, + "balance_loss_mlp": 1.00094676, + "epoch": 0.2830001503081317, + "flos": 24789028331520.0, + "grad_norm": 1.6865012478978165, + "language_loss": 0.82014656, + "learning_rate": 3.365279531475407e-06, + "loss": 0.84292126, + "num_input_tokens_seen": 101585825, + "step": 4707, + "time_per_iteration": 2.650686025619507 + }, + { + "auxiliary_loss_clip": 0.01140769, + "auxiliary_loss_mlp": 0.011375, + "balance_loss_clip": 1.00174892, + "balance_loss_mlp": 1.00064731, + "epoch": 0.28306027356079966, + "flos": 27669387018240.0, + "grad_norm": 1.3587433987515989, + "language_loss": 0.80422449, + "learning_rate": 3.36499490449902e-06, + "loss": 0.82700717, + "num_input_tokens_seen": 101606105, + "step": 4708, + "time_per_iteration": 2.634678363800049 + }, + { + "auxiliary_loss_clip": 0.01137961, + "auxiliary_loss_mlp": 0.0112121, + "balance_loss_clip": 1.00205564, + "balance_loss_mlp": 0.9999029, + "epoch": 0.2831203968134676, + "flos": 60527938199040.0, + "grad_norm": 0.8807419786051982, + "language_loss": 0.62815082, + "learning_rate": 3.3647102257618895e-06, + "loss": 0.65074253, + "num_input_tokens_seen": 101656875, + "step": 4709, + "time_per_iteration": 3.037078619003296 + }, + { + "auxiliary_loss_clip": 0.01140398, + "auxiliary_loss_mlp": 0.0113687, + "balance_loss_clip": 1.00180125, + "balance_loss_mlp": 1.0004946, + "epoch": 0.2831805200661356, + "flos": 22054790171520.0, + "grad_norm": 1.3909737271396327, + "language_loss": 0.73735046, + "learning_rate": 3.3644254952748103e-06, + "loss": 0.76012319, + "num_input_tokens_seen": 101676225, + "step": 4710, + "time_per_iteration": 2.604126453399658 + }, + { + "auxiliary_loss_clip": 0.01129742, + "auxiliary_loss_mlp": 0.01137249, + "balance_loss_clip": 1.00211215, + "balance_loss_mlp": 1.00087333, + "epoch": 0.28324064331880355, + "flos": 22600668136320.0, + "grad_norm": 1.6429825412493329, + "language_loss": 0.78965956, + "learning_rate": 3.364140713048579e-06, + "loss": 0.81232953, + "num_input_tokens_seen": 101693710, + "step": 4711, + "time_per_iteration": 2.6448307037353516 + }, + { + "auxiliary_loss_clip": 0.01155817, + "auxiliary_loss_mlp": 0.00748214, + "balance_loss_clip": 1.0018419, + "balance_loss_mlp": 1.00037098, + "epoch": 0.2833007665714715, + "flos": 30404127968640.0, + "grad_norm": 1.7242864026786386, + "language_loss": 0.70929682, + "learning_rate": 3.363855879093996e-06, + "loss": 0.72833711, + "num_input_tokens_seen": 101714010, + "step": 4712, + "time_per_iteration": 2.641970157623291 + }, + { + "auxiliary_loss_clip": 0.01172313, + "auxiliary_loss_mlp": 0.01137437, + "balance_loss_clip": 1.00185752, + "balance_loss_mlp": 1.0010612, + "epoch": 0.2833608898241395, + "flos": 23549499849600.0, + "grad_norm": 1.8383716139193496, + "language_loss": 0.81560552, + "learning_rate": 3.3635709934218605e-06, + "loss": 0.83870298, + "num_input_tokens_seen": 101732995, + "step": 4713, + "time_per_iteration": 2.522406578063965 + }, + { + "auxiliary_loss_clip": 0.0113936, + "auxiliary_loss_mlp": 0.01137131, + "balance_loss_clip": 1.00186324, + "balance_loss_mlp": 1.00085092, + "epoch": 0.28342101307680745, + "flos": 20266726118400.0, + "grad_norm": 1.8457373908694352, + "language_loss": 0.75385845, + "learning_rate": 3.3632860560429766e-06, + "loss": 0.77662337, + "num_input_tokens_seen": 101751385, + "step": 4714, + "time_per_iteration": 2.602665424346924 + }, + { + "auxiliary_loss_clip": 0.01155995, + "auxiliary_loss_mlp": 0.01136779, + "balance_loss_clip": 1.00192475, + "balance_loss_mlp": 1.00078464, + "epoch": 0.2834811363294754, + "flos": 30847050576000.0, + "grad_norm": 1.4253339812353893, + "language_loss": 0.78426576, + "learning_rate": 3.3630010669681494e-06, + "loss": 0.80719352, + "num_input_tokens_seen": 101773825, + "step": 4715, + "time_per_iteration": 2.642613410949707 + }, + { + "auxiliary_loss_clip": 0.01139985, + "auxiliary_loss_mlp": 0.01136378, + "balance_loss_clip": 1.00172615, + "balance_loss_mlp": 1.00076556, + "epoch": 0.2835412595821434, + "flos": 22711021695360.0, + "grad_norm": 1.78776938456973, + "language_loss": 0.7355113, + "learning_rate": 3.3627160262081845e-06, + "loss": 0.75827497, + "num_input_tokens_seen": 101791920, + "step": 4716, + "time_per_iteration": 2.6438958644866943 + }, + { + "auxiliary_loss_clip": 0.0114035, + "auxiliary_loss_mlp": 0.01137149, + "balance_loss_clip": 1.00170267, + "balance_loss_mlp": 1.0008688, + "epoch": 0.28360138283481134, + "flos": 18077719478400.0, + "grad_norm": 2.7421042151898827, + "language_loss": 0.74823767, + "learning_rate": 3.3624309337738917e-06, + "loss": 0.77101266, + "num_input_tokens_seen": 101809515, + "step": 4717, + "time_per_iteration": 2.577549934387207 + }, + { + "auxiliary_loss_clip": 0.01124823, + "auxiliary_loss_mlp": 0.01137252, + "balance_loss_clip": 1.00167036, + "balance_loss_mlp": 1.00097227, + "epoch": 0.2836615060874793, + "flos": 17854785717120.0, + "grad_norm": 1.8948689885289032, + "language_loss": 0.66972733, + "learning_rate": 3.3621457896760813e-06, + "loss": 0.69234806, + "num_input_tokens_seen": 101827735, + "step": 4718, + "time_per_iteration": 2.6720011234283447 + }, + { + "auxiliary_loss_clip": 0.01139215, + "auxiliary_loss_mlp": 0.01137245, + "balance_loss_clip": 1.00169289, + "balance_loss_mlp": 1.00067842, + "epoch": 0.2837216293401473, + "flos": 25740302169600.0, + "grad_norm": 1.7042492788299106, + "language_loss": 0.72472417, + "learning_rate": 3.361860593925566e-06, + "loss": 0.7474888, + "num_input_tokens_seen": 101845970, + "step": 4719, + "time_per_iteration": 2.6355369091033936 + }, + { + "auxiliary_loss_clip": 0.01156657, + "auxiliary_loss_mlp": 0.01135994, + "balance_loss_clip": 1.00187528, + "balance_loss_mlp": 1.00066745, + "epoch": 0.2837817525928153, + "flos": 20923532259840.0, + "grad_norm": 1.8472158341103255, + "language_loss": 0.80149913, + "learning_rate": 3.3615753465331605e-06, + "loss": 0.8244257, + "num_input_tokens_seen": 101865040, + "step": 4720, + "time_per_iteration": 2.5734777450561523 + }, + { + "auxiliary_loss_clip": 0.01157304, + "auxiliary_loss_mlp": 0.01137524, + "balance_loss_clip": 1.00192535, + "balance_loss_mlp": 1.00095785, + "epoch": 0.28384187584548326, + "flos": 18916700423040.0, + "grad_norm": 1.7753648135742348, + "language_loss": 0.78980196, + "learning_rate": 3.3612900475096817e-06, + "loss": 0.81275028, + "num_input_tokens_seen": 101883735, + "step": 4721, + "time_per_iteration": 2.5262506008148193 + }, + { + "auxiliary_loss_clip": 0.01107203, + "auxiliary_loss_mlp": 0.00748172, + "balance_loss_clip": 1.00162077, + "balance_loss_mlp": 1.00034416, + "epoch": 0.2839019990981512, + "flos": 27343964776320.0, + "grad_norm": 2.1215570780486384, + "language_loss": 0.82504082, + "learning_rate": 3.3610046968659474e-06, + "loss": 0.84359461, + "num_input_tokens_seen": 101903025, + "step": 4722, + "time_per_iteration": 2.693864107131958 + }, + { + "auxiliary_loss_clip": 0.01172455, + "auxiliary_loss_mlp": 0.01137137, + "balance_loss_clip": 1.00198662, + "balance_loss_mlp": 1.00066662, + "epoch": 0.2839621223508192, + "flos": 18114312458880.0, + "grad_norm": 1.8064179840844954, + "language_loss": 0.70255339, + "learning_rate": 3.3607192946127785e-06, + "loss": 0.72564936, + "num_input_tokens_seen": 101922255, + "step": 4723, + "time_per_iteration": 2.5653786659240723 + }, + { + "auxiliary_loss_clip": 0.01140639, + "auxiliary_loss_mlp": 0.01136458, + "balance_loss_clip": 1.00169003, + "balance_loss_mlp": 1.00094092, + "epoch": 0.28402224560348716, + "flos": 26358360514560.0, + "grad_norm": 1.6115330150365794, + "language_loss": 0.78788793, + "learning_rate": 3.360433840760998e-06, + "loss": 0.81065893, + "num_input_tokens_seen": 101943100, + "step": 4724, + "time_per_iteration": 2.691124439239502 + }, + { + "auxiliary_loss_clip": 0.01140188, + "auxiliary_loss_mlp": 0.01136512, + "balance_loss_clip": 1.0017674, + "balance_loss_mlp": 1.00089908, + "epoch": 0.2840823688561551, + "flos": 24060795995520.0, + "grad_norm": 1.9742945314968428, + "language_loss": 0.92095196, + "learning_rate": 3.36014833532143e-06, + "loss": 0.94371891, + "num_input_tokens_seen": 101963160, + "step": 4725, + "time_per_iteration": 2.6423704624176025 + }, + { + "auxiliary_loss_clip": 0.01155898, + "auxiliary_loss_mlp": 0.0113711, + "balance_loss_clip": 1.00190496, + "balance_loss_mlp": 1.00073457, + "epoch": 0.2841424921088231, + "flos": 29459821368960.0, + "grad_norm": 1.6068512852248225, + "language_loss": 0.88627768, + "learning_rate": 3.3598627783049e-06, + "loss": 0.90920776, + "num_input_tokens_seen": 101984300, + "step": 4726, + "time_per_iteration": 2.6239333152770996 + }, + { + "auxiliary_loss_clip": 0.01155936, + "auxiliary_loss_mlp": 0.01137171, + "balance_loss_clip": 1.00195813, + "balance_loss_mlp": 1.00079513, + "epoch": 0.28420261536149105, + "flos": 48100367053440.0, + "grad_norm": 1.776412651304606, + "language_loss": 0.78753996, + "learning_rate": 3.359577169722238e-06, + "loss": 0.810471, + "num_input_tokens_seen": 102005765, + "step": 4727, + "time_per_iteration": 2.777651071548462 + }, + { + "auxiliary_loss_clip": 0.01157075, + "auxiliary_loss_mlp": 0.01136218, + "balance_loss_clip": 1.00191402, + "balance_loss_mlp": 1.00079656, + "epoch": 0.284262738614159, + "flos": 25666146541440.0, + "grad_norm": 2.37493722139243, + "language_loss": 0.66183817, + "learning_rate": 3.3592915095842733e-06, + "loss": 0.68477106, + "num_input_tokens_seen": 102022755, + "step": 4728, + "time_per_iteration": 2.5927624702453613 + }, + { + "auxiliary_loss_clip": 0.01123531, + "auxiliary_loss_mlp": 0.01136804, + "balance_loss_clip": 1.00171971, + "balance_loss_mlp": 1.00071478, + "epoch": 0.284322861866827, + "flos": 19718980646400.0, + "grad_norm": 1.6810562739788906, + "language_loss": 0.76363349, + "learning_rate": 3.3590057979018386e-06, + "loss": 0.78623688, + "num_input_tokens_seen": 102041850, + "step": 4729, + "time_per_iteration": 2.627307176589966 + }, + { + "auxiliary_loss_clip": 0.01139138, + "auxiliary_loss_mlp": 0.01137252, + "balance_loss_clip": 1.00161946, + "balance_loss_mlp": 1.0007813, + "epoch": 0.28438298511949495, + "flos": 23915250086400.0, + "grad_norm": 1.6091775513020954, + "language_loss": 0.66557848, + "learning_rate": 3.3587200346857674e-06, + "loss": 0.68834239, + "num_input_tokens_seen": 102059500, + "step": 4730, + "time_per_iteration": 2.603987455368042 + }, + { + "auxiliary_loss_clip": 0.01139497, + "auxiliary_loss_mlp": 0.01137652, + "balance_loss_clip": 1.00185323, + "balance_loss_mlp": 1.00070477, + "epoch": 0.2844431083721629, + "flos": 26067340523520.0, + "grad_norm": 1.6472220171737293, + "language_loss": 0.74980623, + "learning_rate": 3.3584342199468965e-06, + "loss": 0.77257776, + "num_input_tokens_seen": 102080460, + "step": 4731, + "time_per_iteration": 2.6301116943359375 + }, + { + "auxiliary_loss_clip": 0.01123091, + "auxiliary_loss_mlp": 0.01136936, + "balance_loss_clip": 1.00183296, + "balance_loss_mlp": 1.00075102, + "epoch": 0.2845032316248309, + "flos": 25810435474560.0, + "grad_norm": 1.4922336507219158, + "language_loss": 0.83717167, + "learning_rate": 3.3581483536960638e-06, + "loss": 0.85977191, + "num_input_tokens_seen": 102100950, + "step": 4732, + "time_per_iteration": 2.685349702835083 + }, + { + "auxiliary_loss_clip": 0.01155706, + "auxiliary_loss_mlp": 0.01136914, + "balance_loss_clip": 1.00191832, + "balance_loss_mlp": 1.00082493, + "epoch": 0.2845633548774989, + "flos": 19823192979840.0, + "grad_norm": 1.7205561367546909, + "language_loss": 0.78448629, + "learning_rate": 3.357862435944109e-06, + "loss": 0.80741251, + "num_input_tokens_seen": 102119345, + "step": 4733, + "time_per_iteration": 2.5355892181396484 + }, + { + "auxiliary_loss_clip": 0.01172449, + "auxiliary_loss_mlp": 0.01136943, + "balance_loss_clip": 1.00193739, + "balance_loss_mlp": 1.00075793, + "epoch": 0.28462347813016686, + "flos": 23182815859200.0, + "grad_norm": 2.175277333498536, + "language_loss": 0.70954192, + "learning_rate": 3.357576466701875e-06, + "loss": 0.73263586, + "num_input_tokens_seen": 102139050, + "step": 4734, + "time_per_iteration": 2.541051149368286 + }, + { + "auxiliary_loss_clip": 0.01139752, + "auxiliary_loss_mlp": 0.01136498, + "balance_loss_clip": 1.00169849, + "balance_loss_mlp": 1.00050437, + "epoch": 0.2846836013828348, + "flos": 18660477732480.0, + "grad_norm": 1.9784381478597255, + "language_loss": 0.73775309, + "learning_rate": 3.3572904459802056e-06, + "loss": 0.76051557, + "num_input_tokens_seen": 102157935, + "step": 4735, + "time_per_iteration": 2.584413766860962 + }, + { + "auxiliary_loss_clip": 0.01138787, + "auxiliary_loss_mlp": 0.01137109, + "balance_loss_clip": 1.00176787, + "balance_loss_mlp": 1.00092387, + "epoch": 0.2847437246355028, + "flos": 14173511523840.0, + "grad_norm": 1.7020337878143188, + "language_loss": 0.79509288, + "learning_rate": 3.357004373789946e-06, + "loss": 0.81785178, + "num_input_tokens_seen": 102175325, + "step": 4736, + "time_per_iteration": 2.553067922592163 + }, + { + "auxiliary_loss_clip": 0.01172385, + "auxiliary_loss_mlp": 0.01136607, + "balance_loss_clip": 1.00197971, + "balance_loss_mlp": 1.00089872, + "epoch": 0.28480384788817076, + "flos": 29278364837760.0, + "grad_norm": 2.4898490408224214, + "language_loss": 0.59743071, + "learning_rate": 3.3567182501419453e-06, + "loss": 0.62052065, + "num_input_tokens_seen": 102196625, + "step": 4737, + "time_per_iteration": 3.960453510284424 + }, + { + "auxiliary_loss_clip": 0.01157106, + "auxiliary_loss_mlp": 0.01136241, + "balance_loss_clip": 1.00191188, + "balance_loss_mlp": 1.00081968, + "epoch": 0.2848639711408387, + "flos": 22601314581120.0, + "grad_norm": 1.9194305417232544, + "language_loss": 0.86115479, + "learning_rate": 3.356432075047052e-06, + "loss": 0.88408828, + "num_input_tokens_seen": 102214975, + "step": 4738, + "time_per_iteration": 2.5750250816345215 + }, + { + "auxiliary_loss_clip": 0.01124606, + "auxiliary_loss_mlp": 0.0113686, + "balance_loss_clip": 1.00169194, + "balance_loss_mlp": 1.00086606, + "epoch": 0.2849240943935067, + "flos": 17599460866560.0, + "grad_norm": 1.9910710870624537, + "language_loss": 0.89748609, + "learning_rate": 3.356145848516118e-06, + "loss": 0.92010075, + "num_input_tokens_seen": 102231885, + "step": 4739, + "time_per_iteration": 2.6280977725982666 + }, + { + "auxiliary_loss_clip": 0.01155466, + "auxiliary_loss_mlp": 0.01135776, + "balance_loss_clip": 1.00188971, + "balance_loss_mlp": 1.00102186, + "epoch": 0.28498421764617465, + "flos": 24862573428480.0, + "grad_norm": 1.3527345886062143, + "language_loss": 0.72182357, + "learning_rate": 3.355859570559998e-06, + "loss": 0.74473596, + "num_input_tokens_seen": 102252725, + "step": 4740, + "time_per_iteration": 4.005504846572876 + }, + { + "auxiliary_loss_clip": 0.01140461, + "auxiliary_loss_mlp": 0.01136651, + "balance_loss_clip": 1.00179505, + "balance_loss_mlp": 1.00084829, + "epoch": 0.2850443408988426, + "flos": 22782555630720.0, + "grad_norm": 1.4343892116472317, + "language_loss": 0.7772615, + "learning_rate": 3.3555732411895477e-06, + "loss": 0.80003262, + "num_input_tokens_seen": 102271730, + "step": 4741, + "time_per_iteration": 2.593909978866577 + }, + { + "auxiliary_loss_clip": 0.01123044, + "auxiliary_loss_mlp": 0.01137497, + "balance_loss_clip": 1.00178432, + "balance_loss_mlp": 1.00073993, + "epoch": 0.2851044641515106, + "flos": 18844053166080.0, + "grad_norm": 1.579212628305734, + "language_loss": 0.75854099, + "learning_rate": 3.3552868604156235e-06, + "loss": 0.78114641, + "num_input_tokens_seen": 102291325, + "step": 4742, + "time_per_iteration": 5.510449647903442 + }, + { + "auxiliary_loss_clip": 0.01172393, + "auxiliary_loss_mlp": 0.01137081, + "balance_loss_clip": 1.00186265, + "balance_loss_mlp": 1.00099146, + "epoch": 0.28516458740417855, + "flos": 18880502492160.0, + "grad_norm": 2.148339749926195, + "language_loss": 0.57250887, + "learning_rate": 3.355000428249086e-06, + "loss": 0.59560359, + "num_input_tokens_seen": 102309000, + "step": 4743, + "time_per_iteration": 2.496788740158081 + }, + { + "auxiliary_loss_clip": 0.01124895, + "auxiliary_loss_mlp": 0.01137128, + "balance_loss_clip": 1.0018537, + "balance_loss_mlp": 1.00094378, + "epoch": 0.2852247106568465, + "flos": 25299821687040.0, + "grad_norm": 1.6814486149353463, + "language_loss": 0.74610168, + "learning_rate": 3.354713944700797e-06, + "loss": 0.76872188, + "num_input_tokens_seen": 102329240, + "step": 4744, + "time_per_iteration": 2.660712480545044 + }, + { + "auxiliary_loss_clip": 0.01155622, + "auxiliary_loss_mlp": 0.01136723, + "balance_loss_clip": 1.00180292, + "balance_loss_mlp": 1.00092018, + "epoch": 0.2852848339095145, + "flos": 11655383541120.0, + "grad_norm": 2.129174057926319, + "language_loss": 0.7762866, + "learning_rate": 3.3544274097816185e-06, + "loss": 0.79921007, + "num_input_tokens_seen": 102344440, + "step": 4745, + "time_per_iteration": 2.5306642055511475 + }, + { + "auxiliary_loss_clip": 0.01156073, + "auxiliary_loss_mlp": 0.01136157, + "balance_loss_clip": 1.00202799, + "balance_loss_mlp": 1.00083101, + "epoch": 0.2853449571621825, + "flos": 12933228856320.0, + "grad_norm": 1.6931820598712457, + "language_loss": 0.82199764, + "learning_rate": 3.3541408235024173e-06, + "loss": 0.84491998, + "num_input_tokens_seen": 102360985, + "step": 4746, + "time_per_iteration": 2.5126078128814697 + }, + { + "auxiliary_loss_clip": 0.01107201, + "auxiliary_loss_mlp": 0.01137001, + "balance_loss_clip": 1.00166535, + "balance_loss_mlp": 1.00072145, + "epoch": 0.28540508041485046, + "flos": 20010575255040.0, + "grad_norm": 1.6268552162271668, + "language_loss": 0.79069102, + "learning_rate": 3.3538541858740604e-06, + "loss": 0.813133, + "num_input_tokens_seen": 102380320, + "step": 4747, + "time_per_iteration": 2.670212745666504 + }, + { + "auxiliary_loss_clip": 0.01151938, + "auxiliary_loss_mlp": 0.01121232, + "balance_loss_clip": 1.00142574, + "balance_loss_mlp": 0.99992508, + "epoch": 0.28546520366751843, + "flos": 68139349966080.0, + "grad_norm": 0.7806280276203423, + "language_loss": 0.60500634, + "learning_rate": 3.3535674969074173e-06, + "loss": 0.62773812, + "num_input_tokens_seen": 102439140, + "step": 4748, + "time_per_iteration": 3.1216068267822266 + }, + { + "auxiliary_loss_clip": 0.01172256, + "auxiliary_loss_mlp": 0.01136757, + "balance_loss_clip": 1.00186348, + "balance_loss_mlp": 1.00076282, + "epoch": 0.2855253269201864, + "flos": 13251540205440.0, + "grad_norm": 2.2391167075033294, + "language_loss": 0.79740363, + "learning_rate": 3.3532807566133592e-06, + "loss": 0.82049376, + "num_input_tokens_seen": 102450990, + "step": 4749, + "time_per_iteration": 2.4793903827667236 + }, + { + "auxiliary_loss_clip": 0.01156705, + "auxiliary_loss_mlp": 0.01136819, + "balance_loss_clip": 1.00189018, + "balance_loss_mlp": 1.00082517, + "epoch": 0.28558545017285436, + "flos": 28620876337920.0, + "grad_norm": 2.155132044952185, + "language_loss": 0.70183575, + "learning_rate": 3.3529939650027587e-06, + "loss": 0.72477102, + "num_input_tokens_seen": 102471820, + "step": 4750, + "time_per_iteration": 2.6634116172790527 + }, + { + "auxiliary_loss_clip": 0.01155724, + "auxiliary_loss_mlp": 0.01135951, + "balance_loss_clip": 1.00189638, + "balance_loss_mlp": 1.00062478, + "epoch": 0.2856455734255223, + "flos": 34130470752000.0, + "grad_norm": 1.5642337094456906, + "language_loss": 0.81784153, + "learning_rate": 3.3527071220864917e-06, + "loss": 0.84075832, + "num_input_tokens_seen": 102492625, + "step": 4751, + "time_per_iteration": 2.666288375854492 + }, + { + "auxiliary_loss_clip": 0.01172255, + "auxiliary_loss_mlp": 0.01136149, + "balance_loss_clip": 1.00192535, + "balance_loss_mlp": 1.00082302, + "epoch": 0.2857056966781903, + "flos": 39786149779200.0, + "grad_norm": 2.1623104461570266, + "language_loss": 0.80058807, + "learning_rate": 3.3524202278754353e-06, + "loss": 0.82367206, + "num_input_tokens_seen": 102514145, + "step": 4752, + "time_per_iteration": 2.688952684402466 + }, + { + "auxiliary_loss_clip": 0.01157077, + "auxiliary_loss_mlp": 0.01136559, + "balance_loss_clip": 1.00189614, + "balance_loss_mlp": 1.00075579, + "epoch": 0.28576581993085826, + "flos": 21872292145920.0, + "grad_norm": 1.7833999891869785, + "language_loss": 0.78525794, + "learning_rate": 3.3521332823804676e-06, + "loss": 0.80819434, + "num_input_tokens_seen": 102532365, + "step": 4753, + "time_per_iteration": 2.569817543029785 + }, + { + "auxiliary_loss_clip": 0.01172391, + "auxiliary_loss_mlp": 0.01136989, + "balance_loss_clip": 1.00192904, + "balance_loss_mlp": 1.00080431, + "epoch": 0.2858259431835262, + "flos": 19091656592640.0, + "grad_norm": 2.2864201304841973, + "language_loss": 0.89557725, + "learning_rate": 3.3518462856124704e-06, + "loss": 0.91867107, + "num_input_tokens_seen": 102548425, + "step": 4754, + "time_per_iteration": 2.4868063926696777 + }, + { + "auxiliary_loss_clip": 0.01155428, + "auxiliary_loss_mlp": 0.011363, + "balance_loss_clip": 1.00179255, + "balance_loss_mlp": 1.00087845, + "epoch": 0.2858860664361942, + "flos": 20334309557760.0, + "grad_norm": 1.8970550713649312, + "language_loss": 0.82266593, + "learning_rate": 3.3515592375823267e-06, + "loss": 0.84558332, + "num_input_tokens_seen": 102566370, + "step": 4755, + "time_per_iteration": 2.5289132595062256 + }, + { + "auxiliary_loss_clip": 0.01107229, + "auxiliary_loss_mlp": 0.01136155, + "balance_loss_clip": 1.00164986, + "balance_loss_mlp": 1.00092435, + "epoch": 0.28594618968886215, + "flos": 24461738582400.0, + "grad_norm": 1.4453596592778108, + "language_loss": 0.83725733, + "learning_rate": 3.351272138300922e-06, + "loss": 0.85969114, + "num_input_tokens_seen": 102588715, + "step": 4756, + "time_per_iteration": 2.7244598865509033 + }, + { + "auxiliary_loss_clip": 0.01122725, + "auxiliary_loss_mlp": 0.01120609, + "balance_loss_clip": 1.00217438, + "balance_loss_mlp": 1.00006413, + "epoch": 0.2860063129415301, + "flos": 71652850709760.0, + "grad_norm": 0.8596954499257383, + "language_loss": 0.61021942, + "learning_rate": 3.350984987779142e-06, + "loss": 0.63265276, + "num_input_tokens_seen": 102656715, + "step": 4757, + "time_per_iteration": 3.363975763320923 + }, + { + "auxiliary_loss_clip": 0.01172211, + "auxiliary_loss_mlp": 0.01136054, + "balance_loss_clip": 1.00195336, + "balance_loss_mlp": 1.00072718, + "epoch": 0.2860664361941981, + "flos": 20558679863040.0, + "grad_norm": 1.9050373832304752, + "language_loss": 0.65860939, + "learning_rate": 3.3506977860278756e-06, + "loss": 0.681692, + "num_input_tokens_seen": 102676545, + "step": 4758, + "time_per_iteration": 2.5374109745025635 + }, + { + "auxiliary_loss_clip": 0.01155653, + "auxiliary_loss_mlp": 0.01137417, + "balance_loss_clip": 1.00175869, + "balance_loss_mlp": 1.00085092, + "epoch": 0.2861265594468661, + "flos": 35996389534080.0, + "grad_norm": 1.499627929708258, + "language_loss": 0.6305337, + "learning_rate": 3.3504105330580143e-06, + "loss": 0.65346438, + "num_input_tokens_seen": 102702875, + "step": 4759, + "time_per_iteration": 2.7023627758026123 + }, + { + "auxiliary_loss_clip": 0.01156907, + "auxiliary_loss_mlp": 0.00748169, + "balance_loss_clip": 1.00189114, + "balance_loss_mlp": 1.00025058, + "epoch": 0.28618668269953407, + "flos": 20047419630720.0, + "grad_norm": 1.706376412443021, + "language_loss": 0.74341029, + "learning_rate": 3.3501232288804496e-06, + "loss": 0.76246107, + "num_input_tokens_seen": 102723160, + "step": 4760, + "time_per_iteration": 2.587519884109497 + }, + { + "auxiliary_loss_clip": 0.01139291, + "auxiliary_loss_mlp": 0.01135585, + "balance_loss_clip": 1.00180125, + "balance_loss_mlp": 1.00083053, + "epoch": 0.28624680595220203, + "flos": 24971849579520.0, + "grad_norm": 1.8385386540160369, + "language_loss": 0.72596872, + "learning_rate": 3.3498358735060773e-06, + "loss": 0.74871743, + "num_input_tokens_seen": 102743855, + "step": 4761, + "time_per_iteration": 2.6321072578430176 + }, + { + "auxiliary_loss_clip": 0.0107633, + "auxiliary_loss_mlp": 0.01136077, + "balance_loss_clip": 1.00155103, + "balance_loss_mlp": 1.00084651, + "epoch": 0.28630692920487, + "flos": 22492253911680.0, + "grad_norm": 2.02716711399834, + "language_loss": 0.74444163, + "learning_rate": 3.349548466945793e-06, + "loss": 0.76656568, + "num_input_tokens_seen": 102761370, + "step": 4762, + "time_per_iteration": 2.780661106109619 + }, + { + "auxiliary_loss_clip": 0.0112303, + "auxiliary_loss_mlp": 0.01136146, + "balance_loss_clip": 1.00177312, + "balance_loss_mlp": 1.00091457, + "epoch": 0.28636705245753796, + "flos": 21249888255360.0, + "grad_norm": 1.4750744117733756, + "language_loss": 0.76226914, + "learning_rate": 3.349261009210496e-06, + "loss": 0.78486097, + "num_input_tokens_seen": 102780885, + "step": 4763, + "time_per_iteration": 2.6311206817626953 + }, + { + "auxiliary_loss_clip": 0.01125249, + "auxiliary_loss_mlp": 0.01136923, + "balance_loss_clip": 1.00177705, + "balance_loss_mlp": 1.00073838, + "epoch": 0.28642717571020593, + "flos": 24095772864000.0, + "grad_norm": 1.887331775738905, + "language_loss": 0.76674324, + "learning_rate": 3.348973500311086e-06, + "loss": 0.78936493, + "num_input_tokens_seen": 102801000, + "step": 4764, + "time_per_iteration": 2.6577725410461426 + }, + { + "auxiliary_loss_clip": 0.01129787, + "auxiliary_loss_mlp": 0.01137108, + "balance_loss_clip": 1.00256014, + "balance_loss_mlp": 1.00101876, + "epoch": 0.2864872989628739, + "flos": 22601386408320.0, + "grad_norm": 2.6766194975302904, + "language_loss": 0.71065444, + "learning_rate": 3.348685940258466e-06, + "loss": 0.7333234, + "num_input_tokens_seen": 102820230, + "step": 4765, + "time_per_iteration": 2.64414381980896 + }, + { + "auxiliary_loss_clip": 0.01155395, + "auxiliary_loss_mlp": 0.01135739, + "balance_loss_clip": 1.00175786, + "balance_loss_mlp": 1.00069904, + "epoch": 0.28654742221554186, + "flos": 32745073138560.0, + "grad_norm": 1.6691378935190742, + "language_loss": 0.76470059, + "learning_rate": 3.3483983290635395e-06, + "loss": 0.78761196, + "num_input_tokens_seen": 102842670, + "step": 4766, + "time_per_iteration": 2.6491737365722656 + }, + { + "auxiliary_loss_clip": 0.01155404, + "auxiliary_loss_mlp": 0.01135574, + "balance_loss_clip": 1.00181472, + "balance_loss_mlp": 1.00062883, + "epoch": 0.2866075454682098, + "flos": 26981626331520.0, + "grad_norm": 1.5300059727440398, + "language_loss": 0.77409101, + "learning_rate": 3.348110666737214e-06, + "loss": 0.79700077, + "num_input_tokens_seen": 102864480, + "step": 4767, + "time_per_iteration": 2.5997540950775146 + }, + { + "auxiliary_loss_clip": 0.01172181, + "auxiliary_loss_mlp": 0.01136701, + "balance_loss_clip": 1.00191355, + "balance_loss_mlp": 1.00089812, + "epoch": 0.2866676687208778, + "flos": 23253847004160.0, + "grad_norm": 1.9898356230746808, + "language_loss": 0.65121746, + "learning_rate": 3.3478229532903956e-06, + "loss": 0.67430627, + "num_input_tokens_seen": 102883740, + "step": 4768, + "time_per_iteration": 2.5141782760620117 + }, + { + "auxiliary_loss_clip": 0.0114662, + "auxiliary_loss_mlp": 0.01136878, + "balance_loss_clip": 1.00229335, + "balance_loss_mlp": 1.00088358, + "epoch": 0.28672779197354575, + "flos": 21579727870080.0, + "grad_norm": 1.5979410184967506, + "language_loss": 0.70791388, + "learning_rate": 3.3475351887339967e-06, + "loss": 0.73074889, + "num_input_tokens_seen": 102902945, + "step": 4769, + "time_per_iteration": 2.5735137462615967 + }, + { + "auxiliary_loss_clip": 0.01107921, + "auxiliary_loss_mlp": 0.01136654, + "balance_loss_clip": 1.00181317, + "balance_loss_mlp": 1.00085068, + "epoch": 0.2867879152262137, + "flos": 19865568049920.0, + "grad_norm": 1.621436789695172, + "language_loss": 0.75026071, + "learning_rate": 3.3472473730789288e-06, + "loss": 0.77270639, + "num_input_tokens_seen": 102922405, + "step": 4770, + "time_per_iteration": 2.6944031715393066 + }, + { + "auxiliary_loss_clip": 0.01113032, + "auxiliary_loss_mlp": 0.01137088, + "balance_loss_clip": 1.00195682, + "balance_loss_mlp": 1.00090373, + "epoch": 0.2868480384788817, + "flos": 28213325648640.0, + "grad_norm": 2.561000147638771, + "language_loss": 0.67161882, + "learning_rate": 3.3469595063361045e-06, + "loss": 0.69411993, + "num_input_tokens_seen": 102938980, + "step": 4771, + "time_per_iteration": 2.7213380336761475 + }, + { + "auxiliary_loss_clip": 0.01154105, + "auxiliary_loss_mlp": 0.01120581, + "balance_loss_clip": 1.00216413, + "balance_loss_mlp": 1.0000366, + "epoch": 0.2869081617315497, + "flos": 65424286690560.0, + "grad_norm": 0.7777917709538605, + "language_loss": 0.56891024, + "learning_rate": 3.3466715885164414e-06, + "loss": 0.59165704, + "num_input_tokens_seen": 103000405, + "step": 4772, + "time_per_iteration": 3.0969595909118652 + }, + { + "auxiliary_loss_clip": 0.01091971, + "auxiliary_loss_mlp": 0.00748147, + "balance_loss_clip": 1.00178874, + "balance_loss_mlp": 1.00028133, + "epoch": 0.28696828498421767, + "flos": 18660729127680.0, + "grad_norm": 2.5394115847072745, + "language_loss": 0.83272684, + "learning_rate": 3.346383619630856e-06, + "loss": 0.85112798, + "num_input_tokens_seen": 103017970, + "step": 4773, + "time_per_iteration": 2.7513234615325928 + }, + { + "auxiliary_loss_clip": 0.01172265, + "auxiliary_loss_mlp": 0.01136302, + "balance_loss_clip": 1.00182021, + "balance_loss_mlp": 1.00078535, + "epoch": 0.28702840823688563, + "flos": 23659745667840.0, + "grad_norm": 2.1655909641536057, + "language_loss": 0.77367312, + "learning_rate": 3.34609559969027e-06, + "loss": 0.79675877, + "num_input_tokens_seen": 103036385, + "step": 4774, + "time_per_iteration": 2.549586296081543 + }, + { + "auxiliary_loss_clip": 0.01144844, + "auxiliary_loss_mlp": 0.01136588, + "balance_loss_clip": 1.00191164, + "balance_loss_mlp": 1.00087976, + "epoch": 0.2870885314895536, + "flos": 13804744544640.0, + "grad_norm": 2.0824649473083636, + "language_loss": 0.73111445, + "learning_rate": 3.3458075287056034e-06, + "loss": 0.75392878, + "num_input_tokens_seen": 103052170, + "step": 4775, + "time_per_iteration": 3.9300332069396973 + }, + { + "auxiliary_loss_clip": 0.0115692, + "auxiliary_loss_mlp": 0.01136397, + "balance_loss_clip": 1.00183964, + "balance_loss_mlp": 1.00087976, + "epoch": 0.28714865474222157, + "flos": 17786771314560.0, + "grad_norm": 1.5792680054199209, + "language_loss": 0.8791613, + "learning_rate": 3.34551940668778e-06, + "loss": 0.90209448, + "num_input_tokens_seen": 103070510, + "step": 4776, + "time_per_iteration": 2.563828945159912 + }, + { + "auxiliary_loss_clip": 0.01161345, + "auxiliary_loss_mlp": 0.01136186, + "balance_loss_clip": 1.00189877, + "balance_loss_mlp": 1.00104988, + "epoch": 0.28720877799488953, + "flos": 15997486199040.0, + "grad_norm": 3.0568380317211346, + "language_loss": 0.74230504, + "learning_rate": 3.345231233647726e-06, + "loss": 0.76528037, + "num_input_tokens_seen": 103089590, + "step": 4777, + "time_per_iteration": 2.532836675643921 + }, + { + "auxiliary_loss_clip": 0.01140751, + "auxiliary_loss_mlp": 0.01137343, + "balance_loss_clip": 1.00193644, + "balance_loss_mlp": 1.00115824, + "epoch": 0.2872689012475575, + "flos": 20923137210240.0, + "grad_norm": 39.93592477417479, + "language_loss": 0.80224764, + "learning_rate": 3.3449430095963696e-06, + "loss": 0.82502854, + "num_input_tokens_seen": 103109080, + "step": 4778, + "time_per_iteration": 3.995548725128174 + }, + { + "auxiliary_loss_clip": 0.01139493, + "auxiliary_loss_mlp": 0.01136088, + "balance_loss_clip": 1.00180364, + "balance_loss_mlp": 1.00095284, + "epoch": 0.28732902450022546, + "flos": 21325121291520.0, + "grad_norm": 1.565611904692416, + "language_loss": 0.74167156, + "learning_rate": 3.3446547345446386e-06, + "loss": 0.7644273, + "num_input_tokens_seen": 103127755, + "step": 4779, + "time_per_iteration": 5.382736921310425 + }, + { + "auxiliary_loss_clip": 0.01146197, + "auxiliary_loss_mlp": 0.01137109, + "balance_loss_clip": 1.00201511, + "balance_loss_mlp": 1.00092411, + "epoch": 0.2873891477528934, + "flos": 20850382212480.0, + "grad_norm": 1.5323199736188504, + "language_loss": 0.76035416, + "learning_rate": 3.3443664085034656e-06, + "loss": 0.78318727, + "num_input_tokens_seen": 103147035, + "step": 4780, + "time_per_iteration": 2.5813167095184326 + }, + { + "auxiliary_loss_clip": 0.01124452, + "auxiliary_loss_mlp": 0.01136194, + "balance_loss_clip": 1.0017091, + "balance_loss_mlp": 1.00077271, + "epoch": 0.2874492710055614, + "flos": 17420051410560.0, + "grad_norm": 1.6502608229916362, + "language_loss": 0.81061566, + "learning_rate": 3.344078031483784e-06, + "loss": 0.83322209, + "num_input_tokens_seen": 103165410, + "step": 4781, + "time_per_iteration": 2.6111247539520264 + }, + { + "auxiliary_loss_clip": 0.01113555, + "auxiliary_loss_mlp": 0.01136783, + "balance_loss_clip": 1.00190568, + "balance_loss_mlp": 1.0007894, + "epoch": 0.28750939425822936, + "flos": 13406818700160.0, + "grad_norm": 2.9417416962091116, + "language_loss": 0.86360431, + "learning_rate": 3.3437896034965283e-06, + "loss": 0.88610768, + "num_input_tokens_seen": 103183710, + "step": 4782, + "time_per_iteration": 2.642238140106201 + }, + { + "auxiliary_loss_clip": 0.01122798, + "auxiliary_loss_mlp": 0.01137163, + "balance_loss_clip": 1.00185561, + "balance_loss_mlp": 1.00107419, + "epoch": 0.2875695175108973, + "flos": 21870029589120.0, + "grad_norm": 1.4748612241073655, + "language_loss": 0.71368313, + "learning_rate": 3.3435011245526357e-06, + "loss": 0.73628271, + "num_input_tokens_seen": 103203790, + "step": 4783, + "time_per_iteration": 2.661774158477783 + }, + { + "auxiliary_loss_clip": 0.01139157, + "auxiliary_loss_mlp": 0.01136959, + "balance_loss_clip": 1.00193429, + "balance_loss_mlp": 1.00106001, + "epoch": 0.2876296407635653, + "flos": 26245457089920.0, + "grad_norm": 1.5951374087262407, + "language_loss": 0.77110022, + "learning_rate": 3.343212594663047e-06, + "loss": 0.79386139, + "num_input_tokens_seen": 103223925, + "step": 4784, + "time_per_iteration": 2.644925355911255 + }, + { + "auxiliary_loss_clip": 0.01123494, + "auxiliary_loss_mlp": 0.01136112, + "balance_loss_clip": 1.00173521, + "balance_loss_mlp": 1.00068998, + "epoch": 0.28768976401623325, + "flos": 25373654092800.0, + "grad_norm": 1.3599667418715196, + "language_loss": 0.75914562, + "learning_rate": 3.3429240138387015e-06, + "loss": 0.78174174, + "num_input_tokens_seen": 103244760, + "step": 4785, + "time_per_iteration": 2.674037456512451 + }, + { + "auxiliary_loss_clip": 0.01172182, + "auxiliary_loss_mlp": 0.01135857, + "balance_loss_clip": 1.00195432, + "balance_loss_mlp": 1.00081718, + "epoch": 0.28774988726890127, + "flos": 30664372982400.0, + "grad_norm": 1.941100759940293, + "language_loss": 0.83336151, + "learning_rate": 3.3426353820905425e-06, + "loss": 0.85644192, + "num_input_tokens_seen": 103261995, + "step": 4786, + "time_per_iteration": 2.5843825340270996 + }, + { + "auxiliary_loss_clip": 0.01123821, + "auxiliary_loss_mlp": 0.00748067, + "balance_loss_clip": 1.00175869, + "balance_loss_mlp": 1.00030971, + "epoch": 0.28781001052156924, + "flos": 20595452411520.0, + "grad_norm": 1.8346091905136728, + "language_loss": 0.79702908, + "learning_rate": 3.342346699429516e-06, + "loss": 0.81574792, + "num_input_tokens_seen": 103279780, + "step": 4787, + "time_per_iteration": 2.6322178840637207 + }, + { + "auxiliary_loss_clip": 0.01139042, + "auxiliary_loss_mlp": 0.01136476, + "balance_loss_clip": 1.00174105, + "balance_loss_mlp": 1.00086343, + "epoch": 0.2878701337742372, + "flos": 26542330997760.0, + "grad_norm": 1.8688877590148665, + "language_loss": 0.83148813, + "learning_rate": 3.3420579658665677e-06, + "loss": 0.85424334, + "num_input_tokens_seen": 103300580, + "step": 4788, + "time_per_iteration": 2.636012554168701 + }, + { + "auxiliary_loss_clip": 0.01107593, + "auxiliary_loss_mlp": 0.01136896, + "balance_loss_clip": 1.00175941, + "balance_loss_mlp": 1.00080705, + "epoch": 0.28793025702690517, + "flos": 28146855530880.0, + "grad_norm": 1.797661531745767, + "language_loss": 0.73573756, + "learning_rate": 3.3417691814126468e-06, + "loss": 0.75818253, + "num_input_tokens_seen": 103320430, + "step": 4789, + "time_per_iteration": 2.7138760089874268 + }, + { + "auxiliary_loss_clip": 0.01155417, + "auxiliary_loss_mlp": 0.01135923, + "balance_loss_clip": 1.00179815, + "balance_loss_mlp": 1.00078702, + "epoch": 0.28799038027957313, + "flos": 23805471144960.0, + "grad_norm": 1.5976192955579973, + "language_loss": 0.83998907, + "learning_rate": 3.341480346078704e-06, + "loss": 0.86290246, + "num_input_tokens_seen": 103337695, + "step": 4790, + "time_per_iteration": 2.5760269165039062 + }, + { + "auxiliary_loss_clip": 0.01157057, + "auxiliary_loss_mlp": 0.01136506, + "balance_loss_clip": 1.00190794, + "balance_loss_mlp": 1.00079823, + "epoch": 0.2880505035322411, + "flos": 22344122223360.0, + "grad_norm": 1.6482962904351415, + "language_loss": 0.77977514, + "learning_rate": 3.3411914598756922e-06, + "loss": 0.80271077, + "num_input_tokens_seen": 103357010, + "step": 4791, + "time_per_iteration": 2.5550618171691895 + }, + { + "auxiliary_loss_clip": 0.0113898, + "auxiliary_loss_mlp": 0.01135821, + "balance_loss_clip": 1.00176167, + "balance_loss_mlp": 1.00068581, + "epoch": 0.28811062678490906, + "flos": 18004246208640.0, + "grad_norm": 2.1881853524525847, + "language_loss": 0.70233941, + "learning_rate": 3.3409025228145654e-06, + "loss": 0.7250874, + "num_input_tokens_seen": 103375600, + "step": 4792, + "time_per_iteration": 2.618878126144409 + }, + { + "auxiliary_loss_clip": 0.01107391, + "auxiliary_loss_mlp": 0.01136699, + "balance_loss_clip": 1.00167346, + "balance_loss_mlp": 1.0008955, + "epoch": 0.28817075003757703, + "flos": 22090880361600.0, + "grad_norm": 1.871210874542379, + "language_loss": 0.79221368, + "learning_rate": 3.3406135349062812e-06, + "loss": 0.81465453, + "num_input_tokens_seen": 103395225, + "step": 4793, + "time_per_iteration": 2.710476875305176 + }, + { + "auxiliary_loss_clip": 0.01139828, + "auxiliary_loss_mlp": 0.01136754, + "balance_loss_clip": 1.00183487, + "balance_loss_mlp": 1.00095046, + "epoch": 0.288230873290245, + "flos": 41683130847360.0, + "grad_norm": 1.5732877942834809, + "language_loss": 0.78087139, + "learning_rate": 3.340324496161797e-06, + "loss": 0.80363721, + "num_input_tokens_seen": 103417245, + "step": 4794, + "time_per_iteration": 2.8031413555145264 + }, + { + "auxiliary_loss_clip": 0.01155684, + "auxiliary_loss_mlp": 0.01136618, + "balance_loss_clip": 1.00184917, + "balance_loss_mlp": 1.00119579, + "epoch": 0.28829099654291296, + "flos": 18624423456000.0, + "grad_norm": 2.0760664768871413, + "language_loss": 0.83118463, + "learning_rate": 3.340035406592074e-06, + "loss": 0.85410762, + "num_input_tokens_seen": 103435500, + "step": 4795, + "time_per_iteration": 2.5601747035980225 + }, + { + "auxiliary_loss_clip": 0.01155331, + "auxiliary_loss_mlp": 0.01135651, + "balance_loss_clip": 1.0018599, + "balance_loss_mlp": 1.00089645, + "epoch": 0.2883511197955809, + "flos": 24674832017280.0, + "grad_norm": 1.6499560514403941, + "language_loss": 0.74382758, + "learning_rate": 3.339746266208074e-06, + "loss": 0.76673734, + "num_input_tokens_seen": 103451040, + "step": 4796, + "time_per_iteration": 2.564227819442749 + }, + { + "auxiliary_loss_clip": 0.01155638, + "auxiliary_loss_mlp": 0.0113691, + "balance_loss_clip": 1.0018878, + "balance_loss_mlp": 1.00063014, + "epoch": 0.2884112430482489, + "flos": 23112143850240.0, + "grad_norm": 1.9393522227805569, + "language_loss": 0.73038077, + "learning_rate": 3.3394570750207614e-06, + "loss": 0.75330627, + "num_input_tokens_seen": 103471330, + "step": 4797, + "time_per_iteration": 2.5853829383850098 + }, + { + "auxiliary_loss_clip": 0.01124744, + "auxiliary_loss_mlp": 0.0074811, + "balance_loss_clip": 1.00176358, + "balance_loss_mlp": 1.00037014, + "epoch": 0.28847136630091685, + "flos": 16873347432960.0, + "grad_norm": 1.9521184496265351, + "language_loss": 0.74661303, + "learning_rate": 3.3391678330411017e-06, + "loss": 0.76534152, + "num_input_tokens_seen": 103488060, + "step": 4798, + "time_per_iteration": 2.6663331985473633 + }, + { + "auxiliary_loss_clip": 0.01155719, + "auxiliary_loss_mlp": 0.01137064, + "balance_loss_clip": 1.0017823, + "balance_loss_mlp": 1.00097466, + "epoch": 0.2885314895535849, + "flos": 25657527277440.0, + "grad_norm": 3.0465232475747137, + "language_loss": 0.64721233, + "learning_rate": 3.3388785402800642e-06, + "loss": 0.67014015, + "num_input_tokens_seen": 103503600, + "step": 4799, + "time_per_iteration": 2.5545973777770996 + }, + { + "auxiliary_loss_clip": 0.01172272, + "auxiliary_loss_mlp": 0.01136607, + "balance_loss_clip": 1.00197744, + "balance_loss_mlp": 1.00099468, + "epoch": 0.28859161280625284, + "flos": 21107251347840.0, + "grad_norm": 1.6732201243526288, + "language_loss": 0.8235392, + "learning_rate": 3.3385891967486178e-06, + "loss": 0.84662795, + "num_input_tokens_seen": 103524195, + "step": 4800, + "time_per_iteration": 2.5392959117889404 + }, + { + "auxiliary_loss_clip": 0.01126385, + "auxiliary_loss_mlp": 0.01136344, + "balance_loss_clip": 1.00181448, + "balance_loss_mlp": 1.00092244, + "epoch": 0.2886517360589208, + "flos": 26469540086400.0, + "grad_norm": 2.1889420324323434, + "language_loss": 0.9093622, + "learning_rate": 3.3382998024577347e-06, + "loss": 0.93198949, + "num_input_tokens_seen": 103545235, + "step": 4801, + "time_per_iteration": 2.7021186351776123 + }, + { + "auxiliary_loss_clip": 0.01145079, + "auxiliary_loss_mlp": 0.0074822, + "balance_loss_clip": 1.00207627, + "balance_loss_mlp": 1.00038505, + "epoch": 0.28871185931158877, + "flos": 25265275781760.0, + "grad_norm": 2.081709889972092, + "language_loss": 0.73811674, + "learning_rate": 3.33801035741839e-06, + "loss": 0.75704974, + "num_input_tokens_seen": 103563305, + "step": 4802, + "time_per_iteration": 2.6133952140808105 + }, + { + "auxiliary_loss_clip": 0.0112184, + "auxiliary_loss_mlp": 0.01121225, + "balance_loss_clip": 1.00198734, + "balance_loss_mlp": 0.99991816, + "epoch": 0.28877198256425674, + "flos": 66665431284480.0, + "grad_norm": 0.780096190644507, + "language_loss": 0.63006568, + "learning_rate": 3.337720861641558e-06, + "loss": 0.65249634, + "num_input_tokens_seen": 103625025, + "step": 4803, + "time_per_iteration": 3.1756224632263184 + }, + { + "auxiliary_loss_clip": 0.01109174, + "auxiliary_loss_mlp": 0.01136462, + "balance_loss_clip": 1.00170016, + "balance_loss_mlp": 1.00094509, + "epoch": 0.2888321058169247, + "flos": 20303031790080.0, + "grad_norm": 1.7359193395843628, + "language_loss": 0.70455611, + "learning_rate": 3.3374313151382165e-06, + "loss": 0.72701252, + "num_input_tokens_seen": 103644235, + "step": 4804, + "time_per_iteration": 2.6546151638031006 + }, + { + "auxiliary_loss_clip": 0.01155492, + "auxiliary_loss_mlp": 0.01136455, + "balance_loss_clip": 1.00179172, + "balance_loss_mlp": 1.00065136, + "epoch": 0.28889222906959267, + "flos": 25516721963520.0, + "grad_norm": 2.5039921650304526, + "language_loss": 0.6798408, + "learning_rate": 3.337141717919346e-06, + "loss": 0.70276028, + "num_input_tokens_seen": 103664700, + "step": 4805, + "time_per_iteration": 2.5697991847991943 + }, + { + "auxiliary_loss_clip": 0.01155526, + "auxiliary_loss_mlp": 0.01136421, + "balance_loss_clip": 1.00181043, + "balance_loss_mlp": 1.00090408, + "epoch": 0.28895235232226063, + "flos": 32671312560000.0, + "grad_norm": 1.4335705577954514, + "language_loss": 0.69356638, + "learning_rate": 3.3368520699959272e-06, + "loss": 0.71648586, + "num_input_tokens_seen": 103686595, + "step": 4806, + "time_per_iteration": 2.6615800857543945 + }, + { + "auxiliary_loss_clip": 0.011409, + "auxiliary_loss_mlp": 0.01135993, + "balance_loss_clip": 1.001755, + "balance_loss_mlp": 1.000857, + "epoch": 0.2890124755749286, + "flos": 29714679342720.0, + "grad_norm": 1.3456967122020582, + "language_loss": 0.71226585, + "learning_rate": 3.3365623713789443e-06, + "loss": 0.73503482, + "num_input_tokens_seen": 103707525, + "step": 4807, + "time_per_iteration": 2.6586718559265137 + }, + { + "auxiliary_loss_clip": 0.01122148, + "auxiliary_loss_mlp": 0.01136577, + "balance_loss_clip": 1.00159347, + "balance_loss_mlp": 1.00077379, + "epoch": 0.28907259882759656, + "flos": 22674464628480.0, + "grad_norm": 1.689193648354279, + "language_loss": 0.81062555, + "learning_rate": 3.336272622079382e-06, + "loss": 0.83321279, + "num_input_tokens_seen": 103727905, + "step": 4808, + "time_per_iteration": 2.629716157913208 + }, + { + "auxiliary_loss_clip": 0.01123346, + "auxiliary_loss_mlp": 0.01135956, + "balance_loss_clip": 1.00188398, + "balance_loss_mlp": 1.00091577, + "epoch": 0.2891327220802645, + "flos": 22566050403840.0, + "grad_norm": 1.4818687109264315, + "language_loss": 0.77927965, + "learning_rate": 3.3359828221082276e-06, + "loss": 0.80187273, + "num_input_tokens_seen": 103748335, + "step": 4809, + "time_per_iteration": 2.6327409744262695 + }, + { + "auxiliary_loss_clip": 0.01108191, + "auxiliary_loss_mlp": 0.01136328, + "balance_loss_clip": 1.00159907, + "balance_loss_mlp": 1.00071573, + "epoch": 0.2891928453329325, + "flos": 21652806090240.0, + "grad_norm": 1.662276555414668, + "language_loss": 0.78629446, + "learning_rate": 3.3356929714764714e-06, + "loss": 0.80873966, + "num_input_tokens_seen": 103767020, + "step": 4810, + "time_per_iteration": 2.6925482749938965 + }, + { + "auxiliary_loss_clip": 0.011065, + "auxiliary_loss_mlp": 0.01136315, + "balance_loss_clip": 1.00164652, + "balance_loss_mlp": 1.00089335, + "epoch": 0.28925296858560046, + "flos": 23222102359680.0, + "grad_norm": 1.5835503112575975, + "language_loss": 0.76888955, + "learning_rate": 3.3354030701951032e-06, + "loss": 0.7913177, + "num_input_tokens_seen": 103786355, + "step": 4811, + "time_per_iteration": 2.678738594055176 + }, + { + "auxiliary_loss_clip": 0.01155591, + "auxiliary_loss_mlp": 0.0113601, + "balance_loss_clip": 1.00188947, + "balance_loss_mlp": 1.00077879, + "epoch": 0.2893130918382685, + "flos": 28621666437120.0, + "grad_norm": 1.5047681967697188, + "language_loss": 0.76935762, + "learning_rate": 3.335113118275117e-06, + "loss": 0.79227364, + "num_input_tokens_seen": 103809345, + "step": 4812, + "time_per_iteration": 2.6245226860046387 + }, + { + "auxiliary_loss_clip": 0.011219, + "auxiliary_loss_mlp": 0.01121369, + "balance_loss_clip": 1.00302172, + "balance_loss_mlp": 1.00006199, + "epoch": 0.28937321509093644, + "flos": 72301288982400.0, + "grad_norm": 0.8605585134078444, + "language_loss": 0.60331595, + "learning_rate": 3.3348231157275085e-06, + "loss": 0.62574863, + "num_input_tokens_seen": 103871180, + "step": 4813, + "time_per_iteration": 4.740080118179321 + }, + { + "auxiliary_loss_clip": 0.01125775, + "auxiliary_loss_mlp": 0.01136695, + "balance_loss_clip": 1.00204039, + "balance_loss_mlp": 1.00070071, + "epoch": 0.2894333383436044, + "flos": 16216397637120.0, + "grad_norm": 2.199983248718009, + "language_loss": 0.82233381, + "learning_rate": 3.3345330625632725e-06, + "loss": 0.84495854, + "num_input_tokens_seen": 103889040, + "step": 4814, + "time_per_iteration": 2.6357486248016357 + }, + { + "auxiliary_loss_clip": 0.01105909, + "auxiliary_loss_mlp": 0.01136683, + "balance_loss_clip": 1.00155652, + "balance_loss_mlp": 1.00088024, + "epoch": 0.2894934615962724, + "flos": 24828278918400.0, + "grad_norm": 1.6550929890312052, + "language_loss": 0.71986049, + "learning_rate": 3.3342429587934094e-06, + "loss": 0.74228644, + "num_input_tokens_seen": 103910380, + "step": 4815, + "time_per_iteration": 4.1396214962005615 + }, + { + "auxiliary_loss_clip": 0.01156826, + "auxiliary_loss_mlp": 0.01135699, + "balance_loss_clip": 1.0019362, + "balance_loss_mlp": 1.00075388, + "epoch": 0.28955358484894034, + "flos": 20449978329600.0, + "grad_norm": 1.449185792208843, + "language_loss": 0.70028365, + "learning_rate": 3.3339528044289198e-06, + "loss": 0.7232089, + "num_input_tokens_seen": 103929955, + "step": 4816, + "time_per_iteration": 4.061363935470581 + }, + { + "auxiliary_loss_clip": 0.01140966, + "auxiliary_loss_mlp": 0.01137257, + "balance_loss_clip": 1.00197697, + "balance_loss_mlp": 1.00097656, + "epoch": 0.2896137081016083, + "flos": 22565188477440.0, + "grad_norm": 2.205280528661648, + "language_loss": 0.74651474, + "learning_rate": 3.3336625994808055e-06, + "loss": 0.769297, + "num_input_tokens_seen": 103948020, + "step": 4817, + "time_per_iteration": 4.028920888900757 + }, + { + "auxiliary_loss_clip": 0.01122244, + "auxiliary_loss_mlp": 0.0113685, + "balance_loss_clip": 1.00163662, + "balance_loss_mlp": 1.00095177, + "epoch": 0.28967383135427627, + "flos": 26687948734080.0, + "grad_norm": 1.7712871968546764, + "language_loss": 0.76206362, + "learning_rate": 3.3333723439600723e-06, + "loss": 0.78465462, + "num_input_tokens_seen": 103968740, + "step": 4818, + "time_per_iteration": 2.6933059692382812 + }, + { + "auxiliary_loss_clip": 0.01096103, + "auxiliary_loss_mlp": 0.01136885, + "balance_loss_clip": 1.00205326, + "balance_loss_mlp": 1.00079584, + "epoch": 0.28973395460694423, + "flos": 15558262692480.0, + "grad_norm": 2.1066811274658677, + "language_loss": 0.79424322, + "learning_rate": 3.3330820378777263e-06, + "loss": 0.81657308, + "num_input_tokens_seen": 103986005, + "step": 4819, + "time_per_iteration": 2.7003157138824463 + }, + { + "auxiliary_loss_clip": 0.01124485, + "auxiliary_loss_mlp": 0.01137921, + "balance_loss_clip": 1.00188315, + "balance_loss_mlp": 1.00078321, + "epoch": 0.2897940778596122, + "flos": 18697465762560.0, + "grad_norm": 1.8597791024417536, + "language_loss": 0.78973085, + "learning_rate": 3.332791681244776e-06, + "loss": 0.81235492, + "num_input_tokens_seen": 104005070, + "step": 4820, + "time_per_iteration": 2.6202518939971924 + }, + { + "auxiliary_loss_clip": 0.01111391, + "auxiliary_loss_mlp": 0.01137081, + "balance_loss_clip": 1.00189734, + "balance_loss_mlp": 1.00070584, + "epoch": 0.28985420111228016, + "flos": 18770292587520.0, + "grad_norm": 2.0230501643805994, + "language_loss": 0.72425961, + "learning_rate": 3.332501274072231e-06, + "loss": 0.74674428, + "num_input_tokens_seen": 104022945, + "step": 4821, + "time_per_iteration": 2.6352498531341553 + }, + { + "auxiliary_loss_clip": 0.0115578, + "auxiliary_loss_mlp": 0.01136565, + "balance_loss_clip": 1.00189829, + "balance_loss_mlp": 1.00085759, + "epoch": 0.28991432436494813, + "flos": 23069840607360.0, + "grad_norm": 1.8130860949171215, + "language_loss": 0.71854663, + "learning_rate": 3.332210816371104e-06, + "loss": 0.7414701, + "num_input_tokens_seen": 104042080, + "step": 4822, + "time_per_iteration": 2.562732458114624 + }, + { + "auxiliary_loss_clip": 0.01155558, + "auxiliary_loss_mlp": 0.01136756, + "balance_loss_clip": 1.00196767, + "balance_loss_mlp": 1.00095236, + "epoch": 0.2899744476176161, + "flos": 17603195880960.0, + "grad_norm": 1.8752082783064712, + "language_loss": 0.66068405, + "learning_rate": 3.3319203081524102e-06, + "loss": 0.68360722, + "num_input_tokens_seen": 104060975, + "step": 4823, + "time_per_iteration": 2.530271530151367 + }, + { + "auxiliary_loss_clip": 0.01140461, + "auxiliary_loss_mlp": 0.01136023, + "balance_loss_clip": 1.00174415, + "balance_loss_mlp": 1.00079226, + "epoch": 0.29003457087028406, + "flos": 22309360836480.0, + "grad_norm": 1.6854106119249401, + "language_loss": 0.80945468, + "learning_rate": 3.331629749427164e-06, + "loss": 0.83221948, + "num_input_tokens_seen": 104081395, + "step": 4824, + "time_per_iteration": 2.6027023792266846 + }, + { + "auxiliary_loss_clip": 0.01172085, + "auxiliary_loss_mlp": 0.0113667, + "balance_loss_clip": 1.00187945, + "balance_loss_mlp": 1.00077128, + "epoch": 0.2900946941229521, + "flos": 21944975316480.0, + "grad_norm": 2.211494109376846, + "language_loss": 0.72341359, + "learning_rate": 3.331339140206385e-06, + "loss": 0.74650115, + "num_input_tokens_seen": 104099995, + "step": 4825, + "time_per_iteration": 2.5187582969665527 + }, + { + "auxiliary_loss_clip": 0.01172358, + "auxiliary_loss_mlp": 0.0113653, + "balance_loss_clip": 1.002038, + "balance_loss_mlp": 1.00072694, + "epoch": 0.29015481737562004, + "flos": 17932173569280.0, + "grad_norm": 14.598462910598984, + "language_loss": 0.73419476, + "learning_rate": 3.331048480501092e-06, + "loss": 0.75728369, + "num_input_tokens_seen": 104118930, + "step": 4826, + "time_per_iteration": 2.4796154499053955 + }, + { + "auxiliary_loss_clip": 0.01155664, + "auxiliary_loss_mlp": 0.0113654, + "balance_loss_clip": 1.00181115, + "balance_loss_mlp": 1.00083196, + "epoch": 0.290214940628288, + "flos": 22783525297920.0, + "grad_norm": 2.270831902436639, + "language_loss": 0.68501502, + "learning_rate": 3.3307577703223073e-06, + "loss": 0.70793706, + "num_input_tokens_seen": 104136940, + "step": 4827, + "time_per_iteration": 2.558641195297241 + }, + { + "auxiliary_loss_clip": 0.0115716, + "auxiliary_loss_mlp": 0.01136267, + "balance_loss_clip": 1.00198174, + "balance_loss_mlp": 1.00084531, + "epoch": 0.290275063880956, + "flos": 20006481104640.0, + "grad_norm": 3.2981335694815677, + "language_loss": 0.79853731, + "learning_rate": 3.3304670096810545e-06, + "loss": 0.82147163, + "num_input_tokens_seen": 104154280, + "step": 4828, + "time_per_iteration": 2.5198938846588135 + }, + { + "auxiliary_loss_clip": 0.01172334, + "auxiliary_loss_mlp": 0.01136463, + "balance_loss_clip": 1.00207329, + "balance_loss_mlp": 1.00085068, + "epoch": 0.29033518713362394, + "flos": 22053605022720.0, + "grad_norm": 1.8805023030884143, + "language_loss": 0.80514038, + "learning_rate": 3.33017619858836e-06, + "loss": 0.82822829, + "num_input_tokens_seen": 104172605, + "step": 4829, + "time_per_iteration": 2.492274045944214 + }, + { + "auxiliary_loss_clip": 0.01141374, + "auxiliary_loss_mlp": 0.01135578, + "balance_loss_clip": 1.00185633, + "balance_loss_mlp": 1.00063324, + "epoch": 0.2903953103862919, + "flos": 25630056351360.0, + "grad_norm": 1.7993176273013483, + "language_loss": 0.82342654, + "learning_rate": 3.329885337055249e-06, + "loss": 0.84619606, + "num_input_tokens_seen": 104194120, + "step": 4830, + "time_per_iteration": 2.6083180904388428 + }, + { + "auxiliary_loss_clip": 0.01155583, + "auxiliary_loss_mlp": 0.01136756, + "balance_loss_clip": 1.00189507, + "balance_loss_mlp": 1.00104821, + "epoch": 0.29045543363895987, + "flos": 16945851035520.0, + "grad_norm": 2.7970079176389993, + "language_loss": 0.79683733, + "learning_rate": 3.3295944250927546e-06, + "loss": 0.81976068, + "num_input_tokens_seen": 104210875, + "step": 4831, + "time_per_iteration": 2.531663656234741 + }, + { + "auxiliary_loss_clip": 0.01172064, + "auxiliary_loss_mlp": 0.01136299, + "balance_loss_clip": 1.00196302, + "balance_loss_mlp": 1.00078189, + "epoch": 0.29051555689162784, + "flos": 26395492199040.0, + "grad_norm": 1.961944732685704, + "language_loss": 0.74314511, + "learning_rate": 3.3293034627119055e-06, + "loss": 0.7662288, + "num_input_tokens_seen": 104229875, + "step": 4832, + "time_per_iteration": 2.5222387313842773 + }, + { + "auxiliary_loss_clip": 0.01139992, + "auxiliary_loss_mlp": 0.01135206, + "balance_loss_clip": 1.00171876, + "balance_loss_mlp": 1.00073814, + "epoch": 0.2905756801442958, + "flos": 21103875469440.0, + "grad_norm": 1.6249206304767685, + "language_loss": 0.76315689, + "learning_rate": 3.329012449923736e-06, + "loss": 0.78590888, + "num_input_tokens_seen": 104250405, + "step": 4833, + "time_per_iteration": 2.588409185409546 + }, + { + "auxiliary_loss_clip": 0.01140204, + "auxiliary_loss_mlp": 0.01135335, + "balance_loss_clip": 1.00174999, + "balance_loss_mlp": 1.00067663, + "epoch": 0.29063580339696377, + "flos": 15706071158400.0, + "grad_norm": 2.2499324482953704, + "language_loss": 0.64328241, + "learning_rate": 3.3287213867392813e-06, + "loss": 0.66603786, + "num_input_tokens_seen": 104269185, + "step": 4834, + "time_per_iteration": 2.5622150897979736 + }, + { + "auxiliary_loss_clip": 0.01144641, + "auxiliary_loss_mlp": 0.01134694, + "balance_loss_clip": 1.00185776, + "balance_loss_mlp": 1.00079846, + "epoch": 0.29069592664963173, + "flos": 24644990793600.0, + "grad_norm": 1.4576573292284238, + "language_loss": 0.71588296, + "learning_rate": 3.3284302731695783e-06, + "loss": 0.73867631, + "num_input_tokens_seen": 104289400, + "step": 4835, + "time_per_iteration": 2.619579553604126 + }, + { + "auxiliary_loss_clip": 0.01141026, + "auxiliary_loss_mlp": 0.01134766, + "balance_loss_clip": 1.00173366, + "balance_loss_mlp": 1.00077462, + "epoch": 0.2907560499022997, + "flos": 24973753000320.0, + "grad_norm": 1.7015895285667209, + "language_loss": 0.79690444, + "learning_rate": 3.3281391092256668e-06, + "loss": 0.81966233, + "num_input_tokens_seen": 104310485, + "step": 4836, + "time_per_iteration": 2.6263368129730225 + }, + { + "auxiliary_loss_clip": 0.01124649, + "auxiliary_loss_mlp": 0.01135614, + "balance_loss_clip": 1.00181592, + "balance_loss_mlp": 1.00095487, + "epoch": 0.29081617315496766, + "flos": 18657496903680.0, + "grad_norm": 1.6883906588127877, + "language_loss": 0.81279421, + "learning_rate": 3.3278478949185865e-06, + "loss": 0.83539689, + "num_input_tokens_seen": 104327330, + "step": 4837, + "time_per_iteration": 2.622164487838745 + }, + { + "auxiliary_loss_clip": 0.01139116, + "auxiliary_loss_mlp": 0.01135065, + "balance_loss_clip": 1.00157189, + "balance_loss_mlp": 1.00059736, + "epoch": 0.2908762964076356, + "flos": 35331035955840.0, + "grad_norm": 2.145721573967771, + "language_loss": 0.67812014, + "learning_rate": 3.327556630259381e-06, + "loss": 0.70086193, + "num_input_tokens_seen": 104350350, + "step": 4838, + "time_per_iteration": 2.7116525173187256 + }, + { + "auxiliary_loss_clip": 0.01172146, + "auxiliary_loss_mlp": 0.00748159, + "balance_loss_clip": 1.00187922, + "balance_loss_mlp": 1.00044823, + "epoch": 0.29093641966030365, + "flos": 23076305055360.0, + "grad_norm": 1.7021605152067054, + "language_loss": 0.71294981, + "learning_rate": 3.327265315259095e-06, + "loss": 0.73215288, + "num_input_tokens_seen": 104369995, + "step": 4839, + "time_per_iteration": 2.5688962936401367 + }, + { + "auxiliary_loss_clip": 0.0117197, + "auxiliary_loss_mlp": 0.01135546, + "balance_loss_clip": 1.00180376, + "balance_loss_mlp": 1.0006963, + "epoch": 0.2909965429129716, + "flos": 35955415094400.0, + "grad_norm": 1.9049248238392864, + "language_loss": 0.75963336, + "learning_rate": 3.326973949928776e-06, + "loss": 0.78270853, + "num_input_tokens_seen": 104392285, + "step": 4840, + "time_per_iteration": 2.6306910514831543 + }, + { + "auxiliary_loss_clip": 0.01123758, + "auxiliary_loss_mlp": 0.01134974, + "balance_loss_clip": 1.001737, + "balance_loss_mlp": 1.00069666, + "epoch": 0.2910566661656396, + "flos": 30880231764480.0, + "grad_norm": 1.8700042540371216, + "language_loss": 0.59893441, + "learning_rate": 3.326682534279471e-06, + "loss": 0.62152171, + "num_input_tokens_seen": 104412640, + "step": 4841, + "time_per_iteration": 2.733769416809082 + }, + { + "auxiliary_loss_clip": 0.01138795, + "auxiliary_loss_mlp": 0.01135238, + "balance_loss_clip": 1.00169015, + "balance_loss_mlp": 1.00077021, + "epoch": 0.29111678941830754, + "flos": 30010188533760.0, + "grad_norm": 1.3932801001437658, + "language_loss": 0.71401978, + "learning_rate": 3.326391068322232e-06, + "loss": 0.73676014, + "num_input_tokens_seen": 104435245, + "step": 4842, + "time_per_iteration": 2.68208384513855 + }, + { + "auxiliary_loss_clip": 0.01156789, + "auxiliary_loss_mlp": 0.01135188, + "balance_loss_clip": 1.00190842, + "balance_loss_mlp": 1.00071955, + "epoch": 0.2911769126709755, + "flos": 22857393617280.0, + "grad_norm": 1.5500590612550023, + "language_loss": 0.72779071, + "learning_rate": 3.3260995520681098e-06, + "loss": 0.75071043, + "num_input_tokens_seen": 104455395, + "step": 4843, + "time_per_iteration": 2.5576834678649902 + }, + { + "auxiliary_loss_clip": 0.01112247, + "auxiliary_loss_mlp": 0.01135236, + "balance_loss_clip": 1.00204039, + "balance_loss_mlp": 1.00076771, + "epoch": 0.2912370359236435, + "flos": 21650507619840.0, + "grad_norm": 2.0755363575584584, + "language_loss": 0.58044791, + "learning_rate": 3.3258079855281602e-06, + "loss": 0.60292274, + "num_input_tokens_seen": 104473350, + "step": 4844, + "time_per_iteration": 2.6715798377990723 + }, + { + "auxiliary_loss_clip": 0.01155713, + "auxiliary_loss_mlp": 0.01135218, + "balance_loss_clip": 1.00200438, + "balance_loss_mlp": 1.00065446, + "epoch": 0.29129715917631144, + "flos": 22893340152960.0, + "grad_norm": 2.5017876592148833, + "language_loss": 0.86484194, + "learning_rate": 3.3255163687134396e-06, + "loss": 0.88775128, + "num_input_tokens_seen": 104492265, + "step": 4845, + "time_per_iteration": 2.6053924560546875 + }, + { + "auxiliary_loss_clip": 0.01138833, + "auxiliary_loss_mlp": 0.01136094, + "balance_loss_clip": 1.00163257, + "balance_loss_mlp": 1.00076771, + "epoch": 0.2913572824289794, + "flos": 22674464628480.0, + "grad_norm": 1.654693802806543, + "language_loss": 0.66897422, + "learning_rate": 3.3252247016350046e-06, + "loss": 0.69172347, + "num_input_tokens_seen": 104510755, + "step": 4846, + "time_per_iteration": 2.584339141845703 + }, + { + "auxiliary_loss_clip": 0.0114474, + "auxiliary_loss_mlp": 0.01135335, + "balance_loss_clip": 1.00197887, + "balance_loss_mlp": 1.00067616, + "epoch": 0.29141740568164737, + "flos": 23107403255040.0, + "grad_norm": 1.7740355083661772, + "language_loss": 0.70277399, + "learning_rate": 3.3249329843039166e-06, + "loss": 0.72557473, + "num_input_tokens_seen": 104530830, + "step": 4847, + "time_per_iteration": 2.6035892963409424 + }, + { + "auxiliary_loss_clip": 0.011556, + "auxiliary_loss_mlp": 0.0113526, + "balance_loss_clip": 1.0018909, + "balance_loss_mlp": 1.00069642, + "epoch": 0.29147752893431533, + "flos": 23587026583680.0, + "grad_norm": 1.4713103993968295, + "language_loss": 0.74033678, + "learning_rate": 3.324641216731237e-06, + "loss": 0.7632454, + "num_input_tokens_seen": 104550115, + "step": 4848, + "time_per_iteration": 2.5777134895324707 + }, + { + "auxiliary_loss_clip": 0.01156606, + "auxiliary_loss_mlp": 0.0113536, + "balance_loss_clip": 1.00180292, + "balance_loss_mlp": 1.00070071, + "epoch": 0.2915376521869833, + "flos": 20591968792320.0, + "grad_norm": 2.460872634555824, + "language_loss": 0.76791263, + "learning_rate": 3.3243493989280295e-06, + "loss": 0.79083234, + "num_input_tokens_seen": 104566255, + "step": 4849, + "time_per_iteration": 2.5279791355133057 + }, + { + "auxiliary_loss_clip": 0.01157011, + "auxiliary_loss_mlp": 0.01135918, + "balance_loss_clip": 1.00187087, + "balance_loss_mlp": 1.00097334, + "epoch": 0.29159777543965126, + "flos": 20811490761600.0, + "grad_norm": 3.361984808121805, + "language_loss": 0.78435999, + "learning_rate": 3.3240575309053596e-06, + "loss": 0.80728924, + "num_input_tokens_seen": 104585235, + "step": 4850, + "time_per_iteration": 3.9000039100646973 + }, + { + "auxiliary_loss_clip": 0.01138333, + "auxiliary_loss_mlp": 0.01135471, + "balance_loss_clip": 1.00176549, + "balance_loss_mlp": 1.00081182, + "epoch": 0.29165789869231923, + "flos": 24244155947520.0, + "grad_norm": 1.9508847110645458, + "language_loss": 0.75545257, + "learning_rate": 3.323765612674296e-06, + "loss": 0.77819061, + "num_input_tokens_seen": 104605315, + "step": 4851, + "time_per_iteration": 2.620931625366211 + }, + { + "auxiliary_loss_clip": 0.01156347, + "auxiliary_loss_mlp": 0.01134987, + "balance_loss_clip": 1.00195289, + "balance_loss_mlp": 1.0008055, + "epoch": 0.29171802194498725, + "flos": 28949925853440.0, + "grad_norm": 1.3748098037209606, + "language_loss": 0.77128911, + "learning_rate": 3.3234736442459078e-06, + "loss": 0.79420245, + "num_input_tokens_seen": 104626055, + "step": 4852, + "time_per_iteration": 2.6903293132781982 + }, + { + "auxiliary_loss_clip": 0.01138396, + "auxiliary_loss_mlp": 0.01135469, + "balance_loss_clip": 1.00170135, + "balance_loss_mlp": 1.00081015, + "epoch": 0.2917781451976552, + "flos": 22598226011520.0, + "grad_norm": 1.6112164584493955, + "language_loss": 0.78115749, + "learning_rate": 3.3231816256312665e-06, + "loss": 0.80389619, + "num_input_tokens_seen": 104646005, + "step": 4853, + "time_per_iteration": 2.6986236572265625 + }, + { + "auxiliary_loss_clip": 0.01122098, + "auxiliary_loss_mlp": 0.01135537, + "balance_loss_clip": 1.00170493, + "balance_loss_mlp": 1.00087798, + "epoch": 0.2918382684503232, + "flos": 21574448570880.0, + "grad_norm": 2.3131343653673544, + "language_loss": 0.88307464, + "learning_rate": 3.322889556841445e-06, + "loss": 0.90565097, + "num_input_tokens_seen": 104661620, + "step": 4854, + "time_per_iteration": 6.833186388015747 + }, + { + "auxiliary_loss_clip": 0.01155397, + "auxiliary_loss_mlp": 0.01134975, + "balance_loss_clip": 1.00171542, + "balance_loss_mlp": 1.00098395, + "epoch": 0.29189839170299114, + "flos": 24353503925760.0, + "grad_norm": 1.7088636940669444, + "language_loss": 0.86241549, + "learning_rate": 3.322597437887519e-06, + "loss": 0.88531923, + "num_input_tokens_seen": 104681445, + "step": 4855, + "time_per_iteration": 2.5924150943756104 + }, + { + "auxiliary_loss_clip": 0.01158326, + "auxiliary_loss_mlp": 0.01120829, + "balance_loss_clip": 1.00210929, + "balance_loss_mlp": 1.00028467, + "epoch": 0.2919585149556591, + "flos": 71316726215040.0, + "grad_norm": 0.7973529350183479, + "language_loss": 0.60218906, + "learning_rate": 3.322305268780566e-06, + "loss": 0.62498063, + "num_input_tokens_seen": 104747945, + "step": 4856, + "time_per_iteration": 3.229647636413574 + }, + { + "auxiliary_loss_clip": 0.01140091, + "auxiliary_loss_mlp": 0.00748243, + "balance_loss_clip": 1.00178075, + "balance_loss_mlp": 1.00056052, + "epoch": 0.2920186382083271, + "flos": 15633208419840.0, + "grad_norm": 1.935968362922129, + "language_loss": 0.67851049, + "learning_rate": 3.322013049531664e-06, + "loss": 0.69739383, + "num_input_tokens_seen": 104766225, + "step": 4857, + "time_per_iteration": 2.5937280654907227 + }, + { + "auxiliary_loss_clip": 0.01156734, + "auxiliary_loss_mlp": 0.00748053, + "balance_loss_clip": 1.00184345, + "balance_loss_mlp": 1.00045645, + "epoch": 0.29207876146099504, + "flos": 28366018364160.0, + "grad_norm": 1.8182351016072953, + "language_loss": 0.83911729, + "learning_rate": 3.321720780151895e-06, + "loss": 0.85816514, + "num_input_tokens_seen": 104785345, + "step": 4858, + "time_per_iteration": 2.6228199005126953 + }, + { + "auxiliary_loss_clip": 0.01171998, + "auxiliary_loss_mlp": 0.0113556, + "balance_loss_clip": 1.00194931, + "balance_loss_mlp": 1.00090098, + "epoch": 0.292138884713663, + "flos": 21870963342720.0, + "grad_norm": 1.8616929287829398, + "language_loss": 0.77612698, + "learning_rate": 3.321428460652342e-06, + "loss": 0.79920256, + "num_input_tokens_seen": 104804560, + "step": 4859, + "time_per_iteration": 2.5371487140655518 + }, + { + "auxiliary_loss_clip": 0.01112126, + "auxiliary_loss_mlp": 0.0113583, + "balance_loss_clip": 1.00192285, + "balance_loss_mlp": 1.00069404, + "epoch": 0.29219900796633097, + "flos": 20992552243200.0, + "grad_norm": 2.206194759041001, + "language_loss": 0.68782967, + "learning_rate": 3.3211360910440885e-06, + "loss": 0.71030927, + "num_input_tokens_seen": 104821105, + "step": 4860, + "time_per_iteration": 2.7315244674682617 + }, + { + "auxiliary_loss_clip": 0.01138621, + "auxiliary_loss_mlp": 0.01135426, + "balance_loss_clip": 1.0018189, + "balance_loss_mlp": 1.00067186, + "epoch": 0.29225913121899894, + "flos": 35004608133120.0, + "grad_norm": 2.0758153930113927, + "language_loss": 0.75784171, + "learning_rate": 3.320843671338222e-06, + "loss": 0.78058207, + "num_input_tokens_seen": 104841440, + "step": 4861, + "time_per_iteration": 2.712857961654663 + }, + { + "auxiliary_loss_clip": 0.01156642, + "auxiliary_loss_mlp": 0.01135308, + "balance_loss_clip": 1.00193977, + "balance_loss_mlp": 1.00084031, + "epoch": 0.2923192544716669, + "flos": 13515663888000.0, + "grad_norm": 1.6849098017510997, + "language_loss": 0.91156912, + "learning_rate": 3.320551201545832e-06, + "loss": 0.93448859, + "num_input_tokens_seen": 104858210, + "step": 4862, + "time_per_iteration": 2.534480571746826 + }, + { + "auxiliary_loss_clip": 0.01156806, + "auxiliary_loss_mlp": 0.01134682, + "balance_loss_clip": 1.00182307, + "balance_loss_mlp": 1.0006907, + "epoch": 0.29237937772433487, + "flos": 19463512141440.0, + "grad_norm": 2.078794868991894, + "language_loss": 0.73337686, + "learning_rate": 3.320258681678008e-06, + "loss": 0.75629175, + "num_input_tokens_seen": 104875620, + "step": 4863, + "time_per_iteration": 2.641770601272583 + }, + { + "auxiliary_loss_clip": 0.01079981, + "auxiliary_loss_mlp": 0.01134722, + "balance_loss_clip": 1.00187612, + "balance_loss_mlp": 1.00092161, + "epoch": 0.29243950097700283, + "flos": 20850597694080.0, + "grad_norm": 1.8910137345869513, + "language_loss": 0.77808881, + "learning_rate": 3.319966111745842e-06, + "loss": 0.80023581, + "num_input_tokens_seen": 104894600, + "step": 4864, + "time_per_iteration": 2.800466537475586 + }, + { + "auxiliary_loss_clip": 0.01126013, + "auxiliary_loss_mlp": 0.01136172, + "balance_loss_clip": 1.00181925, + "balance_loss_mlp": 1.00103688, + "epoch": 0.29249962422967085, + "flos": 23584225322880.0, + "grad_norm": 1.500885263951918, + "language_loss": 0.81618196, + "learning_rate": 3.319673491760429e-06, + "loss": 0.83880377, + "num_input_tokens_seen": 104914530, + "step": 4865, + "time_per_iteration": 2.680366277694702 + }, + { + "auxiliary_loss_clip": 0.01093898, + "auxiliary_loss_mlp": 0.01135316, + "balance_loss_clip": 1.00170159, + "balance_loss_mlp": 1.00075221, + "epoch": 0.2925597474823388, + "flos": 22273342473600.0, + "grad_norm": 1.921444542028595, + "language_loss": 0.8510257, + "learning_rate": 3.3193808217328645e-06, + "loss": 0.87331778, + "num_input_tokens_seen": 104933460, + "step": 4866, + "time_per_iteration": 2.728256940841675 + }, + { + "auxiliary_loss_clip": 0.01140094, + "auxiliary_loss_mlp": 0.01135105, + "balance_loss_clip": 1.00195372, + "balance_loss_mlp": 1.00063705, + "epoch": 0.2926198707350068, + "flos": 34456108475520.0, + "grad_norm": 1.6757699839659828, + "language_loss": 0.75580454, + "learning_rate": 3.3190881016742476e-06, + "loss": 0.77855647, + "num_input_tokens_seen": 104954495, + "step": 4867, + "time_per_iteration": 2.717449188232422 + }, + { + "auxiliary_loss_clip": 0.01089276, + "auxiliary_loss_mlp": 0.01135352, + "balance_loss_clip": 1.00147474, + "balance_loss_mlp": 1.00097883, + "epoch": 0.29267999398767475, + "flos": 20704153944960.0, + "grad_norm": 1.842235245033242, + "language_loss": 0.73425555, + "learning_rate": 3.3187953315956776e-06, + "loss": 0.75650179, + "num_input_tokens_seen": 104971915, + "step": 4868, + "time_per_iteration": 2.711564779281616 + }, + { + "auxiliary_loss_clip": 0.01106479, + "auxiliary_loss_mlp": 0.01134951, + "balance_loss_clip": 1.00176704, + "balance_loss_mlp": 1.00057864, + "epoch": 0.2927401172403427, + "flos": 18368667642240.0, + "grad_norm": 1.3557284781230654, + "language_loss": 0.74396515, + "learning_rate": 3.3185025115082566e-06, + "loss": 0.76637936, + "num_input_tokens_seen": 104991335, + "step": 4869, + "time_per_iteration": 2.7374587059020996 + }, + { + "auxiliary_loss_clip": 0.01140248, + "auxiliary_loss_mlp": 0.01134495, + "balance_loss_clip": 1.0018301, + "balance_loss_mlp": 1.00069463, + "epoch": 0.2928002404930107, + "flos": 26104041244800.0, + "grad_norm": 1.490707142437028, + "language_loss": 0.76546907, + "learning_rate": 3.318209641423088e-06, + "loss": 0.78821653, + "num_input_tokens_seen": 105012015, + "step": 4870, + "time_per_iteration": 2.6501612663269043 + }, + { + "auxiliary_loss_clip": 0.01156856, + "auxiliary_loss_mlp": 0.01136281, + "balance_loss_clip": 1.00197446, + "balance_loss_mlp": 1.00104976, + "epoch": 0.29286036374567864, + "flos": 21324726241920.0, + "grad_norm": 1.9282382587256357, + "language_loss": 0.67943794, + "learning_rate": 3.3179167213512777e-06, + "loss": 0.70236933, + "num_input_tokens_seen": 105031460, + "step": 4871, + "time_per_iteration": 2.5723509788513184 + }, + { + "auxiliary_loss_clip": 0.01141315, + "auxiliary_loss_mlp": 0.01135468, + "balance_loss_clip": 1.00177205, + "balance_loss_mlp": 1.00071383, + "epoch": 0.2929204869983466, + "flos": 29569492569600.0, + "grad_norm": 2.068588678449902, + "language_loss": 0.7731384, + "learning_rate": 3.317623751303933e-06, + "loss": 0.79590631, + "num_input_tokens_seen": 105052965, + "step": 4872, + "time_per_iteration": 2.6776018142700195 + }, + { + "auxiliary_loss_clip": 0.0109009, + "auxiliary_loss_mlp": 0.01136251, + "balance_loss_clip": 1.00164282, + "balance_loss_mlp": 1.00073409, + "epoch": 0.2929806102510146, + "flos": 19058259922560.0, + "grad_norm": 2.0344985458692246, + "language_loss": 0.73132086, + "learning_rate": 3.317330731292164e-06, + "loss": 0.75358421, + "num_input_tokens_seen": 105071840, + "step": 4873, + "time_per_iteration": 2.703610420227051 + }, + { + "auxiliary_loss_clip": 0.01155331, + "auxiliary_loss_mlp": 0.01135945, + "balance_loss_clip": 1.00186658, + "balance_loss_mlp": 1.00080919, + "epoch": 0.29304073350368254, + "flos": 21944221130880.0, + "grad_norm": 1.9248206980277638, + "language_loss": 0.77513951, + "learning_rate": 3.3170376613270812e-06, + "loss": 0.79805225, + "num_input_tokens_seen": 105089445, + "step": 4874, + "time_per_iteration": 2.5888991355895996 + }, + { + "auxiliary_loss_clip": 0.01107628, + "auxiliary_loss_mlp": 0.01136467, + "balance_loss_clip": 1.00198436, + "balance_loss_mlp": 1.0009501, + "epoch": 0.2931008567563505, + "flos": 15450818135040.0, + "grad_norm": 2.7202669557323502, + "language_loss": 0.76919997, + "learning_rate": 3.3167445414197985e-06, + "loss": 0.79164088, + "num_input_tokens_seen": 105106210, + "step": 4875, + "time_per_iteration": 2.6509361267089844 + }, + { + "auxiliary_loss_clip": 0.01156875, + "auxiliary_loss_mlp": 0.01135581, + "balance_loss_clip": 1.00199628, + "balance_loss_mlp": 1.00073123, + "epoch": 0.29316098000901847, + "flos": 16983162288000.0, + "grad_norm": 1.518738046570063, + "language_loss": 0.68752384, + "learning_rate": 3.316451371581431e-06, + "loss": 0.71044838, + "num_input_tokens_seen": 105124200, + "step": 4876, + "time_per_iteration": 2.5591256618499756 + }, + { + "auxiliary_loss_clip": 0.01156784, + "auxiliary_loss_mlp": 0.01135126, + "balance_loss_clip": 1.0018146, + "balance_loss_mlp": 1.00094438, + "epoch": 0.29322110326168643, + "flos": 16357705741440.0, + "grad_norm": 1.890727737771053, + "language_loss": 0.81856138, + "learning_rate": 3.316158151823096e-06, + "loss": 0.84148043, + "num_input_tokens_seen": 105140400, + "step": 4877, + "time_per_iteration": 2.5323703289031982 + }, + { + "auxiliary_loss_clip": 0.01156492, + "auxiliary_loss_mlp": 0.01136208, + "balance_loss_clip": 1.00192451, + "balance_loss_mlp": 1.00069118, + "epoch": 0.29328122651435445, + "flos": 13990869843840.0, + "grad_norm": 2.050697472320738, + "language_loss": 0.67455423, + "learning_rate": 3.315864882155911e-06, + "loss": 0.69748127, + "num_input_tokens_seen": 105157535, + "step": 4878, + "time_per_iteration": 2.553011894226074 + }, + { + "auxiliary_loss_clip": 0.01129622, + "auxiliary_loss_mlp": 0.0113627, + "balance_loss_clip": 1.00213671, + "balance_loss_mlp": 1.00084817, + "epoch": 0.2933413497670224, + "flos": 25264593423360.0, + "grad_norm": 1.7576978532742187, + "language_loss": 0.73517144, + "learning_rate": 3.3155715625909982e-06, + "loss": 0.75783038, + "num_input_tokens_seen": 105175185, + "step": 4879, + "time_per_iteration": 2.6641905307769775 + }, + { + "auxiliary_loss_clip": 0.01125273, + "auxiliary_loss_mlp": 0.0074821, + "balance_loss_clip": 1.00202775, + "balance_loss_mlp": 1.00055337, + "epoch": 0.2934014730196904, + "flos": 32123746656000.0, + "grad_norm": 1.9132405960400647, + "language_loss": 0.66040754, + "learning_rate": 3.3152781931394803e-06, + "loss": 0.67914242, + "num_input_tokens_seen": 105194540, + "step": 4880, + "time_per_iteration": 2.7440223693847656 + }, + { + "auxiliary_loss_clip": 0.01156923, + "auxiliary_loss_mlp": 0.01135861, + "balance_loss_clip": 1.00196087, + "balance_loss_mlp": 1.000916, + "epoch": 0.29346159627235835, + "flos": 24352498344960.0, + "grad_norm": 2.3939532777062005, + "language_loss": 0.70236945, + "learning_rate": 3.314984773812481e-06, + "loss": 0.72529727, + "num_input_tokens_seen": 105213215, + "step": 4881, + "time_per_iteration": 2.6202192306518555 + }, + { + "auxiliary_loss_clip": 0.01140222, + "auxiliary_loss_mlp": 0.00748172, + "balance_loss_clip": 1.00180435, + "balance_loss_mlp": 1.0004952, + "epoch": 0.2935217195250263, + "flos": 22746752749440.0, + "grad_norm": 1.5155503233936176, + "language_loss": 0.83650339, + "learning_rate": 3.314691304621127e-06, + "loss": 0.85538733, + "num_input_tokens_seen": 105231585, + "step": 4882, + "time_per_iteration": 2.6345133781433105 + }, + { + "auxiliary_loss_clip": 0.01172202, + "auxiliary_loss_mlp": 0.01136382, + "balance_loss_clip": 1.00204468, + "balance_loss_mlp": 1.00076938, + "epoch": 0.2935818427776943, + "flos": 21725561088000.0, + "grad_norm": 2.1623449518945397, + "language_loss": 0.71085262, + "learning_rate": 3.314397785576548e-06, + "loss": 0.73393846, + "num_input_tokens_seen": 105250120, + "step": 4883, + "time_per_iteration": 2.5312139987945557 + }, + { + "auxiliary_loss_clip": 0.01138681, + "auxiliary_loss_mlp": 0.01135833, + "balance_loss_clip": 1.00174952, + "balance_loss_mlp": 1.00069726, + "epoch": 0.29364196603036224, + "flos": 23804968354560.0, + "grad_norm": 2.931197262652371, + "language_loss": 0.92349076, + "learning_rate": 3.3141042166898726e-06, + "loss": 0.9462359, + "num_input_tokens_seen": 105266065, + "step": 4884, + "time_per_iteration": 2.623988151550293 + }, + { + "auxiliary_loss_clip": 0.01156826, + "auxiliary_loss_mlp": 0.01136051, + "balance_loss_clip": 1.00204945, + "balance_loss_mlp": 1.00072455, + "epoch": 0.2937020892830302, + "flos": 23470064922240.0, + "grad_norm": 2.018894076988367, + "language_loss": 0.73141938, + "learning_rate": 3.313810597972234e-06, + "loss": 0.75434816, + "num_input_tokens_seen": 105282155, + "step": 4885, + "time_per_iteration": 2.5526418685913086 + }, + { + "auxiliary_loss_clip": 0.01157044, + "auxiliary_loss_mlp": 0.01136137, + "balance_loss_clip": 1.00195312, + "balance_loss_mlp": 1.00090623, + "epoch": 0.2937622125356982, + "flos": 24272740195200.0, + "grad_norm": 1.7880580262707435, + "language_loss": 0.84686112, + "learning_rate": 3.3135169294347655e-06, + "loss": 0.86979294, + "num_input_tokens_seen": 105299225, + "step": 4886, + "time_per_iteration": 2.552210569381714 + }, + { + "auxiliary_loss_clip": 0.01123627, + "auxiliary_loss_mlp": 0.01135652, + "balance_loss_clip": 1.00168014, + "balance_loss_mlp": 1.00080252, + "epoch": 0.29382233578836614, + "flos": 20662461233280.0, + "grad_norm": 2.192499622561973, + "language_loss": 0.77115613, + "learning_rate": 3.313223211088603e-06, + "loss": 0.79374892, + "num_input_tokens_seen": 105315710, + "step": 4887, + "time_per_iteration": 3.9556567668914795 + }, + { + "auxiliary_loss_clip": 0.01138927, + "auxiliary_loss_mlp": 0.01135531, + "balance_loss_clip": 1.00188684, + "balance_loss_mlp": 1.00096703, + "epoch": 0.2938824590410341, + "flos": 16545052103040.0, + "grad_norm": 2.1236830669817683, + "language_loss": 0.79263806, + "learning_rate": 3.3129294429448855e-06, + "loss": 0.81538266, + "num_input_tokens_seen": 105333505, + "step": 4888, + "time_per_iteration": 2.5613222122192383 + }, + { + "auxiliary_loss_clip": 0.01139369, + "auxiliary_loss_mlp": 0.01135643, + "balance_loss_clip": 1.00195432, + "balance_loss_mlp": 1.00069845, + "epoch": 0.29394258229370207, + "flos": 37925474382720.0, + "grad_norm": 1.3598637987680666, + "language_loss": 0.55243886, + "learning_rate": 3.3126356250147517e-06, + "loss": 0.57518899, + "num_input_tokens_seen": 105355605, + "step": 4889, + "time_per_iteration": 2.7604918479919434 + }, + { + "auxiliary_loss_clip": 0.0115569, + "auxiliary_loss_mlp": 0.01136078, + "balance_loss_clip": 1.00193238, + "balance_loss_mlp": 1.0007515, + "epoch": 0.29400270554637004, + "flos": 20044690197120.0, + "grad_norm": 1.708103192279056, + "language_loss": 0.84690619, + "learning_rate": 3.3123417573093434e-06, + "loss": 0.86982387, + "num_input_tokens_seen": 105374225, + "step": 4890, + "time_per_iteration": 2.5553951263427734 + }, + { + "auxiliary_loss_clip": 0.01155551, + "auxiliary_loss_mlp": 0.01135949, + "balance_loss_clip": 1.00192249, + "balance_loss_mlp": 1.00090873, + "epoch": 0.294062828799038, + "flos": 15266380775040.0, + "grad_norm": 1.70227530858564, + "language_loss": 0.72806191, + "learning_rate": 3.3120478398398046e-06, + "loss": 0.75097692, + "num_input_tokens_seen": 105391565, + "step": 4891, + "time_per_iteration": 3.910762071609497 + }, + { + "auxiliary_loss_clip": 0.01172115, + "auxiliary_loss_mlp": 0.01135914, + "balance_loss_clip": 1.00195527, + "balance_loss_mlp": 1.00096893, + "epoch": 0.294122952051706, + "flos": 22747147799040.0, + "grad_norm": 1.8716507038451398, + "language_loss": 0.77098548, + "learning_rate": 3.3117538726172797e-06, + "loss": 0.79406577, + "num_input_tokens_seen": 105409840, + "step": 4892, + "time_per_iteration": 5.363738298416138 + }, + { + "auxiliary_loss_clip": 0.01172071, + "auxiliary_loss_mlp": 0.0113524, + "balance_loss_clip": 1.00195932, + "balance_loss_mlp": 1.00067687, + "epoch": 0.294183075304374, + "flos": 24972891073920.0, + "grad_norm": 2.4154536258871504, + "language_loss": 0.78269321, + "learning_rate": 3.3114598556529164e-06, + "loss": 0.80576628, + "num_input_tokens_seen": 105428645, + "step": 4893, + "time_per_iteration": 2.5577123165130615 + }, + { + "auxiliary_loss_clip": 0.01122132, + "auxiliary_loss_mlp": 0.01135778, + "balance_loss_clip": 1.00175369, + "balance_loss_mlp": 1.00083327, + "epoch": 0.29424319855704195, + "flos": 30952986762240.0, + "grad_norm": 1.816120632728856, + "language_loss": 0.84734637, + "learning_rate": 3.311165788957864e-06, + "loss": 0.86992544, + "num_input_tokens_seen": 105447480, + "step": 4894, + "time_per_iteration": 2.7181267738342285 + }, + { + "auxiliary_loss_clip": 0.01155315, + "auxiliary_loss_mlp": 0.01135901, + "balance_loss_clip": 1.00186145, + "balance_loss_mlp": 1.00086093, + "epoch": 0.2943033218097099, + "flos": 15231583474560.0, + "grad_norm": 2.6161972707282524, + "language_loss": 0.90192747, + "learning_rate": 3.310871672543274e-06, + "loss": 0.92483968, + "num_input_tokens_seen": 105464600, + "step": 4895, + "time_per_iteration": 2.5725300312042236 + }, + { + "auxiliary_loss_clip": 0.01155574, + "auxiliary_loss_mlp": 0.01135882, + "balance_loss_clip": 1.00183463, + "balance_loss_mlp": 1.00074661, + "epoch": 0.2943634450623779, + "flos": 21725884310400.0, + "grad_norm": 1.7564944221800607, + "language_loss": 0.86315322, + "learning_rate": 3.3105775064202982e-06, + "loss": 0.88606787, + "num_input_tokens_seen": 105481510, + "step": 4896, + "time_per_iteration": 2.5461153984069824 + }, + { + "auxiliary_loss_clip": 0.01156372, + "auxiliary_loss_mlp": 0.01136342, + "balance_loss_clip": 1.00191724, + "balance_loss_mlp": 1.00101602, + "epoch": 0.29442356831504585, + "flos": 22602104680320.0, + "grad_norm": 1.836619888528329, + "language_loss": 0.7329827, + "learning_rate": 3.3102832906000924e-06, + "loss": 0.7559098, + "num_input_tokens_seen": 105501390, + "step": 4897, + "time_per_iteration": 2.568344831466675 + }, + { + "auxiliary_loss_clip": 0.0115681, + "auxiliary_loss_mlp": 0.0113639, + "balance_loss_clip": 1.00183606, + "balance_loss_mlp": 1.00087333, + "epoch": 0.2944836915677138, + "flos": 20011401267840.0, + "grad_norm": 1.8945539874327328, + "language_loss": 0.7386781, + "learning_rate": 3.309989025093813e-06, + "loss": 0.76161015, + "num_input_tokens_seen": 105519600, + "step": 4898, + "time_per_iteration": 2.5248634815216064 + }, + { + "auxiliary_loss_clip": 0.01155913, + "auxiliary_loss_mlp": 0.01136959, + "balance_loss_clip": 1.00192046, + "balance_loss_mlp": 1.00096512, + "epoch": 0.2945438148203818, + "flos": 20045875345920.0, + "grad_norm": 2.7140358885210287, + "language_loss": 0.70188355, + "learning_rate": 3.309694709912618e-06, + "loss": 0.72481227, + "num_input_tokens_seen": 105535970, + "step": 4899, + "time_per_iteration": 2.5333797931671143 + }, + { + "auxiliary_loss_clip": 0.0113978, + "auxiliary_loss_mlp": 0.00748335, + "balance_loss_clip": 1.0018295, + "balance_loss_mlp": 1.00070536, + "epoch": 0.29460393807304974, + "flos": 23733542160000.0, + "grad_norm": 3.268153342413128, + "language_loss": 0.79162526, + "learning_rate": 3.3094003450676685e-06, + "loss": 0.81050646, + "num_input_tokens_seen": 105556735, + "step": 4900, + "time_per_iteration": 2.6235625743865967 + }, + { + "auxiliary_loss_clip": 0.01141545, + "auxiliary_loss_mlp": 0.01135358, + "balance_loss_clip": 1.0018779, + "balance_loss_mlp": 1.00079441, + "epoch": 0.2946640613257177, + "flos": 14976079056000.0, + "grad_norm": 1.713013052233413, + "language_loss": 0.80370069, + "learning_rate": 3.3091059305701268e-06, + "loss": 0.82646972, + "num_input_tokens_seen": 105574875, + "step": 4901, + "time_per_iteration": 2.6455512046813965 + }, + { + "auxiliary_loss_clip": 0.01138388, + "auxiliary_loss_mlp": 0.01134775, + "balance_loss_clip": 1.00182199, + "balance_loss_mlp": 1.00068808, + "epoch": 0.2947241845783857, + "flos": 24243904552320.0, + "grad_norm": 2.0772288994463466, + "language_loss": 0.57590133, + "learning_rate": 3.308811466431157e-06, + "loss": 0.59863299, + "num_input_tokens_seen": 105594225, + "step": 4902, + "time_per_iteration": 2.6377410888671875 + }, + { + "auxiliary_loss_clip": 0.011387, + "auxiliary_loss_mlp": 0.0113532, + "balance_loss_clip": 1.00174403, + "balance_loss_mlp": 1.00085175, + "epoch": 0.29478430783105364, + "flos": 19938394874880.0, + "grad_norm": 1.5978765017385708, + "language_loss": 0.7573992, + "learning_rate": 3.308516952661925e-06, + "loss": 0.78013945, + "num_input_tokens_seen": 105614000, + "step": 4903, + "time_per_iteration": 2.5951812267303467 + }, + { + "auxiliary_loss_clip": 0.01139959, + "auxiliary_loss_mlp": 0.01136263, + "balance_loss_clip": 1.00181413, + "balance_loss_mlp": 1.00074542, + "epoch": 0.2948444310837216, + "flos": 27381347856000.0, + "grad_norm": 4.581671441699315, + "language_loss": 0.624735, + "learning_rate": 3.3082223892736e-06, + "loss": 0.64749718, + "num_input_tokens_seen": 105634575, + "step": 4904, + "time_per_iteration": 2.6376774311065674 + }, + { + "auxiliary_loss_clip": 0.01155249, + "auxiliary_loss_mlp": 0.01135702, + "balance_loss_clip": 1.00185359, + "balance_loss_mlp": 1.00085282, + "epoch": 0.2949045543363896, + "flos": 23405462311680.0, + "grad_norm": 1.6477433580869878, + "language_loss": 0.73439932, + "learning_rate": 3.3079277762773496e-06, + "loss": 0.75730884, + "num_input_tokens_seen": 105654385, + "step": 4905, + "time_per_iteration": 2.5805394649505615 + }, + { + "auxiliary_loss_clip": 0.01125146, + "auxiliary_loss_mlp": 0.01135773, + "balance_loss_clip": 1.0017947, + "balance_loss_mlp": 1.00063729, + "epoch": 0.2949646775890576, + "flos": 23951483930880.0, + "grad_norm": 3.028151351641667, + "language_loss": 0.81560701, + "learning_rate": 3.3076331136843476e-06, + "loss": 0.83821619, + "num_input_tokens_seen": 105673570, + "step": 4906, + "time_per_iteration": 2.6713197231292725 + }, + { + "auxiliary_loss_clip": 0.011145, + "auxiliary_loss_mlp": 0.01135908, + "balance_loss_clip": 1.00232661, + "balance_loss_mlp": 1.00086784, + "epoch": 0.29502480084172555, + "flos": 22784315397120.0, + "grad_norm": 2.3544995539000673, + "language_loss": 0.87347537, + "learning_rate": 3.3073384015057667e-06, + "loss": 0.8959794, + "num_input_tokens_seen": 105691940, + "step": 4907, + "time_per_iteration": 2.680039882659912 + }, + { + "auxiliary_loss_clip": 0.01172149, + "auxiliary_loss_mlp": 0.01136372, + "balance_loss_clip": 1.00197518, + "balance_loss_mlp": 1.00085485, + "epoch": 0.2950849240943935, + "flos": 19646656611840.0, + "grad_norm": 1.8824519375527182, + "language_loss": 0.81877917, + "learning_rate": 3.307043639752782e-06, + "loss": 0.84186435, + "num_input_tokens_seen": 105709825, + "step": 4908, + "time_per_iteration": 2.504615306854248 + }, + { + "auxiliary_loss_clip": 0.01168344, + "auxiliary_loss_mlp": 0.01119934, + "balance_loss_clip": 1.00168085, + "balance_loss_mlp": 1.00015223, + "epoch": 0.2951450473470615, + "flos": 71002829260800.0, + "grad_norm": 0.7756379175796425, + "language_loss": 0.57289743, + "learning_rate": 3.3067488284365728e-06, + "loss": 0.59578019, + "num_input_tokens_seen": 105766880, + "step": 4909, + "time_per_iteration": 2.979006767272949 + }, + { + "auxiliary_loss_clip": 0.01155399, + "auxiliary_loss_mlp": 0.00748296, + "balance_loss_clip": 1.00187421, + "balance_loss_mlp": 1.00067866, + "epoch": 0.29520517059972945, + "flos": 22966310632320.0, + "grad_norm": 1.5384178823261252, + "language_loss": 0.86780465, + "learning_rate": 3.3064539675683163e-06, + "loss": 0.88684154, + "num_input_tokens_seen": 105786875, + "step": 4910, + "time_per_iteration": 2.582227945327759 + }, + { + "auxiliary_loss_clip": 0.01155174, + "auxiliary_loss_mlp": 0.01135064, + "balance_loss_clip": 1.00189519, + "balance_loss_mlp": 1.00078714, + "epoch": 0.2952652938523974, + "flos": 20485673470080.0, + "grad_norm": 1.665390584323697, + "language_loss": 0.72954249, + "learning_rate": 3.3061590571591946e-06, + "loss": 0.75244486, + "num_input_tokens_seen": 105805315, + "step": 4911, + "time_per_iteration": 2.6067512035369873 + }, + { + "auxiliary_loss_clip": 0.01155539, + "auxiliary_loss_mlp": 0.01135849, + "balance_loss_clip": 1.00197875, + "balance_loss_mlp": 1.00080919, + "epoch": 0.2953254171050654, + "flos": 19646584784640.0, + "grad_norm": 2.1413986704424373, + "language_loss": 0.90172613, + "learning_rate": 3.3058640972203904e-06, + "loss": 0.92464006, + "num_input_tokens_seen": 105825125, + "step": 4912, + "time_per_iteration": 2.554272413253784 + }, + { + "auxiliary_loss_clip": 0.01141113, + "auxiliary_loss_mlp": 0.01135654, + "balance_loss_clip": 1.00183845, + "balance_loss_mlp": 1.00109053, + "epoch": 0.29538554035773334, + "flos": 22747973811840.0, + "grad_norm": 1.39882855097555, + "language_loss": 0.83331692, + "learning_rate": 3.3055690877630894e-06, + "loss": 0.85608453, + "num_input_tokens_seen": 105846085, + "step": 4913, + "time_per_iteration": 2.647143840789795 + }, + { + "auxiliary_loss_clip": 0.01171891, + "auxiliary_loss_mlp": 0.011355, + "balance_loss_clip": 1.00192177, + "balance_loss_mlp": 1.00074577, + "epoch": 0.2954456636104013, + "flos": 21871861182720.0, + "grad_norm": 2.0728278996533627, + "language_loss": 0.76897216, + "learning_rate": 3.3052740287984765e-06, + "loss": 0.79204607, + "num_input_tokens_seen": 105865400, + "step": 4914, + "time_per_iteration": 2.5423357486724854 + }, + { + "auxiliary_loss_clip": 0.01139465, + "auxiliary_loss_mlp": 0.01135116, + "balance_loss_clip": 1.0017463, + "balance_loss_mlp": 1.00064826, + "epoch": 0.2955057868630693, + "flos": 40442560871040.0, + "grad_norm": 1.6899430983339954, + "language_loss": 0.81566703, + "learning_rate": 3.3049789203377424e-06, + "loss": 0.83841288, + "num_input_tokens_seen": 105887920, + "step": 4915, + "time_per_iteration": 2.767091989517212 + }, + { + "auxiliary_loss_clip": 0.0107512, + "auxiliary_loss_mlp": 0.01136401, + "balance_loss_clip": 1.00167572, + "balance_loss_mlp": 1.00069284, + "epoch": 0.29556591011573724, + "flos": 22564506119040.0, + "grad_norm": 1.8016088170813287, + "language_loss": 0.84559119, + "learning_rate": 3.3046837623920772e-06, + "loss": 0.8677063, + "num_input_tokens_seen": 105904035, + "step": 4916, + "time_per_iteration": 2.7582144737243652 + }, + { + "auxiliary_loss_clip": 0.01156746, + "auxiliary_loss_mlp": 0.01134743, + "balance_loss_clip": 1.00182223, + "balance_loss_mlp": 1.00065613, + "epoch": 0.2956260333684052, + "flos": 22089300163200.0, + "grad_norm": 4.490325098826126, + "language_loss": 0.69693172, + "learning_rate": 3.3043885549726723e-06, + "loss": 0.71984667, + "num_input_tokens_seen": 105922685, + "step": 4917, + "time_per_iteration": 2.6037323474884033 + }, + { + "auxiliary_loss_clip": 0.01140159, + "auxiliary_loss_mlp": 0.011357, + "balance_loss_clip": 1.00182891, + "balance_loss_mlp": 1.00065947, + "epoch": 0.2956861566210732, + "flos": 16435488643200.0, + "grad_norm": 1.9202591971917855, + "language_loss": 0.91286314, + "learning_rate": 3.3040932980907226e-06, + "loss": 0.93562168, + "num_input_tokens_seen": 105940425, + "step": 4918, + "time_per_iteration": 2.596339702606201 + }, + { + "auxiliary_loss_clip": 0.01171957, + "auxiliary_loss_mlp": 0.01135236, + "balance_loss_clip": 1.00200129, + "balance_loss_mlp": 1.00076771, + "epoch": 0.2957462798737412, + "flos": 25812087500160.0, + "grad_norm": 1.9305404681196545, + "language_loss": 0.72210455, + "learning_rate": 3.303797991757425e-06, + "loss": 0.74517643, + "num_input_tokens_seen": 105960550, + "step": 4919, + "time_per_iteration": 2.5549967288970947 + }, + { + "auxiliary_loss_clip": 0.01139881, + "auxiliary_loss_mlp": 0.01134926, + "balance_loss_clip": 1.00176811, + "balance_loss_mlp": 1.00074399, + "epoch": 0.29580640312640916, + "flos": 16690849407360.0, + "grad_norm": 1.7600095369031972, + "language_loss": 0.75901169, + "learning_rate": 3.3035026359839763e-06, + "loss": 0.78175974, + "num_input_tokens_seen": 105978820, + "step": 4920, + "time_per_iteration": 2.5861988067626953 + }, + { + "auxiliary_loss_clip": 0.01139468, + "auxiliary_loss_mlp": 0.01136191, + "balance_loss_clip": 1.00205302, + "balance_loss_mlp": 1.00086427, + "epoch": 0.2958665263790771, + "flos": 23945594100480.0, + "grad_norm": 2.2767932132587916, + "language_loss": 0.68419635, + "learning_rate": 3.3032072307815774e-06, + "loss": 0.70695293, + "num_input_tokens_seen": 105997545, + "step": 4921, + "time_per_iteration": 2.627505302429199 + }, + { + "auxiliary_loss_clip": 0.0113875, + "auxiliary_loss_mlp": 0.01135478, + "balance_loss_clip": 1.00183153, + "balance_loss_mlp": 1.00081897, + "epoch": 0.2959266496317451, + "flos": 18478410670080.0, + "grad_norm": 1.8004178032843876, + "language_loss": 0.7426241, + "learning_rate": 3.3029117761614298e-06, + "loss": 0.76536638, + "num_input_tokens_seen": 106015320, + "step": 4922, + "time_per_iteration": 2.5772531032562256 + }, + { + "auxiliary_loss_clip": 0.01172113, + "auxiliary_loss_mlp": 0.00748256, + "balance_loss_clip": 1.00199926, + "balance_loss_mlp": 1.00062537, + "epoch": 0.29598677288441305, + "flos": 25957489754880.0, + "grad_norm": 1.8903582399377796, + "language_loss": 0.76520538, + "learning_rate": 3.302616272134737e-06, + "loss": 0.78440905, + "num_input_tokens_seen": 106034555, + "step": 4923, + "time_per_iteration": 2.5587964057922363 + }, + { + "auxiliary_loss_clip": 0.01139181, + "auxiliary_loss_mlp": 0.01135386, + "balance_loss_clip": 1.00175667, + "balance_loss_mlp": 1.00072682, + "epoch": 0.296046896137081, + "flos": 25155999630720.0, + "grad_norm": 1.8847704800085936, + "language_loss": 0.86587346, + "learning_rate": 3.3023207187127042e-06, + "loss": 0.88861907, + "num_input_tokens_seen": 106054200, + "step": 4924, + "time_per_iteration": 2.6381521224975586 + }, + { + "auxiliary_loss_clip": 0.01156142, + "auxiliary_loss_mlp": 0.01134595, + "balance_loss_clip": 1.00188398, + "balance_loss_mlp": 1.00069892, + "epoch": 0.296107019389749, + "flos": 21761148487680.0, + "grad_norm": 2.1448402312161114, + "language_loss": 0.8198849, + "learning_rate": 3.3020251159065396e-06, + "loss": 0.84279227, + "num_input_tokens_seen": 106074700, + "step": 4925, + "time_per_iteration": 3.8979127407073975 + }, + { + "auxiliary_loss_clip": 0.01109543, + "auxiliary_loss_mlp": 0.01134787, + "balance_loss_clip": 1.00179601, + "balance_loss_mlp": 1.00070024, + "epoch": 0.29616714264241695, + "flos": 17960039544960.0, + "grad_norm": 3.4260267584474153, + "language_loss": 0.86608189, + "learning_rate": 3.301729463727452e-06, + "loss": 0.88852513, + "num_input_tokens_seen": 106091415, + "step": 4926, + "time_per_iteration": 2.6274490356445312 + }, + { + "auxiliary_loss_clip": 0.01123411, + "auxiliary_loss_mlp": 0.01134665, + "balance_loss_clip": 1.00170445, + "balance_loss_mlp": 1.00076938, + "epoch": 0.2962272658950849, + "flos": 15012779777280.0, + "grad_norm": 1.8793018263477586, + "language_loss": 0.86071807, + "learning_rate": 3.3014337621866527e-06, + "loss": 0.88329881, + "num_input_tokens_seen": 106109135, + "step": 4927, + "time_per_iteration": 2.6089065074920654 + }, + { + "auxiliary_loss_clip": 0.0115605, + "auxiliary_loss_mlp": 0.01134483, + "balance_loss_clip": 1.00187683, + "balance_loss_mlp": 1.00087333, + "epoch": 0.2962873891477529, + "flos": 14720861946240.0, + "grad_norm": 1.6598105334657345, + "language_loss": 0.80689448, + "learning_rate": 3.3011380112953553e-06, + "loss": 0.82979977, + "num_input_tokens_seen": 106125750, + "step": 4928, + "time_per_iteration": 2.551400899887085 + }, + { + "auxiliary_loss_clip": 0.01139696, + "auxiliary_loss_mlp": 0.01135566, + "balance_loss_clip": 1.00184727, + "balance_loss_mlp": 1.00071681, + "epoch": 0.29634751240042084, + "flos": 26723787528960.0, + "grad_norm": 3.3699101274168988, + "language_loss": 0.72430217, + "learning_rate": 3.300842211064773e-06, + "loss": 0.74705482, + "num_input_tokens_seen": 106142835, + "step": 4929, + "time_per_iteration": 5.424392223358154 + }, + { + "auxiliary_loss_clip": 0.01141345, + "auxiliary_loss_mlp": 0.01135666, + "balance_loss_clip": 1.00181842, + "balance_loss_mlp": 1.00091195, + "epoch": 0.2964076356530888, + "flos": 14571293713920.0, + "grad_norm": 2.0746118959172417, + "language_loss": 0.72025692, + "learning_rate": 3.3005463615061246e-06, + "loss": 0.74302709, + "num_input_tokens_seen": 106160680, + "step": 4930, + "time_per_iteration": 3.9664244651794434 + }, + { + "auxiliary_loss_clip": 0.01123152, + "auxiliary_loss_mlp": 0.01120985, + "balance_loss_clip": 1.00292718, + "balance_loss_mlp": 1.00044048, + "epoch": 0.29646775890575683, + "flos": 63104315063040.0, + "grad_norm": 0.8278853453970841, + "language_loss": 0.60731637, + "learning_rate": 3.3002504626306275e-06, + "loss": 0.62975776, + "num_input_tokens_seen": 106224415, + "step": 4931, + "time_per_iteration": 3.1623072624206543 + }, + { + "auxiliary_loss_clip": 0.01089298, + "auxiliary_loss_mlp": 0.01119813, + "balance_loss_clip": 1.00147426, + "balance_loss_mlp": 1.00003171, + "epoch": 0.2965278821584248, + "flos": 63067686168960.0, + "grad_norm": 0.737157981319484, + "language_loss": 0.52350283, + "learning_rate": 3.2999545144495023e-06, + "loss": 0.54559398, + "num_input_tokens_seen": 106279140, + "step": 4932, + "time_per_iteration": 3.4329991340637207 + }, + { + "auxiliary_loss_clip": 0.0115661, + "auxiliary_loss_mlp": 0.01135403, + "balance_loss_clip": 1.00193739, + "balance_loss_mlp": 1.00064826, + "epoch": 0.29658800541109276, + "flos": 23768734510080.0, + "grad_norm": 1.7789801602063706, + "language_loss": 0.813586, + "learning_rate": 3.299658516973972e-06, + "loss": 0.83650619, + "num_input_tokens_seen": 106298190, + "step": 4933, + "time_per_iteration": 2.884345769882202 + }, + { + "auxiliary_loss_clip": 0.01124945, + "auxiliary_loss_mlp": 0.01134197, + "balance_loss_clip": 1.00188851, + "balance_loss_mlp": 1.00068283, + "epoch": 0.2966481286637607, + "flos": 23988543788160.0, + "grad_norm": 1.6381309406742304, + "language_loss": 0.75404978, + "learning_rate": 3.299362470215261e-06, + "loss": 0.77664119, + "num_input_tokens_seen": 106319065, + "step": 4934, + "time_per_iteration": 2.702188491821289 + }, + { + "auxiliary_loss_clip": 0.01140593, + "auxiliary_loss_mlp": 0.01135734, + "balance_loss_clip": 1.00181007, + "balance_loss_mlp": 1.00107551, + "epoch": 0.2967082519164287, + "flos": 17165157523200.0, + "grad_norm": 1.747393491536317, + "language_loss": 0.62210596, + "learning_rate": 3.299066374184594e-06, + "loss": 0.64486921, + "num_input_tokens_seen": 106338040, + "step": 4935, + "time_per_iteration": 2.580047607421875 + }, + { + "auxiliary_loss_clip": 0.01156721, + "auxiliary_loss_mlp": 0.01134946, + "balance_loss_clip": 1.00196922, + "balance_loss_mlp": 1.00066876, + "epoch": 0.29676837516909665, + "flos": 29387712816000.0, + "grad_norm": 1.6064767295595301, + "language_loss": 0.79764819, + "learning_rate": 3.2987702288932e-06, + "loss": 0.82056493, + "num_input_tokens_seen": 106358900, + "step": 4936, + "time_per_iteration": 2.63297438621521 + }, + { + "auxiliary_loss_clip": 0.0110574, + "auxiliary_loss_mlp": 0.01136052, + "balance_loss_clip": 1.00175571, + "balance_loss_mlp": 1.00091684, + "epoch": 0.2968284984217646, + "flos": 34751222616960.0, + "grad_norm": 1.5582266200009098, + "language_loss": 0.73801196, + "learning_rate": 3.298474034352309e-06, + "loss": 0.76042986, + "num_input_tokens_seen": 106381805, + "step": 4937, + "time_per_iteration": 2.8037641048431396 + }, + { + "auxiliary_loss_clip": 0.01105795, + "auxiliary_loss_mlp": 0.01135219, + "balance_loss_clip": 1.00163233, + "balance_loss_mlp": 1.00075054, + "epoch": 0.2968886216744326, + "flos": 21544104556800.0, + "grad_norm": 1.859005033816613, + "language_loss": 0.78055918, + "learning_rate": 3.2981777905731526e-06, + "loss": 0.80296928, + "num_input_tokens_seen": 106402365, + "step": 4938, + "time_per_iteration": 2.7485203742980957 + }, + { + "auxiliary_loss_clip": 0.01139725, + "auxiliary_loss_mlp": 0.01135581, + "balance_loss_clip": 1.00187004, + "balance_loss_mlp": 1.00073147, + "epoch": 0.29694874492710055, + "flos": 12787323811200.0, + "grad_norm": 2.628509512176665, + "language_loss": 0.77149606, + "learning_rate": 3.297881497566964e-06, + "loss": 0.79424912, + "num_input_tokens_seen": 106419800, + "step": 4939, + "time_per_iteration": 2.5763649940490723 + }, + { + "auxiliary_loss_clip": 0.01126391, + "auxiliary_loss_mlp": 0.01135137, + "balance_loss_clip": 1.00187099, + "balance_loss_mlp": 1.00066924, + "epoch": 0.2970088681797685, + "flos": 24569973239040.0, + "grad_norm": 1.529140582999319, + "language_loss": 0.78126431, + "learning_rate": 3.297585155344979e-06, + "loss": 0.80387956, + "num_input_tokens_seen": 106440300, + "step": 4940, + "time_per_iteration": 2.6798322200775146 + }, + { + "auxiliary_loss_clip": 0.01140377, + "auxiliary_loss_mlp": 0.01135715, + "balance_loss_clip": 1.00201094, + "balance_loss_mlp": 1.00067425, + "epoch": 0.2970689914324365, + "flos": 23659171050240.0, + "grad_norm": 1.6269986253242308, + "language_loss": 0.75434923, + "learning_rate": 3.297288763918435e-06, + "loss": 0.77711016, + "num_input_tokens_seen": 106460035, + "step": 4941, + "time_per_iteration": 2.620731830596924 + }, + { + "auxiliary_loss_clip": 0.01155498, + "auxiliary_loss_mlp": 0.01135943, + "balance_loss_clip": 1.00194669, + "balance_loss_mlp": 1.00080752, + "epoch": 0.29712911468510445, + "flos": 39670301439360.0, + "grad_norm": 2.2755205634941613, + "language_loss": 0.73824763, + "learning_rate": 3.2969923232985712e-06, + "loss": 0.7611621, + "num_input_tokens_seen": 106481095, + "step": 4942, + "time_per_iteration": 2.742408037185669 + }, + { + "auxiliary_loss_clip": 0.01123343, + "auxiliary_loss_mlp": 0.01136033, + "balance_loss_clip": 1.0016396, + "balance_loss_mlp": 1.00108778, + "epoch": 0.2971892379377724, + "flos": 26395312631040.0, + "grad_norm": 1.8682057029616224, + "language_loss": 0.70095551, + "learning_rate": 3.2966958334966287e-06, + "loss": 0.72354925, + "num_input_tokens_seen": 106501590, + "step": 4943, + "time_per_iteration": 2.6700916290283203 + }, + { + "auxiliary_loss_clip": 0.01139644, + "auxiliary_loss_mlp": 0.0113543, + "balance_loss_clip": 1.00194359, + "balance_loss_mlp": 1.00067616, + "epoch": 0.2972493611904404, + "flos": 17603195880960.0, + "grad_norm": 2.0206132609435787, + "language_loss": 0.79467106, + "learning_rate": 3.2963992945238497e-06, + "loss": 0.81742179, + "num_input_tokens_seen": 106519430, + "step": 4944, + "time_per_iteration": 2.5694632530212402 + }, + { + "auxiliary_loss_clip": 0.0113986, + "auxiliary_loss_mlp": 0.01134503, + "balance_loss_clip": 1.00183964, + "balance_loss_mlp": 1.00079799, + "epoch": 0.2973094844431084, + "flos": 20412774817920.0, + "grad_norm": 1.966230820562287, + "language_loss": 0.83372188, + "learning_rate": 3.2961027063914795e-06, + "loss": 0.85646552, + "num_input_tokens_seen": 106535870, + "step": 4945, + "time_per_iteration": 2.585768222808838 + }, + { + "auxiliary_loss_clip": 0.01106602, + "auxiliary_loss_mlp": 0.01134677, + "balance_loss_clip": 1.00170934, + "balance_loss_mlp": 1.00068545, + "epoch": 0.29736960769577636, + "flos": 17493488766720.0, + "grad_norm": 1.7932444978290338, + "language_loss": 0.6712209, + "learning_rate": 3.2958060691107654e-06, + "loss": 0.69363368, + "num_input_tokens_seen": 106553560, + "step": 4946, + "time_per_iteration": 2.65975022315979 + }, + { + "auxiliary_loss_clip": 0.01156753, + "auxiliary_loss_mlp": 0.00748323, + "balance_loss_clip": 1.00210786, + "balance_loss_mlp": 1.00056016, + "epoch": 0.2974297309484443, + "flos": 26103969417600.0, + "grad_norm": 1.902319465806219, + "language_loss": 0.73926169, + "learning_rate": 3.2955093826929547e-06, + "loss": 0.75831246, + "num_input_tokens_seen": 106574115, + "step": 4947, + "time_per_iteration": 2.610941171646118 + }, + { + "auxiliary_loss_clip": 0.01121728, + "auxiliary_loss_mlp": 0.01135375, + "balance_loss_clip": 1.00174594, + "balance_loss_mlp": 1.00090671, + "epoch": 0.2974898542011123, + "flos": 25666433850240.0, + "grad_norm": 2.9125264194820706, + "language_loss": 0.73022813, + "learning_rate": 3.2952126471492985e-06, + "loss": 0.75279915, + "num_input_tokens_seen": 106593070, + "step": 4948, + "time_per_iteration": 2.65250825881958 + }, + { + "auxiliary_loss_clip": 0.01171887, + "auxiliary_loss_mlp": 0.0113436, + "balance_loss_clip": 1.00192642, + "balance_loss_mlp": 1.00065494, + "epoch": 0.29754997745378026, + "flos": 18661339658880.0, + "grad_norm": 1.8682931648858934, + "language_loss": 0.84254026, + "learning_rate": 3.2949158624910497e-06, + "loss": 0.86560273, + "num_input_tokens_seen": 106610695, + "step": 4949, + "time_per_iteration": 2.5184922218322754 + }, + { + "auxiliary_loss_clip": 0.0115526, + "auxiliary_loss_mlp": 0.01134668, + "balance_loss_clip": 1.0017947, + "balance_loss_mlp": 1.00077271, + "epoch": 0.2976101007064482, + "flos": 22274599449600.0, + "grad_norm": 3.7032694615748243, + "language_loss": 0.71468341, + "learning_rate": 3.2946190287294603e-06, + "loss": 0.73758268, + "num_input_tokens_seen": 106631300, + "step": 4950, + "time_per_iteration": 2.5895168781280518 + }, + { + "auxiliary_loss_clip": 0.01106207, + "auxiliary_loss_mlp": 0.01133623, + "balance_loss_clip": 1.00159347, + "balance_loss_mlp": 1.00077629, + "epoch": 0.2976702239591162, + "flos": 21945657674880.0, + "grad_norm": 3.280753867640262, + "language_loss": 0.82450294, + "learning_rate": 3.294322145875789e-06, + "loss": 0.8469013, + "num_input_tokens_seen": 106650065, + "step": 4951, + "time_per_iteration": 2.6717870235443115 + }, + { + "auxiliary_loss_clip": 0.01139632, + "auxiliary_loss_mlp": 0.01134349, + "balance_loss_clip": 1.0017159, + "balance_loss_mlp": 1.00064445, + "epoch": 0.29773034721178415, + "flos": 24637197542400.0, + "grad_norm": 2.2530899100481285, + "language_loss": 0.73355663, + "learning_rate": 3.2940252139412912e-06, + "loss": 0.75629646, + "num_input_tokens_seen": 106668230, + "step": 4952, + "time_per_iteration": 2.624558448791504 + }, + { + "auxiliary_loss_clip": 0.01075898, + "auxiliary_loss_mlp": 0.01134516, + "balance_loss_clip": 1.00156689, + "balance_loss_mlp": 1.00081134, + "epoch": 0.2977904704644521, + "flos": 20557566541440.0, + "grad_norm": 1.8151340712124882, + "language_loss": 0.84241205, + "learning_rate": 3.293728232937228e-06, + "loss": 0.8645162, + "num_input_tokens_seen": 106687785, + "step": 4953, + "time_per_iteration": 2.751319646835327 + }, + { + "auxiliary_loss_clip": 0.01141416, + "auxiliary_loss_mlp": 0.01135092, + "balance_loss_clip": 1.0018003, + "balance_loss_mlp": 1.00071943, + "epoch": 0.2978505937171201, + "flos": 18916449027840.0, + "grad_norm": 1.8831078639184087, + "language_loss": 0.73875594, + "learning_rate": 3.2934312028748597e-06, + "loss": 0.76152098, + "num_input_tokens_seen": 106706875, + "step": 4954, + "time_per_iteration": 2.587479591369629 + }, + { + "auxiliary_loss_clip": 0.01171772, + "auxiliary_loss_mlp": 0.01134445, + "balance_loss_clip": 1.00186706, + "balance_loss_mlp": 1.00064421, + "epoch": 0.29791071696978805, + "flos": 19317750750720.0, + "grad_norm": 1.871322552036669, + "language_loss": 0.75490606, + "learning_rate": 3.293134123765452e-06, + "loss": 0.77796823, + "num_input_tokens_seen": 106725105, + "step": 4955, + "time_per_iteration": 2.5269417762756348 + }, + { + "auxiliary_loss_clip": 0.01106965, + "auxiliary_loss_mlp": 0.01135227, + "balance_loss_clip": 1.00169611, + "balance_loss_mlp": 1.00066316, + "epoch": 0.297970840222456, + "flos": 18806813740800.0, + "grad_norm": 1.9577977659567927, + "language_loss": 0.72497612, + "learning_rate": 3.2928369956202684e-06, + "loss": 0.74739802, + "num_input_tokens_seen": 106744780, + "step": 4956, + "time_per_iteration": 2.672292709350586 + }, + { + "auxiliary_loss_clip": 0.01155276, + "auxiliary_loss_mlp": 0.01136153, + "balance_loss_clip": 1.00180101, + "balance_loss_mlp": 1.00073087, + "epoch": 0.298030963475124, + "flos": 22852760762880.0, + "grad_norm": 2.1030003858615087, + "language_loss": 0.79155195, + "learning_rate": 3.2925398184505754e-06, + "loss": 0.81446624, + "num_input_tokens_seen": 106764670, + "step": 4957, + "time_per_iteration": 2.5796284675598145 + }, + { + "auxiliary_loss_clip": 0.0115671, + "auxiliary_loss_mlp": 0.01134716, + "balance_loss_clip": 1.00189531, + "balance_loss_mlp": 1.00072479, + "epoch": 0.298091086727792, + "flos": 21868485304320.0, + "grad_norm": 1.4268426260158296, + "language_loss": 0.69896996, + "learning_rate": 3.2922425922676437e-06, + "loss": 0.72188413, + "num_input_tokens_seen": 106783695, + "step": 4958, + "time_per_iteration": 2.5720739364624023 + }, + { + "auxiliary_loss_clip": 0.01123163, + "auxiliary_loss_mlp": 0.01135021, + "balance_loss_clip": 1.00177932, + "balance_loss_mlp": 1.00083923, + "epoch": 0.29815120998045996, + "flos": 21175014355200.0, + "grad_norm": 1.593051075936656, + "language_loss": 0.78377187, + "learning_rate": 3.291945317082743e-06, + "loss": 0.80635369, + "num_input_tokens_seen": 106803150, + "step": 4959, + "time_per_iteration": 2.7079551219940186 + }, + { + "auxiliary_loss_clip": 0.01156468, + "auxiliary_loss_mlp": 0.01135449, + "balance_loss_clip": 1.00185502, + "balance_loss_mlp": 1.00079, + "epoch": 0.29821133323312793, + "flos": 19896271200000.0, + "grad_norm": 1.6705282664351444, + "language_loss": 0.79578406, + "learning_rate": 3.291647992907147e-06, + "loss": 0.81870323, + "num_input_tokens_seen": 106820705, + "step": 4960, + "time_per_iteration": 2.562166452407837 + }, + { + "auxiliary_loss_clip": 0.01127633, + "auxiliary_loss_mlp": 0.01135327, + "balance_loss_clip": 1.00187492, + "balance_loss_mlp": 1.00076354, + "epoch": 0.2982714564857959, + "flos": 12750766744320.0, + "grad_norm": 2.2131681087433352, + "language_loss": 0.7393108, + "learning_rate": 3.291350619752129e-06, + "loss": 0.76194036, + "num_input_tokens_seen": 106837335, + "step": 4961, + "time_per_iteration": 2.6335225105285645 + }, + { + "auxiliary_loss_clip": 0.01156667, + "auxiliary_loss_mlp": 0.01135181, + "balance_loss_clip": 1.00191462, + "balance_loss_mlp": 1.00071239, + "epoch": 0.29833157973846386, + "flos": 22271905929600.0, + "grad_norm": 1.7609320234069863, + "language_loss": 0.6147325, + "learning_rate": 3.291053197628967e-06, + "loss": 0.63765097, + "num_input_tokens_seen": 106856250, + "step": 4962, + "time_per_iteration": 2.560237407684326 + }, + { + "auxiliary_loss_clip": 0.01155262, + "auxiliary_loss_mlp": 0.0113543, + "balance_loss_clip": 1.0018878, + "balance_loss_mlp": 1.00096154, + "epoch": 0.2983917029911318, + "flos": 15372999319680.0, + "grad_norm": 1.670183376769717, + "language_loss": 0.83039844, + "learning_rate": 3.2907557265489375e-06, + "loss": 0.85330534, + "num_input_tokens_seen": 106873370, + "step": 4963, + "time_per_iteration": 4.276941537857056 + }, + { + "auxiliary_loss_clip": 0.01122512, + "auxiliary_loss_mlp": 0.01134981, + "balance_loss_clip": 1.0017941, + "balance_loss_mlp": 1.00070381, + "epoch": 0.2984518262437998, + "flos": 15377632174080.0, + "grad_norm": 2.015860807595351, + "language_loss": 0.66182858, + "learning_rate": 3.290458206523322e-06, + "loss": 0.68440348, + "num_input_tokens_seen": 106890330, + "step": 4964, + "time_per_iteration": 2.6582205295562744 + }, + { + "auxiliary_loss_clip": 0.01155263, + "auxiliary_loss_mlp": 0.01133902, + "balance_loss_clip": 1.00184262, + "balance_loss_mlp": 1.0005784, + "epoch": 0.29851194949646775, + "flos": 18108458542080.0, + "grad_norm": 1.919480710960893, + "language_loss": 0.7135148, + "learning_rate": 3.2901606375634015e-06, + "loss": 0.73640645, + "num_input_tokens_seen": 106909190, + "step": 4965, + "time_per_iteration": 2.5583012104034424 + }, + { + "auxiliary_loss_clip": 0.01172017, + "auxiliary_loss_mlp": 0.01135439, + "balance_loss_clip": 1.00201583, + "balance_loss_mlp": 1.00097108, + "epoch": 0.2985720727491357, + "flos": 22018233104640.0, + "grad_norm": 1.9224550515461714, + "language_loss": 0.66478086, + "learning_rate": 3.289863019680461e-06, + "loss": 0.68785548, + "num_input_tokens_seen": 106927825, + "step": 4966, + "time_per_iteration": 3.8891773223876953 + }, + { + "auxiliary_loss_clip": 0.0117194, + "auxiliary_loss_mlp": 0.01135058, + "balance_loss_clip": 1.00204289, + "balance_loss_mlp": 1.00078022, + "epoch": 0.2986321960018037, + "flos": 13041355772160.0, + "grad_norm": 4.608879218599512, + "language_loss": 0.74003226, + "learning_rate": 3.289565352885785e-06, + "loss": 0.76310223, + "num_input_tokens_seen": 106943155, + "step": 4967, + "time_per_iteration": 3.891979694366455 + }, + { + "auxiliary_loss_clip": 0.01141291, + "auxiliary_loss_mlp": 0.01134973, + "balance_loss_clip": 1.00183821, + "balance_loss_mlp": 1.00060046, + "epoch": 0.29869231925447165, + "flos": 14465034305280.0, + "grad_norm": 2.066205663348724, + "language_loss": 0.71096611, + "learning_rate": 3.2892676371906614e-06, + "loss": 0.73372871, + "num_input_tokens_seen": 106960295, + "step": 4968, + "time_per_iteration": 2.564767599105835 + }, + { + "auxiliary_loss_clip": 0.0115676, + "auxiliary_loss_mlp": 0.01134912, + "balance_loss_clip": 1.00185788, + "balance_loss_mlp": 1.00053966, + "epoch": 0.2987524425071396, + "flos": 31650228639360.0, + "grad_norm": 2.9662533143456287, + "language_loss": 0.76663065, + "learning_rate": 3.2889698726063805e-06, + "loss": 0.78954732, + "num_input_tokens_seen": 106982870, + "step": 4969, + "time_per_iteration": 2.627812385559082 + }, + { + "auxiliary_loss_clip": 0.01171841, + "auxiliary_loss_mlp": 0.01133802, + "balance_loss_clip": 1.00197244, + "balance_loss_mlp": 1.00076437, + "epoch": 0.2988125657598076, + "flos": 21433427775360.0, + "grad_norm": 1.786824207398042, + "language_loss": 0.70084023, + "learning_rate": 3.2886720591442327e-06, + "loss": 0.72389668, + "num_input_tokens_seen": 107002405, + "step": 4970, + "time_per_iteration": 2.5067079067230225 + }, + { + "auxiliary_loss_clip": 0.01155524, + "auxiliary_loss_mlp": 0.01135696, + "balance_loss_clip": 1.00191498, + "balance_loss_mlp": 1.00065601, + "epoch": 0.2988726890124756, + "flos": 18076965292800.0, + "grad_norm": 2.148542506218196, + "language_loss": 0.84786856, + "learning_rate": 3.2883741968155103e-06, + "loss": 0.87078077, + "num_input_tokens_seen": 107017310, + "step": 4971, + "time_per_iteration": 2.532839059829712 + }, + { + "auxiliary_loss_clip": 0.01139609, + "auxiliary_loss_mlp": 0.01134495, + "balance_loss_clip": 1.00180137, + "balance_loss_mlp": 1.00088525, + "epoch": 0.29893281226514357, + "flos": 21755653706880.0, + "grad_norm": 1.7671722139643204, + "language_loss": 0.79599154, + "learning_rate": 3.2880762856315107e-06, + "loss": 0.81873262, + "num_input_tokens_seen": 107034645, + "step": 4972, + "time_per_iteration": 2.5776922702789307 + }, + { + "auxiliary_loss_clip": 0.01171711, + "auxiliary_loss_mlp": 0.0113433, + "balance_loss_clip": 1.00200737, + "balance_loss_mlp": 1.00100672, + "epoch": 0.29899293551781153, + "flos": 16836718538880.0, + "grad_norm": 1.813081542425027, + "language_loss": 0.85334671, + "learning_rate": 3.2877783256035285e-06, + "loss": 0.87640709, + "num_input_tokens_seen": 107051125, + "step": 4973, + "time_per_iteration": 2.480255126953125 + }, + { + "auxiliary_loss_clip": 0.01139731, + "auxiliary_loss_mlp": 0.01134151, + "balance_loss_clip": 1.00187838, + "balance_loss_mlp": 1.00063694, + "epoch": 0.2990530587704795, + "flos": 11729215946880.0, + "grad_norm": 1.708751463115721, + "language_loss": 0.77274364, + "learning_rate": 3.287480316742863e-06, + "loss": 0.79548246, + "num_input_tokens_seen": 107068815, + "step": 4974, + "time_per_iteration": 2.572138547897339 + }, + { + "auxiliary_loss_clip": 0.01139818, + "auxiliary_loss_mlp": 0.00748043, + "balance_loss_clip": 1.00187433, + "balance_loss_mlp": 1.00044358, + "epoch": 0.29911318202314746, + "flos": 28039877850240.0, + "grad_norm": 2.1429864695799554, + "language_loss": 0.72166032, + "learning_rate": 3.287182259060815e-06, + "loss": 0.74053895, + "num_input_tokens_seen": 107090420, + "step": 4975, + "time_per_iteration": 2.6720879077911377 + }, + { + "auxiliary_loss_clip": 0.01156134, + "auxiliary_loss_mlp": 0.01133666, + "balance_loss_clip": 1.00201035, + "balance_loss_mlp": 1.00081944, + "epoch": 0.2991733052758154, + "flos": 18733555952640.0, + "grad_norm": 2.438759907118582, + "language_loss": 0.76134706, + "learning_rate": 3.286884152568687e-06, + "loss": 0.78424507, + "num_input_tokens_seen": 107107255, + "step": 4976, + "time_per_iteration": 2.5312957763671875 + }, + { + "auxiliary_loss_clip": 0.01155213, + "auxiliary_loss_mlp": 0.01134256, + "balance_loss_clip": 1.00188756, + "balance_loss_mlp": 1.00083661, + "epoch": 0.2992334285284834, + "flos": 15559160532480.0, + "grad_norm": 2.3047328540754073, + "language_loss": 0.86218047, + "learning_rate": 3.2865859972777827e-06, + "loss": 0.88507521, + "num_input_tokens_seen": 107123840, + "step": 4977, + "time_per_iteration": 2.560718297958374 + }, + { + "auxiliary_loss_clip": 0.01138183, + "auxiliary_loss_mlp": 0.01134199, + "balance_loss_clip": 1.00186777, + "balance_loss_mlp": 1.00087488, + "epoch": 0.29929355178115136, + "flos": 21797561900160.0, + "grad_norm": 1.6442178498160163, + "language_loss": 0.68616068, + "learning_rate": 3.2862877931994088e-06, + "loss": 0.70888448, + "num_input_tokens_seen": 107143475, + "step": 4978, + "time_per_iteration": 2.614088535308838 + }, + { + "auxiliary_loss_clip": 0.01140185, + "auxiliary_loss_mlp": 0.01134998, + "balance_loss_clip": 1.00192666, + "balance_loss_mlp": 1.0006249, + "epoch": 0.2993536750338193, + "flos": 21178533888000.0, + "grad_norm": 1.7859896345820865, + "language_loss": 0.75712293, + "learning_rate": 3.2859895403448726e-06, + "loss": 0.7798748, + "num_input_tokens_seen": 107161725, + "step": 4979, + "time_per_iteration": 2.5970470905303955 + }, + { + "auxiliary_loss_clip": 0.01112693, + "auxiliary_loss_mlp": 0.01134969, + "balance_loss_clip": 1.00186682, + "balance_loss_mlp": 1.00069141, + "epoch": 0.2994137982864873, + "flos": 32122130544000.0, + "grad_norm": 1.8724063961494328, + "language_loss": 0.68597114, + "learning_rate": 3.285691238725484e-06, + "loss": 0.70844781, + "num_input_tokens_seen": 107183935, + "step": 4980, + "time_per_iteration": 2.7507877349853516 + }, + { + "auxiliary_loss_clip": 0.01155052, + "auxiliary_loss_mlp": 0.00748068, + "balance_loss_clip": 1.00189614, + "balance_loss_mlp": 1.00038171, + "epoch": 0.29947392153915525, + "flos": 21105419754240.0, + "grad_norm": 2.0193606772753347, + "language_loss": 0.73558885, + "learning_rate": 3.285392888352555e-06, + "loss": 0.75462008, + "num_input_tokens_seen": 107204285, + "step": 4981, + "time_per_iteration": 2.6209750175476074 + }, + { + "auxiliary_loss_clip": 0.01155164, + "auxiliary_loss_mlp": 0.01135101, + "balance_loss_clip": 1.0018183, + "balance_loss_mlp": 1.00082314, + "epoch": 0.2995340447918232, + "flos": 21542632099200.0, + "grad_norm": 1.500810884056911, + "language_loss": 0.86382478, + "learning_rate": 3.2850944892373987e-06, + "loss": 0.88672745, + "num_input_tokens_seen": 107225265, + "step": 4982, + "time_per_iteration": 2.572840929031372 + }, + { + "auxiliary_loss_clip": 0.01144651, + "auxiliary_loss_mlp": 0.01135097, + "balance_loss_clip": 1.00187635, + "balance_loss_mlp": 1.00062919, + "epoch": 0.2995941680444912, + "flos": 16725143917440.0, + "grad_norm": 2.156398118296353, + "language_loss": 0.86238146, + "learning_rate": 3.2847960413913307e-06, + "loss": 0.88517892, + "num_input_tokens_seen": 107241335, + "step": 4983, + "time_per_iteration": 2.580474615097046 + }, + { + "auxiliary_loss_clip": 0.01155774, + "auxiliary_loss_mlp": 0.01134112, + "balance_loss_clip": 1.00204587, + "balance_loss_mlp": 1.00078881, + "epoch": 0.2996542912971592, + "flos": 20923496346240.0, + "grad_norm": 1.8695319881565882, + "language_loss": 0.78437936, + "learning_rate": 3.284497544825668e-06, + "loss": 0.80727816, + "num_input_tokens_seen": 107259375, + "step": 4984, + "time_per_iteration": 2.593153476715088 + }, + { + "auxiliary_loss_clip": 0.01140259, + "auxiliary_loss_mlp": 0.01134578, + "balance_loss_clip": 1.00201356, + "balance_loss_mlp": 1.00068223, + "epoch": 0.29971441454982717, + "flos": 25079868754560.0, + "grad_norm": 1.5502032162295762, + "language_loss": 0.7838676, + "learning_rate": 3.2841989995517303e-06, + "loss": 0.80661595, + "num_input_tokens_seen": 107279890, + "step": 4985, + "time_per_iteration": 2.6240010261535645 + }, + { + "auxiliary_loss_clip": 0.01107886, + "auxiliary_loss_mlp": 0.01135197, + "balance_loss_clip": 1.00171018, + "balance_loss_mlp": 1.00063312, + "epoch": 0.29977453780249513, + "flos": 52555911840000.0, + "grad_norm": 1.8845566767368707, + "language_loss": 0.71355963, + "learning_rate": 3.283900405580837e-06, + "loss": 0.73599041, + "num_input_tokens_seen": 107303430, + "step": 4986, + "time_per_iteration": 2.9396846294403076 + }, + { + "auxiliary_loss_clip": 0.0113839, + "auxiliary_loss_mlp": 0.01135131, + "balance_loss_clip": 1.00179672, + "balance_loss_mlp": 1.00085342, + "epoch": 0.2998346610551631, + "flos": 22237144542720.0, + "grad_norm": 1.6939480645332323, + "language_loss": 0.73013663, + "learning_rate": 3.283601762924312e-06, + "loss": 0.75287187, + "num_input_tokens_seen": 107323700, + "step": 4987, + "time_per_iteration": 2.5899128913879395 + }, + { + "auxiliary_loss_clip": 0.01140612, + "auxiliary_loss_mlp": 0.01134084, + "balance_loss_clip": 1.0019567, + "balance_loss_mlp": 1.00066471, + "epoch": 0.29989478430783106, + "flos": 16873203778560.0, + "grad_norm": 3.3984732257566312, + "language_loss": 0.80176461, + "learning_rate": 3.2833030715934793e-06, + "loss": 0.82451153, + "num_input_tokens_seen": 107341965, + "step": 4988, + "time_per_iteration": 2.580296754837036 + }, + { + "auxiliary_loss_clip": 0.01145647, + "auxiliary_loss_mlp": 0.00748103, + "balance_loss_clip": 1.00198817, + "balance_loss_mlp": 1.00034451, + "epoch": 0.29995490756049903, + "flos": 23768878164480.0, + "grad_norm": 2.1216183518115725, + "language_loss": 0.70719159, + "learning_rate": 3.2830043315996658e-06, + "loss": 0.72612911, + "num_input_tokens_seen": 107362615, + "step": 4989, + "time_per_iteration": 2.613300085067749 + }, + { + "auxiliary_loss_clip": 0.01123864, + "auxiliary_loss_mlp": 0.01134643, + "balance_loss_clip": 1.00179029, + "balance_loss_mlp": 1.00074744, + "epoch": 0.300015030813167, + "flos": 14465321614080.0, + "grad_norm": 1.8692717318503589, + "language_loss": 0.85049403, + "learning_rate": 3.282705542954199e-06, + "loss": 0.87307912, + "num_input_tokens_seen": 107378980, + "step": 4990, + "time_per_iteration": 2.663691997528076 + }, + { + "auxiliary_loss_clip": 0.01154867, + "auxiliary_loss_mlp": 0.01134377, + "balance_loss_clip": 1.00176585, + "balance_loss_mlp": 1.00067198, + "epoch": 0.30007515406583496, + "flos": 25191982080000.0, + "grad_norm": 1.638625756109297, + "language_loss": 0.66655493, + "learning_rate": 3.28240670566841e-06, + "loss": 0.6894474, + "num_input_tokens_seen": 107397640, + "step": 4991, + "time_per_iteration": 2.5921318531036377 + }, + { + "auxiliary_loss_clip": 0.0114027, + "auxiliary_loss_mlp": 0.01135376, + "balance_loss_clip": 1.00178015, + "balance_loss_mlp": 1.00062132, + "epoch": 0.3001352773185029, + "flos": 19391188106880.0, + "grad_norm": 1.8691446532386415, + "language_loss": 0.78802633, + "learning_rate": 3.28210781975363e-06, + "loss": 0.81078273, + "num_input_tokens_seen": 107416020, + "step": 4992, + "time_per_iteration": 2.5991785526275635 + }, + { + "auxiliary_loss_clip": 0.01171928, + "auxiliary_loss_mlp": 0.01134336, + "balance_loss_clip": 1.0020473, + "balance_loss_mlp": 1.00072575, + "epoch": 0.3001954005711709, + "flos": 21543853161600.0, + "grad_norm": 2.23511940716091, + "language_loss": 0.82370985, + "learning_rate": 3.281808885221193e-06, + "loss": 0.84677249, + "num_input_tokens_seen": 107436340, + "step": 4993, + "time_per_iteration": 2.5365777015686035 + }, + { + "auxiliary_loss_clip": 0.01106848, + "auxiliary_loss_mlp": 0.01135596, + "balance_loss_clip": 1.00166225, + "balance_loss_mlp": 1.00093758, + "epoch": 0.30025552382383885, + "flos": 17384320356480.0, + "grad_norm": 2.290432764755534, + "language_loss": 0.86009336, + "learning_rate": 3.2815099020824345e-06, + "loss": 0.88251781, + "num_input_tokens_seen": 107454585, + "step": 4994, + "time_per_iteration": 2.67293381690979 + }, + { + "auxiliary_loss_clip": 0.01129512, + "auxiliary_loss_mlp": 0.01134541, + "balance_loss_clip": 1.00183439, + "balance_loss_mlp": 1.00064516, + "epoch": 0.3003156470765068, + "flos": 29533330552320.0, + "grad_norm": 1.4241308813227007, + "language_loss": 0.81007761, + "learning_rate": 3.2812108703486924e-06, + "loss": 0.83271813, + "num_input_tokens_seen": 107477180, + "step": 4995, + "time_per_iteration": 2.7004499435424805 + }, + { + "auxiliary_loss_clip": 0.01144646, + "auxiliary_loss_mlp": 0.01134043, + "balance_loss_clip": 1.00202596, + "balance_loss_mlp": 1.00052905, + "epoch": 0.3003757703291748, + "flos": 43646402465280.0, + "grad_norm": 1.837128770834425, + "language_loss": 0.67400086, + "learning_rate": 3.2809117900313055e-06, + "loss": 0.69678783, + "num_input_tokens_seen": 107500250, + "step": 4996, + "time_per_iteration": 2.7686917781829834 + }, + { + "auxiliary_loss_clip": 0.01138569, + "auxiliary_loss_mlp": 0.0113438, + "balance_loss_clip": 1.00188684, + "balance_loss_mlp": 1.00067508, + "epoch": 0.30043589358184275, + "flos": 22528380015360.0, + "grad_norm": 1.88778980297705, + "language_loss": 0.7571044, + "learning_rate": 3.280612661141615e-06, + "loss": 0.77983391, + "num_input_tokens_seen": 107520070, + "step": 4997, + "time_per_iteration": 2.6020941734313965 + }, + { + "auxiliary_loss_clip": 0.01155738, + "auxiliary_loss_mlp": 0.01134659, + "balance_loss_clip": 1.00174093, + "balance_loss_mlp": 1.00095367, + "epoch": 0.30049601683451077, + "flos": 20995892208000.0, + "grad_norm": 1.8653434120585257, + "language_loss": 0.77724206, + "learning_rate": 3.2803134836909646e-06, + "loss": 0.8001461, + "num_input_tokens_seen": 107539285, + "step": 4998, + "time_per_iteration": 2.578019618988037 + }, + { + "auxiliary_loss_clip": 0.01171787, + "auxiliary_loss_mlp": 0.01134319, + "balance_loss_clip": 1.00205278, + "balance_loss_mlp": 1.00080466, + "epoch": 0.30055614008717874, + "flos": 23916004272000.0, + "grad_norm": 1.599425464063368, + "language_loss": 0.73222792, + "learning_rate": 3.2800142576906985e-06, + "loss": 0.75528896, + "num_input_tokens_seen": 107560260, + "step": 4999, + "time_per_iteration": 2.5617220401763916 + }, + { + "auxiliary_loss_clip": 0.01156167, + "auxiliary_loss_mlp": 0.01134645, + "balance_loss_clip": 1.00189793, + "balance_loss_mlp": 1.00084496, + "epoch": 0.3006162633398467, + "flos": 19169798630400.0, + "grad_norm": 1.7149968812977046, + "language_loss": 0.75767094, + "learning_rate": 3.2797149831521626e-06, + "loss": 0.78057909, + "num_input_tokens_seen": 107579260, + "step": 5000, + "time_per_iteration": 2.578686475753784 + }, + { + "auxiliary_loss_clip": 0.01171678, + "auxiliary_loss_mlp": 0.01133968, + "balance_loss_clip": 1.00190699, + "balance_loss_mlp": 1.00093019, + "epoch": 0.30067638659251467, + "flos": 14679241061760.0, + "grad_norm": 1.8909355588160917, + "language_loss": 0.82039905, + "learning_rate": 3.2794156600867073e-06, + "loss": 0.84345543, + "num_input_tokens_seen": 107595245, + "step": 5001, + "time_per_iteration": 3.895315408706665 + }, + { + "auxiliary_loss_clip": 0.0115544, + "auxiliary_loss_mlp": 0.01134798, + "balance_loss_clip": 1.00204325, + "balance_loss_mlp": 1.00080657, + "epoch": 0.30073650984518263, + "flos": 23368007404800.0, + "grad_norm": 1.5934709027994114, + "language_loss": 0.80768359, + "learning_rate": 3.2791162885056815e-06, + "loss": 0.83058596, + "num_input_tokens_seen": 107613985, + "step": 5002, + "time_per_iteration": 2.561387777328491 + }, + { + "auxiliary_loss_clip": 0.01106227, + "auxiliary_loss_mlp": 0.01135151, + "balance_loss_clip": 1.00163114, + "balance_loss_mlp": 1.00058782, + "epoch": 0.3007966330978506, + "flos": 22966633854720.0, + "grad_norm": 13.65188612671832, + "language_loss": 0.71310526, + "learning_rate": 3.2788168684204376e-06, + "loss": 0.73551905, + "num_input_tokens_seen": 107631435, + "step": 5003, + "time_per_iteration": 2.691052198410034 + }, + { + "auxiliary_loss_clip": 0.011229, + "auxiliary_loss_mlp": 0.0113468, + "balance_loss_clip": 1.00175285, + "balance_loss_mlp": 1.00078368, + "epoch": 0.30085675635051856, + "flos": 27818452460160.0, + "grad_norm": 1.840470203057187, + "language_loss": 0.70533288, + "learning_rate": 3.27851739984233e-06, + "loss": 0.72790867, + "num_input_tokens_seen": 107650530, + "step": 5004, + "time_per_iteration": 4.081324100494385 + }, + { + "auxiliary_loss_clip": 0.01138678, + "auxiliary_loss_mlp": 0.01135055, + "balance_loss_clip": 1.0018121, + "balance_loss_mlp": 1.00077748, + "epoch": 0.3009168796031865, + "flos": 10882729059840.0, + "grad_norm": 2.6381104053020232, + "language_loss": 0.82368106, + "learning_rate": 3.278217882782715e-06, + "loss": 0.84641838, + "num_input_tokens_seen": 107662240, + "step": 5005, + "time_per_iteration": 3.9239439964294434 + }, + { + "auxiliary_loss_clip": 0.01155147, + "auxiliary_loss_mlp": 0.01134297, + "balance_loss_clip": 1.00190163, + "balance_loss_mlp": 1.00078297, + "epoch": 0.3009770028558545, + "flos": 23805399317760.0, + "grad_norm": 2.6981810465908644, + "language_loss": 0.74567556, + "learning_rate": 3.2779183172529497e-06, + "loss": 0.76856995, + "num_input_tokens_seen": 107680330, + "step": 5006, + "time_per_iteration": 4.018820285797119 + }, + { + "auxiliary_loss_clip": 0.01123205, + "auxiliary_loss_mlp": 0.00748051, + "balance_loss_clip": 1.00163639, + "balance_loss_mlp": 1.00028002, + "epoch": 0.30103712610852246, + "flos": 26468211283200.0, + "grad_norm": 1.872060501299888, + "language_loss": 0.70820832, + "learning_rate": 3.2776187032643932e-06, + "loss": 0.72692096, + "num_input_tokens_seen": 107700020, + "step": 5007, + "time_per_iteration": 2.693542003631592 + }, + { + "auxiliary_loss_clip": 0.01155139, + "auxiliary_loss_mlp": 0.01134265, + "balance_loss_clip": 1.00182557, + "balance_loss_mlp": 1.00065553, + "epoch": 0.3010972493611904, + "flos": 22856459863680.0, + "grad_norm": 2.0908661341685, + "language_loss": 0.7599299, + "learning_rate": 3.2773190408284075e-06, + "loss": 0.78282392, + "num_input_tokens_seen": 107718575, + "step": 5008, + "time_per_iteration": 2.5524168014526367 + }, + { + "auxiliary_loss_clip": 0.01155116, + "auxiliary_loss_mlp": 0.01134588, + "balance_loss_clip": 1.00191867, + "balance_loss_mlp": 1.00069189, + "epoch": 0.3011573726138584, + "flos": 24053685102720.0, + "grad_norm": 2.500225972233851, + "language_loss": 0.84599853, + "learning_rate": 3.2770193299563564e-06, + "loss": 0.86889553, + "num_input_tokens_seen": 107738635, + "step": 5009, + "time_per_iteration": 2.6695516109466553 + }, + { + "auxiliary_loss_clip": 0.01156736, + "auxiliary_loss_mlp": 0.01135149, + "balance_loss_clip": 1.00194836, + "balance_loss_mlp": 1.00068116, + "epoch": 0.30121749586652635, + "flos": 20259687052800.0, + "grad_norm": 2.168409844654902, + "language_loss": 0.83443236, + "learning_rate": 3.276719570659604e-06, + "loss": 0.85735124, + "num_input_tokens_seen": 107753415, + "step": 5010, + "time_per_iteration": 2.5325846672058105 + }, + { + "auxiliary_loss_clip": 0.01123551, + "auxiliary_loss_mlp": 0.011345, + "balance_loss_clip": 1.00178063, + "balance_loss_mlp": 1.00060415, + "epoch": 0.3012776191191944, + "flos": 26943058103040.0, + "grad_norm": 2.8574154437423225, + "language_loss": 0.85258698, + "learning_rate": 3.2764197629495176e-06, + "loss": 0.87516749, + "num_input_tokens_seen": 107773840, + "step": 5011, + "time_per_iteration": 2.6617045402526855 + }, + { + "auxiliary_loss_clip": 0.01140847, + "auxiliary_loss_mlp": 0.01134938, + "balance_loss_clip": 1.00176001, + "balance_loss_mlp": 1.00066054, + "epoch": 0.30133774237186234, + "flos": 20412307941120.0, + "grad_norm": 1.8571406306277727, + "language_loss": 0.72243673, + "learning_rate": 3.2761199068374656e-06, + "loss": 0.74519455, + "num_input_tokens_seen": 107792020, + "step": 5012, + "time_per_iteration": 2.572854518890381 + }, + { + "auxiliary_loss_clip": 0.01154968, + "auxiliary_loss_mlp": 0.01134101, + "balance_loss_clip": 1.00187135, + "balance_loss_mlp": 1.00068212, + "epoch": 0.3013978656245303, + "flos": 19792453916160.0, + "grad_norm": 2.203188659391897, + "language_loss": 0.87616026, + "learning_rate": 3.275820002334819e-06, + "loss": 0.89905101, + "num_input_tokens_seen": 107809595, + "step": 5013, + "time_per_iteration": 2.5538554191589355 + }, + { + "auxiliary_loss_clip": 0.01141232, + "auxiliary_loss_mlp": 0.01135157, + "balance_loss_clip": 1.00182211, + "balance_loss_mlp": 1.00059319, + "epoch": 0.30145798887719827, + "flos": 16249650652800.0, + "grad_norm": 5.885395849777121, + "language_loss": 0.82608056, + "learning_rate": 3.2755200494529496e-06, + "loss": 0.84884441, + "num_input_tokens_seen": 107827230, + "step": 5014, + "time_per_iteration": 2.557253122329712 + }, + { + "auxiliary_loss_clip": 0.01122711, + "auxiliary_loss_mlp": 0.01134462, + "balance_loss_clip": 1.00170386, + "balance_loss_mlp": 1.00056672, + "epoch": 0.30151811212986623, + "flos": 24571733005440.0, + "grad_norm": 2.3083362557971667, + "language_loss": 0.68570018, + "learning_rate": 3.2752200482032323e-06, + "loss": 0.70827186, + "num_input_tokens_seen": 107847195, + "step": 5015, + "time_per_iteration": 2.6900548934936523 + }, + { + "auxiliary_loss_clip": 0.0113979, + "auxiliary_loss_mlp": 0.01134917, + "balance_loss_clip": 1.00183439, + "balance_loss_mlp": 1.00063944, + "epoch": 0.3015782353825342, + "flos": 21872076664320.0, + "grad_norm": 2.07842161314167, + "language_loss": 0.74742609, + "learning_rate": 3.2749199985970436e-06, + "loss": 0.77017313, + "num_input_tokens_seen": 107866420, + "step": 5016, + "time_per_iteration": 2.6170058250427246 + }, + { + "auxiliary_loss_clip": 0.01156217, + "auxiliary_loss_mlp": 0.01134625, + "balance_loss_clip": 1.00190759, + "balance_loss_mlp": 1.00063396, + "epoch": 0.30163835863520216, + "flos": 28769331248640.0, + "grad_norm": 1.3945810419186961, + "language_loss": 0.65507519, + "learning_rate": 3.2746199006457603e-06, + "loss": 0.67798364, + "num_input_tokens_seen": 107889090, + "step": 5017, + "time_per_iteration": 2.7089638710021973 + }, + { + "auxiliary_loss_clip": 0.01130863, + "auxiliary_loss_mlp": 0.01134488, + "balance_loss_clip": 1.0021522, + "balance_loss_mlp": 1.00087821, + "epoch": 0.30169848188787013, + "flos": 22966202891520.0, + "grad_norm": 1.7890593491687532, + "language_loss": 0.68696523, + "learning_rate": 3.2743197543607628e-06, + "loss": 0.70961869, + "num_input_tokens_seen": 107907520, + "step": 5018, + "time_per_iteration": 2.641829013824463 + }, + { + "auxiliary_loss_clip": 0.0117162, + "auxiliary_loss_mlp": 0.01133643, + "balance_loss_clip": 1.00188184, + "balance_loss_mlp": 1.00070107, + "epoch": 0.3017586051405381, + "flos": 21835268202240.0, + "grad_norm": 2.125617433705282, + "language_loss": 0.78785574, + "learning_rate": 3.2740195597534327e-06, + "loss": 0.81090844, + "num_input_tokens_seen": 107925650, + "step": 5019, + "time_per_iteration": 2.517993688583374 + }, + { + "auxiliary_loss_clip": 0.01138844, + "auxiliary_loss_mlp": 0.01134353, + "balance_loss_clip": 1.00175226, + "balance_loss_mlp": 1.00074339, + "epoch": 0.30181872839320606, + "flos": 22160403135360.0, + "grad_norm": 1.8424061151352997, + "language_loss": 0.69745028, + "learning_rate": 3.2737193168351527e-06, + "loss": 0.7201823, + "num_input_tokens_seen": 107943975, + "step": 5020, + "time_per_iteration": 2.6427133083343506 + }, + { + "auxiliary_loss_clip": 0.01171969, + "auxiliary_loss_mlp": 0.01135237, + "balance_loss_clip": 1.00199342, + "balance_loss_mlp": 1.00076938, + "epoch": 0.301878851645874, + "flos": 18114168804480.0, + "grad_norm": 10.1342154752688, + "language_loss": 0.78342378, + "learning_rate": 3.2734190256173085e-06, + "loss": 0.8064959, + "num_input_tokens_seen": 107962950, + "step": 5021, + "time_per_iteration": 2.521512508392334 + }, + { + "auxiliary_loss_clip": 0.01156232, + "auxiliary_loss_mlp": 0.01134837, + "balance_loss_clip": 1.00194669, + "balance_loss_mlp": 1.00065529, + "epoch": 0.301938974898542, + "flos": 17602226213760.0, + "grad_norm": 1.9565826742615569, + "language_loss": 0.75627363, + "learning_rate": 3.2731186861112877e-06, + "loss": 0.77918434, + "num_input_tokens_seen": 107979700, + "step": 5022, + "time_per_iteration": 2.53674578666687 + }, + { + "auxiliary_loss_clip": 0.01171796, + "auxiliary_loss_mlp": 0.01135066, + "balance_loss_clip": 1.00188947, + "balance_loss_mlp": 1.00097954, + "epoch": 0.30199909815120995, + "flos": 11181219079680.0, + "grad_norm": 1.9077348397311034, + "language_loss": 0.69833207, + "learning_rate": 3.2728182983284793e-06, + "loss": 0.72140062, + "num_input_tokens_seen": 107996645, + "step": 5023, + "time_per_iteration": 2.4779770374298096 + }, + { + "auxiliary_loss_clip": 0.01139949, + "auxiliary_loss_mlp": 0.01134774, + "balance_loss_clip": 1.00181007, + "balance_loss_mlp": 1.00059199, + "epoch": 0.302059221403878, + "flos": 21907843632000.0, + "grad_norm": 1.8342175331543218, + "language_loss": 0.71564353, + "learning_rate": 3.2725178622802724e-06, + "loss": 0.7383908, + "num_input_tokens_seen": 108015020, + "step": 5024, + "time_per_iteration": 2.602956533432007 + }, + { + "auxiliary_loss_clip": 0.01155125, + "auxiliary_loss_mlp": 0.01134493, + "balance_loss_clip": 1.00184631, + "balance_loss_mlp": 1.0007875, + "epoch": 0.30211934465654594, + "flos": 26396390039040.0, + "grad_norm": 2.042871977306312, + "language_loss": 0.74052525, + "learning_rate": 3.272217377978061e-06, + "loss": 0.76342142, + "num_input_tokens_seen": 108036430, + "step": 5025, + "time_per_iteration": 2.654399871826172 + }, + { + "auxiliary_loss_clip": 0.01155955, + "auxiliary_loss_mlp": 0.01134141, + "balance_loss_clip": 1.00193846, + "balance_loss_mlp": 1.00081754, + "epoch": 0.3021794679092139, + "flos": 23400470321280.0, + "grad_norm": 1.5846872388513882, + "language_loss": 0.67015076, + "learning_rate": 3.2719168454332387e-06, + "loss": 0.6930517, + "num_input_tokens_seen": 108054250, + "step": 5026, + "time_per_iteration": 2.613293170928955 + }, + { + "auxiliary_loss_clip": 0.01156652, + "auxiliary_loss_mlp": 0.01134217, + "balance_loss_clip": 1.00190568, + "balance_loss_mlp": 1.00089312, + "epoch": 0.30223959116188187, + "flos": 20260979942400.0, + "grad_norm": 1.6106768894163852, + "language_loss": 0.85059679, + "learning_rate": 3.2716162646572034e-06, + "loss": 0.87350547, + "num_input_tokens_seen": 108071495, + "step": 5027, + "time_per_iteration": 2.536404609680176 + }, + { + "auxiliary_loss_clip": 0.01139297, + "auxiliary_loss_mlp": 0.01134469, + "balance_loss_clip": 1.00188589, + "balance_loss_mlp": 1.00076342, + "epoch": 0.30229971441454984, + "flos": 26687840993280.0, + "grad_norm": 1.6993094555557022, + "language_loss": 0.78616142, + "learning_rate": 3.271315635661351e-06, + "loss": 0.80889904, + "num_input_tokens_seen": 108092135, + "step": 5028, + "time_per_iteration": 2.633631944656372 + }, + { + "auxiliary_loss_clip": 0.01139739, + "auxiliary_loss_mlp": 0.01134924, + "balance_loss_clip": 1.00183463, + "balance_loss_mlp": 1.00074172, + "epoch": 0.3023598376672178, + "flos": 34345323953280.0, + "grad_norm": 1.9620477883327656, + "language_loss": 0.76675951, + "learning_rate": 3.2710149584570826e-06, + "loss": 0.78950608, + "num_input_tokens_seen": 108112945, + "step": 5029, + "time_per_iteration": 2.710007905960083 + }, + { + "auxiliary_loss_clip": 0.01124754, + "auxiliary_loss_mlp": 0.01134786, + "balance_loss_clip": 1.00170815, + "balance_loss_mlp": 1.00060368, + "epoch": 0.30241996091988577, + "flos": 23112143850240.0, + "grad_norm": 2.077169898337065, + "language_loss": 0.81789994, + "learning_rate": 3.2707142330557993e-06, + "loss": 0.84049535, + "num_input_tokens_seen": 108130325, + "step": 5030, + "time_per_iteration": 2.650266647338867 + }, + { + "auxiliary_loss_clip": 0.01108197, + "auxiliary_loss_mlp": 0.00748101, + "balance_loss_clip": 1.00172305, + "balance_loss_mlp": 1.00031209, + "epoch": 0.30248008417255373, + "flos": 19390002958080.0, + "grad_norm": 1.6911093283173684, + "language_loss": 0.69425863, + "learning_rate": 3.270413459468905e-06, + "loss": 0.7128216, + "num_input_tokens_seen": 108150300, + "step": 5031, + "time_per_iteration": 2.6701645851135254 + }, + { + "auxiliary_loss_clip": 0.01156496, + "auxiliary_loss_mlp": 0.01134829, + "balance_loss_clip": 1.00188434, + "balance_loss_mlp": 1.00055146, + "epoch": 0.3025402074252217, + "flos": 23769704177280.0, + "grad_norm": 2.9329302430303255, + "language_loss": 0.82586193, + "learning_rate": 3.2701126377078047e-06, + "loss": 0.84877515, + "num_input_tokens_seen": 108170330, + "step": 5032, + "time_per_iteration": 2.575505256652832 + }, + { + "auxiliary_loss_clip": 0.01123812, + "auxiliary_loss_mlp": 0.0113571, + "balance_loss_clip": 1.00192857, + "balance_loss_mlp": 1.00085998, + "epoch": 0.30260033067788966, + "flos": 25994118648960.0, + "grad_norm": 2.0788815615457463, + "language_loss": 0.73842049, + "learning_rate": 3.269811767783906e-06, + "loss": 0.76101565, + "num_input_tokens_seen": 108191265, + "step": 5033, + "time_per_iteration": 2.660703182220459 + }, + { + "auxiliary_loss_clip": 0.01154943, + "auxiliary_loss_mlp": 0.01133984, + "balance_loss_clip": 1.00184083, + "balance_loss_mlp": 1.0008508, + "epoch": 0.3026604539305576, + "flos": 25374551932800.0, + "grad_norm": 1.5475154691487365, + "language_loss": 0.73822784, + "learning_rate": 3.2695108497086185e-06, + "loss": 0.76111716, + "num_input_tokens_seen": 108211615, + "step": 5034, + "time_per_iteration": 2.6065168380737305 + }, + { + "auxiliary_loss_clip": 0.01171719, + "auxiliary_loss_mlp": 0.01134489, + "balance_loss_clip": 1.00188804, + "balance_loss_mlp": 1.00059354, + "epoch": 0.3027205771832256, + "flos": 25812733944960.0, + "grad_norm": 1.9175446829757588, + "language_loss": 0.71619564, + "learning_rate": 3.269209883493352e-06, + "loss": 0.73925769, + "num_input_tokens_seen": 108231080, + "step": 5035, + "time_per_iteration": 2.5360074043273926 + }, + { + "auxiliary_loss_clip": 0.01156395, + "auxiliary_loss_mlp": 0.01133925, + "balance_loss_clip": 1.00181675, + "balance_loss_mlp": 1.00069714, + "epoch": 0.30278070043589356, + "flos": 27344539393920.0, + "grad_norm": 3.500281835781347, + "language_loss": 0.87844729, + "learning_rate": 3.2689088691495196e-06, + "loss": 0.9013505, + "num_input_tokens_seen": 108251125, + "step": 5036, + "time_per_iteration": 2.586090326309204 + }, + { + "auxiliary_loss_clip": 0.01123, + "auxiliary_loss_mlp": 0.01134063, + "balance_loss_clip": 1.00168359, + "balance_loss_mlp": 1.00083458, + "epoch": 0.3028408236885616, + "flos": 24786227070720.0, + "grad_norm": 1.6058776720434758, + "language_loss": 0.7720874, + "learning_rate": 3.268607806688536e-06, + "loss": 0.79465806, + "num_input_tokens_seen": 108272545, + "step": 5037, + "time_per_iteration": 2.651214599609375 + }, + { + "auxiliary_loss_clip": 0.01128985, + "auxiliary_loss_mlp": 0.01135488, + "balance_loss_clip": 1.00210452, + "balance_loss_mlp": 1.00082946, + "epoch": 0.30290094694122954, + "flos": 12932474670720.0, + "grad_norm": 2.004259212803606, + "language_loss": 0.77541912, + "learning_rate": 3.268306696121816e-06, + "loss": 0.79806387, + "num_input_tokens_seen": 108289725, + "step": 5038, + "time_per_iteration": 2.603332996368408 + }, + { + "auxiliary_loss_clip": 0.01139294, + "auxiliary_loss_mlp": 0.01134339, + "balance_loss_clip": 1.00174618, + "balance_loss_mlp": 1.00072885, + "epoch": 0.3029610701938975, + "flos": 25916443488000.0, + "grad_norm": 1.968822638226372, + "language_loss": 0.73805726, + "learning_rate": 3.2680055374607804e-06, + "loss": 0.76079363, + "num_input_tokens_seen": 108310690, + "step": 5039, + "time_per_iteration": 4.013906002044678 + }, + { + "auxiliary_loss_clip": 0.01171672, + "auxiliary_loss_mlp": 0.00748028, + "balance_loss_clip": 1.0019412, + "balance_loss_mlp": 1.00030708, + "epoch": 0.3030211934465655, + "flos": 21980993679360.0, + "grad_norm": 2.365377834560806, + "language_loss": 0.80023301, + "learning_rate": 3.267704330716847e-06, + "loss": 0.81943005, + "num_input_tokens_seen": 108328905, + "step": 5040, + "time_per_iteration": 2.555400848388672 + }, + { + "auxiliary_loss_clip": 0.01140027, + "auxiliary_loss_mlp": 0.01134, + "balance_loss_clip": 1.00194979, + "balance_loss_mlp": 1.00067675, + "epoch": 0.30308131669923344, + "flos": 20991977625600.0, + "grad_norm": 1.690249755336865, + "language_loss": 0.81622028, + "learning_rate": 3.267403075901438e-06, + "loss": 0.83896053, + "num_input_tokens_seen": 108346680, + "step": 5041, + "time_per_iteration": 2.6332242488861084 + }, + { + "auxiliary_loss_clip": 0.01105189, + "auxiliary_loss_mlp": 0.01118225, + "balance_loss_clip": 1.00125647, + "balance_loss_mlp": 0.99996942, + "epoch": 0.3031414399519014, + "flos": 60548875827840.0, + "grad_norm": 0.7533991836163875, + "language_loss": 0.5948391, + "learning_rate": 3.267101773025978e-06, + "loss": 0.61707324, + "num_input_tokens_seen": 108413885, + "step": 5042, + "time_per_iteration": 4.721556186676025 + }, + { + "auxiliary_loss_clip": 0.01171947, + "auxiliary_loss_mlp": 0.01135199, + "balance_loss_clip": 1.00202513, + "balance_loss_mlp": 1.00073111, + "epoch": 0.30320156320456937, + "flos": 21907664064000.0, + "grad_norm": 1.8609688648898048, + "language_loss": 0.71061987, + "learning_rate": 3.266800422101892e-06, + "loss": 0.73369133, + "num_input_tokens_seen": 108433640, + "step": 5043, + "time_per_iteration": 3.952672243118286 + }, + { + "auxiliary_loss_clip": 0.01106412, + "auxiliary_loss_mlp": 0.01133411, + "balance_loss_clip": 1.00159442, + "balance_loss_mlp": 1.00046921, + "epoch": 0.30326168645723733, + "flos": 21652770176640.0, + "grad_norm": 2.1373402399737205, + "language_loss": 0.69923657, + "learning_rate": 3.266499023140606e-06, + "loss": 0.72163481, + "num_input_tokens_seen": 108452640, + "step": 5044, + "time_per_iteration": 4.084809064865112 + }, + { + "auxiliary_loss_clip": 0.01155056, + "auxiliary_loss_mlp": 0.01134146, + "balance_loss_clip": 1.001876, + "balance_loss_mlp": 1.00053644, + "epoch": 0.3033218097099053, + "flos": 21871286565120.0, + "grad_norm": 1.469358605412997, + "language_loss": 0.7705605, + "learning_rate": 3.2661975761535513e-06, + "loss": 0.7934525, + "num_input_tokens_seen": 108472470, + "step": 5045, + "time_per_iteration": 2.5626938343048096 + }, + { + "auxiliary_loss_clip": 0.01171729, + "auxiliary_loss_mlp": 0.00748077, + "balance_loss_clip": 1.00192857, + "balance_loss_mlp": 1.00035286, + "epoch": 0.30338193296257326, + "flos": 27089717333760.0, + "grad_norm": 1.925125846120714, + "language_loss": 0.72631478, + "learning_rate": 3.2658960811521564e-06, + "loss": 0.74551284, + "num_input_tokens_seen": 108493025, + "step": 5046, + "time_per_iteration": 2.547184467315674 + }, + { + "auxiliary_loss_clip": 0.01161247, + "auxiliary_loss_mlp": 0.01134544, + "balance_loss_clip": 1.0021522, + "balance_loss_mlp": 1.00064826, + "epoch": 0.30344205621524123, + "flos": 19534363718400.0, + "grad_norm": 1.8192393149717894, + "language_loss": 0.80802619, + "learning_rate": 3.2655945381478564e-06, + "loss": 0.83098418, + "num_input_tokens_seen": 108513480, + "step": 5047, + "time_per_iteration": 2.566068172454834 + }, + { + "auxiliary_loss_clip": 0.01108085, + "auxiliary_loss_mlp": 0.01134927, + "balance_loss_clip": 1.00161707, + "balance_loss_mlp": 1.00084066, + "epoch": 0.3035021794679092, + "flos": 23910976368000.0, + "grad_norm": 1.89697096230617, + "language_loss": 0.72110748, + "learning_rate": 3.265292947152084e-06, + "loss": 0.74353755, + "num_input_tokens_seen": 108533155, + "step": 5048, + "time_per_iteration": 2.66597318649292 + }, + { + "auxiliary_loss_clip": 0.0113862, + "auxiliary_loss_mlp": 0.01134215, + "balance_loss_clip": 1.00179482, + "balance_loss_mlp": 1.00070119, + "epoch": 0.30356230272057716, + "flos": 16143606725760.0, + "grad_norm": 2.1266066776969406, + "language_loss": 0.75098354, + "learning_rate": 3.2649913081762763e-06, + "loss": 0.77371192, + "num_input_tokens_seen": 108551900, + "step": 5049, + "time_per_iteration": 2.5637588500976562 + }, + { + "auxiliary_loss_clip": 0.01156196, + "auxiliary_loss_mlp": 0.01134078, + "balance_loss_clip": 1.00187635, + "balance_loss_mlp": 1.00084972, + "epoch": 0.3036224259732452, + "flos": 28914697589760.0, + "grad_norm": 1.5524868055649947, + "language_loss": 0.82071555, + "learning_rate": 3.2646896212318717e-06, + "loss": 0.84361827, + "num_input_tokens_seen": 108574005, + "step": 5050, + "time_per_iteration": 2.620596408843994 + }, + { + "auxiliary_loss_clip": 0.0112352, + "auxiliary_loss_mlp": 0.01134394, + "balance_loss_clip": 1.00173879, + "balance_loss_mlp": 1.00078416, + "epoch": 0.30368254922591315, + "flos": 21105599322240.0, + "grad_norm": 2.1114519083681142, + "language_loss": 0.73659635, + "learning_rate": 3.2643878863303106e-06, + "loss": 0.75917542, + "num_input_tokens_seen": 108592715, + "step": 5051, + "time_per_iteration": 2.6176254749298096 + }, + { + "auxiliary_loss_clip": 0.01091729, + "auxiliary_loss_mlp": 0.00748039, + "balance_loss_clip": 1.00169206, + "balance_loss_mlp": 1.00032961, + "epoch": 0.3037426724785811, + "flos": 23002293081600.0, + "grad_norm": 1.72805342062891, + "language_loss": 0.76392794, + "learning_rate": 3.264086103483033e-06, + "loss": 0.78232563, + "num_input_tokens_seen": 108611770, + "step": 5052, + "time_per_iteration": 2.7441012859344482 + }, + { + "auxiliary_loss_clip": 0.01171927, + "auxiliary_loss_mlp": 0.01134712, + "balance_loss_clip": 1.00193429, + "balance_loss_mlp": 1.00091183, + "epoch": 0.3038027957312491, + "flos": 15632705629440.0, + "grad_norm": 1.9073781237292213, + "language_loss": 0.82544607, + "learning_rate": 3.2637842727014836e-06, + "loss": 0.84851253, + "num_input_tokens_seen": 108629070, + "step": 5053, + "time_per_iteration": 2.5886740684509277 + }, + { + "auxiliary_loss_clip": 0.01138074, + "auxiliary_loss_mlp": 0.01134337, + "balance_loss_clip": 1.00168526, + "balance_loss_mlp": 1.00082302, + "epoch": 0.30386291898391704, + "flos": 12713994195840.0, + "grad_norm": 1.5462141669521048, + "language_loss": 0.7096746, + "learning_rate": 3.2634823939971083e-06, + "loss": 0.73239875, + "num_input_tokens_seen": 108646315, + "step": 5054, + "time_per_iteration": 2.6284189224243164 + }, + { + "auxiliary_loss_clip": 0.01171731, + "auxiliary_loss_mlp": 0.01134422, + "balance_loss_clip": 1.00197673, + "balance_loss_mlp": 1.00071681, + "epoch": 0.303923042236585, + "flos": 26359437922560.0, + "grad_norm": 2.2997102431900283, + "language_loss": 0.6845479, + "learning_rate": 3.2631804673813545e-06, + "loss": 0.70760942, + "num_input_tokens_seen": 108665920, + "step": 5055, + "time_per_iteration": 2.5786612033843994 + }, + { + "auxiliary_loss_clip": 0.01139162, + "auxiliary_loss_mlp": 0.01134744, + "balance_loss_clip": 1.00196004, + "balance_loss_mlp": 1.00075245, + "epoch": 0.30398316548925297, + "flos": 19719232041600.0, + "grad_norm": 2.113069051849257, + "language_loss": 0.68018931, + "learning_rate": 3.2628784928656707e-06, + "loss": 0.70292836, + "num_input_tokens_seen": 108683485, + "step": 5056, + "time_per_iteration": 2.5886523723602295 + }, + { + "auxiliary_loss_clip": 0.0113976, + "auxiliary_loss_mlp": 0.01134109, + "balance_loss_clip": 1.00187063, + "balance_loss_mlp": 1.00078535, + "epoch": 0.30404328874192094, + "flos": 24239846315520.0, + "grad_norm": 1.8053498593407693, + "language_loss": 0.8239854, + "learning_rate": 3.262576470461507e-06, + "loss": 0.84672409, + "num_input_tokens_seen": 108702700, + "step": 5057, + "time_per_iteration": 2.6260874271392822 + }, + { + "auxiliary_loss_clip": 0.01139025, + "auxiliary_loss_mlp": 0.01134037, + "balance_loss_clip": 1.00171804, + "balance_loss_mlp": 1.00071383, + "epoch": 0.3041034119945889, + "flos": 24498942094080.0, + "grad_norm": 1.6613268903733787, + "language_loss": 0.89134276, + "learning_rate": 3.2622744001803176e-06, + "loss": 0.91407341, + "num_input_tokens_seen": 108721860, + "step": 5058, + "time_per_iteration": 2.6336307525634766 + }, + { + "auxiliary_loss_clip": 0.0112343, + "auxiliary_loss_mlp": 0.01134325, + "balance_loss_clip": 1.00180316, + "balance_loss_mlp": 1.00081098, + "epoch": 0.30416353524725687, + "flos": 28288881907200.0, + "grad_norm": 1.9055862353589261, + "language_loss": 0.70979393, + "learning_rate": 3.2619722820335564e-06, + "loss": 0.73237145, + "num_input_tokens_seen": 108743215, + "step": 5059, + "time_per_iteration": 2.692084789276123 + }, + { + "auxiliary_loss_clip": 0.01093812, + "auxiliary_loss_mlp": 0.01134396, + "balance_loss_clip": 1.00173163, + "balance_loss_mlp": 1.00088143, + "epoch": 0.30422365849992483, + "flos": 23660392112640.0, + "grad_norm": 1.639288695356463, + "language_loss": 0.73154938, + "learning_rate": 3.26167011603268e-06, + "loss": 0.75383151, + "num_input_tokens_seen": 108765505, + "step": 5060, + "time_per_iteration": 2.7819671630859375 + }, + { + "auxiliary_loss_clip": 0.01171816, + "auxiliary_loss_mlp": 0.01134215, + "balance_loss_clip": 1.00204062, + "balance_loss_mlp": 1.00079572, + "epoch": 0.3042837817525928, + "flos": 22998773548800.0, + "grad_norm": 1.7374655916926807, + "language_loss": 0.76868123, + "learning_rate": 3.2613679021891463e-06, + "loss": 0.79174155, + "num_input_tokens_seen": 108783370, + "step": 5061, + "time_per_iteration": 2.5296764373779297 + }, + { + "auxiliary_loss_clip": 0.01125286, + "auxiliary_loss_mlp": 0.01134671, + "balance_loss_clip": 1.00195348, + "balance_loss_mlp": 1.00067973, + "epoch": 0.30434390500526076, + "flos": 22082332924800.0, + "grad_norm": 2.0035406674910563, + "language_loss": 0.81837493, + "learning_rate": 3.261065640514415e-06, + "loss": 0.84097451, + "num_input_tokens_seen": 108797430, + "step": 5062, + "time_per_iteration": 2.669630289077759 + }, + { + "auxiliary_loss_clip": 0.01171663, + "auxiliary_loss_mlp": 0.0113358, + "balance_loss_clip": 1.00195408, + "balance_loss_mlp": 1.00063777, + "epoch": 0.3044040282579287, + "flos": 25483504861440.0, + "grad_norm": 2.705131914937435, + "language_loss": 0.74626541, + "learning_rate": 3.2607633310199483e-06, + "loss": 0.76931787, + "num_input_tokens_seen": 108816945, + "step": 5063, + "time_per_iteration": 2.5551116466522217 + }, + { + "auxiliary_loss_clip": 0.01155202, + "auxiliary_loss_mlp": 0.00747947, + "balance_loss_clip": 1.00193405, + "balance_loss_mlp": 1.00028157, + "epoch": 0.30446415151059675, + "flos": 21945478106880.0, + "grad_norm": 1.7117438505915181, + "language_loss": 0.84390044, + "learning_rate": 3.26046097371721e-06, + "loss": 0.86293191, + "num_input_tokens_seen": 108836615, + "step": 5064, + "time_per_iteration": 2.594148874282837 + }, + { + "auxiliary_loss_clip": 0.01154713, + "auxiliary_loss_mlp": 0.01134216, + "balance_loss_clip": 1.00172019, + "balance_loss_mlp": 1.00070155, + "epoch": 0.3045242747632647, + "flos": 16435416816000.0, + "grad_norm": 1.8982830120568976, + "language_loss": 0.75323313, + "learning_rate": 3.2601585686176655e-06, + "loss": 0.77612239, + "num_input_tokens_seen": 108855165, + "step": 5065, + "time_per_iteration": 2.563387870788574 + }, + { + "auxiliary_loss_clip": 0.01140416, + "auxiliary_loss_mlp": 0.01135518, + "balance_loss_clip": 1.00190091, + "balance_loss_mlp": 1.00085926, + "epoch": 0.3045843980159327, + "flos": 31540341957120.0, + "grad_norm": 1.5675015940966093, + "language_loss": 0.61824816, + "learning_rate": 3.2598561157327814e-06, + "loss": 0.64100754, + "num_input_tokens_seen": 108874690, + "step": 5066, + "time_per_iteration": 2.6726086139678955 + }, + { + "auxiliary_loss_clip": 0.01140096, + "auxiliary_loss_mlp": 0.01135267, + "balance_loss_clip": 1.00206661, + "balance_loss_mlp": 1.00089455, + "epoch": 0.30464452126860064, + "flos": 17853636481920.0, + "grad_norm": 2.089651357151884, + "language_loss": 0.82892311, + "learning_rate": 3.2595536150740265e-06, + "loss": 0.8516767, + "num_input_tokens_seen": 108893140, + "step": 5067, + "time_per_iteration": 2.5605974197387695 + }, + { + "auxiliary_loss_clip": 0.01171686, + "auxiliary_loss_mlp": 0.01133736, + "balance_loss_clip": 1.00200117, + "balance_loss_mlp": 1.00079393, + "epoch": 0.3047046445212686, + "flos": 20631398947200.0, + "grad_norm": 1.8389055821734588, + "language_loss": 0.63274801, + "learning_rate": 3.259251066652873e-06, + "loss": 0.65580225, + "num_input_tokens_seen": 108911880, + "step": 5068, + "time_per_iteration": 2.5345022678375244 + }, + { + "auxiliary_loss_clip": 0.01156038, + "auxiliary_loss_mlp": 0.01134469, + "balance_loss_clip": 1.00182593, + "balance_loss_mlp": 1.00076413, + "epoch": 0.3047647677739366, + "flos": 21287594557440.0, + "grad_norm": 1.8571219739934453, + "language_loss": 0.74815238, + "learning_rate": 3.258948470480793e-06, + "loss": 0.77105743, + "num_input_tokens_seen": 108930440, + "step": 5069, + "time_per_iteration": 2.5423107147216797 + }, + { + "auxiliary_loss_clip": 0.01130172, + "auxiliary_loss_mlp": 0.01134171, + "balance_loss_clip": 1.00215685, + "balance_loss_mlp": 1.00094223, + "epoch": 0.30482489102660454, + "flos": 20995928121600.0, + "grad_norm": 2.4917601031734185, + "language_loss": 0.75509042, + "learning_rate": 3.258645826569261e-06, + "loss": 0.7777338, + "num_input_tokens_seen": 108949125, + "step": 5070, + "time_per_iteration": 2.649616003036499 + }, + { + "auxiliary_loss_clip": 0.01171948, + "auxiliary_loss_mlp": 0.00748072, + "balance_loss_clip": 1.0019263, + "balance_loss_mlp": 1.00038242, + "epoch": 0.3048850142792725, + "flos": 26290812988800.0, + "grad_norm": 2.7120689821597295, + "language_loss": 0.81587553, + "learning_rate": 3.2583431349297527e-06, + "loss": 0.83507574, + "num_input_tokens_seen": 108972190, + "step": 5071, + "time_per_iteration": 2.5753748416900635 + }, + { + "auxiliary_loss_clip": 0.01141294, + "auxiliary_loss_mlp": 0.01134241, + "balance_loss_clip": 1.00188255, + "balance_loss_mlp": 1.00063133, + "epoch": 0.30494513753194047, + "flos": 22346241125760.0, + "grad_norm": 1.6034887038008148, + "language_loss": 0.76058334, + "learning_rate": 3.2580403955737467e-06, + "loss": 0.78333873, + "num_input_tokens_seen": 108990325, + "step": 5072, + "time_per_iteration": 2.601001739501953 + }, + { + "auxiliary_loss_clip": 0.01123089, + "auxiliary_loss_mlp": 0.01133987, + "balance_loss_clip": 1.00187993, + "balance_loss_mlp": 1.00075889, + "epoch": 0.30500526078460843, + "flos": 19537667769600.0, + "grad_norm": 1.871905231780759, + "language_loss": 0.70759851, + "learning_rate": 3.257737608512723e-06, + "loss": 0.73016924, + "num_input_tokens_seen": 109009505, + "step": 5073, + "time_per_iteration": 2.6276497840881348 + }, + { + "auxiliary_loss_clip": 0.01155334, + "auxiliary_loss_mlp": 0.01134902, + "balance_loss_clip": 1.00195456, + "balance_loss_mlp": 1.00081503, + "epoch": 0.3050653840372764, + "flos": 14465321614080.0, + "grad_norm": 2.1055403262562296, + "language_loss": 0.76608777, + "learning_rate": 3.257434773758163e-06, + "loss": 0.78899014, + "num_input_tokens_seen": 109026350, + "step": 5074, + "time_per_iteration": 2.5335779190063477 + }, + { + "auxiliary_loss_clip": 0.01138694, + "auxiliary_loss_mlp": 0.01134084, + "balance_loss_clip": 1.00203621, + "balance_loss_mlp": 1.00066519, + "epoch": 0.30512550728994436, + "flos": 24243796811520.0, + "grad_norm": 1.8850272222759759, + "language_loss": 0.7446121, + "learning_rate": 3.25713189132155e-06, + "loss": 0.76733983, + "num_input_tokens_seen": 109044165, + "step": 5075, + "time_per_iteration": 2.6091506481170654 + }, + { + "auxiliary_loss_clip": 0.01171989, + "auxiliary_loss_mlp": 0.01135417, + "balance_loss_clip": 1.00206614, + "balance_loss_mlp": 1.00056744, + "epoch": 0.30518563054261233, + "flos": 16360542915840.0, + "grad_norm": 2.0519463631409844, + "language_loss": 0.75648224, + "learning_rate": 3.2568289612143703e-06, + "loss": 0.77955633, + "num_input_tokens_seen": 109060665, + "step": 5076, + "time_per_iteration": 2.5046229362487793 + }, + { + "auxiliary_loss_clip": 0.01145897, + "auxiliary_loss_mlp": 0.0113479, + "balance_loss_clip": 1.00220561, + "balance_loss_mlp": 1.00079918, + "epoch": 0.30524575379528035, + "flos": 21579584215680.0, + "grad_norm": 1.4920779786196625, + "language_loss": 0.7889657, + "learning_rate": 3.25652598344811e-06, + "loss": 0.81177253, + "num_input_tokens_seen": 109080035, + "step": 5077, + "time_per_iteration": 3.960510015487671 + }, + { + "auxiliary_loss_clip": 0.01107434, + "auxiliary_loss_mlp": 0.01133412, + "balance_loss_clip": 1.0018189, + "balance_loss_mlp": 1.0006609, + "epoch": 0.3053058770479483, + "flos": 16545231671040.0, + "grad_norm": 1.739398759657248, + "language_loss": 0.74975067, + "learning_rate": 3.256222958034259e-06, + "loss": 0.77215916, + "num_input_tokens_seen": 109097385, + "step": 5078, + "time_per_iteration": 2.674741268157959 + }, + { + "auxiliary_loss_clip": 0.01108279, + "auxiliary_loss_mlp": 0.0113388, + "balance_loss_clip": 1.00179327, + "balance_loss_mlp": 1.0009383, + "epoch": 0.3053660003006163, + "flos": 12312907954560.0, + "grad_norm": 1.9057574353364237, + "language_loss": 0.67126238, + "learning_rate": 3.255919884984307e-06, + "loss": 0.69368398, + "num_input_tokens_seen": 109115495, + "step": 5079, + "time_per_iteration": 4.111735820770264 + }, + { + "auxiliary_loss_clip": 0.01156538, + "auxiliary_loss_mlp": 0.01133713, + "balance_loss_clip": 1.001881, + "balance_loss_mlp": 1.0006758, + "epoch": 0.30542612355328425, + "flos": 23112287504640.0, + "grad_norm": 1.718915676320944, + "language_loss": 0.80079097, + "learning_rate": 3.2556167643097477e-06, + "loss": 0.82369351, + "num_input_tokens_seen": 109134235, + "step": 5080, + "time_per_iteration": 4.006433486938477 + }, + { + "auxiliary_loss_clip": 0.0115507, + "auxiliary_loss_mlp": 0.00747974, + "balance_loss_clip": 1.00188613, + "balance_loss_mlp": 1.00026429, + "epoch": 0.3054862468059522, + "flos": 24389450461440.0, + "grad_norm": 2.223628811282498, + "language_loss": 0.80973053, + "learning_rate": 3.255313596022074e-06, + "loss": 0.82876092, + "num_input_tokens_seen": 109152760, + "step": 5081, + "time_per_iteration": 3.9869184494018555 + }, + { + "auxiliary_loss_clip": 0.01155, + "auxiliary_loss_mlp": 0.01133802, + "balance_loss_clip": 1.00187504, + "balance_loss_mlp": 1.00066948, + "epoch": 0.3055463700586202, + "flos": 29386096704000.0, + "grad_norm": 1.9143863789512146, + "language_loss": 0.71841967, + "learning_rate": 3.255010380132783e-06, + "loss": 0.74130768, + "num_input_tokens_seen": 109173925, + "step": 5082, + "time_per_iteration": 2.629437208175659 + }, + { + "auxiliary_loss_clip": 0.01155054, + "auxiliary_loss_mlp": 0.01134248, + "balance_loss_clip": 1.00184333, + "balance_loss_mlp": 1.00092423, + "epoch": 0.30560649331128814, + "flos": 25591775431680.0, + "grad_norm": 1.8283418424111066, + "language_loss": 0.731722, + "learning_rate": 3.2547071166533736e-06, + "loss": 0.75461495, + "num_input_tokens_seen": 109192510, + "step": 5083, + "time_per_iteration": 2.5983803272247314 + }, + { + "auxiliary_loss_clip": 0.01140782, + "auxiliary_loss_mlp": 0.00748047, + "balance_loss_clip": 1.00175595, + "balance_loss_mlp": 1.00034189, + "epoch": 0.3056666165639561, + "flos": 19128321400320.0, + "grad_norm": 1.7046201675626984, + "language_loss": 0.71025747, + "learning_rate": 3.254403805595344e-06, + "loss": 0.72914577, + "num_input_tokens_seen": 109210885, + "step": 5084, + "time_per_iteration": 2.5812220573425293 + }, + { + "auxiliary_loss_clip": 0.01124512, + "auxiliary_loss_mlp": 0.01134271, + "balance_loss_clip": 1.00180602, + "balance_loss_mlp": 1.00056577, + "epoch": 0.30572673981662407, + "flos": 15523860441600.0, + "grad_norm": 3.088614471309911, + "language_loss": 0.78179228, + "learning_rate": 3.2541004469701962e-06, + "loss": 0.80438006, + "num_input_tokens_seen": 109229180, + "step": 5085, + "time_per_iteration": 2.6337039470672607 + }, + { + "auxiliary_loss_clip": 0.01171664, + "auxiliary_loss_mlp": 0.01133552, + "balance_loss_clip": 1.0019455, + "balance_loss_mlp": 1.00070488, + "epoch": 0.30578686306929204, + "flos": 21506541909120.0, + "grad_norm": 2.2764155492392364, + "language_loss": 0.77903414, + "learning_rate": 3.2537970407894342e-06, + "loss": 0.80208629, + "num_input_tokens_seen": 109249510, + "step": 5086, + "time_per_iteration": 2.569772720336914 + }, + { + "auxiliary_loss_clip": 0.01141348, + "auxiliary_loss_mlp": 0.01134046, + "balance_loss_clip": 1.00192952, + "balance_loss_mlp": 1.00072265, + "epoch": 0.30584698632196, + "flos": 20954271323520.0, + "grad_norm": 1.831143757602002, + "language_loss": 0.76334244, + "learning_rate": 3.253493587064563e-06, + "loss": 0.78609633, + "num_input_tokens_seen": 109268200, + "step": 5087, + "time_per_iteration": 2.592287540435791 + }, + { + "auxiliary_loss_clip": 0.01154956, + "auxiliary_loss_mlp": 0.01133804, + "balance_loss_clip": 1.00180674, + "balance_loss_mlp": 1.00076652, + "epoch": 0.30590710957462797, + "flos": 24681116897280.0, + "grad_norm": 1.7942693336305586, + "language_loss": 0.72598839, + "learning_rate": 3.2531900858070885e-06, + "loss": 0.74887604, + "num_input_tokens_seen": 109288370, + "step": 5088, + "time_per_iteration": 2.575270414352417 + }, + { + "auxiliary_loss_clip": 0.01156555, + "auxiliary_loss_mlp": 0.01134615, + "balance_loss_clip": 1.00186658, + "balance_loss_mlp": 1.0008142, + "epoch": 0.30596723282729593, + "flos": 17086907744640.0, + "grad_norm": 2.6213912078163553, + "language_loss": 0.79131627, + "learning_rate": 3.252886537028521e-06, + "loss": 0.81422794, + "num_input_tokens_seen": 109306730, + "step": 5089, + "time_per_iteration": 2.509575128555298 + }, + { + "auxiliary_loss_clip": 0.01138435, + "auxiliary_loss_mlp": 0.01134287, + "balance_loss_clip": 1.00187469, + "balance_loss_mlp": 1.00067699, + "epoch": 0.30602735607996395, + "flos": 22857106308480.0, + "grad_norm": 1.66052224797564, + "language_loss": 0.77011669, + "learning_rate": 3.2525829407403703e-06, + "loss": 0.79284394, + "num_input_tokens_seen": 109327360, + "step": 5090, + "time_per_iteration": 2.605475425720215 + }, + { + "auxiliary_loss_clip": 0.0113979, + "auxiliary_loss_mlp": 0.01134318, + "balance_loss_clip": 1.00179482, + "balance_loss_mlp": 1.00080371, + "epoch": 0.3060874793326319, + "flos": 29861482227840.0, + "grad_norm": 1.8083339857158152, + "language_loss": 0.7627331, + "learning_rate": 3.2522792969541488e-06, + "loss": 0.78547418, + "num_input_tokens_seen": 109348135, + "step": 5091, + "time_per_iteration": 2.6495022773742676 + }, + { + "auxiliary_loss_clip": 0.01077979, + "auxiliary_loss_mlp": 0.01133678, + "balance_loss_clip": 1.00155532, + "balance_loss_mlp": 1.00064051, + "epoch": 0.3061476025852999, + "flos": 20448577699200.0, + "grad_norm": 1.9773925539098898, + "language_loss": 0.71822178, + "learning_rate": 3.2519756056813705e-06, + "loss": 0.74033839, + "num_input_tokens_seen": 109366220, + "step": 5092, + "time_per_iteration": 2.7457432746887207 + }, + { + "auxiliary_loss_clip": 0.01138691, + "auxiliary_loss_mlp": 0.01134076, + "balance_loss_clip": 1.00181079, + "balance_loss_mlp": 1.00065684, + "epoch": 0.30620772583796785, + "flos": 19391475415680.0, + "grad_norm": 2.2534618693792754, + "language_loss": 0.82475227, + "learning_rate": 3.2516718669335522e-06, + "loss": 0.84748, + "num_input_tokens_seen": 109385260, + "step": 5093, + "time_per_iteration": 2.57580304145813 + }, + { + "auxiliary_loss_clip": 0.01171642, + "auxiliary_loss_mlp": 0.0074813, + "balance_loss_clip": 1.00197804, + "balance_loss_mlp": 1.00037789, + "epoch": 0.3062678490906358, + "flos": 24024562151040.0, + "grad_norm": 2.0943428923479526, + "language_loss": 0.74690825, + "learning_rate": 3.2513680807222114e-06, + "loss": 0.76610601, + "num_input_tokens_seen": 109405025, + "step": 5094, + "time_per_iteration": 2.580479145050049 + }, + { + "auxiliary_loss_clip": 0.01138151, + "auxiliary_loss_mlp": 0.0113351, + "balance_loss_clip": 1.00184011, + "balance_loss_mlp": 1.00066292, + "epoch": 0.3063279723433038, + "flos": 19754639873280.0, + "grad_norm": 1.8545836311774844, + "language_loss": 0.7555747, + "learning_rate": 3.251064247058868e-06, + "loss": 0.77829123, + "num_input_tokens_seen": 109422465, + "step": 5095, + "time_per_iteration": 2.5953826904296875 + }, + { + "auxiliary_loss_clip": 0.0115511, + "auxiliary_loss_mlp": 0.01134094, + "balance_loss_clip": 1.00201893, + "balance_loss_mlp": 1.00086594, + "epoch": 0.30638809559597174, + "flos": 22450022496000.0, + "grad_norm": 2.017786963504048, + "language_loss": 0.80378151, + "learning_rate": 3.250760365955042e-06, + "loss": 0.82667351, + "num_input_tokens_seen": 109440575, + "step": 5096, + "time_per_iteration": 2.5667688846588135 + }, + { + "auxiliary_loss_clip": 0.0115506, + "auxiliary_loss_mlp": 0.0113436, + "balance_loss_clip": 1.0018934, + "balance_loss_mlp": 1.00065446, + "epoch": 0.3064482188486397, + "flos": 17165157523200.0, + "grad_norm": 2.1787298502081343, + "language_loss": 0.81592083, + "learning_rate": 3.250456437422258e-06, + "loss": 0.83881509, + "num_input_tokens_seen": 109459050, + "step": 5097, + "time_per_iteration": 2.5503787994384766 + }, + { + "auxiliary_loss_clip": 0.01171606, + "auxiliary_loss_mlp": 0.01133728, + "balance_loss_clip": 1.00197482, + "balance_loss_mlp": 1.0008812, + "epoch": 0.3065083421013077, + "flos": 23768483114880.0, + "grad_norm": 2.1023802232659814, + "language_loss": 0.77773345, + "learning_rate": 3.250152461472041e-06, + "loss": 0.80078685, + "num_input_tokens_seen": 109475860, + "step": 5098, + "time_per_iteration": 2.5373189449310303 + }, + { + "auxiliary_loss_clip": 0.01111452, + "auxiliary_loss_mlp": 0.01133304, + "balance_loss_clip": 1.00185037, + "balance_loss_mlp": 1.00074315, + "epoch": 0.30656846535397564, + "flos": 26431833784320.0, + "grad_norm": 2.1444601676247124, + "language_loss": 0.84025121, + "learning_rate": 3.249848438115917e-06, + "loss": 0.86269879, + "num_input_tokens_seen": 109494760, + "step": 5099, + "time_per_iteration": 2.694584608078003 + }, + { + "auxiliary_loss_clip": 0.01171649, + "auxiliary_loss_mlp": 0.01133773, + "balance_loss_clip": 1.00185323, + "balance_loss_mlp": 1.00083137, + "epoch": 0.3066285886066436, + "flos": 26651786716800.0, + "grad_norm": 1.9211936587544483, + "language_loss": 0.85547042, + "learning_rate": 3.2495443673654148e-06, + "loss": 0.87852466, + "num_input_tokens_seen": 109516480, + "step": 5100, + "time_per_iteration": 2.5460917949676514 + }, + { + "auxiliary_loss_clip": 0.01124048, + "auxiliary_loss_mlp": 0.01134299, + "balance_loss_clip": 1.00172615, + "balance_loss_mlp": 1.00068915, + "epoch": 0.30668871185931157, + "flos": 15049947375360.0, + "grad_norm": 2.3048068036421667, + "language_loss": 0.79013479, + "learning_rate": 3.249240249232065e-06, + "loss": 0.81271827, + "num_input_tokens_seen": 109534615, + "step": 5101, + "time_per_iteration": 2.689121961593628 + }, + { + "auxiliary_loss_clip": 0.0112354, + "auxiliary_loss_mlp": 0.01133916, + "balance_loss_clip": 1.00185227, + "balance_loss_mlp": 1.00087857, + "epoch": 0.30674883511197953, + "flos": 20082109190400.0, + "grad_norm": 1.7669221666926909, + "language_loss": 0.80116653, + "learning_rate": 3.2489360837273998e-06, + "loss": 0.82374108, + "num_input_tokens_seen": 109554040, + "step": 5102, + "time_per_iteration": 2.624830722808838 + }, + { + "auxiliary_loss_clip": 0.01171781, + "auxiliary_loss_mlp": 0.01133623, + "balance_loss_clip": 1.00203753, + "balance_loss_mlp": 1.00058568, + "epoch": 0.30680895836464755, + "flos": 22893807029760.0, + "grad_norm": 1.8357241647425842, + "language_loss": 0.88743591, + "learning_rate": 3.2486318708629532e-06, + "loss": 0.91049004, + "num_input_tokens_seen": 109574345, + "step": 5103, + "time_per_iteration": 2.5540847778320312 + }, + { + "auxiliary_loss_clip": 0.01156679, + "auxiliary_loss_mlp": 0.01133628, + "balance_loss_clip": 1.00194526, + "balance_loss_mlp": 1.00078082, + "epoch": 0.3068690816173155, + "flos": 23696159080320.0, + "grad_norm": 2.13856479896901, + "language_loss": 0.74353838, + "learning_rate": 3.2483276106502607e-06, + "loss": 0.76644146, + "num_input_tokens_seen": 109593670, + "step": 5104, + "time_per_iteration": 2.5920908451080322 + }, + { + "auxiliary_loss_clip": 0.01156656, + "auxiliary_loss_mlp": 0.00748064, + "balance_loss_clip": 1.00195003, + "balance_loss_mlp": 1.00029802, + "epoch": 0.3069292048699835, + "flos": 23551044134400.0, + "grad_norm": 1.7373918442807852, + "language_loss": 0.7248255, + "learning_rate": 3.2480233031008605e-06, + "loss": 0.7438727, + "num_input_tokens_seen": 109613385, + "step": 5105, + "time_per_iteration": 2.5980923175811768 + }, + { + "auxiliary_loss_clip": 0.01138408, + "auxiliary_loss_mlp": 0.01133912, + "balance_loss_clip": 1.00191665, + "balance_loss_mlp": 1.00077939, + "epoch": 0.30698932812265145, + "flos": 24531656405760.0, + "grad_norm": 1.8018411026590972, + "language_loss": 0.87391698, + "learning_rate": 3.2477189482262916e-06, + "loss": 0.89664018, + "num_input_tokens_seen": 109632395, + "step": 5106, + "time_per_iteration": 2.6298835277557373 + }, + { + "auxiliary_loss_clip": 0.01123464, + "auxiliary_loss_mlp": 0.01134609, + "balance_loss_clip": 1.00174487, + "balance_loss_mlp": 1.00080884, + "epoch": 0.3070494513753194, + "flos": 20996430912000.0, + "grad_norm": 2.1533998490053694, + "language_loss": 0.71033025, + "learning_rate": 3.2474145460380945e-06, + "loss": 0.73291093, + "num_input_tokens_seen": 109651380, + "step": 5107, + "time_per_iteration": 2.7252843379974365 + }, + { + "auxiliary_loss_clip": 0.01121139, + "auxiliary_loss_mlp": 0.01133684, + "balance_loss_clip": 1.00159085, + "balance_loss_mlp": 1.00083768, + "epoch": 0.3071095746279874, + "flos": 19025940660480.0, + "grad_norm": 2.186309117258601, + "language_loss": 0.72102153, + "learning_rate": 3.247110096547814e-06, + "loss": 0.74356973, + "num_input_tokens_seen": 109670240, + "step": 5108, + "time_per_iteration": 2.607269287109375 + }, + { + "auxiliary_loss_clip": 0.01139671, + "auxiliary_loss_mlp": 0.01133952, + "balance_loss_clip": 1.00179982, + "balance_loss_mlp": 1.00081921, + "epoch": 0.30716969788065535, + "flos": 21215521918080.0, + "grad_norm": 1.498577958305425, + "language_loss": 0.85693324, + "learning_rate": 3.2468055997669926e-06, + "loss": 0.87966943, + "num_input_tokens_seen": 109690810, + "step": 5109, + "time_per_iteration": 2.592449426651001 + }, + { + "auxiliary_loss_clip": 0.0113955, + "auxiliary_loss_mlp": 0.01133126, + "balance_loss_clip": 1.00177681, + "balance_loss_mlp": 1.0006603, + "epoch": 0.3072298211333233, + "flos": 25772765086080.0, + "grad_norm": 1.7387195283181092, + "language_loss": 0.67413074, + "learning_rate": 3.2465010557071788e-06, + "loss": 0.69685745, + "num_input_tokens_seen": 109711145, + "step": 5110, + "time_per_iteration": 2.625210762023926 + }, + { + "auxiliary_loss_clip": 0.01155258, + "auxiliary_loss_mlp": 0.01133072, + "balance_loss_clip": 1.0018692, + "balance_loss_mlp": 1.00060654, + "epoch": 0.3072899443859913, + "flos": 25848931875840.0, + "grad_norm": 1.522246604806723, + "language_loss": 0.77012289, + "learning_rate": 3.246196464379919e-06, + "loss": 0.79300618, + "num_input_tokens_seen": 109731425, + "step": 5111, + "time_per_iteration": 2.5802674293518066 + }, + { + "auxiliary_loss_clip": 0.01171849, + "auxiliary_loss_mlp": 0.01133803, + "balance_loss_clip": 1.00201786, + "balance_loss_mlp": 1.00067008, + "epoch": 0.30735006763865924, + "flos": 25922800195200.0, + "grad_norm": 1.8685718627703816, + "language_loss": 0.66926014, + "learning_rate": 3.245891825796765e-06, + "loss": 0.69231665, + "num_input_tokens_seen": 109752720, + "step": 5112, + "time_per_iteration": 2.550708532333374 + }, + { + "auxiliary_loss_clip": 0.01155216, + "auxiliary_loss_mlp": 0.01134739, + "balance_loss_clip": 1.00192308, + "balance_loss_mlp": 1.00074756, + "epoch": 0.3074101908913272, + "flos": 30917004312960.0, + "grad_norm": 2.0830217534214728, + "language_loss": 0.79075074, + "learning_rate": 3.2455871399692678e-06, + "loss": 0.81365025, + "num_input_tokens_seen": 109772840, + "step": 5113, + "time_per_iteration": 2.6181740760803223 + }, + { + "auxiliary_loss_clip": 0.01127935, + "auxiliary_loss_mlp": 0.00747942, + "balance_loss_clip": 1.00218153, + "balance_loss_mlp": 1.00024819, + "epoch": 0.30747031414399517, + "flos": 18401058731520.0, + "grad_norm": 2.3053156525176375, + "language_loss": 0.76626658, + "learning_rate": 3.2452824069089815e-06, + "loss": 0.78502536, + "num_input_tokens_seen": 109790150, + "step": 5114, + "time_per_iteration": 2.6101574897766113 + }, + { + "auxiliary_loss_clip": 0.01124601, + "auxiliary_loss_mlp": 0.01134169, + "balance_loss_clip": 1.00184584, + "balance_loss_mlp": 1.00065446, + "epoch": 0.30753043739666314, + "flos": 22633166966400.0, + "grad_norm": 2.3244525189203817, + "language_loss": 0.62026441, + "learning_rate": 3.2449776266274623e-06, + "loss": 0.64285213, + "num_input_tokens_seen": 109807985, + "step": 5115, + "time_per_iteration": 4.00491738319397 + }, + { + "auxiliary_loss_clip": 0.01156658, + "auxiliary_loss_mlp": 0.01134194, + "balance_loss_clip": 1.00190449, + "balance_loss_mlp": 1.00067949, + "epoch": 0.3075905606493311, + "flos": 27344072517120.0, + "grad_norm": 3.7522139127282332, + "language_loss": 0.82828832, + "learning_rate": 3.2446727991362657e-06, + "loss": 0.85119677, + "num_input_tokens_seen": 109825920, + "step": 5116, + "time_per_iteration": 2.595174551010132 + }, + { + "auxiliary_loss_clip": 0.01138128, + "auxiliary_loss_mlp": 0.01133817, + "balance_loss_clip": 1.00178742, + "balance_loss_mlp": 1.00097036, + "epoch": 0.3076506839019991, + "flos": 22090808534400.0, + "grad_norm": 1.701552052624358, + "language_loss": 0.75945979, + "learning_rate": 3.244367924446952e-06, + "loss": 0.78217924, + "num_input_tokens_seen": 109846220, + "step": 5117, + "time_per_iteration": 5.408756732940674 + }, + { + "auxiliary_loss_clip": 0.01123282, + "auxiliary_loss_mlp": 0.01134732, + "balance_loss_clip": 1.00189602, + "balance_loss_mlp": 1.00064564, + "epoch": 0.3077108071546671, + "flos": 21289533891840.0, + "grad_norm": 6.877973555132433, + "language_loss": 0.71145082, + "learning_rate": 3.2440630025710826e-06, + "loss": 0.73403096, + "num_input_tokens_seen": 109863870, + "step": 5118, + "time_per_iteration": 2.7101497650146484 + }, + { + "auxiliary_loss_clip": 0.01106457, + "auxiliary_loss_mlp": 0.011332, + "balance_loss_clip": 1.00158513, + "balance_loss_mlp": 1.00073421, + "epoch": 0.30777093040733505, + "flos": 21430985650560.0, + "grad_norm": 1.8860574958286338, + "language_loss": 0.74175161, + "learning_rate": 3.243758033520219e-06, + "loss": 0.76414818, + "num_input_tokens_seen": 109883500, + "step": 5119, + "time_per_iteration": 4.090076923370361 + }, + { + "auxiliary_loss_clip": 0.01155459, + "auxiliary_loss_mlp": 0.01134516, + "balance_loss_clip": 1.00200939, + "balance_loss_mlp": 1.00100195, + "epoch": 0.307831053660003, + "flos": 23149275534720.0, + "grad_norm": 2.0706143346346324, + "language_loss": 0.80063796, + "learning_rate": 3.243453017305926e-06, + "loss": 0.82353771, + "num_input_tokens_seen": 109904620, + "step": 5120, + "time_per_iteration": 2.6178479194641113 + }, + { + "auxiliary_loss_clip": 0.01156562, + "auxiliary_loss_mlp": 0.01134264, + "balance_loss_clip": 1.00189757, + "balance_loss_mlp": 1.00094068, + "epoch": 0.307891176912671, + "flos": 17019755268480.0, + "grad_norm": 1.668393380455095, + "language_loss": 0.79858696, + "learning_rate": 3.24314795393977e-06, + "loss": 0.82149518, + "num_input_tokens_seen": 109922275, + "step": 5121, + "time_per_iteration": 2.5401272773742676 + }, + { + "auxiliary_loss_clip": 0.01140141, + "auxiliary_loss_mlp": 0.01133343, + "balance_loss_clip": 1.00205719, + "balance_loss_mlp": 1.00068736, + "epoch": 0.30795130016533895, + "flos": 27705046245120.0, + "grad_norm": 1.6062555541681371, + "language_loss": 0.82570994, + "learning_rate": 3.242842843433319e-06, + "loss": 0.84844482, + "num_input_tokens_seen": 109944265, + "step": 5122, + "time_per_iteration": 2.652362108230591 + }, + { + "auxiliary_loss_clip": 0.01151096, + "auxiliary_loss_mlp": 0.01117522, + "balance_loss_clip": 1.0015111, + "balance_loss_mlp": 1.00002909, + "epoch": 0.3080114234180069, + "flos": 69058699591680.0, + "grad_norm": 0.739543652893872, + "language_loss": 0.58594191, + "learning_rate": 3.242537685798143e-06, + "loss": 0.60862815, + "num_input_tokens_seen": 110014160, + "step": 5123, + "time_per_iteration": 3.2885754108428955 + }, + { + "auxiliary_loss_clip": 0.01156671, + "auxiliary_loss_mlp": 0.00747919, + "balance_loss_clip": 1.00191355, + "balance_loss_mlp": 1.00016403, + "epoch": 0.3080715466706749, + "flos": 24060221377920.0, + "grad_norm": 1.5707482688185954, + "language_loss": 0.83307326, + "learning_rate": 3.242232481045813e-06, + "loss": 0.85211909, + "num_input_tokens_seen": 110034865, + "step": 5124, + "time_per_iteration": 2.592540740966797 + }, + { + "auxiliary_loss_clip": 0.01171648, + "auxiliary_loss_mlp": 0.01134182, + "balance_loss_clip": 1.0018785, + "balance_loss_mlp": 1.00095344, + "epoch": 0.30813166992334284, + "flos": 25848680480640.0, + "grad_norm": 1.8973693490636114, + "language_loss": 0.79483318, + "learning_rate": 3.2419272291879035e-06, + "loss": 0.81789154, + "num_input_tokens_seen": 110052930, + "step": 5125, + "time_per_iteration": 2.5282530784606934 + }, + { + "auxiliary_loss_clip": 0.01156503, + "auxiliary_loss_mlp": 0.01134183, + "balance_loss_clip": 1.00191247, + "balance_loss_mlp": 1.00066829, + "epoch": 0.3081917931760108, + "flos": 20449619193600.0, + "grad_norm": 3.0193657164946406, + "language_loss": 0.64545977, + "learning_rate": 3.241621930235989e-06, + "loss": 0.66836667, + "num_input_tokens_seen": 110071765, + "step": 5126, + "time_per_iteration": 2.5668928623199463 + }, + { + "auxiliary_loss_clip": 0.01106709, + "auxiliary_loss_mlp": 0.01132512, + "balance_loss_clip": 1.00157785, + "balance_loss_mlp": 1.00071406, + "epoch": 0.3082519164286788, + "flos": 22166257052160.0, + "grad_norm": 1.5288115163732265, + "language_loss": 0.86259598, + "learning_rate": 3.241316584201646e-06, + "loss": 0.88498819, + "num_input_tokens_seen": 110092660, + "step": 5127, + "time_per_iteration": 2.6808667182922363 + }, + { + "auxiliary_loss_clip": 0.01111566, + "auxiliary_loss_mlp": 0.01134275, + "balance_loss_clip": 1.00168443, + "balance_loss_mlp": 1.00066507, + "epoch": 0.30831203968134674, + "flos": 28913404700160.0, + "grad_norm": 1.5271985415145735, + "language_loss": 0.68438721, + "learning_rate": 3.2410111910964538e-06, + "loss": 0.70684564, + "num_input_tokens_seen": 110114960, + "step": 5128, + "time_per_iteration": 2.7429656982421875 + }, + { + "auxiliary_loss_clip": 0.01155184, + "auxiliary_loss_mlp": 0.00747913, + "balance_loss_clip": 1.00191116, + "balance_loss_mlp": 1.00010109, + "epoch": 0.3083721629340147, + "flos": 25667726739840.0, + "grad_norm": 1.952021337007348, + "language_loss": 0.71236736, + "learning_rate": 3.240705750931993e-06, + "loss": 0.73139834, + "num_input_tokens_seen": 110135750, + "step": 5129, + "time_per_iteration": 2.614699363708496 + }, + { + "auxiliary_loss_clip": 0.01117965, + "auxiliary_loss_mlp": 0.01116739, + "balance_loss_clip": 1.00129807, + "balance_loss_mlp": 1.00000882, + "epoch": 0.3084322861866827, + "flos": 68212679581440.0, + "grad_norm": 0.8368429937942256, + "language_loss": 0.59361339, + "learning_rate": 3.240400263719846e-06, + "loss": 0.61596036, + "num_input_tokens_seen": 110189480, + "step": 5130, + "time_per_iteration": 3.1913106441497803 + }, + { + "auxiliary_loss_clip": 0.0113965, + "auxiliary_loss_mlp": 0.01134769, + "balance_loss_clip": 1.0019033, + "balance_loss_mlp": 1.0007782, + "epoch": 0.3084924094393507, + "flos": 20296495514880.0, + "grad_norm": 2.3622846793674164, + "language_loss": 0.72491199, + "learning_rate": 3.2400947294715957e-06, + "loss": 0.74765617, + "num_input_tokens_seen": 110206445, + "step": 5131, + "time_per_iteration": 2.619305372238159 + }, + { + "auxiliary_loss_clip": 0.01123114, + "auxiliary_loss_mlp": 0.0113428, + "balance_loss_clip": 1.00178075, + "balance_loss_mlp": 1.00067067, + "epoch": 0.30855253269201866, + "flos": 23949831905280.0, + "grad_norm": 1.4839233609754927, + "language_loss": 0.70830071, + "learning_rate": 3.2397891481988303e-06, + "loss": 0.73087466, + "num_input_tokens_seen": 110226845, + "step": 5132, + "time_per_iteration": 2.654238700866699 + }, + { + "auxiliary_loss_clip": 0.011716, + "auxiliary_loss_mlp": 0.00747965, + "balance_loss_clip": 1.00197768, + "balance_loss_mlp": 1.00016499, + "epoch": 0.3086126559446866, + "flos": 19281876042240.0, + "grad_norm": 1.7646305486786422, + "language_loss": 0.89820063, + "learning_rate": 3.239483519913136e-06, + "loss": 0.91739625, + "num_input_tokens_seen": 110244095, + "step": 5133, + "time_per_iteration": 2.512763261795044 + }, + { + "auxiliary_loss_clip": 0.01140103, + "auxiliary_loss_mlp": 0.01134413, + "balance_loss_clip": 1.00183141, + "balance_loss_mlp": 1.00070786, + "epoch": 0.3086727791973546, + "flos": 33760770019200.0, + "grad_norm": 2.0738423398280728, + "language_loss": 0.67183566, + "learning_rate": 3.239177844626102e-06, + "loss": 0.69458079, + "num_input_tokens_seen": 110264240, + "step": 5134, + "time_per_iteration": 2.723205089569092 + }, + { + "auxiliary_loss_clip": 0.01155679, + "auxiliary_loss_mlp": 0.01135018, + "balance_loss_clip": 1.00200653, + "balance_loss_mlp": 1.00074065, + "epoch": 0.30873290245002255, + "flos": 16034151006720.0, + "grad_norm": 2.2895617777117683, + "language_loss": 0.82817769, + "learning_rate": 3.2388721223493197e-06, + "loss": 0.85108471, + "num_input_tokens_seen": 110282450, + "step": 5135, + "time_per_iteration": 2.5206425189971924 + }, + { + "auxiliary_loss_clip": 0.01119517, + "auxiliary_loss_mlp": 0.01117468, + "balance_loss_clip": 1.00139916, + "balance_loss_mlp": 0.99997538, + "epoch": 0.3087930257026905, + "flos": 65048304055680.0, + "grad_norm": 0.7021312221030432, + "language_loss": 0.55334228, + "learning_rate": 3.2385663530943824e-06, + "loss": 0.57571214, + "num_input_tokens_seen": 110343715, + "step": 5136, + "time_per_iteration": 3.2394003868103027 + }, + { + "auxiliary_loss_clip": 0.01138816, + "auxiliary_loss_mlp": 0.00747989, + "balance_loss_clip": 1.00191164, + "balance_loss_mlp": 1.00017142, + "epoch": 0.3088531489553585, + "flos": 74738829824640.0, + "grad_norm": 2.280214679241994, + "language_loss": 0.75967348, + "learning_rate": 3.2382605368728852e-06, + "loss": 0.77854156, + "num_input_tokens_seen": 110368430, + "step": 5137, + "time_per_iteration": 3.0212390422821045 + }, + { + "auxiliary_loss_clip": 0.01121763, + "auxiliary_loss_mlp": 0.01133182, + "balance_loss_clip": 1.00174212, + "balance_loss_mlp": 1.00062084, + "epoch": 0.30891327220802645, + "flos": 21142300043520.0, + "grad_norm": 1.8741042323969717, + "language_loss": 0.79589403, + "learning_rate": 3.237954673696424e-06, + "loss": 0.81844354, + "num_input_tokens_seen": 110386735, + "step": 5138, + "time_per_iteration": 2.660506010055542 + }, + { + "auxiliary_loss_clip": 0.01109802, + "auxiliary_loss_mlp": 0.01134713, + "balance_loss_clip": 1.00192189, + "balance_loss_mlp": 1.00081706, + "epoch": 0.3089733954606944, + "flos": 25664494515840.0, + "grad_norm": 1.4756835078001318, + "language_loss": 0.81054378, + "learning_rate": 3.2376487635765983e-06, + "loss": 0.83298892, + "num_input_tokens_seen": 110406820, + "step": 5139, + "time_per_iteration": 2.8074264526367188 + }, + { + "auxiliary_loss_clip": 0.01155315, + "auxiliary_loss_mlp": 0.01134952, + "balance_loss_clip": 1.00174165, + "balance_loss_mlp": 1.00067461, + "epoch": 0.3090335187133624, + "flos": 19427350124160.0, + "grad_norm": 2.0270296085748973, + "language_loss": 0.7709074, + "learning_rate": 3.2373428065250067e-06, + "loss": 0.79381007, + "num_input_tokens_seen": 110424225, + "step": 5140, + "time_per_iteration": 2.612678289413452 + }, + { + "auxiliary_loss_clip": 0.01122513, + "auxiliary_loss_mlp": 0.01133256, + "balance_loss_clip": 1.00173545, + "balance_loss_mlp": 1.00088608, + "epoch": 0.30909364196603034, + "flos": 20011329440640.0, + "grad_norm": 4.40865509172983, + "language_loss": 0.7825827, + "learning_rate": 3.237036802553252e-06, + "loss": 0.80514038, + "num_input_tokens_seen": 110443310, + "step": 5141, + "time_per_iteration": 2.6537866592407227 + }, + { + "auxiliary_loss_clip": 0.01144271, + "auxiliary_loss_mlp": 0.01134543, + "balance_loss_clip": 1.00186086, + "balance_loss_mlp": 1.00074255, + "epoch": 0.3091537652186983, + "flos": 19677575243520.0, + "grad_norm": 2.0543737335889367, + "language_loss": 0.86697447, + "learning_rate": 3.2367307516729377e-06, + "loss": 0.88976264, + "num_input_tokens_seen": 110460215, + "step": 5142, + "time_per_iteration": 2.6082746982574463 + }, + { + "auxiliary_loss_clip": 0.01155051, + "auxiliary_loss_mlp": 0.01134393, + "balance_loss_clip": 1.00180793, + "balance_loss_mlp": 1.00078297, + "epoch": 0.3092138884713663, + "flos": 17020042577280.0, + "grad_norm": 1.7860070945027515, + "language_loss": 0.78767675, + "learning_rate": 3.23642465389567e-06, + "loss": 0.81057119, + "num_input_tokens_seen": 110479385, + "step": 5143, + "time_per_iteration": 2.5698070526123047 + }, + { + "auxiliary_loss_clip": 0.01121921, + "auxiliary_loss_mlp": 0.01134216, + "balance_loss_clip": 1.00165045, + "balance_loss_mlp": 1.00070167, + "epoch": 0.3092740117240343, + "flos": 25009986844800.0, + "grad_norm": 1.8718348901163504, + "language_loss": 0.71859491, + "learning_rate": 3.236118509233055e-06, + "loss": 0.74115622, + "num_input_tokens_seen": 110499885, + "step": 5144, + "time_per_iteration": 2.69073748588562 + }, + { + "auxiliary_loss_clip": 0.01156141, + "auxiliary_loss_mlp": 0.01134158, + "balance_loss_clip": 1.00193048, + "balance_loss_mlp": 1.00083423, + "epoch": 0.30933413497670226, + "flos": 25590410714880.0, + "grad_norm": 1.9150000361581252, + "language_loss": 0.73472852, + "learning_rate": 3.235812317696702e-06, + "loss": 0.75763154, + "num_input_tokens_seen": 110519690, + "step": 5145, + "time_per_iteration": 2.624584436416626 + }, + { + "auxiliary_loss_clip": 0.01145679, + "auxiliary_loss_mlp": 0.01133926, + "balance_loss_clip": 1.0018934, + "balance_loss_mlp": 1.00079322, + "epoch": 0.3093942582293702, + "flos": 24389665943040.0, + "grad_norm": 1.7073501027680593, + "language_loss": 0.76373881, + "learning_rate": 3.2355060792982224e-06, + "loss": 0.78653485, + "num_input_tokens_seen": 110540520, + "step": 5146, + "time_per_iteration": 2.6189277172088623 + }, + { + "auxiliary_loss_clip": 0.01139785, + "auxiliary_loss_mlp": 0.01133969, + "balance_loss_clip": 1.00185561, + "balance_loss_mlp": 1.00064492, + "epoch": 0.3094543814820382, + "flos": 19646441130240.0, + "grad_norm": 1.817854980740521, + "language_loss": 0.66700536, + "learning_rate": 3.2351997940492286e-06, + "loss": 0.68974286, + "num_input_tokens_seen": 110557950, + "step": 5147, + "time_per_iteration": 2.623426675796509 + }, + { + "auxiliary_loss_clip": 0.01155348, + "auxiliary_loss_mlp": 0.01134561, + "balance_loss_clip": 1.00200498, + "balance_loss_mlp": 1.00095105, + "epoch": 0.30951450473470615, + "flos": 25663812157440.0, + "grad_norm": 1.6105912458172977, + "language_loss": 0.74203551, + "learning_rate": 3.2348934619613346e-06, + "loss": 0.76493466, + "num_input_tokens_seen": 110578215, + "step": 5148, + "time_per_iteration": 2.660200595855713 + }, + { + "auxiliary_loss_clip": 0.01155171, + "auxiliary_loss_mlp": 0.01134685, + "balance_loss_clip": 1.00184083, + "balance_loss_mlp": 1.00088441, + "epoch": 0.3095746279873741, + "flos": 12020415505920.0, + "grad_norm": 2.21522782670907, + "language_loss": 0.72664678, + "learning_rate": 3.2345870830461567e-06, + "loss": 0.74954534, + "num_input_tokens_seen": 110592990, + "step": 5149, + "time_per_iteration": 2.5962347984313965 + }, + { + "auxiliary_loss_clip": 0.01107366, + "auxiliary_loss_mlp": 0.0113473, + "balance_loss_clip": 1.00161481, + "balance_loss_mlp": 1.00083423, + "epoch": 0.3096347512400421, + "flos": 23623044946560.0, + "grad_norm": 1.7439873863186837, + "language_loss": 0.8406918, + "learning_rate": 3.2342806573153132e-06, + "loss": 0.86311281, + "num_input_tokens_seen": 110612130, + "step": 5150, + "time_per_iteration": 2.7129523754119873 + }, + { + "auxiliary_loss_clip": 0.01108105, + "auxiliary_loss_mlp": 0.01134653, + "balance_loss_clip": 1.00173044, + "balance_loss_mlp": 1.00066197, + "epoch": 0.30969487449271005, + "flos": 22529313768960.0, + "grad_norm": 21.787341637608517, + "language_loss": 0.78343642, + "learning_rate": 3.233974184780424e-06, + "loss": 0.80586398, + "num_input_tokens_seen": 110632045, + "step": 5151, + "time_per_iteration": 2.7879834175109863 + }, + { + "auxiliary_loss_clip": 0.01155033, + "auxiliary_loss_mlp": 0.01134016, + "balance_loss_clip": 1.00186777, + "balance_loss_mlp": 1.00069213, + "epoch": 0.309754997745378, + "flos": 15267925059840.0, + "grad_norm": 1.836329441013891, + "language_loss": 0.6717205, + "learning_rate": 3.2336676654531084e-06, + "loss": 0.69461095, + "num_input_tokens_seen": 110649340, + "step": 5152, + "time_per_iteration": 3.976808547973633 + }, + { + "auxiliary_loss_clip": 0.0108983, + "auxiliary_loss_mlp": 0.01134506, + "balance_loss_clip": 1.00158358, + "balance_loss_mlp": 1.00089586, + "epoch": 0.309815120998046, + "flos": 26979291947520.0, + "grad_norm": 2.3739085950037446, + "language_loss": 0.82654977, + "learning_rate": 3.2333610993449926e-06, + "loss": 0.84879309, + "num_input_tokens_seen": 110668450, + "step": 5153, + "time_per_iteration": 2.7398416996002197 + }, + { + "auxiliary_loss_clip": 0.01139598, + "auxiliary_loss_mlp": 0.00747928, + "balance_loss_clip": 1.00171888, + "balance_loss_mlp": 1.00015712, + "epoch": 0.30987524425071394, + "flos": 21143161969920.0, + "grad_norm": 1.654172043705569, + "language_loss": 0.74033105, + "learning_rate": 3.2330544864676997e-06, + "loss": 0.7592063, + "num_input_tokens_seen": 110689410, + "step": 5154, + "time_per_iteration": 2.6134443283081055 + }, + { + "auxiliary_loss_clip": 0.01156543, + "auxiliary_loss_mlp": 0.01134353, + "balance_loss_clip": 1.00196791, + "balance_loss_mlp": 1.00083852, + "epoch": 0.3099353675033819, + "flos": 15268284195840.0, + "grad_norm": 1.9503058093184658, + "language_loss": 0.75891292, + "learning_rate": 3.232747826832858e-06, + "loss": 0.78182191, + "num_input_tokens_seen": 110707350, + "step": 5155, + "time_per_iteration": 3.9562697410583496 + }, + { + "auxiliary_loss_clip": 0.01140785, + "auxiliary_loss_mlp": 0.01134125, + "balance_loss_clip": 1.00182521, + "balance_loss_mlp": 1.00080132, + "epoch": 0.30999549075604993, + "flos": 15413794191360.0, + "grad_norm": 1.7202211784670112, + "language_loss": 0.78922606, + "learning_rate": 3.232441120452094e-06, + "loss": 0.81197518, + "num_input_tokens_seen": 110724910, + "step": 5156, + "time_per_iteration": 3.9697067737579346 + }, + { + "auxiliary_loss_clip": 0.01155613, + "auxiliary_loss_mlp": 0.01135201, + "balance_loss_clip": 1.00191903, + "balance_loss_mlp": 1.0008285, + "epoch": 0.3100556140087179, + "flos": 23184539712000.0, + "grad_norm": 2.0112234677157748, + "language_loss": 0.74629694, + "learning_rate": 3.23213436733704e-06, + "loss": 0.76920509, + "num_input_tokens_seen": 110744010, + "step": 5157, + "time_per_iteration": 2.5805916786193848 + }, + { + "auxiliary_loss_clip": 0.01125437, + "auxiliary_loss_mlp": 0.01133527, + "balance_loss_clip": 1.00175655, + "balance_loss_mlp": 1.00087118, + "epoch": 0.31011573726138586, + "flos": 25742169676800.0, + "grad_norm": 2.1180255265648507, + "language_loss": 0.68716586, + "learning_rate": 3.231827567499327e-06, + "loss": 0.70975548, + "num_input_tokens_seen": 110765835, + "step": 5158, + "time_per_iteration": 2.6920695304870605 + }, + { + "auxiliary_loss_clip": 0.01105866, + "auxiliary_loss_mlp": 0.01133328, + "balance_loss_clip": 1.0015502, + "balance_loss_mlp": 1.0008626, + "epoch": 0.3101758605140538, + "flos": 20011329440640.0, + "grad_norm": 2.1039789998177456, + "language_loss": 0.8425898, + "learning_rate": 3.2315207209505896e-06, + "loss": 0.86498177, + "num_input_tokens_seen": 110784655, + "step": 5159, + "time_per_iteration": 2.661914587020874 + }, + { + "auxiliary_loss_clip": 0.01138369, + "auxiliary_loss_mlp": 0.01134453, + "balance_loss_clip": 1.00178587, + "balance_loss_mlp": 1.00074768, + "epoch": 0.3102359837667218, + "flos": 19135683688320.0, + "grad_norm": 1.8592778150953437, + "language_loss": 0.85225391, + "learning_rate": 3.231213827702462e-06, + "loss": 0.87498212, + "num_input_tokens_seen": 110802545, + "step": 5160, + "time_per_iteration": 2.571962833404541 + }, + { + "auxiliary_loss_clip": 0.01155911, + "auxiliary_loss_mlp": 0.0113391, + "balance_loss_clip": 1.00187898, + "balance_loss_mlp": 1.00077701, + "epoch": 0.31029610701938976, + "flos": 22265405568000.0, + "grad_norm": 2.1407250421520017, + "language_loss": 0.75538862, + "learning_rate": 3.230906887766584e-06, + "loss": 0.77828681, + "num_input_tokens_seen": 110820265, + "step": 5161, + "time_per_iteration": 2.580899477005005 + }, + { + "auxiliary_loss_clip": 0.01154989, + "auxiliary_loss_mlp": 0.01134502, + "balance_loss_clip": 1.00177217, + "balance_loss_mlp": 1.00079656, + "epoch": 0.3103562302720577, + "flos": 20805349536000.0, + "grad_norm": 2.265920414963336, + "language_loss": 0.81484497, + "learning_rate": 3.2305999011545924e-06, + "loss": 0.83773983, + "num_input_tokens_seen": 110836195, + "step": 5162, + "time_per_iteration": 2.5554959774017334 + }, + { + "auxiliary_loss_clip": 0.01154468, + "auxiliary_loss_mlp": 0.01133467, + "balance_loss_clip": 1.00174892, + "balance_loss_mlp": 1.00081134, + "epoch": 0.3104163535247257, + "flos": 22344158136960.0, + "grad_norm": 1.6504020978150455, + "language_loss": 0.82707196, + "learning_rate": 3.2302928678781295e-06, + "loss": 0.84995127, + "num_input_tokens_seen": 110856420, + "step": 5163, + "time_per_iteration": 2.554607391357422 + }, + { + "auxiliary_loss_clip": 0.01171772, + "auxiliary_loss_mlp": 0.01134276, + "balance_loss_clip": 1.00201225, + "balance_loss_mlp": 1.00085711, + "epoch": 0.31047647677739365, + "flos": 21689363157120.0, + "grad_norm": 3.198773364973101, + "language_loss": 0.76211435, + "learning_rate": 3.2299857879488376e-06, + "loss": 0.78517485, + "num_input_tokens_seen": 110876650, + "step": 5164, + "time_per_iteration": 2.539381265640259 + }, + { + "auxiliary_loss_clip": 0.01114085, + "auxiliary_loss_mlp": 0.01134253, + "balance_loss_clip": 1.00214672, + "balance_loss_mlp": 1.00083435, + "epoch": 0.3105366000300616, + "flos": 18917275040640.0, + "grad_norm": 2.1394089223822395, + "language_loss": 0.74951267, + "learning_rate": 3.2296786613783626e-06, + "loss": 0.77199602, + "num_input_tokens_seen": 110894445, + "step": 5165, + "time_per_iteration": 2.6566741466522217 + }, + { + "auxiliary_loss_clip": 0.01123856, + "auxiliary_loss_mlp": 0.01133527, + "balance_loss_clip": 1.00183749, + "balance_loss_mlp": 1.00068045, + "epoch": 0.3105967232827296, + "flos": 18260397072000.0, + "grad_norm": 1.8246148998821339, + "language_loss": 0.76284385, + "learning_rate": 3.229371488178348e-06, + "loss": 0.78541762, + "num_input_tokens_seen": 110912855, + "step": 5166, + "time_per_iteration": 2.6188857555389404 + }, + { + "auxiliary_loss_clip": 0.01139362, + "auxiliary_loss_mlp": 0.0113435, + "balance_loss_clip": 1.00188196, + "balance_loss_mlp": 1.00074065, + "epoch": 0.31065684653539755, + "flos": 17672144037120.0, + "grad_norm": 2.3203219065421963, + "language_loss": 0.73065054, + "learning_rate": 3.229064268360444e-06, + "loss": 0.75338763, + "num_input_tokens_seen": 110928025, + "step": 5167, + "time_per_iteration": 2.5833122730255127 + }, + { + "auxiliary_loss_clip": 0.01102069, + "auxiliary_loss_mlp": 0.01116273, + "balance_loss_clip": 1.00112402, + "balance_loss_mlp": 1.00030577, + "epoch": 0.3107169697880655, + "flos": 68531996511360.0, + "grad_norm": 0.7118515661394418, + "language_loss": 0.53029203, + "learning_rate": 3.2287570019362997e-06, + "loss": 0.55247545, + "num_input_tokens_seen": 110992215, + "step": 5168, + "time_per_iteration": 3.359029531478882 + }, + { + "auxiliary_loss_clip": 0.01155166, + "auxiliary_loss_mlp": 0.0113424, + "balance_loss_clip": 1.00188923, + "balance_loss_mlp": 1.0007261, + "epoch": 0.3107770930407335, + "flos": 13188733274880.0, + "grad_norm": 2.2409308355112727, + "language_loss": 0.78601229, + "learning_rate": 3.2284496889175668e-06, + "loss": 0.80890632, + "num_input_tokens_seen": 111010400, + "step": 5169, + "time_per_iteration": 2.8137636184692383 + }, + { + "auxiliary_loss_clip": 0.0113967, + "auxiliary_loss_mlp": 0.01133743, + "balance_loss_clip": 1.00176704, + "balance_loss_mlp": 1.00080097, + "epoch": 0.3108372162934015, + "flos": 31580849520000.0, + "grad_norm": 2.213620177762995, + "language_loss": 0.63920206, + "learning_rate": 3.2281423293158986e-06, + "loss": 0.66193616, + "num_input_tokens_seen": 111033960, + "step": 5170, + "time_per_iteration": 2.70038104057312 + }, + { + "auxiliary_loss_clip": 0.01124102, + "auxiliary_loss_mlp": 0.00747999, + "balance_loss_clip": 1.00198746, + "balance_loss_mlp": 1.00013852, + "epoch": 0.31089733954606946, + "flos": 28729829266560.0, + "grad_norm": 11.602045481701412, + "language_loss": 0.77249908, + "learning_rate": 3.22783492314295e-06, + "loss": 0.79122007, + "num_input_tokens_seen": 111053265, + "step": 5171, + "time_per_iteration": 2.7186481952667236 + }, + { + "auxiliary_loss_clip": 0.01113063, + "auxiliary_loss_mlp": 0.01133973, + "balance_loss_clip": 1.00200725, + "balance_loss_mlp": 1.00093567, + "epoch": 0.3109574627987374, + "flos": 19683249592320.0, + "grad_norm": 3.900655814884326, + "language_loss": 0.83489764, + "learning_rate": 3.2275274704103785e-06, + "loss": 0.85736799, + "num_input_tokens_seen": 111071130, + "step": 5172, + "time_per_iteration": 2.6854441165924072 + }, + { + "auxiliary_loss_clip": 0.01107888, + "auxiliary_loss_mlp": 0.01134039, + "balance_loss_clip": 1.00171137, + "balance_loss_mlp": 1.00081038, + "epoch": 0.3110175860514054, + "flos": 14683981656960.0, + "grad_norm": 3.3285637235683234, + "language_loss": 0.84096545, + "learning_rate": 3.227219971129842e-06, + "loss": 0.86338472, + "num_input_tokens_seen": 111089560, + "step": 5173, + "time_per_iteration": 2.66247296333313 + }, + { + "auxiliary_loss_clip": 0.01171419, + "auxiliary_loss_mlp": 0.01132941, + "balance_loss_clip": 1.00190043, + "balance_loss_mlp": 1.00066662, + "epoch": 0.31107770930407336, + "flos": 25739655724800.0, + "grad_norm": 1.8464456847352497, + "language_loss": 0.83524275, + "learning_rate": 3.226912425313001e-06, + "loss": 0.85828638, + "num_input_tokens_seen": 111109960, + "step": 5174, + "time_per_iteration": 2.5584003925323486 + }, + { + "auxiliary_loss_clip": 0.01139444, + "auxiliary_loss_mlp": 0.01133845, + "balance_loss_clip": 1.00183022, + "balance_loss_mlp": 1.00090241, + "epoch": 0.3111378325567413, + "flos": 19208259118080.0, + "grad_norm": 2.0274970026739214, + "language_loss": 0.84888262, + "learning_rate": 3.2266048329715183e-06, + "loss": 0.87161547, + "num_input_tokens_seen": 111127960, + "step": 5175, + "time_per_iteration": 2.56512451171875 + }, + { + "auxiliary_loss_clip": 0.01093388, + "auxiliary_loss_mlp": 0.01133958, + "balance_loss_clip": 1.00175214, + "balance_loss_mlp": 1.00072944, + "epoch": 0.3111979558094093, + "flos": 23696374561920.0, + "grad_norm": 1.5836205149186695, + "language_loss": 0.82991993, + "learning_rate": 3.2262971941170575e-06, + "loss": 0.85219342, + "num_input_tokens_seen": 111146730, + "step": 5176, + "time_per_iteration": 2.726079225540161 + }, + { + "auxiliary_loss_clip": 0.0115606, + "auxiliary_loss_mlp": 0.01132859, + "balance_loss_clip": 1.00173652, + "balance_loss_mlp": 1.00058484, + "epoch": 0.31125807906207725, + "flos": 21033023892480.0, + "grad_norm": 2.089191204112499, + "language_loss": 0.8083356, + "learning_rate": 3.2259895087612837e-06, + "loss": 0.8312248, + "num_input_tokens_seen": 111166295, + "step": 5177, + "time_per_iteration": 2.545534372329712 + }, + { + "auxiliary_loss_clip": 0.01156361, + "auxiliary_loss_mlp": 0.00747886, + "balance_loss_clip": 1.00196123, + "balance_loss_mlp": 1.00011873, + "epoch": 0.3113182023147452, + "flos": 23076628277760.0, + "grad_norm": 2.658789898284892, + "language_loss": 0.80664074, + "learning_rate": 3.2256817769158657e-06, + "loss": 0.82568324, + "num_input_tokens_seen": 111185665, + "step": 5178, + "time_per_iteration": 2.5735411643981934 + }, + { + "auxiliary_loss_clip": 0.01138893, + "auxiliary_loss_mlp": 0.01134159, + "balance_loss_clip": 1.00187838, + "balance_loss_mlp": 1.00064468, + "epoch": 0.3113783255674132, + "flos": 11838994888320.0, + "grad_norm": 1.7958652227151632, + "language_loss": 0.81125903, + "learning_rate": 3.225373998592471e-06, + "loss": 0.8339895, + "num_input_tokens_seen": 111201615, + "step": 5179, + "time_per_iteration": 2.5877742767333984 + }, + { + "auxiliary_loss_clip": 0.01122568, + "auxiliary_loss_mlp": 0.01133792, + "balance_loss_clip": 1.00172269, + "balance_loss_mlp": 1.00113618, + "epoch": 0.31143844882008115, + "flos": 16289547684480.0, + "grad_norm": 1.6577182803693657, + "language_loss": 0.78378749, + "learning_rate": 3.2250661738027715e-06, + "loss": 0.80635113, + "num_input_tokens_seen": 111220515, + "step": 5180, + "time_per_iteration": 2.6400082111358643 + }, + { + "auxiliary_loss_clip": 0.01108971, + "auxiliary_loss_mlp": 0.01132925, + "balance_loss_clip": 1.0017581, + "balance_loss_mlp": 1.00065041, + "epoch": 0.3114985720727491, + "flos": 23217792727680.0, + "grad_norm": 2.254093958250164, + "language_loss": 0.83116615, + "learning_rate": 3.22475830255844e-06, + "loss": 0.85358512, + "num_input_tokens_seen": 111240395, + "step": 5181, + "time_per_iteration": 2.72629976272583 + }, + { + "auxiliary_loss_clip": 0.01124452, + "auxiliary_loss_mlp": 0.01133258, + "balance_loss_clip": 1.00175464, + "balance_loss_mlp": 1.0007925, + "epoch": 0.3115586953254171, + "flos": 30044626698240.0, + "grad_norm": 2.239843167738143, + "language_loss": 0.7430315, + "learning_rate": 3.2244503848711516e-06, + "loss": 0.76560855, + "num_input_tokens_seen": 111261100, + "step": 5182, + "time_per_iteration": 2.70176100730896 + }, + { + "auxiliary_loss_clip": 0.01106675, + "auxiliary_loss_mlp": 0.00747891, + "balance_loss_clip": 1.00175047, + "balance_loss_mlp": 1.00016403, + "epoch": 0.3116188185780851, + "flos": 25666326109440.0, + "grad_norm": 1.8778982429777886, + "language_loss": 0.70163274, + "learning_rate": 3.2241424207525815e-06, + "loss": 0.72017843, + "num_input_tokens_seen": 111281320, + "step": 5183, + "time_per_iteration": 2.725311756134033 + }, + { + "auxiliary_loss_clip": 0.01120287, + "auxiliary_loss_mlp": 0.0111589, + "balance_loss_clip": 1.00130904, + "balance_loss_mlp": 0.99992293, + "epoch": 0.31167894183075306, + "flos": 69510058917120.0, + "grad_norm": 0.949484465972351, + "language_loss": 0.59682643, + "learning_rate": 3.223834410214408e-06, + "loss": 0.61918819, + "num_input_tokens_seen": 111341405, + "step": 5184, + "time_per_iteration": 3.25763201713562 + }, + { + "auxiliary_loss_clip": 0.01139852, + "auxiliary_loss_mlp": 0.01133442, + "balance_loss_clip": 1.00179017, + "balance_loss_mlp": 1.00078535, + "epoch": 0.31173906508342103, + "flos": 14939845211520.0, + "grad_norm": 2.252812144803221, + "language_loss": 0.69653058, + "learning_rate": 3.223526353268311e-06, + "loss": 0.71926349, + "num_input_tokens_seen": 111358975, + "step": 5185, + "time_per_iteration": 2.594252109527588 + }, + { + "auxiliary_loss_clip": 0.01125175, + "auxiliary_loss_mlp": 0.01133257, + "balance_loss_clip": 1.00189662, + "balance_loss_mlp": 1.00088727, + "epoch": 0.311799188336089, + "flos": 16176033728640.0, + "grad_norm": 8.33546240230512, + "language_loss": 0.63484979, + "learning_rate": 3.2232182499259725e-06, + "loss": 0.65743411, + "num_input_tokens_seen": 111375845, + "step": 5186, + "time_per_iteration": 2.6095192432403564 + }, + { + "auxiliary_loss_clip": 0.01139439, + "auxiliary_loss_mlp": 0.0113342, + "balance_loss_clip": 1.00180519, + "balance_loss_mlp": 1.00085902, + "epoch": 0.31185931158875696, + "flos": 25009627708800.0, + "grad_norm": 6.52477267613784, + "language_loss": 0.85949516, + "learning_rate": 3.2229101001990747e-06, + "loss": 0.88222373, + "num_input_tokens_seen": 111394150, + "step": 5187, + "time_per_iteration": 2.6288983821868896 + }, + { + "auxiliary_loss_clip": 0.01171578, + "auxiliary_loss_mlp": 0.00747901, + "balance_loss_clip": 1.00189042, + "balance_loss_mlp": 1.00017142, + "epoch": 0.3119194348414249, + "flos": 37232901273600.0, + "grad_norm": 1.5339946165940062, + "language_loss": 0.62943864, + "learning_rate": 3.2226019040993036e-06, + "loss": 0.64863336, + "num_input_tokens_seen": 111418355, + "step": 5188, + "time_per_iteration": 2.661755323410034 + }, + { + "auxiliary_loss_clip": 0.0112355, + "auxiliary_loss_mlp": 0.01133928, + "balance_loss_clip": 1.0019412, + "balance_loss_mlp": 1.00079513, + "epoch": 0.3119795580940929, + "flos": 15012779777280.0, + "grad_norm": 2.2187986122363084, + "language_loss": 0.82920545, + "learning_rate": 3.222293661638346e-06, + "loss": 0.85178018, + "num_input_tokens_seen": 111435445, + "step": 5189, + "time_per_iteration": 2.636549711227417 + }, + { + "auxiliary_loss_clip": 0.01043813, + "auxiliary_loss_mlp": 0.01132545, + "balance_loss_clip": 1.00149536, + "balance_loss_mlp": 1.00065136, + "epoch": 0.31203968134676086, + "flos": 15998168557440.0, + "grad_norm": 2.608774342849776, + "language_loss": 0.79239118, + "learning_rate": 3.22198537282789e-06, + "loss": 0.81415474, + "num_input_tokens_seen": 111453430, + "step": 5190, + "time_per_iteration": 4.453144311904907 + }, + { + "auxiliary_loss_clip": 0.01110554, + "auxiliary_loss_mlp": 0.01133212, + "balance_loss_clip": 1.00185394, + "balance_loss_mlp": 1.00074661, + "epoch": 0.3120998045994288, + "flos": 23837359443840.0, + "grad_norm": 1.549967382571601, + "language_loss": 0.74999529, + "learning_rate": 3.2216770376796262e-06, + "loss": 0.77243292, + "num_input_tokens_seen": 111475325, + "step": 5191, + "time_per_iteration": 3.3460559844970703 + }, + { + "auxiliary_loss_clip": 0.011511, + "auxiliary_loss_mlp": 0.00747177, + "balance_loss_clip": 1.00154638, + "balance_loss_mlp": 1.00017059, + "epoch": 0.3121599278520968, + "flos": 69184205712000.0, + "grad_norm": 0.8410584311007492, + "language_loss": 0.63946307, + "learning_rate": 3.221368656205247e-06, + "loss": 0.65844584, + "num_input_tokens_seen": 111533960, + "step": 5192, + "time_per_iteration": 4.88750696182251 + }, + { + "auxiliary_loss_clip": 0.01154851, + "auxiliary_loss_mlp": 0.0113359, + "balance_loss_clip": 1.00176287, + "balance_loss_mlp": 1.00064826, + "epoch": 0.31222005110476475, + "flos": 23806368984960.0, + "grad_norm": 2.03369452452287, + "language_loss": 0.8016991, + "learning_rate": 3.221060228416446e-06, + "loss": 0.82458347, + "num_input_tokens_seen": 111554055, + "step": 5193, + "time_per_iteration": 4.488238096237183 + }, + { + "auxiliary_loss_clip": 0.01139434, + "auxiliary_loss_mlp": 0.01133783, + "balance_loss_clip": 1.00172043, + "balance_loss_mlp": 1.00064957, + "epoch": 0.3122801743574327, + "flos": 25226132935680.0, + "grad_norm": 2.1269883831300853, + "language_loss": 0.71853209, + "learning_rate": 3.2207517543249183e-06, + "loss": 0.74126422, + "num_input_tokens_seen": 111574305, + "step": 5194, + "time_per_iteration": 4.136533260345459 + }, + { + "auxiliary_loss_clip": 0.01171544, + "auxiliary_loss_mlp": 0.01133197, + "balance_loss_clip": 1.00193834, + "balance_loss_mlp": 1.00082707, + "epoch": 0.3123402976101007, + "flos": 22966490200320.0, + "grad_norm": 1.6629516098676342, + "language_loss": 0.7643702, + "learning_rate": 3.2204432339423616e-06, + "loss": 0.78741759, + "num_input_tokens_seen": 111595680, + "step": 5195, + "time_per_iteration": 2.5518951416015625 + }, + { + "auxiliary_loss_clip": 0.01171662, + "auxiliary_loss_mlp": 0.01133681, + "balance_loss_clip": 1.00197124, + "balance_loss_mlp": 1.00073862, + "epoch": 0.3124004208627687, + "flos": 25192089820800.0, + "grad_norm": 1.4622489780285879, + "language_loss": 0.77767515, + "learning_rate": 3.220134667280476e-06, + "loss": 0.80072856, + "num_input_tokens_seen": 111618135, + "step": 5196, + "time_per_iteration": 2.5611767768859863 + }, + { + "auxiliary_loss_clip": 0.01135399, + "auxiliary_loss_mlp": 0.00747229, + "balance_loss_clip": 1.00140119, + "balance_loss_mlp": 1.00028312, + "epoch": 0.31246054411543667, + "flos": 67485165517440.0, + "grad_norm": 0.7802222544327688, + "language_loss": 0.54873854, + "learning_rate": 3.2198260543509613e-06, + "loss": 0.56756479, + "num_input_tokens_seen": 111682220, + "step": 5197, + "time_per_iteration": 3.220964193344116 + }, + { + "auxiliary_loss_clip": 0.01171367, + "auxiliary_loss_mlp": 0.0113263, + "balance_loss_clip": 1.00194693, + "balance_loss_mlp": 1.00073695, + "epoch": 0.31252066736810463, + "flos": 17858520731520.0, + "grad_norm": 1.8347056527825927, + "language_loss": 0.66166735, + "learning_rate": 3.21951739516552e-06, + "loss": 0.68470734, + "num_input_tokens_seen": 111700815, + "step": 5198, + "time_per_iteration": 2.518357038497925 + }, + { + "auxiliary_loss_clip": 0.01124412, + "auxiliary_loss_mlp": 0.01134031, + "balance_loss_clip": 1.00190365, + "balance_loss_mlp": 1.00070739, + "epoch": 0.3125807906207726, + "flos": 18475034791680.0, + "grad_norm": 3.2510587500861874, + "language_loss": 0.69495457, + "learning_rate": 3.219208689735857e-06, + "loss": 0.71753901, + "num_input_tokens_seen": 111718195, + "step": 5199, + "time_per_iteration": 2.603287696838379 + }, + { + "auxiliary_loss_clip": 0.01156306, + "auxiliary_loss_mlp": 0.01133005, + "balance_loss_clip": 1.00185943, + "balance_loss_mlp": 1.00082612, + "epoch": 0.31264091387344056, + "flos": 18946541646720.0, + "grad_norm": 1.8409373226588173, + "language_loss": 0.78861177, + "learning_rate": 3.2188999380736785e-06, + "loss": 0.81150484, + "num_input_tokens_seen": 111734440, + "step": 5200, + "time_per_iteration": 2.536933183670044 + }, + { + "auxiliary_loss_clip": 0.01155561, + "auxiliary_loss_mlp": 0.0113242, + "balance_loss_clip": 1.00184965, + "balance_loss_mlp": 1.00071812, + "epoch": 0.3127010371261085, + "flos": 21468512384640.0, + "grad_norm": 2.158053286071484, + "language_loss": 0.82939553, + "learning_rate": 3.2185911401906917e-06, + "loss": 0.85227537, + "num_input_tokens_seen": 111751960, + "step": 5201, + "time_per_iteration": 2.5608842372894287 + }, + { + "auxiliary_loss_clip": 0.01171497, + "auxiliary_loss_mlp": 0.011332, + "balance_loss_clip": 1.00194359, + "balance_loss_mlp": 1.00082946, + "epoch": 0.3127611603787765, + "flos": 15336047203200.0, + "grad_norm": 2.1932473023351324, + "language_loss": 0.68642366, + "learning_rate": 3.2182822960986072e-06, + "loss": 0.70947069, + "num_input_tokens_seen": 111769585, + "step": 5202, + "time_per_iteration": 2.502371072769165 + }, + { + "auxiliary_loss_clip": 0.01171529, + "auxiliary_loss_mlp": 0.01133013, + "balance_loss_clip": 1.00194192, + "balance_loss_mlp": 1.00073791, + "epoch": 0.31282128363144446, + "flos": 17602980399360.0, + "grad_norm": 1.77505681052282, + "language_loss": 0.83690077, + "learning_rate": 3.2179734058091358e-06, + "loss": 0.85994619, + "num_input_tokens_seen": 111787880, + "step": 5203, + "time_per_iteration": 2.4975693225860596 + }, + { + "auxiliary_loss_clip": 0.01106917, + "auxiliary_loss_mlp": 0.01134006, + "balance_loss_clip": 1.00186384, + "balance_loss_mlp": 1.00068259, + "epoch": 0.3128814068841124, + "flos": 26756753235840.0, + "grad_norm": 2.075333250012248, + "language_loss": 0.6071254, + "learning_rate": 3.2176644693339913e-06, + "loss": 0.62953466, + "num_input_tokens_seen": 111805950, + "step": 5204, + "time_per_iteration": 2.7611773014068604 + }, + { + "auxiliary_loss_clip": 0.0113966, + "auxiliary_loss_mlp": 0.01132764, + "balance_loss_clip": 1.00189328, + "balance_loss_mlp": 1.00068033, + "epoch": 0.3129415301367804, + "flos": 22272372806400.0, + "grad_norm": 1.7964616229329158, + "language_loss": 0.66035259, + "learning_rate": 3.217355486684887e-06, + "loss": 0.68307686, + "num_input_tokens_seen": 111826135, + "step": 5205, + "time_per_iteration": 2.628042459487915 + }, + { + "auxiliary_loss_clip": 0.01156349, + "auxiliary_loss_mlp": 0.01132832, + "balance_loss_clip": 1.00190103, + "balance_loss_mlp": 1.00093865, + "epoch": 0.31300165338944835, + "flos": 26464907232000.0, + "grad_norm": 1.8635384573019467, + "language_loss": 0.76659662, + "learning_rate": 3.2170464578735414e-06, + "loss": 0.78948843, + "num_input_tokens_seen": 111844700, + "step": 5206, + "time_per_iteration": 2.6089260578155518 + }, + { + "auxiliary_loss_clip": 0.01171383, + "auxiliary_loss_mlp": 0.01132742, + "balance_loss_clip": 1.00191367, + "balance_loss_mlp": 1.00075316, + "epoch": 0.3130617766421163, + "flos": 21944652094080.0, + "grad_norm": 2.789068321028697, + "language_loss": 0.83029896, + "learning_rate": 3.216737382911672e-06, + "loss": 0.85334021, + "num_input_tokens_seen": 111861585, + "step": 5207, + "time_per_iteration": 2.520988702774048 + }, + { + "auxiliary_loss_clip": 0.01154665, + "auxiliary_loss_mlp": 0.0113265, + "balance_loss_clip": 1.00176525, + "balance_loss_mlp": 1.00075674, + "epoch": 0.3131218998947843, + "flos": 23292774368640.0, + "grad_norm": 1.4980415335377626, + "language_loss": 0.7139467, + "learning_rate": 3.216428261810999e-06, + "loss": 0.7368198, + "num_input_tokens_seen": 111882950, + "step": 5208, + "time_per_iteration": 2.6263320446014404 + }, + { + "auxiliary_loss_clip": 0.01139242, + "auxiliary_loss_mlp": 0.01133207, + "balance_loss_clip": 1.00190616, + "balance_loss_mlp": 1.00083685, + "epoch": 0.3131820231474523, + "flos": 21139642437120.0, + "grad_norm": 2.2359833658941572, + "language_loss": 0.74516779, + "learning_rate": 3.2161190945832445e-06, + "loss": 0.7678923, + "num_input_tokens_seen": 111901640, + "step": 5209, + "time_per_iteration": 2.603424310684204 + }, + { + "auxiliary_loss_clip": 0.01171377, + "auxiliary_loss_mlp": 0.01132576, + "balance_loss_clip": 1.0018332, + "balance_loss_mlp": 1.00068283, + "epoch": 0.31324214640012027, + "flos": 23909863046400.0, + "grad_norm": 1.731126865030386, + "language_loss": 0.77302408, + "learning_rate": 3.2158098812401325e-06, + "loss": 0.79606366, + "num_input_tokens_seen": 111919615, + "step": 5210, + "time_per_iteration": 2.5400519371032715 + }, + { + "auxiliary_loss_clip": 0.01155595, + "auxiliary_loss_mlp": 0.0113233, + "balance_loss_clip": 1.00182164, + "balance_loss_mlp": 1.00081897, + "epoch": 0.31330226965278823, + "flos": 22236929061120.0, + "grad_norm": 1.8089847314162708, + "language_loss": 0.79600847, + "learning_rate": 3.2155006217933874e-06, + "loss": 0.81888771, + "num_input_tokens_seen": 111938485, + "step": 5211, + "time_per_iteration": 2.5426573753356934 + }, + { + "auxiliary_loss_clip": 0.01155758, + "auxiliary_loss_mlp": 0.01131712, + "balance_loss_clip": 1.00184095, + "balance_loss_mlp": 1.0006777, + "epoch": 0.3133623929054562, + "flos": 19753993428480.0, + "grad_norm": 1.7266798575879867, + "language_loss": 0.79307455, + "learning_rate": 3.2151913162547367e-06, + "loss": 0.81594926, + "num_input_tokens_seen": 111956425, + "step": 5212, + "time_per_iteration": 2.556978940963745 + }, + { + "auxiliary_loss_clip": 0.01145717, + "auxiliary_loss_mlp": 0.01133455, + "balance_loss_clip": 1.0019598, + "balance_loss_mlp": 1.00098991, + "epoch": 0.31342251615812416, + "flos": 27162256849920.0, + "grad_norm": 2.428195837862687, + "language_loss": 0.70926493, + "learning_rate": 3.2148819646359097e-06, + "loss": 0.73205662, + "num_input_tokens_seen": 111975915, + "step": 5213, + "time_per_iteration": 2.6222281455993652 + }, + { + "auxiliary_loss_clip": 0.01154952, + "auxiliary_loss_mlp": 0.01132802, + "balance_loss_clip": 1.00193, + "balance_loss_mlp": 1.00071788, + "epoch": 0.31348263941079213, + "flos": 20229809915520.0, + "grad_norm": 2.1599988752469104, + "language_loss": 0.77603924, + "learning_rate": 3.2145725669486374e-06, + "loss": 0.79891682, + "num_input_tokens_seen": 111995055, + "step": 5214, + "time_per_iteration": 2.552027463912964 + }, + { + "auxiliary_loss_clip": 0.01106791, + "auxiliary_loss_mlp": 0.01132281, + "balance_loss_clip": 1.00179422, + "balance_loss_mlp": 1.00067353, + "epoch": 0.3135427626634601, + "flos": 24607643627520.0, + "grad_norm": 1.5767671680957782, + "language_loss": 0.82782942, + "learning_rate": 3.2142631232046517e-06, + "loss": 0.85022014, + "num_input_tokens_seen": 112015830, + "step": 5215, + "time_per_iteration": 2.7229831218719482 + }, + { + "auxiliary_loss_clip": 0.01155152, + "auxiliary_loss_mlp": 0.01132886, + "balance_loss_clip": 1.00183702, + "balance_loss_mlp": 1.0007062, + "epoch": 0.31360288591612806, + "flos": 20959873845120.0, + "grad_norm": 2.1103197203425985, + "language_loss": 0.79576278, + "learning_rate": 3.213953633415686e-06, + "loss": 0.81864309, + "num_input_tokens_seen": 112035065, + "step": 5216, + "time_per_iteration": 2.5513851642608643 + }, + { + "auxiliary_loss_clip": 0.01140103, + "auxiliary_loss_mlp": 0.01133571, + "balance_loss_clip": 1.00181639, + "balance_loss_mlp": 1.00062895, + "epoch": 0.313663009168796, + "flos": 26980513009920.0, + "grad_norm": 1.5941208127193494, + "language_loss": 0.68609935, + "learning_rate": 3.213644097593477e-06, + "loss": 0.70883608, + "num_input_tokens_seen": 112058405, + "step": 5217, + "time_per_iteration": 2.650118350982666 + }, + { + "auxiliary_loss_clip": 0.01140511, + "auxiliary_loss_mlp": 0.01132899, + "balance_loss_clip": 1.00186789, + "balance_loss_mlp": 1.00072026, + "epoch": 0.313723132421464, + "flos": 18040911016320.0, + "grad_norm": 1.6518147291827046, + "language_loss": 0.80609322, + "learning_rate": 3.2133345157497624e-06, + "loss": 0.82882726, + "num_input_tokens_seen": 112076420, + "step": 5218, + "time_per_iteration": 2.5925135612487793 + }, + { + "auxiliary_loss_clip": 0.01171378, + "auxiliary_loss_mlp": 0.01133517, + "balance_loss_clip": 1.00191104, + "balance_loss_mlp": 1.00076544, + "epoch": 0.31378325567413196, + "flos": 22488913946880.0, + "grad_norm": 2.3151201652330315, + "language_loss": 0.69152242, + "learning_rate": 3.2130248878962813e-06, + "loss": 0.71457136, + "num_input_tokens_seen": 112090775, + "step": 5219, + "time_per_iteration": 2.5044188499450684 + }, + { + "auxiliary_loss_clip": 0.01139193, + "auxiliary_loss_mlp": 0.01133316, + "balance_loss_clip": 1.00189412, + "balance_loss_mlp": 1.00075555, + "epoch": 0.3138433789267999, + "flos": 22419247518720.0, + "grad_norm": 4.766669708101262, + "language_loss": 0.79597282, + "learning_rate": 3.2127152140447747e-06, + "loss": 0.81869793, + "num_input_tokens_seen": 112110980, + "step": 5220, + "time_per_iteration": 2.596318006515503 + }, + { + "auxiliary_loss_clip": 0.01156369, + "auxiliary_loss_mlp": 0.01133209, + "balance_loss_clip": 1.00193512, + "balance_loss_mlp": 1.00102997, + "epoch": 0.3139035021794679, + "flos": 13005912026880.0, + "grad_norm": 1.7048099384310245, + "language_loss": 0.7298587, + "learning_rate": 3.212405494206986e-06, + "loss": 0.75275451, + "num_input_tokens_seen": 112129020, + "step": 5221, + "time_per_iteration": 2.533679485321045 + }, + { + "auxiliary_loss_clip": 0.01139631, + "auxiliary_loss_mlp": 0.01132645, + "balance_loss_clip": 1.00176764, + "balance_loss_mlp": 1.00075161, + "epoch": 0.31396362543213585, + "flos": 16945994689920.0, + "grad_norm": 1.7588956549395185, + "language_loss": 0.81922328, + "learning_rate": 3.2120957283946588e-06, + "loss": 0.84194613, + "num_input_tokens_seen": 112147865, + "step": 5222, + "time_per_iteration": 2.577420711517334 + }, + { + "auxiliary_loss_clip": 0.01155859, + "auxiliary_loss_mlp": 0.01133383, + "balance_loss_clip": 1.00189602, + "balance_loss_mlp": 1.00091743, + "epoch": 0.31402374868480387, + "flos": 20156731695360.0, + "grad_norm": 1.8124006194928095, + "language_loss": 0.7059626, + "learning_rate": 3.2117859166195407e-06, + "loss": 0.72885501, + "num_input_tokens_seen": 112166745, + "step": 5223, + "time_per_iteration": 2.56368350982666 + }, + { + "auxiliary_loss_clip": 0.01156067, + "auxiliary_loss_mlp": 0.00747997, + "balance_loss_clip": 1.00193834, + "balance_loss_mlp": 1.00027478, + "epoch": 0.31408387193747184, + "flos": 21251073404160.0, + "grad_norm": 1.6390014865364728, + "language_loss": 0.80108857, + "learning_rate": 3.211476058893379e-06, + "loss": 0.82012922, + "num_input_tokens_seen": 112185895, + "step": 5224, + "time_per_iteration": 2.5617754459381104 + }, + { + "auxiliary_loss_clip": 0.01154919, + "auxiliary_loss_mlp": 0.01133233, + "balance_loss_clip": 1.00191009, + "balance_loss_mlp": 1.0008626, + "epoch": 0.3141439951901398, + "flos": 27484267299840.0, + "grad_norm": 2.4786004624346902, + "language_loss": 0.57557082, + "learning_rate": 3.2111661552279243e-06, + "loss": 0.59845233, + "num_input_tokens_seen": 112204465, + "step": 5225, + "time_per_iteration": 2.666900634765625 + }, + { + "auxiliary_loss_clip": 0.01111709, + "auxiliary_loss_mlp": 0.01132688, + "balance_loss_clip": 1.00181508, + "balance_loss_mlp": 1.00079441, + "epoch": 0.31420411844280777, + "flos": 17852235851520.0, + "grad_norm": 1.9175281177051242, + "language_loss": 0.81713653, + "learning_rate": 3.2108562056349273e-06, + "loss": 0.83958048, + "num_input_tokens_seen": 112221635, + "step": 5226, + "time_per_iteration": 2.6512694358825684 + }, + { + "auxiliary_loss_clip": 0.01155977, + "auxiliary_loss_mlp": 0.01133568, + "balance_loss_clip": 1.00189734, + "balance_loss_mlp": 1.00091195, + "epoch": 0.31426424169547573, + "flos": 21616967295360.0, + "grad_norm": 1.7016477773049494, + "language_loss": 0.73375863, + "learning_rate": 3.210546210126141e-06, + "loss": 0.75665408, + "num_input_tokens_seen": 112241240, + "step": 5227, + "time_per_iteration": 2.5562591552734375 + }, + { + "auxiliary_loss_clip": 0.01155502, + "auxiliary_loss_mlp": 0.01134156, + "balance_loss_clip": 1.00211787, + "balance_loss_mlp": 1.00073731, + "epoch": 0.3143243649481437, + "flos": 30920631586560.0, + "grad_norm": 2.31454201217416, + "language_loss": 0.67853945, + "learning_rate": 3.2102361687133213e-06, + "loss": 0.70143604, + "num_input_tokens_seen": 112262350, + "step": 5228, + "time_per_iteration": 4.019442081451416 + }, + { + "auxiliary_loss_clip": 0.01138029, + "auxiliary_loss_mlp": 0.01132877, + "balance_loss_clip": 1.0018425, + "balance_loss_mlp": 1.00088871, + "epoch": 0.31438448820081166, + "flos": 22821411168000.0, + "grad_norm": 2.4400351999897034, + "language_loss": 0.79924965, + "learning_rate": 3.2099260814082254e-06, + "loss": 0.82195878, + "num_input_tokens_seen": 112283710, + "step": 5229, + "time_per_iteration": 2.599735975265503 + }, + { + "auxiliary_loss_clip": 0.01137983, + "auxiliary_loss_mlp": 0.01132906, + "balance_loss_clip": 1.00183046, + "balance_loss_mlp": 1.00063181, + "epoch": 0.3144446114534796, + "flos": 23292127923840.0, + "grad_norm": 1.7094929449809941, + "language_loss": 0.69971132, + "learning_rate": 3.209615948222611e-06, + "loss": 0.72242022, + "num_input_tokens_seen": 112304285, + "step": 5230, + "time_per_iteration": 4.03354811668396 + }, + { + "auxiliary_loss_clip": 0.01122827, + "auxiliary_loss_mlp": 0.01133144, + "balance_loss_clip": 1.00152147, + "balance_loss_mlp": 1.00067902, + "epoch": 0.3145047347061476, + "flos": 31355976424320.0, + "grad_norm": 1.8440217629984859, + "language_loss": 0.79594094, + "learning_rate": 3.209305769168239e-06, + "loss": 0.81850064, + "num_input_tokens_seen": 112325110, + "step": 5231, + "time_per_iteration": 4.162369251251221 + }, + { + "auxiliary_loss_clip": 0.01138791, + "auxiliary_loss_mlp": 0.01133376, + "balance_loss_clip": 1.00200129, + "balance_loss_mlp": 1.00081563, + "epoch": 0.31456485795881556, + "flos": 10889552643840.0, + "grad_norm": 3.483742333781412, + "language_loss": 0.84627271, + "learning_rate": 3.2089955442568704e-06, + "loss": 0.86899436, + "num_input_tokens_seen": 112339855, + "step": 5232, + "time_per_iteration": 2.5659947395324707 + }, + { + "auxiliary_loss_clip": 0.01107651, + "auxiliary_loss_mlp": 0.0113273, + "balance_loss_clip": 1.00168133, + "balance_loss_mlp": 1.00093198, + "epoch": 0.3146249812114835, + "flos": 17092438439040.0, + "grad_norm": 2.050342079988463, + "language_loss": 0.79827923, + "learning_rate": 3.2086852735002692e-06, + "loss": 0.820683, + "num_input_tokens_seen": 112358480, + "step": 5233, + "time_per_iteration": 2.6628525257110596 + }, + { + "auxiliary_loss_clip": 0.01108021, + "auxiliary_loss_mlp": 0.01132873, + "balance_loss_clip": 1.00177908, + "balance_loss_mlp": 1.00088394, + "epoch": 0.3146851044641515, + "flos": 55291442889600.0, + "grad_norm": 1.629118997255372, + "language_loss": 0.70631862, + "learning_rate": 3.2083749569102024e-06, + "loss": 0.72872752, + "num_input_tokens_seen": 112382350, + "step": 5234, + "time_per_iteration": 2.955052614212036 + }, + { + "auxiliary_loss_clip": 0.01123125, + "auxiliary_loss_mlp": 0.01133498, + "balance_loss_clip": 1.00174665, + "balance_loss_mlp": 1.00074685, + "epoch": 0.31474522771681945, + "flos": 27015884928000.0, + "grad_norm": 2.5815693809829106, + "language_loss": 0.72194028, + "learning_rate": 3.2080645944984356e-06, + "loss": 0.74450648, + "num_input_tokens_seen": 112400260, + "step": 5235, + "time_per_iteration": 2.6622400283813477 + }, + { + "auxiliary_loss_clip": 0.01156264, + "auxiliary_loss_mlp": 0.01132725, + "balance_loss_clip": 1.00186217, + "balance_loss_mlp": 1.0008316, + "epoch": 0.3148053509694875, + "flos": 21251935330560.0, + "grad_norm": 5.142119131970216, + "language_loss": 0.78900838, + "learning_rate": 3.2077541862767384e-06, + "loss": 0.81189823, + "num_input_tokens_seen": 112419400, + "step": 5236, + "time_per_iteration": 2.559232711791992 + }, + { + "auxiliary_loss_clip": 0.01171496, + "auxiliary_loss_mlp": 0.01132866, + "balance_loss_clip": 1.00196695, + "balance_loss_mlp": 1.00087786, + "epoch": 0.31486547422215544, + "flos": 31248675521280.0, + "grad_norm": 3.1116758927061703, + "language_loss": 0.75539267, + "learning_rate": 3.207443732256881e-06, + "loss": 0.7784363, + "num_input_tokens_seen": 112440825, + "step": 5237, + "time_per_iteration": 2.609492778778076 + }, + { + "auxiliary_loss_clip": 0.0117135, + "auxiliary_loss_mlp": 0.01133069, + "balance_loss_clip": 1.00203586, + "balance_loss_mlp": 1.00098491, + "epoch": 0.3149255974748234, + "flos": 19828615933440.0, + "grad_norm": 1.984030923684963, + "language_loss": 0.79573309, + "learning_rate": 3.2071332324506372e-06, + "loss": 0.81877732, + "num_input_tokens_seen": 112459180, + "step": 5238, + "time_per_iteration": 2.5156729221343994 + }, + { + "auxiliary_loss_clip": 0.01152514, + "auxiliary_loss_mlp": 0.01116216, + "balance_loss_clip": 1.00167823, + "balance_loss_mlp": 1.00024879, + "epoch": 0.31498572072749137, + "flos": 67683965339520.0, + "grad_norm": 0.8440718628416335, + "language_loss": 0.67928541, + "learning_rate": 3.2068226868697795e-06, + "loss": 0.70197272, + "num_input_tokens_seen": 112516680, + "step": 5239, + "time_per_iteration": 3.1453874111175537 + }, + { + "auxiliary_loss_clip": 0.01139073, + "auxiliary_loss_mlp": 0.01133718, + "balance_loss_clip": 1.00176072, + "balance_loss_mlp": 1.00087166, + "epoch": 0.31504584398015933, + "flos": 19793136274560.0, + "grad_norm": 2.426532344123245, + "language_loss": 0.834656, + "learning_rate": 3.2065120955260846e-06, + "loss": 0.85738391, + "num_input_tokens_seen": 112535895, + "step": 5240, + "time_per_iteration": 2.5902888774871826 + }, + { + "auxiliary_loss_clip": 0.01139209, + "auxiliary_loss_mlp": 0.00748045, + "balance_loss_clip": 1.00193799, + "balance_loss_mlp": 1.0003016, + "epoch": 0.3151059672328273, + "flos": 26615409217920.0, + "grad_norm": 1.825703163899262, + "language_loss": 0.81298435, + "learning_rate": 3.2062014584313302e-06, + "loss": 0.83185697, + "num_input_tokens_seen": 112557490, + "step": 5241, + "time_per_iteration": 2.7012557983398438 + }, + { + "auxiliary_loss_clip": 0.01171537, + "auxiliary_loss_mlp": 0.01132694, + "balance_loss_clip": 1.00216317, + "balance_loss_mlp": 1.00080049, + "epoch": 0.31516609048549526, + "flos": 24204438483840.0, + "grad_norm": 1.7050074494452576, + "language_loss": 0.74033159, + "learning_rate": 3.2058907755972956e-06, + "loss": 0.76337385, + "num_input_tokens_seen": 112577075, + "step": 5242, + "time_per_iteration": 2.575700283050537 + }, + { + "auxiliary_loss_clip": 0.01125565, + "auxiliary_loss_mlp": 0.01133327, + "balance_loss_clip": 1.00195289, + "balance_loss_mlp": 1.00095725, + "epoch": 0.31522621373816323, + "flos": 25958710817280.0, + "grad_norm": 1.9360473775414881, + "language_loss": 0.74229223, + "learning_rate": 3.2055800470357626e-06, + "loss": 0.76488113, + "num_input_tokens_seen": 112597620, + "step": 5243, + "time_per_iteration": 2.696190118789673 + }, + { + "auxiliary_loss_clip": 0.01154817, + "auxiliary_loss_mlp": 0.01132956, + "balance_loss_clip": 1.00189328, + "balance_loss_mlp": 1.00077724, + "epoch": 0.3152863369908312, + "flos": 21908813299200.0, + "grad_norm": 2.1088467903699875, + "language_loss": 0.64495444, + "learning_rate": 3.205269272758513e-06, + "loss": 0.66783214, + "num_input_tokens_seen": 112617150, + "step": 5244, + "time_per_iteration": 2.5749285221099854 + }, + { + "auxiliary_loss_clip": 0.01105125, + "auxiliary_loss_mlp": 0.01133228, + "balance_loss_clip": 1.0015986, + "balance_loss_mlp": 1.00076222, + "epoch": 0.31534646024349916, + "flos": 16281072074880.0, + "grad_norm": 2.144293937226027, + "language_loss": 0.91101134, + "learning_rate": 3.2049584527773313e-06, + "loss": 0.93339491, + "num_input_tokens_seen": 112631090, + "step": 5245, + "time_per_iteration": 2.6334919929504395 + }, + { + "auxiliary_loss_clip": 0.01154881, + "auxiliary_loss_mlp": 0.01133596, + "balance_loss_clip": 1.00200152, + "balance_loss_mlp": 1.00093961, + "epoch": 0.3154065834961671, + "flos": 24717243000960.0, + "grad_norm": 2.2387514416162597, + "language_loss": 0.75379646, + "learning_rate": 3.2046475871040048e-06, + "loss": 0.77668124, + "num_input_tokens_seen": 112651220, + "step": 5246, + "time_per_iteration": 2.596708297729492 + }, + { + "auxiliary_loss_clip": 0.01171397, + "auxiliary_loss_mlp": 0.01133152, + "balance_loss_clip": 1.0018537, + "balance_loss_mlp": 1.00087714, + "epoch": 0.3154667067488351, + "flos": 35371148469120.0, + "grad_norm": 1.7612047085213582, + "language_loss": 0.61210132, + "learning_rate": 3.204336675750321e-06, + "loss": 0.6351468, + "num_input_tokens_seen": 112671560, + "step": 5247, + "time_per_iteration": 2.6319401264190674 + }, + { + "auxiliary_loss_clip": 0.01154921, + "auxiliary_loss_mlp": 0.01133286, + "balance_loss_clip": 1.00192988, + "balance_loss_mlp": 1.000916, + "epoch": 0.31552683000150306, + "flos": 17456464823040.0, + "grad_norm": 2.137366152057774, + "language_loss": 0.82202035, + "learning_rate": 3.2040257187280693e-06, + "loss": 0.8449024, + "num_input_tokens_seen": 112689790, + "step": 5248, + "time_per_iteration": 2.5756523609161377 + }, + { + "auxiliary_loss_clip": 0.01138824, + "auxiliary_loss_mlp": 0.01133223, + "balance_loss_clip": 1.00178301, + "balance_loss_mlp": 1.00075793, + "epoch": 0.3155869532541711, + "flos": 18405763413120.0, + "grad_norm": 1.8224321366119514, + "language_loss": 0.85077327, + "learning_rate": 3.2037147160490423e-06, + "loss": 0.87349379, + "num_input_tokens_seen": 112708265, + "step": 5249, + "time_per_iteration": 2.608107805252075 + }, + { + "auxiliary_loss_clip": 0.01127313, + "auxiliary_loss_mlp": 0.01133582, + "balance_loss_clip": 1.00208116, + "balance_loss_mlp": 1.00073552, + "epoch": 0.31564707650683904, + "flos": 21579763783680.0, + "grad_norm": 1.7025342502135283, + "language_loss": 0.85638559, + "learning_rate": 3.2034036677250322e-06, + "loss": 0.87899458, + "num_input_tokens_seen": 112727820, + "step": 5250, + "time_per_iteration": 2.663970708847046 + }, + { + "auxiliary_loss_clip": 0.01138082, + "auxiliary_loss_mlp": 0.01133448, + "balance_loss_clip": 1.00177431, + "balance_loss_mlp": 1.00079143, + "epoch": 0.315707199759507, + "flos": 21030976817280.0, + "grad_norm": 2.491688972784402, + "language_loss": 0.6845597, + "learning_rate": 3.203092573767835e-06, + "loss": 0.70727503, + "num_input_tokens_seen": 112743140, + "step": 5251, + "time_per_iteration": 2.574738025665283 + }, + { + "auxiliary_loss_clip": 0.01171493, + "auxiliary_loss_mlp": 0.01133055, + "balance_loss_clip": 1.00205457, + "balance_loss_mlp": 1.00078022, + "epoch": 0.31576732301217497, + "flos": 26828861788800.0, + "grad_norm": 2.6827334759784676, + "language_loss": 0.78671312, + "learning_rate": 3.202781434189246e-06, + "loss": 0.80975866, + "num_input_tokens_seen": 112764705, + "step": 5252, + "time_per_iteration": 2.563777208328247 + }, + { + "auxiliary_loss_clip": 0.01156281, + "auxiliary_loss_mlp": 0.01132689, + "balance_loss_clip": 1.00212717, + "balance_loss_mlp": 1.00089073, + "epoch": 0.31582744626484294, + "flos": 22711165349760.0, + "grad_norm": 1.5676634644001515, + "language_loss": 0.7434175, + "learning_rate": 3.202470249001066e-06, + "loss": 0.76630723, + "num_input_tokens_seen": 112785310, + "step": 5253, + "time_per_iteration": 2.5801427364349365 + }, + { + "auxiliary_loss_clip": 0.01144223, + "auxiliary_loss_mlp": 0.01133613, + "balance_loss_clip": 1.00195765, + "balance_loss_mlp": 1.0007658, + "epoch": 0.3158875695175109, + "flos": 23951914894080.0, + "grad_norm": 1.7001788851054365, + "language_loss": 0.73336858, + "learning_rate": 3.2021590182150924e-06, + "loss": 0.75614691, + "num_input_tokens_seen": 112802905, + "step": 5254, + "time_per_iteration": 2.616304397583008 + }, + { + "auxiliary_loss_clip": 0.01155096, + "auxiliary_loss_mlp": 0.01133347, + "balance_loss_clip": 1.00195742, + "balance_loss_mlp": 1.00078595, + "epoch": 0.31594769277017887, + "flos": 13261883322240.0, + "grad_norm": 2.4623032472430006, + "language_loss": 0.77700299, + "learning_rate": 3.201847741843128e-06, + "loss": 0.79988742, + "num_input_tokens_seen": 112820305, + "step": 5255, + "time_per_iteration": 2.5315370559692383 + }, + { + "auxiliary_loss_clip": 0.01137848, + "auxiliary_loss_mlp": 0.01133135, + "balance_loss_clip": 1.00185204, + "balance_loss_mlp": 1.00085998, + "epoch": 0.31600781602284683, + "flos": 23368258800000.0, + "grad_norm": 2.6990976944815976, + "language_loss": 0.78018522, + "learning_rate": 3.2015364198969772e-06, + "loss": 0.80289507, + "num_input_tokens_seen": 112841185, + "step": 5256, + "time_per_iteration": 2.6275908946990967 + }, + { + "auxiliary_loss_clip": 0.01121587, + "auxiliary_loss_mlp": 0.01131814, + "balance_loss_clip": 1.00172567, + "balance_loss_mlp": 1.00087464, + "epoch": 0.3160679392755148, + "flos": 19828580019840.0, + "grad_norm": 1.5576328489436797, + "language_loss": 0.7136631, + "learning_rate": 3.2012250523884453e-06, + "loss": 0.73619711, + "num_input_tokens_seen": 112860570, + "step": 5257, + "time_per_iteration": 2.6346044540405273 + }, + { + "auxiliary_loss_clip": 0.01155258, + "auxiliary_loss_mlp": 0.01133045, + "balance_loss_clip": 1.00197172, + "balance_loss_mlp": 1.00086594, + "epoch": 0.31612806252818276, + "flos": 20193216935040.0, + "grad_norm": 2.098252211684908, + "language_loss": 0.76349235, + "learning_rate": 3.2009136393293393e-06, + "loss": 0.7863754, + "num_input_tokens_seen": 112877975, + "step": 5258, + "time_per_iteration": 2.5667693614959717 + }, + { + "auxiliary_loss_clip": 0.01139869, + "auxiliary_loss_mlp": 0.01133408, + "balance_loss_clip": 1.00197196, + "balance_loss_mlp": 1.00094235, + "epoch": 0.31618818578085073, + "flos": 24235967646720.0, + "grad_norm": 1.9383283662794164, + "language_loss": 0.72748768, + "learning_rate": 3.200602180731467e-06, + "loss": 0.75022048, + "num_input_tokens_seen": 112896170, + "step": 5259, + "time_per_iteration": 2.60577392578125 + }, + { + "auxiliary_loss_clip": 0.0114009, + "auxiliary_loss_mlp": 0.00747938, + "balance_loss_clip": 1.0020057, + "balance_loss_mlp": 1.00019634, + "epoch": 0.3162483090335187, + "flos": 25081844002560.0, + "grad_norm": 3.1150447849466594, + "language_loss": 0.66765916, + "learning_rate": 3.20029067660664e-06, + "loss": 0.68653941, + "num_input_tokens_seen": 112916180, + "step": 5260, + "time_per_iteration": 2.625223398208618 + }, + { + "auxiliary_loss_clip": 0.01155815, + "auxiliary_loss_mlp": 0.01133025, + "balance_loss_clip": 1.00189054, + "balance_loss_mlp": 1.00065553, + "epoch": 0.31630843228618666, + "flos": 26323383646080.0, + "grad_norm": 2.388011102546235, + "language_loss": 0.71911263, + "learning_rate": 3.1999791269666706e-06, + "loss": 0.742001, + "num_input_tokens_seen": 112936745, + "step": 5261, + "time_per_iteration": 2.6029574871063232 + }, + { + "auxiliary_loss_clip": 0.01151186, + "auxiliary_loss_mlp": 0.01116204, + "balance_loss_clip": 1.00168049, + "balance_loss_mlp": 1.00023687, + "epoch": 0.3163685555388547, + "flos": 66758441552640.0, + "grad_norm": 0.7412174377875541, + "language_loss": 0.50659734, + "learning_rate": 3.1996675318233716e-06, + "loss": 0.52927125, + "num_input_tokens_seen": 112994845, + "step": 5262, + "time_per_iteration": 3.1699817180633545 + }, + { + "auxiliary_loss_clip": 0.01154943, + "auxiliary_loss_mlp": 0.01133544, + "balance_loss_clip": 1.00201821, + "balance_loss_mlp": 1.00088787, + "epoch": 0.31642867879152264, + "flos": 25995662933760.0, + "grad_norm": 1.4731017416087786, + "language_loss": 0.84963071, + "learning_rate": 3.19935589118856e-06, + "loss": 0.87251556, + "num_input_tokens_seen": 113015125, + "step": 5263, + "time_per_iteration": 2.6134321689605713 + }, + { + "auxiliary_loss_clip": 0.01139328, + "auxiliary_loss_mlp": 0.01132199, + "balance_loss_clip": 1.00185108, + "balance_loss_mlp": 1.00087786, + "epoch": 0.3164888020441906, + "flos": 25774955815680.0, + "grad_norm": 1.6131738679578138, + "language_loss": 0.81734711, + "learning_rate": 3.1990442050740535e-06, + "loss": 0.84006238, + "num_input_tokens_seen": 113035535, + "step": 5264, + "time_per_iteration": 2.6418449878692627 + }, + { + "auxiliary_loss_clip": 0.01141072, + "auxiliary_loss_mlp": 0.01133343, + "balance_loss_clip": 1.00202751, + "balance_loss_mlp": 1.00078237, + "epoch": 0.3165489252968586, + "flos": 19756220071680.0, + "grad_norm": 1.7664560963896323, + "language_loss": 0.79281592, + "learning_rate": 3.19873247349167e-06, + "loss": 0.81556004, + "num_input_tokens_seen": 113052720, + "step": 5265, + "time_per_iteration": 2.5866732597351074 + }, + { + "auxiliary_loss_clip": 0.01154913, + "auxiliary_loss_mlp": 0.01133332, + "balance_loss_clip": 1.00207639, + "balance_loss_mlp": 1.00086689, + "epoch": 0.31660904854952654, + "flos": 23183929180800.0, + "grad_norm": 1.709039909896483, + "language_loss": 0.75062549, + "learning_rate": 3.1984206964532307e-06, + "loss": 0.77350795, + "num_input_tokens_seen": 113071435, + "step": 5266, + "time_per_iteration": 3.9281272888183594 + }, + { + "auxiliary_loss_clip": 0.01124393, + "auxiliary_loss_mlp": 0.01133603, + "balance_loss_clip": 1.00193572, + "balance_loss_mlp": 1.00085151, + "epoch": 0.3166691718021945, + "flos": 20408501099520.0, + "grad_norm": 1.9781018971030213, + "language_loss": 0.79266787, + "learning_rate": 3.1981088739705585e-06, + "loss": 0.81524777, + "num_input_tokens_seen": 113088645, + "step": 5267, + "time_per_iteration": 4.063950300216675 + }, + { + "auxiliary_loss_clip": 0.01134734, + "auxiliary_loss_mlp": 0.01116814, + "balance_loss_clip": 1.00159526, + "balance_loss_mlp": 1.00008476, + "epoch": 0.31672929505486247, + "flos": 70144781172480.0, + "grad_norm": 0.7422081489540857, + "language_loss": 0.57867205, + "learning_rate": 3.197797006055478e-06, + "loss": 0.60118747, + "num_input_tokens_seen": 113152775, + "step": 5268, + "time_per_iteration": 3.203831434249878 + }, + { + "auxiliary_loss_clip": 0.01171529, + "auxiliary_loss_mlp": 0.01133021, + "balance_loss_clip": 1.00197577, + "balance_loss_mlp": 1.00065136, + "epoch": 0.31678941830753043, + "flos": 14355758154240.0, + "grad_norm": 2.180066584417587, + "language_loss": 0.73014241, + "learning_rate": 3.197485092719815e-06, + "loss": 0.75318795, + "num_input_tokens_seen": 113171410, + "step": 5269, + "time_per_iteration": 5.281693458557129 + }, + { + "auxiliary_loss_clip": 0.01122534, + "auxiliary_loss_mlp": 0.0113311, + "balance_loss_clip": 1.00180531, + "balance_loss_mlp": 1.00112176, + "epoch": 0.3168495415601984, + "flos": 22747722416640.0, + "grad_norm": 2.006651042385275, + "language_loss": 0.79703653, + "learning_rate": 3.1971731339753973e-06, + "loss": 0.81959295, + "num_input_tokens_seen": 113189965, + "step": 5270, + "time_per_iteration": 2.636597156524658 + }, + { + "auxiliary_loss_clip": 0.0117162, + "auxiliary_loss_mlp": 0.01133871, + "balance_loss_clip": 1.00206113, + "balance_loss_mlp": 1.00092888, + "epoch": 0.31690966481286637, + "flos": 20115254465280.0, + "grad_norm": 2.0623728498621863, + "language_loss": 0.79287714, + "learning_rate": 3.1968611298340545e-06, + "loss": 0.81593204, + "num_input_tokens_seen": 113206355, + "step": 5271, + "time_per_iteration": 2.477233409881592 + }, + { + "auxiliary_loss_clip": 0.0117163, + "auxiliary_loss_mlp": 0.01133437, + "balance_loss_clip": 1.00208211, + "balance_loss_mlp": 1.00078058, + "epoch": 0.31696978806553433, + "flos": 21178928937600.0, + "grad_norm": 1.9451670360117692, + "language_loss": 0.73237836, + "learning_rate": 3.1965490803076173e-06, + "loss": 0.75542909, + "num_input_tokens_seen": 113225440, + "step": 5272, + "time_per_iteration": 2.508765935897827 + }, + { + "auxiliary_loss_clip": 0.01139665, + "auxiliary_loss_mlp": 0.01134023, + "balance_loss_clip": 1.00189781, + "balance_loss_mlp": 1.00089025, + "epoch": 0.3170299113182023, + "flos": 42997030439040.0, + "grad_norm": 1.9545307700753678, + "language_loss": 0.68385041, + "learning_rate": 3.1962369854079194e-06, + "loss": 0.70658731, + "num_input_tokens_seen": 113248840, + "step": 5273, + "time_per_iteration": 2.7657504081726074 + }, + { + "auxiliary_loss_clip": 0.01154874, + "auxiliary_loss_mlp": 0.00747899, + "balance_loss_clip": 1.00194228, + "balance_loss_mlp": 1.00029016, + "epoch": 0.31709003457087026, + "flos": 24460158384000.0, + "grad_norm": 1.8277669890814559, + "language_loss": 0.67796648, + "learning_rate": 3.195924845146795e-06, + "loss": 0.69699419, + "num_input_tokens_seen": 113269630, + "step": 5274, + "time_per_iteration": 2.5885326862335205 + }, + { + "auxiliary_loss_clip": 0.01122629, + "auxiliary_loss_mlp": 0.01132393, + "balance_loss_clip": 1.00189793, + "balance_loss_mlp": 1.0009768, + "epoch": 0.3171501578235382, + "flos": 24135310759680.0, + "grad_norm": 1.479524838332729, + "language_loss": 0.80665946, + "learning_rate": 3.195612659536081e-06, + "loss": 0.82920963, + "num_input_tokens_seen": 113291200, + "step": 5275, + "time_per_iteration": 2.658989906311035 + }, + { + "auxiliary_loss_clip": 0.01155861, + "auxiliary_loss_mlp": 0.01132902, + "balance_loss_clip": 1.00191486, + "balance_loss_mlp": 1.00100851, + "epoch": 0.31721028107620625, + "flos": 18879712392960.0, + "grad_norm": 1.6896384238881408, + "language_loss": 0.72636884, + "learning_rate": 3.1953004285876147e-06, + "loss": 0.74925643, + "num_input_tokens_seen": 113310170, + "step": 5276, + "time_per_iteration": 2.5384867191314697 + }, + { + "auxiliary_loss_clip": 0.01139833, + "auxiliary_loss_mlp": 0.01132283, + "balance_loss_clip": 1.00187373, + "balance_loss_mlp": 1.00067556, + "epoch": 0.3172704043288742, + "flos": 23147874904320.0, + "grad_norm": 1.6257335840981448, + "language_loss": 0.7795226, + "learning_rate": 3.194988152313236e-06, + "loss": 0.80224383, + "num_input_tokens_seen": 113331140, + "step": 5277, + "time_per_iteration": 2.6134650707244873 + }, + { + "auxiliary_loss_clip": 0.01139431, + "auxiliary_loss_mlp": 0.01133695, + "balance_loss_clip": 1.00188613, + "balance_loss_mlp": 1.00094366, + "epoch": 0.3173305275815422, + "flos": 17858520731520.0, + "grad_norm": 1.6475917578220904, + "language_loss": 0.78420305, + "learning_rate": 3.1946758307247878e-06, + "loss": 0.80693436, + "num_input_tokens_seen": 113350030, + "step": 5278, + "time_per_iteration": 2.563647747039795 + }, + { + "auxiliary_loss_clip": 0.01150521, + "auxiliary_loss_mlp": 0.0111679, + "balance_loss_clip": 1.00167251, + "balance_loss_mlp": 1.00005996, + "epoch": 0.31739065083421014, + "flos": 59973476883840.0, + "grad_norm": 0.8729123506148715, + "language_loss": 0.62837666, + "learning_rate": 3.1943634638341114e-06, + "loss": 0.65104973, + "num_input_tokens_seen": 113395820, + "step": 5279, + "time_per_iteration": 2.905203104019165 + }, + { + "auxiliary_loss_clip": 0.0117163, + "auxiliary_loss_mlp": 0.0113393, + "balance_loss_clip": 1.00197434, + "balance_loss_mlp": 1.00098741, + "epoch": 0.3174507740868781, + "flos": 23800981944960.0, + "grad_norm": 1.5289517291378412, + "language_loss": 0.81097287, + "learning_rate": 3.194051051653053e-06, + "loss": 0.83402848, + "num_input_tokens_seen": 113416835, + "step": 5280, + "time_per_iteration": 2.558966636657715 + }, + { + "auxiliary_loss_clip": 0.01121492, + "auxiliary_loss_mlp": 0.01133525, + "balance_loss_clip": 1.00188529, + "balance_loss_mlp": 1.00125015, + "epoch": 0.31751089733954607, + "flos": 27638899349760.0, + "grad_norm": 2.1092185478877843, + "language_loss": 0.78208053, + "learning_rate": 3.19373859419346e-06, + "loss": 0.80463064, + "num_input_tokens_seen": 113440850, + "step": 5281, + "time_per_iteration": 2.7315917015075684 + }, + { + "auxiliary_loss_clip": 0.01139196, + "auxiliary_loss_mlp": 0.01132596, + "balance_loss_clip": 1.00197554, + "balance_loss_mlp": 1.00079799, + "epoch": 0.31757102059221404, + "flos": 23769273214080.0, + "grad_norm": 1.5431429384603617, + "language_loss": 0.78524065, + "learning_rate": 3.193426091467179e-06, + "loss": 0.80795854, + "num_input_tokens_seen": 113461000, + "step": 5282, + "time_per_iteration": 2.6429474353790283 + }, + { + "auxiliary_loss_clip": 0.0114116, + "auxiliary_loss_mlp": 0.01133682, + "balance_loss_clip": 1.00196695, + "balance_loss_mlp": 1.00093079, + "epoch": 0.317631143844882, + "flos": 25264521596160.0, + "grad_norm": 2.6873931272191878, + "language_loss": 0.67468852, + "learning_rate": 3.193113543486061e-06, + "loss": 0.69743693, + "num_input_tokens_seen": 113480820, + "step": 5283, + "time_per_iteration": 2.6506588459014893 + }, + { + "auxiliary_loss_clip": 0.01150623, + "auxiliary_loss_mlp": 0.01116031, + "balance_loss_clip": 1.00134444, + "balance_loss_mlp": 1.00006413, + "epoch": 0.31769126709754997, + "flos": 55825939221120.0, + "grad_norm": 0.7551471692516777, + "language_loss": 0.52825278, + "learning_rate": 3.192800950261958e-06, + "loss": 0.55091935, + "num_input_tokens_seen": 113536910, + "step": 5284, + "time_per_iteration": 3.0761075019836426 + }, + { + "auxiliary_loss_clip": 0.01138577, + "auxiliary_loss_mlp": 0.01133651, + "balance_loss_clip": 1.00199437, + "balance_loss_mlp": 1.00089931, + "epoch": 0.31775139035021793, + "flos": 16690562098560.0, + "grad_norm": 1.6547941484344095, + "language_loss": 0.70377207, + "learning_rate": 3.1924883118067235e-06, + "loss": 0.72649431, + "num_input_tokens_seen": 113555480, + "step": 5285, + "time_per_iteration": 2.58259916305542 + }, + { + "auxiliary_loss_clip": 0.01167519, + "auxiliary_loss_mlp": 0.01116076, + "balance_loss_clip": 1.00169277, + "balance_loss_mlp": 1.00010896, + "epoch": 0.3178115136028859, + "flos": 64227241019520.0, + "grad_norm": 0.8195158173530596, + "language_loss": 0.6054191, + "learning_rate": 3.1921756281322123e-06, + "loss": 0.62825501, + "num_input_tokens_seen": 113616790, + "step": 5286, + "time_per_iteration": 3.114784002304077 + }, + { + "auxiliary_loss_clip": 0.0117158, + "auxiliary_loss_mlp": 0.01132814, + "balance_loss_clip": 1.00195181, + "balance_loss_mlp": 1.00082588, + "epoch": 0.31787163685555386, + "flos": 18697465762560.0, + "grad_norm": 2.0178619732126455, + "language_loss": 0.71872532, + "learning_rate": 3.1918628992502826e-06, + "loss": 0.74176931, + "num_input_tokens_seen": 113635320, + "step": 5287, + "time_per_iteration": 2.502937078475952 + }, + { + "auxiliary_loss_clip": 0.01155021, + "auxiliary_loss_mlp": 0.01133455, + "balance_loss_clip": 1.00199723, + "balance_loss_mlp": 1.00098932, + "epoch": 0.31793176010822183, + "flos": 21324762155520.0, + "grad_norm": 1.9713564858704589, + "language_loss": 0.75342506, + "learning_rate": 3.191550125172792e-06, + "loss": 0.77630985, + "num_input_tokens_seen": 113654000, + "step": 5288, + "time_per_iteration": 2.6127827167510986 + }, + { + "auxiliary_loss_clip": 0.01155479, + "auxiliary_loss_mlp": 0.01131917, + "balance_loss_clip": 1.00178838, + "balance_loss_mlp": 1.00069165, + "epoch": 0.31799188336088985, + "flos": 20958688696320.0, + "grad_norm": 7.367003411716477, + "language_loss": 0.87233776, + "learning_rate": 3.1912373059116007e-06, + "loss": 0.8952117, + "num_input_tokens_seen": 113672375, + "step": 5289, + "time_per_iteration": 2.6170501708984375 + }, + { + "auxiliary_loss_clip": 0.01154842, + "auxiliary_loss_mlp": 0.01132377, + "balance_loss_clip": 1.00200009, + "balance_loss_mlp": 1.00077021, + "epoch": 0.3180520066135578, + "flos": 22491930689280.0, + "grad_norm": 1.5004496304753951, + "language_loss": 0.67531538, + "learning_rate": 3.190924441478572e-06, + "loss": 0.69818759, + "num_input_tokens_seen": 113692385, + "step": 5290, + "time_per_iteration": 2.6187121868133545 + }, + { + "auxiliary_loss_clip": 0.01138415, + "auxiliary_loss_mlp": 0.01133458, + "balance_loss_clip": 1.00182271, + "balance_loss_mlp": 1.00089681, + "epoch": 0.3181121298662258, + "flos": 27235335070080.0, + "grad_norm": 2.64900561366795, + "language_loss": 0.79663074, + "learning_rate": 3.1906115318855687e-06, + "loss": 0.81934953, + "num_input_tokens_seen": 113712145, + "step": 5291, + "time_per_iteration": 2.6704652309417725 + }, + { + "auxiliary_loss_clip": 0.01107603, + "auxiliary_loss_mlp": 0.01133017, + "balance_loss_clip": 1.00182879, + "balance_loss_mlp": 1.00074279, + "epoch": 0.31817225311889374, + "flos": 23180158252800.0, + "grad_norm": 2.204765060844604, + "language_loss": 0.7952888, + "learning_rate": 3.1902985771444577e-06, + "loss": 0.81769502, + "num_input_tokens_seen": 113731435, + "step": 5292, + "time_per_iteration": 2.6924235820770264 + }, + { + "auxiliary_loss_clip": 0.01160691, + "auxiliary_loss_mlp": 0.01131871, + "balance_loss_clip": 1.00210679, + "balance_loss_mlp": 1.00083613, + "epoch": 0.3182323763715617, + "flos": 23258803080960.0, + "grad_norm": 1.6454915613336638, + "language_loss": 0.74960423, + "learning_rate": 3.1899855772671043e-06, + "loss": 0.77252984, + "num_input_tokens_seen": 113750825, + "step": 5293, + "time_per_iteration": 2.6341428756713867 + }, + { + "auxiliary_loss_clip": 0.01154856, + "auxiliary_loss_mlp": 0.01132079, + "balance_loss_clip": 1.00202882, + "balance_loss_mlp": 1.00094891, + "epoch": 0.3182924996242297, + "flos": 29016683280000.0, + "grad_norm": 1.8657699149923752, + "language_loss": 0.73950195, + "learning_rate": 3.189672532265379e-06, + "loss": 0.7623713, + "num_input_tokens_seen": 113770010, + "step": 5294, + "time_per_iteration": 2.6125786304473877 + }, + { + "auxiliary_loss_clip": 0.01171448, + "auxiliary_loss_mlp": 0.01133302, + "balance_loss_clip": 1.00191617, + "balance_loss_mlp": 1.00074172, + "epoch": 0.31835262287689764, + "flos": 20449188230400.0, + "grad_norm": 2.2019543885812825, + "language_loss": 0.76280797, + "learning_rate": 3.189359442151152e-06, + "loss": 0.78585547, + "num_input_tokens_seen": 113788640, + "step": 5295, + "time_per_iteration": 2.5249571800231934 + }, + { + "auxiliary_loss_clip": 0.01129102, + "auxiliary_loss_mlp": 0.01133608, + "balance_loss_clip": 1.00203526, + "balance_loss_mlp": 1.00104713, + "epoch": 0.3184127461295656, + "flos": 25119478477440.0, + "grad_norm": 1.6331095459957723, + "language_loss": 0.6927489, + "learning_rate": 3.189046306936296e-06, + "loss": 0.71537608, + "num_input_tokens_seen": 113809515, + "step": 5296, + "time_per_iteration": 2.666635036468506 + }, + { + "auxiliary_loss_clip": 0.0113815, + "auxiliary_loss_mlp": 0.01132794, + "balance_loss_clip": 1.00188804, + "balance_loss_mlp": 1.00080514, + "epoch": 0.31847286938223357, + "flos": 25551231955200.0, + "grad_norm": 1.439361588772336, + "language_loss": 0.77197665, + "learning_rate": 3.1887331266326846e-06, + "loss": 0.79468608, + "num_input_tokens_seen": 113829770, + "step": 5297, + "time_per_iteration": 2.631108522415161 + }, + { + "auxiliary_loss_clip": 0.01123153, + "auxiliary_loss_mlp": 0.0113225, + "balance_loss_clip": 1.00174522, + "balance_loss_mlp": 1.00064278, + "epoch": 0.31853299263490154, + "flos": 27782470010880.0, + "grad_norm": 1.950797483279179, + "language_loss": 0.78999639, + "learning_rate": 3.1884199012521942e-06, + "loss": 0.81255043, + "num_input_tokens_seen": 113849320, + "step": 5298, + "time_per_iteration": 2.6812119483947754 + }, + { + "auxiliary_loss_clip": 0.01139311, + "auxiliary_loss_mlp": 0.01132951, + "balance_loss_clip": 1.00178695, + "balance_loss_mlp": 1.00086725, + "epoch": 0.3185931158875695, + "flos": 22706747976960.0, + "grad_norm": 1.7386278666075936, + "language_loss": 0.74218893, + "learning_rate": 3.1881066308067016e-06, + "loss": 0.76491147, + "num_input_tokens_seen": 113867860, + "step": 5299, + "time_per_iteration": 2.602646589279175 + }, + { + "auxiliary_loss_clip": 0.01140594, + "auxiliary_loss_mlp": 0.01132969, + "balance_loss_clip": 1.00185061, + "balance_loss_mlp": 1.00088501, + "epoch": 0.31865323914023747, + "flos": 24571517523840.0, + "grad_norm": 2.717519608753523, + "language_loss": 0.78189844, + "learning_rate": 3.1877933153080873e-06, + "loss": 0.80463403, + "num_input_tokens_seen": 113886375, + "step": 5300, + "time_per_iteration": 2.6188817024230957 + }, + { + "auxiliary_loss_clip": 0.01139602, + "auxiliary_loss_mlp": 0.01132989, + "balance_loss_clip": 1.00181389, + "balance_loss_mlp": 1.00080955, + "epoch": 0.31871336239290543, + "flos": 18186564666240.0, + "grad_norm": 2.4245460233379545, + "language_loss": 0.83746016, + "learning_rate": 3.1874799547682304e-06, + "loss": 0.8601861, + "num_input_tokens_seen": 113904065, + "step": 5301, + "time_per_iteration": 2.5909132957458496 + }, + { + "auxiliary_loss_clip": 0.01154995, + "auxiliary_loss_mlp": 0.01133672, + "balance_loss_clip": 1.00204885, + "balance_loss_mlp": 1.00101566, + "epoch": 0.31877348564557345, + "flos": 21826756679040.0, + "grad_norm": 2.4271688973833827, + "language_loss": 0.76897728, + "learning_rate": 3.187166549199015e-06, + "loss": 0.79186392, + "num_input_tokens_seen": 113918415, + "step": 5302, + "time_per_iteration": 2.5198841094970703 + }, + { + "auxiliary_loss_clip": 0.01171323, + "auxiliary_loss_mlp": 0.01131602, + "balance_loss_clip": 1.00196016, + "balance_loss_mlp": 1.0006628, + "epoch": 0.3188336088982414, + "flos": 22015252275840.0, + "grad_norm": 1.583420804766359, + "language_loss": 0.79223788, + "learning_rate": 3.1868530986123255e-06, + "loss": 0.81526709, + "num_input_tokens_seen": 113938135, + "step": 5303, + "time_per_iteration": 2.535245418548584 + }, + { + "auxiliary_loss_clip": 0.01155138, + "auxiliary_loss_mlp": 0.01133735, + "balance_loss_clip": 1.00193453, + "balance_loss_mlp": 1.00098395, + "epoch": 0.3188937321509094, + "flos": 20047886507520.0, + "grad_norm": 1.880917462180581, + "language_loss": 0.72938865, + "learning_rate": 3.186539603020047e-06, + "loss": 0.75227737, + "num_input_tokens_seen": 113957125, + "step": 5304, + "time_per_iteration": 3.9086360931396484 + }, + { + "auxiliary_loss_clip": 0.01141248, + "auxiliary_loss_mlp": 0.01132355, + "balance_loss_clip": 1.00205708, + "balance_loss_mlp": 1.00084305, + "epoch": 0.31895385540357735, + "flos": 25848105863040.0, + "grad_norm": 3.5049737331223842, + "language_loss": 0.71540511, + "learning_rate": 3.186226062434068e-06, + "loss": 0.73814106, + "num_input_tokens_seen": 113974875, + "step": 5305, + "time_per_iteration": 4.042706489562988 + }, + { + "auxiliary_loss_clip": 0.01139789, + "auxiliary_loss_mlp": 0.01132357, + "balance_loss_clip": 1.00198722, + "balance_loss_mlp": 1.00075042, + "epoch": 0.3190139786562453, + "flos": 23477714519040.0, + "grad_norm": 1.8848746766846252, + "language_loss": 0.63575506, + "learning_rate": 3.1859124768662778e-06, + "loss": 0.65847653, + "num_input_tokens_seen": 113994450, + "step": 5306, + "time_per_iteration": 4.05835747718811 + }, + { + "auxiliary_loss_clip": 0.01139578, + "auxiliary_loss_mlp": 0.01132779, + "balance_loss_clip": 1.00200319, + "balance_loss_mlp": 1.00079083, + "epoch": 0.3190741019089133, + "flos": 29095543589760.0, + "grad_norm": 2.339719645445926, + "language_loss": 0.79363698, + "learning_rate": 3.1855988463285678e-06, + "loss": 0.81636059, + "num_input_tokens_seen": 114013945, + "step": 5307, + "time_per_iteration": 4.04279637336731 + }, + { + "auxiliary_loss_clip": 0.01137906, + "auxiliary_loss_mlp": 0.01131964, + "balance_loss_clip": 1.0018084, + "balance_loss_mlp": 1.00083423, + "epoch": 0.31913422516158124, + "flos": 17129534209920.0, + "grad_norm": 1.7695517378155095, + "language_loss": 0.7796042, + "learning_rate": 3.1852851708328308e-06, + "loss": 0.8023029, + "num_input_tokens_seen": 114031375, + "step": 5308, + "time_per_iteration": 2.593778133392334 + }, + { + "auxiliary_loss_clip": 0.01155319, + "auxiliary_loss_mlp": 0.0113411, + "balance_loss_clip": 1.00209343, + "balance_loss_mlp": 1.00078654, + "epoch": 0.3191943484142492, + "flos": 16069846147200.0, + "grad_norm": 2.2021944500720263, + "language_loss": 0.73652303, + "learning_rate": 3.184971450390961e-06, + "loss": 0.75941736, + "num_input_tokens_seen": 114048465, + "step": 5309, + "time_per_iteration": 2.5282089710235596 + }, + { + "auxiliary_loss_clip": 0.01155822, + "auxiliary_loss_mlp": 0.01132348, + "balance_loss_clip": 1.00197744, + "balance_loss_mlp": 1.00064516, + "epoch": 0.3192544716669172, + "flos": 22966166977920.0, + "grad_norm": 1.944738161015455, + "language_loss": 0.82601804, + "learning_rate": 3.184657685014856e-06, + "loss": 0.84889972, + "num_input_tokens_seen": 114068415, + "step": 5310, + "time_per_iteration": 2.6365647315979004 + }, + { + "auxiliary_loss_clip": 0.01137986, + "auxiliary_loss_mlp": 0.01132271, + "balance_loss_clip": 1.00182366, + "balance_loss_mlp": 1.0007596, + "epoch": 0.31931459491958514, + "flos": 26870339018880.0, + "grad_norm": 1.6348091989135562, + "language_loss": 0.78219414, + "learning_rate": 3.184343874716412e-06, + "loss": 0.80489671, + "num_input_tokens_seen": 114088565, + "step": 5311, + "time_per_iteration": 2.6328818798065186 + }, + { + "auxiliary_loss_clip": 0.01124246, + "auxiliary_loss_mlp": 0.01132296, + "balance_loss_clip": 1.00178671, + "balance_loss_mlp": 1.00068855, + "epoch": 0.3193747181722531, + "flos": 21836525178240.0, + "grad_norm": 2.006569385893893, + "language_loss": 0.846021, + "learning_rate": 3.1840300195075295e-06, + "loss": 0.86858636, + "num_input_tokens_seen": 114107160, + "step": 5312, + "time_per_iteration": 2.641171932220459 + }, + { + "auxiliary_loss_clip": 0.01124564, + "auxiliary_loss_mlp": 0.011332, + "balance_loss_clip": 1.00192416, + "balance_loss_mlp": 1.00092554, + "epoch": 0.31943484142492107, + "flos": 18324999682560.0, + "grad_norm": 2.211570900680031, + "language_loss": 0.78195435, + "learning_rate": 3.1837161194001102e-06, + "loss": 0.80453199, + "num_input_tokens_seen": 114123420, + "step": 5313, + "time_per_iteration": 2.5895631313323975 + }, + { + "auxiliary_loss_clip": 0.01155088, + "auxiliary_loss_mlp": 0.01132648, + "balance_loss_clip": 1.00202656, + "balance_loss_mlp": 1.00065982, + "epoch": 0.31949496467758903, + "flos": 21615818060160.0, + "grad_norm": 2.4501452273927287, + "language_loss": 0.85984612, + "learning_rate": 3.183402174406057e-06, + "loss": 0.88272345, + "num_input_tokens_seen": 114139230, + "step": 5314, + "time_per_iteration": 2.546783924102783 + }, + { + "auxiliary_loss_clip": 0.01140452, + "auxiliary_loss_mlp": 0.01132215, + "balance_loss_clip": 1.00192297, + "balance_loss_mlp": 1.0007031, + "epoch": 0.31955508793025705, + "flos": 21760214734080.0, + "grad_norm": 1.7044850804539822, + "language_loss": 0.7966969, + "learning_rate": 3.1830881845372747e-06, + "loss": 0.81942356, + "num_input_tokens_seen": 114159290, + "step": 5315, + "time_per_iteration": 2.599233627319336 + }, + { + "auxiliary_loss_clip": 0.01124204, + "auxiliary_loss_mlp": 0.0113305, + "balance_loss_clip": 1.00187469, + "balance_loss_mlp": 1.00096631, + "epoch": 0.319615211182925, + "flos": 17164331510400.0, + "grad_norm": 1.7994443222234098, + "language_loss": 0.67521387, + "learning_rate": 3.18277414980567e-06, + "loss": 0.69778639, + "num_input_tokens_seen": 114177655, + "step": 5316, + "time_per_iteration": 2.645766496658325 + }, + { + "auxiliary_loss_clip": 0.01156202, + "auxiliary_loss_mlp": 0.01132812, + "balance_loss_clip": 1.0019722, + "balance_loss_mlp": 1.00082338, + "epoch": 0.319675334435593, + "flos": 28112812416000.0, + "grad_norm": 1.7681098898708043, + "language_loss": 0.69152611, + "learning_rate": 3.1824600702231515e-06, + "loss": 0.71441627, + "num_input_tokens_seen": 114200880, + "step": 5317, + "time_per_iteration": 2.64241886138916 + }, + { + "auxiliary_loss_clip": 0.01134503, + "auxiliary_loss_mlp": 0.01116174, + "balance_loss_clip": 1.00134587, + "balance_loss_mlp": 1.00020754, + "epoch": 0.31973545768826095, + "flos": 69501119408640.0, + "grad_norm": 0.7275702525859586, + "language_loss": 0.53096026, + "learning_rate": 3.182145945801628e-06, + "loss": 0.55346704, + "num_input_tokens_seen": 114267145, + "step": 5318, + "time_per_iteration": 3.296952486038208 + }, + { + "auxiliary_loss_clip": 0.01171347, + "auxiliary_loss_mlp": 0.01131908, + "balance_loss_clip": 1.00201797, + "balance_loss_mlp": 1.00087285, + "epoch": 0.3197955809409289, + "flos": 13699203408000.0, + "grad_norm": 1.7301766002214838, + "language_loss": 0.83891201, + "learning_rate": 3.181831776553012e-06, + "loss": 0.86194456, + "num_input_tokens_seen": 114284630, + "step": 5319, + "time_per_iteration": 2.497004270553589 + }, + { + "auxiliary_loss_clip": 0.01156007, + "auxiliary_loss_mlp": 0.01132068, + "balance_loss_clip": 1.0018965, + "balance_loss_mlp": 1.0009377, + "epoch": 0.3198557041935969, + "flos": 33218124278400.0, + "grad_norm": 1.6708080792151936, + "language_loss": 0.63332283, + "learning_rate": 3.1815175624892165e-06, + "loss": 0.65620351, + "num_input_tokens_seen": 114305830, + "step": 5320, + "time_per_iteration": 2.653164863586426 + }, + { + "auxiliary_loss_clip": 0.01145793, + "auxiliary_loss_mlp": 0.01132726, + "balance_loss_clip": 1.00245762, + "balance_loss_mlp": 1.00092769, + "epoch": 0.31991582744626484, + "flos": 23732033788800.0, + "grad_norm": 2.446542589001865, + "language_loss": 0.70578253, + "learning_rate": 3.1812033036221567e-06, + "loss": 0.72856766, + "num_input_tokens_seen": 114325165, + "step": 5321, + "time_per_iteration": 2.60286545753479 + }, + { + "auxiliary_loss_clip": 0.01171709, + "auxiliary_loss_mlp": 0.00748019, + "balance_loss_clip": 1.00205159, + "balance_loss_mlp": 1.00037193, + "epoch": 0.3199759506989328, + "flos": 18550842445440.0, + "grad_norm": 2.679296656025683, + "language_loss": 0.85790944, + "learning_rate": 3.180888999963749e-06, + "loss": 0.87710673, + "num_input_tokens_seen": 114341310, + "step": 5322, + "time_per_iteration": 2.714458465576172 + }, + { + "auxiliary_loss_clip": 0.0113977, + "auxiliary_loss_mlp": 0.01131796, + "balance_loss_clip": 1.00195098, + "balance_loss_mlp": 1.00066578, + "epoch": 0.3200360739516008, + "flos": 22418888382720.0, + "grad_norm": 1.9612839823861659, + "language_loss": 0.83340502, + "learning_rate": 3.1805746515259123e-06, + "loss": 0.85612071, + "num_input_tokens_seen": 114360355, + "step": 5323, + "time_per_iteration": 2.609316349029541 + }, + { + "auxiliary_loss_clip": 0.01160755, + "auxiliary_loss_mlp": 0.01131836, + "balance_loss_clip": 1.00232005, + "balance_loss_mlp": 1.00070572, + "epoch": 0.32009619720426874, + "flos": 20595236929920.0, + "grad_norm": 1.7784859354195917, + "language_loss": 0.7834692, + "learning_rate": 3.1802602583205663e-06, + "loss": 0.80639505, + "num_input_tokens_seen": 114379220, + "step": 5324, + "time_per_iteration": 2.575770139694214 + }, + { + "auxiliary_loss_clip": 0.01140326, + "auxiliary_loss_mlp": 0.01132238, + "balance_loss_clip": 1.00182533, + "balance_loss_mlp": 1.00063145, + "epoch": 0.3201563204569367, + "flos": 18147637301760.0, + "grad_norm": 2.377216700783692, + "language_loss": 0.80249786, + "learning_rate": 3.1799458203596333e-06, + "loss": 0.82522351, + "num_input_tokens_seen": 114396365, + "step": 5325, + "time_per_iteration": 2.60025954246521 + }, + { + "auxiliary_loss_clip": 0.01155061, + "auxiliary_loss_mlp": 0.0113258, + "balance_loss_clip": 1.00191855, + "balance_loss_mlp": 1.00078273, + "epoch": 0.32021644370960467, + "flos": 31684235840640.0, + "grad_norm": 5.88667241505894, + "language_loss": 0.74734259, + "learning_rate": 3.179631337655037e-06, + "loss": 0.77021897, + "num_input_tokens_seen": 114416780, + "step": 5326, + "time_per_iteration": 2.638927698135376 + }, + { + "auxiliary_loss_clip": 0.01125487, + "auxiliary_loss_mlp": 0.01131419, + "balance_loss_clip": 1.0019381, + "balance_loss_mlp": 1.0007652, + "epoch": 0.32027656696227264, + "flos": 26865921646080.0, + "grad_norm": 1.5754661101183813, + "language_loss": 0.81375808, + "learning_rate": 3.179316810218701e-06, + "loss": 0.8363272, + "num_input_tokens_seen": 114437405, + "step": 5327, + "time_per_iteration": 2.7645676136016846 + }, + { + "auxiliary_loss_clip": 0.01123214, + "auxiliary_loss_mlp": 0.01131647, + "balance_loss_clip": 1.00177503, + "balance_loss_mlp": 1.0008024, + "epoch": 0.32033669021494066, + "flos": 24169928492160.0, + "grad_norm": 1.5290573074977674, + "language_loss": 0.77874684, + "learning_rate": 3.179002238062554e-06, + "loss": 0.80129546, + "num_input_tokens_seen": 114458505, + "step": 5328, + "time_per_iteration": 2.7277486324310303 + }, + { + "auxiliary_loss_clip": 0.01107172, + "auxiliary_loss_mlp": 0.01132679, + "balance_loss_clip": 1.00180554, + "balance_loss_mlp": 1.00078607, + "epoch": 0.3203968134676086, + "flos": 24460768915200.0, + "grad_norm": 1.7989846559033793, + "language_loss": 0.7414341, + "learning_rate": 3.178687621198524e-06, + "loss": 0.76383257, + "num_input_tokens_seen": 114479050, + "step": 5329, + "time_per_iteration": 2.70953106880188 + }, + { + "auxiliary_loss_clip": 0.01139286, + "auxiliary_loss_mlp": 0.01131182, + "balance_loss_clip": 1.00185716, + "balance_loss_mlp": 1.00071919, + "epoch": 0.3204569367202766, + "flos": 18004713085440.0, + "grad_norm": 1.579987765192929, + "language_loss": 0.71041489, + "learning_rate": 3.1783729596385415e-06, + "loss": 0.73311961, + "num_input_tokens_seen": 114497415, + "step": 5330, + "time_per_iteration": 2.570835828781128 + }, + { + "auxiliary_loss_clip": 0.01106999, + "auxiliary_loss_mlp": 0.01133009, + "balance_loss_clip": 1.0018456, + "balance_loss_mlp": 1.00092542, + "epoch": 0.32051705997294455, + "flos": 30589678650240.0, + "grad_norm": 1.6979386256980133, + "language_loss": 0.79918784, + "learning_rate": 3.1780582533945376e-06, + "loss": 0.82158792, + "num_input_tokens_seen": 114518785, + "step": 5331, + "time_per_iteration": 2.782383680343628 + }, + { + "auxiliary_loss_clip": 0.01150754, + "auxiliary_loss_mlp": 0.011154, + "balance_loss_clip": 1.0013088, + "balance_loss_mlp": 1.00019574, + "epoch": 0.3205771832256125, + "flos": 68417979765120.0, + "grad_norm": 0.8380886304888242, + "language_loss": 0.57793999, + "learning_rate": 3.177743502478447e-06, + "loss": 0.60060149, + "num_input_tokens_seen": 114577710, + "step": 5332, + "time_per_iteration": 3.1422460079193115 + }, + { + "auxiliary_loss_clip": 0.01124269, + "auxiliary_loss_mlp": 0.01132138, + "balance_loss_clip": 1.00186181, + "balance_loss_mlp": 1.00072193, + "epoch": 0.3206373064782805, + "flos": 30443953173120.0, + "grad_norm": 1.6267696970582763, + "language_loss": 0.73160696, + "learning_rate": 3.177428706902205e-06, + "loss": 0.75417107, + "num_input_tokens_seen": 114598640, + "step": 5333, + "time_per_iteration": 2.741576910018921 + }, + { + "auxiliary_loss_clip": 0.01139058, + "auxiliary_loss_mlp": 0.01132309, + "balance_loss_clip": 1.00183272, + "balance_loss_mlp": 1.00089252, + "epoch": 0.32069742973094845, + "flos": 22054502862720.0, + "grad_norm": 1.6723508507541567, + "language_loss": 0.70389467, + "learning_rate": 3.1771138666777485e-06, + "loss": 0.72660828, + "num_input_tokens_seen": 114618780, + "step": 5334, + "time_per_iteration": 2.621494770050049 + }, + { + "auxiliary_loss_clip": 0.01108106, + "auxiliary_loss_mlp": 0.01131584, + "balance_loss_clip": 1.00170064, + "balance_loss_mlp": 1.00074017, + "epoch": 0.3207575529836164, + "flos": 22054000072320.0, + "grad_norm": 2.511600915692247, + "language_loss": 0.77014786, + "learning_rate": 3.1767989818170156e-06, + "loss": 0.79254472, + "num_input_tokens_seen": 114637525, + "step": 5335, + "time_per_iteration": 2.7073922157287598 + }, + { + "auxiliary_loss_clip": 0.01154726, + "auxiliary_loss_mlp": 0.0113204, + "balance_loss_clip": 1.00196815, + "balance_loss_mlp": 1.00071883, + "epoch": 0.3208176762362844, + "flos": 34057536186240.0, + "grad_norm": 1.506909822057997, + "language_loss": 0.68374825, + "learning_rate": 3.1764840523319477e-06, + "loss": 0.70661592, + "num_input_tokens_seen": 114659705, + "step": 5336, + "time_per_iteration": 2.6798272132873535 + }, + { + "auxiliary_loss_clip": 0.01124175, + "auxiliary_loss_mlp": 0.01132598, + "balance_loss_clip": 1.00189447, + "balance_loss_mlp": 1.00099134, + "epoch": 0.32087779948895234, + "flos": 21798711135360.0, + "grad_norm": 1.7101633835282866, + "language_loss": 0.79086214, + "learning_rate": 3.176169078234487e-06, + "loss": 0.81342995, + "num_input_tokens_seen": 114678340, + "step": 5337, + "time_per_iteration": 2.6497321128845215 + }, + { + "auxiliary_loss_clip": 0.01154407, + "auxiliary_loss_mlp": 0.01130974, + "balance_loss_clip": 1.00179708, + "balance_loss_mlp": 1.00070167, + "epoch": 0.3209379227416203, + "flos": 21434110133760.0, + "grad_norm": 1.6945678710637806, + "language_loss": 0.74293727, + "learning_rate": 3.1758540595365766e-06, + "loss": 0.76579106, + "num_input_tokens_seen": 114696980, + "step": 5338, + "time_per_iteration": 2.5666468143463135 + }, + { + "auxiliary_loss_clip": 0.0114521, + "auxiliary_loss_mlp": 0.01131526, + "balance_loss_clip": 1.00217843, + "balance_loss_mlp": 1.00077724, + "epoch": 0.3209980459942883, + "flos": 25849075530240.0, + "grad_norm": 2.0938233894253964, + "language_loss": 0.62664735, + "learning_rate": 3.1755389962501626e-06, + "loss": 0.64941466, + "num_input_tokens_seen": 114717330, + "step": 5339, + "time_per_iteration": 2.6723930835723877 + }, + { + "auxiliary_loss_clip": 0.01171277, + "auxiliary_loss_mlp": 0.01132355, + "balance_loss_clip": 1.00191081, + "balance_loss_mlp": 1.00074756, + "epoch": 0.32105816924695624, + "flos": 19099162535040.0, + "grad_norm": 2.2133525458960452, + "language_loss": 0.81362003, + "learning_rate": 3.175223888387192e-06, + "loss": 0.83665639, + "num_input_tokens_seen": 114736320, + "step": 5340, + "time_per_iteration": 2.4969892501831055 + }, + { + "auxiliary_loss_clip": 0.0112997, + "auxiliary_loss_mlp": 0.01132351, + "balance_loss_clip": 1.00212824, + "balance_loss_mlp": 1.00074339, + "epoch": 0.3211182924996242, + "flos": 16581860565120.0, + "grad_norm": 1.6803486837393768, + "language_loss": 0.76161432, + "learning_rate": 3.1749087359596137e-06, + "loss": 0.7842375, + "num_input_tokens_seen": 114754575, + "step": 5341, + "time_per_iteration": 3.9787464141845703 + }, + { + "auxiliary_loss_clip": 0.01121349, + "auxiliary_loss_mlp": 0.01131431, + "balance_loss_clip": 1.00179851, + "balance_loss_mlp": 1.00068259, + "epoch": 0.3211784157522922, + "flos": 22672202071680.0, + "grad_norm": 3.732466088735156, + "language_loss": 0.78998512, + "learning_rate": 3.1745935389793786e-06, + "loss": 0.81251287, + "num_input_tokens_seen": 114773590, + "step": 5342, + "time_per_iteration": 2.6507699489593506 + }, + { + "auxiliary_loss_clip": 0.01139148, + "auxiliary_loss_mlp": 0.01132467, + "balance_loss_clip": 1.00189638, + "balance_loss_mlp": 1.00076425, + "epoch": 0.3212385390049602, + "flos": 20558787603840.0, + "grad_norm": 2.597073595420955, + "language_loss": 0.74258232, + "learning_rate": 3.174278297458438e-06, + "loss": 0.76529849, + "num_input_tokens_seen": 114790775, + "step": 5343, + "time_per_iteration": 4.065151691436768 + }, + { + "auxiliary_loss_clip": 0.01091514, + "auxiliary_loss_mlp": 0.01131679, + "balance_loss_clip": 1.00163364, + "balance_loss_mlp": 1.0007391, + "epoch": 0.32129866225762815, + "flos": 24791147233920.0, + "grad_norm": 1.5326259062483305, + "language_loss": 0.82525313, + "learning_rate": 3.173963011408748e-06, + "loss": 0.84748495, + "num_input_tokens_seen": 114809835, + "step": 5344, + "time_per_iteration": 4.297851324081421 + }, + { + "auxiliary_loss_clip": 0.0110632, + "auxiliary_loss_mlp": 0.01131742, + "balance_loss_clip": 1.00150776, + "balance_loss_mlp": 1.00061202, + "epoch": 0.3213587855102961, + "flos": 18366871962240.0, + "grad_norm": 2.2892704615030546, + "language_loss": 0.79610586, + "learning_rate": 3.173647680842262e-06, + "loss": 0.81848651, + "num_input_tokens_seen": 114826505, + "step": 5345, + "time_per_iteration": 4.070428848266602 + }, + { + "auxiliary_loss_clip": 0.01139192, + "auxiliary_loss_mlp": 0.01131966, + "balance_loss_clip": 1.00168359, + "balance_loss_mlp": 1.00074053, + "epoch": 0.3214189087629641, + "flos": 27015992668800.0, + "grad_norm": 5.161746095844152, + "language_loss": 0.82943738, + "learning_rate": 3.1733323057709384e-06, + "loss": 0.85214895, + "num_input_tokens_seen": 114846140, + "step": 5346, + "time_per_iteration": 2.6284286975860596 + }, + { + "auxiliary_loss_clip": 0.01128528, + "auxiliary_loss_mlp": 0.01131861, + "balance_loss_clip": 1.00205827, + "balance_loss_mlp": 1.00073075, + "epoch": 0.32147903201563205, + "flos": 23148269953920.0, + "grad_norm": 2.7525698968381156, + "language_loss": 0.81462151, + "learning_rate": 3.1730168862067366e-06, + "loss": 0.83722544, + "num_input_tokens_seen": 114866660, + "step": 5347, + "time_per_iteration": 2.6609485149383545 + }, + { + "auxiliary_loss_clip": 0.01156113, + "auxiliary_loss_mlp": 0.01131695, + "balance_loss_clip": 1.00191092, + "balance_loss_mlp": 1.00085044, + "epoch": 0.3215391552683, + "flos": 16580747243520.0, + "grad_norm": 2.1939891361927035, + "language_loss": 0.79738951, + "learning_rate": 3.1727014221616164e-06, + "loss": 0.82026762, + "num_input_tokens_seen": 114882820, + "step": 5348, + "time_per_iteration": 2.5345981121063232 + }, + { + "auxiliary_loss_clip": 0.0113949, + "auxiliary_loss_mlp": 0.01132876, + "balance_loss_clip": 1.00199902, + "balance_loss_mlp": 1.0011735, + "epoch": 0.321599278520968, + "flos": 17821820010240.0, + "grad_norm": 2.0358595499795418, + "language_loss": 0.84711885, + "learning_rate": 3.172385913647542e-06, + "loss": 0.86984253, + "num_input_tokens_seen": 114900745, + "step": 5349, + "time_per_iteration": 2.5981638431549072 + }, + { + "auxiliary_loss_clip": 0.0114131, + "auxiliary_loss_mlp": 0.01132095, + "balance_loss_clip": 1.00200963, + "balance_loss_mlp": 1.0007745, + "epoch": 0.32165940177363594, + "flos": 16251769555200.0, + "grad_norm": 3.0890827593558603, + "language_loss": 0.80533838, + "learning_rate": 3.172070360676475e-06, + "loss": 0.82807243, + "num_input_tokens_seen": 114917940, + "step": 5350, + "time_per_iteration": 2.5839133262634277 + }, + { + "auxiliary_loss_clip": 0.01154506, + "auxiliary_loss_mlp": 0.01131747, + "balance_loss_clip": 1.00188279, + "balance_loss_mlp": 1.00080705, + "epoch": 0.3217195250263039, + "flos": 27599900158080.0, + "grad_norm": 3.255342756594892, + "language_loss": 0.79862189, + "learning_rate": 3.1717547632603828e-06, + "loss": 0.82148445, + "num_input_tokens_seen": 114937735, + "step": 5351, + "time_per_iteration": 2.6879079341888428 + }, + { + "auxiliary_loss_clip": 0.01121029, + "auxiliary_loss_mlp": 0.01132386, + "balance_loss_clip": 1.00171733, + "balance_loss_mlp": 1.00087476, + "epoch": 0.3217796482789719, + "flos": 21470595373440.0, + "grad_norm": 2.0249035247033524, + "language_loss": 0.75740618, + "learning_rate": 3.1714391214112326e-06, + "loss": 0.77994037, + "num_input_tokens_seen": 114956630, + "step": 5352, + "time_per_iteration": 2.6312592029571533 + }, + { + "auxiliary_loss_clip": 0.01107287, + "auxiliary_loss_mlp": 0.01132007, + "balance_loss_clip": 1.00164127, + "balance_loss_mlp": 1.00078154, + "epoch": 0.32183977153163984, + "flos": 21215593745280.0, + "grad_norm": 2.186464146884272, + "language_loss": 0.81960881, + "learning_rate": 3.1711234351409933e-06, + "loss": 0.8420018, + "num_input_tokens_seen": 114976470, + "step": 5353, + "time_per_iteration": 2.6842548847198486 + }, + { + "auxiliary_loss_clip": 0.01073085, + "auxiliary_loss_mlp": 0.01131371, + "balance_loss_clip": 1.00152349, + "balance_loss_mlp": 1.00062227, + "epoch": 0.3218998947843078, + "flos": 24608182331520.0, + "grad_norm": 1.4730877980651067, + "language_loss": 0.7313711, + "learning_rate": 3.1708077044616365e-06, + "loss": 0.7534157, + "num_input_tokens_seen": 114996710, + "step": 5354, + "time_per_iteration": 2.787593126296997 + }, + { + "auxiliary_loss_clip": 0.0112266, + "auxiliary_loss_mlp": 0.01131694, + "balance_loss_clip": 1.00177228, + "balance_loss_mlp": 1.00065935, + "epoch": 0.3219600180369758, + "flos": 22270577126400.0, + "grad_norm": 2.255539567986953, + "language_loss": 0.83676088, + "learning_rate": 3.1704919293851334e-06, + "loss": 0.85930443, + "num_input_tokens_seen": 115015775, + "step": 5355, + "time_per_iteration": 2.653465986251831 + }, + { + "auxiliary_loss_clip": 0.01171498, + "auxiliary_loss_mlp": 0.01132401, + "balance_loss_clip": 1.0020709, + "balance_loss_mlp": 1.00088882, + "epoch": 0.3220201412896438, + "flos": 14939126939520.0, + "grad_norm": 1.8354341463217199, + "language_loss": 0.71466422, + "learning_rate": 3.1701761099234597e-06, + "loss": 0.7377032, + "num_input_tokens_seen": 115034265, + "step": 5356, + "time_per_iteration": 2.5121524333953857 + }, + { + "auxiliary_loss_clip": 0.01107529, + "auxiliary_loss_mlp": 0.01133571, + "balance_loss_clip": 1.00176907, + "balance_loss_mlp": 1.00081992, + "epoch": 0.32208026454231176, + "flos": 22667389649280.0, + "grad_norm": 2.4026733596360925, + "language_loss": 0.67780197, + "learning_rate": 3.1698602460885903e-06, + "loss": 0.70021296, + "num_input_tokens_seen": 115051945, + "step": 5357, + "time_per_iteration": 2.699122905731201 + }, + { + "auxiliary_loss_clip": 0.01141211, + "auxiliary_loss_mlp": 0.01115216, + "balance_loss_clip": 1.00174332, + "balance_loss_mlp": 1.0000124, + "epoch": 0.3221403877949797, + "flos": 64605130053120.0, + "grad_norm": 0.7130715535395978, + "language_loss": 0.58242625, + "learning_rate": 3.1695443378925035e-06, + "loss": 0.60499048, + "num_input_tokens_seen": 115119090, + "step": 5358, + "time_per_iteration": 3.2508604526519775 + }, + { + "auxiliary_loss_clip": 0.01089567, + "auxiliary_loss_mlp": 0.01131552, + "balance_loss_clip": 1.00156951, + "balance_loss_mlp": 1.00070786, + "epoch": 0.3222005110476477, + "flos": 20157019004160.0, + "grad_norm": 1.5688248838129246, + "language_loss": 0.83445334, + "learning_rate": 3.1692283853471777e-06, + "loss": 0.85666454, + "num_input_tokens_seen": 115137755, + "step": 5359, + "time_per_iteration": 2.753706216812134 + }, + { + "auxiliary_loss_clip": 0.01155716, + "auxiliary_loss_mlp": 0.01131003, + "balance_loss_clip": 1.00189924, + "balance_loss_mlp": 1.00063586, + "epoch": 0.32226063430031565, + "flos": 22674177319680.0, + "grad_norm": 1.926122586942323, + "language_loss": 0.79193377, + "learning_rate": 3.168912388464595e-06, + "loss": 0.81480098, + "num_input_tokens_seen": 115158150, + "step": 5360, + "time_per_iteration": 2.600154399871826 + }, + { + "auxiliary_loss_clip": 0.01150349, + "auxiliary_loss_mlp": 0.01114355, + "balance_loss_clip": 1.00120282, + "balance_loss_mlp": 0.99991417, + "epoch": 0.3223207575529836, + "flos": 63828525075840.0, + "grad_norm": 0.654013380136555, + "language_loss": 0.56988209, + "learning_rate": 3.168596347256737e-06, + "loss": 0.59252918, + "num_input_tokens_seen": 115212755, + "step": 5361, + "time_per_iteration": 2.9744770526885986 + }, + { + "auxiliary_loss_clip": 0.01090076, + "auxiliary_loss_mlp": 0.01132281, + "balance_loss_clip": 1.00165594, + "balance_loss_mlp": 1.00086451, + "epoch": 0.3223808808056516, + "flos": 26870123537280.0, + "grad_norm": 1.8115469348976037, + "language_loss": 0.71609747, + "learning_rate": 3.168280261735588e-06, + "loss": 0.73832095, + "num_input_tokens_seen": 115233090, + "step": 5362, + "time_per_iteration": 2.7769615650177 + }, + { + "auxiliary_loss_clip": 0.01155931, + "auxiliary_loss_mlp": 0.01131417, + "balance_loss_clip": 1.00191581, + "balance_loss_mlp": 1.0007633, + "epoch": 0.32244100405831955, + "flos": 26761350176640.0, + "grad_norm": 1.7937769351524466, + "language_loss": 0.73839557, + "learning_rate": 3.167964131913135e-06, + "loss": 0.76126909, + "num_input_tokens_seen": 115252645, + "step": 5363, + "time_per_iteration": 2.638692855834961 + }, + { + "auxiliary_loss_clip": 0.01154891, + "auxiliary_loss_mlp": 0.01131864, + "balance_loss_clip": 1.00185132, + "balance_loss_mlp": 1.00082958, + "epoch": 0.3225011273109875, + "flos": 23803029020160.0, + "grad_norm": 2.6647814532352343, + "language_loss": 0.75960231, + "learning_rate": 3.167647957801365e-06, + "loss": 0.78246987, + "num_input_tokens_seen": 115269085, + "step": 5364, + "time_per_iteration": 2.6141750812530518 + }, + { + "auxiliary_loss_clip": 0.01138876, + "auxiliary_loss_mlp": 0.01131846, + "balance_loss_clip": 1.00178981, + "balance_loss_mlp": 1.00081086, + "epoch": 0.3225612505636555, + "flos": 17274505501440.0, + "grad_norm": 3.6626683324816875, + "language_loss": 0.7676816, + "learning_rate": 3.1673317394122672e-06, + "loss": 0.79038882, + "num_input_tokens_seen": 115286470, + "step": 5365, + "time_per_iteration": 2.621121644973755 + }, + { + "auxiliary_loss_clip": 0.01138975, + "auxiliary_loss_mlp": 0.01132056, + "balance_loss_clip": 1.00196278, + "balance_loss_mlp": 1.00083089, + "epoch": 0.32262137381632344, + "flos": 23366247638400.0, + "grad_norm": 2.1814691347969055, + "language_loss": 0.7666294, + "learning_rate": 3.1670154767578333e-06, + "loss": 0.78933966, + "num_input_tokens_seen": 115307000, + "step": 5366, + "time_per_iteration": 2.6467783451080322 + }, + { + "auxiliary_loss_clip": 0.01139185, + "auxiliary_loss_mlp": 0.01131628, + "balance_loss_clip": 1.00188518, + "balance_loss_mlp": 1.00068831, + "epoch": 0.3226814970689914, + "flos": 23258803080960.0, + "grad_norm": 1.7169284498549615, + "language_loss": 0.71712101, + "learning_rate": 3.166699169850055e-06, + "loss": 0.73982918, + "num_input_tokens_seen": 115325925, + "step": 5367, + "time_per_iteration": 2.6218149662017822 + }, + { + "auxiliary_loss_clip": 0.01171103, + "auxiliary_loss_mlp": 0.01131249, + "balance_loss_clip": 1.00188899, + "balance_loss_mlp": 1.00088215, + "epoch": 0.32274162032165943, + "flos": 16395196561920.0, + "grad_norm": 1.776394456452768, + "language_loss": 0.74463415, + "learning_rate": 3.1663828187009274e-06, + "loss": 0.7676577, + "num_input_tokens_seen": 115343705, + "step": 5368, + "time_per_iteration": 2.5167877674102783 + }, + { + "auxiliary_loss_clip": 0.01141084, + "auxiliary_loss_mlp": 0.01130956, + "balance_loss_clip": 1.0019877, + "balance_loss_mlp": 1.00077987, + "epoch": 0.3228017435743274, + "flos": 27855081354240.0, + "grad_norm": 1.9487884258735135, + "language_loss": 0.78596151, + "learning_rate": 3.1660664233224467e-06, + "loss": 0.80868185, + "num_input_tokens_seen": 115364170, + "step": 5369, + "time_per_iteration": 2.686479091644287 + }, + { + "auxiliary_loss_clip": 0.01122823, + "auxiliary_loss_mlp": 0.01131434, + "balance_loss_clip": 1.00189698, + "balance_loss_mlp": 1.00058961, + "epoch": 0.32286186682699536, + "flos": 19608770741760.0, + "grad_norm": 1.9107001949774165, + "language_loss": 0.83122289, + "learning_rate": 3.16574998372661e-06, + "loss": 0.85376549, + "num_input_tokens_seen": 115382495, + "step": 5370, + "time_per_iteration": 2.8307723999023438 + }, + { + "auxiliary_loss_clip": 0.01171348, + "auxiliary_loss_mlp": 0.01131242, + "balance_loss_clip": 1.00207651, + "balance_loss_mlp": 1.00077891, + "epoch": 0.3229219900796633, + "flos": 24134017870080.0, + "grad_norm": 1.945122308519968, + "language_loss": 0.8297714, + "learning_rate": 3.1654334999254177e-06, + "loss": 0.85279727, + "num_input_tokens_seen": 115399450, + "step": 5371, + "time_per_iteration": 2.5245003700256348 + }, + { + "auxiliary_loss_clip": 0.01154707, + "auxiliary_loss_mlp": 0.00747886, + "balance_loss_clip": 1.00181425, + "balance_loss_mlp": 1.00030315, + "epoch": 0.3229821133323313, + "flos": 17748705876480.0, + "grad_norm": 5.918266948533116, + "language_loss": 0.8859427, + "learning_rate": 3.1651169719308695e-06, + "loss": 0.90496862, + "num_input_tokens_seen": 115417700, + "step": 5372, + "time_per_iteration": 2.5730338096618652 + }, + { + "auxiliary_loss_clip": 0.0117143, + "auxiliary_loss_mlp": 0.01131847, + "balance_loss_clip": 1.00205827, + "balance_loss_mlp": 1.00081253, + "epoch": 0.32304223658499925, + "flos": 22346025644160.0, + "grad_norm": 2.0694135139983265, + "language_loss": 0.72996753, + "learning_rate": 3.1648003997549694e-06, + "loss": 0.75300032, + "num_input_tokens_seen": 115435840, + "step": 5373, + "time_per_iteration": 2.5848729610443115 + }, + { + "auxiliary_loss_clip": 0.01139685, + "auxiliary_loss_mlp": 0.01131174, + "balance_loss_clip": 1.00191283, + "balance_loss_mlp": 1.00071132, + "epoch": 0.3231023598376672, + "flos": 18478302929280.0, + "grad_norm": 2.631082451022684, + "language_loss": 0.81150991, + "learning_rate": 3.1644837834097214e-06, + "loss": 0.8342185, + "num_input_tokens_seen": 115454210, + "step": 5374, + "time_per_iteration": 2.568380832672119 + }, + { + "auxiliary_loss_clip": 0.01122868, + "auxiliary_loss_mlp": 0.01131298, + "balance_loss_clip": 1.00171936, + "balance_loss_mlp": 1.00064445, + "epoch": 0.3231624830903352, + "flos": 27636313570560.0, + "grad_norm": 2.1272402448890473, + "language_loss": 0.87563485, + "learning_rate": 3.1641671229071317e-06, + "loss": 0.89817655, + "num_input_tokens_seen": 115471785, + "step": 5375, + "time_per_iteration": 2.6662771701812744 + }, + { + "auxiliary_loss_clip": 0.01171427, + "auxiliary_loss_mlp": 0.01132182, + "balance_loss_clip": 1.00200987, + "balance_loss_mlp": 1.0005753, + "epoch": 0.32322260634300315, + "flos": 21726423014400.0, + "grad_norm": 1.8757887489105958, + "language_loss": 0.75787199, + "learning_rate": 3.1638504182592076e-06, + "loss": 0.78090811, + "num_input_tokens_seen": 115491405, + "step": 5376, + "time_per_iteration": 2.5178909301757812 + }, + { + "auxiliary_loss_clip": 0.01124068, + "auxiliary_loss_mlp": 0.01130603, + "balance_loss_clip": 1.00176001, + "balance_loss_mlp": 1.00071263, + "epoch": 0.3232827295956711, + "flos": 22637656166400.0, + "grad_norm": 4.860439314999478, + "language_loss": 0.66654021, + "learning_rate": 3.1635336694779594e-06, + "loss": 0.68908691, + "num_input_tokens_seen": 115511555, + "step": 5377, + "time_per_iteration": 2.6487693786621094 + }, + { + "auxiliary_loss_clip": 0.01123813, + "auxiliary_loss_mlp": 0.01131267, + "balance_loss_clip": 1.00181758, + "balance_loss_mlp": 1.00080431, + "epoch": 0.3233428528483391, + "flos": 26322593546880.0, + "grad_norm": 1.820294960713909, + "language_loss": 0.72272265, + "learning_rate": 3.1632168765753982e-06, + "loss": 0.74527341, + "num_input_tokens_seen": 115532860, + "step": 5378, + "time_per_iteration": 4.060098171234131 + }, + { + "auxiliary_loss_clip": 0.01155054, + "auxiliary_loss_mlp": 0.01132024, + "balance_loss_clip": 1.00193059, + "balance_loss_mlp": 1.00070262, + "epoch": 0.32340297610100704, + "flos": 28585217111040.0, + "grad_norm": 2.1609319487260303, + "language_loss": 0.81494534, + "learning_rate": 3.1629000395635357e-06, + "loss": 0.83781612, + "num_input_tokens_seen": 115553850, + "step": 5379, + "time_per_iteration": 2.600064277648926 + }, + { + "auxiliary_loss_clip": 0.01161055, + "auxiliary_loss_mlp": 0.01131709, + "balance_loss_clip": 1.00264966, + "balance_loss_mlp": 1.00077009, + "epoch": 0.323463099353675, + "flos": 30773792787840.0, + "grad_norm": 1.6035646073455716, + "language_loss": 0.78468525, + "learning_rate": 3.162583158454388e-06, + "loss": 0.8076129, + "num_input_tokens_seen": 115575530, + "step": 5380, + "time_per_iteration": 4.078200340270996 + }, + { + "auxiliary_loss_clip": 0.01154824, + "auxiliary_loss_mlp": 0.01131961, + "balance_loss_clip": 1.00183511, + "balance_loss_mlp": 1.00092661, + "epoch": 0.32352322260634303, + "flos": 25228610974080.0, + "grad_norm": 1.7163693952739663, + "language_loss": 0.76916808, + "learning_rate": 3.1622662332599697e-06, + "loss": 0.79203594, + "num_input_tokens_seen": 115594885, + "step": 5381, + "time_per_iteration": 2.5840952396392822 + }, + { + "auxiliary_loss_clip": 0.01154412, + "auxiliary_loss_mlp": 0.01130902, + "balance_loss_clip": 1.00189006, + "balance_loss_mlp": 1.00072575, + "epoch": 0.323583345859011, + "flos": 23330480670720.0, + "grad_norm": 2.646474755130634, + "language_loss": 0.71585792, + "learning_rate": 3.1619492639922998e-06, + "loss": 0.73871112, + "num_input_tokens_seen": 115614080, + "step": 5382, + "time_per_iteration": 4.142355680465698 + }, + { + "auxiliary_loss_clip": 0.01145509, + "auxiliary_loss_mlp": 0.01131672, + "balance_loss_clip": 1.00227022, + "balance_loss_mlp": 1.00082767, + "epoch": 0.32364346911167896, + "flos": 26207499392640.0, + "grad_norm": 2.108364878941707, + "language_loss": 0.70336616, + "learning_rate": 3.1616322506633964e-06, + "loss": 0.72613794, + "num_input_tokens_seen": 115632820, + "step": 5383, + "time_per_iteration": 2.620046854019165 + }, + { + "auxiliary_loss_clip": 0.0115443, + "auxiliary_loss_mlp": 0.0113108, + "balance_loss_clip": 1.0017724, + "balance_loss_mlp": 1.00090337, + "epoch": 0.3237035923643469, + "flos": 23695764030720.0, + "grad_norm": 1.773762201410151, + "language_loss": 0.78604817, + "learning_rate": 3.161315193285283e-06, + "loss": 0.80890328, + "num_input_tokens_seen": 115652860, + "step": 5384, + "time_per_iteration": 2.563230037689209 + }, + { + "auxiliary_loss_clip": 0.01091195, + "auxiliary_loss_mlp": 0.0113189, + "balance_loss_clip": 1.00155234, + "balance_loss_mlp": 1.00085509, + "epoch": 0.3237637156170149, + "flos": 14428728633600.0, + "grad_norm": 2.3195691124128546, + "language_loss": 0.74962318, + "learning_rate": 3.16099809186998e-06, + "loss": 0.77185404, + "num_input_tokens_seen": 115670940, + "step": 5385, + "time_per_iteration": 2.673978805541992 + }, + { + "auxiliary_loss_clip": 0.01137627, + "auxiliary_loss_mlp": 0.01131516, + "balance_loss_clip": 1.00182354, + "balance_loss_mlp": 1.00076723, + "epoch": 0.32382383886968286, + "flos": 31062981185280.0, + "grad_norm": 2.918384356463738, + "language_loss": 0.72136152, + "learning_rate": 3.1606809464295145e-06, + "loss": 0.74405289, + "num_input_tokens_seen": 115691155, + "step": 5386, + "time_per_iteration": 2.6809170246124268 + }, + { + "auxiliary_loss_clip": 0.01171147, + "auxiliary_loss_mlp": 0.01131674, + "balance_loss_clip": 1.00185943, + "balance_loss_mlp": 1.0006392, + "epoch": 0.3238839621223508, + "flos": 23256935573760.0, + "grad_norm": 2.507356344119409, + "language_loss": 0.94151539, + "learning_rate": 3.1603637569759095e-06, + "loss": 0.96454358, + "num_input_tokens_seen": 115710340, + "step": 5387, + "time_per_iteration": 2.564467430114746 + }, + { + "auxiliary_loss_clip": 0.0115555, + "auxiliary_loss_mlp": 0.01132415, + "balance_loss_clip": 1.0019033, + "balance_loss_mlp": 1.000808, + "epoch": 0.3239440853750188, + "flos": 22964658606720.0, + "grad_norm": 3.042990595378941, + "language_loss": 0.77502739, + "learning_rate": 3.1600465235211956e-06, + "loss": 0.79790711, + "num_input_tokens_seen": 115726745, + "step": 5388, + "time_per_iteration": 2.5456695556640625 + }, + { + "auxiliary_loss_clip": 0.01139315, + "auxiliary_loss_mlp": 0.01131588, + "balance_loss_clip": 1.00189209, + "balance_loss_mlp": 1.00064862, + "epoch": 0.32400420862768675, + "flos": 36246614653440.0, + "grad_norm": 1.8672616076382311, + "language_loss": 0.71554625, + "learning_rate": 3.1597292460774006e-06, + "loss": 0.73825526, + "num_input_tokens_seen": 115749385, + "step": 5389, + "time_per_iteration": 2.706775665283203 + }, + { + "auxiliary_loss_clip": 0.0112228, + "auxiliary_loss_mlp": 0.01131453, + "balance_loss_clip": 1.00165486, + "balance_loss_mlp": 1.00070477, + "epoch": 0.3240643318803547, + "flos": 21616500418560.0, + "grad_norm": 2.148020246441977, + "language_loss": 0.8077271, + "learning_rate": 3.159411924656557e-06, + "loss": 0.83026445, + "num_input_tokens_seen": 115768105, + "step": 5390, + "time_per_iteration": 2.6078009605407715 + }, + { + "auxiliary_loss_clip": 0.01141247, + "auxiliary_loss_mlp": 0.0113222, + "balance_loss_clip": 1.00209498, + "balance_loss_mlp": 1.00070882, + "epoch": 0.3241244551330227, + "flos": 23295611543040.0, + "grad_norm": 2.1056228228775655, + "language_loss": 0.72593015, + "learning_rate": 3.1590945592706967e-06, + "loss": 0.74866486, + "num_input_tokens_seen": 115787340, + "step": 5391, + "time_per_iteration": 2.6067440509796143 + }, + { + "auxiliary_loss_clip": 0.01139248, + "auxiliary_loss_mlp": 0.01130742, + "balance_loss_clip": 1.00180197, + "balance_loss_mlp": 1.00075579, + "epoch": 0.32418457838569065, + "flos": 14097236993280.0, + "grad_norm": 1.8031789723031122, + "language_loss": 0.77423167, + "learning_rate": 3.158777149931855e-06, + "loss": 0.79693156, + "num_input_tokens_seen": 115805565, + "step": 5392, + "time_per_iteration": 2.576993465423584 + }, + { + "auxiliary_loss_clip": 0.0113924, + "auxiliary_loss_mlp": 0.01131594, + "balance_loss_clip": 1.00181937, + "balance_loss_mlp": 1.00065446, + "epoch": 0.3242447016383586, + "flos": 29752672953600.0, + "grad_norm": 1.7072384931768556, + "language_loss": 0.62338734, + "learning_rate": 3.158459696652067e-06, + "loss": 0.64609569, + "num_input_tokens_seen": 115826725, + "step": 5393, + "time_per_iteration": 2.6854865550994873 + }, + { + "auxiliary_loss_clip": 0.01154489, + "auxiliary_loss_mlp": 0.01131193, + "balance_loss_clip": 1.0018332, + "balance_loss_mlp": 1.00063491, + "epoch": 0.3243048248910266, + "flos": 24351205455360.0, + "grad_norm": 1.601827953350207, + "language_loss": 0.82507253, + "learning_rate": 3.158142199443371e-06, + "loss": 0.84792936, + "num_input_tokens_seen": 115846955, + "step": 5394, + "time_per_iteration": 2.6282167434692383 + }, + { + "auxiliary_loss_clip": 0.01138912, + "auxiliary_loss_mlp": 0.01131102, + "balance_loss_clip": 1.00187898, + "balance_loss_mlp": 1.00102091, + "epoch": 0.3243649481436946, + "flos": 24353037048960.0, + "grad_norm": 1.7010918613187251, + "language_loss": 0.81579733, + "learning_rate": 3.1578246583178076e-06, + "loss": 0.83849752, + "num_input_tokens_seen": 115865975, + "step": 5395, + "time_per_iteration": 2.6266660690307617 + }, + { + "auxiliary_loss_clip": 0.01156065, + "auxiliary_loss_mlp": 0.01130997, + "balance_loss_clip": 1.0020653, + "balance_loss_mlp": 1.00082028, + "epoch": 0.32442507139636256, + "flos": 22925228451840.0, + "grad_norm": 1.7842055880146486, + "language_loss": 0.83102858, + "learning_rate": 3.157507073287417e-06, + "loss": 0.85389912, + "num_input_tokens_seen": 115884950, + "step": 5396, + "time_per_iteration": 2.5823707580566406 + }, + { + "auxiliary_loss_clip": 0.01122871, + "auxiliary_loss_mlp": 0.01132329, + "balance_loss_clip": 1.00184941, + "balance_loss_mlp": 1.00081706, + "epoch": 0.32448519464903053, + "flos": 22200192426240.0, + "grad_norm": 6.78249098605009, + "language_loss": 0.76174319, + "learning_rate": 3.1571894443642414e-06, + "loss": 0.7842952, + "num_input_tokens_seen": 115904170, + "step": 5397, + "time_per_iteration": 2.616176128387451 + }, + { + "auxiliary_loss_clip": 0.01121715, + "auxiliary_loss_mlp": 0.01131066, + "balance_loss_clip": 1.00170684, + "balance_loss_mlp": 1.00060391, + "epoch": 0.3245453179016985, + "flos": 18838450644480.0, + "grad_norm": 2.78775699666238, + "language_loss": 0.6670177, + "learning_rate": 3.1568717715603263e-06, + "loss": 0.68954551, + "num_input_tokens_seen": 115919255, + "step": 5398, + "time_per_iteration": 2.614546298980713 + }, + { + "auxiliary_loss_clip": 0.01139509, + "auxiliary_loss_mlp": 0.01131232, + "balance_loss_clip": 1.00196362, + "balance_loss_mlp": 1.0005784, + "epoch": 0.32460544115436646, + "flos": 21178390233600.0, + "grad_norm": 1.4827852098324814, + "language_loss": 0.72990441, + "learning_rate": 3.156554054887718e-06, + "loss": 0.75261182, + "num_input_tokens_seen": 115938535, + "step": 5399, + "time_per_iteration": 2.6024646759033203 + }, + { + "auxiliary_loss_clip": 0.01122712, + "auxiliary_loss_mlp": 0.0113138, + "balance_loss_clip": 1.00182772, + "balance_loss_mlp": 1.0007267, + "epoch": 0.3246655644070344, + "flos": 21981137333760.0, + "grad_norm": 2.1855886464359093, + "language_loss": 0.71368444, + "learning_rate": 3.1562362943584645e-06, + "loss": 0.73622537, + "num_input_tokens_seen": 115955005, + "step": 5400, + "time_per_iteration": 2.662827968597412 + }, + { + "auxiliary_loss_clip": 0.01155695, + "auxiliary_loss_mlp": 0.01131768, + "balance_loss_clip": 1.00191021, + "balance_loss_mlp": 1.00063753, + "epoch": 0.3247256876597024, + "flos": 32159729105280.0, + "grad_norm": 1.885399598931104, + "language_loss": 0.79766017, + "learning_rate": 3.155918489984614e-06, + "loss": 0.82053483, + "num_input_tokens_seen": 115975305, + "step": 5401, + "time_per_iteration": 2.635619878768921 + }, + { + "auxiliary_loss_clip": 0.01140774, + "auxiliary_loss_mlp": 0.01131843, + "balance_loss_clip": 1.00187922, + "balance_loss_mlp": 1.00080848, + "epoch": 0.32478581091237035, + "flos": 20997544233600.0, + "grad_norm": 1.3949047266396177, + "language_loss": 0.87524533, + "learning_rate": 3.1556006417782196e-06, + "loss": 0.89797151, + "num_input_tokens_seen": 115994810, + "step": 5402, + "time_per_iteration": 2.5961151123046875 + }, + { + "auxiliary_loss_clip": 0.01108812, + "auxiliary_loss_mlp": 0.01131317, + "balance_loss_clip": 1.00187635, + "balance_loss_mlp": 1.00075948, + "epoch": 0.3248459341650383, + "flos": 17924990849280.0, + "grad_norm": 1.8656396249452496, + "language_loss": 0.84469932, + "learning_rate": 3.155282749751332e-06, + "loss": 0.8671006, + "num_input_tokens_seen": 116011095, + "step": 5403, + "time_per_iteration": 2.6441361904144287 + }, + { + "auxiliary_loss_clip": 0.01140875, + "auxiliary_loss_mlp": 0.01130166, + "balance_loss_clip": 1.00207663, + "balance_loss_mlp": 1.00103879, + "epoch": 0.3249060574177063, + "flos": 24535606901760.0, + "grad_norm": 3.4693627454807134, + "language_loss": 0.87070036, + "learning_rate": 3.154964813916007e-06, + "loss": 0.89341074, + "num_input_tokens_seen": 116028805, + "step": 5404, + "time_per_iteration": 2.6047589778900146 + }, + { + "auxiliary_loss_clip": 0.01154421, + "auxiliary_loss_mlp": 0.01131261, + "balance_loss_clip": 1.00192046, + "balance_loss_mlp": 1.0007987, + "epoch": 0.32496618067037425, + "flos": 25994765093760.0, + "grad_norm": 1.6232201545688867, + "language_loss": 0.72703183, + "learning_rate": 3.1546468342843008e-06, + "loss": 0.74988866, + "num_input_tokens_seen": 116047765, + "step": 5405, + "time_per_iteration": 2.5799684524536133 + }, + { + "auxiliary_loss_clip": 0.01121347, + "auxiliary_loss_mlp": 0.01130772, + "balance_loss_clip": 1.00194359, + "balance_loss_mlp": 1.00069082, + "epoch": 0.3250263039230422, + "flos": 19573757959680.0, + "grad_norm": 1.8188169422525955, + "language_loss": 0.82647747, + "learning_rate": 3.1543288108682707e-06, + "loss": 0.84899867, + "num_input_tokens_seen": 116068385, + "step": 5406, + "time_per_iteration": 2.757300853729248 + }, + { + "auxiliary_loss_clip": 0.01171184, + "auxiliary_loss_mlp": 0.01131363, + "balance_loss_clip": 1.00205207, + "balance_loss_mlp": 1.00061417, + "epoch": 0.3250864271757102, + "flos": 16763640318720.0, + "grad_norm": 2.225209131895181, + "language_loss": 0.87772477, + "learning_rate": 3.1540107436799764e-06, + "loss": 0.90075028, + "num_input_tokens_seen": 116085350, + "step": 5407, + "time_per_iteration": 2.4887735843658447 + }, + { + "auxiliary_loss_clip": 0.01138731, + "auxiliary_loss_mlp": 0.01130839, + "balance_loss_clip": 1.00188935, + "balance_loss_mlp": 1.00075793, + "epoch": 0.3251465504283782, + "flos": 27819458040960.0, + "grad_norm": 1.4729849579254382, + "language_loss": 0.69201863, + "learning_rate": 3.153692632731479e-06, + "loss": 0.71471435, + "num_input_tokens_seen": 116107560, + "step": 5408, + "time_per_iteration": 2.656353235244751 + }, + { + "auxiliary_loss_clip": 0.01154733, + "auxiliary_loss_mlp": 0.01131456, + "balance_loss_clip": 1.0018878, + "balance_loss_mlp": 1.00061202, + "epoch": 0.32520667368104617, + "flos": 19063144172160.0, + "grad_norm": 1.7843825673214093, + "language_loss": 0.7746067, + "learning_rate": 3.153374478034841e-06, + "loss": 0.7974686, + "num_input_tokens_seen": 116125980, + "step": 5409, + "time_per_iteration": 2.5333704948425293 + }, + { + "auxiliary_loss_clip": 0.01091237, + "auxiliary_loss_mlp": 0.01131853, + "balance_loss_clip": 1.00161588, + "balance_loss_mlp": 1.00091362, + "epoch": 0.32526679693371413, + "flos": 29382146208000.0, + "grad_norm": 1.7270308935914644, + "language_loss": 0.83142769, + "learning_rate": 3.1530562796021285e-06, + "loss": 0.85365856, + "num_input_tokens_seen": 116146530, + "step": 5410, + "time_per_iteration": 2.781280517578125 + }, + { + "auxiliary_loss_clip": 0.0110591, + "auxiliary_loss_mlp": 0.01130037, + "balance_loss_clip": 1.00161839, + "balance_loss_mlp": 1.00062382, + "epoch": 0.3253269201863821, + "flos": 20704513080960.0, + "grad_norm": 1.732293136763023, + "language_loss": 0.7129122, + "learning_rate": 3.152738037445405e-06, + "loss": 0.73527163, + "num_input_tokens_seen": 116165695, + "step": 5411, + "time_per_iteration": 2.6707119941711426 + }, + { + "auxiliary_loss_clip": 0.01112399, + "auxiliary_loss_mlp": 0.01130927, + "balance_loss_clip": 1.00251424, + "balance_loss_mlp": 1.0007503, + "epoch": 0.32538704343905006, + "flos": 29094142959360.0, + "grad_norm": 1.5519793766273846, + "language_loss": 0.83441466, + "learning_rate": 3.1524197515767403e-06, + "loss": 0.85684788, + "num_input_tokens_seen": 116185375, + "step": 5412, + "time_per_iteration": 2.7373836040496826 + }, + { + "auxiliary_loss_clip": 0.01123551, + "auxiliary_loss_mlp": 0.01130998, + "balance_loss_clip": 1.00165868, + "balance_loss_mlp": 1.0006305, + "epoch": 0.325447166691718, + "flos": 24676124906880.0, + "grad_norm": 2.4109532531290245, + "language_loss": 0.8059392, + "learning_rate": 3.152101422008203e-06, + "loss": 0.82848465, + "num_input_tokens_seen": 116204335, + "step": 5413, + "time_per_iteration": 2.664200782775879 + }, + { + "auxiliary_loss_clip": 0.0113795, + "auxiliary_loss_mlp": 0.01131096, + "balance_loss_clip": 1.00190449, + "balance_loss_mlp": 1.00063288, + "epoch": 0.325507289944386, + "flos": 21543134889600.0, + "grad_norm": 5.527874262871021, + "language_loss": 0.76945293, + "learning_rate": 3.151783048751864e-06, + "loss": 0.7921434, + "num_input_tokens_seen": 116222840, + "step": 5414, + "time_per_iteration": 2.607048511505127 + }, + { + "auxiliary_loss_clip": 0.01120264, + "auxiliary_loss_mlp": 0.01114402, + "balance_loss_clip": 1.00106597, + "balance_loss_mlp": 0.99996102, + "epoch": 0.32556741319705396, + "flos": 71518722347520.0, + "grad_norm": 0.9015910164534756, + "language_loss": 0.63993776, + "learning_rate": 3.1514646318197965e-06, + "loss": 0.66228449, + "num_input_tokens_seen": 116274940, + "step": 5415, + "time_per_iteration": 3.108705997467041 + }, + { + "auxiliary_loss_clip": 0.01124091, + "auxiliary_loss_mlp": 0.01130757, + "balance_loss_clip": 1.00191331, + "balance_loss_mlp": 1.00058019, + "epoch": 0.3256275364497219, + "flos": 23732428838400.0, + "grad_norm": 1.7557585379570166, + "language_loss": 0.73972756, + "learning_rate": 3.151146171224075e-06, + "loss": 0.76227605, + "num_input_tokens_seen": 116297300, + "step": 5416, + "time_per_iteration": 4.048696517944336 + }, + { + "auxiliary_loss_clip": 0.0116676, + "auxiliary_loss_mlp": 0.01114556, + "balance_loss_clip": 1.00116706, + "balance_loss_mlp": 1.0001148, + "epoch": 0.3256876597023899, + "flos": 67289199891840.0, + "grad_norm": 0.7757510064379929, + "language_loss": 0.58086431, + "learning_rate": 3.1508276669767757e-06, + "loss": 0.60367745, + "num_input_tokens_seen": 116362370, + "step": 5417, + "time_per_iteration": 3.1513314247131348 + }, + { + "auxiliary_loss_clip": 0.01136711, + "auxiliary_loss_mlp": 0.01114607, + "balance_loss_clip": 1.0011692, + "balance_loss_mlp": 1.00016582, + "epoch": 0.32574778295505785, + "flos": 71282323964160.0, + "grad_norm": 0.805509799043012, + "language_loss": 0.63400543, + "learning_rate": 3.150509119089975e-06, + "loss": 0.65651858, + "num_input_tokens_seen": 116430365, + "step": 5418, + "time_per_iteration": 4.650890588760376 + }, + { + "auxiliary_loss_clip": 0.011444, + "auxiliary_loss_mlp": 0.01130607, + "balance_loss_clip": 1.00216508, + "balance_loss_mlp": 1.00081158, + "epoch": 0.3258079062077258, + "flos": 20776370238720.0, + "grad_norm": 4.107236145496078, + "language_loss": 0.69241709, + "learning_rate": 3.1501905275757537e-06, + "loss": 0.71516716, + "num_input_tokens_seen": 116447525, + "step": 5419, + "time_per_iteration": 3.9383227825164795 + }, + { + "auxiliary_loss_clip": 0.01160562, + "auxiliary_loss_mlp": 0.01131538, + "balance_loss_clip": 1.00216341, + "balance_loss_mlp": 1.0006938, + "epoch": 0.3258680294603938, + "flos": 22235456603520.0, + "grad_norm": 1.590395348427777, + "language_loss": 0.77105302, + "learning_rate": 3.1498718924461926e-06, + "loss": 0.79397404, + "num_input_tokens_seen": 116466310, + "step": 5420, + "time_per_iteration": 3.9678540229797363 + }, + { + "auxiliary_loss_clip": 0.01155995, + "auxiliary_loss_mlp": 0.00747785, + "balance_loss_clip": 1.00194645, + "balance_loss_mlp": 1.00018597, + "epoch": 0.3259281527130618, + "flos": 26979974305920.0, + "grad_norm": 2.0552156755456545, + "language_loss": 0.79943359, + "learning_rate": 3.1495532137133736e-06, + "loss": 0.81847143, + "num_input_tokens_seen": 116487825, + "step": 5421, + "time_per_iteration": 2.6123297214508057 + }, + { + "auxiliary_loss_clip": 0.01171083, + "auxiliary_loss_mlp": 0.01131215, + "balance_loss_clip": 1.00205064, + "balance_loss_mlp": 1.00065732, + "epoch": 0.32598827596572977, + "flos": 26214251149440.0, + "grad_norm": 1.458911366640346, + "language_loss": 0.75281829, + "learning_rate": 3.149234491389381e-06, + "loss": 0.77584124, + "num_input_tokens_seen": 116509950, + "step": 5422, + "time_per_iteration": 2.570709466934204 + }, + { + "auxiliary_loss_clip": 0.01123869, + "auxiliary_loss_mlp": 0.00747806, + "balance_loss_clip": 1.00190008, + "balance_loss_mlp": 1.00021005, + "epoch": 0.32604839921839773, + "flos": 17639752947840.0, + "grad_norm": 2.313754587248993, + "language_loss": 0.62846339, + "learning_rate": 3.1489157254863026e-06, + "loss": 0.6471802, + "num_input_tokens_seen": 116527695, + "step": 5423, + "time_per_iteration": 2.594020366668701 + }, + { + "auxiliary_loss_clip": 0.01139366, + "auxiliary_loss_mlp": 0.01129975, + "balance_loss_clip": 1.00185871, + "balance_loss_mlp": 1.00065637, + "epoch": 0.3261085224710657, + "flos": 23622721724160.0, + "grad_norm": 1.6430427245511374, + "language_loss": 0.74576497, + "learning_rate": 3.148596916016224e-06, + "loss": 0.76845837, + "num_input_tokens_seen": 116547800, + "step": 5424, + "time_per_iteration": 2.629399061203003 + }, + { + "auxiliary_loss_clip": 0.0113798, + "auxiliary_loss_mlp": 0.01130284, + "balance_loss_clip": 1.0018307, + "balance_loss_mlp": 1.00077558, + "epoch": 0.32616864572373366, + "flos": 23260455106560.0, + "grad_norm": 1.6104108084831656, + "language_loss": 0.77015328, + "learning_rate": 3.1482780629912355e-06, + "loss": 0.79283595, + "num_input_tokens_seen": 116568460, + "step": 5425, + "time_per_iteration": 2.6205637454986572 + }, + { + "auxiliary_loss_clip": 0.01122753, + "auxiliary_loss_mlp": 0.01131332, + "balance_loss_clip": 1.00173879, + "balance_loss_mlp": 1.00067878, + "epoch": 0.32622876897640163, + "flos": 25593427457280.0, + "grad_norm": 3.0304285214392914, + "language_loss": 0.78520203, + "learning_rate": 3.147959166423428e-06, + "loss": 0.80774295, + "num_input_tokens_seen": 116588705, + "step": 5426, + "time_per_iteration": 2.697775363922119 + }, + { + "auxiliary_loss_clip": 0.01108086, + "auxiliary_loss_mlp": 0.01130846, + "balance_loss_clip": 1.00183511, + "balance_loss_mlp": 1.00066972, + "epoch": 0.3262888922290696, + "flos": 22418996123520.0, + "grad_norm": 1.6341402741488233, + "language_loss": 0.73806477, + "learning_rate": 3.147640226324893e-06, + "loss": 0.76045412, + "num_input_tokens_seen": 116608845, + "step": 5427, + "time_per_iteration": 2.6844606399536133 + }, + { + "auxiliary_loss_clip": 0.01123581, + "auxiliary_loss_mlp": 0.01131657, + "balance_loss_clip": 1.00175619, + "balance_loss_mlp": 1.00090814, + "epoch": 0.32634901548173756, + "flos": 19718908819200.0, + "grad_norm": 1.5779109689494473, + "language_loss": 0.79123712, + "learning_rate": 3.1473212427077266e-06, + "loss": 0.81378949, + "num_input_tokens_seen": 116628145, + "step": 5428, + "time_per_iteration": 2.710000514984131 + }, + { + "auxiliary_loss_clip": 0.01154404, + "auxiliary_loss_mlp": 0.01130884, + "balance_loss_clip": 1.00182271, + "balance_loss_mlp": 1.00080276, + "epoch": 0.3264091387344055, + "flos": 16142924367360.0, + "grad_norm": 1.5392691076445715, + "language_loss": 0.71190929, + "learning_rate": 3.147002215584023e-06, + "loss": 0.73476219, + "num_input_tokens_seen": 116646920, + "step": 5429, + "time_per_iteration": 2.5977630615234375 + }, + { + "auxiliary_loss_clip": 0.01121147, + "auxiliary_loss_mlp": 0.01130793, + "balance_loss_clip": 1.00173855, + "balance_loss_mlp": 1.00071204, + "epoch": 0.3264692619870735, + "flos": 16399075230720.0, + "grad_norm": 1.6250891429544894, + "language_loss": 0.78484035, + "learning_rate": 3.146683144965881e-06, + "loss": 0.8073597, + "num_input_tokens_seen": 116665100, + "step": 5430, + "time_per_iteration": 2.6771633625030518 + }, + { + "auxiliary_loss_clip": 0.0110589, + "auxiliary_loss_mlp": 0.01131837, + "balance_loss_clip": 1.00168788, + "balance_loss_mlp": 1.00080252, + "epoch": 0.32652938523974145, + "flos": 22382331315840.0, + "grad_norm": 1.909241453183741, + "language_loss": 0.84555286, + "learning_rate": 3.146364030865399e-06, + "loss": 0.86793017, + "num_input_tokens_seen": 116682205, + "step": 5431, + "time_per_iteration": 2.6507298946380615 + }, + { + "auxiliary_loss_clip": 0.01154262, + "auxiliary_loss_mlp": 0.01130706, + "balance_loss_clip": 1.00185156, + "balance_loss_mlp": 1.00062525, + "epoch": 0.3265895084924094, + "flos": 21908059113600.0, + "grad_norm": 1.6563488261880468, + "language_loss": 0.70351261, + "learning_rate": 3.146044873294678e-06, + "loss": 0.72636229, + "num_input_tokens_seen": 116702575, + "step": 5432, + "time_per_iteration": 2.6205263137817383 + }, + { + "auxiliary_loss_clip": 0.01107459, + "auxiliary_loss_mlp": 0.01130677, + "balance_loss_clip": 1.0016731, + "balance_loss_mlp": 1.00059628, + "epoch": 0.3266496317450774, + "flos": 16067152627200.0, + "grad_norm": 1.5504736281399514, + "language_loss": 0.84121519, + "learning_rate": 3.1457256722658203e-06, + "loss": 0.86359656, + "num_input_tokens_seen": 116720885, + "step": 5433, + "time_per_iteration": 2.673168182373047 + }, + { + "auxiliary_loss_clip": 0.01137743, + "auxiliary_loss_mlp": 0.01130029, + "balance_loss_clip": 1.00180578, + "balance_loss_mlp": 1.00061572, + "epoch": 0.3267097549977454, + "flos": 22528236360960.0, + "grad_norm": 1.442457012485029, + "language_loss": 0.85402846, + "learning_rate": 3.145406427790931e-06, + "loss": 0.87670624, + "num_input_tokens_seen": 116740395, + "step": 5434, + "time_per_iteration": 2.6391565799713135 + }, + { + "auxiliary_loss_clip": 0.01140688, + "auxiliary_loss_mlp": 0.01131341, + "balance_loss_clip": 1.00191033, + "balance_loss_mlp": 1.00068736, + "epoch": 0.32676987825041337, + "flos": 27270419679360.0, + "grad_norm": 1.8994620357966787, + "language_loss": 0.87735426, + "learning_rate": 3.1450871398821147e-06, + "loss": 0.90007454, + "num_input_tokens_seen": 116758870, + "step": 5435, + "time_per_iteration": 2.6288702487945557 + }, + { + "auxiliary_loss_clip": 0.01170882, + "auxiliary_loss_mlp": 0.01130346, + "balance_loss_clip": 1.00188994, + "balance_loss_mlp": 1.00074196, + "epoch": 0.32683000150308134, + "flos": 11508257433600.0, + "grad_norm": 2.8854432438786333, + "language_loss": 0.76672065, + "learning_rate": 3.144767808551479e-06, + "loss": 0.78973293, + "num_input_tokens_seen": 116773440, + "step": 5436, + "time_per_iteration": 2.4417171478271484 + }, + { + "auxiliary_loss_clip": 0.0117095, + "auxiliary_loss_mlp": 0.01129842, + "balance_loss_clip": 1.00195277, + "balance_loss_mlp": 1.00052381, + "epoch": 0.3268901247557493, + "flos": 25630200005760.0, + "grad_norm": 1.8378306021025599, + "language_loss": 0.71891338, + "learning_rate": 3.144448433811134e-06, + "loss": 0.74192131, + "num_input_tokens_seen": 116794375, + "step": 5437, + "time_per_iteration": 2.5315122604370117 + }, + { + "auxiliary_loss_clip": 0.01128953, + "auxiliary_loss_mlp": 0.01130867, + "balance_loss_clip": 1.00210476, + "balance_loss_mlp": 1.00069022, + "epoch": 0.32695024800841727, + "flos": 24860849575680.0, + "grad_norm": 1.7907811220368721, + "language_loss": 0.63907659, + "learning_rate": 3.144129015673189e-06, + "loss": 0.66167474, + "num_input_tokens_seen": 116815095, + "step": 5438, + "time_per_iteration": 2.6904587745666504 + }, + { + "auxiliary_loss_clip": 0.01154317, + "auxiliary_loss_mlp": 0.01130706, + "balance_loss_clip": 1.00198388, + "balance_loss_mlp": 1.00072002, + "epoch": 0.32701037126108523, + "flos": 28839249072000.0, + "grad_norm": 1.6850771443915675, + "language_loss": 0.7417208, + "learning_rate": 3.1438095541497576e-06, + "loss": 0.76457107, + "num_input_tokens_seen": 116836630, + "step": 5439, + "time_per_iteration": 2.6498560905456543 + }, + { + "auxiliary_loss_clip": 0.01154428, + "auxiliary_loss_mlp": 0.01130993, + "balance_loss_clip": 1.00191998, + "balance_loss_mlp": 1.00081623, + "epoch": 0.3270704945137532, + "flos": 27965075777280.0, + "grad_norm": 2.1020839454166076, + "language_loss": 0.74781853, + "learning_rate": 3.1434900492529527e-06, + "loss": 0.77067268, + "num_input_tokens_seen": 116856880, + "step": 5440, + "time_per_iteration": 2.570232629776001 + }, + { + "auxiliary_loss_clip": 0.01154491, + "auxiliary_loss_mlp": 0.00747918, + "balance_loss_clip": 1.00194502, + "balance_loss_mlp": 1.0002352, + "epoch": 0.32713061776642116, + "flos": 23690700213120.0, + "grad_norm": 2.0465161299711725, + "language_loss": 0.84735203, + "learning_rate": 3.1431705009948914e-06, + "loss": 0.86637616, + "num_input_tokens_seen": 116873770, + "step": 5441, + "time_per_iteration": 2.5357511043548584 + }, + { + "auxiliary_loss_clip": 0.01155808, + "auxiliary_loss_mlp": 0.01130994, + "balance_loss_clip": 1.00195968, + "balance_loss_mlp": 1.00072241, + "epoch": 0.3271907410190891, + "flos": 22455625017600.0, + "grad_norm": 2.6327122300973267, + "language_loss": 0.86485773, + "learning_rate": 3.1428509093876897e-06, + "loss": 0.88772571, + "num_input_tokens_seen": 116891225, + "step": 5442, + "time_per_iteration": 2.5443356037139893 + }, + { + "auxiliary_loss_clip": 0.01124084, + "auxiliary_loss_mlp": 0.01131027, + "balance_loss_clip": 1.00186443, + "balance_loss_mlp": 1.00056481, + "epoch": 0.3272508642717571, + "flos": 22820118278400.0, + "grad_norm": 1.6437506003556688, + "language_loss": 0.77444839, + "learning_rate": 3.1425312744434668e-06, + "loss": 0.79699945, + "num_input_tokens_seen": 116912300, + "step": 5443, + "time_per_iteration": 2.634281635284424 + }, + { + "auxiliary_loss_clip": 0.01121922, + "auxiliary_loss_mlp": 0.00747771, + "balance_loss_clip": 1.00177777, + "balance_loss_mlp": 1.00016439, + "epoch": 0.32731098752442506, + "flos": 11801360413440.0, + "grad_norm": 2.6890055412871234, + "language_loss": 0.81233114, + "learning_rate": 3.142211596174343e-06, + "loss": 0.83102804, + "num_input_tokens_seen": 116929425, + "step": 5444, + "time_per_iteration": 2.605327606201172 + }, + { + "auxiliary_loss_clip": 0.01106735, + "auxiliary_loss_mlp": 0.01130199, + "balance_loss_clip": 1.00171793, + "balance_loss_mlp": 1.00069022, + "epoch": 0.327371110777093, + "flos": 21027780506880.0, + "grad_norm": 2.7184815655510666, + "language_loss": 0.59094191, + "learning_rate": 3.1418918745924423e-06, + "loss": 0.61331123, + "num_input_tokens_seen": 116948255, + "step": 5445, + "time_per_iteration": 2.655472993850708 + }, + { + "auxiliary_loss_clip": 0.01154452, + "auxiliary_loss_mlp": 0.01131085, + "balance_loss_clip": 1.00206292, + "balance_loss_mlp": 1.00071788, + "epoch": 0.327431234029761, + "flos": 19062102677760.0, + "grad_norm": 2.243204663420227, + "language_loss": 0.88247919, + "learning_rate": 3.1415721097098865e-06, + "loss": 0.90533459, + "num_input_tokens_seen": 116964905, + "step": 5446, + "time_per_iteration": 2.535831928253174 + }, + { + "auxiliary_loss_clip": 0.01141038, + "auxiliary_loss_mlp": 0.01132143, + "balance_loss_clip": 1.00197697, + "balance_loss_mlp": 1.0007267, + "epoch": 0.32749135728242895, + "flos": 25849219184640.0, + "grad_norm": 1.689244716951742, + "language_loss": 0.78686017, + "learning_rate": 3.141252301538802e-06, + "loss": 0.80959195, + "num_input_tokens_seen": 116983650, + "step": 5447, + "time_per_iteration": 2.617234706878662 + }, + { + "auxiliary_loss_clip": 0.01139462, + "auxiliary_loss_mlp": 0.0074792, + "balance_loss_clip": 1.00185871, + "balance_loss_mlp": 1.00017571, + "epoch": 0.327551480535097, + "flos": 20120533764480.0, + "grad_norm": 2.0505103720371833, + "language_loss": 0.73053974, + "learning_rate": 3.1409324500913157e-06, + "loss": 0.74941361, + "num_input_tokens_seen": 117003265, + "step": 5448, + "time_per_iteration": 2.5917203426361084 + }, + { + "auxiliary_loss_clip": 0.01170982, + "auxiliary_loss_mlp": 0.01130425, + "balance_loss_clip": 1.00198793, + "balance_loss_mlp": 1.00072575, + "epoch": 0.32761160378776494, + "flos": 28803553931520.0, + "grad_norm": 2.216466911264152, + "language_loss": 0.67057061, + "learning_rate": 3.1406125553795567e-06, + "loss": 0.69358468, + "num_input_tokens_seen": 117025370, + "step": 5449, + "time_per_iteration": 2.607419490814209 + }, + { + "auxiliary_loss_clip": 0.01122576, + "auxiliary_loss_mlp": 0.01130546, + "balance_loss_clip": 1.0017643, + "balance_loss_mlp": 1.00056064, + "epoch": 0.3276717270404329, + "flos": 26937778803840.0, + "grad_norm": 2.002312845764026, + "language_loss": 0.6560334, + "learning_rate": 3.1402926174156556e-06, + "loss": 0.67856461, + "num_input_tokens_seen": 117044350, + "step": 5450, + "time_per_iteration": 2.6741695404052734 + }, + { + "auxiliary_loss_clip": 0.01160477, + "auxiliary_loss_mlp": 0.01130338, + "balance_loss_clip": 1.00219822, + "balance_loss_mlp": 1.00063801, + "epoch": 0.32773185029310087, + "flos": 25338425829120.0, + "grad_norm": 3.140021377118004, + "language_loss": 0.77177364, + "learning_rate": 3.1399726362117437e-06, + "loss": 0.79468179, + "num_input_tokens_seen": 117064450, + "step": 5451, + "time_per_iteration": 2.5835723876953125 + }, + { + "auxiliary_loss_clip": 0.01154302, + "auxiliary_loss_mlp": 0.01131055, + "balance_loss_clip": 1.00193822, + "balance_loss_mlp": 1.00068784, + "epoch": 0.32779197354576883, + "flos": 26391721271040.0, + "grad_norm": 2.0400023028438166, + "language_loss": 0.69965076, + "learning_rate": 3.1396526117799555e-06, + "loss": 0.72250432, + "num_input_tokens_seen": 117083060, + "step": 5452, + "time_per_iteration": 2.568866729736328 + }, + { + "auxiliary_loss_clip": 0.01138885, + "auxiliary_loss_mlp": 0.01129743, + "balance_loss_clip": 1.00186324, + "balance_loss_mlp": 1.00052047, + "epoch": 0.3278520967984368, + "flos": 24899381890560.0, + "grad_norm": 1.7282703495278604, + "language_loss": 0.78982896, + "learning_rate": 3.1393325441324256e-06, + "loss": 0.8125152, + "num_input_tokens_seen": 117101860, + "step": 5453, + "time_per_iteration": 3.9681899547576904 + }, + { + "auxiliary_loss_clip": 0.01155959, + "auxiliary_loss_mlp": 0.01130242, + "balance_loss_clip": 1.0019815, + "balance_loss_mlp": 1.00054216, + "epoch": 0.32791222005110476, + "flos": 29752996176000.0, + "grad_norm": 2.012833405754032, + "language_loss": 0.74561679, + "learning_rate": 3.1390124332812916e-06, + "loss": 0.76847875, + "num_input_tokens_seen": 117123100, + "step": 5454, + "time_per_iteration": 2.6144514083862305 + }, + { + "auxiliary_loss_clip": 0.01108935, + "auxiliary_loss_mlp": 0.01129474, + "balance_loss_clip": 1.00180292, + "balance_loss_mlp": 1.00072849, + "epoch": 0.32797234330377273, + "flos": 16508064072960.0, + "grad_norm": 1.7065249583505324, + "language_loss": 0.76523787, + "learning_rate": 3.1386922792386924e-06, + "loss": 0.78762197, + "num_input_tokens_seen": 117140515, + "step": 5455, + "time_per_iteration": 4.097889184951782 + }, + { + "auxiliary_loss_clip": 0.01160402, + "auxiliary_loss_mlp": 0.01130499, + "balance_loss_clip": 1.00222826, + "balance_loss_mlp": 1.00079942, + "epoch": 0.3280324665564407, + "flos": 26577918397440.0, + "grad_norm": 1.9010966336476802, + "language_loss": 0.74013811, + "learning_rate": 3.138372082016768e-06, + "loss": 0.7630471, + "num_input_tokens_seen": 117161485, + "step": 5456, + "time_per_iteration": 2.5716865062713623 + }, + { + "auxiliary_loss_clip": 0.0117092, + "auxiliary_loss_mlp": 0.01131044, + "balance_loss_clip": 1.00192666, + "balance_loss_mlp": 1.00096309, + "epoch": 0.32809258980910866, + "flos": 22929969047040.0, + "grad_norm": 1.5137412948203735, + "language_loss": 0.78082061, + "learning_rate": 3.1380518416276596e-06, + "loss": 0.80384028, + "num_input_tokens_seen": 117181870, + "step": 5457, + "time_per_iteration": 5.261960744857788 + }, + { + "auxiliary_loss_clip": 0.01122411, + "auxiliary_loss_mlp": 0.01130991, + "balance_loss_clip": 1.00170708, + "balance_loss_mlp": 1.00071895, + "epoch": 0.3281527130617766, + "flos": 22783848520320.0, + "grad_norm": 2.005872604003889, + "language_loss": 0.78665107, + "learning_rate": 3.1377315580835115e-06, + "loss": 0.80918503, + "num_input_tokens_seen": 117201380, + "step": 5458, + "time_per_iteration": 2.659684896469116 + }, + { + "auxiliary_loss_clip": 0.01154757, + "auxiliary_loss_mlp": 0.01130064, + "balance_loss_clip": 1.00192308, + "balance_loss_mlp": 1.00055468, + "epoch": 0.3282128363144446, + "flos": 21250678354560.0, + "grad_norm": 2.1101959821589493, + "language_loss": 0.72859716, + "learning_rate": 3.1374112313964686e-06, + "loss": 0.75144541, + "num_input_tokens_seen": 117221040, + "step": 5459, + "time_per_iteration": 2.5380425453186035 + }, + { + "auxiliary_loss_clip": 0.01137867, + "auxiliary_loss_mlp": 0.0113088, + "balance_loss_clip": 1.00188172, + "balance_loss_mlp": 1.00079894, + "epoch": 0.32827295956711255, + "flos": 30843064166400.0, + "grad_norm": 1.910641944112915, + "language_loss": 0.83760774, + "learning_rate": 3.1370908615786783e-06, + "loss": 0.8602953, + "num_input_tokens_seen": 117241395, + "step": 5460, + "time_per_iteration": 2.649338960647583 + }, + { + "auxiliary_loss_clip": 0.01170768, + "auxiliary_loss_mlp": 0.01130361, + "balance_loss_clip": 1.00181079, + "balance_loss_mlp": 1.00085199, + "epoch": 0.3283330828197806, + "flos": 25915006944000.0, + "grad_norm": 1.7481376301346914, + "language_loss": 0.76515496, + "learning_rate": 3.136770448642288e-06, + "loss": 0.78816628, + "num_input_tokens_seen": 117259340, + "step": 5461, + "time_per_iteration": 2.5242109298706055 + }, + { + "auxiliary_loss_clip": 0.01155918, + "auxiliary_loss_mlp": 0.01130234, + "balance_loss_clip": 1.00195515, + "balance_loss_mlp": 1.00072551, + "epoch": 0.32839320607244854, + "flos": 38582065042560.0, + "grad_norm": 1.7863178512704656, + "language_loss": 0.62949789, + "learning_rate": 3.1364499925994484e-06, + "loss": 0.65235943, + "num_input_tokens_seen": 117282375, + "step": 5462, + "time_per_iteration": 2.68605637550354 + }, + { + "auxiliary_loss_clip": 0.01170754, + "auxiliary_loss_mlp": 0.00747816, + "balance_loss_clip": 1.00190687, + "balance_loss_mlp": 1.00017035, + "epoch": 0.3284533293251165, + "flos": 26650888876800.0, + "grad_norm": 1.401361085482092, + "language_loss": 0.78127682, + "learning_rate": 3.1361294934623115e-06, + "loss": 0.80046248, + "num_input_tokens_seen": 117303830, + "step": 5463, + "time_per_iteration": 2.5687623023986816 + }, + { + "auxiliary_loss_clip": 0.01123101, + "auxiliary_loss_mlp": 0.0113044, + "balance_loss_clip": 1.001791, + "balance_loss_mlp": 1.00064468, + "epoch": 0.32851345257778447, + "flos": 15304158904320.0, + "grad_norm": 3.318630137870749, + "language_loss": 0.69616508, + "learning_rate": 3.1358089512430303e-06, + "loss": 0.71870047, + "num_input_tokens_seen": 117320665, + "step": 5464, + "time_per_iteration": 2.5891387462615967 + }, + { + "auxiliary_loss_clip": 0.01154558, + "auxiliary_loss_mlp": 0.01130325, + "balance_loss_clip": 1.00194478, + "balance_loss_mlp": 1.0007205, + "epoch": 0.32857357583045244, + "flos": 23513732881920.0, + "grad_norm": 2.6417741664500682, + "language_loss": 0.72221237, + "learning_rate": 3.1354883659537594e-06, + "loss": 0.74506116, + "num_input_tokens_seen": 117339795, + "step": 5465, + "time_per_iteration": 2.5714902877807617 + }, + { + "auxiliary_loss_clip": 0.01139386, + "auxiliary_loss_mlp": 0.01129777, + "balance_loss_clip": 1.00190234, + "balance_loss_mlp": 1.00064993, + "epoch": 0.3286336990831204, + "flos": 20995209849600.0, + "grad_norm": 1.6402752010175774, + "language_loss": 0.829597, + "learning_rate": 3.1351677376066567e-06, + "loss": 0.85228866, + "num_input_tokens_seen": 117359525, + "step": 5466, + "time_per_iteration": 2.6029186248779297 + }, + { + "auxiliary_loss_clip": 0.01138483, + "auxiliary_loss_mlp": 0.01130356, + "balance_loss_clip": 1.00179338, + "balance_loss_mlp": 1.0006566, + "epoch": 0.32869382233578837, + "flos": 23658811914240.0, + "grad_norm": 9.223638031490996, + "language_loss": 0.7924583, + "learning_rate": 3.134847066213879e-06, + "loss": 0.81514668, + "num_input_tokens_seen": 117380320, + "step": 5467, + "time_per_iteration": 2.624077320098877 + }, + { + "auxiliary_loss_clip": 0.01137735, + "auxiliary_loss_mlp": 0.01130584, + "balance_loss_clip": 1.00179696, + "balance_loss_mlp": 1.00059783, + "epoch": 0.32875394558845633, + "flos": 25336522408320.0, + "grad_norm": 1.7925042344555109, + "language_loss": 0.74494332, + "learning_rate": 3.134526351787587e-06, + "loss": 0.76762646, + "num_input_tokens_seen": 117400695, + "step": 5468, + "time_per_iteration": 2.6319568157196045 + }, + { + "auxiliary_loss_clip": 0.01141042, + "auxiliary_loss_mlp": 0.011311, + "balance_loss_clip": 1.00200272, + "balance_loss_mlp": 1.00063705, + "epoch": 0.3288140688411243, + "flos": 14903108576640.0, + "grad_norm": 3.0140265548286553, + "language_loss": 0.78255284, + "learning_rate": 3.134205594339942e-06, + "loss": 0.80527425, + "num_input_tokens_seen": 117418800, + "step": 5469, + "time_per_iteration": 2.56070613861084 + }, + { + "auxiliary_loss_clip": 0.01127971, + "auxiliary_loss_mlp": 0.01130195, + "balance_loss_clip": 1.00174737, + "balance_loss_mlp": 1.00059044, + "epoch": 0.32887419209379226, + "flos": 18551345235840.0, + "grad_norm": 1.6749407897265982, + "language_loss": 0.81781387, + "learning_rate": 3.133884793883107e-06, + "loss": 0.84039545, + "num_input_tokens_seen": 117438220, + "step": 5470, + "time_per_iteration": 2.621190071105957 + }, + { + "auxiliary_loss_clip": 0.01170963, + "auxiliary_loss_mlp": 0.011302, + "balance_loss_clip": 1.0018959, + "balance_loss_mlp": 1.00069141, + "epoch": 0.3289343153464602, + "flos": 48105610439040.0, + "grad_norm": 1.7211573909494866, + "language_loss": 0.67708766, + "learning_rate": 3.1335639504292478e-06, + "loss": 0.70009929, + "num_input_tokens_seen": 117462560, + "step": 5471, + "time_per_iteration": 2.7748641967773438 + }, + { + "auxiliary_loss_clip": 0.01171006, + "auxiliary_loss_mlp": 0.01131417, + "balance_loss_clip": 1.00193679, + "balance_loss_mlp": 1.00076318, + "epoch": 0.3289944385991282, + "flos": 27600295207680.0, + "grad_norm": 3.131261365757526, + "language_loss": 0.64610636, + "learning_rate": 3.1332430639905288e-06, + "loss": 0.66913056, + "num_input_tokens_seen": 117483665, + "step": 5472, + "time_per_iteration": 2.5937998294830322 + }, + { + "auxiliary_loss_clip": 0.01154421, + "auxiliary_loss_mlp": 0.01131496, + "balance_loss_clip": 1.00191617, + "balance_loss_mlp": 1.00074685, + "epoch": 0.32905456185179616, + "flos": 20120318282880.0, + "grad_norm": 1.744277420614238, + "language_loss": 0.88231897, + "learning_rate": 3.13292213457912e-06, + "loss": 0.90517807, + "num_input_tokens_seen": 117503565, + "step": 5473, + "time_per_iteration": 2.559906244277954 + }, + { + "auxiliary_loss_clip": 0.01123311, + "auxiliary_loss_mlp": 0.01130899, + "balance_loss_clip": 1.00188673, + "balance_loss_mlp": 1.00062716, + "epoch": 0.3291146851044642, + "flos": 23180230080000.0, + "grad_norm": 1.700407493597233, + "language_loss": 0.77740705, + "learning_rate": 3.1326011622071903e-06, + "loss": 0.79994917, + "num_input_tokens_seen": 117521460, + "step": 5474, + "time_per_iteration": 2.6535909175872803 + }, + { + "auxiliary_loss_clip": 0.01133278, + "auxiliary_loss_mlp": 0.0111285, + "balance_loss_clip": 1.00100827, + "balance_loss_mlp": 0.99993479, + "epoch": 0.32917480835713214, + "flos": 67621912594560.0, + "grad_norm": 0.7992077333511811, + "language_loss": 0.60149872, + "learning_rate": 3.132280146886911e-06, + "loss": 0.62395996, + "num_input_tokens_seen": 117580550, + "step": 5475, + "time_per_iteration": 3.1258513927459717 + }, + { + "auxiliary_loss_clip": 0.01123986, + "auxiliary_loss_mlp": 0.01131384, + "balance_loss_clip": 1.00188553, + "balance_loss_mlp": 1.00082552, + "epoch": 0.3292349316098001, + "flos": 27964537073280.0, + "grad_norm": 2.5205052092281606, + "language_loss": 0.76163816, + "learning_rate": 3.131959088630455e-06, + "loss": 0.78419191, + "num_input_tokens_seen": 117600645, + "step": 5476, + "time_per_iteration": 2.706514596939087 + }, + { + "auxiliary_loss_clip": 0.01123456, + "auxiliary_loss_mlp": 0.01131242, + "balance_loss_clip": 1.00184393, + "balance_loss_mlp": 1.00077915, + "epoch": 0.3292950548624681, + "flos": 20263673462400.0, + "grad_norm": 1.7026654275841877, + "language_loss": 0.74257857, + "learning_rate": 3.131637987449997e-06, + "loss": 0.76512557, + "num_input_tokens_seen": 117618880, + "step": 5477, + "time_per_iteration": 2.6370527744293213 + }, + { + "auxiliary_loss_clip": 0.01170832, + "auxiliary_loss_mlp": 0.01129933, + "balance_loss_clip": 1.00198567, + "balance_loss_mlp": 1.00080597, + "epoch": 0.32935517811513604, + "flos": 20812999132800.0, + "grad_norm": 2.4379677284304258, + "language_loss": 0.75068259, + "learning_rate": 3.131316843357713e-06, + "loss": 0.77369022, + "num_input_tokens_seen": 117636445, + "step": 5478, + "time_per_iteration": 2.5069379806518555 + }, + { + "auxiliary_loss_clip": 0.01155806, + "auxiliary_loss_mlp": 0.01130383, + "balance_loss_clip": 1.00208282, + "balance_loss_mlp": 1.00068307, + "epoch": 0.329415301367804, + "flos": 18441853603200.0, + "grad_norm": 2.0910644427313523, + "language_loss": 0.80376869, + "learning_rate": 3.1309956563657807e-06, + "loss": 0.82663059, + "num_input_tokens_seen": 117653105, + "step": 5479, + "time_per_iteration": 2.532935857772827 + }, + { + "auxiliary_loss_clip": 0.01135425, + "auxiliary_loss_mlp": 0.01112831, + "balance_loss_clip": 1.00130475, + "balance_loss_mlp": 0.99991554, + "epoch": 0.32947542462047197, + "flos": 66323024887680.0, + "grad_norm": 0.74145832801176, + "language_loss": 0.56502748, + "learning_rate": 3.1306744264863804e-06, + "loss": 0.58751005, + "num_input_tokens_seen": 117719225, + "step": 5480, + "time_per_iteration": 3.2019190788269043 + }, + { + "auxiliary_loss_clip": 0.01155726, + "auxiliary_loss_mlp": 0.00747812, + "balance_loss_clip": 1.0019418, + "balance_loss_mlp": 1.00014675, + "epoch": 0.32953554787313993, + "flos": 23221599569280.0, + "grad_norm": 2.013262608756494, + "language_loss": 0.77201235, + "learning_rate": 3.1303531537316915e-06, + "loss": 0.79104769, + "num_input_tokens_seen": 117738725, + "step": 5481, + "time_per_iteration": 2.5881502628326416 + }, + { + "auxiliary_loss_clip": 0.01137843, + "auxiliary_loss_mlp": 0.0113027, + "balance_loss_clip": 1.00179505, + "balance_loss_mlp": 1.00066566, + "epoch": 0.3295956711258079, + "flos": 27009492307200.0, + "grad_norm": 1.489236288850638, + "language_loss": 0.78523356, + "learning_rate": 3.130031838113899e-06, + "loss": 0.80791473, + "num_input_tokens_seen": 117757765, + "step": 5482, + "time_per_iteration": 2.6293435096740723 + }, + { + "auxiliary_loss_clip": 0.01155405, + "auxiliary_loss_mlp": 0.0113118, + "balance_loss_clip": 1.00191963, + "balance_loss_mlp": 1.00081301, + "epoch": 0.32965579437847586, + "flos": 19171702051200.0, + "grad_norm": 2.8346180797352307, + "language_loss": 0.73910016, + "learning_rate": 3.129710479645185e-06, + "loss": 0.76196599, + "num_input_tokens_seen": 117776810, + "step": 5483, + "time_per_iteration": 2.6030240058898926 + }, + { + "auxiliary_loss_clip": 0.01155565, + "auxiliary_loss_mlp": 0.0113066, + "balance_loss_clip": 1.00197673, + "balance_loss_mlp": 1.00067449, + "epoch": 0.32971591763114383, + "flos": 30482521401600.0, + "grad_norm": 1.7973103306887774, + "language_loss": 0.75529855, + "learning_rate": 3.1293890783377366e-06, + "loss": 0.77816081, + "num_input_tokens_seen": 117797730, + "step": 5484, + "time_per_iteration": 2.6353116035461426 + }, + { + "auxiliary_loss_clip": 0.01170972, + "auxiliary_loss_mlp": 0.01130128, + "balance_loss_clip": 1.00203681, + "balance_loss_mlp": 1.0007143, + "epoch": 0.3297760408838118, + "flos": 16289583598080.0, + "grad_norm": 4.008788357234855, + "language_loss": 0.71718967, + "learning_rate": 3.129067634203742e-06, + "loss": 0.7402007, + "num_input_tokens_seen": 117815365, + "step": 5485, + "time_per_iteration": 2.5091235637664795 + }, + { + "auxiliary_loss_clip": 0.01088994, + "auxiliary_loss_mlp": 0.01130598, + "balance_loss_clip": 1.00163651, + "balance_loss_mlp": 1.00080335, + "epoch": 0.32983616413647976, + "flos": 29530924341120.0, + "grad_norm": 1.9356454134101575, + "language_loss": 0.80119526, + "learning_rate": 3.128746147255388e-06, + "loss": 0.82339114, + "num_input_tokens_seen": 117836095, + "step": 5486, + "time_per_iteration": 2.7667243480682373 + }, + { + "auxiliary_loss_clip": 0.01138897, + "auxiliary_loss_mlp": 0.01130723, + "balance_loss_clip": 1.00175571, + "balance_loss_mlp": 1.00073743, + "epoch": 0.3298962873891478, + "flos": 20631398947200.0, + "grad_norm": 3.992076195665645, + "language_loss": 0.84052455, + "learning_rate": 3.1284246175048683e-06, + "loss": 0.86322081, + "num_input_tokens_seen": 117854655, + "step": 5487, + "time_per_iteration": 2.594087600708008 + }, + { + "auxiliary_loss_clip": 0.01108143, + "auxiliary_loss_mlp": 0.01131213, + "balance_loss_clip": 1.00172818, + "balance_loss_mlp": 1.00055945, + "epoch": 0.32995641064181574, + "flos": 14976007228800.0, + "grad_norm": 2.2623044056051818, + "language_loss": 0.74019182, + "learning_rate": 3.1281030449643735e-06, + "loss": 0.7625854, + "num_input_tokens_seen": 117873300, + "step": 5488, + "time_per_iteration": 2.6477456092834473 + }, + { + "auxiliary_loss_clip": 0.01170922, + "auxiliary_loss_mlp": 0.01131298, + "balance_loss_clip": 1.00199795, + "balance_loss_mlp": 1.00073993, + "epoch": 0.3300165338944837, + "flos": 18661447399680.0, + "grad_norm": 2.131298533382093, + "language_loss": 0.72163165, + "learning_rate": 3.127781429646098e-06, + "loss": 0.74465388, + "num_input_tokens_seen": 117891540, + "step": 5489, + "time_per_iteration": 2.5077261924743652 + }, + { + "auxiliary_loss_clip": 0.01170861, + "auxiliary_loss_mlp": 0.01130684, + "balance_loss_clip": 1.00189328, + "balance_loss_mlp": 1.00050724, + "epoch": 0.3300766571471517, + "flos": 25583730785280.0, + "grad_norm": 2.4704255653042204, + "language_loss": 0.88618737, + "learning_rate": 3.127459771562238e-06, + "loss": 0.90920281, + "num_input_tokens_seen": 117907690, + "step": 5490, + "time_per_iteration": 2.528050422668457 + }, + { + "auxiliary_loss_clip": 0.01154949, + "auxiliary_loss_mlp": 0.01129572, + "balance_loss_clip": 1.0018208, + "balance_loss_mlp": 1.00054026, + "epoch": 0.33013678039981964, + "flos": 11363501623680.0, + "grad_norm": 5.627308398755366, + "language_loss": 0.83067012, + "learning_rate": 3.1271380707249907e-06, + "loss": 0.85351539, + "num_input_tokens_seen": 117925640, + "step": 5491, + "time_per_iteration": 3.928281545639038 + }, + { + "auxiliary_loss_clip": 0.01138073, + "auxiliary_loss_mlp": 0.01129819, + "balance_loss_clip": 1.00172758, + "balance_loss_mlp": 1.00088191, + "epoch": 0.3301969036524876, + "flos": 24821203939200.0, + "grad_norm": 1.8230444724712063, + "language_loss": 0.77083111, + "learning_rate": 3.126816327146554e-06, + "loss": 0.79351008, + "num_input_tokens_seen": 117944525, + "step": 5492, + "time_per_iteration": 2.653898000717163 + }, + { + "auxiliary_loss_clip": 0.01171045, + "auxiliary_loss_mlp": 0.01131259, + "balance_loss_clip": 1.0019778, + "balance_loss_mlp": 1.00089169, + "epoch": 0.33025702690515557, + "flos": 15961144613760.0, + "grad_norm": 2.4148382136240443, + "language_loss": 0.74977267, + "learning_rate": 3.12649454083913e-06, + "loss": 0.77279568, + "num_input_tokens_seen": 117962515, + "step": 5493, + "time_per_iteration": 3.907388210296631 + }, + { + "auxiliary_loss_clip": 0.01103498, + "auxiliary_loss_mlp": 0.01112293, + "balance_loss_clip": 1.00087571, + "balance_loss_mlp": 1.00014043, + "epoch": 0.33031715015782354, + "flos": 59416755989760.0, + "grad_norm": 0.7866817585922105, + "language_loss": 0.53873128, + "learning_rate": 3.12617271181492e-06, + "loss": 0.56088918, + "num_input_tokens_seen": 118018780, + "step": 5494, + "time_per_iteration": 3.1838128566741943 + }, + { + "auxiliary_loss_clip": 0.01140533, + "auxiliary_loss_mlp": 0.01131001, + "balance_loss_clip": 1.00186765, + "balance_loss_mlp": 1.00082397, + "epoch": 0.3303772734104915, + "flos": 23184360144000.0, + "grad_norm": 1.5131510936364978, + "language_loss": 0.87014735, + "learning_rate": 3.1258508400861276e-06, + "loss": 0.89286268, + "num_input_tokens_seen": 118038610, + "step": 5495, + "time_per_iteration": 5.3890221118927 + }, + { + "auxiliary_loss_clip": 0.01121819, + "auxiliary_loss_mlp": 0.01130927, + "balance_loss_clip": 1.00168538, + "balance_loss_mlp": 1.00084615, + "epoch": 0.33043739666315947, + "flos": 33071896010880.0, + "grad_norm": 2.813476832500453, + "language_loss": 0.73476183, + "learning_rate": 3.1255289256649587e-06, + "loss": 0.75728929, + "num_input_tokens_seen": 118055905, + "step": 5496, + "time_per_iteration": 2.7214622497558594 + }, + { + "auxiliary_loss_clip": 0.01139211, + "auxiliary_loss_mlp": 0.01130261, + "balance_loss_clip": 1.00190592, + "balance_loss_mlp": 1.00056076, + "epoch": 0.33049751991582743, + "flos": 24895431394560.0, + "grad_norm": 2.415649796390603, + "language_loss": 0.71758121, + "learning_rate": 3.1252069685636196e-06, + "loss": 0.74027592, + "num_input_tokens_seen": 118073695, + "step": 5497, + "time_per_iteration": 2.626971483230591 + }, + { + "auxiliary_loss_clip": 0.01139069, + "auxiliary_loss_mlp": 0.01130714, + "balance_loss_clip": 1.0017345, + "balance_loss_mlp": 1.00072813, + "epoch": 0.3305576431684954, + "flos": 29460575554560.0, + "grad_norm": 1.9741232127737307, + "language_loss": 0.80213463, + "learning_rate": 3.124884968794321e-06, + "loss": 0.82483244, + "num_input_tokens_seen": 118094030, + "step": 5498, + "time_per_iteration": 2.6574039459228516 + }, + { + "auxiliary_loss_clip": 0.01155665, + "auxiliary_loss_mlp": 0.01130889, + "balance_loss_clip": 1.00185251, + "balance_loss_mlp": 1.00071239, + "epoch": 0.33061776642116336, + "flos": 22632305040000.0, + "grad_norm": 1.96420226433054, + "language_loss": 0.75704223, + "learning_rate": 3.12456292636927e-06, + "loss": 0.77990776, + "num_input_tokens_seen": 118111665, + "step": 5499, + "time_per_iteration": 2.560863494873047 + }, + { + "auxiliary_loss_clip": 0.01138785, + "auxiliary_loss_mlp": 0.01130284, + "balance_loss_clip": 1.00174904, + "balance_loss_mlp": 1.00067997, + "epoch": 0.3306778896738313, + "flos": 25776320532480.0, + "grad_norm": 1.5822628852745844, + "language_loss": 0.78995872, + "learning_rate": 3.124240841300681e-06, + "loss": 0.81264943, + "num_input_tokens_seen": 118132435, + "step": 5500, + "time_per_iteration": 2.6268467903137207 + }, + { + "auxiliary_loss_clip": 0.01153976, + "auxiliary_loss_mlp": 0.01131153, + "balance_loss_clip": 1.0018518, + "balance_loss_mlp": 1.00059485, + "epoch": 0.33073801292649935, + "flos": 36940552479360.0, + "grad_norm": 2.040167467010152, + "language_loss": 0.6618551, + "learning_rate": 3.1239187136007665e-06, + "loss": 0.68470639, + "num_input_tokens_seen": 118155255, + "step": 5501, + "time_per_iteration": 2.6603002548217773 + }, + { + "auxiliary_loss_clip": 0.01154238, + "auxiliary_loss_mlp": 0.01131432, + "balance_loss_clip": 1.00186324, + "balance_loss_mlp": 1.00068355, + "epoch": 0.3307981361791673, + "flos": 12967738848000.0, + "grad_norm": 2.3032516318769285, + "language_loss": 0.77248156, + "learning_rate": 3.1235965432817417e-06, + "loss": 0.79533827, + "num_input_tokens_seen": 118169865, + "step": 5502, + "time_per_iteration": 2.5178394317626953 + }, + { + "auxiliary_loss_clip": 0.01138168, + "auxiliary_loss_mlp": 0.01131765, + "balance_loss_clip": 1.00192964, + "balance_loss_mlp": 1.00072968, + "epoch": 0.3308582594318353, + "flos": 25374372364800.0, + "grad_norm": 1.6184956067539713, + "language_loss": 0.72317255, + "learning_rate": 3.123274330355824e-06, + "loss": 0.74587178, + "num_input_tokens_seen": 118190760, + "step": 5503, + "time_per_iteration": 2.6111974716186523 + }, + { + "auxiliary_loss_clip": 0.0113886, + "auxiliary_loss_mlp": 0.01130864, + "balance_loss_clip": 1.00177479, + "balance_loss_mlp": 1.00068736, + "epoch": 0.33091838268450324, + "flos": 26468570419200.0, + "grad_norm": 1.5107288429772512, + "language_loss": 0.75059122, + "learning_rate": 3.12295207483523e-06, + "loss": 0.77328849, + "num_input_tokens_seen": 118213620, + "step": 5504, + "time_per_iteration": 2.6559560298919678 + }, + { + "auxiliary_loss_clip": 0.01143637, + "auxiliary_loss_mlp": 0.01130532, + "balance_loss_clip": 1.00203061, + "balance_loss_mlp": 1.00073719, + "epoch": 0.3309785059371712, + "flos": 24971167221120.0, + "grad_norm": 1.701530373802771, + "language_loss": 0.698048, + "learning_rate": 3.1226297767321816e-06, + "loss": 0.72078967, + "num_input_tokens_seen": 118235010, + "step": 5505, + "time_per_iteration": 2.63492488861084 + }, + { + "auxiliary_loss_clip": 0.0115584, + "auxiliary_loss_mlp": 0.01131106, + "balance_loss_clip": 1.0020839, + "balance_loss_mlp": 1.0010252, + "epoch": 0.3310386291898392, + "flos": 20446710192000.0, + "grad_norm": 1.866203964389777, + "language_loss": 0.8228364, + "learning_rate": 3.122307436058899e-06, + "loss": 0.84570587, + "num_input_tokens_seen": 118255820, + "step": 5506, + "time_per_iteration": 2.5948381423950195 + }, + { + "auxiliary_loss_clip": 0.01155947, + "auxiliary_loss_mlp": 0.01131418, + "balance_loss_clip": 1.00201058, + "balance_loss_mlp": 1.00076485, + "epoch": 0.33109875244250714, + "flos": 23182672204800.0, + "grad_norm": 1.6884674427805884, + "language_loss": 0.79036909, + "learning_rate": 3.121985052827606e-06, + "loss": 0.81324273, + "num_input_tokens_seen": 118274160, + "step": 5507, + "time_per_iteration": 2.634312152862549 + }, + { + "auxiliary_loss_clip": 0.0113776, + "auxiliary_loss_mlp": 0.01130786, + "balance_loss_clip": 1.00180614, + "balance_loss_mlp": 1.00099039, + "epoch": 0.3311588756951751, + "flos": 24168384207360.0, + "grad_norm": 1.4530402844964143, + "language_loss": 0.71354002, + "learning_rate": 3.1216626270505274e-06, + "loss": 0.73622549, + "num_input_tokens_seen": 118294385, + "step": 5508, + "time_per_iteration": 2.640955924987793 + }, + { + "auxiliary_loss_clip": 0.0113935, + "auxiliary_loss_mlp": 0.0113036, + "balance_loss_clip": 1.002069, + "balance_loss_mlp": 1.00094604, + "epoch": 0.33121899894784307, + "flos": 28145742209280.0, + "grad_norm": 1.8257421967628311, + "language_loss": 0.71502864, + "learning_rate": 3.12134015873989e-06, + "loss": 0.73772573, + "num_input_tokens_seen": 118313105, + "step": 5509, + "time_per_iteration": 2.6761131286621094 + }, + { + "auxiliary_loss_clip": 0.01154747, + "auxiliary_loss_mlp": 0.01131601, + "balance_loss_clip": 1.00197101, + "balance_loss_mlp": 1.00066113, + "epoch": 0.33127912220051103, + "flos": 29567660976000.0, + "grad_norm": 1.648644836056557, + "language_loss": 0.73171151, + "learning_rate": 3.121017647907921e-06, + "loss": 0.75457501, + "num_input_tokens_seen": 118335250, + "step": 5510, + "time_per_iteration": 2.6347479820251465 + }, + { + "auxiliary_loss_clip": 0.01130311, + "auxiliary_loss_mlp": 0.01130669, + "balance_loss_clip": 1.00223768, + "balance_loss_mlp": 1.00096977, + "epoch": 0.331339245453179, + "flos": 14428836374400.0, + "grad_norm": 2.1999667165064465, + "language_loss": 0.87980717, + "learning_rate": 3.1206950945668508e-06, + "loss": 0.902417, + "num_input_tokens_seen": 118351470, + "step": 5511, + "time_per_iteration": 2.6569578647613525 + }, + { + "auxiliary_loss_clip": 0.01106741, + "auxiliary_loss_mlp": 0.01129835, + "balance_loss_clip": 1.00172234, + "balance_loss_mlp": 1.0007081, + "epoch": 0.33139936870584696, + "flos": 20887118847360.0, + "grad_norm": 1.7766226270834986, + "language_loss": 0.73190475, + "learning_rate": 3.12037249872891e-06, + "loss": 0.75427055, + "num_input_tokens_seen": 118370970, + "step": 5512, + "time_per_iteration": 2.6859488487243652 + }, + { + "auxiliary_loss_clip": 0.01122573, + "auxiliary_loss_mlp": 0.01130624, + "balance_loss_clip": 1.00171876, + "balance_loss_mlp": 1.00082922, + "epoch": 0.33145949195851493, + "flos": 36284356869120.0, + "grad_norm": 1.7643772125879273, + "language_loss": 0.72286224, + "learning_rate": 3.1200498604063317e-06, + "loss": 0.74539423, + "num_input_tokens_seen": 118393125, + "step": 5513, + "time_per_iteration": 2.770465135574341 + }, + { + "auxiliary_loss_clip": 0.01122034, + "auxiliary_loss_mlp": 0.01130869, + "balance_loss_clip": 1.0017581, + "balance_loss_mlp": 1.00059712, + "epoch": 0.33151961521118295, + "flos": 14279735018880.0, + "grad_norm": 2.542413388576087, + "language_loss": 0.68032295, + "learning_rate": 3.1197271796113507e-06, + "loss": 0.70285201, + "num_input_tokens_seen": 118410860, + "step": 5514, + "time_per_iteration": 2.5874483585357666 + }, + { + "auxiliary_loss_clip": 0.01139081, + "auxiliary_loss_mlp": 0.01131607, + "balance_loss_clip": 1.00185704, + "balance_loss_mlp": 1.00085819, + "epoch": 0.3315797384638509, + "flos": 20774323163520.0, + "grad_norm": 1.956893478006871, + "language_loss": 0.65968287, + "learning_rate": 3.1194044563562026e-06, + "loss": 0.68238974, + "num_input_tokens_seen": 118429570, + "step": 5515, + "time_per_iteration": 2.6172876358032227 + }, + { + "auxiliary_loss_clip": 0.01154385, + "auxiliary_loss_mlp": 0.0113101, + "balance_loss_clip": 1.00190067, + "balance_loss_mlp": 1.00073767, + "epoch": 0.3316398617165189, + "flos": 24679464871680.0, + "grad_norm": 1.7010573029390161, + "language_loss": 0.68982124, + "learning_rate": 3.1190816906531257e-06, + "loss": 0.71267521, + "num_input_tokens_seen": 118450285, + "step": 5516, + "time_per_iteration": 2.653965711593628 + }, + { + "auxiliary_loss_clip": 0.01154472, + "auxiliary_loss_mlp": 0.01131698, + "balance_loss_clip": 1.00181866, + "balance_loss_mlp": 1.00085354, + "epoch": 0.33169998496918685, + "flos": 18587974129920.0, + "grad_norm": 2.4213491445309065, + "language_loss": 0.80805457, + "learning_rate": 3.118758882514359e-06, + "loss": 0.83091629, + "num_input_tokens_seen": 118468270, + "step": 5517, + "time_per_iteration": 2.547480344772339 + }, + { + "auxiliary_loss_clip": 0.01155367, + "auxiliary_loss_mlp": 0.01130434, + "balance_loss_clip": 1.00185668, + "balance_loss_mlp": 1.00073457, + "epoch": 0.3317601082218548, + "flos": 20193647898240.0, + "grad_norm": 1.9379828063138502, + "language_loss": 0.74580729, + "learning_rate": 3.118436031952143e-06, + "loss": 0.76866531, + "num_input_tokens_seen": 118486615, + "step": 5518, + "time_per_iteration": 2.530379056930542 + }, + { + "auxiliary_loss_clip": 0.01135462, + "auxiliary_loss_mlp": 0.01112138, + "balance_loss_clip": 1.00139427, + "balance_loss_mlp": 0.99998587, + "epoch": 0.3318202314745228, + "flos": 68974703637120.0, + "grad_norm": 0.6099267861120208, + "language_loss": 0.54367959, + "learning_rate": 3.1181131389787206e-06, + "loss": 0.56615555, + "num_input_tokens_seen": 118553580, + "step": 5519, + "time_per_iteration": 3.252692699432373 + }, + { + "auxiliary_loss_clip": 0.01154559, + "auxiliary_loss_mlp": 0.01130847, + "balance_loss_clip": 1.00190639, + "balance_loss_mlp": 1.00076556, + "epoch": 0.33188035472719074, + "flos": 21500113374720.0, + "grad_norm": 2.2397967642762406, + "language_loss": 0.7825945, + "learning_rate": 3.117790203606336e-06, + "loss": 0.80544853, + "num_input_tokens_seen": 118570280, + "step": 5520, + "time_per_iteration": 2.536454439163208 + }, + { + "auxiliary_loss_clip": 0.011388, + "auxiliary_loss_mlp": 0.01130379, + "balance_loss_clip": 1.00193834, + "balance_loss_mlp": 1.00077415, + "epoch": 0.3319404779798587, + "flos": 28870490926080.0, + "grad_norm": 2.4110343378800265, + "language_loss": 0.76197559, + "learning_rate": 3.1174672258472344e-06, + "loss": 0.78466737, + "num_input_tokens_seen": 118590455, + "step": 5521, + "time_per_iteration": 2.658661127090454 + }, + { + "auxiliary_loss_clip": 0.0115429, + "auxiliary_loss_mlp": 0.01130751, + "balance_loss_clip": 1.00182343, + "balance_loss_mlp": 1.00095606, + "epoch": 0.33200060123252667, + "flos": 23076915586560.0, + "grad_norm": 2.8634502074125474, + "language_loss": 0.70161736, + "learning_rate": 3.117144205713664e-06, + "loss": 0.72446781, + "num_input_tokens_seen": 118609495, + "step": 5522, + "time_per_iteration": 2.6009902954101562 + }, + { + "auxiliary_loss_clip": 0.01138873, + "auxiliary_loss_mlp": 0.01130405, + "balance_loss_clip": 1.00177264, + "balance_loss_mlp": 1.00080097, + "epoch": 0.33206072448519464, + "flos": 21142479611520.0, + "grad_norm": 1.6484956214568505, + "language_loss": 0.73773301, + "learning_rate": 3.1168211432178735e-06, + "loss": 0.76042575, + "num_input_tokens_seen": 118628720, + "step": 5523, + "time_per_iteration": 2.6397266387939453 + }, + { + "auxiliary_loss_clip": 0.01138934, + "auxiliary_loss_mlp": 0.01130313, + "balance_loss_clip": 1.00179183, + "balance_loss_mlp": 1.00070822, + "epoch": 0.3321208477378626, + "flos": 13079097987840.0, + "grad_norm": 1.80617371859818, + "language_loss": 0.816975, + "learning_rate": 3.116498038372114e-06, + "loss": 0.83966744, + "num_input_tokens_seen": 118645955, + "step": 5524, + "time_per_iteration": 2.6256654262542725 + }, + { + "auxiliary_loss_clip": 0.01120847, + "auxiliary_loss_mlp": 0.00747856, + "balance_loss_clip": 1.00162053, + "balance_loss_mlp": 1.0001843, + "epoch": 0.33218097099053057, + "flos": 21215414177280.0, + "grad_norm": 1.616681217979786, + "language_loss": 0.82527626, + "learning_rate": 3.116174891188636e-06, + "loss": 0.84396327, + "num_input_tokens_seen": 118665605, + "step": 5525, + "time_per_iteration": 2.6432597637176514 + }, + { + "auxiliary_loss_clip": 0.01166723, + "auxiliary_loss_mlp": 0.0111215, + "balance_loss_clip": 1.00130689, + "balance_loss_mlp": 0.99999756, + "epoch": 0.33224109424319853, + "flos": 64348979189760.0, + "grad_norm": 0.7673820213260326, + "language_loss": 0.52630526, + "learning_rate": 3.1158517016796945e-06, + "loss": 0.54909402, + "num_input_tokens_seen": 118728155, + "step": 5526, + "time_per_iteration": 3.0729660987854004 + }, + { + "auxiliary_loss_clip": 0.01124463, + "auxiliary_loss_mlp": 0.00747841, + "balance_loss_clip": 1.00192523, + "balance_loss_mlp": 1.00013983, + "epoch": 0.33230121749586655, + "flos": 17346003523200.0, + "grad_norm": 2.685353296806445, + "language_loss": 0.77466631, + "learning_rate": 3.1155284698575445e-06, + "loss": 0.79338932, + "num_input_tokens_seen": 118743955, + "step": 5527, + "time_per_iteration": 2.606330394744873 + }, + { + "auxiliary_loss_clip": 0.01106433, + "auxiliary_loss_mlp": 0.01129813, + "balance_loss_clip": 1.00170147, + "balance_loss_mlp": 1.00087643, + "epoch": 0.3323613407485345, + "flos": 20997041443200.0, + "grad_norm": 1.6854544481178453, + "language_loss": 0.71840668, + "learning_rate": 3.1152051957344434e-06, + "loss": 0.74076909, + "num_input_tokens_seen": 118763275, + "step": 5528, + "time_per_iteration": 2.7167139053344727 + }, + { + "auxiliary_loss_clip": 0.01138601, + "auxiliary_loss_mlp": 0.01129838, + "balance_loss_clip": 1.00181174, + "balance_loss_mlp": 1.00061536, + "epoch": 0.3324214640012025, + "flos": 13152535344000.0, + "grad_norm": 3.992923154465842, + "language_loss": 0.82423842, + "learning_rate": 3.1148818793226497e-06, + "loss": 0.84692281, + "num_input_tokens_seen": 118781110, + "step": 5529, + "time_per_iteration": 3.9489896297454834 + }, + { + "auxiliary_loss_clip": 0.01139186, + "auxiliary_loss_mlp": 0.00747872, + "balance_loss_clip": 1.00187969, + "balance_loss_mlp": 1.00013566, + "epoch": 0.33248158725387045, + "flos": 22273522041600.0, + "grad_norm": 2.078329941290725, + "language_loss": 0.69871062, + "learning_rate": 3.114558520634423e-06, + "loss": 0.71758115, + "num_input_tokens_seen": 118800620, + "step": 5530, + "time_per_iteration": 2.6257362365722656 + }, + { + "auxiliary_loss_clip": 0.01155903, + "auxiliary_loss_mlp": 0.01130724, + "balance_loss_clip": 1.0019362, + "balance_loss_mlp": 1.00092888, + "epoch": 0.3325417105065384, + "flos": 20740998320640.0, + "grad_norm": 2.9176213987475244, + "language_loss": 0.76310956, + "learning_rate": 3.1142351196820256e-06, + "loss": 0.78597581, + "num_input_tokens_seen": 118818725, + "step": 5531, + "time_per_iteration": 3.9978349208831787 + }, + { + "auxiliary_loss_clip": 0.0113773, + "auxiliary_loss_mlp": 0.01131526, + "balance_loss_clip": 1.00182617, + "balance_loss_mlp": 1.00077736, + "epoch": 0.3326018337592064, + "flos": 24790536702720.0, + "grad_norm": 2.003564363317506, + "language_loss": 0.73289466, + "learning_rate": 3.1139116764777206e-06, + "loss": 0.75558722, + "num_input_tokens_seen": 118839390, + "step": 5532, + "time_per_iteration": 4.0488526821136475 + }, + { + "auxiliary_loss_clip": 0.01137004, + "auxiliary_loss_mlp": 0.01130348, + "balance_loss_clip": 1.00187778, + "balance_loss_mlp": 1.00074351, + "epoch": 0.33266195701187434, + "flos": 14501699112960.0, + "grad_norm": 2.1602018203916513, + "language_loss": 0.65787899, + "learning_rate": 3.1135881910337735e-06, + "loss": 0.68055248, + "num_input_tokens_seen": 118856275, + "step": 5533, + "time_per_iteration": 3.998033046722412 + }, + { + "auxiliary_loss_clip": 0.01090173, + "auxiliary_loss_mlp": 0.01130227, + "balance_loss_clip": 1.00162375, + "balance_loss_mlp": 1.00071776, + "epoch": 0.3327220802645423, + "flos": 15304410299520.0, + "grad_norm": 1.6475550102166852, + "language_loss": 0.71268284, + "learning_rate": 3.113264663362451e-06, + "loss": 0.73488683, + "num_input_tokens_seen": 118873830, + "step": 5534, + "time_per_iteration": 2.725531578063965 + }, + { + "auxiliary_loss_clip": 0.01129159, + "auxiliary_loss_mlp": 0.01130276, + "balance_loss_clip": 1.00218081, + "balance_loss_mlp": 1.00086236, + "epoch": 0.3327822035172103, + "flos": 23477534951040.0, + "grad_norm": 1.6107131633483776, + "language_loss": 0.67080808, + "learning_rate": 3.1129410934760204e-06, + "loss": 0.69340241, + "num_input_tokens_seen": 118891560, + "step": 5535, + "time_per_iteration": 2.6787726879119873 + }, + { + "auxiliary_loss_clip": 0.01155692, + "auxiliary_loss_mlp": 0.00747864, + "balance_loss_clip": 1.00184345, + "balance_loss_mlp": 1.0001514, + "epoch": 0.33284232676987824, + "flos": 25374516019200.0, + "grad_norm": 2.255645932425731, + "language_loss": 0.72849631, + "learning_rate": 3.1126174813867517e-06, + "loss": 0.74753183, + "num_input_tokens_seen": 118910260, + "step": 5536, + "time_per_iteration": 2.6011555194854736 + }, + { + "auxiliary_loss_clip": 0.01154337, + "auxiliary_loss_mlp": 0.01129976, + "balance_loss_clip": 1.00190687, + "balance_loss_mlp": 1.00084901, + "epoch": 0.3329024500225462, + "flos": 23694363400320.0, + "grad_norm": 1.8431018254865268, + "language_loss": 0.816589, + "learning_rate": 3.112293827106917e-06, + "loss": 0.83943218, + "num_input_tokens_seen": 118929985, + "step": 5537, + "time_per_iteration": 2.602177619934082 + }, + { + "auxiliary_loss_clip": 0.01154173, + "auxiliary_loss_mlp": 0.01131271, + "balance_loss_clip": 1.00187826, + "balance_loss_mlp": 1.00071335, + "epoch": 0.33296257327521417, + "flos": 31723163205120.0, + "grad_norm": 1.994717506362059, + "language_loss": 0.71347612, + "learning_rate": 3.111970130648789e-06, + "loss": 0.73633051, + "num_input_tokens_seen": 118951355, + "step": 5538, + "time_per_iteration": 2.6151819229125977 + }, + { + "auxiliary_loss_clip": 0.01154152, + "auxiliary_loss_mlp": 0.01129691, + "balance_loss_clip": 1.0018841, + "balance_loss_mlp": 1.00065863, + "epoch": 0.33302269652788213, + "flos": 22744705674240.0, + "grad_norm": 2.361667014805911, + "language_loss": 0.74472624, + "learning_rate": 3.1116463920246424e-06, + "loss": 0.76756471, + "num_input_tokens_seen": 118970910, + "step": 5539, + "time_per_iteration": 2.56550931930542 + }, + { + "auxiliary_loss_clip": 0.01170931, + "auxiliary_loss_mlp": 0.01131079, + "balance_loss_clip": 1.00192297, + "balance_loss_mlp": 1.00080705, + "epoch": 0.33308281978055015, + "flos": 11473747441920.0, + "grad_norm": 1.801161064316014, + "language_loss": 0.7110765, + "learning_rate": 3.1113226112467527e-06, + "loss": 0.73409665, + "num_input_tokens_seen": 118989200, + "step": 5540, + "time_per_iteration": 2.469229221343994 + }, + { + "auxiliary_loss_clip": 0.01154031, + "auxiliary_loss_mlp": 0.01130315, + "balance_loss_clip": 1.00180352, + "balance_loss_mlp": 1.00052023, + "epoch": 0.3331429430332181, + "flos": 38213693112960.0, + "grad_norm": 1.8978192442036548, + "language_loss": 0.60378253, + "learning_rate": 3.1109987883273983e-06, + "loss": 0.62662601, + "num_input_tokens_seen": 119011030, + "step": 5541, + "time_per_iteration": 2.7064054012298584 + }, + { + "auxiliary_loss_clip": 0.01138266, + "auxiliary_loss_mlp": 0.01130553, + "balance_loss_clip": 1.00171506, + "balance_loss_mlp": 1.00075793, + "epoch": 0.3332030662858861, + "flos": 22528667324160.0, + "grad_norm": 1.6762636901217105, + "language_loss": 0.68934733, + "learning_rate": 3.1106749232788584e-06, + "loss": 0.71203554, + "num_input_tokens_seen": 119030620, + "step": 5542, + "time_per_iteration": 2.599820852279663 + }, + { + "auxiliary_loss_clip": 0.01155334, + "auxiliary_loss_mlp": 0.01130526, + "balance_loss_clip": 1.00188553, + "balance_loss_mlp": 1.00073123, + "epoch": 0.33326318953855405, + "flos": 15997773507840.0, + "grad_norm": 1.6018905922996771, + "language_loss": 0.74929219, + "learning_rate": 3.110351016113414e-06, + "loss": 0.77215075, + "num_input_tokens_seen": 119048015, + "step": 5543, + "time_per_iteration": 2.5246150493621826 + }, + { + "auxiliary_loss_clip": 0.01074274, + "auxiliary_loss_mlp": 0.01130948, + "balance_loss_clip": 1.00173521, + "balance_loss_mlp": 1.00077188, + "epoch": 0.333323312791222, + "flos": 25593535198080.0, + "grad_norm": 2.0662212041640755, + "language_loss": 0.75034702, + "learning_rate": 3.110027066843348e-06, + "loss": 0.77239919, + "num_input_tokens_seen": 119066280, + "step": 5544, + "time_per_iteration": 2.782520294189453 + }, + { + "auxiliary_loss_clip": 0.01170825, + "auxiliary_loss_mlp": 0.01130498, + "balance_loss_clip": 1.0018754, + "balance_loss_mlp": 1.0007031, + "epoch": 0.33338343604389, + "flos": 25119550304640.0, + "grad_norm": 4.472860361019884, + "language_loss": 0.7055788, + "learning_rate": 3.1097030754809456e-06, + "loss": 0.72859198, + "num_input_tokens_seen": 119087680, + "step": 5545, + "time_per_iteration": 2.5921387672424316 + }, + { + "auxiliary_loss_clip": 0.01127009, + "auxiliary_loss_mlp": 0.0113024, + "balance_loss_clip": 1.0017215, + "balance_loss_mlp": 1.00073135, + "epoch": 0.33344355929655795, + "flos": 16947287579520.0, + "grad_norm": 1.6879965174240887, + "language_loss": 0.69022596, + "learning_rate": 3.1093790420384894e-06, + "loss": 0.71279848, + "num_input_tokens_seen": 119105820, + "step": 5546, + "time_per_iteration": 2.6266229152679443 + }, + { + "auxiliary_loss_clip": 0.0112242, + "auxiliary_loss_mlp": 0.01131089, + "balance_loss_clip": 1.00168681, + "balance_loss_mlp": 1.00081682, + "epoch": 0.3335036825492259, + "flos": 27889591345920.0, + "grad_norm": 1.539372972784882, + "language_loss": 0.64635754, + "learning_rate": 3.1090549665282702e-06, + "loss": 0.66889262, + "num_input_tokens_seen": 119126630, + "step": 5547, + "time_per_iteration": 2.7090322971343994 + }, + { + "auxiliary_loss_clip": 0.0113998, + "auxiliary_loss_mlp": 0.01130056, + "balance_loss_clip": 1.00198126, + "balance_loss_mlp": 1.00064254, + "epoch": 0.3335638058018939, + "flos": 16179553261440.0, + "grad_norm": 2.082024592482928, + "language_loss": 0.85141885, + "learning_rate": 3.1087308489625742e-06, + "loss": 0.87411922, + "num_input_tokens_seen": 119143375, + "step": 5548, + "time_per_iteration": 2.615539073944092 + }, + { + "auxiliary_loss_clip": 0.01153758, + "auxiliary_loss_mlp": 0.01130562, + "balance_loss_clip": 1.0017786, + "balance_loss_mlp": 1.00076652, + "epoch": 0.33362392905456184, + "flos": 39896108288640.0, + "grad_norm": 2.1061150197485055, + "language_loss": 0.74784458, + "learning_rate": 3.1084066893536945e-06, + "loss": 0.77068776, + "num_input_tokens_seen": 119166450, + "step": 5549, + "time_per_iteration": 2.761228561401367 + }, + { + "auxiliary_loss_clip": 0.01155649, + "auxiliary_loss_mlp": 0.01130206, + "balance_loss_clip": 1.00188971, + "balance_loss_mlp": 1.00079203, + "epoch": 0.3336840523072298, + "flos": 44271212567040.0, + "grad_norm": 1.8223527355718678, + "language_loss": 0.68372613, + "learning_rate": 3.108082487713921e-06, + "loss": 0.70658469, + "num_input_tokens_seen": 119189645, + "step": 5550, + "time_per_iteration": 2.7533602714538574 + }, + { + "auxiliary_loss_clip": 0.01124107, + "auxiliary_loss_mlp": 0.01130554, + "balance_loss_clip": 1.00191629, + "balance_loss_mlp": 1.00085449, + "epoch": 0.33374417555989777, + "flos": 15085678429440.0, + "grad_norm": 1.8065751097100113, + "language_loss": 0.60222781, + "learning_rate": 3.1077582440555495e-06, + "loss": 0.6247744, + "num_input_tokens_seen": 119208045, + "step": 5551, + "time_per_iteration": 2.6396067142486572 + }, + { + "auxiliary_loss_clip": 0.01122749, + "auxiliary_loss_mlp": 0.01130314, + "balance_loss_clip": 1.00178933, + "balance_loss_mlp": 1.00070953, + "epoch": 0.33380429881256574, + "flos": 15849174942720.0, + "grad_norm": 2.0560435801231165, + "language_loss": 0.70797968, + "learning_rate": 3.1074339583908746e-06, + "loss": 0.73051035, + "num_input_tokens_seen": 119224910, + "step": 5552, + "time_per_iteration": 2.6467843055725098 + }, + { + "auxiliary_loss_clip": 0.01123672, + "auxiliary_loss_mlp": 0.01129806, + "balance_loss_clip": 1.00168681, + "balance_loss_mlp": 1.00058341, + "epoch": 0.33386442206523376, + "flos": 13480327883520.0, + "grad_norm": 1.9737504428460815, + "language_loss": 0.83200455, + "learning_rate": 3.107109630732192e-06, + "loss": 0.85453933, + "num_input_tokens_seen": 119243290, + "step": 5553, + "time_per_iteration": 2.62001895904541 + }, + { + "auxiliary_loss_clip": 0.01137276, + "auxiliary_loss_mlp": 0.00747804, + "balance_loss_clip": 1.00178826, + "balance_loss_mlp": 1.00020432, + "epoch": 0.3339245453179017, + "flos": 16690669839360.0, + "grad_norm": 4.112899460342006, + "language_loss": 0.81123221, + "learning_rate": 3.1067852610918017e-06, + "loss": 0.83008301, + "num_input_tokens_seen": 119261195, + "step": 5554, + "time_per_iteration": 2.58904767036438 + }, + { + "auxiliary_loss_clip": 0.01154141, + "auxiliary_loss_mlp": 0.01130367, + "balance_loss_clip": 1.00184977, + "balance_loss_mlp": 1.00085831, + "epoch": 0.3339846685705697, + "flos": 24610624456320.0, + "grad_norm": 3.7178309742804587, + "language_loss": 0.81284565, + "learning_rate": 3.1064608494820032e-06, + "loss": 0.83569074, + "num_input_tokens_seen": 119282845, + "step": 5555, + "time_per_iteration": 2.6237781047821045 + }, + { + "auxiliary_loss_clip": 0.01155633, + "auxiliary_loss_mlp": 0.01129552, + "balance_loss_clip": 1.00190973, + "balance_loss_mlp": 1.00061488, + "epoch": 0.33404479182323765, + "flos": 30953812775040.0, + "grad_norm": 2.0667181629322378, + "language_loss": 0.74083173, + "learning_rate": 3.106136395915099e-06, + "loss": 0.76368356, + "num_input_tokens_seen": 119304430, + "step": 5556, + "time_per_iteration": 2.6466593742370605 + }, + { + "auxiliary_loss_clip": 0.0115425, + "auxiliary_loss_mlp": 0.01129868, + "balance_loss_clip": 1.00192928, + "balance_loss_mlp": 1.00083566, + "epoch": 0.3341049150759056, + "flos": 23513301918720.0, + "grad_norm": 1.4965278330447624, + "language_loss": 0.82309449, + "learning_rate": 3.105811900403391e-06, + "loss": 0.8459357, + "num_input_tokens_seen": 119323830, + "step": 5557, + "time_per_iteration": 2.6055352687835693 + }, + { + "auxiliary_loss_clip": 0.01137466, + "auxiliary_loss_mlp": 0.0113004, + "balance_loss_clip": 1.00182295, + "balance_loss_mlp": 1.00062609, + "epoch": 0.3341650383285736, + "flos": 24026824707840.0, + "grad_norm": 2.1986873545543903, + "language_loss": 0.79996765, + "learning_rate": 3.1054873629591855e-06, + "loss": 0.82264268, + "num_input_tokens_seen": 119346340, + "step": 5558, + "time_per_iteration": 2.6572611331939697 + }, + { + "auxiliary_loss_clip": 0.01139174, + "auxiliary_loss_mlp": 0.0112951, + "balance_loss_clip": 1.00188255, + "balance_loss_mlp": 1.00057352, + "epoch": 0.33422516158124155, + "flos": 24901967669760.0, + "grad_norm": 1.6909999691349868, + "language_loss": 0.81115198, + "learning_rate": 3.105162783594788e-06, + "loss": 0.83383882, + "num_input_tokens_seen": 119367285, + "step": 5559, + "time_per_iteration": 2.642930746078491 + }, + { + "auxiliary_loss_clip": 0.01121937, + "auxiliary_loss_mlp": 0.01129995, + "balance_loss_clip": 1.00180757, + "balance_loss_mlp": 1.00086701, + "epoch": 0.3342852848339095, + "flos": 18333403464960.0, + "grad_norm": 1.9146980279702956, + "language_loss": 0.71648824, + "learning_rate": 3.1048381623225074e-06, + "loss": 0.73900753, + "num_input_tokens_seen": 119385370, + "step": 5560, + "time_per_iteration": 2.6287853717803955 + }, + { + "auxiliary_loss_clip": 0.01138808, + "auxiliary_loss_mlp": 0.01131045, + "balance_loss_clip": 1.00184369, + "balance_loss_mlp": 1.0008682, + "epoch": 0.3343454080865775, + "flos": 30046530119040.0, + "grad_norm": 1.410532755227845, + "language_loss": 0.75218737, + "learning_rate": 3.1045134991546526e-06, + "loss": 0.77488589, + "num_input_tokens_seen": 119409150, + "step": 5561, + "time_per_iteration": 2.6738555431365967 + }, + { + "auxiliary_loss_clip": 0.01138991, + "auxiliary_loss_mlp": 0.01130116, + "balance_loss_clip": 1.00190771, + "balance_loss_mlp": 1.00070226, + "epoch": 0.33440553133924544, + "flos": 16398823835520.0, + "grad_norm": 1.8825158199688312, + "language_loss": 0.69413197, + "learning_rate": 3.1041887941035355e-06, + "loss": 0.71682304, + "num_input_tokens_seen": 119426475, + "step": 5562, + "time_per_iteration": 2.581359386444092 + }, + { + "auxiliary_loss_clip": 0.01154039, + "auxiliary_loss_mlp": 0.01129388, + "balance_loss_clip": 1.00188673, + "balance_loss_mlp": 1.00064158, + "epoch": 0.3344656545919134, + "flos": 24242072958720.0, + "grad_norm": 1.910263275435402, + "language_loss": 0.65052897, + "learning_rate": 3.1038640471814685e-06, + "loss": 0.67336321, + "num_input_tokens_seen": 119446900, + "step": 5563, + "time_per_iteration": 2.556666612625122 + }, + { + "auxiliary_loss_clip": 0.01089172, + "auxiliary_loss_mlp": 0.01130307, + "balance_loss_clip": 1.00163364, + "balance_loss_mlp": 1.00089335, + "epoch": 0.3345257778445814, + "flos": 52118843149440.0, + "grad_norm": 7.515583716300535, + "language_loss": 0.73804462, + "learning_rate": 3.103539258400766e-06, + "loss": 0.76023942, + "num_input_tokens_seen": 119470945, + "step": 5564, + "time_per_iteration": 2.9832658767700195 + }, + { + "auxiliary_loss_clip": 0.01119733, + "auxiliary_loss_mlp": 0.01111736, + "balance_loss_clip": 1.00126219, + "balance_loss_mlp": 1.00034678, + "epoch": 0.33458590109724934, + "flos": 68048602254720.0, + "grad_norm": 0.7757478295219219, + "language_loss": 0.55544019, + "learning_rate": 3.103214427773745e-06, + "loss": 0.57775486, + "num_input_tokens_seen": 119529925, + "step": 5565, + "time_per_iteration": 3.13969349861145 + }, + { + "auxiliary_loss_clip": 0.01170614, + "auxiliary_loss_mlp": 0.01129807, + "balance_loss_clip": 1.00195408, + "balance_loss_mlp": 1.00067997, + "epoch": 0.3346460243499173, + "flos": 37414788768000.0, + "grad_norm": 1.871180221121766, + "language_loss": 0.64905596, + "learning_rate": 3.102889555312721e-06, + "loss": 0.67206019, + "num_input_tokens_seen": 119550700, + "step": 5566, + "time_per_iteration": 4.035922050476074 + }, + { + "auxiliary_loss_clip": 0.01138975, + "auxiliary_loss_mlp": 0.01130344, + "balance_loss_clip": 1.00177205, + "balance_loss_mlp": 1.00073993, + "epoch": 0.3347061476025853, + "flos": 18697358021760.0, + "grad_norm": 1.7913027966696067, + "language_loss": 0.77399075, + "learning_rate": 3.102564641030016e-06, + "loss": 0.79668397, + "num_input_tokens_seen": 119569295, + "step": 5567, + "time_per_iteration": 2.650750160217285 + }, + { + "auxiliary_loss_clip": 0.01137225, + "auxiliary_loss_mlp": 0.0113091, + "balance_loss_clip": 1.00183249, + "balance_loss_mlp": 1.00063789, + "epoch": 0.3347662708552533, + "flos": 13917827537280.0, + "grad_norm": 1.9905556344990059, + "language_loss": 0.76153064, + "learning_rate": 3.102239684937949e-06, + "loss": 0.78421199, + "num_input_tokens_seen": 119587375, + "step": 5568, + "time_per_iteration": 2.55433988571167 + }, + { + "auxiliary_loss_clip": 0.0112143, + "auxiliary_loss_mlp": 0.01130231, + "balance_loss_clip": 1.00173855, + "balance_loss_mlp": 1.0009129, + "epoch": 0.33482639410792125, + "flos": 19750402068480.0, + "grad_norm": 2.0309257294164955, + "language_loss": 0.70761973, + "learning_rate": 3.101914687048842e-06, + "loss": 0.73013639, + "num_input_tokens_seen": 119604530, + "step": 5569, + "time_per_iteration": 4.197608232498169 + }, + { + "auxiliary_loss_clip": 0.01123648, + "auxiliary_loss_mlp": 0.01130951, + "balance_loss_clip": 1.00169623, + "balance_loss_mlp": 1.00067925, + "epoch": 0.3348865173605892, + "flos": 16102991422080.0, + "grad_norm": 2.6736282097121515, + "language_loss": 0.89478171, + "learning_rate": 3.10158964737502e-06, + "loss": 0.9173277, + "num_input_tokens_seen": 119621025, + "step": 5570, + "time_per_iteration": 4.086466550827026 + }, + { + "auxiliary_loss_clip": 0.01122067, + "auxiliary_loss_mlp": 0.01130258, + "balance_loss_clip": 1.00191224, + "balance_loss_mlp": 1.00065339, + "epoch": 0.3349466406132572, + "flos": 25008945350400.0, + "grad_norm": 1.4763205952638825, + "language_loss": 0.79832518, + "learning_rate": 3.101264565928808e-06, + "loss": 0.82084846, + "num_input_tokens_seen": 119641725, + "step": 5571, + "time_per_iteration": 4.104155540466309 + }, + { + "auxiliary_loss_clip": 0.0116661, + "auxiliary_loss_mlp": 0.00747353, + "balance_loss_clip": 1.00132799, + "balance_loss_mlp": 1.00048494, + "epoch": 0.33500676386592515, + "flos": 54319991564160.0, + "grad_norm": 0.8897613192083998, + "language_loss": 0.55929077, + "learning_rate": 3.1009394427225335e-06, + "loss": 0.57843041, + "num_input_tokens_seen": 119693560, + "step": 5572, + "time_per_iteration": 3.060455560684204 + }, + { + "auxiliary_loss_clip": 0.01170766, + "auxiliary_loss_mlp": 0.01130062, + "balance_loss_clip": 1.00204659, + "balance_loss_mlp": 1.00093412, + "epoch": 0.3350668871185931, + "flos": 26797332625920.0, + "grad_norm": 1.9073305452570866, + "language_loss": 0.78116298, + "learning_rate": 3.1006142777685257e-06, + "loss": 0.8041712, + "num_input_tokens_seen": 119712935, + "step": 5573, + "time_per_iteration": 2.558769941329956 + }, + { + "auxiliary_loss_clip": 0.01121307, + "auxiliary_loss_mlp": 0.01129874, + "balance_loss_clip": 1.00181067, + "balance_loss_mlp": 1.00093734, + "epoch": 0.3351270103712611, + "flos": 33510508986240.0, + "grad_norm": 3.2274298162603126, + "language_loss": 0.72477412, + "learning_rate": 3.1002890710791133e-06, + "loss": 0.7472859, + "num_input_tokens_seen": 119731680, + "step": 5574, + "time_per_iteration": 2.721306085586548 + }, + { + "auxiliary_loss_clip": 0.01153863, + "auxiliary_loss_mlp": 0.01129488, + "balance_loss_clip": 1.00177872, + "balance_loss_mlp": 1.0007422, + "epoch": 0.33518713362392905, + "flos": 26506240807680.0, + "grad_norm": 1.7943071192546847, + "language_loss": 0.88297093, + "learning_rate": 3.0999638226666287e-06, + "loss": 0.9058044, + "num_input_tokens_seen": 119752155, + "step": 5575, + "time_per_iteration": 2.5901780128479004 + }, + { + "auxiliary_loss_clip": 0.01137479, + "auxiliary_loss_mlp": 0.01131315, + "balance_loss_clip": 1.00180781, + "balance_loss_mlp": 1.00094724, + "epoch": 0.335247256876597, + "flos": 17232345912960.0, + "grad_norm": 5.753892533320764, + "language_loss": 0.8267504, + "learning_rate": 3.0996385325434063e-06, + "loss": 0.84943831, + "num_input_tokens_seen": 119769195, + "step": 5576, + "time_per_iteration": 2.6187903881073 + }, + { + "auxiliary_loss_clip": 0.01153943, + "auxiliary_loss_mlp": 0.01129937, + "balance_loss_clip": 1.00190377, + "balance_loss_mlp": 1.00080943, + "epoch": 0.335307380129265, + "flos": 25629373992960.0, + "grad_norm": 2.1630228471502693, + "language_loss": 0.72325593, + "learning_rate": 3.0993132007217806e-06, + "loss": 0.7460947, + "num_input_tokens_seen": 119786810, + "step": 5577, + "time_per_iteration": 2.5503716468811035 + }, + { + "auxiliary_loss_clip": 0.01127713, + "auxiliary_loss_mlp": 0.01130313, + "balance_loss_clip": 1.00224566, + "balance_loss_mlp": 1.00080454, + "epoch": 0.33536750338193294, + "flos": 19680089195520.0, + "grad_norm": 1.7539857923207247, + "language_loss": 0.81777036, + "learning_rate": 3.0989878272140883e-06, + "loss": 0.84035063, + "num_input_tokens_seen": 119805395, + "step": 5578, + "time_per_iteration": 2.6050100326538086 + }, + { + "auxiliary_loss_clip": 0.01095282, + "auxiliary_loss_mlp": 0.00747734, + "balance_loss_clip": 1.00199401, + "balance_loss_mlp": 1.00004411, + "epoch": 0.3354276266346009, + "flos": 18332613365760.0, + "grad_norm": 1.9766357378784936, + "language_loss": 0.71671891, + "learning_rate": 3.0986624120326676e-06, + "loss": 0.73514915, + "num_input_tokens_seen": 119823135, + "step": 5579, + "time_per_iteration": 2.685858726501465 + }, + { + "auxiliary_loss_clip": 0.011054, + "auxiliary_loss_mlp": 0.0113042, + "balance_loss_clip": 1.00180149, + "balance_loss_mlp": 1.00100613, + "epoch": 0.3354877498872689, + "flos": 17858556645120.0, + "grad_norm": 2.2450256551085936, + "language_loss": 0.80888945, + "learning_rate": 3.0983369551898573e-06, + "loss": 0.83124769, + "num_input_tokens_seen": 119842265, + "step": 5580, + "time_per_iteration": 2.642610549926758 + }, + { + "auxiliary_loss_clip": 0.01143725, + "auxiliary_loss_mlp": 0.01130553, + "balance_loss_clip": 1.00198841, + "balance_loss_mlp": 1.00066233, + "epoch": 0.3355478731399369, + "flos": 24717745791360.0, + "grad_norm": 1.6303753983086897, + "language_loss": 0.77822924, + "learning_rate": 3.0980114566980003e-06, + "loss": 0.80097198, + "num_input_tokens_seen": 119862500, + "step": 5581, + "time_per_iteration": 2.6092560291290283 + }, + { + "auxiliary_loss_clip": 0.01122318, + "auxiliary_loss_mlp": 0.01130936, + "balance_loss_clip": 1.00179219, + "balance_loss_mlp": 1.00075936, + "epoch": 0.33560799639260486, + "flos": 16873886136960.0, + "grad_norm": 2.0789186347277435, + "language_loss": 0.74534261, + "learning_rate": 3.0976859165694384e-06, + "loss": 0.76787508, + "num_input_tokens_seen": 119880160, + "step": 5582, + "time_per_iteration": 2.5772809982299805 + }, + { + "auxiliary_loss_clip": 0.01139057, + "auxiliary_loss_mlp": 0.01130902, + "balance_loss_clip": 1.0018214, + "balance_loss_mlp": 1.0009166, + "epoch": 0.3356681196452728, + "flos": 18333511205760.0, + "grad_norm": 1.4671032439934453, + "language_loss": 0.81931424, + "learning_rate": 3.0973603348165166e-06, + "loss": 0.84201384, + "num_input_tokens_seen": 119899040, + "step": 5583, + "time_per_iteration": 2.578418493270874 + }, + { + "auxiliary_loss_clip": 0.0113764, + "auxiliary_loss_mlp": 0.01130497, + "balance_loss_clip": 1.00192046, + "balance_loss_mlp": 1.00108314, + "epoch": 0.3357282428979408, + "flos": 34750612085760.0, + "grad_norm": 6.986122502098084, + "language_loss": 0.77453721, + "learning_rate": 3.097034711451581e-06, + "loss": 0.79721862, + "num_input_tokens_seen": 119921120, + "step": 5584, + "time_per_iteration": 2.712771415710449 + }, + { + "auxiliary_loss_clip": 0.01137649, + "auxiliary_loss_mlp": 0.01130177, + "balance_loss_clip": 1.00181758, + "balance_loss_mlp": 1.00066757, + "epoch": 0.33578836615060875, + "flos": 21580087006080.0, + "grad_norm": 3.919950321440661, + "language_loss": 0.75908315, + "learning_rate": 3.0967090464869795e-06, + "loss": 0.78176141, + "num_input_tokens_seen": 119940165, + "step": 5585, + "time_per_iteration": 2.596846103668213 + }, + { + "auxiliary_loss_clip": 0.01155, + "auxiliary_loss_mlp": 0.01129456, + "balance_loss_clip": 1.00181913, + "balance_loss_mlp": 1.00071037, + "epoch": 0.3358484894032767, + "flos": 24530291688960.0, + "grad_norm": 1.5403511964221994, + "language_loss": 0.77416968, + "learning_rate": 3.0963833399350608e-06, + "loss": 0.79701424, + "num_input_tokens_seen": 119959730, + "step": 5586, + "time_per_iteration": 2.5823097229003906 + }, + { + "auxiliary_loss_clip": 0.01122061, + "auxiliary_loss_mlp": 0.01131264, + "balance_loss_clip": 1.00188088, + "balance_loss_mlp": 1.00080132, + "epoch": 0.3359086126559447, + "flos": 22455589104000.0, + "grad_norm": 1.7062764686040561, + "language_loss": 0.80537009, + "learning_rate": 3.0960575918081756e-06, + "loss": 0.82790339, + "num_input_tokens_seen": 119979315, + "step": 5587, + "time_per_iteration": 2.6148784160614014 + }, + { + "auxiliary_loss_clip": 0.01170735, + "auxiliary_loss_mlp": 0.01129728, + "balance_loss_clip": 1.00209486, + "balance_loss_mlp": 1.00079107, + "epoch": 0.33596873590861265, + "flos": 16543687386240.0, + "grad_norm": 1.889867276578839, + "language_loss": 0.66966438, + "learning_rate": 3.095731802118677e-06, + "loss": 0.69266897, + "num_input_tokens_seen": 119996140, + "step": 5588, + "time_per_iteration": 2.496049165725708 + }, + { + "auxiliary_loss_clip": 0.01140471, + "auxiliary_loss_mlp": 0.00747785, + "balance_loss_clip": 1.00196683, + "balance_loss_mlp": 1.00006211, + "epoch": 0.3360288591612806, + "flos": 31175812782720.0, + "grad_norm": 2.37493103835883, + "language_loss": 0.69812155, + "learning_rate": 3.095405970878919e-06, + "loss": 0.71700406, + "num_input_tokens_seen": 120017720, + "step": 5589, + "time_per_iteration": 2.6671531200408936 + }, + { + "auxiliary_loss_clip": 0.01138982, + "auxiliary_loss_mlp": 0.01130459, + "balance_loss_clip": 1.00182629, + "balance_loss_mlp": 1.00085485, + "epoch": 0.3360889824139486, + "flos": 23696913265920.0, + "grad_norm": 1.7028847539484908, + "language_loss": 0.66921735, + "learning_rate": 3.0950800981012567e-06, + "loss": 0.69191182, + "num_input_tokens_seen": 120036335, + "step": 5590, + "time_per_iteration": 2.6072700023651123 + }, + { + "auxiliary_loss_clip": 0.01123229, + "auxiliary_loss_mlp": 0.01129225, + "balance_loss_clip": 1.00193238, + "balance_loss_mlp": 1.00076485, + "epoch": 0.33614910566661654, + "flos": 19318109886720.0, + "grad_norm": 2.147407989122613, + "language_loss": 0.73057473, + "learning_rate": 3.094754183798047e-06, + "loss": 0.75309926, + "num_input_tokens_seen": 120056120, + "step": 5591, + "time_per_iteration": 2.6576149463653564 + }, + { + "auxiliary_loss_clip": 0.01170767, + "auxiliary_loss_mlp": 0.01129365, + "balance_loss_clip": 1.00206351, + "balance_loss_mlp": 1.00080991, + "epoch": 0.3362092289192845, + "flos": 16472261191680.0, + "grad_norm": 1.9334358071148767, + "language_loss": 0.70266807, + "learning_rate": 3.0944282279816493e-06, + "loss": 0.72566938, + "num_input_tokens_seen": 120073650, + "step": 5592, + "time_per_iteration": 2.5203258991241455 + }, + { + "auxiliary_loss_clip": 0.01137277, + "auxiliary_loss_mlp": 0.01129686, + "balance_loss_clip": 1.00187373, + "balance_loss_mlp": 1.0006541, + "epoch": 0.33626935217195253, + "flos": 24243581329920.0, + "grad_norm": 5.58759290836334, + "language_loss": 0.76411474, + "learning_rate": 3.094102230664423e-06, + "loss": 0.78678441, + "num_input_tokens_seen": 120093260, + "step": 5593, + "time_per_iteration": 2.631463050842285 + }, + { + "auxiliary_loss_clip": 0.01123191, + "auxiliary_loss_mlp": 0.00747863, + "balance_loss_clip": 1.00178409, + "balance_loss_mlp": 1.00006366, + "epoch": 0.3363294754246205, + "flos": 19718765164800.0, + "grad_norm": 2.645787949388731, + "language_loss": 0.72037572, + "learning_rate": 3.093776191858731e-06, + "loss": 0.73908627, + "num_input_tokens_seen": 120111830, + "step": 5594, + "time_per_iteration": 2.6311466693878174 + }, + { + "auxiliary_loss_clip": 0.01107196, + "auxiliary_loss_mlp": 0.00747972, + "balance_loss_clip": 1.00175512, + "balance_loss_mlp": 1.00013852, + "epoch": 0.33638959867728846, + "flos": 22596286677120.0, + "grad_norm": 1.9338931210507175, + "language_loss": 0.80076933, + "learning_rate": 3.0934501115769363e-06, + "loss": 0.81932104, + "num_input_tokens_seen": 120130470, + "step": 5595, + "time_per_iteration": 2.694070339202881 + }, + { + "auxiliary_loss_clip": 0.0113764, + "auxiliary_loss_mlp": 0.01129384, + "balance_loss_clip": 1.00189996, + "balance_loss_mlp": 1.00073338, + "epoch": 0.3364497219299564, + "flos": 20994742972800.0, + "grad_norm": 1.7492486750913052, + "language_loss": 0.81010675, + "learning_rate": 3.0931239898314037e-06, + "loss": 0.8327769, + "num_input_tokens_seen": 120150735, + "step": 5596, + "time_per_iteration": 2.606328248977661 + }, + { + "auxiliary_loss_clip": 0.01137527, + "auxiliary_loss_mlp": 0.01129864, + "balance_loss_clip": 1.00192964, + "balance_loss_mlp": 1.00073695, + "epoch": 0.3365098451826244, + "flos": 25228610974080.0, + "grad_norm": 1.5874657337811482, + "language_loss": 0.75706762, + "learning_rate": 3.0927978266344995e-06, + "loss": 0.77974153, + "num_input_tokens_seen": 120173230, + "step": 5597, + "time_per_iteration": 2.654001474380493 + }, + { + "auxiliary_loss_clip": 0.01155258, + "auxiliary_loss_mlp": 0.01129716, + "balance_loss_clip": 1.0020684, + "balance_loss_mlp": 1.00058913, + "epoch": 0.33656996843529235, + "flos": 24571697091840.0, + "grad_norm": 2.132160462241481, + "language_loss": 0.78895319, + "learning_rate": 3.0924716219985916e-06, + "loss": 0.81180298, + "num_input_tokens_seen": 120191860, + "step": 5598, + "time_per_iteration": 2.6390535831451416 + }, + { + "auxiliary_loss_clip": 0.01170954, + "auxiliary_loss_mlp": 0.01130821, + "balance_loss_clip": 1.00204015, + "balance_loss_mlp": 1.00064492, + "epoch": 0.3366300916879603, + "flos": 44091120752640.0, + "grad_norm": 1.6839403812109366, + "language_loss": 0.64407665, + "learning_rate": 3.0921453759360514e-06, + "loss": 0.66709441, + "num_input_tokens_seen": 120219195, + "step": 5599, + "time_per_iteration": 2.706707715988159 + }, + { + "auxiliary_loss_clip": 0.01123967, + "auxiliary_loss_mlp": 0.01131016, + "balance_loss_clip": 1.00189424, + "balance_loss_mlp": 1.00074458, + "epoch": 0.3366902149406283, + "flos": 13879869840000.0, + "grad_norm": 2.3462068090963295, + "language_loss": 0.82176852, + "learning_rate": 3.091819088459249e-06, + "loss": 0.84431833, + "num_input_tokens_seen": 120232950, + "step": 5600, + "time_per_iteration": 2.559157371520996 + }, + { + "auxiliary_loss_clip": 0.01155107, + "auxiliary_loss_mlp": 0.0113073, + "balance_loss_clip": 1.00191069, + "balance_loss_mlp": 1.00083911, + "epoch": 0.33675033819329625, + "flos": 16253098358400.0, + "grad_norm": 4.181630245931634, + "language_loss": 0.83071333, + "learning_rate": 3.0914927595805573e-06, + "loss": 0.85357165, + "num_input_tokens_seen": 120248865, + "step": 5601, + "time_per_iteration": 2.531583786010742 + }, + { + "auxiliary_loss_clip": 0.01154974, + "auxiliary_loss_mlp": 0.01129128, + "balance_loss_clip": 1.00207472, + "balance_loss_mlp": 1.0007633, + "epoch": 0.3368104614459642, + "flos": 17055809544960.0, + "grad_norm": 1.6021699797566795, + "language_loss": 0.83337879, + "learning_rate": 3.0911663893123507e-06, + "loss": 0.85621983, + "num_input_tokens_seen": 120267820, + "step": 5602, + "time_per_iteration": 2.512354850769043 + }, + { + "auxiliary_loss_clip": 0.01170846, + "auxiliary_loss_mlp": 0.01130439, + "balance_loss_clip": 1.00206161, + "balance_loss_mlp": 1.00102568, + "epoch": 0.3368705846986322, + "flos": 17858628472320.0, + "grad_norm": 1.777294452239133, + "language_loss": 0.69399607, + "learning_rate": 3.0908399776670048e-06, + "loss": 0.71700895, + "num_input_tokens_seen": 120286540, + "step": 5603, + "time_per_iteration": 2.5094950199127197 + }, + { + "auxiliary_loss_clip": 0.01138758, + "auxiliary_loss_mlp": 0.01131334, + "balance_loss_clip": 1.00194168, + "balance_loss_mlp": 1.00096655, + "epoch": 0.33693070795130015, + "flos": 22929502170240.0, + "grad_norm": 1.434472444520496, + "language_loss": 0.82933903, + "learning_rate": 3.090513524656898e-06, + "loss": 0.85203993, + "num_input_tokens_seen": 120307305, + "step": 5604, + "time_per_iteration": 2.5936334133148193 + }, + { + "auxiliary_loss_clip": 0.01122899, + "auxiliary_loss_mlp": 0.01130237, + "balance_loss_clip": 1.00180292, + "balance_loss_mlp": 1.00101459, + "epoch": 0.3369908312039681, + "flos": 22017443005440.0, + "grad_norm": 2.0858437844940085, + "language_loss": 0.73706555, + "learning_rate": 3.090187030294409e-06, + "loss": 0.75959694, + "num_input_tokens_seen": 120327845, + "step": 5605, + "time_per_iteration": 4.0425825119018555 + }, + { + "auxiliary_loss_clip": 0.01138437, + "auxiliary_loss_mlp": 0.01131075, + "balance_loss_clip": 1.0018959, + "balance_loss_mlp": 1.00080287, + "epoch": 0.33705095445663613, + "flos": 11801970944640.0, + "grad_norm": 2.4774423084954322, + "language_loss": 0.83540267, + "learning_rate": 3.089860494591919e-06, + "loss": 0.85809773, + "num_input_tokens_seen": 120343255, + "step": 5606, + "time_per_iteration": 2.5948379039764404 + }, + { + "auxiliary_loss_clip": 0.01138524, + "auxiliary_loss_mlp": 0.01130282, + "balance_loss_clip": 1.00177908, + "balance_loss_mlp": 1.0007726, + "epoch": 0.3371110777093041, + "flos": 25046400257280.0, + "grad_norm": 1.4267505517972299, + "language_loss": 0.67521399, + "learning_rate": 3.089533917561809e-06, + "loss": 0.69790202, + "num_input_tokens_seen": 120361745, + "step": 5607, + "time_per_iteration": 3.98152494430542 + }, + { + "auxiliary_loss_clip": 0.01155705, + "auxiliary_loss_mlp": 0.01131165, + "balance_loss_clip": 1.0020051, + "balance_loss_mlp": 1.00079811, + "epoch": 0.33717120096197206, + "flos": 26579031719040.0, + "grad_norm": 2.176155582335374, + "language_loss": 0.70141244, + "learning_rate": 3.089207299216464e-06, + "loss": 0.72428107, + "num_input_tokens_seen": 120380565, + "step": 5608, + "time_per_iteration": 3.949036121368408 + }, + { + "auxiliary_loss_clip": 0.01075191, + "auxiliary_loss_mlp": 0.01130297, + "balance_loss_clip": 1.00178123, + "balance_loss_mlp": 1.00097847, + "epoch": 0.33723132421464, + "flos": 15158541168000.0, + "grad_norm": 2.9863185791820617, + "language_loss": 0.79125702, + "learning_rate": 3.088880639568269e-06, + "loss": 0.81331193, + "num_input_tokens_seen": 120399235, + "step": 5609, + "time_per_iteration": 4.334996461868286 + }, + { + "auxiliary_loss_clip": 0.0115435, + "auxiliary_loss_mlp": 0.01130932, + "balance_loss_clip": 1.00200593, + "balance_loss_mlp": 1.00094604, + "epoch": 0.337291447467308, + "flos": 23436093634560.0, + "grad_norm": 1.5752011423109749, + "language_loss": 0.82438213, + "learning_rate": 3.0885539386296114e-06, + "loss": 0.84723496, + "num_input_tokens_seen": 120420095, + "step": 5610, + "time_per_iteration": 2.931708812713623 + }, + { + "auxiliary_loss_clip": 0.01154123, + "auxiliary_loss_mlp": 0.01130113, + "balance_loss_clip": 1.00188446, + "balance_loss_mlp": 1.00089014, + "epoch": 0.33735157071997596, + "flos": 17238163916160.0, + "grad_norm": 1.974390477970485, + "language_loss": 0.82511681, + "learning_rate": 3.088227196412879e-06, + "loss": 0.84795922, + "num_input_tokens_seen": 120437690, + "step": 5611, + "time_per_iteration": 2.53967022895813 + }, + { + "auxiliary_loss_clip": 0.01138565, + "auxiliary_loss_mlp": 0.01130931, + "balance_loss_clip": 1.00191665, + "balance_loss_mlp": 1.00084949, + "epoch": 0.3374116939726439, + "flos": 28257388657920.0, + "grad_norm": 1.7028410353087906, + "language_loss": 0.79381561, + "learning_rate": 3.0879004129304626e-06, + "loss": 0.81651062, + "num_input_tokens_seen": 120459240, + "step": 5612, + "time_per_iteration": 2.65625262260437 + }, + { + "auxiliary_loss_clip": 0.01088745, + "auxiliary_loss_mlp": 0.01130258, + "balance_loss_clip": 1.00158882, + "balance_loss_mlp": 1.00084472, + "epoch": 0.3374718172253119, + "flos": 35919396731520.0, + "grad_norm": 2.6904223875230846, + "language_loss": 0.70408583, + "learning_rate": 3.087573588194753e-06, + "loss": 0.7262758, + "num_input_tokens_seen": 120481090, + "step": 5613, + "time_per_iteration": 2.8279168605804443 + }, + { + "auxiliary_loss_clip": 0.01139304, + "auxiliary_loss_mlp": 0.01130578, + "balance_loss_clip": 1.00192261, + "balance_loss_mlp": 1.00097394, + "epoch": 0.33753194047797985, + "flos": 18186672407040.0, + "grad_norm": 1.7014561736390061, + "language_loss": 0.79458725, + "learning_rate": 3.087246722218144e-06, + "loss": 0.81728607, + "num_input_tokens_seen": 120500045, + "step": 5614, + "time_per_iteration": 2.588599920272827 + }, + { + "auxiliary_loss_clip": 0.0113936, + "auxiliary_loss_mlp": 0.01130522, + "balance_loss_clip": 1.00195074, + "balance_loss_mlp": 1.00082207, + "epoch": 0.3375920637306478, + "flos": 23148916398720.0, + "grad_norm": 1.7597191168753015, + "language_loss": 0.90957242, + "learning_rate": 3.086919815013031e-06, + "loss": 0.93227118, + "num_input_tokens_seen": 120521125, + "step": 5615, + "time_per_iteration": 2.6431877613067627 + }, + { + "auxiliary_loss_clip": 0.01154165, + "auxiliary_loss_mlp": 0.01130459, + "balance_loss_clip": 1.00192726, + "balance_loss_mlp": 1.00095046, + "epoch": 0.3376521869833158, + "flos": 23112215677440.0, + "grad_norm": 1.8829746405670633, + "language_loss": 0.80652165, + "learning_rate": 3.086592866591809e-06, + "loss": 0.82936788, + "num_input_tokens_seen": 120539180, + "step": 5616, + "time_per_iteration": 2.6806349754333496 + }, + { + "auxiliary_loss_clip": 0.01155127, + "auxiliary_loss_mlp": 0.00747993, + "balance_loss_clip": 1.00191617, + "balance_loss_mlp": 1.00017571, + "epoch": 0.33771231023598375, + "flos": 19274585581440.0, + "grad_norm": 1.9067528117073953, + "language_loss": 0.83682835, + "learning_rate": 3.0862658769668774e-06, + "loss": 0.85585952, + "num_input_tokens_seen": 120556280, + "step": 5617, + "time_per_iteration": 2.6074891090393066 + }, + { + "auxiliary_loss_clip": 0.0108155, + "auxiliary_loss_mlp": 0.01130094, + "balance_loss_clip": 1.00227785, + "balance_loss_mlp": 1.00096667, + "epoch": 0.3377724334886517, + "flos": 18150187167360.0, + "grad_norm": 2.306906856009842, + "language_loss": 0.79737282, + "learning_rate": 3.0859388461506343e-06, + "loss": 0.81948924, + "num_input_tokens_seen": 120575395, + "step": 5618, + "time_per_iteration": 2.7360002994537354 + }, + { + "auxiliary_loss_clip": 0.0110708, + "auxiliary_loss_mlp": 0.01130657, + "balance_loss_clip": 1.00169206, + "balance_loss_mlp": 1.00086212, + "epoch": 0.3378325567413197, + "flos": 25775997310080.0, + "grad_norm": 2.1170802000932567, + "language_loss": 0.70708203, + "learning_rate": 3.085611774155481e-06, + "loss": 0.7294594, + "num_input_tokens_seen": 120596075, + "step": 5619, + "time_per_iteration": 2.6873679161071777 + }, + { + "auxiliary_loss_clip": 0.0113892, + "auxiliary_loss_mlp": 0.01130419, + "balance_loss_clip": 1.00172591, + "balance_loss_mlp": 1.00110114, + "epoch": 0.3378926799939877, + "flos": 21317112558720.0, + "grad_norm": 2.4284286389304195, + "language_loss": 0.70520937, + "learning_rate": 3.085284660993821e-06, + "loss": 0.72790277, + "num_input_tokens_seen": 120614195, + "step": 5620, + "time_per_iteration": 2.5893137454986572 + }, + { + "auxiliary_loss_clip": 0.01170895, + "auxiliary_loss_mlp": 0.01130521, + "balance_loss_clip": 1.0020802, + "balance_loss_mlp": 1.00101233, + "epoch": 0.33795280324665566, + "flos": 24900028335360.0, + "grad_norm": 2.2652596363977167, + "language_loss": 0.67746782, + "learning_rate": 3.084957506678058e-06, + "loss": 0.70048201, + "num_input_tokens_seen": 120634475, + "step": 5621, + "time_per_iteration": 2.551722288131714 + }, + { + "auxiliary_loss_clip": 0.01137328, + "auxiliary_loss_mlp": 0.0113004, + "balance_loss_clip": 1.00179875, + "balance_loss_mlp": 1.0010078, + "epoch": 0.33801292649932363, + "flos": 24753943722240.0, + "grad_norm": 4.373538060039747, + "language_loss": 0.82809627, + "learning_rate": 3.0846303112205975e-06, + "loss": 0.85076994, + "num_input_tokens_seen": 120654980, + "step": 5622, + "time_per_iteration": 2.654754161834717 + }, + { + "auxiliary_loss_clip": 0.01121626, + "auxiliary_loss_mlp": 0.01130362, + "balance_loss_clip": 1.00179052, + "balance_loss_mlp": 1.00075805, + "epoch": 0.3380730497519916, + "flos": 26723967096960.0, + "grad_norm": 1.3945033889231464, + "language_loss": 0.73420417, + "learning_rate": 3.0843030746338464e-06, + "loss": 0.756724, + "num_input_tokens_seen": 120676245, + "step": 5623, + "time_per_iteration": 2.6844873428344727 + }, + { + "auxiliary_loss_clip": 0.01136489, + "auxiliary_loss_mlp": 0.01113431, + "balance_loss_clip": 1.00147569, + "balance_loss_mlp": 1.00051618, + "epoch": 0.33813317300465956, + "flos": 70035756416640.0, + "grad_norm": 0.7457854529029846, + "language_loss": 0.54960424, + "learning_rate": 3.083975796930215e-06, + "loss": 0.57210338, + "num_input_tokens_seen": 120741965, + "step": 5624, + "time_per_iteration": 3.295808792114258 + }, + { + "auxiliary_loss_clip": 0.01122353, + "auxiliary_loss_mlp": 0.01130322, + "balance_loss_clip": 1.00181508, + "balance_loss_mlp": 1.00100422, + "epoch": 0.3381932962573275, + "flos": 24097317148800.0, + "grad_norm": 2.1605807729141158, + "language_loss": 0.73211497, + "learning_rate": 3.083648478122111e-06, + "loss": 0.75464177, + "num_input_tokens_seen": 120760410, + "step": 5625, + "time_per_iteration": 2.6263601779937744 + }, + { + "auxiliary_loss_clip": 0.01154411, + "auxiliary_loss_mlp": 0.01130883, + "balance_loss_clip": 1.0018909, + "balance_loss_mlp": 1.00108814, + "epoch": 0.3382534195099955, + "flos": 19278248768640.0, + "grad_norm": 2.1736037840494675, + "language_loss": 0.70310068, + "learning_rate": 3.0833211182219497e-06, + "loss": 0.72595358, + "num_input_tokens_seen": 120777705, + "step": 5626, + "time_per_iteration": 2.5791375637054443 + }, + { + "auxiliary_loss_clip": 0.01139019, + "auxiliary_loss_mlp": 0.01129709, + "balance_loss_clip": 1.00189006, + "balance_loss_mlp": 1.00077248, + "epoch": 0.33831354276266346, + "flos": 25226240676480.0, + "grad_norm": 1.6138539322012848, + "language_loss": 0.80964446, + "learning_rate": 3.0829937172421425e-06, + "loss": 0.83233166, + "num_input_tokens_seen": 120798660, + "step": 5627, + "time_per_iteration": 2.6358530521392822 + }, + { + "auxiliary_loss_clip": 0.01154337, + "auxiliary_loss_mlp": 0.00748054, + "balance_loss_clip": 1.00196493, + "balance_loss_mlp": 1.00016737, + "epoch": 0.3383736660153314, + "flos": 23112000195840.0, + "grad_norm": 1.8517935126151102, + "language_loss": 0.8105222, + "learning_rate": 3.0826662751951055e-06, + "loss": 0.82954609, + "num_input_tokens_seen": 120816705, + "step": 5628, + "time_per_iteration": 2.5497512817382812 + }, + { + "auxiliary_loss_clip": 0.01105609, + "auxiliary_loss_mlp": 0.01130988, + "balance_loss_clip": 1.00168979, + "balance_loss_mlp": 1.00090635, + "epoch": 0.3384337892679994, + "flos": 23477139901440.0, + "grad_norm": 1.9581081773888354, + "language_loss": 0.77340388, + "learning_rate": 3.082338792093254e-06, + "loss": 0.79576993, + "num_input_tokens_seen": 120835375, + "step": 5629, + "time_per_iteration": 2.695798635482788 + }, + { + "auxiliary_loss_clip": 0.01139176, + "auxiliary_loss_mlp": 0.01131094, + "balance_loss_clip": 1.00189257, + "balance_loss_mlp": 1.00091767, + "epoch": 0.33849391252066735, + "flos": 19425805839360.0, + "grad_norm": 1.8460219875686885, + "language_loss": 0.84731179, + "learning_rate": 3.0820112679490074e-06, + "loss": 0.87001449, + "num_input_tokens_seen": 120854260, + "step": 5630, + "time_per_iteration": 2.5799601078033447 + }, + { + "auxiliary_loss_clip": 0.0110581, + "auxiliary_loss_mlp": 0.01130908, + "balance_loss_clip": 1.00182843, + "balance_loss_mlp": 1.00120807, + "epoch": 0.3385540357733353, + "flos": 21064840364160.0, + "grad_norm": 1.9891122819185063, + "language_loss": 0.71697247, + "learning_rate": 3.0816837027747857e-06, + "loss": 0.73933971, + "num_input_tokens_seen": 120871590, + "step": 5631, + "time_per_iteration": 2.6686530113220215 + }, + { + "auxiliary_loss_clip": 0.01134276, + "auxiliary_loss_mlp": 0.01113208, + "balance_loss_clip": 1.00152564, + "balance_loss_mlp": 1.00105631, + "epoch": 0.3386141590260033, + "flos": 69208013450880.0, + "grad_norm": 0.8495229152563288, + "language_loss": 0.56185329, + "learning_rate": 3.0813560965830084e-06, + "loss": 0.58432817, + "num_input_tokens_seen": 120925550, + "step": 5632, + "time_per_iteration": 3.2133378982543945 + }, + { + "auxiliary_loss_clip": 0.01154567, + "auxiliary_loss_mlp": 0.01130133, + "balance_loss_clip": 1.00194633, + "balance_loss_mlp": 1.0008148, + "epoch": 0.3386742822786713, + "flos": 25519487310720.0, + "grad_norm": 1.7300366785576065, + "language_loss": 0.80446255, + "learning_rate": 3.0810284493861005e-06, + "loss": 0.82730955, + "num_input_tokens_seen": 120947620, + "step": 5633, + "time_per_iteration": 2.594576835632324 + }, + { + "auxiliary_loss_clip": 0.01124034, + "auxiliary_loss_mlp": 0.01130649, + "balance_loss_clip": 1.00185478, + "balance_loss_mlp": 1.00104523, + "epoch": 0.33873440553133927, + "flos": 23623116773760.0, + "grad_norm": 2.411843295487616, + "language_loss": 0.58970451, + "learning_rate": 3.0807007611964855e-06, + "loss": 0.61225128, + "num_input_tokens_seen": 120965205, + "step": 5634, + "time_per_iteration": 2.6538126468658447 + }, + { + "auxiliary_loss_clip": 0.01137995, + "auxiliary_loss_mlp": 0.01129962, + "balance_loss_clip": 1.00185716, + "balance_loss_mlp": 1.00083447, + "epoch": 0.33879452878400723, + "flos": 17088882992640.0, + "grad_norm": 1.6918130474958695, + "language_loss": 0.92434525, + "learning_rate": 3.080373032026589e-06, + "loss": 0.94702476, + "num_input_tokens_seen": 120983560, + "step": 5635, + "time_per_iteration": 2.5882294178009033 + }, + { + "auxiliary_loss_clip": 0.01122969, + "auxiliary_loss_mlp": 0.0112974, + "balance_loss_clip": 1.00196552, + "balance_loss_mlp": 1.00070834, + "epoch": 0.3388546520366752, + "flos": 15742053607680.0, + "grad_norm": 2.803072132462512, + "language_loss": 0.7511096, + "learning_rate": 3.0800452618888386e-06, + "loss": 0.7736367, + "num_input_tokens_seen": 121001400, + "step": 5636, + "time_per_iteration": 2.662416696548462 + }, + { + "auxiliary_loss_clip": 0.01155374, + "auxiliary_loss_mlp": 0.01130083, + "balance_loss_clip": 1.00189257, + "balance_loss_mlp": 1.00095594, + "epoch": 0.33891477528934316, + "flos": 22418744728320.0, + "grad_norm": 1.5689861042718654, + "language_loss": 0.83065122, + "learning_rate": 3.0797174507956637e-06, + "loss": 0.85350579, + "num_input_tokens_seen": 121021760, + "step": 5637, + "time_per_iteration": 2.610825300216675 + }, + { + "auxiliary_loss_clip": 0.01105741, + "auxiliary_loss_mlp": 0.01131321, + "balance_loss_clip": 1.00180697, + "balance_loss_mlp": 1.00095415, + "epoch": 0.3389748985420111, + "flos": 17274828723840.0, + "grad_norm": 1.9259871453773247, + "language_loss": 0.69950104, + "learning_rate": 3.079389598759495e-06, + "loss": 0.72187173, + "num_input_tokens_seen": 121041070, + "step": 5638, + "time_per_iteration": 2.6672048568725586 + }, + { + "auxiliary_loss_clip": 0.01139248, + "auxiliary_loss_mlp": 0.01130439, + "balance_loss_clip": 1.00200868, + "balance_loss_mlp": 1.00102484, + "epoch": 0.3390350217946791, + "flos": 27744979190400.0, + "grad_norm": 1.8212291594508856, + "language_loss": 0.81004524, + "learning_rate": 3.079061705792765e-06, + "loss": 0.83274209, + "num_input_tokens_seen": 121060890, + "step": 5639, + "time_per_iteration": 2.670241355895996 + }, + { + "auxiliary_loss_clip": 0.01170866, + "auxiliary_loss_mlp": 0.01130601, + "balance_loss_clip": 1.00197744, + "balance_loss_mlp": 1.00099683, + "epoch": 0.33909514504734706, + "flos": 20339804338560.0, + "grad_norm": 2.0420967733336033, + "language_loss": 0.67396522, + "learning_rate": 3.078733771907907e-06, + "loss": 0.69697988, + "num_input_tokens_seen": 121079135, + "step": 5640, + "time_per_iteration": 2.527130126953125 + }, + { + "auxiliary_loss_clip": 0.01138436, + "auxiliary_loss_mlp": 0.01130913, + "balance_loss_clip": 1.00190735, + "balance_loss_mlp": 1.00073636, + "epoch": 0.339155268300015, + "flos": 14830030356480.0, + "grad_norm": 1.651434012558289, + "language_loss": 0.7011475, + "learning_rate": 3.0784057971173554e-06, + "loss": 0.72384101, + "num_input_tokens_seen": 121097685, + "step": 5641, + "time_per_iteration": 2.573786735534668 + }, + { + "auxiliary_loss_clip": 0.01171087, + "auxiliary_loss_mlp": 0.01131302, + "balance_loss_clip": 1.00216639, + "balance_loss_mlp": 1.00093448, + "epoch": 0.339215391552683, + "flos": 26067951054720.0, + "grad_norm": 1.6735597846024535, + "language_loss": 0.87381661, + "learning_rate": 3.0780777814335483e-06, + "loss": 0.89684045, + "num_input_tokens_seen": 121115640, + "step": 5642, + "time_per_iteration": 4.043104648590088 + }, + { + "auxiliary_loss_clip": 0.01153994, + "auxiliary_loss_mlp": 0.01128853, + "balance_loss_clip": 1.00201416, + "balance_loss_mlp": 1.00058436, + "epoch": 0.33927551480535095, + "flos": 14574705505920.0, + "grad_norm": 1.6792913154982627, + "language_loss": 0.84113657, + "learning_rate": 3.077749724868924e-06, + "loss": 0.86396503, + "num_input_tokens_seen": 121132485, + "step": 5643, + "time_per_iteration": 2.5535106658935547 + }, + { + "auxiliary_loss_clip": 0.01138684, + "auxiliary_loss_mlp": 0.01130068, + "balance_loss_clip": 1.00195527, + "balance_loss_mlp": 1.00103629, + "epoch": 0.3393356380580189, + "flos": 23805578885760.0, + "grad_norm": 1.6602354798541137, + "language_loss": 0.76830155, + "learning_rate": 3.077421627435922e-06, + "loss": 0.7909891, + "num_input_tokens_seen": 121152935, + "step": 5644, + "time_per_iteration": 4.377078294754028 + }, + { + "auxiliary_loss_clip": 0.01154303, + "auxiliary_loss_mlp": 0.01130061, + "balance_loss_clip": 1.00187492, + "balance_loss_mlp": 1.0010289, + "epoch": 0.3393957613106869, + "flos": 17347871030400.0, + "grad_norm": 4.031815973221495, + "language_loss": 0.63806331, + "learning_rate": 3.0770934891469832e-06, + "loss": 0.66090697, + "num_input_tokens_seen": 121169835, + "step": 5645, + "time_per_iteration": 3.907320976257324 + }, + { + "auxiliary_loss_clip": 0.01154274, + "auxiliary_loss_mlp": 0.01129383, + "balance_loss_clip": 1.00189972, + "balance_loss_mlp": 1.0007323, + "epoch": 0.3394558845633549, + "flos": 28433960939520.0, + "grad_norm": 2.8252067265386964, + "language_loss": 0.76684916, + "learning_rate": 3.076765310014552e-06, + "loss": 0.78968573, + "num_input_tokens_seen": 121190290, + "step": 5646, + "time_per_iteration": 4.038933515548706 + }, + { + "auxiliary_loss_clip": 0.01154556, + "auxiliary_loss_mlp": 0.01131039, + "balance_loss_clip": 1.00190306, + "balance_loss_mlp": 1.00086212, + "epoch": 0.33951600781602287, + "flos": 22086929865600.0, + "grad_norm": 2.9468672476869804, + "language_loss": 0.78942013, + "learning_rate": 3.0764370900510727e-06, + "loss": 0.81227612, + "num_input_tokens_seen": 121209060, + "step": 5647, + "time_per_iteration": 2.572478771209717 + }, + { + "auxiliary_loss_clip": 0.01137415, + "auxiliary_loss_mlp": 0.00747898, + "balance_loss_clip": 1.0019331, + "balance_loss_mlp": 1.00011849, + "epoch": 0.33957613106869083, + "flos": 23878262056320.0, + "grad_norm": 2.0098116518849034, + "language_loss": 0.77210492, + "learning_rate": 3.0761088292689904e-06, + "loss": 0.79095805, + "num_input_tokens_seen": 121227480, + "step": 5648, + "time_per_iteration": 2.6331634521484375 + }, + { + "auxiliary_loss_clip": 0.01075918, + "auxiliary_loss_mlp": 0.0111309, + "balance_loss_clip": 1.0025506, + "balance_loss_mlp": 1.00093794, + "epoch": 0.3396362543213588, + "flos": 71242642414080.0, + "grad_norm": 0.7756543416453328, + "language_loss": 0.56337678, + "learning_rate": 3.075780527680754e-06, + "loss": 0.58526695, + "num_input_tokens_seen": 121291305, + "step": 5649, + "time_per_iteration": 3.5807793140411377 + }, + { + "auxiliary_loss_clip": 0.01138952, + "auxiliary_loss_mlp": 0.00747859, + "balance_loss_clip": 1.0019145, + "balance_loss_mlp": 1.00015163, + "epoch": 0.33969637757402676, + "flos": 25921615046400.0, + "grad_norm": 1.9469733466940486, + "language_loss": 0.85584235, + "learning_rate": 3.0754521852988117e-06, + "loss": 0.87471044, + "num_input_tokens_seen": 121312740, + "step": 5650, + "time_per_iteration": 2.999501943588257 + }, + { + "auxiliary_loss_clip": 0.01155144, + "auxiliary_loss_mlp": 0.0112974, + "balance_loss_clip": 1.00192738, + "balance_loss_mlp": 1.00070763, + "epoch": 0.33975650082669473, + "flos": 35261728663680.0, + "grad_norm": 1.665445176613533, + "language_loss": 0.71270072, + "learning_rate": 3.0751238021356152e-06, + "loss": 0.73554957, + "num_input_tokens_seen": 121334220, + "step": 5651, + "time_per_iteration": 2.696425676345825 + }, + { + "auxiliary_loss_clip": 0.01105538, + "auxiliary_loss_mlp": 0.01131099, + "balance_loss_clip": 1.00186646, + "balance_loss_mlp": 1.00101781, + "epoch": 0.3398166240793627, + "flos": 16647001879680.0, + "grad_norm": 1.7839150478082804, + "language_loss": 0.80764377, + "learning_rate": 3.074795378203616e-06, + "loss": 0.83001018, + "num_input_tokens_seen": 121351870, + "step": 5652, + "time_per_iteration": 2.6838698387145996 + }, + { + "auxiliary_loss_clip": 0.01171008, + "auxiliary_loss_mlp": 0.01130959, + "balance_loss_clip": 1.00211322, + "balance_loss_mlp": 1.0008781, + "epoch": 0.33987674733203066, + "flos": 24062196625920.0, + "grad_norm": 1.8960200549790338, + "language_loss": 0.77040398, + "learning_rate": 3.0744669135152685e-06, + "loss": 0.79342365, + "num_input_tokens_seen": 121373400, + "step": 5653, + "time_per_iteration": 2.589721918106079 + }, + { + "auxiliary_loss_clip": 0.01155389, + "auxiliary_loss_mlp": 0.01129585, + "balance_loss_clip": 1.00193596, + "balance_loss_mlp": 1.00064862, + "epoch": 0.3399368705846986, + "flos": 13250678279040.0, + "grad_norm": 2.5893268621703815, + "language_loss": 0.85696852, + "learning_rate": 3.0741384080830278e-06, + "loss": 0.87981826, + "num_input_tokens_seen": 121385225, + "step": 5654, + "time_per_iteration": 2.5160276889801025 + }, + { + "auxiliary_loss_clip": 0.01154065, + "auxiliary_loss_mlp": 0.01130455, + "balance_loss_clip": 1.00190032, + "balance_loss_mlp": 1.00075555, + "epoch": 0.3399969938373666, + "flos": 27012832272000.0, + "grad_norm": 2.185266193677676, + "language_loss": 0.65388155, + "learning_rate": 3.073809861919351e-06, + "loss": 0.6767267, + "num_input_tokens_seen": 121404735, + "step": 5655, + "time_per_iteration": 2.6280055046081543 + }, + { + "auxiliary_loss_clip": 0.01154188, + "auxiliary_loss_mlp": 0.01129692, + "balance_loss_clip": 1.0020026, + "balance_loss_mlp": 1.00085044, + "epoch": 0.34005711709003456, + "flos": 28550096588160.0, + "grad_norm": 1.5637992228732405, + "language_loss": 0.76712787, + "learning_rate": 3.073481275036697e-06, + "loss": 0.7899667, + "num_input_tokens_seen": 121426780, + "step": 5656, + "time_per_iteration": 2.616447925567627 + }, + { + "auxiliary_loss_clip": 0.01123762, + "auxiliary_loss_mlp": 0.0113055, + "balance_loss_clip": 1.00184488, + "balance_loss_mlp": 1.00075483, + "epoch": 0.3401172403427025, + "flos": 21617003208960.0, + "grad_norm": 1.929183922936203, + "language_loss": 0.83525586, + "learning_rate": 3.073152647447525e-06, + "loss": 0.85779899, + "num_input_tokens_seen": 121447245, + "step": 5657, + "time_per_iteration": 2.6680195331573486 + }, + { + "auxiliary_loss_clip": 0.01137515, + "auxiliary_loss_mlp": 0.01129713, + "balance_loss_clip": 1.00195551, + "balance_loss_mlp": 1.00087106, + "epoch": 0.3401773635953705, + "flos": 25885776251520.0, + "grad_norm": 4.766521897442593, + "language_loss": 0.85093153, + "learning_rate": 3.0728239791642976e-06, + "loss": 0.87360382, + "num_input_tokens_seen": 121468165, + "step": 5658, + "time_per_iteration": 2.740563154220581 + }, + { + "auxiliary_loss_clip": 0.01150489, + "auxiliary_loss_mlp": 0.01112148, + "balance_loss_clip": 1.00173426, + "balance_loss_mlp": 0.99999541, + "epoch": 0.3402374868480385, + "flos": 65507995336320.0, + "grad_norm": 0.8180650928407558, + "language_loss": 0.60100579, + "learning_rate": 3.072495270199477e-06, + "loss": 0.62363207, + "num_input_tokens_seen": 121523795, + "step": 5659, + "time_per_iteration": 3.1219289302825928 + }, + { + "auxiliary_loss_clip": 0.01170706, + "auxiliary_loss_mlp": 0.01129102, + "balance_loss_clip": 1.00206697, + "balance_loss_mlp": 1.00073743, + "epoch": 0.34029761010070647, + "flos": 24060580513920.0, + "grad_norm": 1.7682674320721057, + "language_loss": 0.67884833, + "learning_rate": 3.0721665205655284e-06, + "loss": 0.70184636, + "num_input_tokens_seen": 121542950, + "step": 5660, + "time_per_iteration": 2.5594890117645264 + }, + { + "auxiliary_loss_clip": 0.01170746, + "auxiliary_loss_mlp": 0.01130108, + "balance_loss_clip": 1.00208879, + "balance_loss_mlp": 1.00098014, + "epoch": 0.34035773335337444, + "flos": 27599720590080.0, + "grad_norm": 1.7527724579259119, + "language_loss": 0.67079729, + "learning_rate": 3.071837730274918e-06, + "loss": 0.69380587, + "num_input_tokens_seen": 121562765, + "step": 5661, + "time_per_iteration": 2.5543923377990723 + }, + { + "auxiliary_loss_clip": 0.01138289, + "auxiliary_loss_mlp": 0.01128987, + "balance_loss_clip": 1.0019381, + "balance_loss_mlp": 1.00090897, + "epoch": 0.3404178566060424, + "flos": 20812783651200.0, + "grad_norm": 1.6808364681965058, + "language_loss": 0.789253, + "learning_rate": 3.071508899340113e-06, + "loss": 0.81192577, + "num_input_tokens_seen": 121581610, + "step": 5662, + "time_per_iteration": 2.6501569747924805 + }, + { + "auxiliary_loss_clip": 0.01122261, + "auxiliary_loss_mlp": 0.01129529, + "balance_loss_clip": 1.00177646, + "balance_loss_mlp": 1.00078297, + "epoch": 0.34047797985871037, + "flos": 26833566470400.0, + "grad_norm": 3.213989892374324, + "language_loss": 0.73367602, + "learning_rate": 3.0711800277735833e-06, + "loss": 0.756194, + "num_input_tokens_seen": 121601885, + "step": 5663, + "time_per_iteration": 2.681095600128174 + }, + { + "auxiliary_loss_clip": 0.01122485, + "auxiliary_loss_mlp": 0.01128491, + "balance_loss_clip": 1.00180984, + "balance_loss_mlp": 1.0008893, + "epoch": 0.34053810311137833, + "flos": 19682639061120.0, + "grad_norm": 1.7514648440417546, + "language_loss": 0.86184317, + "learning_rate": 3.0708511155877997e-06, + "loss": 0.88435298, + "num_input_tokens_seen": 121621335, + "step": 5664, + "time_per_iteration": 2.637355089187622 + }, + { + "auxiliary_loss_clip": 0.01171016, + "auxiliary_loss_mlp": 0.01129209, + "balance_loss_clip": 1.00209236, + "balance_loss_mlp": 1.00074899, + "epoch": 0.3405982263640463, + "flos": 21725740656000.0, + "grad_norm": 1.8241554434021492, + "language_loss": 0.68903285, + "learning_rate": 3.070522162795235e-06, + "loss": 0.71203506, + "num_input_tokens_seen": 121641310, + "step": 5665, + "time_per_iteration": 2.522399425506592 + }, + { + "auxiliary_loss_clip": 0.01170951, + "auxiliary_loss_mlp": 0.011302, + "balance_loss_clip": 1.00210154, + "balance_loss_mlp": 1.00078642, + "epoch": 0.34065834961671426, + "flos": 18041629288320.0, + "grad_norm": 2.6577823908930602, + "language_loss": 0.72989237, + "learning_rate": 3.0701931694083626e-06, + "loss": 0.75290394, + "num_input_tokens_seen": 121659625, + "step": 5666, + "time_per_iteration": 2.5220208168029785 + }, + { + "auxiliary_loss_clip": 0.01155374, + "auxiliary_loss_mlp": 0.01130331, + "balance_loss_clip": 1.00198281, + "balance_loss_mlp": 1.00082183, + "epoch": 0.3407184728693822, + "flos": 21397337585280.0, + "grad_norm": 1.7533041381853864, + "language_loss": 0.73282927, + "learning_rate": 3.0698641354396576e-06, + "loss": 0.75568628, + "num_input_tokens_seen": 121679205, + "step": 5667, + "time_per_iteration": 2.562920093536377 + }, + { + "auxiliary_loss_clip": 0.01149971, + "auxiliary_loss_mlp": 0.01112457, + "balance_loss_clip": 1.00167382, + "balance_loss_mlp": 1.00030446, + "epoch": 0.3407785961220502, + "flos": 68688101018880.0, + "grad_norm": 0.8486720509604577, + "language_loss": 0.63319373, + "learning_rate": 3.069535060901597e-06, + "loss": 0.65581799, + "num_input_tokens_seen": 121751085, + "step": 5668, + "time_per_iteration": 3.2805142402648926 + }, + { + "auxiliary_loss_clip": 0.01046379, + "auxiliary_loss_mlp": 0.01129919, + "balance_loss_clip": 1.00183463, + "balance_loss_mlp": 1.00107741, + "epoch": 0.34083871937471816, + "flos": 14064379027200.0, + "grad_norm": 2.153735823056548, + "language_loss": 0.71821022, + "learning_rate": 3.0692059458066596e-06, + "loss": 0.73997319, + "num_input_tokens_seen": 121768565, + "step": 5669, + "time_per_iteration": 2.850292444229126 + }, + { + "auxiliary_loss_clip": 0.01120688, + "auxiliary_loss_mlp": 0.00747742, + "balance_loss_clip": 1.00179398, + "balance_loss_mlp": 1.00009632, + "epoch": 0.3408988426273861, + "flos": 17085435287040.0, + "grad_norm": 1.9531573709281989, + "language_loss": 0.80591071, + "learning_rate": 3.0688767901673265e-06, + "loss": 0.82459497, + "num_input_tokens_seen": 121784925, + "step": 5670, + "time_per_iteration": 2.648785352706909 + }, + { + "auxiliary_loss_clip": 0.01108578, + "auxiliary_loss_mlp": 0.01130258, + "balance_loss_clip": 1.00188458, + "balance_loss_mlp": 1.0008446, + "epoch": 0.3409589658800541, + "flos": 24024562151040.0, + "grad_norm": 1.8266447659679443, + "language_loss": 0.77068526, + "learning_rate": 3.068547593996078e-06, + "loss": 0.79307365, + "num_input_tokens_seen": 121804425, + "step": 5671, + "time_per_iteration": 2.724066734313965 + }, + { + "auxiliary_loss_clip": 0.01170924, + "auxiliary_loss_mlp": 0.00747849, + "balance_loss_clip": 1.00208485, + "balance_loss_mlp": 1.00013113, + "epoch": 0.34101908913272205, + "flos": 21142012734720.0, + "grad_norm": 1.7827010043480118, + "language_loss": 0.74295646, + "learning_rate": 3.0682183573053974e-06, + "loss": 0.76214415, + "num_input_tokens_seen": 121825145, + "step": 5672, + "time_per_iteration": 2.5831210613250732 + }, + { + "auxiliary_loss_clip": 0.01155577, + "auxiliary_loss_mlp": 0.0112949, + "balance_loss_clip": 1.00198317, + "balance_loss_mlp": 1.00083911, + "epoch": 0.3410792123853901, + "flos": 15702012921600.0, + "grad_norm": 1.7425179279504814, + "language_loss": 0.73481888, + "learning_rate": 3.06788908010777e-06, + "loss": 0.75766951, + "num_input_tokens_seen": 121842185, + "step": 5673, + "time_per_iteration": 2.571214437484741 + }, + { + "auxiliary_loss_clip": 0.01155606, + "auxiliary_loss_mlp": 0.01129341, + "balance_loss_clip": 1.00197887, + "balance_loss_mlp": 1.00078535, + "epoch": 0.34113933563805804, + "flos": 23036012974080.0, + "grad_norm": 1.7677444061459786, + "language_loss": 0.7985422, + "learning_rate": 3.067559762415682e-06, + "loss": 0.82139164, + "num_input_tokens_seen": 121862260, + "step": 5674, + "time_per_iteration": 2.5691115856170654 + }, + { + "auxiliary_loss_clip": 0.01167215, + "auxiliary_loss_mlp": 0.01111413, + "balance_loss_clip": 1.00188637, + "balance_loss_mlp": 1.00002372, + "epoch": 0.341199458890726, + "flos": 69614235336960.0, + "grad_norm": 0.7953845124581618, + "language_loss": 0.5613091, + "learning_rate": 3.0672304042416198e-06, + "loss": 0.58409536, + "num_input_tokens_seen": 121923560, + "step": 5675, + "time_per_iteration": 3.6627228260040283 + }, + { + "auxiliary_loss_clip": 0.01138928, + "auxiliary_loss_mlp": 0.00747718, + "balance_loss_clip": 1.00186777, + "balance_loss_mlp": 1.00008535, + "epoch": 0.34125958214339397, + "flos": 22346348866560.0, + "grad_norm": 1.8047647861588103, + "language_loss": 0.7890479, + "learning_rate": 3.0669010055980734e-06, + "loss": 0.80791432, + "num_input_tokens_seen": 121943515, + "step": 5676, + "time_per_iteration": 2.6859967708587646 + }, + { + "auxiliary_loss_clip": 0.01155584, + "auxiliary_loss_mlp": 0.01129504, + "balance_loss_clip": 1.00192595, + "balance_loss_mlp": 1.00066257, + "epoch": 0.34131970539606193, + "flos": 21871933009920.0, + "grad_norm": 1.8550313238776268, + "language_loss": 0.85547817, + "learning_rate": 3.0665715664975357e-06, + "loss": 0.87832904, + "num_input_tokens_seen": 121962540, + "step": 5677, + "time_per_iteration": 2.583725929260254 + }, + { + "auxiliary_loss_clip": 0.01137337, + "auxiliary_loss_mlp": 0.01129707, + "balance_loss_clip": 1.00195312, + "balance_loss_mlp": 1.00077033, + "epoch": 0.3413798286487299, + "flos": 24935723475840.0, + "grad_norm": 2.1632540449021427, + "language_loss": 0.79748172, + "learning_rate": 3.0662420869524966e-06, + "loss": 0.82015216, + "num_input_tokens_seen": 121979830, + "step": 5678, + "time_per_iteration": 2.622542381286621 + }, + { + "auxiliary_loss_clip": 0.01154236, + "auxiliary_loss_mlp": 0.01129628, + "balance_loss_clip": 1.00199461, + "balance_loss_mlp": 1.00078607, + "epoch": 0.34143995190139786, + "flos": 25374372364800.0, + "grad_norm": 1.6002738243254466, + "language_loss": 0.75098151, + "learning_rate": 3.0659125669754506e-06, + "loss": 0.77382016, + "num_input_tokens_seen": 121999055, + "step": 5679, + "time_per_iteration": 2.6226296424865723 + }, + { + "auxiliary_loss_clip": 0.01150164, + "auxiliary_loss_mlp": 0.01111404, + "balance_loss_clip": 1.00173235, + "balance_loss_mlp": 1.0000149, + "epoch": 0.34150007515406583, + "flos": 67782578129280.0, + "grad_norm": 0.7213771100551196, + "language_loss": 0.59439707, + "learning_rate": 3.0655830065788923e-06, + "loss": 0.61701274, + "num_input_tokens_seen": 122067015, + "step": 5680, + "time_per_iteration": 4.608680486679077 + }, + { + "auxiliary_loss_clip": 0.01138421, + "auxiliary_loss_mlp": 0.01129017, + "balance_loss_clip": 1.00181055, + "balance_loss_mlp": 1.00055707, + "epoch": 0.3415601984067338, + "flos": 20302421258880.0, + "grad_norm": 1.8114433883052594, + "language_loss": 0.72208583, + "learning_rate": 3.0652534057753206e-06, + "loss": 0.74476016, + "num_input_tokens_seen": 122085295, + "step": 5681, + "time_per_iteration": 2.5868399143218994 + }, + { + "auxiliary_loss_clip": 0.01138195, + "auxiliary_loss_mlp": 0.01129263, + "balance_loss_clip": 1.00179982, + "balance_loss_mlp": 1.00089812, + "epoch": 0.34162032165940176, + "flos": 26031178506240.0, + "grad_norm": 2.12062995131143, + "language_loss": 0.71548152, + "learning_rate": 3.064923764577233e-06, + "loss": 0.73815614, + "num_input_tokens_seen": 122104020, + "step": 5682, + "time_per_iteration": 4.31843638420105 + }, + { + "auxiliary_loss_clip": 0.01170743, + "auxiliary_loss_mlp": 0.01129866, + "balance_loss_clip": 1.00195277, + "balance_loss_mlp": 1.00083435, + "epoch": 0.3416804449120697, + "flos": 28803338449920.0, + "grad_norm": 1.5111197834413255, + "language_loss": 0.83790672, + "learning_rate": 3.0645940829971295e-06, + "loss": 0.8609128, + "num_input_tokens_seen": 122125080, + "step": 5683, + "time_per_iteration": 3.9978010654449463 + }, + { + "auxiliary_loss_clip": 0.01137499, + "auxiliary_loss_mlp": 0.01130171, + "balance_loss_clip": 1.00186324, + "balance_loss_mlp": 1.00094795, + "epoch": 0.3417405681647377, + "flos": 22601601889920.0, + "grad_norm": 2.5386672047186662, + "language_loss": 0.70633161, + "learning_rate": 3.0642643610475116e-06, + "loss": 0.72900832, + "num_input_tokens_seen": 122146350, + "step": 5684, + "time_per_iteration": 4.086174249649048 + }, + { + "auxiliary_loss_clip": 0.01170862, + "auxiliary_loss_mlp": 0.0112916, + "balance_loss_clip": 1.00211394, + "balance_loss_mlp": 1.00079596, + "epoch": 0.34180069141740566, + "flos": 24716237420160.0, + "grad_norm": 1.4879713319887269, + "language_loss": 0.74829674, + "learning_rate": 3.0639345987408823e-06, + "loss": 0.77129698, + "num_input_tokens_seen": 122168085, + "step": 5685, + "time_per_iteration": 2.5531985759735107 + }, + { + "auxiliary_loss_clip": 0.01155416, + "auxiliary_loss_mlp": 0.01128753, + "balance_loss_clip": 1.00201178, + "balance_loss_mlp": 1.00067401, + "epoch": 0.3418608146700737, + "flos": 30518755246080.0, + "grad_norm": 1.8109673798974566, + "language_loss": 0.70725554, + "learning_rate": 3.0636047960897468e-06, + "loss": 0.73009723, + "num_input_tokens_seen": 122191040, + "step": 5686, + "time_per_iteration": 2.6165666580200195 + }, + { + "auxiliary_loss_clip": 0.01154208, + "auxiliary_loss_mlp": 0.01130072, + "balance_loss_clip": 1.00188804, + "balance_loss_mlp": 1.00065827, + "epoch": 0.34192093792274164, + "flos": 15122343237120.0, + "grad_norm": 2.060662223052505, + "language_loss": 0.77478588, + "learning_rate": 3.06327495310661e-06, + "loss": 0.79762864, + "num_input_tokens_seen": 122209225, + "step": 5687, + "time_per_iteration": 2.5309641361236572 + }, + { + "auxiliary_loss_clip": 0.01139088, + "auxiliary_loss_mlp": 0.01129734, + "balance_loss_clip": 1.00197947, + "balance_loss_mlp": 1.00079775, + "epoch": 0.3419810611754096, + "flos": 13187799521280.0, + "grad_norm": 2.688354520479354, + "language_loss": 0.86595732, + "learning_rate": 3.062945069803981e-06, + "loss": 0.88864553, + "num_input_tokens_seen": 122226160, + "step": 5688, + "time_per_iteration": 2.5759687423706055 + }, + { + "auxiliary_loss_clip": 0.01144828, + "auxiliary_loss_mlp": 0.01129628, + "balance_loss_clip": 1.00250208, + "balance_loss_mlp": 1.00059628, + "epoch": 0.34204118442807757, + "flos": 19536267139200.0, + "grad_norm": 1.6356492985948194, + "language_loss": 0.79777312, + "learning_rate": 3.0626151461943684e-06, + "loss": 0.82051766, + "num_input_tokens_seen": 122243115, + "step": 5689, + "time_per_iteration": 2.621330738067627 + }, + { + "auxiliary_loss_clip": 0.01155059, + "auxiliary_loss_mlp": 0.01129761, + "balance_loss_clip": 1.00192523, + "balance_loss_mlp": 1.00072908, + "epoch": 0.34210130768074554, + "flos": 15194846839680.0, + "grad_norm": 1.7975204315637774, + "language_loss": 0.73105109, + "learning_rate": 3.0622851822902834e-06, + "loss": 0.75389934, + "num_input_tokens_seen": 122261105, + "step": 5690, + "time_per_iteration": 2.5481038093566895 + }, + { + "auxiliary_loss_clip": 0.0115549, + "auxiliary_loss_mlp": 0.01129327, + "balance_loss_clip": 1.00191844, + "balance_loss_mlp": 1.00067639, + "epoch": 0.3421614309334135, + "flos": 24936226266240.0, + "grad_norm": 1.764424929872021, + "language_loss": 0.76146173, + "learning_rate": 3.061955178104237e-06, + "loss": 0.78430986, + "num_input_tokens_seen": 122279995, + "step": 5691, + "time_per_iteration": 2.604738235473633 + }, + { + "auxiliary_loss_clip": 0.0115408, + "auxiliary_loss_mlp": 0.01128698, + "balance_loss_clip": 1.00179482, + "balance_loss_mlp": 1.0007149, + "epoch": 0.34222155418608147, + "flos": 21908633731200.0, + "grad_norm": 1.682992573659739, + "language_loss": 0.67583203, + "learning_rate": 3.0616251336487447e-06, + "loss": 0.69865978, + "num_input_tokens_seen": 122299070, + "step": 5692, + "time_per_iteration": 2.5574951171875 + }, + { + "auxiliary_loss_clip": 0.01154235, + "auxiliary_loss_mlp": 0.0112954, + "balance_loss_clip": 1.00192428, + "balance_loss_mlp": 1.00069833, + "epoch": 0.34228167743874943, + "flos": 18114061063680.0, + "grad_norm": 2.481239514239482, + "language_loss": 0.72509128, + "learning_rate": 3.06129504893632e-06, + "loss": 0.74792898, + "num_input_tokens_seen": 122316800, + "step": 5693, + "time_per_iteration": 2.578493118286133 + }, + { + "auxiliary_loss_clip": 0.01123104, + "auxiliary_loss_mlp": 0.01128997, + "balance_loss_clip": 1.00185132, + "balance_loss_mlp": 1.00082338, + "epoch": 0.3423418006914174, + "flos": 21288600138240.0, + "grad_norm": 1.7662221303206371, + "language_loss": 0.75281286, + "learning_rate": 3.0609649239794813e-06, + "loss": 0.77533388, + "num_input_tokens_seen": 122335275, + "step": 5694, + "time_per_iteration": 2.637214422225952 + }, + { + "auxiliary_loss_clip": 0.01121001, + "auxiliary_loss_mlp": 0.01128228, + "balance_loss_clip": 1.00176477, + "balance_loss_mlp": 1.00072193, + "epoch": 0.34240192394408536, + "flos": 19823480288640.0, + "grad_norm": 1.7307183148497014, + "language_loss": 0.79353189, + "learning_rate": 3.060634758790747e-06, + "loss": 0.81602418, + "num_input_tokens_seen": 122353215, + "step": 5695, + "time_per_iteration": 2.648287296295166 + }, + { + "auxiliary_loss_clip": 0.01106513, + "auxiliary_loss_mlp": 0.01128329, + "balance_loss_clip": 1.00182235, + "balance_loss_mlp": 1.00072718, + "epoch": 0.3424620471967533, + "flos": 24535535074560.0, + "grad_norm": 1.827123202592001, + "language_loss": 0.73370445, + "learning_rate": 3.060304553382635e-06, + "loss": 0.75605273, + "num_input_tokens_seen": 122372495, + "step": 5696, + "time_per_iteration": 2.749891996383667 + }, + { + "auxiliary_loss_clip": 0.01129859, + "auxiliary_loss_mlp": 0.01128751, + "balance_loss_clip": 1.00243592, + "balance_loss_mlp": 1.00076818, + "epoch": 0.3425221704494213, + "flos": 25848895962240.0, + "grad_norm": 1.613836250389469, + "language_loss": 0.70616764, + "learning_rate": 3.0599743077676685e-06, + "loss": 0.72875375, + "num_input_tokens_seen": 122394600, + "step": 5697, + "time_per_iteration": 2.7432048320770264 + }, + { + "auxiliary_loss_clip": 0.01143296, + "auxiliary_loss_mlp": 0.01128411, + "balance_loss_clip": 1.00240457, + "balance_loss_mlp": 1.00042796, + "epoch": 0.34258229370208926, + "flos": 21540513196800.0, + "grad_norm": 2.3370371787821838, + "language_loss": 0.82184398, + "learning_rate": 3.05964402195837e-06, + "loss": 0.8445611, + "num_input_tokens_seen": 122414700, + "step": 5698, + "time_per_iteration": 2.6105268001556396 + }, + { + "auxiliary_loss_clip": 0.01097594, + "auxiliary_loss_mlp": 0.01130014, + "balance_loss_clip": 1.00197816, + "balance_loss_mlp": 1.0008868, + "epoch": 0.3426424169547573, + "flos": 23652778429440.0, + "grad_norm": 3.656631983603173, + "language_loss": 0.69301718, + "learning_rate": 3.0593136959672645e-06, + "loss": 0.71529329, + "num_input_tokens_seen": 122432760, + "step": 5699, + "time_per_iteration": 2.8113131523132324 + }, + { + "auxiliary_loss_clip": 0.01137597, + "auxiliary_loss_mlp": 0.01129038, + "balance_loss_clip": 1.0018959, + "balance_loss_mlp": 1.00067306, + "epoch": 0.34270254020742524, + "flos": 24644883052800.0, + "grad_norm": 2.1801130114254548, + "language_loss": 0.72504342, + "learning_rate": 3.058983329806877e-06, + "loss": 0.74770975, + "num_input_tokens_seen": 122449105, + "step": 5700, + "time_per_iteration": 2.601614475250244 + }, + { + "auxiliary_loss_clip": 0.01137366, + "auxiliary_loss_mlp": 0.01129098, + "balance_loss_clip": 1.00186265, + "balance_loss_mlp": 1.00063813, + "epoch": 0.3427626634600932, + "flos": 20996754134400.0, + "grad_norm": 2.7568744979679662, + "language_loss": 0.81631219, + "learning_rate": 3.0586529234897354e-06, + "loss": 0.83897686, + "num_input_tokens_seen": 122468700, + "step": 5701, + "time_per_iteration": 2.605530261993408 + }, + { + "auxiliary_loss_clip": 0.01154641, + "auxiliary_loss_mlp": 0.0112909, + "balance_loss_clip": 1.0019815, + "balance_loss_mlp": 1.00082088, + "epoch": 0.3428227867127612, + "flos": 21433786911360.0, + "grad_norm": 1.7735290283113716, + "language_loss": 0.7174533, + "learning_rate": 3.0583224770283694e-06, + "loss": 0.74029064, + "num_input_tokens_seen": 122488160, + "step": 5702, + "time_per_iteration": 2.614741802215576 + }, + { + "auxiliary_loss_clip": 0.01133696, + "auxiliary_loss_mlp": 0.01112218, + "balance_loss_clip": 1.00175285, + "balance_loss_mlp": 1.00006568, + "epoch": 0.34288290996542914, + "flos": 55731782695680.0, + "grad_norm": 0.7764035972335083, + "language_loss": 0.57398176, + "learning_rate": 3.057991990435309e-06, + "loss": 0.59644091, + "num_input_tokens_seen": 122542890, + "step": 5703, + "time_per_iteration": 3.081918954849243 + }, + { + "auxiliary_loss_clip": 0.01155689, + "auxiliary_loss_mlp": 0.01129904, + "balance_loss_clip": 1.00206184, + "balance_loss_mlp": 1.00058627, + "epoch": 0.3429430332180971, + "flos": 20156803522560.0, + "grad_norm": 2.0820028644668245, + "language_loss": 0.75028968, + "learning_rate": 3.057661463723086e-06, + "loss": 0.77314556, + "num_input_tokens_seen": 122561770, + "step": 5704, + "time_per_iteration": 2.5364816188812256 + }, + { + "auxiliary_loss_clip": 0.01127115, + "auxiliary_loss_mlp": 0.01128736, + "balance_loss_clip": 1.00219214, + "balance_loss_mlp": 1.00084782, + "epoch": 0.34300315647076507, + "flos": 17965857548160.0, + "grad_norm": 2.140553158460437, + "language_loss": 0.72487867, + "learning_rate": 3.0573308969042346e-06, + "loss": 0.74743712, + "num_input_tokens_seen": 122580580, + "step": 5705, + "time_per_iteration": 2.632791042327881 + }, + { + "auxiliary_loss_clip": 0.01107745, + "auxiliary_loss_mlp": 0.01128606, + "balance_loss_clip": 1.0018369, + "balance_loss_mlp": 1.00052738, + "epoch": 0.34306327972343303, + "flos": 22086822124800.0, + "grad_norm": 2.195320597605174, + "language_loss": 0.79906857, + "learning_rate": 3.057000289991289e-06, + "loss": 0.82143211, + "num_input_tokens_seen": 122599810, + "step": 5706, + "time_per_iteration": 2.670888662338257 + }, + { + "auxiliary_loss_clip": 0.01139203, + "auxiliary_loss_mlp": 0.01129203, + "balance_loss_clip": 1.00194931, + "balance_loss_mlp": 1.00064778, + "epoch": 0.343123402976101, + "flos": 18442679616000.0, + "grad_norm": 2.187164723092971, + "language_loss": 0.82738447, + "learning_rate": 3.056669642996787e-06, + "loss": 0.85006857, + "num_input_tokens_seen": 122616035, + "step": 5707, + "time_per_iteration": 2.5666019916534424 + }, + { + "auxiliary_loss_clip": 0.01154096, + "auxiliary_loss_mlp": 0.01128997, + "balance_loss_clip": 1.00201523, + "balance_loss_mlp": 1.00063288, + "epoch": 0.34318352622876896, + "flos": 17163685065600.0, + "grad_norm": 1.8460389419171161, + "language_loss": 0.75252974, + "learning_rate": 3.056338955933266e-06, + "loss": 0.77536064, + "num_input_tokens_seen": 122633785, + "step": 5708, + "time_per_iteration": 2.5308644771575928 + }, + { + "auxiliary_loss_clip": 0.01139267, + "auxiliary_loss_mlp": 0.01128652, + "balance_loss_clip": 1.00190353, + "balance_loss_mlp": 1.00076449, + "epoch": 0.34324364948143693, + "flos": 26688164215680.0, + "grad_norm": 1.547066636280133, + "language_loss": 0.8138839, + "learning_rate": 3.0560082288132662e-06, + "loss": 0.83656311, + "num_input_tokens_seen": 122652100, + "step": 5709, + "time_per_iteration": 2.7339229583740234 + }, + { + "auxiliary_loss_clip": 0.01137397, + "auxiliary_loss_mlp": 0.01129039, + "balance_loss_clip": 1.00188303, + "balance_loss_mlp": 1.00077009, + "epoch": 0.3433037727341049, + "flos": 21251576194560.0, + "grad_norm": 2.302629002972122, + "language_loss": 0.79051679, + "learning_rate": 3.055677461649329e-06, + "loss": 0.81318116, + "num_input_tokens_seen": 122669720, + "step": 5710, + "time_per_iteration": 2.5980474948883057 + }, + { + "auxiliary_loss_clip": 0.01154153, + "auxiliary_loss_mlp": 0.01129223, + "balance_loss_clip": 1.00193119, + "balance_loss_mlp": 1.00066805, + "epoch": 0.34336389598677286, + "flos": 20629423699200.0, + "grad_norm": 1.963062916024686, + "language_loss": 0.70045638, + "learning_rate": 3.055346654453996e-06, + "loss": 0.72329015, + "num_input_tokens_seen": 122688715, + "step": 5711, + "time_per_iteration": 2.5535833835601807 + }, + { + "auxiliary_loss_clip": 0.01124012, + "auxiliary_loss_mlp": 0.00747876, + "balance_loss_clip": 1.00187492, + "balance_loss_mlp": 1.00009847, + "epoch": 0.3434240192394409, + "flos": 14538579402240.0, + "grad_norm": 1.869141984517476, + "language_loss": 0.67844981, + "learning_rate": 3.055015807239812e-06, + "loss": 0.69716871, + "num_input_tokens_seen": 122706970, + "step": 5712, + "time_per_iteration": 2.6289002895355225 + }, + { + "auxiliary_loss_clip": 0.01118881, + "auxiliary_loss_mlp": 0.01111597, + "balance_loss_clip": 1.00182462, + "balance_loss_mlp": 1.0002079, + "epoch": 0.34348414249210885, + "flos": 58051538841600.0, + "grad_norm": 0.8421797243820192, + "language_loss": 0.58098948, + "learning_rate": 3.0546849200193226e-06, + "loss": 0.60329425, + "num_input_tokens_seen": 122758095, + "step": 5713, + "time_per_iteration": 3.1796605587005615 + }, + { + "auxiliary_loss_clip": 0.01170726, + "auxiliary_loss_mlp": 0.01129408, + "balance_loss_clip": 1.00203943, + "balance_loss_mlp": 1.00075769, + "epoch": 0.3435442657447768, + "flos": 20704441253760.0, + "grad_norm": 1.9685150032092704, + "language_loss": 0.80816865, + "learning_rate": 3.054353992805076e-06, + "loss": 0.83116996, + "num_input_tokens_seen": 122777815, + "step": 5714, + "time_per_iteration": 2.5618185997009277 + }, + { + "auxiliary_loss_clip": 0.01170875, + "auxiliary_loss_mlp": 0.01128604, + "balance_loss_clip": 1.00215983, + "balance_loss_mlp": 1.00071585, + "epoch": 0.3436043889974448, + "flos": 22930256355840.0, + "grad_norm": 2.115987825342512, + "language_loss": 0.71724987, + "learning_rate": 3.05402302560962e-06, + "loss": 0.74024469, + "num_input_tokens_seen": 122797555, + "step": 5715, + "time_per_iteration": 2.5252201557159424 + }, + { + "auxiliary_loss_clip": 0.01151265, + "auxiliary_loss_mlp": 0.01112149, + "balance_loss_clip": 1.00209498, + "balance_loss_mlp": 1.00075936, + "epoch": 0.34366451225011274, + "flos": 58403285752320.0, + "grad_norm": 0.8938655327658723, + "language_loss": 0.65861529, + "learning_rate": 3.053692018445505e-06, + "loss": 0.68124938, + "num_input_tokens_seen": 122863955, + "step": 5716, + "time_per_iteration": 3.18404221534729 + }, + { + "auxiliary_loss_clip": 0.01155361, + "auxiliary_loss_mlp": 0.01128243, + "balance_loss_clip": 1.00198853, + "balance_loss_mlp": 1.00073647, + "epoch": 0.3437246355027807, + "flos": 15596292216960.0, + "grad_norm": 3.124333391799773, + "language_loss": 0.73939788, + "learning_rate": 3.0533609713252838e-06, + "loss": 0.76223391, + "num_input_tokens_seen": 122883000, + "step": 5717, + "time_per_iteration": 3.9491665363311768 + }, + { + "auxiliary_loss_clip": 0.01105082, + "auxiliary_loss_mlp": 0.01128146, + "balance_loss_clip": 1.00169396, + "balance_loss_mlp": 1.00073504, + "epoch": 0.34378475875544867, + "flos": 27672260106240.0, + "grad_norm": 2.07838455633991, + "language_loss": 0.75038528, + "learning_rate": 3.0530298842615077e-06, + "loss": 0.7727176, + "num_input_tokens_seen": 122903265, + "step": 5718, + "time_per_iteration": 2.7186520099639893 + }, + { + "auxiliary_loss_clip": 0.01120393, + "auxiliary_loss_mlp": 0.0112876, + "balance_loss_clip": 1.00172722, + "balance_loss_mlp": 1.00068164, + "epoch": 0.34384488200811664, + "flos": 31431496769280.0, + "grad_norm": 1.9479121247259206, + "language_loss": 0.63979125, + "learning_rate": 3.052698757266734e-06, + "loss": 0.66228282, + "num_input_tokens_seen": 122923860, + "step": 5719, + "time_per_iteration": 4.228410959243774 + }, + { + "auxiliary_loss_clip": 0.01107131, + "auxiliary_loss_mlp": 0.01129213, + "balance_loss_clip": 1.00172162, + "balance_loss_mlp": 1.0006578, + "epoch": 0.3439050052607846, + "flos": 24899920594560.0, + "grad_norm": 2.6340947230529124, + "language_loss": 0.73868501, + "learning_rate": 3.0523675903535183e-06, + "loss": 0.76104844, + "num_input_tokens_seen": 122945305, + "step": 5720, + "time_per_iteration": 4.093812465667725 + }, + { + "auxiliary_loss_clip": 0.0115545, + "auxiliary_loss_mlp": 0.01129354, + "balance_loss_clip": 1.00204229, + "balance_loss_mlp": 1.0007031, + "epoch": 0.34396512851345257, + "flos": 18150079426560.0, + "grad_norm": 1.7205530622444372, + "language_loss": 0.74310136, + "learning_rate": 3.0520363835344173e-06, + "loss": 0.76594937, + "num_input_tokens_seen": 122962535, + "step": 5721, + "time_per_iteration": 2.5270721912384033 + }, + { + "auxiliary_loss_clip": 0.01138935, + "auxiliary_loss_mlp": 0.00747837, + "balance_loss_clip": 1.00197124, + "balance_loss_mlp": 1.00013423, + "epoch": 0.34402525176612053, + "flos": 16034438315520.0, + "grad_norm": 1.8356725898216841, + "language_loss": 0.79628587, + "learning_rate": 3.051705136821992e-06, + "loss": 0.8151536, + "num_input_tokens_seen": 122979750, + "step": 5722, + "time_per_iteration": 4.032835245132446 + }, + { + "auxiliary_loss_clip": 0.01105747, + "auxiliary_loss_mlp": 0.01127545, + "balance_loss_clip": 1.00171399, + "balance_loss_mlp": 1.0005157, + "epoch": 0.3440853750187885, + "flos": 21178641628800.0, + "grad_norm": 1.5657747650277654, + "language_loss": 0.81352752, + "learning_rate": 3.051373850228801e-06, + "loss": 0.83586049, + "num_input_tokens_seen": 122998955, + "step": 5723, + "time_per_iteration": 2.6777074337005615 + }, + { + "auxiliary_loss_clip": 0.01123624, + "auxiliary_loss_mlp": 0.01128456, + "balance_loss_clip": 1.00185204, + "balance_loss_mlp": 1.00085473, + "epoch": 0.34414549827145646, + "flos": 12677868092160.0, + "grad_norm": 1.9107773493253781, + "language_loss": 0.81172681, + "learning_rate": 3.0510425237674096e-06, + "loss": 0.83424771, + "num_input_tokens_seen": 123016165, + "step": 5724, + "time_per_iteration": 2.5999011993408203 + }, + { + "auxiliary_loss_clip": 0.01138309, + "auxiliary_loss_mlp": 0.01128699, + "balance_loss_clip": 1.00180376, + "balance_loss_mlp": 1.00071549, + "epoch": 0.3442056215241244, + "flos": 31284514316160.0, + "grad_norm": 2.6647813186049647, + "language_loss": 0.68796575, + "learning_rate": 3.05071115745038e-06, + "loss": 0.7106359, + "num_input_tokens_seen": 123036900, + "step": 5725, + "time_per_iteration": 2.6565847396850586 + }, + { + "auxiliary_loss_clip": 0.01154474, + "auxiliary_loss_mlp": 0.01129594, + "balance_loss_clip": 1.00180721, + "balance_loss_mlp": 1.00084758, + "epoch": 0.34426574477679245, + "flos": 23367289132800.0, + "grad_norm": 1.4782723866948924, + "language_loss": 0.69424897, + "learning_rate": 3.0503797512902773e-06, + "loss": 0.71708965, + "num_input_tokens_seen": 123057480, + "step": 5726, + "time_per_iteration": 2.5467898845672607 + }, + { + "auxiliary_loss_clip": 0.01122072, + "auxiliary_loss_mlp": 0.01129101, + "balance_loss_clip": 1.00183845, + "balance_loss_mlp": 1.00083148, + "epoch": 0.3443258680294604, + "flos": 24535427333760.0, + "grad_norm": 1.7229767603658626, + "language_loss": 0.73331153, + "learning_rate": 3.0500483052996703e-06, + "loss": 0.75582331, + "num_input_tokens_seen": 123076890, + "step": 5727, + "time_per_iteration": 2.649414539337158 + }, + { + "auxiliary_loss_clip": 0.01122899, + "auxiliary_loss_mlp": 0.01128146, + "balance_loss_clip": 1.00190067, + "balance_loss_mlp": 1.00064015, + "epoch": 0.3443859912821284, + "flos": 20230133137920.0, + "grad_norm": 1.9251745336618558, + "language_loss": 0.87984025, + "learning_rate": 3.0497168194911257e-06, + "loss": 0.90235072, + "num_input_tokens_seen": 123092530, + "step": 5728, + "time_per_iteration": 2.5991594791412354 + }, + { + "auxiliary_loss_clip": 0.01106537, + "auxiliary_loss_mlp": 0.01129026, + "balance_loss_clip": 1.00186229, + "balance_loss_mlp": 1.00056612, + "epoch": 0.34444611453479634, + "flos": 24316515895680.0, + "grad_norm": 2.2010320413024376, + "language_loss": 0.70158219, + "learning_rate": 3.0493852938772143e-06, + "loss": 0.72393787, + "num_input_tokens_seen": 123110560, + "step": 5729, + "time_per_iteration": 2.6940152645111084 + }, + { + "auxiliary_loss_clip": 0.01153828, + "auxiliary_loss_mlp": 0.01128892, + "balance_loss_clip": 1.0019815, + "balance_loss_mlp": 1.00081372, + "epoch": 0.3445062377874643, + "flos": 16983413683200.0, + "grad_norm": 1.989086916437192, + "language_loss": 0.73460978, + "learning_rate": 3.0490537284705078e-06, + "loss": 0.75743693, + "num_input_tokens_seen": 123128655, + "step": 5730, + "time_per_iteration": 2.5449001789093018 + }, + { + "auxiliary_loss_clip": 0.01123198, + "auxiliary_loss_mlp": 0.01128838, + "balance_loss_clip": 1.00182605, + "balance_loss_mlp": 1.000664, + "epoch": 0.3445663610401323, + "flos": 20302708567680.0, + "grad_norm": 5.570086549007162, + "language_loss": 0.7961756, + "learning_rate": 3.048722123283578e-06, + "loss": 0.81869596, + "num_input_tokens_seen": 123145130, + "step": 5731, + "time_per_iteration": 2.6694369316101074 + }, + { + "auxiliary_loss_clip": 0.01154141, + "auxiliary_loss_mlp": 0.0112828, + "balance_loss_clip": 1.00195837, + "balance_loss_mlp": 1.00077367, + "epoch": 0.34462648429280024, + "flos": 15888102307200.0, + "grad_norm": 1.9030346083116487, + "language_loss": 0.78211904, + "learning_rate": 3.0483904783290006e-06, + "loss": 0.8049432, + "num_input_tokens_seen": 123162265, + "step": 5732, + "time_per_iteration": 2.568373203277588 + }, + { + "auxiliary_loss_clip": 0.01119148, + "auxiliary_loss_mlp": 0.01112243, + "balance_loss_clip": 1.00185633, + "balance_loss_mlp": 1.00009108, + "epoch": 0.3446866075454682, + "flos": 59311035285120.0, + "grad_norm": 0.9211668368111402, + "language_loss": 0.53508383, + "learning_rate": 3.0480587936193505e-06, + "loss": 0.55739772, + "num_input_tokens_seen": 123218620, + "step": 5733, + "time_per_iteration": 3.2448718547821045 + }, + { + "auxiliary_loss_clip": 0.01138419, + "auxiliary_loss_mlp": 0.01128591, + "balance_loss_clip": 1.00185978, + "balance_loss_mlp": 1.00079858, + "epoch": 0.34474673079813617, + "flos": 22343799000960.0, + "grad_norm": 1.5599711008699138, + "language_loss": 0.83451068, + "learning_rate": 3.047727069167207e-06, + "loss": 0.85718083, + "num_input_tokens_seen": 123237325, + "step": 5734, + "time_per_iteration": 2.6405022144317627 + }, + { + "auxiliary_loss_clip": 0.01137589, + "auxiliary_loss_mlp": 0.01129387, + "balance_loss_clip": 1.00184727, + "balance_loss_mlp": 1.00054574, + "epoch": 0.34480685405080413, + "flos": 27670141203840.0, + "grad_norm": 2.028751102004045, + "language_loss": 0.92936426, + "learning_rate": 3.0473953049851478e-06, + "loss": 0.952034, + "num_input_tokens_seen": 123258650, + "step": 5735, + "time_per_iteration": 2.6736631393432617 + }, + { + "auxiliary_loss_clip": 0.0110553, + "auxiliary_loss_mlp": 0.01129184, + "balance_loss_clip": 1.00189519, + "balance_loss_mlp": 1.0007236, + "epoch": 0.3448669773034721, + "flos": 22456020067200.0, + "grad_norm": 1.6644526395494958, + "language_loss": 0.7653445, + "learning_rate": 3.0470635010857533e-06, + "loss": 0.78769171, + "num_input_tokens_seen": 123277155, + "step": 5736, + "time_per_iteration": 2.699061870574951 + }, + { + "auxiliary_loss_clip": 0.01138417, + "auxiliary_loss_mlp": 0.01129003, + "balance_loss_clip": 1.00196433, + "balance_loss_mlp": 1.00073409, + "epoch": 0.34492710055614006, + "flos": 24936190352640.0, + "grad_norm": 1.6552290287754134, + "language_loss": 0.7904619, + "learning_rate": 3.0467316574816064e-06, + "loss": 0.8131361, + "num_input_tokens_seen": 123297640, + "step": 5737, + "time_per_iteration": 2.6248013973236084 + }, + { + "auxiliary_loss_clip": 0.01113374, + "auxiliary_loss_mlp": 0.01129962, + "balance_loss_clip": 1.00250328, + "balance_loss_mlp": 1.00083482, + "epoch": 0.34498722380880803, + "flos": 20120821073280.0, + "grad_norm": 2.4424907664405673, + "language_loss": 0.71317422, + "learning_rate": 3.0463997741852893e-06, + "loss": 0.73560756, + "num_input_tokens_seen": 123314370, + "step": 5738, + "time_per_iteration": 2.676408529281616 + }, + { + "auxiliary_loss_clip": 0.01123686, + "auxiliary_loss_mlp": 0.01129135, + "balance_loss_clip": 1.00190079, + "balance_loss_mlp": 1.00086617, + "epoch": 0.34504734706147605, + "flos": 28438126917120.0, + "grad_norm": 2.924997951835308, + "language_loss": 0.8191883, + "learning_rate": 3.046067851209389e-06, + "loss": 0.84171653, + "num_input_tokens_seen": 123336085, + "step": 5739, + "time_per_iteration": 2.700028419494629 + }, + { + "auxiliary_loss_clip": 0.01122371, + "auxiliary_loss_mlp": 0.01129236, + "balance_loss_clip": 1.00196242, + "balance_loss_mlp": 1.00077605, + "epoch": 0.345107470314144, + "flos": 22674464628480.0, + "grad_norm": 1.9564636877658135, + "language_loss": 0.82918954, + "learning_rate": 3.0457358885664898e-06, + "loss": 0.85170567, + "num_input_tokens_seen": 123354460, + "step": 5740, + "time_per_iteration": 2.6424920558929443 + }, + { + "auxiliary_loss_clip": 0.01153798, + "auxiliary_loss_mlp": 0.01129858, + "balance_loss_clip": 1.00200605, + "balance_loss_mlp": 1.00073087, + "epoch": 0.345167593566812, + "flos": 20630716588800.0, + "grad_norm": 2.086481047215917, + "language_loss": 0.76876891, + "learning_rate": 3.045403886269181e-06, + "loss": 0.79160547, + "num_input_tokens_seen": 123373420, + "step": 5741, + "time_per_iteration": 2.5335991382598877 + }, + { + "auxiliary_loss_clip": 0.01138731, + "auxiliary_loss_mlp": 0.01129109, + "balance_loss_clip": 1.00177813, + "balance_loss_mlp": 1.00074434, + "epoch": 0.34522771681947995, + "flos": 26214358890240.0, + "grad_norm": 1.4673832698409504, + "language_loss": 0.77103436, + "learning_rate": 3.045071844330053e-06, + "loss": 0.79371274, + "num_input_tokens_seen": 123394730, + "step": 5742, + "time_per_iteration": 2.638327121734619 + }, + { + "auxiliary_loss_clip": 0.0115567, + "auxiliary_loss_mlp": 0.01128941, + "balance_loss_clip": 1.0020417, + "balance_loss_mlp": 1.00086236, + "epoch": 0.3452878400721479, + "flos": 19062354072960.0, + "grad_norm": 1.9821909816799357, + "language_loss": 0.76174998, + "learning_rate": 3.0447397627616955e-06, + "loss": 0.78459609, + "num_input_tokens_seen": 123412895, + "step": 5743, + "time_per_iteration": 2.526524782180786 + }, + { + "auxiliary_loss_clip": 0.01154133, + "auxiliary_loss_mlp": 0.01128622, + "balance_loss_clip": 1.002092, + "balance_loss_mlp": 1.00073469, + "epoch": 0.3453479633248159, + "flos": 27929739772800.0, + "grad_norm": 1.6156619280030127, + "language_loss": 0.70444524, + "learning_rate": 3.0444076415767016e-06, + "loss": 0.72727287, + "num_input_tokens_seen": 123432320, + "step": 5744, + "time_per_iteration": 2.6012425422668457 + }, + { + "auxiliary_loss_clip": 0.01170559, + "auxiliary_loss_mlp": 0.01128337, + "balance_loss_clip": 1.00202847, + "balance_loss_mlp": 1.00054491, + "epoch": 0.34540808657748384, + "flos": 19606113135360.0, + "grad_norm": 1.570620342013245, + "language_loss": 0.79235339, + "learning_rate": 3.044075480787665e-06, + "loss": 0.81534237, + "num_input_tokens_seen": 123450980, + "step": 5745, + "time_per_iteration": 2.495596408843994 + }, + { + "auxiliary_loss_clip": 0.01104143, + "auxiliary_loss_mlp": 0.01129605, + "balance_loss_clip": 1.00171256, + "balance_loss_mlp": 1.00076401, + "epoch": 0.3454682098301518, + "flos": 20411661496320.0, + "grad_norm": 1.8025539875531689, + "language_loss": 0.89162755, + "learning_rate": 3.043743280407182e-06, + "loss": 0.91396511, + "num_input_tokens_seen": 123469365, + "step": 5746, + "time_per_iteration": 2.6652393341064453 + }, + { + "auxiliary_loss_clip": 0.01154144, + "auxiliary_loss_mlp": 0.01129564, + "balance_loss_clip": 1.00199413, + "balance_loss_mlp": 1.0006274, + "epoch": 0.34552833308281977, + "flos": 21325121291520.0, + "grad_norm": 2.341315828814529, + "language_loss": 0.64759684, + "learning_rate": 3.043411040447849e-06, + "loss": 0.67043388, + "num_input_tokens_seen": 123489425, + "step": 5747, + "time_per_iteration": 2.5944793224334717 + }, + { + "auxiliary_loss_clip": 0.01140079, + "auxiliary_loss_mlp": 0.0112837, + "balance_loss_clip": 1.00198162, + "balance_loss_mlp": 1.00076818, + "epoch": 0.34558845633548774, + "flos": 36243633824640.0, + "grad_norm": 1.5535104008235712, + "language_loss": 0.72973204, + "learning_rate": 3.043078760922264e-06, + "loss": 0.75241649, + "num_input_tokens_seen": 123509970, + "step": 5748, + "time_per_iteration": 2.713894844055176 + }, + { + "auxiliary_loss_clip": 0.01105741, + "auxiliary_loss_mlp": 0.01128481, + "balance_loss_clip": 1.00187278, + "balance_loss_mlp": 1.00078416, + "epoch": 0.3456485795881557, + "flos": 22450561200000.0, + "grad_norm": 1.6889455157782622, + "language_loss": 0.75376409, + "learning_rate": 3.042746441843029e-06, + "loss": 0.7761063, + "num_input_tokens_seen": 123531055, + "step": 5749, + "time_per_iteration": 2.6978211402893066 + }, + { + "auxiliary_loss_clip": 0.01136795, + "auxiliary_loss_mlp": 0.01111336, + "balance_loss_clip": 1.00219738, + "balance_loss_mlp": 0.99994701, + "epoch": 0.34570870284082367, + "flos": 62004299005440.0, + "grad_norm": 0.8907509171643289, + "language_loss": 0.62784827, + "learning_rate": 3.0424140832227437e-06, + "loss": 0.65032959, + "num_input_tokens_seen": 123584720, + "step": 5750, + "time_per_iteration": 3.0454206466674805 + }, + { + "auxiliary_loss_clip": 0.01137167, + "auxiliary_loss_mlp": 0.01127929, + "balance_loss_clip": 1.00189948, + "balance_loss_mlp": 1.00061393, + "epoch": 0.34576882609349163, + "flos": 22782196494720.0, + "grad_norm": 1.687775051252356, + "language_loss": 0.80146313, + "learning_rate": 3.042081685074012e-06, + "loss": 0.82411414, + "num_input_tokens_seen": 123604465, + "step": 5751, + "time_per_iteration": 2.7715911865234375 + }, + { + "auxiliary_loss_clip": 0.01170541, + "auxiliary_loss_mlp": 0.01128444, + "balance_loss_clip": 1.00199938, + "balance_loss_mlp": 1.00103319, + "epoch": 0.34582894934615965, + "flos": 12348818576640.0, + "grad_norm": 2.1508285270348524, + "language_loss": 0.84093499, + "learning_rate": 3.041749247409439e-06, + "loss": 0.86392474, + "num_input_tokens_seen": 123622320, + "step": 5752, + "time_per_iteration": 2.5127053260803223 + }, + { + "auxiliary_loss_clip": 0.01133506, + "auxiliary_loss_mlp": 0.0074757, + "balance_loss_clip": 1.00148511, + "balance_loss_mlp": 1.00072122, + "epoch": 0.3458890725988276, + "flos": 70167691071360.0, + "grad_norm": 0.7300723927130577, + "language_loss": 0.63103533, + "learning_rate": 3.0414167702416296e-06, + "loss": 0.64984608, + "num_input_tokens_seen": 123678010, + "step": 5753, + "time_per_iteration": 3.0708189010620117 + }, + { + "auxiliary_loss_clip": 0.01139088, + "auxiliary_loss_mlp": 0.0112831, + "balance_loss_clip": 1.00190914, + "balance_loss_mlp": 1.00080347, + "epoch": 0.3459491958514956, + "flos": 17092582093440.0, + "grad_norm": 2.017857264305864, + "language_loss": 0.70826399, + "learning_rate": 3.0410842535831914e-06, + "loss": 0.73093796, + "num_input_tokens_seen": 123696830, + "step": 5754, + "time_per_iteration": 2.5783331394195557 + }, + { + "auxiliary_loss_clip": 0.011542, + "auxiliary_loss_mlp": 0.01129109, + "balance_loss_clip": 1.00190437, + "balance_loss_mlp": 1.00064909, + "epoch": 0.34600931910416355, + "flos": 16650952375680.0, + "grad_norm": 1.9189508720760273, + "language_loss": 0.72627831, + "learning_rate": 3.0407516974467343e-06, + "loss": 0.74911141, + "num_input_tokens_seen": 123714360, + "step": 5755, + "time_per_iteration": 4.065045356750488 + }, + { + "auxiliary_loss_clip": 0.01155057, + "auxiliary_loss_mlp": 0.0112817, + "balance_loss_clip": 1.00205052, + "balance_loss_mlp": 1.00056791, + "epoch": 0.3460694423568315, + "flos": 38546190334080.0, + "grad_norm": 1.6479332892559042, + "language_loss": 0.72197163, + "learning_rate": 3.040419101844869e-06, + "loss": 0.74480391, + "num_input_tokens_seen": 123739250, + "step": 5756, + "time_per_iteration": 2.742417335510254 + }, + { + "auxiliary_loss_clip": 0.0115098, + "auxiliary_loss_mlp": 0.01111391, + "balance_loss_clip": 1.00218666, + "balance_loss_mlp": 1.00000191, + "epoch": 0.3461295656094995, + "flos": 72081479704320.0, + "grad_norm": 0.7199298147103765, + "language_loss": 0.62620205, + "learning_rate": 3.040086466790207e-06, + "loss": 0.64882571, + "num_input_tokens_seen": 123802845, + "step": 5757, + "time_per_iteration": 4.540299415588379 + }, + { + "auxiliary_loss_clip": 0.01117381, + "auxiliary_loss_mlp": 0.00747541, + "balance_loss_clip": 1.00191307, + "balance_loss_mlp": 1.00059474, + "epoch": 0.34618968886216744, + "flos": 65460089571840.0, + "grad_norm": 0.9432997426263454, + "language_loss": 0.59227431, + "learning_rate": 3.039753792295362e-06, + "loss": 0.61092353, + "num_input_tokens_seen": 123861805, + "step": 5758, + "time_per_iteration": 4.602906942367554 + }, + { + "auxiliary_loss_clip": 0.01138476, + "auxiliary_loss_mlp": 0.01127981, + "balance_loss_clip": 1.0019846, + "balance_loss_mlp": 1.00076056, + "epoch": 0.3462498121148354, + "flos": 23472542960640.0, + "grad_norm": 1.6177310007253092, + "language_loss": 0.71975255, + "learning_rate": 3.0394210783729487e-06, + "loss": 0.74241704, + "num_input_tokens_seen": 123881820, + "step": 5759, + "time_per_iteration": 2.646090269088745 + }, + { + "auxiliary_loss_clip": 0.01105499, + "auxiliary_loss_mlp": 0.01128686, + "balance_loss_clip": 1.00175035, + "balance_loss_mlp": 1.00098956, + "epoch": 0.3463099353675034, + "flos": 24170790418560.0, + "grad_norm": 1.6820840988461723, + "language_loss": 0.83170104, + "learning_rate": 3.0390883250355836e-06, + "loss": 0.85404283, + "num_input_tokens_seen": 123903700, + "step": 5760, + "time_per_iteration": 4.096797466278076 + }, + { + "auxiliary_loss_clip": 0.0112038, + "auxiliary_loss_mlp": 0.0111061, + "balance_loss_clip": 1.0018084, + "balance_loss_mlp": 0.99998397, + "epoch": 0.34637005862017134, + "flos": 63700609766400.0, + "grad_norm": 0.831052395768898, + "language_loss": 0.56574208, + "learning_rate": 3.0387555322958865e-06, + "loss": 0.58805197, + "num_input_tokens_seen": 123960075, + "step": 5761, + "time_per_iteration": 3.2371761798858643 + }, + { + "auxiliary_loss_clip": 0.01154981, + "auxiliary_loss_mlp": 0.00747841, + "balance_loss_clip": 1.00201654, + "balance_loss_mlp": 1.00018001, + "epoch": 0.3464301818728393, + "flos": 13145532192000.0, + "grad_norm": 2.561439045657558, + "language_loss": 0.95501685, + "learning_rate": 3.038422700166474e-06, + "loss": 0.97404504, + "num_input_tokens_seen": 123975805, + "step": 5762, + "time_per_iteration": 2.5361812114715576 + }, + { + "auxiliary_loss_clip": 0.0112431, + "auxiliary_loss_mlp": 0.01127992, + "balance_loss_clip": 1.00185609, + "balance_loss_mlp": 1.0004859, + "epoch": 0.34649030512550727, + "flos": 29315173299840.0, + "grad_norm": 1.6327369421397628, + "language_loss": 0.69685662, + "learning_rate": 3.0380898286599692e-06, + "loss": 0.7193796, + "num_input_tokens_seen": 123997530, + "step": 5763, + "time_per_iteration": 2.717554807662964 + }, + { + "auxiliary_loss_clip": 0.01155686, + "auxiliary_loss_mlp": 0.01129552, + "balance_loss_clip": 1.00210238, + "balance_loss_mlp": 1.00090122, + "epoch": 0.34655042837817523, + "flos": 23730884553600.0, + "grad_norm": 2.084079718542755, + "language_loss": 0.8371917, + "learning_rate": 3.0377569177889945e-06, + "loss": 0.86004412, + "num_input_tokens_seen": 124016375, + "step": 5764, + "time_per_iteration": 2.5571413040161133 + }, + { + "auxiliary_loss_clip": 0.01139323, + "auxiliary_loss_mlp": 0.01128097, + "balance_loss_clip": 1.00200331, + "balance_loss_mlp": 1.00078166, + "epoch": 0.34661055163084326, + "flos": 22054215553920.0, + "grad_norm": 2.3127286910156797, + "language_loss": 0.67429739, + "learning_rate": 3.0374239675661722e-06, + "loss": 0.6969716, + "num_input_tokens_seen": 124033975, + "step": 5765, + "time_per_iteration": 2.576564073562622 + }, + { + "auxiliary_loss_clip": 0.0113875, + "auxiliary_loss_mlp": 0.01129007, + "balance_loss_clip": 1.00208116, + "balance_loss_mlp": 1.0008328, + "epoch": 0.3466706748835112, + "flos": 21799213925760.0, + "grad_norm": 1.9471889922985428, + "language_loss": 0.76877624, + "learning_rate": 3.03709097800413e-06, + "loss": 0.79145384, + "num_input_tokens_seen": 124051930, + "step": 5766, + "time_per_iteration": 2.5814478397369385 + }, + { + "auxiliary_loss_clip": 0.01105794, + "auxiliary_loss_mlp": 0.01127861, + "balance_loss_clip": 1.00184727, + "balance_loss_mlp": 1.000736, + "epoch": 0.3467307981361792, + "flos": 19461680547840.0, + "grad_norm": 1.7402076422058672, + "language_loss": 0.7366811, + "learning_rate": 3.0367579491154943e-06, + "loss": 0.75901771, + "num_input_tokens_seen": 124071220, + "step": 5767, + "time_per_iteration": 2.679687976837158 + }, + { + "auxiliary_loss_clip": 0.01139071, + "auxiliary_loss_mlp": 0.01128844, + "balance_loss_clip": 1.00202322, + "balance_loss_mlp": 1.00086093, + "epoch": 0.34679092138884715, + "flos": 24827452905600.0, + "grad_norm": 2.119003699997906, + "language_loss": 0.77641964, + "learning_rate": 3.036424880912893e-06, + "loss": 0.79909873, + "num_input_tokens_seen": 124090140, + "step": 5768, + "time_per_iteration": 2.615878105163574 + }, + { + "auxiliary_loss_clip": 0.01150104, + "auxiliary_loss_mlp": 0.01111426, + "balance_loss_clip": 1.00207949, + "balance_loss_mlp": 1.00003707, + "epoch": 0.3468510446415151, + "flos": 63236070149760.0, + "grad_norm": 0.7675517053188808, + "language_loss": 0.57499373, + "learning_rate": 3.036091773408956e-06, + "loss": 0.59760898, + "num_input_tokens_seen": 124152025, + "step": 5769, + "time_per_iteration": 3.162553548812866 + }, + { + "auxiliary_loss_clip": 0.01121859, + "auxiliary_loss_mlp": 0.01130093, + "balance_loss_clip": 1.00191498, + "balance_loss_mlp": 1.00067973, + "epoch": 0.3469111678941831, + "flos": 12120713256960.0, + "grad_norm": 2.9351791976638997, + "language_loss": 0.86165112, + "learning_rate": 3.0357586266163154e-06, + "loss": 0.88417065, + "num_input_tokens_seen": 124165795, + "step": 5770, + "time_per_iteration": 2.5580570697784424 + }, + { + "auxiliary_loss_clip": 0.01135871, + "auxiliary_loss_mlp": 0.01110544, + "balance_loss_clip": 1.00215268, + "balance_loss_mlp": 0.99991769, + "epoch": 0.34697129114685105, + "flos": 65934110378880.0, + "grad_norm": 0.7742438237118314, + "language_loss": 0.59804451, + "learning_rate": 3.0354254405476036e-06, + "loss": 0.62050867, + "num_input_tokens_seen": 124222925, + "step": 5771, + "time_per_iteration": 2.977423906326294 + }, + { + "auxiliary_loss_clip": 0.01155516, + "auxiliary_loss_mlp": 0.01129094, + "balance_loss_clip": 1.00207126, + "balance_loss_mlp": 1.00082469, + "epoch": 0.347031414399519, + "flos": 34454205054720.0, + "grad_norm": 1.8013236383277995, + "language_loss": 0.71792305, + "learning_rate": 3.0350922152154557e-06, + "loss": 0.74076915, + "num_input_tokens_seen": 124240915, + "step": 5772, + "time_per_iteration": 2.6840052604675293 + }, + { + "auxiliary_loss_clip": 0.01123642, + "auxiliary_loss_mlp": 0.00747866, + "balance_loss_clip": 1.00191307, + "balance_loss_mlp": 1.00016296, + "epoch": 0.347091537652187, + "flos": 26944135511040.0, + "grad_norm": 1.4210640175483211, + "language_loss": 0.76464343, + "learning_rate": 3.034758950632507e-06, + "loss": 0.78335851, + "num_input_tokens_seen": 124262770, + "step": 5773, + "time_per_iteration": 2.704113006591797 + }, + { + "auxiliary_loss_clip": 0.01155526, + "auxiliary_loss_mlp": 0.01128916, + "balance_loss_clip": 1.00195289, + "balance_loss_mlp": 1.00074244, + "epoch": 0.34715166090485494, + "flos": 21142228216320.0, + "grad_norm": 2.084588674232874, + "language_loss": 0.70998794, + "learning_rate": 3.034425646811396e-06, + "loss": 0.73283231, + "num_input_tokens_seen": 124280950, + "step": 5774, + "time_per_iteration": 2.5668013095855713 + }, + { + "auxiliary_loss_clip": 0.01138438, + "auxiliary_loss_mlp": 0.00747817, + "balance_loss_clip": 1.00203478, + "balance_loss_mlp": 1.00016809, + "epoch": 0.3472117841575229, + "flos": 23478001827840.0, + "grad_norm": 2.914790340757534, + "language_loss": 0.75946128, + "learning_rate": 3.0340923037647602e-06, + "loss": 0.77832377, + "num_input_tokens_seen": 124299540, + "step": 5775, + "time_per_iteration": 2.6401641368865967 + }, + { + "auxiliary_loss_clip": 0.01138893, + "auxiliary_loss_mlp": 0.01129179, + "balance_loss_clip": 1.00193119, + "balance_loss_mlp": 1.00081432, + "epoch": 0.34727190741019087, + "flos": 17492806408320.0, + "grad_norm": 2.0586671124288896, + "language_loss": 0.7744838, + "learning_rate": 3.0337589215052404e-06, + "loss": 0.79716456, + "num_input_tokens_seen": 124316285, + "step": 5776, + "time_per_iteration": 2.5460398197174072 + }, + { + "auxiliary_loss_clip": 0.01133597, + "auxiliary_loss_mlp": 0.01110555, + "balance_loss_clip": 1.00175261, + "balance_loss_mlp": 0.99992859, + "epoch": 0.34733203066285884, + "flos": 65265491640960.0, + "grad_norm": 0.8405622765384837, + "language_loss": 0.63392973, + "learning_rate": 3.033425500045478e-06, + "loss": 0.65637124, + "num_input_tokens_seen": 124376650, + "step": 5777, + "time_per_iteration": 3.20949649810791 + }, + { + "auxiliary_loss_clip": 0.01121403, + "auxiliary_loss_mlp": 0.0112995, + "balance_loss_clip": 1.0018605, + "balance_loss_mlp": 1.00082302, + "epoch": 0.3473921539155268, + "flos": 28658726294400.0, + "grad_norm": 1.927853523628842, + "language_loss": 0.64869678, + "learning_rate": 3.033092039398119e-06, + "loss": 0.67121029, + "num_input_tokens_seen": 124396475, + "step": 5778, + "time_per_iteration": 2.7213237285614014 + }, + { + "auxiliary_loss_clip": 0.01138751, + "auxiliary_loss_mlp": 0.0112905, + "balance_loss_clip": 1.00194645, + "balance_loss_mlp": 1.00106657, + "epoch": 0.3474522771681948, + "flos": 40836895355520.0, + "grad_norm": 2.701685658607906, + "language_loss": 0.70846385, + "learning_rate": 3.0327585395758046e-06, + "loss": 0.73114187, + "num_input_tokens_seen": 124416480, + "step": 5779, + "time_per_iteration": 2.7835962772369385 + }, + { + "auxiliary_loss_clip": 0.01170878, + "auxiliary_loss_mlp": 0.01129588, + "balance_loss_clip": 1.002123, + "balance_loss_mlp": 1.00103307, + "epoch": 0.3475124004208628, + "flos": 24608577381120.0, + "grad_norm": 1.9591218689954133, + "language_loss": 0.61904895, + "learning_rate": 3.0324250005911837e-06, + "loss": 0.64205366, + "num_input_tokens_seen": 124435950, + "step": 5780, + "time_per_iteration": 2.546807050704956 + }, + { + "auxiliary_loss_clip": 0.01123546, + "auxiliary_loss_mlp": 0.01128993, + "balance_loss_clip": 1.00200438, + "balance_loss_mlp": 1.00081909, + "epoch": 0.34757252367353075, + "flos": 22711309004160.0, + "grad_norm": 1.7448354362738836, + "language_loss": 0.71768641, + "learning_rate": 3.0320914224569033e-06, + "loss": 0.74021184, + "num_input_tokens_seen": 124455410, + "step": 5781, + "time_per_iteration": 2.637986898422241 + }, + { + "auxiliary_loss_clip": 0.01091215, + "auxiliary_loss_mlp": 0.01129337, + "balance_loss_clip": 1.00177836, + "balance_loss_mlp": 1.00087678, + "epoch": 0.3476326469261987, + "flos": 19828184970240.0, + "grad_norm": 1.8710584078493315, + "language_loss": 0.76708347, + "learning_rate": 3.031757805185612e-06, + "loss": 0.789289, + "num_input_tokens_seen": 124474870, + "step": 5782, + "time_per_iteration": 2.7118756771087646 + }, + { + "auxiliary_loss_clip": 0.01137386, + "auxiliary_loss_mlp": 0.01127997, + "balance_loss_clip": 1.00192773, + "balance_loss_mlp": 1.00058651, + "epoch": 0.3476927701788667, + "flos": 19938107566080.0, + "grad_norm": 1.9367601615938017, + "language_loss": 0.62382758, + "learning_rate": 3.0314241487899622e-06, + "loss": 0.64648139, + "num_input_tokens_seen": 124494105, + "step": 5783, + "time_per_iteration": 2.5729331970214844 + }, + { + "auxiliary_loss_clip": 0.0110654, + "auxiliary_loss_mlp": 0.01127336, + "balance_loss_clip": 1.00175548, + "balance_loss_mlp": 1.00059283, + "epoch": 0.34775289343153465, + "flos": 20735108490240.0, + "grad_norm": 1.773353893510282, + "language_loss": 0.88178277, + "learning_rate": 3.031090453282605e-06, + "loss": 0.90412152, + "num_input_tokens_seen": 124512030, + "step": 5784, + "time_per_iteration": 2.681525707244873 + }, + { + "auxiliary_loss_clip": 0.01105356, + "auxiliary_loss_mlp": 0.01128234, + "balance_loss_clip": 1.00172079, + "balance_loss_mlp": 1.00063217, + "epoch": 0.3478130166842026, + "flos": 19354846521600.0, + "grad_norm": 1.851711747956651, + "language_loss": 0.81525683, + "learning_rate": 3.0307567186761946e-06, + "loss": 0.83759266, + "num_input_tokens_seen": 124530980, + "step": 5785, + "time_per_iteration": 2.685649871826172 + }, + { + "auxiliary_loss_clip": 0.01140343, + "auxiliary_loss_mlp": 0.01128064, + "balance_loss_clip": 1.00214243, + "balance_loss_mlp": 1.00093961, + "epoch": 0.3478731399368706, + "flos": 22051198811520.0, + "grad_norm": 1.8383476543518542, + "language_loss": 0.80334675, + "learning_rate": 3.0304229449833862e-06, + "loss": 0.82603073, + "num_input_tokens_seen": 124549330, + "step": 5786, + "time_per_iteration": 2.6148681640625 + }, + { + "auxiliary_loss_clip": 0.01170606, + "auxiliary_loss_mlp": 0.00747851, + "balance_loss_clip": 1.00210071, + "balance_loss_mlp": 1.00014997, + "epoch": 0.34793326318953854, + "flos": 18041449720320.0, + "grad_norm": 1.5248622031178225, + "language_loss": 0.74991763, + "learning_rate": 3.030089132216836e-06, + "loss": 0.7691021, + "num_input_tokens_seen": 124567200, + "step": 5787, + "time_per_iteration": 2.5134334564208984 + }, + { + "auxiliary_loss_clip": 0.01140301, + "auxiliary_loss_mlp": 0.00747949, + "balance_loss_clip": 1.00203204, + "balance_loss_mlp": 1.00021052, + "epoch": 0.3479933864422065, + "flos": 29314670509440.0, + "grad_norm": 1.5990514687458213, + "language_loss": 0.81235182, + "learning_rate": 3.029755280389203e-06, + "loss": 0.83123434, + "num_input_tokens_seen": 124587025, + "step": 5788, + "time_per_iteration": 2.662445068359375 + }, + { + "auxiliary_loss_clip": 0.0117082, + "auxiliary_loss_mlp": 0.0112963, + "balance_loss_clip": 1.00211048, + "balance_loss_mlp": 1.00078893, + "epoch": 0.3480535096948745, + "flos": 20120713332480.0, + "grad_norm": 1.8031735886360876, + "language_loss": 0.85664678, + "learning_rate": 3.029421389513147e-06, + "loss": 0.87965131, + "num_input_tokens_seen": 124605860, + "step": 5789, + "time_per_iteration": 2.5237550735473633 + }, + { + "auxiliary_loss_clip": 0.01154063, + "auxiliary_loss_mlp": 0.0112936, + "balance_loss_clip": 1.00206554, + "balance_loss_mlp": 1.00109088, + "epoch": 0.34811363294754244, + "flos": 18548974938240.0, + "grad_norm": 1.9396891087953045, + "language_loss": 0.84913552, + "learning_rate": 3.029087459601328e-06, + "loss": 0.8719697, + "num_input_tokens_seen": 124624270, + "step": 5790, + "time_per_iteration": 2.558485507965088 + }, + { + "auxiliary_loss_clip": 0.01155634, + "auxiliary_loss_mlp": 0.01129133, + "balance_loss_clip": 1.0021981, + "balance_loss_mlp": 1.00086331, + "epoch": 0.3481737562002104, + "flos": 26870303105280.0, + "grad_norm": 2.885443895141719, + "language_loss": 0.81606054, + "learning_rate": 3.0287534906664097e-06, + "loss": 0.8389082, + "num_input_tokens_seen": 124644005, + "step": 5791, + "time_per_iteration": 2.636258840560913 + }, + { + "auxiliary_loss_clip": 0.01153918, + "auxiliary_loss_mlp": 0.01128598, + "balance_loss_clip": 1.00195837, + "balance_loss_mlp": 1.00061512, + "epoch": 0.3482338794528784, + "flos": 28908664104960.0, + "grad_norm": 2.051644370941735, + "language_loss": 0.77672458, + "learning_rate": 3.028419482721056e-06, + "loss": 0.7995497, + "num_input_tokens_seen": 124663020, + "step": 5792, + "time_per_iteration": 3.9969139099121094 + }, + { + "auxiliary_loss_clip": 0.01140385, + "auxiliary_loss_mlp": 0.01127694, + "balance_loss_clip": 1.00196028, + "balance_loss_mlp": 1.00066471, + "epoch": 0.3482940027055464, + "flos": 22200767043840.0, + "grad_norm": 1.5502694485740922, + "language_loss": 0.81838107, + "learning_rate": 3.0280854357779325e-06, + "loss": 0.84106195, + "num_input_tokens_seen": 124682975, + "step": 5793, + "time_per_iteration": 2.6306533813476562 + }, + { + "auxiliary_loss_clip": 0.0115388, + "auxiliary_loss_mlp": 0.01128333, + "balance_loss_clip": 1.00207949, + "balance_loss_mlp": 1.00101781, + "epoch": 0.34835412595821436, + "flos": 20302708567680.0, + "grad_norm": 2.116115111794787, + "language_loss": 0.75863612, + "learning_rate": 3.027751349849706e-06, + "loss": 0.78145826, + "num_input_tokens_seen": 124701340, + "step": 5794, + "time_per_iteration": 4.000741720199585 + }, + { + "auxiliary_loss_clip": 0.01155444, + "auxiliary_loss_mlp": 0.01128068, + "balance_loss_clip": 1.0021497, + "balance_loss_mlp": 1.00065708, + "epoch": 0.3484142492108823, + "flos": 20449691020800.0, + "grad_norm": 2.1143992621295995, + "language_loss": 0.57548374, + "learning_rate": 3.0274172249490456e-06, + "loss": 0.59831887, + "num_input_tokens_seen": 124719165, + "step": 5795, + "time_per_iteration": 2.589724063873291 + }, + { + "auxiliary_loss_clip": 0.01139074, + "auxiliary_loss_mlp": 0.01128033, + "balance_loss_clip": 1.00190949, + "balance_loss_mlp": 1.0007174, + "epoch": 0.3484743724635503, + "flos": 24352929308160.0, + "grad_norm": 1.9804343198389873, + "language_loss": 0.82603037, + "learning_rate": 3.0270830610886213e-06, + "loss": 0.84870142, + "num_input_tokens_seen": 124738670, + "step": 5796, + "time_per_iteration": 4.09458065032959 + }, + { + "auxiliary_loss_clip": 0.01153857, + "auxiliary_loss_mlp": 0.01127736, + "balance_loss_clip": 1.00204158, + "balance_loss_mlp": 1.00080156, + "epoch": 0.34853449571621825, + "flos": 24353001135360.0, + "grad_norm": 1.7902791693959315, + "language_loss": 0.83399689, + "learning_rate": 3.0267488582811033e-06, + "loss": 0.85681283, + "num_input_tokens_seen": 124758760, + "step": 5797, + "time_per_iteration": 4.139038801193237 + }, + { + "auxiliary_loss_clip": 0.01170556, + "auxiliary_loss_mlp": 0.01127758, + "balance_loss_clip": 1.00207067, + "balance_loss_mlp": 1.00072908, + "epoch": 0.3485946189688862, + "flos": 27267690245760.0, + "grad_norm": 1.5587205772028616, + "language_loss": 0.7335816, + "learning_rate": 3.026414616539167e-06, + "loss": 0.75656474, + "num_input_tokens_seen": 124777765, + "step": 5798, + "time_per_iteration": 2.5627167224884033 + }, + { + "auxiliary_loss_clip": 0.01170795, + "auxiliary_loss_mlp": 0.01128729, + "balance_loss_clip": 1.00208747, + "balance_loss_mlp": 1.00074625, + "epoch": 0.3486547422215542, + "flos": 20156695781760.0, + "grad_norm": 1.8478436151054412, + "language_loss": 0.75821531, + "learning_rate": 3.026080335875485e-06, + "loss": 0.78121048, + "num_input_tokens_seen": 124796775, + "step": 5799, + "time_per_iteration": 2.5411794185638428 + }, + { + "auxiliary_loss_clip": 0.01080862, + "auxiliary_loss_mlp": 0.01127987, + "balance_loss_clip": 1.00220513, + "balance_loss_mlp": 1.0005765, + "epoch": 0.34871486547422215, + "flos": 20230348619520.0, + "grad_norm": 1.6619861072530813, + "language_loss": 0.753196, + "learning_rate": 3.025746016302734e-06, + "loss": 0.77528453, + "num_input_tokens_seen": 124815825, + "step": 5800, + "time_per_iteration": 2.8786938190460205 + }, + { + "auxiliary_loss_clip": 0.01139125, + "auxiliary_loss_mlp": 0.00747726, + "balance_loss_clip": 1.00195384, + "balance_loss_mlp": 1.00012302, + "epoch": 0.3487749887268901, + "flos": 44053234882560.0, + "grad_norm": 1.8453162847327353, + "language_loss": 0.67394894, + "learning_rate": 3.025411657833591e-06, + "loss": 0.69281745, + "num_input_tokens_seen": 124838420, + "step": 5801, + "time_per_iteration": 2.8260021209716797 + }, + { + "auxiliary_loss_clip": 0.01140258, + "auxiliary_loss_mlp": 0.01128095, + "balance_loss_clip": 1.00210774, + "balance_loss_mlp": 1.0006845, + "epoch": 0.3488351119795581, + "flos": 23295144666240.0, + "grad_norm": 1.7367009613607318, + "language_loss": 0.76730633, + "learning_rate": 3.025077260480735e-06, + "loss": 0.78998983, + "num_input_tokens_seen": 124857320, + "step": 5802, + "time_per_iteration": 2.6040143966674805 + }, + { + "auxiliary_loss_clip": 0.01079571, + "auxiliary_loss_mlp": 0.01127632, + "balance_loss_clip": 1.00187802, + "balance_loss_mlp": 1.00079322, + "epoch": 0.34889523523222604, + "flos": 19934839428480.0, + "grad_norm": 1.8028119651369574, + "language_loss": 0.78848529, + "learning_rate": 3.0247428242568474e-06, + "loss": 0.81055731, + "num_input_tokens_seen": 124875685, + "step": 5803, + "time_per_iteration": 2.7291839122772217 + }, + { + "auxiliary_loss_clip": 0.01138818, + "auxiliary_loss_mlp": 0.00747876, + "balance_loss_clip": 1.00183916, + "balance_loss_mlp": 1.00013804, + "epoch": 0.348955358484894, + "flos": 30446179816320.0, + "grad_norm": 2.166290225382614, + "language_loss": 0.67399931, + "learning_rate": 3.0244083491746085e-06, + "loss": 0.69286627, + "num_input_tokens_seen": 124895960, + "step": 5804, + "time_per_iteration": 2.6720688343048096 + }, + { + "auxiliary_loss_clip": 0.01136762, + "auxiliary_loss_mlp": 0.01128352, + "balance_loss_clip": 1.00185394, + "balance_loss_mlp": 1.00075066, + "epoch": 0.349015481737562, + "flos": 17999972490240.0, + "grad_norm": 2.5646158758761497, + "language_loss": 0.76184011, + "learning_rate": 3.024073835246702e-06, + "loss": 0.7844913, + "num_input_tokens_seen": 124914140, + "step": 5805, + "time_per_iteration": 2.562631607055664 + }, + { + "auxiliary_loss_clip": 0.0112338, + "auxiliary_loss_mlp": 0.01128829, + "balance_loss_clip": 1.00190854, + "balance_loss_mlp": 1.0007509, + "epoch": 0.34907560499023, + "flos": 27198490694400.0, + "grad_norm": 2.2162968394419966, + "language_loss": 0.67768216, + "learning_rate": 3.023739282485814e-06, + "loss": 0.70020425, + "num_input_tokens_seen": 124934180, + "step": 5806, + "time_per_iteration": 2.7029364109039307 + }, + { + "auxiliary_loss_clip": 0.01154273, + "auxiliary_loss_mlp": 0.01128458, + "balance_loss_clip": 1.00205171, + "balance_loss_mlp": 1.00076127, + "epoch": 0.34913572824289796, + "flos": 30226873328640.0, + "grad_norm": 1.6308338657124288, + "language_loss": 0.72306633, + "learning_rate": 3.023404690904629e-06, + "loss": 0.7458936, + "num_input_tokens_seen": 124956060, + "step": 5807, + "time_per_iteration": 2.6377527713775635 + }, + { + "auxiliary_loss_clip": 0.01170519, + "auxiliary_loss_mlp": 0.01129013, + "balance_loss_clip": 1.00195289, + "balance_loss_mlp": 1.00064874, + "epoch": 0.3491958514955659, + "flos": 29971907614080.0, + "grad_norm": 1.8875552878735018, + "language_loss": 0.73367524, + "learning_rate": 3.0230700605158364e-06, + "loss": 0.75667059, + "num_input_tokens_seen": 124976070, + "step": 5808, + "time_per_iteration": 2.5726442337036133 + }, + { + "auxiliary_loss_clip": 0.0117055, + "auxiliary_loss_mlp": 0.01128198, + "balance_loss_clip": 1.00214446, + "balance_loss_mlp": 1.00097823, + "epoch": 0.3492559747482339, + "flos": 22783273902720.0, + "grad_norm": 1.649969937800022, + "language_loss": 0.84433138, + "learning_rate": 3.0227353913321238e-06, + "loss": 0.86731887, + "num_input_tokens_seen": 124996995, + "step": 5809, + "time_per_iteration": 2.538139581680298 + }, + { + "auxiliary_loss_clip": 0.01138604, + "auxiliary_loss_mlp": 0.01127455, + "balance_loss_clip": 1.00203848, + "balance_loss_mlp": 1.00080657, + "epoch": 0.34931609800090185, + "flos": 26068022881920.0, + "grad_norm": 2.168193067809963, + "language_loss": 0.8040235, + "learning_rate": 3.0224006833661835e-06, + "loss": 0.82668412, + "num_input_tokens_seen": 125015600, + "step": 5810, + "time_per_iteration": 2.6461963653564453 + }, + { + "auxiliary_loss_clip": 0.01170471, + "auxiliary_loss_mlp": 0.01128137, + "balance_loss_clip": 1.00205529, + "balance_loss_mlp": 1.00072622, + "epoch": 0.3493762212535698, + "flos": 29242023252480.0, + "grad_norm": 1.7748946189002512, + "language_loss": 0.75235474, + "learning_rate": 3.0220659366307057e-06, + "loss": 0.7753408, + "num_input_tokens_seen": 125035290, + "step": 5811, + "time_per_iteration": 2.5772507190704346 + }, + { + "auxiliary_loss_clip": 0.01139105, + "auxiliary_loss_mlp": 0.01127956, + "balance_loss_clip": 1.00189984, + "balance_loss_mlp": 1.000736, + "epoch": 0.3494363445062378, + "flos": 27126058919040.0, + "grad_norm": 1.6215962029843385, + "language_loss": 0.80001497, + "learning_rate": 3.021731151138386e-06, + "loss": 0.8226856, + "num_input_tokens_seen": 125057130, + "step": 5812, + "time_per_iteration": 2.6453442573547363 + }, + { + "auxiliary_loss_clip": 0.01090541, + "auxiliary_loss_mlp": 0.01128037, + "balance_loss_clip": 1.00157666, + "balance_loss_mlp": 1.00072169, + "epoch": 0.34949646775890575, + "flos": 12276207233280.0, + "grad_norm": 1.881944431140873, + "language_loss": 0.6908657, + "learning_rate": 3.021396326901918e-06, + "loss": 0.7130515, + "num_input_tokens_seen": 125073720, + "step": 5813, + "time_per_iteration": 2.698204517364502 + }, + { + "auxiliary_loss_clip": 0.01140107, + "auxiliary_loss_mlp": 0.00747843, + "balance_loss_clip": 1.0018959, + "balance_loss_mlp": 1.00007963, + "epoch": 0.3495565910115737, + "flos": 17165516659200.0, + "grad_norm": 2.790209071672002, + "language_loss": 0.76761049, + "learning_rate": 3.0210614639339998e-06, + "loss": 0.78648996, + "num_input_tokens_seen": 125090635, + "step": 5814, + "time_per_iteration": 2.6098992824554443 + }, + { + "auxiliary_loss_clip": 0.01144396, + "auxiliary_loss_mlp": 0.00747896, + "balance_loss_clip": 1.00235915, + "balance_loss_mlp": 1.00012684, + "epoch": 0.3496167142642417, + "flos": 26465661417600.0, + "grad_norm": 1.5689058629559134, + "language_loss": 0.84501052, + "learning_rate": 3.020726562247328e-06, + "loss": 0.86393344, + "num_input_tokens_seen": 125110070, + "step": 5815, + "time_per_iteration": 2.6452643871307373 + }, + { + "auxiliary_loss_clip": 0.011542, + "auxiliary_loss_mlp": 0.01128018, + "balance_loss_clip": 1.00184524, + "balance_loss_mlp": 1.00070238, + "epoch": 0.34967683751690964, + "flos": 17414843938560.0, + "grad_norm": 2.151618963787382, + "language_loss": 0.7728768, + "learning_rate": 3.0203916218546024e-06, + "loss": 0.795699, + "num_input_tokens_seen": 125125730, + "step": 5816, + "time_per_iteration": 2.530853509902954 + }, + { + "auxiliary_loss_clip": 0.01154308, + "auxiliary_loss_mlp": 0.01128582, + "balance_loss_clip": 1.00199091, + "balance_loss_mlp": 1.00078988, + "epoch": 0.3497369607695776, + "flos": 22600021691520.0, + "grad_norm": 1.8223894503578975, + "language_loss": 0.59262145, + "learning_rate": 3.0200566427685246e-06, + "loss": 0.61545032, + "num_input_tokens_seen": 125146195, + "step": 5817, + "time_per_iteration": 2.5843582153320312 + }, + { + "auxiliary_loss_clip": 0.01167032, + "auxiliary_loss_mlp": 0.0111053, + "balance_loss_clip": 1.0022583, + "balance_loss_mlp": 0.9999035, + "epoch": 0.34979708402224563, + "flos": 68529374818560.0, + "grad_norm": 0.8676048428007024, + "language_loss": 0.59879816, + "learning_rate": 3.0197216250017975e-06, + "loss": 0.62157381, + "num_input_tokens_seen": 125207790, + "step": 5818, + "time_per_iteration": 3.153059244155884 + }, + { + "auxiliary_loss_clip": 0.01123642, + "auxiliary_loss_mlp": 0.01128517, + "balance_loss_clip": 1.00202441, + "balance_loss_mlp": 1.00072455, + "epoch": 0.3498572072749136, + "flos": 18989634988800.0, + "grad_norm": 2.9642312877408594, + "language_loss": 0.8343699, + "learning_rate": 3.019386568567123e-06, + "loss": 0.85689151, + "num_input_tokens_seen": 125226220, + "step": 5819, + "time_per_iteration": 2.626492738723755 + }, + { + "auxiliary_loss_clip": 0.01138863, + "auxiliary_loss_mlp": 0.01128563, + "balance_loss_clip": 1.00190997, + "balance_loss_mlp": 1.00058031, + "epoch": 0.34991733052758156, + "flos": 27818883423360.0, + "grad_norm": 1.6922712493843488, + "language_loss": 0.70840877, + "learning_rate": 3.0190514734772083e-06, + "loss": 0.73108304, + "num_input_tokens_seen": 125247485, + "step": 5820, + "time_per_iteration": 2.6344525814056396 + }, + { + "auxiliary_loss_clip": 0.01155064, + "auxiliary_loss_mlp": 0.01128766, + "balance_loss_clip": 1.00200307, + "balance_loss_mlp": 1.00078321, + "epoch": 0.3499774537802495, + "flos": 33584197737600.0, + "grad_norm": 1.6396819252127584, + "language_loss": 0.70513248, + "learning_rate": 3.018716339744759e-06, + "loss": 0.72797078, + "num_input_tokens_seen": 125268625, + "step": 5821, + "time_per_iteration": 2.713393211364746 + }, + { + "auxiliary_loss_clip": 0.01154056, + "auxiliary_loss_mlp": 0.01128845, + "balance_loss_clip": 1.00191355, + "balance_loss_mlp": 1.00076604, + "epoch": 0.3500375770329175, + "flos": 23476744851840.0, + "grad_norm": 2.0487426519086562, + "language_loss": 0.73766565, + "learning_rate": 3.0183811673824842e-06, + "loss": 0.76049471, + "num_input_tokens_seen": 125287530, + "step": 5822, + "time_per_iteration": 2.6102728843688965 + }, + { + "auxiliary_loss_clip": 0.01137083, + "auxiliary_loss_mlp": 0.01128431, + "balance_loss_clip": 1.00192535, + "balance_loss_mlp": 1.00063848, + "epoch": 0.35009770028558546, + "flos": 19026048401280.0, + "grad_norm": 2.5997596861008656, + "language_loss": 0.78529787, + "learning_rate": 3.018045956403094e-06, + "loss": 0.807953, + "num_input_tokens_seen": 125307020, + "step": 5823, + "time_per_iteration": 2.6005160808563232 + }, + { + "auxiliary_loss_clip": 0.0115038, + "auxiliary_loss_mlp": 0.01110547, + "balance_loss_clip": 1.00211, + "balance_loss_mlp": 0.99992102, + "epoch": 0.3501578235382534, + "flos": 68351868783360.0, + "grad_norm": 0.7125626273612206, + "language_loss": 0.59263653, + "learning_rate": 3.017710706819298e-06, + "loss": 0.61524576, + "num_input_tokens_seen": 125370445, + "step": 5824, + "time_per_iteration": 3.239635705947876 + }, + { + "auxiliary_loss_clip": 0.01136839, + "auxiliary_loss_mlp": 0.01129132, + "balance_loss_clip": 1.00180531, + "balance_loss_mlp": 1.00076699, + "epoch": 0.3502179467909214, + "flos": 21250893836160.0, + "grad_norm": 2.024224885220318, + "language_loss": 0.84796816, + "learning_rate": 3.017375418643811e-06, + "loss": 0.87062788, + "num_input_tokens_seen": 125388900, + "step": 5825, + "time_per_iteration": 2.605424642562866 + }, + { + "auxiliary_loss_clip": 0.01154304, + "auxiliary_loss_mlp": 0.00747757, + "balance_loss_clip": 1.00205922, + "balance_loss_mlp": 1.00014877, + "epoch": 0.35027807004358935, + "flos": 11942955826560.0, + "grad_norm": 2.5165788264723417, + "language_loss": 0.83292019, + "learning_rate": 3.0170400918893464e-06, + "loss": 0.85194087, + "num_input_tokens_seen": 125402675, + "step": 5826, + "time_per_iteration": 2.514524221420288 + }, + { + "auxiliary_loss_clip": 0.01137259, + "auxiliary_loss_mlp": 0.01129064, + "balance_loss_clip": 1.00193965, + "balance_loss_mlp": 1.00098515, + "epoch": 0.3503381932962573, + "flos": 21470918595840.0, + "grad_norm": 1.4911505150690516, + "language_loss": 0.80970591, + "learning_rate": 3.0167047265686186e-06, + "loss": 0.83236909, + "num_input_tokens_seen": 125421360, + "step": 5827, + "time_per_iteration": 2.6008148193359375 + }, + { + "auxiliary_loss_clip": 0.01107086, + "auxiliary_loss_mlp": 0.01128546, + "balance_loss_clip": 1.00188124, + "balance_loss_mlp": 1.00084889, + "epoch": 0.3503983165489253, + "flos": 21251109317760.0, + "grad_norm": 2.0230983416056203, + "language_loss": 0.70091164, + "learning_rate": 3.0163693226943467e-06, + "loss": 0.72326803, + "num_input_tokens_seen": 125440000, + "step": 5828, + "time_per_iteration": 2.663590431213379 + }, + { + "auxiliary_loss_clip": 0.01154082, + "auxiliary_loss_mlp": 0.01129686, + "balance_loss_clip": 1.00210381, + "balance_loss_mlp": 1.00103498, + "epoch": 0.35045843980159325, + "flos": 27815723026560.0, + "grad_norm": 2.8635022950275046, + "language_loss": 0.79411221, + "learning_rate": 3.016033880279248e-06, + "loss": 0.8169499, + "num_input_tokens_seen": 125460390, + "step": 5829, + "time_per_iteration": 3.9893312454223633 + }, + { + "auxiliary_loss_clip": 0.01121033, + "auxiliary_loss_mlp": 0.01130261, + "balance_loss_clip": 1.00175452, + "balance_loss_mlp": 1.00103784, + "epoch": 0.3505185630542612, + "flos": 25921148169600.0, + "grad_norm": 1.8584459463775245, + "language_loss": 0.72208047, + "learning_rate": 3.0156983993360417e-06, + "loss": 0.74459338, + "num_input_tokens_seen": 125478410, + "step": 5830, + "time_per_iteration": 2.6771152019500732 + }, + { + "auxiliary_loss_clip": 0.01123585, + "auxiliary_loss_mlp": 0.01128428, + "balance_loss_clip": 1.00194287, + "balance_loss_mlp": 1.00073123, + "epoch": 0.35057868630692923, + "flos": 20521763660160.0, + "grad_norm": 2.1258215798394464, + "language_loss": 0.88461494, + "learning_rate": 3.0153628798774513e-06, + "loss": 0.90713507, + "num_input_tokens_seen": 125495975, + "step": 5831, + "time_per_iteration": 2.621156930923462 + }, + { + "auxiliary_loss_clip": 0.01106994, + "auxiliary_loss_mlp": 0.01129117, + "balance_loss_clip": 1.00182867, + "balance_loss_mlp": 1.00094354, + "epoch": 0.3506388095595972, + "flos": 20448649526400.0, + "grad_norm": 4.706396521033044, + "language_loss": 0.78203571, + "learning_rate": 3.0150273219161985e-06, + "loss": 0.80439675, + "num_input_tokens_seen": 125515035, + "step": 5832, + "time_per_iteration": 4.110253572463989 + }, + { + "auxiliary_loss_clip": 0.01122301, + "auxiliary_loss_mlp": 0.01129075, + "balance_loss_clip": 1.00181866, + "balance_loss_mlp": 1.00090086, + "epoch": 0.35069893281226516, + "flos": 23109665811840.0, + "grad_norm": 2.5361847219682683, + "language_loss": 0.70947117, + "learning_rate": 3.014691725465008e-06, + "loss": 0.73198491, + "num_input_tokens_seen": 125535555, + "step": 5833, + "time_per_iteration": 4.146428108215332 + }, + { + "auxiliary_loss_clip": 0.01155524, + "auxiliary_loss_mlp": 0.01128542, + "balance_loss_clip": 1.00207031, + "balance_loss_mlp": 1.00084531, + "epoch": 0.35075905606493313, + "flos": 27271999877760.0, + "grad_norm": 1.4158669392827632, + "language_loss": 0.81067985, + "learning_rate": 3.014356090536606e-06, + "loss": 0.83352047, + "num_input_tokens_seen": 125558195, + "step": 5834, + "time_per_iteration": 2.6623754501342773 + }, + { + "auxiliary_loss_clip": 0.01105799, + "auxiliary_loss_mlp": 0.01129311, + "balance_loss_clip": 1.00184882, + "balance_loss_mlp": 1.00075603, + "epoch": 0.3508191793176011, + "flos": 19128608709120.0, + "grad_norm": 2.640890254576432, + "language_loss": 0.83899534, + "learning_rate": 3.0140204171437183e-06, + "loss": 0.86134636, + "num_input_tokens_seen": 125575375, + "step": 5835, + "time_per_iteration": 4.0935564041137695 + }, + { + "auxiliary_loss_clip": 0.0111251, + "auxiliary_loss_mlp": 0.01128834, + "balance_loss_clip": 1.00280666, + "balance_loss_mlp": 1.00085068, + "epoch": 0.35087930257026906, + "flos": 25557588662400.0, + "grad_norm": 2.504668594455827, + "language_loss": 0.76847172, + "learning_rate": 3.0136847052990754e-06, + "loss": 0.79088509, + "num_input_tokens_seen": 125596745, + "step": 5836, + "time_per_iteration": 2.715268850326538 + }, + { + "auxiliary_loss_clip": 0.01122849, + "auxiliary_loss_mlp": 0.01128889, + "balance_loss_clip": 1.00198674, + "balance_loss_mlp": 1.00090539, + "epoch": 0.350939425822937, + "flos": 18004246208640.0, + "grad_norm": 1.9473950125804873, + "language_loss": 0.77351928, + "learning_rate": 3.0133489550154074e-06, + "loss": 0.79603666, + "num_input_tokens_seen": 125613980, + "step": 5837, + "time_per_iteration": 2.6167328357696533 + }, + { + "auxiliary_loss_clip": 0.01153972, + "auxiliary_loss_mlp": 0.01129183, + "balance_loss_clip": 1.00196469, + "balance_loss_mlp": 1.00100946, + "epoch": 0.350999549075605, + "flos": 22273198819200.0, + "grad_norm": 1.7040548699223486, + "language_loss": 0.68180603, + "learning_rate": 3.0130131663054442e-06, + "loss": 0.70463759, + "num_input_tokens_seen": 125632100, + "step": 5838, + "time_per_iteration": 2.564757823944092 + }, + { + "auxiliary_loss_clip": 0.01170481, + "auxiliary_loss_mlp": 0.0112841, + "balance_loss_clip": 1.00198197, + "balance_loss_mlp": 1.00071263, + "epoch": 0.35105967232827295, + "flos": 14392279307520.0, + "grad_norm": 2.000112971592543, + "language_loss": 0.83003104, + "learning_rate": 3.0126773391819215e-06, + "loss": 0.85301995, + "num_input_tokens_seen": 125649190, + "step": 5839, + "time_per_iteration": 2.518333911895752 + }, + { + "auxiliary_loss_clip": 0.0116025, + "auxiliary_loss_mlp": 0.01129459, + "balance_loss_clip": 1.00256777, + "balance_loss_mlp": 1.00071275, + "epoch": 0.3511197955809409, + "flos": 25082346792960.0, + "grad_norm": 2.4396195340770968, + "language_loss": 0.58564037, + "learning_rate": 3.012341473657572e-06, + "loss": 0.60853744, + "num_input_tokens_seen": 125668680, + "step": 5840, + "time_per_iteration": 2.5856285095214844 + }, + { + "auxiliary_loss_clip": 0.01122641, + "auxiliary_loss_mlp": 0.01128811, + "balance_loss_clip": 1.00189424, + "balance_loss_mlp": 1.0009228, + "epoch": 0.3511799188336089, + "flos": 25884160139520.0, + "grad_norm": 2.6533250657309937, + "language_loss": 0.87156647, + "learning_rate": 3.0120055697451322e-06, + "loss": 0.894081, + "num_input_tokens_seen": 125686935, + "step": 5841, + "time_per_iteration": 2.670452117919922 + }, + { + "auxiliary_loss_clip": 0.01143827, + "auxiliary_loss_mlp": 0.01129311, + "balance_loss_clip": 1.00247586, + "balance_loss_mlp": 1.00066042, + "epoch": 0.35124004208627685, + "flos": 20083725302400.0, + "grad_norm": 2.004435377040492, + "language_loss": 0.75175381, + "learning_rate": 3.0116696274573406e-06, + "loss": 0.77448523, + "num_input_tokens_seen": 125707180, + "step": 5842, + "time_per_iteration": 2.660618782043457 + }, + { + "auxiliary_loss_clip": 0.01160223, + "auxiliary_loss_mlp": 0.01128387, + "balance_loss_clip": 1.0022738, + "balance_loss_mlp": 1.00069046, + "epoch": 0.3513001653389448, + "flos": 17783431349760.0, + "grad_norm": 2.0682231353247764, + "language_loss": 0.68815935, + "learning_rate": 3.0113336468069346e-06, + "loss": 0.71104544, + "num_input_tokens_seen": 125722780, + "step": 5843, + "time_per_iteration": 2.5467288494110107 + }, + { + "auxiliary_loss_clip": 0.01170733, + "auxiliary_loss_mlp": 0.01129374, + "balance_loss_clip": 1.00211143, + "balance_loss_mlp": 1.00091434, + "epoch": 0.3513602885916128, + "flos": 29387138198400.0, + "grad_norm": 1.8681280873604298, + "language_loss": 0.65508914, + "learning_rate": 3.010997627806655e-06, + "loss": 0.67809021, + "num_input_tokens_seen": 125742110, + "step": 5844, + "time_per_iteration": 2.5614125728607178 + }, + { + "auxiliary_loss_clip": 0.01153949, + "auxiliary_loss_mlp": 0.01129648, + "balance_loss_clip": 1.00198936, + "balance_loss_mlp": 1.00080657, + "epoch": 0.3514204118442808, + "flos": 16179876483840.0, + "grad_norm": 2.0138795280342254, + "language_loss": 0.75047791, + "learning_rate": 3.010661570469245e-06, + "loss": 0.77331388, + "num_input_tokens_seen": 125759980, + "step": 5845, + "time_per_iteration": 2.527878761291504 + }, + { + "auxiliary_loss_clip": 0.01154877, + "auxiliary_loss_mlp": 0.0112861, + "balance_loss_clip": 1.00198698, + "balance_loss_mlp": 1.00072205, + "epoch": 0.35148053509694877, + "flos": 23834665923840.0, + "grad_norm": 3.1616622436032724, + "language_loss": 0.72398531, + "learning_rate": 3.0103254748074465e-06, + "loss": 0.74682021, + "num_input_tokens_seen": 125772660, + "step": 5846, + "time_per_iteration": 2.5634872913360596 + }, + { + "auxiliary_loss_clip": 0.01128691, + "auxiliary_loss_mlp": 0.011286, + "balance_loss_clip": 1.00235164, + "balance_loss_mlp": 1.00061679, + "epoch": 0.35154065834961673, + "flos": 20991295267200.0, + "grad_norm": 1.5351959796602679, + "language_loss": 0.75376177, + "learning_rate": 3.0099893408340046e-06, + "loss": 0.77633464, + "num_input_tokens_seen": 125791935, + "step": 5847, + "time_per_iteration": 2.6882431507110596 + }, + { + "auxiliary_loss_clip": 0.01139121, + "auxiliary_loss_mlp": 0.01128938, + "balance_loss_clip": 1.00189304, + "balance_loss_mlp": 1.00076437, + "epoch": 0.3516007816022847, + "flos": 33255471444480.0, + "grad_norm": 2.2878643268595327, + "language_loss": 0.72799289, + "learning_rate": 3.009653168561666e-06, + "loss": 0.75067353, + "num_input_tokens_seen": 125813455, + "step": 5848, + "time_per_iteration": 2.700589418411255 + }, + { + "auxiliary_loss_clip": 0.01139999, + "auxiliary_loss_mlp": 0.01129031, + "balance_loss_clip": 1.00201344, + "balance_loss_mlp": 1.00085735, + "epoch": 0.35166090485495266, + "flos": 11726953390080.0, + "grad_norm": 2.2654403803835113, + "language_loss": 0.89759219, + "learning_rate": 3.009316958003178e-06, + "loss": 0.92028248, + "num_input_tokens_seen": 125827660, + "step": 5849, + "time_per_iteration": 2.5755646228790283 + }, + { + "auxiliary_loss_clip": 0.0113877, + "auxiliary_loss_mlp": 0.0112838, + "balance_loss_clip": 1.0018959, + "balance_loss_mlp": 1.00058734, + "epoch": 0.3517210281076206, + "flos": 22638446265600.0, + "grad_norm": 2.1604044799386273, + "language_loss": 0.74719775, + "learning_rate": 3.0089807091712897e-06, + "loss": 0.76986921, + "num_input_tokens_seen": 125846655, + "step": 5850, + "time_per_iteration": 2.6031112670898438 + }, + { + "auxiliary_loss_clip": 0.01154912, + "auxiliary_loss_mlp": 0.01128073, + "balance_loss_clip": 1.00205493, + "balance_loss_mlp": 1.00066221, + "epoch": 0.3517811513602886, + "flos": 21322750993920.0, + "grad_norm": 1.682230081115526, + "language_loss": 0.75465643, + "learning_rate": 3.0086444220787515e-06, + "loss": 0.77748632, + "num_input_tokens_seen": 125866290, + "step": 5851, + "time_per_iteration": 2.574089288711548 + }, + { + "auxiliary_loss_clip": 0.01138778, + "auxiliary_loss_mlp": 0.01129746, + "balance_loss_clip": 1.00209522, + "balance_loss_mlp": 1.00061893, + "epoch": 0.35184127461295656, + "flos": 21032880238080.0, + "grad_norm": 4.642850048843165, + "language_loss": 0.87445772, + "learning_rate": 3.0083080967383165e-06, + "loss": 0.89714289, + "num_input_tokens_seen": 125884620, + "step": 5852, + "time_per_iteration": 2.5905613899230957 + }, + { + "auxiliary_loss_clip": 0.01170473, + "auxiliary_loss_mlp": 0.01128566, + "balance_loss_clip": 1.00207019, + "balance_loss_mlp": 1.00067878, + "epoch": 0.3519013978656245, + "flos": 22455265881600.0, + "grad_norm": 2.2359465655136934, + "language_loss": 0.67751741, + "learning_rate": 3.007971733162737e-06, + "loss": 0.70050788, + "num_input_tokens_seen": 125902430, + "step": 5853, + "time_per_iteration": 2.5386922359466553 + }, + { + "auxiliary_loss_clip": 0.01140167, + "auxiliary_loss_mlp": 0.01128336, + "balance_loss_clip": 1.00188279, + "balance_loss_mlp": 1.00063896, + "epoch": 0.3519615211182925, + "flos": 13115295918720.0, + "grad_norm": 1.8604722627395378, + "language_loss": 0.80857772, + "learning_rate": 3.0076353313647686e-06, + "loss": 0.83126271, + "num_input_tokens_seen": 125920570, + "step": 5854, + "time_per_iteration": 2.5735433101654053 + }, + { + "auxiliary_loss_clip": 0.01138238, + "auxiliary_loss_mlp": 0.01127531, + "balance_loss_clip": 1.00189018, + "balance_loss_mlp": 1.00069284, + "epoch": 0.35202164437096045, + "flos": 19135144984320.0, + "grad_norm": 1.5743478448385655, + "language_loss": 0.7314409, + "learning_rate": 3.0072988913571666e-06, + "loss": 0.75409859, + "num_input_tokens_seen": 125939800, + "step": 5855, + "time_per_iteration": 2.5954549312591553 + }, + { + "auxiliary_loss_clip": 0.01170368, + "auxiliary_loss_mlp": 0.01128071, + "balance_loss_clip": 1.00196075, + "balance_loss_mlp": 1.00085139, + "epoch": 0.3520817676236284, + "flos": 26542187343360.0, + "grad_norm": 1.952580782499319, + "language_loss": 0.71039772, + "learning_rate": 3.006962413152691e-06, + "loss": 0.73338211, + "num_input_tokens_seen": 125958720, + "step": 5856, + "time_per_iteration": 2.5639591217041016 + }, + { + "auxiliary_loss_clip": 0.01155523, + "auxiliary_loss_mlp": 0.01129873, + "balance_loss_clip": 1.00215852, + "balance_loss_mlp": 1.00084138, + "epoch": 0.3521418908762964, + "flos": 44893472803200.0, + "grad_norm": 2.256250161085112, + "language_loss": 0.60922354, + "learning_rate": 3.0066258967640987e-06, + "loss": 0.63207752, + "num_input_tokens_seen": 125984310, + "step": 5857, + "time_per_iteration": 2.768320083618164 + }, + { + "auxiliary_loss_clip": 0.01154111, + "auxiliary_loss_mlp": 0.01128605, + "balance_loss_clip": 1.00193942, + "balance_loss_mlp": 1.00081277, + "epoch": 0.3522020141289644, + "flos": 20187398931840.0, + "grad_norm": 3.115571350846837, + "language_loss": 0.73464155, + "learning_rate": 3.006289342204152e-06, + "loss": 0.7574687, + "num_input_tokens_seen": 126002410, + "step": 5858, + "time_per_iteration": 2.5426783561706543 + }, + { + "auxiliary_loss_clip": 0.01170504, + "auxiliary_loss_mlp": 0.01128837, + "balance_loss_clip": 1.00204408, + "balance_loss_mlp": 1.00066328, + "epoch": 0.35226213738163237, + "flos": 27563917708800.0, + "grad_norm": 2.6318599246765975, + "language_loss": 0.76508164, + "learning_rate": 3.0059527494856126e-06, + "loss": 0.78807509, + "num_input_tokens_seen": 126022490, + "step": 5859, + "time_per_iteration": 2.5655934810638428 + }, + { + "auxiliary_loss_clip": 0.01137593, + "auxiliary_loss_mlp": 0.01129322, + "balance_loss_clip": 1.00198579, + "balance_loss_mlp": 1.00086212, + "epoch": 0.35232226063430033, + "flos": 22966310632320.0, + "grad_norm": 2.4560602253491517, + "language_loss": 0.71854085, + "learning_rate": 3.0056161186212435e-06, + "loss": 0.74120998, + "num_input_tokens_seen": 126042895, + "step": 5860, + "time_per_iteration": 2.606156349182129 + }, + { + "auxiliary_loss_clip": 0.01138715, + "auxiliary_loss_mlp": 0.0112967, + "balance_loss_clip": 1.00191784, + "balance_loss_mlp": 1.00082839, + "epoch": 0.3523823838869683, + "flos": 19168290259200.0, + "grad_norm": 2.167740563843091, + "language_loss": 0.65956873, + "learning_rate": 3.005279449623811e-06, + "loss": 0.68225253, + "num_input_tokens_seen": 126060130, + "step": 5861, + "time_per_iteration": 2.601986885070801 + }, + { + "auxiliary_loss_clip": 0.01138802, + "auxiliary_loss_mlp": 0.01128216, + "balance_loss_clip": 1.00195134, + "balance_loss_mlp": 1.00061393, + "epoch": 0.35244250713963626, + "flos": 17930988420480.0, + "grad_norm": 2.8016231037830366, + "language_loss": 0.66455179, + "learning_rate": 3.0049427425060815e-06, + "loss": 0.687222, + "num_input_tokens_seen": 126077850, + "step": 5862, + "time_per_iteration": 2.6014244556427 + }, + { + "auxiliary_loss_clip": 0.01137013, + "auxiliary_loss_mlp": 0.01129531, + "balance_loss_clip": 1.00180066, + "balance_loss_mlp": 1.00088072, + "epoch": 0.35250263039230423, + "flos": 21432529935360.0, + "grad_norm": 1.928263758131793, + "language_loss": 0.76781881, + "learning_rate": 3.0046059972808215e-06, + "loss": 0.79048431, + "num_input_tokens_seen": 126095985, + "step": 5863, + "time_per_iteration": 2.595496654510498 + }, + { + "auxiliary_loss_clip": 0.01153775, + "auxiliary_loss_mlp": 0.01128757, + "balance_loss_clip": 1.00190365, + "balance_loss_mlp": 1.00077367, + "epoch": 0.3525627536449722, + "flos": 27416863428480.0, + "grad_norm": 1.8394670627630607, + "language_loss": 0.74736214, + "learning_rate": 3.0042692139608024e-06, + "loss": 0.77018738, + "num_input_tokens_seen": 126116070, + "step": 5864, + "time_per_iteration": 2.604693651199341 + }, + { + "auxiliary_loss_clip": 0.01154891, + "auxiliary_loss_mlp": 0.01128291, + "balance_loss_clip": 1.00190783, + "balance_loss_mlp": 1.00068927, + "epoch": 0.35262287689764016, + "flos": 24789818430720.0, + "grad_norm": 2.1183137441732405, + "language_loss": 0.78680921, + "learning_rate": 3.003932392558793e-06, + "loss": 0.809641, + "num_input_tokens_seen": 126135205, + "step": 5865, + "time_per_iteration": 2.6185901165008545 + }, + { + "auxiliary_loss_clip": 0.01155484, + "auxiliary_loss_mlp": 0.01129098, + "balance_loss_clip": 1.00210118, + "balance_loss_mlp": 1.00073314, + "epoch": 0.3526830001503081, + "flos": 17821604528640.0, + "grad_norm": 2.053980353394269, + "language_loss": 0.81129235, + "learning_rate": 3.0035955330875677e-06, + "loss": 0.83413815, + "num_input_tokens_seen": 126151895, + "step": 5866, + "time_per_iteration": 2.5343143939971924 + }, + { + "auxiliary_loss_clip": 0.01104372, + "auxiliary_loss_mlp": 0.01129496, + "balance_loss_clip": 1.00168514, + "balance_loss_mlp": 1.00065446, + "epoch": 0.3527431234029761, + "flos": 18078114528000.0, + "grad_norm": 2.4780564710499795, + "language_loss": 0.83860862, + "learning_rate": 3.0032586355598986e-06, + "loss": 0.86094737, + "num_input_tokens_seen": 126168515, + "step": 5867, + "time_per_iteration": 4.081524133682251 + }, + { + "auxiliary_loss_clip": 0.0117066, + "auxiliary_loss_mlp": 0.01129076, + "balance_loss_clip": 1.00201201, + "balance_loss_mlp": 1.00080705, + "epoch": 0.35280324665564405, + "flos": 19427350124160.0, + "grad_norm": 2.479773798265999, + "language_loss": 0.74115729, + "learning_rate": 3.0029216999885613e-06, + "loss": 0.76415467, + "num_input_tokens_seen": 126186460, + "step": 5868, + "time_per_iteration": 2.5183093547821045 + }, + { + "auxiliary_loss_clip": 0.01153664, + "auxiliary_loss_mlp": 0.0112979, + "balance_loss_clip": 1.00201058, + "balance_loss_mlp": 1.00075817, + "epoch": 0.352863369908312, + "flos": 21504027957120.0, + "grad_norm": 1.8524424598953997, + "language_loss": 0.61531168, + "learning_rate": 3.0025847263863327e-06, + "loss": 0.63814622, + "num_input_tokens_seen": 126206170, + "step": 5869, + "time_per_iteration": 2.5427215099334717 + }, + { + "auxiliary_loss_clip": 0.01154107, + "auxiliary_loss_mlp": 0.01129036, + "balance_loss_clip": 1.00188792, + "balance_loss_mlp": 1.00067139, + "epoch": 0.35292349316098, + "flos": 22309504490880.0, + "grad_norm": 2.7544404322687925, + "language_loss": 0.74065882, + "learning_rate": 3.0022477147659917e-06, + "loss": 0.7634902, + "num_input_tokens_seen": 126225605, + "step": 5870, + "time_per_iteration": 3.977724552154541 + }, + { + "auxiliary_loss_clip": 0.01154164, + "auxiliary_loss_mlp": 0.01128439, + "balance_loss_clip": 1.00187802, + "balance_loss_mlp": 1.00064635, + "epoch": 0.352983616413648, + "flos": 33109745967360.0, + "grad_norm": 1.4443309384446932, + "language_loss": 0.71838951, + "learning_rate": 3.001910665140316e-06, + "loss": 0.74121553, + "num_input_tokens_seen": 126250230, + "step": 5871, + "time_per_iteration": 4.131188154220581 + }, + { + "auxiliary_loss_clip": 0.01155121, + "auxiliary_loss_mlp": 0.01128285, + "balance_loss_clip": 1.00195312, + "balance_loss_mlp": 1.00077868, + "epoch": 0.35304373966631597, + "flos": 18696603836160.0, + "grad_norm": 1.9215359564248613, + "language_loss": 0.73339534, + "learning_rate": 3.0015735775220873e-06, + "loss": 0.7562294, + "num_input_tokens_seen": 126268315, + "step": 5872, + "time_per_iteration": 2.5362393856048584 + }, + { + "auxiliary_loss_clip": 0.01138388, + "auxiliary_loss_mlp": 0.0074794, + "balance_loss_clip": 1.001899, + "balance_loss_mlp": 1.00014281, + "epoch": 0.35310386291898394, + "flos": 23364954748800.0, + "grad_norm": 1.5194698011686585, + "language_loss": 0.82129234, + "learning_rate": 3.001236451924089e-06, + "loss": 0.8401556, + "num_input_tokens_seen": 126288390, + "step": 5873, + "time_per_iteration": 4.022215366363525 + }, + { + "auxiliary_loss_clip": 0.01139893, + "auxiliary_loss_mlp": 0.01130289, + "balance_loss_clip": 1.00195336, + "balance_loss_mlp": 1.00078046, + "epoch": 0.3531639861716519, + "flos": 24461954064000.0, + "grad_norm": 2.0973404864730507, + "language_loss": 0.66104764, + "learning_rate": 3.000899288359104e-06, + "loss": 0.6837495, + "num_input_tokens_seen": 126305750, + "step": 5874, + "time_per_iteration": 2.6395068168640137 + }, + { + "auxiliary_loss_clip": 0.01152166, + "auxiliary_loss_mlp": 0.01110723, + "balance_loss_clip": 1.00228679, + "balance_loss_mlp": 1.00009716, + "epoch": 0.35322410942431987, + "flos": 70312446881280.0, + "grad_norm": 0.7904744596709076, + "language_loss": 0.61576939, + "learning_rate": 3.000562086839917e-06, + "loss": 0.63839817, + "num_input_tokens_seen": 126362495, + "step": 5875, + "time_per_iteration": 3.0566394329071045 + }, + { + "auxiliary_loss_clip": 0.01098347, + "auxiliary_loss_mlp": 0.01128813, + "balance_loss_clip": 1.00289083, + "balance_loss_mlp": 1.00092506, + "epoch": 0.35328423267698783, + "flos": 19820894509440.0, + "grad_norm": 1.836850900924698, + "language_loss": 0.79673672, + "learning_rate": 3.0002248473793163e-06, + "loss": 0.81900835, + "num_input_tokens_seen": 126378320, + "step": 5876, + "time_per_iteration": 2.703619956970215 + }, + { + "auxiliary_loss_clip": 0.01118836, + "auxiliary_loss_mlp": 0.00747497, + "balance_loss_clip": 1.00193679, + "balance_loss_mlp": 1.00058353, + "epoch": 0.3533443559296558, + "flos": 60826356391680.0, + "grad_norm": 0.6699197879429196, + "language_loss": 0.5680117, + "learning_rate": 2.999887569990088e-06, + "loss": 0.58667499, + "num_input_tokens_seen": 126442735, + "step": 5877, + "time_per_iteration": 3.2960805892944336 + }, + { + "auxiliary_loss_clip": 0.01139033, + "auxiliary_loss_mlp": 0.01129542, + "balance_loss_clip": 1.00202239, + "balance_loss_mlp": 1.0007962, + "epoch": 0.35340447918232376, + "flos": 24755775315840.0, + "grad_norm": 2.083338538415366, + "language_loss": 0.72179723, + "learning_rate": 2.999550254685024e-06, + "loss": 0.74448299, + "num_input_tokens_seen": 126463090, + "step": 5878, + "time_per_iteration": 2.653709888458252 + }, + { + "auxiliary_loss_clip": 0.01137341, + "auxiliary_loss_mlp": 0.01128464, + "balance_loss_clip": 1.00172627, + "balance_loss_mlp": 1.00076747, + "epoch": 0.3534646024349917, + "flos": 21796304924160.0, + "grad_norm": 1.7022620248959281, + "language_loss": 0.78120321, + "learning_rate": 2.9992129014769136e-06, + "loss": 0.80386126, + "num_input_tokens_seen": 126482105, + "step": 5879, + "time_per_iteration": 2.6357004642486572 + }, + { + "auxiliary_loss_clip": 0.01121192, + "auxiliary_loss_mlp": 0.0112988, + "balance_loss_clip": 1.00181746, + "balance_loss_mlp": 1.00075209, + "epoch": 0.3535247256876597, + "flos": 20012119539840.0, + "grad_norm": 3.0126840507112838, + "language_loss": 0.62716305, + "learning_rate": 2.9988755103785493e-06, + "loss": 0.64967382, + "num_input_tokens_seen": 126502125, + "step": 5880, + "time_per_iteration": 2.6433870792388916 + }, + { + "auxiliary_loss_clip": 0.01137239, + "auxiliary_loss_mlp": 0.0112901, + "balance_loss_clip": 1.00186062, + "balance_loss_mlp": 1.0006454, + "epoch": 0.35358484894032766, + "flos": 18187929383040.0, + "grad_norm": 2.809413339098749, + "language_loss": 0.65574515, + "learning_rate": 2.998538081402727e-06, + "loss": 0.67840767, + "num_input_tokens_seen": 126521950, + "step": 5881, + "time_per_iteration": 2.6104047298431396 + }, + { + "auxiliary_loss_clip": 0.01154211, + "auxiliary_loss_mlp": 0.01128209, + "balance_loss_clip": 1.0019747, + "balance_loss_mlp": 1.00070238, + "epoch": 0.3536449721929956, + "flos": 22820369673600.0, + "grad_norm": 1.4141732451762175, + "language_loss": 0.7555058, + "learning_rate": 2.998200614562239e-06, + "loss": 0.77832997, + "num_input_tokens_seen": 126542445, + "step": 5882, + "time_per_iteration": 2.5959742069244385 + }, + { + "auxiliary_loss_clip": 0.01138615, + "auxiliary_loss_mlp": 0.01129123, + "balance_loss_clip": 1.00195336, + "balance_loss_mlp": 1.0008539, + "epoch": 0.3537050954456636, + "flos": 26432336574720.0, + "grad_norm": 2.0882653580749624, + "language_loss": 0.7075156, + "learning_rate": 2.9978631098698847e-06, + "loss": 0.73019302, + "num_input_tokens_seen": 126560690, + "step": 5883, + "time_per_iteration": 2.6292622089385986 + }, + { + "auxiliary_loss_clip": 0.01121409, + "auxiliary_loss_mlp": 0.0112904, + "balance_loss_clip": 1.00186491, + "balance_loss_mlp": 1.0006752, + "epoch": 0.3537652186983316, + "flos": 17197153562880.0, + "grad_norm": 2.43197254639153, + "language_loss": 0.78244245, + "learning_rate": 2.9975255673384614e-06, + "loss": 0.80494702, + "num_input_tokens_seen": 126577620, + "step": 5884, + "time_per_iteration": 2.5946812629699707 + }, + { + "auxiliary_loss_clip": 0.01143229, + "auxiliary_loss_mlp": 0.01128763, + "balance_loss_clip": 1.00226736, + "balance_loss_mlp": 1.00077963, + "epoch": 0.3538253419509996, + "flos": 19536769929600.0, + "grad_norm": 1.8392988421372376, + "language_loss": 0.75182575, + "learning_rate": 2.9971879869807673e-06, + "loss": 0.77454567, + "num_input_tokens_seen": 126596235, + "step": 5885, + "time_per_iteration": 2.575932264328003 + }, + { + "auxiliary_loss_clip": 0.01106729, + "auxiliary_loss_mlp": 0.01128965, + "balance_loss_clip": 1.00176954, + "balance_loss_mlp": 1.00088692, + "epoch": 0.35388546520366754, + "flos": 12128578335360.0, + "grad_norm": 2.180548886310009, + "language_loss": 0.83742708, + "learning_rate": 2.996850368809606e-06, + "loss": 0.85978401, + "num_input_tokens_seen": 126612830, + "step": 5886, + "time_per_iteration": 2.646082878112793 + }, + { + "auxiliary_loss_clip": 0.01170441, + "auxiliary_loss_mlp": 0.01127497, + "balance_loss_clip": 1.00203681, + "balance_loss_mlp": 1.00065804, + "epoch": 0.3539455884563355, + "flos": 19678149861120.0, + "grad_norm": 7.808050512086843, + "language_loss": 0.77762246, + "learning_rate": 2.9965127128377787e-06, + "loss": 0.80060184, + "num_input_tokens_seen": 126630910, + "step": 5887, + "time_per_iteration": 2.545944929122925 + }, + { + "auxiliary_loss_clip": 0.0108816, + "auxiliary_loss_mlp": 0.01127782, + "balance_loss_clip": 1.00155103, + "balance_loss_mlp": 1.00075305, + "epoch": 0.35400571170900347, + "flos": 18072045129600.0, + "grad_norm": 1.884893990705803, + "language_loss": 0.65818369, + "learning_rate": 2.996175019078089e-06, + "loss": 0.68034309, + "num_input_tokens_seen": 126648365, + "step": 5888, + "time_per_iteration": 2.688762664794922 + }, + { + "auxiliary_loss_clip": 0.01138979, + "auxiliary_loss_mlp": 0.01128814, + "balance_loss_clip": 1.0020771, + "balance_loss_mlp": 1.00083077, + "epoch": 0.35406583496167143, + "flos": 26068058795520.0, + "grad_norm": 4.148406450885723, + "language_loss": 0.7703259, + "learning_rate": 2.9958372875433437e-06, + "loss": 0.7930038, + "num_input_tokens_seen": 126667500, + "step": 5889, + "time_per_iteration": 2.6481080055236816 + }, + { + "auxiliary_loss_clip": 0.0112252, + "auxiliary_loss_mlp": 0.01129002, + "balance_loss_clip": 1.00190139, + "balance_loss_mlp": 1.00073326, + "epoch": 0.3541259582143394, + "flos": 19792453916160.0, + "grad_norm": 1.8131485601663704, + "language_loss": 0.81057662, + "learning_rate": 2.9954995182463478e-06, + "loss": 0.83309186, + "num_input_tokens_seen": 126686820, + "step": 5890, + "time_per_iteration": 2.618302345275879 + }, + { + "auxiliary_loss_clip": 0.01143337, + "auxiliary_loss_mlp": 0.01127923, + "balance_loss_clip": 1.00238526, + "balance_loss_mlp": 1.00060761, + "epoch": 0.35418608146700736, + "flos": 24022084112640.0, + "grad_norm": 1.5806550563886437, + "language_loss": 0.79332191, + "learning_rate": 2.99516171119991e-06, + "loss": 0.81603456, + "num_input_tokens_seen": 126706965, + "step": 5891, + "time_per_iteration": 2.609490156173706 + }, + { + "auxiliary_loss_clip": 0.01121994, + "auxiliary_loss_mlp": 0.01129141, + "balance_loss_clip": 1.0018903, + "balance_loss_mlp": 1.00077629, + "epoch": 0.35424620471967533, + "flos": 12385770693120.0, + "grad_norm": 1.9569647112474198, + "language_loss": 0.73449159, + "learning_rate": 2.9948238664168415e-06, + "loss": 0.75700295, + "num_input_tokens_seen": 126724015, + "step": 5892, + "time_per_iteration": 2.6020662784576416 + }, + { + "auxiliary_loss_clip": 0.01138799, + "auxiliary_loss_mlp": 0.01128264, + "balance_loss_clip": 1.00198889, + "balance_loss_mlp": 1.0008533, + "epoch": 0.3543063279723433, + "flos": 19673624747520.0, + "grad_norm": 2.759986444025618, + "language_loss": 0.66903877, + "learning_rate": 2.9944859839099518e-06, + "loss": 0.6917094, + "num_input_tokens_seen": 126737565, + "step": 5893, + "time_per_iteration": 2.5780115127563477 + }, + { + "auxiliary_loss_clip": 0.01106692, + "auxiliary_loss_mlp": 0.01128249, + "balance_loss_clip": 1.00186515, + "balance_loss_mlp": 1.00074255, + "epoch": 0.35436645122501126, + "flos": 21909208348800.0, + "grad_norm": 2.071806439359335, + "language_loss": 0.69783878, + "learning_rate": 2.9941480636920533e-06, + "loss": 0.72018826, + "num_input_tokens_seen": 126756095, + "step": 5894, + "time_per_iteration": 2.7006542682647705 + }, + { + "auxiliary_loss_clip": 0.01144952, + "auxiliary_loss_mlp": 0.00747867, + "balance_loss_clip": 1.00220156, + "balance_loss_mlp": 1.00015128, + "epoch": 0.3544265744776792, + "flos": 21719527603200.0, + "grad_norm": 1.6525616693162268, + "language_loss": 0.74969041, + "learning_rate": 2.9938101057759615e-06, + "loss": 0.76861858, + "num_input_tokens_seen": 126775455, + "step": 5895, + "time_per_iteration": 2.818506956100464 + }, + { + "auxiliary_loss_clip": 0.01138564, + "auxiliary_loss_mlp": 0.01128635, + "balance_loss_clip": 1.00196314, + "balance_loss_mlp": 1.00065231, + "epoch": 0.3544866977303472, + "flos": 21213223447680.0, + "grad_norm": 2.0132294616074122, + "language_loss": 0.83667231, + "learning_rate": 2.993472110174491e-06, + "loss": 0.85934436, + "num_input_tokens_seen": 126792320, + "step": 5896, + "time_per_iteration": 2.576197385787964 + }, + { + "auxiliary_loss_clip": 0.01138303, + "auxiliary_loss_mlp": 0.00747858, + "balance_loss_clip": 1.00195193, + "balance_loss_mlp": 1.00011349, + "epoch": 0.35454682098301515, + "flos": 29311402371840.0, + "grad_norm": 1.7433502615005718, + "language_loss": 0.70287943, + "learning_rate": 2.9931340769004576e-06, + "loss": 0.72174096, + "num_input_tokens_seen": 126813680, + "step": 5897, + "time_per_iteration": 2.6544878482818604 + }, + { + "auxiliary_loss_clip": 0.01139365, + "auxiliary_loss_mlp": 0.01128036, + "balance_loss_clip": 1.00191617, + "balance_loss_mlp": 1.00072062, + "epoch": 0.3546069442356832, + "flos": 24316587722880.0, + "grad_norm": 2.040616291724205, + "language_loss": 0.81506485, + "learning_rate": 2.9927960059666816e-06, + "loss": 0.83773887, + "num_input_tokens_seen": 126834395, + "step": 5898, + "time_per_iteration": 2.728667736053467 + }, + { + "auxiliary_loss_clip": 0.01170361, + "auxiliary_loss_mlp": 0.01127573, + "balance_loss_clip": 1.00195932, + "balance_loss_mlp": 1.00092459, + "epoch": 0.35466706748835114, + "flos": 22857285876480.0, + "grad_norm": 1.4266800295743172, + "language_loss": 0.74375886, + "learning_rate": 2.9924578973859804e-06, + "loss": 0.76673818, + "num_input_tokens_seen": 126855145, + "step": 5899, + "time_per_iteration": 2.5590264797210693 + }, + { + "auxiliary_loss_clip": 0.01170463, + "auxiliary_loss_mlp": 0.00747913, + "balance_loss_clip": 1.00193775, + "balance_loss_mlp": 1.00013256, + "epoch": 0.3547271907410191, + "flos": 28330107742080.0, + "grad_norm": 1.8128617012392734, + "language_loss": 0.79969656, + "learning_rate": 2.9921197511711763e-06, + "loss": 0.81888032, + "num_input_tokens_seen": 126873790, + "step": 5900, + "time_per_iteration": 2.6027917861938477 + }, + { + "auxiliary_loss_clip": 0.01140181, + "auxiliary_loss_mlp": 0.0112799, + "balance_loss_clip": 1.00194466, + "balance_loss_mlp": 1.00076985, + "epoch": 0.35478731399368707, + "flos": 23514092017920.0, + "grad_norm": 2.1382024823165247, + "language_loss": 0.81167907, + "learning_rate": 2.991781567335093e-06, + "loss": 0.83436078, + "num_input_tokens_seen": 126892865, + "step": 5901, + "time_per_iteration": 2.6101150512695312 + }, + { + "auxiliary_loss_clip": 0.01154372, + "auxiliary_loss_mlp": 0.00747906, + "balance_loss_clip": 1.0019629, + "balance_loss_mlp": 1.00013828, + "epoch": 0.35484743724635504, + "flos": 18624315715200.0, + "grad_norm": 1.8770796890419688, + "language_loss": 0.75700217, + "learning_rate": 2.9914433458905525e-06, + "loss": 0.77602494, + "num_input_tokens_seen": 126911935, + "step": 5902, + "time_per_iteration": 2.5590014457702637 + }, + { + "auxiliary_loss_clip": 0.01153788, + "auxiliary_loss_mlp": 0.011277, + "balance_loss_clip": 1.00204039, + "balance_loss_mlp": 1.00067067, + "epoch": 0.354907560499023, + "flos": 17384499924480.0, + "grad_norm": 1.6371706216811646, + "language_loss": 0.7021969, + "learning_rate": 2.991105086850381e-06, + "loss": 0.72501183, + "num_input_tokens_seen": 126930040, + "step": 5903, + "time_per_iteration": 2.556396245956421 + }, + { + "auxiliary_loss_clip": 0.01153992, + "auxiliary_loss_mlp": 0.01128695, + "balance_loss_clip": 1.00191271, + "balance_loss_mlp": 1.00061655, + "epoch": 0.35496768375169097, + "flos": 19208546426880.0, + "grad_norm": 3.2100908892095803, + "language_loss": 0.75183338, + "learning_rate": 2.9907667902274053e-06, + "loss": 0.77466023, + "num_input_tokens_seen": 126948390, + "step": 5904, + "time_per_iteration": 2.571669340133667 + }, + { + "auxiliary_loss_clip": 0.01137296, + "auxiliary_loss_mlp": 0.00747865, + "balance_loss_clip": 1.0019536, + "balance_loss_mlp": 1.00014138, + "epoch": 0.35502780700435893, + "flos": 18332792933760.0, + "grad_norm": 2.6649961120730263, + "language_loss": 0.79105711, + "learning_rate": 2.9904284560344536e-06, + "loss": 0.80990875, + "num_input_tokens_seen": 126964905, + "step": 5905, + "time_per_iteration": 3.9519431591033936 + }, + { + "auxiliary_loss_clip": 0.01122722, + "auxiliary_loss_mlp": 0.01126821, + "balance_loss_clip": 1.0017823, + "balance_loss_mlp": 1.00074542, + "epoch": 0.3550879302570269, + "flos": 15448555578240.0, + "grad_norm": 1.8458925785561433, + "language_loss": 0.72482705, + "learning_rate": 2.990090084284356e-06, + "loss": 0.74732244, + "num_input_tokens_seen": 126982000, + "step": 5906, + "time_per_iteration": 2.6049656867980957 + }, + { + "auxiliary_loss_clip": 0.01138785, + "auxiliary_loss_mlp": 0.01128059, + "balance_loss_clip": 1.0019294, + "balance_loss_mlp": 1.00055265, + "epoch": 0.35514805350969486, + "flos": 21979197999360.0, + "grad_norm": 4.733552237491389, + "language_loss": 0.74946487, + "learning_rate": 2.9897516749899426e-06, + "loss": 0.77213329, + "num_input_tokens_seen": 126998390, + "step": 5907, + "time_per_iteration": 2.5849039554595947 + }, + { + "auxiliary_loss_clip": 0.01092988, + "auxiliary_loss_mlp": 0.01127785, + "balance_loss_clip": 1.00184894, + "balance_loss_mlp": 1.00066006, + "epoch": 0.3552081767623628, + "flos": 29861949104640.0, + "grad_norm": 1.7492364940238163, + "language_loss": 0.75526845, + "learning_rate": 2.989413228164047e-06, + "loss": 0.77747619, + "num_input_tokens_seen": 127020220, + "step": 5908, + "time_per_iteration": 5.559142827987671 + }, + { + "auxiliary_loss_clip": 0.01136966, + "auxiliary_loss_mlp": 0.01127462, + "balance_loss_clip": 1.00188684, + "balance_loss_mlp": 1.00071859, + "epoch": 0.3552683000150308, + "flos": 26432264747520.0, + "grad_norm": 1.812518204014732, + "language_loss": 0.6778245, + "learning_rate": 2.989074743819502e-06, + "loss": 0.70046878, + "num_input_tokens_seen": 127038585, + "step": 5909, + "time_per_iteration": 2.6239516735076904 + }, + { + "auxiliary_loss_clip": 0.01153515, + "auxiliary_loss_mlp": 0.01127107, + "balance_loss_clip": 1.0020833, + "balance_loss_mlp": 1.00084078, + "epoch": 0.35532842326769876, + "flos": 19785989468160.0, + "grad_norm": 2.4645851413657973, + "language_loss": 0.78969926, + "learning_rate": 2.988736221969144e-06, + "loss": 0.81250548, + "num_input_tokens_seen": 127056215, + "step": 5910, + "time_per_iteration": 2.560698986053467 + }, + { + "auxiliary_loss_clip": 0.01140034, + "auxiliary_loss_mlp": 0.01128838, + "balance_loss_clip": 1.00186729, + "balance_loss_mlp": 1.0007596, + "epoch": 0.3553885465203668, + "flos": 17239277237760.0, + "grad_norm": 1.5951128401256343, + "language_loss": 0.70697814, + "learning_rate": 2.98839766262581e-06, + "loss": 0.72966683, + "num_input_tokens_seen": 127075825, + "step": 5911, + "time_per_iteration": 2.6199305057525635 + }, + { + "auxiliary_loss_clip": 0.01155096, + "auxiliary_loss_mlp": 0.01127323, + "balance_loss_clip": 1.00194466, + "balance_loss_mlp": 1.00077033, + "epoch": 0.35544866977303474, + "flos": 14934350430720.0, + "grad_norm": 2.803904030884412, + "language_loss": 0.87392485, + "learning_rate": 2.9880590658023366e-06, + "loss": 0.89674902, + "num_input_tokens_seen": 127091205, + "step": 5912, + "time_per_iteration": 3.941387891769409 + }, + { + "auxiliary_loss_clip": 0.0113853, + "auxiliary_loss_mlp": 0.01128024, + "balance_loss_clip": 1.0018189, + "balance_loss_mlp": 1.00070834, + "epoch": 0.3555087930257027, + "flos": 19756040503680.0, + "grad_norm": 2.303265293278575, + "language_loss": 0.77175152, + "learning_rate": 2.9877204315115646e-06, + "loss": 0.79441708, + "num_input_tokens_seen": 127109210, + "step": 5913, + "time_per_iteration": 2.638164520263672 + }, + { + "auxiliary_loss_clip": 0.01123441, + "auxiliary_loss_mlp": 0.01127602, + "balance_loss_clip": 1.00192845, + "balance_loss_mlp": 1.00066757, + "epoch": 0.3555689162783707, + "flos": 21068252156160.0, + "grad_norm": 3.1838608778039403, + "language_loss": 0.82547247, + "learning_rate": 2.9873817597663353e-06, + "loss": 0.84798288, + "num_input_tokens_seen": 127128400, + "step": 5914, + "time_per_iteration": 2.645585536956787 + }, + { + "auxiliary_loss_clip": 0.011704, + "auxiliary_loss_mlp": 0.01127982, + "balance_loss_clip": 1.00206101, + "balance_loss_mlp": 1.00057089, + "epoch": 0.35562903953103864, + "flos": 33069633454080.0, + "grad_norm": 1.986806539985008, + "language_loss": 0.70320904, + "learning_rate": 2.98704305057949e-06, + "loss": 0.72619289, + "num_input_tokens_seen": 127149965, + "step": 5915, + "time_per_iteration": 2.623507261276245 + }, + { + "auxiliary_loss_clip": 0.0115323, + "auxiliary_loss_mlp": 0.01126937, + "balance_loss_clip": 1.00181985, + "balance_loss_mlp": 1.00076556, + "epoch": 0.3556891627837066, + "flos": 20557853850240.0, + "grad_norm": 1.9994890747237408, + "language_loss": 0.75851774, + "learning_rate": 2.9867043039638737e-06, + "loss": 0.78131938, + "num_input_tokens_seen": 127169865, + "step": 5916, + "time_per_iteration": 2.5898523330688477 + }, + { + "auxiliary_loss_clip": 0.01124423, + "auxiliary_loss_mlp": 0.01127954, + "balance_loss_clip": 1.00191391, + "balance_loss_mlp": 1.00063884, + "epoch": 0.35574928603637457, + "flos": 20703327932160.0, + "grad_norm": 2.439362252839281, + "language_loss": 0.88341916, + "learning_rate": 2.986365519932332e-06, + "loss": 0.90594304, + "num_input_tokens_seen": 127188075, + "step": 5917, + "time_per_iteration": 2.6873581409454346 + }, + { + "auxiliary_loss_clip": 0.01091513, + "auxiliary_loss_mlp": 0.01127163, + "balance_loss_clip": 1.00185847, + "balance_loss_mlp": 1.00061011, + "epoch": 0.35580940928904253, + "flos": 15194595444480.0, + "grad_norm": 2.097100114492556, + "language_loss": 0.74213952, + "learning_rate": 2.98602669849771e-06, + "loss": 0.76432627, + "num_input_tokens_seen": 127206065, + "step": 5918, + "time_per_iteration": 2.7801506519317627 + }, + { + "auxiliary_loss_clip": 0.01151001, + "auxiliary_loss_mlp": 0.01110575, + "balance_loss_clip": 1.00230813, + "balance_loss_mlp": 1.00071168, + "epoch": 0.3558695325417105, + "flos": 58639145431680.0, + "grad_norm": 0.9171193291929821, + "language_loss": 0.63785309, + "learning_rate": 2.985687839672857e-06, + "loss": 0.66046888, + "num_input_tokens_seen": 127257885, + "step": 5919, + "time_per_iteration": 2.919341564178467 + }, + { + "auxiliary_loss_clip": 0.01154102, + "auxiliary_loss_mlp": 0.01128133, + "balance_loss_clip": 1.00193834, + "balance_loss_mlp": 1.00072205, + "epoch": 0.35592965579437846, + "flos": 22018233104640.0, + "grad_norm": 1.9567381084161988, + "language_loss": 0.73385179, + "learning_rate": 2.9853489434706223e-06, + "loss": 0.75667417, + "num_input_tokens_seen": 127275550, + "step": 5920, + "time_per_iteration": 2.5843331813812256 + }, + { + "auxiliary_loss_clip": 0.01122499, + "auxiliary_loss_mlp": 0.01128143, + "balance_loss_clip": 1.00186276, + "balance_loss_mlp": 1.00073254, + "epoch": 0.35598977904704643, + "flos": 23367684182400.0, + "grad_norm": 2.297309082873241, + "language_loss": 0.77232432, + "learning_rate": 2.985010009903857e-06, + "loss": 0.7948308, + "num_input_tokens_seen": 127295110, + "step": 5921, + "time_per_iteration": 2.6531481742858887 + }, + { + "auxiliary_loss_clip": 0.01138395, + "auxiliary_loss_mlp": 0.01127435, + "balance_loss_clip": 1.00186515, + "balance_loss_mlp": 1.00059664, + "epoch": 0.3560499022997144, + "flos": 17785334770560.0, + "grad_norm": 1.8755572262222906, + "language_loss": 0.67775619, + "learning_rate": 2.9846710389854133e-06, + "loss": 0.70041454, + "num_input_tokens_seen": 127312865, + "step": 5922, + "time_per_iteration": 2.602949619293213 + }, + { + "auxiliary_loss_clip": 0.01155329, + "auxiliary_loss_mlp": 0.01127781, + "balance_loss_clip": 1.00219297, + "balance_loss_mlp": 1.00065613, + "epoch": 0.35611002555238236, + "flos": 20740459616640.0, + "grad_norm": 2.0668995368654306, + "language_loss": 0.79194272, + "learning_rate": 2.9843320307281454e-06, + "loss": 0.8147738, + "num_input_tokens_seen": 127331710, + "step": 5923, + "time_per_iteration": 2.598844051361084 + }, + { + "auxiliary_loss_clip": 0.01136991, + "auxiliary_loss_mlp": 0.01127202, + "balance_loss_clip": 1.00192857, + "balance_loss_mlp": 1.00084007, + "epoch": 0.3561701488050504, + "flos": 19462219251840.0, + "grad_norm": 1.8007347140396248, + "language_loss": 0.85365528, + "learning_rate": 2.983992985144908e-06, + "loss": 0.87629724, + "num_input_tokens_seen": 127350950, + "step": 5924, + "time_per_iteration": 2.592916488647461 + }, + { + "auxiliary_loss_clip": 0.01140038, + "auxiliary_loss_mlp": 0.01128175, + "balance_loss_clip": 1.00203824, + "balance_loss_mlp": 1.00076365, + "epoch": 0.35623027205771834, + "flos": 30774942023040.0, + "grad_norm": 1.9599086459070694, + "language_loss": 0.77537429, + "learning_rate": 2.9836539022485578e-06, + "loss": 0.79805648, + "num_input_tokens_seen": 127369385, + "step": 5925, + "time_per_iteration": 2.666098117828369 + }, + { + "auxiliary_loss_clip": 0.01089652, + "auxiliary_loss_mlp": 0.01126849, + "balance_loss_clip": 1.00167298, + "balance_loss_mlp": 1.00067818, + "epoch": 0.3562903953103863, + "flos": 16981079299200.0, + "grad_norm": 1.7669347267909008, + "language_loss": 0.75955212, + "learning_rate": 2.9833147820519535e-06, + "loss": 0.78171718, + "num_input_tokens_seen": 127386965, + "step": 5926, + "time_per_iteration": 2.686631679534912 + }, + { + "auxiliary_loss_clip": 0.01122798, + "auxiliary_loss_mlp": 0.00747874, + "balance_loss_clip": 1.00175381, + "balance_loss_mlp": 1.00015783, + "epoch": 0.3563505185630543, + "flos": 23839837482240.0, + "grad_norm": 2.0964117747452504, + "language_loss": 0.6944223, + "learning_rate": 2.9829756245679544e-06, + "loss": 0.71312904, + "num_input_tokens_seen": 127406075, + "step": 5927, + "time_per_iteration": 2.6713204383850098 + }, + { + "auxiliary_loss_clip": 0.01170247, + "auxiliary_loss_mlp": 0.01126671, + "balance_loss_clip": 1.00199831, + "balance_loss_mlp": 1.00059581, + "epoch": 0.35641064181572224, + "flos": 22273450214400.0, + "grad_norm": 2.2814402780683145, + "language_loss": 0.79557085, + "learning_rate": 2.9826364298094212e-06, + "loss": 0.8185401, + "num_input_tokens_seen": 127425350, + "step": 5928, + "time_per_iteration": 2.5192108154296875 + }, + { + "auxiliary_loss_clip": 0.01170313, + "auxiliary_loss_mlp": 0.01126578, + "balance_loss_clip": 1.00202513, + "balance_loss_mlp": 1.00078821, + "epoch": 0.3564707650683902, + "flos": 23001251587200.0, + "grad_norm": 1.5741456926200892, + "language_loss": 0.81697333, + "learning_rate": 2.982297197789215e-06, + "loss": 0.83994222, + "num_input_tokens_seen": 127446335, + "step": 5929, + "time_per_iteration": 2.5437521934509277 + }, + { + "auxiliary_loss_clip": 0.01153545, + "auxiliary_loss_mlp": 0.01127566, + "balance_loss_clip": 1.00189948, + "balance_loss_mlp": 1.00072694, + "epoch": 0.35653088832105817, + "flos": 14684268965760.0, + "grad_norm": 1.7105213232206904, + "language_loss": 0.69887507, + "learning_rate": 2.981957928520201e-06, + "loss": 0.72168612, + "num_input_tokens_seen": 127462795, + "step": 5930, + "time_per_iteration": 2.5260188579559326 + }, + { + "auxiliary_loss_clip": 0.01153749, + "auxiliary_loss_mlp": 0.01127891, + "balance_loss_clip": 1.00196433, + "balance_loss_mlp": 1.00086188, + "epoch": 0.35659101157372614, + "flos": 23477068074240.0, + "grad_norm": 1.819223919893064, + "language_loss": 0.67332685, + "learning_rate": 2.981618622015244e-06, + "loss": 0.69614327, + "num_input_tokens_seen": 127482675, + "step": 5931, + "time_per_iteration": 2.566133975982666 + }, + { + "auxiliary_loss_clip": 0.01153573, + "auxiliary_loss_mlp": 0.01127125, + "balance_loss_clip": 1.0019691, + "balance_loss_mlp": 1.0007627, + "epoch": 0.3566511348263941, + "flos": 26578672583040.0, + "grad_norm": 1.5811563094063115, + "language_loss": 0.67993748, + "learning_rate": 2.981279278287211e-06, + "loss": 0.70274448, + "num_input_tokens_seen": 127502275, + "step": 5932, + "time_per_iteration": 2.5958235263824463 + }, + { + "auxiliary_loss_clip": 0.01103267, + "auxiliary_loss_mlp": 0.01126955, + "balance_loss_clip": 1.00171793, + "balance_loss_mlp": 1.00059295, + "epoch": 0.35671125807906207, + "flos": 13115008609920.0, + "grad_norm": 1.9819050779789185, + "language_loss": 0.78486693, + "learning_rate": 2.980939897348969e-06, + "loss": 0.8071692, + "num_input_tokens_seen": 127520195, + "step": 5933, + "time_per_iteration": 2.662158250808716 + }, + { + "auxiliary_loss_clip": 0.01155112, + "auxiliary_loss_mlp": 0.0112805, + "balance_loss_clip": 1.00198209, + "balance_loss_mlp": 1.00073433, + "epoch": 0.35677138133173003, + "flos": 33000577557120.0, + "grad_norm": 1.4186340949297023, + "language_loss": 0.69573921, + "learning_rate": 2.980600479213388e-06, + "loss": 0.71857083, + "num_input_tokens_seen": 127544495, + "step": 5934, + "time_per_iteration": 2.668166160583496 + }, + { + "auxiliary_loss_clip": 0.0113849, + "auxiliary_loss_mlp": 0.00747896, + "balance_loss_clip": 1.00190163, + "balance_loss_mlp": 1.00007963, + "epoch": 0.356831504584398, + "flos": 20777842696320.0, + "grad_norm": 2.5422459468018452, + "language_loss": 0.70984185, + "learning_rate": 2.9802610238933384e-06, + "loss": 0.72870564, + "num_input_tokens_seen": 127563810, + "step": 5935, + "time_per_iteration": 2.603041172027588 + }, + { + "auxiliary_loss_clip": 0.01122981, + "auxiliary_loss_mlp": 0.01126993, + "balance_loss_clip": 1.00181258, + "balance_loss_mlp": 1.00063074, + "epoch": 0.35689162783706596, + "flos": 12165566365440.0, + "grad_norm": 2.0425246839157905, + "language_loss": 0.78162646, + "learning_rate": 2.979921531401692e-06, + "loss": 0.8041262, + "num_input_tokens_seen": 127579065, + "step": 5936, + "time_per_iteration": 2.6228549480438232 + }, + { + "auxiliary_loss_clip": 0.01154065, + "auxiliary_loss_mlp": 0.00747703, + "balance_loss_clip": 1.00195909, + "balance_loss_mlp": 1.00006104, + "epoch": 0.356951751089734, + "flos": 23841489507840.0, + "grad_norm": 4.129225809006699, + "language_loss": 0.64565772, + "learning_rate": 2.9795820017513242e-06, + "loss": 0.66467535, + "num_input_tokens_seen": 127599105, + "step": 5937, + "time_per_iteration": 2.6007235050201416 + }, + { + "auxiliary_loss_clip": 0.01170298, + "auxiliary_loss_mlp": 0.00747716, + "balance_loss_clip": 1.00201058, + "balance_loss_mlp": 1.00010586, + "epoch": 0.35701187434240195, + "flos": 11722176881280.0, + "grad_norm": 2.926002839878223, + "language_loss": 0.78124857, + "learning_rate": 2.9792424349551073e-06, + "loss": 0.80042869, + "num_input_tokens_seen": 127614940, + "step": 5938, + "time_per_iteration": 2.5314226150512695 + }, + { + "auxiliary_loss_clip": 0.01120501, + "auxiliary_loss_mlp": 0.01127989, + "balance_loss_clip": 1.00186288, + "balance_loss_mlp": 1.00086474, + "epoch": 0.3570719975950699, + "flos": 24898879100160.0, + "grad_norm": 1.5527790868118776, + "language_loss": 0.8022244, + "learning_rate": 2.9789028310259202e-06, + "loss": 0.82470924, + "num_input_tokens_seen": 127634960, + "step": 5939, + "time_per_iteration": 2.6698930263519287 + }, + { + "auxiliary_loss_clip": 0.01143612, + "auxiliary_loss_mlp": 0.01128504, + "balance_loss_clip": 1.00220799, + "balance_loss_mlp": 1.00061607, + "epoch": 0.3571321208477379, + "flos": 25994836920960.0, + "grad_norm": 1.6230720409680972, + "language_loss": 0.78847593, + "learning_rate": 2.9785631899766395e-06, + "loss": 0.81119704, + "num_input_tokens_seen": 127654545, + "step": 5940, + "time_per_iteration": 2.6284737586975098 + }, + { + "auxiliary_loss_clip": 0.01137856, + "auxiliary_loss_mlp": 0.01127836, + "balance_loss_clip": 1.0018208, + "balance_loss_mlp": 1.00052106, + "epoch": 0.35719224410040584, + "flos": 14501663199360.0, + "grad_norm": 2.291264695754653, + "language_loss": 0.7249009, + "learning_rate": 2.9782235118201443e-06, + "loss": 0.74755782, + "num_input_tokens_seen": 127672320, + "step": 5941, + "time_per_iteration": 2.5895462036132812 + }, + { + "auxiliary_loss_clip": 0.01154818, + "auxiliary_loss_mlp": 0.01128485, + "balance_loss_clip": 1.00216043, + "balance_loss_mlp": 1.0008831, + "epoch": 0.3572523673530738, + "flos": 31175453646720.0, + "grad_norm": 1.9573183758409476, + "language_loss": 0.65141344, + "learning_rate": 2.9778837965693154e-06, + "loss": 0.67424643, + "num_input_tokens_seen": 127693315, + "step": 5942, + "time_per_iteration": 2.702497959136963 + }, + { + "auxiliary_loss_clip": 0.01153797, + "auxiliary_loss_mlp": 0.01127611, + "balance_loss_clip": 1.00203609, + "balance_loss_mlp": 1.00086784, + "epoch": 0.3573124906057418, + "flos": 15851976203520.0, + "grad_norm": 2.7817011364927033, + "language_loss": 0.73869801, + "learning_rate": 2.9775440442370354e-06, + "loss": 0.76151204, + "num_input_tokens_seen": 127711570, + "step": 5943, + "time_per_iteration": 3.9882736206054688 + }, + { + "auxiliary_loss_clip": 0.01167539, + "auxiliary_loss_mlp": 0.0110921, + "balance_loss_clip": 1.00297701, + "balance_loss_mlp": 1.00011003, + "epoch": 0.35737261385840974, + "flos": 60822729118080.0, + "grad_norm": 0.807551482728132, + "language_loss": 0.60750163, + "learning_rate": 2.9772042548361867e-06, + "loss": 0.63026917, + "num_input_tokens_seen": 127772475, + "step": 5944, + "time_per_iteration": 3.227544069290161 + }, + { + "auxiliary_loss_clip": 0.01137399, + "auxiliary_loss_mlp": 0.01127173, + "balance_loss_clip": 1.00191379, + "balance_loss_mlp": 1.00062013, + "epoch": 0.3574327371110777, + "flos": 18843765857280.0, + "grad_norm": 2.392355366686566, + "language_loss": 0.72850442, + "learning_rate": 2.976864428379655e-06, + "loss": 0.75115013, + "num_input_tokens_seen": 127790940, + "step": 5945, + "time_per_iteration": 2.5992088317871094 + }, + { + "auxiliary_loss_clip": 0.01137914, + "auxiliary_loss_mlp": 0.00747753, + "balance_loss_clip": 1.00182939, + "balance_loss_mlp": 1.00010943, + "epoch": 0.35749286036374567, + "flos": 23549679417600.0, + "grad_norm": 1.8062845180220333, + "language_loss": 0.80973315, + "learning_rate": 2.976524564880326e-06, + "loss": 0.82858992, + "num_input_tokens_seen": 127808275, + "step": 5946, + "time_per_iteration": 4.078911542892456 + }, + { + "auxiliary_loss_clip": 0.01170426, + "auxiliary_loss_mlp": 0.01128397, + "balance_loss_clip": 1.00208139, + "balance_loss_mlp": 1.0007956, + "epoch": 0.35755298361641363, + "flos": 21105491581440.0, + "grad_norm": 1.349542680569676, + "language_loss": 0.68720126, + "learning_rate": 2.9761846643510882e-06, + "loss": 0.71018952, + "num_input_tokens_seen": 127828840, + "step": 5947, + "time_per_iteration": 2.5210306644439697 + }, + { + "auxiliary_loss_clip": 0.0113831, + "auxiliary_loss_mlp": 0.01126631, + "balance_loss_clip": 1.00187302, + "balance_loss_mlp": 1.0008409, + "epoch": 0.3576131068690816, + "flos": 19245031666560.0, + "grad_norm": 2.25761358527382, + "language_loss": 0.75700045, + "learning_rate": 2.9758447268048297e-06, + "loss": 0.77964979, + "num_input_tokens_seen": 127846240, + "step": 5948, + "time_per_iteration": 2.5786843299865723 + }, + { + "auxiliary_loss_clip": 0.01089351, + "auxiliary_loss_mlp": 0.01127286, + "balance_loss_clip": 1.00154996, + "balance_loss_mlp": 1.00082898, + "epoch": 0.35767323012174956, + "flos": 28654703971200.0, + "grad_norm": 2.076569288785012, + "language_loss": 0.70458066, + "learning_rate": 2.9755047522544415e-06, + "loss": 0.72674704, + "num_input_tokens_seen": 127866880, + "step": 5949, + "time_per_iteration": 4.191350936889648 + }, + { + "auxiliary_loss_clip": 0.0113741, + "auxiliary_loss_mlp": 0.01127202, + "balance_loss_clip": 1.00188804, + "balance_loss_mlp": 1.00074506, + "epoch": 0.35773335337441753, + "flos": 17085363459840.0, + "grad_norm": 1.7757051517916909, + "language_loss": 0.76730788, + "learning_rate": 2.9751647407128154e-06, + "loss": 0.78995407, + "num_input_tokens_seen": 127883560, + "step": 5950, + "time_per_iteration": 2.6031858921051025 + }, + { + "auxiliary_loss_clip": 0.01154801, + "auxiliary_loss_mlp": 0.01127144, + "balance_loss_clip": 1.00197184, + "balance_loss_mlp": 1.00078225, + "epoch": 0.35779347662708555, + "flos": 15888605097600.0, + "grad_norm": 1.5889162310538527, + "language_loss": 0.73000818, + "learning_rate": 2.9748246921928445e-06, + "loss": 0.75282764, + "num_input_tokens_seen": 127902330, + "step": 5951, + "time_per_iteration": 2.554457187652588 + }, + { + "auxiliary_loss_clip": 0.01153807, + "auxiliary_loss_mlp": 0.01127725, + "balance_loss_clip": 1.00195265, + "balance_loss_mlp": 1.00069523, + "epoch": 0.3578535998797535, + "flos": 28658834035200.0, + "grad_norm": 2.2300208921714932, + "language_loss": 0.70196426, + "learning_rate": 2.9744846067074236e-06, + "loss": 0.72477955, + "num_input_tokens_seen": 127922325, + "step": 5952, + "time_per_iteration": 2.5963733196258545 + }, + { + "auxiliary_loss_clip": 0.01105816, + "auxiliary_loss_mlp": 0.01126808, + "balance_loss_clip": 1.00173593, + "balance_loss_mlp": 1.00073242, + "epoch": 0.3579137231324215, + "flos": 37852432076160.0, + "grad_norm": 1.6973618638505448, + "language_loss": 0.69529068, + "learning_rate": 2.974144484269449e-06, + "loss": 0.71761692, + "num_input_tokens_seen": 127942635, + "step": 5953, + "time_per_iteration": 2.813406467437744 + }, + { + "auxiliary_loss_clip": 0.0113791, + "auxiliary_loss_mlp": 0.01126932, + "balance_loss_clip": 1.00192332, + "balance_loss_mlp": 1.00057006, + "epoch": 0.35797384638508944, + "flos": 22346851656960.0, + "grad_norm": 1.7870065065459173, + "language_loss": 0.66651058, + "learning_rate": 2.9738043248918175e-06, + "loss": 0.68915904, + "num_input_tokens_seen": 127962520, + "step": 5954, + "time_per_iteration": 2.59889817237854 + }, + { + "auxiliary_loss_clip": 0.01136634, + "auxiliary_loss_mlp": 0.01126854, + "balance_loss_clip": 1.00188172, + "balance_loss_mlp": 1.0007782, + "epoch": 0.3580339696377574, + "flos": 13589711775360.0, + "grad_norm": 1.7317180451900671, + "language_loss": 0.74686182, + "learning_rate": 2.9734641285874282e-06, + "loss": 0.76949668, + "num_input_tokens_seen": 127981180, + "step": 5955, + "time_per_iteration": 2.653272867202759 + }, + { + "auxiliary_loss_clip": 0.01154008, + "auxiliary_loss_mlp": 0.01126574, + "balance_loss_clip": 1.00196087, + "balance_loss_mlp": 1.00059319, + "epoch": 0.3580940928904254, + "flos": 23768231719680.0, + "grad_norm": 1.510937528822145, + "language_loss": 0.75865442, + "learning_rate": 2.973123895369182e-06, + "loss": 0.78146029, + "num_input_tokens_seen": 127999725, + "step": 5956, + "time_per_iteration": 2.5839014053344727 + }, + { + "auxiliary_loss_clip": 0.01170112, + "auxiliary_loss_mlp": 0.01126309, + "balance_loss_clip": 1.00200868, + "balance_loss_mlp": 1.00061512, + "epoch": 0.35815421614309334, + "flos": 19463871277440.0, + "grad_norm": 1.5412126075523829, + "language_loss": 0.73688149, + "learning_rate": 2.9727836252499805e-06, + "loss": 0.75984567, + "num_input_tokens_seen": 128018885, + "step": 5957, + "time_per_iteration": 2.5122478008270264 + }, + { + "auxiliary_loss_clip": 0.01136975, + "auxiliary_loss_mlp": 0.01127871, + "balance_loss_clip": 1.00199318, + "balance_loss_mlp": 1.00074601, + "epoch": 0.3582143393957613, + "flos": 23368186972800.0, + "grad_norm": 2.385745307346405, + "language_loss": 0.71003807, + "learning_rate": 2.972443318242726e-06, + "loss": 0.73268652, + "num_input_tokens_seen": 128037875, + "step": 5958, + "time_per_iteration": 2.612583875656128 + }, + { + "auxiliary_loss_clip": 0.01119644, + "auxiliary_loss_mlp": 0.01126322, + "balance_loss_clip": 1.00173187, + "balance_loss_mlp": 1.00053251, + "epoch": 0.35827446264842927, + "flos": 26323275905280.0, + "grad_norm": 1.9820435965018908, + "language_loss": 0.88390076, + "learning_rate": 2.972102974360324e-06, + "loss": 0.90636039, + "num_input_tokens_seen": 128056045, + "step": 5959, + "time_per_iteration": 2.690948247909546 + }, + { + "auxiliary_loss_clip": 0.01170258, + "auxiliary_loss_mlp": 0.01126991, + "balance_loss_clip": 1.00208116, + "balance_loss_mlp": 1.00082004, + "epoch": 0.35833458590109724, + "flos": 30446610779520.0, + "grad_norm": 1.8713533995332505, + "language_loss": 0.58098459, + "learning_rate": 2.971762593615679e-06, + "loss": 0.60395712, + "num_input_tokens_seen": 128077815, + "step": 5960, + "time_per_iteration": 2.6089487075805664 + }, + { + "auxiliary_loss_clip": 0.01170222, + "auxiliary_loss_mlp": 0.01126383, + "balance_loss_clip": 1.00197434, + "balance_loss_mlp": 1.00059319, + "epoch": 0.3583947091537652, + "flos": 14829886702080.0, + "grad_norm": 2.350883379627823, + "language_loss": 0.76302183, + "learning_rate": 2.9714221760216993e-06, + "loss": 0.78598797, + "num_input_tokens_seen": 128095460, + "step": 5961, + "time_per_iteration": 2.5246646404266357 + }, + { + "auxiliary_loss_clip": 0.01120542, + "auxiliary_loss_mlp": 0.011268, + "balance_loss_clip": 1.00179768, + "balance_loss_mlp": 1.00062871, + "epoch": 0.35845483240643317, + "flos": 34240644743040.0, + "grad_norm": 1.8892328462145755, + "language_loss": 0.70188713, + "learning_rate": 2.971081721591294e-06, + "loss": 0.72436053, + "num_input_tokens_seen": 128118605, + "step": 5962, + "time_per_iteration": 2.7626242637634277 + }, + { + "auxiliary_loss_clip": 0.01138804, + "auxiliary_loss_mlp": 0.01126144, + "balance_loss_clip": 1.00192416, + "balance_loss_mlp": 1.00064015, + "epoch": 0.35851495565910113, + "flos": 20960089326720.0, + "grad_norm": 1.9489795807811652, + "language_loss": 0.74323034, + "learning_rate": 2.9707412303373716e-06, + "loss": 0.76587987, + "num_input_tokens_seen": 128139205, + "step": 5963, + "time_per_iteration": 2.6063344478607178 + }, + { + "auxiliary_loss_clip": 0.01170311, + "auxiliary_loss_mlp": 0.01127098, + "balance_loss_clip": 1.00218034, + "balance_loss_mlp": 1.00064051, + "epoch": 0.35857507891176915, + "flos": 22309863626880.0, + "grad_norm": 1.8230624656309236, + "language_loss": 0.7821461, + "learning_rate": 2.9704007022728447e-06, + "loss": 0.80512023, + "num_input_tokens_seen": 128158765, + "step": 5964, + "time_per_iteration": 2.5433425903320312 + }, + { + "auxiliary_loss_clip": 0.01137501, + "auxiliary_loss_mlp": 0.01127303, + "balance_loss_clip": 1.00194573, + "balance_loss_mlp": 1.00055957, + "epoch": 0.3586352021644371, + "flos": 23367863750400.0, + "grad_norm": 2.1257065178741197, + "language_loss": 0.66277134, + "learning_rate": 2.970060137410626e-06, + "loss": 0.68541932, + "num_input_tokens_seen": 128177850, + "step": 5965, + "time_per_iteration": 2.6249077320098877 + }, + { + "auxiliary_loss_clip": 0.01170284, + "auxiliary_loss_mlp": 0.00747844, + "balance_loss_clip": 1.00211835, + "balance_loss_mlp": 1.00012231, + "epoch": 0.3586953254171051, + "flos": 27849227437440.0, + "grad_norm": 1.8984718327798886, + "language_loss": 0.78674924, + "learning_rate": 2.9697195357636294e-06, + "loss": 0.8059305, + "num_input_tokens_seen": 128196925, + "step": 5966, + "time_per_iteration": 2.5842700004577637 + }, + { + "auxiliary_loss_clip": 0.0110713, + "auxiliary_loss_mlp": 0.01126976, + "balance_loss_clip": 1.00182378, + "balance_loss_mlp": 1.00070989, + "epoch": 0.35875544866977305, + "flos": 19500500171520.0, + "grad_norm": 2.9334447543787556, + "language_loss": 0.91150868, + "learning_rate": 2.9693788973447715e-06, + "loss": 0.93384981, + "num_input_tokens_seen": 128213955, + "step": 5967, + "time_per_iteration": 2.6587741374969482 + }, + { + "auxiliary_loss_clip": 0.0112189, + "auxiliary_loss_mlp": 0.01126858, + "balance_loss_clip": 1.00182819, + "balance_loss_mlp": 1.00087798, + "epoch": 0.358815571922441, + "flos": 21471134077440.0, + "grad_norm": 2.4488505055802032, + "language_loss": 0.80087924, + "learning_rate": 2.9690382221669682e-06, + "loss": 0.82336676, + "num_input_tokens_seen": 128232980, + "step": 5968, + "time_per_iteration": 2.638849973678589 + }, + { + "auxiliary_loss_clip": 0.01138145, + "auxiliary_loss_mlp": 0.01127661, + "balance_loss_clip": 1.00203419, + "balance_loss_mlp": 1.00082231, + "epoch": 0.358875695175109, + "flos": 21835411856640.0, + "grad_norm": 2.2001937963160105, + "language_loss": 0.84003723, + "learning_rate": 2.9686975102431384e-06, + "loss": 0.86269528, + "num_input_tokens_seen": 128252795, + "step": 5969, + "time_per_iteration": 2.5904524326324463 + }, + { + "auxiliary_loss_clip": 0.01121763, + "auxiliary_loss_mlp": 0.01126676, + "balance_loss_clip": 1.00192773, + "balance_loss_mlp": 1.00069582, + "epoch": 0.35893581842777694, + "flos": 32011633330560.0, + "grad_norm": 2.1633202664262745, + "language_loss": 0.71814543, + "learning_rate": 2.968356761586202e-06, + "loss": 0.74062985, + "num_input_tokens_seen": 128273115, + "step": 5970, + "time_per_iteration": 2.712165594100952 + }, + { + "auxiliary_loss_clip": 0.01137946, + "auxiliary_loss_mlp": 0.01127104, + "balance_loss_clip": 1.00193048, + "balance_loss_mlp": 1.00074196, + "epoch": 0.3589959416804449, + "flos": 20485817124480.0, + "grad_norm": 2.6886374462420943, + "language_loss": 0.79547697, + "learning_rate": 2.9680159762090805e-06, + "loss": 0.81812745, + "num_input_tokens_seen": 128292220, + "step": 5971, + "time_per_iteration": 2.578429698944092 + }, + { + "auxiliary_loss_clip": 0.01106361, + "auxiliary_loss_mlp": 0.01126365, + "balance_loss_clip": 1.00164771, + "balance_loss_mlp": 1.00057495, + "epoch": 0.3590560649331129, + "flos": 16180666583040.0, + "grad_norm": 2.35883726262608, + "language_loss": 0.78437328, + "learning_rate": 2.967675154124696e-06, + "loss": 0.80670047, + "num_input_tokens_seen": 128310305, + "step": 5972, + "time_per_iteration": 2.645905017852783 + }, + { + "auxiliary_loss_clip": 0.01121612, + "auxiliary_loss_mlp": 0.01127035, + "balance_loss_clip": 1.0018369, + "balance_loss_mlp": 1.00067294, + "epoch": 0.35911618818578084, + "flos": 20375391738240.0, + "grad_norm": 1.8067206352423935, + "language_loss": 0.81527305, + "learning_rate": 2.9673342953459722e-06, + "loss": 0.83775949, + "num_input_tokens_seen": 128328305, + "step": 5973, + "time_per_iteration": 2.6174938678741455 + }, + { + "auxiliary_loss_clip": 0.01137707, + "auxiliary_loss_mlp": 0.01109086, + "balance_loss_clip": 1.00284672, + "balance_loss_mlp": 0.99998587, + "epoch": 0.3591763114384488, + "flos": 41236691685120.0, + "grad_norm": 0.9101278502800156, + "language_loss": 0.56759262, + "learning_rate": 2.9669933998858355e-06, + "loss": 0.59006059, + "num_input_tokens_seen": 128378380, + "step": 5974, + "time_per_iteration": 3.0408596992492676 + }, + { + "auxiliary_loss_clip": 0.01154718, + "auxiliary_loss_mlp": 0.01127044, + "balance_loss_clip": 1.00203228, + "balance_loss_mlp": 1.00087345, + "epoch": 0.35923643469111677, + "flos": 18695454600960.0, + "grad_norm": 1.6696223473541962, + "language_loss": 0.68809593, + "learning_rate": 2.9666524677572114e-06, + "loss": 0.71091354, + "num_input_tokens_seen": 128394315, + "step": 5975, + "time_per_iteration": 2.5570292472839355 + }, + { + "auxiliary_loss_clip": 0.01170311, + "auxiliary_loss_mlp": 0.01126424, + "balance_loss_clip": 1.00205493, + "balance_loss_mlp": 1.00082541, + "epoch": 0.35929655794378473, + "flos": 25009950931200.0, + "grad_norm": 1.56991764448174, + "language_loss": 0.80211669, + "learning_rate": 2.96631149897303e-06, + "loss": 0.82508403, + "num_input_tokens_seen": 128414515, + "step": 5976, + "time_per_iteration": 2.6282739639282227 + }, + { + "auxiliary_loss_clip": 0.01090936, + "auxiliary_loss_mlp": 0.01126269, + "balance_loss_clip": 1.00181079, + "balance_loss_mlp": 1.00057459, + "epoch": 0.35935668119645275, + "flos": 14975576265600.0, + "grad_norm": 1.9457884414065483, + "language_loss": 0.7896564, + "learning_rate": 2.9659704935462194e-06, + "loss": 0.81182843, + "num_input_tokens_seen": 128430615, + "step": 5977, + "time_per_iteration": 2.724966526031494 + }, + { + "auxiliary_loss_clip": 0.01120316, + "auxiliary_loss_mlp": 0.01125925, + "balance_loss_clip": 1.00167537, + "balance_loss_mlp": 1.00061202, + "epoch": 0.3594168044491207, + "flos": 21178138838400.0, + "grad_norm": 1.7562175521871437, + "language_loss": 0.79937065, + "learning_rate": 2.9656294514897102e-06, + "loss": 0.82183307, + "num_input_tokens_seen": 128449480, + "step": 5978, + "time_per_iteration": 2.660269021987915 + }, + { + "auxiliary_loss_clip": 0.0117015, + "auxiliary_loss_mlp": 0.00747798, + "balance_loss_clip": 1.00200427, + "balance_loss_mlp": 1.00011516, + "epoch": 0.3594769277017887, + "flos": 27672152365440.0, + "grad_norm": 1.9587040760352623, + "language_loss": 0.67692518, + "learning_rate": 2.965288372816436e-06, + "loss": 0.69610465, + "num_input_tokens_seen": 128471465, + "step": 5979, + "time_per_iteration": 2.59887433052063 + }, + { + "auxiliary_loss_clip": 0.01140217, + "auxiliary_loss_mlp": 0.01127077, + "balance_loss_clip": 1.00200951, + "balance_loss_mlp": 1.0008111, + "epoch": 0.35953705095445665, + "flos": 23002328995200.0, + "grad_norm": 1.9223663833096087, + "language_loss": 0.67023361, + "learning_rate": 2.9649472575393296e-06, + "loss": 0.6929065, + "num_input_tokens_seen": 128490645, + "step": 5980, + "time_per_iteration": 2.6075007915496826 + }, + { + "auxiliary_loss_clip": 0.01138173, + "auxiliary_loss_mlp": 0.01127686, + "balance_loss_clip": 1.00189948, + "balance_loss_mlp": 1.00075245, + "epoch": 0.3595971742071246, + "flos": 25513992529920.0, + "grad_norm": 2.15188496948689, + "language_loss": 0.71150303, + "learning_rate": 2.964606105671327e-06, + "loss": 0.73416162, + "num_input_tokens_seen": 128510225, + "step": 5981, + "time_per_iteration": 4.127773761749268 + }, + { + "auxiliary_loss_clip": 0.0113683, + "auxiliary_loss_mlp": 0.01126774, + "balance_loss_clip": 1.0018394, + "balance_loss_mlp": 1.00069809, + "epoch": 0.3596572974597926, + "flos": 29862559635840.0, + "grad_norm": 1.649274605124688, + "language_loss": 0.70865417, + "learning_rate": 2.9642649172253635e-06, + "loss": 0.73129022, + "num_input_tokens_seen": 128530195, + "step": 5982, + "time_per_iteration": 2.6662380695343018 + }, + { + "auxiliary_loss_clip": 0.01154843, + "auxiliary_loss_mlp": 0.01126521, + "balance_loss_clip": 1.00204682, + "balance_loss_mlp": 1.00082695, + "epoch": 0.35971742071246054, + "flos": 23112538899840.0, + "grad_norm": 2.120870931872309, + "language_loss": 0.75996006, + "learning_rate": 2.9639236922143786e-06, + "loss": 0.78277373, + "num_input_tokens_seen": 128549990, + "step": 5983, + "time_per_iteration": 2.57177734375 + }, + { + "auxiliary_loss_clip": 0.01170227, + "auxiliary_loss_mlp": 0.01127443, + "balance_loss_clip": 1.00208497, + "balance_loss_mlp": 1.00079501, + "epoch": 0.3597775439651285, + "flos": 16725359399040.0, + "grad_norm": 1.6835684278418077, + "language_loss": 0.76336437, + "learning_rate": 2.96358243065131e-06, + "loss": 0.78634101, + "num_input_tokens_seen": 128567925, + "step": 5984, + "time_per_iteration": 3.9934258460998535 + }, + { + "auxiliary_loss_clip": 0.01153526, + "auxiliary_loss_mlp": 0.00747723, + "balance_loss_clip": 1.00195026, + "balance_loss_mlp": 1.00008392, + "epoch": 0.3598376672177965, + "flos": 19719483436800.0, + "grad_norm": 1.7810150056217446, + "language_loss": 0.86183906, + "learning_rate": 2.9632411325490993e-06, + "loss": 0.88085151, + "num_input_tokens_seen": 128585655, + "step": 5985, + "time_per_iteration": 2.6504743099212646 + }, + { + "auxiliary_loss_clip": 0.01153606, + "auxiliary_loss_mlp": 0.01126446, + "balance_loss_clip": 1.00193143, + "balance_loss_mlp": 1.00075221, + "epoch": 0.35989779047046444, + "flos": 17311529445120.0, + "grad_norm": 1.5132126013821299, + "language_loss": 0.72332549, + "learning_rate": 2.9628997979206884e-06, + "loss": 0.74612606, + "num_input_tokens_seen": 128604820, + "step": 5986, + "time_per_iteration": 2.530553102493286 + }, + { + "auxiliary_loss_clip": 0.01123413, + "auxiliary_loss_mlp": 0.01127989, + "balance_loss_clip": 1.00182247, + "balance_loss_mlp": 1.00067353, + "epoch": 0.3599579137231324, + "flos": 22711237176960.0, + "grad_norm": 1.8019131491526876, + "language_loss": 0.73583823, + "learning_rate": 2.9625584267790204e-06, + "loss": 0.75835228, + "num_input_tokens_seen": 128623070, + "step": 5987, + "time_per_iteration": 4.063912391662598 + }, + { + "auxiliary_loss_clip": 0.01170371, + "auxiliary_loss_mlp": 0.01127423, + "balance_loss_clip": 1.00211453, + "balance_loss_mlp": 1.00058413, + "epoch": 0.36001803697580037, + "flos": 20959873845120.0, + "grad_norm": 2.368505691390363, + "language_loss": 0.69441831, + "learning_rate": 2.9622170191370404e-06, + "loss": 0.7173962, + "num_input_tokens_seen": 128642430, + "step": 5988, + "time_per_iteration": 2.5224392414093018 + }, + { + "auxiliary_loss_clip": 0.01153828, + "auxiliary_loss_mlp": 0.01127005, + "balance_loss_clip": 1.00199318, + "balance_loss_mlp": 1.00064349, + "epoch": 0.36007816022846834, + "flos": 20485565729280.0, + "grad_norm": 1.8133963984236736, + "language_loss": 0.73219585, + "learning_rate": 2.9618755750076953e-06, + "loss": 0.75500417, + "num_input_tokens_seen": 128661285, + "step": 5989, + "time_per_iteration": 2.6348438262939453 + }, + { + "auxiliary_loss_clip": 0.0112352, + "auxiliary_loss_mlp": 0.01126223, + "balance_loss_clip": 1.00194061, + "balance_loss_mlp": 1.00071931, + "epoch": 0.36013828348113636, + "flos": 28001237794560.0, + "grad_norm": 1.5249914970001093, + "language_loss": 0.79886961, + "learning_rate": 2.961534094403931e-06, + "loss": 0.82136703, + "num_input_tokens_seen": 128682210, + "step": 5990, + "time_per_iteration": 2.694373607635498 + }, + { + "auxiliary_loss_clip": 0.01153606, + "auxiliary_loss_mlp": 0.01127331, + "balance_loss_clip": 1.00194573, + "balance_loss_mlp": 1.00058746, + "epoch": 0.3601984067338043, + "flos": 20082181017600.0, + "grad_norm": 1.7124502060733229, + "language_loss": 0.84109259, + "learning_rate": 2.961192577338698e-06, + "loss": 0.86390197, + "num_input_tokens_seen": 128700445, + "step": 5991, + "time_per_iteration": 2.539911985397339 + }, + { + "auxiliary_loss_clip": 0.01143935, + "auxiliary_loss_mlp": 0.0112755, + "balance_loss_clip": 1.00268388, + "balance_loss_mlp": 1.00071108, + "epoch": 0.3602585299864723, + "flos": 18617599872000.0, + "grad_norm": 4.974296286244461, + "language_loss": 0.75407171, + "learning_rate": 2.9608510238249463e-06, + "loss": 0.77678657, + "num_input_tokens_seen": 128716855, + "step": 5992, + "time_per_iteration": 2.563342571258545 + }, + { + "auxiliary_loss_clip": 0.0117019, + "auxiliary_loss_mlp": 0.01127149, + "balance_loss_clip": 1.00206888, + "balance_loss_mlp": 1.00078726, + "epoch": 0.36031865323914025, + "flos": 19573003774080.0, + "grad_norm": 2.0200672520714704, + "language_loss": 0.77042508, + "learning_rate": 2.960509433875627e-06, + "loss": 0.7933985, + "num_input_tokens_seen": 128735835, + "step": 5993, + "time_per_iteration": 2.5067200660705566 + }, + { + "auxiliary_loss_clip": 0.01138662, + "auxiliary_loss_mlp": 0.01127172, + "balance_loss_clip": 1.00194788, + "balance_loss_mlp": 1.00061953, + "epoch": 0.3603787764918082, + "flos": 17490615678720.0, + "grad_norm": 2.1451345724170765, + "language_loss": 0.73965329, + "learning_rate": 2.9601678075036943e-06, + "loss": 0.76231164, + "num_input_tokens_seen": 128752465, + "step": 5994, + "time_per_iteration": 2.5664002895355225 + }, + { + "auxiliary_loss_clip": 0.0110353, + "auxiliary_loss_mlp": 0.01127184, + "balance_loss_clip": 1.00172687, + "balance_loss_mlp": 1.00053644, + "epoch": 0.3604388997444762, + "flos": 15523393564800.0, + "grad_norm": 1.8824714922423502, + "language_loss": 0.68674791, + "learning_rate": 2.9598261447221024e-06, + "loss": 0.70905507, + "num_input_tokens_seen": 128770865, + "step": 5995, + "time_per_iteration": 2.679669141769409 + }, + { + "auxiliary_loss_clip": 0.01138437, + "auxiliary_loss_mlp": 0.01127024, + "balance_loss_clip": 1.00205612, + "balance_loss_mlp": 1.00085247, + "epoch": 0.36049902299714415, + "flos": 17310883000320.0, + "grad_norm": 2.014596737003908, + "language_loss": 0.8239491, + "learning_rate": 2.9594844455438057e-06, + "loss": 0.84660363, + "num_input_tokens_seen": 128789730, + "step": 5996, + "time_per_iteration": 2.644444465637207 + }, + { + "auxiliary_loss_clip": 0.01170317, + "auxiliary_loss_mlp": 0.01126976, + "balance_loss_clip": 1.00213182, + "balance_loss_mlp": 1.00070965, + "epoch": 0.3605591462498121, + "flos": 17056025026560.0, + "grad_norm": 1.6872415322134542, + "language_loss": 0.73782551, + "learning_rate": 2.959142709981763e-06, + "loss": 0.76079845, + "num_input_tokens_seen": 128806610, + "step": 5997, + "time_per_iteration": 2.4955568313598633 + }, + { + "auxiliary_loss_clip": 0.01159805, + "auxiliary_loss_mlp": 0.01126203, + "balance_loss_clip": 1.00262308, + "balance_loss_mlp": 1.00060391, + "epoch": 0.3606192695024801, + "flos": 16836862193280.0, + "grad_norm": 2.625766578717138, + "language_loss": 0.68809724, + "learning_rate": 2.9588009380489337e-06, + "loss": 0.71095729, + "num_input_tokens_seen": 128824830, + "step": 5998, + "time_per_iteration": 2.53680682182312 + }, + { + "auxiliary_loss_clip": 0.01107016, + "auxiliary_loss_mlp": 0.01126065, + "balance_loss_clip": 1.00194192, + "balance_loss_mlp": 1.00065732, + "epoch": 0.36067939275514804, + "flos": 12129655743360.0, + "grad_norm": 2.2352268812351443, + "language_loss": 0.76882225, + "learning_rate": 2.9584591297582758e-06, + "loss": 0.79115307, + "num_input_tokens_seen": 128838170, + "step": 5999, + "time_per_iteration": 2.634275197982788 + }, + { + "auxiliary_loss_clip": 0.01122481, + "auxiliary_loss_mlp": 0.01127717, + "balance_loss_clip": 1.0019207, + "balance_loss_mlp": 1.00078321, + "epoch": 0.360739516007816, + "flos": 18041449720320.0, + "grad_norm": 4.710911597047363, + "language_loss": 0.77841556, + "learning_rate": 2.9581172851227516e-06, + "loss": 0.80091757, + "num_input_tokens_seen": 128855625, + "step": 6000, + "time_per_iteration": 2.774646043777466 + }, + { + "auxiliary_loss_clip": 0.01126893, + "auxiliary_loss_mlp": 0.01126871, + "balance_loss_clip": 1.00220513, + "balance_loss_mlp": 1.00070024, + "epoch": 0.360799639260484, + "flos": 18549800951040.0, + "grad_norm": 1.6744125089697424, + "language_loss": 0.78493011, + "learning_rate": 2.9577754041553243e-06, + "loss": 0.8074677, + "num_input_tokens_seen": 128873540, + "step": 6001, + "time_per_iteration": 2.6507198810577393 + }, + { + "auxiliary_loss_clip": 0.01170086, + "auxiliary_loss_mlp": 0.00747755, + "balance_loss_clip": 1.00201631, + "balance_loss_mlp": 1.00006557, + "epoch": 0.36085976251315194, + "flos": 19682028529920.0, + "grad_norm": 2.016047139595555, + "language_loss": 0.83096182, + "learning_rate": 2.9574334868689575e-06, + "loss": 0.85014027, + "num_input_tokens_seen": 128889925, + "step": 6002, + "time_per_iteration": 2.5189690589904785 + }, + { + "auxiliary_loss_clip": 0.01137787, + "auxiliary_loss_mlp": 0.0112563, + "balance_loss_clip": 1.00195682, + "balance_loss_mlp": 1.00050783, + "epoch": 0.3609198857658199, + "flos": 24198943703040.0, + "grad_norm": 1.9946306089170303, + "language_loss": 0.90766972, + "learning_rate": 2.9570915332766165e-06, + "loss": 0.93030387, + "num_input_tokens_seen": 128906890, + "step": 6003, + "time_per_iteration": 2.6045753955841064 + }, + { + "auxiliary_loss_clip": 0.01117953, + "auxiliary_loss_mlp": 0.01108271, + "balance_loss_clip": 1.00223732, + "balance_loss_mlp": 0.99993384, + "epoch": 0.3609800090184879, + "flos": 57115995160320.0, + "grad_norm": 0.8676487487956994, + "language_loss": 0.53339159, + "learning_rate": 2.9567495433912693e-06, + "loss": 0.55565381, + "num_input_tokens_seen": 128965940, + "step": 6004, + "time_per_iteration": 3.140104293823242 + }, + { + "auxiliary_loss_clip": 0.01137188, + "auxiliary_loss_mlp": 0.00747813, + "balance_loss_clip": 1.00182199, + "balance_loss_mlp": 1.00006652, + "epoch": 0.3610401322711559, + "flos": 20811239366400.0, + "grad_norm": 1.9085365594865746, + "language_loss": 0.77618623, + "learning_rate": 2.956407517225883e-06, + "loss": 0.7950362, + "num_input_tokens_seen": 128985835, + "step": 6005, + "time_per_iteration": 2.627267599105835 + }, + { + "auxiliary_loss_clip": 0.01155271, + "auxiliary_loss_mlp": 0.0112683, + "balance_loss_clip": 1.00215006, + "balance_loss_mlp": 1.00094461, + "epoch": 0.36110025552382385, + "flos": 13699167494400.0, + "grad_norm": 1.9714766084456206, + "language_loss": 0.79293263, + "learning_rate": 2.956065454793429e-06, + "loss": 0.81575364, + "num_input_tokens_seen": 129003120, + "step": 6006, + "time_per_iteration": 2.5416221618652344 + }, + { + "auxiliary_loss_clip": 0.01170276, + "auxiliary_loss_mlp": 0.01127328, + "balance_loss_clip": 1.00212479, + "balance_loss_mlp": 1.00058436, + "epoch": 0.3611603787764918, + "flos": 22455014486400.0, + "grad_norm": 4.049543793616366, + "language_loss": 0.84188402, + "learning_rate": 2.955723356106876e-06, + "loss": 0.86486, + "num_input_tokens_seen": 129021645, + "step": 6007, + "time_per_iteration": 2.5198841094970703 + }, + { + "auxiliary_loss_clip": 0.01137423, + "auxiliary_loss_mlp": 0.0112757, + "balance_loss_clip": 1.00187564, + "balance_loss_mlp": 1.00082648, + "epoch": 0.3612205020291598, + "flos": 20886651970560.0, + "grad_norm": 4.795230920973962, + "language_loss": 0.72601914, + "learning_rate": 2.955381221179198e-06, + "loss": 0.74866903, + "num_input_tokens_seen": 129038375, + "step": 6008, + "time_per_iteration": 2.5823187828063965 + }, + { + "auxiliary_loss_clip": 0.01155149, + "auxiliary_loss_mlp": 0.01126523, + "balance_loss_clip": 1.00200033, + "balance_loss_mlp": 1.00073314, + "epoch": 0.36128062528182775, + "flos": 15741981780480.0, + "grad_norm": 1.926005498932256, + "language_loss": 0.8262772, + "learning_rate": 2.955039050023368e-06, + "loss": 0.84909391, + "num_input_tokens_seen": 129056235, + "step": 6009, + "time_per_iteration": 2.5431461334228516 + }, + { + "auxiliary_loss_clip": 0.01122053, + "auxiliary_loss_mlp": 0.01126749, + "balance_loss_clip": 1.00184631, + "balance_loss_mlp": 1.00057817, + "epoch": 0.3613407485344957, + "flos": 16764502245120.0, + "grad_norm": 1.9810036324573854, + "language_loss": 0.76204252, + "learning_rate": 2.954696842652362e-06, + "loss": 0.78453058, + "num_input_tokens_seen": 129072405, + "step": 6010, + "time_per_iteration": 2.6058921813964844 + }, + { + "auxiliary_loss_clip": 0.01136808, + "auxiliary_loss_mlp": 0.01126915, + "balance_loss_clip": 1.0018692, + "balance_loss_mlp": 1.00074363, + "epoch": 0.3614008717871637, + "flos": 20371189847040.0, + "grad_norm": 1.551677009383303, + "language_loss": 0.83054376, + "learning_rate": 2.9543545990791554e-06, + "loss": 0.853181, + "num_input_tokens_seen": 129090225, + "step": 6011, + "time_per_iteration": 2.58491587638855 + }, + { + "auxiliary_loss_clip": 0.01170473, + "auxiliary_loss_mlp": 0.01127129, + "balance_loss_clip": 1.00207675, + "balance_loss_mlp": 1.00076747, + "epoch": 0.36146099503983165, + "flos": 22776665800320.0, + "grad_norm": 1.973443719302171, + "language_loss": 0.62645233, + "learning_rate": 2.954012319316727e-06, + "loss": 0.64942831, + "num_input_tokens_seen": 129107685, + "step": 6012, + "time_per_iteration": 2.516197681427002 + }, + { + "auxiliary_loss_clip": 0.0113653, + "auxiliary_loss_mlp": 0.01125619, + "balance_loss_clip": 1.00174809, + "balance_loss_mlp": 1.00078249, + "epoch": 0.3615211182924996, + "flos": 22996654646400.0, + "grad_norm": 1.8242191625500885, + "language_loss": 0.83862698, + "learning_rate": 2.9536700033780565e-06, + "loss": 0.86124837, + "num_input_tokens_seen": 129125315, + "step": 6013, + "time_per_iteration": 2.6149003505706787 + }, + { + "auxiliary_loss_clip": 0.01170232, + "auxiliary_loss_mlp": 0.01126853, + "balance_loss_clip": 1.00201297, + "balance_loss_mlp": 1.00068164, + "epoch": 0.3615812415451676, + "flos": 16648079287680.0, + "grad_norm": 2.3622647558495715, + "language_loss": 0.91670907, + "learning_rate": 2.9533276512761228e-06, + "loss": 0.93967986, + "num_input_tokens_seen": 129141600, + "step": 6014, + "time_per_iteration": 2.5443718433380127 + }, + { + "auxiliary_loss_clip": 0.01170149, + "auxiliary_loss_mlp": 0.01126629, + "balance_loss_clip": 1.00198257, + "balance_loss_mlp": 1.0007441, + "epoch": 0.36164136479783554, + "flos": 21320093387520.0, + "grad_norm": 1.7444424990268832, + "language_loss": 0.73595303, + "learning_rate": 2.95298526302391e-06, + "loss": 0.75892079, + "num_input_tokens_seen": 129160665, + "step": 6015, + "time_per_iteration": 2.526353120803833 + }, + { + "auxiliary_loss_clip": 0.01074738, + "auxiliary_loss_mlp": 0.01127135, + "balance_loss_clip": 1.00171101, + "balance_loss_mlp": 1.00077295, + "epoch": 0.3617014880505035, + "flos": 24169569356160.0, + "grad_norm": 1.8272023774689568, + "language_loss": 0.64851016, + "learning_rate": 2.9526428386344e-06, + "loss": 0.67052889, + "num_input_tokens_seen": 129179220, + "step": 6016, + "time_per_iteration": 2.75317120552063 + }, + { + "auxiliary_loss_clip": 0.01153996, + "auxiliary_loss_mlp": 0.01127221, + "balance_loss_clip": 1.00205743, + "balance_loss_mlp": 1.00085866, + "epoch": 0.3617616113031715, + "flos": 39014824101120.0, + "grad_norm": 1.6372505823616346, + "language_loss": 0.71324962, + "learning_rate": 2.9523003781205785e-06, + "loss": 0.73606181, + "num_input_tokens_seen": 129200385, + "step": 6017, + "time_per_iteration": 2.688866138458252 + }, + { + "auxiliary_loss_clip": 0.01153736, + "auxiliary_loss_mlp": 0.0112801, + "balance_loss_clip": 1.00188911, + "balance_loss_mlp": 1.00059891, + "epoch": 0.3618217345558395, + "flos": 12130840892160.0, + "grad_norm": 1.8197400774181565, + "language_loss": 0.73494506, + "learning_rate": 2.9519578814954307e-06, + "loss": 0.75776255, + "num_input_tokens_seen": 129217395, + "step": 6018, + "time_per_iteration": 2.539803981781006 + }, + { + "auxiliary_loss_clip": 0.01120536, + "auxiliary_loss_mlp": 0.01126766, + "balance_loss_clip": 1.0016396, + "balance_loss_mlp": 1.00069022, + "epoch": 0.36188185780850746, + "flos": 24935005203840.0, + "grad_norm": 1.6216987838330934, + "language_loss": 0.69185531, + "learning_rate": 2.9516153487719448e-06, + "loss": 0.71432835, + "num_input_tokens_seen": 129238940, + "step": 6019, + "time_per_iteration": 4.049725294113159 + }, + { + "auxiliary_loss_clip": 0.01137658, + "auxiliary_loss_mlp": 0.01127321, + "balance_loss_clip": 1.00178671, + "balance_loss_mlp": 1.00057745, + "epoch": 0.3619419810611754, + "flos": 20958832350720.0, + "grad_norm": 1.9448955202110874, + "language_loss": 0.7644465, + "learning_rate": 2.95127277996311e-06, + "loss": 0.78709626, + "num_input_tokens_seen": 129258240, + "step": 6020, + "time_per_iteration": 2.6106152534484863 + }, + { + "auxiliary_loss_clip": 0.01159844, + "auxiliary_loss_mlp": 0.0112736, + "balance_loss_clip": 1.00213635, + "balance_loss_mlp": 1.00071239, + "epoch": 0.3620021043138434, + "flos": 22528882805760.0, + "grad_norm": 1.880536664279529, + "language_loss": 0.73547149, + "learning_rate": 2.9509301750819156e-06, + "loss": 0.75834346, + "num_input_tokens_seen": 129279040, + "step": 6021, + "time_per_iteration": 2.55159068107605 + }, + { + "auxiliary_loss_clip": 0.01122286, + "auxiliary_loss_mlp": 0.01127196, + "balance_loss_clip": 1.00180876, + "balance_loss_mlp": 1.00083423, + "epoch": 0.36206222756651135, + "flos": 15596687266560.0, + "grad_norm": 2.1512614641760055, + "language_loss": 0.80674672, + "learning_rate": 2.9505875341413533e-06, + "loss": 0.82924151, + "num_input_tokens_seen": 129295415, + "step": 6022, + "time_per_iteration": 4.0701539516448975 + }, + { + "auxiliary_loss_clip": 0.01153516, + "auxiliary_loss_mlp": 0.01126798, + "balance_loss_clip": 1.00208259, + "balance_loss_mlp": 1.00091338, + "epoch": 0.3621223508191793, + "flos": 23587170238080.0, + "grad_norm": 2.10477202858399, + "language_loss": 0.81528425, + "learning_rate": 2.950244857154417e-06, + "loss": 0.83808732, + "num_input_tokens_seen": 129312620, + "step": 6023, + "time_per_iteration": 2.602351665496826 + }, + { + "auxiliary_loss_clip": 0.0113865, + "auxiliary_loss_mlp": 0.0112789, + "balance_loss_clip": 1.00196218, + "balance_loss_mlp": 1.00057483, + "epoch": 0.3621824740718473, + "flos": 22309899540480.0, + "grad_norm": 1.9824097954517186, + "language_loss": 0.79309183, + "learning_rate": 2.9499021441341e-06, + "loss": 0.81575727, + "num_input_tokens_seen": 129331825, + "step": 6024, + "time_per_iteration": 2.6023714542388916 + }, + { + "auxiliary_loss_clip": 0.01139773, + "auxiliary_loss_mlp": 0.01125638, + "balance_loss_clip": 1.00213492, + "balance_loss_mlp": 1.00051594, + "epoch": 0.36224259732451525, + "flos": 16763640318720.0, + "grad_norm": 2.2220554945496533, + "language_loss": 0.75013685, + "learning_rate": 2.9495593950933997e-06, + "loss": 0.77279091, + "num_input_tokens_seen": 129350400, + "step": 6025, + "time_per_iteration": 3.9828310012817383 + }, + { + "auxiliary_loss_clip": 0.01153568, + "auxiliary_loss_mlp": 0.00747779, + "balance_loss_clip": 1.00200629, + "balance_loss_mlp": 1.00009036, + "epoch": 0.3623027205771832, + "flos": 23149742411520.0, + "grad_norm": 1.6069209237585245, + "language_loss": 0.72216177, + "learning_rate": 2.9492166100453107e-06, + "loss": 0.74117523, + "num_input_tokens_seen": 129371155, + "step": 6026, + "time_per_iteration": 2.622389554977417 + }, + { + "auxiliary_loss_clip": 0.01153346, + "auxiliary_loss_mlp": 0.01128593, + "balance_loss_clip": 1.0019902, + "balance_loss_mlp": 1.00108695, + "epoch": 0.3623628438298512, + "flos": 28549162834560.0, + "grad_norm": 2.4522889359317275, + "language_loss": 0.79155701, + "learning_rate": 2.948873789002833e-06, + "loss": 0.81437641, + "num_input_tokens_seen": 129391230, + "step": 6027, + "time_per_iteration": 2.594672679901123 + }, + { + "auxiliary_loss_clip": 0.01143036, + "auxiliary_loss_mlp": 0.01127501, + "balance_loss_clip": 1.00209618, + "balance_loss_mlp": 1.00075817, + "epoch": 0.36242296708251914, + "flos": 25484941405440.0, + "grad_norm": 1.7548513165607866, + "language_loss": 0.67420125, + "learning_rate": 2.9485309319789667e-06, + "loss": 0.69690669, + "num_input_tokens_seen": 129410065, + "step": 6028, + "time_per_iteration": 2.6305527687072754 + }, + { + "auxiliary_loss_clip": 0.01122322, + "auxiliary_loss_mlp": 0.01127327, + "balance_loss_clip": 1.00190103, + "balance_loss_mlp": 1.00077415, + "epoch": 0.3624830903351871, + "flos": 16290373697280.0, + "grad_norm": 1.8768295257390875, + "language_loss": 0.85358459, + "learning_rate": 2.9481880389867117e-06, + "loss": 0.87608111, + "num_input_tokens_seen": 129428655, + "step": 6029, + "time_per_iteration": 2.6051337718963623 + }, + { + "auxiliary_loss_clip": 0.0112122, + "auxiliary_loss_mlp": 0.01126736, + "balance_loss_clip": 1.00191522, + "balance_loss_mlp": 1.00075603, + "epoch": 0.36254321358785513, + "flos": 18296307694080.0, + "grad_norm": 1.714436694490824, + "language_loss": 0.72574449, + "learning_rate": 2.9478451100390714e-06, + "loss": 0.74822402, + "num_input_tokens_seen": 129447845, + "step": 6030, + "time_per_iteration": 2.624751329421997 + }, + { + "auxiliary_loss_clip": 0.01137259, + "auxiliary_loss_mlp": 0.011286, + "balance_loss_clip": 1.00181937, + "balance_loss_mlp": 1.00061703, + "epoch": 0.3626033368405231, + "flos": 14865294533760.0, + "grad_norm": 2.6892940779389076, + "language_loss": 0.74638844, + "learning_rate": 2.94750214514905e-06, + "loss": 0.76904702, + "num_input_tokens_seen": 129463275, + "step": 6031, + "time_per_iteration": 2.547642230987549 + }, + { + "auxiliary_loss_clip": 0.01122813, + "auxiliary_loss_mlp": 0.01126302, + "balance_loss_clip": 1.00186324, + "balance_loss_mlp": 1.00079823, + "epoch": 0.36266346009319106, + "flos": 22306595489280.0, + "grad_norm": 1.7838495771607297, + "language_loss": 0.73218036, + "learning_rate": 2.9471591443296516e-06, + "loss": 0.75467151, + "num_input_tokens_seen": 129483205, + "step": 6032, + "time_per_iteration": 2.644602060317993 + }, + { + "auxiliary_loss_clip": 0.01105932, + "auxiliary_loss_mlp": 0.01127594, + "balance_loss_clip": 1.00184548, + "balance_loss_mlp": 1.00094581, + "epoch": 0.362723583345859, + "flos": 18222331633920.0, + "grad_norm": 2.3186334162453344, + "language_loss": 0.77425689, + "learning_rate": 2.946816107593884e-06, + "loss": 0.79659212, + "num_input_tokens_seen": 129499885, + "step": 6033, + "time_per_iteration": 2.657057046890259 + }, + { + "auxiliary_loss_clip": 0.01103236, + "auxiliary_loss_mlp": 0.01109279, + "balance_loss_clip": 1.00207877, + "balance_loss_mlp": 1.00017822, + "epoch": 0.362783706598527, + "flos": 68499174458880.0, + "grad_norm": 0.7696087826648603, + "language_loss": 0.64754492, + "learning_rate": 2.9464730349547547e-06, + "loss": 0.66967005, + "num_input_tokens_seen": 129561885, + "step": 6034, + "time_per_iteration": 3.326313018798828 + }, + { + "auxiliary_loss_clip": 0.01154936, + "auxiliary_loss_mlp": 0.01127494, + "balance_loss_clip": 1.00200844, + "balance_loss_mlp": 1.00065553, + "epoch": 0.36284382985119495, + "flos": 26576589594240.0, + "grad_norm": 1.497782885026774, + "language_loss": 0.89839798, + "learning_rate": 2.946129926425273e-06, + "loss": 0.92122233, + "num_input_tokens_seen": 129582325, + "step": 6035, + "time_per_iteration": 2.5997297763824463 + }, + { + "auxiliary_loss_clip": 0.01137133, + "auxiliary_loss_mlp": 0.01127725, + "balance_loss_clip": 1.00165892, + "balance_loss_mlp": 1.00079143, + "epoch": 0.3629039531038629, + "flos": 20156767608960.0, + "grad_norm": 1.9965579475575732, + "language_loss": 0.73981923, + "learning_rate": 2.9457867820184496e-06, + "loss": 0.7624678, + "num_input_tokens_seen": 129600350, + "step": 6036, + "time_per_iteration": 2.573982000350952 + }, + { + "auxiliary_loss_clip": 0.01138337, + "auxiliary_loss_mlp": 0.01127303, + "balance_loss_clip": 1.00175726, + "balance_loss_mlp": 1.00065517, + "epoch": 0.3629640763565309, + "flos": 18625716345600.0, + "grad_norm": 2.6247292577968837, + "language_loss": 0.75619209, + "learning_rate": 2.945443601747297e-06, + "loss": 0.77884853, + "num_input_tokens_seen": 129618425, + "step": 6037, + "time_per_iteration": 2.5859837532043457 + }, + { + "auxiliary_loss_clip": 0.01154791, + "auxiliary_loss_mlp": 0.01126591, + "balance_loss_clip": 1.00196242, + "balance_loss_mlp": 1.00089622, + "epoch": 0.36302419960919885, + "flos": 19571459489280.0, + "grad_norm": 1.5718974815979148, + "language_loss": 0.78454328, + "learning_rate": 2.945100385624828e-06, + "loss": 0.80735707, + "num_input_tokens_seen": 129636750, + "step": 6038, + "time_per_iteration": 2.5570244789123535 + }, + { + "auxiliary_loss_clip": 0.011508, + "auxiliary_loss_mlp": 0.01108297, + "balance_loss_clip": 1.00258255, + "balance_loss_mlp": 0.99995989, + "epoch": 0.3630843228618668, + "flos": 63797606444160.0, + "grad_norm": 0.838647759547423, + "language_loss": 0.63474107, + "learning_rate": 2.9447571336640573e-06, + "loss": 0.657332, + "num_input_tokens_seen": 129699030, + "step": 6039, + "time_per_iteration": 3.1836419105529785 + }, + { + "auxiliary_loss_clip": 0.01138286, + "auxiliary_loss_mlp": 0.01126923, + "balance_loss_clip": 1.00199842, + "balance_loss_mlp": 1.00094283, + "epoch": 0.3631444461145348, + "flos": 21835160461440.0, + "grad_norm": 2.0446164261129516, + "language_loss": 0.71689868, + "learning_rate": 2.944413845878002e-06, + "loss": 0.73955071, + "num_input_tokens_seen": 129717135, + "step": 6040, + "time_per_iteration": 2.608157157897949 + }, + { + "auxiliary_loss_clip": 0.01153436, + "auxiliary_loss_mlp": 0.01127517, + "balance_loss_clip": 1.00202155, + "balance_loss_mlp": 1.00077391, + "epoch": 0.36320456936720275, + "flos": 21722041555200.0, + "grad_norm": 2.029510019054223, + "language_loss": 0.81162608, + "learning_rate": 2.9440705222796783e-06, + "loss": 0.83443558, + "num_input_tokens_seen": 129735940, + "step": 6041, + "time_per_iteration": 2.58309006690979 + }, + { + "auxiliary_loss_clip": 0.01138571, + "auxiliary_loss_mlp": 0.01127684, + "balance_loss_clip": 1.00180101, + "balance_loss_mlp": 1.00065482, + "epoch": 0.3632646926198707, + "flos": 17019072910080.0, + "grad_norm": 2.25665079244053, + "language_loss": 0.83481717, + "learning_rate": 2.943727162882107e-06, + "loss": 0.85747969, + "num_input_tokens_seen": 129752790, + "step": 6042, + "time_per_iteration": 2.5650575160980225 + }, + { + "auxiliary_loss_clip": 0.01144184, + "auxiliary_loss_mlp": 0.01126705, + "balance_loss_clip": 1.00209355, + "balance_loss_mlp": 1.00081968, + "epoch": 0.36332481587253873, + "flos": 23331163029120.0, + "grad_norm": 1.5895527637051103, + "language_loss": 0.78010154, + "learning_rate": 2.9433837676983064e-06, + "loss": 0.80281043, + "num_input_tokens_seen": 129773655, + "step": 6043, + "time_per_iteration": 2.6072914600372314 + }, + { + "auxiliary_loss_clip": 0.01122727, + "auxiliary_loss_mlp": 0.01127122, + "balance_loss_clip": 1.00193191, + "balance_loss_mlp": 1.00076056, + "epoch": 0.3633849391252067, + "flos": 10743539857920.0, + "grad_norm": 1.7847596969935182, + "language_loss": 0.65117061, + "learning_rate": 2.943040336741298e-06, + "loss": 0.6736691, + "num_input_tokens_seen": 129791605, + "step": 6044, + "time_per_iteration": 2.6092052459716797 + }, + { + "auxiliary_loss_clip": 0.01138771, + "auxiliary_loss_mlp": 0.01127399, + "balance_loss_clip": 1.0019381, + "balance_loss_mlp": 1.00065541, + "epoch": 0.36344506237787466, + "flos": 25849147357440.0, + "grad_norm": 2.292928459546398, + "language_loss": 0.80771375, + "learning_rate": 2.9426968700241066e-06, + "loss": 0.83037543, + "num_input_tokens_seen": 129811075, + "step": 6045, + "time_per_iteration": 2.6258018016815186 + }, + { + "auxiliary_loss_clip": 0.01121804, + "auxiliary_loss_mlp": 0.01126992, + "balance_loss_clip": 1.00184119, + "balance_loss_mlp": 1.00072527, + "epoch": 0.3635051856305426, + "flos": 30154046503680.0, + "grad_norm": 1.7114995312405372, + "language_loss": 0.64332688, + "learning_rate": 2.942353367559755e-06, + "loss": 0.66581482, + "num_input_tokens_seen": 129833755, + "step": 6046, + "time_per_iteration": 2.7333734035491943 + }, + { + "auxiliary_loss_clip": 0.01121034, + "auxiliary_loss_mlp": 0.01126859, + "balance_loss_clip": 1.0018611, + "balance_loss_mlp": 1.00059247, + "epoch": 0.3635653088832106, + "flos": 22198396746240.0, + "grad_norm": 1.7513384836519248, + "language_loss": 0.77809584, + "learning_rate": 2.9420098293612692e-06, + "loss": 0.80057478, + "num_input_tokens_seen": 129854475, + "step": 6047, + "time_per_iteration": 2.6841907501220703 + }, + { + "auxiliary_loss_clip": 0.01155142, + "auxiliary_loss_mlp": 0.01128333, + "balance_loss_clip": 1.00186276, + "balance_loss_mlp": 1.00063622, + "epoch": 0.36362543213587856, + "flos": 24787053083520.0, + "grad_norm": 2.2173594846689064, + "language_loss": 0.79556996, + "learning_rate": 2.9416662554416767e-06, + "loss": 0.81840473, + "num_input_tokens_seen": 129873530, + "step": 6048, + "time_per_iteration": 2.6072728633880615 + }, + { + "auxiliary_loss_clip": 0.01150453, + "auxiliary_loss_mlp": 0.01108428, + "balance_loss_clip": 1.00276423, + "balance_loss_mlp": 1.00009084, + "epoch": 0.3636855553885465, + "flos": 62526369231360.0, + "grad_norm": 0.7491386196096863, + "language_loss": 0.52570891, + "learning_rate": 2.9413226458140054e-06, + "loss": 0.5482977, + "num_input_tokens_seen": 129940400, + "step": 6049, + "time_per_iteration": 3.2101831436157227 + }, + { + "auxiliary_loss_clip": 0.0112429, + "auxiliary_loss_mlp": 0.01127152, + "balance_loss_clip": 1.00197625, + "balance_loss_mlp": 1.00059962, + "epoch": 0.3637456786412145, + "flos": 24060652341120.0, + "grad_norm": 1.8774920366763295, + "language_loss": 0.86146832, + "learning_rate": 2.9409790004912845e-06, + "loss": 0.88398278, + "num_input_tokens_seen": 129958635, + "step": 6050, + "time_per_iteration": 2.666163682937622 + }, + { + "auxiliary_loss_clip": 0.01153766, + "auxiliary_loss_mlp": 0.00747729, + "balance_loss_clip": 1.00194132, + "balance_loss_mlp": 1.00008821, + "epoch": 0.36380580189388245, + "flos": 16691495852160.0, + "grad_norm": 3.257811721469214, + "language_loss": 0.78395998, + "learning_rate": 2.940635319486546e-06, + "loss": 0.80297494, + "num_input_tokens_seen": 129977685, + "step": 6051, + "time_per_iteration": 2.5469791889190674 + }, + { + "auxiliary_loss_clip": 0.01154765, + "auxiliary_loss_mlp": 0.01126169, + "balance_loss_clip": 1.00195956, + "balance_loss_mlp": 1.00076103, + "epoch": 0.3638659251465504, + "flos": 25114091437440.0, + "grad_norm": 1.9183819094796783, + "language_loss": 0.82672155, + "learning_rate": 2.940291602812822e-06, + "loss": 0.84953088, + "num_input_tokens_seen": 129997530, + "step": 6052, + "time_per_iteration": 2.6131746768951416 + }, + { + "auxiliary_loss_clip": 0.01120898, + "auxiliary_loss_mlp": 0.01126312, + "balance_loss_clip": 1.00182807, + "balance_loss_mlp": 1.00080824, + "epoch": 0.3639260483992184, + "flos": 23003011353600.0, + "grad_norm": 1.9921662872861008, + "language_loss": 0.7220726, + "learning_rate": 2.939947850483145e-06, + "loss": 0.74454463, + "num_input_tokens_seen": 130017955, + "step": 6053, + "time_per_iteration": 2.699225902557373 + }, + { + "auxiliary_loss_clip": 0.01090619, + "auxiliary_loss_mlp": 0.01108213, + "balance_loss_clip": 1.00282407, + "balance_loss_mlp": 0.99987584, + "epoch": 0.36398617165188635, + "flos": 70716011160960.0, + "grad_norm": 0.7822218493363277, + "language_loss": 0.61234832, + "learning_rate": 2.9396040625105532e-06, + "loss": 0.63433665, + "num_input_tokens_seen": 130074275, + "step": 6054, + "time_per_iteration": 3.2601144313812256 + }, + { + "auxiliary_loss_clip": 0.01138724, + "auxiliary_loss_mlp": 0.01127177, + "balance_loss_clip": 1.0018599, + "balance_loss_mlp": 1.00071967, + "epoch": 0.3640462949045543, + "flos": 22235456603520.0, + "grad_norm": 9.577711839726668, + "language_loss": 0.7573697, + "learning_rate": 2.9392602389080802e-06, + "loss": 0.7800287, + "num_input_tokens_seen": 130091375, + "step": 6055, + "time_per_iteration": 2.6068263053894043 + }, + { + "auxiliary_loss_clip": 0.01170228, + "auxiliary_loss_mlp": 0.01127455, + "balance_loss_clip": 1.00207806, + "balance_loss_mlp": 1.0007118, + "epoch": 0.3641064181572223, + "flos": 21543529939200.0, + "grad_norm": 1.8022708563325247, + "language_loss": 0.75299764, + "learning_rate": 2.938916379688765e-06, + "loss": 0.77597445, + "num_input_tokens_seen": 130111595, + "step": 6056, + "time_per_iteration": 3.9177730083465576 + }, + { + "auxiliary_loss_clip": 0.0113859, + "auxiliary_loss_mlp": 0.011267, + "balance_loss_clip": 1.00199008, + "balance_loss_mlp": 1.00081444, + "epoch": 0.3641665414098903, + "flos": 22273306560000.0, + "grad_norm": 1.7372376637983753, + "language_loss": 0.80088556, + "learning_rate": 2.9385724848656468e-06, + "loss": 0.82353848, + "num_input_tokens_seen": 130131440, + "step": 6057, + "time_per_iteration": 2.620199203491211 + }, + { + "auxiliary_loss_clip": 0.01144514, + "auxiliary_loss_mlp": 0.01127116, + "balance_loss_clip": 1.0021441, + "balance_loss_mlp": 1.00065851, + "epoch": 0.36422666466255826, + "flos": 28329676778880.0, + "grad_norm": 1.8355486772503058, + "language_loss": 0.80051112, + "learning_rate": 2.9382285544517647e-06, + "loss": 0.82322741, + "num_input_tokens_seen": 130151375, + "step": 6058, + "time_per_iteration": 2.633826494216919 + }, + { + "auxiliary_loss_clip": 0.01138689, + "auxiliary_loss_mlp": 0.00747634, + "balance_loss_clip": 1.0019269, + "balance_loss_mlp": 1.00004232, + "epoch": 0.36428678791522623, + "flos": 24170503109760.0, + "grad_norm": 1.7722681943423861, + "language_loss": 0.84781325, + "learning_rate": 2.9378845884601636e-06, + "loss": 0.86667645, + "num_input_tokens_seen": 130169960, + "step": 6059, + "time_per_iteration": 4.06209397315979 + }, + { + "auxiliary_loss_clip": 0.01122957, + "auxiliary_loss_mlp": 0.01126837, + "balance_loss_clip": 1.00193405, + "balance_loss_mlp": 1.00085616, + "epoch": 0.3643469111678942, + "flos": 22528451842560.0, + "grad_norm": 1.5630565439281037, + "language_loss": 0.88052666, + "learning_rate": 2.937540586903884e-06, + "loss": 0.90302455, + "num_input_tokens_seen": 130189800, + "step": 6060, + "time_per_iteration": 2.6836559772491455 + }, + { + "auxiliary_loss_clip": 0.01153477, + "auxiliary_loss_mlp": 0.01127397, + "balance_loss_clip": 1.00197363, + "balance_loss_mlp": 1.00093973, + "epoch": 0.36440703442056216, + "flos": 19426595938560.0, + "grad_norm": 1.900678998525738, + "language_loss": 0.6657052, + "learning_rate": 2.937196549795971e-06, + "loss": 0.68851393, + "num_input_tokens_seen": 130206370, + "step": 6061, + "time_per_iteration": 2.5483713150024414 + }, + { + "auxiliary_loss_clip": 0.0113738, + "auxiliary_loss_mlp": 0.01127315, + "balance_loss_clip": 1.00203323, + "balance_loss_mlp": 1.00066698, + "epoch": 0.3644671576732301, + "flos": 18040515966720.0, + "grad_norm": 2.1508328646254107, + "language_loss": 0.74864799, + "learning_rate": 2.9368524771494718e-06, + "loss": 0.77129489, + "num_input_tokens_seen": 130224445, + "step": 6062, + "time_per_iteration": 4.008843183517456 + }, + { + "auxiliary_loss_clip": 0.0113715, + "auxiliary_loss_mlp": 0.01127026, + "balance_loss_clip": 1.00182378, + "balance_loss_mlp": 1.00056911, + "epoch": 0.3645272809258981, + "flos": 21542811667200.0, + "grad_norm": 2.2415883201072666, + "language_loss": 0.72397131, + "learning_rate": 2.936508368977432e-06, + "loss": 0.74661309, + "num_input_tokens_seen": 130245380, + "step": 6063, + "time_per_iteration": 2.663295030593872 + }, + { + "auxiliary_loss_clip": 0.01154878, + "auxiliary_loss_mlp": 0.01126024, + "balance_loss_clip": 1.0019381, + "balance_loss_mlp": 1.00080609, + "epoch": 0.36458740417856605, + "flos": 22746860490240.0, + "grad_norm": 1.8838594416386587, + "language_loss": 0.67605042, + "learning_rate": 2.936164225292901e-06, + "loss": 0.69885939, + "num_input_tokens_seen": 130265575, + "step": 6064, + "time_per_iteration": 2.6335065364837646 + }, + { + "auxiliary_loss_clip": 0.01137679, + "auxiliary_loss_mlp": 0.01127272, + "balance_loss_clip": 1.00193763, + "balance_loss_mlp": 1.00081491, + "epoch": 0.364647527431234, + "flos": 26140670138880.0, + "grad_norm": 1.6886295468769474, + "language_loss": 0.74161053, + "learning_rate": 2.9358200461089297e-06, + "loss": 0.76426005, + "num_input_tokens_seen": 130286195, + "step": 6065, + "time_per_iteration": 2.6339550018310547 + }, + { + "auxiliary_loss_clip": 0.0113695, + "auxiliary_loss_mlp": 0.01127116, + "balance_loss_clip": 1.00186694, + "balance_loss_mlp": 1.00075459, + "epoch": 0.364707650683902, + "flos": 31029907737600.0, + "grad_norm": 1.785167778133962, + "language_loss": 0.74745739, + "learning_rate": 2.9354758314385676e-06, + "loss": 0.77009803, + "num_input_tokens_seen": 130306095, + "step": 6066, + "time_per_iteration": 2.6833009719848633 + }, + { + "auxiliary_loss_clip": 0.01159709, + "auxiliary_loss_mlp": 0.01126328, + "balance_loss_clip": 1.00211751, + "balance_loss_mlp": 1.00053811, + "epoch": 0.36476777393656995, + "flos": 19572896033280.0, + "grad_norm": 2.296954724927041, + "language_loss": 0.76166129, + "learning_rate": 2.9351315812948684e-06, + "loss": 0.7845217, + "num_input_tokens_seen": 130324685, + "step": 6067, + "time_per_iteration": 2.584007978439331 + }, + { + "auxiliary_loss_clip": 0.01170066, + "auxiliary_loss_mlp": 0.01126215, + "balance_loss_clip": 1.00206125, + "balance_loss_mlp": 1.00061655, + "epoch": 0.3648278971892379, + "flos": 17748849530880.0, + "grad_norm": 3.145406418223086, + "language_loss": 0.71016181, + "learning_rate": 2.934787295690886e-06, + "loss": 0.73312461, + "num_input_tokens_seen": 130343855, + "step": 6068, + "time_per_iteration": 2.5289759635925293 + }, + { + "auxiliary_loss_clip": 0.011551, + "auxiliary_loss_mlp": 0.01127241, + "balance_loss_clip": 1.00193262, + "balance_loss_mlp": 1.00068867, + "epoch": 0.3648880204419059, + "flos": 17931167988480.0, + "grad_norm": 2.6313816703088944, + "language_loss": 0.7424944, + "learning_rate": 2.9344429746396755e-06, + "loss": 0.7653178, + "num_input_tokens_seen": 130362320, + "step": 6069, + "time_per_iteration": 2.522742509841919 + }, + { + "auxiliary_loss_clip": 0.01136871, + "auxiliary_loss_mlp": 0.0112631, + "balance_loss_clip": 1.00189412, + "balance_loss_mlp": 1.00071084, + "epoch": 0.3649481436945739, + "flos": 22638266697600.0, + "grad_norm": 2.16021502526655, + "language_loss": 0.66381496, + "learning_rate": 2.9340986181542945e-06, + "loss": 0.68644679, + "num_input_tokens_seen": 130383165, + "step": 6070, + "time_per_iteration": 2.607288360595703 + }, + { + "auxiliary_loss_clip": 0.01153717, + "auxiliary_loss_mlp": 0.01125821, + "balance_loss_clip": 1.00194371, + "balance_loss_mlp": 1.0006038, + "epoch": 0.36500826694724187, + "flos": 21579656042880.0, + "grad_norm": 1.8754335710772856, + "language_loss": 0.74541497, + "learning_rate": 2.9337542262477994e-06, + "loss": 0.76821041, + "num_input_tokens_seen": 130402425, + "step": 6071, + "time_per_iteration": 2.557096242904663 + }, + { + "auxiliary_loss_clip": 0.0115459, + "auxiliary_loss_mlp": 0.01126614, + "balance_loss_clip": 1.00194561, + "balance_loss_mlp": 1.00063324, + "epoch": 0.36506839019990983, + "flos": 13772533023360.0, + "grad_norm": 1.7535821434882812, + "language_loss": 0.88234502, + "learning_rate": 2.9334097989332506e-06, + "loss": 0.90515709, + "num_input_tokens_seen": 130419440, + "step": 6072, + "time_per_iteration": 2.533306837081909 + }, + { + "auxiliary_loss_clip": 0.01153403, + "auxiliary_loss_mlp": 0.01126702, + "balance_loss_clip": 1.00192773, + "balance_loss_mlp": 1.00081706, + "epoch": 0.3651285134525778, + "flos": 17274972378240.0, + "grad_norm": 2.1183488934130352, + "language_loss": 0.72510254, + "learning_rate": 2.9330653362237094e-06, + "loss": 0.74790359, + "num_input_tokens_seen": 130438495, + "step": 6073, + "time_per_iteration": 2.5469279289245605 + }, + { + "auxiliary_loss_clip": 0.01088583, + "auxiliary_loss_mlp": 0.01127473, + "balance_loss_clip": 1.00174379, + "balance_loss_mlp": 1.00073004, + "epoch": 0.36518863670524576, + "flos": 21907987286400.0, + "grad_norm": 2.2528771735745026, + "language_loss": 0.66918647, + "learning_rate": 2.932720838132236e-06, + "loss": 0.691347, + "num_input_tokens_seen": 130455575, + "step": 6074, + "time_per_iteration": 2.7062976360321045 + }, + { + "auxiliary_loss_clip": 0.01121712, + "auxiliary_loss_mlp": 0.01126314, + "balance_loss_clip": 1.00173926, + "balance_loss_mlp": 1.00061929, + "epoch": 0.3652487599579137, + "flos": 27122180250240.0, + "grad_norm": 1.9658658304744432, + "language_loss": 0.73048753, + "learning_rate": 2.9323763046718954e-06, + "loss": 0.75296777, + "num_input_tokens_seen": 130476385, + "step": 6075, + "time_per_iteration": 2.681208610534668 + }, + { + "auxiliary_loss_clip": 0.01121789, + "auxiliary_loss_mlp": 0.01127849, + "balance_loss_clip": 1.00178277, + "balance_loss_mlp": 1.00081992, + "epoch": 0.3653088832105817, + "flos": 19755573626880.0, + "grad_norm": 3.043825939933636, + "language_loss": 0.8890155, + "learning_rate": 2.9320317358557524e-06, + "loss": 0.9115119, + "num_input_tokens_seen": 130493630, + "step": 6076, + "time_per_iteration": 2.609467029571533 + }, + { + "auxiliary_loss_clip": 0.01153456, + "auxiliary_loss_mlp": 0.01126458, + "balance_loss_clip": 1.00186229, + "balance_loss_mlp": 1.00076354, + "epoch": 0.36536900646324966, + "flos": 13115008609920.0, + "grad_norm": 2.113558334215872, + "language_loss": 0.69234085, + "learning_rate": 2.931687131696872e-06, + "loss": 0.71513999, + "num_input_tokens_seen": 130510735, + "step": 6077, + "time_per_iteration": 2.528447389602661 + }, + { + "auxiliary_loss_clip": 0.01167431, + "auxiliary_loss_mlp": 0.01108327, + "balance_loss_clip": 1.00286198, + "balance_loss_mlp": 0.99998963, + "epoch": 0.3654291297159176, + "flos": 71100472383360.0, + "grad_norm": 0.7414860783903156, + "language_loss": 0.61760461, + "learning_rate": 2.9313424922083224e-06, + "loss": 0.64036214, + "num_input_tokens_seen": 130577050, + "step": 6078, + "time_per_iteration": 3.209808349609375 + }, + { + "auxiliary_loss_clip": 0.01139409, + "auxiliary_loss_mlp": 0.0112694, + "balance_loss_clip": 1.00190175, + "balance_loss_mlp": 1.00086403, + "epoch": 0.3654892529685856, + "flos": 23617478338560.0, + "grad_norm": 1.8009637119346857, + "language_loss": 0.78285068, + "learning_rate": 2.930997817403173e-06, + "loss": 0.8055141, + "num_input_tokens_seen": 130593780, + "step": 6079, + "time_per_iteration": 2.639453172683716 + }, + { + "auxiliary_loss_clip": 0.01153483, + "auxiliary_loss_mlp": 0.01126889, + "balance_loss_clip": 1.00193703, + "balance_loss_mlp": 1.00071752, + "epoch": 0.36554937622125355, + "flos": 43470799850880.0, + "grad_norm": 2.5197785890230646, + "language_loss": 0.62332642, + "learning_rate": 2.9306531072944913e-06, + "loss": 0.64613008, + "num_input_tokens_seen": 130615510, + "step": 6080, + "time_per_iteration": 2.74104642868042 + }, + { + "auxiliary_loss_clip": 0.01123125, + "auxiliary_loss_mlp": 0.01128089, + "balance_loss_clip": 1.00182843, + "balance_loss_mlp": 1.0006783, + "epoch": 0.3656094994739215, + "flos": 23294641875840.0, + "grad_norm": 2.885309370662373, + "language_loss": 0.67134416, + "learning_rate": 2.930308361895352e-06, + "loss": 0.69385624, + "num_input_tokens_seen": 130635410, + "step": 6081, + "time_per_iteration": 2.6635262966156006 + }, + { + "auxiliary_loss_clip": 0.01137407, + "auxiliary_loss_mlp": 0.00747725, + "balance_loss_clip": 1.00182462, + "balance_loss_mlp": 1.00009298, + "epoch": 0.3656696227265895, + "flos": 24571984400640.0, + "grad_norm": 1.645984206569436, + "language_loss": 0.74902117, + "learning_rate": 2.9299635812188257e-06, + "loss": 0.76787245, + "num_input_tokens_seen": 130657725, + "step": 6082, + "time_per_iteration": 2.6623504161834717 + }, + { + "auxiliary_loss_clip": 0.01090547, + "auxiliary_loss_mlp": 0.00747641, + "balance_loss_clip": 1.00177467, + "balance_loss_mlp": 1.00004816, + "epoch": 0.3657297459792575, + "flos": 27928375056000.0, + "grad_norm": 1.7794122642540913, + "language_loss": 0.82772166, + "learning_rate": 2.929618765277987e-06, + "loss": 0.84610355, + "num_input_tokens_seen": 130678360, + "step": 6083, + "time_per_iteration": 2.7672781944274902 + }, + { + "auxiliary_loss_clip": 0.01135785, + "auxiliary_loss_mlp": 0.01108376, + "balance_loss_clip": 1.00272274, + "balance_loss_mlp": 1.00003815, + "epoch": 0.36578986923192547, + "flos": 67392622126080.0, + "grad_norm": 0.811072967557855, + "language_loss": 0.59370226, + "learning_rate": 2.9292739140859125e-06, + "loss": 0.61614388, + "num_input_tokens_seen": 130742110, + "step": 6084, + "time_per_iteration": 3.2560670375823975 + }, + { + "auxiliary_loss_clip": 0.01123779, + "auxiliary_loss_mlp": 0.01126512, + "balance_loss_clip": 1.00191915, + "balance_loss_mlp": 1.0008173, + "epoch": 0.36584999248459343, + "flos": 20227511445120.0, + "grad_norm": 1.7706751813973223, + "language_loss": 0.72735775, + "learning_rate": 2.9289290276556767e-06, + "loss": 0.74986064, + "num_input_tokens_seen": 130759870, + "step": 6085, + "time_per_iteration": 2.631556987762451 + }, + { + "auxiliary_loss_clip": 0.01120242, + "auxiliary_loss_mlp": 0.01126752, + "balance_loss_clip": 1.00176799, + "balance_loss_mlp": 1.00077176, + "epoch": 0.3659101157372614, + "flos": 19062461813760.0, + "grad_norm": 1.8599723855121961, + "language_loss": 0.78217655, + "learning_rate": 2.9285841060003604e-06, + "loss": 0.80464649, + "num_input_tokens_seen": 130778510, + "step": 6086, + "time_per_iteration": 2.668901205062866 + }, + { + "auxiliary_loss_clip": 0.01154936, + "auxiliary_loss_mlp": 0.01126646, + "balance_loss_clip": 1.00208747, + "balance_loss_mlp": 1.00066519, + "epoch": 0.36597023898992936, + "flos": 30810708990720.0, + "grad_norm": 2.896501644457079, + "language_loss": 0.76563382, + "learning_rate": 2.9282391491330416e-06, + "loss": 0.78844965, + "num_input_tokens_seen": 130798535, + "step": 6087, + "time_per_iteration": 2.635892868041992 + }, + { + "auxiliary_loss_clip": 0.01105334, + "auxiliary_loss_mlp": 0.01128043, + "balance_loss_clip": 1.00183201, + "balance_loss_mlp": 1.00072742, + "epoch": 0.36603036224259733, + "flos": 20521799573760.0, + "grad_norm": 2.255711627517836, + "language_loss": 0.7094593, + "learning_rate": 2.9278941570668002e-06, + "loss": 0.73179305, + "num_input_tokens_seen": 130816655, + "step": 6088, + "time_per_iteration": 2.6629714965820312 + }, + { + "auxiliary_loss_clip": 0.01153788, + "auxiliary_loss_mlp": 0.01127682, + "balance_loss_clip": 1.00195885, + "balance_loss_mlp": 1.00055695, + "epoch": 0.3660904854952653, + "flos": 38329397798400.0, + "grad_norm": 2.5392253216005347, + "language_loss": 0.79722738, + "learning_rate": 2.92754912981472e-06, + "loss": 0.82004201, + "num_input_tokens_seen": 130841225, + "step": 6089, + "time_per_iteration": 2.745215654373169 + }, + { + "auxiliary_loss_clip": 0.01123083, + "auxiliary_loss_mlp": 0.01125904, + "balance_loss_clip": 1.00192046, + "balance_loss_mlp": 1.00049591, + "epoch": 0.36615060874793326, + "flos": 21835555511040.0, + "grad_norm": 1.8397535139158423, + "language_loss": 0.71576566, + "learning_rate": 2.927204067389884e-06, + "loss": 0.7382555, + "num_input_tokens_seen": 130861050, + "step": 6090, + "time_per_iteration": 2.6810481548309326 + }, + { + "auxiliary_loss_clip": 0.01137009, + "auxiliary_loss_mlp": 0.01127176, + "balance_loss_clip": 1.00192416, + "balance_loss_mlp": 1.00100493, + "epoch": 0.3662107320006012, + "flos": 16581537342720.0, + "grad_norm": 1.8511183592755878, + "language_loss": 0.73845118, + "learning_rate": 2.9268589698053763e-06, + "loss": 0.76109308, + "num_input_tokens_seen": 130879775, + "step": 6091, + "time_per_iteration": 2.6084964275360107 + }, + { + "auxiliary_loss_clip": 0.01095611, + "auxiliary_loss_mlp": 0.01126363, + "balance_loss_clip": 1.0022496, + "balance_loss_mlp": 1.00076401, + "epoch": 0.3662708552532692, + "flos": 20958365473920.0, + "grad_norm": 1.9202854895051387, + "language_loss": 0.7274214, + "learning_rate": 2.926513837074284e-06, + "loss": 0.74964118, + "num_input_tokens_seen": 130898070, + "step": 6092, + "time_per_iteration": 2.7020716667175293 + }, + { + "auxiliary_loss_clip": 0.01155194, + "auxiliary_loss_mlp": 0.01127339, + "balance_loss_clip": 1.00205016, + "balance_loss_mlp": 1.00088131, + "epoch": 0.36633097850593715, + "flos": 21902707987200.0, + "grad_norm": 2.1100122798577687, + "language_loss": 0.78108931, + "learning_rate": 2.9261686692096942e-06, + "loss": 0.80391467, + "num_input_tokens_seen": 130915250, + "step": 6093, + "time_per_iteration": 3.944836378097534 + }, + { + "auxiliary_loss_clip": 0.01153552, + "auxiliary_loss_mlp": 0.01127395, + "balance_loss_clip": 1.00189281, + "balance_loss_mlp": 1.00065207, + "epoch": 0.3663911017586051, + "flos": 32854133808000.0, + "grad_norm": 1.8636500708108872, + "language_loss": 0.74543554, + "learning_rate": 2.925823466224696e-06, + "loss": 0.76824498, + "num_input_tokens_seen": 130936995, + "step": 6094, + "time_per_iteration": 2.64841365814209 + }, + { + "auxiliary_loss_clip": 0.01170594, + "auxiliary_loss_mlp": 0.01127502, + "balance_loss_clip": 1.00224113, + "balance_loss_mlp": 1.00104427, + "epoch": 0.3664512250112731, + "flos": 27271748482560.0, + "grad_norm": 1.6902259611545867, + "language_loss": 0.79335135, + "learning_rate": 2.9254782281323785e-06, + "loss": 0.81633234, + "num_input_tokens_seen": 130957970, + "step": 6095, + "time_per_iteration": 2.590247631072998 + }, + { + "auxiliary_loss_clip": 0.01143148, + "auxiliary_loss_mlp": 0.00747781, + "balance_loss_clip": 1.00202656, + "balance_loss_mlp": 1.0000906, + "epoch": 0.3665113482639411, + "flos": 17784436930560.0, + "grad_norm": 2.2352274829275256, + "language_loss": 0.74104333, + "learning_rate": 2.925132954945834e-06, + "loss": 0.7599526, + "num_input_tokens_seen": 130974915, + "step": 6096, + "time_per_iteration": 4.007866382598877 + }, + { + "auxiliary_loss_clip": 0.0112143, + "auxiliary_loss_mlp": 0.01127071, + "balance_loss_clip": 1.00181818, + "balance_loss_mlp": 1.00061429, + "epoch": 0.36657147151660907, + "flos": 27854614477440.0, + "grad_norm": 1.8764232827462508, + "language_loss": 0.67088842, + "learning_rate": 2.924787646678155e-06, + "loss": 0.69337344, + "num_input_tokens_seen": 130995745, + "step": 6097, + "time_per_iteration": 2.7060723304748535 + }, + { + "auxiliary_loss_clip": 0.01088055, + "auxiliary_loss_mlp": 0.01127374, + "balance_loss_clip": 1.00179827, + "balance_loss_mlp": 1.00082183, + "epoch": 0.36663159476927704, + "flos": 25374013228800.0, + "grad_norm": 2.048967784311553, + "language_loss": 0.77690172, + "learning_rate": 2.9244423033424365e-06, + "loss": 0.79905605, + "num_input_tokens_seen": 131015545, + "step": 6098, + "time_per_iteration": 2.7732510566711426 + }, + { + "auxiliary_loss_clip": 0.01155022, + "auxiliary_loss_mlp": 0.01126875, + "balance_loss_clip": 1.00209916, + "balance_loss_mlp": 1.0007987, + "epoch": 0.366691718021945, + "flos": 21357225072000.0, + "grad_norm": 1.7614052441294663, + "language_loss": 0.73298442, + "learning_rate": 2.9240969249517723e-06, + "loss": 0.75580335, + "num_input_tokens_seen": 131033990, + "step": 6099, + "time_per_iteration": 2.586853265762329 + }, + { + "auxiliary_loss_clip": 0.01137044, + "auxiliary_loss_mlp": 0.01125512, + "balance_loss_clip": 1.00183392, + "balance_loss_mlp": 1.00067604, + "epoch": 0.36675184127461297, + "flos": 16800376953600.0, + "grad_norm": 1.7893004467108602, + "language_loss": 0.84638101, + "learning_rate": 2.9237515115192602e-06, + "loss": 0.86900651, + "num_input_tokens_seen": 131050710, + "step": 6100, + "time_per_iteration": 3.985517740249634 + }, + { + "auxiliary_loss_clip": 0.01122324, + "auxiliary_loss_mlp": 0.01127359, + "balance_loss_clip": 1.00190949, + "balance_loss_mlp": 1.00061536, + "epoch": 0.36681196452728093, + "flos": 21906514828800.0, + "grad_norm": 1.8282541137940538, + "language_loss": 0.70982438, + "learning_rate": 2.9234060630579992e-06, + "loss": 0.7323212, + "num_input_tokens_seen": 131071435, + "step": 6101, + "time_per_iteration": 2.671576738357544 + }, + { + "auxiliary_loss_clip": 0.011374, + "auxiliary_loss_mlp": 0.01127158, + "balance_loss_clip": 1.00193417, + "balance_loss_mlp": 1.0007962, + "epoch": 0.3668720877799489, + "flos": 17712436118400.0, + "grad_norm": 2.3073133517168825, + "language_loss": 0.76024497, + "learning_rate": 2.9230605795810865e-06, + "loss": 0.78289056, + "num_input_tokens_seen": 131088775, + "step": 6102, + "time_per_iteration": 2.572603702545166 + }, + { + "auxiliary_loss_clip": 0.01153973, + "auxiliary_loss_mlp": 0.01127898, + "balance_loss_clip": 1.0020752, + "balance_loss_mlp": 1.00077331, + "epoch": 0.36693221103261686, + "flos": 47045455499520.0, + "grad_norm": 1.627851972610824, + "language_loss": 0.70297658, + "learning_rate": 2.922715061101625e-06, + "loss": 0.72579527, + "num_input_tokens_seen": 131112800, + "step": 6103, + "time_per_iteration": 2.785593032836914 + }, + { + "auxiliary_loss_clip": 0.01103912, + "auxiliary_loss_mlp": 0.0112638, + "balance_loss_clip": 1.00171959, + "balance_loss_mlp": 1.00078106, + "epoch": 0.3669923342852848, + "flos": 15960929132160.0, + "grad_norm": 1.70823694247524, + "language_loss": 0.71310115, + "learning_rate": 2.922369507632716e-06, + "loss": 0.73540413, + "num_input_tokens_seen": 131131150, + "step": 6104, + "time_per_iteration": 2.660397529602051 + }, + { + "auxiliary_loss_clip": 0.01155238, + "auxiliary_loss_mlp": 0.01126839, + "balance_loss_clip": 1.00209832, + "balance_loss_mlp": 1.00066817, + "epoch": 0.3670524575379528, + "flos": 19974485064960.0, + "grad_norm": 1.7920399553617958, + "language_loss": 0.81689566, + "learning_rate": 2.9220239191874617e-06, + "loss": 0.83971649, + "num_input_tokens_seen": 131150365, + "step": 6105, + "time_per_iteration": 2.5652027130126953 + }, + { + "auxiliary_loss_clip": 0.01170312, + "auxiliary_loss_mlp": 0.01127817, + "balance_loss_clip": 1.00200474, + "balance_loss_mlp": 1.0008831, + "epoch": 0.36711258079062076, + "flos": 25702955003520.0, + "grad_norm": 1.76316497984337, + "language_loss": 0.80811977, + "learning_rate": 2.9216782957789692e-06, + "loss": 0.83110106, + "num_input_tokens_seen": 131169310, + "step": 6106, + "time_per_iteration": 2.6536026000976562 + }, + { + "auxiliary_loss_clip": 0.01122618, + "auxiliary_loss_mlp": 0.00747007, + "balance_loss_clip": 1.00309587, + "balance_loss_mlp": 1.00003147, + "epoch": 0.3671727040432887, + "flos": 60772743342720.0, + "grad_norm": 0.6851084164973588, + "language_loss": 0.59214926, + "learning_rate": 2.9213326374203426e-06, + "loss": 0.61084557, + "num_input_tokens_seen": 131232900, + "step": 6107, + "time_per_iteration": 3.326660394668579 + }, + { + "auxiliary_loss_clip": 0.01136494, + "auxiliary_loss_mlp": 0.01126385, + "balance_loss_clip": 1.00199926, + "balance_loss_mlp": 1.00069022, + "epoch": 0.3672328272959567, + "flos": 18661303745280.0, + "grad_norm": 1.8207048125982301, + "language_loss": 0.74574149, + "learning_rate": 2.92098694412469e-06, + "loss": 0.76837027, + "num_input_tokens_seen": 131250920, + "step": 6108, + "time_per_iteration": 2.6157827377319336 + }, + { + "auxiliary_loss_clip": 0.01154562, + "auxiliary_loss_mlp": 0.01127373, + "balance_loss_clip": 1.00194633, + "balance_loss_mlp": 1.00062943, + "epoch": 0.3672929505486247, + "flos": 15049049535360.0, + "grad_norm": 2.033543893437769, + "language_loss": 0.73498762, + "learning_rate": 2.9206412159051213e-06, + "loss": 0.75780696, + "num_input_tokens_seen": 131267910, + "step": 6109, + "time_per_iteration": 2.5401649475097656 + }, + { + "auxiliary_loss_clip": 0.01091761, + "auxiliary_loss_mlp": 0.01126903, + "balance_loss_clip": 1.0017693, + "balance_loss_mlp": 1.0006361, + "epoch": 0.3673530738012927, + "flos": 20589347099520.0, + "grad_norm": 1.759383988179321, + "language_loss": 0.53448147, + "learning_rate": 2.920295452774744e-06, + "loss": 0.5566681, + "num_input_tokens_seen": 131287150, + "step": 6110, + "time_per_iteration": 2.7051570415496826 + }, + { + "auxiliary_loss_clip": 0.01155004, + "auxiliary_loss_mlp": 0.01126191, + "balance_loss_clip": 1.00214672, + "balance_loss_mlp": 1.00078297, + "epoch": 0.36741319705396064, + "flos": 21689830033920.0, + "grad_norm": 2.0256206287374967, + "language_loss": 0.80486017, + "learning_rate": 2.919949654746672e-06, + "loss": 0.82767212, + "num_input_tokens_seen": 131308225, + "step": 6111, + "time_per_iteration": 2.5799663066864014 + }, + { + "auxiliary_loss_clip": 0.01105062, + "auxiliary_loss_mlp": 0.01126882, + "balance_loss_clip": 1.00190377, + "balance_loss_mlp": 1.00071096, + "epoch": 0.3674733203066286, + "flos": 29862200499840.0, + "grad_norm": 1.559986288831912, + "language_loss": 0.72466815, + "learning_rate": 2.9196038218340163e-06, + "loss": 0.74698758, + "num_input_tokens_seen": 131332115, + "step": 6112, + "time_per_iteration": 2.74276065826416 + }, + { + "auxiliary_loss_clip": 0.01154976, + "auxiliary_loss_mlp": 0.01126488, + "balance_loss_clip": 1.00198114, + "balance_loss_mlp": 1.0007937, + "epoch": 0.36753344355929657, + "flos": 18257021193600.0, + "grad_norm": 1.5942746423161431, + "language_loss": 0.85078299, + "learning_rate": 2.919257954049892e-06, + "loss": 0.87359762, + "num_input_tokens_seen": 131351885, + "step": 6113, + "time_per_iteration": 2.5646069049835205 + }, + { + "auxiliary_loss_clip": 0.01154751, + "auxiliary_loss_mlp": 0.01127151, + "balance_loss_clip": 1.00205433, + "balance_loss_mlp": 1.00069392, + "epoch": 0.36759356681196453, + "flos": 25301150490240.0, + "grad_norm": 1.8441473049742954, + "language_loss": 0.78685153, + "learning_rate": 2.918912051407413e-06, + "loss": 0.80967057, + "num_input_tokens_seen": 131370245, + "step": 6114, + "time_per_iteration": 2.5805368423461914 + }, + { + "auxiliary_loss_clip": 0.01155311, + "auxiliary_loss_mlp": 0.01126838, + "balance_loss_clip": 1.00209236, + "balance_loss_mlp": 1.00085807, + "epoch": 0.3676536900646325, + "flos": 21032952065280.0, + "grad_norm": 1.9568790005377465, + "language_loss": 0.67239493, + "learning_rate": 2.918566113919698e-06, + "loss": 0.69521636, + "num_input_tokens_seen": 131388115, + "step": 6115, + "time_per_iteration": 2.5823493003845215 + }, + { + "auxiliary_loss_clip": 0.0113837, + "auxiliary_loss_mlp": 0.01126539, + "balance_loss_clip": 1.00200033, + "balance_loss_mlp": 1.00074959, + "epoch": 0.36771381331730046, + "flos": 16288506190080.0, + "grad_norm": 2.600626226844024, + "language_loss": 0.75981998, + "learning_rate": 2.9182201415998636e-06, + "loss": 0.78246909, + "num_input_tokens_seen": 131404595, + "step": 6116, + "time_per_iteration": 2.5759694576263428 + }, + { + "auxiliary_loss_clip": 0.01105477, + "auxiliary_loss_mlp": 0.01126744, + "balance_loss_clip": 1.00179744, + "balance_loss_mlp": 1.00066793, + "epoch": 0.36777393656996843, + "flos": 22309971367680.0, + "grad_norm": 1.839137768121695, + "language_loss": 0.63147688, + "learning_rate": 2.9178741344610286e-06, + "loss": 0.65379906, + "num_input_tokens_seen": 131423760, + "step": 6117, + "time_per_iteration": 2.708592414855957 + }, + { + "auxiliary_loss_clip": 0.01139973, + "auxiliary_loss_mlp": 0.01126717, + "balance_loss_clip": 1.00212955, + "balance_loss_mlp": 1.00064158, + "epoch": 0.3678340598226364, + "flos": 26834069260800.0, + "grad_norm": 1.9329497689048125, + "language_loss": 0.734716, + "learning_rate": 2.9175280925163156e-06, + "loss": 0.75738287, + "num_input_tokens_seen": 131444955, + "step": 6118, + "time_per_iteration": 2.646211624145508 + }, + { + "auxiliary_loss_clip": 0.0115356, + "auxiliary_loss_mlp": 0.01127647, + "balance_loss_clip": 1.00193071, + "balance_loss_mlp": 1.00080812, + "epoch": 0.36789418307530436, + "flos": 21761723105280.0, + "grad_norm": 1.4849679738640251, + "language_loss": 0.72850025, + "learning_rate": 2.9171820157788445e-06, + "loss": 0.75131232, + "num_input_tokens_seen": 131465720, + "step": 6119, + "time_per_iteration": 2.5756947994232178 + }, + { + "auxiliary_loss_clip": 0.01138184, + "auxiliary_loss_mlp": 0.01126449, + "balance_loss_clip": 1.00194824, + "balance_loss_mlp": 1.00065947, + "epoch": 0.3679543063279723, + "flos": 15924192497280.0, + "grad_norm": 1.9492122002161956, + "language_loss": 0.80259275, + "learning_rate": 2.9168359042617404e-06, + "loss": 0.82523906, + "num_input_tokens_seen": 131483080, + "step": 6120, + "time_per_iteration": 2.5735411643981934 + }, + { + "auxiliary_loss_clip": 0.01120359, + "auxiliary_loss_mlp": 0.01126851, + "balance_loss_clip": 1.00178087, + "balance_loss_mlp": 1.00087035, + "epoch": 0.3680144295806403, + "flos": 24275541456000.0, + "grad_norm": 4.938115346451769, + "language_loss": 0.64467335, + "learning_rate": 2.916489757978126e-06, + "loss": 0.66714543, + "num_input_tokens_seen": 131502545, + "step": 6121, + "time_per_iteration": 2.659822940826416 + }, + { + "auxiliary_loss_clip": 0.01153546, + "auxiliary_loss_mlp": 0.01126955, + "balance_loss_clip": 1.00207901, + "balance_loss_mlp": 1.00078344, + "epoch": 0.36807455283330826, + "flos": 26104148985600.0, + "grad_norm": 2.0014409400127358, + "language_loss": 0.71327645, + "learning_rate": 2.9161435769411286e-06, + "loss": 0.73608142, + "num_input_tokens_seen": 131522155, + "step": 6122, + "time_per_iteration": 2.5936357975006104 + }, + { + "auxiliary_loss_clip": 0.01138258, + "auxiliary_loss_mlp": 0.01126602, + "balance_loss_clip": 1.00208402, + "balance_loss_mlp": 1.0007174, + "epoch": 0.3681346760859763, + "flos": 24644990793600.0, + "grad_norm": 1.9292623596102845, + "language_loss": 0.69367528, + "learning_rate": 2.915797361163875e-06, + "loss": 0.71632391, + "num_input_tokens_seen": 131543865, + "step": 6123, + "time_per_iteration": 2.639238119125366 + }, + { + "auxiliary_loss_clip": 0.01155231, + "auxiliary_loss_mlp": 0.01127176, + "balance_loss_clip": 1.00211406, + "balance_loss_mlp": 1.00071859, + "epoch": 0.36819479933864424, + "flos": 23878369797120.0, + "grad_norm": 9.681166743369833, + "language_loss": 0.74126804, + "learning_rate": 2.9154511106594933e-06, + "loss": 0.76409209, + "num_input_tokens_seen": 131562155, + "step": 6124, + "time_per_iteration": 2.6086928844451904 + }, + { + "auxiliary_loss_clip": 0.0114302, + "auxiliary_loss_mlp": 0.01126984, + "balance_loss_clip": 1.00203371, + "balance_loss_mlp": 1.00071728, + "epoch": 0.3682549225913122, + "flos": 25553997302400.0, + "grad_norm": 2.0107038185759873, + "language_loss": 0.74393886, + "learning_rate": 2.915104825441114e-06, + "loss": 0.76663888, + "num_input_tokens_seen": 131581695, + "step": 6125, + "time_per_iteration": 2.6321921348571777 + }, + { + "auxiliary_loss_clip": 0.01155231, + "auxiliary_loss_mlp": 0.01128198, + "balance_loss_clip": 1.00211596, + "balance_loss_mlp": 1.00088286, + "epoch": 0.36831504584398017, + "flos": 16946605221120.0, + "grad_norm": 2.1087263009333053, + "language_loss": 0.78162956, + "learning_rate": 2.9147585055218686e-06, + "loss": 0.80446386, + "num_input_tokens_seen": 131599465, + "step": 6126, + "time_per_iteration": 2.5169079303741455 + }, + { + "auxiliary_loss_clip": 0.01153334, + "auxiliary_loss_mlp": 0.01127691, + "balance_loss_clip": 1.00202024, + "balance_loss_mlp": 1.00075674, + "epoch": 0.36837516909664814, + "flos": 19865065259520.0, + "grad_norm": 2.4784773325346956, + "language_loss": 0.65832818, + "learning_rate": 2.914412150914888e-06, + "loss": 0.6811384, + "num_input_tokens_seen": 131618330, + "step": 6127, + "time_per_iteration": 2.539313793182373 + }, + { + "auxiliary_loss_clip": 0.01137106, + "auxiliary_loss_mlp": 0.01127028, + "balance_loss_clip": 1.00200748, + "balance_loss_mlp": 1.0008564, + "epoch": 0.3684352923493161, + "flos": 37626984362880.0, + "grad_norm": 1.9640214678305188, + "language_loss": 0.70564044, + "learning_rate": 2.9140657616333074e-06, + "loss": 0.7282818, + "num_input_tokens_seen": 131638960, + "step": 6128, + "time_per_iteration": 2.7860970497131348 + }, + { + "auxiliary_loss_clip": 0.01138733, + "auxiliary_loss_mlp": 0.01126574, + "balance_loss_clip": 1.00197864, + "balance_loss_mlp": 1.00078452, + "epoch": 0.36849541560198407, + "flos": 14465501182080.0, + "grad_norm": 2.9483111843974226, + "language_loss": 0.75499928, + "learning_rate": 2.9137193376902614e-06, + "loss": 0.77765238, + "num_input_tokens_seen": 131657440, + "step": 6129, + "time_per_iteration": 2.585103750228882 + }, + { + "auxiliary_loss_clip": 0.01154955, + "auxiliary_loss_mlp": 0.01126685, + "balance_loss_clip": 1.00204921, + "balance_loss_mlp": 1.00070429, + "epoch": 0.36855553885465203, + "flos": 25770753924480.0, + "grad_norm": 1.5546932612702191, + "language_loss": 0.8502872, + "learning_rate": 2.9133728790988868e-06, + "loss": 0.87310362, + "num_input_tokens_seen": 131678035, + "step": 6130, + "time_per_iteration": 2.6027398109436035 + }, + { + "auxiliary_loss_clip": 0.01134289, + "auxiliary_loss_mlp": 0.01108842, + "balance_loss_clip": 1.00289679, + "balance_loss_mlp": 1.00050449, + "epoch": 0.36861566210732, + "flos": 65049417377280.0, + "grad_norm": 0.8056286538545704, + "language_loss": 0.60348308, + "learning_rate": 2.913026385872321e-06, + "loss": 0.62591434, + "num_input_tokens_seen": 131742470, + "step": 6131, + "time_per_iteration": 4.651205778121948 + }, + { + "auxiliary_loss_clip": 0.01122073, + "auxiliary_loss_mlp": 0.01126441, + "balance_loss_clip": 1.00204325, + "balance_loss_mlp": 1.00065184, + "epoch": 0.36867578535998796, + "flos": 30954495133440.0, + "grad_norm": 2.2539360582843564, + "language_loss": 0.72604328, + "learning_rate": 2.9126798580237034e-06, + "loss": 0.74852842, + "num_input_tokens_seen": 131764570, + "step": 6132, + "time_per_iteration": 2.72306752204895 + }, + { + "auxiliary_loss_clip": 0.01155305, + "auxiliary_loss_mlp": 0.01127629, + "balance_loss_clip": 1.00208437, + "balance_loss_mlp": 1.00078988, + "epoch": 0.3687359086126559, + "flos": 28837956182400.0, + "grad_norm": 1.6882958291246466, + "language_loss": 0.74504125, + "learning_rate": 2.9123332955661736e-06, + "loss": 0.76787055, + "num_input_tokens_seen": 131785720, + "step": 6133, + "time_per_iteration": 5.412795066833496 + }, + { + "auxiliary_loss_clip": 0.01093672, + "auxiliary_loss_mlp": 0.01126231, + "balance_loss_clip": 1.0019033, + "balance_loss_mlp": 1.00072742, + "epoch": 0.3687960318653239, + "flos": 21396798881280.0, + "grad_norm": 1.6551545301772441, + "language_loss": 0.7149356, + "learning_rate": 2.911986698512874e-06, + "loss": 0.73713464, + "num_input_tokens_seen": 131804430, + "step": 6134, + "time_per_iteration": 2.7479069232940674 + }, + { + "auxiliary_loss_clip": 0.01121685, + "auxiliary_loss_mlp": 0.01127029, + "balance_loss_clip": 1.00187612, + "balance_loss_mlp": 1.00066757, + "epoch": 0.36885615511799186, + "flos": 20266043760000.0, + "grad_norm": 1.6583484577099914, + "language_loss": 0.75417084, + "learning_rate": 2.9116400668769477e-06, + "loss": 0.776658, + "num_input_tokens_seen": 131822060, + "step": 6135, + "time_per_iteration": 2.6293063163757324 + }, + { + "auxiliary_loss_clip": 0.01119339, + "auxiliary_loss_mlp": 0.0110875, + "balance_loss_clip": 1.00291371, + "balance_loss_mlp": 1.00041282, + "epoch": 0.3689162783706599, + "flos": 63088836301440.0, + "grad_norm": 0.8062548766714985, + "language_loss": 0.58762252, + "learning_rate": 2.9112934006715376e-06, + "loss": 0.60990345, + "num_input_tokens_seen": 131880715, + "step": 6136, + "time_per_iteration": 3.1737852096557617 + }, + { + "auxiliary_loss_clip": 0.01140133, + "auxiliary_loss_mlp": 0.01126362, + "balance_loss_clip": 1.00228763, + "balance_loss_mlp": 1.00057268, + "epoch": 0.36897640162332784, + "flos": 10961984419200.0, + "grad_norm": 1.8366586514707297, + "language_loss": 0.78983229, + "learning_rate": 2.9109466999097918e-06, + "loss": 0.81249726, + "num_input_tokens_seen": 131895850, + "step": 6137, + "time_per_iteration": 4.056914806365967 + }, + { + "auxiliary_loss_clip": 0.01154726, + "auxiliary_loss_mlp": 0.01127116, + "balance_loss_clip": 1.00214195, + "balance_loss_mlp": 1.00084996, + "epoch": 0.3690365248759958, + "flos": 20704297599360.0, + "grad_norm": 2.0995777085424763, + "language_loss": 0.74298167, + "learning_rate": 2.9105999646048552e-06, + "loss": 0.76580012, + "num_input_tokens_seen": 131915775, + "step": 6138, + "time_per_iteration": 2.5756828784942627 + }, + { + "auxiliary_loss_clip": 0.01112397, + "auxiliary_loss_mlp": 0.0112713, + "balance_loss_clip": 1.00202048, + "balance_loss_mlp": 1.00067258, + "epoch": 0.3690966481286638, + "flos": 31826369957760.0, + "grad_norm": 2.3394549738782704, + "language_loss": 0.64810967, + "learning_rate": 2.9102531947698764e-06, + "loss": 0.67050493, + "num_input_tokens_seen": 131935715, + "step": 6139, + "time_per_iteration": 2.782644033432007 + }, + { + "auxiliary_loss_clip": 0.01119804, + "auxiliary_loss_mlp": 0.01126639, + "balance_loss_clip": 1.00195885, + "balance_loss_mlp": 1.000754, + "epoch": 0.36915677138133174, + "flos": 13114936782720.0, + "grad_norm": 2.0441977112658933, + "language_loss": 0.70895678, + "learning_rate": 2.909906390418006e-06, + "loss": 0.73142117, + "num_input_tokens_seen": 131954120, + "step": 6140, + "time_per_iteration": 2.6235878467559814 + }, + { + "auxiliary_loss_clip": 0.01117817, + "auxiliary_loss_mlp": 0.01108737, + "balance_loss_clip": 1.00245261, + "balance_loss_mlp": 1.00039995, + "epoch": 0.3692168946339997, + "flos": 68686879956480.0, + "grad_norm": 0.7530053617940544, + "language_loss": 0.59357214, + "learning_rate": 2.9095595515623934e-06, + "loss": 0.61583769, + "num_input_tokens_seen": 132017485, + "step": 6141, + "time_per_iteration": 3.264554977416992 + }, + { + "auxiliary_loss_clip": 0.01153409, + "auxiliary_loss_mlp": 0.01127033, + "balance_loss_clip": 1.00205731, + "balance_loss_mlp": 1.00067079, + "epoch": 0.36927701788666767, + "flos": 22017873968640.0, + "grad_norm": 2.0604128448187073, + "language_loss": 0.75280535, + "learning_rate": 2.909212678216192e-06, + "loss": 0.77560973, + "num_input_tokens_seen": 132036760, + "step": 6142, + "time_per_iteration": 2.5689005851745605 + }, + { + "auxiliary_loss_clip": 0.01153645, + "auxiliary_loss_mlp": 0.01126101, + "balance_loss_clip": 1.00209403, + "balance_loss_mlp": 1.00088346, + "epoch": 0.36933714113933563, + "flos": 21835591424640.0, + "grad_norm": 1.6512306743554805, + "language_loss": 0.76961088, + "learning_rate": 2.908865770392555e-06, + "loss": 0.79240835, + "num_input_tokens_seen": 132056935, + "step": 6143, + "time_per_iteration": 2.5916733741760254 + }, + { + "auxiliary_loss_clip": 0.0115503, + "auxiliary_loss_mlp": 0.01126608, + "balance_loss_clip": 1.00219262, + "balance_loss_mlp": 1.00062788, + "epoch": 0.3693972643920036, + "flos": 23691705793920.0, + "grad_norm": 1.4538794451301988, + "language_loss": 0.8209818, + "learning_rate": 2.9085188281046364e-06, + "loss": 0.84379816, + "num_input_tokens_seen": 132077285, + "step": 6144, + "time_per_iteration": 2.61350417137146 + }, + { + "auxiliary_loss_clip": 0.01154701, + "auxiliary_loss_mlp": 0.01126662, + "balance_loss_clip": 1.00208616, + "balance_loss_mlp": 1.00068128, + "epoch": 0.36945738764467156, + "flos": 22856747172480.0, + "grad_norm": 1.9863773544675285, + "language_loss": 0.77133191, + "learning_rate": 2.908171851365593e-06, + "loss": 0.79414552, + "num_input_tokens_seen": 132095520, + "step": 6145, + "time_per_iteration": 2.587203025817871 + }, + { + "auxiliary_loss_clip": 0.01155315, + "auxiliary_loss_mlp": 0.01126296, + "balance_loss_clip": 1.00210726, + "balance_loss_mlp": 1.00069666, + "epoch": 0.36951751089733953, + "flos": 16615939593600.0, + "grad_norm": 5.698456382920243, + "language_loss": 0.76788223, + "learning_rate": 2.9078248401885815e-06, + "loss": 0.79069829, + "num_input_tokens_seen": 132112810, + "step": 6146, + "time_per_iteration": 2.541449785232544 + }, + { + "auxiliary_loss_clip": 0.01138491, + "auxiliary_loss_mlp": 0.01127361, + "balance_loss_clip": 1.00201178, + "balance_loss_mlp": 1.000808, + "epoch": 0.3695776341500075, + "flos": 18914545607040.0, + "grad_norm": 1.7770081651674878, + "language_loss": 0.80367911, + "learning_rate": 2.907477794586761e-06, + "loss": 0.82633764, + "num_input_tokens_seen": 132131615, + "step": 6147, + "time_per_iteration": 2.5882954597473145 + }, + { + "auxiliary_loss_clip": 0.01122582, + "auxiliary_loss_mlp": 0.00747745, + "balance_loss_clip": 1.00185239, + "balance_loss_mlp": 1.00002217, + "epoch": 0.36963775740267546, + "flos": 20808474019200.0, + "grad_norm": 12.855931927314739, + "language_loss": 0.83336502, + "learning_rate": 2.9071307145732926e-06, + "loss": 0.85206831, + "num_input_tokens_seen": 132149585, + "step": 6148, + "time_per_iteration": 2.67523193359375 + }, + { + "auxiliary_loss_clip": 0.01153512, + "auxiliary_loss_mlp": 0.01126791, + "balance_loss_clip": 1.00206792, + "balance_loss_mlp": 1.00071537, + "epoch": 0.3696978806553435, + "flos": 26061881656320.0, + "grad_norm": 2.17445096787162, + "language_loss": 0.74767429, + "learning_rate": 2.9067836001613357e-06, + "loss": 0.77047729, + "num_input_tokens_seen": 132165555, + "step": 6149, + "time_per_iteration": 2.570857524871826 + }, + { + "auxiliary_loss_clip": 0.01170411, + "auxiliary_loss_mlp": 0.01127482, + "balance_loss_clip": 1.00222921, + "balance_loss_mlp": 1.00073838, + "epoch": 0.36975800390801145, + "flos": 26833925606400.0, + "grad_norm": 1.8547043425658678, + "language_loss": 0.70745277, + "learning_rate": 2.906436451364054e-06, + "loss": 0.73043168, + "num_input_tokens_seen": 132185100, + "step": 6150, + "time_per_iteration": 2.558600902557373 + }, + { + "auxiliary_loss_clip": 0.01138635, + "auxiliary_loss_mlp": 0.01127076, + "balance_loss_clip": 1.00207472, + "balance_loss_mlp": 1.00090456, + "epoch": 0.3698181271606794, + "flos": 21142623265920.0, + "grad_norm": 1.9853105045055923, + "language_loss": 0.81550002, + "learning_rate": 2.906089268194611e-06, + "loss": 0.83815712, + "num_input_tokens_seen": 132203930, + "step": 6151, + "time_per_iteration": 2.599430799484253 + }, + { + "auxiliary_loss_clip": 0.01136006, + "auxiliary_loss_mlp": 0.01107845, + "balance_loss_clip": 1.00266469, + "balance_loss_mlp": 1.00027072, + "epoch": 0.3698782504133474, + "flos": 66742639568640.0, + "grad_norm": 0.7771387824855845, + "language_loss": 0.63201243, + "learning_rate": 2.9057420506661726e-06, + "loss": 0.65445095, + "num_input_tokens_seen": 132263845, + "step": 6152, + "time_per_iteration": 3.290874481201172 + }, + { + "auxiliary_loss_clip": 0.01111088, + "auxiliary_loss_mlp": 0.01125931, + "balance_loss_clip": 1.00214887, + "balance_loss_mlp": 1.00080919, + "epoch": 0.36993837366601534, + "flos": 24311523905280.0, + "grad_norm": 2.118646147893601, + "language_loss": 0.69707084, + "learning_rate": 2.9053947987919044e-06, + "loss": 0.719441, + "num_input_tokens_seen": 132282350, + "step": 6153, + "time_per_iteration": 2.747823476791382 + }, + { + "auxiliary_loss_clip": 0.01153497, + "auxiliary_loss_mlp": 0.0112711, + "balance_loss_clip": 1.00204873, + "balance_loss_mlp": 1.0007484, + "epoch": 0.3699984969186833, + "flos": 24349194293760.0, + "grad_norm": 1.6147227080113686, + "language_loss": 0.7179569, + "learning_rate": 2.9050475125849755e-06, + "loss": 0.74076295, + "num_input_tokens_seen": 132301930, + "step": 6154, + "time_per_iteration": 2.769274950027466 + }, + { + "auxiliary_loss_clip": 0.01138526, + "auxiliary_loss_mlp": 0.0112651, + "balance_loss_clip": 1.00198627, + "balance_loss_mlp": 1.00062537, + "epoch": 0.37005862017135127, + "flos": 19829154637440.0, + "grad_norm": 1.7929309305190728, + "language_loss": 0.67980671, + "learning_rate": 2.9047001920585534e-06, + "loss": 0.70245707, + "num_input_tokens_seen": 132320915, + "step": 6155, + "time_per_iteration": 2.6193695068359375 + }, + { + "auxiliary_loss_clip": 0.01155123, + "auxiliary_loss_mlp": 0.01126318, + "balance_loss_clip": 1.00199938, + "balance_loss_mlp": 1.0005281, + "epoch": 0.37011874342401924, + "flos": 19573793873280.0, + "grad_norm": 18.056430446146432, + "language_loss": 0.68026698, + "learning_rate": 2.9043528372258097e-06, + "loss": 0.70308143, + "num_input_tokens_seen": 132340415, + "step": 6156, + "time_per_iteration": 2.597658634185791 + }, + { + "auxiliary_loss_clip": 0.01144425, + "auxiliary_loss_mlp": 0.0112626, + "balance_loss_clip": 1.00222802, + "balance_loss_mlp": 1.00066078, + "epoch": 0.3701788666766872, + "flos": 20374350243840.0, + "grad_norm": 1.7797860008786393, + "language_loss": 0.8180421, + "learning_rate": 2.904005448099916e-06, + "loss": 0.84074891, + "num_input_tokens_seen": 132358600, + "step": 6157, + "time_per_iteration": 2.5867080688476562 + }, + { + "auxiliary_loss_clip": 0.01105082, + "auxiliary_loss_mlp": 0.01127602, + "balance_loss_clip": 1.0018357, + "balance_loss_mlp": 1.00076294, + "epoch": 0.37023898992935517, + "flos": 15340931452800.0, + "grad_norm": 2.9151937782843715, + "language_loss": 0.76477492, + "learning_rate": 2.9036580246940444e-06, + "loss": 0.78710175, + "num_input_tokens_seen": 132373160, + "step": 6158, + "time_per_iteration": 2.633146047592163 + }, + { + "auxiliary_loss_clip": 0.01170383, + "auxiliary_loss_mlp": 0.01127489, + "balance_loss_clip": 1.00207782, + "balance_loss_mlp": 1.00065041, + "epoch": 0.37029911318202313, + "flos": 19573937527680.0, + "grad_norm": 3.778613797667288, + "language_loss": 0.69335639, + "learning_rate": 2.9033105670213708e-06, + "loss": 0.71633512, + "num_input_tokens_seen": 132392345, + "step": 6159, + "time_per_iteration": 2.5170905590057373 + }, + { + "auxiliary_loss_clip": 0.01138047, + "auxiliary_loss_mlp": 0.01126401, + "balance_loss_clip": 1.00194681, + "balance_loss_mlp": 1.00080204, + "epoch": 0.3703592364346911, + "flos": 26213353309440.0, + "grad_norm": 2.512121066353191, + "language_loss": 0.70844793, + "learning_rate": 2.9029630750950697e-06, + "loss": 0.73109239, + "num_input_tokens_seen": 132412620, + "step": 6160, + "time_per_iteration": 2.647646903991699 + }, + { + "auxiliary_loss_clip": 0.01136547, + "auxiliary_loss_mlp": 0.0112553, + "balance_loss_clip": 1.00204515, + "balance_loss_mlp": 1.00078988, + "epoch": 0.37041935968735906, + "flos": 20048317470720.0, + "grad_norm": 2.7374857283949092, + "language_loss": 0.78630495, + "learning_rate": 2.9026155489283176e-06, + "loss": 0.80892569, + "num_input_tokens_seen": 132431570, + "step": 6161, + "time_per_iteration": 2.6147701740264893 + }, + { + "auxiliary_loss_clip": 0.0117028, + "auxiliary_loss_mlp": 0.01126866, + "balance_loss_clip": 1.00218344, + "balance_loss_mlp": 1.00059927, + "epoch": 0.3704794829400271, + "flos": 24133802388480.0, + "grad_norm": 1.990982994925531, + "language_loss": 0.79371768, + "learning_rate": 2.902267988534295e-06, + "loss": 0.81668907, + "num_input_tokens_seen": 132451525, + "step": 6162, + "time_per_iteration": 2.544464588165283 + }, + { + "auxiliary_loss_clip": 0.01137925, + "auxiliary_loss_mlp": 0.00747718, + "balance_loss_clip": 1.0019803, + "balance_loss_mlp": 1.00008786, + "epoch": 0.37053960619269505, + "flos": 14866874732160.0, + "grad_norm": 1.8074055434613627, + "language_loss": 0.79892242, + "learning_rate": 2.9019203939261783e-06, + "loss": 0.81777883, + "num_input_tokens_seen": 132469875, + "step": 6163, + "time_per_iteration": 2.5908095836639404 + }, + { + "auxiliary_loss_clip": 0.01153428, + "auxiliary_loss_mlp": 0.01127128, + "balance_loss_clip": 1.00202954, + "balance_loss_mlp": 1.00057566, + "epoch": 0.370599729445363, + "flos": 21361498790400.0, + "grad_norm": 2.2985535951590506, + "language_loss": 0.68108481, + "learning_rate": 2.9015727651171507e-06, + "loss": 0.70389038, + "num_input_tokens_seen": 132488360, + "step": 6164, + "time_per_iteration": 2.54229998588562 + }, + { + "auxiliary_loss_clip": 0.01138687, + "auxiliary_loss_mlp": 0.01126964, + "balance_loss_clip": 1.00217342, + "balance_loss_mlp": 1.00079322, + "epoch": 0.370659852698031, + "flos": 26829041356800.0, + "grad_norm": 2.0734871096661456, + "language_loss": 0.82963258, + "learning_rate": 2.9012251021203935e-06, + "loss": 0.85228908, + "num_input_tokens_seen": 132508630, + "step": 6165, + "time_per_iteration": 2.654366970062256 + }, + { + "auxiliary_loss_clip": 0.01143124, + "auxiliary_loss_mlp": 0.0112717, + "balance_loss_clip": 1.00211465, + "balance_loss_mlp": 1.00071275, + "epoch": 0.37071997595069894, + "flos": 19099018880640.0, + "grad_norm": 3.645453115551879, + "language_loss": 0.68957591, + "learning_rate": 2.9008774049490896e-06, + "loss": 0.71227884, + "num_input_tokens_seen": 132527465, + "step": 6166, + "time_per_iteration": 2.6324849128723145 + }, + { + "auxiliary_loss_clip": 0.01117781, + "auxiliary_loss_mlp": 0.01108852, + "balance_loss_clip": 1.00206006, + "balance_loss_mlp": 1.00051439, + "epoch": 0.3707800992033669, + "flos": 52178384920320.0, + "grad_norm": 0.7872378531649528, + "language_loss": 0.56915677, + "learning_rate": 2.9005296736164244e-06, + "loss": 0.59142303, + "num_input_tokens_seen": 132579940, + "step": 6167, + "time_per_iteration": 3.0905303955078125 + }, + { + "auxiliary_loss_clip": 0.01138379, + "auxiliary_loss_mlp": 0.01126027, + "balance_loss_clip": 1.00207496, + "balance_loss_mlp": 1.00080907, + "epoch": 0.3708402224560349, + "flos": 19901837808000.0, + "grad_norm": 1.8865519898227248, + "language_loss": 0.75243032, + "learning_rate": 2.900181908135584e-06, + "loss": 0.77507436, + "num_input_tokens_seen": 132598390, + "step": 6168, + "time_per_iteration": 2.60141658782959 + }, + { + "auxiliary_loss_clip": 0.0115496, + "auxiliary_loss_mlp": 0.00747658, + "balance_loss_clip": 1.00213885, + "balance_loss_mlp": 1.00007999, + "epoch": 0.37090034570870284, + "flos": 20007630339840.0, + "grad_norm": 1.8476429784855177, + "language_loss": 0.73976469, + "learning_rate": 2.899834108519755e-06, + "loss": 0.75879085, + "num_input_tokens_seen": 132616920, + "step": 6169, + "time_per_iteration": 3.952260971069336 + }, + { + "auxiliary_loss_clip": 0.01170208, + "auxiliary_loss_mlp": 0.01126226, + "balance_loss_clip": 1.0021826, + "balance_loss_mlp": 1.00072289, + "epoch": 0.3709604689613708, + "flos": 24134700228480.0, + "grad_norm": 1.5611101895413213, + "language_loss": 0.79582119, + "learning_rate": 2.899486274782127e-06, + "loss": 0.81878555, + "num_input_tokens_seen": 132637660, + "step": 6170, + "time_per_iteration": 2.586986780166626 + }, + { + "auxiliary_loss_clip": 0.01153412, + "auxiliary_loss_mlp": 0.01127091, + "balance_loss_clip": 1.00205851, + "balance_loss_mlp": 1.00072896, + "epoch": 0.37102059221403877, + "flos": 23876071326720.0, + "grad_norm": 1.4955273369994884, + "language_loss": 0.76002562, + "learning_rate": 2.8991384069358885e-06, + "loss": 0.7828306, + "num_input_tokens_seen": 132657635, + "step": 6171, + "time_per_iteration": 3.9785850048065186 + }, + { + "auxiliary_loss_clip": 0.01138201, + "auxiliary_loss_mlp": 0.01126168, + "balance_loss_clip": 1.002105, + "balance_loss_mlp": 1.00056863, + "epoch": 0.37108071546670673, + "flos": 14501268149760.0, + "grad_norm": 1.9319123750314435, + "language_loss": 0.80883837, + "learning_rate": 2.898790504994232e-06, + "loss": 0.83148205, + "num_input_tokens_seen": 132674455, + "step": 6172, + "time_per_iteration": 2.5780792236328125 + }, + { + "auxiliary_loss_clip": 0.01153638, + "auxiliary_loss_mlp": 0.0112719, + "balance_loss_clip": 1.00208735, + "balance_loss_mlp": 1.00073326, + "epoch": 0.3711408387193747, + "flos": 34562619279360.0, + "grad_norm": 1.7330872773979122, + "language_loss": 0.59436572, + "learning_rate": 2.89844256897035e-06, + "loss": 0.61717403, + "num_input_tokens_seen": 132695140, + "step": 6173, + "time_per_iteration": 2.7000010013580322 + }, + { + "auxiliary_loss_clip": 0.01138072, + "auxiliary_loss_mlp": 0.01126777, + "balance_loss_clip": 1.00200522, + "balance_loss_mlp": 1.0008924, + "epoch": 0.37120096197204266, + "flos": 17310703432320.0, + "grad_norm": 1.8016624484512604, + "language_loss": 0.80326605, + "learning_rate": 2.898094598877435e-06, + "loss": 0.82591462, + "num_input_tokens_seen": 132712470, + "step": 6174, + "time_per_iteration": 2.5638375282287598 + }, + { + "auxiliary_loss_clip": 0.0117008, + "auxiliary_loss_mlp": 0.0112592, + "balance_loss_clip": 1.00212407, + "balance_loss_mlp": 1.00060725, + "epoch": 0.37126108522471063, + "flos": 30664049760000.0, + "grad_norm": 2.701564188895004, + "language_loss": 0.80592918, + "learning_rate": 2.8977465947286826e-06, + "loss": 0.82888919, + "num_input_tokens_seen": 132732945, + "step": 6175, + "time_per_iteration": 3.9919941425323486 + }, + { + "auxiliary_loss_clip": 0.0115359, + "auxiliary_loss_mlp": 0.01126669, + "balance_loss_clip": 1.00213051, + "balance_loss_mlp": 1.00078404, + "epoch": 0.37132120847737865, + "flos": 25155640494720.0, + "grad_norm": 1.8899763079424776, + "language_loss": 0.88650525, + "learning_rate": 2.89739855653729e-06, + "loss": 0.90930784, + "num_input_tokens_seen": 132752470, + "step": 6176, + "time_per_iteration": 2.593370199203491 + }, + { + "auxiliary_loss_clip": 0.01153707, + "auxiliary_loss_mlp": 0.01126009, + "balance_loss_clip": 1.00219822, + "balance_loss_mlp": 1.00079131, + "epoch": 0.3713813317300466, + "flos": 21213474842880.0, + "grad_norm": 1.9729056378565986, + "language_loss": 0.73127222, + "learning_rate": 2.8970504843164546e-06, + "loss": 0.75406945, + "num_input_tokens_seen": 132771485, + "step": 6177, + "time_per_iteration": 2.5725796222686768 + }, + { + "auxiliary_loss_clip": 0.01122421, + "auxiliary_loss_mlp": 0.01127102, + "balance_loss_clip": 1.00193739, + "balance_loss_mlp": 1.00093126, + "epoch": 0.3714414549827146, + "flos": 21616644072960.0, + "grad_norm": 2.0370058306200467, + "language_loss": 0.75674701, + "learning_rate": 2.896702378079374e-06, + "loss": 0.77924228, + "num_input_tokens_seen": 132791465, + "step": 6178, + "time_per_iteration": 2.6410694122314453 + }, + { + "auxiliary_loss_clip": 0.01089867, + "auxiliary_loss_mlp": 0.0112731, + "balance_loss_clip": 1.00192189, + "balance_loss_mlp": 1.00085282, + "epoch": 0.37150157823538255, + "flos": 19972294335360.0, + "grad_norm": 2.15262896190955, + "language_loss": 0.71924669, + "learning_rate": 2.8963542378392502e-06, + "loss": 0.74141848, + "num_input_tokens_seen": 132810160, + "step": 6179, + "time_per_iteration": 2.7073915004730225 + }, + { + "auxiliary_loss_clip": 0.01170223, + "auxiliary_loss_mlp": 0.01127489, + "balance_loss_clip": 1.00211167, + "balance_loss_mlp": 1.00074625, + "epoch": 0.3715617014880505, + "flos": 24860562266880.0, + "grad_norm": 2.3063738879576685, + "language_loss": 0.69586384, + "learning_rate": 2.896006063609283e-06, + "loss": 0.71884096, + "num_input_tokens_seen": 132831265, + "step": 6180, + "time_per_iteration": 2.598792791366577 + }, + { + "auxiliary_loss_clip": 0.01138553, + "auxiliary_loss_mlp": 0.01126664, + "balance_loss_clip": 1.00205827, + "balance_loss_mlp": 1.00068402, + "epoch": 0.3716218247407185, + "flos": 20449080489600.0, + "grad_norm": 1.910012291575903, + "language_loss": 0.78208828, + "learning_rate": 2.8956578554026767e-06, + "loss": 0.80474043, + "num_input_tokens_seen": 132850005, + "step": 6181, + "time_per_iteration": 2.5982015132904053 + }, + { + "auxiliary_loss_clip": 0.01153707, + "auxiliary_loss_mlp": 0.01126595, + "balance_loss_clip": 1.0021956, + "balance_loss_mlp": 1.00090122, + "epoch": 0.37168194799338644, + "flos": 24133479166080.0, + "grad_norm": 7.335199823608022, + "language_loss": 0.78669316, + "learning_rate": 2.8953096132326343e-06, + "loss": 0.80949616, + "num_input_tokens_seen": 132865790, + "step": 6182, + "time_per_iteration": 2.583967685699463 + }, + { + "auxiliary_loss_clip": 0.01133732, + "auxiliary_loss_mlp": 0.01108659, + "balance_loss_clip": 1.00259185, + "balance_loss_mlp": 1.00032127, + "epoch": 0.3717420712460544, + "flos": 67408926900480.0, + "grad_norm": 0.8451979635402884, + "language_loss": 0.57489681, + "learning_rate": 2.894961337112362e-06, + "loss": 0.5973208, + "num_input_tokens_seen": 132921775, + "step": 6183, + "time_per_iteration": 3.168407917022705 + }, + { + "auxiliary_loss_clip": 0.01155144, + "auxiliary_loss_mlp": 0.00747807, + "balance_loss_clip": 1.00206888, + "balance_loss_mlp": 1.00006723, + "epoch": 0.37180219449872237, + "flos": 22376908362240.0, + "grad_norm": 2.952055311685973, + "language_loss": 0.76928234, + "learning_rate": 2.894613027055066e-06, + "loss": 0.78831184, + "num_input_tokens_seen": 132941060, + "step": 6184, + "time_per_iteration": 2.6220953464508057 + }, + { + "auxiliary_loss_clip": 0.01123161, + "auxiliary_loss_mlp": 0.01126963, + "balance_loss_clip": 1.00206327, + "balance_loss_mlp": 1.00079167, + "epoch": 0.37186231775139034, + "flos": 21869885934720.0, + "grad_norm": 1.7453445101487686, + "language_loss": 0.72320145, + "learning_rate": 2.894264683073954e-06, + "loss": 0.74570274, + "num_input_tokens_seen": 132961850, + "step": 6185, + "time_per_iteration": 2.658477306365967 + }, + { + "auxiliary_loss_clip": 0.01107555, + "auxiliary_loss_mlp": 0.01126697, + "balance_loss_clip": 1.0020926, + "balance_loss_mlp": 1.00071692, + "epoch": 0.3719224410040583, + "flos": 22415225195520.0, + "grad_norm": 2.576356555031174, + "language_loss": 0.76939678, + "learning_rate": 2.8939163051822363e-06, + "loss": 0.79173934, + "num_input_tokens_seen": 132981625, + "step": 6186, + "time_per_iteration": 2.7300097942352295 + }, + { + "auxiliary_loss_clip": 0.01153629, + "auxiliary_loss_mlp": 0.01128021, + "balance_loss_clip": 1.00204325, + "balance_loss_mlp": 1.00070512, + "epoch": 0.37198256425672627, + "flos": 25151223121920.0, + "grad_norm": 1.8164224412041736, + "language_loss": 0.83370852, + "learning_rate": 2.8935678933931224e-06, + "loss": 0.856525, + "num_input_tokens_seen": 133001225, + "step": 6187, + "time_per_iteration": 2.6137940883636475 + }, + { + "auxiliary_loss_clip": 0.01155081, + "auxiliary_loss_mlp": 0.01126055, + "balance_loss_clip": 1.00207126, + "balance_loss_mlp": 1.00064683, + "epoch": 0.37204268750939423, + "flos": 21138313633920.0, + "grad_norm": 4.094550489310989, + "language_loss": 0.84335566, + "learning_rate": 2.893219447719824e-06, + "loss": 0.86616695, + "num_input_tokens_seen": 133018820, + "step": 6188, + "time_per_iteration": 2.586315870285034 + }, + { + "auxiliary_loss_clip": 0.01137219, + "auxiliary_loss_mlp": 0.01126101, + "balance_loss_clip": 1.00190318, + "balance_loss_mlp": 1.00069261, + "epoch": 0.37210281076206225, + "flos": 21506829217920.0, + "grad_norm": 2.0477259295318517, + "language_loss": 0.65233743, + "learning_rate": 2.8928709681755548e-06, + "loss": 0.67497063, + "num_input_tokens_seen": 133040205, + "step": 6189, + "time_per_iteration": 2.638878583908081 + }, + { + "auxiliary_loss_clip": 0.01139511, + "auxiliary_loss_mlp": 0.01127334, + "balance_loss_clip": 1.0020057, + "balance_loss_mlp": 1.00097156, + "epoch": 0.3721629340147302, + "flos": 17347835116800.0, + "grad_norm": 1.974524102322705, + "language_loss": 0.84074831, + "learning_rate": 2.8925224547735293e-06, + "loss": 0.86341679, + "num_input_tokens_seen": 133058095, + "step": 6190, + "time_per_iteration": 2.6021645069122314 + }, + { + "auxiliary_loss_clip": 0.01139012, + "auxiliary_loss_mlp": 0.01127253, + "balance_loss_clip": 1.00208569, + "balance_loss_mlp": 1.00070071, + "epoch": 0.3722230572673982, + "flos": 16432400073600.0, + "grad_norm": 4.275545202398735, + "language_loss": 0.87701696, + "learning_rate": 2.8921739075269633e-06, + "loss": 0.89967966, + "num_input_tokens_seen": 133071530, + "step": 6191, + "time_per_iteration": 2.5585460662841797 + }, + { + "auxiliary_loss_clip": 0.0110669, + "auxiliary_loss_mlp": 0.01127367, + "balance_loss_clip": 1.00179648, + "balance_loss_mlp": 1.00062358, + "epoch": 0.37228318052006615, + "flos": 22674716023680.0, + "grad_norm": 1.986513401070814, + "language_loss": 0.73297447, + "learning_rate": 2.891825326449073e-06, + "loss": 0.75531507, + "num_input_tokens_seen": 133091410, + "step": 6192, + "time_per_iteration": 2.7401599884033203 + }, + { + "auxiliary_loss_clip": 0.01170179, + "auxiliary_loss_mlp": 0.01126765, + "balance_loss_clip": 1.00207043, + "balance_loss_mlp": 1.00068974, + "epoch": 0.3723433037727341, + "flos": 25265491263360.0, + "grad_norm": 2.409059023497658, + "language_loss": 0.79811656, + "learning_rate": 2.8914767115530766e-06, + "loss": 0.82108605, + "num_input_tokens_seen": 133110365, + "step": 6193, + "time_per_iteration": 2.5734076499938965 + }, + { + "auxiliary_loss_clip": 0.01121235, + "auxiliary_loss_mlp": 0.01126285, + "balance_loss_clip": 1.00192285, + "balance_loss_mlp": 1.00068641, + "epoch": 0.3724034270254021, + "flos": 10524664333440.0, + "grad_norm": 2.3931358693462443, + "language_loss": 0.84180713, + "learning_rate": 2.891128062852194e-06, + "loss": 0.86428231, + "num_input_tokens_seen": 133128255, + "step": 6194, + "time_per_iteration": 2.643723487854004 + }, + { + "auxiliary_loss_clip": 0.01137003, + "auxiliary_loss_mlp": 0.01126652, + "balance_loss_clip": 1.0021311, + "balance_loss_mlp": 1.00076699, + "epoch": 0.37246355027807004, + "flos": 20266223328000.0, + "grad_norm": 2.3529249913625696, + "language_loss": 0.77329749, + "learning_rate": 2.890779380359646e-06, + "loss": 0.79593402, + "num_input_tokens_seen": 133143975, + "step": 6195, + "time_per_iteration": 2.6277594566345215 + }, + { + "auxiliary_loss_clip": 0.01138541, + "auxiliary_loss_mlp": 0.01126185, + "balance_loss_clip": 1.00209618, + "balance_loss_mlp": 1.00077629, + "epoch": 0.372523673530738, + "flos": 19500571998720.0, + "grad_norm": 1.516817948886804, + "language_loss": 0.79001713, + "learning_rate": 2.890430664088655e-06, + "loss": 0.81266439, + "num_input_tokens_seen": 133162935, + "step": 6196, + "time_per_iteration": 2.597142219543457 + }, + { + "auxiliary_loss_clip": 0.01153531, + "auxiliary_loss_mlp": 0.01126907, + "balance_loss_clip": 1.00218773, + "balance_loss_mlp": 1.00083172, + "epoch": 0.372583796783406, + "flos": 16764250849920.0, + "grad_norm": 3.1575637289014553, + "language_loss": 0.83587599, + "learning_rate": 2.890081914052443e-06, + "loss": 0.85868037, + "num_input_tokens_seen": 133181180, + "step": 6197, + "time_per_iteration": 2.5307259559631348 + }, + { + "auxiliary_loss_clip": 0.01170035, + "auxiliary_loss_mlp": 0.01126084, + "balance_loss_clip": 1.00210094, + "balance_loss_mlp": 1.00067568, + "epoch": 0.37264392003607394, + "flos": 22637979388800.0, + "grad_norm": 1.7760056786624092, + "language_loss": 0.64917123, + "learning_rate": 2.889733130264237e-06, + "loss": 0.67213237, + "num_input_tokens_seen": 133199615, + "step": 6198, + "time_per_iteration": 2.5257110595703125 + }, + { + "auxiliary_loss_clip": 0.01153633, + "auxiliary_loss_mlp": 0.01126499, + "balance_loss_clip": 1.0020963, + "balance_loss_mlp": 1.00099564, + "epoch": 0.3727040432887419, + "flos": 19973120348160.0, + "grad_norm": 2.367677437344899, + "language_loss": 0.73521805, + "learning_rate": 2.889384312737261e-06, + "loss": 0.75801933, + "num_input_tokens_seen": 133219650, + "step": 6199, + "time_per_iteration": 2.6126041412353516 + }, + { + "auxiliary_loss_clip": 0.01136862, + "auxiliary_loss_mlp": 0.01126287, + "balance_loss_clip": 1.00203192, + "balance_loss_mlp": 1.00078344, + "epoch": 0.37276416654140987, + "flos": 63899122279680.0, + "grad_norm": 2.1706373356979416, + "language_loss": 0.80571419, + "learning_rate": 2.889035461484742e-06, + "loss": 0.82834572, + "num_input_tokens_seen": 133245675, + "step": 6200, + "time_per_iteration": 2.9898006916046143 + }, + { + "auxiliary_loss_clip": 0.011214, + "auxiliary_loss_mlp": 0.01126247, + "balance_loss_clip": 1.00203133, + "balance_loss_mlp": 1.00083876, + "epoch": 0.37282428979407783, + "flos": 39785970211200.0, + "grad_norm": 1.8543499349000268, + "language_loss": 0.60022497, + "learning_rate": 2.88868657651991e-06, + "loss": 0.62270141, + "num_input_tokens_seen": 133266905, + "step": 6201, + "time_per_iteration": 2.91680908203125 + }, + { + "auxiliary_loss_clip": 0.01153214, + "auxiliary_loss_mlp": 0.01127343, + "balance_loss_clip": 1.00209689, + "balance_loss_mlp": 1.00079072, + "epoch": 0.37288441304674586, + "flos": 22709046447360.0, + "grad_norm": 1.6510754776915055, + "language_loss": 0.72895944, + "learning_rate": 2.8883376578559934e-06, + "loss": 0.75176501, + "num_input_tokens_seen": 133286865, + "step": 6202, + "time_per_iteration": 2.604105234146118 + }, + { + "auxiliary_loss_clip": 0.01138095, + "auxiliary_loss_mlp": 0.01126367, + "balance_loss_clip": 1.00195312, + "balance_loss_mlp": 1.00086308, + "epoch": 0.3729445362994138, + "flos": 18770292587520.0, + "grad_norm": 1.991883072824871, + "language_loss": 0.740834, + "learning_rate": 2.8879887055062243e-06, + "loss": 0.76347864, + "num_input_tokens_seen": 133305295, + "step": 6203, + "time_per_iteration": 2.6283111572265625 + }, + { + "auxiliary_loss_clip": 0.01136423, + "auxiliary_loss_mlp": 0.01126066, + "balance_loss_clip": 1.00198746, + "balance_loss_mlp": 1.00075305, + "epoch": 0.3730046595520818, + "flos": 22456199635200.0, + "grad_norm": 1.7833516806280982, + "language_loss": 0.81443918, + "learning_rate": 2.8876397194838353e-06, + "loss": 0.83706409, + "num_input_tokens_seen": 133324625, + "step": 6204, + "time_per_iteration": 2.6259818077087402 + }, + { + "auxiliary_loss_clip": 0.01153808, + "auxiliary_loss_mlp": 0.01126782, + "balance_loss_clip": 1.00217867, + "balance_loss_mlp": 1.00080204, + "epoch": 0.37306478280474975, + "flos": 24316372241280.0, + "grad_norm": 2.6299886775163595, + "language_loss": 0.75163019, + "learning_rate": 2.8872906998020577e-06, + "loss": 0.77443612, + "num_input_tokens_seen": 133344625, + "step": 6205, + "time_per_iteration": 2.5936081409454346 + }, + { + "auxiliary_loss_clip": 0.01154951, + "auxiliary_loss_mlp": 0.01126827, + "balance_loss_clip": 1.00218415, + "balance_loss_mlp": 1.00094247, + "epoch": 0.3731249060574177, + "flos": 15815167741440.0, + "grad_norm": 1.944488848947226, + "language_loss": 0.78127551, + "learning_rate": 2.886941646474128e-06, + "loss": 0.8040933, + "num_input_tokens_seen": 133363605, + "step": 6206, + "time_per_iteration": 3.9317080974578857 + }, + { + "auxiliary_loss_clip": 0.01170306, + "auxiliary_loss_mlp": 0.01126574, + "balance_loss_clip": 1.00223923, + "balance_loss_mlp": 1.00068951, + "epoch": 0.3731850293100857, + "flos": 19828077229440.0, + "grad_norm": 1.9430992500235365, + "language_loss": 0.92995125, + "learning_rate": 2.886592559513283e-06, + "loss": 0.95292008, + "num_input_tokens_seen": 133379405, + "step": 6207, + "time_per_iteration": 2.4957077503204346 + }, + { + "auxiliary_loss_clip": 0.0112803, + "auxiliary_loss_mlp": 0.01126706, + "balance_loss_clip": 1.0021348, + "balance_loss_mlp": 1.00072598, + "epoch": 0.37324515256275365, + "flos": 19062354072960.0, + "grad_norm": 1.911149394113161, + "language_loss": 0.82532787, + "learning_rate": 2.886243438932759e-06, + "loss": 0.84787524, + "num_input_tokens_seen": 133397585, + "step": 6208, + "time_per_iteration": 4.014120817184448 + }, + { + "auxiliary_loss_clip": 0.011552, + "auxiliary_loss_mlp": 0.0112662, + "balance_loss_clip": 1.00218797, + "balance_loss_mlp": 1.00083017, + "epoch": 0.3733052758154216, + "flos": 20704333512960.0, + "grad_norm": 2.7052597038106563, + "language_loss": 0.72975761, + "learning_rate": 2.8858942847457953e-06, + "loss": 0.75257587, + "num_input_tokens_seen": 133415365, + "step": 6209, + "time_per_iteration": 4.007421255111694 + }, + { + "auxiliary_loss_clip": 0.01120607, + "auxiliary_loss_mlp": 0.01126752, + "balance_loss_clip": 1.00197124, + "balance_loss_mlp": 1.00067639, + "epoch": 0.3733653990680896, + "flos": 20193504243840.0, + "grad_norm": 1.5809593877212613, + "language_loss": 0.70019627, + "learning_rate": 2.8855450969656305e-06, + "loss": 0.72266984, + "num_input_tokens_seen": 133435700, + "step": 6210, + "time_per_iteration": 2.677945137023926 + }, + { + "auxiliary_loss_clip": 0.01112856, + "auxiliary_loss_mlp": 0.01127151, + "balance_loss_clip": 1.00210547, + "balance_loss_mlp": 1.00069356, + "epoch": 0.37342552232075754, + "flos": 20339660684160.0, + "grad_norm": 1.5885896391776242, + "language_loss": 0.77737105, + "learning_rate": 2.8851958756055073e-06, + "loss": 0.79977107, + "num_input_tokens_seen": 133455180, + "step": 6211, + "time_per_iteration": 2.6799604892730713 + }, + { + "auxiliary_loss_clip": 0.01153579, + "auxiliary_loss_mlp": 0.01126999, + "balance_loss_clip": 1.00215948, + "balance_loss_mlp": 1.00063694, + "epoch": 0.3734856455734255, + "flos": 35517879527040.0, + "grad_norm": 1.5669735573911925, + "language_loss": 0.73345566, + "learning_rate": 2.884846620678668e-06, + "loss": 0.75626141, + "num_input_tokens_seen": 133476715, + "step": 6212, + "time_per_iteration": 2.693152666091919 + }, + { + "auxiliary_loss_clip": 0.01153719, + "auxiliary_loss_mlp": 0.01128272, + "balance_loss_clip": 1.00215518, + "balance_loss_mlp": 1.00076532, + "epoch": 0.37354576882609347, + "flos": 21142300043520.0, + "grad_norm": 1.9564448511260315, + "language_loss": 0.81706876, + "learning_rate": 2.884497332198356e-06, + "loss": 0.83988869, + "num_input_tokens_seen": 133494550, + "step": 6213, + "time_per_iteration": 3.9428133964538574 + }, + { + "auxiliary_loss_clip": 0.01121397, + "auxiliary_loss_mlp": 0.01126925, + "balance_loss_clip": 1.0019269, + "balance_loss_mlp": 1.00094461, + "epoch": 0.37360589207876144, + "flos": 21506793304320.0, + "grad_norm": 2.1041660068850767, + "language_loss": 0.78912222, + "learning_rate": 2.8841480101778167e-06, + "loss": 0.81160545, + "num_input_tokens_seen": 133512640, + "step": 6214, + "time_per_iteration": 2.6374127864837646 + }, + { + "auxiliary_loss_clip": 0.01137924, + "auxiliary_loss_mlp": 0.01126177, + "balance_loss_clip": 1.00205588, + "balance_loss_mlp": 1.00076878, + "epoch": 0.37366601533142946, + "flos": 38435800861440.0, + "grad_norm": 1.9210140180638866, + "language_loss": 0.84826589, + "learning_rate": 2.883798654630296e-06, + "loss": 0.87090683, + "num_input_tokens_seen": 133535540, + "step": 6215, + "time_per_iteration": 2.7500617504119873 + }, + { + "auxiliary_loss_clip": 0.01121144, + "auxiliary_loss_mlp": 0.01126745, + "balance_loss_clip": 1.00196517, + "balance_loss_mlp": 1.00085986, + "epoch": 0.3737261385840974, + "flos": 18441171244800.0, + "grad_norm": 1.9270562327633973, + "language_loss": 0.68400306, + "learning_rate": 2.8834492655690423e-06, + "loss": 0.70648193, + "num_input_tokens_seen": 133555795, + "step": 6216, + "time_per_iteration": 2.706188678741455 + }, + { + "auxiliary_loss_clip": 0.01143205, + "auxiliary_loss_mlp": 0.01125674, + "balance_loss_clip": 1.00228751, + "balance_loss_mlp": 1.00074232, + "epoch": 0.3737862618367654, + "flos": 22929861306240.0, + "grad_norm": 2.253640455491417, + "language_loss": 0.65717626, + "learning_rate": 2.883099843007303e-06, + "loss": 0.67986506, + "num_input_tokens_seen": 133575905, + "step": 6217, + "time_per_iteration": 2.604957342147827 + }, + { + "auxiliary_loss_clip": 0.01137319, + "auxiliary_loss_mlp": 0.01125811, + "balance_loss_clip": 1.00205183, + "balance_loss_mlp": 1.00068939, + "epoch": 0.37384638508943335, + "flos": 15409664127360.0, + "grad_norm": 1.8094717638383824, + "language_loss": 0.80357492, + "learning_rate": 2.88275038695833e-06, + "loss": 0.82620621, + "num_input_tokens_seen": 133592585, + "step": 6218, + "time_per_iteration": 2.5696780681610107 + }, + { + "auxiliary_loss_clip": 0.01153351, + "auxiliary_loss_mlp": 0.01126263, + "balance_loss_clip": 1.00210726, + "balance_loss_mlp": 1.00075948, + "epoch": 0.3739065083421013, + "flos": 24280820755200.0, + "grad_norm": 2.285573054470486, + "language_loss": 0.78881317, + "learning_rate": 2.8824008974353736e-06, + "loss": 0.81160927, + "num_input_tokens_seen": 133615070, + "step": 6219, + "time_per_iteration": 2.599252462387085 + }, + { + "auxiliary_loss_clip": 0.01136345, + "auxiliary_loss_mlp": 0.01125932, + "balance_loss_clip": 1.00202942, + "balance_loss_mlp": 1.00071418, + "epoch": 0.3739666315947693, + "flos": 23002831785600.0, + "grad_norm": 2.3491888159522967, + "language_loss": 0.76503396, + "learning_rate": 2.8820513744516866e-06, + "loss": 0.78765672, + "num_input_tokens_seen": 133633490, + "step": 6220, + "time_per_iteration": 2.6242992877960205 + }, + { + "auxiliary_loss_clip": 0.01122251, + "auxiliary_loss_mlp": 0.01126582, + "balance_loss_clip": 1.00201821, + "balance_loss_mlp": 1.00079274, + "epoch": 0.37402675484743725, + "flos": 19391116279680.0, + "grad_norm": 1.6069839223518045, + "language_loss": 0.82982194, + "learning_rate": 2.8817018180205235e-06, + "loss": 0.85231024, + "num_input_tokens_seen": 133653425, + "step": 6221, + "time_per_iteration": 2.6501314640045166 + }, + { + "auxiliary_loss_clip": 0.01137986, + "auxiliary_loss_mlp": 0.01126773, + "balance_loss_clip": 1.00205946, + "balance_loss_mlp": 1.00098395, + "epoch": 0.3740868781001052, + "flos": 17126158331520.0, + "grad_norm": 2.1235635762972964, + "language_loss": 0.76329207, + "learning_rate": 2.8813522281551387e-06, + "loss": 0.78593969, + "num_input_tokens_seen": 133670220, + "step": 6222, + "time_per_iteration": 2.5776562690734863 + }, + { + "auxiliary_loss_clip": 0.01121312, + "auxiliary_loss_mlp": 0.00747746, + "balance_loss_clip": 1.00212109, + "balance_loss_mlp": 1.00020576, + "epoch": 0.3741470013527732, + "flos": 20043505048320.0, + "grad_norm": 2.1871455325086497, + "language_loss": 0.70410144, + "learning_rate": 2.881002604868789e-06, + "loss": 0.72279203, + "num_input_tokens_seen": 133688910, + "step": 6223, + "time_per_iteration": 2.680140495300293 + }, + { + "auxiliary_loss_clip": 0.01121188, + "auxiliary_loss_mlp": 0.01126272, + "balance_loss_clip": 1.0020889, + "balance_loss_mlp": 1.00076818, + "epoch": 0.37420712460544114, + "flos": 36897279569280.0, + "grad_norm": 3.943988018522672, + "language_loss": 0.68668342, + "learning_rate": 2.8806529481747325e-06, + "loss": 0.709158, + "num_input_tokens_seen": 133708690, + "step": 6224, + "time_per_iteration": 2.777376413345337 + }, + { + "auxiliary_loss_clip": 0.01120303, + "auxiliary_loss_mlp": 0.01125595, + "balance_loss_clip": 1.00188708, + "balance_loss_mlp": 1.00075936, + "epoch": 0.3742672478581091, + "flos": 22201198007040.0, + "grad_norm": 1.6903805777036056, + "language_loss": 0.69842821, + "learning_rate": 2.880303258086228e-06, + "loss": 0.72088718, + "num_input_tokens_seen": 133728095, + "step": 6225, + "time_per_iteration": 2.6703436374664307 + }, + { + "auxiliary_loss_clip": 0.01123512, + "auxiliary_loss_mlp": 0.01125897, + "balance_loss_clip": 1.0020864, + "balance_loss_mlp": 1.00077522, + "epoch": 0.3743273711107771, + "flos": 24681547860480.0, + "grad_norm": 2.984158272367334, + "language_loss": 0.79442251, + "learning_rate": 2.879953534616536e-06, + "loss": 0.81691658, + "num_input_tokens_seen": 133745590, + "step": 6226, + "time_per_iteration": 2.701042413711548 + }, + { + "auxiliary_loss_clip": 0.01137029, + "auxiliary_loss_mlp": 0.01125514, + "balance_loss_clip": 1.00202572, + "balance_loss_mlp": 1.00058246, + "epoch": 0.37438749436344504, + "flos": 24459619680000.0, + "grad_norm": 2.1924998593165803, + "language_loss": 0.68097162, + "learning_rate": 2.879603777778917e-06, + "loss": 0.70359695, + "num_input_tokens_seen": 133766155, + "step": 6227, + "time_per_iteration": 2.708747386932373 + }, + { + "auxiliary_loss_clip": 0.01123149, + "auxiliary_loss_mlp": 0.01125763, + "balance_loss_clip": 1.00211811, + "balance_loss_mlp": 1.00054598, + "epoch": 0.374447617616113, + "flos": 21798747048960.0, + "grad_norm": 1.6414034453837998, + "language_loss": 0.82731265, + "learning_rate": 2.879253987586635e-06, + "loss": 0.84980178, + "num_input_tokens_seen": 133783185, + "step": 6228, + "time_per_iteration": 2.678920269012451 + }, + { + "auxiliary_loss_clip": 0.01121254, + "auxiliary_loss_mlp": 0.01125585, + "balance_loss_clip": 1.00193238, + "balance_loss_mlp": 1.00074887, + "epoch": 0.374507740868781, + "flos": 17968191932160.0, + "grad_norm": 1.8066288239748385, + "language_loss": 0.74668825, + "learning_rate": 2.8789041640529535e-06, + "loss": 0.76915669, + "num_input_tokens_seen": 133800975, + "step": 6229, + "time_per_iteration": 2.655693769454956 + }, + { + "auxiliary_loss_clip": 0.01122096, + "auxiliary_loss_mlp": 0.01126279, + "balance_loss_clip": 1.00187039, + "balance_loss_mlp": 1.00067997, + "epoch": 0.374567864121449, + "flos": 16105828596480.0, + "grad_norm": 1.835880034782068, + "language_loss": 0.83397186, + "learning_rate": 2.8785543071911383e-06, + "loss": 0.85645556, + "num_input_tokens_seen": 133818020, + "step": 6230, + "time_per_iteration": 2.615703582763672 + }, + { + "auxiliary_loss_clip": 0.01153458, + "auxiliary_loss_mlp": 0.01126518, + "balance_loss_clip": 1.0021174, + "balance_loss_mlp": 1.00082397, + "epoch": 0.37462798737411696, + "flos": 25773160135680.0, + "grad_norm": 2.507763424185904, + "language_loss": 0.73739624, + "learning_rate": 2.878204417014456e-06, + "loss": 0.76019603, + "num_input_tokens_seen": 133840690, + "step": 6231, + "time_per_iteration": 2.6473090648651123 + }, + { + "auxiliary_loss_clip": 0.011538, + "auxiliary_loss_mlp": 0.01125738, + "balance_loss_clip": 1.00206053, + "balance_loss_mlp": 1.00080705, + "epoch": 0.3746881106267849, + "flos": 16654507822080.0, + "grad_norm": 2.2049249437485097, + "language_loss": 0.73785734, + "learning_rate": 2.8778544935361735e-06, + "loss": 0.76065272, + "num_input_tokens_seen": 133858350, + "step": 6232, + "time_per_iteration": 2.5318410396575928 + }, + { + "auxiliary_loss_clip": 0.01138267, + "auxiliary_loss_mlp": 0.01125315, + "balance_loss_clip": 1.00216198, + "balance_loss_mlp": 1.00057447, + "epoch": 0.3747482338794529, + "flos": 26177981391360.0, + "grad_norm": 1.6458438399443533, + "language_loss": 0.77185106, + "learning_rate": 2.877504536769561e-06, + "loss": 0.79448688, + "num_input_tokens_seen": 133879775, + "step": 6233, + "time_per_iteration": 2.651249408721924 + }, + { + "auxiliary_loss_clip": 0.01138242, + "auxiliary_loss_mlp": 0.01125246, + "balance_loss_clip": 1.0021317, + "balance_loss_mlp": 1.00079203, + "epoch": 0.37480835713212085, + "flos": 12021061950720.0, + "grad_norm": 1.5895913508037531, + "language_loss": 0.69215339, + "learning_rate": 2.8771545467278883e-06, + "loss": 0.71478832, + "num_input_tokens_seen": 133898295, + "step": 6234, + "time_per_iteration": 2.5897417068481445 + }, + { + "auxiliary_loss_clip": 0.01154438, + "auxiliary_loss_mlp": 0.01126031, + "balance_loss_clip": 1.00210392, + "balance_loss_mlp": 1.00071788, + "epoch": 0.3748684803847888, + "flos": 19679263182720.0, + "grad_norm": 2.446754011436338, + "language_loss": 0.82322323, + "learning_rate": 2.8768045234244276e-06, + "loss": 0.84602797, + "num_input_tokens_seen": 133915230, + "step": 6235, + "time_per_iteration": 2.5730178356170654 + }, + { + "auxiliary_loss_clip": 0.01170112, + "auxiliary_loss_mlp": 0.01125654, + "balance_loss_clip": 1.00221515, + "balance_loss_mlp": 1.00072289, + "epoch": 0.3749286036374568, + "flos": 20521189042560.0, + "grad_norm": 2.2997700540374124, + "language_loss": 0.78069067, + "learning_rate": 2.8764544668724517e-06, + "loss": 0.80364829, + "num_input_tokens_seen": 133934110, + "step": 6236, + "time_per_iteration": 2.5239176750183105 + }, + { + "auxiliary_loss_clip": 0.01155146, + "auxiliary_loss_mlp": 0.01126404, + "balance_loss_clip": 1.00209332, + "balance_loss_mlp": 1.0008055, + "epoch": 0.37498872689012475, + "flos": 20704620821760.0, + "grad_norm": 1.9787392120286447, + "language_loss": 0.73281634, + "learning_rate": 2.876104377085234e-06, + "loss": 0.75563186, + "num_input_tokens_seen": 133952395, + "step": 6237, + "time_per_iteration": 2.564133882522583 + }, + { + "auxiliary_loss_clip": 0.01144732, + "auxiliary_loss_mlp": 0.00747612, + "balance_loss_clip": 1.0025146, + "balance_loss_mlp": 1.00010264, + "epoch": 0.3750488501427927, + "flos": 21574843620480.0, + "grad_norm": 1.9191294283826974, + "language_loss": 0.92742169, + "learning_rate": 2.8757542540760508e-06, + "loss": 0.94634509, + "num_input_tokens_seen": 133969635, + "step": 6238, + "time_per_iteration": 2.587542772293091 + }, + { + "auxiliary_loss_clip": 0.01170101, + "auxiliary_loss_mlp": 0.01125379, + "balance_loss_clip": 1.00219655, + "balance_loss_mlp": 1.00073433, + "epoch": 0.3751089733954607, + "flos": 15923869274880.0, + "grad_norm": 2.461010700036284, + "language_loss": 0.71187234, + "learning_rate": 2.8754040978581777e-06, + "loss": 0.73482716, + "num_input_tokens_seen": 133987215, + "step": 6239, + "time_per_iteration": 2.520855188369751 + }, + { + "auxiliary_loss_clip": 0.01075055, + "auxiliary_loss_mlp": 0.01126179, + "balance_loss_clip": 1.00178218, + "balance_loss_mlp": 1.0006758, + "epoch": 0.37516909664812864, + "flos": 36284644177920.0, + "grad_norm": 1.4530710001764913, + "language_loss": 0.6560424, + "learning_rate": 2.875053908444895e-06, + "loss": 0.67805469, + "num_input_tokens_seen": 134009250, + "step": 6240, + "time_per_iteration": 2.859966278076172 + }, + { + "auxiliary_loss_clip": 0.01121649, + "auxiliary_loss_mlp": 0.00747559, + "balance_loss_clip": 1.00200653, + "balance_loss_mlp": 1.00009227, + "epoch": 0.3752292199007966, + "flos": 13515915283200.0, + "grad_norm": 4.406008376273177, + "language_loss": 0.75253367, + "learning_rate": 2.8747036858494795e-06, + "loss": 0.77122581, + "num_input_tokens_seen": 134026875, + "step": 6241, + "time_per_iteration": 2.6174824237823486 + }, + { + "auxiliary_loss_clip": 0.01121832, + "auxiliary_loss_mlp": 0.01126014, + "balance_loss_clip": 1.0019964, + "balance_loss_mlp": 1.00070143, + "epoch": 0.3752893431534646, + "flos": 27198095644800.0, + "grad_norm": 7.673840516818712, + "language_loss": 0.83871168, + "learning_rate": 2.874353430085213e-06, + "loss": 0.8611902, + "num_input_tokens_seen": 134047185, + "step": 6242, + "time_per_iteration": 2.6953976154327393 + }, + { + "auxiliary_loss_clip": 0.01136797, + "auxiliary_loss_mlp": 0.01125804, + "balance_loss_clip": 1.00200498, + "balance_loss_mlp": 1.00068223, + "epoch": 0.3753494664061326, + "flos": 30007674581760.0, + "grad_norm": 3.4207431019029793, + "language_loss": 0.67940485, + "learning_rate": 2.8740031411653766e-06, + "loss": 0.70203084, + "num_input_tokens_seen": 134067330, + "step": 6243, + "time_per_iteration": 2.6663053035736084 + }, + { + "auxiliary_loss_clip": 0.01077016, + "auxiliary_loss_mlp": 0.00747663, + "balance_loss_clip": 1.00194502, + "balance_loss_mlp": 1.00012684, + "epoch": 0.37540958965880056, + "flos": 24461954064000.0, + "grad_norm": 2.0084676579695833, + "language_loss": 0.83816415, + "learning_rate": 2.8736528191032535e-06, + "loss": 0.85641092, + "num_input_tokens_seen": 134085525, + "step": 6244, + "time_per_iteration": 4.1867759227752686 + }, + { + "auxiliary_loss_clip": 0.01106584, + "auxiliary_loss_mlp": 0.01124724, + "balance_loss_clip": 1.00196075, + "balance_loss_mlp": 1.00074673, + "epoch": 0.3754697129114685, + "flos": 16508387295360.0, + "grad_norm": 2.658599467901043, + "language_loss": 0.82932591, + "learning_rate": 2.8733024639121277e-06, + "loss": 0.85163903, + "num_input_tokens_seen": 134101855, + "step": 6245, + "time_per_iteration": 2.671832323074341 + }, + { + "auxiliary_loss_clip": 0.01139954, + "auxiliary_loss_mlp": 0.01125952, + "balance_loss_clip": 1.00217605, + "balance_loss_mlp": 1.00063896, + "epoch": 0.3755298361641365, + "flos": 19390900798080.0, + "grad_norm": 7.850235227964367, + "language_loss": 0.64086008, + "learning_rate": 2.8729520756052853e-06, + "loss": 0.66351914, + "num_input_tokens_seen": 134119360, + "step": 6246, + "time_per_iteration": 3.963733196258545 + }, + { + "auxiliary_loss_clip": 0.01138623, + "auxiliary_loss_mlp": 0.01126079, + "balance_loss_clip": 1.00191164, + "balance_loss_mlp": 1.00076604, + "epoch": 0.37558995941680445, + "flos": 14720395069440.0, + "grad_norm": 1.75237111153452, + "language_loss": 0.74830818, + "learning_rate": 2.8726016541960124e-06, + "loss": 0.77095526, + "num_input_tokens_seen": 134137475, + "step": 6247, + "time_per_iteration": 4.0516088008880615 + }, + { + "auxiliary_loss_clip": 0.0115324, + "auxiliary_loss_mlp": 0.01125883, + "balance_loss_clip": 1.00200486, + "balance_loss_mlp": 1.00066531, + "epoch": 0.3756500826694724, + "flos": 21689901861120.0, + "grad_norm": 3.0759420384139764, + "language_loss": 0.54128754, + "learning_rate": 2.872251199697598e-06, + "loss": 0.56407869, + "num_input_tokens_seen": 134154580, + "step": 6248, + "time_per_iteration": 2.602553367614746 + }, + { + "auxiliary_loss_clip": 0.01154779, + "auxiliary_loss_mlp": 0.01125581, + "balance_loss_clip": 1.00213599, + "balance_loss_mlp": 1.00084066, + "epoch": 0.3757102059221404, + "flos": 26505666190080.0, + "grad_norm": 2.417450186903226, + "language_loss": 0.84220064, + "learning_rate": 2.8719007121233297e-06, + "loss": 0.86500418, + "num_input_tokens_seen": 134174285, + "step": 6249, + "time_per_iteration": 2.619168758392334 + }, + { + "auxiliary_loss_clip": 0.01137132, + "auxiliary_loss_mlp": 0.01125712, + "balance_loss_clip": 1.00207448, + "balance_loss_mlp": 1.00058985, + "epoch": 0.37577032917480835, + "flos": 37338083274240.0, + "grad_norm": 1.6805036084364744, + "language_loss": 0.67705607, + "learning_rate": 2.8715501914864993e-06, + "loss": 0.6996845, + "num_input_tokens_seen": 134195940, + "step": 6250, + "time_per_iteration": 4.211400508880615 + }, + { + "auxiliary_loss_clip": 0.01138027, + "auxiliary_loss_mlp": 0.01125525, + "balance_loss_clip": 1.00206697, + "balance_loss_mlp": 1.00078416, + "epoch": 0.3758304524274763, + "flos": 21908597817600.0, + "grad_norm": 1.8550223559001169, + "language_loss": 0.77315664, + "learning_rate": 2.8711996378003987e-06, + "loss": 0.79579222, + "num_input_tokens_seen": 134212235, + "step": 6251, + "time_per_iteration": 2.5988030433654785 + }, + { + "auxiliary_loss_clip": 0.01153658, + "auxiliary_loss_mlp": 0.01125757, + "balance_loss_clip": 1.00209045, + "balance_loss_mlp": 1.00063527, + "epoch": 0.3758905756801443, + "flos": 36569343375360.0, + "grad_norm": 1.8485483961013573, + "language_loss": 0.57747245, + "learning_rate": 2.8708490510783203e-06, + "loss": 0.60026658, + "num_input_tokens_seen": 134233810, + "step": 6252, + "time_per_iteration": 2.667603015899658 + }, + { + "auxiliary_loss_clip": 0.01136725, + "auxiliary_loss_mlp": 0.01125997, + "balance_loss_clip": 1.00202656, + "balance_loss_mlp": 1.00077939, + "epoch": 0.37595069893281224, + "flos": 24528783317760.0, + "grad_norm": 2.8067611197321845, + "language_loss": 0.89916968, + "learning_rate": 2.8704984313335584e-06, + "loss": 0.92179698, + "num_input_tokens_seen": 134252020, + "step": 6253, + "time_per_iteration": 2.63018536567688 + }, + { + "auxiliary_loss_clip": 0.01120031, + "auxiliary_loss_mlp": 0.01125022, + "balance_loss_clip": 1.00206864, + "balance_loss_mlp": 1.00066304, + "epoch": 0.3760108221854802, + "flos": 16435021766400.0, + "grad_norm": 3.413141984102399, + "language_loss": 0.76923084, + "learning_rate": 2.8701477785794097e-06, + "loss": 0.79168129, + "num_input_tokens_seen": 134269495, + "step": 6254, + "time_per_iteration": 2.6389269828796387 + }, + { + "auxiliary_loss_clip": 0.01122868, + "auxiliary_loss_mlp": 0.01126865, + "balance_loss_clip": 1.00194144, + "balance_loss_mlp": 1.00078952, + "epoch": 0.37607094543814823, + "flos": 13771742924160.0, + "grad_norm": 2.3453519493436144, + "language_loss": 0.61649346, + "learning_rate": 2.869797092829169e-06, + "loss": 0.63899076, + "num_input_tokens_seen": 134287035, + "step": 6255, + "time_per_iteration": 2.615931987762451 + }, + { + "auxiliary_loss_clip": 0.01153332, + "auxiliary_loss_mlp": 0.01126164, + "balance_loss_clip": 1.00210631, + "balance_loss_mlp": 1.00066042, + "epoch": 0.3761310686908162, + "flos": 19857918453120.0, + "grad_norm": 2.1907942858368545, + "language_loss": 0.73916364, + "learning_rate": 2.869446374096135e-06, + "loss": 0.7619586, + "num_input_tokens_seen": 134304840, + "step": 6256, + "time_per_iteration": 2.571603298187256 + }, + { + "auxiliary_loss_clip": 0.01154849, + "auxiliary_loss_mlp": 0.01125648, + "balance_loss_clip": 1.00216019, + "balance_loss_mlp": 1.00071645, + "epoch": 0.37619119194348416, + "flos": 12750802657920.0, + "grad_norm": 2.8629890353179186, + "language_loss": 0.7027204, + "learning_rate": 2.8690956223936088e-06, + "loss": 0.72552538, + "num_input_tokens_seen": 134323180, + "step": 6257, + "time_per_iteration": 2.547165632247925 + }, + { + "auxiliary_loss_clip": 0.0113825, + "auxiliary_loss_mlp": 0.01125266, + "balance_loss_clip": 1.00210905, + "balance_loss_mlp": 1.00062096, + "epoch": 0.3762513151961521, + "flos": 17530548624000.0, + "grad_norm": 1.710469708085884, + "language_loss": 0.84352517, + "learning_rate": 2.868744837734889e-06, + "loss": 0.86616039, + "num_input_tokens_seen": 134341390, + "step": 6258, + "time_per_iteration": 2.569366455078125 + }, + { + "auxiliary_loss_clip": 0.01106546, + "auxiliary_loss_mlp": 0.01125295, + "balance_loss_clip": 1.00193954, + "balance_loss_mlp": 1.00093627, + "epoch": 0.3763114384488201, + "flos": 23617406511360.0, + "grad_norm": 1.3965870088294796, + "language_loss": 0.80674481, + "learning_rate": 2.868394020133277e-06, + "loss": 0.82906324, + "num_input_tokens_seen": 134360425, + "step": 6259, + "time_per_iteration": 2.699427843093872 + }, + { + "auxiliary_loss_clip": 0.01106267, + "auxiliary_loss_mlp": 0.01126408, + "balance_loss_clip": 1.00204563, + "balance_loss_mlp": 1.00080955, + "epoch": 0.37637156170148806, + "flos": 25406978935680.0, + "grad_norm": 4.569395433087733, + "language_loss": 0.71405482, + "learning_rate": 2.8680431696020783e-06, + "loss": 0.73638153, + "num_input_tokens_seen": 134379775, + "step": 6260, + "time_per_iteration": 2.7259795665740967 + }, + { + "auxiliary_loss_clip": 0.01137896, + "auxiliary_loss_mlp": 0.01126067, + "balance_loss_clip": 1.00213265, + "balance_loss_mlp": 1.00065887, + "epoch": 0.376431684954156, + "flos": 23440906056960.0, + "grad_norm": 2.7133369282008704, + "language_loss": 0.78204304, + "learning_rate": 2.867692286154594e-06, + "loss": 0.80468267, + "num_input_tokens_seen": 134400315, + "step": 6261, + "time_per_iteration": 2.629910707473755 + }, + { + "auxiliary_loss_clip": 0.01138422, + "auxiliary_loss_mlp": 0.01126294, + "balance_loss_clip": 1.00216007, + "balance_loss_mlp": 1.0008862, + "epoch": 0.376491808206824, + "flos": 34204482725760.0, + "grad_norm": 1.632471159663585, + "language_loss": 0.80124736, + "learning_rate": 2.867341369804132e-06, + "loss": 0.8238945, + "num_input_tokens_seen": 134422875, + "step": 6262, + "time_per_iteration": 2.7229249477386475 + }, + { + "auxiliary_loss_clip": 0.01155097, + "auxiliary_loss_mlp": 0.0112522, + "balance_loss_clip": 1.0021801, + "balance_loss_mlp": 1.0007658, + "epoch": 0.37655193145949195, + "flos": 35185669614720.0, + "grad_norm": 2.1685573194249628, + "language_loss": 0.80462128, + "learning_rate": 2.866990420563998e-06, + "loss": 0.82742441, + "num_input_tokens_seen": 134443025, + "step": 6263, + "time_per_iteration": 2.7097082138061523 + }, + { + "auxiliary_loss_clip": 0.01170111, + "auxiliary_loss_mlp": 0.01126096, + "balance_loss_clip": 1.00223637, + "balance_loss_mlp": 1.00087845, + "epoch": 0.3766120547121599, + "flos": 16761844638720.0, + "grad_norm": 4.170466550657517, + "language_loss": 0.8003009, + "learning_rate": 2.866639438447501e-06, + "loss": 0.82326293, + "num_input_tokens_seen": 134460945, + "step": 6264, + "time_per_iteration": 2.4933855533599854 + }, + { + "auxiliary_loss_clip": 0.01169841, + "auxiliary_loss_mlp": 0.01125254, + "balance_loss_clip": 1.00205088, + "balance_loss_mlp": 1.00089478, + "epoch": 0.3766721779648279, + "flos": 23550361776000.0, + "grad_norm": 2.0460060656306793, + "language_loss": 0.73402071, + "learning_rate": 2.8662884234679497e-06, + "loss": 0.75697166, + "num_input_tokens_seen": 134480440, + "step": 6265, + "time_per_iteration": 2.5484800338745117 + }, + { + "auxiliary_loss_clip": 0.01153385, + "auxiliary_loss_mlp": 0.01124823, + "balance_loss_clip": 1.00218892, + "balance_loss_mlp": 1.00084567, + "epoch": 0.37673230121749585, + "flos": 29129191655040.0, + "grad_norm": 1.7471574620343127, + "language_loss": 0.68540692, + "learning_rate": 2.865937375638654e-06, + "loss": 0.70818901, + "num_input_tokens_seen": 134501110, + "step": 6266, + "time_per_iteration": 2.6475188732147217 + }, + { + "auxiliary_loss_clip": 0.0115349, + "auxiliary_loss_mlp": 0.01127072, + "balance_loss_clip": 1.00212181, + "balance_loss_mlp": 1.00080514, + "epoch": 0.3767924244701638, + "flos": 28146783703680.0, + "grad_norm": 2.3238572501433064, + "language_loss": 0.6275202, + "learning_rate": 2.8655862949729264e-06, + "loss": 0.65032589, + "num_input_tokens_seen": 134522460, + "step": 6267, + "time_per_iteration": 2.635821580886841 + }, + { + "auxiliary_loss_clip": 0.01151726, + "auxiliary_loss_mlp": 0.01106174, + "balance_loss_clip": 1.00270295, + "balance_loss_mlp": 1.00012517, + "epoch": 0.37685254772283183, + "flos": 60797197526400.0, + "grad_norm": 0.7146758428534361, + "language_loss": 0.58876663, + "learning_rate": 2.8652351814840795e-06, + "loss": 0.61134565, + "num_input_tokens_seen": 134589545, + "step": 6268, + "time_per_iteration": 3.252556562423706 + }, + { + "auxiliary_loss_clip": 0.01170073, + "auxiliary_loss_mlp": 0.01125752, + "balance_loss_clip": 1.00219417, + "balance_loss_mlp": 1.00082099, + "epoch": 0.3769126709754998, + "flos": 26032543223040.0, + "grad_norm": 2.7443569258587774, + "language_loss": 0.64845693, + "learning_rate": 2.8648840351854283e-06, + "loss": 0.67141521, + "num_input_tokens_seen": 134610550, + "step": 6269, + "time_per_iteration": 2.583604574203491 + }, + { + "auxiliary_loss_clip": 0.01119951, + "auxiliary_loss_mlp": 0.0112551, + "balance_loss_clip": 1.00208759, + "balance_loss_mlp": 1.00076914, + "epoch": 0.37697279422816776, + "flos": 23579879777280.0, + "grad_norm": 1.6285314995071418, + "language_loss": 0.70796168, + "learning_rate": 2.8645328560902874e-06, + "loss": 0.73041636, + "num_input_tokens_seen": 134630485, + "step": 6270, + "time_per_iteration": 2.7158753871917725 + }, + { + "auxiliary_loss_clip": 0.01166619, + "auxiliary_loss_mlp": 0.01106062, + "balance_loss_clip": 1.00263667, + "balance_loss_mlp": 1.00001359, + "epoch": 0.3770329174808357, + "flos": 64745935367040.0, + "grad_norm": 0.7183904914092942, + "language_loss": 0.56089127, + "learning_rate": 2.8641816442119746e-06, + "loss": 0.5836181, + "num_input_tokens_seen": 134693510, + "step": 6271, + "time_per_iteration": 3.090937376022339 + }, + { + "auxiliary_loss_clip": 0.01153602, + "auxiliary_loss_mlp": 0.01124711, + "balance_loss_clip": 1.00209975, + "balance_loss_mlp": 1.00073326, + "epoch": 0.3770930407335037, + "flos": 21835304115840.0, + "grad_norm": 1.9325142855881754, + "language_loss": 0.79870403, + "learning_rate": 2.8638303995638066e-06, + "loss": 0.82148719, + "num_input_tokens_seen": 134713115, + "step": 6272, + "time_per_iteration": 2.5721044540405273 + }, + { + "auxiliary_loss_clip": 0.01154513, + "auxiliary_loss_mlp": 0.01124958, + "balance_loss_clip": 1.00218093, + "balance_loss_mlp": 1.00069451, + "epoch": 0.37715316398617166, + "flos": 22747901984640.0, + "grad_norm": 1.5838938941834322, + "language_loss": 0.73932421, + "learning_rate": 2.863479122159103e-06, + "loss": 0.76211882, + "num_input_tokens_seen": 134732635, + "step": 6273, + "time_per_iteration": 2.5758914947509766 + }, + { + "auxiliary_loss_clip": 0.01153676, + "auxiliary_loss_mlp": 0.01125555, + "balance_loss_clip": 1.00214815, + "balance_loss_mlp": 1.00081444, + "epoch": 0.3772132872388396, + "flos": 18914581520640.0, + "grad_norm": 1.5978040922316787, + "language_loss": 0.71973193, + "learning_rate": 2.8631278120111858e-06, + "loss": 0.74252421, + "num_input_tokens_seen": 134750695, + "step": 6274, + "time_per_iteration": 2.541865110397339 + }, + { + "auxiliary_loss_clip": 0.01138597, + "auxiliary_loss_mlp": 0.01125264, + "balance_loss_clip": 1.0022862, + "balance_loss_mlp": 1.0009048, + "epoch": 0.3772734104915076, + "flos": 17346219004800.0, + "grad_norm": 1.9388144680018107, + "language_loss": 0.84187353, + "learning_rate": 2.8627764691333742e-06, + "loss": 0.86451215, + "num_input_tokens_seen": 134768935, + "step": 6275, + "time_per_iteration": 2.5727338790893555 + }, + { + "auxiliary_loss_clip": 0.01105131, + "auxiliary_loss_mlp": 0.01124511, + "balance_loss_clip": 1.00204372, + "balance_loss_mlp": 1.00081944, + "epoch": 0.37733353374417555, + "flos": 32342370785280.0, + "grad_norm": 1.5058601755123031, + "language_loss": 0.7519362, + "learning_rate": 2.8624250935389935e-06, + "loss": 0.77423263, + "num_input_tokens_seen": 134791260, + "step": 6276, + "time_per_iteration": 2.7828361988067627 + }, + { + "auxiliary_loss_clip": 0.01139633, + "auxiliary_loss_mlp": 0.01126216, + "balance_loss_clip": 1.00219297, + "balance_loss_mlp": 1.00080824, + "epoch": 0.3773936569968435, + "flos": 23360681030400.0, + "grad_norm": 1.8321637782215685, + "language_loss": 0.85688418, + "learning_rate": 2.862073685241366e-06, + "loss": 0.87954271, + "num_input_tokens_seen": 134808350, + "step": 6277, + "time_per_iteration": 2.6101744174957275 + }, + { + "auxiliary_loss_clip": 0.01154185, + "auxiliary_loss_mlp": 0.01124667, + "balance_loss_clip": 1.00222218, + "balance_loss_mlp": 1.00059378, + "epoch": 0.3774537802495115, + "flos": 21466788531840.0, + "grad_norm": 1.6216611973037205, + "language_loss": 0.77940226, + "learning_rate": 2.861722244253818e-06, + "loss": 0.80219078, + "num_input_tokens_seen": 134826005, + "step": 6278, + "time_per_iteration": 2.591747283935547 + }, + { + "auxiliary_loss_clip": 0.01138626, + "auxiliary_loss_mlp": 0.01126152, + "balance_loss_clip": 1.00211012, + "balance_loss_mlp": 1.00083899, + "epoch": 0.37751390350217945, + "flos": 24973717086720.0, + "grad_norm": 1.8800348133551534, + "language_loss": 0.83106923, + "learning_rate": 2.8613707705896767e-06, + "loss": 0.85371703, + "num_input_tokens_seen": 134844995, + "step": 6279, + "time_per_iteration": 2.6801984310150146 + }, + { + "auxiliary_loss_clip": 0.01137423, + "auxiliary_loss_mlp": 0.01125139, + "balance_loss_clip": 1.00210512, + "balance_loss_mlp": 1.0006845, + "epoch": 0.3775740267548474, + "flos": 27819098904960.0, + "grad_norm": 2.70209655395851, + "language_loss": 0.74967098, + "learning_rate": 2.861019264262269e-06, + "loss": 0.77229661, + "num_input_tokens_seen": 134865285, + "step": 6280, + "time_per_iteration": 2.670423746109009 + }, + { + "auxiliary_loss_clip": 0.01169908, + "auxiliary_loss_mlp": 0.01125444, + "balance_loss_clip": 1.00221229, + "balance_loss_mlp": 1.00070381, + "epoch": 0.3776341500075154, + "flos": 22565224391040.0, + "grad_norm": 2.16198529552875, + "language_loss": 0.76528209, + "learning_rate": 2.8606677252849242e-06, + "loss": 0.78823555, + "num_input_tokens_seen": 134886535, + "step": 6281, + "time_per_iteration": 3.9317495822906494 + }, + { + "auxiliary_loss_clip": 0.01138456, + "auxiliary_loss_mlp": 0.01125066, + "balance_loss_clip": 1.00216055, + "balance_loss_mlp": 1.00080228, + "epoch": 0.3776942732601834, + "flos": 23077238808960.0, + "grad_norm": 1.7262794922658402, + "language_loss": 0.84044313, + "learning_rate": 2.860316153670974e-06, + "loss": 0.86307836, + "num_input_tokens_seen": 134907435, + "step": 6282, + "time_per_iteration": 2.6243855953216553 + }, + { + "auxiliary_loss_clip": 0.01154416, + "auxiliary_loss_mlp": 0.01124675, + "balance_loss_clip": 1.00222456, + "balance_loss_mlp": 1.00060201, + "epoch": 0.37775439651285136, + "flos": 21724411852800.0, + "grad_norm": 4.365482808377811, + "language_loss": 0.70044839, + "learning_rate": 2.8599645494337484e-06, + "loss": 0.7232393, + "num_input_tokens_seen": 134925360, + "step": 6283, + "time_per_iteration": 3.9794423580169678 + }, + { + "auxiliary_loss_clip": 0.01093811, + "auxiliary_loss_mlp": 0.01125694, + "balance_loss_clip": 1.00221086, + "balance_loss_mlp": 1.00076222, + "epoch": 0.37781451976551933, + "flos": 23987753688960.0, + "grad_norm": 1.7580103406760892, + "language_loss": 0.75966185, + "learning_rate": 2.859612912586581e-06, + "loss": 0.78185689, + "num_input_tokens_seen": 134944205, + "step": 6284, + "time_per_iteration": 4.154117822647095 + }, + { + "auxiliary_loss_clip": 0.01170174, + "auxiliary_loss_mlp": 0.01125913, + "balance_loss_clip": 1.00225711, + "balance_loss_mlp": 1.00079107, + "epoch": 0.3778746430181873, + "flos": 13727967223680.0, + "grad_norm": 2.255384671969951, + "language_loss": 0.85162258, + "learning_rate": 2.8592612431428055e-06, + "loss": 0.87458348, + "num_input_tokens_seen": 134960255, + "step": 6285, + "time_per_iteration": 2.481748342514038 + }, + { + "auxiliary_loss_clip": 0.01136842, + "auxiliary_loss_mlp": 0.01125318, + "balance_loss_clip": 1.00216436, + "balance_loss_mlp": 1.00076795, + "epoch": 0.37793476627085526, + "flos": 19460495399040.0, + "grad_norm": 1.836631493442048, + "language_loss": 0.84309775, + "learning_rate": 2.858909541115758e-06, + "loss": 0.86571938, + "num_input_tokens_seen": 134978605, + "step": 6286, + "time_per_iteration": 2.60890793800354 + }, + { + "auxiliary_loss_clip": 0.01153709, + "auxiliary_loss_mlp": 0.01125929, + "balance_loss_clip": 1.0020504, + "balance_loss_mlp": 1.00080657, + "epoch": 0.3779948895235232, + "flos": 10707018704640.0, + "grad_norm": 2.034269109809273, + "language_loss": 0.81859958, + "learning_rate": 2.858557806518775e-06, + "loss": 0.84139597, + "num_input_tokens_seen": 134995020, + "step": 6287, + "time_per_iteration": 2.5279288291931152 + }, + { + "auxiliary_loss_clip": 0.01154843, + "auxiliary_loss_mlp": 0.01125573, + "balance_loss_clip": 1.00216115, + "balance_loss_mlp": 1.0006417, + "epoch": 0.3780550127761912, + "flos": 22310007281280.0, + "grad_norm": 2.565604677198403, + "language_loss": 0.72644156, + "learning_rate": 2.8582060393651927e-06, + "loss": 0.74924576, + "num_input_tokens_seen": 135012620, + "step": 6288, + "time_per_iteration": 3.9944756031036377 + }, + { + "auxiliary_loss_clip": 0.01153444, + "auxiliary_loss_mlp": 0.01124934, + "balance_loss_clip": 1.00222003, + "balance_loss_mlp": 1.00067019, + "epoch": 0.37811513602885916, + "flos": 28950644125440.0, + "grad_norm": 1.6740519816024244, + "language_loss": 0.75047338, + "learning_rate": 2.857854239668352e-06, + "loss": 0.77325714, + "num_input_tokens_seen": 135033365, + "step": 6289, + "time_per_iteration": 2.6243836879730225 + }, + { + "auxiliary_loss_clip": 0.01153311, + "auxiliary_loss_mlp": 0.01125195, + "balance_loss_clip": 1.00213361, + "balance_loss_mlp": 1.000741, + "epoch": 0.3781752592815271, + "flos": 23112933949440.0, + "grad_norm": 2.102832917292204, + "language_loss": 0.73346919, + "learning_rate": 2.857502407441593e-06, + "loss": 0.75625426, + "num_input_tokens_seen": 135052185, + "step": 6290, + "time_per_iteration": 2.5755693912506104 + }, + { + "auxiliary_loss_clip": 0.01121366, + "auxiliary_loss_mlp": 0.01125794, + "balance_loss_clip": 1.00185895, + "balance_loss_mlp": 1.00067139, + "epoch": 0.3782353825341951, + "flos": 19755932762880.0, + "grad_norm": 2.295337058633798, + "language_loss": 0.79383516, + "learning_rate": 2.8571505426982566e-06, + "loss": 0.81630671, + "num_input_tokens_seen": 135070425, + "step": 6291, + "time_per_iteration": 2.625943183898926 + }, + { + "auxiliary_loss_clip": 0.01119473, + "auxiliary_loss_mlp": 0.01125884, + "balance_loss_clip": 1.00193369, + "balance_loss_mlp": 1.00047588, + "epoch": 0.37829550578686305, + "flos": 22050839675520.0, + "grad_norm": 1.7987661124662024, + "language_loss": 0.75917047, + "learning_rate": 2.8567986454516854e-06, + "loss": 0.78162408, + "num_input_tokens_seen": 135090525, + "step": 6292, + "time_per_iteration": 2.642395496368408 + }, + { + "auxiliary_loss_clip": 0.01155122, + "auxiliary_loss_mlp": 0.01125734, + "balance_loss_clip": 1.00223112, + "balance_loss_mlp": 1.00089788, + "epoch": 0.378355629039531, + "flos": 16470357770880.0, + "grad_norm": 1.6695174721724997, + "language_loss": 0.69347072, + "learning_rate": 2.856446715715224e-06, + "loss": 0.71627927, + "num_input_tokens_seen": 135109575, + "step": 6293, + "time_per_iteration": 2.570223093032837 + }, + { + "auxiliary_loss_clip": 0.01169859, + "auxiliary_loss_mlp": 0.01125247, + "balance_loss_clip": 1.00214422, + "balance_loss_mlp": 1.00079274, + "epoch": 0.378415752292199, + "flos": 19974844200960.0, + "grad_norm": 1.8449611274035573, + "language_loss": 0.71063113, + "learning_rate": 2.8560947535022173e-06, + "loss": 0.73358214, + "num_input_tokens_seen": 135127000, + "step": 6294, + "time_per_iteration": 2.545198678970337 + }, + { + "auxiliary_loss_clip": 0.01138622, + "auxiliary_loss_mlp": 0.0112589, + "balance_loss_clip": 1.00204635, + "balance_loss_mlp": 1.00076747, + "epoch": 0.378475875544867, + "flos": 14647388676480.0, + "grad_norm": 3.129521164418168, + "language_loss": 0.82122856, + "learning_rate": 2.855742758826011e-06, + "loss": 0.84387368, + "num_input_tokens_seen": 135145285, + "step": 6295, + "time_per_iteration": 2.638713836669922 + }, + { + "auxiliary_loss_clip": 0.01155071, + "auxiliary_loss_mlp": 0.0112545, + "balance_loss_clip": 1.00217283, + "balance_loss_mlp": 1.00080442, + "epoch": 0.37853599879753497, + "flos": 26650996617600.0, + "grad_norm": 1.8475142855009463, + "language_loss": 0.71616554, + "learning_rate": 2.8553907316999547e-06, + "loss": 0.73897076, + "num_input_tokens_seen": 135165240, + "step": 6296, + "time_per_iteration": 2.6019091606140137 + }, + { + "auxiliary_loss_clip": 0.01170008, + "auxiliary_loss_mlp": 0.01125139, + "balance_loss_clip": 1.002285, + "balance_loss_mlp": 1.000875, + "epoch": 0.37859612205020293, + "flos": 17311960408320.0, + "grad_norm": 1.6072308266046949, + "language_loss": 0.76880592, + "learning_rate": 2.855038672137396e-06, + "loss": 0.79175735, + "num_input_tokens_seen": 135184045, + "step": 6297, + "time_per_iteration": 2.5537171363830566 + }, + { + "auxiliary_loss_clip": 0.0113851, + "auxiliary_loss_mlp": 0.01125601, + "balance_loss_clip": 1.00211382, + "balance_loss_mlp": 1.0006696, + "epoch": 0.3786562453028709, + "flos": 18220392299520.0, + "grad_norm": 2.012575928112926, + "language_loss": 0.79170406, + "learning_rate": 2.854686580151684e-06, + "loss": 0.81434518, + "num_input_tokens_seen": 135202365, + "step": 6298, + "time_per_iteration": 2.571075916290283 + }, + { + "auxiliary_loss_clip": 0.01108069, + "auxiliary_loss_mlp": 0.01124757, + "balance_loss_clip": 1.00210619, + "balance_loss_mlp": 1.00087464, + "epoch": 0.37871636855553886, + "flos": 21214875473280.0, + "grad_norm": 1.7395869395345827, + "language_loss": 0.84298754, + "learning_rate": 2.8543344557561722e-06, + "loss": 0.86531579, + "num_input_tokens_seen": 135220955, + "step": 6299, + "time_per_iteration": 2.6836650371551514 + }, + { + "auxiliary_loss_clip": 0.01123156, + "auxiliary_loss_mlp": 0.01124751, + "balance_loss_clip": 1.0019747, + "balance_loss_mlp": 1.0005827, + "epoch": 0.3787764918082068, + "flos": 20952727038720.0, + "grad_norm": 2.194830628614543, + "language_loss": 0.75596583, + "learning_rate": 2.8539822989642116e-06, + "loss": 0.77844489, + "num_input_tokens_seen": 135239715, + "step": 6300, + "time_per_iteration": 2.6250762939453125 + }, + { + "auxiliary_loss_clip": 0.01138379, + "auxiliary_loss_mlp": 0.011263, + "balance_loss_clip": 1.00211549, + "balance_loss_mlp": 1.00060534, + "epoch": 0.3788366150608748, + "flos": 17308009912320.0, + "grad_norm": 2.5174701317234414, + "language_loss": 0.82018387, + "learning_rate": 2.8536301097891577e-06, + "loss": 0.84283066, + "num_input_tokens_seen": 135257035, + "step": 6301, + "time_per_iteration": 2.6748290061950684 + }, + { + "auxiliary_loss_clip": 0.01153134, + "auxiliary_loss_mlp": 0.0112537, + "balance_loss_clip": 1.00194395, + "balance_loss_mlp": 1.00062954, + "epoch": 0.37889673831354276, + "flos": 24311092942080.0, + "grad_norm": 4.220090808555152, + "language_loss": 0.6789006, + "learning_rate": 2.8532778882443636e-06, + "loss": 0.70168561, + "num_input_tokens_seen": 135275720, + "step": 6302, + "time_per_iteration": 2.5691072940826416 + }, + { + "auxiliary_loss_clip": 0.01107697, + "auxiliary_loss_mlp": 0.01124734, + "balance_loss_clip": 1.00207365, + "balance_loss_mlp": 1.00066066, + "epoch": 0.3789568615662107, + "flos": 26683603188480.0, + "grad_norm": 1.7564043828290428, + "language_loss": 0.68105638, + "learning_rate": 2.8529256343431867e-06, + "loss": 0.7033807, + "num_input_tokens_seen": 135294140, + "step": 6303, + "time_per_iteration": 2.7337646484375 + }, + { + "auxiliary_loss_clip": 0.0117001, + "auxiliary_loss_mlp": 0.01125169, + "balance_loss_clip": 1.0021956, + "balance_loss_mlp": 1.00071478, + "epoch": 0.3790169848188787, + "flos": 23585194990080.0, + "grad_norm": 1.4814245078756756, + "language_loss": 0.77785265, + "learning_rate": 2.8525733480989846e-06, + "loss": 0.80080438, + "num_input_tokens_seen": 135314845, + "step": 6304, + "time_per_iteration": 2.581048011779785 + }, + { + "auxiliary_loss_clip": 0.0117032, + "auxiliary_loss_mlp": 0.01126464, + "balance_loss_clip": 1.00233555, + "balance_loss_mlp": 1.00067425, + "epoch": 0.37907710807154665, + "flos": 18437436230400.0, + "grad_norm": 3.2162285903577392, + "language_loss": 0.80279499, + "learning_rate": 2.8522210295251146e-06, + "loss": 0.82576281, + "num_input_tokens_seen": 135333055, + "step": 6305, + "time_per_iteration": 2.5136220455169678 + }, + { + "auxiliary_loss_clip": 0.01151151, + "auxiliary_loss_mlp": 0.0110532, + "balance_loss_clip": 1.00273967, + "balance_loss_mlp": 1.00003421, + "epoch": 0.3791372313242146, + "flos": 50107165954560.0, + "grad_norm": 0.9760682321462915, + "language_loss": 0.64485949, + "learning_rate": 2.8518686786349387e-06, + "loss": 0.66742426, + "num_input_tokens_seen": 135387865, + "step": 6306, + "time_per_iteration": 3.050687551498413 + }, + { + "auxiliary_loss_clip": 0.01137846, + "auxiliary_loss_mlp": 0.01125786, + "balance_loss_clip": 1.00207615, + "balance_loss_mlp": 1.00104523, + "epoch": 0.3791973545768826, + "flos": 24316551809280.0, + "grad_norm": 1.5399877921592415, + "language_loss": 0.73296463, + "learning_rate": 2.851516295441817e-06, + "loss": 0.75560099, + "num_input_tokens_seen": 135409095, + "step": 6307, + "time_per_iteration": 2.6595001220703125 + }, + { + "auxiliary_loss_clip": 0.01138253, + "auxiliary_loss_mlp": 0.0112628, + "balance_loss_clip": 1.00199568, + "balance_loss_mlp": 1.00087214, + "epoch": 0.3792574778295506, + "flos": 21579907438080.0, + "grad_norm": 1.6524675594911493, + "language_loss": 0.78170729, + "learning_rate": 2.851163879959112e-06, + "loss": 0.80435264, + "num_input_tokens_seen": 135429585, + "step": 6308, + "time_per_iteration": 2.6411736011505127 + }, + { + "auxiliary_loss_clip": 0.01121708, + "auxiliary_loss_mlp": 0.01125102, + "balance_loss_clip": 1.00188398, + "balance_loss_mlp": 1.00074279, + "epoch": 0.37931760108221857, + "flos": 22272731942400.0, + "grad_norm": 10.508536922395674, + "language_loss": 0.73030609, + "learning_rate": 2.8508114322001876e-06, + "loss": 0.75277412, + "num_input_tokens_seen": 135446320, + "step": 6309, + "time_per_iteration": 2.644881010055542 + }, + { + "auxiliary_loss_clip": 0.01107447, + "auxiliary_loss_mlp": 0.01125685, + "balance_loss_clip": 1.00208855, + "balance_loss_mlp": 1.00075376, + "epoch": 0.37937772433488653, + "flos": 19682998197120.0, + "grad_norm": 1.4699870587601893, + "language_loss": 0.78826481, + "learning_rate": 2.8504589521784083e-06, + "loss": 0.81059611, + "num_input_tokens_seen": 135465720, + "step": 6310, + "time_per_iteration": 2.703578472137451 + }, + { + "auxiliary_loss_clip": 0.01154415, + "auxiliary_loss_mlp": 0.00747593, + "balance_loss_clip": 1.00214362, + "balance_loss_mlp": 1.00015938, + "epoch": 0.3794378475875545, + "flos": 19099378016640.0, + "grad_norm": 2.1986304888619363, + "language_loss": 0.75779003, + "learning_rate": 2.8501064399071403e-06, + "loss": 0.77681005, + "num_input_tokens_seen": 135485155, + "step": 6311, + "time_per_iteration": 2.5980169773101807 + }, + { + "auxiliary_loss_clip": 0.01143044, + "auxiliary_loss_mlp": 0.01125349, + "balance_loss_clip": 1.0023582, + "balance_loss_mlp": 1.0007987, + "epoch": 0.37949797084022246, + "flos": 20339660684160.0, + "grad_norm": 1.5228938377099737, + "language_loss": 0.70901215, + "learning_rate": 2.8497538953997504e-06, + "loss": 0.73169601, + "num_input_tokens_seen": 135502675, + "step": 6312, + "time_per_iteration": 2.6487905979156494 + }, + { + "auxiliary_loss_clip": 0.01117488, + "auxiliary_loss_mlp": 0.01105333, + "balance_loss_clip": 1.00248647, + "balance_loss_mlp": 1.00004756, + "epoch": 0.37955809409289043, + "flos": 63972203477760.0, + "grad_norm": 0.7829512248917696, + "language_loss": 0.56120706, + "learning_rate": 2.849401318669608e-06, + "loss": 0.5834353, + "num_input_tokens_seen": 135562005, + "step": 6313, + "time_per_iteration": 3.1787126064300537 + }, + { + "auxiliary_loss_clip": 0.0112165, + "auxiliary_loss_mlp": 0.01125575, + "balance_loss_clip": 1.00203049, + "balance_loss_mlp": 1.00073886, + "epoch": 0.3796182173455584, + "flos": 31540665179520.0, + "grad_norm": 1.68147828250366, + "language_loss": 0.71156961, + "learning_rate": 2.849048709730083e-06, + "loss": 0.73404181, + "num_input_tokens_seen": 135582600, + "step": 6314, + "time_per_iteration": 2.7227399349212646 + }, + { + "auxiliary_loss_clip": 0.011533, + "auxiliary_loss_mlp": 0.01125132, + "balance_loss_clip": 1.00210118, + "balance_loss_mlp": 1.00086784, + "epoch": 0.37967834059822636, + "flos": 12130804978560.0, + "grad_norm": 6.72786981661173, + "language_loss": 0.7351107, + "learning_rate": 2.848696068594545e-06, + "loss": 0.75789499, + "num_input_tokens_seen": 135600280, + "step": 6315, + "time_per_iteration": 2.5403735637664795 + }, + { + "auxiliary_loss_clip": 0.01159578, + "auxiliary_loss_mlp": 0.01125504, + "balance_loss_clip": 1.00229037, + "balance_loss_mlp": 1.00066805, + "epoch": 0.3797384638508943, + "flos": 39348578298240.0, + "grad_norm": 2.070771927932933, + "language_loss": 0.70693201, + "learning_rate": 2.8483433952763677e-06, + "loss": 0.72978282, + "num_input_tokens_seen": 135621560, + "step": 6316, + "time_per_iteration": 2.7126505374908447 + }, + { + "auxiliary_loss_clip": 0.01120326, + "auxiliary_loss_mlp": 0.01125263, + "balance_loss_clip": 1.00203013, + "balance_loss_mlp": 1.00071311, + "epoch": 0.3797985871035623, + "flos": 34054016653440.0, + "grad_norm": 1.8706195129617023, + "language_loss": 0.65417194, + "learning_rate": 2.847990689788923e-06, + "loss": 0.67662787, + "num_input_tokens_seen": 135641745, + "step": 6317, + "time_per_iteration": 2.742018699645996 + }, + { + "auxiliary_loss_clip": 0.01153213, + "auxiliary_loss_mlp": 0.01124969, + "balance_loss_clip": 1.00210404, + "balance_loss_mlp": 1.00061035, + "epoch": 0.37985871035623026, + "flos": 23222174186880.0, + "grad_norm": 2.2393272120204397, + "language_loss": 0.8523581, + "learning_rate": 2.8476379521455877e-06, + "loss": 0.87513995, + "num_input_tokens_seen": 135660650, + "step": 6318, + "time_per_iteration": 2.561803102493286 + }, + { + "auxiliary_loss_clip": 0.01137927, + "auxiliary_loss_mlp": 0.01126152, + "balance_loss_clip": 1.00213027, + "balance_loss_mlp": 1.00083888, + "epoch": 0.3799188336088982, + "flos": 18114958903680.0, + "grad_norm": 1.7990162322140935, + "language_loss": 0.76118469, + "learning_rate": 2.8472851823597354e-06, + "loss": 0.78382552, + "num_input_tokens_seen": 135679980, + "step": 6319, + "time_per_iteration": 2.5693728923797607 + }, + { + "auxiliary_loss_clip": 0.0117015, + "auxiliary_loss_mlp": 0.01125984, + "balance_loss_clip": 1.00234401, + "balance_loss_mlp": 1.00086188, + "epoch": 0.3799789568615662, + "flos": 21871897096320.0, + "grad_norm": 58.767143878947834, + "language_loss": 0.64385307, + "learning_rate": 2.846932380444744e-06, + "loss": 0.66681433, + "num_input_tokens_seen": 135699400, + "step": 6320, + "time_per_iteration": 3.903615713119507 + }, + { + "auxiliary_loss_clip": 0.01107198, + "auxiliary_loss_mlp": 0.0112462, + "balance_loss_clip": 1.00205159, + "balance_loss_mlp": 1.0008328, + "epoch": 0.3800390801142342, + "flos": 32962943082240.0, + "grad_norm": 2.8492767514757893, + "language_loss": 0.71188015, + "learning_rate": 2.846579546413992e-06, + "loss": 0.73419833, + "num_input_tokens_seen": 135723455, + "step": 6321, + "time_per_iteration": 4.161892890930176 + }, + { + "auxiliary_loss_clip": 0.01122931, + "auxiliary_loss_mlp": 0.01125686, + "balance_loss_clip": 1.00210559, + "balance_loss_mlp": 1.00056386, + "epoch": 0.38009920336690217, + "flos": 26907075653760.0, + "grad_norm": 1.7993278692734704, + "language_loss": 0.748698, + "learning_rate": 2.846226680280859e-06, + "loss": 0.77118421, + "num_input_tokens_seen": 135744335, + "step": 6322, + "time_per_iteration": 4.158124208450317 + }, + { + "auxiliary_loss_clip": 0.01154926, + "auxiliary_loss_mlp": 0.01125031, + "balance_loss_clip": 1.00218284, + "balance_loss_mlp": 1.0008626, + "epoch": 0.38015932661957014, + "flos": 22488913946880.0, + "grad_norm": 2.6366647110083816, + "language_loss": 0.84935415, + "learning_rate": 2.845873782058725e-06, + "loss": 0.8721537, + "num_input_tokens_seen": 135761440, + "step": 6323, + "time_per_iteration": 2.5861287117004395 + }, + { + "auxiliary_loss_clip": 0.01139129, + "auxiliary_loss_mlp": 0.01125533, + "balance_loss_clip": 1.00207233, + "balance_loss_mlp": 1.0007925, + "epoch": 0.3802194498722381, + "flos": 21980993679360.0, + "grad_norm": 4.267514043046548, + "language_loss": 0.73524582, + "learning_rate": 2.845520851760973e-06, + "loss": 0.75789237, + "num_input_tokens_seen": 135779955, + "step": 6324, + "time_per_iteration": 2.6080007553100586 + }, + { + "auxiliary_loss_clip": 0.01121619, + "auxiliary_loss_mlp": 0.01125448, + "balance_loss_clip": 1.00198948, + "balance_loss_mlp": 1.00089836, + "epoch": 0.38027957312490607, + "flos": 21324869896320.0, + "grad_norm": 1.8937154099509093, + "language_loss": 0.83895731, + "learning_rate": 2.8451678894009847e-06, + "loss": 0.86142796, + "num_input_tokens_seen": 135799840, + "step": 6325, + "time_per_iteration": 2.6644814014434814 + }, + { + "auxiliary_loss_clip": 0.01136568, + "auxiliary_loss_mlp": 0.01124958, + "balance_loss_clip": 1.00198269, + "balance_loss_mlp": 1.00059915, + "epoch": 0.38033969637757403, + "flos": 16691244456960.0, + "grad_norm": 1.6801761893931926, + "language_loss": 0.79357398, + "learning_rate": 2.8448148949921465e-06, + "loss": 0.81618923, + "num_input_tokens_seen": 135817880, + "step": 6326, + "time_per_iteration": 3.9629287719726562 + }, + { + "auxiliary_loss_clip": 0.01153245, + "auxiliary_loss_mlp": 0.01124626, + "balance_loss_clip": 1.00214553, + "balance_loss_mlp": 1.00083899, + "epoch": 0.380399819630242, + "flos": 36210847685760.0, + "grad_norm": 1.7480875806335692, + "language_loss": 0.73063391, + "learning_rate": 2.844461868547842e-06, + "loss": 0.7534126, + "num_input_tokens_seen": 135838940, + "step": 6327, + "time_per_iteration": 2.6890065670013428 + }, + { + "auxiliary_loss_clip": 0.01169997, + "auxiliary_loss_mlp": 0.00747589, + "balance_loss_clip": 1.00223815, + "balance_loss_mlp": 1.00016189, + "epoch": 0.38045994288290996, + "flos": 21288851533440.0, + "grad_norm": 1.6734927600606286, + "language_loss": 0.8274923, + "learning_rate": 2.844108810081459e-06, + "loss": 0.84666812, + "num_input_tokens_seen": 135858325, + "step": 6328, + "time_per_iteration": 2.5443224906921387 + }, + { + "auxiliary_loss_clip": 0.01153339, + "auxiliary_loss_mlp": 0.01124634, + "balance_loss_clip": 1.00206947, + "balance_loss_mlp": 1.00075173, + "epoch": 0.38052006613557793, + "flos": 20922885815040.0, + "grad_norm": 1.4257234524102707, + "language_loss": 0.61773109, + "learning_rate": 2.843755719606385e-06, + "loss": 0.6405108, + "num_input_tokens_seen": 135878430, + "step": 6329, + "time_per_iteration": 2.5728187561035156 + }, + { + "auxiliary_loss_clip": 0.01144527, + "auxiliary_loss_mlp": 0.01125542, + "balance_loss_clip": 1.00263023, + "balance_loss_mlp": 1.00070643, + "epoch": 0.3805801893882459, + "flos": 20990720649600.0, + "grad_norm": 1.764578461330831, + "language_loss": 0.55849838, + "learning_rate": 2.8434025971360104e-06, + "loss": 0.58119905, + "num_input_tokens_seen": 135894755, + "step": 6330, + "time_per_iteration": 2.5952978134155273 + }, + { + "auxiliary_loss_clip": 0.01120149, + "auxiliary_loss_mlp": 0.0112475, + "balance_loss_clip": 1.00192952, + "balance_loss_mlp": 1.00086808, + "epoch": 0.38064031264091386, + "flos": 25558594243200.0, + "grad_norm": 1.4371132784872402, + "language_loss": 0.65925503, + "learning_rate": 2.8430494426837243e-06, + "loss": 0.68170398, + "num_input_tokens_seen": 135918275, + "step": 6331, + "time_per_iteration": 2.6974844932556152 + }, + { + "auxiliary_loss_clip": 0.01153633, + "auxiliary_loss_mlp": 0.01125758, + "balance_loss_clip": 1.0020169, + "balance_loss_mlp": 1.00092185, + "epoch": 0.3807004358935818, + "flos": 15085857997440.0, + "grad_norm": 1.7238007156261612, + "language_loss": 0.75922322, + "learning_rate": 2.842696256262919e-06, + "loss": 0.78201711, + "num_input_tokens_seen": 135937430, + "step": 6332, + "time_per_iteration": 2.5334601402282715 + }, + { + "auxiliary_loss_clip": 0.01094608, + "auxiliary_loss_mlp": 0.0074775, + "balance_loss_clip": 1.00187659, + "balance_loss_mlp": 1.00017393, + "epoch": 0.3807605591462498, + "flos": 16399398453120.0, + "grad_norm": 1.9131696595401755, + "language_loss": 0.81776595, + "learning_rate": 2.842343037886987e-06, + "loss": 0.83618951, + "num_input_tokens_seen": 135954210, + "step": 6333, + "time_per_iteration": 2.6945443153381348 + }, + { + "auxiliary_loss_clip": 0.01153323, + "auxiliary_loss_mlp": 0.01124797, + "balance_loss_clip": 1.00205982, + "balance_loss_mlp": 1.0005331, + "epoch": 0.3808206823989178, + "flos": 29057083102080.0, + "grad_norm": 1.4434388165130692, + "language_loss": 0.86207753, + "learning_rate": 2.8419897875693226e-06, + "loss": 0.88485873, + "num_input_tokens_seen": 135974425, + "step": 6334, + "time_per_iteration": 2.6180872917175293 + }, + { + "auxiliary_loss_clip": 0.0115446, + "auxiliary_loss_mlp": 0.01125073, + "balance_loss_clip": 1.00215077, + "balance_loss_mlp": 1.00080919, + "epoch": 0.3808808056515858, + "flos": 15705855676800.0, + "grad_norm": 1.8949255331675492, + "language_loss": 0.78863347, + "learning_rate": 2.841636505323321e-06, + "loss": 0.81142879, + "num_input_tokens_seen": 135991985, + "step": 6335, + "time_per_iteration": 2.5368900299072266 + }, + { + "auxiliary_loss_clip": 0.01153438, + "auxiliary_loss_mlp": 0.01125468, + "balance_loss_clip": 1.00210392, + "balance_loss_mlp": 1.00063205, + "epoch": 0.38094092890425374, + "flos": 20704584908160.0, + "grad_norm": 2.0359510549910875, + "language_loss": 0.72761196, + "learning_rate": 2.8412831911623795e-06, + "loss": 0.75040102, + "num_input_tokens_seen": 136010015, + "step": 6336, + "time_per_iteration": 2.5444679260253906 + }, + { + "auxiliary_loss_clip": 0.01153117, + "auxiliary_loss_mlp": 0.01124531, + "balance_loss_clip": 1.00198841, + "balance_loss_mlp": 1.00064862, + "epoch": 0.3810010521569217, + "flos": 20667956014080.0, + "grad_norm": 2.0472358593432225, + "language_loss": 0.69078487, + "learning_rate": 2.840929845099894e-06, + "loss": 0.71356136, + "num_input_tokens_seen": 136028440, + "step": 6337, + "time_per_iteration": 2.5492584705352783 + }, + { + "auxiliary_loss_clip": 0.0113968, + "auxiliary_loss_mlp": 0.01124886, + "balance_loss_clip": 1.00214267, + "balance_loss_mlp": 1.00062227, + "epoch": 0.38106117540958967, + "flos": 31827626933760.0, + "grad_norm": 1.701664963451428, + "language_loss": 0.63733375, + "learning_rate": 2.8405764671492652e-06, + "loss": 0.65997946, + "num_input_tokens_seen": 136048360, + "step": 6338, + "time_per_iteration": 2.6973633766174316 + }, + { + "auxiliary_loss_clip": 0.01140064, + "auxiliary_loss_mlp": 0.01125335, + "balance_loss_clip": 1.0022043, + "balance_loss_mlp": 1.00078499, + "epoch": 0.38112129866225763, + "flos": 16902757693440.0, + "grad_norm": 1.6811509743203155, + "language_loss": 0.69485235, + "learning_rate": 2.8402230573238923e-06, + "loss": 0.71750629, + "num_input_tokens_seen": 136065500, + "step": 6339, + "time_per_iteration": 2.572493553161621 + }, + { + "auxiliary_loss_clip": 0.01138225, + "auxiliary_loss_mlp": 0.01125657, + "balance_loss_clip": 1.00223136, + "balance_loss_mlp": 1.00101197, + "epoch": 0.3811814219149256, + "flos": 20887226588160.0, + "grad_norm": 2.2535568427516632, + "language_loss": 0.67744726, + "learning_rate": 2.839869615637177e-06, + "loss": 0.70008612, + "num_input_tokens_seen": 136084060, + "step": 6340, + "time_per_iteration": 2.592571973800659 + }, + { + "auxiliary_loss_clip": 0.0111982, + "auxiliary_loss_mlp": 0.01125312, + "balance_loss_clip": 1.00202942, + "balance_loss_mlp": 1.00066686, + "epoch": 0.38124154516759357, + "flos": 16690813493760.0, + "grad_norm": 2.079652521670823, + "language_loss": 0.89568096, + "learning_rate": 2.839516142102522e-06, + "loss": 0.91813231, + "num_input_tokens_seen": 136102310, + "step": 6341, + "time_per_iteration": 2.655740261077881 + }, + { + "auxiliary_loss_clip": 0.01153388, + "auxiliary_loss_mlp": 0.01125499, + "balance_loss_clip": 1.00204945, + "balance_loss_mlp": 1.00085425, + "epoch": 0.38130166842026153, + "flos": 19681956702720.0, + "grad_norm": 1.6618890129401993, + "language_loss": 0.75077796, + "learning_rate": 2.83916263673333e-06, + "loss": 0.77356684, + "num_input_tokens_seen": 136120725, + "step": 6342, + "time_per_iteration": 2.6360886096954346 + }, + { + "auxiliary_loss_clip": 0.01136846, + "auxiliary_loss_mlp": 0.0112527, + "balance_loss_clip": 1.0020504, + "balance_loss_mlp": 1.00062466, + "epoch": 0.3813617916729295, + "flos": 22198432659840.0, + "grad_norm": 2.595667128472582, + "language_loss": 0.83536136, + "learning_rate": 2.838809099543007e-06, + "loss": 0.85798246, + "num_input_tokens_seen": 136139105, + "step": 6343, + "time_per_iteration": 2.6159555912017822 + }, + { + "auxiliary_loss_clip": 0.01096128, + "auxiliary_loss_mlp": 0.01125179, + "balance_loss_clip": 1.00212121, + "balance_loss_mlp": 1.00081992, + "epoch": 0.38142191492559746, + "flos": 19096899978240.0, + "grad_norm": 1.571477180204232, + "language_loss": 0.76626849, + "learning_rate": 2.838455530544959e-06, + "loss": 0.78848159, + "num_input_tokens_seen": 136158265, + "step": 6344, + "time_per_iteration": 2.716951847076416 + }, + { + "auxiliary_loss_clip": 0.01122406, + "auxiliary_loss_mlp": 0.01126016, + "balance_loss_clip": 1.00198877, + "balance_loss_mlp": 1.00079894, + "epoch": 0.3814820381782654, + "flos": 24097748112000.0, + "grad_norm": 2.1068086927836336, + "language_loss": 0.72623819, + "learning_rate": 2.838101929752593e-06, + "loss": 0.74872237, + "num_input_tokens_seen": 136176100, + "step": 6345, + "time_per_iteration": 2.6675949096679688 + }, + { + "auxiliary_loss_clip": 0.01121297, + "auxiliary_loss_mlp": 0.00747621, + "balance_loss_clip": 1.00196409, + "balance_loss_mlp": 1.00015509, + "epoch": 0.3815421614309334, + "flos": 15778502933760.0, + "grad_norm": 1.719925613837822, + "language_loss": 0.70075798, + "learning_rate": 2.8377482971793187e-06, + "loss": 0.71944714, + "num_input_tokens_seen": 136195125, + "step": 6346, + "time_per_iteration": 2.651874303817749 + }, + { + "auxiliary_loss_clip": 0.01155032, + "auxiliary_loss_mlp": 0.01124857, + "balance_loss_clip": 1.00221658, + "balance_loss_mlp": 1.00059366, + "epoch": 0.38160228468360136, + "flos": 19899754819200.0, + "grad_norm": 1.9359764323462074, + "language_loss": 0.75062871, + "learning_rate": 2.8373946328385437e-06, + "loss": 0.77342767, + "num_input_tokens_seen": 136213885, + "step": 6347, + "time_per_iteration": 2.567007064819336 + }, + { + "auxiliary_loss_clip": 0.01153211, + "auxiliary_loss_mlp": 0.01125164, + "balance_loss_clip": 1.00208259, + "balance_loss_mlp": 1.00080502, + "epoch": 0.3816624079362694, + "flos": 19281050029440.0, + "grad_norm": 1.4877459213688995, + "language_loss": 0.74404275, + "learning_rate": 2.8370409367436813e-06, + "loss": 0.76682651, + "num_input_tokens_seen": 136232700, + "step": 6348, + "time_per_iteration": 2.571948289871216 + }, + { + "auxiliary_loss_clip": 0.01136759, + "auxiliary_loss_mlp": 0.0112472, + "balance_loss_clip": 1.00189734, + "balance_loss_mlp": 1.00064695, + "epoch": 0.38172253118893734, + "flos": 21177564220800.0, + "grad_norm": 2.0812550699056285, + "language_loss": 0.87470317, + "learning_rate": 2.836687208908142e-06, + "loss": 0.89731789, + "num_input_tokens_seen": 136248975, + "step": 6349, + "time_per_iteration": 2.587019205093384 + }, + { + "auxiliary_loss_clip": 0.01153146, + "auxiliary_loss_mlp": 0.01125501, + "balance_loss_clip": 1.0019567, + "balance_loss_mlp": 1.00085616, + "epoch": 0.3817826544416053, + "flos": 17529219820800.0, + "grad_norm": 2.185347303402776, + "language_loss": 0.76487052, + "learning_rate": 2.836333449345341e-06, + "loss": 0.78765702, + "num_input_tokens_seen": 136266710, + "step": 6350, + "time_per_iteration": 2.5376157760620117 + }, + { + "auxiliary_loss_clip": 0.01122585, + "auxiliary_loss_mlp": 0.01124798, + "balance_loss_clip": 1.00193071, + "balance_loss_mlp": 1.00062966, + "epoch": 0.38184277769427327, + "flos": 16326535714560.0, + "grad_norm": 2.200760962001197, + "language_loss": 0.75898981, + "learning_rate": 2.8359796580686907e-06, + "loss": 0.78146362, + "num_input_tokens_seen": 136284445, + "step": 6351, + "time_per_iteration": 2.6131787300109863 + }, + { + "auxiliary_loss_clip": 0.01154602, + "auxiliary_loss_mlp": 0.01125416, + "balance_loss_clip": 1.00206423, + "balance_loss_mlp": 1.00086617, + "epoch": 0.38190290094694124, + "flos": 30443450382720.0, + "grad_norm": 1.9518384207065735, + "language_loss": 0.73959893, + "learning_rate": 2.8356258350916085e-06, + "loss": 0.76239908, + "num_input_tokens_seen": 136305730, + "step": 6352, + "time_per_iteration": 2.658189535140991 + }, + { + "auxiliary_loss_clip": 0.01122384, + "auxiliary_loss_mlp": 0.01124519, + "balance_loss_clip": 1.00211823, + "balance_loss_mlp": 1.00063694, + "epoch": 0.3819630241996092, + "flos": 14209924936320.0, + "grad_norm": 1.8794041225819325, + "language_loss": 0.63925105, + "learning_rate": 2.8352719804275104e-06, + "loss": 0.66172004, + "num_input_tokens_seen": 136323850, + "step": 6353, + "time_per_iteration": 2.6090872287750244 + }, + { + "auxiliary_loss_clip": 0.01169941, + "auxiliary_loss_mlp": 0.01124738, + "balance_loss_clip": 1.00220966, + "balance_loss_mlp": 1.00076079, + "epoch": 0.38202314745227717, + "flos": 25009699536000.0, + "grad_norm": 1.5262365141725895, + "language_loss": 0.83344674, + "learning_rate": 2.834918094089816e-06, + "loss": 0.85639352, + "num_input_tokens_seen": 136344880, + "step": 6354, + "time_per_iteration": 2.548159599304199 + }, + { + "auxiliary_loss_clip": 0.01169904, + "auxiliary_loss_mlp": 0.01124646, + "balance_loss_clip": 1.00223839, + "balance_loss_mlp": 1.00066817, + "epoch": 0.38208327070494513, + "flos": 20814507504000.0, + "grad_norm": 1.6886639597038746, + "language_loss": 0.80583417, + "learning_rate": 2.834564176091943e-06, + "loss": 0.8287797, + "num_input_tokens_seen": 136366060, + "step": 6355, + "time_per_iteration": 2.558764696121216 + }, + { + "auxiliary_loss_clip": 0.01122679, + "auxiliary_loss_mlp": 0.01125126, + "balance_loss_clip": 1.00201845, + "balance_loss_mlp": 1.00067115, + "epoch": 0.3821433939576131, + "flos": 22637727993600.0, + "grad_norm": 1.9668935814372939, + "language_loss": 0.75567752, + "learning_rate": 2.8342102264473125e-06, + "loss": 0.77815557, + "num_input_tokens_seen": 136385625, + "step": 6356, + "time_per_iteration": 2.6707162857055664 + }, + { + "auxiliary_loss_clip": 0.0115431, + "auxiliary_loss_mlp": 0.00747752, + "balance_loss_clip": 1.00220203, + "balance_loss_mlp": 1.00017953, + "epoch": 0.38220351721028106, + "flos": 26869872142080.0, + "grad_norm": 1.8987543688944668, + "language_loss": 0.81562823, + "learning_rate": 2.833856245169348e-06, + "loss": 0.83464879, + "num_input_tokens_seen": 136405750, + "step": 6357, + "time_per_iteration": 4.0218892097473145 + }, + { + "auxiliary_loss_clip": 0.01136175, + "auxiliary_loss_mlp": 0.01125976, + "balance_loss_clip": 1.0021559, + "balance_loss_mlp": 1.00094938, + "epoch": 0.38226364046294903, + "flos": 23367468700800.0, + "grad_norm": 3.0194479121615583, + "language_loss": 0.77957141, + "learning_rate": 2.8335022322714695e-06, + "loss": 0.80219293, + "num_input_tokens_seen": 136426085, + "step": 6358, + "time_per_iteration": 2.5960447788238525 + }, + { + "auxiliary_loss_clip": 0.01137757, + "auxiliary_loss_mlp": 0.01125759, + "balance_loss_clip": 1.00197136, + "balance_loss_mlp": 1.00082731, + "epoch": 0.382323763715617, + "flos": 19646225648640.0, + "grad_norm": 1.9335879364913815, + "language_loss": 0.78983974, + "learning_rate": 2.8331481877671036e-06, + "loss": 0.81247491, + "num_input_tokens_seen": 136442670, + "step": 6359, + "time_per_iteration": 4.03188681602478 + }, + { + "auxiliary_loss_clip": 0.01091131, + "auxiliary_loss_mlp": 0.01125654, + "balance_loss_clip": 1.00203621, + "balance_loss_mlp": 1.00072241, + "epoch": 0.38238388696828496, + "flos": 54124741232640.0, + "grad_norm": 1.562194898247459, + "language_loss": 0.69297278, + "learning_rate": 2.8327941116696754e-06, + "loss": 0.71514064, + "num_input_tokens_seen": 136465730, + "step": 6360, + "time_per_iteration": 4.435871362686157 + }, + { + "auxiliary_loss_clip": 0.01138274, + "auxiliary_loss_mlp": 0.01125624, + "balance_loss_clip": 1.00211513, + "balance_loss_mlp": 1.00069308, + "epoch": 0.382444010220953, + "flos": 24936190352640.0, + "grad_norm": 1.4670053121345308, + "language_loss": 0.78873295, + "learning_rate": 2.83244000399261e-06, + "loss": 0.81137192, + "num_input_tokens_seen": 136487215, + "step": 6361, + "time_per_iteration": 2.6266026496887207 + }, + { + "auxiliary_loss_clip": 0.01138567, + "auxiliary_loss_mlp": 0.01124352, + "balance_loss_clip": 1.00207829, + "balance_loss_mlp": 1.00075579, + "epoch": 0.38250413347362094, + "flos": 42337351209600.0, + "grad_norm": 1.6294881933813667, + "language_loss": 0.65678537, + "learning_rate": 2.832085864749337e-06, + "loss": 0.67941457, + "num_input_tokens_seen": 136510365, + "step": 6362, + "time_per_iteration": 2.8294517993927 + }, + { + "auxiliary_loss_clip": 0.01169843, + "auxiliary_loss_mlp": 0.01125367, + "balance_loss_clip": 1.00212121, + "balance_loss_mlp": 1.00062633, + "epoch": 0.3825642567262889, + "flos": 16289224462080.0, + "grad_norm": 1.9799450098207012, + "language_loss": 0.81716007, + "learning_rate": 2.8317316939532848e-06, + "loss": 0.84011221, + "num_input_tokens_seen": 136527100, + "step": 6363, + "time_per_iteration": 2.5052568912506104 + }, + { + "auxiliary_loss_clip": 0.01104289, + "auxiliary_loss_mlp": 0.01125509, + "balance_loss_clip": 1.00193286, + "balance_loss_mlp": 1.00086355, + "epoch": 0.3826243799789569, + "flos": 45654778586880.0, + "grad_norm": 1.5882199959188048, + "language_loss": 0.58435833, + "learning_rate": 2.8313774916178825e-06, + "loss": 0.60665631, + "num_input_tokens_seen": 136550870, + "step": 6364, + "time_per_iteration": 4.326344966888428 + }, + { + "auxiliary_loss_clip": 0.01139372, + "auxiliary_loss_mlp": 0.01125188, + "balance_loss_clip": 1.00212073, + "balance_loss_mlp": 1.00063801, + "epoch": 0.38268450323162484, + "flos": 25301581453440.0, + "grad_norm": 2.095142246541576, + "language_loss": 0.68876237, + "learning_rate": 2.8310232577565635e-06, + "loss": 0.71140796, + "num_input_tokens_seen": 136569895, + "step": 6365, + "time_per_iteration": 2.6432230472564697 + }, + { + "auxiliary_loss_clip": 0.01153675, + "auxiliary_loss_mlp": 0.01125213, + "balance_loss_clip": 1.00199437, + "balance_loss_mlp": 1.00066352, + "epoch": 0.3827446264842928, + "flos": 21836022387840.0, + "grad_norm": 9.067357223121112, + "language_loss": 0.7299605, + "learning_rate": 2.830668992382758e-06, + "loss": 0.75274938, + "num_input_tokens_seen": 136588585, + "step": 6366, + "time_per_iteration": 2.631648302078247 + }, + { + "auxiliary_loss_clip": 0.01137837, + "auxiliary_loss_mlp": 0.01125806, + "balance_loss_clip": 1.00221443, + "balance_loss_mlp": 1.00077939, + "epoch": 0.38280474973696077, + "flos": 25734591907200.0, + "grad_norm": 2.1692437608439787, + "language_loss": 0.68401498, + "learning_rate": 2.830314695509902e-06, + "loss": 0.70665139, + "num_input_tokens_seen": 136606640, + "step": 6367, + "time_per_iteration": 2.650189161300659 + }, + { + "auxiliary_loss_clip": 0.01154865, + "auxiliary_loss_mlp": 0.01124627, + "balance_loss_clip": 1.00226474, + "balance_loss_mlp": 1.00064981, + "epoch": 0.38286487298962874, + "flos": 24895934184960.0, + "grad_norm": 1.7223465786287573, + "language_loss": 0.64421201, + "learning_rate": 2.82996036715143e-06, + "loss": 0.66700697, + "num_input_tokens_seen": 136624940, + "step": 6368, + "time_per_iteration": 2.6221208572387695 + }, + { + "auxiliary_loss_clip": 0.01169845, + "auxiliary_loss_mlp": 0.01125302, + "balance_loss_clip": 1.00225878, + "balance_loss_mlp": 1.00084698, + "epoch": 0.3829249962422967, + "flos": 28543703967360.0, + "grad_norm": 1.4138618285717497, + "language_loss": 0.68189585, + "learning_rate": 2.8296060073207763e-06, + "loss": 0.70484722, + "num_input_tokens_seen": 136645540, + "step": 6369, + "time_per_iteration": 2.5699656009674072 + }, + { + "auxiliary_loss_clip": 0.0110602, + "auxiliary_loss_mlp": 0.01125032, + "balance_loss_clip": 1.00191987, + "balance_loss_mlp": 1.00067306, + "epoch": 0.38298511949496467, + "flos": 21471205904640.0, + "grad_norm": 1.7005517077084247, + "language_loss": 0.78444469, + "learning_rate": 2.8292516160313804e-06, + "loss": 0.80675519, + "num_input_tokens_seen": 136664530, + "step": 6370, + "time_per_iteration": 2.689908266067505 + }, + { + "auxiliary_loss_clip": 0.01154804, + "auxiliary_loss_mlp": 0.01125169, + "balance_loss_clip": 1.00223804, + "balance_loss_mlp": 1.00071502, + "epoch": 0.38304524274763263, + "flos": 31679998035840.0, + "grad_norm": 2.3399189483645086, + "language_loss": 0.64238816, + "learning_rate": 2.8288971932966805e-06, + "loss": 0.6651879, + "num_input_tokens_seen": 136682315, + "step": 6371, + "time_per_iteration": 2.6142807006835938 + }, + { + "auxiliary_loss_clip": 0.01122052, + "auxiliary_loss_mlp": 0.01125585, + "balance_loss_clip": 1.00201571, + "balance_loss_mlp": 1.0006541, + "epoch": 0.3831053660003006, + "flos": 25076816098560.0, + "grad_norm": 1.8281371324241908, + "language_loss": 0.73222089, + "learning_rate": 2.8285427391301155e-06, + "loss": 0.7546972, + "num_input_tokens_seen": 136701185, + "step": 6372, + "time_per_iteration": 2.701303243637085 + }, + { + "auxiliary_loss_clip": 0.01153354, + "auxiliary_loss_mlp": 0.01125211, + "balance_loss_clip": 1.00208104, + "balance_loss_mlp": 1.00056577, + "epoch": 0.38316548925296856, + "flos": 23259018562560.0, + "grad_norm": 1.8983967200422178, + "language_loss": 0.8495748, + "learning_rate": 2.8281882535451266e-06, + "loss": 0.87236047, + "num_input_tokens_seen": 136721265, + "step": 6373, + "time_per_iteration": 2.5698165893554688 + }, + { + "auxiliary_loss_clip": 0.01106209, + "auxiliary_loss_mlp": 0.011259, + "balance_loss_clip": 1.00200689, + "balance_loss_mlp": 1.0008738, + "epoch": 0.3832256125056366, + "flos": 34423465991040.0, + "grad_norm": 2.0068411533912567, + "language_loss": 0.74801207, + "learning_rate": 2.8278337365551567e-06, + "loss": 0.77033317, + "num_input_tokens_seen": 136741885, + "step": 6374, + "time_per_iteration": 2.810574769973755 + }, + { + "auxiliary_loss_clip": 0.01159686, + "auxiliary_loss_mlp": 0.01125641, + "balance_loss_clip": 1.00257695, + "balance_loss_mlp": 1.00080466, + "epoch": 0.38328573575830455, + "flos": 21762764599680.0, + "grad_norm": 2.3925432823747106, + "language_loss": 0.76136816, + "learning_rate": 2.8274791881736485e-06, + "loss": 0.78422141, + "num_input_tokens_seen": 136760905, + "step": 6375, + "time_per_iteration": 2.55350399017334 + }, + { + "auxiliary_loss_clip": 0.01152759, + "auxiliary_loss_mlp": 0.01124713, + "balance_loss_clip": 1.002123, + "balance_loss_mlp": 1.00064039, + "epoch": 0.3833458590109725, + "flos": 17380010724480.0, + "grad_norm": 2.9225081764323564, + "language_loss": 0.72546023, + "learning_rate": 2.8271246084140457e-06, + "loss": 0.74823499, + "num_input_tokens_seen": 136777240, + "step": 6376, + "time_per_iteration": 2.532818555831909 + }, + { + "auxiliary_loss_clip": 0.01153188, + "auxiliary_loss_mlp": 0.01125293, + "balance_loss_clip": 1.00208712, + "balance_loss_mlp": 1.00064778, + "epoch": 0.3834059822636405, + "flos": 29424557191680.0, + "grad_norm": 1.6199619943617176, + "language_loss": 0.67702234, + "learning_rate": 2.826769997289796e-06, + "loss": 0.69980717, + "num_input_tokens_seen": 136801040, + "step": 6377, + "time_per_iteration": 2.6268527507781982 + }, + { + "auxiliary_loss_clip": 0.01121937, + "auxiliary_loss_mlp": 0.01125581, + "balance_loss_clip": 1.00204182, + "balance_loss_mlp": 1.00074542, + "epoch": 0.38346610551630844, + "flos": 21470739027840.0, + "grad_norm": 9.25068454479723, + "language_loss": 0.72798842, + "learning_rate": 2.826415354814344e-06, + "loss": 0.7504636, + "num_input_tokens_seen": 136819495, + "step": 6378, + "time_per_iteration": 2.7059731483459473 + }, + { + "auxiliary_loss_clip": 0.01109864, + "auxiliary_loss_mlp": 0.0112515, + "balance_loss_clip": 1.00211191, + "balance_loss_mlp": 1.00079072, + "epoch": 0.3835262287689764, + "flos": 27561224188800.0, + "grad_norm": 1.6365702268848268, + "language_loss": 0.6914202, + "learning_rate": 2.8260606810011396e-06, + "loss": 0.71377033, + "num_input_tokens_seen": 136838840, + "step": 6379, + "time_per_iteration": 2.727320432662964 + }, + { + "auxiliary_loss_clip": 0.01153237, + "auxiliary_loss_mlp": 0.0112479, + "balance_loss_clip": 1.00222373, + "balance_loss_mlp": 1.00081205, + "epoch": 0.3835863520216444, + "flos": 15523716787200.0, + "grad_norm": 1.9964456255721676, + "language_loss": 0.83389515, + "learning_rate": 2.8257059758636315e-06, + "loss": 0.85667545, + "num_input_tokens_seen": 136854425, + "step": 6380, + "time_per_iteration": 2.5723893642425537 + }, + { + "auxiliary_loss_clip": 0.01169894, + "auxiliary_loss_mlp": 0.01124812, + "balance_loss_clip": 1.00231433, + "balance_loss_mlp": 1.00064397, + "epoch": 0.38364647527431234, + "flos": 21904934630400.0, + "grad_norm": 1.4166669254641364, + "language_loss": 0.81239825, + "learning_rate": 2.8253512394152697e-06, + "loss": 0.83534533, + "num_input_tokens_seen": 136874355, + "step": 6381, + "time_per_iteration": 2.5944578647613525 + }, + { + "auxiliary_loss_clip": 0.01166353, + "auxiliary_loss_mlp": 0.01106192, + "balance_loss_clip": 1.0025115, + "balance_loss_mlp": 1.00014293, + "epoch": 0.3837065985269803, + "flos": 65534927558400.0, + "grad_norm": 0.7926735625300533, + "language_loss": 0.60473907, + "learning_rate": 2.8249964716695068e-06, + "loss": 0.62746453, + "num_input_tokens_seen": 136937475, + "step": 6382, + "time_per_iteration": 3.119464874267578 + }, + { + "auxiliary_loss_clip": 0.01169801, + "auxiliary_loss_mlp": 0.01125617, + "balance_loss_clip": 1.00208688, + "balance_loss_mlp": 1.00068545, + "epoch": 0.38376672177964827, + "flos": 28256598558720.0, + "grad_norm": 2.127111202588785, + "language_loss": 0.66993368, + "learning_rate": 2.824641672639794e-06, + "loss": 0.69288784, + "num_input_tokens_seen": 136955805, + "step": 6383, + "time_per_iteration": 2.577333927154541 + }, + { + "auxiliary_loss_clip": 0.01122903, + "auxiliary_loss_mlp": 0.01124955, + "balance_loss_clip": 1.00211799, + "balance_loss_mlp": 1.00069165, + "epoch": 0.38382684503231623, + "flos": 20631363033600.0, + "grad_norm": 4.385757950008972, + "language_loss": 0.74815136, + "learning_rate": 2.824286842339587e-06, + "loss": 0.77062994, + "num_input_tokens_seen": 136975240, + "step": 6384, + "time_per_iteration": 2.6751410961151123 + }, + { + "auxiliary_loss_clip": 0.01152917, + "auxiliary_loss_mlp": 0.01125615, + "balance_loss_clip": 1.00206864, + "balance_loss_mlp": 1.00087488, + "epoch": 0.3838869682849842, + "flos": 19605825826560.0, + "grad_norm": 1.3520327445911238, + "language_loss": 0.76351446, + "learning_rate": 2.823931980782341e-06, + "loss": 0.78629982, + "num_input_tokens_seen": 136994985, + "step": 6385, + "time_per_iteration": 2.5755977630615234 + }, + { + "auxiliary_loss_clip": 0.01149874, + "auxiliary_loss_mlp": 0.01106218, + "balance_loss_clip": 1.00242496, + "balance_loss_mlp": 1.00016963, + "epoch": 0.38394709153765216, + "flos": 56556110891520.0, + "grad_norm": 0.9119707092250182, + "language_loss": 0.67022693, + "learning_rate": 2.82357708798151e-06, + "loss": 0.69278789, + "num_input_tokens_seen": 137046290, + "step": 6386, + "time_per_iteration": 3.0221667289733887 + }, + { + "auxiliary_loss_clip": 0.01122849, + "auxiliary_loss_mlp": 0.01124719, + "balance_loss_clip": 1.00218105, + "balance_loss_mlp": 1.00074089, + "epoch": 0.3840072147903202, + "flos": 15888748752000.0, + "grad_norm": 1.5988779000401176, + "language_loss": 0.72637743, + "learning_rate": 2.8232221639505547e-06, + "loss": 0.74885309, + "num_input_tokens_seen": 137064725, + "step": 6387, + "time_per_iteration": 2.67261004447937 + }, + { + "auxiliary_loss_clip": 0.01169839, + "auxiliary_loss_mlp": 0.01125111, + "balance_loss_clip": 1.00235593, + "balance_loss_mlp": 1.00094223, + "epoch": 0.38406733804298815, + "flos": 28218030330240.0, + "grad_norm": 1.5554065690338603, + "language_loss": 0.81127024, + "learning_rate": 2.822867208702932e-06, + "loss": 0.83421969, + "num_input_tokens_seen": 137086030, + "step": 6388, + "time_per_iteration": 2.673523187637329 + }, + { + "auxiliary_loss_clip": 0.01138237, + "auxiliary_loss_mlp": 0.0112417, + "balance_loss_clip": 1.00200605, + "balance_loss_mlp": 1.0006696, + "epoch": 0.3841274612956561, + "flos": 18223588609920.0, + "grad_norm": 1.7456550324227627, + "language_loss": 0.76332366, + "learning_rate": 2.8225122222521026e-06, + "loss": 0.7859478, + "num_input_tokens_seen": 137105400, + "step": 6389, + "time_per_iteration": 2.6709821224212646 + }, + { + "auxiliary_loss_clip": 0.01138175, + "auxiliary_loss_mlp": 0.01126282, + "balance_loss_clip": 1.00208533, + "balance_loss_mlp": 1.00077891, + "epoch": 0.3841875845483241, + "flos": 19792884879360.0, + "grad_norm": 1.631659621999689, + "language_loss": 0.76240778, + "learning_rate": 2.8221572046115273e-06, + "loss": 0.7850523, + "num_input_tokens_seen": 137124985, + "step": 6390, + "time_per_iteration": 2.6181395053863525 + }, + { + "auxiliary_loss_clip": 0.01106289, + "auxiliary_loss_mlp": 0.01126189, + "balance_loss_clip": 1.00196242, + "balance_loss_mlp": 1.00097156, + "epoch": 0.38424770780099204, + "flos": 29898829393920.0, + "grad_norm": 1.6939702560992331, + "language_loss": 0.70014036, + "learning_rate": 2.821802155794668e-06, + "loss": 0.7224651, + "num_input_tokens_seen": 137146745, + "step": 6391, + "time_per_iteration": 2.814173698425293 + }, + { + "auxiliary_loss_clip": 0.01153546, + "auxiliary_loss_mlp": 0.0112526, + "balance_loss_clip": 1.00210667, + "balance_loss_mlp": 1.00071001, + "epoch": 0.38430783105366, + "flos": 20813717404800.0, + "grad_norm": 1.7145213553389766, + "language_loss": 0.83990836, + "learning_rate": 2.8214470758149884e-06, + "loss": 0.86269641, + "num_input_tokens_seen": 137163195, + "step": 6392, + "time_per_iteration": 2.654759407043457 + }, + { + "auxiliary_loss_clip": 0.01154645, + "auxiliary_loss_mlp": 0.01125315, + "balance_loss_clip": 1.00215173, + "balance_loss_mlp": 1.00067008, + "epoch": 0.384367954306328, + "flos": 10998577399680.0, + "grad_norm": 1.8511433253334895, + "language_loss": 0.612194, + "learning_rate": 2.8210919646859536e-06, + "loss": 0.63499361, + "num_input_tokens_seen": 137179330, + "step": 6393, + "time_per_iteration": 2.5506882667541504 + }, + { + "auxiliary_loss_clip": 0.01121571, + "auxiliary_loss_mlp": 0.01125795, + "balance_loss_clip": 1.00195706, + "balance_loss_mlp": 1.00057793, + "epoch": 0.38442807755899594, + "flos": 25338030779520.0, + "grad_norm": 1.854000752290318, + "language_loss": 0.71075487, + "learning_rate": 2.820736822421029e-06, + "loss": 0.73322856, + "num_input_tokens_seen": 137198655, + "step": 6394, + "time_per_iteration": 2.7120108604431152 + }, + { + "auxiliary_loss_clip": 0.01154895, + "auxiliary_loss_mlp": 0.01125876, + "balance_loss_clip": 1.00229311, + "balance_loss_mlp": 1.00065827, + "epoch": 0.3844882008116639, + "flos": 21069760527360.0, + "grad_norm": 1.923944667711953, + "language_loss": 0.81266904, + "learning_rate": 2.8203816490336822e-06, + "loss": 0.83547676, + "num_input_tokens_seen": 137217120, + "step": 6395, + "time_per_iteration": 3.9660744667053223 + }, + { + "auxiliary_loss_clip": 0.01159602, + "auxiliary_loss_mlp": 0.01125303, + "balance_loss_clip": 1.00270677, + "balance_loss_mlp": 1.00084865, + "epoch": 0.38454832406433187, + "flos": 17963235855360.0, + "grad_norm": 5.2450876163511975, + "language_loss": 0.70706654, + "learning_rate": 2.8200264445373813e-06, + "loss": 0.72991562, + "num_input_tokens_seen": 137234410, + "step": 6396, + "time_per_iteration": 2.5289971828460693 + }, + { + "auxiliary_loss_clip": 0.01149058, + "auxiliary_loss_mlp": 0.01106219, + "balance_loss_clip": 1.00216544, + "balance_loss_mlp": 1.00017071, + "epoch": 0.38460844731699984, + "flos": 67924999555200.0, + "grad_norm": 0.8906283918725625, + "language_loss": 0.59712511, + "learning_rate": 2.8196712089455954e-06, + "loss": 0.6196779, + "num_input_tokens_seen": 137294940, + "step": 6397, + "time_per_iteration": 4.554362773895264 + }, + { + "auxiliary_loss_clip": 0.01169911, + "auxiliary_loss_mlp": 0.01125605, + "balance_loss_clip": 1.00229657, + "balance_loss_mlp": 1.00057817, + "epoch": 0.3846685705696678, + "flos": 25849075530240.0, + "grad_norm": 2.058017109944821, + "language_loss": 0.85019803, + "learning_rate": 2.819315942271794e-06, + "loss": 0.87315321, + "num_input_tokens_seen": 137315035, + "step": 6398, + "time_per_iteration": 4.000813245773315 + }, + { + "auxiliary_loss_clip": 0.01169846, + "auxiliary_loss_mlp": 0.01125306, + "balance_loss_clip": 1.00220621, + "balance_loss_mlp": 1.00085187, + "epoch": 0.38472869382233577, + "flos": 16290194129280.0, + "grad_norm": 3.913111463254507, + "language_loss": 0.79555875, + "learning_rate": 2.8189606445294515e-06, + "loss": 0.81851029, + "num_input_tokens_seen": 137333155, + "step": 6399, + "time_per_iteration": 2.526111364364624 + }, + { + "auxiliary_loss_clip": 0.01169888, + "auxiliary_loss_mlp": 0.00747727, + "balance_loss_clip": 1.00216556, + "balance_loss_mlp": 1.00012612, + "epoch": 0.38478881707500373, + "flos": 19353122668800.0, + "grad_norm": 2.8883198014434885, + "language_loss": 0.67012739, + "learning_rate": 2.818605315732038e-06, + "loss": 0.68930358, + "num_input_tokens_seen": 137351515, + "step": 6400, + "time_per_iteration": 2.518768548965454 + }, + { + "auxiliary_loss_clip": 0.01137841, + "auxiliary_loss_mlp": 0.01126069, + "balance_loss_clip": 1.0022366, + "balance_loss_mlp": 1.00104237, + "epoch": 0.38484894032767175, + "flos": 24860849575680.0, + "grad_norm": 1.8488871467329169, + "language_loss": 0.73260522, + "learning_rate": 2.81824995589303e-06, + "loss": 0.75524437, + "num_input_tokens_seen": 137371255, + "step": 6401, + "time_per_iteration": 2.6134448051452637 + }, + { + "auxiliary_loss_clip": 0.01119959, + "auxiliary_loss_mlp": 0.01124786, + "balance_loss_clip": 1.00208688, + "balance_loss_mlp": 1.00080836, + "epoch": 0.3849090635803397, + "flos": 14501806853760.0, + "grad_norm": 1.911974981495275, + "language_loss": 0.72165704, + "learning_rate": 2.8178945650259012e-06, + "loss": 0.7441045, + "num_input_tokens_seen": 137388980, + "step": 6402, + "time_per_iteration": 4.122695446014404 + }, + { + "auxiliary_loss_clip": 0.01169643, + "auxiliary_loss_mlp": 0.01124558, + "balance_loss_clip": 1.0021522, + "balance_loss_mlp": 1.00058007, + "epoch": 0.3849691868330077, + "flos": 18515865576960.0, + "grad_norm": 1.8108890165143656, + "language_loss": 0.82773775, + "learning_rate": 2.817539143144128e-06, + "loss": 0.85067976, + "num_input_tokens_seen": 137406885, + "step": 6403, + "time_per_iteration": 2.510930061340332 + }, + { + "auxiliary_loss_clip": 0.01106084, + "auxiliary_loss_mlp": 0.01125183, + "balance_loss_clip": 1.00218666, + "balance_loss_mlp": 1.00072861, + "epoch": 0.38502931008567565, + "flos": 21616392677760.0, + "grad_norm": 2.0099443853257855, + "language_loss": 0.8301236, + "learning_rate": 2.817183690261189e-06, + "loss": 0.8524363, + "num_input_tokens_seen": 137425535, + "step": 6404, + "time_per_iteration": 2.6935312747955322 + }, + { + "auxiliary_loss_clip": 0.0114288, + "auxiliary_loss_mlp": 0.01124883, + "balance_loss_clip": 1.00232434, + "balance_loss_mlp": 1.00081015, + "epoch": 0.3850894333383436, + "flos": 25415346804480.0, + "grad_norm": 1.4795500059458189, + "language_loss": 0.69818461, + "learning_rate": 2.816828206390563e-06, + "loss": 0.72086227, + "num_input_tokens_seen": 137447700, + "step": 6405, + "time_per_iteration": 2.6914730072021484 + }, + { + "auxiliary_loss_clip": 0.01137795, + "auxiliary_loss_mlp": 0.01125087, + "balance_loss_clip": 1.00207591, + "balance_loss_mlp": 1.00072777, + "epoch": 0.3851495565910116, + "flos": 20227870581120.0, + "grad_norm": 2.002140254801679, + "language_loss": 0.79219818, + "learning_rate": 2.816472691545729e-06, + "loss": 0.81482708, + "num_input_tokens_seen": 137462245, + "step": 6406, + "time_per_iteration": 2.5951733589172363 + }, + { + "auxiliary_loss_clip": 0.0115336, + "auxiliary_loss_mlp": 0.01125229, + "balance_loss_clip": 1.0022738, + "balance_loss_mlp": 1.00067878, + "epoch": 0.38520967984367954, + "flos": 16508459122560.0, + "grad_norm": 2.0579345540221587, + "language_loss": 0.84243673, + "learning_rate": 2.8161171457401694e-06, + "loss": 0.86522257, + "num_input_tokens_seen": 137476455, + "step": 6407, + "time_per_iteration": 2.5633368492126465 + }, + { + "auxiliary_loss_clip": 0.0114917, + "auxiliary_loss_mlp": 0.01105249, + "balance_loss_clip": 1.00207388, + "balance_loss_mlp": 0.99996299, + "epoch": 0.3852698030963475, + "flos": 61313772971520.0, + "grad_norm": 0.872885940126306, + "language_loss": 0.65005934, + "learning_rate": 2.815761568987365e-06, + "loss": 0.67260355, + "num_input_tokens_seen": 137539845, + "step": 6408, + "time_per_iteration": 3.207949161529541 + }, + { + "auxiliary_loss_clip": 0.01136586, + "auxiliary_loss_mlp": 0.01124978, + "balance_loss_clip": 1.00187051, + "balance_loss_mlp": 1.00080991, + "epoch": 0.3853299263490155, + "flos": 22893016930560.0, + "grad_norm": 1.6206173982203442, + "language_loss": 0.7351476, + "learning_rate": 2.8154059613008e-06, + "loss": 0.75776321, + "num_input_tokens_seen": 137559880, + "step": 6409, + "time_per_iteration": 2.6217432022094727 + }, + { + "auxiliary_loss_clip": 0.01105854, + "auxiliary_loss_mlp": 0.01125877, + "balance_loss_clip": 1.00187421, + "balance_loss_mlp": 1.00085032, + "epoch": 0.38539004960168344, + "flos": 20047491457920.0, + "grad_norm": 2.4082137210076464, + "language_loss": 0.70764863, + "learning_rate": 2.81505032269396e-06, + "loss": 0.72996593, + "num_input_tokens_seen": 137578225, + "step": 6410, + "time_per_iteration": 2.691221237182617 + }, + { + "auxiliary_loss_clip": 0.01101286, + "auxiliary_loss_mlp": 0.0074674, + "balance_loss_clip": 1.00198793, + "balance_loss_mlp": 1.00026751, + "epoch": 0.3854501728543514, + "flos": 68730691570560.0, + "grad_norm": 0.6735118362458469, + "language_loss": 0.60289121, + "learning_rate": 2.81469465318033e-06, + "loss": 0.62137145, + "num_input_tokens_seen": 137645770, + "step": 6411, + "time_per_iteration": 3.3294742107391357 + }, + { + "auxiliary_loss_clip": 0.011075, + "auxiliary_loss_mlp": 0.01124408, + "balance_loss_clip": 1.00200677, + "balance_loss_mlp": 1.00052583, + "epoch": 0.38551029610701937, + "flos": 20485027025280.0, + "grad_norm": 4.5967182204606125, + "language_loss": 0.77677846, + "learning_rate": 2.814338952773397e-06, + "loss": 0.79909754, + "num_input_tokens_seen": 137664090, + "step": 6412, + "time_per_iteration": 2.6970572471618652 + }, + { + "auxiliary_loss_clip": 0.01122583, + "auxiliary_loss_mlp": 0.01126364, + "balance_loss_clip": 1.00205982, + "balance_loss_mlp": 1.00076509, + "epoch": 0.38557041935968733, + "flos": 23471788775040.0, + "grad_norm": 1.838382889994768, + "language_loss": 0.7760399, + "learning_rate": 2.8139832214866493e-06, + "loss": 0.79852933, + "num_input_tokens_seen": 137683190, + "step": 6413, + "time_per_iteration": 2.6666433811187744 + }, + { + "auxiliary_loss_clip": 0.01166078, + "auxiliary_loss_mlp": 0.0110533, + "balance_loss_clip": 1.00237846, + "balance_loss_mlp": 1.00004411, + "epoch": 0.38563054261235535, + "flos": 63966636869760.0, + "grad_norm": 0.8011948766363912, + "language_loss": 0.61307049, + "learning_rate": 2.813627459333576e-06, + "loss": 0.63578457, + "num_input_tokens_seen": 137737315, + "step": 6414, + "time_per_iteration": 2.95289945602417 + }, + { + "auxiliary_loss_clip": 0.0112143, + "auxiliary_loss_mlp": 0.01125261, + "balance_loss_clip": 1.00199485, + "balance_loss_mlp": 1.00071084, + "epoch": 0.3856906658650233, + "flos": 23987789602560.0, + "grad_norm": 2.743946861104331, + "language_loss": 0.77308381, + "learning_rate": 2.8132716663276685e-06, + "loss": 0.7955507, + "num_input_tokens_seen": 137753535, + "step": 6415, + "time_per_iteration": 2.6538245677948 + }, + { + "auxiliary_loss_clip": 0.01136468, + "auxiliary_loss_mlp": 0.01124391, + "balance_loss_clip": 1.0022099, + "balance_loss_mlp": 1.00060439, + "epoch": 0.3857507891176913, + "flos": 25007436979200.0, + "grad_norm": 1.9740814877164097, + "language_loss": 0.79904962, + "learning_rate": 2.8129158424824173e-06, + "loss": 0.82165813, + "num_input_tokens_seen": 137773405, + "step": 6416, + "time_per_iteration": 2.689046621322632 + }, + { + "auxiliary_loss_clip": 0.01159188, + "auxiliary_loss_mlp": 0.00747668, + "balance_loss_clip": 1.00213528, + "balance_loss_mlp": 1.00012922, + "epoch": 0.38581091237035925, + "flos": 21536778182400.0, + "grad_norm": 1.8985650746018339, + "language_loss": 0.79442304, + "learning_rate": 2.8125599878113155e-06, + "loss": 0.81349158, + "num_input_tokens_seen": 137790810, + "step": 6417, + "time_per_iteration": 2.5876271724700928 + }, + { + "auxiliary_loss_clip": 0.01136584, + "auxiliary_loss_mlp": 0.01124953, + "balance_loss_clip": 1.00191021, + "balance_loss_mlp": 1.00078416, + "epoch": 0.3858710356230272, + "flos": 17383889393280.0, + "grad_norm": 1.9869673615781083, + "language_loss": 0.80027676, + "learning_rate": 2.8122041023278583e-06, + "loss": 0.82289213, + "num_input_tokens_seen": 137810265, + "step": 6418, + "time_per_iteration": 2.6133666038513184 + }, + { + "auxiliary_loss_clip": 0.01137709, + "auxiliary_loss_mlp": 0.0112464, + "balance_loss_clip": 1.00195181, + "balance_loss_mlp": 1.00066257, + "epoch": 0.3859311588756952, + "flos": 20339588856960.0, + "grad_norm": 1.8008165588827454, + "language_loss": 0.79293561, + "learning_rate": 2.8118481860455407e-06, + "loss": 0.81555915, + "num_input_tokens_seen": 137828580, + "step": 6419, + "time_per_iteration": 2.598604679107666 + }, + { + "auxiliary_loss_clip": 0.0113733, + "auxiliary_loss_mlp": 0.01125238, + "balance_loss_clip": 1.00210583, + "balance_loss_mlp": 1.00068784, + "epoch": 0.38599128212836314, + "flos": 26321157002880.0, + "grad_norm": 1.8230645233620526, + "language_loss": 0.672212, + "learning_rate": 2.8114922389778573e-06, + "loss": 0.69483763, + "num_input_tokens_seen": 137846145, + "step": 6420, + "time_per_iteration": 2.647944450378418 + }, + { + "auxiliary_loss_clip": 0.01120893, + "auxiliary_loss_mlp": 0.01124545, + "balance_loss_clip": 1.00196445, + "balance_loss_mlp": 1.00075793, + "epoch": 0.3860514053810311, + "flos": 13553837066880.0, + "grad_norm": 1.870532054282997, + "language_loss": 0.80778152, + "learning_rate": 2.8111362611383076e-06, + "loss": 0.83023584, + "num_input_tokens_seen": 137863705, + "step": 6421, + "time_per_iteration": 2.6228973865509033 + }, + { + "auxiliary_loss_clip": 0.01136138, + "auxiliary_loss_mlp": 0.01124437, + "balance_loss_clip": 1.00191331, + "balance_loss_mlp": 1.0005542, + "epoch": 0.3861115286336991, + "flos": 20954271323520.0, + "grad_norm": 2.2241978700548453, + "language_loss": 0.71786487, + "learning_rate": 2.8107802525403886e-06, + "loss": 0.74047065, + "num_input_tokens_seen": 137880285, + "step": 6422, + "time_per_iteration": 2.6063694953918457 + }, + { + "auxiliary_loss_clip": 0.01135712, + "auxiliary_loss_mlp": 0.01124081, + "balance_loss_clip": 1.00202143, + "balance_loss_mlp": 1.00086594, + "epoch": 0.38617165188636704, + "flos": 16362697731840.0, + "grad_norm": 1.7347440523863293, + "language_loss": 0.66397518, + "learning_rate": 2.8104242131976025e-06, + "loss": 0.68657309, + "num_input_tokens_seen": 137898335, + "step": 6423, + "time_per_iteration": 2.5931758880615234 + }, + { + "auxiliary_loss_clip": 0.01154089, + "auxiliary_loss_mlp": 0.0112506, + "balance_loss_clip": 1.0021472, + "balance_loss_mlp": 1.00060606, + "epoch": 0.386231775139035, + "flos": 34787276893440.0, + "grad_norm": 2.0475544295838537, + "language_loss": 0.68820089, + "learning_rate": 2.810068143123449e-06, + "loss": 0.71099234, + "num_input_tokens_seen": 137918605, + "step": 6424, + "time_per_iteration": 2.7081949710845947 + }, + { + "auxiliary_loss_clip": 0.01120751, + "auxiliary_loss_mlp": 0.01124006, + "balance_loss_clip": 1.00203109, + "balance_loss_mlp": 1.00069571, + "epoch": 0.38629189839170297, + "flos": 21726171619200.0, + "grad_norm": 1.5150322850505946, + "language_loss": 0.7251507, + "learning_rate": 2.809712042331429e-06, + "loss": 0.74759823, + "num_input_tokens_seen": 137938245, + "step": 6425, + "time_per_iteration": 2.649470090866089 + }, + { + "auxiliary_loss_clip": 0.01122688, + "auxiliary_loss_mlp": 0.00747756, + "balance_loss_clip": 1.00200033, + "balance_loss_mlp": 1.00014734, + "epoch": 0.38635202164437094, + "flos": 27923634460800.0, + "grad_norm": 2.187810594703151, + "language_loss": 0.81070465, + "learning_rate": 2.8093559108350484e-06, + "loss": 0.82940912, + "num_input_tokens_seen": 137956770, + "step": 6426, + "time_per_iteration": 2.719029188156128 + }, + { + "auxiliary_loss_clip": 0.01154362, + "auxiliary_loss_mlp": 0.01124655, + "balance_loss_clip": 1.00220871, + "balance_loss_mlp": 1.00067699, + "epoch": 0.38641214489703896, + "flos": 23586631534080.0, + "grad_norm": 1.9293317022069834, + "language_loss": 0.7460708, + "learning_rate": 2.80899974864781e-06, + "loss": 0.76886094, + "num_input_tokens_seen": 137977040, + "step": 6427, + "time_per_iteration": 2.5967307090759277 + }, + { + "auxiliary_loss_clip": 0.01107641, + "auxiliary_loss_mlp": 0.01124161, + "balance_loss_clip": 1.00197315, + "balance_loss_mlp": 1.00075507, + "epoch": 0.3864722681497069, + "flos": 12641239198080.0, + "grad_norm": 2.315992046153486, + "language_loss": 0.70030338, + "learning_rate": 2.8086435557832203e-06, + "loss": 0.72262144, + "num_input_tokens_seen": 137993545, + "step": 6428, + "time_per_iteration": 2.6768851280212402 + }, + { + "auxiliary_loss_clip": 0.01137212, + "auxiliary_loss_mlp": 0.01124673, + "balance_loss_clip": 1.00194561, + "balance_loss_mlp": 1.00088608, + "epoch": 0.3865323914023749, + "flos": 17598922162560.0, + "grad_norm": 3.0783499593172134, + "language_loss": 0.8432281, + "learning_rate": 2.8082873322547863e-06, + "loss": 0.86584699, + "num_input_tokens_seen": 138010140, + "step": 6429, + "time_per_iteration": 2.5829124450683594 + }, + { + "auxiliary_loss_clip": 0.01139749, + "auxiliary_loss_mlp": 0.01125139, + "balance_loss_clip": 1.00217032, + "balance_loss_mlp": 1.00078046, + "epoch": 0.38659251465504285, + "flos": 18478949374080.0, + "grad_norm": 2.3334519232012685, + "language_loss": 0.8136214, + "learning_rate": 2.807931078076015e-06, + "loss": 0.83627027, + "num_input_tokens_seen": 138028880, + "step": 6430, + "time_per_iteration": 2.6280856132507324 + }, + { + "auxiliary_loss_clip": 0.01117652, + "auxiliary_loss_mlp": 0.0110522, + "balance_loss_clip": 1.00188839, + "balance_loss_mlp": 0.99993402, + "epoch": 0.3866526379077108, + "flos": 64165726978560.0, + "grad_norm": 0.7176283451734042, + "language_loss": 0.58847713, + "learning_rate": 2.807574793260416e-06, + "loss": 0.61070585, + "num_input_tokens_seen": 138098090, + "step": 6431, + "time_per_iteration": 3.2658751010894775 + }, + { + "auxiliary_loss_clip": 0.01092283, + "auxiliary_loss_mlp": 0.01124761, + "balance_loss_clip": 1.00184906, + "balance_loss_mlp": 1.00068808, + "epoch": 0.3867127611603788, + "flos": 14388292897920.0, + "grad_norm": 1.8637674982814905, + "language_loss": 0.79495394, + "learning_rate": 2.8072184778215004e-06, + "loss": 0.81712443, + "num_input_tokens_seen": 138114735, + "step": 6432, + "time_per_iteration": 4.103477954864502 + }, + { + "auxiliary_loss_clip": 0.01154768, + "auxiliary_loss_mlp": 0.01125304, + "balance_loss_clip": 1.0019964, + "balance_loss_mlp": 1.0008496, + "epoch": 0.38677288441304675, + "flos": 20010754823040.0, + "grad_norm": 1.8617907169700876, + "language_loss": 0.80628705, + "learning_rate": 2.806862131772779e-06, + "loss": 0.82908773, + "num_input_tokens_seen": 138130480, + "step": 6433, + "time_per_iteration": 2.559351921081543 + }, + { + "auxiliary_loss_clip": 0.01136351, + "auxiliary_loss_mlp": 0.01125297, + "balance_loss_clip": 1.00203574, + "balance_loss_mlp": 1.00065207, + "epoch": 0.3868330076657147, + "flos": 22236893147520.0, + "grad_norm": 1.6330594541108951, + "language_loss": 0.70965934, + "learning_rate": 2.806505755127765e-06, + "loss": 0.73227584, + "num_input_tokens_seen": 138150640, + "step": 6434, + "time_per_iteration": 2.7224280834198 + }, + { + "auxiliary_loss_clip": 0.01122, + "auxiliary_loss_mlp": 0.01125965, + "balance_loss_clip": 1.00182462, + "balance_loss_mlp": 1.00074768, + "epoch": 0.3868931309183827, + "flos": 16727442387840.0, + "grad_norm": 1.7579994012207734, + "language_loss": 0.77131742, + "learning_rate": 2.806149347899972e-06, + "loss": 0.79379708, + "num_input_tokens_seen": 138169700, + "step": 6435, + "time_per_iteration": 5.648598670959473 + }, + { + "auxiliary_loss_clip": 0.01152976, + "auxiliary_loss_mlp": 0.01123961, + "balance_loss_clip": 1.00208592, + "balance_loss_mlp": 1.000651, + "epoch": 0.38695325417105064, + "flos": 22674716023680.0, + "grad_norm": 1.8615997449684674, + "language_loss": 0.79740363, + "learning_rate": 2.805792910102915e-06, + "loss": 0.82017303, + "num_input_tokens_seen": 138185835, + "step": 6436, + "time_per_iteration": 2.57014536857605 + }, + { + "auxiliary_loss_clip": 0.01136352, + "auxiliary_loss_mlp": 0.01123076, + "balance_loss_clip": 1.0020014, + "balance_loss_mlp": 1.00071967, + "epoch": 0.3870133774237186, + "flos": 23112036109440.0, + "grad_norm": 1.5745731237662046, + "language_loss": 0.76780379, + "learning_rate": 2.8054364417501093e-06, + "loss": 0.79039806, + "num_input_tokens_seen": 138204080, + "step": 6437, + "time_per_iteration": 2.604462146759033 + }, + { + "auxiliary_loss_clip": 0.01136715, + "auxiliary_loss_mlp": 0.01123262, + "balance_loss_clip": 1.00202513, + "balance_loss_mlp": 1.00071502, + "epoch": 0.3870735006763866, + "flos": 17675699483520.0, + "grad_norm": 2.076648408608983, + "language_loss": 0.81490445, + "learning_rate": 2.805079942855074e-06, + "loss": 0.83750415, + "num_input_tokens_seen": 138220710, + "step": 6438, + "time_per_iteration": 2.5730087757110596 + }, + { + "auxiliary_loss_clip": 0.01139085, + "auxiliary_loss_mlp": 0.00747536, + "balance_loss_clip": 1.00198328, + "balance_loss_mlp": 1.00019479, + "epoch": 0.38713362392905454, + "flos": 23295791111040.0, + "grad_norm": 1.3896479004018047, + "language_loss": 0.75367773, + "learning_rate": 2.804723413431326e-06, + "loss": 0.77254391, + "num_input_tokens_seen": 138241720, + "step": 6439, + "time_per_iteration": 4.123622417449951 + }, + { + "auxiliary_loss_clip": 0.01169532, + "auxiliary_loss_mlp": 0.01123611, + "balance_loss_clip": 1.00223124, + "balance_loss_mlp": 1.00058675, + "epoch": 0.38719374718172256, + "flos": 21031192298880.0, + "grad_norm": 1.4357162138125634, + "language_loss": 0.73406386, + "learning_rate": 2.8043668534923855e-06, + "loss": 0.75699538, + "num_input_tokens_seen": 138261885, + "step": 6440, + "time_per_iteration": 2.534536600112915 + }, + { + "auxiliary_loss_clip": 0.01153091, + "auxiliary_loss_mlp": 0.01124672, + "balance_loss_clip": 1.00205946, + "balance_loss_mlp": 1.00079012, + "epoch": 0.3872538704343905, + "flos": 19609776322560.0, + "grad_norm": 2.173169826868708, + "language_loss": 0.82127154, + "learning_rate": 2.804010263051774e-06, + "loss": 0.8440491, + "num_input_tokens_seen": 138280255, + "step": 6441, + "time_per_iteration": 2.580911874771118 + }, + { + "auxiliary_loss_clip": 0.01169823, + "auxiliary_loss_mlp": 0.01124324, + "balance_loss_clip": 1.00227046, + "balance_loss_mlp": 1.00082314, + "epoch": 0.3873139936870585, + "flos": 17530045833600.0, + "grad_norm": 2.0522189182956905, + "language_loss": 0.80953127, + "learning_rate": 2.8036536421230118e-06, + "loss": 0.83247268, + "num_input_tokens_seen": 138296675, + "step": 6442, + "time_per_iteration": 2.530946731567383 + }, + { + "auxiliary_loss_clip": 0.01121246, + "auxiliary_loss_mlp": 0.01123969, + "balance_loss_clip": 1.00202501, + "balance_loss_mlp": 1.00065851, + "epoch": 0.38737411693972645, + "flos": 17786555832960.0, + "grad_norm": 1.6766552362368172, + "language_loss": 0.83690262, + "learning_rate": 2.803296990719624e-06, + "loss": 0.85935479, + "num_input_tokens_seen": 138314985, + "step": 6443, + "time_per_iteration": 2.6460256576538086 + }, + { + "auxiliary_loss_clip": 0.01132209, + "auxiliary_loss_mlp": 0.01104463, + "balance_loss_clip": 1.00209308, + "balance_loss_mlp": 0.99993992, + "epoch": 0.3874342401923944, + "flos": 58304637048960.0, + "grad_norm": 0.7587889354035995, + "language_loss": 0.50274074, + "learning_rate": 2.8029403088551327e-06, + "loss": 0.5251075, + "num_input_tokens_seen": 138373275, + "step": 6444, + "time_per_iteration": 3.192721128463745 + }, + { + "auxiliary_loss_clip": 0.01122645, + "auxiliary_loss_mlp": 0.00747503, + "balance_loss_clip": 1.00202143, + "balance_loss_mlp": 1.00009036, + "epoch": 0.3874943634450624, + "flos": 17711933328000.0, + "grad_norm": 1.8349946363369876, + "language_loss": 0.78920615, + "learning_rate": 2.802583596543065e-06, + "loss": 0.80790764, + "num_input_tokens_seen": 138391145, + "step": 6445, + "time_per_iteration": 2.633728504180908 + }, + { + "auxiliary_loss_clip": 0.01154597, + "auxiliary_loss_mlp": 0.01123947, + "balance_loss_clip": 1.00210643, + "balance_loss_mlp": 1.00073266, + "epoch": 0.38755448669773035, + "flos": 19244852098560.0, + "grad_norm": 1.9443550136380492, + "language_loss": 0.80869198, + "learning_rate": 2.8022268537969474e-06, + "loss": 0.83147746, + "num_input_tokens_seen": 138409875, + "step": 6446, + "time_per_iteration": 2.574735164642334 + }, + { + "auxiliary_loss_clip": 0.01136141, + "auxiliary_loss_mlp": 0.01124408, + "balance_loss_clip": 1.00193429, + "balance_loss_mlp": 1.00081229, + "epoch": 0.3876146099503983, + "flos": 20594267262720.0, + "grad_norm": 1.721721180252303, + "language_loss": 0.76884705, + "learning_rate": 2.801870080630306e-06, + "loss": 0.79145259, + "num_input_tokens_seen": 138428965, + "step": 6447, + "time_per_iteration": 2.6010231971740723 + }, + { + "auxiliary_loss_clip": 0.0113789, + "auxiliary_loss_mlp": 0.01123656, + "balance_loss_clip": 1.00202823, + "balance_loss_mlp": 1.00072694, + "epoch": 0.3876747332030663, + "flos": 19281121856640.0, + "grad_norm": 2.0083782860364128, + "language_loss": 0.7642442, + "learning_rate": 2.801513277056671e-06, + "loss": 0.78685963, + "num_input_tokens_seen": 138448090, + "step": 6448, + "time_per_iteration": 2.633194923400879 + }, + { + "auxiliary_loss_clip": 0.01137854, + "auxiliary_loss_mlp": 0.011248, + "balance_loss_clip": 1.00204563, + "balance_loss_mlp": 1.00063181, + "epoch": 0.38773485645573424, + "flos": 18945895201920.0, + "grad_norm": 1.527621419329995, + "language_loss": 0.76081854, + "learning_rate": 2.8011564430895725e-06, + "loss": 0.78344512, + "num_input_tokens_seen": 138466105, + "step": 6449, + "time_per_iteration": 2.5782461166381836 + }, + { + "auxiliary_loss_clip": 0.01121312, + "auxiliary_loss_mlp": 0.00747604, + "balance_loss_clip": 1.00186467, + "balance_loss_mlp": 1.00007832, + "epoch": 0.3877949797084022, + "flos": 23071348978560.0, + "grad_norm": 1.8318001230129026, + "language_loss": 0.78404003, + "learning_rate": 2.800799578742542e-06, + "loss": 0.80272925, + "num_input_tokens_seen": 138485160, + "step": 6450, + "time_per_iteration": 2.6857151985168457 + }, + { + "auxiliary_loss_clip": 0.01169814, + "auxiliary_loss_mlp": 0.01125214, + "balance_loss_clip": 1.00206804, + "balance_loss_mlp": 1.00085521, + "epoch": 0.3878551029610702, + "flos": 29095543589760.0, + "grad_norm": 2.429997888822364, + "language_loss": 0.77394277, + "learning_rate": 2.8004426840291106e-06, + "loss": 0.79689306, + "num_input_tokens_seen": 138504135, + "step": 6451, + "time_per_iteration": 2.592303991317749 + }, + { + "auxiliary_loss_clip": 0.0116938, + "auxiliary_loss_mlp": 0.01123342, + "balance_loss_clip": 1.00205779, + "balance_loss_mlp": 1.00060451, + "epoch": 0.38791522621373814, + "flos": 20996394998400.0, + "grad_norm": 1.7679166414963443, + "language_loss": 0.76677531, + "learning_rate": 2.800085758962812e-06, + "loss": 0.78970253, + "num_input_tokens_seen": 138523955, + "step": 6452, + "time_per_iteration": 2.554110050201416 + }, + { + "auxiliary_loss_clip": 0.01138153, + "auxiliary_loss_mlp": 0.01124078, + "balance_loss_clip": 1.00192237, + "balance_loss_mlp": 1.0009582, + "epoch": 0.3879753494664061, + "flos": 15486836497920.0, + "grad_norm": 1.5290217061271705, + "language_loss": 0.79330945, + "learning_rate": 2.799728803557182e-06, + "loss": 0.8159318, + "num_input_tokens_seen": 138541655, + "step": 6453, + "time_per_iteration": 2.587303876876831 + }, + { + "auxiliary_loss_clip": 0.01154742, + "auxiliary_loss_mlp": 0.01124674, + "balance_loss_clip": 1.00214553, + "balance_loss_mlp": 1.0006963, + "epoch": 0.3880354727190741, + "flos": 22053964158720.0, + "grad_norm": 1.852886219131101, + "language_loss": 0.71643806, + "learning_rate": 2.7993718178257555e-06, + "loss": 0.73923218, + "num_input_tokens_seen": 138560860, + "step": 6454, + "time_per_iteration": 2.5779333114624023 + }, + { + "auxiliary_loss_clip": 0.01169845, + "auxiliary_loss_mlp": 0.01124992, + "balance_loss_clip": 1.00223196, + "balance_loss_mlp": 1.00072825, + "epoch": 0.3880955959717421, + "flos": 20340307128960.0, + "grad_norm": 1.6112991041508602, + "language_loss": 0.77396256, + "learning_rate": 2.7990148017820694e-06, + "loss": 0.79691094, + "num_input_tokens_seen": 138580200, + "step": 6455, + "time_per_iteration": 2.519779920578003 + }, + { + "auxiliary_loss_clip": 0.01169572, + "auxiliary_loss_mlp": 0.01123653, + "balance_loss_clip": 1.00219011, + "balance_loss_mlp": 1.00072408, + "epoch": 0.38815571922441006, + "flos": 23075407215360.0, + "grad_norm": 1.5002315575484324, + "language_loss": 0.75745243, + "learning_rate": 2.798657755439662e-06, + "loss": 0.78038466, + "num_input_tokens_seen": 138598315, + "step": 6456, + "time_per_iteration": 2.5421016216278076 + }, + { + "auxiliary_loss_clip": 0.01088204, + "auxiliary_loss_mlp": 0.01124712, + "balance_loss_clip": 1.0016166, + "balance_loss_mlp": 1.00063944, + "epoch": 0.388215842477078, + "flos": 20776944856320.0, + "grad_norm": 2.186426955724044, + "language_loss": 0.60682905, + "learning_rate": 2.7983006788120726e-06, + "loss": 0.62895823, + "num_input_tokens_seen": 138615695, + "step": 6457, + "time_per_iteration": 2.7094149589538574 + }, + { + "auxiliary_loss_clip": 0.0116964, + "auxiliary_loss_mlp": 0.01124902, + "balance_loss_clip": 1.00213408, + "balance_loss_mlp": 1.00063801, + "epoch": 0.388275965729746, + "flos": 20448182649600.0, + "grad_norm": 1.9951231087225365, + "language_loss": 0.80290663, + "learning_rate": 2.797943571912841e-06, + "loss": 0.82585204, + "num_input_tokens_seen": 138633180, + "step": 6458, + "time_per_iteration": 2.5402116775512695 + }, + { + "auxiliary_loss_clip": 0.01106714, + "auxiliary_loss_mlp": 0.01124404, + "balance_loss_clip": 1.00189853, + "balance_loss_mlp": 1.00071275, + "epoch": 0.38833608898241395, + "flos": 27892392606720.0, + "grad_norm": 1.8015855787782913, + "language_loss": 0.81671405, + "learning_rate": 2.797586434755509e-06, + "loss": 0.8390252, + "num_input_tokens_seen": 138654785, + "step": 6459, + "time_per_iteration": 2.72228741645813 + }, + { + "auxiliary_loss_clip": 0.01137751, + "auxiliary_loss_mlp": 0.0112382, + "balance_loss_clip": 1.00203562, + "balance_loss_mlp": 1.00070071, + "epoch": 0.3883962122350819, + "flos": 18076390675200.0, + "grad_norm": 2.667274520276296, + "language_loss": 0.6172322, + "learning_rate": 2.7972292673536202e-06, + "loss": 0.63984793, + "num_input_tokens_seen": 138673330, + "step": 6460, + "time_per_iteration": 2.603254795074463 + }, + { + "auxiliary_loss_clip": 0.01153006, + "auxiliary_loss_mlp": 0.01123833, + "balance_loss_clip": 1.00215065, + "balance_loss_mlp": 1.00071359, + "epoch": 0.3884563354877499, + "flos": 23622254847360.0, + "grad_norm": 2.8828345453938202, + "language_loss": 0.86449075, + "learning_rate": 2.796872069720717e-06, + "loss": 0.88725913, + "num_input_tokens_seen": 138694185, + "step": 6461, + "time_per_iteration": 2.5938830375671387 + }, + { + "auxiliary_loss_clip": 0.01152785, + "auxiliary_loss_mlp": 0.01124351, + "balance_loss_clip": 1.00196099, + "balance_loss_mlp": 1.00065947, + "epoch": 0.38851645874041785, + "flos": 27453528236160.0, + "grad_norm": 2.1193644210593288, + "language_loss": 0.70875859, + "learning_rate": 2.7965148418703456e-06, + "loss": 0.73152995, + "num_input_tokens_seen": 138714625, + "step": 6462, + "time_per_iteration": 2.6156864166259766 + }, + { + "auxiliary_loss_clip": 0.0112408, + "auxiliary_loss_mlp": 0.01124655, + "balance_loss_clip": 1.00209403, + "balance_loss_mlp": 1.00077224, + "epoch": 0.3885765819930858, + "flos": 25228072270080.0, + "grad_norm": 2.1821673368523804, + "language_loss": 0.75884044, + "learning_rate": 2.796157583816052e-06, + "loss": 0.78132784, + "num_input_tokens_seen": 138733585, + "step": 6463, + "time_per_iteration": 2.6895525455474854 + }, + { + "auxiliary_loss_clip": 0.0111958, + "auxiliary_loss_mlp": 0.01124472, + "balance_loss_clip": 1.00189495, + "balance_loss_mlp": 1.00058961, + "epoch": 0.3886367052457538, + "flos": 16946605221120.0, + "grad_norm": 2.149789085141164, + "language_loss": 0.70237899, + "learning_rate": 2.795800295571382e-06, + "loss": 0.72481954, + "num_input_tokens_seen": 138752335, + "step": 6464, + "time_per_iteration": 2.638171434402466 + }, + { + "auxiliary_loss_clip": 0.01136223, + "auxiliary_loss_mlp": 0.01123701, + "balance_loss_clip": 1.00202239, + "balance_loss_mlp": 1.0005815, + "epoch": 0.38869682849842174, + "flos": 27154140376320.0, + "grad_norm": 2.0481776357820465, + "language_loss": 0.69373226, + "learning_rate": 2.7954429771498858e-06, + "loss": 0.71633154, + "num_input_tokens_seen": 138768450, + "step": 6465, + "time_per_iteration": 2.630908966064453 + }, + { + "auxiliary_loss_clip": 0.01121489, + "auxiliary_loss_mlp": 0.01124155, + "balance_loss_clip": 1.00195265, + "balance_loss_mlp": 1.00074995, + "epoch": 0.3887569517510897, + "flos": 21063619301760.0, + "grad_norm": 2.103543870978224, + "language_loss": 0.78394639, + "learning_rate": 2.7950856285651117e-06, + "loss": 0.8064028, + "num_input_tokens_seen": 138786775, + "step": 6466, + "time_per_iteration": 2.6953163146972656 + }, + { + "auxiliary_loss_clip": 0.01123141, + "auxiliary_loss_mlp": 0.01123958, + "balance_loss_clip": 1.00197959, + "balance_loss_mlp": 1.00064826, + "epoch": 0.38881707500375773, + "flos": 29497384016640.0, + "grad_norm": 1.7861719090788533, + "language_loss": 0.69567382, + "learning_rate": 2.794728249830611e-06, + "loss": 0.71814477, + "num_input_tokens_seen": 138810100, + "step": 6467, + "time_per_iteration": 2.7458910942077637 + }, + { + "auxiliary_loss_clip": 0.01121563, + "auxiliary_loss_mlp": 0.01124199, + "balance_loss_clip": 1.00188386, + "balance_loss_mlp": 1.00060296, + "epoch": 0.3888771982564257, + "flos": 17488281294720.0, + "grad_norm": 2.9023482639905835, + "language_loss": 0.83293545, + "learning_rate": 2.794370840959936e-06, + "loss": 0.85539305, + "num_input_tokens_seen": 138825140, + "step": 6468, + "time_per_iteration": 2.6163902282714844 + }, + { + "auxiliary_loss_clip": 0.01142491, + "auxiliary_loss_mlp": 0.01123423, + "balance_loss_clip": 1.00244188, + "balance_loss_mlp": 1.00068486, + "epoch": 0.38893732150909366, + "flos": 21942425450880.0, + "grad_norm": 1.9185002795643706, + "language_loss": 0.8439132, + "learning_rate": 2.7940134019666383e-06, + "loss": 0.86657238, + "num_input_tokens_seen": 138844115, + "step": 6469, + "time_per_iteration": 2.6043355464935303 + }, + { + "auxiliary_loss_clip": 0.01120981, + "auxiliary_loss_mlp": 0.01124099, + "balance_loss_clip": 1.0019958, + "balance_loss_mlp": 1.00059867, + "epoch": 0.3889974447617616, + "flos": 24276367468800.0, + "grad_norm": 1.7286291102048545, + "language_loss": 0.74743992, + "learning_rate": 2.793655932864273e-06, + "loss": 0.76989073, + "num_input_tokens_seen": 138860860, + "step": 6470, + "time_per_iteration": 4.087491989135742 + }, + { + "auxiliary_loss_clip": 0.01119729, + "auxiliary_loss_mlp": 0.00747637, + "balance_loss_clip": 1.0019002, + "balance_loss_mlp": 1.00004733, + "epoch": 0.3890575680144296, + "flos": 25667116208640.0, + "grad_norm": 1.5443541170795394, + "language_loss": 0.74404263, + "learning_rate": 2.7932984336663953e-06, + "loss": 0.76271629, + "num_input_tokens_seen": 138881910, + "step": 6471, + "time_per_iteration": 2.6984357833862305 + }, + { + "auxiliary_loss_clip": 0.01107089, + "auxiliary_loss_mlp": 0.01124668, + "balance_loss_clip": 1.00198007, + "balance_loss_mlp": 1.00088072, + "epoch": 0.38911769126709755, + "flos": 22855274714880.0, + "grad_norm": 1.6374782460622415, + "language_loss": 0.67661482, + "learning_rate": 2.792940904386562e-06, + "loss": 0.69893241, + "num_input_tokens_seen": 138900975, + "step": 6472, + "time_per_iteration": 2.744562864303589 + }, + { + "auxiliary_loss_clip": 0.01122146, + "auxiliary_loss_mlp": 0.01123971, + "balance_loss_clip": 1.0019263, + "balance_loss_mlp": 1.00075579, + "epoch": 0.3891778145197655, + "flos": 25447522412160.0, + "grad_norm": 1.6449417920311764, + "language_loss": 0.76648796, + "learning_rate": 2.7925833450383293e-06, + "loss": 0.78894907, + "num_input_tokens_seen": 138920795, + "step": 6473, + "time_per_iteration": 4.152123689651489 + }, + { + "auxiliary_loss_clip": 0.01135727, + "auxiliary_loss_mlp": 0.01124095, + "balance_loss_clip": 1.00201249, + "balance_loss_mlp": 1.00088, + "epoch": 0.3892379377724335, + "flos": 14027965614720.0, + "grad_norm": 2.1313962095158545, + "language_loss": 0.70938492, + "learning_rate": 2.792225755635257e-06, + "loss": 0.73198313, + "num_input_tokens_seen": 138938770, + "step": 6474, + "time_per_iteration": 4.010738134384155 + }, + { + "auxiliary_loss_clip": 0.01169512, + "auxiliary_loss_mlp": 0.0112413, + "balance_loss_clip": 1.00210547, + "balance_loss_mlp": 1.00072479, + "epoch": 0.38929806102510145, + "flos": 20157449967360.0, + "grad_norm": 1.493924974905727, + "language_loss": 0.6859889, + "learning_rate": 2.7918681361909046e-06, + "loss": 0.70892531, + "num_input_tokens_seen": 138958880, + "step": 6475, + "time_per_iteration": 2.53520131111145 + }, + { + "auxiliary_loss_clip": 0.0113904, + "auxiliary_loss_mlp": 0.0112482, + "balance_loss_clip": 1.00215816, + "balance_loss_mlp": 1.0007472, + "epoch": 0.3893581842777694, + "flos": 22163958581760.0, + "grad_norm": 1.791823954247792, + "language_loss": 0.75774813, + "learning_rate": 2.7915104867188332e-06, + "loss": 0.78038675, + "num_input_tokens_seen": 138977240, + "step": 6476, + "time_per_iteration": 2.6175010204315186 + }, + { + "auxiliary_loss_clip": 0.01136138, + "auxiliary_loss_mlp": 0.01103723, + "balance_loss_clip": 1.00219274, + "balance_loss_mlp": 0.99996358, + "epoch": 0.3894183075304374, + "flos": 67301877392640.0, + "grad_norm": 0.7849202713247103, + "language_loss": 0.58291799, + "learning_rate": 2.7911528072326055e-06, + "loss": 0.60531658, + "num_input_tokens_seen": 139039035, + "step": 6477, + "time_per_iteration": 4.6364240646362305 + }, + { + "auxiliary_loss_clip": 0.01103922, + "auxiliary_loss_mlp": 0.01124229, + "balance_loss_clip": 1.0018785, + "balance_loss_mlp": 1.00063241, + "epoch": 0.38947843078310534, + "flos": 18547502480640.0, + "grad_norm": 3.7881399803639235, + "language_loss": 0.78319645, + "learning_rate": 2.7907950977457832e-06, + "loss": 0.80547798, + "num_input_tokens_seen": 139055560, + "step": 6478, + "time_per_iteration": 2.6730082035064697 + }, + { + "auxiliary_loss_clip": 0.01154336, + "auxiliary_loss_mlp": 0.01124274, + "balance_loss_clip": 1.00215137, + "balance_loss_mlp": 1.0006783, + "epoch": 0.3895385540357733, + "flos": 14605875532800.0, + "grad_norm": 2.2189399202966094, + "language_loss": 0.82044202, + "learning_rate": 2.7904373582719317e-06, + "loss": 0.8432281, + "num_input_tokens_seen": 139071865, + "step": 6479, + "time_per_iteration": 2.5237581729888916 + }, + { + "auxiliary_loss_clip": 0.011696, + "auxiliary_loss_mlp": 0.01123533, + "balance_loss_clip": 1.00223732, + "balance_loss_mlp": 1.00069988, + "epoch": 0.38959867728844133, + "flos": 19975203336960.0, + "grad_norm": 1.5874328352950404, + "language_loss": 0.80026913, + "learning_rate": 2.790079588824617e-06, + "loss": 0.82320046, + "num_input_tokens_seen": 139089640, + "step": 6480, + "time_per_iteration": 2.656841993331909 + }, + { + "auxiliary_loss_clip": 0.01137683, + "auxiliary_loss_mlp": 0.01123354, + "balance_loss_clip": 1.00201714, + "balance_loss_mlp": 1.00042534, + "epoch": 0.3896588005411093, + "flos": 22672130244480.0, + "grad_norm": 1.823211446733237, + "language_loss": 0.83002925, + "learning_rate": 2.7897217894174038e-06, + "loss": 0.85263956, + "num_input_tokens_seen": 139109365, + "step": 6481, + "time_per_iteration": 2.629239082336426 + }, + { + "auxiliary_loss_clip": 0.01139584, + "auxiliary_loss_mlp": 0.01122988, + "balance_loss_clip": 1.00218976, + "balance_loss_mlp": 1.00072658, + "epoch": 0.38971892379377726, + "flos": 20996035862400.0, + "grad_norm": 1.587507275771779, + "language_loss": 0.75329143, + "learning_rate": 2.789363960063863e-06, + "loss": 0.77591717, + "num_input_tokens_seen": 139128260, + "step": 6482, + "time_per_iteration": 2.610187530517578 + }, + { + "auxiliary_loss_clip": 0.0112069, + "auxiliary_loss_mlp": 0.01123959, + "balance_loss_clip": 1.00188923, + "balance_loss_mlp": 1.00074458, + "epoch": 0.3897790470464452, + "flos": 22528487756160.0, + "grad_norm": 1.9158084418478791, + "language_loss": 0.78544265, + "learning_rate": 2.78900610077756e-06, + "loss": 0.8078891, + "num_input_tokens_seen": 139147315, + "step": 6483, + "time_per_iteration": 2.6632256507873535 + }, + { + "auxiliary_loss_clip": 0.01152777, + "auxiliary_loss_mlp": 0.0112336, + "balance_loss_clip": 1.00201201, + "balance_loss_mlp": 1.00052714, + "epoch": 0.3898391702991132, + "flos": 26209905603840.0, + "grad_norm": 1.4696093357999729, + "language_loss": 0.79948622, + "learning_rate": 2.788648211572067e-06, + "loss": 0.82224751, + "num_input_tokens_seen": 139167270, + "step": 6484, + "time_per_iteration": 2.618624448776245 + }, + { + "auxiliary_loss_clip": 0.0115446, + "auxiliary_loss_mlp": 0.01124839, + "balance_loss_clip": 1.00230014, + "balance_loss_mlp": 1.00076628, + "epoch": 0.38989929355178116, + "flos": 21065558636160.0, + "grad_norm": 1.5276878663853415, + "language_loss": 0.77755666, + "learning_rate": 2.7882902924609557e-06, + "loss": 0.80034965, + "num_input_tokens_seen": 139185970, + "step": 6485, + "time_per_iteration": 2.5723390579223633 + }, + { + "auxiliary_loss_clip": 0.01106771, + "auxiliary_loss_mlp": 0.01123544, + "balance_loss_clip": 1.00199962, + "balance_loss_mlp": 1.00061572, + "epoch": 0.3899594168044491, + "flos": 25484115392640.0, + "grad_norm": 8.776858979420387, + "language_loss": 0.84838855, + "learning_rate": 2.7879323434577965e-06, + "loss": 0.87069166, + "num_input_tokens_seen": 139203730, + "step": 6486, + "time_per_iteration": 2.7164077758789062 + }, + { + "auxiliary_loss_clip": 0.01142585, + "auxiliary_loss_mlp": 0.01123992, + "balance_loss_clip": 1.00227916, + "balance_loss_mlp": 1.00058651, + "epoch": 0.3900195400571171, + "flos": 31139363456640.0, + "grad_norm": 2.0534683939061082, + "language_loss": 0.85347354, + "learning_rate": 2.7875743645761645e-06, + "loss": 0.87613934, + "num_input_tokens_seen": 139222560, + "step": 6487, + "time_per_iteration": 2.703249931335449 + }, + { + "auxiliary_loss_clip": 0.01137579, + "auxiliary_loss_mlp": 0.01123215, + "balance_loss_clip": 1.00210083, + "balance_loss_mlp": 1.0005722, + "epoch": 0.39007966330978505, + "flos": 20229917656320.0, + "grad_norm": 1.475410786587211, + "language_loss": 0.72935581, + "learning_rate": 2.787216355829633e-06, + "loss": 0.75196373, + "num_input_tokens_seen": 139242165, + "step": 6488, + "time_per_iteration": 2.617128849029541 + }, + { + "auxiliary_loss_clip": 0.01120681, + "auxiliary_loss_mlp": 0.01124579, + "balance_loss_clip": 1.00206327, + "balance_loss_mlp": 1.0006963, + "epoch": 0.390139786562453, + "flos": 22528739151360.0, + "grad_norm": 2.260641561825691, + "language_loss": 0.68592823, + "learning_rate": 2.786858317231779e-06, + "loss": 0.70838082, + "num_input_tokens_seen": 139262525, + "step": 6489, + "time_per_iteration": 2.655931234359741 + }, + { + "auxiliary_loss_clip": 0.01136492, + "auxiliary_loss_mlp": 0.01123124, + "balance_loss_clip": 1.00197566, + "balance_loss_mlp": 1.00086284, + "epoch": 0.390199909815121, + "flos": 26432911192320.0, + "grad_norm": 1.564213078604573, + "language_loss": 0.80857271, + "learning_rate": 2.7865002487961788e-06, + "loss": 0.83116889, + "num_input_tokens_seen": 139282835, + "step": 6490, + "time_per_iteration": 2.6698179244995117 + }, + { + "auxiliary_loss_clip": 0.01154526, + "auxiliary_loss_mlp": 0.01124158, + "balance_loss_clip": 1.00212216, + "balance_loss_mlp": 1.00065672, + "epoch": 0.39026003306778895, + "flos": 17274577328640.0, + "grad_norm": 1.919805336303671, + "language_loss": 0.89787149, + "learning_rate": 2.7861421505364104e-06, + "loss": 0.92065829, + "num_input_tokens_seen": 139299490, + "step": 6491, + "time_per_iteration": 2.5714430809020996 + }, + { + "auxiliary_loss_clip": 0.01120782, + "auxiliary_loss_mlp": 0.01124267, + "balance_loss_clip": 1.00190759, + "balance_loss_mlp": 1.00076652, + "epoch": 0.3903201563204569, + "flos": 24532841554560.0, + "grad_norm": 1.6896286021543059, + "language_loss": 0.78724194, + "learning_rate": 2.7857840224660523e-06, + "loss": 0.80969244, + "num_input_tokens_seen": 139317865, + "step": 6492, + "time_per_iteration": 2.715667486190796 + }, + { + "auxiliary_loss_clip": 0.01138869, + "auxiliary_loss_mlp": 0.01123591, + "balance_loss_clip": 1.00201428, + "balance_loss_mlp": 1.00066233, + "epoch": 0.39038027957312493, + "flos": 23767944410880.0, + "grad_norm": 1.7359554503064776, + "language_loss": 0.74209803, + "learning_rate": 2.7854258645986857e-06, + "loss": 0.76472265, + "num_input_tokens_seen": 139339840, + "step": 6493, + "time_per_iteration": 2.6372220516204834 + }, + { + "auxiliary_loss_clip": 0.0111118, + "auxiliary_loss_mlp": 0.0112508, + "balance_loss_clip": 1.00235391, + "balance_loss_mlp": 1.0007211, + "epoch": 0.3904404028257929, + "flos": 14100612871680.0, + "grad_norm": 2.622683867745734, + "language_loss": 0.75918883, + "learning_rate": 2.7850676769478916e-06, + "loss": 0.78155148, + "num_input_tokens_seen": 139357555, + "step": 6494, + "time_per_iteration": 2.656184434890747 + }, + { + "auxiliary_loss_clip": 0.01153207, + "auxiliary_loss_mlp": 0.01125594, + "balance_loss_clip": 1.00207448, + "balance_loss_mlp": 1.00094855, + "epoch": 0.39050052607846086, + "flos": 16910048154240.0, + "grad_norm": 2.0072905583552743, + "language_loss": 0.74331689, + "learning_rate": 2.7847094595272525e-06, + "loss": 0.76610488, + "num_input_tokens_seen": 139374455, + "step": 6495, + "time_per_iteration": 2.519296407699585 + }, + { + "auxiliary_loss_clip": 0.01169588, + "auxiliary_loss_mlp": 0.01124556, + "balance_loss_clip": 1.00227022, + "balance_loss_mlp": 1.00086462, + "epoch": 0.39056064933112883, + "flos": 25915761129600.0, + "grad_norm": 1.6996974275657275, + "language_loss": 0.68033195, + "learning_rate": 2.784351212350352e-06, + "loss": 0.70327342, + "num_input_tokens_seen": 139394770, + "step": 6496, + "time_per_iteration": 2.5959482192993164 + }, + { + "auxiliary_loss_clip": 0.01117238, + "auxiliary_loss_mlp": 0.01104432, + "balance_loss_clip": 1.00174499, + "balance_loss_mlp": 0.99990922, + "epoch": 0.3906207725837968, + "flos": 60028421713920.0, + "grad_norm": 0.7710387713533754, + "language_loss": 0.53997821, + "learning_rate": 2.783992935430775e-06, + "loss": 0.56219488, + "num_input_tokens_seen": 139454760, + "step": 6497, + "time_per_iteration": 3.2881431579589844 + }, + { + "auxiliary_loss_clip": 0.01120804, + "auxiliary_loss_mlp": 0.00747655, + "balance_loss_clip": 1.00198317, + "balance_loss_mlp": 1.00019979, + "epoch": 0.39068089583646476, + "flos": 21068683119360.0, + "grad_norm": 3.0894714792850513, + "language_loss": 0.68741655, + "learning_rate": 2.7836346287821068e-06, + "loss": 0.70610106, + "num_input_tokens_seen": 139472645, + "step": 6498, + "time_per_iteration": 2.674820899963379 + }, + { + "auxiliary_loss_clip": 0.01119398, + "auxiliary_loss_mlp": 0.01104524, + "balance_loss_clip": 1.00207913, + "balance_loss_mlp": 1.00000083, + "epoch": 0.3907410190891327, + "flos": 70445677403520.0, + "grad_norm": 0.7262091248381445, + "language_loss": 0.51805526, + "learning_rate": 2.783276292417936e-06, + "loss": 0.54029441, + "num_input_tokens_seen": 139536730, + "step": 6499, + "time_per_iteration": 3.2791080474853516 + }, + { + "auxiliary_loss_clip": 0.0115298, + "auxiliary_loss_mlp": 0.01124625, + "balance_loss_clip": 1.00201559, + "balance_loss_mlp": 1.00074255, + "epoch": 0.3908011423418007, + "flos": 27962454084480.0, + "grad_norm": 2.422665477312353, + "language_loss": 0.73793447, + "learning_rate": 2.7829179263518487e-06, + "loss": 0.76071054, + "num_input_tokens_seen": 139557540, + "step": 6500, + "time_per_iteration": 2.665712833404541 + }, + { + "auxiliary_loss_clip": 0.01153047, + "auxiliary_loss_mlp": 0.0112487, + "balance_loss_clip": 1.00212491, + "balance_loss_mlp": 1.00070131, + "epoch": 0.39086126559446865, + "flos": 24462097718400.0, + "grad_norm": 2.092059421377357, + "language_loss": 0.69330633, + "learning_rate": 2.7825595305974354e-06, + "loss": 0.71608543, + "num_input_tokens_seen": 139576875, + "step": 6501, + "time_per_iteration": 2.5839731693267822 + }, + { + "auxiliary_loss_clip": 0.01152882, + "auxiliary_loss_mlp": 0.01124472, + "balance_loss_clip": 1.00205612, + "balance_loss_mlp": 1.00078058, + "epoch": 0.3909213888471366, + "flos": 16941541403520.0, + "grad_norm": 1.695999015050967, + "language_loss": 0.78756243, + "learning_rate": 2.782201105168287e-06, + "loss": 0.81033599, + "num_input_tokens_seen": 139594295, + "step": 6502, + "time_per_iteration": 2.5784060955047607 + }, + { + "auxiliary_loss_clip": 0.01136287, + "auxiliary_loss_mlp": 0.01123608, + "balance_loss_clip": 1.00214398, + "balance_loss_mlp": 1.00077426, + "epoch": 0.3909815120998046, + "flos": 29278400751360.0, + "grad_norm": 3.1561027794269263, + "language_loss": 0.7963419, + "learning_rate": 2.7818426500779932e-06, + "loss": 0.81894088, + "num_input_tokens_seen": 139614080, + "step": 6503, + "time_per_iteration": 2.6491541862487793 + }, + { + "auxiliary_loss_clip": 0.01136186, + "auxiliary_loss_mlp": 0.01123544, + "balance_loss_clip": 1.00200891, + "balance_loss_mlp": 1.00051963, + "epoch": 0.39104163535247255, + "flos": 18951246328320.0, + "grad_norm": 1.7275887243352275, + "language_loss": 0.71347111, + "learning_rate": 2.7814841653401485e-06, + "loss": 0.73606843, + "num_input_tokens_seen": 139632755, + "step": 6504, + "time_per_iteration": 2.6148126125335693 + }, + { + "auxiliary_loss_clip": 0.01169476, + "auxiliary_loss_mlp": 0.01123375, + "balance_loss_clip": 1.00206649, + "balance_loss_mlp": 1.00063705, + "epoch": 0.3911017586051405, + "flos": 26323347732480.0, + "grad_norm": 1.6077667517265766, + "language_loss": 0.8307519, + "learning_rate": 2.7811256509683454e-06, + "loss": 0.85368037, + "num_input_tokens_seen": 139654205, + "step": 6505, + "time_per_iteration": 2.569417715072632 + }, + { + "auxiliary_loss_clip": 0.01169539, + "auxiliary_loss_mlp": 0.01124286, + "balance_loss_clip": 1.00211716, + "balance_loss_mlp": 1.00059438, + "epoch": 0.3911618818578085, + "flos": 21835770992640.0, + "grad_norm": 2.2699274570386034, + "language_loss": 0.70683259, + "learning_rate": 2.7807671069761797e-06, + "loss": 0.72977084, + "num_input_tokens_seen": 139673595, + "step": 6506, + "time_per_iteration": 2.5540013313293457 + }, + { + "auxiliary_loss_clip": 0.01136873, + "auxiliary_loss_mlp": 0.01123866, + "balance_loss_clip": 1.00208783, + "balance_loss_mlp": 1.00065124, + "epoch": 0.3912220051104765, + "flos": 16359680989440.0, + "grad_norm": 1.9732265533806526, + "language_loss": 0.75438929, + "learning_rate": 2.7804085333772477e-06, + "loss": 0.77699673, + "num_input_tokens_seen": 139690565, + "step": 6507, + "time_per_iteration": 4.065236806869507 + }, + { + "auxiliary_loss_clip": 0.01165537, + "auxiliary_loss_mlp": 0.01103695, + "balance_loss_clip": 1.00207746, + "balance_loss_mlp": 0.99993575, + "epoch": 0.39128212836314447, + "flos": 71050986420480.0, + "grad_norm": 0.7595234499691025, + "language_loss": 0.56569505, + "learning_rate": 2.7800499301851446e-06, + "loss": 0.58838737, + "num_input_tokens_seen": 139749420, + "step": 6508, + "time_per_iteration": 3.2624197006225586 + }, + { + "auxiliary_loss_clip": 0.01152895, + "auxiliary_loss_mlp": 0.01123299, + "balance_loss_clip": 1.00210392, + "balance_loss_mlp": 1.00075138, + "epoch": 0.39134225161581243, + "flos": 20331975173760.0, + "grad_norm": 1.7309877486190532, + "language_loss": 0.7619909, + "learning_rate": 2.779691297413471e-06, + "loss": 0.78475285, + "num_input_tokens_seen": 139766265, + "step": 6509, + "time_per_iteration": 2.6112239360809326 + }, + { + "auxiliary_loss_clip": 0.01123765, + "auxiliary_loss_mlp": 0.0112443, + "balance_loss_clip": 1.00191498, + "balance_loss_mlp": 1.00064254, + "epoch": 0.3914023748684804, + "flos": 17018390551680.0, + "grad_norm": 3.1707760019063342, + "language_loss": 0.82991225, + "learning_rate": 2.779332635075825e-06, + "loss": 0.85239422, + "num_input_tokens_seen": 139782400, + "step": 6510, + "time_per_iteration": 4.033705949783325 + }, + { + "auxiliary_loss_clip": 0.01154108, + "auxiliary_loss_mlp": 0.01124174, + "balance_loss_clip": 1.00209653, + "balance_loss_mlp": 1.00067294, + "epoch": 0.39146249812114836, + "flos": 18405224709120.0, + "grad_norm": 2.8117663656379412, + "language_loss": 0.76280975, + "learning_rate": 2.7789739431858073e-06, + "loss": 0.78559256, + "num_input_tokens_seen": 139801435, + "step": 6511, + "time_per_iteration": 4.0182785987854 + }, + { + "auxiliary_loss_clip": 0.01134287, + "auxiliary_loss_mlp": 0.0110446, + "balance_loss_clip": 1.00201774, + "balance_loss_mlp": 0.99993747, + "epoch": 0.3915226213738163, + "flos": 67637355442560.0, + "grad_norm": 0.7140993253288431, + "language_loss": 0.57818836, + "learning_rate": 2.7786152217570196e-06, + "loss": 0.60057586, + "num_input_tokens_seen": 139869700, + "step": 6512, + "time_per_iteration": 3.242393970489502 + }, + { + "auxiliary_loss_clip": 0.01169707, + "auxiliary_loss_mlp": 0.01124559, + "balance_loss_clip": 1.00225115, + "balance_loss_mlp": 1.00058174, + "epoch": 0.3915827446264843, + "flos": 26359330181760.0, + "grad_norm": 1.6468166105106843, + "language_loss": 0.69561291, + "learning_rate": 2.7782564708030647e-06, + "loss": 0.71855557, + "num_input_tokens_seen": 139890140, + "step": 6513, + "time_per_iteration": 2.618267059326172 + }, + { + "auxiliary_loss_clip": 0.01107122, + "auxiliary_loss_mlp": 0.01125631, + "balance_loss_clip": 1.00190711, + "balance_loss_mlp": 1.00069952, + "epoch": 0.39164286787915226, + "flos": 21943897908480.0, + "grad_norm": 2.5042551822283103, + "language_loss": 0.75891542, + "learning_rate": 2.7778976903375464e-06, + "loss": 0.78124297, + "num_input_tokens_seen": 139908020, + "step": 6514, + "time_per_iteration": 4.101559638977051 + }, + { + "auxiliary_loss_clip": 0.01127714, + "auxiliary_loss_mlp": 0.01124578, + "balance_loss_clip": 1.00203276, + "balance_loss_mlp": 1.00069571, + "epoch": 0.3917029911318202, + "flos": 16399829416320.0, + "grad_norm": 2.2334486154946522, + "language_loss": 0.77468705, + "learning_rate": 2.7775388803740693e-06, + "loss": 0.79720998, + "num_input_tokens_seen": 139926180, + "step": 6515, + "time_per_iteration": 2.6246514320373535 + }, + { + "auxiliary_loss_clip": 0.01123974, + "auxiliary_loss_mlp": 0.01123796, + "balance_loss_clip": 1.00196958, + "balance_loss_mlp": 1.00096273, + "epoch": 0.3917631143844882, + "flos": 26211701283840.0, + "grad_norm": 1.368357506687499, + "language_loss": 0.7983079, + "learning_rate": 2.7771800409262406e-06, + "loss": 0.82078564, + "num_input_tokens_seen": 139947420, + "step": 6516, + "time_per_iteration": 2.7438175678253174 + }, + { + "auxiliary_loss_clip": 0.01104911, + "auxiliary_loss_mlp": 0.01123609, + "balance_loss_clip": 1.00190055, + "balance_loss_mlp": 1.00077534, + "epoch": 0.39182323763715615, + "flos": 18548364407040.0, + "grad_norm": 2.2444325908948994, + "language_loss": 0.70428669, + "learning_rate": 2.7768211720076665e-06, + "loss": 0.72657198, + "num_input_tokens_seen": 139965800, + "step": 6517, + "time_per_iteration": 2.6637110710144043 + }, + { + "auxiliary_loss_clip": 0.01122574, + "auxiliary_loss_mlp": 0.01124444, + "balance_loss_clip": 1.00190306, + "balance_loss_mlp": 1.00075221, + "epoch": 0.3918833608898241, + "flos": 34313543395200.0, + "grad_norm": 1.8110947411541871, + "language_loss": 0.71769154, + "learning_rate": 2.776462273631956e-06, + "loss": 0.74016166, + "num_input_tokens_seen": 139988140, + "step": 6518, + "time_per_iteration": 2.7564737796783447 + }, + { + "auxiliary_loss_clip": 0.01152782, + "auxiliary_loss_mlp": 0.01124107, + "balance_loss_clip": 1.00198245, + "balance_loss_mlp": 1.00070179, + "epoch": 0.3919434841424921, + "flos": 36939582812160.0, + "grad_norm": 3.2609783118829005, + "language_loss": 0.61797321, + "learning_rate": 2.7761033458127177e-06, + "loss": 0.64074212, + "num_input_tokens_seen": 140010060, + "step": 6519, + "time_per_iteration": 2.697484254837036 + }, + { + "auxiliary_loss_clip": 0.01169768, + "auxiliary_loss_mlp": 0.01125265, + "balance_loss_clip": 1.00223887, + "balance_loss_mlp": 1.00061989, + "epoch": 0.3920036073951601, + "flos": 23508956373120.0, + "grad_norm": 2.185229825232258, + "language_loss": 0.66687119, + "learning_rate": 2.775744388563563e-06, + "loss": 0.68982148, + "num_input_tokens_seen": 140029400, + "step": 6520, + "time_per_iteration": 2.5438644886016846 + }, + { + "auxiliary_loss_clip": 0.01169618, + "auxiliary_loss_mlp": 0.01123503, + "balance_loss_clip": 1.00221705, + "balance_loss_mlp": 1.00057435, + "epoch": 0.39206373064782807, + "flos": 18406086635520.0, + "grad_norm": 1.7005802659134845, + "language_loss": 0.78813934, + "learning_rate": 2.775385401898104e-06, + "loss": 0.8110705, + "num_input_tokens_seen": 140048940, + "step": 6521, + "time_per_iteration": 2.494635820388794 + }, + { + "auxiliary_loss_clip": 0.01152982, + "auxiliary_loss_mlp": 0.01125002, + "balance_loss_clip": 1.00210428, + "balance_loss_mlp": 1.00064301, + "epoch": 0.39212385390049603, + "flos": 12313051608960.0, + "grad_norm": 2.8816390510142362, + "language_loss": 0.69688398, + "learning_rate": 2.775026385829952e-06, + "loss": 0.7196638, + "num_input_tokens_seen": 140066380, + "step": 6522, + "time_per_iteration": 2.545975685119629 + }, + { + "auxiliary_loss_clip": 0.0113801, + "auxiliary_loss_mlp": 0.01124011, + "balance_loss_clip": 1.00196767, + "balance_loss_mlp": 1.0006057, + "epoch": 0.392183977153164, + "flos": 19719160214400.0, + "grad_norm": 1.921008228543385, + "language_loss": 0.76963514, + "learning_rate": 2.774667340372722e-06, + "loss": 0.79225528, + "num_input_tokens_seen": 140085275, + "step": 6523, + "time_per_iteration": 2.591494083404541 + }, + { + "auxiliary_loss_clip": 0.01137638, + "auxiliary_loss_mlp": 0.01123877, + "balance_loss_clip": 1.00202334, + "balance_loss_mlp": 1.00075805, + "epoch": 0.39224410040583196, + "flos": 33144902403840.0, + "grad_norm": 2.2714569137801193, + "language_loss": 0.62343705, + "learning_rate": 2.7743082655400293e-06, + "loss": 0.64605218, + "num_input_tokens_seen": 140105105, + "step": 6524, + "time_per_iteration": 2.7089293003082275 + }, + { + "auxiliary_loss_clip": 0.01169452, + "auxiliary_loss_mlp": 0.01123378, + "balance_loss_clip": 1.00207162, + "balance_loss_mlp": 1.00064051, + "epoch": 0.39230422365849993, + "flos": 27782434097280.0, + "grad_norm": 1.6607314320742115, + "language_loss": 0.73747915, + "learning_rate": 2.773949161345489e-06, + "loss": 0.76040745, + "num_input_tokens_seen": 140125645, + "step": 6525, + "time_per_iteration": 2.585904836654663 + }, + { + "auxiliary_loss_clip": 0.01136249, + "auxiliary_loss_mlp": 0.01123707, + "balance_loss_clip": 1.00193095, + "balance_loss_mlp": 1.00077868, + "epoch": 0.3923643469111679, + "flos": 17931634865280.0, + "grad_norm": 2.416309548102488, + "language_loss": 0.80950189, + "learning_rate": 2.773590027802719e-06, + "loss": 0.83210146, + "num_input_tokens_seen": 140141925, + "step": 6526, + "time_per_iteration": 2.6408026218414307 + }, + { + "auxiliary_loss_clip": 0.0115282, + "auxiliary_loss_mlp": 0.01124047, + "balance_loss_clip": 1.00203168, + "balance_loss_mlp": 1.00073659, + "epoch": 0.39242447016383586, + "flos": 24059539019520.0, + "grad_norm": 1.6824169794901669, + "language_loss": 0.69751632, + "learning_rate": 2.7732308649253383e-06, + "loss": 0.720285, + "num_input_tokens_seen": 140160965, + "step": 6527, + "time_per_iteration": 2.599231481552124 + }, + { + "auxiliary_loss_clip": 0.0112107, + "auxiliary_loss_mlp": 0.01123682, + "balance_loss_clip": 1.00188589, + "balance_loss_mlp": 1.00065827, + "epoch": 0.3924845934165038, + "flos": 10664069016960.0, + "grad_norm": 2.252316906327759, + "language_loss": 0.81805062, + "learning_rate": 2.772871672726965e-06, + "loss": 0.84049821, + "num_input_tokens_seen": 140177780, + "step": 6528, + "time_per_iteration": 2.633859395980835 + }, + { + "auxiliary_loss_clip": 0.01136063, + "auxiliary_loss_mlp": 0.01123285, + "balance_loss_clip": 1.00200248, + "balance_loss_mlp": 1.00064254, + "epoch": 0.3925447166691718, + "flos": 31245910174080.0, + "grad_norm": 2.2140889154100796, + "language_loss": 0.68435192, + "learning_rate": 2.7725124512212205e-06, + "loss": 0.70694536, + "num_input_tokens_seen": 140201660, + "step": 6529, + "time_per_iteration": 2.682316541671753 + }, + { + "auxiliary_loss_clip": 0.01139044, + "auxiliary_loss_mlp": 0.01124012, + "balance_loss_clip": 1.00203884, + "balance_loss_mlp": 1.00079703, + "epoch": 0.39260483992183975, + "flos": 29415040087680.0, + "grad_norm": 4.4562788675844, + "language_loss": 0.80476964, + "learning_rate": 2.7721532004217267e-06, + "loss": 0.82740027, + "num_input_tokens_seen": 140218585, + "step": 6530, + "time_per_iteration": 2.667703866958618 + }, + { + "auxiliary_loss_clip": 0.01153947, + "auxiliary_loss_mlp": 0.01123901, + "balance_loss_clip": 1.00202489, + "balance_loss_mlp": 1.0008769, + "epoch": 0.3926649631745077, + "flos": 22857788666880.0, + "grad_norm": 1.4676266368164221, + "language_loss": 0.75400746, + "learning_rate": 2.7717939203421063e-06, + "loss": 0.77678591, + "num_input_tokens_seen": 140239905, + "step": 6531, + "time_per_iteration": 2.6010665893554688 + }, + { + "auxiliary_loss_clip": 0.01165734, + "auxiliary_loss_mlp": 0.01103768, + "balance_loss_clip": 1.00218272, + "balance_loss_mlp": 1.00000823, + "epoch": 0.3927250864271757, + "flos": 63893881872000.0, + "grad_norm": 0.9299633251749203, + "language_loss": 0.60390353, + "learning_rate": 2.7714346109959822e-06, + "loss": 0.6265986, + "num_input_tokens_seen": 140293820, + "step": 6532, + "time_per_iteration": 2.980694055557251 + }, + { + "auxiliary_loss_clip": 0.01131684, + "auxiliary_loss_mlp": 0.01103927, + "balance_loss_clip": 1.00179863, + "balance_loss_mlp": 1.00016773, + "epoch": 0.3927852096798437, + "flos": 68909741890560.0, + "grad_norm": 0.7844129957580798, + "language_loss": 0.55523229, + "learning_rate": 2.771075272396981e-06, + "loss": 0.57758832, + "num_input_tokens_seen": 140360420, + "step": 6533, + "time_per_iteration": 3.252922534942627 + }, + { + "auxiliary_loss_clip": 0.01142512, + "auxiliary_loss_mlp": 0.01124017, + "balance_loss_clip": 1.00226068, + "balance_loss_mlp": 1.00080276, + "epoch": 0.39284533293251167, + "flos": 29715972232320.0, + "grad_norm": 5.340716670540427, + "language_loss": 0.76172745, + "learning_rate": 2.7707159045587284e-06, + "loss": 0.78439277, + "num_input_tokens_seen": 140381950, + "step": 6534, + "time_per_iteration": 2.6912789344787598 + }, + { + "auxiliary_loss_clip": 0.01154444, + "auxiliary_loss_mlp": 0.01124398, + "balance_loss_clip": 1.00213921, + "balance_loss_mlp": 1.00070643, + "epoch": 0.39290545618517964, + "flos": 18552027594240.0, + "grad_norm": 2.5842862335422105, + "language_loss": 0.78071523, + "learning_rate": 2.770356507494851e-06, + "loss": 0.80350369, + "num_input_tokens_seen": 140399410, + "step": 6535, + "time_per_iteration": 2.5469648838043213 + }, + { + "auxiliary_loss_clip": 0.01120875, + "auxiliary_loss_mlp": 0.01123209, + "balance_loss_clip": 1.00179422, + "balance_loss_mlp": 1.00075722, + "epoch": 0.3929655794378476, + "flos": 26249479413120.0, + "grad_norm": 2.5352001886202333, + "language_loss": 0.68227828, + "learning_rate": 2.769997081218978e-06, + "loss": 0.70471919, + "num_input_tokens_seen": 140419055, + "step": 6536, + "time_per_iteration": 2.685875415802002 + }, + { + "auxiliary_loss_clip": 0.01137471, + "auxiliary_loss_mlp": 0.01122355, + "balance_loss_clip": 1.00199223, + "balance_loss_mlp": 1.00066626, + "epoch": 0.39302570269051557, + "flos": 29277933874560.0, + "grad_norm": 1.7464889438846203, + "language_loss": 0.68867826, + "learning_rate": 2.769637625744738e-06, + "loss": 0.71127653, + "num_input_tokens_seen": 140438800, + "step": 6537, + "time_per_iteration": 2.6719775199890137 + }, + { + "auxiliary_loss_clip": 0.0115268, + "auxiliary_loss_mlp": 0.01123548, + "balance_loss_clip": 1.00200558, + "balance_loss_mlp": 1.00080979, + "epoch": 0.39308582594318353, + "flos": 17347440067200.0, + "grad_norm": 1.6548986979989015, + "language_loss": 0.78952467, + "learning_rate": 2.769278141085763e-06, + "loss": 0.81228691, + "num_input_tokens_seen": 140456880, + "step": 6538, + "time_per_iteration": 2.5304319858551025 + }, + { + "auxiliary_loss_clip": 0.01107126, + "auxiliary_loss_mlp": 0.01103936, + "balance_loss_clip": 1.00202131, + "balance_loss_mlp": 1.00017631, + "epoch": 0.3931459491958515, + "flos": 61007094650880.0, + "grad_norm": 0.8060677362591334, + "language_loss": 0.61894685, + "learning_rate": 2.768918627255683e-06, + "loss": 0.64105749, + "num_input_tokens_seen": 140507510, + "step": 6539, + "time_per_iteration": 3.047112226486206 + }, + { + "auxiliary_loss_clip": 0.01135823, + "auxiliary_loss_mlp": 0.01123713, + "balance_loss_clip": 1.00180387, + "balance_loss_mlp": 1.0005939, + "epoch": 0.39320607244851946, + "flos": 39016009249920.0, + "grad_norm": 2.2438046280093022, + "language_loss": 0.68032551, + "learning_rate": 2.7685590842681315e-06, + "loss": 0.70292091, + "num_input_tokens_seen": 140528740, + "step": 6540, + "time_per_iteration": 2.724429130554199 + }, + { + "auxiliary_loss_clip": 0.01136082, + "auxiliary_loss_mlp": 0.0112337, + "balance_loss_clip": 1.00192237, + "balance_loss_mlp": 1.00072742, + "epoch": 0.3932661957011874, + "flos": 24679752180480.0, + "grad_norm": 4.395963462230061, + "language_loss": 0.7239145, + "learning_rate": 2.7681995121367433e-06, + "loss": 0.74650908, + "num_input_tokens_seen": 140547560, + "step": 6541, + "time_per_iteration": 2.622133731842041 + }, + { + "auxiliary_loss_clip": 0.01165583, + "auxiliary_loss_mlp": 0.01103058, + "balance_loss_clip": 1.00207376, + "balance_loss_mlp": 1.00006104, + "epoch": 0.3933263189538554, + "flos": 70096552185600.0, + "grad_norm": 0.822542437873271, + "language_loss": 0.60382819, + "learning_rate": 2.7678399108751516e-06, + "loss": 0.62651455, + "num_input_tokens_seen": 140601175, + "step": 6542, + "time_per_iteration": 2.9436123371124268 + }, + { + "auxiliary_loss_clip": 0.01153044, + "auxiliary_loss_mlp": 0.01122995, + "balance_loss_clip": 1.00207281, + "balance_loss_mlp": 1.00073361, + "epoch": 0.39338644220652336, + "flos": 22929071207040.0, + "grad_norm": 1.5363061878360158, + "language_loss": 0.82356381, + "learning_rate": 2.7674802804969947e-06, + "loss": 0.84632421, + "num_input_tokens_seen": 140622200, + "step": 6543, + "time_per_iteration": 2.6234776973724365 + }, + { + "auxiliary_loss_clip": 0.01137059, + "auxiliary_loss_mlp": 0.011229, + "balance_loss_clip": 1.00194347, + "balance_loss_mlp": 1.00063872, + "epoch": 0.3934465654591913, + "flos": 30848163897600.0, + "grad_norm": 2.033527763478386, + "language_loss": 0.68951011, + "learning_rate": 2.767120621015908e-06, + "loss": 0.71210974, + "num_input_tokens_seen": 140643125, + "step": 6544, + "time_per_iteration": 2.689079523086548 + }, + { + "auxiliary_loss_clip": 0.01137865, + "auxiliary_loss_mlp": 0.01124167, + "balance_loss_clip": 1.00206459, + "balance_loss_mlp": 1.00066578, + "epoch": 0.3935066887118593, + "flos": 29236528471680.0, + "grad_norm": 1.9175246900279173, + "language_loss": 0.75446928, + "learning_rate": 2.76676093244553e-06, + "loss": 0.7770896, + "num_input_tokens_seen": 140662500, + "step": 6545, + "time_per_iteration": 4.007375955581665 + }, + { + "auxiliary_loss_clip": 0.01121201, + "auxiliary_loss_mlp": 0.0112153, + "balance_loss_clip": 1.00210142, + "balance_loss_mlp": 1.00069952, + "epoch": 0.3935668119645273, + "flos": 19135288638720.0, + "grad_norm": 1.4678291687199312, + "language_loss": 0.74863338, + "learning_rate": 2.7664012147995015e-06, + "loss": 0.77106076, + "num_input_tokens_seen": 140681960, + "step": 6546, + "time_per_iteration": 2.645636558532715 + }, + { + "auxiliary_loss_clip": 0.01136673, + "auxiliary_loss_mlp": 0.01124406, + "balance_loss_clip": 1.00196469, + "balance_loss_mlp": 1.00061893, + "epoch": 0.3936269352171953, + "flos": 18516116972160.0, + "grad_norm": 1.8215427553068715, + "language_loss": 0.81751096, + "learning_rate": 2.7660414680914617e-06, + "loss": 0.84012175, + "num_input_tokens_seen": 140699170, + "step": 6547, + "time_per_iteration": 2.60313081741333 + }, + { + "auxiliary_loss_clip": 0.01154248, + "auxiliary_loss_mlp": 0.00747611, + "balance_loss_clip": 1.0021075, + "balance_loss_mlp": 1.00031066, + "epoch": 0.39368705846986324, + "flos": 15632813370240.0, + "grad_norm": 1.8169245179091926, + "language_loss": 0.8436572, + "learning_rate": 2.7656816923350525e-06, + "loss": 0.86267579, + "num_input_tokens_seen": 140714920, + "step": 6548, + "time_per_iteration": 3.935645580291748 + }, + { + "auxiliary_loss_clip": 0.01153695, + "auxiliary_loss_mlp": 0.00747498, + "balance_loss_clip": 1.00206888, + "balance_loss_mlp": 1.00018287, + "epoch": 0.3937471817225312, + "flos": 21325839563520.0, + "grad_norm": 2.049149513145325, + "language_loss": 0.72618347, + "learning_rate": 2.7653218875439174e-06, + "loss": 0.74519539, + "num_input_tokens_seen": 140734595, + "step": 6549, + "time_per_iteration": 4.009974479675293 + }, + { + "auxiliary_loss_clip": 0.01088519, + "auxiliary_loss_mlp": 0.01123249, + "balance_loss_clip": 1.00184643, + "balance_loss_mlp": 1.00060606, + "epoch": 0.39380730497519917, + "flos": 20776693461120.0, + "grad_norm": 1.540368222537567, + "language_loss": 0.77479976, + "learning_rate": 2.764962053731699e-06, + "loss": 0.79691744, + "num_input_tokens_seen": 140754050, + "step": 6550, + "time_per_iteration": 2.740144968032837 + }, + { + "auxiliary_loss_clip": 0.01126043, + "auxiliary_loss_mlp": 0.01123134, + "balance_loss_clip": 1.00209272, + "balance_loss_mlp": 1.00058675, + "epoch": 0.39386742822786713, + "flos": 21609784575360.0, + "grad_norm": 1.6586157632715686, + "language_loss": 0.81203043, + "learning_rate": 2.7646021909120434e-06, + "loss": 0.83452213, + "num_input_tokens_seen": 140771440, + "step": 6551, + "time_per_iteration": 2.6451780796051025 + }, + { + "auxiliary_loss_clip": 0.01152826, + "auxiliary_loss_mlp": 0.01122342, + "balance_loss_clip": 1.00205648, + "balance_loss_mlp": 1.0007484, + "epoch": 0.3939275514805351, + "flos": 12414642249600.0, + "grad_norm": 2.0341378125812626, + "language_loss": 0.79988587, + "learning_rate": 2.764242299098596e-06, + "loss": 0.82263756, + "num_input_tokens_seen": 140786715, + "step": 6552, + "time_per_iteration": 4.036467552185059 + }, + { + "auxiliary_loss_clip": 0.01169598, + "auxiliary_loss_mlp": 0.01123965, + "balance_loss_clip": 1.0021733, + "balance_loss_mlp": 1.0009408, + "epoch": 0.39398767473320306, + "flos": 18552027594240.0, + "grad_norm": 2.968562693507934, + "language_loss": 0.71057892, + "learning_rate": 2.763882378305003e-06, + "loss": 0.73351455, + "num_input_tokens_seen": 140804950, + "step": 6553, + "time_per_iteration": 2.51975679397583 + }, + { + "auxiliary_loss_clip": 0.01152639, + "auxiliary_loss_mlp": 0.00747506, + "balance_loss_clip": 1.00210214, + "balance_loss_mlp": 1.00017905, + "epoch": 0.39404779798587103, + "flos": 29308888419840.0, + "grad_norm": 1.6587801714305341, + "language_loss": 0.63964772, + "learning_rate": 2.7635224285449144e-06, + "loss": 0.65864915, + "num_input_tokens_seen": 140822800, + "step": 6554, + "time_per_iteration": 2.625584840774536 + }, + { + "auxiliary_loss_clip": 0.01144191, + "auxiliary_loss_mlp": 0.0112321, + "balance_loss_clip": 1.00242043, + "balance_loss_mlp": 1.00075817, + "epoch": 0.394107921238539, + "flos": 34897055834880.0, + "grad_norm": 1.8713666546638161, + "language_loss": 0.79289919, + "learning_rate": 2.7631624498319796e-06, + "loss": 0.81557316, + "num_input_tokens_seen": 140842940, + "step": 6555, + "time_per_iteration": 2.7018229961395264 + }, + { + "auxiliary_loss_clip": 0.01137942, + "auxiliary_loss_mlp": 0.01123455, + "balance_loss_clip": 1.00202978, + "balance_loss_mlp": 1.00081229, + "epoch": 0.39416804449120696, + "flos": 25081413039360.0, + "grad_norm": 1.7435649265253979, + "language_loss": 0.7139641, + "learning_rate": 2.7628024421798473e-06, + "loss": 0.73657805, + "num_input_tokens_seen": 140863060, + "step": 6556, + "time_per_iteration": 2.6372175216674805 + }, + { + "auxiliary_loss_clip": 0.01169524, + "auxiliary_loss_mlp": 0.01123017, + "balance_loss_clip": 1.00214887, + "balance_loss_mlp": 1.00066042, + "epoch": 0.3942281677438749, + "flos": 32306639731200.0, + "grad_norm": 1.7268582144336966, + "language_loss": 0.83636492, + "learning_rate": 2.7624424056021705e-06, + "loss": 0.85929036, + "num_input_tokens_seen": 140883795, + "step": 6557, + "time_per_iteration": 2.6194705963134766 + }, + { + "auxiliary_loss_clip": 0.01153093, + "auxiliary_loss_mlp": 0.01122846, + "balance_loss_clip": 1.00205374, + "balance_loss_mlp": 1.00068021, + "epoch": 0.3942882909965429, + "flos": 24936621315840.0, + "grad_norm": 2.3549678995325443, + "language_loss": 0.80404603, + "learning_rate": 2.7620823401126004e-06, + "loss": 0.82680541, + "num_input_tokens_seen": 140903055, + "step": 6558, + "time_per_iteration": 2.585538387298584 + }, + { + "auxiliary_loss_clip": 0.01169685, + "auxiliary_loss_mlp": 0.01123611, + "balance_loss_clip": 1.00228667, + "balance_loss_mlp": 1.00077736, + "epoch": 0.39434841424921085, + "flos": 11874797769600.0, + "grad_norm": 1.6736470024862016, + "language_loss": 0.70932597, + "learning_rate": 2.761722245724792e-06, + "loss": 0.73225886, + "num_input_tokens_seen": 140920685, + "step": 6559, + "time_per_iteration": 2.486969470977783 + }, + { + "auxiliary_loss_clip": 0.01136733, + "auxiliary_loss_mlp": 0.01123585, + "balance_loss_clip": 1.0019958, + "balance_loss_mlp": 1.00056076, + "epoch": 0.3944085375018789, + "flos": 16361620323840.0, + "grad_norm": 1.920665950623801, + "language_loss": 0.80822158, + "learning_rate": 2.7613621224524003e-06, + "loss": 0.83082473, + "num_input_tokens_seen": 140937320, + "step": 6560, + "time_per_iteration": 2.557673454284668 + }, + { + "auxiliary_loss_clip": 0.01142412, + "auxiliary_loss_mlp": 0.01123876, + "balance_loss_clip": 1.00216985, + "balance_loss_mlp": 1.00085163, + "epoch": 0.39446866075454684, + "flos": 10633365866880.0, + "grad_norm": 2.19021013031117, + "language_loss": 0.83061755, + "learning_rate": 2.7610019703090803e-06, + "loss": 0.85328043, + "num_input_tokens_seen": 140954855, + "step": 6561, + "time_per_iteration": 2.5638232231140137 + }, + { + "auxiliary_loss_clip": 0.01152933, + "auxiliary_loss_mlp": 0.01123005, + "balance_loss_clip": 1.00216699, + "balance_loss_mlp": 1.00083983, + "epoch": 0.3945287840072148, + "flos": 18187498419840.0, + "grad_norm": 2.3013939759471067, + "language_loss": 0.79683942, + "learning_rate": 2.7606417893084887e-06, + "loss": 0.81959879, + "num_input_tokens_seen": 140973250, + "step": 6562, + "time_per_iteration": 2.532918930053711 + }, + { + "auxiliary_loss_clip": 0.01137649, + "auxiliary_loss_mlp": 0.01122677, + "balance_loss_clip": 1.00215018, + "balance_loss_mlp": 1.0008924, + "epoch": 0.39458890725988277, + "flos": 23039891642880.0, + "grad_norm": 1.4499186268093425, + "language_loss": 0.81097072, + "learning_rate": 2.7602815794642853e-06, + "loss": 0.83357394, + "num_input_tokens_seen": 140993050, + "step": 6563, + "time_per_iteration": 2.6121420860290527 + }, + { + "auxiliary_loss_clip": 0.01122941, + "auxiliary_loss_mlp": 0.0112299, + "balance_loss_clip": 1.00192833, + "balance_loss_mlp": 1.00082469, + "epoch": 0.39464903051255074, + "flos": 17159052211200.0, + "grad_norm": 1.9340736059402657, + "language_loss": 0.70058203, + "learning_rate": 2.759921340790127e-06, + "loss": 0.72304136, + "num_input_tokens_seen": 141010815, + "step": 6564, + "time_per_iteration": 2.63199782371521 + }, + { + "auxiliary_loss_clip": 0.01154528, + "auxiliary_loss_mlp": 0.01123742, + "balance_loss_clip": 1.00211811, + "balance_loss_mlp": 1.00062275, + "epoch": 0.3947091537652187, + "flos": 15889000147200.0, + "grad_norm": 2.4518357594492577, + "language_loss": 0.82691717, + "learning_rate": 2.759561073299676e-06, + "loss": 0.84969985, + "num_input_tokens_seen": 141028720, + "step": 6565, + "time_per_iteration": 2.5571656227111816 + }, + { + "auxiliary_loss_clip": 0.01124126, + "auxiliary_loss_mlp": 0.01123124, + "balance_loss_clip": 1.00200176, + "balance_loss_mlp": 1.00076723, + "epoch": 0.39476927701788667, + "flos": 18545491319040.0, + "grad_norm": 2.225144609622429, + "language_loss": 0.83143163, + "learning_rate": 2.7592007770065937e-06, + "loss": 0.85390413, + "num_input_tokens_seen": 141046025, + "step": 6566, + "time_per_iteration": 2.630221128463745 + }, + { + "auxiliary_loss_clip": 0.0116978, + "auxiliary_loss_mlp": 0.01124062, + "balance_loss_clip": 1.00222564, + "balance_loss_mlp": 1.00094247, + "epoch": 0.39482940027055463, + "flos": 22275712771200.0, + "grad_norm": 1.7274049815857864, + "language_loss": 0.77204335, + "learning_rate": 2.7588404519245403e-06, + "loss": 0.79498172, + "num_input_tokens_seen": 141066865, + "step": 6567, + "time_per_iteration": 2.558929204940796 + }, + { + "auxiliary_loss_clip": 0.01154272, + "auxiliary_loss_mlp": 0.01121738, + "balance_loss_clip": 1.00208271, + "balance_loss_mlp": 1.00071716, + "epoch": 0.3948895235232226, + "flos": 14757634494720.0, + "grad_norm": 1.8520595127356867, + "language_loss": 0.80087382, + "learning_rate": 2.758480098067182e-06, + "loss": 0.82363391, + "num_input_tokens_seen": 141084210, + "step": 6568, + "time_per_iteration": 2.5483815670013428 + }, + { + "auxiliary_loss_clip": 0.01121509, + "auxiliary_loss_mlp": 0.01122431, + "balance_loss_clip": 1.00199902, + "balance_loss_mlp": 1.00064659, + "epoch": 0.39494964677589056, + "flos": 22565763095040.0, + "grad_norm": 2.0183733671596147, + "language_loss": 0.84712243, + "learning_rate": 2.7581197154481816e-06, + "loss": 0.86956179, + "num_input_tokens_seen": 141103895, + "step": 6569, + "time_per_iteration": 2.6658670902252197 + }, + { + "auxiliary_loss_clip": 0.01090694, + "auxiliary_loss_mlp": 0.01122499, + "balance_loss_clip": 1.00188208, + "balance_loss_mlp": 1.00061893, + "epoch": 0.3950097700285585, + "flos": 22963186149120.0, + "grad_norm": 1.7535834629146883, + "language_loss": 0.74546587, + "learning_rate": 2.7577593040812066e-06, + "loss": 0.76759779, + "num_input_tokens_seen": 141124000, + "step": 6570, + "time_per_iteration": 2.7520413398742676 + }, + { + "auxiliary_loss_clip": 0.01127585, + "auxiliary_loss_mlp": 0.01122483, + "balance_loss_clip": 1.00212932, + "balance_loss_mlp": 1.00050807, + "epoch": 0.3950698932812265, + "flos": 20595236929920.0, + "grad_norm": 2.094390284808306, + "language_loss": 0.79776901, + "learning_rate": 2.757398863979922e-06, + "loss": 0.8202697, + "num_input_tokens_seen": 141142535, + "step": 6571, + "time_per_iteration": 2.654808521270752 + }, + { + "auxiliary_loss_clip": 0.01137492, + "auxiliary_loss_mlp": 0.01122716, + "balance_loss_clip": 1.00202584, + "balance_loss_mlp": 1.00083637, + "epoch": 0.39513001653389446, + "flos": 20375786787840.0, + "grad_norm": 1.6001817350100431, + "language_loss": 0.77378118, + "learning_rate": 2.757038395157997e-06, + "loss": 0.7963832, + "num_input_tokens_seen": 141161575, + "step": 6572, + "time_per_iteration": 2.608499050140381 + }, + { + "auxiliary_loss_clip": 0.01120688, + "auxiliary_loss_mlp": 0.01123025, + "balance_loss_clip": 1.00185037, + "balance_loss_mlp": 1.00076425, + "epoch": 0.3951901397865625, + "flos": 26463650256000.0, + "grad_norm": 1.7730226092174755, + "language_loss": 0.75040948, + "learning_rate": 2.7566778976291002e-06, + "loss": 0.77284664, + "num_input_tokens_seen": 141181150, + "step": 6573, + "time_per_iteration": 2.7070727348327637 + }, + { + "auxiliary_loss_clip": 0.0115421, + "auxiliary_loss_mlp": 0.01123042, + "balance_loss_clip": 1.00202274, + "balance_loss_mlp": 1.00068593, + "epoch": 0.39525026303923044, + "flos": 43838345767680.0, + "grad_norm": 1.7987102764716876, + "language_loss": 0.67973948, + "learning_rate": 2.7563173714069017e-06, + "loss": 0.70251203, + "num_input_tokens_seen": 141206310, + "step": 6574, + "time_per_iteration": 2.810215950012207 + }, + { + "auxiliary_loss_clip": 0.01090673, + "auxiliary_loss_mlp": 0.01123479, + "balance_loss_clip": 1.00180244, + "balance_loss_mlp": 1.00064588, + "epoch": 0.3953103862918984, + "flos": 18040803275520.0, + "grad_norm": 2.3202269510466906, + "language_loss": 0.7149775, + "learning_rate": 2.755956816505072e-06, + "loss": 0.73711908, + "num_input_tokens_seen": 141223925, + "step": 6575, + "time_per_iteration": 2.7320849895477295 + }, + { + "auxiliary_loss_clip": 0.01144099, + "auxiliary_loss_mlp": 0.01123967, + "balance_loss_clip": 1.00236773, + "balance_loss_mlp": 1.00084805, + "epoch": 0.3953705095445664, + "flos": 16976015481600.0, + "grad_norm": 1.9802447983543887, + "language_loss": 0.73398656, + "learning_rate": 2.7555962329372845e-06, + "loss": 0.7566672, + "num_input_tokens_seen": 141239010, + "step": 6576, + "time_per_iteration": 2.5654566287994385 + }, + { + "auxiliary_loss_clip": 0.01169438, + "auxiliary_loss_mlp": 0.01122844, + "balance_loss_clip": 1.00210476, + "balance_loss_mlp": 1.00067794, + "epoch": 0.39543063279723434, + "flos": 17411144837760.0, + "grad_norm": 2.4116984592576265, + "language_loss": 0.84636915, + "learning_rate": 2.7552356207172124e-06, + "loss": 0.8692919, + "num_input_tokens_seen": 141252255, + "step": 6577, + "time_per_iteration": 2.481943368911743 + }, + { + "auxiliary_loss_clip": 0.01139484, + "auxiliary_loss_mlp": 0.01123225, + "balance_loss_clip": 1.00214481, + "balance_loss_mlp": 1.00077307, + "epoch": 0.3954907560499023, + "flos": 22784207656320.0, + "grad_norm": 3.8079649234335604, + "language_loss": 0.9039979, + "learning_rate": 2.75487497985853e-06, + "loss": 0.92662501, + "num_input_tokens_seen": 141269325, + "step": 6578, + "time_per_iteration": 2.6050546169281006 + }, + { + "auxiliary_loss_clip": 0.01136055, + "auxiliary_loss_mlp": 0.01123593, + "balance_loss_clip": 1.00185299, + "balance_loss_mlp": 1.00066423, + "epoch": 0.39555087930257027, + "flos": 21944400698880.0, + "grad_norm": 3.6943800197827383, + "language_loss": 0.77890146, + "learning_rate": 2.7545143103749117e-06, + "loss": 0.80149794, + "num_input_tokens_seen": 141288505, + "step": 6579, + "time_per_iteration": 2.615037679672241 + }, + { + "auxiliary_loss_clip": 0.01105732, + "auxiliary_loss_mlp": 0.01123268, + "balance_loss_clip": 1.00182652, + "balance_loss_mlp": 1.00072074, + "epoch": 0.39561100255523823, + "flos": 20404622430720.0, + "grad_norm": 2.09794072186909, + "language_loss": 0.6839515, + "learning_rate": 2.754153612280037e-06, + "loss": 0.70624149, + "num_input_tokens_seen": 141303680, + "step": 6580, + "time_per_iteration": 2.693931818008423 + }, + { + "auxiliary_loss_clip": 0.01152678, + "auxiliary_loss_mlp": 0.01122903, + "balance_loss_clip": 1.00202143, + "balance_loss_mlp": 1.00064182, + "epoch": 0.3956711258079062, + "flos": 27964572986880.0, + "grad_norm": 1.9068355049465038, + "language_loss": 0.58775467, + "learning_rate": 2.7537928855875797e-06, + "loss": 0.61051047, + "num_input_tokens_seen": 141324090, + "step": 6581, + "time_per_iteration": 2.604191780090332 + }, + { + "auxiliary_loss_clip": 0.0113872, + "auxiliary_loss_mlp": 0.01123972, + "balance_loss_clip": 1.00213957, + "balance_loss_mlp": 1.00075746, + "epoch": 0.39573124906057416, + "flos": 14428297670400.0, + "grad_norm": 2.2700813521359295, + "language_loss": 0.69261479, + "learning_rate": 2.7534321303112224e-06, + "loss": 0.71524173, + "num_input_tokens_seen": 141342235, + "step": 6582, + "time_per_iteration": 2.567394495010376 + }, + { + "auxiliary_loss_clip": 0.0116929, + "auxiliary_loss_mlp": 0.00747607, + "balance_loss_clip": 1.00204241, + "balance_loss_mlp": 1.00038588, + "epoch": 0.39579137231324213, + "flos": 18733699607040.0, + "grad_norm": 2.087323537566121, + "language_loss": 0.76225698, + "learning_rate": 2.753071346464642e-06, + "loss": 0.78142589, + "num_input_tokens_seen": 141361195, + "step": 6583, + "time_per_iteration": 3.844649314880371 + }, + { + "auxiliary_loss_clip": 0.01112115, + "auxiliary_loss_mlp": 0.00747615, + "balance_loss_clip": 1.00200081, + "balance_loss_mlp": 1.00035203, + "epoch": 0.3958514955659101, + "flos": 17676417755520.0, + "grad_norm": 1.6584576850627537, + "language_loss": 0.65997159, + "learning_rate": 2.7527105340615207e-06, + "loss": 0.6785689, + "num_input_tokens_seen": 141378275, + "step": 6584, + "time_per_iteration": 2.652876377105713 + }, + { + "auxiliary_loss_clip": 0.0112065, + "auxiliary_loss_mlp": 0.01123898, + "balance_loss_clip": 1.00183463, + "balance_loss_mlp": 1.00068307, + "epoch": 0.39591161881857806, + "flos": 29309103901440.0, + "grad_norm": 2.3311105802044754, + "language_loss": 0.72498405, + "learning_rate": 2.7523496931155413e-06, + "loss": 0.74742949, + "num_input_tokens_seen": 141396960, + "step": 6585, + "time_per_iteration": 4.169750213623047 + }, + { + "auxiliary_loss_clip": 0.01121097, + "auxiliary_loss_mlp": 0.01122732, + "balance_loss_clip": 1.00180161, + "balance_loss_mlp": 1.00075698, + "epoch": 0.3959717420712461, + "flos": 25771831332480.0, + "grad_norm": 1.7613472186079802, + "language_loss": 0.73396754, + "learning_rate": 2.7519888236403856e-06, + "loss": 0.75640589, + "num_input_tokens_seen": 141417320, + "step": 6586, + "time_per_iteration": 4.1285271644592285 + }, + { + "auxiliary_loss_clip": 0.01143855, + "auxiliary_loss_mlp": 0.01123446, + "balance_loss_clip": 1.00215805, + "balance_loss_mlp": 1.0007081, + "epoch": 0.39603186532391405, + "flos": 20923783655040.0, + "grad_norm": 1.685401724521949, + "language_loss": 0.71219224, + "learning_rate": 2.7516279256497382e-06, + "loss": 0.73486525, + "num_input_tokens_seen": 141435985, + "step": 6587, + "time_per_iteration": 2.607814073562622 + }, + { + "auxiliary_loss_clip": 0.01106674, + "auxiliary_loss_mlp": 0.01104093, + "balance_loss_clip": 1.0016005, + "balance_loss_mlp": 1.00033307, + "epoch": 0.396091988576582, + "flos": 54880986176640.0, + "grad_norm": 0.9032296206235005, + "language_loss": 0.61175579, + "learning_rate": 2.751266999157285e-06, + "loss": 0.63386345, + "num_input_tokens_seen": 141486075, + "step": 6588, + "time_per_iteration": 3.0568323135375977 + }, + { + "auxiliary_loss_clip": 0.01135756, + "auxiliary_loss_mlp": 0.00747693, + "balance_loss_clip": 1.00196695, + "balance_loss_mlp": 1.00038326, + "epoch": 0.39615211182925, + "flos": 20702896968960.0, + "grad_norm": 1.8990048304312634, + "language_loss": 0.81617701, + "learning_rate": 2.7509060441767115e-06, + "loss": 0.83501154, + "num_input_tokens_seen": 141505280, + "step": 6589, + "time_per_iteration": 2.611820697784424 + }, + { + "auxiliary_loss_clip": 0.01136561, + "auxiliary_loss_mlp": 0.01123352, + "balance_loss_clip": 1.00197554, + "balance_loss_mlp": 1.0007093, + "epoch": 0.39621223508191794, + "flos": 20994312009600.0, + "grad_norm": 2.168576256291904, + "language_loss": 0.70281911, + "learning_rate": 2.7505450607217057e-06, + "loss": 0.72541821, + "num_input_tokens_seen": 141523930, + "step": 6590, + "time_per_iteration": 4.012592554092407 + }, + { + "auxiliary_loss_clip": 0.01154256, + "auxiliary_loss_mlp": 0.01123572, + "balance_loss_clip": 1.00217938, + "balance_loss_mlp": 1.00092959, + "epoch": 0.3962723583345859, + "flos": 23368833417600.0, + "grad_norm": 2.1839303494744553, + "language_loss": 0.75946665, + "learning_rate": 2.750184048805956e-06, + "loss": 0.78224492, + "num_input_tokens_seen": 141541320, + "step": 6591, + "time_per_iteration": 2.5787811279296875 + }, + { + "auxiliary_loss_clip": 0.01057299, + "auxiliary_loss_mlp": 0.01123562, + "balance_loss_clip": 1.00179458, + "balance_loss_mlp": 1.00111055, + "epoch": 0.39633248158725387, + "flos": 25115599808640.0, + "grad_norm": 1.638289237287097, + "language_loss": 0.78900534, + "learning_rate": 2.749823008443152e-06, + "loss": 0.81081402, + "num_input_tokens_seen": 141561880, + "step": 6592, + "time_per_iteration": 2.93196439743042 + }, + { + "auxiliary_loss_clip": 0.01088849, + "auxiliary_loss_mlp": 0.01122305, + "balance_loss_clip": 1.00181103, + "balance_loss_mlp": 1.00052071, + "epoch": 0.39639260483992184, + "flos": 39787622236800.0, + "grad_norm": 1.8036482143861632, + "language_loss": 0.69202316, + "learning_rate": 2.7494619396469843e-06, + "loss": 0.71413469, + "num_input_tokens_seen": 141586460, + "step": 6593, + "time_per_iteration": 3.2221577167510986 + }, + { + "auxiliary_loss_clip": 0.01080613, + "auxiliary_loss_mlp": 0.0112365, + "balance_loss_clip": 1.00199366, + "balance_loss_mlp": 1.00072122, + "epoch": 0.3964527280925898, + "flos": 17347045017600.0, + "grad_norm": 3.469497915809491, + "language_loss": 0.77852637, + "learning_rate": 2.7491008424311452e-06, + "loss": 0.800569, + "num_input_tokens_seen": 141605955, + "step": 6594, + "time_per_iteration": 2.7540841102600098 + }, + { + "auxiliary_loss_clip": 0.01117083, + "auxiliary_loss_mlp": 0.01104781, + "balance_loss_clip": 1.00154769, + "balance_loss_mlp": 1.0002588, + "epoch": 0.39651285134525777, + "flos": 71717848369920.0, + "grad_norm": 0.9390777425175943, + "language_loss": 0.62895936, + "learning_rate": 2.7487397168093265e-06, + "loss": 0.65117806, + "num_input_tokens_seen": 141673140, + "step": 6595, + "time_per_iteration": 3.260573387145996 + }, + { + "auxiliary_loss_clip": 0.01120694, + "auxiliary_loss_mlp": 0.01123483, + "balance_loss_clip": 1.00204039, + "balance_loss_mlp": 1.00084019, + "epoch": 0.39657297459792573, + "flos": 25775710001280.0, + "grad_norm": 3.7173294126607104, + "language_loss": 0.63170993, + "learning_rate": 2.748378562795223e-06, + "loss": 0.65415168, + "num_input_tokens_seen": 141692955, + "step": 6596, + "time_per_iteration": 2.6955528259277344 + }, + { + "auxiliary_loss_clip": 0.01153969, + "auxiliary_loss_mlp": 0.01122174, + "balance_loss_clip": 1.0022068, + "balance_loss_mlp": 1.00086629, + "epoch": 0.3966330978505937, + "flos": 20266115587200.0, + "grad_norm": 1.8257728268618139, + "language_loss": 0.78701735, + "learning_rate": 2.7480173804025293e-06, + "loss": 0.80977875, + "num_input_tokens_seen": 141710680, + "step": 6597, + "time_per_iteration": 2.5918476581573486 + }, + { + "auxiliary_loss_clip": 0.0112215, + "auxiliary_loss_mlp": 0.00747693, + "balance_loss_clip": 1.00196028, + "balance_loss_mlp": 1.00035071, + "epoch": 0.39669322110326166, + "flos": 20631183465600.0, + "grad_norm": 1.7849607605589883, + "language_loss": 0.67377079, + "learning_rate": 2.747656169644941e-06, + "loss": 0.69246924, + "num_input_tokens_seen": 141729860, + "step": 6598, + "time_per_iteration": 2.6653125286102295 + }, + { + "auxiliary_loss_clip": 0.01169535, + "auxiliary_loss_mlp": 0.01123792, + "balance_loss_clip": 1.00212336, + "balance_loss_mlp": 1.00095868, + "epoch": 0.3967533443559297, + "flos": 21726063878400.0, + "grad_norm": 2.1296706248545942, + "language_loss": 0.78908783, + "learning_rate": 2.747294930536157e-06, + "loss": 0.81202108, + "num_input_tokens_seen": 141749060, + "step": 6599, + "time_per_iteration": 2.52932071685791 + }, + { + "auxiliary_loss_clip": 0.01128536, + "auxiliary_loss_mlp": 0.01123262, + "balance_loss_clip": 1.00224805, + "balance_loss_mlp": 1.00071526, + "epoch": 0.39681346760859765, + "flos": 25484151306240.0, + "grad_norm": 1.886473283234651, + "language_loss": 0.72607434, + "learning_rate": 2.7469336630898737e-06, + "loss": 0.74859232, + "num_input_tokens_seen": 141769860, + "step": 6600, + "time_per_iteration": 2.662750482559204 + }, + { + "auxiliary_loss_clip": 0.01122825, + "auxiliary_loss_mlp": 0.01122874, + "balance_loss_clip": 1.00192928, + "balance_loss_mlp": 1.00061297, + "epoch": 0.3968735908612656, + "flos": 20959586536320.0, + "grad_norm": 2.080395087712009, + "language_loss": 0.85716408, + "learning_rate": 2.746572367319791e-06, + "loss": 0.87962103, + "num_input_tokens_seen": 141788465, + "step": 6601, + "time_per_iteration": 2.62813663482666 + }, + { + "auxiliary_loss_clip": 0.01121506, + "auxiliary_loss_mlp": 0.01123677, + "balance_loss_clip": 1.00183702, + "balance_loss_mlp": 1.0007484, + "epoch": 0.3969337141139336, + "flos": 10707090531840.0, + "grad_norm": 2.60810311536461, + "language_loss": 0.70479643, + "learning_rate": 2.7462110432396095e-06, + "loss": 0.72724819, + "num_input_tokens_seen": 141804955, + "step": 6602, + "time_per_iteration": 2.6039111614227295 + }, + { + "auxiliary_loss_clip": 0.01169433, + "auxiliary_loss_mlp": 0.01123369, + "balance_loss_clip": 1.00206041, + "balance_loss_mlp": 1.00101244, + "epoch": 0.39699383736660154, + "flos": 17593714690560.0, + "grad_norm": 2.8646119546407207, + "language_loss": 0.83131146, + "learning_rate": 2.7458496908630305e-06, + "loss": 0.85423946, + "num_input_tokens_seen": 141820025, + "step": 6603, + "time_per_iteration": 2.474841833114624 + }, + { + "auxiliary_loss_clip": 0.01138508, + "auxiliary_loss_mlp": 0.01122453, + "balance_loss_clip": 1.00189066, + "balance_loss_mlp": 1.00076401, + "epoch": 0.3970539606192695, + "flos": 17785945301760.0, + "grad_norm": 1.7536203911157162, + "language_loss": 0.73048186, + "learning_rate": 2.7454883102037563e-06, + "loss": 0.75309145, + "num_input_tokens_seen": 141838735, + "step": 6604, + "time_per_iteration": 2.6152260303497314 + }, + { + "auxiliary_loss_clip": 0.01137303, + "auxiliary_loss_mlp": 0.01122115, + "balance_loss_clip": 1.00208521, + "balance_loss_mlp": 1.00061679, + "epoch": 0.3971140838719375, + "flos": 24789495208320.0, + "grad_norm": 1.5198008970760648, + "language_loss": 0.82618928, + "learning_rate": 2.745126901275491e-06, + "loss": 0.84878349, + "num_input_tokens_seen": 141858090, + "step": 6605, + "time_per_iteration": 2.640460729598999 + }, + { + "auxiliary_loss_clip": 0.01169349, + "auxiliary_loss_mlp": 0.01122406, + "balance_loss_clip": 1.00209129, + "balance_loss_mlp": 1.00062191, + "epoch": 0.39717420712460544, + "flos": 24243581329920.0, + "grad_norm": 1.4566146908057858, + "language_loss": 0.7385838, + "learning_rate": 2.7447654640919383e-06, + "loss": 0.76150131, + "num_input_tokens_seen": 141877540, + "step": 6606, + "time_per_iteration": 2.5709598064422607 + }, + { + "auxiliary_loss_clip": 0.01121172, + "auxiliary_loss_mlp": 0.01122825, + "balance_loss_clip": 1.00192046, + "balance_loss_mlp": 1.00084996, + "epoch": 0.3972343303772734, + "flos": 25884698843520.0, + "grad_norm": 1.8460077860175255, + "language_loss": 0.73957044, + "learning_rate": 2.744403998666805e-06, + "loss": 0.76201046, + "num_input_tokens_seen": 141897315, + "step": 6607, + "time_per_iteration": 2.6750741004943848 + }, + { + "auxiliary_loss_clip": 0.01154568, + "auxiliary_loss_mlp": 0.01123312, + "balance_loss_clip": 1.00218701, + "balance_loss_mlp": 1.00076461, + "epoch": 0.39729445362994137, + "flos": 45623716300800.0, + "grad_norm": 1.4744059138461345, + "language_loss": 0.67991155, + "learning_rate": 2.744042505013797e-06, + "loss": 0.70269036, + "num_input_tokens_seen": 141919580, + "step": 6608, + "time_per_iteration": 2.7789695262908936 + }, + { + "auxiliary_loss_clip": 0.01122725, + "auxiliary_loss_mlp": 0.01123419, + "balance_loss_clip": 1.0019474, + "balance_loss_mlp": 1.00077653, + "epoch": 0.39735457688260933, + "flos": 20193971120640.0, + "grad_norm": 1.9596893367503005, + "language_loss": 0.74449688, + "learning_rate": 2.7436809831466233e-06, + "loss": 0.7669583, + "num_input_tokens_seen": 141937045, + "step": 6609, + "time_per_iteration": 2.612809181213379 + }, + { + "auxiliary_loss_clip": 0.01142534, + "auxiliary_loss_mlp": 0.01122962, + "balance_loss_clip": 1.00242496, + "balance_loss_mlp": 1.00070095, + "epoch": 0.3974147001352773, + "flos": 23331163029120.0, + "grad_norm": 1.562900993167201, + "language_loss": 0.71411598, + "learning_rate": 2.7433194330789927e-06, + "loss": 0.73677087, + "num_input_tokens_seen": 141956695, + "step": 6610, + "time_per_iteration": 2.61222767829895 + }, + { + "auxiliary_loss_clip": 0.0115406, + "auxiliary_loss_mlp": 0.01122069, + "balance_loss_clip": 1.00214338, + "balance_loss_mlp": 1.00057077, + "epoch": 0.39747482338794526, + "flos": 21688644885120.0, + "grad_norm": 4.514317423629555, + "language_loss": 0.78637218, + "learning_rate": 2.7429578548246133e-06, + "loss": 0.80913341, + "num_input_tokens_seen": 141975935, + "step": 6611, + "time_per_iteration": 2.5675268173217773 + }, + { + "auxiliary_loss_clip": 0.01152889, + "auxiliary_loss_mlp": 0.01122935, + "balance_loss_clip": 1.00219274, + "balance_loss_mlp": 1.00067449, + "epoch": 0.3975349466406133, + "flos": 30988717816320.0, + "grad_norm": 1.8257489562031979, + "language_loss": 0.78997862, + "learning_rate": 2.7425962483971985e-06, + "loss": 0.81273687, + "num_input_tokens_seen": 141995750, + "step": 6612, + "time_per_iteration": 2.629365921020508 + }, + { + "auxiliary_loss_clip": 0.01115051, + "auxiliary_loss_mlp": 0.01104095, + "balance_loss_clip": 1.00150073, + "balance_loss_mlp": 1.00033522, + "epoch": 0.39759506989328125, + "flos": 63683948833920.0, + "grad_norm": 0.8314324270997324, + "language_loss": 0.64962804, + "learning_rate": 2.742234613810459e-06, + "loss": 0.67181945, + "num_input_tokens_seen": 142057655, + "step": 6613, + "time_per_iteration": 3.1344854831695557 + }, + { + "auxiliary_loss_clip": 0.01139167, + "auxiliary_loss_mlp": 0.01123171, + "balance_loss_clip": 1.00218391, + "balance_loss_mlp": 1.00071955, + "epoch": 0.3976551931459492, + "flos": 23695835857920.0, + "grad_norm": 2.19819965154448, + "language_loss": 0.71632487, + "learning_rate": 2.741872951078109e-06, + "loss": 0.73894823, + "num_input_tokens_seen": 142076020, + "step": 6614, + "time_per_iteration": 2.6103265285491943 + }, + { + "auxiliary_loss_clip": 0.01152755, + "auxiliary_loss_mlp": 0.01122822, + "balance_loss_clip": 1.00209808, + "balance_loss_mlp": 1.00065672, + "epoch": 0.3977153163986172, + "flos": 15669657745920.0, + "grad_norm": 1.8429916706228078, + "language_loss": 0.81559855, + "learning_rate": 2.741511260213862e-06, + "loss": 0.83835435, + "num_input_tokens_seen": 142093790, + "step": 6615, + "time_per_iteration": 2.5468740463256836 + }, + { + "auxiliary_loss_clip": 0.01125783, + "auxiliary_loss_mlp": 0.01122644, + "balance_loss_clip": 1.00204444, + "balance_loss_mlp": 1.0005734, + "epoch": 0.39777543965128515, + "flos": 14064702249600.0, + "grad_norm": 2.3076909363163463, + "language_loss": 0.67527109, + "learning_rate": 2.741149541231434e-06, + "loss": 0.6977554, + "num_input_tokens_seen": 142110545, + "step": 6616, + "time_per_iteration": 2.5821917057037354 + }, + { + "auxiliary_loss_clip": 0.01169639, + "auxiliary_loss_mlp": 0.01123441, + "balance_loss_clip": 1.00212646, + "balance_loss_mlp": 1.00079834, + "epoch": 0.3978355629039531, + "flos": 23367468700800.0, + "grad_norm": 2.2026107156952066, + "language_loss": 0.83860856, + "learning_rate": 2.740787794144541e-06, + "loss": 0.86153936, + "num_input_tokens_seen": 142128695, + "step": 6617, + "time_per_iteration": 2.5326170921325684 + }, + { + "auxiliary_loss_clip": 0.01169359, + "auxiliary_loss_mlp": 0.01122075, + "balance_loss_clip": 1.00221252, + "balance_loss_mlp": 1.00067234, + "epoch": 0.3978956861566211, + "flos": 19062785036160.0, + "grad_norm": 1.534537375165497, + "language_loss": 0.72358656, + "learning_rate": 2.7404260189669e-06, + "loss": 0.74650091, + "num_input_tokens_seen": 142148375, + "step": 6618, + "time_per_iteration": 2.51764178276062 + }, + { + "auxiliary_loss_clip": 0.01136408, + "auxiliary_loss_mlp": 0.01121932, + "balance_loss_clip": 1.00203586, + "balance_loss_mlp": 1.00062501, + "epoch": 0.39795580940928904, + "flos": 30227699341440.0, + "grad_norm": 2.3756117923030513, + "language_loss": 0.65598196, + "learning_rate": 2.740064215712231e-06, + "loss": 0.67856538, + "num_input_tokens_seen": 142169735, + "step": 6619, + "time_per_iteration": 2.7071025371551514 + }, + { + "auxiliary_loss_clip": 0.01165282, + "auxiliary_loss_mlp": 0.0110298, + "balance_loss_clip": 1.00194192, + "balance_loss_mlp": 0.99998343, + "epoch": 0.398015932661957, + "flos": 69847224906240.0, + "grad_norm": 0.7701412461550857, + "language_loss": 0.58176613, + "learning_rate": 2.7397023843942527e-06, + "loss": 0.60444868, + "num_input_tokens_seen": 142229520, + "step": 6620, + "time_per_iteration": 3.068873405456543 + }, + { + "auxiliary_loss_clip": 0.01138077, + "auxiliary_loss_mlp": 0.01122006, + "balance_loss_clip": 1.00202215, + "balance_loss_mlp": 1.00079358, + "epoch": 0.39807605591462497, + "flos": 20157773189760.0, + "grad_norm": 2.2665143779530617, + "language_loss": 0.78920197, + "learning_rate": 2.739340525026686e-06, + "loss": 0.8118028, + "num_input_tokens_seen": 142247660, + "step": 6621, + "time_per_iteration": 4.2617576122283936 + }, + { + "auxiliary_loss_clip": 0.01136941, + "auxiliary_loss_mlp": 0.01122363, + "balance_loss_clip": 1.00198221, + "balance_loss_mlp": 1.00057864, + "epoch": 0.39813617916729294, + "flos": 21141761339520.0, + "grad_norm": 6.267888959003586, + "language_loss": 0.77991027, + "learning_rate": 2.738978637623252e-06, + "loss": 0.80250329, + "num_input_tokens_seen": 142266990, + "step": 6622, + "time_per_iteration": 2.639019727706909 + }, + { + "auxiliary_loss_clip": 0.01137497, + "auxiliary_loss_mlp": 0.01122862, + "balance_loss_clip": 1.00197768, + "balance_loss_mlp": 1.00060081, + "epoch": 0.3981963024199609, + "flos": 18988485753600.0, + "grad_norm": 1.5444375946281534, + "language_loss": 0.750085, + "learning_rate": 2.738616722197674e-06, + "loss": 0.77268851, + "num_input_tokens_seen": 142287170, + "step": 6623, + "time_per_iteration": 3.9683282375335693 + }, + { + "auxiliary_loss_clip": 0.01120839, + "auxiliary_loss_mlp": 0.01122761, + "balance_loss_clip": 1.00180566, + "balance_loss_mlp": 1.00088167, + "epoch": 0.39825642567262887, + "flos": 16575108808320.0, + "grad_norm": 1.9294897864204714, + "language_loss": 0.79656768, + "learning_rate": 2.7382547787636766e-06, + "loss": 0.8190037, + "num_input_tokens_seen": 142305405, + "step": 6624, + "time_per_iteration": 4.055820941925049 + }, + { + "auxiliary_loss_clip": 0.01169592, + "auxiliary_loss_mlp": 0.01123115, + "balance_loss_clip": 1.00221014, + "balance_loss_mlp": 1.00085437, + "epoch": 0.39831654892529683, + "flos": 22199833290240.0, + "grad_norm": 1.9680711276026834, + "language_loss": 0.83641887, + "learning_rate": 2.7378928073349832e-06, + "loss": 0.85934591, + "num_input_tokens_seen": 142322710, + "step": 6625, + "time_per_iteration": 2.5628814697265625 + }, + { + "auxiliary_loss_clip": 0.01152457, + "auxiliary_loss_mlp": 0.01122618, + "balance_loss_clip": 1.00193071, + "balance_loss_mlp": 1.0006429, + "epoch": 0.39837667217796485, + "flos": 10487963612160.0, + "grad_norm": 2.168386791033075, + "language_loss": 0.86498082, + "learning_rate": 2.737530807925321e-06, + "loss": 0.88773155, + "num_input_tokens_seen": 142338535, + "step": 6626, + "time_per_iteration": 2.5383942127227783 + }, + { + "auxiliary_loss_clip": 0.01090918, + "auxiliary_loss_mlp": 0.00747714, + "balance_loss_clip": 1.001966, + "balance_loss_mlp": 1.00042534, + "epoch": 0.3984367954306328, + "flos": 17965282930560.0, + "grad_norm": 2.622921926960965, + "language_loss": 0.83769464, + "learning_rate": 2.737168780548417e-06, + "loss": 0.85608101, + "num_input_tokens_seen": 142354570, + "step": 6627, + "time_per_iteration": 4.096599340438843 + }, + { + "auxiliary_loss_clip": 0.01127143, + "auxiliary_loss_mlp": 0.00747493, + "balance_loss_clip": 1.00211644, + "balance_loss_mlp": 1.00032771, + "epoch": 0.3984969186833008, + "flos": 22711057608960.0, + "grad_norm": 1.5246112705997465, + "language_loss": 0.82796055, + "learning_rate": 2.736806725217998e-06, + "loss": 0.84670693, + "num_input_tokens_seen": 142374395, + "step": 6628, + "time_per_iteration": 2.684396743774414 + }, + { + "auxiliary_loss_clip": 0.01122422, + "auxiliary_loss_mlp": 0.01123419, + "balance_loss_clip": 1.00198245, + "balance_loss_mlp": 1.00096679, + "epoch": 0.39855704193596875, + "flos": 23405785534080.0, + "grad_norm": 1.4973016003287365, + "language_loss": 0.71181738, + "learning_rate": 2.7364446419477945e-06, + "loss": 0.73427576, + "num_input_tokens_seen": 142396040, + "step": 6629, + "time_per_iteration": 2.6818442344665527 + }, + { + "auxiliary_loss_clip": 0.01120595, + "auxiliary_loss_mlp": 0.01121888, + "balance_loss_clip": 1.0019834, + "balance_loss_mlp": 1.0007714, + "epoch": 0.3986171651886367, + "flos": 21251935330560.0, + "grad_norm": 1.8561371469422316, + "language_loss": 0.8070153, + "learning_rate": 2.7360825307515366e-06, + "loss": 0.82944012, + "num_input_tokens_seen": 142415495, + "step": 6630, + "time_per_iteration": 2.6605851650238037 + }, + { + "auxiliary_loss_clip": 0.01087446, + "auxiliary_loss_mlp": 0.01122165, + "balance_loss_clip": 1.00159776, + "balance_loss_mlp": 1.00057125, + "epoch": 0.3986772884413047, + "flos": 12458705258880.0, + "grad_norm": 2.5500835164870272, + "language_loss": 0.75234008, + "learning_rate": 2.7357203916429555e-06, + "loss": 0.77443612, + "num_input_tokens_seen": 142431865, + "step": 6631, + "time_per_iteration": 2.68129563331604 + }, + { + "auxiliary_loss_clip": 0.01122146, + "auxiliary_loss_mlp": 0.01122291, + "balance_loss_clip": 1.00205231, + "balance_loss_mlp": 1.00069785, + "epoch": 0.39873741169397264, + "flos": 19646117907840.0, + "grad_norm": 1.8900005716067403, + "language_loss": 0.71431184, + "learning_rate": 2.735358224635783e-06, + "loss": 0.73675621, + "num_input_tokens_seen": 142450595, + "step": 6632, + "time_per_iteration": 2.6825859546661377 + }, + { + "auxiliary_loss_clip": 0.01087794, + "auxiliary_loss_mlp": 0.00747439, + "balance_loss_clip": 1.00165927, + "balance_loss_mlp": 1.00026512, + "epoch": 0.3987975349466406, + "flos": 21684766216320.0, + "grad_norm": 1.7356343885294578, + "language_loss": 0.75217539, + "learning_rate": 2.7349960297437533e-06, + "loss": 0.77052772, + "num_input_tokens_seen": 142466650, + "step": 6633, + "time_per_iteration": 2.7015652656555176 + }, + { + "auxiliary_loss_clip": 0.01137483, + "auxiliary_loss_mlp": 0.01121446, + "balance_loss_clip": 1.00191236, + "balance_loss_mlp": 1.00052047, + "epoch": 0.3988576581993086, + "flos": 23914064937600.0, + "grad_norm": 1.999516149776047, + "language_loss": 0.81733555, + "learning_rate": 2.7346338069806e-06, + "loss": 0.83992481, + "num_input_tokens_seen": 142486165, + "step": 6634, + "time_per_iteration": 2.6516833305358887 + }, + { + "auxiliary_loss_clip": 0.01137912, + "auxiliary_loss_mlp": 0.01121814, + "balance_loss_clip": 1.00206757, + "balance_loss_mlp": 1.00069785, + "epoch": 0.39891778145197654, + "flos": 18149899858560.0, + "grad_norm": 1.731092881935023, + "language_loss": 0.74712503, + "learning_rate": 2.7342715563600597e-06, + "loss": 0.76972234, + "num_input_tokens_seen": 142505035, + "step": 6635, + "time_per_iteration": 2.5879809856414795 + }, + { + "auxiliary_loss_clip": 0.01127249, + "auxiliary_loss_mlp": 0.01123099, + "balance_loss_clip": 1.00210822, + "balance_loss_mlp": 1.00083804, + "epoch": 0.3989779047046445, + "flos": 22595281096320.0, + "grad_norm": 2.181999909351239, + "language_loss": 0.66536027, + "learning_rate": 2.733909277895868e-06, + "loss": 0.68786383, + "num_input_tokens_seen": 142521870, + "step": 6636, + "time_per_iteration": 2.641247272491455 + }, + { + "auxiliary_loss_clip": 0.01152816, + "auxiliary_loss_mlp": 0.01121563, + "balance_loss_clip": 1.0020864, + "balance_loss_mlp": 1.0007329, + "epoch": 0.39903802795731247, + "flos": 18077216688000.0, + "grad_norm": 2.0314253143088545, + "language_loss": 0.81298041, + "learning_rate": 2.733546971601763e-06, + "loss": 0.83572423, + "num_input_tokens_seen": 142540455, + "step": 6637, + "time_per_iteration": 2.5452165603637695 + }, + { + "auxiliary_loss_clip": 0.01118385, + "auxiliary_loss_mlp": 0.01103048, + "balance_loss_clip": 1.00175047, + "balance_loss_mlp": 1.00005102, + "epoch": 0.39909815120998043, + "flos": 70441367771520.0, + "grad_norm": 0.7276729491724327, + "language_loss": 0.53170949, + "learning_rate": 2.733184637491484e-06, + "loss": 0.55392385, + "num_input_tokens_seen": 142599665, + "step": 6638, + "time_per_iteration": 3.2885067462921143 + }, + { + "auxiliary_loss_clip": 0.01136155, + "auxiliary_loss_mlp": 0.00747647, + "balance_loss_clip": 1.00200295, + "balance_loss_mlp": 1.00035822, + "epoch": 0.39915827446264845, + "flos": 18549262247040.0, + "grad_norm": 1.3943588782551888, + "language_loss": 0.75598919, + "learning_rate": 2.732822275578769e-06, + "loss": 0.77482724, + "num_input_tokens_seen": 142618845, + "step": 6639, + "time_per_iteration": 2.604485511779785 + }, + { + "auxiliary_loss_clip": 0.01091987, + "auxiliary_loss_mlp": 0.01121741, + "balance_loss_clip": 1.00207567, + "balance_loss_mlp": 1.00071955, + "epoch": 0.3992183977153164, + "flos": 29897249195520.0, + "grad_norm": 1.6231856932749544, + "language_loss": 0.76197577, + "learning_rate": 2.7324598858773603e-06, + "loss": 0.78411305, + "num_input_tokens_seen": 142640885, + "step": 6640, + "time_per_iteration": 2.7936840057373047 + }, + { + "auxiliary_loss_clip": 0.01121297, + "auxiliary_loss_mlp": 0.01121983, + "balance_loss_clip": 1.00190032, + "balance_loss_mlp": 1.00077128, + "epoch": 0.3992785209679844, + "flos": 22565080736640.0, + "grad_norm": 2.163420255903758, + "language_loss": 0.8202498, + "learning_rate": 2.7320974684009996e-06, + "loss": 0.8426826, + "num_input_tokens_seen": 142659340, + "step": 6641, + "time_per_iteration": 2.63826584815979 + }, + { + "auxiliary_loss_clip": 0.011694, + "auxiliary_loss_mlp": 0.01122312, + "balance_loss_clip": 1.00217199, + "balance_loss_mlp": 1.00062323, + "epoch": 0.39933864422065235, + "flos": 19682674974720.0, + "grad_norm": 1.9120304190863289, + "language_loss": 0.76697433, + "learning_rate": 2.7317350231634288e-06, + "loss": 0.78989142, + "num_input_tokens_seen": 142677085, + "step": 6642, + "time_per_iteration": 2.5270092487335205 + }, + { + "auxiliary_loss_clip": 0.01135869, + "auxiliary_loss_mlp": 0.01122459, + "balance_loss_clip": 1.00198364, + "balance_loss_mlp": 1.00058007, + "epoch": 0.3993987674733203, + "flos": 23038491012480.0, + "grad_norm": 2.2285696072532986, + "language_loss": 0.72248685, + "learning_rate": 2.731372550178393e-06, + "loss": 0.7450701, + "num_input_tokens_seen": 142694595, + "step": 6643, + "time_per_iteration": 2.6137094497680664 + }, + { + "auxiliary_loss_clip": 0.01152626, + "auxiliary_loss_mlp": 0.01122233, + "balance_loss_clip": 1.00199103, + "balance_loss_mlp": 1.00054431, + "epoch": 0.3994588907259883, + "flos": 19390828970880.0, + "grad_norm": 1.6104594217473578, + "language_loss": 0.66334462, + "learning_rate": 2.7310100494596375e-06, + "loss": 0.68609327, + "num_input_tokens_seen": 142714175, + "step": 6644, + "time_per_iteration": 2.561720848083496 + }, + { + "auxiliary_loss_clip": 0.01169239, + "auxiliary_loss_mlp": 0.01121663, + "balance_loss_clip": 1.00206792, + "balance_loss_mlp": 1.00073731, + "epoch": 0.39951901397865625, + "flos": 13734395758080.0, + "grad_norm": 2.261429286055161, + "language_loss": 0.78205431, + "learning_rate": 2.730647521020907e-06, + "loss": 0.80496329, + "num_input_tokens_seen": 142730955, + "step": 6645, + "time_per_iteration": 2.5148708820343018 + }, + { + "auxiliary_loss_clip": 0.01152807, + "auxiliary_loss_mlp": 0.01122658, + "balance_loss_clip": 1.00208783, + "balance_loss_mlp": 1.00058782, + "epoch": 0.3995791372313242, + "flos": 23586451966080.0, + "grad_norm": 2.000272655933688, + "language_loss": 0.69670963, + "learning_rate": 2.73028496487595e-06, + "loss": 0.7194643, + "num_input_tokens_seen": 142751200, + "step": 6646, + "time_per_iteration": 2.5724289417266846 + }, + { + "auxiliary_loss_clip": 0.0110567, + "auxiliary_loss_mlp": 0.01121593, + "balance_loss_clip": 1.00189257, + "balance_loss_mlp": 1.00076294, + "epoch": 0.3996392604839922, + "flos": 21355896268800.0, + "grad_norm": 1.8433415912778892, + "language_loss": 0.71549934, + "learning_rate": 2.729922381038513e-06, + "loss": 0.73777199, + "num_input_tokens_seen": 142770170, + "step": 6647, + "time_per_iteration": 2.6905717849731445 + }, + { + "auxiliary_loss_clip": 0.01120901, + "auxiliary_loss_mlp": 0.01120899, + "balance_loss_clip": 1.00175488, + "balance_loss_mlp": 1.00064015, + "epoch": 0.39969938373666014, + "flos": 26032255914240.0, + "grad_norm": 1.9498128086606312, + "language_loss": 0.74204499, + "learning_rate": 2.7295597695223463e-06, + "loss": 0.76446295, + "num_input_tokens_seen": 142792680, + "step": 6648, + "time_per_iteration": 2.6988630294799805 + }, + { + "auxiliary_loss_clip": 0.01169277, + "auxiliary_loss_mlp": 0.01121602, + "balance_loss_clip": 1.00207496, + "balance_loss_mlp": 1.00058115, + "epoch": 0.3997595069893281, + "flos": 20116367786880.0, + "grad_norm": 9.876494908425302, + "language_loss": 0.65826631, + "learning_rate": 2.7291971303412006e-06, + "loss": 0.68117511, + "num_input_tokens_seen": 142810510, + "step": 6649, + "time_per_iteration": 2.538862705230713 + }, + { + "auxiliary_loss_clip": 0.01122011, + "auxiliary_loss_mlp": 0.01121851, + "balance_loss_clip": 1.00202155, + "balance_loss_mlp": 1.00073433, + "epoch": 0.39981963024199607, + "flos": 27783403764480.0, + "grad_norm": 1.6863546027145733, + "language_loss": 0.75063604, + "learning_rate": 2.728834463508826e-06, + "loss": 0.77307469, + "num_input_tokens_seen": 142832455, + "step": 6650, + "time_per_iteration": 2.701211929321289 + }, + { + "auxiliary_loss_clip": 0.01169283, + "auxiliary_loss_mlp": 0.0112226, + "balance_loss_clip": 1.00208259, + "balance_loss_mlp": 1.00085676, + "epoch": 0.39987975349466404, + "flos": 21944436612480.0, + "grad_norm": 1.5416169964911834, + "language_loss": 0.7170397, + "learning_rate": 2.728471769038975e-06, + "loss": 0.73995513, + "num_input_tokens_seen": 142852590, + "step": 6651, + "time_per_iteration": 2.5451979637145996 + }, + { + "auxiliary_loss_clip": 0.01169262, + "auxiliary_loss_mlp": 0.01122, + "balance_loss_clip": 1.00203919, + "balance_loss_mlp": 1.00059676, + "epoch": 0.39993987674733206, + "flos": 20704405340160.0, + "grad_norm": 1.826306715049131, + "language_loss": 0.72930193, + "learning_rate": 2.728109046945403e-06, + "loss": 0.75221461, + "num_input_tokens_seen": 142870595, + "step": 6652, + "time_per_iteration": 2.52575421333313 + }, + { + "auxiliary_loss_clip": 0.01117071, + "auxiliary_loss_mlp": 0.01102995, + "balance_loss_clip": 1.00178313, + "balance_loss_mlp": 0.99999857, + "epoch": 0.4, + "flos": 61525429862400.0, + "grad_norm": 0.8473635757924783, + "language_loss": 0.60668778, + "learning_rate": 2.727746297241862e-06, + "loss": 0.62888849, + "num_input_tokens_seen": 142925805, + "step": 6653, + "time_per_iteration": 3.1653220653533936 + }, + { + "auxiliary_loss_clip": 0.01119128, + "auxiliary_loss_mlp": 0.0112189, + "balance_loss_clip": 1.00185847, + "balance_loss_mlp": 1.0008688, + "epoch": 0.400060123252668, + "flos": 14502309644160.0, + "grad_norm": 1.8850234225452835, + "language_loss": 0.66880846, + "learning_rate": 2.7273835199421085e-06, + "loss": 0.69121861, + "num_input_tokens_seen": 142943145, + "step": 6654, + "time_per_iteration": 2.6401069164276123 + }, + { + "auxiliary_loss_clip": 0.01152472, + "auxiliary_loss_mlp": 0.01121722, + "balance_loss_clip": 1.00197721, + "balance_loss_mlp": 1.00079584, + "epoch": 0.40012024650533595, + "flos": 19093308618240.0, + "grad_norm": 2.019215111423813, + "language_loss": 0.89771426, + "learning_rate": 2.7270207150599e-06, + "loss": 0.92045623, + "num_input_tokens_seen": 142956925, + "step": 6655, + "time_per_iteration": 2.5269062519073486 + }, + { + "auxiliary_loss_clip": 0.01138409, + "auxiliary_loss_mlp": 0.01120749, + "balance_loss_clip": 1.00214005, + "balance_loss_mlp": 1.00058591, + "epoch": 0.4001803697580039, + "flos": 29351012094720.0, + "grad_norm": 1.6406712314485117, + "language_loss": 0.73635375, + "learning_rate": 2.7266578826089917e-06, + "loss": 0.75894535, + "num_input_tokens_seen": 142978040, + "step": 6656, + "time_per_iteration": 2.6725504398345947 + }, + { + "auxiliary_loss_clip": 0.01169298, + "auxiliary_loss_mlp": 0.01122115, + "balance_loss_clip": 1.00206232, + "balance_loss_mlp": 1.00071263, + "epoch": 0.4002404930106719, + "flos": 20920048640640.0, + "grad_norm": 1.478310191494871, + "language_loss": 0.7333433, + "learning_rate": 2.726295022603144e-06, + "loss": 0.75625741, + "num_input_tokens_seen": 142998390, + "step": 6657, + "time_per_iteration": 2.578458309173584 + }, + { + "auxiliary_loss_clip": 0.01169397, + "auxiliary_loss_mlp": 0.01122785, + "balance_loss_clip": 1.00224495, + "balance_loss_mlp": 1.00081038, + "epoch": 0.40030061626333985, + "flos": 28405735827840.0, + "grad_norm": 1.7394497978377685, + "language_loss": 0.7954542, + "learning_rate": 2.725932135056117e-06, + "loss": 0.81837606, + "num_input_tokens_seen": 143021505, + "step": 6658, + "time_per_iteration": 4.1097071170806885 + }, + { + "auxiliary_loss_clip": 0.01154302, + "auxiliary_loss_mlp": 0.01123, + "balance_loss_clip": 1.00206327, + "balance_loss_mlp": 1.0008347, + "epoch": 0.4003607395160078, + "flos": 25921615046400.0, + "grad_norm": 1.9775029379618416, + "language_loss": 0.77571744, + "learning_rate": 2.72556921998167e-06, + "loss": 0.79849041, + "num_input_tokens_seen": 143041375, + "step": 6659, + "time_per_iteration": 2.625861883163452 + }, + { + "auxiliary_loss_clip": 0.01168865, + "auxiliary_loss_mlp": 0.01119998, + "balance_loss_clip": 1.00204444, + "balance_loss_mlp": 1.00069344, + "epoch": 0.4004208627686758, + "flos": 20768648814720.0, + "grad_norm": 1.807567762416596, + "language_loss": 0.72449076, + "learning_rate": 2.7252062773935662e-06, + "loss": 0.74737936, + "num_input_tokens_seen": 143058725, + "step": 6660, + "time_per_iteration": 3.9381325244903564 + }, + { + "auxiliary_loss_clip": 0.01137531, + "auxiliary_loss_mlp": 0.01121627, + "balance_loss_clip": 1.00204098, + "balance_loss_mlp": 1.00070131, + "epoch": 0.40048098602134374, + "flos": 24681224638080.0, + "grad_norm": 1.8910678980653237, + "language_loss": 0.71170682, + "learning_rate": 2.7248433073055674e-06, + "loss": 0.73429841, + "num_input_tokens_seen": 143076995, + "step": 6661, + "time_per_iteration": 2.6208667755126953 + }, + { + "auxiliary_loss_clip": 0.01169323, + "auxiliary_loss_mlp": 0.0112209, + "balance_loss_clip": 1.00214803, + "balance_loss_mlp": 1.00078213, + "epoch": 0.4005411092740117, + "flos": 23185688947200.0, + "grad_norm": 2.0874054785825735, + "language_loss": 0.75297213, + "learning_rate": 2.724480309731437e-06, + "loss": 0.7758863, + "num_input_tokens_seen": 143096780, + "step": 6662, + "time_per_iteration": 4.0850255489349365 + }, + { + "auxiliary_loss_clip": 0.01154388, + "auxiliary_loss_mlp": 0.01121999, + "balance_loss_clip": 1.00220954, + "balance_loss_mlp": 1.00069153, + "epoch": 0.4006012325266797, + "flos": 17522324409600.0, + "grad_norm": 1.9214992883486057, + "language_loss": 0.6648739, + "learning_rate": 2.7241172846849417e-06, + "loss": 0.68763769, + "num_input_tokens_seen": 143112590, + "step": 6663, + "time_per_iteration": 2.5449535846710205 + }, + { + "auxiliary_loss_clip": 0.0115409, + "auxiliary_loss_mlp": 0.01121974, + "balance_loss_clip": 1.00206709, + "balance_loss_mlp": 1.00057137, + "epoch": 0.40066135577934764, + "flos": 19857200181120.0, + "grad_norm": 2.1145420476024395, + "language_loss": 0.85790825, + "learning_rate": 2.7237542321798455e-06, + "loss": 0.88066882, + "num_input_tokens_seen": 143130220, + "step": 6664, + "time_per_iteration": 2.54706072807312 + }, + { + "auxiliary_loss_clip": 0.01152563, + "auxiliary_loss_mlp": 0.01121962, + "balance_loss_clip": 1.00192559, + "balance_loss_mlp": 1.00055909, + "epoch": 0.40072147903201566, + "flos": 18150007599360.0, + "grad_norm": 2.017448833922069, + "language_loss": 0.84500909, + "learning_rate": 2.723391152229917e-06, + "loss": 0.8677544, + "num_input_tokens_seen": 143147160, + "step": 6665, + "time_per_iteration": 4.074612140655518 + }, + { + "auxiliary_loss_clip": 0.01152916, + "auxiliary_loss_mlp": 0.01122055, + "balance_loss_clip": 1.00205684, + "balance_loss_mlp": 1.00055671, + "epoch": 0.4007816022846836, + "flos": 18661267831680.0, + "grad_norm": 1.5924659060778772, + "language_loss": 0.7814064, + "learning_rate": 2.7230280448489236e-06, + "loss": 0.80415612, + "num_input_tokens_seen": 143164605, + "step": 6666, + "time_per_iteration": 2.5622637271881104 + }, + { + "auxiliary_loss_clip": 0.01152624, + "auxiliary_loss_mlp": 0.01122823, + "balance_loss_clip": 1.00208211, + "balance_loss_mlp": 1.00065708, + "epoch": 0.4008417255373516, + "flos": 25703170485120.0, + "grad_norm": 1.689954568660306, + "language_loss": 0.73678362, + "learning_rate": 2.7226649100506333e-06, + "loss": 0.75953805, + "num_input_tokens_seen": 143183965, + "step": 6667, + "time_per_iteration": 2.6069395542144775 + }, + { + "auxiliary_loss_clip": 0.0115451, + "auxiliary_loss_mlp": 0.01122388, + "balance_loss_clip": 1.00210834, + "balance_loss_mlp": 1.00079417, + "epoch": 0.40090184879001955, + "flos": 22858614679680.0, + "grad_norm": 1.422582676464539, + "language_loss": 0.75680119, + "learning_rate": 2.7223017478488183e-06, + "loss": 0.77957016, + "num_input_tokens_seen": 143204965, + "step": 6668, + "time_per_iteration": 2.6144938468933105 + }, + { + "auxiliary_loss_clip": 0.01121802, + "auxiliary_loss_mlp": 0.01122068, + "balance_loss_clip": 1.00213647, + "balance_loss_mlp": 1.00076032, + "epoch": 0.4009619720426875, + "flos": 29059848449280.0, + "grad_norm": 4.015232156175009, + "language_loss": 0.82191652, + "learning_rate": 2.721938558257248e-06, + "loss": 0.84435523, + "num_input_tokens_seen": 143225015, + "step": 6669, + "time_per_iteration": 2.7259488105773926 + }, + { + "auxiliary_loss_clip": 0.01139018, + "auxiliary_loss_mlp": 0.01103091, + "balance_loss_clip": 1.00224447, + "balance_loss_mlp": 1.00009465, + "epoch": 0.4010220952953555, + "flos": 66059763131520.0, + "grad_norm": 0.7062735277863977, + "language_loss": 0.53406608, + "learning_rate": 2.721575341289695e-06, + "loss": 0.55648714, + "num_input_tokens_seen": 143294925, + "step": 6670, + "time_per_iteration": 3.358551263809204 + }, + { + "auxiliary_loss_clip": 0.01104085, + "auxiliary_loss_mlp": 0.01122091, + "balance_loss_clip": 1.00183952, + "balance_loss_mlp": 1.00068784, + "epoch": 0.40108221854802345, + "flos": 29642822184960.0, + "grad_norm": 1.6202826681522273, + "language_loss": 0.88747323, + "learning_rate": 2.7212120969599333e-06, + "loss": 0.90973496, + "num_input_tokens_seen": 143314170, + "step": 6671, + "time_per_iteration": 2.747267723083496 + }, + { + "auxiliary_loss_clip": 0.01154223, + "auxiliary_loss_mlp": 0.01122129, + "balance_loss_clip": 1.00207782, + "balance_loss_mlp": 1.00063097, + "epoch": 0.4011423418006914, + "flos": 19929560129280.0, + "grad_norm": 1.8314287636457174, + "language_loss": 0.79238778, + "learning_rate": 2.720848825281736e-06, + "loss": 0.81515133, + "num_input_tokens_seen": 143330050, + "step": 6672, + "time_per_iteration": 2.561671018600464 + }, + { + "auxiliary_loss_clip": 0.01122804, + "auxiliary_loss_mlp": 0.01121789, + "balance_loss_clip": 1.00200689, + "balance_loss_mlp": 1.00057745, + "epoch": 0.4012024650533594, + "flos": 20084299920000.0, + "grad_norm": 2.234651427100597, + "language_loss": 0.63177431, + "learning_rate": 2.72048552626888e-06, + "loss": 0.65422022, + "num_input_tokens_seen": 143348650, + "step": 6673, + "time_per_iteration": 2.646263837814331 + }, + { + "auxiliary_loss_clip": 0.011361, + "auxiliary_loss_mlp": 0.00747695, + "balance_loss_clip": 1.00198984, + "balance_loss_mlp": 1.00055981, + "epoch": 0.40126258830602735, + "flos": 21695719864320.0, + "grad_norm": 1.381874012180602, + "language_loss": 0.79993999, + "learning_rate": 2.7201221999351402e-06, + "loss": 0.81877792, + "num_input_tokens_seen": 143370275, + "step": 6674, + "time_per_iteration": 2.6550402641296387 + }, + { + "auxiliary_loss_clip": 0.0110377, + "auxiliary_loss_mlp": 0.01122435, + "balance_loss_clip": 1.00174952, + "balance_loss_mlp": 1.000651, + "epoch": 0.4013227115586953, + "flos": 12020379592320.0, + "grad_norm": 7.587697050430161, + "language_loss": 0.82280684, + "learning_rate": 2.719758846294294e-06, + "loss": 0.84506893, + "num_input_tokens_seen": 143385390, + "step": 6675, + "time_per_iteration": 2.6664299964904785 + }, + { + "auxiliary_loss_clip": 0.01154261, + "auxiliary_loss_mlp": 0.01122952, + "balance_loss_clip": 1.00208437, + "balance_loss_mlp": 1.00059509, + "epoch": 0.4013828348113633, + "flos": 25447522412160.0, + "grad_norm": 1.7135967466106825, + "language_loss": 0.93222666, + "learning_rate": 2.71939546536012e-06, + "loss": 0.95499885, + "num_input_tokens_seen": 143404215, + "step": 6676, + "time_per_iteration": 2.597635269165039 + }, + { + "auxiliary_loss_clip": 0.01152746, + "auxiliary_loss_mlp": 0.01123467, + "balance_loss_clip": 1.00208116, + "balance_loss_mlp": 1.00063372, + "epoch": 0.40144295806403124, + "flos": 18582946225920.0, + "grad_norm": 2.076776598927601, + "language_loss": 0.799909, + "learning_rate": 2.719032057146399e-06, + "loss": 0.82267112, + "num_input_tokens_seen": 143422245, + "step": 6677, + "time_per_iteration": 2.5539536476135254 + }, + { + "auxiliary_loss_clip": 0.01142325, + "auxiliary_loss_mlp": 0.01122502, + "balance_loss_clip": 1.00262249, + "balance_loss_mlp": 1.00062287, + "epoch": 0.4015030813166992, + "flos": 22930220442240.0, + "grad_norm": 1.8988220986893254, + "language_loss": 0.83598685, + "learning_rate": 2.71866862166691e-06, + "loss": 0.85863507, + "num_input_tokens_seen": 143443130, + "step": 6678, + "time_per_iteration": 2.6113977432250977 + }, + { + "auxiliary_loss_clip": 0.01169291, + "auxiliary_loss_mlp": 0.01122337, + "balance_loss_clip": 1.0021224, + "balance_loss_mlp": 1.00074387, + "epoch": 0.4015632045693672, + "flos": 20595057361920.0, + "grad_norm": 2.6611884069804734, + "language_loss": 0.63466465, + "learning_rate": 2.718305158935434e-06, + "loss": 0.65758097, + "num_input_tokens_seen": 143461385, + "step": 6679, + "time_per_iteration": 2.543762683868408 + }, + { + "auxiliary_loss_clip": 0.01137524, + "auxiliary_loss_mlp": 0.01121888, + "balance_loss_clip": 1.00216413, + "balance_loss_mlp": 1.00077152, + "epoch": 0.4016233278220352, + "flos": 23438930808960.0, + "grad_norm": 1.5374754049821966, + "language_loss": 0.78962398, + "learning_rate": 2.7179416689657554e-06, + "loss": 0.81221807, + "num_input_tokens_seen": 143481750, + "step": 6680, + "time_per_iteration": 2.629720687866211 + }, + { + "auxiliary_loss_clip": 0.01121159, + "auxiliary_loss_mlp": 0.00747917, + "balance_loss_clip": 1.00184917, + "balance_loss_mlp": 1.00068116, + "epoch": 0.40168345107470316, + "flos": 21431057477760.0, + "grad_norm": 1.518872002774598, + "language_loss": 0.75855434, + "learning_rate": 2.7175781517716556e-06, + "loss": 0.7772451, + "num_input_tokens_seen": 143501540, + "step": 6681, + "time_per_iteration": 2.6734702587127686 + }, + { + "auxiliary_loss_clip": 0.01111853, + "auxiliary_loss_mlp": 0.01123666, + "balance_loss_clip": 1.0022831, + "balance_loss_mlp": 1.00054681, + "epoch": 0.4017435743273711, + "flos": 22857214049280.0, + "grad_norm": 1.8655538894898847, + "language_loss": 0.63879001, + "learning_rate": 2.7172146073669213e-06, + "loss": 0.66114515, + "num_input_tokens_seen": 143520530, + "step": 6682, + "time_per_iteration": 2.6997463703155518 + }, + { + "auxiliary_loss_clip": 0.01105214, + "auxiliary_loss_mlp": 0.01122169, + "balance_loss_clip": 1.001827, + "balance_loss_mlp": 1.00067103, + "epoch": 0.4018036975800391, + "flos": 28622312881920.0, + "grad_norm": 2.606912777100563, + "language_loss": 0.73257774, + "learning_rate": 2.716851035765337e-06, + "loss": 0.75485158, + "num_input_tokens_seen": 143540210, + "step": 6683, + "time_per_iteration": 2.7134037017822266 + }, + { + "auxiliary_loss_clip": 0.01152602, + "auxiliary_loss_mlp": 0.01122598, + "balance_loss_clip": 1.00212836, + "balance_loss_mlp": 1.00081348, + "epoch": 0.40186382083270705, + "flos": 26651212099200.0, + "grad_norm": 1.7353538762406902, + "language_loss": 0.72919869, + "learning_rate": 2.7164874369806896e-06, + "loss": 0.75195068, + "num_input_tokens_seen": 143560940, + "step": 6684, + "time_per_iteration": 2.648656129837036 + }, + { + "auxiliary_loss_clip": 0.01154979, + "auxiliary_loss_mlp": 0.01103186, + "balance_loss_clip": 1.00219798, + "balance_loss_mlp": 1.00018907, + "epoch": 0.401923944085375, + "flos": 59259969123840.0, + "grad_norm": 0.8034950592110879, + "language_loss": 0.60414493, + "learning_rate": 2.716123811026767e-06, + "loss": 0.62672651, + "num_input_tokens_seen": 143624015, + "step": 6685, + "time_per_iteration": 3.263019323348999 + }, + { + "auxiliary_loss_clip": 0.01159093, + "auxiliary_loss_mlp": 0.0112316, + "balance_loss_clip": 1.00226486, + "balance_loss_mlp": 1.0006125, + "epoch": 0.401984067338043, + "flos": 16982803152000.0, + "grad_norm": 1.6232550847451253, + "language_loss": 0.69792736, + "learning_rate": 2.715760157917357e-06, + "loss": 0.72074991, + "num_input_tokens_seen": 143642750, + "step": 6686, + "time_per_iteration": 2.5888068675994873 + }, + { + "auxiliary_loss_clip": 0.01136456, + "auxiliary_loss_mlp": 0.01122177, + "balance_loss_clip": 1.00206876, + "balance_loss_mlp": 1.00067925, + "epoch": 0.40204419059071095, + "flos": 24972496024320.0, + "grad_norm": 1.4050537857279488, + "language_loss": 0.74678361, + "learning_rate": 2.7153964776662504e-06, + "loss": 0.76936996, + "num_input_tokens_seen": 143664515, + "step": 6687, + "time_per_iteration": 2.624908208847046 + }, + { + "auxiliary_loss_clip": 0.01136241, + "auxiliary_loss_mlp": 0.0112332, + "balance_loss_clip": 1.00213242, + "balance_loss_mlp": 1.0005821, + "epoch": 0.4021043138433789, + "flos": 23477463123840.0, + "grad_norm": 1.743635340045074, + "language_loss": 0.70979947, + "learning_rate": 2.7150327702872385e-06, + "loss": 0.73239505, + "num_input_tokens_seen": 143683135, + "step": 6688, + "time_per_iteration": 2.6028687953948975 + }, + { + "auxiliary_loss_clip": 0.01137716, + "auxiliary_loss_mlp": 0.01124566, + "balance_loss_clip": 1.00212097, + "balance_loss_mlp": 1.00077891, + "epoch": 0.4021644370960469, + "flos": 25995806588160.0, + "grad_norm": 1.6050377447173132, + "language_loss": 0.6460799, + "learning_rate": 2.7146690357941112e-06, + "loss": 0.66870272, + "num_input_tokens_seen": 143703985, + "step": 6689, + "time_per_iteration": 2.6182994842529297 + }, + { + "auxiliary_loss_clip": 0.01153878, + "auxiliary_loss_mlp": 0.01122976, + "balance_loss_clip": 1.00205588, + "balance_loss_mlp": 1.00052476, + "epoch": 0.40222456034871484, + "flos": 13587987922560.0, + "grad_norm": 2.6800576736205315, + "language_loss": 0.73245388, + "learning_rate": 2.7143052742006632e-06, + "loss": 0.75522244, + "num_input_tokens_seen": 143719245, + "step": 6690, + "time_per_iteration": 2.5577993392944336 + }, + { + "auxiliary_loss_clip": 0.01122633, + "auxiliary_loss_mlp": 0.0112306, + "balance_loss_clip": 1.0020256, + "balance_loss_mlp": 1.00079882, + "epoch": 0.4022846836013828, + "flos": 24278019494400.0, + "grad_norm": 1.6412721218019668, + "language_loss": 0.74876225, + "learning_rate": 2.7139414855206872e-06, + "loss": 0.77121913, + "num_input_tokens_seen": 143739575, + "step": 6691, + "time_per_iteration": 2.701050281524658 + }, + { + "auxiliary_loss_clip": 0.01137628, + "auxiliary_loss_mlp": 0.01123659, + "balance_loss_clip": 1.00220835, + "balance_loss_mlp": 1.00073087, + "epoch": 0.40234480685405083, + "flos": 20151596050560.0, + "grad_norm": 1.701228363261786, + "language_loss": 0.72589201, + "learning_rate": 2.7135776697679785e-06, + "loss": 0.74850488, + "num_input_tokens_seen": 143758515, + "step": 6692, + "time_per_iteration": 2.653071641921997 + }, + { + "auxiliary_loss_clip": 0.01105685, + "auxiliary_loss_mlp": 0.01122758, + "balance_loss_clip": 1.00175071, + "balance_loss_mlp": 1.00078309, + "epoch": 0.4024049301067188, + "flos": 22930220442240.0, + "grad_norm": 2.039662610664577, + "language_loss": 0.84429789, + "learning_rate": 2.7132138269563333e-06, + "loss": 0.86658233, + "num_input_tokens_seen": 143776770, + "step": 6693, + "time_per_iteration": 2.6913137435913086 + }, + { + "auxiliary_loss_clip": 0.01120971, + "auxiliary_loss_mlp": 0.01122789, + "balance_loss_clip": 1.00200331, + "balance_loss_mlp": 1.00091004, + "epoch": 0.40246505335938676, + "flos": 36028421487360.0, + "grad_norm": 1.6654534377056773, + "language_loss": 0.71006757, + "learning_rate": 2.7128499570995483e-06, + "loss": 0.73250514, + "num_input_tokens_seen": 143798450, + "step": 6694, + "time_per_iteration": 2.7573230266571045 + }, + { + "auxiliary_loss_clip": 0.01136244, + "auxiliary_loss_mlp": 0.01122463, + "balance_loss_clip": 1.00202012, + "balance_loss_mlp": 1.00067854, + "epoch": 0.4025251766120547, + "flos": 20594303176320.0, + "grad_norm": 2.6204947914298296, + "language_loss": 0.67569017, + "learning_rate": 2.7124860602114212e-06, + "loss": 0.69827724, + "num_input_tokens_seen": 143816995, + "step": 6695, + "time_per_iteration": 2.598299264907837 + }, + { + "auxiliary_loss_clip": 0.01137397, + "auxiliary_loss_mlp": 0.01122854, + "balance_loss_clip": 1.00200546, + "balance_loss_mlp": 1.00059271, + "epoch": 0.4025852998647227, + "flos": 64523932381440.0, + "grad_norm": 2.0541895410067483, + "language_loss": 0.7925306, + "learning_rate": 2.7121221363057515e-06, + "loss": 0.81513315, + "num_input_tokens_seen": 143842090, + "step": 6696, + "time_per_iteration": 4.331331729888916 + }, + { + "auxiliary_loss_clip": 0.01137423, + "auxiliary_loss_mlp": 0.01123234, + "balance_loss_clip": 1.00206828, + "balance_loss_mlp": 1.00078189, + "epoch": 0.40264542311739066, + "flos": 20886292834560.0, + "grad_norm": 1.770345057397368, + "language_loss": 0.70763993, + "learning_rate": 2.7117581853963393e-06, + "loss": 0.73024642, + "num_input_tokens_seen": 143860800, + "step": 6697, + "time_per_iteration": 3.992157220840454 + }, + { + "auxiliary_loss_clip": 0.0115403, + "auxiliary_loss_mlp": 0.01122511, + "balance_loss_clip": 1.00209057, + "balance_loss_mlp": 1.00091779, + "epoch": 0.4027055463700586, + "flos": 26250197685120.0, + "grad_norm": 2.0854803670582034, + "language_loss": 0.61307454, + "learning_rate": 2.711394207496984e-06, + "loss": 0.63583994, + "num_input_tokens_seen": 143878950, + "step": 6698, + "time_per_iteration": 2.609496593475342 + }, + { + "auxiliary_loss_clip": 0.01152612, + "auxiliary_loss_mlp": 0.0112239, + "balance_loss_clip": 1.00202823, + "balance_loss_mlp": 1.00070095, + "epoch": 0.4027656696227266, + "flos": 20631398947200.0, + "grad_norm": 1.78749461762002, + "language_loss": 0.76256156, + "learning_rate": 2.711030202621491e-06, + "loss": 0.78531158, + "num_input_tokens_seen": 143898385, + "step": 6699, + "time_per_iteration": 4.048049211502075 + }, + { + "auxiliary_loss_clip": 0.01120656, + "auxiliary_loss_mlp": 0.01122234, + "balance_loss_clip": 1.00190294, + "balance_loss_mlp": 1.00054538, + "epoch": 0.40282579287539455, + "flos": 22346277039360.0, + "grad_norm": 1.52986131388882, + "language_loss": 0.80132699, + "learning_rate": 2.7106661707836605e-06, + "loss": 0.82375586, + "num_input_tokens_seen": 143918795, + "step": 6700, + "time_per_iteration": 2.630654811859131 + }, + { + "auxiliary_loss_clip": 0.01136421, + "auxiliary_loss_mlp": 0.01123504, + "balance_loss_clip": 1.00191522, + "balance_loss_mlp": 1.00076604, + "epoch": 0.4028859161280625, + "flos": 29274988959360.0, + "grad_norm": 2.0386980582725682, + "language_loss": 0.74724221, + "learning_rate": 2.7103021119972977e-06, + "loss": 0.76984143, + "num_input_tokens_seen": 143938245, + "step": 6701, + "time_per_iteration": 2.688796043395996 + }, + { + "auxiliary_loss_clip": 0.0113772, + "auxiliary_loss_mlp": 0.01122522, + "balance_loss_clip": 1.00205731, + "balance_loss_mlp": 1.00064242, + "epoch": 0.4029460393807305, + "flos": 28622312881920.0, + "grad_norm": 1.5352211303710732, + "language_loss": 0.65638381, + "learning_rate": 2.709938026276208e-06, + "loss": 0.67898625, + "num_input_tokens_seen": 143960995, + "step": 6702, + "time_per_iteration": 2.6566596031188965 + }, + { + "auxiliary_loss_clip": 0.01137803, + "auxiliary_loss_mlp": 0.01122984, + "balance_loss_clip": 1.00210238, + "balance_loss_mlp": 1.00091362, + "epoch": 0.40300616263339845, + "flos": 22601925112320.0, + "grad_norm": 1.8078485623646148, + "language_loss": 0.66121101, + "learning_rate": 2.7095739136341964e-06, + "loss": 0.68381888, + "num_input_tokens_seen": 143979910, + "step": 6703, + "time_per_iteration": 3.988788366317749 + }, + { + "auxiliary_loss_clip": 0.01074168, + "auxiliary_loss_mlp": 0.0112304, + "balance_loss_clip": 1.0018214, + "balance_loss_mlp": 1.00068402, + "epoch": 0.4030662858860664, + "flos": 25520313323520.0, + "grad_norm": 1.7480927950458316, + "language_loss": 0.82216203, + "learning_rate": 2.709209774085071e-06, + "loss": 0.84413409, + "num_input_tokens_seen": 144000095, + "step": 6704, + "time_per_iteration": 2.904780149459839 + }, + { + "auxiliary_loss_clip": 0.01142163, + "auxiliary_loss_mlp": 0.01122917, + "balance_loss_clip": 1.00271821, + "balance_loss_mlp": 1.00065637, + "epoch": 0.40312640913873443, + "flos": 23586703361280.0, + "grad_norm": 1.5340184895021913, + "language_loss": 0.73349261, + "learning_rate": 2.7088456076426407e-06, + "loss": 0.75614333, + "num_input_tokens_seen": 144019695, + "step": 6705, + "time_per_iteration": 2.834052085876465 + }, + { + "auxiliary_loss_clip": 0.01152289, + "auxiliary_loss_mlp": 0.01121998, + "balance_loss_clip": 1.00203848, + "balance_loss_mlp": 1.00069022, + "epoch": 0.4031865323914024, + "flos": 20011042131840.0, + "grad_norm": 1.8694326881941459, + "language_loss": 0.66750902, + "learning_rate": 2.708481414320713e-06, + "loss": 0.69025183, + "num_input_tokens_seen": 144038525, + "step": 6706, + "time_per_iteration": 2.553410768508911 + }, + { + "auxiliary_loss_clip": 0.0115264, + "auxiliary_loss_mlp": 0.01122202, + "balance_loss_clip": 1.00212598, + "balance_loss_mlp": 1.00070381, + "epoch": 0.40324665564407036, + "flos": 21871430219520.0, + "grad_norm": 1.4339579126910909, + "language_loss": 0.71252251, + "learning_rate": 2.7081171941330992e-06, + "loss": 0.73527098, + "num_input_tokens_seen": 144059485, + "step": 6707, + "time_per_iteration": 2.563655376434326 + }, + { + "auxiliary_loss_clip": 0.01135536, + "auxiliary_loss_mlp": 0.01121137, + "balance_loss_clip": 1.00193715, + "balance_loss_mlp": 1.0004971, + "epoch": 0.4033067788967383, + "flos": 23878728933120.0, + "grad_norm": 1.5492479666623507, + "language_loss": 0.80101478, + "learning_rate": 2.707752947093611e-06, + "loss": 0.82358146, + "num_input_tokens_seen": 144080265, + "step": 6708, + "time_per_iteration": 2.665501594543457 + }, + { + "auxiliary_loss_clip": 0.01104783, + "auxiliary_loss_mlp": 0.0112285, + "balance_loss_clip": 1.00182414, + "balance_loss_mlp": 1.00077939, + "epoch": 0.4033669021494063, + "flos": 17419907756160.0, + "grad_norm": 2.0823800762880427, + "language_loss": 0.82722509, + "learning_rate": 2.70738867321606e-06, + "loss": 0.84950149, + "num_input_tokens_seen": 144098040, + "step": 6709, + "time_per_iteration": 2.646529197692871 + }, + { + "auxiliary_loss_clip": 0.01152498, + "auxiliary_loss_mlp": 0.01123211, + "balance_loss_clip": 1.00203276, + "balance_loss_mlp": 1.00075972, + "epoch": 0.40342702540207426, + "flos": 29600554855680.0, + "grad_norm": 1.417589688000101, + "language_loss": 0.71388751, + "learning_rate": 2.70702437251426e-06, + "loss": 0.73664463, + "num_input_tokens_seen": 144118265, + "step": 6710, + "time_per_iteration": 2.610419511795044 + }, + { + "auxiliary_loss_clip": 0.01137558, + "auxiliary_loss_mlp": 0.01122379, + "balance_loss_clip": 1.00201368, + "balance_loss_mlp": 1.00068974, + "epoch": 0.4034871486547422, + "flos": 11284605400320.0, + "grad_norm": 1.900428918416893, + "language_loss": 0.85027432, + "learning_rate": 2.7066600450020236e-06, + "loss": 0.87287366, + "num_input_tokens_seen": 144133865, + "step": 6711, + "time_per_iteration": 2.5675501823425293 + }, + { + "auxiliary_loss_clip": 0.01154254, + "auxiliary_loss_mlp": 0.01122529, + "balance_loss_clip": 1.00211883, + "balance_loss_mlp": 1.00064957, + "epoch": 0.4035472719074102, + "flos": 15552839738880.0, + "grad_norm": 2.8614756847663227, + "language_loss": 0.76444054, + "learning_rate": 2.706295690693168e-06, + "loss": 0.78720838, + "num_input_tokens_seen": 144150125, + "step": 6712, + "time_per_iteration": 2.526949644088745 + }, + { + "auxiliary_loss_clip": 0.01144207, + "auxiliary_loss_mlp": 0.01122454, + "balance_loss_clip": 1.00245452, + "balance_loss_mlp": 1.00066972, + "epoch": 0.40360739516007815, + "flos": 24674365140480.0, + "grad_norm": 1.9975984611865383, + "language_loss": 0.7893635, + "learning_rate": 2.7059313096015096e-06, + "loss": 0.81203008, + "num_input_tokens_seen": 144169295, + "step": 6713, + "time_per_iteration": 2.6349682807922363 + }, + { + "auxiliary_loss_clip": 0.01123475, + "auxiliary_loss_mlp": 0.01122235, + "balance_loss_clip": 1.00198257, + "balance_loss_mlp": 1.00064111, + "epoch": 0.4036675184127461, + "flos": 17304095329920.0, + "grad_norm": 1.8717732997413945, + "language_loss": 0.87983108, + "learning_rate": 2.705566901740865e-06, + "loss": 0.9022882, + "num_input_tokens_seen": 144185790, + "step": 6714, + "time_per_iteration": 2.666748523712158 + }, + { + "auxiliary_loss_clip": 0.01152809, + "auxiliary_loss_mlp": 0.01122075, + "balance_loss_clip": 1.00219226, + "balance_loss_mlp": 1.0005765, + "epoch": 0.4037276416654141, + "flos": 19864023765120.0, + "grad_norm": 1.868487196300914, + "language_loss": 0.69196922, + "learning_rate": 2.7052024671250527e-06, + "loss": 0.7147181, + "num_input_tokens_seen": 144205190, + "step": 6715, + "time_per_iteration": 2.556379795074463 + }, + { + "auxiliary_loss_clip": 0.01105448, + "auxiliary_loss_mlp": 0.01122564, + "balance_loss_clip": 1.00191009, + "balance_loss_mlp": 1.00058961, + "epoch": 0.40378776491808205, + "flos": 18296271780480.0, + "grad_norm": 1.944509309995629, + "language_loss": 0.77410239, + "learning_rate": 2.704838005767892e-06, + "loss": 0.79638255, + "num_input_tokens_seen": 144222705, + "step": 6716, + "time_per_iteration": 2.6679484844207764 + }, + { + "auxiliary_loss_clip": 0.01105218, + "auxiliary_loss_mlp": 0.01121234, + "balance_loss_clip": 1.00199389, + "balance_loss_mlp": 1.00068951, + "epoch": 0.40384788817075, + "flos": 15049372757760.0, + "grad_norm": 1.9634309429781025, + "language_loss": 0.76118189, + "learning_rate": 2.7044735176832037e-06, + "loss": 0.78344643, + "num_input_tokens_seen": 144239545, + "step": 6717, + "time_per_iteration": 2.6685633659362793 + }, + { + "auxiliary_loss_clip": 0.01138616, + "auxiliary_loss_mlp": 0.01102332, + "balance_loss_clip": 1.00252056, + "balance_loss_mlp": 1.00009847, + "epoch": 0.40390801142341803, + "flos": 61929927895680.0, + "grad_norm": 0.92160428431238, + "language_loss": 0.60759568, + "learning_rate": 2.7041090028848084e-06, + "loss": 0.63000512, + "num_input_tokens_seen": 144288145, + "step": 6718, + "time_per_iteration": 3.1269874572753906 + }, + { + "auxiliary_loss_clip": 0.01169397, + "auxiliary_loss_mlp": 0.01122718, + "balance_loss_clip": 1.00213957, + "balance_loss_mlp": 1.00083828, + "epoch": 0.403968134676086, + "flos": 22738779930240.0, + "grad_norm": 1.8640159455895051, + "language_loss": 0.74662697, + "learning_rate": 2.7037444613865306e-06, + "loss": 0.76954812, + "num_input_tokens_seen": 144302315, + "step": 6719, + "time_per_iteration": 2.5596089363098145 + }, + { + "auxiliary_loss_clip": 0.01154218, + "auxiliary_loss_mlp": 0.01121543, + "balance_loss_clip": 1.00208879, + "balance_loss_mlp": 1.00071287, + "epoch": 0.40402825792875396, + "flos": 19784409269760.0, + "grad_norm": 1.9786962782041466, + "language_loss": 0.8170315, + "learning_rate": 2.7033798932021906e-06, + "loss": 0.83978909, + "num_input_tokens_seen": 144318990, + "step": 6720, + "time_per_iteration": 2.591323137283325 + }, + { + "auxiliary_loss_clip": 0.01135431, + "auxiliary_loss_mlp": 0.0112215, + "balance_loss_clip": 1.00190425, + "balance_loss_mlp": 1.00065172, + "epoch": 0.40408838118142193, + "flos": 19609273532160.0, + "grad_norm": 1.8870398144050646, + "language_loss": 0.76695383, + "learning_rate": 2.7030152983456153e-06, + "loss": 0.78952968, + "num_input_tokens_seen": 144335765, + "step": 6721, + "time_per_iteration": 2.589508295059204 + }, + { + "auxiliary_loss_clip": 0.01120013, + "auxiliary_loss_mlp": 0.01121253, + "balance_loss_clip": 1.00195122, + "balance_loss_mlp": 1.00051832, + "epoch": 0.4041485044340899, + "flos": 24426043441920.0, + "grad_norm": 1.5745380680824685, + "language_loss": 0.72524434, + "learning_rate": 2.7026506768306304e-06, + "loss": 0.74765706, + "num_input_tokens_seen": 144355825, + "step": 6722, + "time_per_iteration": 2.713176965713501 + }, + { + "auxiliary_loss_clip": 0.01152349, + "auxiliary_loss_mlp": 0.01121068, + "balance_loss_clip": 1.00199544, + "balance_loss_mlp": 1.00061882, + "epoch": 0.40420862768675786, + "flos": 16760192613120.0, + "grad_norm": 1.998267300907205, + "language_loss": 0.65775919, + "learning_rate": 2.7022860286710602e-06, + "loss": 0.68049335, + "num_input_tokens_seen": 144374320, + "step": 6723, + "time_per_iteration": 2.5384438037872314 + }, + { + "auxiliary_loss_clip": 0.01152904, + "auxiliary_loss_mlp": 0.01122629, + "balance_loss_clip": 1.0019778, + "balance_loss_mlp": 1.00074935, + "epoch": 0.4042687509394258, + "flos": 22491571553280.0, + "grad_norm": 1.4094453955368047, + "language_loss": 0.73631245, + "learning_rate": 2.701921353880734e-06, + "loss": 0.75906777, + "num_input_tokens_seen": 144394325, + "step": 6724, + "time_per_iteration": 2.560986042022705 + }, + { + "auxiliary_loss_clip": 0.01136028, + "auxiliary_loss_mlp": 0.01120719, + "balance_loss_clip": 1.00194168, + "balance_loss_mlp": 1.0006516, + "epoch": 0.4043288741920938, + "flos": 30336149479680.0, + "grad_norm": 1.7339095124563626, + "language_loss": 0.74477744, + "learning_rate": 2.7015566524734787e-06, + "loss": 0.76734495, + "num_input_tokens_seen": 144412765, + "step": 6725, + "time_per_iteration": 2.7128255367279053 + }, + { + "auxiliary_loss_clip": 0.01152206, + "auxiliary_loss_mlp": 0.01121531, + "balance_loss_clip": 1.00196505, + "balance_loss_mlp": 1.00060499, + "epoch": 0.40438899744476176, + "flos": 46348321363200.0, + "grad_norm": 1.5709526292574802, + "language_loss": 0.7690022, + "learning_rate": 2.701191924463126e-06, + "loss": 0.79173958, + "num_input_tokens_seen": 144435400, + "step": 6726, + "time_per_iteration": 2.7660512924194336 + }, + { + "auxiliary_loss_clip": 0.01138498, + "auxiliary_loss_mlp": 0.00747741, + "balance_loss_clip": 1.00202215, + "balance_loss_mlp": 1.00069964, + "epoch": 0.4044491206974297, + "flos": 13333524998400.0, + "grad_norm": 2.6612759391801175, + "language_loss": 0.81100368, + "learning_rate": 2.7008271698635054e-06, + "loss": 0.82986605, + "num_input_tokens_seen": 144452925, + "step": 6727, + "time_per_iteration": 2.6138393878936768 + }, + { + "auxiliary_loss_clip": 0.01169119, + "auxiliary_loss_mlp": 0.01120802, + "balance_loss_clip": 1.00212348, + "balance_loss_mlp": 1.00063932, + "epoch": 0.4045092439500977, + "flos": 12093745121280.0, + "grad_norm": 1.8098714308167445, + "language_loss": 0.84543526, + "learning_rate": 2.700462388688447e-06, + "loss": 0.86833453, + "num_input_tokens_seen": 144470195, + "step": 6728, + "time_per_iteration": 2.4903407096862793 + }, + { + "auxiliary_loss_clip": 0.01121064, + "auxiliary_loss_mlp": 0.01121224, + "balance_loss_clip": 1.00202477, + "balance_loss_mlp": 1.00067925, + "epoch": 0.40456936720276565, + "flos": 21179683123200.0, + "grad_norm": 1.9402406439684372, + "language_loss": 0.81328452, + "learning_rate": 2.700097580951786e-06, + "loss": 0.83570737, + "num_input_tokens_seen": 144490320, + "step": 6729, + "time_per_iteration": 2.6399223804473877 + }, + { + "auxiliary_loss_clip": 0.0113687, + "auxiliary_loss_mlp": 0.01122075, + "balance_loss_clip": 1.00199294, + "balance_loss_mlp": 1.00057685, + "epoch": 0.4046294904554336, + "flos": 23915286000000.0, + "grad_norm": 2.125194996339627, + "language_loss": 0.73095238, + "learning_rate": 2.6997327466673533e-06, + "loss": 0.75354183, + "num_input_tokens_seen": 144508990, + "step": 6730, + "time_per_iteration": 2.6105258464813232 + }, + { + "auxiliary_loss_clip": 0.01154109, + "auxiliary_loss_mlp": 0.01121823, + "balance_loss_clip": 1.002141, + "balance_loss_mlp": 1.0007062, + "epoch": 0.4046896137081016, + "flos": 38071235773440.0, + "grad_norm": 1.6206530436465518, + "language_loss": 0.67390573, + "learning_rate": 2.699367885848985e-06, + "loss": 0.69666505, + "num_input_tokens_seen": 144529550, + "step": 6731, + "time_per_iteration": 2.7082266807556152 + }, + { + "auxiliary_loss_clip": 0.01169119, + "auxiliary_loss_mlp": 0.01121099, + "balance_loss_clip": 1.00209415, + "balance_loss_mlp": 1.00064957, + "epoch": 0.4047497369607696, + "flos": 23617262856960.0, + "grad_norm": 1.8417694676316918, + "language_loss": 0.74106419, + "learning_rate": 2.699002998510517e-06, + "loss": 0.76396632, + "num_input_tokens_seen": 144549310, + "step": 6732, + "time_per_iteration": 2.5306410789489746 + }, + { + "auxiliary_loss_clip": 0.01136166, + "auxiliary_loss_mlp": 0.00747573, + "balance_loss_clip": 1.00206101, + "balance_loss_mlp": 1.00068605, + "epoch": 0.40480986021343757, + "flos": 12823593569280.0, + "grad_norm": 1.7930305576655832, + "language_loss": 0.77478689, + "learning_rate": 2.6986380846657852e-06, + "loss": 0.79362428, + "num_input_tokens_seen": 144567430, + "step": 6733, + "time_per_iteration": 3.964627981185913 + }, + { + "auxiliary_loss_clip": 0.01138937, + "auxiliary_loss_mlp": 0.01121636, + "balance_loss_clip": 1.0020113, + "balance_loss_mlp": 1.00071061, + "epoch": 0.40486998346610553, + "flos": 23768770423680.0, + "grad_norm": 1.7976670746511256, + "language_loss": 0.76720613, + "learning_rate": 2.698273144328627e-06, + "loss": 0.78981185, + "num_input_tokens_seen": 144585975, + "step": 6734, + "time_per_iteration": 2.607673406600952 + }, + { + "auxiliary_loss_clip": 0.01137304, + "auxiliary_loss_mlp": 0.01121869, + "balance_loss_clip": 1.00189459, + "balance_loss_mlp": 1.00056183, + "epoch": 0.4049301067187735, + "flos": 22856818999680.0, + "grad_norm": 2.0868660873270697, + "language_loss": 0.65095097, + "learning_rate": 2.6979081775128805e-06, + "loss": 0.67354262, + "num_input_tokens_seen": 144605225, + "step": 6735, + "time_per_iteration": 4.237279653549194 + }, + { + "auxiliary_loss_clip": 0.011216, + "auxiliary_loss_mlp": 0.01121099, + "balance_loss_clip": 1.00188184, + "balance_loss_mlp": 1.00064945, + "epoch": 0.40499022997144146, + "flos": 22783992174720.0, + "grad_norm": 6.032949988761962, + "language_loss": 0.83313334, + "learning_rate": 2.697543184232387e-06, + "loss": 0.85556036, + "num_input_tokens_seen": 144624145, + "step": 6736, + "time_per_iteration": 2.7547688484191895 + }, + { + "auxiliary_loss_clip": 0.01121479, + "auxiliary_loss_mlp": 0.00747735, + "balance_loss_clip": 1.00200558, + "balance_loss_mlp": 1.00069535, + "epoch": 0.4050503532241094, + "flos": 23039352938880.0, + "grad_norm": 1.6279811150244905, + "language_loss": 0.74971509, + "learning_rate": 2.6971781645009863e-06, + "loss": 0.76840723, + "num_input_tokens_seen": 144644470, + "step": 6737, + "time_per_iteration": 4.18938946723938 + }, + { + "auxiliary_loss_clip": 0.01152444, + "auxiliary_loss_mlp": 0.01120707, + "balance_loss_clip": 1.00207818, + "balance_loss_mlp": 1.00083005, + "epoch": 0.4051104764767774, + "flos": 16647756065280.0, + "grad_norm": 1.945711651578084, + "language_loss": 0.71664935, + "learning_rate": 2.696813118332519e-06, + "loss": 0.73938084, + "num_input_tokens_seen": 144661055, + "step": 6738, + "time_per_iteration": 2.5375547409057617 + }, + { + "auxiliary_loss_clip": 0.0112086, + "auxiliary_loss_mlp": 0.01120551, + "balance_loss_clip": 1.00187993, + "balance_loss_mlp": 1.00057924, + "epoch": 0.40517059972944536, + "flos": 16358962717440.0, + "grad_norm": 1.7399750525571127, + "language_loss": 0.74838948, + "learning_rate": 2.696448045740828e-06, + "loss": 0.77080357, + "num_input_tokens_seen": 144677935, + "step": 6739, + "time_per_iteration": 2.628018379211426 + }, + { + "auxiliary_loss_clip": 0.0112282, + "auxiliary_loss_mlp": 0.0112158, + "balance_loss_clip": 1.00187147, + "balance_loss_mlp": 1.00055897, + "epoch": 0.4052307229821133, + "flos": 28803374363520.0, + "grad_norm": 4.818383725929556, + "language_loss": 0.73957777, + "learning_rate": 2.6960829467397576e-06, + "loss": 0.76202178, + "num_input_tokens_seen": 144697725, + "step": 6740, + "time_per_iteration": 4.111506700515747 + }, + { + "auxiliary_loss_clip": 0.01153857, + "auxiliary_loss_mlp": 0.01120763, + "balance_loss_clip": 1.00216806, + "balance_loss_mlp": 1.00050426, + "epoch": 0.4052908462347813, + "flos": 21397876289280.0, + "grad_norm": 1.5662787513615744, + "language_loss": 0.76987267, + "learning_rate": 2.695717821343153e-06, + "loss": 0.79261887, + "num_input_tokens_seen": 144718805, + "step": 6741, + "time_per_iteration": 2.579547882080078 + }, + { + "auxiliary_loss_clip": 0.01169018, + "auxiliary_loss_mlp": 0.01121593, + "balance_loss_clip": 1.00201941, + "balance_loss_mlp": 1.0006671, + "epoch": 0.40535096948744925, + "flos": 22419067950720.0, + "grad_norm": 2.708689667340875, + "language_loss": 0.713745, + "learning_rate": 2.6953526695648577e-06, + "loss": 0.73665106, + "num_input_tokens_seen": 144737105, + "step": 6742, + "time_per_iteration": 2.5219669342041016 + }, + { + "auxiliary_loss_clip": 0.01169124, + "auxiliary_loss_mlp": 0.01121288, + "balance_loss_clip": 1.00210643, + "balance_loss_mlp": 1.00055242, + "epoch": 0.4054110927401172, + "flos": 17010776868480.0, + "grad_norm": 2.304697076201456, + "language_loss": 0.72776204, + "learning_rate": 2.6949874914187202e-06, + "loss": 0.7506662, + "num_input_tokens_seen": 144751350, + "step": 6743, + "time_per_iteration": 2.494419574737549 + }, + { + "auxiliary_loss_clip": 0.01138688, + "auxiliary_loss_mlp": 0.01122695, + "balance_loss_clip": 1.00219297, + "balance_loss_mlp": 1.00062513, + "epoch": 0.4054712159927852, + "flos": 21614848392960.0, + "grad_norm": 1.8913809176874226, + "language_loss": 0.71006143, + "learning_rate": 2.694622286918588e-06, + "loss": 0.73267531, + "num_input_tokens_seen": 144770030, + "step": 6744, + "time_per_iteration": 2.625213861465454 + }, + { + "auxiliary_loss_clip": 0.01152277, + "auxiliary_loss_mlp": 0.01120823, + "balance_loss_clip": 1.0019623, + "balance_loss_mlp": 1.00066042, + "epoch": 0.4055313392454532, + "flos": 25812554376960.0, + "grad_norm": 1.4928751268578975, + "language_loss": 0.79876006, + "learning_rate": 2.6942570560783076e-06, + "loss": 0.82149106, + "num_input_tokens_seen": 144790965, + "step": 6745, + "time_per_iteration": 2.615931749343872 + }, + { + "auxiliary_loss_clip": 0.01143369, + "auxiliary_loss_mlp": 0.01121502, + "balance_loss_clip": 1.00250447, + "balance_loss_mlp": 1.00067127, + "epoch": 0.40559146249812117, + "flos": 14137098111360.0, + "grad_norm": 1.8190949875136504, + "language_loss": 0.67078364, + "learning_rate": 2.693891798911731e-06, + "loss": 0.69343233, + "num_input_tokens_seen": 144807755, + "step": 6746, + "time_per_iteration": 2.535529851913452 + }, + { + "auxiliary_loss_clip": 0.01121005, + "auxiliary_loss_mlp": 0.01121208, + "balance_loss_clip": 1.00204122, + "balance_loss_mlp": 1.00056827, + "epoch": 0.40565158575078913, + "flos": 41355481962240.0, + "grad_norm": 1.5705664613225405, + "language_loss": 0.57147473, + "learning_rate": 2.6935265154327075e-06, + "loss": 0.59389687, + "num_input_tokens_seen": 144832405, + "step": 6747, + "time_per_iteration": 2.8446946144104004 + }, + { + "auxiliary_loss_clip": 0.01125677, + "auxiliary_loss_mlp": 0.01122117, + "balance_loss_clip": 1.00228763, + "balance_loss_mlp": 1.00080931, + "epoch": 0.4057117090034571, + "flos": 28544529980160.0, + "grad_norm": 1.6055440569962431, + "language_loss": 0.84675711, + "learning_rate": 2.693161205655089e-06, + "loss": 0.86923504, + "num_input_tokens_seen": 144853890, + "step": 6748, + "time_per_iteration": 2.663261890411377 + }, + { + "auxiliary_loss_clip": 0.01136173, + "auxiliary_loss_mlp": 0.01121781, + "balance_loss_clip": 1.00190473, + "balance_loss_mlp": 1.00066459, + "epoch": 0.40577183225612506, + "flos": 18004066640640.0, + "grad_norm": 1.9690756627262134, + "language_loss": 0.81756568, + "learning_rate": 2.6927958695927287e-06, + "loss": 0.84014523, + "num_input_tokens_seen": 144871395, + "step": 6749, + "time_per_iteration": 2.54962158203125 + }, + { + "auxiliary_loss_clip": 0.01154117, + "auxiliary_loss_mlp": 0.00747771, + "balance_loss_clip": 1.0022018, + "balance_loss_mlp": 1.00054502, + "epoch": 0.40583195550879303, + "flos": 19536734016000.0, + "grad_norm": 1.670308119127838, + "language_loss": 0.75238514, + "learning_rate": 2.6924305072594784e-06, + "loss": 0.77140397, + "num_input_tokens_seen": 144890975, + "step": 6750, + "time_per_iteration": 2.581246852874756 + }, + { + "auxiliary_loss_clip": 0.01137484, + "auxiliary_loss_mlp": 0.01122211, + "balance_loss_clip": 1.00197315, + "balance_loss_mlp": 1.00061738, + "epoch": 0.405892078761461, + "flos": 22309468577280.0, + "grad_norm": 23.72897218538313, + "language_loss": 0.73690814, + "learning_rate": 2.692065118669195e-06, + "loss": 0.75950509, + "num_input_tokens_seen": 144908170, + "step": 6751, + "time_per_iteration": 2.5908684730529785 + }, + { + "auxiliary_loss_clip": 0.01111288, + "auxiliary_loss_mlp": 0.01122178, + "balance_loss_clip": 1.00218701, + "balance_loss_mlp": 1.00067949, + "epoch": 0.40595220201412896, + "flos": 25484402701440.0, + "grad_norm": 1.443565714156606, + "language_loss": 0.66841882, + "learning_rate": 2.6916997038357326e-06, + "loss": 0.69075346, + "num_input_tokens_seen": 144928020, + "step": 6752, + "time_per_iteration": 2.730588912963867 + }, + { + "auxiliary_loss_clip": 0.01104584, + "auxiliary_loss_mlp": 0.01121845, + "balance_loss_clip": 1.00194061, + "balance_loss_mlp": 1.00072789, + "epoch": 0.4060123252667969, + "flos": 49856004103680.0, + "grad_norm": 1.6588613522591942, + "language_loss": 0.707039, + "learning_rate": 2.691334262772948e-06, + "loss": 0.72930324, + "num_input_tokens_seen": 144951240, + "step": 6753, + "time_per_iteration": 2.9174273014068604 + }, + { + "auxiliary_loss_clip": 0.01139126, + "auxiliary_loss_mlp": 0.01122515, + "balance_loss_clip": 1.00208926, + "balance_loss_mlp": 1.00073123, + "epoch": 0.4060724485194649, + "flos": 21135476459520.0, + "grad_norm": 2.14475864554652, + "language_loss": 0.72037345, + "learning_rate": 2.690968795494699e-06, + "loss": 0.74298984, + "num_input_tokens_seen": 144969100, + "step": 6754, + "time_per_iteration": 2.618762969970703 + }, + { + "auxiliary_loss_clip": 0.01127068, + "auxiliary_loss_mlp": 0.01123087, + "balance_loss_clip": 1.00207782, + "balance_loss_mlp": 1.00082636, + "epoch": 0.40613257177213286, + "flos": 21758059918080.0, + "grad_norm": 2.0050157110979985, + "language_loss": 0.82914853, + "learning_rate": 2.690603302014844e-06, + "loss": 0.85165006, + "num_input_tokens_seen": 144987065, + "step": 6755, + "time_per_iteration": 2.6512069702148438 + }, + { + "auxiliary_loss_clip": 0.01111365, + "auxiliary_loss_mlp": 0.01122079, + "balance_loss_clip": 1.00194359, + "balance_loss_mlp": 1.00067639, + "epoch": 0.4061926950248008, + "flos": 25555074710400.0, + "grad_norm": 1.553895187275539, + "language_loss": 0.70543951, + "learning_rate": 2.6902377823472426e-06, + "loss": 0.72777396, + "num_input_tokens_seen": 145007310, + "step": 6756, + "time_per_iteration": 2.7123913764953613 + }, + { + "auxiliary_loss_clip": 0.01091964, + "auxiliary_loss_mlp": 0.00747817, + "balance_loss_clip": 1.00175214, + "balance_loss_mlp": 1.00049233, + "epoch": 0.4062528182774688, + "flos": 23695799944320.0, + "grad_norm": 1.713225391797213, + "language_loss": 0.78513169, + "learning_rate": 2.689872236505755e-06, + "loss": 0.8035295, + "num_input_tokens_seen": 145026210, + "step": 6757, + "time_per_iteration": 2.755568265914917 + }, + { + "auxiliary_loss_clip": 0.01136079, + "auxiliary_loss_mlp": 0.01122166, + "balance_loss_clip": 1.00205445, + "balance_loss_mlp": 1.00057209, + "epoch": 0.4063129415301368, + "flos": 21726027964800.0, + "grad_norm": 2.3233750817774648, + "language_loss": 0.78582883, + "learning_rate": 2.6895066645042437e-06, + "loss": 0.80841136, + "num_input_tokens_seen": 145045475, + "step": 6758, + "time_per_iteration": 2.6374552249908447 + }, + { + "auxiliary_loss_clip": 0.01119528, + "auxiliary_loss_mlp": 0.01122024, + "balance_loss_clip": 1.00197816, + "balance_loss_mlp": 1.00062132, + "epoch": 0.40637306478280477, + "flos": 12787575206400.0, + "grad_norm": 1.9291341131463378, + "language_loss": 0.8899169, + "learning_rate": 2.6891410663565703e-06, + "loss": 0.91233242, + "num_input_tokens_seen": 145062260, + "step": 6759, + "time_per_iteration": 2.630267858505249 + }, + { + "auxiliary_loss_clip": 0.01125668, + "auxiliary_loss_mlp": 0.01121619, + "balance_loss_clip": 1.0020752, + "balance_loss_mlp": 1.00059831, + "epoch": 0.40643318803547274, + "flos": 24024490323840.0, + "grad_norm": 2.0757657573256076, + "language_loss": 0.64156067, + "learning_rate": 2.688775442076598e-06, + "loss": 0.66403353, + "num_input_tokens_seen": 145082470, + "step": 6760, + "time_per_iteration": 2.6645524501800537 + }, + { + "auxiliary_loss_clip": 0.01154266, + "auxiliary_loss_mlp": 0.0112281, + "balance_loss_clip": 1.00202131, + "balance_loss_mlp": 1.00073993, + "epoch": 0.4064933112881407, + "flos": 25592421876480.0, + "grad_norm": 1.4092108428817403, + "language_loss": 0.75290108, + "learning_rate": 2.688409791678193e-06, + "loss": 0.77567184, + "num_input_tokens_seen": 145105685, + "step": 6761, + "time_per_iteration": 2.6151814460754395 + }, + { + "auxiliary_loss_clip": 0.01135923, + "auxiliary_loss_mlp": 0.01121118, + "balance_loss_clip": 1.00191808, + "balance_loss_mlp": 1.00066936, + "epoch": 0.40655343454080867, + "flos": 22054323294720.0, + "grad_norm": 2.0715346652536524, + "language_loss": 0.7001403, + "learning_rate": 2.6880441151752185e-06, + "loss": 0.72271067, + "num_input_tokens_seen": 145125590, + "step": 6762, + "time_per_iteration": 2.587104320526123 + }, + { + "auxiliary_loss_clip": 0.01152875, + "auxiliary_loss_mlp": 0.01122531, + "balance_loss_clip": 1.00195253, + "balance_loss_mlp": 1.00084186, + "epoch": 0.40661355779347663, + "flos": 26468893641600.0, + "grad_norm": 3.763143293805348, + "language_loss": 0.73234838, + "learning_rate": 2.6876784125815433e-06, + "loss": 0.7551024, + "num_input_tokens_seen": 145146810, + "step": 6763, + "time_per_iteration": 2.612001419067383 + }, + { + "auxiliary_loss_clip": 0.01124097, + "auxiliary_loss_mlp": 0.01122598, + "balance_loss_clip": 1.00207961, + "balance_loss_mlp": 1.00062323, + "epoch": 0.4066736810461446, + "flos": 13261129136640.0, + "grad_norm": 1.9228240035728899, + "language_loss": 0.69281125, + "learning_rate": 2.687312683911033e-06, + "loss": 0.71527827, + "num_input_tokens_seen": 145163130, + "step": 6764, + "time_per_iteration": 2.6330630779266357 + }, + { + "auxiliary_loss_clip": 0.01123196, + "auxiliary_loss_mlp": 0.01123528, + "balance_loss_clip": 1.0019486, + "balance_loss_mlp": 1.00079048, + "epoch": 0.40673380429881256, + "flos": 28803625758720.0, + "grad_norm": 3.5376640299554554, + "language_loss": 0.91285372, + "learning_rate": 2.686946929177557e-06, + "loss": 0.93532091, + "num_input_tokens_seen": 145181420, + "step": 6765, + "time_per_iteration": 2.740845203399658 + }, + { + "auxiliary_loss_clip": 0.01154274, + "auxiliary_loss_mlp": 0.01123548, + "balance_loss_clip": 1.00216055, + "balance_loss_mlp": 1.00081027, + "epoch": 0.4067939275514805, + "flos": 12495334152960.0, + "grad_norm": 2.5045129498354815, + "language_loss": 0.79118514, + "learning_rate": 2.6865811483949855e-06, + "loss": 0.81396335, + "num_input_tokens_seen": 145198545, + "step": 6766, + "time_per_iteration": 2.558107852935791 + }, + { + "auxiliary_loss_clip": 0.01169184, + "auxiliary_loss_mlp": 0.01122452, + "balance_loss_clip": 1.00192666, + "balance_loss_mlp": 1.00066745, + "epoch": 0.4068540508041485, + "flos": 18770508069120.0, + "grad_norm": 1.817157425990952, + "language_loss": 0.7647205, + "learning_rate": 2.6862153415771867e-06, + "loss": 0.78763688, + "num_input_tokens_seen": 145215835, + "step": 6767, + "time_per_iteration": 2.4776735305786133 + }, + { + "auxiliary_loss_clip": 0.01152952, + "auxiliary_loss_mlp": 0.01122185, + "balance_loss_clip": 1.00210094, + "balance_loss_mlp": 1.00078225, + "epoch": 0.40691417405681646, + "flos": 28512821249280.0, + "grad_norm": 1.6868215588164188, + "language_loss": 0.77628201, + "learning_rate": 2.685849508738034e-06, + "loss": 0.7990334, + "num_input_tokens_seen": 145236555, + "step": 6768, + "time_per_iteration": 2.5978455543518066 + }, + { + "auxiliary_loss_clip": 0.01169262, + "auxiliary_loss_mlp": 0.01121721, + "balance_loss_clip": 1.00209415, + "balance_loss_mlp": 1.00069976, + "epoch": 0.4069742973094844, + "flos": 20814040627200.0, + "grad_norm": 1.8599715335700795, + "language_loss": 0.87053883, + "learning_rate": 2.6854836498913995e-06, + "loss": 0.89344871, + "num_input_tokens_seen": 145254595, + "step": 6769, + "time_per_iteration": 2.5078864097595215 + }, + { + "auxiliary_loss_clip": 0.01137264, + "auxiliary_loss_mlp": 0.01122014, + "balance_loss_clip": 1.00204313, + "balance_loss_mlp": 1.00080192, + "epoch": 0.4070344205621524, + "flos": 21470272151040.0, + "grad_norm": 1.6861133788721419, + "language_loss": 0.81002486, + "learning_rate": 2.685117765051156e-06, + "loss": 0.83261764, + "num_input_tokens_seen": 145274005, + "step": 6770, + "time_per_iteration": 4.061398506164551 + }, + { + "auxiliary_loss_clip": 0.01169395, + "auxiliary_loss_mlp": 0.01122474, + "balance_loss_clip": 1.00214481, + "balance_loss_mlp": 1.00068963, + "epoch": 0.4070945438148204, + "flos": 26830046937600.0, + "grad_norm": 1.6116281984073635, + "language_loss": 0.80384219, + "learning_rate": 2.6847518542311783e-06, + "loss": 0.82676089, + "num_input_tokens_seen": 145294850, + "step": 6771, + "time_per_iteration": 2.568748712539673 + }, + { + "auxiliary_loss_clip": 0.01122115, + "auxiliary_loss_mlp": 0.01121718, + "balance_loss_clip": 1.00194883, + "balance_loss_mlp": 1.00079167, + "epoch": 0.4071546670674884, + "flos": 26354158623360.0, + "grad_norm": 1.6487321835843345, + "language_loss": 0.76045829, + "learning_rate": 2.6843859174453417e-06, + "loss": 0.78289664, + "num_input_tokens_seen": 145317050, + "step": 6772, + "time_per_iteration": 4.057543754577637 + }, + { + "auxiliary_loss_clip": 0.01135932, + "auxiliary_loss_mlp": 0.01122801, + "balance_loss_clip": 1.00179791, + "balance_loss_mlp": 1.00092185, + "epoch": 0.40721479032015634, + "flos": 17895401020800.0, + "grad_norm": 1.7965805423301848, + "language_loss": 0.8129847, + "learning_rate": 2.6840199547075218e-06, + "loss": 0.83557206, + "num_input_tokens_seen": 145334480, + "step": 6773, + "time_per_iteration": 2.576051950454712 + }, + { + "auxiliary_loss_clip": 0.01135424, + "auxiliary_loss_mlp": 0.0110321, + "balance_loss_clip": 1.00190043, + "balance_loss_mlp": 1.00021291, + "epoch": 0.4072749135728243, + "flos": 49854570537600.0, + "grad_norm": 0.8167127922187669, + "language_loss": 0.64358503, + "learning_rate": 2.683653966031597e-06, + "loss": 0.66597134, + "num_input_tokens_seen": 145388695, + "step": 6774, + "time_per_iteration": 3.0684831142425537 + }, + { + "auxiliary_loss_clip": 0.01103401, + "auxiliary_loss_mlp": 0.01122615, + "balance_loss_clip": 1.00184488, + "balance_loss_mlp": 1.00063992, + "epoch": 0.40733503682549227, + "flos": 27563630400000.0, + "grad_norm": 1.8634041571862934, + "language_loss": 0.72371471, + "learning_rate": 2.683287951431446e-06, + "loss": 0.7459749, + "num_input_tokens_seen": 145408240, + "step": 6775, + "time_per_iteration": 4.151534557342529 + }, + { + "auxiliary_loss_clip": 0.01142545, + "auxiliary_loss_mlp": 0.00747823, + "balance_loss_clip": 1.00229073, + "balance_loss_mlp": 1.00061107, + "epoch": 0.40739516007816023, + "flos": 22126970551680.0, + "grad_norm": 1.3831394168623925, + "language_loss": 0.77639318, + "learning_rate": 2.6829219109209474e-06, + "loss": 0.79529691, + "num_input_tokens_seen": 145428395, + "step": 6776, + "time_per_iteration": 2.6159403324127197 + }, + { + "auxiliary_loss_clip": 0.01152645, + "auxiliary_loss_mlp": 0.0112347, + "balance_loss_clip": 1.00203586, + "balance_loss_mlp": 1.00082695, + "epoch": 0.4074552833308282, + "flos": 23842243693440.0, + "grad_norm": 1.8650966296336577, + "language_loss": 0.78809494, + "learning_rate": 2.682555844513981e-06, + "loss": 0.81085604, + "num_input_tokens_seen": 145448290, + "step": 6777, + "time_per_iteration": 4.083766937255859 + }, + { + "auxiliary_loss_clip": 0.01165137, + "auxiliary_loss_mlp": 0.01102433, + "balance_loss_clip": 1.00193906, + "balance_loss_mlp": 1.00019956, + "epoch": 0.40751540658349616, + "flos": 58000008781440.0, + "grad_norm": 0.67908307678666, + "language_loss": 0.53107584, + "learning_rate": 2.6821897522244286e-06, + "loss": 0.55375159, + "num_input_tokens_seen": 145509785, + "step": 6778, + "time_per_iteration": 3.102202892303467 + }, + { + "auxiliary_loss_clip": 0.01169441, + "auxiliary_loss_mlp": 0.00747799, + "balance_loss_clip": 1.00226617, + "balance_loss_mlp": 1.00078154, + "epoch": 0.40757552983616413, + "flos": 21214659991680.0, + "grad_norm": 1.9573221014306303, + "language_loss": 0.82699823, + "learning_rate": 2.6818236340661718e-06, + "loss": 0.84617066, + "num_input_tokens_seen": 145528620, + "step": 6779, + "time_per_iteration": 2.5746982097625732 + }, + { + "auxiliary_loss_clip": 0.01154377, + "auxiliary_loss_mlp": 0.01122635, + "balance_loss_clip": 1.00221086, + "balance_loss_mlp": 1.00075555, + "epoch": 0.4076356530888321, + "flos": 26833530556800.0, + "grad_norm": 1.693131351323906, + "language_loss": 0.76022208, + "learning_rate": 2.6814574900530957e-06, + "loss": 0.78299218, + "num_input_tokens_seen": 145547775, + "step": 6780, + "time_per_iteration": 2.6159870624542236 + }, + { + "auxiliary_loss_clip": 0.01152227, + "auxiliary_loss_mlp": 0.01121479, + "balance_loss_clip": 1.0019449, + "balance_loss_mlp": 1.0007441, + "epoch": 0.40769577634150006, + "flos": 12203021272320.0, + "grad_norm": 1.8250829845239482, + "language_loss": 0.66699749, + "learning_rate": 2.6810913201990827e-06, + "loss": 0.68973458, + "num_input_tokens_seen": 145564465, + "step": 6781, + "time_per_iteration": 2.5507380962371826 + }, + { + "auxiliary_loss_clip": 0.01137473, + "auxiliary_loss_mlp": 0.01121696, + "balance_loss_clip": 1.00193787, + "balance_loss_mlp": 1.00057983, + "epoch": 0.407755899594168, + "flos": 33655264796160.0, + "grad_norm": 1.5430327926898582, + "language_loss": 0.71318138, + "learning_rate": 2.6807251245180183e-06, + "loss": 0.73577309, + "num_input_tokens_seen": 145585965, + "step": 6782, + "time_per_iteration": 2.8288512229919434 + }, + { + "auxiliary_loss_clip": 0.01154121, + "auxiliary_loss_mlp": 0.01122575, + "balance_loss_clip": 1.00199521, + "balance_loss_mlp": 1.00050426, + "epoch": 0.407816022846836, + "flos": 20157342226560.0, + "grad_norm": 2.235001760671316, + "language_loss": 0.82489794, + "learning_rate": 2.6803589030237897e-06, + "loss": 0.84766489, + "num_input_tokens_seen": 145605000, + "step": 6783, + "time_per_iteration": 2.6795573234558105 + }, + { + "auxiliary_loss_clip": 0.01152855, + "auxiliary_loss_mlp": 0.01123126, + "balance_loss_clip": 1.00190663, + "balance_loss_mlp": 1.00067401, + "epoch": 0.40787614609950396, + "flos": 21178821196800.0, + "grad_norm": 2.308049217170444, + "language_loss": 0.80966425, + "learning_rate": 2.679992655730283e-06, + "loss": 0.83242404, + "num_input_tokens_seen": 145623740, + "step": 6784, + "time_per_iteration": 2.629544258117676 + }, + { + "auxiliary_loss_clip": 0.01121193, + "auxiliary_loss_mlp": 0.01123629, + "balance_loss_clip": 1.00187635, + "balance_loss_mlp": 1.00070071, + "epoch": 0.407936269352172, + "flos": 20520650338560.0, + "grad_norm": 1.6673462025248678, + "language_loss": 0.65717435, + "learning_rate": 2.679626382651386e-06, + "loss": 0.67962265, + "num_input_tokens_seen": 145643515, + "step": 6785, + "time_per_iteration": 2.6619584560394287 + }, + { + "auxiliary_loss_clip": 0.01152739, + "auxiliary_loss_mlp": 0.01122187, + "balance_loss_clip": 1.00190949, + "balance_loss_mlp": 1.00049841, + "epoch": 0.40799639260483994, + "flos": 20118809911680.0, + "grad_norm": 2.02641895838182, + "language_loss": 0.79767954, + "learning_rate": 2.679260083800989e-06, + "loss": 0.82042885, + "num_input_tokens_seen": 145660890, + "step": 6786, + "time_per_iteration": 2.5220420360565186 + }, + { + "auxiliary_loss_clip": 0.01169326, + "auxiliary_loss_mlp": 0.01122244, + "balance_loss_clip": 1.00213623, + "balance_loss_mlp": 1.0008415, + "epoch": 0.4080565158575079, + "flos": 20997328752000.0, + "grad_norm": 1.5767607339558034, + "language_loss": 0.81985193, + "learning_rate": 2.678893759192982e-06, + "loss": 0.84276766, + "num_input_tokens_seen": 145680070, + "step": 6787, + "time_per_iteration": 2.506953477859497 + }, + { + "auxiliary_loss_clip": 0.01152028, + "auxiliary_loss_mlp": 0.01122036, + "balance_loss_clip": 1.00204241, + "balance_loss_mlp": 1.000633, + "epoch": 0.40811663911017587, + "flos": 19317714837120.0, + "grad_norm": 1.9834048229880163, + "language_loss": 0.67827111, + "learning_rate": 2.678527408841255e-06, + "loss": 0.70101178, + "num_input_tokens_seen": 145698010, + "step": 6788, + "time_per_iteration": 2.5208239555358887 + }, + { + "auxiliary_loss_clip": 0.01137334, + "auxiliary_loss_mlp": 0.01122096, + "balance_loss_clip": 1.00189137, + "balance_loss_mlp": 1.00078893, + "epoch": 0.40817676236284384, + "flos": 40625382119040.0, + "grad_norm": 1.7929161757328105, + "language_loss": 0.65800858, + "learning_rate": 2.678161032759701e-06, + "loss": 0.68060291, + "num_input_tokens_seen": 145722215, + "step": 6789, + "time_per_iteration": 2.738853931427002 + }, + { + "auxiliary_loss_clip": 0.01105589, + "auxiliary_loss_mlp": 0.01121691, + "balance_loss_clip": 1.00184822, + "balance_loss_mlp": 1.00057435, + "epoch": 0.4082368856155118, + "flos": 20522086882560.0, + "grad_norm": 1.8617026870259805, + "language_loss": 0.60520649, + "learning_rate": 2.6777946309622123e-06, + "loss": 0.62747931, + "num_input_tokens_seen": 145741090, + "step": 6790, + "time_per_iteration": 2.654484987258911 + }, + { + "auxiliary_loss_clip": 0.0115422, + "auxiliary_loss_mlp": 0.01122585, + "balance_loss_clip": 1.00210893, + "balance_loss_mlp": 1.00080132, + "epoch": 0.40829700886817977, + "flos": 11427745098240.0, + "grad_norm": 4.966280651899098, + "language_loss": 0.69477463, + "learning_rate": 2.677428203462683e-06, + "loss": 0.71754271, + "num_input_tokens_seen": 145754985, + "step": 6791, + "time_per_iteration": 2.551797866821289 + }, + { + "auxiliary_loss_clip": 0.0114849, + "auxiliary_loss_mlp": 0.0110238, + "balance_loss_clip": 1.00176811, + "balance_loss_mlp": 1.00014639, + "epoch": 0.40835713212084773, + "flos": 67330677121920.0, + "grad_norm": 0.7467711046246319, + "language_loss": 0.59633303, + "learning_rate": 2.6770617502750093e-06, + "loss": 0.61884177, + "num_input_tokens_seen": 145815260, + "step": 6792, + "time_per_iteration": 3.1301562786102295 + }, + { + "auxiliary_loss_clip": 0.01169196, + "auxiliary_loss_mlp": 0.01122471, + "balance_loss_clip": 1.00220394, + "balance_loss_mlp": 1.00068736, + "epoch": 0.4084172553735157, + "flos": 21762010414080.0, + "grad_norm": 1.5686643154099715, + "language_loss": 0.8000803, + "learning_rate": 2.6766952714130857e-06, + "loss": 0.82299697, + "num_input_tokens_seen": 145832665, + "step": 6793, + "time_per_iteration": 2.5219240188598633 + }, + { + "auxiliary_loss_clip": 0.01152976, + "auxiliary_loss_mlp": 0.01122395, + "balance_loss_clip": 1.00210583, + "balance_loss_mlp": 1.00061047, + "epoch": 0.40847737862618366, + "flos": 27417258478080.0, + "grad_norm": 2.8418773662964663, + "language_loss": 0.84914124, + "learning_rate": 2.6763287668908094e-06, + "loss": 0.87189496, + "num_input_tokens_seen": 145850240, + "step": 6794, + "time_per_iteration": 2.599952459335327 + }, + { + "auxiliary_loss_clip": 0.01120862, + "auxiliary_loss_mlp": 0.01122642, + "balance_loss_clip": 1.00199187, + "balance_loss_mlp": 1.00076222, + "epoch": 0.4085375018788516, + "flos": 18587255857920.0, + "grad_norm": 1.5741284906090094, + "language_loss": 0.80191839, + "learning_rate": 2.6759622367220788e-06, + "loss": 0.8243534, + "num_input_tokens_seen": 145869545, + "step": 6795, + "time_per_iteration": 2.612851142883301 + }, + { + "auxiliary_loss_clip": 0.01152842, + "auxiliary_loss_mlp": 0.01123288, + "balance_loss_clip": 1.00196505, + "balance_loss_mlp": 1.00074053, + "epoch": 0.4085976251315196, + "flos": 15411783029760.0, + "grad_norm": 2.328009395143927, + "language_loss": 0.70285451, + "learning_rate": 2.675595680920792e-06, + "loss": 0.7256158, + "num_input_tokens_seen": 145884025, + "step": 6796, + "time_per_iteration": 2.4942314624786377 + }, + { + "auxiliary_loss_clip": 0.01154041, + "auxiliary_loss_mlp": 0.00747887, + "balance_loss_clip": 1.00209868, + "balance_loss_mlp": 1.00060511, + "epoch": 0.40865774838418756, + "flos": 21252222639360.0, + "grad_norm": 1.6035853338403878, + "language_loss": 0.77860087, + "learning_rate": 2.6752290995008498e-06, + "loss": 0.79762018, + "num_input_tokens_seen": 145903210, + "step": 6797, + "time_per_iteration": 2.5760409832000732 + }, + { + "auxiliary_loss_clip": 0.01154162, + "auxiliary_loss_mlp": 0.01122341, + "balance_loss_clip": 1.00199568, + "balance_loss_mlp": 1.00084245, + "epoch": 0.4087178716368556, + "flos": 13772245714560.0, + "grad_norm": 1.8030375163480419, + "language_loss": 0.85283864, + "learning_rate": 2.6748624924761523e-06, + "loss": 0.87560368, + "num_input_tokens_seen": 145920985, + "step": 6798, + "time_per_iteration": 2.5214223861694336 + }, + { + "auxiliary_loss_clip": 0.01169281, + "auxiliary_loss_mlp": 0.01120962, + "balance_loss_clip": 1.00216568, + "balance_loss_mlp": 1.00060797, + "epoch": 0.40877799488952354, + "flos": 23621752056960.0, + "grad_norm": 1.5040056523319505, + "language_loss": 0.84355694, + "learning_rate": 2.674495859860601e-06, + "loss": 0.86645931, + "num_input_tokens_seen": 145940350, + "step": 6799, + "time_per_iteration": 2.5313453674316406 + }, + { + "auxiliary_loss_clip": 0.01122537, + "auxiliary_loss_mlp": 0.01122661, + "balance_loss_clip": 1.00213504, + "balance_loss_mlp": 1.00078189, + "epoch": 0.4088381181421915, + "flos": 20918791664640.0, + "grad_norm": 2.0796621668641624, + "language_loss": 0.83572161, + "learning_rate": 2.6741292016681e-06, + "loss": 0.85817349, + "num_input_tokens_seen": 145957460, + "step": 6800, + "time_per_iteration": 2.6143295764923096 + }, + { + "auxiliary_loss_clip": 0.01152382, + "auxiliary_loss_mlp": 0.01122792, + "balance_loss_clip": 1.00201643, + "balance_loss_mlp": 1.00091267, + "epoch": 0.4088982413948595, + "flos": 13297578462720.0, + "grad_norm": 3.608821995165352, + "language_loss": 0.74693966, + "learning_rate": 2.6737625179125514e-06, + "loss": 0.76969135, + "num_input_tokens_seen": 145975285, + "step": 6801, + "time_per_iteration": 2.5200865268707275 + }, + { + "auxiliary_loss_clip": 0.01153727, + "auxiliary_loss_mlp": 0.0112256, + "balance_loss_clip": 1.00200284, + "balance_loss_mlp": 1.00077558, + "epoch": 0.40895836464752744, + "flos": 15267673664640.0, + "grad_norm": 1.8590934008897908, + "language_loss": 0.79920983, + "learning_rate": 2.673395808607861e-06, + "loss": 0.82197273, + "num_input_tokens_seen": 145989150, + "step": 6802, + "time_per_iteration": 2.529055118560791 + }, + { + "auxiliary_loss_clip": 0.0115297, + "auxiliary_loss_mlp": 0.01123202, + "balance_loss_clip": 1.002002, + "balance_loss_mlp": 1.00075066, + "epoch": 0.4090184879001954, + "flos": 14501411804160.0, + "grad_norm": 2.5189507855014326, + "language_loss": 0.75165248, + "learning_rate": 2.673029073767934e-06, + "loss": 0.77441418, + "num_input_tokens_seen": 146006980, + "step": 6803, + "time_per_iteration": 2.5569417476654053 + }, + { + "auxiliary_loss_clip": 0.01088288, + "auxiliary_loss_mlp": 0.00747857, + "balance_loss_clip": 1.00189757, + "balance_loss_mlp": 1.0005095, + "epoch": 0.40907861115286337, + "flos": 13881593692800.0, + "grad_norm": 1.750527956032946, + "language_loss": 0.78600836, + "learning_rate": 2.6726623134066764e-06, + "loss": 0.80436981, + "num_input_tokens_seen": 146025125, + "step": 6804, + "time_per_iteration": 2.684685230255127 + }, + { + "auxiliary_loss_clip": 0.01169328, + "auxiliary_loss_mlp": 0.01122834, + "balance_loss_clip": 1.00208521, + "balance_loss_mlp": 1.0008589, + "epoch": 0.40913873440553133, + "flos": 28037615293440.0, + "grad_norm": 1.8062362084849215, + "language_loss": 0.75094903, + "learning_rate": 2.672295527537998e-06, + "loss": 0.77387065, + "num_input_tokens_seen": 146044990, + "step": 6805, + "time_per_iteration": 2.561389207839966 + }, + { + "auxiliary_loss_clip": 0.01105579, + "auxiliary_loss_mlp": 0.01123069, + "balance_loss_clip": 1.00196457, + "balance_loss_mlp": 1.00080776, + "epoch": 0.4091988576581993, + "flos": 21618188357760.0, + "grad_norm": 1.6392890852348332, + "language_loss": 0.79536617, + "learning_rate": 2.671928716175804e-06, + "loss": 0.81765264, + "num_input_tokens_seen": 146066045, + "step": 6806, + "time_per_iteration": 2.740605115890503 + }, + { + "auxiliary_loss_clip": 0.01152953, + "auxiliary_loss_mlp": 0.0112236, + "balance_loss_clip": 1.00201344, + "balance_loss_mlp": 1.00057626, + "epoch": 0.40925898091086726, + "flos": 25224085860480.0, + "grad_norm": 1.9290058594607964, + "language_loss": 0.7222684, + "learning_rate": 2.671561879334007e-06, + "loss": 0.74502152, + "num_input_tokens_seen": 146086280, + "step": 6807, + "time_per_iteration": 2.602830648422241 + }, + { + "auxiliary_loss_clip": 0.01139069, + "auxiliary_loss_mlp": 0.01103175, + "balance_loss_clip": 1.00276792, + "balance_loss_mlp": 1.00017786, + "epoch": 0.40931910416353523, + "flos": 68930568800640.0, + "grad_norm": 0.8332909096793751, + "language_loss": 0.58797151, + "learning_rate": 2.6711950170265155e-06, + "loss": 0.61039394, + "num_input_tokens_seen": 146148840, + "step": 6808, + "time_per_iteration": 4.649322986602783 + }, + { + "auxiliary_loss_clip": 0.01138465, + "auxiliary_loss_mlp": 0.0112219, + "balance_loss_clip": 1.00201035, + "balance_loss_mlp": 1.00069213, + "epoch": 0.4093792274162032, + "flos": 20189553747840.0, + "grad_norm": 1.8205684291543662, + "language_loss": 0.54833114, + "learning_rate": 2.670828129267242e-06, + "loss": 0.57093775, + "num_input_tokens_seen": 146166195, + "step": 6809, + "time_per_iteration": 2.6138064861297607 + }, + { + "auxiliary_loss_clip": 0.01139048, + "auxiliary_loss_mlp": 0.0112226, + "balance_loss_clip": 1.00197995, + "balance_loss_mlp": 1.00057113, + "epoch": 0.40943935066887116, + "flos": 25228754628480.0, + "grad_norm": 2.014171083469514, + "language_loss": 0.83371496, + "learning_rate": 2.6704612160700983e-06, + "loss": 0.85632801, + "num_input_tokens_seen": 146185045, + "step": 6810, + "time_per_iteration": 3.983950614929199 + }, + { + "auxiliary_loss_clip": 0.01137224, + "auxiliary_loss_mlp": 0.01123434, + "balance_loss_clip": 1.00214267, + "balance_loss_mlp": 1.00088739, + "epoch": 0.4094994739215392, + "flos": 23255319461760.0, + "grad_norm": 2.217109192033476, + "language_loss": 0.77618909, + "learning_rate": 2.670094277448999e-06, + "loss": 0.79879564, + "num_input_tokens_seen": 146204655, + "step": 6811, + "time_per_iteration": 2.5868144035339355 + }, + { + "auxiliary_loss_clip": 0.01169315, + "auxiliary_loss_mlp": 0.01122334, + "balance_loss_clip": 1.00222385, + "balance_loss_mlp": 1.00064552, + "epoch": 0.40955959717420715, + "flos": 17382165540480.0, + "grad_norm": 1.6630573747805155, + "language_loss": 0.69940054, + "learning_rate": 2.669727313417857e-06, + "loss": 0.72231698, + "num_input_tokens_seen": 146222000, + "step": 6812, + "time_per_iteration": 2.4855704307556152 + }, + { + "auxiliary_loss_clip": 0.01169279, + "auxiliary_loss_mlp": 0.01122224, + "balance_loss_clip": 1.00216162, + "balance_loss_mlp": 1.00072575, + "epoch": 0.4096197204268751, + "flos": 25082418620160.0, + "grad_norm": 1.4380785302310415, + "language_loss": 0.66320771, + "learning_rate": 2.6693603239905872e-06, + "loss": 0.68612266, + "num_input_tokens_seen": 146242630, + "step": 6813, + "time_per_iteration": 3.9614644050598145 + }, + { + "auxiliary_loss_clip": 0.01154213, + "auxiliary_loss_mlp": 0.00747753, + "balance_loss_clip": 1.00203252, + "balance_loss_mlp": 1.00045681, + "epoch": 0.4096798436795431, + "flos": 30586769648640.0, + "grad_norm": 1.8198528051259357, + "language_loss": 0.73731411, + "learning_rate": 2.6689933091811087e-06, + "loss": 0.75633383, + "num_input_tokens_seen": 146263070, + "step": 6814, + "time_per_iteration": 2.6143686771392822 + }, + { + "auxiliary_loss_clip": 0.0110634, + "auxiliary_loss_mlp": 0.01123329, + "balance_loss_clip": 1.00190246, + "balance_loss_mlp": 1.00059152, + "epoch": 0.40973996693221104, + "flos": 24133622820480.0, + "grad_norm": 1.966162075584711, + "language_loss": 0.66275799, + "learning_rate": 2.6686262690033357e-06, + "loss": 0.68505472, + "num_input_tokens_seen": 146282890, + "step": 6815, + "time_per_iteration": 2.675494909286499 + }, + { + "auxiliary_loss_clip": 0.01152373, + "auxiliary_loss_mlp": 0.01121992, + "balance_loss_clip": 1.00212157, + "balance_loss_mlp": 1.00087523, + "epoch": 0.409800090184879, + "flos": 23988974751360.0, + "grad_norm": 1.7473155496814485, + "language_loss": 0.76894891, + "learning_rate": 2.668259203471188e-06, + "loss": 0.7916925, + "num_input_tokens_seen": 146301755, + "step": 6816, + "time_per_iteration": 4.009680986404419 + }, + { + "auxiliary_loss_clip": 0.01138621, + "auxiliary_loss_mlp": 0.01122185, + "balance_loss_clip": 1.0020082, + "balance_loss_mlp": 1.00097322, + "epoch": 0.40986021343754697, + "flos": 16143678552960.0, + "grad_norm": 2.019133904913215, + "language_loss": 0.81947988, + "learning_rate": 2.6678921125985843e-06, + "loss": 0.84208798, + "num_input_tokens_seen": 146316835, + "step": 6817, + "time_per_iteration": 2.574108362197876 + }, + { + "auxiliary_loss_clip": 0.01137842, + "auxiliary_loss_mlp": 0.01123135, + "balance_loss_clip": 1.00207019, + "balance_loss_mlp": 1.00077844, + "epoch": 0.40992033669021494, + "flos": 24790824011520.0, + "grad_norm": 1.5818991935799203, + "language_loss": 0.8014164, + "learning_rate": 2.667524996399444e-06, + "loss": 0.82402623, + "num_input_tokens_seen": 146336650, + "step": 6818, + "time_per_iteration": 2.662900686264038 + }, + { + "auxiliary_loss_clip": 0.01123508, + "auxiliary_loss_mlp": 0.01121768, + "balance_loss_clip": 1.00196147, + "balance_loss_mlp": 1.0006516, + "epoch": 0.4099804599428829, + "flos": 29641888431360.0, + "grad_norm": 1.5292766245926086, + "language_loss": 0.6620816, + "learning_rate": 2.66715785488769e-06, + "loss": 0.68453443, + "num_input_tokens_seen": 146357640, + "step": 6819, + "time_per_iteration": 2.7155847549438477 + }, + { + "auxiliary_loss_clip": 0.01137802, + "auxiliary_loss_mlp": 0.0112335, + "balance_loss_clip": 1.00199986, + "balance_loss_mlp": 1.00070786, + "epoch": 0.41004058319555087, + "flos": 24826590979200.0, + "grad_norm": 1.5748253909231757, + "language_loss": 0.85341454, + "learning_rate": 2.6667906880772428e-06, + "loss": 0.87602603, + "num_input_tokens_seen": 146379325, + "step": 6820, + "time_per_iteration": 2.6323013305664062 + }, + { + "auxiliary_loss_clip": 0.01152924, + "auxiliary_loss_mlp": 0.01121812, + "balance_loss_clip": 1.00211644, + "balance_loss_mlp": 1.00069511, + "epoch": 0.41010070644821883, + "flos": 25737464995200.0, + "grad_norm": 1.7748684994814643, + "language_loss": 0.71177864, + "learning_rate": 2.6664234959820256e-06, + "loss": 0.73452598, + "num_input_tokens_seen": 146398635, + "step": 6821, + "time_per_iteration": 2.5811150074005127 + }, + { + "auxiliary_loss_clip": 0.01152814, + "auxiliary_loss_mlp": 0.0112189, + "balance_loss_clip": 1.00203788, + "balance_loss_mlp": 1.00086856, + "epoch": 0.4101608297008868, + "flos": 22346061557760.0, + "grad_norm": 1.8757583337796042, + "language_loss": 0.74579358, + "learning_rate": 2.6660562786159634e-06, + "loss": 0.76854062, + "num_input_tokens_seen": 146417585, + "step": 6822, + "time_per_iteration": 2.584746837615967 + }, + { + "auxiliary_loss_clip": 0.01137007, + "auxiliary_loss_mlp": 0.01122166, + "balance_loss_clip": 1.00195813, + "balance_loss_mlp": 1.00076342, + "epoch": 0.41022095295355476, + "flos": 21945083057280.0, + "grad_norm": 2.082791915646714, + "language_loss": 0.75952446, + "learning_rate": 2.6656890359929796e-06, + "loss": 0.78211617, + "num_input_tokens_seen": 146437035, + "step": 6823, + "time_per_iteration": 2.637957811355591 + }, + { + "auxiliary_loss_clip": 0.01102499, + "auxiliary_loss_mlp": 0.01124177, + "balance_loss_clip": 1.00181329, + "balance_loss_mlp": 1.00096226, + "epoch": 0.4102810762062228, + "flos": 27450511493760.0, + "grad_norm": 1.9330467475767472, + "language_loss": 0.73293442, + "learning_rate": 2.665321768127001e-06, + "loss": 0.75520122, + "num_input_tokens_seen": 146457370, + "step": 6824, + "time_per_iteration": 2.7457244396209717 + }, + { + "auxiliary_loss_clip": 0.01120907, + "auxiliary_loss_mlp": 0.01122646, + "balance_loss_clip": 1.00193441, + "balance_loss_mlp": 1.00067139, + "epoch": 0.41034119945889075, + "flos": 24499265316480.0, + "grad_norm": 1.6616469340515894, + "language_loss": 0.72094285, + "learning_rate": 2.6649544750319548e-06, + "loss": 0.7433784, + "num_input_tokens_seen": 146478105, + "step": 6825, + "time_per_iteration": 2.7449100017547607 + }, + { + "auxiliary_loss_clip": 0.01122667, + "auxiliary_loss_mlp": 0.01122501, + "balance_loss_clip": 1.00192022, + "balance_loss_mlp": 1.00081229, + "epoch": 0.4104013227115587, + "flos": 24352641999360.0, + "grad_norm": 2.017778337280895, + "language_loss": 0.84530723, + "learning_rate": 2.664587156721768e-06, + "loss": 0.86775893, + "num_input_tokens_seen": 146497835, + "step": 6826, + "time_per_iteration": 2.7322728633880615 + }, + { + "auxiliary_loss_clip": 0.0113573, + "auxiliary_loss_mlp": 0.00747754, + "balance_loss_clip": 1.00206971, + "balance_loss_mlp": 1.00039363, + "epoch": 0.4104614459642267, + "flos": 23729340268800.0, + "grad_norm": 1.732873175221032, + "language_loss": 0.66173071, + "learning_rate": 2.6642198132103696e-06, + "loss": 0.68056554, + "num_input_tokens_seen": 146517735, + "step": 6827, + "time_per_iteration": 2.716012477874756 + }, + { + "auxiliary_loss_clip": 0.01137364, + "auxiliary_loss_mlp": 0.01122087, + "balance_loss_clip": 1.00196266, + "balance_loss_mlp": 1.00068426, + "epoch": 0.41052156921689464, + "flos": 22127976132480.0, + "grad_norm": 1.3326716844390347, + "language_loss": 0.72435343, + "learning_rate": 2.663852444511689e-06, + "loss": 0.74694794, + "num_input_tokens_seen": 146537640, + "step": 6828, + "time_per_iteration": 2.613224744796753 + }, + { + "auxiliary_loss_clip": 0.01137747, + "auxiliary_loss_mlp": 0.01123711, + "balance_loss_clip": 1.00210357, + "balance_loss_mlp": 1.00078201, + "epoch": 0.4105816924695626, + "flos": 20084371747200.0, + "grad_norm": 1.8406469375003918, + "language_loss": 0.8308059, + "learning_rate": 2.6634850506396574e-06, + "loss": 0.8534205, + "num_input_tokens_seen": 146554695, + "step": 6829, + "time_per_iteration": 2.5706496238708496 + }, + { + "auxiliary_loss_clip": 0.01152583, + "auxiliary_loss_mlp": 0.01121551, + "balance_loss_clip": 1.00216055, + "balance_loss_mlp": 1.00081575, + "epoch": 0.4106418157222306, + "flos": 18076785724800.0, + "grad_norm": 1.4862027918801644, + "language_loss": 0.89968717, + "learning_rate": 2.663117631608206e-06, + "loss": 0.92242861, + "num_input_tokens_seen": 146573740, + "step": 6830, + "time_per_iteration": 2.6609458923339844 + }, + { + "auxiliary_loss_clip": 0.01120926, + "auxiliary_loss_mlp": 0.01121999, + "balance_loss_clip": 1.0019722, + "balance_loss_mlp": 1.00059664, + "epoch": 0.41070193897489854, + "flos": 21647850013440.0, + "grad_norm": 1.731679253070625, + "language_loss": 0.66078991, + "learning_rate": 2.662750187431268e-06, + "loss": 0.68321913, + "num_input_tokens_seen": 146592885, + "step": 6831, + "time_per_iteration": 2.652956247329712 + }, + { + "auxiliary_loss_clip": 0.01169136, + "auxiliary_loss_mlp": 0.01122367, + "balance_loss_clip": 1.00210404, + "balance_loss_mlp": 1.00058234, + "epoch": 0.4107620622275665, + "flos": 26648195356800.0, + "grad_norm": 1.764681013034461, + "language_loss": 0.69334686, + "learning_rate": 2.662382718122776e-06, + "loss": 0.71626186, + "num_input_tokens_seen": 146611995, + "step": 6832, + "time_per_iteration": 2.6017720699310303 + }, + { + "auxiliary_loss_clip": 0.01108746, + "auxiliary_loss_mlp": 0.01122448, + "balance_loss_clip": 1.00216722, + "balance_loss_mlp": 1.00075889, + "epoch": 0.41082218548023447, + "flos": 18734310138240.0, + "grad_norm": 2.0578718990058325, + "language_loss": 0.73493588, + "learning_rate": 2.662015223696666e-06, + "loss": 0.75724787, + "num_input_tokens_seen": 146628045, + "step": 6833, + "time_per_iteration": 2.633436679840088 + }, + { + "auxiliary_loss_clip": 0.0110561, + "auxiliary_loss_mlp": 0.01123213, + "balance_loss_clip": 1.00205016, + "balance_loss_mlp": 1.00066638, + "epoch": 0.41088230873290243, + "flos": 22893771116160.0, + "grad_norm": 1.910163390554234, + "language_loss": 0.72909951, + "learning_rate": 2.6616477041668713e-06, + "loss": 0.75138772, + "num_input_tokens_seen": 146648355, + "step": 6834, + "time_per_iteration": 2.706179141998291 + }, + { + "auxiliary_loss_clip": 0.01152693, + "auxiliary_loss_mlp": 0.01122829, + "balance_loss_clip": 1.00204098, + "balance_loss_mlp": 1.00085425, + "epoch": 0.4109424319855704, + "flos": 24276978000000.0, + "grad_norm": 1.8511874589145951, + "language_loss": 0.71047741, + "learning_rate": 2.661280159547329e-06, + "loss": 0.73323262, + "num_input_tokens_seen": 146668370, + "step": 6835, + "time_per_iteration": 2.6038553714752197 + }, + { + "auxiliary_loss_clip": 0.01154039, + "auxiliary_loss_mlp": 0.01122358, + "balance_loss_clip": 1.00205183, + "balance_loss_mlp": 1.0006696, + "epoch": 0.41100255523823837, + "flos": 12969139478400.0, + "grad_norm": 2.2058008730752756, + "language_loss": 0.8708806, + "learning_rate": 2.660912589851978e-06, + "loss": 0.89364457, + "num_input_tokens_seen": 146686665, + "step": 6836, + "time_per_iteration": 2.535010814666748 + }, + { + "auxiliary_loss_clip": 0.01151806, + "auxiliary_loss_mlp": 0.01121726, + "balance_loss_clip": 1.00201106, + "balance_loss_mlp": 1.00089586, + "epoch": 0.4110626784909064, + "flos": 23145648261120.0, + "grad_norm": 1.8566161710826214, + "language_loss": 0.68551981, + "learning_rate": 2.6605449950947547e-06, + "loss": 0.70825511, + "num_input_tokens_seen": 146706570, + "step": 6837, + "time_per_iteration": 2.5671844482421875 + }, + { + "auxiliary_loss_clip": 0.011691, + "auxiliary_loss_mlp": 0.01122981, + "balance_loss_clip": 1.00206232, + "balance_loss_mlp": 1.00081515, + "epoch": 0.41112280174357435, + "flos": 22747399194240.0, + "grad_norm": 2.747649358302247, + "language_loss": 0.75537086, + "learning_rate": 2.660177375289599e-06, + "loss": 0.7782917, + "num_input_tokens_seen": 146723425, + "step": 6838, + "time_per_iteration": 2.5331568717956543 + }, + { + "auxiliary_loss_clip": 0.01120351, + "auxiliary_loss_mlp": 0.01122577, + "balance_loss_clip": 1.00188541, + "balance_loss_mlp": 1.00060236, + "epoch": 0.4111829249962423, + "flos": 21102403011840.0, + "grad_norm": 1.9996280164478024, + "language_loss": 0.81877476, + "learning_rate": 2.659809730450451e-06, + "loss": 0.84120405, + "num_input_tokens_seen": 146741640, + "step": 6839, + "time_per_iteration": 2.622706413269043 + }, + { + "auxiliary_loss_clip": 0.0116911, + "auxiliary_loss_mlp": 0.01122242, + "balance_loss_clip": 1.00202894, + "balance_loss_mlp": 1.00064898, + "epoch": 0.4112430482489103, + "flos": 21505787723520.0, + "grad_norm": 1.8258061665270797, + "language_loss": 0.80257022, + "learning_rate": 2.6594420605912523e-06, + "loss": 0.8254838, + "num_input_tokens_seen": 146759195, + "step": 6840, + "time_per_iteration": 2.546175718307495 + }, + { + "auxiliary_loss_clip": 0.01151738, + "auxiliary_loss_mlp": 0.01121, + "balance_loss_clip": 1.00195158, + "balance_loss_mlp": 1.00064588, + "epoch": 0.41130317150157825, + "flos": 19570022945280.0, + "grad_norm": 1.8632129676383447, + "language_loss": 0.68138492, + "learning_rate": 2.6590743657259442e-06, + "loss": 0.70411229, + "num_input_tokens_seen": 146774990, + "step": 6841, + "time_per_iteration": 2.542839765548706 + }, + { + "auxiliary_loss_clip": 0.01148383, + "auxiliary_loss_mlp": 0.01101587, + "balance_loss_clip": 1.00174701, + "balance_loss_mlp": 1.00011599, + "epoch": 0.4113632947542462, + "flos": 62383157706240.0, + "grad_norm": 0.7651543609746716, + "language_loss": 0.59721231, + "learning_rate": 2.65870664586847e-06, + "loss": 0.619712, + "num_input_tokens_seen": 146839610, + "step": 6842, + "time_per_iteration": 3.2160329818725586 + }, + { + "auxiliary_loss_clip": 0.01152571, + "auxiliary_loss_mlp": 0.01121637, + "balance_loss_clip": 1.00204551, + "balance_loss_mlp": 1.0007112, + "epoch": 0.4114234180069142, + "flos": 13918617636480.0, + "grad_norm": 2.2629251506866845, + "language_loss": 0.69936025, + "learning_rate": 2.6583389010327742e-06, + "loss": 0.72210228, + "num_input_tokens_seen": 146857360, + "step": 6843, + "time_per_iteration": 2.6215693950653076 + }, + { + "auxiliary_loss_clip": 0.01115673, + "auxiliary_loss_mlp": 0.01102407, + "balance_loss_clip": 1.00165248, + "balance_loss_mlp": 1.00017309, + "epoch": 0.41148354125958214, + "flos": 64928505219840.0, + "grad_norm": 0.7202412057235845, + "language_loss": 0.53650141, + "learning_rate": 2.6579711312328013e-06, + "loss": 0.5586822, + "num_input_tokens_seen": 146917055, + "step": 6844, + "time_per_iteration": 3.21181321144104 + }, + { + "auxiliary_loss_clip": 0.01152456, + "auxiliary_loss_mlp": 0.0112227, + "balance_loss_clip": 1.00210834, + "balance_loss_mlp": 1.00067639, + "epoch": 0.4115436645122501, + "flos": 18728779443840.0, + "grad_norm": 3.0963383282690806, + "language_loss": 0.66091514, + "learning_rate": 2.6576033364824967e-06, + "loss": 0.68366241, + "num_input_tokens_seen": 146935215, + "step": 6845, + "time_per_iteration": 2.5632126331329346 + }, + { + "auxiliary_loss_clip": 0.01169088, + "auxiliary_loss_mlp": 0.01121743, + "balance_loss_clip": 1.00218606, + "balance_loss_mlp": 1.00072217, + "epoch": 0.41160378776491807, + "flos": 16252918790400.0, + "grad_norm": 1.9476886331858707, + "language_loss": 0.70568281, + "learning_rate": 2.657235516795808e-06, + "loss": 0.72859108, + "num_input_tokens_seen": 146951970, + "step": 6846, + "time_per_iteration": 3.960334300994873 + }, + { + "auxiliary_loss_clip": 0.01142158, + "auxiliary_loss_mlp": 0.01122094, + "balance_loss_clip": 1.00237441, + "balance_loss_mlp": 1.00069141, + "epoch": 0.41166391101758604, + "flos": 27970031854080.0, + "grad_norm": 1.5214060581552387, + "language_loss": 0.65191829, + "learning_rate": 2.6568676721866826e-06, + "loss": 0.67456079, + "num_input_tokens_seen": 146975615, + "step": 6847, + "time_per_iteration": 2.705914258956909 + }, + { + "auxiliary_loss_clip": 0.01141973, + "auxiliary_loss_mlp": 0.01121207, + "balance_loss_clip": 1.002285, + "balance_loss_mlp": 1.00085294, + "epoch": 0.411724034270254, + "flos": 34131296764800.0, + "grad_norm": 1.5741374797711105, + "language_loss": 0.70420647, + "learning_rate": 2.656499802669069e-06, + "loss": 0.72683829, + "num_input_tokens_seen": 146998855, + "step": 6848, + "time_per_iteration": 4.056392669677734 + }, + { + "auxiliary_loss_clip": 0.0113383, + "auxiliary_loss_mlp": 0.00746532, + "balance_loss_clip": 1.00189257, + "balance_loss_mlp": 0.99972737, + "epoch": 0.41178415752292197, + "flos": 67923670752000.0, + "grad_norm": 0.8844445492594946, + "language_loss": 0.56285805, + "learning_rate": 2.6561319082569174e-06, + "loss": 0.58166164, + "num_input_tokens_seen": 147062710, + "step": 6849, + "time_per_iteration": 3.245002269744873 + }, + { + "auxiliary_loss_clip": 0.01138413, + "auxiliary_loss_mlp": 0.01122437, + "balance_loss_clip": 1.00210917, + "balance_loss_mlp": 1.00055742, + "epoch": 0.41184428077558993, + "flos": 34313938444800.0, + "grad_norm": 1.6938386566481962, + "language_loss": 0.75929624, + "learning_rate": 2.6557639889641783e-06, + "loss": 0.7819047, + "num_input_tokens_seen": 147086075, + "step": 6850, + "time_per_iteration": 2.7308058738708496 + }, + { + "auxiliary_loss_clip": 0.01103476, + "auxiliary_loss_mlp": 0.01121363, + "balance_loss_clip": 1.00186372, + "balance_loss_mlp": 1.00081825, + "epoch": 0.41190440402825795, + "flos": 35444118948480.0, + "grad_norm": 1.4847225386651373, + "language_loss": 0.67425668, + "learning_rate": 2.6553960448048025e-06, + "loss": 0.69650507, + "num_input_tokens_seen": 147107590, + "step": 6851, + "time_per_iteration": 4.277297496795654 + }, + { + "auxiliary_loss_clip": 0.01119471, + "auxiliary_loss_mlp": 0.01122763, + "balance_loss_clip": 1.00191951, + "balance_loss_mlp": 1.00078785, + "epoch": 0.4119645272809259, + "flos": 20849879422080.0, + "grad_norm": 2.604309612861755, + "language_loss": 0.7967239, + "learning_rate": 2.655028075792743e-06, + "loss": 0.81914628, + "num_input_tokens_seen": 147123715, + "step": 6852, + "time_per_iteration": 2.621502161026001 + }, + { + "auxiliary_loss_clip": 0.0116934, + "auxiliary_loss_mlp": 0.01122251, + "balance_loss_clip": 1.00222945, + "balance_loss_mlp": 1.00065732, + "epoch": 0.4120246505335939, + "flos": 27562050201600.0, + "grad_norm": 2.166338058277151, + "language_loss": 0.77177471, + "learning_rate": 2.6546600819419537e-06, + "loss": 0.79469061, + "num_input_tokens_seen": 147144290, + "step": 6853, + "time_per_iteration": 3.993600606918335 + }, + { + "auxiliary_loss_clip": 0.01154222, + "auxiliary_loss_mlp": 0.01123487, + "balance_loss_clip": 1.00210834, + "balance_loss_mlp": 1.00074911, + "epoch": 0.41208477378626185, + "flos": 37815444046080.0, + "grad_norm": 1.6642181617611542, + "language_loss": 0.65864193, + "learning_rate": 2.6542920632663883e-06, + "loss": 0.68141901, + "num_input_tokens_seen": 147166340, + "step": 6854, + "time_per_iteration": 2.694525957107544 + }, + { + "auxiliary_loss_clip": 0.0113714, + "auxiliary_loss_mlp": 0.01121239, + "balance_loss_clip": 1.00203526, + "balance_loss_mlp": 1.0005033, + "epoch": 0.4121448970389298, + "flos": 23440762402560.0, + "grad_norm": 1.8202238504231818, + "language_loss": 0.84048712, + "learning_rate": 2.6539240197800023e-06, + "loss": 0.86307091, + "num_input_tokens_seen": 147184025, + "step": 6855, + "time_per_iteration": 2.6353542804718018 + }, + { + "auxiliary_loss_clip": 0.01152496, + "auxiliary_loss_mlp": 0.01120964, + "balance_loss_clip": 1.00208199, + "balance_loss_mlp": 1.00080085, + "epoch": 0.4122050202915978, + "flos": 21325300859520.0, + "grad_norm": 1.9004982546896476, + "language_loss": 0.78781497, + "learning_rate": 2.6535559514967517e-06, + "loss": 0.8105495, + "num_input_tokens_seen": 147202730, + "step": 6856, + "time_per_iteration": 2.570002555847168 + }, + { + "auxiliary_loss_clip": 0.01121132, + "auxiliary_loss_mlp": 0.01121711, + "balance_loss_clip": 1.00210416, + "balance_loss_mlp": 1.00068986, + "epoch": 0.41226514354426574, + "flos": 17306286059520.0, + "grad_norm": 2.613279761862885, + "language_loss": 0.79626286, + "learning_rate": 2.6531878584305935e-06, + "loss": 0.81869125, + "num_input_tokens_seen": 147215315, + "step": 6857, + "time_per_iteration": 2.6159706115722656 + }, + { + "auxiliary_loss_clip": 0.01152292, + "auxiliary_loss_mlp": 0.00747689, + "balance_loss_clip": 1.00189376, + "balance_loss_mlp": 1.00032413, + "epoch": 0.4123252667969337, + "flos": 17638855107840.0, + "grad_norm": 2.8073892451755436, + "language_loss": 0.70830554, + "learning_rate": 2.6528197405954873e-06, + "loss": 0.72730529, + "num_input_tokens_seen": 147233330, + "step": 6858, + "time_per_iteration": 2.5752952098846436 + }, + { + "auxiliary_loss_clip": 0.01152476, + "auxiliary_loss_mlp": 0.01121597, + "balance_loss_clip": 1.0020839, + "balance_loss_mlp": 1.00076604, + "epoch": 0.4123853900496017, + "flos": 46424811375360.0, + "grad_norm": 1.6642679423396018, + "language_loss": 0.59251416, + "learning_rate": 2.652451598005391e-06, + "loss": 0.61525494, + "num_input_tokens_seen": 147257780, + "step": 6859, + "time_per_iteration": 2.7743189334869385 + }, + { + "auxiliary_loss_clip": 0.01169047, + "auxiliary_loss_mlp": 0.01121304, + "balance_loss_clip": 1.00206935, + "balance_loss_mlp": 1.00066447, + "epoch": 0.41244551330226964, + "flos": 17675160779520.0, + "grad_norm": 2.1993732123986987, + "language_loss": 0.74059224, + "learning_rate": 2.652083430674264e-06, + "loss": 0.7634958, + "num_input_tokens_seen": 147276055, + "step": 6860, + "time_per_iteration": 2.4961163997650146 + }, + { + "auxiliary_loss_clip": 0.01076021, + "auxiliary_loss_mlp": 0.01121224, + "balance_loss_clip": 1.00169241, + "balance_loss_mlp": 1.00058413, + "epoch": 0.4125056365549376, + "flos": 18693730748160.0, + "grad_norm": 1.7221958834171716, + "language_loss": 0.74128139, + "learning_rate": 2.651715238616068e-06, + "loss": 0.76325381, + "num_input_tokens_seen": 147293200, + "step": 6861, + "time_per_iteration": 2.7303171157836914 + }, + { + "auxiliary_loss_clip": 0.01138794, + "auxiliary_loss_mlp": 0.01121265, + "balance_loss_clip": 1.00197434, + "balance_loss_mlp": 1.00062537, + "epoch": 0.41256575980760557, + "flos": 17895293280000.0, + "grad_norm": 2.072733748233414, + "language_loss": 0.79894638, + "learning_rate": 2.651347021844765e-06, + "loss": 0.82154703, + "num_input_tokens_seen": 147310640, + "step": 6862, + "time_per_iteration": 2.594532012939453 + }, + { + "auxiliary_loss_clip": 0.01139002, + "auxiliary_loss_mlp": 0.01121737, + "balance_loss_clip": 1.00214398, + "balance_loss_mlp": 1.00062013, + "epoch": 0.41262588306027354, + "flos": 21981316901760.0, + "grad_norm": 2.0399559509344085, + "language_loss": 0.75645769, + "learning_rate": 2.650978780374318e-06, + "loss": 0.77906501, + "num_input_tokens_seen": 147329435, + "step": 6863, + "time_per_iteration": 2.5999841690063477 + }, + { + "auxiliary_loss_clip": 0.0114972, + "auxiliary_loss_mlp": 0.01101552, + "balance_loss_clip": 1.001719, + "balance_loss_mlp": 1.00008106, + "epoch": 0.41268600631294156, + "flos": 53350006740480.0, + "grad_norm": 0.7082571272958527, + "language_loss": 0.52673787, + "learning_rate": 2.650610514218691e-06, + "loss": 0.5492506, + "num_input_tokens_seen": 147385805, + "step": 6864, + "time_per_iteration": 3.1031768321990967 + }, + { + "auxiliary_loss_clip": 0.01169312, + "auxiliary_loss_mlp": 0.01122357, + "balance_loss_clip": 1.00210178, + "balance_loss_mlp": 1.00066769, + "epoch": 0.4127461295656095, + "flos": 24385356311040.0, + "grad_norm": 1.9275285852549549, + "language_loss": 0.72674584, + "learning_rate": 2.6502422233918468e-06, + "loss": 0.74966258, + "num_input_tokens_seen": 147405160, + "step": 6865, + "time_per_iteration": 2.5815863609313965 + }, + { + "auxiliary_loss_clip": 0.01164621, + "auxiliary_loss_mlp": 0.01101572, + "balance_loss_clip": 1.00164533, + "balance_loss_mlp": 1.00010109, + "epoch": 0.4128062528182775, + "flos": 71705242696320.0, + "grad_norm": 0.9176588744488275, + "language_loss": 0.66609353, + "learning_rate": 2.649873907907753e-06, + "loss": 0.68875545, + "num_input_tokens_seen": 147460245, + "step": 6866, + "time_per_iteration": 2.9559757709503174 + }, + { + "auxiliary_loss_clip": 0.01168969, + "auxiliary_loss_mlp": 0.01120758, + "balance_loss_clip": 1.00208592, + "balance_loss_mlp": 1.00059474, + "epoch": 0.41286637607094545, + "flos": 17849111368320.0, + "grad_norm": 2.5512753180548575, + "language_loss": 0.80920315, + "learning_rate": 2.649505567780375e-06, + "loss": 0.83210039, + "num_input_tokens_seen": 147476200, + "step": 6867, + "time_per_iteration": 2.519219398498535 + }, + { + "auxiliary_loss_clip": 0.01135924, + "auxiliary_loss_mlp": 0.01121663, + "balance_loss_clip": 1.00200546, + "balance_loss_mlp": 1.00064158, + "epoch": 0.4129264993236134, + "flos": 25549544016000.0, + "grad_norm": 2.041722119796444, + "language_loss": 0.77269745, + "learning_rate": 2.6491372030236815e-06, + "loss": 0.7952733, + "num_input_tokens_seen": 147494315, + "step": 6868, + "time_per_iteration": 2.6308090686798096 + }, + { + "auxiliary_loss_clip": 0.01148044, + "auxiliary_loss_mlp": 0.01101703, + "balance_loss_clip": 1.00157285, + "balance_loss_mlp": 1.0002321, + "epoch": 0.4129866225762814, + "flos": 65414446364160.0, + "grad_norm": 0.8974787863424235, + "language_loss": 0.57892084, + "learning_rate": 2.64876881365164e-06, + "loss": 0.60141838, + "num_input_tokens_seen": 147543665, + "step": 6869, + "time_per_iteration": 2.8800711631774902 + }, + { + "auxiliary_loss_clip": 0.01152294, + "auxiliary_loss_mlp": 0.01121394, + "balance_loss_clip": 1.00208473, + "balance_loss_mlp": 1.00065911, + "epoch": 0.41304674582894935, + "flos": 28876991287680.0, + "grad_norm": 1.719462389897274, + "language_loss": 0.75186229, + "learning_rate": 2.64840039967822e-06, + "loss": 0.77459919, + "num_input_tokens_seen": 147564870, + "step": 6870, + "time_per_iteration": 2.605090379714966 + }, + { + "auxiliary_loss_clip": 0.01119544, + "auxiliary_loss_mlp": 0.01121261, + "balance_loss_clip": 1.0019145, + "balance_loss_mlp": 1.00081182, + "epoch": 0.4131068690816173, + "flos": 22891975436160.0, + "grad_norm": 1.5317121539076761, + "language_loss": 0.83488727, + "learning_rate": 2.6480319611173912e-06, + "loss": 0.85729533, + "num_input_tokens_seen": 147584840, + "step": 6871, + "time_per_iteration": 2.6684694290161133 + }, + { + "auxiliary_loss_clip": 0.01120271, + "auxiliary_loss_mlp": 0.01121609, + "balance_loss_clip": 1.00200021, + "balance_loss_mlp": 1.0007782, + "epoch": 0.4131669923342853, + "flos": 26065185707520.0, + "grad_norm": 2.3189948593995617, + "language_loss": 0.68238401, + "learning_rate": 2.6476634979831263e-06, + "loss": 0.70480287, + "num_input_tokens_seen": 147604635, + "step": 6872, + "time_per_iteration": 2.6555655002593994 + }, + { + "auxiliary_loss_clip": 0.01138017, + "auxiliary_loss_mlp": 0.01121337, + "balance_loss_clip": 1.00209856, + "balance_loss_mlp": 1.0006969, + "epoch": 0.41322711558695324, + "flos": 19244564789760.0, + "grad_norm": 1.8797363087958123, + "language_loss": 0.75931299, + "learning_rate": 2.6472950102893964e-06, + "loss": 0.78190649, + "num_input_tokens_seen": 147620700, + "step": 6873, + "time_per_iteration": 2.595771551132202 + }, + { + "auxiliary_loss_clip": 0.0113734, + "auxiliary_loss_mlp": 0.01121665, + "balance_loss_clip": 1.00212502, + "balance_loss_mlp": 1.00064409, + "epoch": 0.4132872388396212, + "flos": 22674464628480.0, + "grad_norm": 1.8254899073108253, + "language_loss": 0.83162022, + "learning_rate": 2.6469264980501746e-06, + "loss": 0.85421026, + "num_input_tokens_seen": 147639490, + "step": 6874, + "time_per_iteration": 2.60073184967041 + }, + { + "auxiliary_loss_clip": 0.01122235, + "auxiliary_loss_mlp": 0.011217, + "balance_loss_clip": 1.00185168, + "balance_loss_mlp": 1.00058317, + "epoch": 0.4133473620922892, + "flos": 20150195420160.0, + "grad_norm": 1.7890493899346303, + "language_loss": 0.72096455, + "learning_rate": 2.646557961279436e-06, + "loss": 0.74340391, + "num_input_tokens_seen": 147657205, + "step": 6875, + "time_per_iteration": 2.612973690032959 + }, + { + "auxiliary_loss_clip": 0.01139005, + "auxiliary_loss_mlp": 0.01120772, + "balance_loss_clip": 1.00203919, + "balance_loss_mlp": 1.00089478, + "epoch": 0.41340748534495714, + "flos": 24242755317120.0, + "grad_norm": 2.12362230228729, + "language_loss": 0.82627767, + "learning_rate": 2.646189399991154e-06, + "loss": 0.8488754, + "num_input_tokens_seen": 147677005, + "step": 6876, + "time_per_iteration": 2.6225945949554443 + }, + { + "auxiliary_loss_clip": 0.0115265, + "auxiliary_loss_mlp": 0.01122407, + "balance_loss_clip": 1.00195694, + "balance_loss_mlp": 1.00062323, + "epoch": 0.41346760859762516, + "flos": 14392171566720.0, + "grad_norm": 4.691413731648509, + "language_loss": 0.65001035, + "learning_rate": 2.6458208141993048e-06, + "loss": 0.67276096, + "num_input_tokens_seen": 147693435, + "step": 6877, + "time_per_iteration": 2.629509687423706 + }, + { + "auxiliary_loss_clip": 0.01152361, + "auxiliary_loss_mlp": 0.01121685, + "balance_loss_clip": 1.00209284, + "balance_loss_mlp": 1.00066328, + "epoch": 0.4135277318502931, + "flos": 22492002516480.0, + "grad_norm": 1.9952111207790972, + "language_loss": 0.76702797, + "learning_rate": 2.6454522039178668e-06, + "loss": 0.78976846, + "num_input_tokens_seen": 147714000, + "step": 6878, + "time_per_iteration": 2.626193046569824 + }, + { + "auxiliary_loss_clip": 0.01152464, + "auxiliary_loss_mlp": 0.007476, + "balance_loss_clip": 1.00204039, + "balance_loss_mlp": 1.00038981, + "epoch": 0.4135878551029611, + "flos": 22418744728320.0, + "grad_norm": 1.848924592915773, + "language_loss": 0.80154449, + "learning_rate": 2.6450835691608154e-06, + "loss": 0.82054514, + "num_input_tokens_seen": 147731010, + "step": 6879, + "time_per_iteration": 2.5642659664154053 + }, + { + "auxiliary_loss_clip": 0.0116904, + "auxiliary_loss_mlp": 0.01122108, + "balance_loss_clip": 1.00212836, + "balance_loss_mlp": 1.00070477, + "epoch": 0.41364797835562905, + "flos": 27053232094080.0, + "grad_norm": 1.7645528677348585, + "language_loss": 0.84669042, + "learning_rate": 2.6447149099421315e-06, + "loss": 0.86960185, + "num_input_tokens_seen": 147750880, + "step": 6880, + "time_per_iteration": 2.6156768798828125 + }, + { + "auxiliary_loss_clip": 0.01136197, + "auxiliary_loss_mlp": 0.01122073, + "balance_loss_clip": 1.00202596, + "balance_loss_mlp": 1.00057542, + "epoch": 0.413708101608297, + "flos": 22967603521920.0, + "grad_norm": 1.6023289528383506, + "language_loss": 0.70309043, + "learning_rate": 2.6443462262757927e-06, + "loss": 0.72567308, + "num_input_tokens_seen": 147771360, + "step": 6881, + "time_per_iteration": 2.611616849899292 + }, + { + "auxiliary_loss_clip": 0.01169011, + "auxiliary_loss_mlp": 0.0112134, + "balance_loss_clip": 1.00223541, + "balance_loss_mlp": 1.00089097, + "epoch": 0.413768224860965, + "flos": 13333991875200.0, + "grad_norm": 3.2808026729331705, + "language_loss": 0.81117731, + "learning_rate": 2.6439775181757805e-06, + "loss": 0.83408087, + "num_input_tokens_seen": 147787440, + "step": 6882, + "time_per_iteration": 2.5095207691192627 + }, + { + "auxiliary_loss_clip": 0.01137424, + "auxiliary_loss_mlp": 0.01122735, + "balance_loss_clip": 1.00208902, + "balance_loss_mlp": 1.00095081, + "epoch": 0.41382834811363295, + "flos": 20813968800000.0, + "grad_norm": 1.9637160081249823, + "language_loss": 0.69832963, + "learning_rate": 2.643608785656077e-06, + "loss": 0.72093117, + "num_input_tokens_seen": 147805720, + "step": 6883, + "time_per_iteration": 4.070748567581177 + }, + { + "auxiliary_loss_clip": 0.01153965, + "auxiliary_loss_mlp": 0.01121344, + "balance_loss_clip": 1.00213599, + "balance_loss_mlp": 1.00060856, + "epoch": 0.4138884713663009, + "flos": 20667130001280.0, + "grad_norm": 2.3862673730779274, + "language_loss": 0.75425309, + "learning_rate": 2.643240028730663e-06, + "loss": 0.77700621, + "num_input_tokens_seen": 147824605, + "step": 6884, + "time_per_iteration": 2.5829057693481445 + }, + { + "auxiliary_loss_clip": 0.01127148, + "auxiliary_loss_mlp": 0.01121411, + "balance_loss_clip": 1.00234854, + "balance_loss_mlp": 1.00086713, + "epoch": 0.4139485946189689, + "flos": 29056616225280.0, + "grad_norm": 1.482443026146994, + "language_loss": 0.75727987, + "learning_rate": 2.642871247413523e-06, + "loss": 0.77976549, + "num_input_tokens_seen": 147845445, + "step": 6885, + "time_per_iteration": 2.7149951457977295 + }, + { + "auxiliary_loss_clip": 0.01169003, + "auxiliary_loss_mlp": 0.01122189, + "balance_loss_clip": 1.0020833, + "balance_loss_mlp": 1.0007863, + "epoch": 0.41400871787163684, + "flos": 24425720219520.0, + "grad_norm": 2.2945154871880575, + "language_loss": 0.69951826, + "learning_rate": 2.6425024417186414e-06, + "loss": 0.72243023, + "num_input_tokens_seen": 147865580, + "step": 6886, + "time_per_iteration": 3.987778425216675 + }, + { + "auxiliary_loss_clip": 0.01169128, + "auxiliary_loss_mlp": 0.00747587, + "balance_loss_clip": 1.00210857, + "balance_loss_mlp": 1.00032687, + "epoch": 0.4140688411243048, + "flos": 19464050845440.0, + "grad_norm": 1.5645808478159435, + "language_loss": 0.75099981, + "learning_rate": 2.642133611660002e-06, + "loss": 0.77016699, + "num_input_tokens_seen": 147885230, + "step": 6887, + "time_per_iteration": 2.5322020053863525 + }, + { + "auxiliary_loss_clip": 0.01152311, + "auxiliary_loss_mlp": 0.01120623, + "balance_loss_clip": 1.00199413, + "balance_loss_mlp": 1.00055599, + "epoch": 0.4141289643769728, + "flos": 19313656600320.0, + "grad_norm": 1.8643913973670014, + "language_loss": 0.70090646, + "learning_rate": 2.641764757251592e-06, + "loss": 0.72363579, + "num_input_tokens_seen": 147903035, + "step": 6888, + "time_per_iteration": 4.040741682052612 + }, + { + "auxiliary_loss_clip": 0.01169024, + "auxiliary_loss_mlp": 0.01120677, + "balance_loss_clip": 1.00214326, + "balance_loss_mlp": 1.00060928, + "epoch": 0.41418908762964074, + "flos": 16726903683840.0, + "grad_norm": 2.347136064987256, + "language_loss": 0.76331508, + "learning_rate": 2.6413958785073976e-06, + "loss": 0.78621209, + "num_input_tokens_seen": 147918745, + "step": 6889, + "time_per_iteration": 2.4917447566986084 + }, + { + "auxiliary_loss_clip": 0.01120667, + "auxiliary_loss_mlp": 0.00747483, + "balance_loss_clip": 1.00194895, + "balance_loss_mlp": 1.00037551, + "epoch": 0.41424921088230876, + "flos": 25296840858240.0, + "grad_norm": 2.715099409467736, + "language_loss": 0.79941595, + "learning_rate": 2.6410269754414074e-06, + "loss": 0.81809747, + "num_input_tokens_seen": 147938265, + "step": 6890, + "time_per_iteration": 2.696401596069336 + }, + { + "auxiliary_loss_clip": 0.01168969, + "auxiliary_loss_mlp": 0.01121673, + "balance_loss_clip": 1.0021534, + "balance_loss_mlp": 1.00074673, + "epoch": 0.4143093341349767, + "flos": 20960520289920.0, + "grad_norm": 1.434689411804743, + "language_loss": 0.74472743, + "learning_rate": 2.6406580480676113e-06, + "loss": 0.76763386, + "num_input_tokens_seen": 147957320, + "step": 6891, + "time_per_iteration": 3.867279291152954 + }, + { + "auxiliary_loss_clip": 0.01106988, + "auxiliary_loss_mlp": 0.01122257, + "balance_loss_clip": 1.00188494, + "balance_loss_mlp": 1.00075936, + "epoch": 0.4143694573876447, + "flos": 22017694400640.0, + "grad_norm": 1.7903884381637936, + "language_loss": 0.8407377, + "learning_rate": 2.6402890963999963e-06, + "loss": 0.86303008, + "num_input_tokens_seen": 147977045, + "step": 6892, + "time_per_iteration": 2.6952524185180664 + }, + { + "auxiliary_loss_clip": 0.01122346, + "auxiliary_loss_mlp": 0.00747537, + "balance_loss_clip": 1.0020076, + "balance_loss_mlp": 1.00031257, + "epoch": 0.41442958064031266, + "flos": 35697396723840.0, + "grad_norm": 1.7290280735621386, + "language_loss": 0.70470119, + "learning_rate": 2.6399201204525554e-06, + "loss": 0.72340006, + "num_input_tokens_seen": 147996905, + "step": 6893, + "time_per_iteration": 2.7575786113739014 + }, + { + "auxiliary_loss_clip": 0.01169051, + "auxiliary_loss_mlp": 0.01121739, + "balance_loss_clip": 1.00215411, + "balance_loss_mlp": 1.00062275, + "epoch": 0.4144897038929806, + "flos": 28293766156800.0, + "grad_norm": 1.3089783227274843, + "language_loss": 0.72912908, + "learning_rate": 2.639551120239279e-06, + "loss": 0.75203699, + "num_input_tokens_seen": 148017875, + "step": 6894, + "time_per_iteration": 2.575841188430786 + }, + { + "auxiliary_loss_clip": 0.01153943, + "auxiliary_loss_mlp": 0.01121333, + "balance_loss_clip": 1.00203621, + "balance_loss_mlp": 1.00059831, + "epoch": 0.4145498271456486, + "flos": 11648093080320.0, + "grad_norm": 23.63881230800306, + "language_loss": 0.62479633, + "learning_rate": 2.63918209577416e-06, + "loss": 0.64754909, + "num_input_tokens_seen": 148032300, + "step": 6895, + "time_per_iteration": 2.499282121658325 + }, + { + "auxiliary_loss_clip": 0.0112068, + "auxiliary_loss_mlp": 0.01121703, + "balance_loss_clip": 1.00194275, + "balance_loss_mlp": 1.00096726, + "epoch": 0.41460995039831655, + "flos": 27235622378880.0, + "grad_norm": 1.5584553610056397, + "language_loss": 0.70313257, + "learning_rate": 2.638813047071192e-06, + "loss": 0.72555643, + "num_input_tokens_seen": 148053260, + "step": 6896, + "time_per_iteration": 2.7048017978668213 + }, + { + "auxiliary_loss_clip": 0.01168977, + "auxiliary_loss_mlp": 0.01122145, + "balance_loss_clip": 1.00207829, + "balance_loss_mlp": 1.00093317, + "epoch": 0.4146700736509845, + "flos": 25922369232000.0, + "grad_norm": 1.7005802539123092, + "language_loss": 0.73504281, + "learning_rate": 2.6384439741443696e-06, + "loss": 0.75795406, + "num_input_tokens_seen": 148072965, + "step": 6897, + "time_per_iteration": 2.571573257446289 + }, + { + "auxiliary_loss_clip": 0.0115231, + "auxiliary_loss_mlp": 0.01121786, + "balance_loss_clip": 1.00218046, + "balance_loss_mlp": 1.00086045, + "epoch": 0.4147301969036525, + "flos": 26833243248000.0, + "grad_norm": 19.292334882355707, + "language_loss": 0.84709442, + "learning_rate": 2.6380748770076873e-06, + "loss": 0.86983538, + "num_input_tokens_seen": 148093240, + "step": 6898, + "time_per_iteration": 2.600155830383301 + }, + { + "auxiliary_loss_clip": 0.01103815, + "auxiliary_loss_mlp": 0.01121515, + "balance_loss_clip": 1.00182438, + "balance_loss_mlp": 1.0006845, + "epoch": 0.41479032015632045, + "flos": 20298291194880.0, + "grad_norm": 1.5891786785870559, + "language_loss": 0.74438328, + "learning_rate": 2.6377057556751416e-06, + "loss": 0.76663661, + "num_input_tokens_seen": 148110925, + "step": 6899, + "time_per_iteration": 2.674159049987793 + }, + { + "auxiliary_loss_clip": 0.01122254, + "auxiliary_loss_mlp": 0.01122198, + "balance_loss_clip": 1.00178504, + "balance_loss_mlp": 1.0007, + "epoch": 0.4148504434089884, + "flos": 25264988472960.0, + "grad_norm": 1.7924453253486332, + "language_loss": 0.75331855, + "learning_rate": 2.6373366101607306e-06, + "loss": 0.77576303, + "num_input_tokens_seen": 148130670, + "step": 6900, + "time_per_iteration": 2.6742448806762695 + }, + { + "auxiliary_loss_clip": 0.0115248, + "auxiliary_loss_mlp": 0.01120938, + "balance_loss_clip": 1.00206113, + "balance_loss_mlp": 1.00067961, + "epoch": 0.4149105666616564, + "flos": 12822300679680.0, + "grad_norm": 2.048113737019502, + "language_loss": 0.79391885, + "learning_rate": 2.6369674404784503e-06, + "loss": 0.81665307, + "num_input_tokens_seen": 148148350, + "step": 6901, + "time_per_iteration": 2.526167631149292 + }, + { + "auxiliary_loss_clip": 0.01120487, + "auxiliary_loss_mlp": 0.01120697, + "balance_loss_clip": 1.00195432, + "balance_loss_mlp": 1.00062907, + "epoch": 0.41497068991432434, + "flos": 16763891713920.0, + "grad_norm": 1.7494384275507504, + "language_loss": 0.69813728, + "learning_rate": 2.6365982466423014e-06, + "loss": 0.72054911, + "num_input_tokens_seen": 148167550, + "step": 6902, + "time_per_iteration": 2.6489665508270264 + }, + { + "auxiliary_loss_clip": 0.01135327, + "auxiliary_loss_mlp": 0.00747557, + "balance_loss_clip": 1.00192583, + "balance_loss_mlp": 1.00043464, + "epoch": 0.4150308131669923, + "flos": 18000906243840.0, + "grad_norm": 1.7774094518236205, + "language_loss": 0.83837759, + "learning_rate": 2.6362290286662834e-06, + "loss": 0.8572064, + "num_input_tokens_seen": 148184740, + "step": 6903, + "time_per_iteration": 2.579122304916382 + }, + { + "auxiliary_loss_clip": 0.01169033, + "auxiliary_loss_mlp": 0.01122885, + "balance_loss_clip": 1.00212419, + "balance_loss_mlp": 1.0006243, + "epoch": 0.41509093641966033, + "flos": 30044770352640.0, + "grad_norm": 2.270323065345625, + "language_loss": 0.67856026, + "learning_rate": 2.6358597865643968e-06, + "loss": 0.70147943, + "num_input_tokens_seen": 148204605, + "step": 6904, + "time_per_iteration": 2.5790159702301025 + }, + { + "auxiliary_loss_clip": 0.01169081, + "auxiliary_loss_mlp": 0.00747624, + "balance_loss_clip": 1.00207353, + "balance_loss_mlp": 1.00029492, + "epoch": 0.4151510596723283, + "flos": 24279994742400.0, + "grad_norm": 1.958624581708663, + "language_loss": 0.77604288, + "learning_rate": 2.635490520350643e-06, + "loss": 0.79520988, + "num_input_tokens_seen": 148224675, + "step": 6905, + "time_per_iteration": 2.578639030456543 + }, + { + "auxiliary_loss_clip": 0.01169244, + "auxiliary_loss_mlp": 0.0112167, + "balance_loss_clip": 1.00224471, + "balance_loss_mlp": 1.0006485, + "epoch": 0.41521118292499626, + "flos": 23476206147840.0, + "grad_norm": 1.5527634512644537, + "language_loss": 0.68273604, + "learning_rate": 2.635121230039025e-06, + "loss": 0.7056452, + "num_input_tokens_seen": 148243375, + "step": 6906, + "time_per_iteration": 2.589355230331421 + }, + { + "auxiliary_loss_clip": 0.01136934, + "auxiliary_loss_mlp": 0.01120976, + "balance_loss_clip": 1.00207925, + "balance_loss_mlp": 1.00062263, + "epoch": 0.4152713061776642, + "flos": 22125498094080.0, + "grad_norm": 2.0596394959875948, + "language_loss": 0.67743075, + "learning_rate": 2.6347519156435467e-06, + "loss": 0.70000988, + "num_input_tokens_seen": 148261140, + "step": 6907, + "time_per_iteration": 2.5864005088806152 + }, + { + "auxiliary_loss_clip": 0.01120178, + "auxiliary_loss_mlp": 0.01121914, + "balance_loss_clip": 1.00203395, + "balance_loss_mlp": 1.00079703, + "epoch": 0.4153314294303322, + "flos": 21251396626560.0, + "grad_norm": 1.8078798644590415, + "language_loss": 0.77205765, + "learning_rate": 2.6343825771782123e-06, + "loss": 0.7944786, + "num_input_tokens_seen": 148279655, + "step": 6908, + "time_per_iteration": 2.629718780517578 + }, + { + "auxiliary_loss_clip": 0.01134965, + "auxiliary_loss_mlp": 0.01101497, + "balance_loss_clip": 1.00169468, + "balance_loss_mlp": 1.00002599, + "epoch": 0.41539155268300015, + "flos": 57920681594880.0, + "grad_norm": 0.7729457630481411, + "language_loss": 0.64846611, + "learning_rate": 2.634013214657026e-06, + "loss": 0.67083085, + "num_input_tokens_seen": 148339005, + "step": 6909, + "time_per_iteration": 3.1251916885375977 + }, + { + "auxiliary_loss_clip": 0.01126026, + "auxiliary_loss_mlp": 0.01120711, + "balance_loss_clip": 1.00273538, + "balance_loss_mlp": 1.00073862, + "epoch": 0.4154516759356681, + "flos": 21903677654400.0, + "grad_norm": 1.5238633040110854, + "language_loss": 0.87022513, + "learning_rate": 2.633643828093996e-06, + "loss": 0.89269245, + "num_input_tokens_seen": 148358715, + "step": 6910, + "time_per_iteration": 2.6368906497955322 + }, + { + "auxiliary_loss_clip": 0.0114995, + "auxiliary_loss_mlp": 0.01101451, + "balance_loss_clip": 1.00172687, + "balance_loss_mlp": 0.99998057, + "epoch": 0.4155117991883361, + "flos": 67833677226240.0, + "grad_norm": 0.8064851163892227, + "language_loss": 0.62192267, + "learning_rate": 2.633274417503128e-06, + "loss": 0.64443672, + "num_input_tokens_seen": 148417280, + "step": 6911, + "time_per_iteration": 3.09419584274292 + }, + { + "auxiliary_loss_clip": 0.01169364, + "auxiliary_loss_mlp": 0.01122742, + "balance_loss_clip": 1.00228405, + "balance_loss_mlp": 1.0006721, + "epoch": 0.41557192244100405, + "flos": 14282679934080.0, + "grad_norm": 2.827003554138974, + "language_loss": 0.87764215, + "learning_rate": 2.6329049828984312e-06, + "loss": 0.90056324, + "num_input_tokens_seen": 148432610, + "step": 6912, + "time_per_iteration": 2.4848265647888184 + }, + { + "auxiliary_loss_clip": 0.01152284, + "auxiliary_loss_mlp": 0.01120912, + "balance_loss_clip": 1.00205112, + "balance_loss_mlp": 1.000844, + "epoch": 0.415632045693672, + "flos": 24461954064000.0, + "grad_norm": 2.542104882588853, + "language_loss": 0.62929857, + "learning_rate": 2.632535524293914e-06, + "loss": 0.65203047, + "num_input_tokens_seen": 148451510, + "step": 6913, + "time_per_iteration": 2.635856866836548 + }, + { + "auxiliary_loss_clip": 0.01137231, + "auxiliary_loss_mlp": 0.00747615, + "balance_loss_clip": 1.00205147, + "balance_loss_mlp": 1.00034118, + "epoch": 0.41569216894634, + "flos": 20115290378880.0, + "grad_norm": 1.9033837160986198, + "language_loss": 0.75349784, + "learning_rate": 2.632166041703586e-06, + "loss": 0.77234632, + "num_input_tokens_seen": 148469945, + "step": 6914, + "time_per_iteration": 2.6077358722686768 + }, + { + "auxiliary_loss_clip": 0.01104199, + "auxiliary_loss_mlp": 0.01121881, + "balance_loss_clip": 1.0018158, + "balance_loss_mlp": 1.00076461, + "epoch": 0.41575229219900794, + "flos": 23798827128960.0, + "grad_norm": 3.1882570244512483, + "language_loss": 0.87612921, + "learning_rate": 2.631796535141458e-06, + "loss": 0.89839005, + "num_input_tokens_seen": 148486655, + "step": 6915, + "time_per_iteration": 2.6644294261932373 + }, + { + "auxiliary_loss_clip": 0.01143868, + "auxiliary_loss_mlp": 0.01121469, + "balance_loss_clip": 1.00261736, + "balance_loss_mlp": 1.00092483, + "epoch": 0.4158124154516759, + "flos": 23108229267840.0, + "grad_norm": 1.9491466415679084, + "language_loss": 0.70861971, + "learning_rate": 2.6314270046215426e-06, + "loss": 0.73127306, + "num_input_tokens_seen": 148505035, + "step": 6916, + "time_per_iteration": 2.614203929901123 + }, + { + "auxiliary_loss_clip": 0.01169134, + "auxiliary_loss_mlp": 0.01122876, + "balance_loss_clip": 1.00215769, + "balance_loss_mlp": 1.00071013, + "epoch": 0.41587253870434393, + "flos": 24242970798720.0, + "grad_norm": 1.330994220990246, + "language_loss": 0.71943069, + "learning_rate": 2.631057450157852e-06, + "loss": 0.7423507, + "num_input_tokens_seen": 148525575, + "step": 6917, + "time_per_iteration": 2.550321578979492 + }, + { + "auxiliary_loss_clip": 0.01141967, + "auxiliary_loss_mlp": 0.01121521, + "balance_loss_clip": 1.00234985, + "balance_loss_mlp": 1.00059474, + "epoch": 0.4159326619570119, + "flos": 23881602021120.0, + "grad_norm": 1.587208847029964, + "language_loss": 0.81122398, + "learning_rate": 2.6306878717643988e-06, + "loss": 0.83385885, + "num_input_tokens_seen": 148547270, + "step": 6918, + "time_per_iteration": 2.66414213180542 + }, + { + "auxiliary_loss_clip": 0.0115225, + "auxiliary_loss_mlp": 0.01121619, + "balance_loss_clip": 1.00204933, + "balance_loss_mlp": 1.00078797, + "epoch": 0.41599278520967986, + "flos": 40626531354240.0, + "grad_norm": 1.5185182156122263, + "language_loss": 0.70349097, + "learning_rate": 2.6303182694551995e-06, + "loss": 0.72622967, + "num_input_tokens_seen": 148572100, + "step": 6919, + "time_per_iteration": 2.763679027557373 + }, + { + "auxiliary_loss_clip": 0.0113739, + "auxiliary_loss_mlp": 0.01121788, + "balance_loss_clip": 1.00208759, + "balance_loss_mlp": 1.00067091, + "epoch": 0.4160529084623478, + "flos": 18222942165120.0, + "grad_norm": 1.8915971863348608, + "language_loss": 0.81458628, + "learning_rate": 2.6299486432442677e-06, + "loss": 0.83717805, + "num_input_tokens_seen": 148591245, + "step": 6920, + "time_per_iteration": 2.600426435470581 + }, + { + "auxiliary_loss_clip": 0.01135855, + "auxiliary_loss_mlp": 0.01122032, + "balance_loss_clip": 1.00212908, + "balance_loss_mlp": 1.0007242, + "epoch": 0.4161130317150158, + "flos": 13661963982720.0, + "grad_norm": 3.356094164916974, + "language_loss": 0.66194105, + "learning_rate": 2.6295789931456195e-06, + "loss": 0.68451989, + "num_input_tokens_seen": 148607980, + "step": 6921, + "time_per_iteration": 3.9936230182647705 + }, + { + "auxiliary_loss_clip": 0.01137183, + "auxiliary_loss_mlp": 0.01122178, + "balance_loss_clip": 1.00203335, + "balance_loss_mlp": 1.00077486, + "epoch": 0.41617315496768376, + "flos": 16178511767040.0, + "grad_norm": 1.9945931921123, + "language_loss": 0.80341059, + "learning_rate": 2.629209319173274e-06, + "loss": 0.82600427, + "num_input_tokens_seen": 148624490, + "step": 6922, + "time_per_iteration": 2.567962169647217 + }, + { + "auxiliary_loss_clip": 0.01135765, + "auxiliary_loss_mlp": 0.01122421, + "balance_loss_clip": 1.00206387, + "balance_loss_mlp": 1.00082719, + "epoch": 0.4162332782203517, + "flos": 26213317395840.0, + "grad_norm": 2.331735891718798, + "language_loss": 0.67854506, + "learning_rate": 2.628839621341247e-06, + "loss": 0.70112693, + "num_input_tokens_seen": 148646490, + "step": 6923, + "time_per_iteration": 2.6495463848114014 + }, + { + "auxiliary_loss_clip": 0.01137861, + "auxiliary_loss_mlp": 0.01122369, + "balance_loss_clip": 1.00207496, + "balance_loss_mlp": 1.0008707, + "epoch": 0.4162934014730197, + "flos": 28183987215360.0, + "grad_norm": 1.8344596306621912, + "language_loss": 0.75950068, + "learning_rate": 2.6284698996635593e-06, + "loss": 0.782103, + "num_input_tokens_seen": 148668580, + "step": 6924, + "time_per_iteration": 4.055726051330566 + }, + { + "auxiliary_loss_clip": 0.01169012, + "auxiliary_loss_mlp": 0.01121549, + "balance_loss_clip": 1.0020647, + "balance_loss_mlp": 1.00062323, + "epoch": 0.41635352472568765, + "flos": 19865316654720.0, + "grad_norm": 2.1154999684647056, + "language_loss": 0.73115551, + "learning_rate": 2.62810015415423e-06, + "loss": 0.7540611, + "num_input_tokens_seen": 148688410, + "step": 6925, + "time_per_iteration": 2.5542497634887695 + }, + { + "auxiliary_loss_clip": 0.01137171, + "auxiliary_loss_mlp": 0.01121392, + "balance_loss_clip": 1.0019598, + "balance_loss_mlp": 1.00065744, + "epoch": 0.4164136479783556, + "flos": 14935356011520.0, + "grad_norm": 4.018777374065677, + "language_loss": 0.8366946, + "learning_rate": 2.6277303848272792e-06, + "loss": 0.85928023, + "num_input_tokens_seen": 148704855, + "step": 6926, + "time_per_iteration": 4.041667699813843 + }, + { + "auxiliary_loss_clip": 0.0113567, + "auxiliary_loss_mlp": 0.0112086, + "balance_loss_clip": 1.00184345, + "balance_loss_mlp": 1.00060141, + "epoch": 0.4164737712310236, + "flos": 21757593041280.0, + "grad_norm": 1.6271910025520566, + "language_loss": 0.86662185, + "learning_rate": 2.6273605916967302e-06, + "loss": 0.8891871, + "num_input_tokens_seen": 148723065, + "step": 6927, + "time_per_iteration": 2.5867950916290283 + }, + { + "auxiliary_loss_clip": 0.01152664, + "auxiliary_loss_mlp": 0.01121662, + "balance_loss_clip": 1.00218344, + "balance_loss_mlp": 1.00083101, + "epoch": 0.41653389448369155, + "flos": 20740136394240.0, + "grad_norm": 2.5210400965384214, + "language_loss": 0.72566807, + "learning_rate": 2.626990774776604e-06, + "loss": 0.74841136, + "num_input_tokens_seen": 148741780, + "step": 6928, + "time_per_iteration": 3.927920341491699 + }, + { + "auxiliary_loss_clip": 0.01135466, + "auxiliary_loss_mlp": 0.01121935, + "balance_loss_clip": 1.00186968, + "balance_loss_mlp": 1.00062728, + "epoch": 0.4165940177363595, + "flos": 24972891073920.0, + "grad_norm": 1.8129012435802663, + "language_loss": 0.78010786, + "learning_rate": 2.6266209340809254e-06, + "loss": 0.80268186, + "num_input_tokens_seen": 148759795, + "step": 6929, + "time_per_iteration": 2.61877703666687 + }, + { + "auxiliary_loss_clip": 0.01168928, + "auxiliary_loss_mlp": 0.01121341, + "balance_loss_clip": 1.00211692, + "balance_loss_mlp": 1.00060606, + "epoch": 0.41665414098902753, + "flos": 20521727746560.0, + "grad_norm": 2.7266759055520593, + "language_loss": 0.70689321, + "learning_rate": 2.6262510696237182e-06, + "loss": 0.72979593, + "num_input_tokens_seen": 148778680, + "step": 6930, + "time_per_iteration": 2.516317129135132 + }, + { + "auxiliary_loss_clip": 0.01135471, + "auxiliary_loss_mlp": 0.01122217, + "balance_loss_clip": 1.00183141, + "balance_loss_mlp": 1.00071931, + "epoch": 0.4167142642416955, + "flos": 19682926369920.0, + "grad_norm": 1.636201679923217, + "language_loss": 0.81437987, + "learning_rate": 2.625881181419007e-06, + "loss": 0.83695674, + "num_input_tokens_seen": 148796470, + "step": 6931, + "time_per_iteration": 2.605774402618408 + }, + { + "auxiliary_loss_clip": 0.0110558, + "auxiliary_loss_mlp": 0.0112111, + "balance_loss_clip": 1.00202048, + "balance_loss_mlp": 1.00075674, + "epoch": 0.41677438749436346, + "flos": 23763742519680.0, + "grad_norm": 2.0811255948375287, + "language_loss": 0.78960371, + "learning_rate": 2.6255112694808193e-06, + "loss": 0.81187057, + "num_input_tokens_seen": 148815300, + "step": 6932, + "time_per_iteration": 2.69512677192688 + }, + { + "auxiliary_loss_clip": 0.01135624, + "auxiliary_loss_mlp": 0.00747604, + "balance_loss_clip": 1.00190926, + "balance_loss_mlp": 1.00027084, + "epoch": 0.41683451074703143, + "flos": 30410053712640.0, + "grad_norm": 2.024156776829609, + "language_loss": 0.81445271, + "learning_rate": 2.6251413338231813e-06, + "loss": 0.83328497, + "num_input_tokens_seen": 148834315, + "step": 6933, + "time_per_iteration": 2.685673475265503 + }, + { + "auxiliary_loss_clip": 0.01169092, + "auxiliary_loss_mlp": 0.01122216, + "balance_loss_clip": 1.00205028, + "balance_loss_mlp": 1.00071752, + "epoch": 0.4168946339996994, + "flos": 21506757390720.0, + "grad_norm": 1.8296821343575553, + "language_loss": 0.77146578, + "learning_rate": 2.624771374460121e-06, + "loss": 0.79437888, + "num_input_tokens_seen": 148852420, + "step": 6934, + "time_per_iteration": 2.510406017303467 + }, + { + "auxiliary_loss_clip": 0.01152379, + "auxiliary_loss_mlp": 0.01121908, + "balance_loss_clip": 1.00207448, + "balance_loss_mlp": 1.00088644, + "epoch": 0.41695475725236736, + "flos": 17638675539840.0, + "grad_norm": 1.7955786515540473, + "language_loss": 0.67723334, + "learning_rate": 2.624401391405668e-06, + "loss": 0.69997615, + "num_input_tokens_seen": 148869305, + "step": 6935, + "time_per_iteration": 2.572005271911621 + }, + { + "auxiliary_loss_clip": 0.01135368, + "auxiliary_loss_mlp": 0.01121587, + "balance_loss_clip": 1.00187087, + "balance_loss_mlp": 1.00085163, + "epoch": 0.4170148805050353, + "flos": 15668903560320.0, + "grad_norm": 2.6594192092459554, + "language_loss": 0.72596085, + "learning_rate": 2.6240313846738513e-06, + "loss": 0.74853039, + "num_input_tokens_seen": 148886395, + "step": 6936, + "time_per_iteration": 2.584775447845459 + }, + { + "auxiliary_loss_clip": 0.01152399, + "auxiliary_loss_mlp": 0.01121374, + "balance_loss_clip": 1.00196707, + "balance_loss_mlp": 1.00102067, + "epoch": 0.4170750037577033, + "flos": 15159151699200.0, + "grad_norm": 1.7230401348816635, + "language_loss": 0.73969787, + "learning_rate": 2.6236613542787024e-06, + "loss": 0.76243567, + "num_input_tokens_seen": 148905235, + "step": 6937, + "time_per_iteration": 2.546138286590576 + }, + { + "auxiliary_loss_clip": 0.0113564, + "auxiliary_loss_mlp": 0.01121242, + "balance_loss_clip": 1.00196564, + "balance_loss_mlp": 1.00069773, + "epoch": 0.41713512701037125, + "flos": 28768289754240.0, + "grad_norm": 1.405600446877171, + "language_loss": 0.8409946, + "learning_rate": 2.6232913002342518e-06, + "loss": 0.86356342, + "num_input_tokens_seen": 148928130, + "step": 6938, + "time_per_iteration": 2.677546739578247 + }, + { + "auxiliary_loss_clip": 0.01141915, + "auxiliary_loss_mlp": 0.0112229, + "balance_loss_clip": 1.00229502, + "balance_loss_mlp": 1.00069702, + "epoch": 0.4171952502630392, + "flos": 28256993608320.0, + "grad_norm": 2.1659443078478455, + "language_loss": 0.74627841, + "learning_rate": 2.6229212225545334e-06, + "loss": 0.76892042, + "num_input_tokens_seen": 148948790, + "step": 6939, + "time_per_iteration": 2.668921709060669 + }, + { + "auxiliary_loss_clip": 0.01153407, + "auxiliary_loss_mlp": 0.01121522, + "balance_loss_clip": 1.002074, + "balance_loss_mlp": 1.00069177, + "epoch": 0.4172553735157072, + "flos": 24571697091840.0, + "grad_norm": 1.8011746787298162, + "language_loss": 0.75048387, + "learning_rate": 2.622551121253579e-06, + "loss": 0.77323318, + "num_input_tokens_seen": 148967690, + "step": 6940, + "time_per_iteration": 2.5918285846710205 + }, + { + "auxiliary_loss_clip": 0.01169071, + "auxiliary_loss_mlp": 0.011212, + "balance_loss_clip": 1.00214911, + "balance_loss_mlp": 1.0007515, + "epoch": 0.41731549676837515, + "flos": 27045797978880.0, + "grad_norm": 1.5283091568553882, + "language_loss": 0.71641076, + "learning_rate": 2.622180996345424e-06, + "loss": 0.73931348, + "num_input_tokens_seen": 148987150, + "step": 6941, + "time_per_iteration": 2.575336456298828 + }, + { + "auxiliary_loss_clip": 0.01152739, + "auxiliary_loss_mlp": 0.01122688, + "balance_loss_clip": 1.00202131, + "balance_loss_mlp": 1.00071347, + "epoch": 0.4173756200210431, + "flos": 28394063907840.0, + "grad_norm": 1.8253891093701498, + "language_loss": 0.73800063, + "learning_rate": 2.621810847844104e-06, + "loss": 0.76075494, + "num_input_tokens_seen": 149004895, + "step": 6942, + "time_per_iteration": 2.606529712677002 + }, + { + "auxiliary_loss_clip": 0.01122648, + "auxiliary_loss_mlp": 0.01122154, + "balance_loss_clip": 1.00200272, + "balance_loss_mlp": 1.00075126, + "epoch": 0.41743574327371114, + "flos": 22521556431360.0, + "grad_norm": 2.260543177135512, + "language_loss": 0.72442007, + "learning_rate": 2.6214406757636534e-06, + "loss": 0.74686801, + "num_input_tokens_seen": 149020970, + "step": 6943, + "time_per_iteration": 2.658264636993408 + }, + { + "auxiliary_loss_clip": 0.01122372, + "auxiliary_loss_mlp": 0.00747644, + "balance_loss_clip": 1.00202453, + "balance_loss_mlp": 1.00024235, + "epoch": 0.4174958665263791, + "flos": 30113431200000.0, + "grad_norm": 8.551674701712635, + "language_loss": 0.63587242, + "learning_rate": 2.621070480118111e-06, + "loss": 0.65457255, + "num_input_tokens_seen": 149041795, + "step": 6944, + "time_per_iteration": 2.7253963947296143 + }, + { + "auxiliary_loss_clip": 0.0113875, + "auxiliary_loss_mlp": 0.01121626, + "balance_loss_clip": 1.00213552, + "balance_loss_mlp": 1.00069976, + "epoch": 0.41755598977904707, + "flos": 25263444188160.0, + "grad_norm": 1.7421693730230863, + "language_loss": 0.70072544, + "learning_rate": 2.620700260921513e-06, + "loss": 0.72332919, + "num_input_tokens_seen": 149063700, + "step": 6945, + "time_per_iteration": 2.6854777336120605 + }, + { + "auxiliary_loss_clip": 0.01121979, + "auxiliary_loss_mlp": 0.01121783, + "balance_loss_clip": 1.00190306, + "balance_loss_mlp": 1.00076163, + "epoch": 0.41761611303171503, + "flos": 19828580019840.0, + "grad_norm": 1.6775122508314992, + "language_loss": 0.80768347, + "learning_rate": 2.620330018187899e-06, + "loss": 0.83012104, + "num_input_tokens_seen": 149082410, + "step": 6946, + "time_per_iteration": 2.6716270446777344 + }, + { + "auxiliary_loss_clip": 0.0115849, + "auxiliary_loss_mlp": 0.01121068, + "balance_loss_clip": 1.0022366, + "balance_loss_mlp": 1.00071383, + "epoch": 0.417676236284383, + "flos": 15523249910400.0, + "grad_norm": 2.5828189902754177, + "language_loss": 0.77976561, + "learning_rate": 2.6199597519313086e-06, + "loss": 0.80256122, + "num_input_tokens_seen": 149098745, + "step": 6947, + "time_per_iteration": 2.52716064453125 + }, + { + "auxiliary_loss_clip": 0.01169089, + "auxiliary_loss_mlp": 0.0112211, + "balance_loss_clip": 1.0022161, + "balance_loss_mlp": 1.00089788, + "epoch": 0.41773635953705096, + "flos": 32524473761280.0, + "grad_norm": 1.6264710925200447, + "language_loss": 0.71613872, + "learning_rate": 2.6195894621657825e-06, + "loss": 0.73905063, + "num_input_tokens_seen": 149122255, + "step": 6948, + "time_per_iteration": 2.602193593978882 + }, + { + "auxiliary_loss_clip": 0.01152172, + "auxiliary_loss_mlp": 0.01121135, + "balance_loss_clip": 1.0019896, + "balance_loss_mlp": 1.00068593, + "epoch": 0.4177964827897189, + "flos": 23440941970560.0, + "grad_norm": 1.5253272395287536, + "language_loss": 0.77032387, + "learning_rate": 2.619219148905362e-06, + "loss": 0.79305696, + "num_input_tokens_seen": 149142845, + "step": 6949, + "time_per_iteration": 2.6018660068511963 + }, + { + "auxiliary_loss_clip": 0.01137734, + "auxiliary_loss_mlp": 0.01121534, + "balance_loss_clip": 1.00200391, + "balance_loss_mlp": 1.0007987, + "epoch": 0.4178566060423869, + "flos": 22748907565440.0, + "grad_norm": 1.6608837978320705, + "language_loss": 0.81771153, + "learning_rate": 2.6188488121640888e-06, + "loss": 0.8403042, + "num_input_tokens_seen": 149163375, + "step": 6950, + "time_per_iteration": 2.6371066570281982 + }, + { + "auxiliary_loss_clip": 0.01135838, + "auxiliary_loss_mlp": 0.00747438, + "balance_loss_clip": 1.00200546, + "balance_loss_mlp": 1.00030017, + "epoch": 0.41791672929505486, + "flos": 26032794618240.0, + "grad_norm": 2.3543890773324105, + "language_loss": 0.76199985, + "learning_rate": 2.618478451956007e-06, + "loss": 0.78083265, + "num_input_tokens_seen": 149185610, + "step": 6951, + "time_per_iteration": 2.6616106033325195 + }, + { + "auxiliary_loss_clip": 0.01102965, + "auxiliary_loss_mlp": 0.01121872, + "balance_loss_clip": 1.00175858, + "balance_loss_mlp": 1.00066018, + "epoch": 0.4179768525477228, + "flos": 19568694142080.0, + "grad_norm": 1.6454248294911533, + "language_loss": 0.73015988, + "learning_rate": 2.61810806829516e-06, + "loss": 0.75240815, + "num_input_tokens_seen": 149203990, + "step": 6952, + "time_per_iteration": 2.675877094268799 + }, + { + "auxiliary_loss_clip": 0.01152659, + "auxiliary_loss_mlp": 0.01121762, + "balance_loss_clip": 1.00203729, + "balance_loss_mlp": 1.00074124, + "epoch": 0.4180369758003908, + "flos": 17783826399360.0, + "grad_norm": 3.251991426118674, + "language_loss": 0.71319532, + "learning_rate": 2.617737661195593e-06, + "loss": 0.7359395, + "num_input_tokens_seen": 149221385, + "step": 6953, + "time_per_iteration": 2.5169289112091064 + }, + { + "auxiliary_loss_clip": 0.01153612, + "auxiliary_loss_mlp": 0.01121577, + "balance_loss_clip": 1.0021342, + "balance_loss_mlp": 1.00074601, + "epoch": 0.41809709905305875, + "flos": 20960663944320.0, + "grad_norm": 2.483224892606047, + "language_loss": 0.7560187, + "learning_rate": 2.617367230671353e-06, + "loss": 0.77877057, + "num_input_tokens_seen": 149241175, + "step": 6954, + "time_per_iteration": 2.552647829055786 + }, + { + "auxiliary_loss_clip": 0.01104053, + "auxiliary_loss_mlp": 0.01121769, + "balance_loss_clip": 1.00178027, + "balance_loss_mlp": 1.00074744, + "epoch": 0.4181572223057267, + "flos": 22017622573440.0, + "grad_norm": 2.957654732988287, + "language_loss": 0.84572625, + "learning_rate": 2.616996776736485e-06, + "loss": 0.86798453, + "num_input_tokens_seen": 149259115, + "step": 6955, + "time_per_iteration": 2.658522844314575 + }, + { + "auxiliary_loss_clip": 0.01153187, + "auxiliary_loss_mlp": 0.01121275, + "balance_loss_clip": 1.00204468, + "balance_loss_mlp": 1.00073111, + "epoch": 0.4182173455583947, + "flos": 26245528917120.0, + "grad_norm": 1.588596206375089, + "language_loss": 0.83253986, + "learning_rate": 2.616626299405037e-06, + "loss": 0.85528445, + "num_input_tokens_seen": 149278705, + "step": 6956, + "time_per_iteration": 2.6039984226226807 + }, + { + "auxiliary_loss_clip": 0.01120039, + "auxiliary_loss_mlp": 0.01121644, + "balance_loss_clip": 1.00208688, + "balance_loss_mlp": 1.00081325, + "epoch": 0.4182774688110627, + "flos": 14791605782400.0, + "grad_norm": 1.9672119451360959, + "language_loss": 0.71842498, + "learning_rate": 2.616255798691059e-06, + "loss": 0.74084181, + "num_input_tokens_seen": 149294040, + "step": 6957, + "time_per_iteration": 2.666809320449829 + }, + { + "auxiliary_loss_clip": 0.01120673, + "auxiliary_loss_mlp": 0.01121662, + "balance_loss_clip": 1.00200057, + "balance_loss_mlp": 1.00102186, + "epoch": 0.41833759206373067, + "flos": 20412020632320.0, + "grad_norm": 3.734366111179811, + "language_loss": 0.74968708, + "learning_rate": 2.6158852746085982e-06, + "loss": 0.77211034, + "num_input_tokens_seen": 149310385, + "step": 6958, + "time_per_iteration": 4.024616718292236 + }, + { + "auxiliary_loss_clip": 0.01107195, + "auxiliary_loss_mlp": 0.00747606, + "balance_loss_clip": 1.00195527, + "balance_loss_mlp": 1.00028598, + "epoch": 0.41839771531639863, + "flos": 23656333875840.0, + "grad_norm": 1.716082006380858, + "language_loss": 0.7687695, + "learning_rate": 2.6155147271717066e-06, + "loss": 0.78731751, + "num_input_tokens_seen": 149328235, + "step": 6959, + "time_per_iteration": 2.7088418006896973 + }, + { + "auxiliary_loss_clip": 0.01120617, + "auxiliary_loss_mlp": 0.00747665, + "balance_loss_clip": 1.00191236, + "balance_loss_mlp": 1.00032175, + "epoch": 0.4184578385690666, + "flos": 19754137082880.0, + "grad_norm": 1.672917827413931, + "language_loss": 0.76785719, + "learning_rate": 2.6151441563944347e-06, + "loss": 0.78654003, + "num_input_tokens_seen": 149347465, + "step": 6960, + "time_per_iteration": 2.6572043895721436 + }, + { + "auxiliary_loss_clip": 0.01135533, + "auxiliary_loss_mlp": 0.01120603, + "balance_loss_clip": 1.00187159, + "balance_loss_mlp": 1.00063074, + "epoch": 0.41851796182173456, + "flos": 20193396503040.0, + "grad_norm": 2.090719837101269, + "language_loss": 0.75387776, + "learning_rate": 2.614773562290835e-06, + "loss": 0.77643913, + "num_input_tokens_seen": 149366685, + "step": 6961, + "time_per_iteration": 2.6185648441314697 + }, + { + "auxiliary_loss_clip": 0.01117519, + "auxiliary_loss_mlp": 0.01101547, + "balance_loss_clip": 1.00171971, + "balance_loss_mlp": 1.00007629, + "epoch": 0.41857808507440253, + "flos": 59018794231680.0, + "grad_norm": 0.7789711061000048, + "language_loss": 0.54730821, + "learning_rate": 2.61440294487496e-06, + "loss": 0.5694989, + "num_input_tokens_seen": 149422925, + "step": 6962, + "time_per_iteration": 4.468150854110718 + }, + { + "auxiliary_loss_clip": 0.01152546, + "auxiliary_loss_mlp": 0.01122351, + "balance_loss_clip": 1.00196338, + "balance_loss_mlp": 1.00085247, + "epoch": 0.4186382083270705, + "flos": 18478805719680.0, + "grad_norm": 1.9408450555330874, + "language_loss": 0.85350394, + "learning_rate": 2.614032304160864e-06, + "loss": 0.87625289, + "num_input_tokens_seen": 149440820, + "step": 6963, + "time_per_iteration": 3.933694362640381 + }, + { + "auxiliary_loss_clip": 0.01135949, + "auxiliary_loss_mlp": 0.01121034, + "balance_loss_clip": 1.00197053, + "balance_loss_mlp": 1.00077558, + "epoch": 0.41869833157973846, + "flos": 21578758202880.0, + "grad_norm": 1.472917593830425, + "language_loss": 0.70081043, + "learning_rate": 2.6136616401626014e-06, + "loss": 0.72338033, + "num_input_tokens_seen": 149461060, + "step": 6964, + "time_per_iteration": 2.6049323081970215 + }, + { + "auxiliary_loss_clip": 0.01168809, + "auxiliary_loss_mlp": 0.01120238, + "balance_loss_clip": 1.00211143, + "balance_loss_mlp": 1.00074303, + "epoch": 0.4187584548324064, + "flos": 35517412650240.0, + "grad_norm": 1.659467046420573, + "language_loss": 0.71005976, + "learning_rate": 2.6132909528942273e-06, + "loss": 0.73295021, + "num_input_tokens_seen": 149483115, + "step": 6965, + "time_per_iteration": 4.084373950958252 + }, + { + "auxiliary_loss_clip": 0.01103508, + "auxiliary_loss_mlp": 0.01121125, + "balance_loss_clip": 1.0017333, + "balance_loss_mlp": 1.00067639, + "epoch": 0.4188185780850744, + "flos": 18655880791680.0, + "grad_norm": 1.6804077118588487, + "language_loss": 0.72228765, + "learning_rate": 2.6129202423697997e-06, + "loss": 0.74453402, + "num_input_tokens_seen": 149501495, + "step": 6966, + "time_per_iteration": 2.67218279838562 + }, + { + "auxiliary_loss_clip": 0.01153586, + "auxiliary_loss_mlp": 0.01122095, + "balance_loss_clip": 1.00206435, + "balance_loss_mlp": 1.00069237, + "epoch": 0.41887870133774235, + "flos": 40333428374400.0, + "grad_norm": 2.0292825925340003, + "language_loss": 0.70825821, + "learning_rate": 2.612549508603375e-06, + "loss": 0.73101503, + "num_input_tokens_seen": 149523170, + "step": 6967, + "time_per_iteration": 2.722557544708252 + }, + { + "auxiliary_loss_clip": 0.01149321, + "auxiliary_loss_mlp": 0.01101519, + "balance_loss_clip": 1.00185227, + "balance_loss_mlp": 1.00004768, + "epoch": 0.4189388245904103, + "flos": 61371336516480.0, + "grad_norm": 0.671743031239697, + "language_loss": 0.46283388, + "learning_rate": 2.612178751609011e-06, + "loss": 0.48534226, + "num_input_tokens_seen": 149583955, + "step": 6968, + "time_per_iteration": 3.1537296772003174 + }, + { + "auxiliary_loss_clip": 0.01152336, + "auxiliary_loss_mlp": 0.01122773, + "balance_loss_clip": 1.00198841, + "balance_loss_mlp": 1.00089288, + "epoch": 0.4189989478430783, + "flos": 28215624119040.0, + "grad_norm": 1.8152509060233148, + "language_loss": 0.75103885, + "learning_rate": 2.6118079714007685e-06, + "loss": 0.77378994, + "num_input_tokens_seen": 149604440, + "step": 6969, + "time_per_iteration": 2.6314098834991455 + }, + { + "auxiliary_loss_clip": 0.01135263, + "auxiliary_loss_mlp": 0.01120867, + "balance_loss_clip": 1.00184894, + "balance_loss_mlp": 1.00070417, + "epoch": 0.4190590710957463, + "flos": 24565879088640.0, + "grad_norm": 1.8626264449228411, + "language_loss": 0.80293334, + "learning_rate": 2.611437167992705e-06, + "loss": 0.82549459, + "num_input_tokens_seen": 149623745, + "step": 6970, + "time_per_iteration": 2.6887946128845215 + }, + { + "auxiliary_loss_clip": 0.01152096, + "auxiliary_loss_mlp": 0.0112113, + "balance_loss_clip": 1.00206161, + "balance_loss_mlp": 1.00068104, + "epoch": 0.41911919434841427, + "flos": 21726027964800.0, + "grad_norm": 1.754627959623878, + "language_loss": 0.83164698, + "learning_rate": 2.6110663413988835e-06, + "loss": 0.8543793, + "num_input_tokens_seen": 149643025, + "step": 6971, + "time_per_iteration": 2.6241962909698486 + }, + { + "auxiliary_loss_clip": 0.01136023, + "auxiliary_loss_mlp": 0.01122051, + "balance_loss_clip": 1.00195765, + "balance_loss_mlp": 1.000934, + "epoch": 0.41917931760108224, + "flos": 17601543855360.0, + "grad_norm": 1.8085863777055293, + "language_loss": 0.74932861, + "learning_rate": 2.6106954916333648e-06, + "loss": 0.77190936, + "num_input_tokens_seen": 149660695, + "step": 6972, + "time_per_iteration": 2.6143155097961426 + }, + { + "auxiliary_loss_clip": 0.01138591, + "auxiliary_loss_mlp": 0.01120731, + "balance_loss_clip": 1.00208747, + "balance_loss_mlp": 1.00066376, + "epoch": 0.4192394408537502, + "flos": 37816701022080.0, + "grad_norm": 1.4406322521979942, + "language_loss": 0.73174191, + "learning_rate": 2.610324618710212e-06, + "loss": 0.75433511, + "num_input_tokens_seen": 149682040, + "step": 6973, + "time_per_iteration": 2.7310805320739746 + }, + { + "auxiliary_loss_clip": 0.01119369, + "auxiliary_loss_mlp": 0.01122317, + "balance_loss_clip": 1.00186574, + "balance_loss_mlp": 1.00081897, + "epoch": 0.41929956410641817, + "flos": 23107726477440.0, + "grad_norm": 1.990103122650308, + "language_loss": 0.75065351, + "learning_rate": 2.609953722643489e-06, + "loss": 0.77307034, + "num_input_tokens_seen": 149700855, + "step": 6974, + "time_per_iteration": 2.642789840698242 + }, + { + "auxiliary_loss_clip": 0.01153362, + "auxiliary_loss_mlp": 0.01121008, + "balance_loss_clip": 1.0019995, + "balance_loss_mlp": 1.00055933, + "epoch": 0.41935968735908613, + "flos": 22524537260160.0, + "grad_norm": 1.7042432814751136, + "language_loss": 0.72543734, + "learning_rate": 2.609582803447259e-06, + "loss": 0.74818105, + "num_input_tokens_seen": 149717360, + "step": 6975, + "time_per_iteration": 2.5446810722351074 + }, + { + "auxiliary_loss_clip": 0.01154074, + "auxiliary_loss_mlp": 0.01122025, + "balance_loss_clip": 1.00221121, + "balance_loss_mlp": 1.00071764, + "epoch": 0.4194198106117541, + "flos": 26870446759680.0, + "grad_norm": 1.4564930699181278, + "language_loss": 0.80944169, + "learning_rate": 2.6092118611355885e-06, + "loss": 0.83220279, + "num_input_tokens_seen": 149738975, + "step": 6976, + "time_per_iteration": 2.605219602584839 + }, + { + "auxiliary_loss_clip": 0.01143336, + "auxiliary_loss_mlp": 0.01121558, + "balance_loss_clip": 1.00232327, + "balance_loss_mlp": 1.00053668, + "epoch": 0.41947993386442206, + "flos": 19902412425600.0, + "grad_norm": 1.9799243995906837, + "language_loss": 0.67441034, + "learning_rate": 2.6088408957225425e-06, + "loss": 0.69705927, + "num_input_tokens_seen": 149757055, + "step": 6977, + "time_per_iteration": 2.594623327255249 + }, + { + "auxiliary_loss_clip": 0.01152225, + "auxiliary_loss_mlp": 0.01121648, + "balance_loss_clip": 1.00210285, + "balance_loss_mlp": 1.00081754, + "epoch": 0.41954005711709, + "flos": 17383889393280.0, + "grad_norm": 2.28801184437371, + "language_loss": 0.81188148, + "learning_rate": 2.6084699072221898e-06, + "loss": 0.83462024, + "num_input_tokens_seen": 149772885, + "step": 6978, + "time_per_iteration": 2.563450336456299 + }, + { + "auxiliary_loss_clip": 0.01168918, + "auxiliary_loss_mlp": 0.01121838, + "balance_loss_clip": 1.00202918, + "balance_loss_mlp": 1.00081658, + "epoch": 0.419600180369758, + "flos": 25003306915200.0, + "grad_norm": 2.2868386461122667, + "language_loss": 0.82506669, + "learning_rate": 2.6080988956485964e-06, + "loss": 0.84797418, + "num_input_tokens_seen": 149791515, + "step": 6979, + "time_per_iteration": 2.535348415374756 + }, + { + "auxiliary_loss_clip": 0.01168752, + "auxiliary_loss_mlp": 0.01121625, + "balance_loss_clip": 1.00198698, + "balance_loss_mlp": 1.00079465, + "epoch": 0.41966030362242596, + "flos": 17383781652480.0, + "grad_norm": 1.745882682357461, + "language_loss": 0.83401233, + "learning_rate": 2.6077278610158325e-06, + "loss": 0.85691607, + "num_input_tokens_seen": 149807250, + "step": 6980, + "time_per_iteration": 2.4945356845855713 + }, + { + "auxiliary_loss_clip": 0.01169001, + "auxiliary_loss_mlp": 0.01121673, + "balance_loss_clip": 1.00209379, + "balance_loss_mlp": 1.00084281, + "epoch": 0.4197204268750939, + "flos": 22156165330560.0, + "grad_norm": 2.9619973837972315, + "language_loss": 0.79234153, + "learning_rate": 2.6073568033379665e-06, + "loss": 0.81524825, + "num_input_tokens_seen": 149821640, + "step": 6981, + "time_per_iteration": 2.571986436843872 + }, + { + "auxiliary_loss_clip": 0.01120526, + "auxiliary_loss_mlp": 0.01120562, + "balance_loss_clip": 1.00189495, + "balance_loss_mlp": 1.00059009, + "epoch": 0.4197805501277619, + "flos": 22084128604800.0, + "grad_norm": 1.6283149292394885, + "language_loss": 0.83888161, + "learning_rate": 2.6069857226290696e-06, + "loss": 0.86129248, + "num_input_tokens_seen": 149840545, + "step": 6982, + "time_per_iteration": 2.6613805294036865 + }, + { + "auxiliary_loss_clip": 0.01158557, + "auxiliary_loss_mlp": 0.0112229, + "balance_loss_clip": 1.00210857, + "balance_loss_mlp": 1.00069606, + "epoch": 0.4198406733804299, + "flos": 26432192920320.0, + "grad_norm": 1.8380786959722668, + "language_loss": 0.56681871, + "learning_rate": 2.606614618903214e-06, + "loss": 0.58962721, + "num_input_tokens_seen": 149860375, + "step": 6983, + "time_per_iteration": 2.621137857437134 + }, + { + "auxiliary_loss_clip": 0.0115209, + "auxiliary_loss_mlp": 0.01121361, + "balance_loss_clip": 1.00199914, + "balance_loss_mlp": 1.00072098, + "epoch": 0.4199007966330979, + "flos": 12531029293440.0, + "grad_norm": 1.8226623586144355, + "language_loss": 0.82426214, + "learning_rate": 2.606243492174471e-06, + "loss": 0.84699667, + "num_input_tokens_seen": 149877850, + "step": 6984, + "time_per_iteration": 2.5315685272216797 + }, + { + "auxiliary_loss_clip": 0.01154053, + "auxiliary_loss_mlp": 0.01121116, + "balance_loss_clip": 1.00207424, + "balance_loss_mlp": 1.0007627, + "epoch": 0.41996091988576584, + "flos": 21762944167680.0, + "grad_norm": 1.6038687862071614, + "language_loss": 0.7893613, + "learning_rate": 2.605872342456914e-06, + "loss": 0.81211299, + "num_input_tokens_seen": 149896110, + "step": 6985, + "time_per_iteration": 2.6118240356445312 + }, + { + "auxiliary_loss_clip": 0.01169023, + "auxiliary_loss_mlp": 0.01122249, + "balance_loss_clip": 1.00206232, + "balance_loss_mlp": 1.00065601, + "epoch": 0.4200210431384338, + "flos": 26541935948160.0, + "grad_norm": 1.8026200391283649, + "language_loss": 0.7843523, + "learning_rate": 2.6055011697646173e-06, + "loss": 0.80726504, + "num_input_tokens_seen": 149916495, + "step": 6986, + "time_per_iteration": 2.5666885375976562 + }, + { + "auxiliary_loss_clip": 0.01137829, + "auxiliary_loss_mlp": 0.01120572, + "balance_loss_clip": 1.00194442, + "balance_loss_mlp": 1.00059974, + "epoch": 0.42008116639110177, + "flos": 26795824254720.0, + "grad_norm": 1.4645903348672111, + "language_loss": 0.71968186, + "learning_rate": 2.605129974111655e-06, + "loss": 0.74226588, + "num_input_tokens_seen": 149936445, + "step": 6987, + "time_per_iteration": 2.637228012084961 + }, + { + "auxiliary_loss_clip": 0.01138774, + "auxiliary_loss_mlp": 0.00747691, + "balance_loss_clip": 1.00211167, + "balance_loss_mlp": 1.00036085, + "epoch": 0.42014128964376973, + "flos": 32087333243520.0, + "grad_norm": 1.3479220399748235, + "language_loss": 0.74733138, + "learning_rate": 2.604758755512104e-06, + "loss": 0.76619601, + "num_input_tokens_seen": 149959430, + "step": 6988, + "time_per_iteration": 2.6974549293518066 + }, + { + "auxiliary_loss_clip": 0.01152319, + "auxiliary_loss_mlp": 0.0112188, + "balance_loss_clip": 1.00208843, + "balance_loss_mlp": 1.00066853, + "epoch": 0.4202014128964377, + "flos": 26467133875200.0, + "grad_norm": 1.5725963201699442, + "language_loss": 0.74156684, + "learning_rate": 2.60438751398004e-06, + "loss": 0.76430881, + "num_input_tokens_seen": 149980365, + "step": 6989, + "time_per_iteration": 2.580324411392212 + }, + { + "auxiliary_loss_clip": 0.01136713, + "auxiliary_loss_mlp": 0.01121339, + "balance_loss_clip": 1.00191009, + "balance_loss_mlp": 1.00069952, + "epoch": 0.42026153614910566, + "flos": 13401216178560.0, + "grad_norm": 2.275232189395873, + "language_loss": 0.7107181, + "learning_rate": 2.6040162495295404e-06, + "loss": 0.7332986, + "num_input_tokens_seen": 149997375, + "step": 6990, + "time_per_iteration": 2.55983829498291 + }, + { + "auxiliary_loss_clip": 0.0114833, + "auxiliary_loss_mlp": 0.00746395, + "balance_loss_clip": 1.00187337, + "balance_loss_mlp": 0.99978369, + "epoch": 0.42032165940177363, + "flos": 60250457635200.0, + "grad_norm": 0.8274023447268146, + "language_loss": 0.60518038, + "learning_rate": 2.603644962174685e-06, + "loss": 0.62412763, + "num_input_tokens_seen": 150051230, + "step": 6991, + "time_per_iteration": 3.0087459087371826 + }, + { + "auxiliary_loss_clip": 0.01168982, + "auxiliary_loss_mlp": 0.01122157, + "balance_loss_clip": 1.00220478, + "balance_loss_mlp": 1.00075483, + "epoch": 0.4203817826544416, + "flos": 24535211852160.0, + "grad_norm": 1.7346248616112974, + "language_loss": 0.83514577, + "learning_rate": 2.6032736519295517e-06, + "loss": 0.85805714, + "num_input_tokens_seen": 150071135, + "step": 6992, + "time_per_iteration": 2.558166027069092 + }, + { + "auxiliary_loss_clip": 0.01164447, + "auxiliary_loss_mlp": 0.01101493, + "balance_loss_clip": 1.00172853, + "balance_loss_mlp": 1.00002217, + "epoch": 0.42044190590710956, + "flos": 58820781530880.0, + "grad_norm": 0.8062260296087558, + "language_loss": 0.65488839, + "learning_rate": 2.6029023188082217e-06, + "loss": 0.67754781, + "num_input_tokens_seen": 150125220, + "step": 6993, + "time_per_iteration": 3.072889566421509 + }, + { + "auxiliary_loss_clip": 0.01168953, + "auxiliary_loss_mlp": 0.01122224, + "balance_loss_clip": 1.00203013, + "balance_loss_mlp": 1.00072598, + "epoch": 0.4205020291597775, + "flos": 16436063260800.0, + "grad_norm": 1.7962910511133314, + "language_loss": 0.83856535, + "learning_rate": 2.6025309628247746e-06, + "loss": 0.86147708, + "num_input_tokens_seen": 150142300, + "step": 6994, + "time_per_iteration": 2.4910645484924316 + }, + { + "auxiliary_loss_clip": 0.01153844, + "auxiliary_loss_mlp": 0.00747711, + "balance_loss_clip": 1.0021553, + "balance_loss_mlp": 1.00034142, + "epoch": 0.4205621524124455, + "flos": 18405655672320.0, + "grad_norm": 1.5195941664925607, + "language_loss": 0.78205079, + "learning_rate": 2.6021595839932934e-06, + "loss": 0.80106634, + "num_input_tokens_seen": 150161345, + "step": 6995, + "time_per_iteration": 2.5876200199127197 + }, + { + "auxiliary_loss_clip": 0.01123682, + "auxiliary_loss_mlp": 0.01120713, + "balance_loss_clip": 1.00212097, + "balance_loss_mlp": 1.00064564, + "epoch": 0.4206222756651135, + "flos": 25520097841920.0, + "grad_norm": 2.040732887946467, + "language_loss": 0.79974306, + "learning_rate": 2.60178818232786e-06, + "loss": 0.82218707, + "num_input_tokens_seen": 150182420, + "step": 6996, + "time_per_iteration": 4.073574542999268 + }, + { + "auxiliary_loss_clip": 0.01137227, + "auxiliary_loss_mlp": 0.00747786, + "balance_loss_clip": 1.0020411, + "balance_loss_mlp": 1.0003686, + "epoch": 0.4206823989177815, + "flos": 15304338472320.0, + "grad_norm": 1.8731903125240232, + "language_loss": 0.75640225, + "learning_rate": 2.601416757842559e-06, + "loss": 0.77525234, + "num_input_tokens_seen": 150200175, + "step": 6997, + "time_per_iteration": 2.609922409057617 + }, + { + "auxiliary_loss_clip": 0.01168806, + "auxiliary_loss_mlp": 0.01121123, + "balance_loss_clip": 1.00193524, + "balance_loss_mlp": 1.00076914, + "epoch": 0.42074252217044944, + "flos": 15554096714880.0, + "grad_norm": 1.6608592027689577, + "language_loss": 0.7506119, + "learning_rate": 2.6010453105514743e-06, + "loss": 0.77351117, + "num_input_tokens_seen": 150217100, + "step": 6998, + "time_per_iteration": 2.506263494491577 + }, + { + "auxiliary_loss_clip": 0.01168927, + "auxiliary_loss_mlp": 0.01121729, + "balance_loss_clip": 1.00214028, + "balance_loss_mlp": 1.00089884, + "epoch": 0.4208026454231174, + "flos": 26145877610880.0, + "grad_norm": 1.7755027287422247, + "language_loss": 0.75946653, + "learning_rate": 2.60067384046869e-06, + "loss": 0.78237307, + "num_input_tokens_seen": 150239830, + "step": 6999, + "time_per_iteration": 3.9831316471099854 + }, + { + "auxiliary_loss_clip": 0.01122749, + "auxiliary_loss_mlp": 0.01121115, + "balance_loss_clip": 1.00200796, + "balance_loss_mlp": 1.00076127, + "epoch": 0.42086276867578537, + "flos": 23550110380800.0, + "grad_norm": 1.8470492958715394, + "language_loss": 0.64328849, + "learning_rate": 2.600302347608295e-06, + "loss": 0.66572714, + "num_input_tokens_seen": 150260690, + "step": 7000, + "time_per_iteration": 4.111445665359497 + }, + { + "auxiliary_loss_clip": 0.0111985, + "auxiliary_loss_mlp": 0.01122, + "balance_loss_clip": 1.00204563, + "balance_loss_mlp": 1.0005976, + "epoch": 0.42092289192845334, + "flos": 18113414618880.0, + "grad_norm": 2.5143718936258206, + "language_loss": 0.76165861, + "learning_rate": 2.5999308319843743e-06, + "loss": 0.78407717, + "num_input_tokens_seen": 150279885, + "step": 7001, + "time_per_iteration": 2.644577980041504 + }, + { + "auxiliary_loss_clip": 0.01125097, + "auxiliary_loss_mlp": 0.00747671, + "balance_loss_clip": 1.00214791, + "balance_loss_mlp": 1.00038457, + "epoch": 0.4209830151811213, + "flos": 20006588845440.0, + "grad_norm": 1.6852037092554524, + "language_loss": 0.86934024, + "learning_rate": 2.5995592936110154e-06, + "loss": 0.8880679, + "num_input_tokens_seen": 150297390, + "step": 7002, + "time_per_iteration": 2.6431667804718018 + }, + { + "auxiliary_loss_clip": 0.01119056, + "auxiliary_loss_mlp": 0.01120938, + "balance_loss_clip": 1.00189519, + "balance_loss_mlp": 1.00067937, + "epoch": 0.42104313843378927, + "flos": 21978946604160.0, + "grad_norm": 1.8768305387101027, + "language_loss": 0.68026692, + "learning_rate": 2.5991877325023096e-06, + "loss": 0.70266682, + "num_input_tokens_seen": 150317390, + "step": 7003, + "time_per_iteration": 4.064581394195557 + }, + { + "auxiliary_loss_clip": 0.0116885, + "auxiliary_loss_mlp": 0.01121738, + "balance_loss_clip": 1.00202465, + "balance_loss_mlp": 1.0006218, + "epoch": 0.42110326168645723, + "flos": 25443966965760.0, + "grad_norm": 2.0930207961898573, + "language_loss": 0.77304626, + "learning_rate": 2.598816148672344e-06, + "loss": 0.79595214, + "num_input_tokens_seen": 150337455, + "step": 7004, + "time_per_iteration": 2.5630152225494385 + }, + { + "auxiliary_loss_clip": 0.01168755, + "auxiliary_loss_mlp": 0.01121385, + "balance_loss_clip": 1.00213265, + "balance_loss_mlp": 1.00074553, + "epoch": 0.4211633849391252, + "flos": 17822574195840.0, + "grad_norm": 1.6495257204085179, + "language_loss": 0.68031645, + "learning_rate": 2.59844454213521e-06, + "loss": 0.70321786, + "num_input_tokens_seen": 150355385, + "step": 7005, + "time_per_iteration": 2.5402400493621826 + }, + { + "auxiliary_loss_clip": 0.01152116, + "auxiliary_loss_mlp": 0.01121104, + "balance_loss_clip": 1.00198793, + "balance_loss_mlp": 1.0006547, + "epoch": 0.42122350819179316, + "flos": 16282436791680.0, + "grad_norm": 2.162917315273628, + "language_loss": 0.72220933, + "learning_rate": 2.5980729129049994e-06, + "loss": 0.74494153, + "num_input_tokens_seen": 150371750, + "step": 7006, + "time_per_iteration": 2.523716449737549 + }, + { + "auxiliary_loss_clip": 0.01168873, + "auxiliary_loss_mlp": 0.01121757, + "balance_loss_clip": 1.00203848, + "balance_loss_mlp": 1.00064063, + "epoch": 0.4212836314444611, + "flos": 19645866512640.0, + "grad_norm": 1.6892647959776255, + "language_loss": 0.70868814, + "learning_rate": 2.5977012609958033e-06, + "loss": 0.73159444, + "num_input_tokens_seen": 150389955, + "step": 7007, + "time_per_iteration": 2.520695447921753 + }, + { + "auxiliary_loss_clip": 0.01136505, + "auxiliary_loss_mlp": 0.00747613, + "balance_loss_clip": 1.00199687, + "balance_loss_mlp": 1.00034487, + "epoch": 0.4213437546971291, + "flos": 18369026778240.0, + "grad_norm": 1.942873198118971, + "language_loss": 0.82932228, + "learning_rate": 2.5973295864217166e-06, + "loss": 0.84816349, + "num_input_tokens_seen": 150405780, + "step": 7008, + "time_per_iteration": 2.571589231491089 + }, + { + "auxiliary_loss_clip": 0.01118336, + "auxiliary_loss_mlp": 0.01121681, + "balance_loss_clip": 1.00174022, + "balance_loss_mlp": 1.00075531, + "epoch": 0.42140387794979706, + "flos": 27704507541120.0, + "grad_norm": 5.794321865868056, + "language_loss": 0.71900201, + "learning_rate": 2.596957889196831e-06, + "loss": 0.74140215, + "num_input_tokens_seen": 150425615, + "step": 7009, + "time_per_iteration": 2.6789300441741943 + }, + { + "auxiliary_loss_clip": 0.01168982, + "auxiliary_loss_mlp": 0.01121171, + "balance_loss_clip": 1.00203645, + "balance_loss_mlp": 1.00053155, + "epoch": 0.4214640012024651, + "flos": 28147071012480.0, + "grad_norm": 3.2996742172389846, + "language_loss": 0.66257823, + "learning_rate": 2.596586169335243e-06, + "loss": 0.68547976, + "num_input_tokens_seen": 150445765, + "step": 7010, + "time_per_iteration": 2.557439088821411 + }, + { + "auxiliary_loss_clip": 0.01121363, + "auxiliary_loss_mlp": 0.01120865, + "balance_loss_clip": 1.00179672, + "balance_loss_mlp": 1.00070214, + "epoch": 0.42152412445513304, + "flos": 22997265177600.0, + "grad_norm": 1.5439932495783746, + "language_loss": 0.72607291, + "learning_rate": 2.5962144268510477e-06, + "loss": 0.74849522, + "num_input_tokens_seen": 150464405, + "step": 7011, + "time_per_iteration": 2.647298812866211 + }, + { + "auxiliary_loss_clip": 0.01149602, + "auxiliary_loss_mlp": 0.01101696, + "balance_loss_clip": 1.00158286, + "balance_loss_mlp": 1.00022542, + "epoch": 0.421584247707801, + "flos": 63749592938880.0, + "grad_norm": 0.79618736039196, + "language_loss": 0.54329348, + "learning_rate": 2.5958426617583417e-06, + "loss": 0.56580651, + "num_input_tokens_seen": 150520430, + "step": 7012, + "time_per_iteration": 3.0442142486572266 + }, + { + "auxiliary_loss_clip": 0.01152262, + "auxiliary_loss_mlp": 0.0112153, + "balance_loss_clip": 1.00211239, + "balance_loss_mlp": 1.0006038, + "epoch": 0.421644370960469, + "flos": 24314612474880.0, + "grad_norm": 1.4503951564561344, + "language_loss": 0.78378838, + "learning_rate": 2.5954708740712215e-06, + "loss": 0.8065263, + "num_input_tokens_seen": 150542610, + "step": 7013, + "time_per_iteration": 2.6051714420318604 + }, + { + "auxiliary_loss_clip": 0.01168922, + "auxiliary_loss_mlp": 0.01120994, + "balance_loss_clip": 1.00206876, + "balance_loss_mlp": 1.00073564, + "epoch": 0.42170449421313694, + "flos": 23440690575360.0, + "grad_norm": 1.695884334772607, + "language_loss": 0.81135917, + "learning_rate": 2.595099063803787e-06, + "loss": 0.83425832, + "num_input_tokens_seen": 150560970, + "step": 7014, + "time_per_iteration": 2.557050943374634 + }, + { + "auxiliary_loss_clip": 0.01153901, + "auxiliary_loss_mlp": 0.01121577, + "balance_loss_clip": 1.00211751, + "balance_loss_mlp": 1.00074613, + "epoch": 0.4217646174658049, + "flos": 23695476721920.0, + "grad_norm": 1.6026326105465472, + "language_loss": 0.78156269, + "learning_rate": 2.5947272309701354e-06, + "loss": 0.80431747, + "num_input_tokens_seen": 150582615, + "step": 7015, + "time_per_iteration": 2.610905170440674 + }, + { + "auxiliary_loss_clip": 0.01168901, + "auxiliary_loss_mlp": 0.01122482, + "balance_loss_clip": 1.00215244, + "balance_loss_mlp": 1.00069797, + "epoch": 0.42182474071847287, + "flos": 24971562270720.0, + "grad_norm": 1.3666151705090366, + "language_loss": 0.82214808, + "learning_rate": 2.594355375584368e-06, + "loss": 0.8450619, + "num_input_tokens_seen": 150603640, + "step": 7016, + "time_per_iteration": 2.5599758625030518 + }, + { + "auxiliary_loss_clip": 0.01121896, + "auxiliary_loss_mlp": 0.01120459, + "balance_loss_clip": 1.00207078, + "balance_loss_mlp": 1.0005821, + "epoch": 0.42188486397114083, + "flos": 22856639431680.0, + "grad_norm": 2.764527718321033, + "language_loss": 0.67846721, + "learning_rate": 2.593983497660586e-06, + "loss": 0.70089078, + "num_input_tokens_seen": 150622490, + "step": 7017, + "time_per_iteration": 2.6385886669158936 + }, + { + "auxiliary_loss_clip": 0.01147798, + "auxiliary_loss_mlp": 0.01101545, + "balance_loss_clip": 1.00164294, + "balance_loss_mlp": 1.00007439, + "epoch": 0.4219449872238088, + "flos": 66975700965120.0, + "grad_norm": 0.6737746273758346, + "language_loss": 0.59447014, + "learning_rate": 2.5936115972128895e-06, + "loss": 0.61696362, + "num_input_tokens_seen": 150689545, + "step": 7018, + "time_per_iteration": 3.2668135166168213 + }, + { + "auxiliary_loss_clip": 0.01153863, + "auxiliary_loss_mlp": 0.01122013, + "balance_loss_clip": 1.00213742, + "balance_loss_mlp": 1.00061035, + "epoch": 0.42200511047647676, + "flos": 13115367745920.0, + "grad_norm": 1.782890307408357, + "language_loss": 0.75442708, + "learning_rate": 2.593239674255382e-06, + "loss": 0.7771858, + "num_input_tokens_seen": 150707610, + "step": 7019, + "time_per_iteration": 2.534634590148926 + }, + { + "auxiliary_loss_clip": 0.01142756, + "auxiliary_loss_mlp": 0.01121564, + "balance_loss_clip": 1.00216746, + "balance_loss_mlp": 1.00063777, + "epoch": 0.42206523372914473, + "flos": 13991193066240.0, + "grad_norm": 1.848123322442268, + "language_loss": 0.68913841, + "learning_rate": 2.592867728802166e-06, + "loss": 0.71178162, + "num_input_tokens_seen": 150724530, + "step": 7020, + "time_per_iteration": 2.577315092086792 + }, + { + "auxiliary_loss_clip": 0.01136605, + "auxiliary_loss_mlp": 0.00747622, + "balance_loss_clip": 1.00205135, + "balance_loss_mlp": 1.00033617, + "epoch": 0.4221253569818127, + "flos": 21942317710080.0, + "grad_norm": 1.6015867544763047, + "language_loss": 0.80872232, + "learning_rate": 2.592495760867347e-06, + "loss": 0.8275646, + "num_input_tokens_seen": 150742870, + "step": 7021, + "time_per_iteration": 2.620570182800293 + }, + { + "auxiliary_loss_clip": 0.01091564, + "auxiliary_loss_mlp": 0.01121611, + "balance_loss_clip": 1.00184917, + "balance_loss_mlp": 1.00058937, + "epoch": 0.42218548023448066, + "flos": 32192587071360.0, + "grad_norm": 1.6225697671758796, + "language_loss": 0.69922352, + "learning_rate": 2.5921237704650293e-06, + "loss": 0.72135526, + "num_input_tokens_seen": 150765500, + "step": 7022, + "time_per_iteration": 2.808675527572632 + }, + { + "auxiliary_loss_clip": 0.01153327, + "auxiliary_loss_mlp": 0.01120158, + "balance_loss_clip": 1.00211966, + "balance_loss_mlp": 1.00075841, + "epoch": 0.4222456034871487, + "flos": 30118961894400.0, + "grad_norm": 1.6484933942551496, + "language_loss": 0.67575061, + "learning_rate": 2.5917517576093188e-06, + "loss": 0.69848543, + "num_input_tokens_seen": 150784945, + "step": 7023, + "time_per_iteration": 2.666102886199951 + }, + { + "auxiliary_loss_clip": 0.01136778, + "auxiliary_loss_mlp": 0.01120537, + "balance_loss_clip": 1.00201964, + "balance_loss_mlp": 1.00066054, + "epoch": 0.42230572673981664, + "flos": 22127904305280.0, + "grad_norm": 1.621699996696186, + "language_loss": 0.69041979, + "learning_rate": 2.591379722314322e-06, + "loss": 0.71299303, + "num_input_tokens_seen": 150803120, + "step": 7024, + "time_per_iteration": 2.6117963790893555 + }, + { + "auxiliary_loss_clip": 0.01168909, + "auxiliary_loss_mlp": 0.01120566, + "balance_loss_clip": 1.0021503, + "balance_loss_mlp": 1.00068927, + "epoch": 0.4223658499924846, + "flos": 22055077480320.0, + "grad_norm": 1.6767801882147884, + "language_loss": 0.76783389, + "learning_rate": 2.591007664594147e-06, + "loss": 0.79072857, + "num_input_tokens_seen": 150823135, + "step": 7025, + "time_per_iteration": 2.557896137237549 + }, + { + "auxiliary_loss_clip": 0.0113717, + "auxiliary_loss_mlp": 0.01120281, + "balance_loss_clip": 1.00203788, + "balance_loss_mlp": 1.00078547, + "epoch": 0.4224259732451526, + "flos": 20410727742720.0, + "grad_norm": 1.5172079292692204, + "language_loss": 0.79419422, + "learning_rate": 2.5906355844629024e-06, + "loss": 0.81676865, + "num_input_tokens_seen": 150842070, + "step": 7026, + "time_per_iteration": 2.6267120838165283 + }, + { + "auxiliary_loss_clip": 0.01164207, + "auxiliary_loss_mlp": 0.01100804, + "balance_loss_clip": 1.00151014, + "balance_loss_mlp": 1.0000962, + "epoch": 0.42248609649782054, + "flos": 62846655828480.0, + "grad_norm": 0.7180259317794346, + "language_loss": 0.61910707, + "learning_rate": 2.5902634819346966e-06, + "loss": 0.64175713, + "num_input_tokens_seen": 150907450, + "step": 7027, + "time_per_iteration": 3.215183734893799 + }, + { + "auxiliary_loss_clip": 0.01168783, + "auxiliary_loss_mlp": 0.01120235, + "balance_loss_clip": 1.00212419, + "balance_loss_mlp": 1.00073934, + "epoch": 0.4225462197504885, + "flos": 26249946289920.0, + "grad_norm": 2.0390869904908, + "language_loss": 0.70941174, + "learning_rate": 2.5898913570236414e-06, + "loss": 0.73230195, + "num_input_tokens_seen": 150928040, + "step": 7028, + "time_per_iteration": 2.5838139057159424 + }, + { + "auxiliary_loss_clip": 0.01141899, + "auxiliary_loss_mlp": 0.0112235, + "balance_loss_clip": 1.00214756, + "balance_loss_mlp": 1.00066066, + "epoch": 0.42260634300315647, + "flos": 20521943228160.0, + "grad_norm": 1.858496398119985, + "language_loss": 0.82364404, + "learning_rate": 2.589519209743846e-06, + "loss": 0.84628654, + "num_input_tokens_seen": 150945760, + "step": 7029, + "time_per_iteration": 2.6105499267578125 + }, + { + "auxiliary_loss_clip": 0.01105208, + "auxiliary_loss_mlp": 0.01121304, + "balance_loss_clip": 1.00194025, + "balance_loss_mlp": 1.00095081, + "epoch": 0.42266646625582444, + "flos": 24316731377280.0, + "grad_norm": 1.9867151803357521, + "language_loss": 0.75089777, + "learning_rate": 2.589147040109424e-06, + "loss": 0.77316296, + "num_input_tokens_seen": 150965665, + "step": 7030, + "time_per_iteration": 2.6842427253723145 + }, + { + "auxiliary_loss_clip": 0.01168784, + "auxiliary_loss_mlp": 0.01121122, + "balance_loss_clip": 1.00210106, + "balance_loss_mlp": 1.00067329, + "epoch": 0.4227265895084924, + "flos": 24204151175040.0, + "grad_norm": 3.8564257103148067, + "language_loss": 0.86426985, + "learning_rate": 2.588774848134486e-06, + "loss": 0.88716894, + "num_input_tokens_seen": 150982260, + "step": 7031, + "time_per_iteration": 2.532850503921509 + }, + { + "auxiliary_loss_clip": 0.01152151, + "auxiliary_loss_mlp": 0.01120738, + "balance_loss_clip": 1.00202203, + "balance_loss_mlp": 1.00076628, + "epoch": 0.42278671276116037, + "flos": 16909760845440.0, + "grad_norm": 1.9140132200683146, + "language_loss": 0.73416662, + "learning_rate": 2.5884026338331473e-06, + "loss": 0.75689554, + "num_input_tokens_seen": 150999990, + "step": 7032, + "time_per_iteration": 2.515047550201416 + }, + { + "auxiliary_loss_clip": 0.01137448, + "auxiliary_loss_mlp": 0.01121113, + "balance_loss_clip": 1.00205326, + "balance_loss_mlp": 1.00085473, + "epoch": 0.42284683601382833, + "flos": 25411073086080.0, + "grad_norm": 1.9232692022465094, + "language_loss": 0.70033598, + "learning_rate": 2.5880303972195222e-06, + "loss": 0.72292161, + "num_input_tokens_seen": 151021105, + "step": 7033, + "time_per_iteration": 2.6434926986694336 + }, + { + "auxiliary_loss_clip": 0.01138023, + "auxiliary_loss_mlp": 0.00747686, + "balance_loss_clip": 1.00210333, + "balance_loss_mlp": 1.00041139, + "epoch": 0.4229069592664963, + "flos": 23040322606080.0, + "grad_norm": 1.728340911045166, + "language_loss": 0.90031016, + "learning_rate": 2.5876581383077256e-06, + "loss": 0.91916728, + "num_input_tokens_seen": 151040665, + "step": 7034, + "time_per_iteration": 3.987199306488037 + }, + { + "auxiliary_loss_clip": 0.01137145, + "auxiliary_loss_mlp": 0.01120835, + "balance_loss_clip": 1.00195217, + "balance_loss_mlp": 1.0009582, + "epoch": 0.42296708251916426, + "flos": 26067448264320.0, + "grad_norm": 1.6355438537071665, + "language_loss": 0.76802808, + "learning_rate": 2.5872858571118723e-06, + "loss": 0.79060787, + "num_input_tokens_seen": 151061240, + "step": 7035, + "time_per_iteration": 2.627765417098999 + }, + { + "auxiliary_loss_clip": 0.01152513, + "auxiliary_loss_mlp": 0.01121181, + "balance_loss_clip": 1.00202155, + "balance_loss_mlp": 1.00092316, + "epoch": 0.4230272057718323, + "flos": 19458376496640.0, + "grad_norm": 1.898710035472012, + "language_loss": 0.82148004, + "learning_rate": 2.5869135536460817e-06, + "loss": 0.844217, + "num_input_tokens_seen": 151076870, + "step": 7036, + "time_per_iteration": 2.5485901832580566 + }, + { + "auxiliary_loss_clip": 0.01135546, + "auxiliary_loss_mlp": 0.01120791, + "balance_loss_clip": 1.00202394, + "balance_loss_mlp": 1.00072384, + "epoch": 0.42308732902450025, + "flos": 22383300983040.0, + "grad_norm": 1.5535967368615207, + "language_loss": 0.70582128, + "learning_rate": 2.58654122792447e-06, + "loss": 0.72838461, + "num_input_tokens_seen": 151095110, + "step": 7037, + "time_per_iteration": 4.100390911102295 + }, + { + "auxiliary_loss_clip": 0.01118463, + "auxiliary_loss_mlp": 0.00747735, + "balance_loss_clip": 1.00181532, + "balance_loss_mlp": 1.00035548, + "epoch": 0.4231474522771682, + "flos": 20995425331200.0, + "grad_norm": 1.952794694422387, + "language_loss": 0.77890623, + "learning_rate": 2.586168879961155e-06, + "loss": 0.7975682, + "num_input_tokens_seen": 151114355, + "step": 7038, + "time_per_iteration": 4.064671277999878 + }, + { + "auxiliary_loss_clip": 0.01111772, + "auxiliary_loss_mlp": 0.0112212, + "balance_loss_clip": 1.00188076, + "balance_loss_mlp": 1.00081241, + "epoch": 0.4232075755298362, + "flos": 14975863574400.0, + "grad_norm": 2.169235878671685, + "language_loss": 0.67685497, + "learning_rate": 2.585796509770259e-06, + "loss": 0.69919384, + "num_input_tokens_seen": 151131505, + "step": 7039, + "time_per_iteration": 2.6823270320892334 + }, + { + "auxiliary_loss_clip": 0.01153357, + "auxiliary_loss_mlp": 0.01122641, + "balance_loss_clip": 1.00194216, + "balance_loss_mlp": 1.00076151, + "epoch": 0.42326769878250414, + "flos": 24532661986560.0, + "grad_norm": 1.688004862921097, + "language_loss": 0.75957108, + "learning_rate": 2.5854241173658996e-06, + "loss": 0.78233111, + "num_input_tokens_seen": 151151555, + "step": 7040, + "time_per_iteration": 2.5817971229553223 + }, + { + "auxiliary_loss_clip": 0.01152077, + "auxiliary_loss_mlp": 0.01121333, + "balance_loss_clip": 1.00196075, + "balance_loss_mlp": 1.00059819, + "epoch": 0.4233278220351721, + "flos": 26870303105280.0, + "grad_norm": 1.4963994353727512, + "language_loss": 0.64772165, + "learning_rate": 2.5850517027621996e-06, + "loss": 0.67045569, + "num_input_tokens_seen": 151172385, + "step": 7041, + "time_per_iteration": 4.014049291610718 + }, + { + "auxiliary_loss_clip": 0.01137393, + "auxiliary_loss_mlp": 0.01121563, + "balance_loss_clip": 1.00199652, + "balance_loss_mlp": 1.00073218, + "epoch": 0.4233879452878401, + "flos": 42814927463040.0, + "grad_norm": 1.738028445426122, + "language_loss": 0.73570722, + "learning_rate": 2.5846792659732803e-06, + "loss": 0.75829673, + "num_input_tokens_seen": 151194930, + "step": 7042, + "time_per_iteration": 2.7939209938049316 + }, + { + "auxiliary_loss_clip": 0.01152075, + "auxiliary_loss_mlp": 0.01120542, + "balance_loss_clip": 1.00204659, + "balance_loss_mlp": 1.00076008, + "epoch": 0.42344806854050804, + "flos": 25229006023680.0, + "grad_norm": 1.5851177994383256, + "language_loss": 0.82336617, + "learning_rate": 2.5843068070132643e-06, + "loss": 0.84609228, + "num_input_tokens_seen": 151217905, + "step": 7043, + "time_per_iteration": 2.6347157955169678 + }, + { + "auxiliary_loss_clip": 0.01135691, + "auxiliary_loss_mlp": 0.01121291, + "balance_loss_clip": 1.00212991, + "balance_loss_mlp": 1.00074697, + "epoch": 0.423508191793176, + "flos": 22778820616320.0, + "grad_norm": 2.08244544881868, + "language_loss": 0.65011418, + "learning_rate": 2.5839343258962763e-06, + "loss": 0.67268395, + "num_input_tokens_seen": 151234580, + "step": 7044, + "time_per_iteration": 2.634916067123413 + }, + { + "auxiliary_loss_clip": 0.01154214, + "auxiliary_loss_mlp": 0.01122272, + "balance_loss_clip": 1.00225472, + "balance_loss_mlp": 1.00106025, + "epoch": 0.42356831504584397, + "flos": 34637493179520.0, + "grad_norm": 1.7391310652208738, + "language_loss": 0.7569828, + "learning_rate": 2.5835618226364393e-06, + "loss": 0.77974772, + "num_input_tokens_seen": 151254765, + "step": 7045, + "time_per_iteration": 2.7150511741638184 + }, + { + "auxiliary_loss_clip": 0.01121179, + "auxiliary_loss_mlp": 0.01120621, + "balance_loss_clip": 1.00197804, + "balance_loss_mlp": 1.00074446, + "epoch": 0.42362843829851193, + "flos": 17596767346560.0, + "grad_norm": 2.1637032247505807, + "language_loss": 0.80527759, + "learning_rate": 2.5831892972478797e-06, + "loss": 0.82769561, + "num_input_tokens_seen": 151269045, + "step": 7046, + "time_per_iteration": 2.6345651149749756 + }, + { + "auxiliary_loss_clip": 0.01065166, + "auxiliary_loss_mlp": 0.01120913, + "balance_loss_clip": 1.00178742, + "balance_loss_mlp": 1.00075042, + "epoch": 0.4236885615511799, + "flos": 22565691267840.0, + "grad_norm": 1.7399089544269477, + "language_loss": 0.77086502, + "learning_rate": 2.5828167497447242e-06, + "loss": 0.79272592, + "num_input_tokens_seen": 151287530, + "step": 7047, + "time_per_iteration": 2.8452818393707275 + }, + { + "auxiliary_loss_clip": 0.01168705, + "auxiliary_loss_mlp": 0.01121237, + "balance_loss_clip": 1.00210023, + "balance_loss_mlp": 1.00069213, + "epoch": 0.42374868480384786, + "flos": 26469216864000.0, + "grad_norm": 1.6766543101618476, + "language_loss": 0.68263757, + "learning_rate": 2.582444180141098e-06, + "loss": 0.70553702, + "num_input_tokens_seen": 151308905, + "step": 7048, + "time_per_iteration": 2.5675783157348633 + }, + { + "auxiliary_loss_clip": 0.01153913, + "auxiliary_loss_mlp": 0.01121334, + "balance_loss_clip": 1.00218797, + "balance_loss_mlp": 1.00078964, + "epoch": 0.4238088080565159, + "flos": 20370220179840.0, + "grad_norm": 2.109163686425486, + "language_loss": 0.78099394, + "learning_rate": 2.5820715884511307e-06, + "loss": 0.80374646, + "num_input_tokens_seen": 151326525, + "step": 7049, + "time_per_iteration": 2.5441713333129883 + }, + { + "auxiliary_loss_clip": 0.01152307, + "auxiliary_loss_mlp": 0.01121944, + "balance_loss_clip": 1.00207663, + "balance_loss_mlp": 1.00092292, + "epoch": 0.42386893130918385, + "flos": 21172105353600.0, + "grad_norm": 2.155911009815105, + "language_loss": 0.82778084, + "learning_rate": 2.5816989746889504e-06, + "loss": 0.85052335, + "num_input_tokens_seen": 151344675, + "step": 7050, + "time_per_iteration": 2.5434114933013916 + }, + { + "auxiliary_loss_clip": 0.01168871, + "auxiliary_loss_mlp": 0.01121514, + "balance_loss_clip": 1.00211835, + "balance_loss_mlp": 1.00068343, + "epoch": 0.4239290545618518, + "flos": 17675627656320.0, + "grad_norm": 2.17044440057814, + "language_loss": 0.73267698, + "learning_rate": 2.581326338868687e-06, + "loss": 0.75558078, + "num_input_tokens_seen": 151360730, + "step": 7051, + "time_per_iteration": 2.4841036796569824 + }, + { + "auxiliary_loss_clip": 0.0112045, + "auxiliary_loss_mlp": 0.01121885, + "balance_loss_clip": 1.00195932, + "balance_loss_mlp": 1.00067258, + "epoch": 0.4239891778145198, + "flos": 24314504734080.0, + "grad_norm": 1.5555272543412575, + "language_loss": 0.86213863, + "learning_rate": 2.5809536810044706e-06, + "loss": 0.88456196, + "num_input_tokens_seen": 151380445, + "step": 7052, + "time_per_iteration": 2.6607418060302734 + }, + { + "auxiliary_loss_clip": 0.01135301, + "auxiliary_loss_mlp": 0.01120861, + "balance_loss_clip": 1.00192499, + "balance_loss_mlp": 1.00107932, + "epoch": 0.42404930106718774, + "flos": 20558428467840.0, + "grad_norm": 1.5793017274197472, + "language_loss": 0.72551811, + "learning_rate": 2.5805810011104323e-06, + "loss": 0.74807966, + "num_input_tokens_seen": 151399325, + "step": 7053, + "time_per_iteration": 2.610227108001709 + }, + { + "auxiliary_loss_clip": 0.01123231, + "auxiliary_loss_mlp": 0.00747719, + "balance_loss_clip": 1.00218534, + "balance_loss_mlp": 1.00041306, + "epoch": 0.4241094243198557, + "flos": 22308067946880.0, + "grad_norm": 1.7336520273062455, + "language_loss": 0.82240415, + "learning_rate": 2.580208299200704e-06, + "loss": 0.84111363, + "num_input_tokens_seen": 151417240, + "step": 7054, + "time_per_iteration": 2.6449568271636963 + }, + { + "auxiliary_loss_clip": 0.01147578, + "auxiliary_loss_mlp": 0.01101025, + "balance_loss_clip": 1.00139952, + "balance_loss_mlp": 1.00031686, + "epoch": 0.4241695475725237, + "flos": 70612445272320.0, + "grad_norm": 0.7753962623039864, + "language_loss": 0.60446274, + "learning_rate": 2.5798355752894183e-06, + "loss": 0.62694871, + "num_input_tokens_seen": 151476015, + "step": 7055, + "time_per_iteration": 3.0789499282836914 + }, + { + "auxiliary_loss_clip": 0.01168968, + "auxiliary_loss_mlp": 0.01121712, + "balance_loss_clip": 1.00216091, + "balance_loss_mlp": 1.0007863, + "epoch": 0.42422967082519164, + "flos": 14027462824320.0, + "grad_norm": 2.4137884044978835, + "language_loss": 0.76765817, + "learning_rate": 2.5794628293907107e-06, + "loss": 0.79056489, + "num_input_tokens_seen": 151492035, + "step": 7056, + "time_per_iteration": 2.4903500080108643 + }, + { + "auxiliary_loss_clip": 0.01152501, + "auxiliary_loss_mlp": 0.01122009, + "balance_loss_clip": 1.00201607, + "balance_loss_mlp": 1.00079668, + "epoch": 0.4242897940778596, + "flos": 22345522853760.0, + "grad_norm": 1.8170220685147378, + "language_loss": 0.8408165, + "learning_rate": 2.579090061518714e-06, + "loss": 0.86356163, + "num_input_tokens_seen": 151508970, + "step": 7057, + "time_per_iteration": 2.564493179321289 + }, + { + "auxiliary_loss_clip": 0.01118628, + "auxiliary_loss_mlp": 0.01122037, + "balance_loss_clip": 1.00185335, + "balance_loss_mlp": 1.00082493, + "epoch": 0.42434991733052757, + "flos": 22595855713920.0, + "grad_norm": 2.114159181454496, + "language_loss": 0.83291382, + "learning_rate": 2.5787172716875642e-06, + "loss": 0.85532051, + "num_input_tokens_seen": 151525295, + "step": 7058, + "time_per_iteration": 2.6467323303222656 + }, + { + "auxiliary_loss_clip": 0.01134661, + "auxiliary_loss_mlp": 0.00747728, + "balance_loss_clip": 1.00207365, + "balance_loss_mlp": 1.00041759, + "epoch": 0.42441004058319554, + "flos": 20011437181440.0, + "grad_norm": 2.6253866532846186, + "language_loss": 0.80266678, + "learning_rate": 2.5783444599113973e-06, + "loss": 0.82149071, + "num_input_tokens_seen": 151544435, + "step": 7059, + "time_per_iteration": 2.629269599914551 + }, + { + "auxiliary_loss_clip": 0.01168837, + "auxiliary_loss_mlp": 0.01121829, + "balance_loss_clip": 1.00205541, + "balance_loss_mlp": 1.00080824, + "epoch": 0.4244701638358635, + "flos": 11144985235200.0, + "grad_norm": 3.1635152377734523, + "language_loss": 0.69928837, + "learning_rate": 2.57797162620435e-06, + "loss": 0.72219503, + "num_input_tokens_seen": 151559520, + "step": 7060, + "time_per_iteration": 2.495985269546509 + }, + { + "auxiliary_loss_clip": 0.01153921, + "auxiliary_loss_mlp": 0.01121763, + "balance_loss_clip": 1.00213814, + "balance_loss_mlp": 1.00074208, + "epoch": 0.42453028708853147, + "flos": 23987753688960.0, + "grad_norm": 1.4802888787370645, + "language_loss": 0.75940347, + "learning_rate": 2.577598770580562e-06, + "loss": 0.78216034, + "num_input_tokens_seen": 151579790, + "step": 7061, + "time_per_iteration": 2.583662748336792 + }, + { + "auxiliary_loss_clip": 0.01152172, + "auxiliary_loss_mlp": 0.01121333, + "balance_loss_clip": 1.00201213, + "balance_loss_mlp": 1.00088358, + "epoch": 0.42459041034119943, + "flos": 18406338030720.0, + "grad_norm": 1.9090834442943745, + "language_loss": 0.72365713, + "learning_rate": 2.5772258930541693e-06, + "loss": 0.74639219, + "num_input_tokens_seen": 151598285, + "step": 7062, + "time_per_iteration": 2.546290636062622 + }, + { + "auxiliary_loss_clip": 0.0113678, + "auxiliary_loss_mlp": 0.01121217, + "balance_loss_clip": 1.0019753, + "balance_loss_mlp": 1.00105381, + "epoch": 0.42465053359386745, + "flos": 20958006337920.0, + "grad_norm": 1.7510459756543884, + "language_loss": 0.66266614, + "learning_rate": 2.5768529936393137e-06, + "loss": 0.68524605, + "num_input_tokens_seen": 151615430, + "step": 7063, + "time_per_iteration": 2.6194427013397217 + }, + { + "auxiliary_loss_clip": 0.0113711, + "auxiliary_loss_mlp": 0.00747647, + "balance_loss_clip": 1.00206244, + "balance_loss_mlp": 1.00041151, + "epoch": 0.4247106568465354, + "flos": 33106190520960.0, + "grad_norm": 1.799630981227592, + "language_loss": 0.78870547, + "learning_rate": 2.5764800723501354e-06, + "loss": 0.80755305, + "num_input_tokens_seen": 151637030, + "step": 7064, + "time_per_iteration": 2.702090263366699 + }, + { + "auxiliary_loss_clip": 0.01168904, + "auxiliary_loss_mlp": 0.01121314, + "balance_loss_clip": 1.00207472, + "balance_loss_mlp": 1.00086546, + "epoch": 0.4247707800992034, + "flos": 20046916840320.0, + "grad_norm": 1.8897985522807281, + "language_loss": 0.74829519, + "learning_rate": 2.5761071292007736e-06, + "loss": 0.77119732, + "num_input_tokens_seen": 151655745, + "step": 7065, + "time_per_iteration": 2.5761938095092773 + }, + { + "auxiliary_loss_clip": 0.01152355, + "auxiliary_loss_mlp": 0.01120675, + "balance_loss_clip": 1.00227666, + "balance_loss_mlp": 1.00089371, + "epoch": 0.42483090335187135, + "flos": 22385132576640.0, + "grad_norm": 1.3373249162543634, + "language_loss": 0.72431946, + "learning_rate": 2.5757341642053725e-06, + "loss": 0.74704969, + "num_input_tokens_seen": 151678040, + "step": 7066, + "time_per_iteration": 2.619581460952759 + }, + { + "auxiliary_loss_clip": 0.01119801, + "auxiliary_loss_mlp": 0.0112083, + "balance_loss_clip": 1.00178623, + "balance_loss_mlp": 1.00057197, + "epoch": 0.4248910266045393, + "flos": 21356830022400.0, + "grad_norm": 1.882410435007162, + "language_loss": 0.80017668, + "learning_rate": 2.5753611773780745e-06, + "loss": 0.82258296, + "num_input_tokens_seen": 151696410, + "step": 7067, + "time_per_iteration": 2.657072067260742 + }, + { + "auxiliary_loss_clip": 0.01164392, + "auxiliary_loss_mlp": 0.01101013, + "balance_loss_clip": 1.00167358, + "balance_loss_mlp": 1.00030458, + "epoch": 0.4249511498572073, + "flos": 64008114099840.0, + "grad_norm": 0.9206327428982595, + "language_loss": 0.63477224, + "learning_rate": 2.574988168733022e-06, + "loss": 0.65742624, + "num_input_tokens_seen": 151756365, + "step": 7068, + "time_per_iteration": 3.0443015098571777 + }, + { + "auxiliary_loss_clip": 0.01168727, + "auxiliary_loss_mlp": 0.01121546, + "balance_loss_clip": 1.00202537, + "balance_loss_mlp": 1.00061989, + "epoch": 0.42501127310987524, + "flos": 19607046888960.0, + "grad_norm": 1.615937167215933, + "language_loss": 0.72228718, + "learning_rate": 2.574615138284361e-06, + "loss": 0.74518991, + "num_input_tokens_seen": 151775165, + "step": 7069, + "time_per_iteration": 2.4979896545410156 + }, + { + "auxiliary_loss_clip": 0.01168822, + "auxiliary_loss_mlp": 0.0112133, + "balance_loss_clip": 1.00218916, + "balance_loss_mlp": 1.00068998, + "epoch": 0.4250713963625432, + "flos": 19462326992640.0, + "grad_norm": 2.066168585475718, + "language_loss": 0.7902348, + "learning_rate": 2.5742420860462364e-06, + "loss": 0.81313634, + "num_input_tokens_seen": 151792620, + "step": 7070, + "time_per_iteration": 2.529996633529663 + }, + { + "auxiliary_loss_clip": 0.01151787, + "auxiliary_loss_mlp": 0.01120832, + "balance_loss_clip": 1.00209272, + "balance_loss_mlp": 1.00066948, + "epoch": 0.4251315196152112, + "flos": 25337707557120.0, + "grad_norm": 1.6924401689619157, + "language_loss": 0.70370501, + "learning_rate": 2.573869012032795e-06, + "loss": 0.72643125, + "num_input_tokens_seen": 151812850, + "step": 7071, + "time_per_iteration": 3.9743664264678955 + }, + { + "auxiliary_loss_clip": 0.01168793, + "auxiliary_loss_mlp": 0.01120921, + "balance_loss_clip": 1.00217414, + "balance_loss_mlp": 1.00066233, + "epoch": 0.42519164286787914, + "flos": 26359186527360.0, + "grad_norm": 2.6810975676074142, + "language_loss": 0.70818281, + "learning_rate": 2.5734959162581824e-06, + "loss": 0.73107994, + "num_input_tokens_seen": 151831785, + "step": 7072, + "time_per_iteration": 2.5477893352508545 + }, + { + "auxiliary_loss_clip": 0.01127028, + "auxiliary_loss_mlp": 0.01121243, + "balance_loss_clip": 1.00209427, + "balance_loss_mlp": 1.00069857, + "epoch": 0.4252517661205471, + "flos": 26031070765440.0, + "grad_norm": 7.380456140131202, + "language_loss": 0.81502908, + "learning_rate": 2.5731227987365475e-06, + "loss": 0.83751178, + "num_input_tokens_seen": 151853885, + "step": 7073, + "time_per_iteration": 2.6520941257476807 + }, + { + "auxiliary_loss_clip": 0.01151839, + "auxiliary_loss_mlp": 0.01120301, + "balance_loss_clip": 1.00210965, + "balance_loss_mlp": 1.00061464, + "epoch": 0.42531188937321507, + "flos": 12713635059840.0, + "grad_norm": 3.47572747196753, + "language_loss": 0.91274047, + "learning_rate": 2.5727496594820386e-06, + "loss": 0.93546188, + "num_input_tokens_seen": 151871780, + "step": 7074, + "time_per_iteration": 2.522688865661621 + }, + { + "auxiliary_loss_clip": 0.01153124, + "auxiliary_loss_mlp": 0.00747683, + "balance_loss_clip": 1.00204229, + "balance_loss_mlp": 1.00036919, + "epoch": 0.42537201262588303, + "flos": 22091670460800.0, + "grad_norm": 2.679930369765195, + "language_loss": 0.63988137, + "learning_rate": 2.572376498508805e-06, + "loss": 0.65888947, + "num_input_tokens_seen": 151891600, + "step": 7075, + "time_per_iteration": 4.040294647216797 + }, + { + "auxiliary_loss_clip": 0.011199, + "auxiliary_loss_mlp": 0.01120088, + "balance_loss_clip": 1.00194049, + "balance_loss_mlp": 1.00059283, + "epoch": 0.42543213587855105, + "flos": 23003119094400.0, + "grad_norm": 5.1722932583583185, + "language_loss": 0.73380804, + "learning_rate": 2.5720033158309973e-06, + "loss": 0.75620794, + "num_input_tokens_seen": 151911330, + "step": 7076, + "time_per_iteration": 4.103160381317139 + }, + { + "auxiliary_loss_clip": 0.01138566, + "auxiliary_loss_mlp": 0.01121683, + "balance_loss_clip": 1.00207019, + "balance_loss_mlp": 1.00075674, + "epoch": 0.425492259131219, + "flos": 25082454533760.0, + "grad_norm": 2.3478936387172524, + "language_loss": 0.78279555, + "learning_rate": 2.571630111462766e-06, + "loss": 0.80539805, + "num_input_tokens_seen": 151930355, + "step": 7077, + "time_per_iteration": 2.617602586746216 + }, + { + "auxiliary_loss_clip": 0.01137043, + "auxiliary_loss_mlp": 0.0111918, + "balance_loss_clip": 1.00211263, + "balance_loss_mlp": 1.00054312, + "epoch": 0.425552382383887, + "flos": 22816850140800.0, + "grad_norm": 2.017417174479121, + "language_loss": 0.73343915, + "learning_rate": 2.571256885418265e-06, + "loss": 0.75600135, + "num_input_tokens_seen": 151949695, + "step": 7078, + "time_per_iteration": 2.5989084243774414 + }, + { + "auxiliary_loss_clip": 0.01135261, + "auxiliary_loss_mlp": 0.01120393, + "balance_loss_clip": 1.00222015, + "balance_loss_mlp": 1.00089741, + "epoch": 0.42561250563655495, + "flos": 13553585671680.0, + "grad_norm": 1.6502054238717327, + "language_loss": 0.79393721, + "learning_rate": 2.5708836377116445e-06, + "loss": 0.81649375, + "num_input_tokens_seen": 151967640, + "step": 7079, + "time_per_iteration": 2.5935473442077637 + }, + { + "auxiliary_loss_clip": 0.01152157, + "auxiliary_loss_mlp": 0.01120497, + "balance_loss_clip": 1.00226724, + "balance_loss_mlp": 1.00081086, + "epoch": 0.4256726288892229, + "flos": 46978303023360.0, + "grad_norm": 1.6824669914983124, + "language_loss": 0.71722984, + "learning_rate": 2.5705103683570592e-06, + "loss": 0.73995638, + "num_input_tokens_seen": 151994020, + "step": 7080, + "time_per_iteration": 4.182163238525391 + }, + { + "auxiliary_loss_clip": 0.01168505, + "auxiliary_loss_mlp": 0.01119309, + "balance_loss_clip": 1.00195944, + "balance_loss_mlp": 1.0005765, + "epoch": 0.4257327521418909, + "flos": 23586451966080.0, + "grad_norm": 2.055506012918468, + "language_loss": 0.80419505, + "learning_rate": 2.5701370773686646e-06, + "loss": 0.82707322, + "num_input_tokens_seen": 152013415, + "step": 7081, + "time_per_iteration": 2.526115655899048 + }, + { + "auxiliary_loss_clip": 0.01143152, + "auxiliary_loss_mlp": 0.01119521, + "balance_loss_clip": 1.00220442, + "balance_loss_mlp": 1.00059772, + "epoch": 0.42579287539455885, + "flos": 18989994124800.0, + "grad_norm": 1.7542101439154396, + "language_loss": 0.81630874, + "learning_rate": 2.5697637647606138e-06, + "loss": 0.83893549, + "num_input_tokens_seen": 152030860, + "step": 7082, + "time_per_iteration": 2.5771920680999756 + }, + { + "auxiliary_loss_clip": 0.01152114, + "auxiliary_loss_mlp": 0.01120745, + "balance_loss_clip": 1.00217104, + "balance_loss_mlp": 1.00067711, + "epoch": 0.4258529986472268, + "flos": 25191910252800.0, + "grad_norm": 2.1449620239115474, + "language_loss": 0.69757921, + "learning_rate": 2.569390430547065e-06, + "loss": 0.72030771, + "num_input_tokens_seen": 152050395, + "step": 7083, + "time_per_iteration": 2.585721015930176 + }, + { + "auxiliary_loss_clip": 0.01149747, + "auxiliary_loss_mlp": 0.01101034, + "balance_loss_clip": 1.00180686, + "balance_loss_mlp": 1.0003258, + "epoch": 0.4259131218998948, + "flos": 69968280718080.0, + "grad_norm": 0.8697131527389634, + "language_loss": 0.67175269, + "learning_rate": 2.569017074742173e-06, + "loss": 0.69426048, + "num_input_tokens_seen": 152113555, + "step": 7084, + "time_per_iteration": 3.2295000553131104 + }, + { + "auxiliary_loss_clip": 0.01151994, + "auxiliary_loss_mlp": 0.01120345, + "balance_loss_clip": 1.00200796, + "balance_loss_mlp": 1.00075412, + "epoch": 0.42597324515256274, + "flos": 18004964480640.0, + "grad_norm": 2.2788605671608972, + "language_loss": 0.78489316, + "learning_rate": 2.5686436973600964e-06, + "loss": 0.80761659, + "num_input_tokens_seen": 152131575, + "step": 7085, + "time_per_iteration": 2.541862726211548 + }, + { + "auxiliary_loss_clip": 0.01158502, + "auxiliary_loss_mlp": 0.01121476, + "balance_loss_clip": 1.00213218, + "balance_loss_mlp": 1.000741, + "epoch": 0.4260333684052307, + "flos": 15158792563200.0, + "grad_norm": 3.7827236465497536, + "language_loss": 0.76117861, + "learning_rate": 2.568270298414995e-06, + "loss": 0.78397846, + "num_input_tokens_seen": 152149435, + "step": 7086, + "time_per_iteration": 2.5413944721221924 + }, + { + "auxiliary_loss_clip": 0.01136846, + "auxiliary_loss_mlp": 0.01120573, + "balance_loss_clip": 1.00201035, + "balance_loss_mlp": 1.00069594, + "epoch": 0.42609349165789867, + "flos": 14939342421120.0, + "grad_norm": 1.8305844176259771, + "language_loss": 0.79826999, + "learning_rate": 2.5678968779210255e-06, + "loss": 0.82084417, + "num_input_tokens_seen": 152166860, + "step": 7087, + "time_per_iteration": 2.591407060623169 + }, + { + "auxiliary_loss_clip": 0.01137018, + "auxiliary_loss_mlp": 0.01120478, + "balance_loss_clip": 1.00206256, + "balance_loss_mlp": 1.00060105, + "epoch": 0.42615361491056664, + "flos": 23731961961600.0, + "grad_norm": 1.983451047199347, + "language_loss": 0.65841717, + "learning_rate": 2.5675234358923505e-06, + "loss": 0.68099213, + "num_input_tokens_seen": 152187475, + "step": 7088, + "time_per_iteration": 2.622448682785034 + }, + { + "auxiliary_loss_clip": 0.01110995, + "auxiliary_loss_mlp": 0.01120766, + "balance_loss_clip": 1.00202775, + "balance_loss_mlp": 1.00069821, + "epoch": 0.42621373816323466, + "flos": 24936441747840.0, + "grad_norm": 2.621742801385275, + "language_loss": 0.68302292, + "learning_rate": 2.56714997234313e-06, + "loss": 0.7053405, + "num_input_tokens_seen": 152207235, + "step": 7089, + "time_per_iteration": 2.717041492462158 + }, + { + "auxiliary_loss_clip": 0.01103241, + "auxiliary_loss_mlp": 0.01120765, + "balance_loss_clip": 1.00190687, + "balance_loss_mlp": 1.00069785, + "epoch": 0.4262738614159026, + "flos": 13552975140480.0, + "grad_norm": 4.401841075374576, + "language_loss": 0.7303108, + "learning_rate": 2.566776487287525e-06, + "loss": 0.7525509, + "num_input_tokens_seen": 152224240, + "step": 7090, + "time_per_iteration": 2.6241703033447266 + }, + { + "auxiliary_loss_clip": 0.01137553, + "auxiliary_loss_mlp": 0.01121482, + "balance_loss_clip": 1.00198793, + "balance_loss_mlp": 1.00084186, + "epoch": 0.4263339846685706, + "flos": 29748794284800.0, + "grad_norm": 3.41064335722862, + "language_loss": 0.75006449, + "learning_rate": 2.5664029807396994e-06, + "loss": 0.77265489, + "num_input_tokens_seen": 152242595, + "step": 7091, + "time_per_iteration": 2.668941020965576 + }, + { + "auxiliary_loss_clip": 0.01104172, + "auxiliary_loss_mlp": 0.01118969, + "balance_loss_clip": 1.00190532, + "balance_loss_mlp": 1.00061846, + "epoch": 0.42639410792123855, + "flos": 16834204586880.0, + "grad_norm": 2.542638551137544, + "language_loss": 0.82386792, + "learning_rate": 2.5660294527138156e-06, + "loss": 0.84609938, + "num_input_tokens_seen": 152260840, + "step": 7092, + "time_per_iteration": 2.673846960067749 + }, + { + "auxiliary_loss_clip": 0.01137471, + "auxiliary_loss_mlp": 0.01122145, + "balance_loss_clip": 1.00204039, + "balance_loss_mlp": 1.00074232, + "epoch": 0.4264542311739065, + "flos": 28763118195840.0, + "grad_norm": 1.5075361063936776, + "language_loss": 0.735291, + "learning_rate": 2.565655903224038e-06, + "loss": 0.75788713, + "num_input_tokens_seen": 152280580, + "step": 7093, + "time_per_iteration": 2.6557083129882812 + }, + { + "auxiliary_loss_clip": 0.01153898, + "auxiliary_loss_mlp": 0.01120855, + "balance_loss_clip": 1.00217128, + "balance_loss_mlp": 1.0007875, + "epoch": 0.4265143544265745, + "flos": 24713615727360.0, + "grad_norm": 2.153709401605424, + "language_loss": 0.70059907, + "learning_rate": 2.565282332284532e-06, + "loss": 0.72334659, + "num_input_tokens_seen": 152298455, + "step": 7094, + "time_per_iteration": 2.5838463306427 + }, + { + "auxiliary_loss_clip": 0.01118203, + "auxiliary_loss_mlp": 0.01120639, + "balance_loss_clip": 1.00195289, + "balance_loss_mlp": 1.00066674, + "epoch": 0.42657447767924245, + "flos": 21865971352320.0, + "grad_norm": 1.652905906294462, + "language_loss": 0.81354475, + "learning_rate": 2.564908739909464e-06, + "loss": 0.83593315, + "num_input_tokens_seen": 152316995, + "step": 7095, + "time_per_iteration": 2.651123285293579 + }, + { + "auxiliary_loss_clip": 0.01168876, + "auxiliary_loss_mlp": 0.01120702, + "balance_loss_clip": 1.00211239, + "balance_loss_mlp": 1.00072932, + "epoch": 0.4266346009319104, + "flos": 21470236237440.0, + "grad_norm": 1.9105076542586716, + "language_loss": 0.8069244, + "learning_rate": 2.5645351261129996e-06, + "loss": 0.82982016, + "num_input_tokens_seen": 152334800, + "step": 7096, + "time_per_iteration": 2.5057127475738525 + }, + { + "auxiliary_loss_clip": 0.01152235, + "auxiliary_loss_mlp": 0.01120851, + "balance_loss_clip": 1.00204134, + "balance_loss_mlp": 1.00068784, + "epoch": 0.4266947241845784, + "flos": 25519379569920.0, + "grad_norm": 1.9247221996916617, + "language_loss": 0.65702271, + "learning_rate": 2.5641614909093066e-06, + "loss": 0.67975354, + "num_input_tokens_seen": 152355175, + "step": 7097, + "time_per_iteration": 2.5790061950683594 + }, + { + "auxiliary_loss_clip": 0.01141907, + "auxiliary_loss_mlp": 0.01120563, + "balance_loss_clip": 1.0019536, + "balance_loss_mlp": 1.0004952, + "epoch": 0.42675484743724634, + "flos": 26541217676160.0, + "grad_norm": 1.797803142182985, + "language_loss": 0.74347579, + "learning_rate": 2.5637878343125535e-06, + "loss": 0.76610041, + "num_input_tokens_seen": 152377245, + "step": 7098, + "time_per_iteration": 2.624180793762207 + }, + { + "auxiliary_loss_clip": 0.01152988, + "auxiliary_loss_mlp": 0.01119813, + "balance_loss_clip": 1.00211668, + "balance_loss_mlp": 1.00050855, + "epoch": 0.4268149706899143, + "flos": 23112718467840.0, + "grad_norm": 1.7973091626526887, + "language_loss": 0.75431597, + "learning_rate": 2.5634141563369086e-06, + "loss": 0.77704406, + "num_input_tokens_seen": 152396985, + "step": 7099, + "time_per_iteration": 2.5873911380767822 + }, + { + "auxiliary_loss_clip": 0.01137163, + "auxiliary_loss_mlp": 0.01121414, + "balance_loss_clip": 1.00205731, + "balance_loss_mlp": 1.0006783, + "epoch": 0.4268750939425823, + "flos": 22706532495360.0, + "grad_norm": 2.6521765738295127, + "language_loss": 0.83072573, + "learning_rate": 2.5630404569965432e-06, + "loss": 0.85331154, + "num_input_tokens_seen": 152415590, + "step": 7100, + "time_per_iteration": 2.5990519523620605 + }, + { + "auxiliary_loss_clip": 0.01136438, + "auxiliary_loss_mlp": 0.01120738, + "balance_loss_clip": 1.00190151, + "balance_loss_mlp": 1.00067019, + "epoch": 0.42693521719525024, + "flos": 25374875155200.0, + "grad_norm": 1.5941811148430551, + "language_loss": 0.82202244, + "learning_rate": 2.562666736305627e-06, + "loss": 0.84459412, + "num_input_tokens_seen": 152436735, + "step": 7101, + "time_per_iteration": 2.6339945793151855 + }, + { + "auxiliary_loss_clip": 0.01168843, + "auxiliary_loss_mlp": 0.01121026, + "balance_loss_clip": 1.00208926, + "balance_loss_mlp": 1.0005765, + "epoch": 0.42699534044791826, + "flos": 18150689957760.0, + "grad_norm": 9.945778146714877, + "language_loss": 0.72670472, + "learning_rate": 2.5622929942783314e-06, + "loss": 0.74960339, + "num_input_tokens_seen": 152455685, + "step": 7102, + "time_per_iteration": 2.487074851989746 + }, + { + "auxiliary_loss_clip": 0.01153661, + "auxiliary_loss_mlp": 0.01120309, + "balance_loss_clip": 1.00203037, + "balance_loss_mlp": 1.00052738, + "epoch": 0.4270554637005862, + "flos": 13698413308800.0, + "grad_norm": 1.7677783596156158, + "language_loss": 0.83096719, + "learning_rate": 2.5619192309288297e-06, + "loss": 0.8537069, + "num_input_tokens_seen": 152473500, + "step": 7103, + "time_per_iteration": 2.560014247894287 + }, + { + "auxiliary_loss_clip": 0.01138593, + "auxiliary_loss_mlp": 0.01121606, + "balance_loss_clip": 1.00206518, + "balance_loss_mlp": 1.0005846, + "epoch": 0.4271155869532542, + "flos": 17493596507520.0, + "grad_norm": 2.1394032907136613, + "language_loss": 0.73681933, + "learning_rate": 2.561545446271294e-06, + "loss": 0.75942135, + "num_input_tokens_seen": 152491320, + "step": 7104, + "time_per_iteration": 2.5869078636169434 + }, + { + "auxiliary_loss_clip": 0.01153626, + "auxiliary_loss_mlp": 0.01120062, + "balance_loss_clip": 1.00215912, + "balance_loss_mlp": 1.00066161, + "epoch": 0.42717571020592215, + "flos": 32452293381120.0, + "grad_norm": 2.2567981126943186, + "language_loss": 0.74347484, + "learning_rate": 2.5611716403198987e-06, + "loss": 0.76621169, + "num_input_tokens_seen": 152511970, + "step": 7105, + "time_per_iteration": 2.6506619453430176 + }, + { + "auxiliary_loss_clip": 0.01168901, + "auxiliary_loss_mlp": 0.01121312, + "balance_loss_clip": 1.00220478, + "balance_loss_mlp": 1.00067198, + "epoch": 0.4272358334585901, + "flos": 16253062444800.0, + "grad_norm": 1.6845525567360045, + "language_loss": 0.76453722, + "learning_rate": 2.560797813088819e-06, + "loss": 0.78743935, + "num_input_tokens_seen": 152530515, + "step": 7106, + "time_per_iteration": 2.506136894226074 + }, + { + "auxiliary_loss_clip": 0.01136332, + "auxiliary_loss_mlp": 0.01120132, + "balance_loss_clip": 1.00199568, + "balance_loss_mlp": 1.00063705, + "epoch": 0.4272959567112581, + "flos": 24200092938240.0, + "grad_norm": 1.7659206276332198, + "language_loss": 0.79966962, + "learning_rate": 2.560423964592229e-06, + "loss": 0.82223427, + "num_input_tokens_seen": 152549295, + "step": 7107, + "time_per_iteration": 2.6220333576202393 + }, + { + "auxiliary_loss_clip": 0.01105919, + "auxiliary_loss_mlp": 0.01120164, + "balance_loss_clip": 1.00205088, + "balance_loss_mlp": 1.00076342, + "epoch": 0.42735607996392605, + "flos": 27963495578880.0, + "grad_norm": 1.3928776744570317, + "language_loss": 0.67776948, + "learning_rate": 2.5600500948443075e-06, + "loss": 0.70003033, + "num_input_tokens_seen": 152570725, + "step": 7108, + "time_per_iteration": 4.142235994338989 + }, + { + "auxiliary_loss_clip": 0.01141642, + "auxiliary_loss_mlp": 0.01119932, + "balance_loss_clip": 1.00212407, + "balance_loss_mlp": 1.00062728, + "epoch": 0.427416203216594, + "flos": 20295597674880.0, + "grad_norm": 1.8742771614657214, + "language_loss": 0.71489114, + "learning_rate": 2.5596762038592294e-06, + "loss": 0.73750687, + "num_input_tokens_seen": 152588950, + "step": 7109, + "time_per_iteration": 2.60097074508667 + }, + { + "auxiliary_loss_clip": 0.01153687, + "auxiliary_loss_mlp": 0.01120551, + "balance_loss_clip": 1.00206518, + "balance_loss_mlp": 1.00057876, + "epoch": 0.427476326469262, + "flos": 26943955943040.0, + "grad_norm": 2.6072419575946437, + "language_loss": 0.64265192, + "learning_rate": 2.559302291651174e-06, + "loss": 0.66539431, + "num_input_tokens_seen": 152608965, + "step": 7110, + "time_per_iteration": 2.5777151584625244 + }, + { + "auxiliary_loss_clip": 0.01168723, + "auxiliary_loss_mlp": 0.00747521, + "balance_loss_clip": 1.00214112, + "balance_loss_mlp": 1.00037086, + "epoch": 0.42753644972192995, + "flos": 25702847262720.0, + "grad_norm": 1.7385000451968704, + "language_loss": 0.76450139, + "learning_rate": 2.5589283582343197e-06, + "loss": 0.78366375, + "num_input_tokens_seen": 152630220, + "step": 7111, + "time_per_iteration": 2.5563528537750244 + }, + { + "auxiliary_loss_clip": 0.01120025, + "auxiliary_loss_mlp": 0.01120073, + "balance_loss_clip": 1.00185633, + "balance_loss_mlp": 1.00067341, + "epoch": 0.4275965729745979, + "flos": 18767419499520.0, + "grad_norm": 1.7200426305200551, + "language_loss": 0.72930527, + "learning_rate": 2.558554403622845e-06, + "loss": 0.75170618, + "num_input_tokens_seen": 152648835, + "step": 7112, + "time_per_iteration": 2.6996304988861084 + }, + { + "auxiliary_loss_clip": 0.01136927, + "auxiliary_loss_mlp": 0.0112026, + "balance_loss_clip": 1.00202262, + "balance_loss_mlp": 1.00066888, + "epoch": 0.4276566962272659, + "flos": 23764424878080.0, + "grad_norm": 1.927988290529723, + "language_loss": 0.71344846, + "learning_rate": 2.5581804278309323e-06, + "loss": 0.73602033, + "num_input_tokens_seen": 152668375, + "step": 7113, + "time_per_iteration": 4.085842609405518 + }, + { + "auxiliary_loss_clip": 0.01151722, + "auxiliary_loss_mlp": 0.01121221, + "balance_loss_clip": 1.00211847, + "balance_loss_mlp": 1.00077164, + "epoch": 0.42771681947993384, + "flos": 22492505306880.0, + "grad_norm": 1.554946346751593, + "language_loss": 0.61623317, + "learning_rate": 2.5578064308727617e-06, + "loss": 0.63896263, + "num_input_tokens_seen": 152689725, + "step": 7114, + "time_per_iteration": 4.062589883804321 + }, + { + "auxiliary_loss_clip": 0.0115236, + "auxiliary_loss_mlp": 0.01122075, + "balance_loss_clip": 1.00215065, + "balance_loss_mlp": 1.00095868, + "epoch": 0.42777694273260186, + "flos": 25044712318080.0, + "grad_norm": 1.6932469991152908, + "language_loss": 0.64796615, + "learning_rate": 2.5574324127625153e-06, + "loss": 0.6707105, + "num_input_tokens_seen": 152709375, + "step": 7115, + "time_per_iteration": 2.5767323970794678 + }, + { + "auxiliary_loss_clip": 0.01137214, + "auxiliary_loss_mlp": 0.01120209, + "balance_loss_clip": 1.00204015, + "balance_loss_mlp": 1.00052285, + "epoch": 0.4278370659852698, + "flos": 18661519226880.0, + "grad_norm": 1.5219563992390157, + "language_loss": 0.73487663, + "learning_rate": 2.5570583735143753e-06, + "loss": 0.75745082, + "num_input_tokens_seen": 152727510, + "step": 7116, + "time_per_iteration": 2.5794410705566406 + }, + { + "auxiliary_loss_clip": 0.0113843, + "auxiliary_loss_mlp": 0.01119188, + "balance_loss_clip": 1.00195503, + "balance_loss_mlp": 1.00064635, + "epoch": 0.4278971892379378, + "flos": 27308269635840.0, + "grad_norm": 1.659856718416788, + "language_loss": 0.69454551, + "learning_rate": 2.5566843131425275e-06, + "loss": 0.71712172, + "num_input_tokens_seen": 152746670, + "step": 7117, + "time_per_iteration": 2.6380972862243652 + }, + { + "auxiliary_loss_clip": 0.0113727, + "auxiliary_loss_mlp": 0.01120714, + "balance_loss_clip": 1.0020349, + "balance_loss_mlp": 1.00074148, + "epoch": 0.42795731249060576, + "flos": 12888698970240.0, + "grad_norm": 2.2060118302717493, + "language_loss": 0.6958667, + "learning_rate": 2.5563102316611536e-06, + "loss": 0.71844649, + "num_input_tokens_seen": 152760545, + "step": 7118, + "time_per_iteration": 4.018347978591919 + }, + { + "auxiliary_loss_clip": 0.01122015, + "auxiliary_loss_mlp": 0.01120417, + "balance_loss_clip": 1.00200379, + "balance_loss_mlp": 1.00092173, + "epoch": 0.4280174357432737, + "flos": 33401448316800.0, + "grad_norm": 2.014848607256823, + "language_loss": 0.74537832, + "learning_rate": 2.55593612908444e-06, + "loss": 0.7678026, + "num_input_tokens_seen": 152780970, + "step": 7119, + "time_per_iteration": 2.757856607437134 + }, + { + "auxiliary_loss_clip": 0.01089661, + "auxiliary_loss_mlp": 0.01120766, + "balance_loss_clip": 1.00194657, + "balance_loss_mlp": 1.00069833, + "epoch": 0.4280775589959417, + "flos": 18259104182400.0, + "grad_norm": 2.4182940657483463, + "language_loss": 0.74691927, + "learning_rate": 2.555562005426573e-06, + "loss": 0.76902354, + "num_input_tokens_seen": 152798475, + "step": 7120, + "time_per_iteration": 2.710904359817505 + }, + { + "auxiliary_loss_clip": 0.01135406, + "auxiliary_loss_mlp": 0.0074742, + "balance_loss_clip": 1.00206184, + "balance_loss_mlp": 1.00040245, + "epoch": 0.42813768224860965, + "flos": 21471277731840.0, + "grad_norm": 1.8512401256463509, + "language_loss": 0.76993084, + "learning_rate": 2.5551878607017385e-06, + "loss": 0.78875905, + "num_input_tokens_seen": 152817555, + "step": 7121, + "time_per_iteration": 2.653432846069336 + }, + { + "auxiliary_loss_clip": 0.01136466, + "auxiliary_loss_mlp": 0.01119954, + "balance_loss_clip": 1.00204813, + "balance_loss_mlp": 1.00074458, + "epoch": 0.4281978055012776, + "flos": 15669262696320.0, + "grad_norm": 2.015418951986211, + "language_loss": 0.85963702, + "learning_rate": 2.554813694924126e-06, + "loss": 0.88220119, + "num_input_tokens_seen": 152836295, + "step": 7122, + "time_per_iteration": 2.60561203956604 + }, + { + "auxiliary_loss_clip": 0.01105336, + "auxiliary_loss_mlp": 0.01119502, + "balance_loss_clip": 1.00191164, + "balance_loss_mlp": 1.00057864, + "epoch": 0.4282579287539456, + "flos": 17712005155200.0, + "grad_norm": 1.8379944821137275, + "language_loss": 0.81707942, + "learning_rate": 2.554439508107921e-06, + "loss": 0.83932781, + "num_input_tokens_seen": 152854950, + "step": 7123, + "time_per_iteration": 2.7040467262268066 + }, + { + "auxiliary_loss_clip": 0.01118894, + "auxiliary_loss_mlp": 0.01120377, + "balance_loss_clip": 1.002069, + "balance_loss_mlp": 1.00078619, + "epoch": 0.42831805200661355, + "flos": 19281157770240.0, + "grad_norm": 1.6268709703825692, + "language_loss": 0.80858207, + "learning_rate": 2.5540653002673153e-06, + "loss": 0.83097476, + "num_input_tokens_seen": 152873995, + "step": 7124, + "time_per_iteration": 2.6591169834136963 + }, + { + "auxiliary_loss_clip": 0.01153693, + "auxiliary_loss_mlp": 0.01120063, + "balance_loss_clip": 1.00208545, + "balance_loss_mlp": 1.00056779, + "epoch": 0.4283781752592815, + "flos": 19792633484160.0, + "grad_norm": 1.6398329975592438, + "language_loss": 0.80349362, + "learning_rate": 2.553691071416498e-06, + "loss": 0.82623118, + "num_input_tokens_seen": 152892925, + "step": 7125, + "time_per_iteration": 2.593703031539917 + }, + { + "auxiliary_loss_clip": 0.01168745, + "auxiliary_loss_mlp": 0.00747323, + "balance_loss_clip": 1.0022161, + "balance_loss_mlp": 1.00026798, + "epoch": 0.4284382985119495, + "flos": 16508064072960.0, + "grad_norm": 2.1066529135586496, + "language_loss": 0.7522037, + "learning_rate": 2.553316821569659e-06, + "loss": 0.77136439, + "num_input_tokens_seen": 152910935, + "step": 7126, + "time_per_iteration": 2.516740083694458 + }, + { + "auxiliary_loss_clip": 0.01152145, + "auxiliary_loss_mlp": 0.01120236, + "balance_loss_clip": 1.00208437, + "balance_loss_mlp": 1.00055003, + "epoch": 0.42849842176461744, + "flos": 23330767979520.0, + "grad_norm": 1.5747952670541974, + "language_loss": 0.81366044, + "learning_rate": 2.5529425507409913e-06, + "loss": 0.83638424, + "num_input_tokens_seen": 152931030, + "step": 7127, + "time_per_iteration": 2.5883736610412598 + }, + { + "auxiliary_loss_clip": 0.01101425, + "auxiliary_loss_mlp": 0.01120802, + "balance_loss_clip": 1.00184596, + "balance_loss_mlp": 1.00083017, + "epoch": 0.4285585450172854, + "flos": 17274433674240.0, + "grad_norm": 3.441917832693158, + "language_loss": 0.7639972, + "learning_rate": 2.5525682589446867e-06, + "loss": 0.78621948, + "num_input_tokens_seen": 152948085, + "step": 7128, + "time_per_iteration": 2.648688554763794 + }, + { + "auxiliary_loss_clip": 0.01103761, + "auxiliary_loss_mlp": 0.01121322, + "balance_loss_clip": 1.00192595, + "balance_loss_mlp": 1.00068235, + "epoch": 0.42861866826995343, + "flos": 24279599692800.0, + "grad_norm": 4.06992492758182, + "language_loss": 0.7376163, + "learning_rate": 2.552193946194937e-06, + "loss": 0.75986713, + "num_input_tokens_seen": 152966265, + "step": 7129, + "time_per_iteration": 2.6937668323516846 + }, + { + "auxiliary_loss_clip": 0.01151721, + "auxiliary_loss_mlp": 0.00747454, + "balance_loss_clip": 1.00218725, + "balance_loss_mlp": 1.00028121, + "epoch": 0.4286787915226214, + "flos": 24353108876160.0, + "grad_norm": 2.1798489516860875, + "language_loss": 0.7775147, + "learning_rate": 2.5518196125059394e-06, + "loss": 0.79650646, + "num_input_tokens_seen": 152986775, + "step": 7130, + "time_per_iteration": 2.588111400604248 + }, + { + "auxiliary_loss_clip": 0.011357, + "auxiliary_loss_mlp": 0.01120826, + "balance_loss_clip": 1.00222683, + "balance_loss_mlp": 1.00085378, + "epoch": 0.42873891477528936, + "flos": 15449992122240.0, + "grad_norm": 2.194441327752025, + "language_loss": 0.73347527, + "learning_rate": 2.551445257891886e-06, + "loss": 0.75604057, + "num_input_tokens_seen": 153003595, + "step": 7131, + "time_per_iteration": 2.5504631996154785 + }, + { + "auxiliary_loss_clip": 0.01136996, + "auxiliary_loss_mlp": 0.01120691, + "balance_loss_clip": 1.00202882, + "balance_loss_mlp": 1.00071847, + "epoch": 0.4287990380279573, + "flos": 17639573379840.0, + "grad_norm": 2.8452491115258702, + "language_loss": 0.77865702, + "learning_rate": 2.551070882366973e-06, + "loss": 0.80123389, + "num_input_tokens_seen": 153021960, + "step": 7132, + "time_per_iteration": 2.581465721130371 + }, + { + "auxiliary_loss_clip": 0.0112039, + "auxiliary_loss_mlp": 0.00747456, + "balance_loss_clip": 1.00200868, + "balance_loss_mlp": 1.00030279, + "epoch": 0.4288591612806253, + "flos": 27162328677120.0, + "grad_norm": 1.574585431136916, + "language_loss": 0.78956473, + "learning_rate": 2.550696485945397e-06, + "loss": 0.80824316, + "num_input_tokens_seen": 153042110, + "step": 7133, + "time_per_iteration": 2.6862740516662598 + }, + { + "auxiliary_loss_clip": 0.01135526, + "auxiliary_loss_mlp": 0.01120252, + "balance_loss_clip": 1.00196218, + "balance_loss_mlp": 1.00066161, + "epoch": 0.42891928453329325, + "flos": 17163182275200.0, + "grad_norm": 1.9443966972640252, + "language_loss": 0.74583757, + "learning_rate": 2.550322068641355e-06, + "loss": 0.76839542, + "num_input_tokens_seen": 153058925, + "step": 7134, + "time_per_iteration": 2.5754289627075195 + }, + { + "auxiliary_loss_clip": 0.01153386, + "auxiliary_loss_mlp": 0.01119022, + "balance_loss_clip": 1.00206351, + "balance_loss_mlp": 1.0005759, + "epoch": 0.4289794077859612, + "flos": 18187031543040.0, + "grad_norm": 1.894519598836633, + "language_loss": 0.84274995, + "learning_rate": 2.5499476304690455e-06, + "loss": 0.86547405, + "num_input_tokens_seen": 153078070, + "step": 7135, + "time_per_iteration": 2.5411932468414307 + }, + { + "auxiliary_loss_clip": 0.01090169, + "auxiliary_loss_mlp": 0.01120132, + "balance_loss_clip": 1.00182116, + "balance_loss_mlp": 1.00063646, + "epoch": 0.4290395310386292, + "flos": 28256885867520.0, + "grad_norm": 2.554098972168366, + "language_loss": 0.75127751, + "learning_rate": 2.549573171442666e-06, + "loss": 0.77338052, + "num_input_tokens_seen": 153096680, + "step": 7136, + "time_per_iteration": 2.7628071308135986 + }, + { + "auxiliary_loss_clip": 0.011521, + "auxiliary_loss_mlp": 0.01120584, + "balance_loss_clip": 1.00199747, + "balance_loss_mlp": 1.00061226, + "epoch": 0.42909965429129715, + "flos": 16216074414720.0, + "grad_norm": 1.9126546859240119, + "language_loss": 0.78841269, + "learning_rate": 2.5491986915764175e-06, + "loss": 0.81113958, + "num_input_tokens_seen": 153113305, + "step": 7137, + "time_per_iteration": 2.5359184741973877 + }, + { + "auxiliary_loss_clip": 0.01168981, + "auxiliary_loss_mlp": 0.01120503, + "balance_loss_clip": 1.00230551, + "balance_loss_mlp": 1.00072181, + "epoch": 0.4291597775439651, + "flos": 23112862122240.0, + "grad_norm": 1.8302265684708636, + "language_loss": 0.76052952, + "learning_rate": 2.548824190884499e-06, + "loss": 0.78342438, + "num_input_tokens_seen": 153132735, + "step": 7138, + "time_per_iteration": 2.5302512645721436 + }, + { + "auxiliary_loss_clip": 0.01132929, + "auxiliary_loss_mlp": 0.01100991, + "balance_loss_clip": 1.001791, + "balance_loss_mlp": 1.00028324, + "epoch": 0.4292199007966331, + "flos": 67546212681600.0, + "grad_norm": 0.7703875466490395, + "language_loss": 0.56288695, + "learning_rate": 2.548449669381113e-06, + "loss": 0.58522612, + "num_input_tokens_seen": 153187925, + "step": 7139, + "time_per_iteration": 3.038090229034424 + }, + { + "auxiliary_loss_clip": 0.01168582, + "auxiliary_loss_mlp": 0.00747363, + "balance_loss_clip": 1.00211883, + "balance_loss_mlp": 1.00034833, + "epoch": 0.42928002404930105, + "flos": 22999850956800.0, + "grad_norm": 1.6244683185011786, + "language_loss": 0.80967116, + "learning_rate": 2.5480751270804595e-06, + "loss": 0.8288306, + "num_input_tokens_seen": 153206990, + "step": 7140, + "time_per_iteration": 2.537640333175659 + }, + { + "auxiliary_loss_clip": 0.01151979, + "auxiliary_loss_mlp": 0.01120049, + "balance_loss_clip": 1.0019815, + "balance_loss_mlp": 1.00036287, + "epoch": 0.429340147301969, + "flos": 11544922241280.0, + "grad_norm": 1.9855908498980495, + "language_loss": 0.82041597, + "learning_rate": 2.5477005639967424e-06, + "loss": 0.84313625, + "num_input_tokens_seen": 153222345, + "step": 7141, + "time_per_iteration": 2.556690216064453 + }, + { + "auxiliary_loss_clip": 0.01158539, + "auxiliary_loss_mlp": 0.01120932, + "balance_loss_clip": 1.00245357, + "balance_loss_mlp": 1.00076902, + "epoch": 0.42940027055463703, + "flos": 25264988472960.0, + "grad_norm": 1.8116399900834759, + "language_loss": 0.86482942, + "learning_rate": 2.547325980144166e-06, + "loss": 0.88762414, + "num_input_tokens_seen": 153240570, + "step": 7142, + "time_per_iteration": 2.581059694290161 + }, + { + "auxiliary_loss_clip": 0.01135113, + "auxiliary_loss_mlp": 0.01119394, + "balance_loss_clip": 1.00218248, + "balance_loss_mlp": 1.00066173, + "epoch": 0.429460393807305, + "flos": 23805004268160.0, + "grad_norm": 2.1806974689850693, + "language_loss": 0.78281939, + "learning_rate": 2.5469513755369323e-06, + "loss": 0.80536443, + "num_input_tokens_seen": 153259575, + "step": 7143, + "time_per_iteration": 2.6175451278686523 + }, + { + "auxiliary_loss_clip": 0.01106839, + "auxiliary_loss_mlp": 0.01120006, + "balance_loss_clip": 1.00213647, + "balance_loss_mlp": 1.00070167, + "epoch": 0.42952051705997296, + "flos": 13918294414080.0, + "grad_norm": 2.255427103380362, + "language_loss": 0.76948112, + "learning_rate": 2.5465767501892484e-06, + "loss": 0.7917496, + "num_input_tokens_seen": 153276650, + "step": 7144, + "time_per_iteration": 2.650686740875244 + }, + { + "auxiliary_loss_clip": 0.01123319, + "auxiliary_loss_mlp": 0.01119678, + "balance_loss_clip": 1.00197816, + "balance_loss_mlp": 1.00046873, + "epoch": 0.4295806403126409, + "flos": 26760380509440.0, + "grad_norm": 1.8163162567316362, + "language_loss": 0.73649204, + "learning_rate": 2.54620210411532e-06, + "loss": 0.75892204, + "num_input_tokens_seen": 153298025, + "step": 7145, + "time_per_iteration": 2.686134099960327 + }, + { + "auxiliary_loss_clip": 0.01152093, + "auxiliary_loss_mlp": 0.01119643, + "balance_loss_clip": 1.00212896, + "balance_loss_mlp": 1.00062394, + "epoch": 0.4296407635653089, + "flos": 20952619297920.0, + "grad_norm": 1.930414151559999, + "language_loss": 0.79110122, + "learning_rate": 2.545827437329352e-06, + "loss": 0.81381857, + "num_input_tokens_seen": 153315775, + "step": 7146, + "time_per_iteration": 3.968742609024048 + }, + { + "auxiliary_loss_clip": 0.01151973, + "auxiliary_loss_mlp": 0.01119565, + "balance_loss_clip": 1.00205195, + "balance_loss_mlp": 1.0008328, + "epoch": 0.42970088681797686, + "flos": 15852335339520.0, + "grad_norm": 7.659825643460132, + "language_loss": 0.83338523, + "learning_rate": 2.5454527498455532e-06, + "loss": 0.85610056, + "num_input_tokens_seen": 153332765, + "step": 7147, + "time_per_iteration": 2.6039819717407227 + }, + { + "auxiliary_loss_clip": 0.01152385, + "auxiliary_loss_mlp": 0.0112059, + "balance_loss_clip": 1.00206125, + "balance_loss_mlp": 1.00061762, + "epoch": 0.4297610100706448, + "flos": 22382618624640.0, + "grad_norm": 2.9717039437922006, + "language_loss": 0.87172627, + "learning_rate": 2.545078041678131e-06, + "loss": 0.89445609, + "num_input_tokens_seen": 153350760, + "step": 7148, + "time_per_iteration": 2.581097364425659 + }, + { + "auxiliary_loss_clip": 0.01143296, + "auxiliary_loss_mlp": 0.01119677, + "balance_loss_clip": 1.00213206, + "balance_loss_mlp": 1.00056338, + "epoch": 0.4298211333233128, + "flos": 27925681536000.0, + "grad_norm": 1.615021243006781, + "language_loss": 0.77871799, + "learning_rate": 2.5447033128412957e-06, + "loss": 0.80134773, + "num_input_tokens_seen": 153370765, + "step": 7149, + "time_per_iteration": 2.690464973449707 + }, + { + "auxiliary_loss_clip": 0.01121822, + "auxiliary_loss_mlp": 0.01119638, + "balance_loss_clip": 1.00214934, + "balance_loss_mlp": 1.00061917, + "epoch": 0.42988125657598075, + "flos": 24425612478720.0, + "grad_norm": 2.045783909552358, + "language_loss": 0.79919785, + "learning_rate": 2.544328563349256e-06, + "loss": 0.82161248, + "num_input_tokens_seen": 153390725, + "step": 7150, + "time_per_iteration": 2.6762115955352783 + }, + { + "auxiliary_loss_clip": 0.01151697, + "auxiliary_loss_mlp": 0.01120499, + "balance_loss_clip": 1.00211549, + "balance_loss_mlp": 1.00090814, + "epoch": 0.4299413798286487, + "flos": 15850180523520.0, + "grad_norm": 1.9577889349529436, + "language_loss": 0.74776149, + "learning_rate": 2.5439537932162222e-06, + "loss": 0.77048349, + "num_input_tokens_seen": 153408010, + "step": 7151, + "time_per_iteration": 5.304002523422241 + }, + { + "auxiliary_loss_clip": 0.01122588, + "auxiliary_loss_mlp": 0.01121366, + "balance_loss_clip": 1.00203848, + "balance_loss_mlp": 1.00082159, + "epoch": 0.4300015030813167, + "flos": 22309504490880.0, + "grad_norm": 2.428253571008695, + "language_loss": 0.70339239, + "learning_rate": 2.543579002456406e-06, + "loss": 0.72583187, + "num_input_tokens_seen": 153426865, + "step": 7152, + "time_per_iteration": 2.618877649307251 + }, + { + "auxiliary_loss_clip": 0.01143213, + "auxiliary_loss_mlp": 0.01119568, + "balance_loss_clip": 1.00192988, + "balance_loss_mlp": 1.00064504, + "epoch": 0.43006162633398465, + "flos": 34897666366080.0, + "grad_norm": 1.7712066781421487, + "language_loss": 0.71310973, + "learning_rate": 2.54320419108402e-06, + "loss": 0.7357375, + "num_input_tokens_seen": 153449410, + "step": 7153, + "time_per_iteration": 2.6918303966522217 + }, + { + "auxiliary_loss_clip": 0.0115357, + "auxiliary_loss_mlp": 0.01119778, + "balance_loss_clip": 1.00212812, + "balance_loss_mlp": 1.00056851, + "epoch": 0.4301217495866526, + "flos": 15961575576960.0, + "grad_norm": 2.69106050476785, + "language_loss": 0.7811197, + "learning_rate": 2.542829359113276e-06, + "loss": 0.80385315, + "num_input_tokens_seen": 153467910, + "step": 7154, + "time_per_iteration": 2.544753313064575 + }, + { + "auxiliary_loss_clip": 0.01137022, + "auxiliary_loss_mlp": 0.01119573, + "balance_loss_clip": 1.00196505, + "balance_loss_mlp": 1.00055408, + "epoch": 0.43018187283932063, + "flos": 18770364414720.0, + "grad_norm": 1.5241014878702464, + "language_loss": 0.78452206, + "learning_rate": 2.542454506558389e-06, + "loss": 0.80708802, + "num_input_tokens_seen": 153487100, + "step": 7155, + "time_per_iteration": 4.069540739059448 + }, + { + "auxiliary_loss_clip": 0.01135594, + "auxiliary_loss_mlp": 0.01118871, + "balance_loss_clip": 1.00201452, + "balance_loss_mlp": 1.00061488, + "epoch": 0.4302419960919886, + "flos": 20151703791360.0, + "grad_norm": 1.9449763768125186, + "language_loss": 0.88590276, + "learning_rate": 2.5420796334335723e-06, + "loss": 0.90844738, + "num_input_tokens_seen": 153505565, + "step": 7156, + "time_per_iteration": 2.5904250144958496 + }, + { + "auxiliary_loss_clip": 0.01168788, + "auxiliary_loss_mlp": 0.01121434, + "balance_loss_clip": 1.00211024, + "balance_loss_mlp": 1.00060391, + "epoch": 0.43030211934465656, + "flos": 26432731624320.0, + "grad_norm": 5.251679805359216, + "language_loss": 0.82866538, + "learning_rate": 2.541704739753042e-06, + "loss": 0.85156757, + "num_input_tokens_seen": 153526130, + "step": 7157, + "time_per_iteration": 2.554784059524536 + }, + { + "auxiliary_loss_clip": 0.01168907, + "auxiliary_loss_mlp": 0.01121131, + "balance_loss_clip": 1.0021925, + "balance_loss_mlp": 1.00049078, + "epoch": 0.43036224259732453, + "flos": 24389234979840.0, + "grad_norm": 1.7546214428167421, + "language_loss": 0.7187829, + "learning_rate": 2.5413298255310132e-06, + "loss": 0.7416833, + "num_input_tokens_seen": 153546370, + "step": 7158, + "time_per_iteration": 2.6166446208953857 + }, + { + "auxiliary_loss_clip": 0.01152071, + "auxiliary_loss_mlp": 0.01121017, + "balance_loss_clip": 1.00209403, + "balance_loss_mlp": 1.00066376, + "epoch": 0.4304223658499925, + "flos": 17201714590080.0, + "grad_norm": 2.010322257099473, + "language_loss": 0.82644719, + "learning_rate": 2.5409548907817034e-06, + "loss": 0.84917808, + "num_input_tokens_seen": 153562800, + "step": 7159, + "time_per_iteration": 2.5405898094177246 + }, + { + "auxiliary_loss_clip": 0.01136905, + "auxiliary_loss_mlp": 0.01120433, + "balance_loss_clip": 1.00206828, + "balance_loss_mlp": 1.0006516, + "epoch": 0.43048248910266046, + "flos": 14903000835840.0, + "grad_norm": 2.2566503959434963, + "language_loss": 0.83174312, + "learning_rate": 2.54057993551933e-06, + "loss": 0.85431647, + "num_input_tokens_seen": 153578395, + "step": 7160, + "time_per_iteration": 2.6276471614837646 + }, + { + "auxiliary_loss_clip": 0.01152167, + "auxiliary_loss_mlp": 0.01122078, + "balance_loss_clip": 1.00209141, + "balance_loss_mlp": 1.00067532, + "epoch": 0.4305426123553284, + "flos": 21579835610880.0, + "grad_norm": 2.0019836741560493, + "language_loss": 0.76967281, + "learning_rate": 2.5402049597581116e-06, + "loss": 0.79241532, + "num_input_tokens_seen": 153596880, + "step": 7161, + "time_per_iteration": 2.6211719512939453 + }, + { + "auxiliary_loss_clip": 0.01153739, + "auxiliary_loss_mlp": 0.0112051, + "balance_loss_clip": 1.0020653, + "balance_loss_mlp": 1.00072825, + "epoch": 0.4306027356079964, + "flos": 22601278667520.0, + "grad_norm": 2.2056661509677524, + "language_loss": 0.7264713, + "learning_rate": 2.5398299635122662e-06, + "loss": 0.74921376, + "num_input_tokens_seen": 153616570, + "step": 7162, + "time_per_iteration": 2.5952165126800537 + }, + { + "auxiliary_loss_clip": 0.01115565, + "auxiliary_loss_mlp": 0.0074648, + "balance_loss_clip": 1.00162721, + "balance_loss_mlp": 0.99983805, + "epoch": 0.43066285886066435, + "flos": 70672091806080.0, + "grad_norm": 0.7946097901799505, + "language_loss": 0.59032965, + "learning_rate": 2.5394549467960147e-06, + "loss": 0.60895014, + "num_input_tokens_seen": 153671450, + "step": 7163, + "time_per_iteration": 3.1126015186309814 + }, + { + "auxiliary_loss_clip": 0.01137054, + "auxiliary_loss_mlp": 0.01119298, + "balance_loss_clip": 1.0019455, + "balance_loss_mlp": 1.00056601, + "epoch": 0.4307229821133323, + "flos": 26720591218560.0, + "grad_norm": 1.7747075017252614, + "language_loss": 0.79122126, + "learning_rate": 2.5390799096235783e-06, + "loss": 0.81378484, + "num_input_tokens_seen": 153691405, + "step": 7164, + "time_per_iteration": 2.6580238342285156 + }, + { + "auxiliary_loss_clip": 0.01168786, + "auxiliary_loss_mlp": 0.01120874, + "balance_loss_clip": 1.00205517, + "balance_loss_mlp": 1.00071073, + "epoch": 0.4307831053660003, + "flos": 26177119464960.0, + "grad_norm": 2.8640418968402535, + "language_loss": 0.67514902, + "learning_rate": 2.538704852009177e-06, + "loss": 0.69804561, + "num_input_tokens_seen": 153711555, + "step": 7165, + "time_per_iteration": 2.5885016918182373 + }, + { + "auxiliary_loss_clip": 0.01135605, + "auxiliary_loss_mlp": 0.00747624, + "balance_loss_clip": 1.00191748, + "balance_loss_mlp": 1.0003556, + "epoch": 0.43084322861866825, + "flos": 18910343715840.0, + "grad_norm": 2.105399797838868, + "language_loss": 0.75157952, + "learning_rate": 2.538329773967034e-06, + "loss": 0.77041185, + "num_input_tokens_seen": 153730095, + "step": 7166, + "time_per_iteration": 2.6203572750091553 + }, + { + "auxiliary_loss_clip": 0.01151973, + "auxiliary_loss_mlp": 0.01119426, + "balance_loss_clip": 1.00210297, + "balance_loss_mlp": 1.00050282, + "epoch": 0.4309033518713362, + "flos": 26432911192320.0, + "grad_norm": 1.6582483589797932, + "language_loss": 0.7185033, + "learning_rate": 2.537954675511372e-06, + "loss": 0.74121732, + "num_input_tokens_seen": 153749320, + "step": 7167, + "time_per_iteration": 2.621997594833374 + }, + { + "auxiliary_loss_clip": 0.01135095, + "auxiliary_loss_mlp": 0.00747466, + "balance_loss_clip": 1.00199461, + "balance_loss_mlp": 1.00034165, + "epoch": 0.43096347512400424, + "flos": 21213295274880.0, + "grad_norm": 1.6385419635270402, + "language_loss": 0.78623092, + "learning_rate": 2.537579556656414e-06, + "loss": 0.80505651, + "num_input_tokens_seen": 153767825, + "step": 7168, + "time_per_iteration": 2.620124340057373 + }, + { + "auxiliary_loss_clip": 0.01134755, + "auxiliary_loss_mlp": 0.01120255, + "balance_loss_clip": 1.00196886, + "balance_loss_mlp": 1.000664, + "epoch": 0.4310235983766722, + "flos": 16540131939840.0, + "grad_norm": 2.473225118642625, + "language_loss": 0.82145452, + "learning_rate": 2.537204417416387e-06, + "loss": 0.84400463, + "num_input_tokens_seen": 153785350, + "step": 7169, + "time_per_iteration": 2.6244778633117676 + }, + { + "auxiliary_loss_clip": 0.01133055, + "auxiliary_loss_mlp": 0.01100041, + "balance_loss_clip": 1.00167727, + "balance_loss_mlp": 1.00009632, + "epoch": 0.43108372162934017, + "flos": 64775704763520.0, + "grad_norm": 0.670941654800768, + "language_loss": 0.60768342, + "learning_rate": 2.5368292578055132e-06, + "loss": 0.63001436, + "num_input_tokens_seen": 153856400, + "step": 7170, + "time_per_iteration": 3.3371291160583496 + }, + { + "auxiliary_loss_clip": 0.0116858, + "auxiliary_loss_mlp": 0.0111982, + "balance_loss_clip": 1.00208628, + "balance_loss_mlp": 1.00061047, + "epoch": 0.43114384488200813, + "flos": 13444094039040.0, + "grad_norm": 1.6953943010984556, + "language_loss": 0.76048338, + "learning_rate": 2.536454077838021e-06, + "loss": 0.7833674, + "num_input_tokens_seen": 153875230, + "step": 7171, + "time_per_iteration": 2.5116894245147705 + }, + { + "auxiliary_loss_clip": 0.01151868, + "auxiliary_loss_mlp": 0.01119732, + "balance_loss_clip": 1.00203085, + "balance_loss_mlp": 1.00061846, + "epoch": 0.4312039681346761, + "flos": 26286682924800.0, + "grad_norm": 1.542823784429054, + "language_loss": 0.77656639, + "learning_rate": 2.5360788775281357e-06, + "loss": 0.79928231, + "num_input_tokens_seen": 153894740, + "step": 7172, + "time_per_iteration": 2.611114025115967 + }, + { + "auxiliary_loss_clip": 0.01136758, + "auxiliary_loss_mlp": 0.0112077, + "balance_loss_clip": 1.00200903, + "balance_loss_mlp": 1.00060701, + "epoch": 0.43126409138734406, + "flos": 20376684627840.0, + "grad_norm": 1.5744131821001086, + "language_loss": 0.76434213, + "learning_rate": 2.535703656890086e-06, + "loss": 0.78691745, + "num_input_tokens_seen": 153913230, + "step": 7173, + "time_per_iteration": 2.5935754776000977 + }, + { + "auxiliary_loss_clip": 0.01168626, + "auxiliary_loss_mlp": 0.00747717, + "balance_loss_clip": 1.00210655, + "balance_loss_mlp": 1.00055528, + "epoch": 0.431324214640012, + "flos": 22123091882880.0, + "grad_norm": 1.572802521263517, + "language_loss": 0.7735728, + "learning_rate": 2.5353284159381e-06, + "loss": 0.79273623, + "num_input_tokens_seen": 153933250, + "step": 7174, + "time_per_iteration": 2.5449635982513428 + }, + { + "auxiliary_loss_clip": 0.01168769, + "auxiliary_loss_mlp": 0.01119991, + "balance_loss_clip": 1.00206852, + "balance_loss_mlp": 1.00059128, + "epoch": 0.43138433789268, + "flos": 15231008856960.0, + "grad_norm": 1.778198377729104, + "language_loss": 0.8276298, + "learning_rate": 2.534953154686407e-06, + "loss": 0.85051739, + "num_input_tokens_seen": 153951325, + "step": 7175, + "time_per_iteration": 2.5951805114746094 + }, + { + "auxiliary_loss_clip": 0.01122185, + "auxiliary_loss_mlp": 0.01120641, + "balance_loss_clip": 1.00187564, + "balance_loss_mlp": 1.00066876, + "epoch": 0.43144446114534796, + "flos": 18150294908160.0, + "grad_norm": 2.21277645321079, + "language_loss": 0.7479465, + "learning_rate": 2.5345778731492366e-06, + "loss": 0.77037477, + "num_input_tokens_seen": 153966975, + "step": 7176, + "time_per_iteration": 2.6173415184020996 + }, + { + "auxiliary_loss_clip": 0.01153677, + "auxiliary_loss_mlp": 0.01119761, + "balance_loss_clip": 1.00191939, + "balance_loss_mlp": 1.00055158, + "epoch": 0.4315045843980159, + "flos": 22929861306240.0, + "grad_norm": 1.5536512090254329, + "language_loss": 0.73610961, + "learning_rate": 2.534202571340819e-06, + "loss": 0.75884402, + "num_input_tokens_seen": 153986695, + "step": 7177, + "time_per_iteration": 2.5574519634246826 + }, + { + "auxiliary_loss_clip": 0.0113576, + "auxiliary_loss_mlp": 0.01121223, + "balance_loss_clip": 1.00184894, + "balance_loss_mlp": 1.00067854, + "epoch": 0.4315647076506839, + "flos": 22126862810880.0, + "grad_norm": 1.7031363085353872, + "language_loss": 0.81601852, + "learning_rate": 2.533827249275387e-06, + "loss": 0.83858836, + "num_input_tokens_seen": 154004710, + "step": 7178, + "time_per_iteration": 2.6184470653533936 + }, + { + "auxiliary_loss_clip": 0.01136881, + "auxiliary_loss_mlp": 0.0111971, + "balance_loss_clip": 1.00202823, + "balance_loss_mlp": 1.00069141, + "epoch": 0.43162483090335185, + "flos": 26871129118080.0, + "grad_norm": 1.4858009966778887, + "language_loss": 0.83870876, + "learning_rate": 2.5334519069671725e-06, + "loss": 0.86127466, + "num_input_tokens_seen": 154024320, + "step": 7179, + "time_per_iteration": 2.655797243118286 + }, + { + "auxiliary_loss_clip": 0.01135346, + "auxiliary_loss_mlp": 0.01120108, + "balance_loss_clip": 1.00191998, + "balance_loss_mlp": 1.00070846, + "epoch": 0.4316849541560198, + "flos": 13913122855680.0, + "grad_norm": 1.7994550795608917, + "language_loss": 0.74979734, + "learning_rate": 2.5330765444304075e-06, + "loss": 0.77235186, + "num_input_tokens_seen": 154041755, + "step": 7180, + "time_per_iteration": 2.576192855834961 + }, + { + "auxiliary_loss_clip": 0.01138523, + "auxiliary_loss_mlp": 0.00747848, + "balance_loss_clip": 1.0019536, + "balance_loss_mlp": 1.00049102, + "epoch": 0.4317450774086878, + "flos": 16435165420800.0, + "grad_norm": 1.7917934265111701, + "language_loss": 0.81660473, + "learning_rate": 2.5327011616793274e-06, + "loss": 0.83546841, + "num_input_tokens_seen": 154056775, + "step": 7181, + "time_per_iteration": 2.577204942703247 + }, + { + "auxiliary_loss_clip": 0.01141771, + "auxiliary_loss_mlp": 0.01120906, + "balance_loss_clip": 1.00210285, + "balance_loss_mlp": 1.00064766, + "epoch": 0.4318052006613558, + "flos": 20554980762240.0, + "grad_norm": 1.9254820665825274, + "language_loss": 0.88833725, + "learning_rate": 2.532325758728165e-06, + "loss": 0.91096401, + "num_input_tokens_seen": 154075015, + "step": 7182, + "time_per_iteration": 2.6019937992095947 + }, + { + "auxiliary_loss_clip": 0.01152347, + "auxiliary_loss_mlp": 0.00747452, + "balance_loss_clip": 1.00203919, + "balance_loss_mlp": 1.00043523, + "epoch": 0.43186532391402377, + "flos": 22820046451200.0, + "grad_norm": 1.5480248594167474, + "language_loss": 0.75662494, + "learning_rate": 2.5319503355911566e-06, + "loss": 0.77562296, + "num_input_tokens_seen": 154095170, + "step": 7183, + "time_per_iteration": 2.5829460620880127 + }, + { + "auxiliary_loss_clip": 0.01152014, + "auxiliary_loss_mlp": 0.01120418, + "balance_loss_clip": 1.00201321, + "balance_loss_mlp": 1.00054145, + "epoch": 0.43192544716669173, + "flos": 25556583081600.0, + "grad_norm": 1.6240127689768844, + "language_loss": 0.77587974, + "learning_rate": 2.5315748922825393e-06, + "loss": 0.79860407, + "num_input_tokens_seen": 154116895, + "step": 7184, + "time_per_iteration": 2.6007473468780518 + }, + { + "auxiliary_loss_clip": 0.01136824, + "auxiliary_loss_mlp": 0.01119686, + "balance_loss_clip": 1.00197351, + "balance_loss_mlp": 1.00066781, + "epoch": 0.4319855704193597, + "flos": 30954674701440.0, + "grad_norm": 1.6551647621422114, + "language_loss": 0.73612583, + "learning_rate": 2.5311994288165474e-06, + "loss": 0.75869095, + "num_input_tokens_seen": 154138395, + "step": 7185, + "time_per_iteration": 4.051485538482666 + }, + { + "auxiliary_loss_clip": 0.01137641, + "auxiliary_loss_mlp": 0.01121004, + "balance_loss_clip": 1.00194669, + "balance_loss_mlp": 1.00064993, + "epoch": 0.43204569367202766, + "flos": 24238732993920.0, + "grad_norm": 2.149125700274758, + "language_loss": 0.76012719, + "learning_rate": 2.530823945207421e-06, + "loss": 0.78271365, + "num_input_tokens_seen": 154156775, + "step": 7186, + "time_per_iteration": 2.6276166439056396 + }, + { + "auxiliary_loss_clip": 0.01118484, + "auxiliary_loss_mlp": 0.01120494, + "balance_loss_clip": 1.001881, + "balance_loss_mlp": 1.00052154, + "epoch": 0.43210581692469563, + "flos": 18406948561920.0, + "grad_norm": 2.432511890296486, + "language_loss": 0.76586533, + "learning_rate": 2.5304484414693962e-06, + "loss": 0.7882551, + "num_input_tokens_seen": 154177500, + "step": 7187, + "time_per_iteration": 2.652519464492798 + }, + { + "auxiliary_loss_clip": 0.01118131, + "auxiliary_loss_mlp": 0.01101128, + "balance_loss_clip": 1.00175631, + "balance_loss_mlp": 1.00041986, + "epoch": 0.4321659401773636, + "flos": 49832378910720.0, + "grad_norm": 0.8697636875321564, + "language_loss": 0.68186831, + "learning_rate": 2.530072917616714e-06, + "loss": 0.70406091, + "num_input_tokens_seen": 154237110, + "step": 7188, + "time_per_iteration": 3.24670672416687 + }, + { + "auxiliary_loss_clip": 0.01136741, + "auxiliary_loss_mlp": 0.0111924, + "balance_loss_clip": 1.00221372, + "balance_loss_mlp": 1.00060272, + "epoch": 0.43222606343003156, + "flos": 17128564542720.0, + "grad_norm": 1.997138453170231, + "language_loss": 0.78372633, + "learning_rate": 2.529697373663614e-06, + "loss": 0.80628616, + "num_input_tokens_seen": 154253910, + "step": 7189, + "time_per_iteration": 5.450939178466797 + }, + { + "auxiliary_loss_clip": 0.01108859, + "auxiliary_loss_mlp": 0.01121625, + "balance_loss_clip": 1.0019393, + "balance_loss_mlp": 1.00088966, + "epoch": 0.4322861866826995, + "flos": 22749949059840.0, + "grad_norm": 12.463015019715986, + "language_loss": 0.71756911, + "learning_rate": 2.5293218096243364e-06, + "loss": 0.73987401, + "num_input_tokens_seen": 154274770, + "step": 7190, + "time_per_iteration": 2.7158098220825195 + }, + { + "auxiliary_loss_clip": 0.01137282, + "auxiliary_loss_mlp": 0.01119124, + "balance_loss_clip": 1.00191092, + "balance_loss_mlp": 1.00048661, + "epoch": 0.4323463099353675, + "flos": 27891925729920.0, + "grad_norm": 1.3749486881518145, + "language_loss": 0.79837441, + "learning_rate": 2.5289462255131223e-06, + "loss": 0.82093847, + "num_input_tokens_seen": 154295035, + "step": 7191, + "time_per_iteration": 2.679694414138794 + }, + { + "auxiliary_loss_clip": 0.01102671, + "auxiliary_loss_mlp": 0.01119594, + "balance_loss_clip": 1.00184512, + "balance_loss_mlp": 1.0005753, + "epoch": 0.43240643318803546, + "flos": 21614740652160.0, + "grad_norm": 1.457814792478532, + "language_loss": 0.74784207, + "learning_rate": 2.5285706213442146e-06, + "loss": 0.77006471, + "num_input_tokens_seen": 154314905, + "step": 7192, + "time_per_iteration": 2.7032909393310547 + }, + { + "auxiliary_loss_clip": 0.01106041, + "auxiliary_loss_mlp": 0.01120769, + "balance_loss_clip": 1.0019536, + "balance_loss_mlp": 1.00070143, + "epoch": 0.4324665564407034, + "flos": 17558378686080.0, + "grad_norm": 1.6998876091112223, + "language_loss": 0.78886098, + "learning_rate": 2.5281949971318557e-06, + "loss": 0.81112903, + "num_input_tokens_seen": 154331740, + "step": 7193, + "time_per_iteration": 4.106358289718628 + }, + { + "auxiliary_loss_clip": 0.01137875, + "auxiliary_loss_mlp": 0.01119745, + "balance_loss_clip": 1.00191247, + "balance_loss_mlp": 1.00072634, + "epoch": 0.4325266796933714, + "flos": 18402423448320.0, + "grad_norm": 1.9375430785844496, + "language_loss": 0.7557615, + "learning_rate": 2.5278193528902897e-06, + "loss": 0.77833772, + "num_input_tokens_seen": 154348740, + "step": 7194, + "time_per_iteration": 2.5864810943603516 + }, + { + "auxiliary_loss_clip": 0.01168653, + "auxiliary_loss_mlp": 0.01120694, + "balance_loss_clip": 1.00209498, + "balance_loss_mlp": 1.00062621, + "epoch": 0.4325868029460394, + "flos": 22564793427840.0, + "grad_norm": 1.7900183005940085, + "language_loss": 0.59731758, + "learning_rate": 2.5274436886337613e-06, + "loss": 0.62021106, + "num_input_tokens_seen": 154368835, + "step": 7195, + "time_per_iteration": 2.548304796218872 + }, + { + "auxiliary_loss_clip": 0.0113693, + "auxiliary_loss_mlp": 0.01120939, + "balance_loss_clip": 1.00210667, + "balance_loss_mlp": 1.00068057, + "epoch": 0.43264692619870737, + "flos": 14605516396800.0, + "grad_norm": 2.1156982755606286, + "language_loss": 0.65561473, + "learning_rate": 2.527068004376515e-06, + "loss": 0.67819339, + "num_input_tokens_seen": 154384620, + "step": 7196, + "time_per_iteration": 2.574993371963501 + }, + { + "auxiliary_loss_clip": 0.01168749, + "auxiliary_loss_mlp": 0.01120891, + "balance_loss_clip": 1.00209904, + "balance_loss_mlp": 1.00063276, + "epoch": 0.43270704945137534, + "flos": 21501657659520.0, + "grad_norm": 2.0351615900282836, + "language_loss": 0.72182631, + "learning_rate": 2.526692300132797e-06, + "loss": 0.74472278, + "num_input_tokens_seen": 154402865, + "step": 7197, + "time_per_iteration": 2.506505250930786 + }, + { + "auxiliary_loss_clip": 0.0115196, + "auxiliary_loss_mlp": 0.01120196, + "balance_loss_clip": 1.00213099, + "balance_loss_mlp": 1.00079608, + "epoch": 0.4327671727040433, + "flos": 25155891889920.0, + "grad_norm": 1.425014585072434, + "language_loss": 0.72871208, + "learning_rate": 2.5263165759168547e-06, + "loss": 0.75143361, + "num_input_tokens_seen": 154423625, + "step": 7198, + "time_per_iteration": 2.602193832397461 + }, + { + "auxiliary_loss_clip": 0.01120017, + "auxiliary_loss_mlp": 0.01119502, + "balance_loss_clip": 1.00189447, + "balance_loss_mlp": 1.00048304, + "epoch": 0.43282729595671127, + "flos": 25447163276160.0, + "grad_norm": 1.3816971978207937, + "language_loss": 0.81008077, + "learning_rate": 2.525940831742934e-06, + "loss": 0.83247596, + "num_input_tokens_seen": 154444775, + "step": 7199, + "time_per_iteration": 2.6846628189086914 + }, + { + "auxiliary_loss_clip": 0.01138226, + "auxiliary_loss_mlp": 0.01120801, + "balance_loss_clip": 1.00211501, + "balance_loss_mlp": 1.00073338, + "epoch": 0.43288741920937923, + "flos": 24126116878080.0, + "grad_norm": 2.223389042472579, + "language_loss": 0.68544477, + "learning_rate": 2.525565067625286e-06, + "loss": 0.70803505, + "num_input_tokens_seen": 154460815, + "step": 7200, + "time_per_iteration": 2.620934009552002 + }, + { + "auxiliary_loss_clip": 0.01135421, + "auxiliary_loss_mlp": 0.007477, + "balance_loss_clip": 1.00200546, + "balance_loss_mlp": 1.0004425, + "epoch": 0.4329475424620472, + "flos": 19204955066880.0, + "grad_norm": 1.8396441824106942, + "language_loss": 0.87252617, + "learning_rate": 2.525189283578157e-06, + "loss": 0.89135736, + "num_input_tokens_seen": 154479145, + "step": 7201, + "time_per_iteration": 2.610132932662964 + }, + { + "auxiliary_loss_clip": 0.01089123, + "auxiliary_loss_mlp": 0.01121434, + "balance_loss_clip": 1.00189829, + "balance_loss_mlp": 1.00069916, + "epoch": 0.43300766571471516, + "flos": 22638374438400.0, + "grad_norm": 1.9719290388642554, + "language_loss": 0.65111077, + "learning_rate": 2.5248134796157974e-06, + "loss": 0.67321634, + "num_input_tokens_seen": 154498905, + "step": 7202, + "time_per_iteration": 2.7452666759490967 + }, + { + "auxiliary_loss_clip": 0.0110389, + "auxiliary_loss_mlp": 0.01119427, + "balance_loss_clip": 1.00197077, + "balance_loss_mlp": 1.00050342, + "epoch": 0.4330677889673831, + "flos": 22121080721280.0, + "grad_norm": 1.8016199870062528, + "language_loss": 0.81860685, + "learning_rate": 2.5244376557524586e-06, + "loss": 0.84083998, + "num_input_tokens_seen": 154517270, + "step": 7203, + "time_per_iteration": 2.7451372146606445 + }, + { + "auxiliary_loss_clip": 0.01120568, + "auxiliary_loss_mlp": 0.01120627, + "balance_loss_clip": 1.00184035, + "balance_loss_mlp": 1.00084555, + "epoch": 0.4331279122200511, + "flos": 23221527742080.0, + "grad_norm": 1.867101875157205, + "language_loss": 0.81293559, + "learning_rate": 2.5240618120023912e-06, + "loss": 0.83534753, + "num_input_tokens_seen": 154535945, + "step": 7204, + "time_per_iteration": 2.6835403442382812 + }, + { + "auxiliary_loss_clip": 0.01135329, + "auxiliary_loss_mlp": 0.01119873, + "balance_loss_clip": 1.00193048, + "balance_loss_mlp": 1.00056791, + "epoch": 0.43318803547271906, + "flos": 18259750627200.0, + "grad_norm": 1.8069544162373747, + "language_loss": 0.7363894, + "learning_rate": 2.5236859483798468e-06, + "loss": 0.75894141, + "num_input_tokens_seen": 154554935, + "step": 7205, + "time_per_iteration": 2.6373112201690674 + }, + { + "auxiliary_loss_clip": 0.01168535, + "auxiliary_loss_mlp": 0.00747601, + "balance_loss_clip": 1.00221515, + "balance_loss_mlp": 1.00054085, + "epoch": 0.433248158725387, + "flos": 27418407713280.0, + "grad_norm": 1.580804609114365, + "language_loss": 0.74831498, + "learning_rate": 2.5233100648990803e-06, + "loss": 0.76747632, + "num_input_tokens_seen": 154576065, + "step": 7206, + "time_per_iteration": 2.626844644546509 + }, + { + "auxiliary_loss_clip": 0.01108231, + "auxiliary_loss_mlp": 0.01119367, + "balance_loss_clip": 1.00199401, + "balance_loss_mlp": 1.00063491, + "epoch": 0.433308281978055, + "flos": 23218008209280.0, + "grad_norm": 3.17710749689784, + "language_loss": 0.78919339, + "learning_rate": 2.522934161574342e-06, + "loss": 0.81146938, + "num_input_tokens_seen": 154595110, + "step": 7207, + "time_per_iteration": 2.699331760406494 + }, + { + "auxiliary_loss_clip": 0.01120959, + "auxiliary_loss_mlp": 0.01120082, + "balance_loss_clip": 1.00196528, + "balance_loss_mlp": 1.00058627, + "epoch": 0.433368405230723, + "flos": 15852407166720.0, + "grad_norm": 1.694652575489082, + "language_loss": 0.80929643, + "learning_rate": 2.5225582384198888e-06, + "loss": 0.83170688, + "num_input_tokens_seen": 154612255, + "step": 7208, + "time_per_iteration": 2.6497585773468018 + }, + { + "auxiliary_loss_clip": 0.01136968, + "auxiliary_loss_mlp": 0.01120102, + "balance_loss_clip": 1.00202203, + "balance_loss_mlp": 1.00060666, + "epoch": 0.433428528483391, + "flos": 19026084314880.0, + "grad_norm": 2.2974208813587773, + "language_loss": 0.70257932, + "learning_rate": 2.5221822954499744e-06, + "loss": 0.72514999, + "num_input_tokens_seen": 154630440, + "step": 7209, + "time_per_iteration": 2.6855831146240234 + }, + { + "auxiliary_loss_clip": 0.01153509, + "auxiliary_loss_mlp": 0.01120601, + "balance_loss_clip": 1.00208855, + "balance_loss_mlp": 1.00072455, + "epoch": 0.43348865173605894, + "flos": 24718248581760.0, + "grad_norm": 1.4856859834836726, + "language_loss": 0.81407714, + "learning_rate": 2.5218063326788557e-06, + "loss": 0.83681822, + "num_input_tokens_seen": 154652515, + "step": 7210, + "time_per_iteration": 2.638115167617798 + }, + { + "auxiliary_loss_clip": 0.0113523, + "auxiliary_loss_mlp": 0.01119967, + "balance_loss_clip": 1.00202656, + "balance_loss_mlp": 1.00075746, + "epoch": 0.4335487749887269, + "flos": 22090664880000.0, + "grad_norm": 2.2296372177654904, + "language_loss": 0.82037467, + "learning_rate": 2.5214303501207885e-06, + "loss": 0.84292662, + "num_input_tokens_seen": 154670965, + "step": 7211, + "time_per_iteration": 2.5987560749053955 + }, + { + "auxiliary_loss_clip": 0.01153472, + "auxiliary_loss_mlp": 0.01120152, + "balance_loss_clip": 1.00199175, + "balance_loss_mlp": 1.000561, + "epoch": 0.43360889824139487, + "flos": 22382941847040.0, + "grad_norm": 1.6885250437807175, + "language_loss": 0.74884748, + "learning_rate": 2.521054347790029e-06, + "loss": 0.77158368, + "num_input_tokens_seen": 154689980, + "step": 7212, + "time_per_iteration": 2.569690227508545 + }, + { + "auxiliary_loss_clip": 0.01141479, + "auxiliary_loss_mlp": 0.01120051, + "balance_loss_clip": 1.00203848, + "balance_loss_mlp": 1.00074601, + "epoch": 0.43366902149406283, + "flos": 17528286067200.0, + "grad_norm": 1.7601978915214875, + "language_loss": 0.76752341, + "learning_rate": 2.5206783257008375e-06, + "loss": 0.79013872, + "num_input_tokens_seen": 154706570, + "step": 7213, + "time_per_iteration": 2.579599618911743 + }, + { + "auxiliary_loss_clip": 0.01152316, + "auxiliary_loss_mlp": 0.01120234, + "balance_loss_clip": 1.00202096, + "balance_loss_mlp": 1.00064278, + "epoch": 0.4337291447467308, + "flos": 19022672522880.0, + "grad_norm": 1.76423704513157, + "language_loss": 0.65101039, + "learning_rate": 2.520302283867471e-06, + "loss": 0.67373586, + "num_input_tokens_seen": 154725210, + "step": 7214, + "time_per_iteration": 2.576036214828491 + }, + { + "auxiliary_loss_clip": 0.01136457, + "auxiliary_loss_mlp": 0.011195, + "balance_loss_clip": 1.00200963, + "balance_loss_mlp": 1.00076747, + "epoch": 0.43378926799939876, + "flos": 27234042180480.0, + "grad_norm": 1.602546628281969, + "language_loss": 0.71356273, + "learning_rate": 2.519926222304191e-06, + "loss": 0.73612231, + "num_input_tokens_seen": 154745945, + "step": 7215, + "time_per_iteration": 2.6685376167297363 + }, + { + "auxiliary_loss_clip": 0.01136681, + "auxiliary_loss_mlp": 0.0112027, + "balance_loss_clip": 1.00200081, + "balance_loss_mlp": 1.00067914, + "epoch": 0.43384939125206673, + "flos": 15961108700160.0, + "grad_norm": 1.9742972444450544, + "language_loss": 0.74736881, + "learning_rate": 2.519550141025255e-06, + "loss": 0.76993829, + "num_input_tokens_seen": 154763580, + "step": 7216, + "time_per_iteration": 2.5576980113983154 + }, + { + "auxiliary_loss_clip": 0.01137345, + "auxiliary_loss_mlp": 0.01121568, + "balance_loss_clip": 1.00196266, + "balance_loss_mlp": 1.00073719, + "epoch": 0.4339095145047347, + "flos": 21793216354560.0, + "grad_norm": 3.5664430950895794, + "language_loss": 0.75810754, + "learning_rate": 2.519174040044927e-06, + "loss": 0.78069675, + "num_input_tokens_seen": 154776825, + "step": 7217, + "time_per_iteration": 2.597261905670166 + }, + { + "auxiliary_loss_clip": 0.01121995, + "auxiliary_loss_mlp": 0.01119966, + "balance_loss_clip": 1.00208163, + "balance_loss_mlp": 1.00066113, + "epoch": 0.43396963775740266, + "flos": 14209853109120.0, + "grad_norm": 4.4062282082563105, + "language_loss": 0.7421065, + "learning_rate": 2.5187979193774664e-06, + "loss": 0.76452613, + "num_input_tokens_seen": 154794025, + "step": 7218, + "time_per_iteration": 2.6240062713623047 + }, + { + "auxiliary_loss_clip": 0.01141861, + "auxiliary_loss_mlp": 0.01120565, + "balance_loss_clip": 1.00202465, + "balance_loss_mlp": 1.00068808, + "epoch": 0.4340297610100706, + "flos": 19719052473600.0, + "grad_norm": 1.977064499645819, + "language_loss": 0.6858604, + "learning_rate": 2.5184217790371367e-06, + "loss": 0.70848465, + "num_input_tokens_seen": 154813105, + "step": 7219, + "time_per_iteration": 2.59145188331604 + }, + { + "auxiliary_loss_clip": 0.01137204, + "auxiliary_loss_mlp": 0.01119452, + "balance_loss_clip": 1.00208902, + "balance_loss_mlp": 1.00071931, + "epoch": 0.4340898842627386, + "flos": 18953508885120.0, + "grad_norm": 1.6688022387945018, + "language_loss": 0.77442712, + "learning_rate": 2.518045619038202e-06, + "loss": 0.79699361, + "num_input_tokens_seen": 154833525, + "step": 7220, + "time_per_iteration": 2.6082208156585693 + }, + { + "auxiliary_loss_clip": 0.01087949, + "auxiliary_loss_mlp": 0.01120093, + "balance_loss_clip": 1.00183487, + "balance_loss_mlp": 1.00078869, + "epoch": 0.4341500075154066, + "flos": 22018304931840.0, + "grad_norm": 1.9804672457785104, + "language_loss": 0.69764805, + "learning_rate": 2.5176694393949243e-06, + "loss": 0.71972847, + "num_input_tokens_seen": 154853090, + "step": 7221, + "time_per_iteration": 4.113835096359253 + }, + { + "auxiliary_loss_clip": 0.01153819, + "auxiliary_loss_mlp": 0.01120334, + "balance_loss_clip": 1.00208855, + "balance_loss_mlp": 1.00074363, + "epoch": 0.4342101307680746, + "flos": 23582465556480.0, + "grad_norm": 1.876759441354635, + "language_loss": 0.64719802, + "learning_rate": 2.51729324012157e-06, + "loss": 0.66993958, + "num_input_tokens_seen": 154872055, + "step": 7222, + "time_per_iteration": 2.5855519771575928 + }, + { + "auxiliary_loss_clip": 0.0111973, + "auxiliary_loss_mlp": 0.01119787, + "balance_loss_clip": 1.00193524, + "balance_loss_mlp": 1.00057769, + "epoch": 0.43427025402074254, + "flos": 17967976450560.0, + "grad_norm": 2.088481281069558, + "language_loss": 0.7307151, + "learning_rate": 2.5169170212324053e-06, + "loss": 0.75311029, + "num_input_tokens_seen": 154886645, + "step": 7223, + "time_per_iteration": 2.631784200668335 + }, + { + "auxiliary_loss_clip": 0.0116875, + "auxiliary_loss_mlp": 0.01120862, + "balance_loss_clip": 1.00207615, + "balance_loss_mlp": 1.00069892, + "epoch": 0.4343303772734105, + "flos": 26286395616000.0, + "grad_norm": 1.8821596231915618, + "language_loss": 0.93681508, + "learning_rate": 2.516540782741694e-06, + "loss": 0.95971119, + "num_input_tokens_seen": 154906775, + "step": 7224, + "time_per_iteration": 2.556464195251465 + }, + { + "auxiliary_loss_clip": 0.01121709, + "auxiliary_loss_mlp": 0.01119443, + "balance_loss_clip": 1.00203693, + "balance_loss_mlp": 1.00071073, + "epoch": 0.43439050052607847, + "flos": 26833961520000.0, + "grad_norm": 1.5246322464853153, + "language_loss": 0.61490172, + "learning_rate": 2.5161645246637056e-06, + "loss": 0.63731331, + "num_input_tokens_seen": 154926990, + "step": 7225, + "time_per_iteration": 2.7324531078338623 + }, + { + "auxiliary_loss_clip": 0.01138841, + "auxiliary_loss_mlp": 0.00747636, + "balance_loss_clip": 1.00210679, + "balance_loss_mlp": 1.00049567, + "epoch": 0.43445062377874644, + "flos": 21397660807680.0, + "grad_norm": 2.1641723594783393, + "language_loss": 0.7785337, + "learning_rate": 2.5157882470127054e-06, + "loss": 0.79739839, + "num_input_tokens_seen": 154946210, + "step": 7226, + "time_per_iteration": 4.1728856563568115 + }, + { + "auxiliary_loss_clip": 0.01152835, + "auxiliary_loss_mlp": 0.01119334, + "balance_loss_clip": 1.00208163, + "balance_loss_mlp": 1.00069678, + "epoch": 0.4345107470314144, + "flos": 19901945548800.0, + "grad_norm": 1.533206141525805, + "language_loss": 0.84614468, + "learning_rate": 2.515411949802964e-06, + "loss": 0.86886632, + "num_input_tokens_seen": 154964995, + "step": 7227, + "time_per_iteration": 4.135713338851929 + }, + { + "auxiliary_loss_clip": 0.01153593, + "auxiliary_loss_mlp": 0.01119434, + "balance_loss_clip": 1.00210989, + "balance_loss_mlp": 1.00079679, + "epoch": 0.43457087028408237, + "flos": 26432623883520.0, + "grad_norm": 2.515174306227333, + "language_loss": 0.76271015, + "learning_rate": 2.5150356330487498e-06, + "loss": 0.78544039, + "num_input_tokens_seen": 154984775, + "step": 7228, + "time_per_iteration": 2.614231824874878 + }, + { + "auxiliary_loss_clip": 0.01104192, + "auxiliary_loss_mlp": 0.01119905, + "balance_loss_clip": 1.00192404, + "balance_loss_mlp": 1.0006007, + "epoch": 0.43463099353675033, + "flos": 31868816855040.0, + "grad_norm": 1.4971561823367248, + "language_loss": 0.80363691, + "learning_rate": 2.5146592967643324e-06, + "loss": 0.8258779, + "num_input_tokens_seen": 155008125, + "step": 7229, + "time_per_iteration": 2.785695791244507 + }, + { + "auxiliary_loss_clip": 0.01153731, + "auxiliary_loss_mlp": 0.01120213, + "balance_loss_clip": 1.00211048, + "balance_loss_mlp": 1.00081277, + "epoch": 0.4346911167894183, + "flos": 24571266128640.0, + "grad_norm": 2.040215780696499, + "language_loss": 0.82132447, + "learning_rate": 2.5142829409639834e-06, + "loss": 0.84406394, + "num_input_tokens_seen": 155027885, + "step": 7230, + "time_per_iteration": 4.086455345153809 + }, + { + "auxiliary_loss_clip": 0.01154119, + "auxiliary_loss_mlp": 0.01120934, + "balance_loss_clip": 1.00222898, + "balance_loss_mlp": 1.00086665, + "epoch": 0.43475124004208626, + "flos": 17090678672640.0, + "grad_norm": 2.071555372225241, + "language_loss": 0.76806414, + "learning_rate": 2.513906565661973e-06, + "loss": 0.79081464, + "num_input_tokens_seen": 155043375, + "step": 7231, + "time_per_iteration": 2.5087437629699707 + }, + { + "auxiliary_loss_clip": 0.01126581, + "auxiliary_loss_mlp": 0.01119182, + "balance_loss_clip": 1.00197113, + "balance_loss_mlp": 1.000736, + "epoch": 0.4348113632947542, + "flos": 26104615862400.0, + "grad_norm": 1.7041195852727398, + "language_loss": 0.689291, + "learning_rate": 2.513530170872575e-06, + "loss": 0.7117486, + "num_input_tokens_seen": 155062930, + "step": 7232, + "time_per_iteration": 2.655486583709717 + }, + { + "auxiliary_loss_clip": 0.01122285, + "auxiliary_loss_mlp": 0.01120697, + "balance_loss_clip": 1.00189638, + "balance_loss_mlp": 1.00072503, + "epoch": 0.4348714865474222, + "flos": 34200496316160.0, + "grad_norm": 1.7041099995891877, + "language_loss": 0.72131121, + "learning_rate": 2.5131537566100605e-06, + "loss": 0.74374104, + "num_input_tokens_seen": 155084980, + "step": 7233, + "time_per_iteration": 2.7555220127105713 + }, + { + "auxiliary_loss_clip": 0.01087038, + "auxiliary_loss_mlp": 0.01119627, + "balance_loss_clip": 1.00186515, + "balance_loss_mlp": 1.00070381, + "epoch": 0.43493160980009016, + "flos": 31537468869120.0, + "grad_norm": 1.5402069761103119, + "language_loss": 0.74759698, + "learning_rate": 2.5127773228887053e-06, + "loss": 0.76966357, + "num_input_tokens_seen": 155107260, + "step": 7234, + "time_per_iteration": 2.81706166267395 + }, + { + "auxiliary_loss_clip": 0.01137337, + "auxiliary_loss_mlp": 0.01120973, + "balance_loss_clip": 1.00194097, + "balance_loss_mlp": 1.00090504, + "epoch": 0.4349917330527582, + "flos": 24061334699520.0, + "grad_norm": 3.1162953465527137, + "language_loss": 0.58887273, + "learning_rate": 2.512400869722782e-06, + "loss": 0.6114558, + "num_input_tokens_seen": 155126720, + "step": 7235, + "time_per_iteration": 2.663475751876831 + }, + { + "auxiliary_loss_clip": 0.0110724, + "auxiliary_loss_mlp": 0.01120609, + "balance_loss_clip": 1.00197458, + "balance_loss_mlp": 1.00063694, + "epoch": 0.43505185630542614, + "flos": 30519329863680.0, + "grad_norm": 1.7691106836558788, + "language_loss": 0.77657443, + "learning_rate": 2.512024397126566e-06, + "loss": 0.79885292, + "num_input_tokens_seen": 155148640, + "step": 7236, + "time_per_iteration": 2.7655231952667236 + }, + { + "auxiliary_loss_clip": 0.01168583, + "auxiliary_loss_mlp": 0.01119587, + "balance_loss_clip": 1.00209904, + "balance_loss_mlp": 1.00075912, + "epoch": 0.4351119795580941, + "flos": 15735158196480.0, + "grad_norm": 2.1524703018978126, + "language_loss": 0.81294656, + "learning_rate": 2.5116479051143345e-06, + "loss": 0.83582819, + "num_input_tokens_seen": 155165870, + "step": 7237, + "time_per_iteration": 2.5138800144195557 + }, + { + "auxiliary_loss_clip": 0.01153687, + "auxiliary_loss_mlp": 0.01119452, + "balance_loss_clip": 1.00207603, + "balance_loss_mlp": 1.00071943, + "epoch": 0.4351721028107621, + "flos": 18731760272640.0, + "grad_norm": 1.507422715264705, + "language_loss": 0.63115811, + "learning_rate": 2.5112713937003623e-06, + "loss": 0.65388948, + "num_input_tokens_seen": 155185315, + "step": 7238, + "time_per_iteration": 2.5523149967193604 + }, + { + "auxiliary_loss_clip": 0.01118991, + "auxiliary_loss_mlp": 0.00747745, + "balance_loss_clip": 1.00197649, + "balance_loss_mlp": 1.00058079, + "epoch": 0.43523222606343004, + "flos": 25226887121280.0, + "grad_norm": 1.7197975452419765, + "language_loss": 0.85912651, + "learning_rate": 2.510894862898928e-06, + "loss": 0.87779391, + "num_input_tokens_seen": 155205790, + "step": 7239, + "time_per_iteration": 2.6704559326171875 + }, + { + "auxiliary_loss_clip": 0.0113719, + "auxiliary_loss_mlp": 0.01120213, + "balance_loss_clip": 1.00212204, + "balance_loss_mlp": 1.00071728, + "epoch": 0.435292349316098, + "flos": 22709190101760.0, + "grad_norm": 1.5077681815027975, + "language_loss": 0.72605366, + "learning_rate": 2.510518312724309e-06, + "loss": 0.74862766, + "num_input_tokens_seen": 155226475, + "step": 7240, + "time_per_iteration": 2.599893569946289 + }, + { + "auxiliary_loss_clip": 0.01118597, + "auxiliary_loss_mlp": 0.01119981, + "balance_loss_clip": 1.00187922, + "balance_loss_mlp": 1.0005815, + "epoch": 0.43535247256876597, + "flos": 25775889569280.0, + "grad_norm": 1.803304104347839, + "language_loss": 0.81739008, + "learning_rate": 2.5101417431907842e-06, + "loss": 0.8397758, + "num_input_tokens_seen": 155247110, + "step": 7241, + "time_per_iteration": 2.649829626083374 + }, + { + "auxiliary_loss_clip": 0.01118849, + "auxiliary_loss_mlp": 0.00747762, + "balance_loss_clip": 1.00192666, + "balance_loss_mlp": 1.00046921, + "epoch": 0.43541259582143393, + "flos": 17528142412800.0, + "grad_norm": 2.480405527433949, + "language_loss": 0.79590666, + "learning_rate": 2.5097651543126345e-06, + "loss": 0.81457269, + "num_input_tokens_seen": 155261335, + "step": 7242, + "time_per_iteration": 2.666473388671875 + }, + { + "auxiliary_loss_clip": 0.01138195, + "auxiliary_loss_mlp": 0.01120207, + "balance_loss_clip": 1.00203896, + "balance_loss_mlp": 1.00052071, + "epoch": 0.4354727190741019, + "flos": 15195205975680.0, + "grad_norm": 2.223242755549089, + "language_loss": 0.68084264, + "learning_rate": 2.509388546104138e-06, + "loss": 0.70342672, + "num_input_tokens_seen": 155278510, + "step": 7243, + "time_per_iteration": 2.6092164516448975 + }, + { + "auxiliary_loss_clip": 0.01110333, + "auxiliary_loss_mlp": 0.0111923, + "balance_loss_clip": 1.00201964, + "balance_loss_mlp": 1.00059247, + "epoch": 0.43553284232676986, + "flos": 16649264436480.0, + "grad_norm": 2.253224284967606, + "language_loss": 0.80934912, + "learning_rate": 2.5090119185795766e-06, + "loss": 0.83164477, + "num_input_tokens_seen": 155296450, + "step": 7244, + "time_per_iteration": 2.7148237228393555 + }, + { + "auxiliary_loss_clip": 0.01088632, + "auxiliary_loss_mlp": 0.01119598, + "balance_loss_clip": 1.00179732, + "balance_loss_mlp": 1.00067449, + "epoch": 0.43559296557943783, + "flos": 23400865370880.0, + "grad_norm": 1.7740812181787022, + "language_loss": 0.73487091, + "learning_rate": 2.508635271753234e-06, + "loss": 0.75695312, + "num_input_tokens_seen": 155316080, + "step": 7245, + "time_per_iteration": 2.7587924003601074 + }, + { + "auxiliary_loss_clip": 0.01103924, + "auxiliary_loss_mlp": 0.01120014, + "balance_loss_clip": 1.00191712, + "balance_loss_mlp": 1.00070941, + "epoch": 0.4356530888321058, + "flos": 22419067950720.0, + "grad_norm": 1.647760811621204, + "language_loss": 0.77095741, + "learning_rate": 2.508258605639389e-06, + "loss": 0.7931968, + "num_input_tokens_seen": 155336765, + "step": 7246, + "time_per_iteration": 2.744900703430176 + }, + { + "auxiliary_loss_clip": 0.01153891, + "auxiliary_loss_mlp": 0.01120871, + "balance_loss_clip": 1.00219178, + "balance_loss_mlp": 1.00080311, + "epoch": 0.43571321208477376, + "flos": 21616141282560.0, + "grad_norm": 2.044829743430677, + "language_loss": 0.85444474, + "learning_rate": 2.5078819202523275e-06, + "loss": 0.87719238, + "num_input_tokens_seen": 155356440, + "step": 7247, + "time_per_iteration": 2.618485450744629 + }, + { + "auxiliary_loss_clip": 0.0116864, + "auxiliary_loss_mlp": 0.01119922, + "balance_loss_clip": 1.00213134, + "balance_loss_mlp": 1.00071287, + "epoch": 0.4357733353374418, + "flos": 23987358639360.0, + "grad_norm": 1.7122402706716722, + "language_loss": 0.72639382, + "learning_rate": 2.507505215606333e-06, + "loss": 0.74927938, + "num_input_tokens_seen": 155377070, + "step": 7248, + "time_per_iteration": 2.589393138885498 + }, + { + "auxiliary_loss_clip": 0.01151986, + "auxiliary_loss_mlp": 0.01119963, + "balance_loss_clip": 1.00208724, + "balance_loss_mlp": 1.00065863, + "epoch": 0.43583345859010975, + "flos": 25264737077760.0, + "grad_norm": 1.7366082557814146, + "language_loss": 0.87103891, + "learning_rate": 2.5071284917156893e-06, + "loss": 0.89375842, + "num_input_tokens_seen": 155398415, + "step": 7249, + "time_per_iteration": 2.5956926345825195 + }, + { + "auxiliary_loss_clip": 0.01136512, + "auxiliary_loss_mlp": 0.0111979, + "balance_loss_clip": 1.00200582, + "balance_loss_mlp": 1.0009619, + "epoch": 0.4358935818427777, + "flos": 23696302734720.0, + "grad_norm": 1.772142616527651, + "language_loss": 0.82011062, + "learning_rate": 2.506751748594683e-06, + "loss": 0.84267366, + "num_input_tokens_seen": 155415625, + "step": 7250, + "time_per_iteration": 2.617341995239258 + }, + { + "auxiliary_loss_clip": 0.01152127, + "auxiliary_loss_mlp": 0.01120453, + "balance_loss_clip": 1.0022285, + "balance_loss_mlp": 1.00076711, + "epoch": 0.4359537050954457, + "flos": 29532827761920.0, + "grad_norm": 2.633875971595595, + "language_loss": 0.8468377, + "learning_rate": 2.5063749862575988e-06, + "loss": 0.86956358, + "num_input_tokens_seen": 155435505, + "step": 7251, + "time_per_iteration": 2.63319730758667 + }, + { + "auxiliary_loss_clip": 0.0115384, + "auxiliary_loss_mlp": 0.01118961, + "balance_loss_clip": 1.00211632, + "balance_loss_mlp": 1.00080025, + "epoch": 0.43601382834811364, + "flos": 22711273090560.0, + "grad_norm": 2.013099139272956, + "language_loss": 0.69072139, + "learning_rate": 2.5059982047187245e-06, + "loss": 0.71344948, + "num_input_tokens_seen": 155455425, + "step": 7252, + "time_per_iteration": 2.6444666385650635 + }, + { + "auxiliary_loss_clip": 0.01136704, + "auxiliary_loss_mlp": 0.01119746, + "balance_loss_clip": 1.00201738, + "balance_loss_mlp": 1.00063252, + "epoch": 0.4360739516007816, + "flos": 19098731571840.0, + "grad_norm": 1.5563070679169497, + "language_loss": 0.8371501, + "learning_rate": 2.505621403992348e-06, + "loss": 0.85971463, + "num_input_tokens_seen": 155474250, + "step": 7253, + "time_per_iteration": 2.64508056640625 + }, + { + "auxiliary_loss_clip": 0.01151636, + "auxiliary_loss_mlp": 0.01119695, + "balance_loss_clip": 1.00214839, + "balance_loss_mlp": 1.00067675, + "epoch": 0.43613407485344957, + "flos": 23404420817280.0, + "grad_norm": 1.6597304195125755, + "language_loss": 0.70480514, + "learning_rate": 2.505244584092757e-06, + "loss": 0.72751844, + "num_input_tokens_seen": 155494685, + "step": 7254, + "time_per_iteration": 2.586468458175659 + }, + { + "auxiliary_loss_clip": 0.01135226, + "auxiliary_loss_mlp": 0.01119576, + "balance_loss_clip": 1.00203776, + "balance_loss_mlp": 1.00065327, + "epoch": 0.43619419810611754, + "flos": 22637799820800.0, + "grad_norm": 1.800446841323464, + "language_loss": 0.81161714, + "learning_rate": 2.5048677450342406e-06, + "loss": 0.83416522, + "num_input_tokens_seen": 155513040, + "step": 7255, + "time_per_iteration": 2.604212522506714 + }, + { + "auxiliary_loss_clip": 0.01168683, + "auxiliary_loss_mlp": 0.01120433, + "balance_loss_clip": 1.00216007, + "balance_loss_mlp": 1.00084221, + "epoch": 0.4362543213587855, + "flos": 20047958334720.0, + "grad_norm": 1.8364505161802152, + "language_loss": 0.77576661, + "learning_rate": 2.504490886831089e-06, + "loss": 0.79865777, + "num_input_tokens_seen": 155530100, + "step": 7256, + "time_per_iteration": 2.4960360527038574 + }, + { + "auxiliary_loss_clip": 0.01168577, + "auxiliary_loss_mlp": 0.0111917, + "balance_loss_clip": 1.00215471, + "balance_loss_mlp": 1.00043762, + "epoch": 0.43631444461145347, + "flos": 21361319222400.0, + "grad_norm": 3.371581231567848, + "language_loss": 0.76339722, + "learning_rate": 2.5041140094975922e-06, + "loss": 0.78627473, + "num_input_tokens_seen": 155549375, + "step": 7257, + "time_per_iteration": 2.5165953636169434 + }, + { + "auxiliary_loss_clip": 0.01151911, + "auxiliary_loss_mlp": 0.01119056, + "balance_loss_clip": 1.00197399, + "balance_loss_mlp": 1.00051391, + "epoch": 0.43637456786412143, + "flos": 22418529246720.0, + "grad_norm": 3.1777315913513076, + "language_loss": 0.73206502, + "learning_rate": 2.5037371130480417e-06, + "loss": 0.75477469, + "num_input_tokens_seen": 155569395, + "step": 7258, + "time_per_iteration": 2.5599215030670166 + }, + { + "auxiliary_loss_clip": 0.01134786, + "auxiliary_loss_mlp": 0.0111992, + "balance_loss_clip": 1.00196886, + "balance_loss_mlp": 1.00061524, + "epoch": 0.4364346911167894, + "flos": 28548839612160.0, + "grad_norm": 2.030611012364442, + "language_loss": 0.76726019, + "learning_rate": 2.5033601974967297e-06, + "loss": 0.78980726, + "num_input_tokens_seen": 155589090, + "step": 7259, + "time_per_iteration": 3.985295295715332 + }, + { + "auxiliary_loss_clip": 0.01131243, + "auxiliary_loss_mlp": 0.01100636, + "balance_loss_clip": 1.00161052, + "balance_loss_mlp": 1.00069094, + "epoch": 0.43649481436945736, + "flos": 62659345380480.0, + "grad_norm": 0.7583353507602674, + "language_loss": 0.57005465, + "learning_rate": 2.5029832628579483e-06, + "loss": 0.59237349, + "num_input_tokens_seen": 155648660, + "step": 7260, + "time_per_iteration": 3.141029119491577 + }, + { + "auxiliary_loss_clip": 0.01138177, + "auxiliary_loss_mlp": 0.01120676, + "balance_loss_clip": 1.00198865, + "balance_loss_mlp": 1.00060797, + "epoch": 0.4365549376221254, + "flos": 30592120775040.0, + "grad_norm": 2.1190319917000155, + "language_loss": 0.70729244, + "learning_rate": 2.5026063091459907e-06, + "loss": 0.72988093, + "num_input_tokens_seen": 155669945, + "step": 7261, + "time_per_iteration": 2.686187505722046 + }, + { + "auxiliary_loss_clip": 0.01104929, + "auxiliary_loss_mlp": 0.01120124, + "balance_loss_clip": 1.00188589, + "balance_loss_mlp": 1.00062871, + "epoch": 0.43661506087479335, + "flos": 17165875795200.0, + "grad_norm": 4.219145104548845, + "language_loss": 0.69530094, + "learning_rate": 2.5022293363751522e-06, + "loss": 0.71755153, + "num_input_tokens_seen": 155688555, + "step": 7262, + "time_per_iteration": 2.668710231781006 + }, + { + "auxiliary_loss_clip": 0.01090732, + "auxiliary_loss_mlp": 0.0111814, + "balance_loss_clip": 1.00185192, + "balance_loss_mlp": 1.00074244, + "epoch": 0.4366751841274613, + "flos": 22047499710720.0, + "grad_norm": 1.859863718604544, + "language_loss": 0.79466259, + "learning_rate": 2.501852344559726e-06, + "loss": 0.8167513, + "num_input_tokens_seen": 155705370, + "step": 7263, + "time_per_iteration": 2.707829236984253 + }, + { + "auxiliary_loss_clip": 0.01119571, + "auxiliary_loss_mlp": 0.01119954, + "balance_loss_clip": 1.00195527, + "balance_loss_mlp": 1.00093555, + "epoch": 0.4367353073801293, + "flos": 15997306631040.0, + "grad_norm": 1.713360353104764, + "language_loss": 0.75520003, + "learning_rate": 2.50147533371401e-06, + "loss": 0.77759528, + "num_input_tokens_seen": 155721890, + "step": 7264, + "time_per_iteration": 4.135430335998535 + }, + { + "auxiliary_loss_clip": 0.01105288, + "auxiliary_loss_mlp": 0.01119203, + "balance_loss_clip": 1.00182724, + "balance_loss_mlp": 1.00056553, + "epoch": 0.43679543063279724, + "flos": 38217535868160.0, + "grad_norm": 2.2420387220219267, + "language_loss": 0.61368215, + "learning_rate": 2.501098303852298e-06, + "loss": 0.63592708, + "num_input_tokens_seen": 155743970, + "step": 7265, + "time_per_iteration": 4.150288105010986 + }, + { + "auxiliary_loss_clip": 0.01135605, + "auxiliary_loss_mlp": 0.0111882, + "balance_loss_clip": 1.00196075, + "balance_loss_mlp": 1.0005641, + "epoch": 0.4368555538854652, + "flos": 15193230727680.0, + "grad_norm": 1.8638630473412674, + "language_loss": 0.72604054, + "learning_rate": 2.5007212549888884e-06, + "loss": 0.74858487, + "num_input_tokens_seen": 155761830, + "step": 7266, + "time_per_iteration": 2.592221260070801 + }, + { + "auxiliary_loss_clip": 0.01135066, + "auxiliary_loss_mlp": 0.01119555, + "balance_loss_clip": 1.00196981, + "balance_loss_mlp": 1.00072694, + "epoch": 0.4369156771381332, + "flos": 23069086421760.0, + "grad_norm": 4.530264959771188, + "language_loss": 0.81849152, + "learning_rate": 2.5003441871380794e-06, + "loss": 0.84103775, + "num_input_tokens_seen": 155779610, + "step": 7267, + "time_per_iteration": 3.979477643966675 + }, + { + "auxiliary_loss_clip": 0.01168481, + "auxiliary_loss_mlp": 0.01118712, + "balance_loss_clip": 1.0021255, + "balance_loss_mlp": 1.0005517, + "epoch": 0.43697580039080114, + "flos": 23441085624960.0, + "grad_norm": 1.8567995241529403, + "language_loss": 0.74359983, + "learning_rate": 2.4999671003141674e-06, + "loss": 0.76647174, + "num_input_tokens_seen": 155798765, + "step": 7268, + "time_per_iteration": 2.5754024982452393 + }, + { + "auxiliary_loss_clip": 0.01168709, + "auxiliary_loss_mlp": 0.01119321, + "balance_loss_clip": 1.00217056, + "balance_loss_mlp": 1.0005883, + "epoch": 0.4370359236434691, + "flos": 18514680428160.0, + "grad_norm": 2.0902568590296506, + "language_loss": 0.7972368, + "learning_rate": 2.499589994531454e-06, + "loss": 0.82011706, + "num_input_tokens_seen": 155817750, + "step": 7269, + "time_per_iteration": 2.5323691368103027 + }, + { + "auxiliary_loss_clip": 0.0113619, + "auxiliary_loss_mlp": 0.0111943, + "balance_loss_clip": 1.00208426, + "balance_loss_mlp": 1.00069749, + "epoch": 0.43709604689613707, + "flos": 23222497409280.0, + "grad_norm": 3.17862423072224, + "language_loss": 0.74993789, + "learning_rate": 2.499212869804237e-06, + "loss": 0.77249408, + "num_input_tokens_seen": 155836490, + "step": 7270, + "time_per_iteration": 2.6336722373962402 + }, + { + "auxiliary_loss_clip": 0.01089302, + "auxiliary_loss_mlp": 0.01119542, + "balance_loss_clip": 1.00183415, + "balance_loss_mlp": 1.00052333, + "epoch": 0.43715617014880503, + "flos": 23803711378560.0, + "grad_norm": 2.00640318545614, + "language_loss": 0.79206914, + "learning_rate": 2.4988357261468182e-06, + "loss": 0.81415761, + "num_input_tokens_seen": 155856225, + "step": 7271, + "time_per_iteration": 2.8277628421783447 + }, + { + "auxiliary_loss_clip": 0.01147389, + "auxiliary_loss_mlp": 0.01099244, + "balance_loss_clip": 1.00155973, + "balance_loss_mlp": 1.00006187, + "epoch": 0.437216293401473, + "flos": 61941204766080.0, + "grad_norm": 0.6917996948051657, + "language_loss": 0.5489915, + "learning_rate": 2.4984585635734993e-06, + "loss": 0.57145786, + "num_input_tokens_seen": 155916770, + "step": 7272, + "time_per_iteration": 3.203888177871704 + }, + { + "auxiliary_loss_clip": 0.01168698, + "auxiliary_loss_mlp": 0.01120313, + "balance_loss_clip": 1.00212336, + "balance_loss_mlp": 1.00072241, + "epoch": 0.43727641665414096, + "flos": 21982250655360.0, + "grad_norm": 1.616340023049232, + "language_loss": 0.69972539, + "learning_rate": 2.498081382098581e-06, + "loss": 0.72261548, + "num_input_tokens_seen": 155936490, + "step": 7273, + "time_per_iteration": 2.5346338748931885 + }, + { + "auxiliary_loss_clip": 0.01137907, + "auxiliary_loss_mlp": 0.01120026, + "balance_loss_clip": 1.00196469, + "balance_loss_mlp": 1.00062633, + "epoch": 0.437336539906809, + "flos": 39530860842240.0, + "grad_norm": 1.7254662163279761, + "language_loss": 0.75065386, + "learning_rate": 2.497704181736367e-06, + "loss": 0.77323323, + "num_input_tokens_seen": 155957595, + "step": 7274, + "time_per_iteration": 2.7443416118621826 + }, + { + "auxiliary_loss_clip": 0.01152065, + "auxiliary_loss_mlp": 0.01118464, + "balance_loss_clip": 1.00197291, + "balance_loss_mlp": 1.00059032, + "epoch": 0.43739666315947695, + "flos": 17457147181440.0, + "grad_norm": 1.7900901270954177, + "language_loss": 0.80077136, + "learning_rate": 2.49732696250116e-06, + "loss": 0.82347661, + "num_input_tokens_seen": 155975710, + "step": 7275, + "time_per_iteration": 2.5504510402679443 + }, + { + "auxiliary_loss_clip": 0.01135254, + "auxiliary_loss_mlp": 0.0111884, + "balance_loss_clip": 1.00215268, + "balance_loss_mlp": 1.00067997, + "epoch": 0.4374567864121449, + "flos": 16358747235840.0, + "grad_norm": 2.1635372263016506, + "language_loss": 0.80406201, + "learning_rate": 2.496949724407266e-06, + "loss": 0.82660294, + "num_input_tokens_seen": 155993090, + "step": 7276, + "time_per_iteration": 2.566922426223755 + }, + { + "auxiliary_loss_clip": 0.01143437, + "auxiliary_loss_mlp": 0.01120301, + "balance_loss_clip": 1.0022372, + "balance_loss_mlp": 1.00061524, + "epoch": 0.4375169096648129, + "flos": 30587523834240.0, + "grad_norm": 3.120039391924473, + "language_loss": 0.73301226, + "learning_rate": 2.496572467468988e-06, + "loss": 0.75564957, + "num_input_tokens_seen": 156013685, + "step": 7277, + "time_per_iteration": 2.676180839538574 + }, + { + "auxiliary_loss_clip": 0.01134779, + "auxiliary_loss_mlp": 0.00747559, + "balance_loss_clip": 1.00207448, + "balance_loss_mlp": 1.00047827, + "epoch": 0.43757703291748085, + "flos": 30555599621760.0, + "grad_norm": 1.9764216361062144, + "language_loss": 0.73011202, + "learning_rate": 2.4961951917006317e-06, + "loss": 0.74893534, + "num_input_tokens_seen": 156034300, + "step": 7278, + "time_per_iteration": 2.6737139225006104 + }, + { + "auxiliary_loss_clip": 0.01125908, + "auxiliary_loss_mlp": 0.01118832, + "balance_loss_clip": 1.00211453, + "balance_loss_mlp": 1.00076735, + "epoch": 0.4376371561701488, + "flos": 21397373498880.0, + "grad_norm": 1.5251420764629955, + "language_loss": 0.66044319, + "learning_rate": 2.4958178971165046e-06, + "loss": 0.68289065, + "num_input_tokens_seen": 156053805, + "step": 7279, + "time_per_iteration": 2.629074811935425 + }, + { + "auxiliary_loss_clip": 0.01168646, + "auxiliary_loss_mlp": 0.0111955, + "balance_loss_clip": 1.00213552, + "balance_loss_mlp": 1.0006268, + "epoch": 0.4376972794228168, + "flos": 23404384903680.0, + "grad_norm": 1.9554477347931447, + "language_loss": 0.82059008, + "learning_rate": 2.4954405837309126e-06, + "loss": 0.843472, + "num_input_tokens_seen": 156073295, + "step": 7280, + "time_per_iteration": 2.5310635566711426 + }, + { + "auxiliary_loss_clip": 0.01136833, + "auxiliary_loss_mlp": 0.01118721, + "balance_loss_clip": 1.00204802, + "balance_loss_mlp": 1.00065589, + "epoch": 0.43775740267548474, + "flos": 22892945103360.0, + "grad_norm": 1.6361610601868075, + "language_loss": 0.76791722, + "learning_rate": 2.4950632515581653e-06, + "loss": 0.79047275, + "num_input_tokens_seen": 156094540, + "step": 7281, + "time_per_iteration": 2.6074347496032715 + }, + { + "auxiliary_loss_clip": 0.01136928, + "auxiliary_loss_mlp": 0.01119301, + "balance_loss_clip": 1.00200653, + "balance_loss_mlp": 1.00066423, + "epoch": 0.4378175259281527, + "flos": 23294390480640.0, + "grad_norm": 1.7878333012615688, + "language_loss": 0.75752485, + "learning_rate": 2.494685900612569e-06, + "loss": 0.78008711, + "num_input_tokens_seen": 156114070, + "step": 7282, + "time_per_iteration": 2.589331865310669 + }, + { + "auxiliary_loss_clip": 0.01120229, + "auxiliary_loss_mlp": 0.0111931, + "balance_loss_clip": 1.00207567, + "balance_loss_mlp": 1.00057721, + "epoch": 0.43787764918082067, + "flos": 23876897339520.0, + "grad_norm": 1.739764802286618, + "language_loss": 0.84966296, + "learning_rate": 2.4943085309084333e-06, + "loss": 0.87205839, + "num_input_tokens_seen": 156132130, + "step": 7283, + "time_per_iteration": 2.641305923461914 + }, + { + "auxiliary_loss_clip": 0.01141598, + "auxiliary_loss_mlp": 0.01120225, + "balance_loss_clip": 1.00200522, + "balance_loss_mlp": 1.00063396, + "epoch": 0.43793777243348864, + "flos": 23988148738560.0, + "grad_norm": 2.1037063988344658, + "language_loss": 0.8045885, + "learning_rate": 2.49393114246007e-06, + "loss": 0.82720673, + "num_input_tokens_seen": 156150820, + "step": 7284, + "time_per_iteration": 2.6050004959106445 + }, + { + "auxiliary_loss_clip": 0.0115364, + "auxiliary_loss_mlp": 0.01118865, + "balance_loss_clip": 1.00211382, + "balance_loss_mlp": 1.00080049, + "epoch": 0.4379978956861566, + "flos": 18624064320000.0, + "grad_norm": 4.490874247109245, + "language_loss": 0.80082321, + "learning_rate": 2.493553735281787e-06, + "loss": 0.8235482, + "num_input_tokens_seen": 156170125, + "step": 7285, + "time_per_iteration": 2.589836359024048 + }, + { + "auxiliary_loss_clip": 0.01153582, + "auxiliary_loss_mlp": 0.01118888, + "balance_loss_clip": 1.002033, + "balance_loss_mlp": 1.00044143, + "epoch": 0.43805801893882457, + "flos": 21981388728960.0, + "grad_norm": 1.8248481789215059, + "language_loss": 0.75226104, + "learning_rate": 2.493176309387897e-06, + "loss": 0.77498573, + "num_input_tokens_seen": 156187320, + "step": 7286, + "time_per_iteration": 2.542879104614258 + }, + { + "auxiliary_loss_clip": 0.01119729, + "auxiliary_loss_mlp": 0.0111904, + "balance_loss_clip": 1.00197101, + "balance_loss_mlp": 1.00049782, + "epoch": 0.43811814219149253, + "flos": 26393337383040.0, + "grad_norm": 1.4808415464873481, + "language_loss": 0.7333374, + "learning_rate": 2.492798864792712e-06, + "loss": 0.75572503, + "num_input_tokens_seen": 156207455, + "step": 7287, + "time_per_iteration": 2.701967239379883 + }, + { + "auxiliary_loss_clip": 0.0113525, + "auxiliary_loss_mlp": 0.01119534, + "balance_loss_clip": 1.00195575, + "balance_loss_mlp": 1.00080156, + "epoch": 0.43817826544416055, + "flos": 17493309198720.0, + "grad_norm": 1.6650490441447552, + "language_loss": 0.82415849, + "learning_rate": 2.492421401510545e-06, + "loss": 0.84670627, + "num_input_tokens_seen": 156226560, + "step": 7288, + "time_per_iteration": 2.5895192623138428 + }, + { + "auxiliary_loss_clip": 0.01120438, + "auxiliary_loss_mlp": 0.01119235, + "balance_loss_clip": 1.00180578, + "balance_loss_mlp": 1.00059807, + "epoch": 0.4382383886968285, + "flos": 21581020759680.0, + "grad_norm": 1.4795871104614238, + "language_loss": 0.841295, + "learning_rate": 2.4920439195557093e-06, + "loss": 0.86369169, + "num_input_tokens_seen": 156246740, + "step": 7289, + "time_per_iteration": 2.672218084335327 + }, + { + "auxiliary_loss_clip": 0.01138717, + "auxiliary_loss_mlp": 0.01119907, + "balance_loss_clip": 1.00206268, + "balance_loss_mlp": 1.00069785, + "epoch": 0.4382985119494965, + "flos": 27923742201600.0, + "grad_norm": 1.9223618946431265, + "language_loss": 0.78107357, + "learning_rate": 2.4916664189425183e-06, + "loss": 0.80365974, + "num_input_tokens_seen": 156266440, + "step": 7290, + "time_per_iteration": 2.6956710815429688 + }, + { + "auxiliary_loss_clip": 0.01168565, + "auxiliary_loss_mlp": 0.01119442, + "balance_loss_clip": 1.00214469, + "balance_loss_mlp": 1.00070906, + "epoch": 0.43835863520216445, + "flos": 24936836797440.0, + "grad_norm": 2.314123419391339, + "language_loss": 0.77872884, + "learning_rate": 2.491288899685288e-06, + "loss": 0.80160892, + "num_input_tokens_seen": 156286900, + "step": 7291, + "time_per_iteration": 2.615314483642578 + }, + { + "auxiliary_loss_clip": 0.01120158, + "auxiliary_loss_mlp": 0.01118867, + "balance_loss_clip": 1.00201488, + "balance_loss_mlp": 1.00061154, + "epoch": 0.4384187584548324, + "flos": 33510293504640.0, + "grad_norm": 1.8086007800705044, + "language_loss": 0.65007305, + "learning_rate": 2.4909113617983325e-06, + "loss": 0.6724633, + "num_input_tokens_seen": 156307690, + "step": 7292, + "time_per_iteration": 2.773033618927002 + }, + { + "auxiliary_loss_clip": 0.0115341, + "auxiliary_loss_mlp": 0.01120045, + "balance_loss_clip": 1.00200462, + "balance_loss_mlp": 1.00054944, + "epoch": 0.4384788817075004, + "flos": 23951052967680.0, + "grad_norm": 1.676613004692736, + "language_loss": 0.74021536, + "learning_rate": 2.49053380529597e-06, + "loss": 0.76294982, + "num_input_tokens_seen": 156326620, + "step": 7293, + "time_per_iteration": 2.5772950649261475 + }, + { + "auxiliary_loss_clip": 0.01126504, + "auxiliary_loss_mlp": 0.01119714, + "balance_loss_clip": 1.00214314, + "balance_loss_mlp": 1.0007906, + "epoch": 0.43853900496016834, + "flos": 19098516090240.0, + "grad_norm": 1.9933111438663729, + "language_loss": 0.78600979, + "learning_rate": 2.490156230192516e-06, + "loss": 0.80847204, + "num_input_tokens_seen": 156345495, + "step": 7294, + "time_per_iteration": 2.6066761016845703 + }, + { + "auxiliary_loss_clip": 0.01120549, + "auxiliary_loss_mlp": 0.01119547, + "balance_loss_clip": 1.0020746, + "balance_loss_mlp": 1.00081468, + "epoch": 0.4385991282128363, + "flos": 13225362168960.0, + "grad_norm": 1.6006944296589811, + "language_loss": 0.72927552, + "learning_rate": 2.4897786365022883e-06, + "loss": 0.7516765, + "num_input_tokens_seen": 156363155, + "step": 7295, + "time_per_iteration": 2.6196084022521973 + }, + { + "auxiliary_loss_clip": 0.01120098, + "auxiliary_loss_mlp": 0.01120119, + "balance_loss_clip": 1.00190949, + "balance_loss_mlp": 1.0006237, + "epoch": 0.4386592514655043, + "flos": 14319883445760.0, + "grad_norm": 2.6363310829924953, + "language_loss": 0.75694817, + "learning_rate": 2.4894010242396063e-06, + "loss": 0.77935034, + "num_input_tokens_seen": 156380940, + "step": 7296, + "time_per_iteration": 4.0217626094818115 + }, + { + "auxiliary_loss_clip": 0.01151911, + "auxiliary_loss_mlp": 0.0111902, + "balance_loss_clip": 1.0021292, + "balance_loss_mlp": 1.00066912, + "epoch": 0.43871937471817224, + "flos": 22784423137920.0, + "grad_norm": 2.745144554599619, + "language_loss": 0.69253808, + "learning_rate": 2.4890233934187873e-06, + "loss": 0.71524739, + "num_input_tokens_seen": 156400415, + "step": 7297, + "time_per_iteration": 2.580162286758423 + }, + { + "auxiliary_loss_clip": 0.01153802, + "auxiliary_loss_mlp": 0.01118945, + "balance_loss_clip": 1.00212979, + "balance_loss_mlp": 1.0005939, + "epoch": 0.4387794979708402, + "flos": 28072304853120.0, + "grad_norm": 2.768671848660941, + "language_loss": 0.70287001, + "learning_rate": 2.4886457440541535e-06, + "loss": 0.72559744, + "num_input_tokens_seen": 156421120, + "step": 7298, + "time_per_iteration": 2.5990793704986572 + }, + { + "auxiliary_loss_clip": 0.01151266, + "auxiliary_loss_mlp": 0.01118317, + "balance_loss_clip": 1.00216472, + "balance_loss_mlp": 1.00053811, + "epoch": 0.43883962122350817, + "flos": 26249551240320.0, + "grad_norm": 1.4694051093976908, + "language_loss": 0.72226751, + "learning_rate": 2.4882680761600238e-06, + "loss": 0.74496329, + "num_input_tokens_seen": 156441535, + "step": 7299, + "time_per_iteration": 2.607222080230713 + }, + { + "auxiliary_loss_clip": 0.01143491, + "auxiliary_loss_mlp": 0.0074771, + "balance_loss_clip": 1.00218403, + "balance_loss_mlp": 1.00049591, + "epoch": 0.43889974447617613, + "flos": 25883765089920.0, + "grad_norm": 2.1874466219788236, + "language_loss": 0.76692998, + "learning_rate": 2.487890389750719e-06, + "loss": 0.78584194, + "num_input_tokens_seen": 156462015, + "step": 7300, + "time_per_iteration": 2.664855480194092 + }, + { + "auxiliary_loss_clip": 0.0113504, + "auxiliary_loss_mlp": 0.01118295, + "balance_loss_clip": 1.00192738, + "balance_loss_mlp": 1.00070679, + "epoch": 0.43895986772884416, + "flos": 25046615738880.0, + "grad_norm": 1.921738776283662, + "language_loss": 0.70171577, + "learning_rate": 2.4875126848405626e-06, + "loss": 0.72424912, + "num_input_tokens_seen": 156482165, + "step": 7301, + "time_per_iteration": 2.650221824645996 + }, + { + "auxiliary_loss_clip": 0.0110615, + "auxiliary_loss_mlp": 0.01120619, + "balance_loss_clip": 1.00194931, + "balance_loss_mlp": 1.00064671, + "epoch": 0.4390199909815121, + "flos": 25994585525760.0, + "grad_norm": 2.2749615424630774, + "language_loss": 0.70621675, + "learning_rate": 2.4871349614438757e-06, + "loss": 0.72848439, + "num_input_tokens_seen": 156503170, + "step": 7302, + "time_per_iteration": 4.140049695968628 + }, + { + "auxiliary_loss_clip": 0.0113504, + "auxiliary_loss_mlp": 0.01119509, + "balance_loss_clip": 1.00199342, + "balance_loss_mlp": 1.00077653, + "epoch": 0.4390801142341801, + "flos": 29022249888000.0, + "grad_norm": 2.5524725295523796, + "language_loss": 0.82003486, + "learning_rate": 2.486757219574983e-06, + "loss": 0.84258032, + "num_input_tokens_seen": 156523005, + "step": 7303, + "time_per_iteration": 4.083894491195679 + }, + { + "auxiliary_loss_clip": 0.01153933, + "auxiliary_loss_mlp": 0.01120761, + "balance_loss_clip": 1.00220823, + "balance_loss_mlp": 1.00069308, + "epoch": 0.43914023748684805, + "flos": 33438544087680.0, + "grad_norm": 2.77894639732357, + "language_loss": 0.68806767, + "learning_rate": 2.4863794592482067e-06, + "loss": 0.7108146, + "num_input_tokens_seen": 156544440, + "step": 7304, + "time_per_iteration": 2.6718311309814453 + }, + { + "auxiliary_loss_clip": 0.01138044, + "auxiliary_loss_mlp": 0.00747502, + "balance_loss_clip": 1.00207305, + "balance_loss_mlp": 1.00042832, + "epoch": 0.439200360739516, + "flos": 34531844302080.0, + "grad_norm": 2.4075814996912133, + "language_loss": 0.78136498, + "learning_rate": 2.486001680477873e-06, + "loss": 0.80022043, + "num_input_tokens_seen": 156565410, + "step": 7305, + "time_per_iteration": 4.12078332901001 + }, + { + "auxiliary_loss_clip": 0.01134953, + "auxiliary_loss_mlp": 0.01119039, + "balance_loss_clip": 1.00200236, + "balance_loss_mlp": 1.00068808, + "epoch": 0.439260483992184, + "flos": 21907843632000.0, + "grad_norm": 2.075774977940039, + "language_loss": 0.68902063, + "learning_rate": 2.485623883278308e-06, + "loss": 0.71156049, + "num_input_tokens_seen": 156584210, + "step": 7306, + "time_per_iteration": 2.615403175354004 + }, + { + "auxiliary_loss_clip": 0.01120505, + "auxiliary_loss_mlp": 0.01119087, + "balance_loss_clip": 1.00201726, + "balance_loss_mlp": 1.000736, + "epoch": 0.43932060724485195, + "flos": 20996430912000.0, + "grad_norm": 1.5026292330418278, + "language_loss": 0.62634468, + "learning_rate": 2.4852460676638344e-06, + "loss": 0.64874065, + "num_input_tokens_seen": 156602730, + "step": 7307, + "time_per_iteration": 2.6514434814453125 + }, + { + "auxiliary_loss_clip": 0.01168725, + "auxiliary_loss_mlp": 0.01120369, + "balance_loss_clip": 1.00217843, + "balance_loss_mlp": 1.00068247, + "epoch": 0.4393807304975199, + "flos": 17747053850880.0, + "grad_norm": 1.9697199304929527, + "language_loss": 0.72160387, + "learning_rate": 2.4848682336487828e-06, + "loss": 0.7444948, + "num_input_tokens_seen": 156619405, + "step": 7308, + "time_per_iteration": 2.492659091949463 + }, + { + "auxiliary_loss_clip": 0.01136598, + "auxiliary_loss_mlp": 0.01119276, + "balance_loss_clip": 1.00197923, + "balance_loss_mlp": 1.0006392, + "epoch": 0.4394408537501879, + "flos": 22528523669760.0, + "grad_norm": 1.9210585136459863, + "language_loss": 0.76671779, + "learning_rate": 2.4844903812474787e-06, + "loss": 0.78927648, + "num_input_tokens_seen": 156638165, + "step": 7309, + "time_per_iteration": 2.632066488265991 + }, + { + "auxiliary_loss_clip": 0.01151693, + "auxiliary_loss_mlp": 0.01118198, + "balance_loss_clip": 1.00210214, + "balance_loss_mlp": 1.00041938, + "epoch": 0.43950097700285584, + "flos": 23440654661760.0, + "grad_norm": 1.5847850755764696, + "language_loss": 0.70545113, + "learning_rate": 2.484112510474251e-06, + "loss": 0.72815001, + "num_input_tokens_seen": 156658845, + "step": 7310, + "time_per_iteration": 2.607949733734131 + }, + { + "auxiliary_loss_clip": 0.01137123, + "auxiliary_loss_mlp": 0.00747365, + "balance_loss_clip": 1.00203824, + "balance_loss_mlp": 1.00033283, + "epoch": 0.4395611002555238, + "flos": 23180696956800.0, + "grad_norm": 2.2156010367763836, + "language_loss": 0.76358247, + "learning_rate": 2.483734621343429e-06, + "loss": 0.78242731, + "num_input_tokens_seen": 156677275, + "step": 7311, + "time_per_iteration": 2.6187915802001953 + }, + { + "auxiliary_loss_clip": 0.01152151, + "auxiliary_loss_mlp": 0.01120024, + "balance_loss_clip": 1.0020895, + "balance_loss_mlp": 1.00081456, + "epoch": 0.43962122350819177, + "flos": 22127365601280.0, + "grad_norm": 2.2576774066684795, + "language_loss": 0.81768394, + "learning_rate": 2.483356713869341e-06, + "loss": 0.8404057, + "num_input_tokens_seen": 156695815, + "step": 7312, + "time_per_iteration": 2.6374690532684326 + }, + { + "auxiliary_loss_clip": 0.01121461, + "auxiliary_loss_mlp": 0.01118704, + "balance_loss_clip": 1.00198865, + "balance_loss_mlp": 1.00054407, + "epoch": 0.43968134676085974, + "flos": 17420554200960.0, + "grad_norm": 1.8759574392282836, + "language_loss": 0.8524549, + "learning_rate": 2.482978788066318e-06, + "loss": 0.87485659, + "num_input_tokens_seen": 156714385, + "step": 7313, + "time_per_iteration": 2.708054542541504 + }, + { + "auxiliary_loss_clip": 0.01136736, + "auxiliary_loss_mlp": 0.01119604, + "balance_loss_clip": 1.00195599, + "balance_loss_mlp": 1.00068092, + "epoch": 0.43974147001352776, + "flos": 18952646958720.0, + "grad_norm": 1.927068332142989, + "language_loss": 0.67397058, + "learning_rate": 2.4826008439486904e-06, + "loss": 0.69653404, + "num_input_tokens_seen": 156732615, + "step": 7314, + "time_per_iteration": 2.5703024864196777 + }, + { + "auxiliary_loss_clip": 0.01136983, + "auxiliary_loss_mlp": 0.01119647, + "balance_loss_clip": 1.00201893, + "balance_loss_mlp": 1.00062847, + "epoch": 0.4398015932661957, + "flos": 18953508885120.0, + "grad_norm": 2.089906972295117, + "language_loss": 0.76757193, + "learning_rate": 2.4822228815307915e-06, + "loss": 0.79013824, + "num_input_tokens_seen": 156750920, + "step": 7315, + "time_per_iteration": 2.5812549591064453 + }, + { + "auxiliary_loss_clip": 0.01136676, + "auxiliary_loss_mlp": 0.01118608, + "balance_loss_clip": 1.00203276, + "balance_loss_mlp": 1.00073373, + "epoch": 0.4398617165188637, + "flos": 24199913370240.0, + "grad_norm": 2.069204455810946, + "language_loss": 0.74301922, + "learning_rate": 2.4818449008269523e-06, + "loss": 0.76557213, + "num_input_tokens_seen": 156768520, + "step": 7316, + "time_per_iteration": 2.6255381107330322 + }, + { + "auxiliary_loss_clip": 0.01119617, + "auxiliary_loss_mlp": 0.01118734, + "balance_loss_clip": 1.00201976, + "balance_loss_mlp": 1.00066912, + "epoch": 0.43992183977153165, + "flos": 22236677665920.0, + "grad_norm": 2.6605279632468184, + "language_loss": 0.65116501, + "learning_rate": 2.481466901851506e-06, + "loss": 0.67354846, + "num_input_tokens_seen": 156788700, + "step": 7317, + "time_per_iteration": 2.630796194076538 + }, + { + "auxiliary_loss_clip": 0.01137301, + "auxiliary_loss_mlp": 0.01119943, + "balance_loss_clip": 1.00208116, + "balance_loss_mlp": 1.00073385, + "epoch": 0.4399819630241996, + "flos": 18697465762560.0, + "grad_norm": 1.757223944027095, + "language_loss": 0.7963444, + "learning_rate": 2.4810888846187865e-06, + "loss": 0.8189168, + "num_input_tokens_seen": 156806470, + "step": 7318, + "time_per_iteration": 2.5746021270751953 + }, + { + "auxiliary_loss_clip": 0.01123417, + "auxiliary_loss_mlp": 0.01120007, + "balance_loss_clip": 1.00216234, + "balance_loss_mlp": 1.00079727, + "epoch": 0.4400420862768676, + "flos": 23879375377920.0, + "grad_norm": 1.8612555481114024, + "language_loss": 0.79616153, + "learning_rate": 2.4807108491431283e-06, + "loss": 0.81859577, + "num_input_tokens_seen": 156825895, + "step": 7319, + "time_per_iteration": 2.636448621749878 + }, + { + "auxiliary_loss_clip": 0.01153458, + "auxiliary_loss_mlp": 0.01119536, + "balance_loss_clip": 1.00216699, + "balance_loss_mlp": 1.00070834, + "epoch": 0.44010220952953555, + "flos": 28037615293440.0, + "grad_norm": 1.629757998600902, + "language_loss": 0.79188836, + "learning_rate": 2.4803327954388667e-06, + "loss": 0.81461835, + "num_input_tokens_seen": 156845990, + "step": 7320, + "time_per_iteration": 2.604006767272949 + }, + { + "auxiliary_loss_clip": 0.01118286, + "auxiliary_loss_mlp": 0.01118924, + "balance_loss_clip": 1.00189662, + "balance_loss_mlp": 1.00076425, + "epoch": 0.4401623327822035, + "flos": 23768985905280.0, + "grad_norm": 1.5819241333998535, + "language_loss": 0.69891143, + "learning_rate": 2.4799547235203376e-06, + "loss": 0.7212835, + "num_input_tokens_seen": 156866685, + "step": 7321, + "time_per_iteration": 2.6743054389953613 + }, + { + "auxiliary_loss_clip": 0.01100882, + "auxiliary_loss_mlp": 0.0109918, + "balance_loss_clip": 1.00144529, + "balance_loss_mlp": 0.99999803, + "epoch": 0.4402224560348715, + "flos": 70774583264640.0, + "grad_norm": 0.87512550933945, + "language_loss": 0.56951022, + "learning_rate": 2.4795766334018763e-06, + "loss": 0.59151083, + "num_input_tokens_seen": 156923450, + "step": 7322, + "time_per_iteration": 3.3175458908081055 + }, + { + "auxiliary_loss_clip": 0.01107896, + "auxiliary_loss_mlp": 0.01119287, + "balance_loss_clip": 1.00195003, + "balance_loss_mlp": 1.00074565, + "epoch": 0.44028257928753944, + "flos": 22891795868160.0, + "grad_norm": 1.5557204061461063, + "language_loss": 0.75977618, + "learning_rate": 2.479198525097822e-06, + "loss": 0.78204805, + "num_input_tokens_seen": 156944795, + "step": 7323, + "time_per_iteration": 2.726897954940796 + }, + { + "auxiliary_loss_clip": 0.01152894, + "auxiliary_loss_mlp": 0.01119186, + "balance_loss_clip": 1.00210619, + "balance_loss_mlp": 1.00073946, + "epoch": 0.4403427025402074, + "flos": 17895760156800.0, + "grad_norm": 2.3854850449637066, + "language_loss": 0.80884558, + "learning_rate": 2.478820398622511e-06, + "loss": 0.83156633, + "num_input_tokens_seen": 156962755, + "step": 7324, + "time_per_iteration": 2.606099843978882 + }, + { + "auxiliary_loss_clip": 0.01119585, + "auxiliary_loss_mlp": 0.01099218, + "balance_loss_clip": 1.00159049, + "balance_loss_mlp": 1.00003552, + "epoch": 0.4404028257928754, + "flos": 69562525708800.0, + "grad_norm": 0.8386260214835027, + "language_loss": 0.54575795, + "learning_rate": 2.478442253990283e-06, + "loss": 0.56794596, + "num_input_tokens_seen": 157028095, + "step": 7325, + "time_per_iteration": 3.2071797847747803 + }, + { + "auxiliary_loss_clip": 0.01168517, + "auxiliary_loss_mlp": 0.01118991, + "balance_loss_clip": 1.00222611, + "balance_loss_mlp": 1.00054419, + "epoch": 0.44046294904554334, + "flos": 20923675914240.0, + "grad_norm": 1.4226258453955527, + "language_loss": 0.6936546, + "learning_rate": 2.4780640912154766e-06, + "loss": 0.71652967, + "num_input_tokens_seen": 157048365, + "step": 7326, + "time_per_iteration": 2.5303611755371094 + }, + { + "auxiliary_loss_clip": 0.01119835, + "auxiliary_loss_mlp": 0.01118979, + "balance_loss_clip": 1.00194192, + "balance_loss_mlp": 1.00053287, + "epoch": 0.44052307229821136, + "flos": 23623475909760.0, + "grad_norm": 1.4785590525434462, + "language_loss": 0.76248968, + "learning_rate": 2.477685910312432e-06, + "loss": 0.78487784, + "num_input_tokens_seen": 157069130, + "step": 7327, + "time_per_iteration": 2.6721925735473633 + }, + { + "auxiliary_loss_clip": 0.01136848, + "auxiliary_loss_mlp": 0.01119553, + "balance_loss_clip": 1.00203514, + "balance_loss_mlp": 1.00072479, + "epoch": 0.4405831955508793, + "flos": 17597665186560.0, + "grad_norm": 2.101621869769424, + "language_loss": 0.84391522, + "learning_rate": 2.4773077112954897e-06, + "loss": 0.86647922, + "num_input_tokens_seen": 157084940, + "step": 7328, + "time_per_iteration": 2.5741848945617676 + }, + { + "auxiliary_loss_clip": 0.01135115, + "auxiliary_loss_mlp": 0.01118877, + "balance_loss_clip": 1.00195539, + "balance_loss_mlp": 1.00052619, + "epoch": 0.4406433188035473, + "flos": 21463376739840.0, + "grad_norm": 2.411019348984773, + "language_loss": 0.77823937, + "learning_rate": 2.4769294941789908e-06, + "loss": 0.80077922, + "num_input_tokens_seen": 157102770, + "step": 7329, + "time_per_iteration": 2.630229949951172 + }, + { + "auxiliary_loss_clip": 0.01153547, + "auxiliary_loss_mlp": 0.01119519, + "balance_loss_clip": 1.00223374, + "balance_loss_mlp": 1.00069141, + "epoch": 0.44070344205621526, + "flos": 22673566788480.0, + "grad_norm": 1.623533505786951, + "language_loss": 0.73124826, + "learning_rate": 2.476551258977278e-06, + "loss": 0.75397897, + "num_input_tokens_seen": 157122035, + "step": 7330, + "time_per_iteration": 2.5692672729492188 + }, + { + "auxiliary_loss_clip": 0.01135246, + "auxiliary_loss_mlp": 0.01119067, + "balance_loss_clip": 1.00203896, + "balance_loss_mlp": 1.00071549, + "epoch": 0.4407635653088832, + "flos": 23441193365760.0, + "grad_norm": 1.8647257819461187, + "language_loss": 0.74394834, + "learning_rate": 2.4761730057046936e-06, + "loss": 0.76649153, + "num_input_tokens_seen": 157142800, + "step": 7331, + "time_per_iteration": 2.620576858520508 + }, + { + "auxiliary_loss_clip": 0.0110518, + "auxiliary_loss_mlp": 0.01118434, + "balance_loss_clip": 1.00191295, + "balance_loss_mlp": 1.00055957, + "epoch": 0.4408236885615512, + "flos": 24021294013440.0, + "grad_norm": 1.542432707374821, + "language_loss": 0.76528978, + "learning_rate": 2.475794734375581e-06, + "loss": 0.78752589, + "num_input_tokens_seen": 157163295, + "step": 7332, + "time_per_iteration": 2.703063488006592 + }, + { + "auxiliary_loss_clip": 0.01134937, + "auxiliary_loss_mlp": 0.01118616, + "balance_loss_clip": 1.00188076, + "balance_loss_mlp": 1.00064659, + "epoch": 0.44088381181421915, + "flos": 12676826597760.0, + "grad_norm": 1.8603103323243257, + "language_loss": 0.73243278, + "learning_rate": 2.475416445004285e-06, + "loss": 0.75496829, + "num_input_tokens_seen": 157180890, + "step": 7333, + "time_per_iteration": 2.6011769771575928 + }, + { + "auxiliary_loss_clip": 0.01119572, + "auxiliary_loss_mlp": 0.0111864, + "balance_loss_clip": 1.00198102, + "balance_loss_mlp": 1.00067079, + "epoch": 0.4409439350668871, + "flos": 24569865498240.0, + "grad_norm": 1.6139806315107352, + "language_loss": 0.79323828, + "learning_rate": 2.4750381376051493e-06, + "loss": 0.81562048, + "num_input_tokens_seen": 157200580, + "step": 7334, + "time_per_iteration": 2.6787102222442627 + }, + { + "auxiliary_loss_clip": 0.01119955, + "auxiliary_loss_mlp": 0.01120815, + "balance_loss_clip": 1.00192237, + "balance_loss_mlp": 1.00055647, + "epoch": 0.4410040583195551, + "flos": 22668574798080.0, + "grad_norm": 2.490967930686438, + "language_loss": 0.75582975, + "learning_rate": 2.47465981219252e-06, + "loss": 0.77823746, + "num_input_tokens_seen": 157218345, + "step": 7335, + "time_per_iteration": 4.06264591217041 + }, + { + "auxiliary_loss_clip": 0.01141544, + "auxiliary_loss_mlp": 0.01119616, + "balance_loss_clip": 1.00225508, + "balance_loss_mlp": 1.00078773, + "epoch": 0.44106418157222305, + "flos": 10852528700160.0, + "grad_norm": 1.7986762449915865, + "language_loss": 0.7252748, + "learning_rate": 2.4742814687807423e-06, + "loss": 0.74788642, + "num_input_tokens_seen": 157234395, + "step": 7336, + "time_per_iteration": 2.550989866256714 + }, + { + "auxiliary_loss_clip": 0.01153732, + "auxiliary_loss_mlp": 0.01120072, + "balance_loss_clip": 1.00209165, + "balance_loss_mlp": 1.00076783, + "epoch": 0.441124304824891, + "flos": 21726710323200.0, + "grad_norm": 2.177842551111617, + "language_loss": 0.63189137, + "learning_rate": 2.473903107384165e-06, + "loss": 0.65462935, + "num_input_tokens_seen": 157254805, + "step": 7337, + "time_per_iteration": 2.5658915042877197 + }, + { + "auxiliary_loss_clip": 0.01133811, + "auxiliary_loss_mlp": 0.00746346, + "balance_loss_clip": 1.00155413, + "balance_loss_mlp": 0.999901, + "epoch": 0.441184428077559, + "flos": 63220486625280.0, + "grad_norm": 0.7358893947925101, + "language_loss": 0.52634984, + "learning_rate": 2.473524728017134e-06, + "loss": 0.54515141, + "num_input_tokens_seen": 157317870, + "step": 7338, + "time_per_iteration": 3.1948273181915283 + }, + { + "auxiliary_loss_clip": 0.01137113, + "auxiliary_loss_mlp": 0.01120824, + "balance_loss_clip": 1.00204742, + "balance_loss_mlp": 1.00085139, + "epoch": 0.44124455133022694, + "flos": 21177959270400.0, + "grad_norm": 1.8403007482517042, + "language_loss": 0.7096231, + "learning_rate": 2.473146330693997e-06, + "loss": 0.73220247, + "num_input_tokens_seen": 157336505, + "step": 7339, + "time_per_iteration": 4.044001579284668 + }, + { + "auxiliary_loss_clip": 0.01088169, + "auxiliary_loss_mlp": 0.01118285, + "balance_loss_clip": 1.0018605, + "balance_loss_mlp": 1.00079274, + "epoch": 0.4413046745828949, + "flos": 17457865453440.0, + "grad_norm": 3.616992196115125, + "language_loss": 0.70012254, + "learning_rate": 2.472767915429105e-06, + "loss": 0.72218704, + "num_input_tokens_seen": 157354995, + "step": 7340, + "time_per_iteration": 4.031643629074097 + }, + { + "auxiliary_loss_clip": 0.0113046, + "auxiliary_loss_mlp": 0.01099193, + "balance_loss_clip": 1.00146723, + "balance_loss_mlp": 1.00001109, + "epoch": 0.4413647978355629, + "flos": 61586153804160.0, + "grad_norm": 0.9136934934504582, + "language_loss": 0.64021504, + "learning_rate": 2.4723894822368054e-06, + "loss": 0.66251159, + "num_input_tokens_seen": 157404260, + "step": 7341, + "time_per_iteration": 2.9715054035186768 + }, + { + "auxiliary_loss_clip": 0.01119856, + "auxiliary_loss_mlp": 0.01119457, + "balance_loss_clip": 1.00195789, + "balance_loss_mlp": 1.00062954, + "epoch": 0.4414249210882309, + "flos": 27527001505920.0, + "grad_norm": 2.101997285735128, + "language_loss": 0.73811889, + "learning_rate": 2.47201103113145e-06, + "loss": 0.76051205, + "num_input_tokens_seen": 157423045, + "step": 7342, + "time_per_iteration": 4.054593563079834 + }, + { + "auxiliary_loss_clip": 0.01168444, + "auxiliary_loss_mlp": 0.01118936, + "balance_loss_clip": 1.00212812, + "balance_loss_mlp": 1.0005852, + "epoch": 0.44148504434089886, + "flos": 23513984277120.0, + "grad_norm": 1.8598227274972994, + "language_loss": 0.79567873, + "learning_rate": 2.4716325621273886e-06, + "loss": 0.81855255, + "num_input_tokens_seen": 157441815, + "step": 7343, + "time_per_iteration": 2.5486695766448975 + }, + { + "auxiliary_loss_clip": 0.01119392, + "auxiliary_loss_mlp": 0.01119181, + "balance_loss_clip": 1.00173664, + "balance_loss_mlp": 1.0006392, + "epoch": 0.4415451675935668, + "flos": 21580589796480.0, + "grad_norm": 1.799174512300696, + "language_loss": 0.7657665, + "learning_rate": 2.4712540752389725e-06, + "loss": 0.78815228, + "num_input_tokens_seen": 157460470, + "step": 7344, + "time_per_iteration": 2.6478488445281982 + }, + { + "auxiliary_loss_clip": 0.01134411, + "auxiliary_loss_mlp": 0.01099194, + "balance_loss_clip": 1.00152969, + "balance_loss_mlp": 1.0000124, + "epoch": 0.4416052908462348, + "flos": 59006368126080.0, + "grad_norm": 0.7889294522937735, + "language_loss": 0.6379658, + "learning_rate": 2.470875570480556e-06, + "loss": 0.66030192, + "num_input_tokens_seen": 157512655, + "step": 7345, + "time_per_iteration": 2.9461264610290527 + }, + { + "auxiliary_loss_clip": 0.01168462, + "auxiliary_loss_mlp": 0.01118711, + "balance_loss_clip": 1.00215483, + "balance_loss_mlp": 1.00045562, + "epoch": 0.44166541409890275, + "flos": 26357642242560.0, + "grad_norm": 3.4355525086508263, + "language_loss": 0.85837603, + "learning_rate": 2.470497047866489e-06, + "loss": 0.88124776, + "num_input_tokens_seen": 157533700, + "step": 7346, + "time_per_iteration": 2.627060651779175 + }, + { + "auxiliary_loss_clip": 0.01153043, + "auxiliary_loss_mlp": 0.01119664, + "balance_loss_clip": 1.00204682, + "balance_loss_mlp": 1.00074053, + "epoch": 0.4417255373515707, + "flos": 20192678231040.0, + "grad_norm": 1.6248032306879054, + "language_loss": 0.80264425, + "learning_rate": 2.470118507411128e-06, + "loss": 0.82537127, + "num_input_tokens_seen": 157551105, + "step": 7347, + "time_per_iteration": 2.593346118927002 + }, + { + "auxiliary_loss_clip": 0.01135379, + "auxiliary_loss_mlp": 0.01119161, + "balance_loss_clip": 1.00192726, + "balance_loss_mlp": 1.00061941, + "epoch": 0.4417856606042387, + "flos": 17887895078400.0, + "grad_norm": 1.848271253080811, + "language_loss": 0.82579648, + "learning_rate": 2.4697399491288263e-06, + "loss": 0.84834188, + "num_input_tokens_seen": 157568285, + "step": 7348, + "time_per_iteration": 2.5968170166015625 + }, + { + "auxiliary_loss_clip": 0.01151903, + "auxiliary_loss_mlp": 0.01120473, + "balance_loss_clip": 1.00209212, + "balance_loss_mlp": 1.00059593, + "epoch": 0.44184578385690665, + "flos": 27964034282880.0, + "grad_norm": 1.840217471546471, + "language_loss": 0.70076263, + "learning_rate": 2.469361373033938e-06, + "loss": 0.72348642, + "num_input_tokens_seen": 157590405, + "step": 7349, + "time_per_iteration": 2.6380226612091064 + }, + { + "auxiliary_loss_clip": 0.01143096, + "auxiliary_loss_mlp": 0.01119091, + "balance_loss_clip": 1.00226855, + "balance_loss_mlp": 1.00054908, + "epoch": 0.4419059071095746, + "flos": 23367899664000.0, + "grad_norm": 1.6763864856566155, + "language_loss": 0.74584973, + "learning_rate": 2.468982779140819e-06, + "loss": 0.76847154, + "num_input_tokens_seen": 157607420, + "step": 7350, + "time_per_iteration": 2.6285107135772705 + }, + { + "auxiliary_loss_clip": 0.01168403, + "auxiliary_loss_mlp": 0.01119461, + "balance_loss_clip": 1.00214338, + "balance_loss_mlp": 1.00072861, + "epoch": 0.4419660303622426, + "flos": 15012169246080.0, + "grad_norm": 4.511280733406913, + "language_loss": 0.81243789, + "learning_rate": 2.468604167463827e-06, + "loss": 0.83531654, + "num_input_tokens_seen": 157624990, + "step": 7351, + "time_per_iteration": 2.5060617923736572 + }, + { + "auxiliary_loss_clip": 0.01127737, + "auxiliary_loss_mlp": 0.0074721, + "balance_loss_clip": 1.00215411, + "balance_loss_mlp": 1.00021982, + "epoch": 0.44202615361491054, + "flos": 25371750672000.0, + "grad_norm": 1.6407841327824297, + "language_loss": 0.73435432, + "learning_rate": 2.4682255380173176e-06, + "loss": 0.75310379, + "num_input_tokens_seen": 157645300, + "step": 7352, + "time_per_iteration": 2.68320894241333 + }, + { + "auxiliary_loss_clip": 0.01135501, + "auxiliary_loss_mlp": 0.01119563, + "balance_loss_clip": 1.00198078, + "balance_loss_mlp": 1.00063944, + "epoch": 0.4420862768675785, + "flos": 24681116897280.0, + "grad_norm": 1.7544508300281003, + "language_loss": 0.8702023, + "learning_rate": 2.467846890815649e-06, + "loss": 0.89275301, + "num_input_tokens_seen": 157664060, + "step": 7353, + "time_per_iteration": 2.618088960647583 + }, + { + "auxiliary_loss_clip": 0.01168544, + "auxiliary_loss_mlp": 0.01119049, + "balance_loss_clip": 1.00218034, + "balance_loss_mlp": 1.00060236, + "epoch": 0.44214640012024653, + "flos": 19528437974400.0, + "grad_norm": 2.3859166034909562, + "language_loss": 0.75842774, + "learning_rate": 2.4674682258731795e-06, + "loss": 0.7813037, + "num_input_tokens_seen": 157680905, + "step": 7354, + "time_per_iteration": 2.511906385421753 + }, + { + "auxiliary_loss_clip": 0.01118597, + "auxiliary_loss_mlp": 0.01118298, + "balance_loss_clip": 1.00189281, + "balance_loss_mlp": 1.00071037, + "epoch": 0.4422065233729145, + "flos": 47557434003840.0, + "grad_norm": 2.377288364535944, + "language_loss": 0.64979601, + "learning_rate": 2.467089543204268e-06, + "loss": 0.67216498, + "num_input_tokens_seen": 157701980, + "step": 7355, + "time_per_iteration": 2.853788137435913 + }, + { + "auxiliary_loss_clip": 0.01168548, + "auxiliary_loss_mlp": 0.01119786, + "balance_loss_clip": 1.002074, + "balance_loss_mlp": 1.00057685, + "epoch": 0.44226664662558246, + "flos": 19281050029440.0, + "grad_norm": 2.5294694290641497, + "language_loss": 0.78253865, + "learning_rate": 2.466710842823274e-06, + "loss": 0.80542207, + "num_input_tokens_seen": 157720555, + "step": 7356, + "time_per_iteration": 2.5647079944610596 + }, + { + "auxiliary_loss_clip": 0.01136284, + "auxiliary_loss_mlp": 0.0074746, + "balance_loss_clip": 1.0020597, + "balance_loss_mlp": 1.00031304, + "epoch": 0.4423267698782504, + "flos": 17821820010240.0, + "grad_norm": 1.6120067776417435, + "language_loss": 0.77423149, + "learning_rate": 2.4663321247445577e-06, + "loss": 0.79306889, + "num_input_tokens_seen": 157739160, + "step": 7357, + "time_per_iteration": 2.5818567276000977 + }, + { + "auxiliary_loss_clip": 0.01136383, + "auxiliary_loss_mlp": 0.01119252, + "balance_loss_clip": 1.00213003, + "balance_loss_mlp": 1.0006144, + "epoch": 0.4423868931309184, + "flos": 29204424691200.0, + "grad_norm": 1.5529393588281755, + "language_loss": 0.7349577, + "learning_rate": 2.465953388982481e-06, + "loss": 0.757514, + "num_input_tokens_seen": 157760020, + "step": 7358, + "time_per_iteration": 2.639805793762207 + }, + { + "auxiliary_loss_clip": 0.01134943, + "auxiliary_loss_mlp": 0.01119447, + "balance_loss_clip": 1.00190139, + "balance_loss_mlp": 1.0006187, + "epoch": 0.44244701638358636, + "flos": 29713135057920.0, + "grad_norm": 1.7229698108042197, + "language_loss": 0.75713181, + "learning_rate": 2.465574635551405e-06, + "loss": 0.77967572, + "num_input_tokens_seen": 157780435, + "step": 7359, + "time_per_iteration": 2.6725120544433594 + }, + { + "auxiliary_loss_clip": 0.0113663, + "auxiliary_loss_mlp": 0.01119802, + "balance_loss_clip": 1.00204682, + "balance_loss_mlp": 1.00078344, + "epoch": 0.4425071396362543, + "flos": 22930040874240.0, + "grad_norm": 2.666693278156118, + "language_loss": 0.69926631, + "learning_rate": 2.4651958644656923e-06, + "loss": 0.72183061, + "num_input_tokens_seen": 157799420, + "step": 7360, + "time_per_iteration": 2.6370458602905273 + }, + { + "auxiliary_loss_clip": 0.01135406, + "auxiliary_loss_mlp": 0.01119385, + "balance_loss_clip": 1.00195646, + "balance_loss_mlp": 1.00065291, + "epoch": 0.4425672628889223, + "flos": 19792346175360.0, + "grad_norm": 2.3727256614405916, + "language_loss": 0.69436681, + "learning_rate": 2.4648170757397053e-06, + "loss": 0.71691477, + "num_input_tokens_seen": 157817025, + "step": 7361, + "time_per_iteration": 2.605834484100342 + }, + { + "auxiliary_loss_clip": 0.01136814, + "auxiliary_loss_mlp": 0.01119358, + "balance_loss_clip": 1.00203729, + "balance_loss_mlp": 1.00062549, + "epoch": 0.44262738614159025, + "flos": 13662215377920.0, + "grad_norm": 1.9735921219171382, + "language_loss": 0.82590556, + "learning_rate": 2.464438269387809e-06, + "loss": 0.84846723, + "num_input_tokens_seen": 157834345, + "step": 7362, + "time_per_iteration": 2.6505706310272217 + }, + { + "auxiliary_loss_clip": 0.01119766, + "auxiliary_loss_mlp": 0.01120661, + "balance_loss_clip": 1.00201762, + "balance_loss_mlp": 1.00068843, + "epoch": 0.4426875093942582, + "flos": 14210212245120.0, + "grad_norm": 1.773376166419736, + "language_loss": 0.74032664, + "learning_rate": 2.464059445424366e-06, + "loss": 0.76273096, + "num_input_tokens_seen": 157852290, + "step": 7363, + "time_per_iteration": 2.6362359523773193 + }, + { + "auxiliary_loss_clip": 0.01096811, + "auxiliary_loss_mlp": 0.01098529, + "balance_loss_clip": 1.00138044, + "balance_loss_mlp": 1.00010991, + "epoch": 0.4427476326469262, + "flos": 70117525728000.0, + "grad_norm": 0.679461925630602, + "language_loss": 0.55670077, + "learning_rate": 2.463680603863743e-06, + "loss": 0.57865417, + "num_input_tokens_seen": 157923060, + "step": 7364, + "time_per_iteration": 3.3213717937469482 + }, + { + "auxiliary_loss_clip": 0.0113768, + "auxiliary_loss_mlp": 0.01118542, + "balance_loss_clip": 1.00211501, + "balance_loss_mlp": 1.00066829, + "epoch": 0.44280775589959415, + "flos": 25445080287360.0, + "grad_norm": 2.39250930053907, + "language_loss": 0.73777723, + "learning_rate": 2.463301744720305e-06, + "loss": 0.76033944, + "num_input_tokens_seen": 157944110, + "step": 7365, + "time_per_iteration": 2.6407623291015625 + }, + { + "auxiliary_loss_clip": 0.01136899, + "auxiliary_loss_mlp": 0.01118516, + "balance_loss_clip": 1.00200105, + "balance_loss_mlp": 1.0006417, + "epoch": 0.4428678791522621, + "flos": 22857214049280.0, + "grad_norm": 2.0832062338855617, + "language_loss": 0.73785055, + "learning_rate": 2.4629228680084184e-06, + "loss": 0.76040477, + "num_input_tokens_seen": 157964295, + "step": 7366, + "time_per_iteration": 2.595165967941284 + }, + { + "auxiliary_loss_clip": 0.01136317, + "auxiliary_loss_mlp": 0.01118607, + "balance_loss_clip": 1.00199747, + "balance_loss_mlp": 1.00044632, + "epoch": 0.44292800240493013, + "flos": 25812446636160.0, + "grad_norm": 4.244336563870927, + "language_loss": 0.73669916, + "learning_rate": 2.46254397374245e-06, + "loss": 0.75924838, + "num_input_tokens_seen": 157983970, + "step": 7367, + "time_per_iteration": 2.654001235961914 + }, + { + "auxiliary_loss_clip": 0.01168416, + "auxiliary_loss_mlp": 0.01118747, + "balance_loss_clip": 1.00213504, + "balance_loss_mlp": 1.00068212, + "epoch": 0.4429881256575981, + "flos": 32416885549440.0, + "grad_norm": 2.0695330518739654, + "language_loss": 0.74112052, + "learning_rate": 2.4621650619367677e-06, + "loss": 0.76399219, + "num_input_tokens_seen": 158006515, + "step": 7368, + "time_per_iteration": 2.6422171592712402 + }, + { + "auxiliary_loss_clip": 0.01136739, + "auxiliary_loss_mlp": 0.0111872, + "balance_loss_clip": 1.00217247, + "balance_loss_mlp": 1.00055981, + "epoch": 0.44304824891026606, + "flos": 22163707186560.0, + "grad_norm": 2.1257938127148055, + "language_loss": 0.79715556, + "learning_rate": 2.4617861326057403e-06, + "loss": 0.81971014, + "num_input_tokens_seen": 158025565, + "step": 7369, + "time_per_iteration": 2.6069037914276123 + }, + { + "auxiliary_loss_clip": 0.01120908, + "auxiliary_loss_mlp": 0.011179, + "balance_loss_clip": 1.0019027, + "balance_loss_mlp": 1.00050235, + "epoch": 0.443108372162934, + "flos": 25338569483520.0, + "grad_norm": 2.4971131305723517, + "language_loss": 0.71905768, + "learning_rate": 2.461407185763737e-06, + "loss": 0.74144566, + "num_input_tokens_seen": 158045620, + "step": 7370, + "time_per_iteration": 2.687838554382324 + }, + { + "auxiliary_loss_clip": 0.01168238, + "auxiliary_loss_mlp": 0.01118756, + "balance_loss_clip": 1.00207174, + "balance_loss_mlp": 1.00059533, + "epoch": 0.443168495415602, + "flos": 23330947547520.0, + "grad_norm": 1.6876461062758892, + "language_loss": 0.69550169, + "learning_rate": 2.461028221425126e-06, + "loss": 0.71837163, + "num_input_tokens_seen": 158063505, + "step": 7371, + "time_per_iteration": 2.52781081199646 + }, + { + "auxiliary_loss_clip": 0.01153417, + "auxiliary_loss_mlp": 0.01117588, + "balance_loss_clip": 1.00210571, + "balance_loss_mlp": 1.00047696, + "epoch": 0.44322861866826996, + "flos": 21871502046720.0, + "grad_norm": 2.2248576388515326, + "language_loss": 0.67490101, + "learning_rate": 2.4606492396042786e-06, + "loss": 0.69761103, + "num_input_tokens_seen": 158080335, + "step": 7372, + "time_per_iteration": 2.572500228881836 + }, + { + "auxiliary_loss_clip": 0.01121162, + "auxiliary_loss_mlp": 0.01118631, + "balance_loss_clip": 1.001894, + "balance_loss_mlp": 1.00047112, + "epoch": 0.4432887419209379, + "flos": 20084407660800.0, + "grad_norm": 2.5944523531973025, + "language_loss": 0.83253288, + "learning_rate": 2.4602702403155664e-06, + "loss": 0.85493082, + "num_input_tokens_seen": 158098955, + "step": 7373, + "time_per_iteration": 4.035140752792358 + }, + { + "auxiliary_loss_clip": 0.01149132, + "auxiliary_loss_mlp": 0.01097884, + "balance_loss_clip": 1.00163555, + "balance_loss_mlp": 1.00022793, + "epoch": 0.4433488651736059, + "flos": 70035540935040.0, + "grad_norm": 0.7571506506560447, + "language_loss": 0.5524292, + "learning_rate": 2.4598912235733604e-06, + "loss": 0.57489932, + "num_input_tokens_seen": 158164110, + "step": 7374, + "time_per_iteration": 3.230055332183838 + }, + { + "auxiliary_loss_clip": 0.01101423, + "auxiliary_loss_mlp": 0.01118057, + "balance_loss_clip": 1.00177908, + "balance_loss_mlp": 1.00075471, + "epoch": 0.44340898842627385, + "flos": 16282472705280.0, + "grad_norm": 7.685420970656737, + "language_loss": 0.82137954, + "learning_rate": 2.4595121893920327e-06, + "loss": 0.8435744, + "num_input_tokens_seen": 158179850, + "step": 7375, + "time_per_iteration": 2.687953472137451 + }, + { + "auxiliary_loss_clip": 0.01168491, + "auxiliary_loss_mlp": 0.01118425, + "balance_loss_clip": 1.00214112, + "balance_loss_mlp": 1.00055122, + "epoch": 0.4434691116789418, + "flos": 16611989097600.0, + "grad_norm": 2.0273772779215067, + "language_loss": 0.84024417, + "learning_rate": 2.4591331377859578e-06, + "loss": 0.8631134, + "num_input_tokens_seen": 158196590, + "step": 7376, + "time_per_iteration": 2.491847276687622 + }, + { + "auxiliary_loss_clip": 0.01134954, + "auxiliary_loss_mlp": 0.01118766, + "balance_loss_clip": 1.00189233, + "balance_loss_mlp": 1.00070083, + "epoch": 0.4435292349316098, + "flos": 19063251912960.0, + "grad_norm": 1.9783501176600207, + "language_loss": 0.76816118, + "learning_rate": 2.4587540687695077e-06, + "loss": 0.79069841, + "num_input_tokens_seen": 158216355, + "step": 7377, + "time_per_iteration": 4.011488437652588 + }, + { + "auxiliary_loss_clip": 0.01151484, + "auxiliary_loss_mlp": 0.01118015, + "balance_loss_clip": 1.00198126, + "balance_loss_mlp": 1.00052214, + "epoch": 0.44358935818427775, + "flos": 21251324799360.0, + "grad_norm": 1.8624311215677183, + "language_loss": 0.7636683, + "learning_rate": 2.458374982357057e-06, + "loss": 0.7863633, + "num_input_tokens_seen": 158235825, + "step": 7378, + "time_per_iteration": 3.9359724521636963 + }, + { + "auxiliary_loss_clip": 0.01137468, + "auxiliary_loss_mlp": 0.0111843, + "balance_loss_clip": 1.00198257, + "balance_loss_mlp": 1.00065136, + "epoch": 0.4436494814369457, + "flos": 12495298239360.0, + "grad_norm": 2.015758639754949, + "language_loss": 0.69034225, + "learning_rate": 2.457995878562982e-06, + "loss": 0.71290123, + "num_input_tokens_seen": 158254230, + "step": 7379, + "time_per_iteration": 2.5909264087677 + }, + { + "auxiliary_loss_clip": 0.01090274, + "auxiliary_loss_mlp": 0.01118459, + "balance_loss_clip": 1.00202084, + "balance_loss_mlp": 1.00058436, + "epoch": 0.44370960468961373, + "flos": 23659853408640.0, + "grad_norm": 1.869129488933463, + "language_loss": 0.72865117, + "learning_rate": 2.457616757401656e-06, + "loss": 0.7507385, + "num_input_tokens_seen": 158273400, + "step": 7380, + "time_per_iteration": 4.088131666183472 + }, + { + "auxiliary_loss_clip": 0.01136075, + "auxiliary_loss_mlp": 0.01118642, + "balance_loss_clip": 1.00200462, + "balance_loss_mlp": 1.00057745, + "epoch": 0.4437697279422817, + "flos": 32416849635840.0, + "grad_norm": 2.0660608638543914, + "language_loss": 0.64731884, + "learning_rate": 2.457237618887458e-06, + "loss": 0.66986609, + "num_input_tokens_seen": 158296840, + "step": 7381, + "time_per_iteration": 2.7061386108398438 + }, + { + "auxiliary_loss_clip": 0.01151859, + "auxiliary_loss_mlp": 0.01118456, + "balance_loss_clip": 1.00216174, + "balance_loss_mlp": 1.00067711, + "epoch": 0.44382985119494966, + "flos": 18112875914880.0, + "grad_norm": 2.046462221942345, + "language_loss": 0.79433489, + "learning_rate": 2.456858463034763e-06, + "loss": 0.81703806, + "num_input_tokens_seen": 158314935, + "step": 7382, + "time_per_iteration": 2.527113676071167 + }, + { + "auxiliary_loss_clip": 0.01153439, + "auxiliary_loss_mlp": 0.0111902, + "balance_loss_clip": 1.00218892, + "balance_loss_mlp": 1.00076461, + "epoch": 0.44388997444761763, + "flos": 30774151923840.0, + "grad_norm": 2.14323980803545, + "language_loss": 0.64960396, + "learning_rate": 2.456479289857949e-06, + "loss": 0.67232847, + "num_input_tokens_seen": 158334620, + "step": 7383, + "time_per_iteration": 2.646206855773926 + }, + { + "auxiliary_loss_clip": 0.01136243, + "auxiliary_loss_mlp": 0.01119285, + "balance_loss_clip": 1.00216269, + "balance_loss_mlp": 1.00064814, + "epoch": 0.4439500977002856, + "flos": 20339157893760.0, + "grad_norm": 2.6099000193123296, + "language_loss": 0.75447488, + "learning_rate": 2.4561000993713953e-06, + "loss": 0.77703017, + "num_input_tokens_seen": 158350550, + "step": 7384, + "time_per_iteration": 2.6001803874969482 + }, + { + "auxiliary_loss_clip": 0.01168429, + "auxiliary_loss_mlp": 0.01118634, + "balance_loss_clip": 1.00216293, + "balance_loss_mlp": 1.00076032, + "epoch": 0.44401022095295356, + "flos": 20371225760640.0, + "grad_norm": 1.848528715717117, + "language_loss": 0.80919766, + "learning_rate": 2.4557208915894796e-06, + "loss": 0.83206832, + "num_input_tokens_seen": 158369555, + "step": 7385, + "time_per_iteration": 2.5382089614868164 + }, + { + "auxiliary_loss_clip": 0.0110424, + "auxiliary_loss_mlp": 0.01118073, + "balance_loss_clip": 1.00176394, + "balance_loss_mlp": 1.00067544, + "epoch": 0.4440703442056215, + "flos": 20230635928320.0, + "grad_norm": 1.7206177910297002, + "language_loss": 0.81313324, + "learning_rate": 2.455341666526582e-06, + "loss": 0.83535635, + "num_input_tokens_seen": 158388045, + "step": 7386, + "time_per_iteration": 2.6885087490081787 + }, + { + "auxiliary_loss_clip": 0.01120441, + "auxiliary_loss_mlp": 0.01119829, + "balance_loss_clip": 1.00183082, + "balance_loss_mlp": 1.00061965, + "epoch": 0.4441304674582895, + "flos": 39494698824960.0, + "grad_norm": 1.8104867803779825, + "language_loss": 0.69978917, + "learning_rate": 2.4549624241970832e-06, + "loss": 0.72219187, + "num_input_tokens_seen": 158410115, + "step": 7387, + "time_per_iteration": 2.8041927814483643 + }, + { + "auxiliary_loss_clip": 0.01084933, + "auxiliary_loss_mlp": 0.01118422, + "balance_loss_clip": 1.00165153, + "balance_loss_mlp": 1.00083447, + "epoch": 0.44419059071095746, + "flos": 14829671220480.0, + "grad_norm": 2.013581875333741, + "language_loss": 0.71750867, + "learning_rate": 2.4545831646153628e-06, + "loss": 0.73954225, + "num_input_tokens_seen": 158427765, + "step": 7388, + "time_per_iteration": 2.6861891746520996 + }, + { + "auxiliary_loss_clip": 0.01151644, + "auxiliary_loss_mlp": 0.0111833, + "balance_loss_clip": 1.00199842, + "balance_loss_mlp": 1.00055087, + "epoch": 0.4442507139636254, + "flos": 22637835734400.0, + "grad_norm": 1.4712188199879976, + "language_loss": 0.69034171, + "learning_rate": 2.4542038877958044e-06, + "loss": 0.71304142, + "num_input_tokens_seen": 158446375, + "step": 7389, + "time_per_iteration": 2.6023900508880615 + }, + { + "auxiliary_loss_clip": 0.01151981, + "auxiliary_loss_mlp": 0.01118232, + "balance_loss_clip": 1.0020206, + "balance_loss_mlp": 1.00054908, + "epoch": 0.4443108372162934, + "flos": 38290721829120.0, + "grad_norm": 2.017888280485654, + "language_loss": 0.75089109, + "learning_rate": 2.453824593752788e-06, + "loss": 0.77359325, + "num_input_tokens_seen": 158467260, + "step": 7390, + "time_per_iteration": 2.723179817199707 + }, + { + "auxiliary_loss_clip": 0.01153074, + "auxiliary_loss_mlp": 0.01117579, + "balance_loss_clip": 1.00210428, + "balance_loss_mlp": 1.00056314, + "epoch": 0.44437096046896135, + "flos": 17748993185280.0, + "grad_norm": 3.010886749794816, + "language_loss": 0.81361878, + "learning_rate": 2.4534452825006988e-06, + "loss": 0.83632529, + "num_input_tokens_seen": 158486720, + "step": 7391, + "time_per_iteration": 2.562647819519043 + }, + { + "auxiliary_loss_clip": 0.01136531, + "auxiliary_loss_mlp": 0.01117898, + "balance_loss_clip": 1.00217366, + "balance_loss_mlp": 1.00059605, + "epoch": 0.4444310837216293, + "flos": 13732348682880.0, + "grad_norm": 1.7904257457011092, + "language_loss": 0.73958755, + "learning_rate": 2.4530659540539185e-06, + "loss": 0.76213193, + "num_input_tokens_seen": 158502530, + "step": 7392, + "time_per_iteration": 2.6550185680389404 + }, + { + "auxiliary_loss_clip": 0.0115159, + "auxiliary_loss_mlp": 0.01117736, + "balance_loss_clip": 1.00192845, + "balance_loss_mlp": 1.00062478, + "epoch": 0.44449120697429734, + "flos": 25010238240000.0, + "grad_norm": 1.4580103970093792, + "language_loss": 0.79641581, + "learning_rate": 2.4526866084268313e-06, + "loss": 0.81910908, + "num_input_tokens_seen": 158522715, + "step": 7393, + "time_per_iteration": 2.81272029876709 + }, + { + "auxiliary_loss_clip": 0.0115344, + "auxiliary_loss_mlp": 0.01119035, + "balance_loss_clip": 1.00211859, + "balance_loss_mlp": 1.00058889, + "epoch": 0.4445513302269653, + "flos": 32671707609600.0, + "grad_norm": 1.8824002587019906, + "language_loss": 0.80806029, + "learning_rate": 2.4523072456338226e-06, + "loss": 0.83078504, + "num_input_tokens_seen": 158543615, + "step": 7394, + "time_per_iteration": 2.6923866271972656 + }, + { + "auxiliary_loss_clip": 0.01136677, + "auxiliary_loss_mlp": 0.01118437, + "balance_loss_clip": 1.00201309, + "balance_loss_mlp": 1.00084853, + "epoch": 0.44461145347963327, + "flos": 11655814504320.0, + "grad_norm": 2.5850739090186603, + "language_loss": 0.79477322, + "learning_rate": 2.4519278656892785e-06, + "loss": 0.81732446, + "num_input_tokens_seen": 158560330, + "step": 7395, + "time_per_iteration": 2.594975709915161 + }, + { + "auxiliary_loss_clip": 0.01136407, + "auxiliary_loss_mlp": 0.01118165, + "balance_loss_clip": 1.00209689, + "balance_loss_mlp": 1.00067234, + "epoch": 0.44467157673230123, + "flos": 20886759711360.0, + "grad_norm": 2.4403337910989293, + "language_loss": 0.6812672, + "learning_rate": 2.451548468607584e-06, + "loss": 0.7038129, + "num_input_tokens_seen": 158579735, + "step": 7396, + "time_per_iteration": 2.601628303527832 + }, + { + "auxiliary_loss_clip": 0.01153465, + "auxiliary_loss_mlp": 0.00747365, + "balance_loss_clip": 1.0022682, + "balance_loss_mlp": 1.00029778, + "epoch": 0.4447316999849692, + "flos": 18546137763840.0, + "grad_norm": 2.1498507113233143, + "language_loss": 0.80768037, + "learning_rate": 2.451169054403126e-06, + "loss": 0.82668865, + "num_input_tokens_seen": 158597075, + "step": 7397, + "time_per_iteration": 2.55903697013855 + }, + { + "auxiliary_loss_clip": 0.01152927, + "auxiliary_loss_mlp": 0.01118252, + "balance_loss_clip": 1.00208902, + "balance_loss_mlp": 1.00056899, + "epoch": 0.44479182323763716, + "flos": 23769057732480.0, + "grad_norm": 1.6176079301252042, + "language_loss": 0.67595673, + "learning_rate": 2.450789623090293e-06, + "loss": 0.69866848, + "num_input_tokens_seen": 158616650, + "step": 7398, + "time_per_iteration": 2.621187210083008 + }, + { + "auxiliary_loss_clip": 0.01121189, + "auxiliary_loss_mlp": 0.01118425, + "balance_loss_clip": 1.00203693, + "balance_loss_mlp": 1.00083685, + "epoch": 0.44485194649030513, + "flos": 16543831040640.0, + "grad_norm": 1.7243407722293365, + "language_loss": 0.696684, + "learning_rate": 2.450410174683472e-06, + "loss": 0.71908015, + "num_input_tokens_seen": 158634515, + "step": 7399, + "time_per_iteration": 2.6216022968292236 + }, + { + "auxiliary_loss_clip": 0.01121449, + "auxiliary_loss_mlp": 0.01117645, + "balance_loss_clip": 1.00189757, + "balance_loss_mlp": 1.00053382, + "epoch": 0.4449120697429731, + "flos": 22600955445120.0, + "grad_norm": 1.8033825572907909, + "language_loss": 0.7286275, + "learning_rate": 2.4500307091970514e-06, + "loss": 0.7510184, + "num_input_tokens_seen": 158653760, + "step": 7400, + "time_per_iteration": 2.634495496749878 + }, + { + "auxiliary_loss_clip": 0.01101441, + "auxiliary_loss_mlp": 0.00747061, + "balance_loss_clip": 1.00178099, + "balance_loss_mlp": 1.0001812, + "epoch": 0.44497219299564106, + "flos": 20004864992640.0, + "grad_norm": 1.696622110899181, + "language_loss": 0.8479445, + "learning_rate": 2.449651226645422e-06, + "loss": 0.86642951, + "num_input_tokens_seen": 158672190, + "step": 7401, + "time_per_iteration": 2.688619375228882 + }, + { + "auxiliary_loss_clip": 0.01134783, + "auxiliary_loss_mlp": 0.01117572, + "balance_loss_clip": 1.00197268, + "balance_loss_mlp": 1.0006516, + "epoch": 0.445032316248309, + "flos": 25594253470080.0, + "grad_norm": 2.1362238640900566, + "language_loss": 0.83375108, + "learning_rate": 2.449271727042973e-06, + "loss": 0.8562746, + "num_input_tokens_seen": 158694115, + "step": 7402, + "time_per_iteration": 2.635864019393921 + }, + { + "auxiliary_loss_clip": 0.01134627, + "auxiliary_loss_mlp": 0.01118419, + "balance_loss_clip": 1.00207424, + "balance_loss_mlp": 1.00064051, + "epoch": 0.445092439500977, + "flos": 21250426959360.0, + "grad_norm": 1.996236309906368, + "language_loss": 0.76877689, + "learning_rate": 2.4488922104040947e-06, + "loss": 0.79130733, + "num_input_tokens_seen": 158711000, + "step": 7403, + "time_per_iteration": 2.5795655250549316 + }, + { + "auxiliary_loss_clip": 0.01131041, + "auxiliary_loss_mlp": 0.01097339, + "balance_loss_clip": 1.00181484, + "balance_loss_mlp": 1.0000639, + "epoch": 0.44515256275364495, + "flos": 57764900309760.0, + "grad_norm": 0.7444701450000757, + "language_loss": 0.60021496, + "learning_rate": 2.4485126767431793e-06, + "loss": 0.62249875, + "num_input_tokens_seen": 158769675, + "step": 7404, + "time_per_iteration": 3.1387362480163574 + }, + { + "auxiliary_loss_clip": 0.011384, + "auxiliary_loss_mlp": 0.01118836, + "balance_loss_clip": 1.00205576, + "balance_loss_mlp": 1.00058019, + "epoch": 0.4452126860063129, + "flos": 15596004908160.0, + "grad_norm": 3.4912272257206487, + "language_loss": 0.82186139, + "learning_rate": 2.4481331260746177e-06, + "loss": 0.84443367, + "num_input_tokens_seen": 158788215, + "step": 7405, + "time_per_iteration": 2.583979606628418 + }, + { + "auxiliary_loss_clip": 0.0113489, + "auxiliary_loss_mlp": 0.01117679, + "balance_loss_clip": 1.00192964, + "balance_loss_mlp": 1.00056744, + "epoch": 0.4452728092589809, + "flos": 21617398258560.0, + "grad_norm": 3.2568667254248806, + "language_loss": 0.74987209, + "learning_rate": 2.4477535584128036e-06, + "loss": 0.77239776, + "num_input_tokens_seen": 158809090, + "step": 7406, + "time_per_iteration": 2.6466565132141113 + }, + { + "auxiliary_loss_clip": 0.01119175, + "auxiliary_loss_mlp": 0.0111679, + "balance_loss_clip": 1.00195539, + "balance_loss_mlp": 1.00063288, + "epoch": 0.4453329325116489, + "flos": 29497491757440.0, + "grad_norm": 1.8013726196456996, + "language_loss": 0.65691674, + "learning_rate": 2.447373973772129e-06, + "loss": 0.67927635, + "num_input_tokens_seen": 158828320, + "step": 7407, + "time_per_iteration": 2.710965871810913 + }, + { + "auxiliary_loss_clip": 0.01136716, + "auxiliary_loss_mlp": 0.01118091, + "balance_loss_clip": 1.00220752, + "balance_loss_mlp": 1.00069416, + "epoch": 0.44539305576431687, + "flos": 21361139654400.0, + "grad_norm": 1.5644576828510346, + "language_loss": 0.67968571, + "learning_rate": 2.4469943721669887e-06, + "loss": 0.70223379, + "num_input_tokens_seen": 158847040, + "step": 7408, + "time_per_iteration": 2.628040075302124 + }, + { + "auxiliary_loss_clip": 0.01168327, + "auxiliary_loss_mlp": 0.01118107, + "balance_loss_clip": 1.00211477, + "balance_loss_mlp": 1.00070953, + "epoch": 0.44545317901698483, + "flos": 41427626428800.0, + "grad_norm": 1.5363086243661472, + "language_loss": 0.71685827, + "learning_rate": 2.4466147536117776e-06, + "loss": 0.73972261, + "num_input_tokens_seen": 158870490, + "step": 7409, + "time_per_iteration": 2.681509017944336 + }, + { + "auxiliary_loss_clip": 0.01134938, + "auxiliary_loss_mlp": 0.01118423, + "balance_loss_clip": 1.00195813, + "balance_loss_mlp": 1.00054908, + "epoch": 0.4455133022696528, + "flos": 22055005653120.0, + "grad_norm": 1.7722750980831652, + "language_loss": 0.64538169, + "learning_rate": 2.4462351181208895e-06, + "loss": 0.66791534, + "num_input_tokens_seen": 158889920, + "step": 7410, + "time_per_iteration": 2.6230251789093018 + }, + { + "auxiliary_loss_clip": 0.01137103, + "auxiliary_loss_mlp": 0.01119015, + "balance_loss_clip": 1.00195479, + "balance_loss_mlp": 1.00075924, + "epoch": 0.44557342552232077, + "flos": 23476960333440.0, + "grad_norm": 1.9911216516525256, + "language_loss": 0.74146956, + "learning_rate": 2.4458554657087217e-06, + "loss": 0.76403081, + "num_input_tokens_seen": 158909580, + "step": 7411, + "time_per_iteration": 4.007925987243652 + }, + { + "auxiliary_loss_clip": 0.01086748, + "auxiliary_loss_mlp": 0.01117854, + "balance_loss_clip": 1.00179875, + "balance_loss_mlp": 1.00074315, + "epoch": 0.44563354877498873, + "flos": 19134678107520.0, + "grad_norm": 1.7763066298191525, + "language_loss": 0.78827035, + "learning_rate": 2.4454757963896695e-06, + "loss": 0.81031632, + "num_input_tokens_seen": 158924600, + "step": 7412, + "time_per_iteration": 2.702233076095581 + }, + { + "auxiliary_loss_clip": 0.01137276, + "auxiliary_loss_mlp": 0.01118731, + "balance_loss_clip": 1.00198257, + "balance_loss_mlp": 1.00057101, + "epoch": 0.4456936720276567, + "flos": 13621420506240.0, + "grad_norm": 2.0301245366891005, + "language_loss": 0.80552161, + "learning_rate": 2.4450961101781304e-06, + "loss": 0.82808167, + "num_input_tokens_seen": 158939345, + "step": 7413, + "time_per_iteration": 2.5675926208496094 + }, + { + "auxiliary_loss_clip": 0.01151774, + "auxiliary_loss_mlp": 0.01117482, + "balance_loss_clip": 1.00207686, + "balance_loss_mlp": 1.00046563, + "epoch": 0.44575379528032466, + "flos": 14713715139840.0, + "grad_norm": 2.0837558662174938, + "language_loss": 0.76021534, + "learning_rate": 2.4447164070885026e-06, + "loss": 0.7829079, + "num_input_tokens_seen": 158955855, + "step": 7414, + "time_per_iteration": 2.561753034591675 + }, + { + "auxiliary_loss_clip": 0.0113624, + "auxiliary_loss_mlp": 0.01117937, + "balance_loss_clip": 1.00203967, + "balance_loss_mlp": 1.00063467, + "epoch": 0.4458139185329926, + "flos": 24170682677760.0, + "grad_norm": 1.7626142477895181, + "language_loss": 0.83332145, + "learning_rate": 2.4443366871351837e-06, + "loss": 0.85586315, + "num_input_tokens_seen": 158976315, + "step": 7415, + "time_per_iteration": 4.06007194519043 + }, + { + "auxiliary_loss_clip": 0.0116815, + "auxiliary_loss_mlp": 0.01117727, + "balance_loss_clip": 1.00200796, + "balance_loss_mlp": 1.00080657, + "epoch": 0.4458740417856606, + "flos": 21762225895680.0, + "grad_norm": 1.527961171169649, + "language_loss": 0.83944958, + "learning_rate": 2.4439569503325732e-06, + "loss": 0.86230838, + "num_input_tokens_seen": 158996725, + "step": 7416, + "time_per_iteration": 3.9724273681640625 + }, + { + "auxiliary_loss_clip": 0.01121572, + "auxiliary_loss_mlp": 0.01118379, + "balance_loss_clip": 1.00183225, + "balance_loss_mlp": 1.00079107, + "epoch": 0.44593416503832856, + "flos": 21068790860160.0, + "grad_norm": 1.7741083027979356, + "language_loss": 0.81129014, + "learning_rate": 2.4435771966950706e-06, + "loss": 0.83368969, + "num_input_tokens_seen": 159017255, + "step": 7417, + "time_per_iteration": 2.6748626232147217 + }, + { + "auxiliary_loss_clip": 0.01136751, + "auxiliary_loss_mlp": 0.01117896, + "balance_loss_clip": 1.00197339, + "balance_loss_mlp": 1.00097573, + "epoch": 0.4459942882909965, + "flos": 22600488568320.0, + "grad_norm": 1.9281503765179309, + "language_loss": 0.80774426, + "learning_rate": 2.443197426237077e-06, + "loss": 0.83029068, + "num_input_tokens_seen": 159035010, + "step": 7418, + "time_per_iteration": 3.988320827484131 + }, + { + "auxiliary_loss_clip": 0.01152799, + "auxiliary_loss_mlp": 0.0074748, + "balance_loss_clip": 1.00200462, + "balance_loss_mlp": 1.00028801, + "epoch": 0.4460544115436645, + "flos": 26505486622080.0, + "grad_norm": 1.626310361395724, + "language_loss": 0.77197158, + "learning_rate": 2.442817638972991e-06, + "loss": 0.79097444, + "num_input_tokens_seen": 159055345, + "step": 7419, + "time_per_iteration": 2.588872194290161 + }, + { + "auxiliary_loss_clip": 0.01121754, + "auxiliary_loss_mlp": 0.01118728, + "balance_loss_clip": 1.00193882, + "balance_loss_mlp": 1.0006634, + "epoch": 0.4461145347963325, + "flos": 17604021893760.0, + "grad_norm": 1.5615379924329174, + "language_loss": 0.72072458, + "learning_rate": 2.4424378349172176e-06, + "loss": 0.74312937, + "num_input_tokens_seen": 159074225, + "step": 7420, + "time_per_iteration": 2.6105198860168457 + }, + { + "auxiliary_loss_clip": 0.01137842, + "auxiliary_loss_mlp": 0.01117473, + "balance_loss_clip": 1.00201559, + "balance_loss_mlp": 1.00055289, + "epoch": 0.44617465804900047, + "flos": 27268193036160.0, + "grad_norm": 1.7875754508563861, + "language_loss": 0.75212556, + "learning_rate": 2.442058014084156e-06, + "loss": 0.77467871, + "num_input_tokens_seen": 159095415, + "step": 7421, + "time_per_iteration": 2.6511800289154053 + }, + { + "auxiliary_loss_clip": 0.01089564, + "auxiliary_loss_mlp": 0.01117625, + "balance_loss_clip": 1.00195467, + "balance_loss_mlp": 1.00089526, + "epoch": 0.44623478130166844, + "flos": 17786412178560.0, + "grad_norm": 1.9408731511670025, + "language_loss": 0.7606647, + "learning_rate": 2.44167817648821e-06, + "loss": 0.78273666, + "num_input_tokens_seen": 159114615, + "step": 7422, + "time_per_iteration": 2.728522777557373 + }, + { + "auxiliary_loss_clip": 0.01168269, + "auxiliary_loss_mlp": 0.01119152, + "balance_loss_clip": 1.00207949, + "balance_loss_mlp": 1.00080085, + "epoch": 0.4462949045543364, + "flos": 23003011353600.0, + "grad_norm": 1.440973978279129, + "language_loss": 0.65466309, + "learning_rate": 2.441298322143784e-06, + "loss": 0.6775372, + "num_input_tokens_seen": 159134370, + "step": 7423, + "time_per_iteration": 2.569880723953247 + }, + { + "auxiliary_loss_clip": 0.01134693, + "auxiliary_loss_mlp": 0.01117253, + "balance_loss_clip": 1.00203753, + "balance_loss_mlp": 1.00061846, + "epoch": 0.44635502780700437, + "flos": 17820096157440.0, + "grad_norm": 1.5200233006024781, + "language_loss": 0.78976494, + "learning_rate": 2.4409184510652807e-06, + "loss": 0.81228441, + "num_input_tokens_seen": 159152540, + "step": 7424, + "time_per_iteration": 2.5773191452026367 + }, + { + "auxiliary_loss_clip": 0.01151248, + "auxiliary_loss_mlp": 0.01116493, + "balance_loss_clip": 1.00204968, + "balance_loss_mlp": 1.00052619, + "epoch": 0.44641515105967233, + "flos": 26688020561280.0, + "grad_norm": 1.4292432055827973, + "language_loss": 0.80612481, + "learning_rate": 2.4405385632671063e-06, + "loss": 0.82880223, + "num_input_tokens_seen": 159173425, + "step": 7425, + "time_per_iteration": 2.6145741939544678 + }, + { + "auxiliary_loss_clip": 0.01151729, + "auxiliary_loss_mlp": 0.01117473, + "balance_loss_clip": 1.00204337, + "balance_loss_mlp": 1.00055218, + "epoch": 0.4464752743123403, + "flos": 18913324544640.0, + "grad_norm": 1.6399183159821538, + "language_loss": 0.77380013, + "learning_rate": 2.4401586587636655e-06, + "loss": 0.79649222, + "num_input_tokens_seen": 159191210, + "step": 7426, + "time_per_iteration": 2.5582168102264404 + }, + { + "auxiliary_loss_clip": 0.01135302, + "auxiliary_loss_mlp": 0.00747437, + "balance_loss_clip": 1.00189888, + "balance_loss_mlp": 1.00027108, + "epoch": 0.44653539756500826, + "flos": 29570318582400.0, + "grad_norm": 1.8135804117869405, + "language_loss": 0.64755362, + "learning_rate": 2.4397787375693634e-06, + "loss": 0.66638094, + "num_input_tokens_seen": 159211755, + "step": 7427, + "time_per_iteration": 2.6705219745635986 + }, + { + "auxiliary_loss_clip": 0.01158142, + "auxiliary_loss_mlp": 0.01118024, + "balance_loss_clip": 1.00258303, + "balance_loss_mlp": 1.00062609, + "epoch": 0.44659552081767623, + "flos": 21468979261440.0, + "grad_norm": 1.804038342890239, + "language_loss": 0.75427371, + "learning_rate": 2.439398799698608e-06, + "loss": 0.77703542, + "num_input_tokens_seen": 159230315, + "step": 7428, + "time_per_iteration": 2.5736443996429443 + }, + { + "auxiliary_loss_clip": 0.01137046, + "auxiliary_loss_mlp": 0.01117769, + "balance_loss_clip": 1.00197864, + "balance_loss_mlp": 1.0006578, + "epoch": 0.4466556440703442, + "flos": 17931886260480.0, + "grad_norm": 1.7347036242104032, + "language_loss": 0.77478814, + "learning_rate": 2.439018845165806e-06, + "loss": 0.79733634, + "num_input_tokens_seen": 159249810, + "step": 7429, + "time_per_iteration": 2.6082534790039062 + }, + { + "auxiliary_loss_clip": 0.01152209, + "auxiliary_loss_mlp": 0.01118332, + "balance_loss_clip": 1.00217867, + "balance_loss_mlp": 1.00064838, + "epoch": 0.44671576732301216, + "flos": 21107430915840.0, + "grad_norm": 2.3233009831796974, + "language_loss": 0.91289973, + "learning_rate": 2.438638873985366e-06, + "loss": 0.93560517, + "num_input_tokens_seen": 159271715, + "step": 7430, + "time_per_iteration": 2.608043670654297 + }, + { + "auxiliary_loss_clip": 0.01134992, + "auxiliary_loss_mlp": 0.0074757, + "balance_loss_clip": 1.00202322, + "balance_loss_mlp": 1.0002929, + "epoch": 0.4467758905756801, + "flos": 23508920459520.0, + "grad_norm": 2.1733956360624846, + "language_loss": 0.79784095, + "learning_rate": 2.4382588861716954e-06, + "loss": 0.8166666, + "num_input_tokens_seen": 159290690, + "step": 7431, + "time_per_iteration": 2.617955446243286 + }, + { + "auxiliary_loss_clip": 0.01136034, + "auxiliary_loss_mlp": 0.01118261, + "balance_loss_clip": 1.00208461, + "balance_loss_mlp": 1.00057769, + "epoch": 0.4468360138283481, + "flos": 18734022829440.0, + "grad_norm": 1.8134859166284514, + "language_loss": 0.79575908, + "learning_rate": 2.437878881739204e-06, + "loss": 0.81830204, + "num_input_tokens_seen": 159309400, + "step": 7432, + "time_per_iteration": 2.5792133808135986 + }, + { + "auxiliary_loss_clip": 0.01118363, + "auxiliary_loss_mlp": 0.01118003, + "balance_loss_clip": 1.00198567, + "balance_loss_mlp": 1.00070119, + "epoch": 0.4468961370810161, + "flos": 23477139901440.0, + "grad_norm": 1.8252820637973977, + "language_loss": 0.77058107, + "learning_rate": 2.437498860702301e-06, + "loss": 0.79294467, + "num_input_tokens_seen": 159327425, + "step": 7433, + "time_per_iteration": 2.735426664352417 + }, + { + "auxiliary_loss_clip": 0.0115125, + "auxiliary_loss_mlp": 0.01117094, + "balance_loss_clip": 1.00193214, + "balance_loss_mlp": 1.00065053, + "epoch": 0.4469562603336841, + "flos": 30075042539520.0, + "grad_norm": 1.8030234241420142, + "language_loss": 0.77536243, + "learning_rate": 2.437118823075398e-06, + "loss": 0.79804587, + "num_input_tokens_seen": 159345805, + "step": 7434, + "time_per_iteration": 2.6579127311706543 + }, + { + "auxiliary_loss_clip": 0.0115187, + "auxiliary_loss_mlp": 0.01118394, + "balance_loss_clip": 1.00211608, + "balance_loss_mlp": 1.00061548, + "epoch": 0.44701638358635204, + "flos": 22456415116800.0, + "grad_norm": 1.829778778748331, + "language_loss": 0.6463306, + "learning_rate": 2.436738768872905e-06, + "loss": 0.66903329, + "num_input_tokens_seen": 159364595, + "step": 7435, + "time_per_iteration": 2.582261085510254 + }, + { + "auxiliary_loss_clip": 0.01136765, + "auxiliary_loss_mlp": 0.01117983, + "balance_loss_clip": 1.00218987, + "balance_loss_mlp": 1.00058627, + "epoch": 0.44707650683902, + "flos": 24057851080320.0, + "grad_norm": 1.6053256231172779, + "language_loss": 0.83395755, + "learning_rate": 2.4363586981092346e-06, + "loss": 0.85650504, + "num_input_tokens_seen": 159385265, + "step": 7436, + "time_per_iteration": 2.6319329738616943 + }, + { + "auxiliary_loss_clip": 0.01107058, + "auxiliary_loss_mlp": 0.01119398, + "balance_loss_clip": 1.00206554, + "balance_loss_mlp": 1.00057018, + "epoch": 0.44713663009168797, + "flos": 23766938830080.0, + "grad_norm": 1.5895343631502585, + "language_loss": 0.79613036, + "learning_rate": 2.435978610798798e-06, + "loss": 0.81839496, + "num_input_tokens_seen": 159405080, + "step": 7437, + "time_per_iteration": 2.6975510120391846 + }, + { + "auxiliary_loss_clip": 0.01101428, + "auxiliary_loss_mlp": 0.01118422, + "balance_loss_clip": 1.00175738, + "balance_loss_mlp": 1.00064361, + "epoch": 0.44719675334435594, + "flos": 24499265316480.0, + "grad_norm": 1.664086319926087, + "language_loss": 0.71792984, + "learning_rate": 2.435598506956009e-06, + "loss": 0.74012834, + "num_input_tokens_seen": 159424595, + "step": 7438, + "time_per_iteration": 2.7165369987487793 + }, + { + "auxiliary_loss_clip": 0.01110758, + "auxiliary_loss_mlp": 0.01118706, + "balance_loss_clip": 1.00218511, + "balance_loss_mlp": 1.00064158, + "epoch": 0.4472568765970239, + "flos": 29781759991680.0, + "grad_norm": 1.69820583061643, + "language_loss": 0.6725598, + "learning_rate": 2.4352183865952808e-06, + "loss": 0.69485438, + "num_input_tokens_seen": 159443865, + "step": 7439, + "time_per_iteration": 2.7527334690093994 + }, + { + "auxiliary_loss_clip": 0.01138616, + "auxiliary_loss_mlp": 0.01119371, + "balance_loss_clip": 1.00217986, + "balance_loss_mlp": 1.00082898, + "epoch": 0.44731699984969187, + "flos": 24643123286400.0, + "grad_norm": 1.5751996808019704, + "language_loss": 0.73914331, + "learning_rate": 2.4348382497310285e-06, + "loss": 0.76172316, + "num_input_tokens_seen": 159464525, + "step": 7440, + "time_per_iteration": 2.6325178146362305 + }, + { + "auxiliary_loss_clip": 0.01105522, + "auxiliary_loss_mlp": 0.01118244, + "balance_loss_clip": 1.00190425, + "balance_loss_mlp": 1.00065565, + "epoch": 0.44737712310235983, + "flos": 29455691304960.0, + "grad_norm": 2.6707831510114413, + "language_loss": 0.73688066, + "learning_rate": 2.4344580963776655e-06, + "loss": 0.75911832, + "num_input_tokens_seen": 159486385, + "step": 7441, + "time_per_iteration": 2.7460060119628906 + }, + { + "auxiliary_loss_clip": 0.01119507, + "auxiliary_loss_mlp": 0.01119348, + "balance_loss_clip": 1.00194073, + "balance_loss_mlp": 1.00080657, + "epoch": 0.4474372463550278, + "flos": 24896832024960.0, + "grad_norm": 1.7982536991616591, + "language_loss": 0.74896479, + "learning_rate": 2.4340779265496082e-06, + "loss": 0.77135336, + "num_input_tokens_seen": 159503880, + "step": 7442, + "time_per_iteration": 2.6679155826568604 + }, + { + "auxiliary_loss_clip": 0.0116841, + "auxiliary_loss_mlp": 0.01118692, + "balance_loss_clip": 1.00212812, + "balance_loss_mlp": 1.00062692, + "epoch": 0.44749736960769576, + "flos": 33181603125120.0, + "grad_norm": 1.6887135289862125, + "language_loss": 0.74375552, + "learning_rate": 2.433697740261273e-06, + "loss": 0.7666266, + "num_input_tokens_seen": 159522980, + "step": 7443, + "time_per_iteration": 2.6110076904296875 + }, + { + "auxiliary_loss_clip": 0.01138447, + "auxiliary_loss_mlp": 0.0111858, + "balance_loss_clip": 1.00207973, + "balance_loss_mlp": 1.00070548, + "epoch": 0.4475574928603637, + "flos": 21071807602560.0, + "grad_norm": 1.5642076682050206, + "language_loss": 0.7756151, + "learning_rate": 2.4333175375270748e-06, + "loss": 0.79818535, + "num_input_tokens_seen": 159543340, + "step": 7444, + "time_per_iteration": 2.6019155979156494 + }, + { + "auxiliary_loss_clip": 0.01152022, + "auxiliary_loss_mlp": 0.01118621, + "balance_loss_clip": 1.00214696, + "balance_loss_mlp": 1.00055659, + "epoch": 0.4476176161130317, + "flos": 21862523646720.0, + "grad_norm": 5.6604464352178665, + "language_loss": 0.84685111, + "learning_rate": 2.4329373183614333e-06, + "loss": 0.86955756, + "num_input_tokens_seen": 159558210, + "step": 7445, + "time_per_iteration": 2.619281530380249 + }, + { + "auxiliary_loss_clip": 0.0112162, + "auxiliary_loss_mlp": 0.0111953, + "balance_loss_clip": 1.00209689, + "balance_loss_mlp": 1.00060678, + "epoch": 0.4476777393656997, + "flos": 22528667324160.0, + "grad_norm": 2.051909576723001, + "language_loss": 0.64097941, + "learning_rate": 2.432557082778765e-06, + "loss": 0.66339087, + "num_input_tokens_seen": 159577920, + "step": 7446, + "time_per_iteration": 2.66144061088562 + }, + { + "auxiliary_loss_clip": 0.01147434, + "auxiliary_loss_mlp": 0.01098606, + "balance_loss_clip": 1.00196612, + "balance_loss_mlp": 1.00018692, + "epoch": 0.4477378626183677, + "flos": 49017133877760.0, + "grad_norm": 0.7352730349736318, + "language_loss": 0.50241143, + "learning_rate": 2.4321768307934884e-06, + "loss": 0.52487177, + "num_input_tokens_seen": 159632295, + "step": 7447, + "time_per_iteration": 2.994046211242676 + }, + { + "auxiliary_loss_clip": 0.01164042, + "auxiliary_loss_mlp": 0.0109785, + "balance_loss_clip": 1.00201678, + "balance_loss_mlp": 1.00019383, + "epoch": 0.44779798587103564, + "flos": 56542179392640.0, + "grad_norm": 0.7747594701816676, + "language_loss": 0.59348863, + "learning_rate": 2.4317965624200235e-06, + "loss": 0.61610758, + "num_input_tokens_seen": 159698435, + "step": 7448, + "time_per_iteration": 4.514643669128418 + }, + { + "auxiliary_loss_clip": 0.01119959, + "auxiliary_loss_mlp": 0.01117704, + "balance_loss_clip": 1.00191045, + "balance_loss_mlp": 1.00078368, + "epoch": 0.4478581091237036, + "flos": 46498536040320.0, + "grad_norm": 1.568355430537893, + "language_loss": 0.5915302, + "learning_rate": 2.431416277672789e-06, + "loss": 0.61390674, + "num_input_tokens_seen": 159722150, + "step": 7449, + "time_per_iteration": 2.87722110748291 + }, + { + "auxiliary_loss_clip": 0.01136767, + "auxiliary_loss_mlp": 0.0111888, + "balance_loss_clip": 1.00215042, + "balance_loss_mlp": 1.0006249, + "epoch": 0.4479182323763716, + "flos": 20814363849600.0, + "grad_norm": 1.8188673043010404, + "language_loss": 0.8013531, + "learning_rate": 2.4310359765662065e-06, + "loss": 0.82390952, + "num_input_tokens_seen": 159740550, + "step": 7450, + "time_per_iteration": 2.6052868366241455 + }, + { + "auxiliary_loss_clip": 0.01168444, + "auxiliary_loss_mlp": 0.01118345, + "balance_loss_clip": 1.00214887, + "balance_loss_mlp": 1.00085258, + "epoch": 0.44797835562903954, + "flos": 14245979212800.0, + "grad_norm": 2.311992880546761, + "language_loss": 0.79656541, + "learning_rate": 2.430655659114697e-06, + "loss": 0.81943327, + "num_input_tokens_seen": 159758245, + "step": 7451, + "time_per_iteration": 2.509850025177002 + }, + { + "auxiliary_loss_clip": 0.01099035, + "auxiliary_loss_mlp": 0.01097801, + "balance_loss_clip": 1.00178337, + "balance_loss_mlp": 1.0001452, + "epoch": 0.4480384788817075, + "flos": 63534560169600.0, + "grad_norm": 0.8314397203991434, + "language_loss": 0.62808621, + "learning_rate": 2.430275325332681e-06, + "loss": 0.65005457, + "num_input_tokens_seen": 159826790, + "step": 7452, + "time_per_iteration": 4.817967414855957 + }, + { + "auxiliary_loss_clip": 0.01168343, + "auxiliary_loss_mlp": 0.01118559, + "balance_loss_clip": 1.00217247, + "balance_loss_mlp": 1.00068486, + "epoch": 0.44809860213437547, + "flos": 21652626522240.0, + "grad_norm": 1.8153473613143538, + "language_loss": 0.62170839, + "learning_rate": 2.429894975234582e-06, + "loss": 0.64457738, + "num_input_tokens_seen": 159845805, + "step": 7453, + "time_per_iteration": 3.8621621131896973 + }, + { + "auxiliary_loss_clip": 0.01153852, + "auxiliary_loss_mlp": 0.01097842, + "balance_loss_clip": 1.00216126, + "balance_loss_mlp": 1.00018585, + "epoch": 0.44815872538704343, + "flos": 69190634246400.0, + "grad_norm": 0.769969845278583, + "language_loss": 0.57101578, + "learning_rate": 2.4295146088348224e-06, + "loss": 0.59353262, + "num_input_tokens_seen": 159898860, + "step": 7454, + "time_per_iteration": 3.0199365615844727 + }, + { + "auxiliary_loss_clip": 0.0113778, + "auxiliary_loss_mlp": 0.01118433, + "balance_loss_clip": 1.00206971, + "balance_loss_mlp": 1.00055909, + "epoch": 0.4482188486397114, + "flos": 12598289510400.0, + "grad_norm": 2.0313183496264147, + "language_loss": 0.75030601, + "learning_rate": 2.4291342261478255e-06, + "loss": 0.7728681, + "num_input_tokens_seen": 159911555, + "step": 7455, + "time_per_iteration": 2.554868698120117 + }, + { + "auxiliary_loss_clip": 0.01135158, + "auxiliary_loss_mlp": 0.01118178, + "balance_loss_clip": 1.00208163, + "balance_loss_mlp": 1.00068533, + "epoch": 0.44827897189237936, + "flos": 34058182631040.0, + "grad_norm": 1.5907256177551408, + "language_loss": 0.75799626, + "learning_rate": 2.428753827188016e-06, + "loss": 0.78052962, + "num_input_tokens_seen": 159931470, + "step": 7456, + "time_per_iteration": 4.072336673736572 + }, + { + "auxiliary_loss_clip": 0.01168397, + "auxiliary_loss_mlp": 0.01118026, + "balance_loss_clip": 1.00232089, + "balance_loss_mlp": 1.00062895, + "epoch": 0.44833909514504733, + "flos": 25147416280320.0, + "grad_norm": 3.1806496606247476, + "language_loss": 0.76295006, + "learning_rate": 2.428373411969818e-06, + "loss": 0.78581429, + "num_input_tokens_seen": 159946115, + "step": 7457, + "time_per_iteration": 2.551372766494751 + }, + { + "auxiliary_loss_clip": 0.01153443, + "auxiliary_loss_mlp": 0.01118952, + "balance_loss_clip": 1.00221241, + "balance_loss_mlp": 1.00060129, + "epoch": 0.4483992183977153, + "flos": 16179984224640.0, + "grad_norm": 36.9048034071992, + "language_loss": 0.68058383, + "learning_rate": 2.4279929805076576e-06, + "loss": 0.70330775, + "num_input_tokens_seen": 159963915, + "step": 7458, + "time_per_iteration": 2.5512192249298096 + }, + { + "auxiliary_loss_clip": 0.01119857, + "auxiliary_loss_mlp": 0.01118862, + "balance_loss_clip": 1.00190067, + "balance_loss_mlp": 1.00060606, + "epoch": 0.44845934165038326, + "flos": 17746048270080.0, + "grad_norm": 1.5931374226234334, + "language_loss": 0.71739542, + "learning_rate": 2.427612532815961e-06, + "loss": 0.73978263, + "num_input_tokens_seen": 159982140, + "step": 7459, + "time_per_iteration": 2.6333489418029785 + }, + { + "auxiliary_loss_clip": 0.01153499, + "auxiliary_loss_mlp": 0.01117821, + "balance_loss_clip": 1.00214708, + "balance_loss_mlp": 1.00071025, + "epoch": 0.4485194649030513, + "flos": 21835914647040.0, + "grad_norm": 1.5878979157013615, + "language_loss": 0.69497716, + "learning_rate": 2.427232068909154e-06, + "loss": 0.71769035, + "num_input_tokens_seen": 160002280, + "step": 7460, + "time_per_iteration": 2.567542791366577 + }, + { + "auxiliary_loss_clip": 0.01168383, + "auxiliary_loss_mlp": 0.01118032, + "balance_loss_clip": 1.00211418, + "balance_loss_mlp": 1.00063503, + "epoch": 0.44857958815571924, + "flos": 20084515401600.0, + "grad_norm": 2.3884537872394542, + "language_loss": 0.7713055, + "learning_rate": 2.4268515888016635e-06, + "loss": 0.79416966, + "num_input_tokens_seen": 160020260, + "step": 7461, + "time_per_iteration": 2.4983584880828857 + }, + { + "auxiliary_loss_clip": 0.01168442, + "auxiliary_loss_mlp": 0.01118481, + "balance_loss_clip": 1.00217199, + "balance_loss_mlp": 1.00070214, + "epoch": 0.4486397114083872, + "flos": 27053519402880.0, + "grad_norm": 1.9749545851874815, + "language_loss": 0.67770362, + "learning_rate": 2.4264710925079184e-06, + "loss": 0.70057291, + "num_input_tokens_seen": 160040240, + "step": 7462, + "time_per_iteration": 2.5467793941497803 + }, + { + "auxiliary_loss_clip": 0.01163844, + "auxiliary_loss_mlp": 0.01097319, + "balance_loss_clip": 1.00187469, + "balance_loss_mlp": 1.00004435, + "epoch": 0.4486998346610552, + "flos": 67321195931520.0, + "grad_norm": 0.7464158341350315, + "language_loss": 0.5441637, + "learning_rate": 2.4260905800423462e-06, + "loss": 0.56677544, + "num_input_tokens_seen": 160093865, + "step": 7463, + "time_per_iteration": 3.1010730266571045 + }, + { + "auxiliary_loss_clip": 0.01152853, + "auxiliary_loss_mlp": 0.01117574, + "balance_loss_clip": 1.0021317, + "balance_loss_mlp": 1.00065351, + "epoch": 0.44875995791372314, + "flos": 27636816360960.0, + "grad_norm": 3.0755530798061033, + "language_loss": 0.7580598, + "learning_rate": 2.4257100514193775e-06, + "loss": 0.7807641, + "num_input_tokens_seen": 160113590, + "step": 7464, + "time_per_iteration": 2.619180679321289 + }, + { + "auxiliary_loss_clip": 0.01151603, + "auxiliary_loss_mlp": 0.01118208, + "balance_loss_clip": 1.00208592, + "balance_loss_mlp": 1.00062048, + "epoch": 0.4488200811663911, + "flos": 13005947940480.0, + "grad_norm": 1.8255560449788677, + "language_loss": 0.7403096, + "learning_rate": 2.425329506653441e-06, + "loss": 0.76300776, + "num_input_tokens_seen": 160131795, + "step": 7465, + "time_per_iteration": 2.5462710857391357 + }, + { + "auxiliary_loss_clip": 0.01141516, + "auxiliary_loss_mlp": 0.01118997, + "balance_loss_clip": 1.00244021, + "balance_loss_mlp": 1.00055075, + "epoch": 0.44888020441905907, + "flos": 27489977562240.0, + "grad_norm": 1.8303474990471147, + "language_loss": 0.79945624, + "learning_rate": 2.424948945758966e-06, + "loss": 0.82206142, + "num_input_tokens_seen": 160150635, + "step": 7466, + "time_per_iteration": 2.659351110458374 + }, + { + "auxiliary_loss_clip": 0.01136347, + "auxiliary_loss_mlp": 0.0111831, + "balance_loss_clip": 1.00212264, + "balance_loss_mlp": 1.0005312, + "epoch": 0.44894032767172704, + "flos": 18259678800000.0, + "grad_norm": 3.252685223815305, + "language_loss": 0.80364263, + "learning_rate": 2.4245683687503844e-06, + "loss": 0.82618916, + "num_input_tokens_seen": 160168615, + "step": 7467, + "time_per_iteration": 2.5900990962982178 + }, + { + "auxiliary_loss_clip": 0.01101637, + "auxiliary_loss_mlp": 0.01117486, + "balance_loss_clip": 1.00190425, + "balance_loss_mlp": 1.00075603, + "epoch": 0.449000450924395, + "flos": 21579835610880.0, + "grad_norm": 1.6625623466949346, + "language_loss": 0.74981356, + "learning_rate": 2.424187775642129e-06, + "loss": 0.77200472, + "num_input_tokens_seen": 160187295, + "step": 7468, + "time_per_iteration": 2.6791117191314697 + }, + { + "auxiliary_loss_clip": 0.01137022, + "auxiliary_loss_mlp": 0.01117551, + "balance_loss_clip": 1.00212753, + "balance_loss_mlp": 1.00053537, + "epoch": 0.44906057417706297, + "flos": 17967904623360.0, + "grad_norm": 2.359635057596549, + "language_loss": 0.70543641, + "learning_rate": 2.4238071664486297e-06, + "loss": 0.7279821, + "num_input_tokens_seen": 160205115, + "step": 7469, + "time_per_iteration": 2.63907527923584 + }, + { + "auxiliary_loss_clip": 0.01152023, + "auxiliary_loss_mlp": 0.01118741, + "balance_loss_clip": 1.00210094, + "balance_loss_mlp": 1.00067639, + "epoch": 0.44912069742973093, + "flos": 20047347803520.0, + "grad_norm": 2.2022127846470396, + "language_loss": 0.71711653, + "learning_rate": 2.4234265411843203e-06, + "loss": 0.73982418, + "num_input_tokens_seen": 160222580, + "step": 7470, + "time_per_iteration": 2.5517067909240723 + }, + { + "auxiliary_loss_clip": 0.01122065, + "auxiliary_loss_mlp": 0.01118119, + "balance_loss_clip": 1.00182235, + "balance_loss_mlp": 1.00072122, + "epoch": 0.4491808206823989, + "flos": 21033526682880.0, + "grad_norm": 1.7276015581202637, + "language_loss": 0.77034754, + "learning_rate": 2.423045899863634e-06, + "loss": 0.7927494, + "num_input_tokens_seen": 160241520, + "step": 7471, + "time_per_iteration": 2.6418421268463135 + }, + { + "auxiliary_loss_clip": 0.01168291, + "auxiliary_loss_mlp": 0.01118456, + "balance_loss_clip": 1.0022397, + "balance_loss_mlp": 1.00077319, + "epoch": 0.44924094393506686, + "flos": 22967136645120.0, + "grad_norm": 1.6945914099458803, + "language_loss": 0.6987713, + "learning_rate": 2.4226652425010048e-06, + "loss": 0.72163874, + "num_input_tokens_seen": 160261815, + "step": 7472, + "time_per_iteration": 2.6060009002685547 + }, + { + "auxiliary_loss_clip": 0.01149039, + "auxiliary_loss_mlp": 0.01096985, + "balance_loss_clip": 1.00173581, + "balance_loss_mlp": 1.00009215, + "epoch": 0.4493010671877349, + "flos": 59233467864960.0, + "grad_norm": 0.7377978020092978, + "language_loss": 0.61671668, + "learning_rate": 2.4222845691108676e-06, + "loss": 0.63917696, + "num_input_tokens_seen": 160317070, + "step": 7473, + "time_per_iteration": 3.109046220779419 + }, + { + "auxiliary_loss_clip": 0.01168291, + "auxiliary_loss_mlp": 0.00747458, + "balance_loss_clip": 1.00212669, + "balance_loss_mlp": 1.00029349, + "epoch": 0.44936119044040285, + "flos": 18004892653440.0, + "grad_norm": 2.083044129777701, + "language_loss": 0.77376252, + "learning_rate": 2.421903879707657e-06, + "loss": 0.79291999, + "num_input_tokens_seen": 160334980, + "step": 7474, + "time_per_iteration": 2.482311487197876 + }, + { + "auxiliary_loss_clip": 0.01110993, + "auxiliary_loss_mlp": 0.01117279, + "balance_loss_clip": 1.00226521, + "balance_loss_mlp": 1.00064421, + "epoch": 0.4494213136930708, + "flos": 21251827589760.0, + "grad_norm": 1.7891958844534006, + "language_loss": 0.72078037, + "learning_rate": 2.4215231743058086e-06, + "loss": 0.74306309, + "num_input_tokens_seen": 160354500, + "step": 7475, + "time_per_iteration": 2.6764116287231445 + }, + { + "auxiliary_loss_clip": 0.01104475, + "auxiliary_loss_mlp": 0.01117915, + "balance_loss_clip": 1.00190187, + "balance_loss_mlp": 1.00051737, + "epoch": 0.4494814369457388, + "flos": 27418695022080.0, + "grad_norm": 2.3240706384879024, + "language_loss": 0.76725459, + "learning_rate": 2.4211424529197594e-06, + "loss": 0.78947848, + "num_input_tokens_seen": 160373650, + "step": 7476, + "time_per_iteration": 2.7035419940948486 + }, + { + "auxiliary_loss_clip": 0.01153073, + "auxiliary_loss_mlp": 0.00747408, + "balance_loss_clip": 1.00212765, + "balance_loss_mlp": 1.00030077, + "epoch": 0.44954156019840674, + "flos": 22854053652480.0, + "grad_norm": 2.244105702829505, + "language_loss": 0.71756816, + "learning_rate": 2.4207617155639464e-06, + "loss": 0.73657292, + "num_input_tokens_seen": 160393430, + "step": 7477, + "time_per_iteration": 2.5828609466552734 + }, + { + "auxiliary_loss_clip": 0.01138182, + "auxiliary_loss_mlp": 0.01118571, + "balance_loss_clip": 1.00208163, + "balance_loss_mlp": 1.00060189, + "epoch": 0.4496016834510747, + "flos": 17201570935680.0, + "grad_norm": 3.8461327913173147, + "language_loss": 0.67881846, + "learning_rate": 2.4203809622528062e-06, + "loss": 0.70138597, + "num_input_tokens_seen": 160410545, + "step": 7478, + "time_per_iteration": 2.591944932937622 + }, + { + "auxiliary_loss_clip": 0.0113614, + "auxiliary_loss_mlp": 0.01116845, + "balance_loss_clip": 1.00207317, + "balance_loss_mlp": 1.00068772, + "epoch": 0.4496618067037427, + "flos": 18916628595840.0, + "grad_norm": 2.003016735418087, + "language_loss": 0.89657557, + "learning_rate": 2.420000193000779e-06, + "loss": 0.91910541, + "num_input_tokens_seen": 160428105, + "step": 7479, + "time_per_iteration": 2.572315216064453 + }, + { + "auxiliary_loss_clip": 0.01093306, + "auxiliary_loss_mlp": 0.01118049, + "balance_loss_clip": 1.00233757, + "balance_loss_mlp": 1.00074744, + "epoch": 0.44972192995641064, + "flos": 21031659175680.0, + "grad_norm": 2.3793627111262596, + "language_loss": 0.75544584, + "learning_rate": 2.419619407822302e-06, + "loss": 0.7775594, + "num_input_tokens_seen": 160448815, + "step": 7480, + "time_per_iteration": 2.782149076461792 + }, + { + "auxiliary_loss_clip": 0.01121174, + "auxiliary_loss_mlp": 0.01118699, + "balance_loss_clip": 1.0019846, + "balance_loss_mlp": 1.00063396, + "epoch": 0.4497820532090786, + "flos": 20777088510720.0, + "grad_norm": 2.200607806413051, + "language_loss": 0.79769361, + "learning_rate": 2.419238606731815e-06, + "loss": 0.82009232, + "num_input_tokens_seen": 160465940, + "step": 7481, + "time_per_iteration": 2.6418261528015137 + }, + { + "auxiliary_loss_clip": 0.01136626, + "auxiliary_loss_mlp": 0.01117395, + "balance_loss_clip": 1.00217462, + "balance_loss_mlp": 1.0005697, + "epoch": 0.44984217646174657, + "flos": 33802606385280.0, + "grad_norm": 1.6545706331106258, + "language_loss": 0.68584627, + "learning_rate": 2.418857789743758e-06, + "loss": 0.70838648, + "num_input_tokens_seen": 160486710, + "step": 7482, + "time_per_iteration": 2.701643228530884 + }, + { + "auxiliary_loss_clip": 0.01151793, + "auxiliary_loss_mlp": 0.01119074, + "balance_loss_clip": 1.00217128, + "balance_loss_mlp": 1.00072348, + "epoch": 0.44990229971441453, + "flos": 15518365660800.0, + "grad_norm": 2.9249693867675846, + "language_loss": 0.85065436, + "learning_rate": 2.418476956872571e-06, + "loss": 0.87336302, + "num_input_tokens_seen": 160503405, + "step": 7483, + "time_per_iteration": 2.533176898956299 + }, + { + "auxiliary_loss_clip": 0.0113843, + "auxiliary_loss_mlp": 0.01118526, + "balance_loss_clip": 1.0022229, + "balance_loss_mlp": 1.00084269, + "epoch": 0.4499624229670825, + "flos": 29861913191040.0, + "grad_norm": 1.630690944992477, + "language_loss": 0.8047812, + "learning_rate": 2.4180961081326967e-06, + "loss": 0.82735074, + "num_input_tokens_seen": 160525080, + "step": 7484, + "time_per_iteration": 2.671434164047241 + }, + { + "auxiliary_loss_clip": 0.01105892, + "auxiliary_loss_mlp": 0.01119005, + "balance_loss_clip": 1.00212419, + "balance_loss_mlp": 1.00046337, + "epoch": 0.45002254621975046, + "flos": 18513674847360.0, + "grad_norm": 3.5495288216618555, + "language_loss": 0.74213207, + "learning_rate": 2.4177152435385754e-06, + "loss": 0.76438105, + "num_input_tokens_seen": 160540895, + "step": 7485, + "time_per_iteration": 2.692058563232422 + }, + { + "auxiliary_loss_clip": 0.01133574, + "auxiliary_loss_mlp": 0.0109727, + "balance_loss_clip": 1.00167084, + "balance_loss_mlp": 0.99999565, + "epoch": 0.4500826694724185, + "flos": 70420394229120.0, + "grad_norm": 0.7848182751560203, + "language_loss": 0.58611989, + "learning_rate": 2.4173343631046504e-06, + "loss": 0.6084283, + "num_input_tokens_seen": 160598270, + "step": 7486, + "time_per_iteration": 4.670121192932129 + }, + { + "auxiliary_loss_clip": 0.01153407, + "auxiliary_loss_mlp": 0.01118476, + "balance_loss_clip": 1.00225413, + "balance_loss_mlp": 1.00060225, + "epoch": 0.45014279272508645, + "flos": 15778897983360.0, + "grad_norm": 2.558062399334985, + "language_loss": 0.83756232, + "learning_rate": 2.4169534668453654e-06, + "loss": 0.86028111, + "num_input_tokens_seen": 160614720, + "step": 7487, + "time_per_iteration": 2.596984624862671 + }, + { + "auxiliary_loss_clip": 0.01168199, + "auxiliary_loss_mlp": 0.01117274, + "balance_loss_clip": 1.00213373, + "balance_loss_mlp": 1.00073516, + "epoch": 0.4502029159777544, + "flos": 21799573061760.0, + "grad_norm": 1.7090093489441214, + "language_loss": 0.77126318, + "learning_rate": 2.4165725547751622e-06, + "loss": 0.79411793, + "num_input_tokens_seen": 160635170, + "step": 7488, + "time_per_iteration": 2.5744590759277344 + }, + { + "auxiliary_loss_clip": 0.01153753, + "auxiliary_loss_mlp": 0.01119522, + "balance_loss_clip": 1.00222945, + "balance_loss_mlp": 1.00078905, + "epoch": 0.4502630392304224, + "flos": 28767966531840.0, + "grad_norm": 1.8854243262926198, + "language_loss": 0.71826935, + "learning_rate": 2.4161916269084858e-06, + "loss": 0.74100214, + "num_input_tokens_seen": 160654490, + "step": 7489, + "time_per_iteration": 2.6133029460906982 + }, + { + "auxiliary_loss_clip": 0.01137763, + "auxiliary_loss_mlp": 0.01118936, + "balance_loss_clip": 1.00225914, + "balance_loss_mlp": 1.00067997, + "epoch": 0.45032316248309034, + "flos": 15844182952320.0, + "grad_norm": 2.319794597086377, + "language_loss": 0.69658273, + "learning_rate": 2.4158106832597817e-06, + "loss": 0.71914971, + "num_input_tokens_seen": 160669400, + "step": 7490, + "time_per_iteration": 2.5696609020233154 + }, + { + "auxiliary_loss_clip": 0.01132727, + "auxiliary_loss_mlp": 0.01097891, + "balance_loss_clip": 1.00179923, + "balance_loss_mlp": 1.00023484, + "epoch": 0.4503832857357583, + "flos": 57853600945920.0, + "grad_norm": 0.7885377120954948, + "language_loss": 0.56765062, + "learning_rate": 2.415429723843495e-06, + "loss": 0.58995682, + "num_input_tokens_seen": 160733820, + "step": 7491, + "time_per_iteration": 5.915446519851685 + }, + { + "auxiliary_loss_clip": 0.01151641, + "auxiliary_loss_mlp": 0.01118013, + "balance_loss_clip": 1.00212598, + "balance_loss_mlp": 1.00071061, + "epoch": 0.4504434089884263, + "flos": 23878082488320.0, + "grad_norm": 2.156486433737275, + "language_loss": 0.79405463, + "learning_rate": 2.4150487486740713e-06, + "loss": 0.81675118, + "num_input_tokens_seen": 160753175, + "step": 7492, + "time_per_iteration": 2.6148903369903564 + }, + { + "auxiliary_loss_clip": 0.01121815, + "auxiliary_loss_mlp": 0.00747448, + "balance_loss_clip": 1.00216913, + "balance_loss_mlp": 1.00031137, + "epoch": 0.45050353224109424, + "flos": 17785083375360.0, + "grad_norm": 2.1897739119196276, + "language_loss": 0.93101752, + "learning_rate": 2.4146677577659573e-06, + "loss": 0.94971019, + "num_input_tokens_seen": 160768310, + "step": 7493, + "time_per_iteration": 3.966110944747925 + }, + { + "auxiliary_loss_clip": 0.01147388, + "auxiliary_loss_mlp": 0.01097411, + "balance_loss_clip": 1.00167942, + "balance_loss_mlp": 1.00013626, + "epoch": 0.4505636554937622, + "flos": 65063420703360.0, + "grad_norm": 0.9062778033896066, + "language_loss": 0.62949169, + "learning_rate": 2.4142867511336e-06, + "loss": 0.65193975, + "num_input_tokens_seen": 160827370, + "step": 7494, + "time_per_iteration": 3.160907745361328 + }, + { + "auxiliary_loss_clip": 0.01168301, + "auxiliary_loss_mlp": 0.01117273, + "balance_loss_clip": 1.00222492, + "balance_loss_mlp": 1.00073361, + "epoch": 0.45062377874643017, + "flos": 22200084685440.0, + "grad_norm": 1.5169017589567368, + "language_loss": 0.81996703, + "learning_rate": 2.4139057287914484e-06, + "loss": 0.84282279, + "num_input_tokens_seen": 160849140, + "step": 7495, + "time_per_iteration": 2.5390639305114746 + }, + { + "auxiliary_loss_clip": 0.01153303, + "auxiliary_loss_mlp": 0.01118931, + "balance_loss_clip": 1.00227666, + "balance_loss_mlp": 1.00067544, + "epoch": 0.45068390199909814, + "flos": 37670293186560.0, + "grad_norm": 1.9637983193900264, + "language_loss": 0.85603881, + "learning_rate": 2.41352469075395e-06, + "loss": 0.87876111, + "num_input_tokens_seen": 160871280, + "step": 7496, + "time_per_iteration": 2.7005910873413086 + }, + { + "auxiliary_loss_clip": 0.01168364, + "auxiliary_loss_mlp": 0.01117889, + "balance_loss_clip": 1.00222194, + "balance_loss_mlp": 1.00068212, + "epoch": 0.4507440252517661, + "flos": 22302501338880.0, + "grad_norm": 3.6594383429213013, + "language_loss": 0.76204485, + "learning_rate": 2.4131436370355534e-06, + "loss": 0.7849074, + "num_input_tokens_seen": 160888625, + "step": 7497, + "time_per_iteration": 2.514509677886963 + }, + { + "auxiliary_loss_clip": 0.01120332, + "auxiliary_loss_mlp": 0.01118549, + "balance_loss_clip": 1.00199831, + "balance_loss_mlp": 1.00067472, + "epoch": 0.45080414850443407, + "flos": 13188374138880.0, + "grad_norm": 2.6013526663285464, + "language_loss": 0.75408155, + "learning_rate": 2.4127625676507088e-06, + "loss": 0.77647042, + "num_input_tokens_seen": 160907040, + "step": 7498, + "time_per_iteration": 2.621237277984619 + }, + { + "auxiliary_loss_clip": 0.01168527, + "auxiliary_loss_mlp": 0.01118156, + "balance_loss_clip": 1.00227308, + "balance_loss_mlp": 1.00066376, + "epoch": 0.4508642717571021, + "flos": 21944939402880.0, + "grad_norm": 1.933555984071475, + "language_loss": 0.70252043, + "learning_rate": 2.4123814826138663e-06, + "loss": 0.72538722, + "num_input_tokens_seen": 160927115, + "step": 7499, + "time_per_iteration": 2.517880439758301 + }, + { + "auxiliary_loss_clip": 0.01104596, + "auxiliary_loss_mlp": 0.01118136, + "balance_loss_clip": 1.00197756, + "balance_loss_mlp": 1.00064373, + "epoch": 0.45092439500977005, + "flos": 23367468700800.0, + "grad_norm": 2.065264130172462, + "language_loss": 0.77075404, + "learning_rate": 2.412000381939477e-06, + "loss": 0.79298133, + "num_input_tokens_seen": 160944405, + "step": 7500, + "time_per_iteration": 2.662466049194336 + }, + { + "auxiliary_loss_clip": 0.01103661, + "auxiliary_loss_mlp": 0.01118075, + "balance_loss_clip": 1.00197148, + "balance_loss_mlp": 1.00058198, + "epoch": 0.450984518262438, + "flos": 20772958446720.0, + "grad_norm": 1.7584207976883237, + "language_loss": 0.62351429, + "learning_rate": 2.411619265641992e-06, + "loss": 0.64573157, + "num_input_tokens_seen": 160961345, + "step": 7501, + "time_per_iteration": 2.666297197341919 + }, + { + "auxiliary_loss_clip": 0.01168507, + "auxiliary_loss_mlp": 0.01119163, + "balance_loss_clip": 1.00222588, + "balance_loss_mlp": 1.00071621, + "epoch": 0.451044641515106, + "flos": 17707372300800.0, + "grad_norm": 1.8912705369492206, + "language_loss": 0.84395456, + "learning_rate": 2.411238133735863e-06, + "loss": 0.86683124, + "num_input_tokens_seen": 160977330, + "step": 7502, + "time_per_iteration": 2.4795889854431152 + }, + { + "auxiliary_loss_clip": 0.01134546, + "auxiliary_loss_mlp": 0.01117235, + "balance_loss_clip": 1.0021224, + "balance_loss_mlp": 1.00079131, + "epoch": 0.45110476476777395, + "flos": 20594698225920.0, + "grad_norm": 1.337273265055739, + "language_loss": 0.79436648, + "learning_rate": 2.4108569862355418e-06, + "loss": 0.81688428, + "num_input_tokens_seen": 160997280, + "step": 7503, + "time_per_iteration": 2.5866761207580566 + }, + { + "auxiliary_loss_clip": 0.01141306, + "auxiliary_loss_mlp": 0.01118258, + "balance_loss_clip": 1.00237441, + "balance_loss_mlp": 1.00067043, + "epoch": 0.4511648880204419, + "flos": 16034043265920.0, + "grad_norm": 2.011525958212853, + "language_loss": 0.81060547, + "learning_rate": 2.410475823155484e-06, + "loss": 0.83320117, + "num_input_tokens_seen": 161014235, + "step": 7504, + "time_per_iteration": 2.570248603820801 + }, + { + "auxiliary_loss_clip": 0.01126414, + "auxiliary_loss_mlp": 0.01117575, + "balance_loss_clip": 1.00219417, + "balance_loss_mlp": 1.0007503, + "epoch": 0.4512250112731099, + "flos": 23978811202560.0, + "grad_norm": 2.153536935758408, + "language_loss": 0.63282061, + "learning_rate": 2.4100946445101405e-06, + "loss": 0.65526056, + "num_input_tokens_seen": 161032360, + "step": 7505, + "time_per_iteration": 2.647193431854248 + }, + { + "auxiliary_loss_clip": 0.01115541, + "auxiliary_loss_mlp": 0.01097937, + "balance_loss_clip": 1.00165534, + "balance_loss_mlp": 1.00028062, + "epoch": 0.45128513452577784, + "flos": 71462308037760.0, + "grad_norm": 0.8327153809013448, + "language_loss": 0.58906996, + "learning_rate": 2.409713450313968e-06, + "loss": 0.61120474, + "num_input_tokens_seen": 161091360, + "step": 7506, + "time_per_iteration": 3.2485568523406982 + }, + { + "auxiliary_loss_clip": 0.01103007, + "auxiliary_loss_mlp": 0.0111781, + "balance_loss_clip": 1.00202799, + "balance_loss_mlp": 1.00079417, + "epoch": 0.4513452577784458, + "flos": 22090844448000.0, + "grad_norm": 2.649038823549898, + "language_loss": 0.79152417, + "learning_rate": 2.40933224058142e-06, + "loss": 0.81373227, + "num_input_tokens_seen": 161110825, + "step": 7507, + "time_per_iteration": 2.690887451171875 + }, + { + "auxiliary_loss_clip": 0.01120255, + "auxiliary_loss_mlp": 0.01118055, + "balance_loss_clip": 1.00199676, + "balance_loss_mlp": 1.0006572, + "epoch": 0.4514053810311138, + "flos": 24276403382400.0, + "grad_norm": 1.576414625173907, + "language_loss": 0.74070841, + "learning_rate": 2.4089510153269526e-06, + "loss": 0.76309156, + "num_input_tokens_seen": 161130685, + "step": 7508, + "time_per_iteration": 2.7018399238586426 + }, + { + "auxiliary_loss_clip": 0.01151683, + "auxiliary_loss_mlp": 0.01117455, + "balance_loss_clip": 1.00219285, + "balance_loss_mlp": 1.00072551, + "epoch": 0.45146550428378174, + "flos": 17886781756800.0, + "grad_norm": 1.8573031784432381, + "language_loss": 0.79104912, + "learning_rate": 2.4085697745650217e-06, + "loss": 0.81374049, + "num_input_tokens_seen": 161147555, + "step": 7509, + "time_per_iteration": 2.588392972946167 + }, + { + "auxiliary_loss_clip": 0.01168369, + "auxiliary_loss_mlp": 0.0111751, + "balance_loss_clip": 1.00227785, + "balance_loss_mlp": 1.00078034, + "epoch": 0.4515256275364497, + "flos": 24243437675520.0, + "grad_norm": 1.937459302430831, + "language_loss": 0.72892761, + "learning_rate": 2.4081885183100837e-06, + "loss": 0.75178647, + "num_input_tokens_seen": 161166255, + "step": 7510, + "time_per_iteration": 2.6304845809936523 + }, + { + "auxiliary_loss_clip": 0.01168385, + "auxiliary_loss_mlp": 0.01118008, + "balance_loss_clip": 1.00218225, + "balance_loss_mlp": 1.00051582, + "epoch": 0.45158575078911767, + "flos": 20631039811200.0, + "grad_norm": 2.006758500125429, + "language_loss": 0.77053291, + "learning_rate": 2.4078072465765964e-06, + "loss": 0.79339683, + "num_input_tokens_seen": 161184720, + "step": 7511, + "time_per_iteration": 2.550652265548706 + }, + { + "auxiliary_loss_clip": 0.01151765, + "auxiliary_loss_mlp": 0.01117968, + "balance_loss_clip": 1.0020988, + "balance_loss_mlp": 1.00085664, + "epoch": 0.45164587404178563, + "flos": 23327751237120.0, + "grad_norm": 1.6456800056314624, + "language_loss": 0.78417891, + "learning_rate": 2.4074259593790174e-06, + "loss": 0.8068763, + "num_input_tokens_seen": 161204360, + "step": 7512, + "time_per_iteration": 2.6370997428894043 + }, + { + "auxiliary_loss_clip": 0.01120163, + "auxiliary_loss_mlp": 0.0111861, + "balance_loss_clip": 1.0020566, + "balance_loss_mlp": 1.00064027, + "epoch": 0.45170599729445365, + "flos": 23805973935360.0, + "grad_norm": 2.142536376719447, + "language_loss": 0.86856151, + "learning_rate": 2.4070446567318053e-06, + "loss": 0.89094919, + "num_input_tokens_seen": 161223575, + "step": 7513, + "time_per_iteration": 2.683250665664673 + }, + { + "auxiliary_loss_clip": 0.01151745, + "auxiliary_loss_mlp": 0.01116187, + "balance_loss_clip": 1.00228047, + "balance_loss_mlp": 1.00050592, + "epoch": 0.4517661205471216, + "flos": 23512942782720.0, + "grad_norm": 2.0735428442009174, + "language_loss": 0.67828691, + "learning_rate": 2.406663338649419e-06, + "loss": 0.70096618, + "num_input_tokens_seen": 161243805, + "step": 7514, + "time_per_iteration": 2.6860387325286865 + }, + { + "auxiliary_loss_clip": 0.01152074, + "auxiliary_loss_mlp": 0.01117967, + "balance_loss_clip": 1.00216007, + "balance_loss_mlp": 1.00047445, + "epoch": 0.4518262437997896, + "flos": 23513948363520.0, + "grad_norm": 1.9557499294062537, + "language_loss": 0.69488317, + "learning_rate": 2.406282005146318e-06, + "loss": 0.71758366, + "num_input_tokens_seen": 161261450, + "step": 7515, + "time_per_iteration": 2.589613914489746 + }, + { + "auxiliary_loss_clip": 0.01153344, + "auxiliary_loss_mlp": 0.01118073, + "balance_loss_clip": 1.00222874, + "balance_loss_mlp": 1.00067544, + "epoch": 0.45188636705245755, + "flos": 14568061489920.0, + "grad_norm": 2.3434995716023073, + "language_loss": 0.81181192, + "learning_rate": 2.405900656236963e-06, + "loss": 0.83452612, + "num_input_tokens_seen": 161276965, + "step": 7516, + "time_per_iteration": 2.5399045944213867 + }, + { + "auxiliary_loss_clip": 0.01168432, + "auxiliary_loss_mlp": 0.01117889, + "balance_loss_clip": 1.00232983, + "balance_loss_mlp": 1.00058746, + "epoch": 0.4519464903051255, + "flos": 19901550499200.0, + "grad_norm": 1.8581960828838116, + "language_loss": 0.66178918, + "learning_rate": 2.4055192919358137e-06, + "loss": 0.68465245, + "num_input_tokens_seen": 161295375, + "step": 7517, + "time_per_iteration": 2.505222797393799 + }, + { + "auxiliary_loss_clip": 0.01121, + "auxiliary_loss_mlp": 0.01117668, + "balance_loss_clip": 1.00211143, + "balance_loss_mlp": 1.00065219, + "epoch": 0.4520066135577935, + "flos": 18844376388480.0, + "grad_norm": 1.8546806402856724, + "language_loss": 0.62705231, + "learning_rate": 2.405137912257333e-06, + "loss": 0.64943904, + "num_input_tokens_seen": 161313010, + "step": 7518, + "time_per_iteration": 2.6236894130706787 + }, + { + "auxiliary_loss_clip": 0.01151778, + "auxiliary_loss_mlp": 0.01117868, + "balance_loss_clip": 1.0022465, + "balance_loss_mlp": 1.00075698, + "epoch": 0.45206673681046144, + "flos": 48214419713280.0, + "grad_norm": 1.3663863407490442, + "language_loss": 0.59537613, + "learning_rate": 2.404756517215982e-06, + "loss": 0.61807257, + "num_input_tokens_seen": 161336690, + "step": 7519, + "time_per_iteration": 2.802140712738037 + }, + { + "auxiliary_loss_clip": 0.01151727, + "auxiliary_loss_mlp": 0.01118386, + "balance_loss_clip": 1.00214601, + "balance_loss_mlp": 1.00089312, + "epoch": 0.4521268600631294, + "flos": 23842171866240.0, + "grad_norm": 1.6305324527135727, + "language_loss": 0.72542989, + "learning_rate": 2.404375106826223e-06, + "loss": 0.74813104, + "num_input_tokens_seen": 161357845, + "step": 7520, + "time_per_iteration": 2.594406843185425 + }, + { + "auxiliary_loss_clip": 0.01136786, + "auxiliary_loss_mlp": 0.01117913, + "balance_loss_clip": 1.00219762, + "balance_loss_mlp": 1.00070632, + "epoch": 0.4521869833157974, + "flos": 18843622202880.0, + "grad_norm": 2.4421674433825418, + "language_loss": 0.7539258, + "learning_rate": 2.4039936811025194e-06, + "loss": 0.77647281, + "num_input_tokens_seen": 161375160, + "step": 7521, + "time_per_iteration": 2.627031087875366 + }, + { + "auxiliary_loss_clip": 0.01136107, + "auxiliary_loss_mlp": 0.01118193, + "balance_loss_clip": 1.00200605, + "balance_loss_mlp": 1.00079536, + "epoch": 0.45224710656846534, + "flos": 19788072456960.0, + "grad_norm": 1.7238823539484813, + "language_loss": 0.68089473, + "learning_rate": 2.4036122400593343e-06, + "loss": 0.70343775, + "num_input_tokens_seen": 161393690, + "step": 7522, + "time_per_iteration": 2.6343812942504883 + }, + { + "auxiliary_loss_clip": 0.01151626, + "auxiliary_loss_mlp": 0.01117762, + "balance_loss_clip": 1.00217819, + "balance_loss_mlp": 1.00065064, + "epoch": 0.4523072298211333, + "flos": 28256131681920.0, + "grad_norm": 1.6051411550832695, + "language_loss": 0.60581374, + "learning_rate": 2.403230783711134e-06, + "loss": 0.62850761, + "num_input_tokens_seen": 161415015, + "step": 7523, + "time_per_iteration": 2.6326868534088135 + }, + { + "auxiliary_loss_clip": 0.01152115, + "auxiliary_loss_mlp": 0.01118607, + "balance_loss_clip": 1.00209188, + "balance_loss_mlp": 1.00063801, + "epoch": 0.45236735307380127, + "flos": 11181039511680.0, + "grad_norm": 2.2341342693613226, + "language_loss": 0.78309643, + "learning_rate": 2.4028493120723813e-06, + "loss": 0.80580366, + "num_input_tokens_seen": 161432940, + "step": 7524, + "time_per_iteration": 3.9436943531036377 + }, + { + "auxiliary_loss_clip": 0.01106674, + "auxiliary_loss_mlp": 0.01118113, + "balance_loss_clip": 1.00191689, + "balance_loss_mlp": 1.00062025, + "epoch": 0.45242747632646924, + "flos": 22601386408320.0, + "grad_norm": 1.6303767965740825, + "language_loss": 0.6412096, + "learning_rate": 2.4024678251575417e-06, + "loss": 0.66345745, + "num_input_tokens_seen": 161452215, + "step": 7525, + "time_per_iteration": 2.719966173171997 + }, + { + "auxiliary_loss_clip": 0.01152736, + "auxiliary_loss_mlp": 0.01117764, + "balance_loss_clip": 1.00222087, + "balance_loss_mlp": 1.00074804, + "epoch": 0.45248759957913726, + "flos": 18256267008000.0, + "grad_norm": 1.5822925956966933, + "language_loss": 0.7862137, + "learning_rate": 2.402086322981083e-06, + "loss": 0.80891871, + "num_input_tokens_seen": 161469520, + "step": 7526, + "time_per_iteration": 2.563575029373169 + }, + { + "auxiliary_loss_clip": 0.01136579, + "auxiliary_loss_mlp": 0.01117924, + "balance_loss_clip": 1.00211084, + "balance_loss_mlp": 1.00052667, + "epoch": 0.4525477228318052, + "flos": 22450094323200.0, + "grad_norm": 1.9252915056897302, + "language_loss": 0.80751568, + "learning_rate": 2.40170480555747e-06, + "loss": 0.83006072, + "num_input_tokens_seen": 161487335, + "step": 7527, + "time_per_iteration": 2.6451773643493652 + }, + { + "auxiliary_loss_clip": 0.01125579, + "auxiliary_loss_mlp": 0.01117663, + "balance_loss_clip": 1.00225782, + "balance_loss_mlp": 1.00045657, + "epoch": 0.4526078460844732, + "flos": 29644869260160.0, + "grad_norm": 1.4617858670182722, + "language_loss": 0.65540016, + "learning_rate": 2.4013232729011706e-06, + "loss": 0.67783266, + "num_input_tokens_seen": 161510095, + "step": 7528, + "time_per_iteration": 5.653846502304077 + }, + { + "auxiliary_loss_clip": 0.0113847, + "auxiliary_loss_mlp": 0.01117592, + "balance_loss_clip": 1.00217438, + "balance_loss_mlp": 1.00067139, + "epoch": 0.45266796933714115, + "flos": 23039747988480.0, + "grad_norm": 1.553373267427012, + "language_loss": 0.75140154, + "learning_rate": 2.4009417250266525e-06, + "loss": 0.7739622, + "num_input_tokens_seen": 161528725, + "step": 7529, + "time_per_iteration": 2.6202430725097656 + }, + { + "auxiliary_loss_clip": 0.01168474, + "auxiliary_loss_mlp": 0.01117485, + "balance_loss_clip": 1.00233722, + "balance_loss_mlp": 1.00066018, + "epoch": 0.4527280925898091, + "flos": 14428405411200.0, + "grad_norm": 1.831565060918566, + "language_loss": 0.72714329, + "learning_rate": 2.400560161948384e-06, + "loss": 0.75000286, + "num_input_tokens_seen": 161547195, + "step": 7530, + "time_per_iteration": 3.891688346862793 + }, + { + "auxiliary_loss_clip": 0.01121043, + "auxiliary_loss_mlp": 0.01118438, + "balance_loss_clip": 1.00197482, + "balance_loss_mlp": 1.00065923, + "epoch": 0.4527882158424771, + "flos": 22925515760640.0, + "grad_norm": 1.5938830157240027, + "language_loss": 0.76119792, + "learning_rate": 2.400178583680834e-06, + "loss": 0.78359276, + "num_input_tokens_seen": 161565565, + "step": 7531, + "time_per_iteration": 2.8056952953338623 + }, + { + "auxiliary_loss_clip": 0.01168247, + "auxiliary_loss_mlp": 0.01117621, + "balance_loss_clip": 1.00220346, + "balance_loss_mlp": 1.00070107, + "epoch": 0.45284833909514505, + "flos": 25555326105600.0, + "grad_norm": 1.4654701496970444, + "language_loss": 0.66983652, + "learning_rate": 2.3997969902384717e-06, + "loss": 0.6926952, + "num_input_tokens_seen": 161586630, + "step": 7532, + "time_per_iteration": 2.5976715087890625 + }, + { + "auxiliary_loss_clip": 0.01151736, + "auxiliary_loss_mlp": 0.01117323, + "balance_loss_clip": 1.00221992, + "balance_loss_mlp": 1.0006882, + "epoch": 0.452908462347813, + "flos": 18150007599360.0, + "grad_norm": 2.626280157518129, + "language_loss": 0.78724349, + "learning_rate": 2.399415381635768e-06, + "loss": 0.80993414, + "num_input_tokens_seen": 161603815, + "step": 7533, + "time_per_iteration": 2.584240436553955 + }, + { + "auxiliary_loss_clip": 0.01120179, + "auxiliary_loss_mlp": 0.01119366, + "balance_loss_clip": 1.00203168, + "balance_loss_mlp": 1.00053787, + "epoch": 0.452968585600481, + "flos": 19062749122560.0, + "grad_norm": 1.8281531963835753, + "language_loss": 0.82828623, + "learning_rate": 2.3990337578871927e-06, + "loss": 0.85068166, + "num_input_tokens_seen": 161622900, + "step": 7534, + "time_per_iteration": 2.652106523513794 + }, + { + "auxiliary_loss_clip": 0.01135186, + "auxiliary_loss_mlp": 0.01118222, + "balance_loss_clip": 1.00197577, + "balance_loss_mlp": 1.00053859, + "epoch": 0.45302870885314894, + "flos": 22051737515520.0, + "grad_norm": 2.4516179058410397, + "language_loss": 0.76550853, + "learning_rate": 2.3986521190072176e-06, + "loss": 0.78804255, + "num_input_tokens_seen": 161641700, + "step": 7535, + "time_per_iteration": 2.6199872493743896 + }, + { + "auxiliary_loss_clip": 0.01118139, + "auxiliary_loss_mlp": 0.01117595, + "balance_loss_clip": 1.00203741, + "balance_loss_mlp": 1.000579, + "epoch": 0.4530888321058169, + "flos": 20376217751040.0, + "grad_norm": 1.5397540511443917, + "language_loss": 0.80364054, + "learning_rate": 2.3982704650103138e-06, + "loss": 0.82599795, + "num_input_tokens_seen": 161661955, + "step": 7536, + "time_per_iteration": 2.629132032394409 + }, + { + "auxiliary_loss_clip": 0.01136988, + "auxiliary_loss_mlp": 0.01117935, + "balance_loss_clip": 1.00202394, + "balance_loss_mlp": 1.0005374, + "epoch": 0.4531489553584849, + "flos": 14830425406080.0, + "grad_norm": 2.4949823470503922, + "language_loss": 0.7603749, + "learning_rate": 2.3978887959109544e-06, + "loss": 0.78292418, + "num_input_tokens_seen": 161679245, + "step": 7537, + "time_per_iteration": 2.595726490020752 + }, + { + "auxiliary_loss_clip": 0.01153565, + "auxiliary_loss_mlp": 0.0111762, + "balance_loss_clip": 1.00240612, + "balance_loss_mlp": 1.00050879, + "epoch": 0.45320907861115284, + "flos": 21944975316480.0, + "grad_norm": 1.8363535385776395, + "language_loss": 0.76006997, + "learning_rate": 2.3975071117236118e-06, + "loss": 0.78278184, + "num_input_tokens_seen": 161698795, + "step": 7538, + "time_per_iteration": 2.5734715461730957 + }, + { + "auxiliary_loss_clip": 0.01148783, + "auxiliary_loss_mlp": 0.01096815, + "balance_loss_clip": 1.00156748, + "balance_loss_mlp": 0.99992162, + "epoch": 0.45326920186382086, + "flos": 66251455038720.0, + "grad_norm": 0.7872285448412929, + "language_loss": 0.62389731, + "learning_rate": 2.3971254124627593e-06, + "loss": 0.64635324, + "num_input_tokens_seen": 161761980, + "step": 7539, + "time_per_iteration": 3.203181743621826 + }, + { + "auxiliary_loss_clip": 0.0116837, + "auxiliary_loss_mlp": 0.01118433, + "balance_loss_clip": 1.00231194, + "balance_loss_mlp": 1.00084496, + "epoch": 0.4533293251164888, + "flos": 14684233052160.0, + "grad_norm": 1.8695090703843749, + "language_loss": 0.65655494, + "learning_rate": 2.396743698142872e-06, + "loss": 0.67942297, + "num_input_tokens_seen": 161779455, + "step": 7540, + "time_per_iteration": 2.5048909187316895 + }, + { + "auxiliary_loss_clip": 0.01137753, + "auxiliary_loss_mlp": 0.0111866, + "balance_loss_clip": 1.00223517, + "balance_loss_mlp": 1.0006907, + "epoch": 0.4533894483691568, + "flos": 22601206840320.0, + "grad_norm": 1.8998525894639544, + "language_loss": 0.84667969, + "learning_rate": 2.396361968778424e-06, + "loss": 0.86924386, + "num_input_tokens_seen": 161798980, + "step": 7541, + "time_per_iteration": 2.63073992729187 + }, + { + "auxiliary_loss_clip": 0.01136131, + "auxiliary_loss_mlp": 0.01118156, + "balance_loss_clip": 1.00212526, + "balance_loss_mlp": 1.00056791, + "epoch": 0.45344957162182475, + "flos": 34751617666560.0, + "grad_norm": 2.0006820716609086, + "language_loss": 0.76646417, + "learning_rate": 2.395980224383889e-06, + "loss": 0.78900707, + "num_input_tokens_seen": 161819745, + "step": 7542, + "time_per_iteration": 2.6986029148101807 + }, + { + "auxiliary_loss_clip": 0.0113529, + "auxiliary_loss_mlp": 0.01117865, + "balance_loss_clip": 1.0019486, + "balance_loss_mlp": 1.00046754, + "epoch": 0.4535096948744927, + "flos": 23550218121600.0, + "grad_norm": 1.4655166195840406, + "language_loss": 0.80189359, + "learning_rate": 2.395598464973746e-06, + "loss": 0.82442522, + "num_input_tokens_seen": 161838575, + "step": 7543, + "time_per_iteration": 2.6725261211395264 + }, + { + "auxiliary_loss_clip": 0.01153442, + "auxiliary_loss_mlp": 0.00747587, + "balance_loss_clip": 1.00224733, + "balance_loss_mlp": 1.00036788, + "epoch": 0.4535698181271607, + "flos": 25557552748800.0, + "grad_norm": 3.534957017917814, + "language_loss": 0.76024741, + "learning_rate": 2.395216690562469e-06, + "loss": 0.77925766, + "num_input_tokens_seen": 161858590, + "step": 7544, + "time_per_iteration": 2.623002290725708 + }, + { + "auxiliary_loss_clip": 0.01120151, + "auxiliary_loss_mlp": 0.01118702, + "balance_loss_clip": 1.00205684, + "balance_loss_mlp": 1.00082755, + "epoch": 0.45362994137982865, + "flos": 24864117713280.0, + "grad_norm": 2.601159576901033, + "language_loss": 0.75305271, + "learning_rate": 2.3948349011645355e-06, + "loss": 0.77544123, + "num_input_tokens_seen": 161878390, + "step": 7545, + "time_per_iteration": 2.7344353199005127 + }, + { + "auxiliary_loss_clip": 0.01136456, + "auxiliary_loss_mlp": 0.0111759, + "balance_loss_clip": 1.00222659, + "balance_loss_mlp": 1.00057459, + "epoch": 0.4536900646324966, + "flos": 30806794408320.0, + "grad_norm": 1.5765670542982961, + "language_loss": 0.72213733, + "learning_rate": 2.394453096794423e-06, + "loss": 0.74467778, + "num_input_tokens_seen": 161898610, + "step": 7546, + "time_per_iteration": 2.730027914047241 + }, + { + "auxiliary_loss_clip": 0.011359, + "auxiliary_loss_mlp": 0.01118693, + "balance_loss_clip": 1.00204396, + "balance_loss_mlp": 1.00053298, + "epoch": 0.4537501878851646, + "flos": 23404313076480.0, + "grad_norm": 1.9057336089633359, + "language_loss": 0.75827277, + "learning_rate": 2.394071277466609e-06, + "loss": 0.7808187, + "num_input_tokens_seen": 161918210, + "step": 7547, + "time_per_iteration": 2.6698923110961914 + }, + { + "auxiliary_loss_clip": 0.01153477, + "auxiliary_loss_mlp": 0.01118029, + "balance_loss_clip": 1.00212765, + "balance_loss_mlp": 1.00063217, + "epoch": 0.45381031113783254, + "flos": 18149289327360.0, + "grad_norm": 3.431517564229629, + "language_loss": 0.69796467, + "learning_rate": 2.393689443195573e-06, + "loss": 0.7206797, + "num_input_tokens_seen": 161936950, + "step": 7548, + "time_per_iteration": 2.5570507049560547 + }, + { + "auxiliary_loss_clip": 0.01168294, + "auxiliary_loss_mlp": 0.01117554, + "balance_loss_clip": 1.00218058, + "balance_loss_mlp": 1.00072885, + "epoch": 0.4538704343905005, + "flos": 25336666062720.0, + "grad_norm": 2.149476021029459, + "language_loss": 0.72302455, + "learning_rate": 2.393307593995794e-06, + "loss": 0.74588299, + "num_input_tokens_seen": 161955550, + "step": 7549, + "time_per_iteration": 2.5729517936706543 + }, + { + "auxiliary_loss_clip": 0.01119655, + "auxiliary_loss_mlp": 0.01117488, + "balance_loss_clip": 1.00197637, + "balance_loss_mlp": 1.00047231, + "epoch": 0.4539305576431685, + "flos": 28731445378560.0, + "grad_norm": 1.4658826621461698, + "language_loss": 0.64953482, + "learning_rate": 2.392925729881751e-06, + "loss": 0.67190623, + "num_input_tokens_seen": 161976760, + "step": 7550, + "time_per_iteration": 2.7134921550750732 + }, + { + "auxiliary_loss_clip": 0.01152155, + "auxiliary_loss_mlp": 0.01117414, + "balance_loss_clip": 1.00232077, + "balance_loss_mlp": 1.00068474, + "epoch": 0.45399068089583644, + "flos": 22492397566080.0, + "grad_norm": 1.8822366777119863, + "language_loss": 0.68750066, + "learning_rate": 2.3925438508679263e-06, + "loss": 0.71019632, + "num_input_tokens_seen": 161996120, + "step": 7551, + "time_per_iteration": 2.617478132247925 + }, + { + "auxiliary_loss_clip": 0.01153558, + "auxiliary_loss_mlp": 0.01118183, + "balance_loss_clip": 1.00215375, + "balance_loss_mlp": 1.00059509, + "epoch": 0.45405080414850446, + "flos": 12893403651840.0, + "grad_norm": 1.8236344516750642, + "language_loss": 0.79522634, + "learning_rate": 2.392161956968798e-06, + "loss": 0.81794375, + "num_input_tokens_seen": 162011125, + "step": 7552, + "time_per_iteration": 2.5472564697265625 + }, + { + "auxiliary_loss_clip": 0.01146967, + "auxiliary_loss_mlp": 0.01097217, + "balance_loss_clip": 1.00161529, + "balance_loss_mlp": 0.99994242, + "epoch": 0.4541109274011724, + "flos": 59766919724160.0, + "grad_norm": 0.8114431623976773, + "language_loss": 0.57835317, + "learning_rate": 2.39178004819885e-06, + "loss": 0.60079491, + "num_input_tokens_seen": 162068705, + "step": 7553, + "time_per_iteration": 3.119037628173828 + }, + { + "auxiliary_loss_clip": 0.01090668, + "auxiliary_loss_mlp": 0.0111689, + "balance_loss_clip": 1.00219679, + "balance_loss_mlp": 1.00063729, + "epoch": 0.4541710506538404, + "flos": 28511743841280.0, + "grad_norm": 1.4700575256724682, + "language_loss": 0.76577353, + "learning_rate": 2.3913981245725626e-06, + "loss": 0.78784907, + "num_input_tokens_seen": 162089655, + "step": 7554, + "time_per_iteration": 2.7684571743011475 + }, + { + "auxiliary_loss_clip": 0.01135108, + "auxiliary_loss_mlp": 0.01118798, + "balance_loss_clip": 1.00209844, + "balance_loss_mlp": 1.00054252, + "epoch": 0.45423117390650836, + "flos": 17675591742720.0, + "grad_norm": 3.2096716033719126, + "language_loss": 0.76567578, + "learning_rate": 2.3910161861044194e-06, + "loss": 0.78821492, + "num_input_tokens_seen": 162108465, + "step": 7555, + "time_per_iteration": 2.60251784324646 + }, + { + "auxiliary_loss_clip": 0.01071085, + "auxiliary_loss_mlp": 0.01116953, + "balance_loss_clip": 1.00165772, + "balance_loss_mlp": 1.00060463, + "epoch": 0.4542912971591763, + "flos": 28072556248320.0, + "grad_norm": 1.3965318548974854, + "language_loss": 0.72568589, + "learning_rate": 2.390634232808903e-06, + "loss": 0.74756628, + "num_input_tokens_seen": 162129910, + "step": 7556, + "time_per_iteration": 2.805039405822754 + }, + { + "auxiliary_loss_clip": 0.01168469, + "auxiliary_loss_mlp": 0.01118766, + "balance_loss_clip": 1.00218248, + "balance_loss_mlp": 1.00070071, + "epoch": 0.4543514204118443, + "flos": 22671771108480.0, + "grad_norm": 1.9695760166416285, + "language_loss": 0.63286889, + "learning_rate": 2.3902522647004982e-06, + "loss": 0.65574121, + "num_input_tokens_seen": 162148840, + "step": 7557, + "time_per_iteration": 2.5397369861602783 + }, + { + "auxiliary_loss_clip": 0.01132343, + "auxiliary_loss_mlp": 0.01097272, + "balance_loss_clip": 1.00155997, + "balance_loss_mlp": 0.99999684, + "epoch": 0.45441154366451225, + "flos": 58216549921920.0, + "grad_norm": 0.6811114899751455, + "language_loss": 0.5762862, + "learning_rate": 2.3898702817936875e-06, + "loss": 0.59858239, + "num_input_tokens_seen": 162208500, + "step": 7558, + "time_per_iteration": 3.0778183937072754 + }, + { + "auxiliary_loss_clip": 0.01151785, + "auxiliary_loss_mlp": 0.01118548, + "balance_loss_clip": 1.00218964, + "balance_loss_mlp": 1.00067353, + "epoch": 0.4544716669171802, + "flos": 16764286763520.0, + "grad_norm": 2.2330878567976655, + "language_loss": 0.55774063, + "learning_rate": 2.3894882841029573e-06, + "loss": 0.58044398, + "num_input_tokens_seen": 162224650, + "step": 7559, + "time_per_iteration": 2.5869925022125244 + }, + { + "auxiliary_loss_clip": 0.011519, + "auxiliary_loss_mlp": 0.00747387, + "balance_loss_clip": 1.0021286, + "balance_loss_mlp": 1.00026298, + "epoch": 0.4545317901698482, + "flos": 15925233991680.0, + "grad_norm": 2.4874506545631556, + "language_loss": 0.720433, + "learning_rate": 2.389106271642792e-06, + "loss": 0.7394259, + "num_input_tokens_seen": 162242930, + "step": 7560, + "time_per_iteration": 2.5612733364105225 + }, + { + "auxiliary_loss_clip": 0.01078417, + "auxiliary_loss_mlp": 0.0111827, + "balance_loss_clip": 1.00212038, + "balance_loss_mlp": 1.00077701, + "epoch": 0.45459191342251615, + "flos": 17639752947840.0, + "grad_norm": 2.5661213923307375, + "language_loss": 0.69077557, + "learning_rate": 2.3887242444276775e-06, + "loss": 0.71274245, + "num_input_tokens_seen": 162261455, + "step": 7561, + "time_per_iteration": 2.7475857734680176 + }, + { + "auxiliary_loss_clip": 0.01134989, + "auxiliary_loss_mlp": 0.011172, + "balance_loss_clip": 1.00203717, + "balance_loss_mlp": 1.00056541, + "epoch": 0.4546520366751841, + "flos": 16176608346240.0, + "grad_norm": 1.5787238001709722, + "language_loss": 0.84932321, + "learning_rate": 2.3883422024721015e-06, + "loss": 0.87184513, + "num_input_tokens_seen": 162279725, + "step": 7562, + "time_per_iteration": 3.977005958557129 + }, + { + "auxiliary_loss_clip": 0.01152871, + "auxiliary_loss_mlp": 0.01117377, + "balance_loss_clip": 1.00225949, + "balance_loss_mlp": 1.00064683, + "epoch": 0.4547121599278521, + "flos": 19751443562880.0, + "grad_norm": 1.9057811049408744, + "language_loss": 0.89791048, + "learning_rate": 2.38796014579055e-06, + "loss": 0.92061293, + "num_input_tokens_seen": 162297865, + "step": 7563, + "time_per_iteration": 2.578310012817383 + }, + { + "auxiliary_loss_clip": 0.01168259, + "auxiliary_loss_mlp": 0.00747472, + "balance_loss_clip": 1.00212383, + "balance_loss_mlp": 1.00024199, + "epoch": 0.45477228318052004, + "flos": 19937461121280.0, + "grad_norm": 1.9402638838382682, + "language_loss": 0.71489954, + "learning_rate": 2.3875780743975097e-06, + "loss": 0.73405683, + "num_input_tokens_seen": 162316010, + "step": 7564, + "time_per_iteration": 2.526853084564209 + }, + { + "auxiliary_loss_clip": 0.0115336, + "auxiliary_loss_mlp": 0.01118496, + "balance_loss_clip": 1.00205529, + "balance_loss_mlp": 1.00062215, + "epoch": 0.454832406433188, + "flos": 21288312829440.0, + "grad_norm": 3.1916226159770353, + "language_loss": 0.68164998, + "learning_rate": 2.3871959883074713e-06, + "loss": 0.70436859, + "num_input_tokens_seen": 162336115, + "step": 7565, + "time_per_iteration": 2.5738158226013184 + }, + { + "auxiliary_loss_clip": 0.01104689, + "auxiliary_loss_mlp": 0.01117925, + "balance_loss_clip": 1.00183547, + "balance_loss_mlp": 1.00062299, + "epoch": 0.45489252968585603, + "flos": 24498726612480.0, + "grad_norm": 2.1011765725203926, + "language_loss": 0.79834771, + "learning_rate": 2.386813887534922e-06, + "loss": 0.82057393, + "num_input_tokens_seen": 162355705, + "step": 7566, + "time_per_iteration": 5.588924169540405 + }, + { + "auxiliary_loss_clip": 0.0111989, + "auxiliary_loss_mlp": 0.01118287, + "balance_loss_clip": 1.00214124, + "balance_loss_mlp": 1.0006032, + "epoch": 0.454952652938524, + "flos": 17092474352640.0, + "grad_norm": 1.8352644263893603, + "language_loss": 0.73782682, + "learning_rate": 2.3864317720943508e-06, + "loss": 0.76020861, + "num_input_tokens_seen": 162374055, + "step": 7567, + "time_per_iteration": 2.6705429553985596 + }, + { + "auxiliary_loss_clip": 0.01120246, + "auxiliary_loss_mlp": 0.01118572, + "balance_loss_clip": 1.00213873, + "balance_loss_mlp": 1.00069785, + "epoch": 0.45501277619119196, + "flos": 27630387826560.0, + "grad_norm": 1.446573767263747, + "language_loss": 0.80993366, + "learning_rate": 2.386049642000249e-06, + "loss": 0.83232188, + "num_input_tokens_seen": 162393560, + "step": 7568, + "time_per_iteration": 4.162456750869751 + }, + { + "auxiliary_loss_clip": 0.01152782, + "auxiliary_loss_mlp": 0.01119405, + "balance_loss_clip": 1.00214481, + "balance_loss_mlp": 1.00086284, + "epoch": 0.4550728994438599, + "flos": 19974664632960.0, + "grad_norm": 3.0361446239501007, + "language_loss": 0.79814458, + "learning_rate": 2.3856674972671055e-06, + "loss": 0.82086647, + "num_input_tokens_seen": 162413170, + "step": 7569, + "time_per_iteration": 2.588818311691284 + }, + { + "auxiliary_loss_clip": 0.0115185, + "auxiliary_loss_mlp": 0.0111845, + "balance_loss_clip": 1.00214231, + "balance_loss_mlp": 1.00057578, + "epoch": 0.4551330226965279, + "flos": 26066873646720.0, + "grad_norm": 1.3443983597288647, + "language_loss": 0.74735785, + "learning_rate": 2.385285337909412e-06, + "loss": 0.77006084, + "num_input_tokens_seen": 162434080, + "step": 7570, + "time_per_iteration": 2.624755859375 + }, + { + "auxiliary_loss_clip": 0.01135913, + "auxiliary_loss_mlp": 0.01118622, + "balance_loss_clip": 1.0021224, + "balance_loss_mlp": 1.00074792, + "epoch": 0.45519314594919585, + "flos": 32781091501440.0, + "grad_norm": 1.7995728772174326, + "language_loss": 0.74728084, + "learning_rate": 2.3849031639416596e-06, + "loss": 0.76982617, + "num_input_tokens_seen": 162455445, + "step": 7571, + "time_per_iteration": 2.7273826599121094 + }, + { + "auxiliary_loss_clip": 0.01152027, + "auxiliary_loss_mlp": 0.01117286, + "balance_loss_clip": 1.00220108, + "balance_loss_mlp": 1.0006516, + "epoch": 0.4552532692018638, + "flos": 19172671718400.0, + "grad_norm": 2.5882609050640055, + "language_loss": 0.81303215, + "learning_rate": 2.3845209753783414e-06, + "loss": 0.83572531, + "num_input_tokens_seen": 162474940, + "step": 7572, + "time_per_iteration": 2.544996976852417 + }, + { + "auxiliary_loss_clip": 0.0113862, + "auxiliary_loss_mlp": 0.01119979, + "balance_loss_clip": 1.0021714, + "balance_loss_mlp": 1.00076962, + "epoch": 0.4553133924545318, + "flos": 26027156183040.0, + "grad_norm": 2.775576658596022, + "language_loss": 0.72882462, + "learning_rate": 2.3841387722339486e-06, + "loss": 0.7514106, + "num_input_tokens_seen": 162493340, + "step": 7573, + "time_per_iteration": 2.6572086811065674 + }, + { + "auxiliary_loss_clip": 0.01151772, + "auxiliary_loss_mlp": 0.01119477, + "balance_loss_clip": 1.00215828, + "balance_loss_mlp": 1.00064874, + "epoch": 0.45537351570719975, + "flos": 30661535808000.0, + "grad_norm": 2.1798304057613045, + "language_loss": 0.7436384, + "learning_rate": 2.3837565545229748e-06, + "loss": 0.76635087, + "num_input_tokens_seen": 162514360, + "step": 7574, + "time_per_iteration": 2.654125213623047 + }, + { + "auxiliary_loss_clip": 0.01151751, + "auxiliary_loss_mlp": 0.01118767, + "balance_loss_clip": 1.00216103, + "balance_loss_mlp": 1.00070214, + "epoch": 0.4554336389598677, + "flos": 24353396184960.0, + "grad_norm": 1.497923167049493, + "language_loss": 0.71328831, + "learning_rate": 2.383374322259915e-06, + "loss": 0.7359935, + "num_input_tokens_seen": 162535240, + "step": 7575, + "time_per_iteration": 2.6233949661254883 + }, + { + "auxiliary_loss_clip": 0.0113512, + "auxiliary_loss_mlp": 0.01118477, + "balance_loss_clip": 1.00203037, + "balance_loss_mlp": 1.00069821, + "epoch": 0.4554937622125357, + "flos": 20557925677440.0, + "grad_norm": 2.0509069836332463, + "language_loss": 0.72979653, + "learning_rate": 2.3829920754592617e-06, + "loss": 0.75233245, + "num_input_tokens_seen": 162553880, + "step": 7576, + "time_per_iteration": 2.5909976959228516 + }, + { + "auxiliary_loss_clip": 0.01168271, + "auxiliary_loss_mlp": 0.01117502, + "balance_loss_clip": 1.00222397, + "balance_loss_mlp": 1.00067663, + "epoch": 0.45555388546520365, + "flos": 22820764723200.0, + "grad_norm": 1.6934880528241167, + "language_loss": 0.66378736, + "learning_rate": 2.382609814135511e-06, + "loss": 0.68664515, + "num_input_tokens_seen": 162574485, + "step": 7577, + "time_per_iteration": 2.540188789367676 + }, + { + "auxiliary_loss_clip": 0.0113651, + "auxiliary_loss_mlp": 0.01118556, + "balance_loss_clip": 1.00210869, + "balance_loss_mlp": 1.00077713, + "epoch": 0.4556140087178716, + "flos": 21725992051200.0, + "grad_norm": 3.6055851773875847, + "language_loss": 0.74197567, + "learning_rate": 2.382227538303157e-06, + "loss": 0.76452631, + "num_input_tokens_seen": 162595130, + "step": 7578, + "time_per_iteration": 2.705505847930908 + }, + { + "auxiliary_loss_clip": 0.01102536, + "auxiliary_loss_mlp": 0.0074753, + "balance_loss_clip": 1.0019114, + "balance_loss_mlp": 1.00024772, + "epoch": 0.45567413197053963, + "flos": 25994513698560.0, + "grad_norm": 1.924649112090599, + "language_loss": 0.70431685, + "learning_rate": 2.381845247976697e-06, + "loss": 0.72281754, + "num_input_tokens_seen": 162615720, + "step": 7579, + "time_per_iteration": 2.7553298473358154 + }, + { + "auxiliary_loss_clip": 0.01153114, + "auxiliary_loss_mlp": 0.01117563, + "balance_loss_clip": 1.0020802, + "balance_loss_mlp": 1.00073814, + "epoch": 0.4557342552232076, + "flos": 21537604195200.0, + "grad_norm": 1.677315943485385, + "language_loss": 0.78615838, + "learning_rate": 2.381462943170627e-06, + "loss": 0.80886519, + "num_input_tokens_seen": 162635825, + "step": 7580, + "time_per_iteration": 2.5983150005340576 + }, + { + "auxiliary_loss_clip": 0.01168399, + "auxiliary_loss_mlp": 0.01119087, + "balance_loss_clip": 1.00230384, + "balance_loss_mlp": 1.00073647, + "epoch": 0.45579437847587556, + "flos": 40001972647680.0, + "grad_norm": 2.207878396448917, + "language_loss": 0.69221008, + "learning_rate": 2.381080623899444e-06, + "loss": 0.71508491, + "num_input_tokens_seen": 162659130, + "step": 7581, + "time_per_iteration": 2.675306558609009 + }, + { + "auxiliary_loss_clip": 0.01153185, + "auxiliary_loss_mlp": 0.01118437, + "balance_loss_clip": 1.00216913, + "balance_loss_mlp": 1.00065827, + "epoch": 0.4558545017285435, + "flos": 31138501530240.0, + "grad_norm": 1.8945646724847902, + "language_loss": 0.73200887, + "learning_rate": 2.3806982901776455e-06, + "loss": 0.7547251, + "num_input_tokens_seen": 162681665, + "step": 7582, + "time_per_iteration": 2.6251442432403564 + }, + { + "auxiliary_loss_clip": 0.0116851, + "auxiliary_loss_mlp": 0.01118998, + "balance_loss_clip": 1.00226533, + "balance_loss_mlp": 1.00083768, + "epoch": 0.4559146249812115, + "flos": 21725776569600.0, + "grad_norm": 1.7904744646929731, + "language_loss": 0.72403556, + "learning_rate": 2.380315942019729e-06, + "loss": 0.74691069, + "num_input_tokens_seen": 162702040, + "step": 7583, + "time_per_iteration": 2.5643413066864014 + }, + { + "auxiliary_loss_clip": 0.01152188, + "auxiliary_loss_mlp": 0.01119167, + "balance_loss_clip": 1.00218809, + "balance_loss_mlp": 1.00072098, + "epoch": 0.45597474823387946, + "flos": 23805973935360.0, + "grad_norm": 1.6183776190478758, + "language_loss": 0.72682297, + "learning_rate": 2.379933579440195e-06, + "loss": 0.74953651, + "num_input_tokens_seen": 162722375, + "step": 7584, + "time_per_iteration": 2.6193490028381348 + }, + { + "auxiliary_loss_clip": 0.01121422, + "auxiliary_loss_mlp": 0.01118325, + "balance_loss_clip": 1.00211096, + "balance_loss_mlp": 1.00064123, + "epoch": 0.4560348714865474, + "flos": 31905661230720.0, + "grad_norm": 1.8133263333466976, + "language_loss": 0.68019879, + "learning_rate": 2.379551202453541e-06, + "loss": 0.70259625, + "num_input_tokens_seen": 162746095, + "step": 7585, + "time_per_iteration": 2.721780776977539 + }, + { + "auxiliary_loss_clip": 0.01168418, + "auxiliary_loss_mlp": 0.01117576, + "balance_loss_clip": 1.00221002, + "balance_loss_mlp": 1.00065601, + "epoch": 0.4560949947392154, + "flos": 22048828513920.0, + "grad_norm": 1.4115951163689306, + "language_loss": 0.76098508, + "learning_rate": 2.379168811074267e-06, + "loss": 0.78384507, + "num_input_tokens_seen": 162766330, + "step": 7586, + "time_per_iteration": 2.5199079513549805 + }, + { + "auxiliary_loss_clip": 0.01138001, + "auxiliary_loss_mlp": 0.01117528, + "balance_loss_clip": 1.00220942, + "balance_loss_mlp": 1.00051188, + "epoch": 0.45615511799188335, + "flos": 24571804832640.0, + "grad_norm": 1.6242159908091192, + "language_loss": 0.78340042, + "learning_rate": 2.3787864053168747e-06, + "loss": 0.80595577, + "num_input_tokens_seen": 162784755, + "step": 7587, + "time_per_iteration": 2.6766653060913086 + }, + { + "auxiliary_loss_clip": 0.0113832, + "auxiliary_loss_mlp": 0.01119075, + "balance_loss_clip": 1.00213826, + "balance_loss_mlp": 1.00091493, + "epoch": 0.4562152412445513, + "flos": 18330709944960.0, + "grad_norm": 1.8478729297217515, + "language_loss": 0.69228488, + "learning_rate": 2.378403985195863e-06, + "loss": 0.71485889, + "num_input_tokens_seen": 162803850, + "step": 7588, + "time_per_iteration": 2.596613645553589 + }, + { + "auxiliary_loss_clip": 0.01151423, + "auxiliary_loss_mlp": 0.01117988, + "balance_loss_clip": 1.00208902, + "balance_loss_mlp": 1.00068641, + "epoch": 0.4562753644972193, + "flos": 13516525814400.0, + "grad_norm": 1.6008970129983628, + "language_loss": 0.7909146, + "learning_rate": 2.378021550725735e-06, + "loss": 0.81360877, + "num_input_tokens_seen": 162820775, + "step": 7589, + "time_per_iteration": 2.5456974506378174 + }, + { + "auxiliary_loss_clip": 0.01153619, + "auxiliary_loss_mlp": 0.01118475, + "balance_loss_clip": 1.00231934, + "balance_loss_mlp": 1.0006963, + "epoch": 0.45633548774988725, + "flos": 29639697701760.0, + "grad_norm": 3.1338839048457663, + "language_loss": 0.62516749, + "learning_rate": 2.377639101920992e-06, + "loss": 0.64788842, + "num_input_tokens_seen": 162839695, + "step": 7590, + "time_per_iteration": 2.620887517929077 + }, + { + "auxiliary_loss_clip": 0.01136803, + "auxiliary_loss_mlp": 0.01118008, + "balance_loss_clip": 1.00222862, + "balance_loss_mlp": 1.00070572, + "epoch": 0.4563956110025552, + "flos": 22233409528320.0, + "grad_norm": 1.9740220620987883, + "language_loss": 0.72724712, + "learning_rate": 2.377256638796135e-06, + "loss": 0.74979526, + "num_input_tokens_seen": 162856095, + "step": 7591, + "time_per_iteration": 2.579463005065918 + }, + { + "auxiliary_loss_clip": 0.01136286, + "auxiliary_loss_mlp": 0.01118916, + "balance_loss_clip": 1.00222933, + "balance_loss_mlp": 1.00075531, + "epoch": 0.45645573425522323, + "flos": 17092043389440.0, + "grad_norm": 1.9764492911970042, + "language_loss": 0.76411879, + "learning_rate": 2.3768741613656695e-06, + "loss": 0.7866708, + "num_input_tokens_seen": 162874070, + "step": 7592, + "time_per_iteration": 2.5879509449005127 + }, + { + "auxiliary_loss_clip": 0.01138353, + "auxiliary_loss_mlp": 0.01117893, + "balance_loss_clip": 1.00223088, + "balance_loss_mlp": 1.00068676, + "epoch": 0.4565158575078912, + "flos": 20332334309760.0, + "grad_norm": 2.1689895384186184, + "language_loss": 0.69605058, + "learning_rate": 2.376491669644098e-06, + "loss": 0.71861303, + "num_input_tokens_seen": 162891000, + "step": 7593, + "time_per_iteration": 2.599762439727783 + }, + { + "auxiliary_loss_clip": 0.01153198, + "auxiliary_loss_mlp": 0.01117747, + "balance_loss_clip": 1.00215816, + "balance_loss_mlp": 1.00073171, + "epoch": 0.45657598076055916, + "flos": 23983013093760.0, + "grad_norm": 1.8286593334349712, + "language_loss": 0.8403759, + "learning_rate": 2.3761091636459248e-06, + "loss": 0.86308539, + "num_input_tokens_seen": 162910120, + "step": 7594, + "time_per_iteration": 2.566882371902466 + }, + { + "auxiliary_loss_clip": 0.01148568, + "auxiliary_loss_mlp": 0.00746349, + "balance_loss_clip": 1.00151873, + "balance_loss_mlp": 0.99989015, + "epoch": 0.45663610401322713, + "flos": 69364297526400.0, + "grad_norm": 0.7937762376078945, + "language_loss": 0.52704358, + "learning_rate": 2.375726643385654e-06, + "loss": 0.54599279, + "num_input_tokens_seen": 162963720, + "step": 7595, + "time_per_iteration": 3.140746831893921 + }, + { + "auxiliary_loss_clip": 0.01120504, + "auxiliary_loss_mlp": 0.01118263, + "balance_loss_clip": 1.00197768, + "balance_loss_mlp": 1.00058007, + "epoch": 0.4566962272658951, + "flos": 15149095891200.0, + "grad_norm": 2.5866804361678106, + "language_loss": 0.87310982, + "learning_rate": 2.3753441088777915e-06, + "loss": 0.89549744, + "num_input_tokens_seen": 162975760, + "step": 7596, + "time_per_iteration": 2.587355136871338 + }, + { + "auxiliary_loss_clip": 0.01153578, + "auxiliary_loss_mlp": 0.01118909, + "balance_loss_clip": 1.00223136, + "balance_loss_mlp": 1.00093997, + "epoch": 0.45675635051856306, + "flos": 18697465762560.0, + "grad_norm": 1.6433662798202846, + "language_loss": 0.7762531, + "learning_rate": 2.374961560136843e-06, + "loss": 0.79897797, + "num_input_tokens_seen": 162994865, + "step": 7597, + "time_per_iteration": 2.5507829189300537 + }, + { + "auxiliary_loss_clip": 0.01151693, + "auxiliary_loss_mlp": 0.01117447, + "balance_loss_clip": 1.00205112, + "balance_loss_mlp": 1.00052643, + "epoch": 0.456816473771231, + "flos": 19098300608640.0, + "grad_norm": 1.809559859236931, + "language_loss": 0.78349215, + "learning_rate": 2.374578997177314e-06, + "loss": 0.80618358, + "num_input_tokens_seen": 163014730, + "step": 7598, + "time_per_iteration": 2.584695816040039 + }, + { + "auxiliary_loss_clip": 0.01168347, + "auxiliary_loss_mlp": 0.01116908, + "balance_loss_clip": 1.00228763, + "balance_loss_mlp": 1.00065517, + "epoch": 0.456876597023899, + "flos": 28950069507840.0, + "grad_norm": 2.2259651698359764, + "language_loss": 0.71612155, + "learning_rate": 2.374196420013712e-06, + "loss": 0.73897409, + "num_input_tokens_seen": 163033405, + "step": 7599, + "time_per_iteration": 3.9497976303100586 + }, + { + "auxiliary_loss_clip": 0.01119594, + "auxiliary_loss_mlp": 0.01117374, + "balance_loss_clip": 1.00194585, + "balance_loss_mlp": 1.00083542, + "epoch": 0.45693672027656695, + "flos": 23289470317440.0, + "grad_norm": 2.2485420579267554, + "language_loss": 0.69703043, + "learning_rate": 2.373813828660544e-06, + "loss": 0.71940005, + "num_input_tokens_seen": 163051400, + "step": 7600, + "time_per_iteration": 2.6283793449401855 + }, + { + "auxiliary_loss_clip": 0.01086911, + "auxiliary_loss_mlp": 0.01118493, + "balance_loss_clip": 1.00193763, + "balance_loss_mlp": 1.00080919, + "epoch": 0.4569968435292349, + "flos": 20558212986240.0, + "grad_norm": 1.699297586276763, + "language_loss": 0.78447795, + "learning_rate": 2.373431223132319e-06, + "loss": 0.80653191, + "num_input_tokens_seen": 163069250, + "step": 7601, + "time_per_iteration": 2.690375328063965 + }, + { + "auxiliary_loss_clip": 0.01137039, + "auxiliary_loss_mlp": 0.01118742, + "balance_loss_clip": 1.00221157, + "balance_loss_mlp": 1.00086737, + "epoch": 0.4570569667819029, + "flos": 41282619223680.0, + "grad_norm": 2.93979598549894, + "language_loss": 0.71457046, + "learning_rate": 2.3730486034435448e-06, + "loss": 0.73712826, + "num_input_tokens_seen": 163091755, + "step": 7602, + "time_per_iteration": 2.7455925941467285 + }, + { + "auxiliary_loss_clip": 0.01153539, + "auxiliary_loss_mlp": 0.01118994, + "balance_loss_clip": 1.00232267, + "balance_loss_mlp": 1.00064325, + "epoch": 0.45711709003457085, + "flos": 26031573555840.0, + "grad_norm": 10.341152975941231, + "language_loss": 0.72907662, + "learning_rate": 2.372665969608729e-06, + "loss": 0.75180191, + "num_input_tokens_seen": 163111600, + "step": 7603, + "time_per_iteration": 3.9689860343933105 + }, + { + "auxiliary_loss_clip": 0.01151192, + "auxiliary_loss_mlp": 0.01117111, + "balance_loss_clip": 1.00211811, + "balance_loss_mlp": 1.00076294, + "epoch": 0.4571772132872388, + "flos": 22158068751360.0, + "grad_norm": 1.7951432120810982, + "language_loss": 0.83259684, + "learning_rate": 2.372283321642383e-06, + "loss": 0.85527992, + "num_input_tokens_seen": 163127350, + "step": 7604, + "time_per_iteration": 4.016721963882446 + }, + { + "auxiliary_loss_clip": 0.01135692, + "auxiliary_loss_mlp": 0.01118701, + "balance_loss_clip": 1.00218511, + "balance_loss_mlp": 1.00082707, + "epoch": 0.45723733653990684, + "flos": 23878872587520.0, + "grad_norm": 2.0890466011797724, + "language_loss": 0.86366498, + "learning_rate": 2.371900659559016e-06, + "loss": 0.88620895, + "num_input_tokens_seen": 163145855, + "step": 7605, + "time_per_iteration": 2.613847255706787 + }, + { + "auxiliary_loss_clip": 0.01103611, + "auxiliary_loss_mlp": 0.01118018, + "balance_loss_clip": 1.0019486, + "balance_loss_mlp": 1.00062108, + "epoch": 0.4572974597925748, + "flos": 16871803148160.0, + "grad_norm": 1.8525662070886575, + "language_loss": 0.73837501, + "learning_rate": 2.371517983373138e-06, + "loss": 0.76059127, + "num_input_tokens_seen": 163163830, + "step": 7606, + "time_per_iteration": 4.100329160690308 + }, + { + "auxiliary_loss_clip": 0.01120263, + "auxiliary_loss_mlp": 0.01118463, + "balance_loss_clip": 1.00205517, + "balance_loss_mlp": 1.00087476, + "epoch": 0.45735758304524277, + "flos": 13771491528960.0, + "grad_norm": 2.4719621072155333, + "language_loss": 0.80267239, + "learning_rate": 2.371135293099262e-06, + "loss": 0.82505965, + "num_input_tokens_seen": 163180700, + "step": 7607, + "time_per_iteration": 2.6164867877960205 + }, + { + "auxiliary_loss_clip": 0.01124792, + "auxiliary_loss_mlp": 0.01118508, + "balance_loss_clip": 1.00227606, + "balance_loss_mlp": 1.00082433, + "epoch": 0.45741770629791073, + "flos": 21100750986240.0, + "grad_norm": 1.8117028640246444, + "language_loss": 0.80937767, + "learning_rate": 2.3707525887518982e-06, + "loss": 0.83181059, + "num_input_tokens_seen": 163199450, + "step": 7608, + "time_per_iteration": 2.6698687076568604 + }, + { + "auxiliary_loss_clip": 0.01136532, + "auxiliary_loss_mlp": 0.01117729, + "balance_loss_clip": 1.00204778, + "balance_loss_mlp": 1.00071287, + "epoch": 0.4574778295505787, + "flos": 23112898035840.0, + "grad_norm": 1.6118437590324621, + "language_loss": 0.68200886, + "learning_rate": 2.370369870345559e-06, + "loss": 0.70455152, + "num_input_tokens_seen": 163217875, + "step": 7609, + "time_per_iteration": 2.653679132461548 + }, + { + "auxiliary_loss_clip": 0.01135251, + "auxiliary_loss_mlp": 0.01118604, + "balance_loss_clip": 1.00208569, + "balance_loss_mlp": 1.00082493, + "epoch": 0.45753795280324666, + "flos": 24352929308160.0, + "grad_norm": 2.4142590633282306, + "language_loss": 0.80867934, + "learning_rate": 2.369987137894757e-06, + "loss": 0.83121789, + "num_input_tokens_seen": 163237430, + "step": 7610, + "time_per_iteration": 2.6340649127960205 + }, + { + "auxiliary_loss_clip": 0.01151942, + "auxiliary_loss_mlp": 0.01118192, + "balance_loss_clip": 1.00215769, + "balance_loss_mlp": 1.00079513, + "epoch": 0.4575980760559146, + "flos": 16653789550080.0, + "grad_norm": 2.087974338437635, + "language_loss": 0.82473254, + "learning_rate": 2.3696043914140057e-06, + "loss": 0.84743392, + "num_input_tokens_seen": 163253905, + "step": 7611, + "time_per_iteration": 2.560312271118164 + }, + { + "auxiliary_loss_clip": 0.01152732, + "auxiliary_loss_mlp": 0.01117446, + "balance_loss_clip": 1.0022186, + "balance_loss_mlp": 1.00052571, + "epoch": 0.4576581993085826, + "flos": 35911423912320.0, + "grad_norm": 1.657812134285317, + "language_loss": 0.73721159, + "learning_rate": 2.369221630917819e-06, + "loss": 0.75991333, + "num_input_tokens_seen": 163274285, + "step": 7612, + "time_per_iteration": 2.699359178543091 + }, + { + "auxiliary_loss_clip": 0.01136496, + "auxiliary_loss_mlp": 0.01117269, + "balance_loss_clip": 1.00205112, + "balance_loss_mlp": 1.00063443, + "epoch": 0.45771832256125056, + "flos": 20080421251200.0, + "grad_norm": 1.6686796267546973, + "language_loss": 0.84855199, + "learning_rate": 2.368838856420711e-06, + "loss": 0.87108964, + "num_input_tokens_seen": 163293150, + "step": 7613, + "time_per_iteration": 2.6141281127929688 + }, + { + "auxiliary_loss_clip": 0.01120121, + "auxiliary_loss_mlp": 0.01118171, + "balance_loss_clip": 1.00207353, + "balance_loss_mlp": 1.00058329, + "epoch": 0.4577784458139185, + "flos": 10744329957120.0, + "grad_norm": 3.103610631498435, + "language_loss": 0.75767958, + "learning_rate": 2.3684560679371965e-06, + "loss": 0.78006256, + "num_input_tokens_seen": 163310065, + "step": 7614, + "time_per_iteration": 2.6248366832733154 + }, + { + "auxiliary_loss_clip": 0.01168311, + "auxiliary_loss_mlp": 0.01117387, + "balance_loss_clip": 1.00223315, + "balance_loss_mlp": 1.00075293, + "epoch": 0.4578385690665865, + "flos": 21907269014400.0, + "grad_norm": 1.6307169426352073, + "language_loss": 0.74572629, + "learning_rate": 2.368073265481791e-06, + "loss": 0.7685833, + "num_input_tokens_seen": 163329415, + "step": 7615, + "time_per_iteration": 2.536024570465088 + }, + { + "auxiliary_loss_clip": 0.01131298, + "auxiliary_loss_mlp": 0.01096917, + "balance_loss_clip": 1.00155234, + "balance_loss_mlp": 1.00002408, + "epoch": 0.45789869231925445, + "flos": 64758286667520.0, + "grad_norm": 0.7759278096331517, + "language_loss": 0.57650012, + "learning_rate": 2.3676904490690105e-06, + "loss": 0.5987823, + "num_input_tokens_seen": 163385875, + "step": 7616, + "time_per_iteration": 3.108839750289917 + }, + { + "auxiliary_loss_clip": 0.0113672, + "auxiliary_loss_mlp": 0.0074738, + "balance_loss_clip": 1.00206113, + "balance_loss_mlp": 1.00023818, + "epoch": 0.4579588155719224, + "flos": 16144001775360.0, + "grad_norm": 1.7041674937146691, + "language_loss": 0.71286196, + "learning_rate": 2.3673076187133704e-06, + "loss": 0.73170298, + "num_input_tokens_seen": 163405170, + "step": 7617, + "time_per_iteration": 2.592677354812622 + }, + { + "auxiliary_loss_clip": 0.01168471, + "auxiliary_loss_mlp": 0.01117633, + "balance_loss_clip": 1.00232565, + "balance_loss_mlp": 1.00071299, + "epoch": 0.45801893882459044, + "flos": 21395541905280.0, + "grad_norm": 1.8200941301863194, + "language_loss": 0.76400381, + "learning_rate": 2.36692477442939e-06, + "loss": 0.78686488, + "num_input_tokens_seen": 163423155, + "step": 7618, + "time_per_iteration": 2.5096426010131836 + }, + { + "auxiliary_loss_clip": 0.01120209, + "auxiliary_loss_mlp": 0.01118273, + "balance_loss_clip": 1.00198448, + "balance_loss_mlp": 1.00087619, + "epoch": 0.4580790620772584, + "flos": 19536554448000.0, + "grad_norm": 1.6251813751999105, + "language_loss": 0.77159959, + "learning_rate": 2.366541916231585e-06, + "loss": 0.79398441, + "num_input_tokens_seen": 163442450, + "step": 7619, + "time_per_iteration": 2.619861364364624 + }, + { + "auxiliary_loss_clip": 0.0116845, + "auxiliary_loss_mlp": 0.01117294, + "balance_loss_clip": 1.00236523, + "balance_loss_mlp": 1.00085044, + "epoch": 0.45813918532992637, + "flos": 16581070465920.0, + "grad_norm": 3.2071325061036893, + "language_loss": 0.72015035, + "learning_rate": 2.366159044134473e-06, + "loss": 0.74300784, + "num_input_tokens_seen": 163459810, + "step": 7620, + "time_per_iteration": 2.5282673835754395 + }, + { + "auxiliary_loss_clip": 0.01135691, + "auxiliary_loss_mlp": 0.01116551, + "balance_loss_clip": 1.00203753, + "balance_loss_mlp": 1.00058424, + "epoch": 0.45819930858259433, + "flos": 42230301701760.0, + "grad_norm": 1.526065508547547, + "language_loss": 0.78190851, + "learning_rate": 2.3657761581525748e-06, + "loss": 0.8044309, + "num_input_tokens_seen": 163482970, + "step": 7621, + "time_per_iteration": 2.8672287464141846 + }, + { + "auxiliary_loss_clip": 0.01148801, + "auxiliary_loss_mlp": 0.01096881, + "balance_loss_clip": 1.00158453, + "balance_loss_mlp": 0.99998802, + "epoch": 0.4582594318352623, + "flos": 63714795638400.0, + "grad_norm": 0.793827889057613, + "language_loss": 0.64972085, + "learning_rate": 2.3653932583004063e-06, + "loss": 0.67217761, + "num_input_tokens_seen": 163545330, + "step": 7622, + "time_per_iteration": 3.1192409992218018 + }, + { + "auxiliary_loss_clip": 0.01151855, + "auxiliary_loss_mlp": 0.01117842, + "balance_loss_clip": 1.00208342, + "balance_loss_mlp": 1.00053954, + "epoch": 0.45831955508793026, + "flos": 26869979882880.0, + "grad_norm": 1.6291772444360457, + "language_loss": 0.7911799, + "learning_rate": 2.3650103445924903e-06, + "loss": 0.81387687, + "num_input_tokens_seen": 163564620, + "step": 7623, + "time_per_iteration": 2.6052520275115967 + }, + { + "auxiliary_loss_clip": 0.01105181, + "auxiliary_loss_mlp": 0.01118218, + "balance_loss_clip": 1.00209403, + "balance_loss_mlp": 1.0006299, + "epoch": 0.45837967834059823, + "flos": 18733951002240.0, + "grad_norm": 2.055250052157434, + "language_loss": 0.70677662, + "learning_rate": 2.3646274170433452e-06, + "loss": 0.72901058, + "num_input_tokens_seen": 163581010, + "step": 7624, + "time_per_iteration": 2.6594350337982178 + }, + { + "auxiliary_loss_clip": 0.01136768, + "auxiliary_loss_mlp": 0.01116701, + "balance_loss_clip": 1.00205028, + "balance_loss_mlp": 1.00063896, + "epoch": 0.4584398015932662, + "flos": 21178102924800.0, + "grad_norm": 1.760035212955975, + "language_loss": 0.72871083, + "learning_rate": 2.364244475667491e-06, + "loss": 0.7512455, + "num_input_tokens_seen": 163599955, + "step": 7625, + "time_per_iteration": 2.710618257522583 + }, + { + "auxiliary_loss_clip": 0.01158017, + "auxiliary_loss_mlp": 0.01116826, + "balance_loss_clip": 1.00252652, + "balance_loss_mlp": 1.00066876, + "epoch": 0.45849992484593416, + "flos": 19790047704960.0, + "grad_norm": 1.9223591514178322, + "language_loss": 0.78237075, + "learning_rate": 2.363861520479451e-06, + "loss": 0.80511916, + "num_input_tokens_seen": 163618545, + "step": 7626, + "time_per_iteration": 2.592958688735962 + }, + { + "auxiliary_loss_clip": 0.01168497, + "auxiliary_loss_mlp": 0.0111883, + "balance_loss_clip": 1.00229478, + "balance_loss_mlp": 1.00066972, + "epoch": 0.4585600480986021, + "flos": 18223265387520.0, + "grad_norm": 1.7587906521564007, + "language_loss": 0.84676266, + "learning_rate": 2.3634785514937445e-06, + "loss": 0.86963594, + "num_input_tokens_seen": 163636055, + "step": 7627, + "time_per_iteration": 2.5274593830108643 + }, + { + "auxiliary_loss_clip": 0.01168641, + "auxiliary_loss_mlp": 0.01118606, + "balance_loss_clip": 1.00232303, + "balance_loss_mlp": 1.00063694, + "epoch": 0.4586201713512701, + "flos": 29022213974400.0, + "grad_norm": 1.9841754867296608, + "language_loss": 0.6949169, + "learning_rate": 2.3630955687248953e-06, + "loss": 0.71778941, + "num_input_tokens_seen": 163657485, + "step": 7628, + "time_per_iteration": 2.5867414474487305 + }, + { + "auxiliary_loss_clip": 0.01153126, + "auxiliary_loss_mlp": 0.0111731, + "balance_loss_clip": 1.00210631, + "balance_loss_mlp": 1.00048435, + "epoch": 0.45868029460393805, + "flos": 23404600385280.0, + "grad_norm": 1.662906385323543, + "language_loss": 0.78595781, + "learning_rate": 2.3627125721874265e-06, + "loss": 0.80866218, + "num_input_tokens_seen": 163676030, + "step": 7629, + "time_per_iteration": 2.556792736053467 + }, + { + "auxiliary_loss_clip": 0.01136241, + "auxiliary_loss_mlp": 0.01117946, + "balance_loss_clip": 1.00207996, + "balance_loss_mlp": 1.00064397, + "epoch": 0.458740417856606, + "flos": 18221972497920.0, + "grad_norm": 2.0664414250815604, + "language_loss": 0.79326606, + "learning_rate": 2.3623295618958595e-06, + "loss": 0.81580794, + "num_input_tokens_seen": 163694490, + "step": 7630, + "time_per_iteration": 2.600895881652832 + }, + { + "auxiliary_loss_clip": 0.01134999, + "auxiliary_loss_mlp": 0.01118618, + "balance_loss_clip": 1.00202143, + "balance_loss_mlp": 1.00074434, + "epoch": 0.458800541109274, + "flos": 34568760504960.0, + "grad_norm": 1.7056293821533963, + "language_loss": 0.71982974, + "learning_rate": 2.3619465378647198e-06, + "loss": 0.7423659, + "num_input_tokens_seen": 163717035, + "step": 7631, + "time_per_iteration": 2.696381092071533 + }, + { + "auxiliary_loss_clip": 0.01119613, + "auxiliary_loss_mlp": 0.01118432, + "balance_loss_clip": 1.00214052, + "balance_loss_mlp": 1.00084424, + "epoch": 0.458860664361942, + "flos": 17712112896000.0, + "grad_norm": 4.5005518887892775, + "language_loss": 0.71662772, + "learning_rate": 2.361563500108531e-06, + "loss": 0.73900819, + "num_input_tokens_seen": 163734525, + "step": 7632, + "time_per_iteration": 2.6346142292022705 + }, + { + "auxiliary_loss_clip": 0.01104946, + "auxiliary_loss_mlp": 0.00747544, + "balance_loss_clip": 1.00190878, + "balance_loss_mlp": 1.00024176, + "epoch": 0.45892078761460997, + "flos": 18441889516800.0, + "grad_norm": 2.8214638774371323, + "language_loss": 0.68821371, + "learning_rate": 2.3611804486418178e-06, + "loss": 0.70673859, + "num_input_tokens_seen": 163752860, + "step": 7633, + "time_per_iteration": 2.657686710357666 + }, + { + "auxiliary_loss_clip": 0.01151634, + "auxiliary_loss_mlp": 0.0111793, + "balance_loss_clip": 1.00209773, + "balance_loss_mlp": 1.0007236, + "epoch": 0.45898091086727794, + "flos": 22672956257280.0, + "grad_norm": 1.8392997584863853, + "language_loss": 0.80724859, + "learning_rate": 2.3607973834791062e-06, + "loss": 0.82994425, + "num_input_tokens_seen": 163772495, + "step": 7634, + "time_per_iteration": 2.6184866428375244 + }, + { + "auxiliary_loss_clip": 0.01152217, + "auxiliary_loss_mlp": 0.00747442, + "balance_loss_clip": 1.00215065, + "balance_loss_mlp": 1.00025415, + "epoch": 0.4590410341199459, + "flos": 21652949744640.0, + "grad_norm": 1.8343583021098055, + "language_loss": 0.81258428, + "learning_rate": 2.3604143046349216e-06, + "loss": 0.83158088, + "num_input_tokens_seen": 163791475, + "step": 7635, + "time_per_iteration": 2.588365077972412 + }, + { + "auxiliary_loss_clip": 0.01135046, + "auxiliary_loss_mlp": 0.01117387, + "balance_loss_clip": 1.00210977, + "balance_loss_mlp": 1.0008477, + "epoch": 0.45910115737261387, + "flos": 36535372087680.0, + "grad_norm": 1.5472907119154413, + "language_loss": 0.64532268, + "learning_rate": 2.3600312121237905e-06, + "loss": 0.66784704, + "num_input_tokens_seen": 163812995, + "step": 7636, + "time_per_iteration": 2.7320356369018555 + }, + { + "auxiliary_loss_clip": 0.01151949, + "auxiliary_loss_mlp": 0.01117466, + "balance_loss_clip": 1.00218916, + "balance_loss_mlp": 1.00064135, + "epoch": 0.45916128062528183, + "flos": 24419866302720.0, + "grad_norm": 1.514703289449381, + "language_loss": 0.80556607, + "learning_rate": 2.3596481059602395e-06, + "loss": 0.82826018, + "num_input_tokens_seen": 163833945, + "step": 7637, + "time_per_iteration": 4.021937370300293 + }, + { + "auxiliary_loss_clip": 0.01138725, + "auxiliary_loss_mlp": 0.01118544, + "balance_loss_clip": 1.00225008, + "balance_loss_mlp": 1.00086081, + "epoch": 0.4592214038779498, + "flos": 23221958705280.0, + "grad_norm": 1.374761597894506, + "language_loss": 0.75465715, + "learning_rate": 2.3592649861587965e-06, + "loss": 0.77722979, + "num_input_tokens_seen": 163853885, + "step": 7638, + "time_per_iteration": 2.6200647354125977 + }, + { + "auxiliary_loss_clip": 0.01151626, + "auxiliary_loss_mlp": 0.01117174, + "balance_loss_clip": 1.00218582, + "balance_loss_mlp": 1.00063539, + "epoch": 0.45928152713061776, + "flos": 19172133014400.0, + "grad_norm": 2.6163715075662926, + "language_loss": 0.74047899, + "learning_rate": 2.358881852733989e-06, + "loss": 0.76316702, + "num_input_tokens_seen": 163871855, + "step": 7639, + "time_per_iteration": 2.5547382831573486 + }, + { + "auxiliary_loss_clip": 0.01168486, + "auxiliary_loss_mlp": 0.01117739, + "balance_loss_clip": 1.00233984, + "balance_loss_mlp": 1.0006274, + "epoch": 0.4593416503832857, + "flos": 22414686491520.0, + "grad_norm": 1.6882350918837723, + "language_loss": 0.68216014, + "learning_rate": 2.358498705700346e-06, + "loss": 0.70502239, + "num_input_tokens_seen": 163891450, + "step": 7640, + "time_per_iteration": 2.534010887145996 + }, + { + "auxiliary_loss_clip": 0.01143232, + "auxiliary_loss_mlp": 0.01118268, + "balance_loss_clip": 1.00236189, + "balance_loss_mlp": 1.00058436, + "epoch": 0.4594017736359537, + "flos": 18880215183360.0, + "grad_norm": 1.6814266200179804, + "language_loss": 0.75182307, + "learning_rate": 2.3581155450723958e-06, + "loss": 0.77443802, + "num_input_tokens_seen": 163909345, + "step": 7641, + "time_per_iteration": 3.956239700317383 + }, + { + "auxiliary_loss_clip": 0.01136499, + "auxiliary_loss_mlp": 0.01118537, + "balance_loss_clip": 1.00207043, + "balance_loss_mlp": 1.00056756, + "epoch": 0.45946189688862166, + "flos": 20518567349760.0, + "grad_norm": 3.382907992171491, + "language_loss": 0.7493366, + "learning_rate": 2.357732370864668e-06, + "loss": 0.771887, + "num_input_tokens_seen": 163926940, + "step": 7642, + "time_per_iteration": 4.007248401641846 + }, + { + "auxiliary_loss_clip": 0.01147239, + "auxiliary_loss_mlp": 0.01096898, + "balance_loss_clip": 1.00154996, + "balance_loss_mlp": 1.00000477, + "epoch": 0.4595220201412896, + "flos": 61405990162560.0, + "grad_norm": 0.8439832656366223, + "language_loss": 0.58300889, + "learning_rate": 2.357349183091694e-06, + "loss": 0.60545027, + "num_input_tokens_seen": 163977785, + "step": 7643, + "time_per_iteration": 4.2999255657196045 + }, + { + "auxiliary_loss_clip": 0.0115185, + "auxiliary_loss_mlp": 0.01118212, + "balance_loss_clip": 1.00210071, + "balance_loss_mlp": 1.00062418, + "epoch": 0.4595821433939576, + "flos": 23330947547520.0, + "grad_norm": 2.4503013913015956, + "language_loss": 0.93019849, + "learning_rate": 2.3569659817680016e-06, + "loss": 0.9528991, + "num_input_tokens_seen": 163996630, + "step": 7644, + "time_per_iteration": 2.580134630203247 + }, + { + "auxiliary_loss_clip": 0.01151962, + "auxiliary_loss_mlp": 0.01118063, + "balance_loss_clip": 1.00210238, + "balance_loss_mlp": 1.00076079, + "epoch": 0.4596422666466256, + "flos": 14282356711680.0, + "grad_norm": 2.0433745872497737, + "language_loss": 0.82301998, + "learning_rate": 2.3565827669081243e-06, + "loss": 0.84572017, + "num_input_tokens_seen": 164013190, + "step": 7645, + "time_per_iteration": 2.549666166305542 + }, + { + "auxiliary_loss_clip": 0.0112182, + "auxiliary_loss_mlp": 0.01096861, + "balance_loss_clip": 1.00160718, + "balance_loss_mlp": 0.99996775, + "epoch": 0.4597023898992936, + "flos": 65727337737600.0, + "grad_norm": 0.7538616050379732, + "language_loss": 0.59843481, + "learning_rate": 2.356199538526593e-06, + "loss": 0.62062156, + "num_input_tokens_seen": 164074030, + "step": 7646, + "time_per_iteration": 3.1427481174468994 + }, + { + "auxiliary_loss_clip": 0.01153522, + "auxiliary_loss_mlp": 0.01117606, + "balance_loss_clip": 1.00213957, + "balance_loss_mlp": 1.00058985, + "epoch": 0.45976251315196154, + "flos": 26907075653760.0, + "grad_norm": 1.5371279283158332, + "language_loss": 0.72649419, + "learning_rate": 2.355816296637939e-06, + "loss": 0.74920541, + "num_input_tokens_seen": 164095515, + "step": 7647, + "time_per_iteration": 2.6409289836883545 + }, + { + "auxiliary_loss_clip": 0.0112002, + "auxiliary_loss_mlp": 0.01118968, + "balance_loss_clip": 1.00210166, + "balance_loss_mlp": 1.00061703, + "epoch": 0.4598226364046295, + "flos": 26618066824320.0, + "grad_norm": 2.5893535174727678, + "language_loss": 0.66707945, + "learning_rate": 2.3554330412566957e-06, + "loss": 0.68946934, + "num_input_tokens_seen": 164117270, + "step": 7648, + "time_per_iteration": 2.7504241466522217 + }, + { + "auxiliary_loss_clip": 0.01153266, + "auxiliary_loss_mlp": 0.01117929, + "balance_loss_clip": 1.00222194, + "balance_loss_mlp": 1.00062692, + "epoch": 0.45988275965729747, + "flos": 24387762522240.0, + "grad_norm": 1.8612237637443254, + "language_loss": 0.78537261, + "learning_rate": 2.3550497723973953e-06, + "loss": 0.80808455, + "num_input_tokens_seen": 164137850, + "step": 7649, + "time_per_iteration": 2.6190576553344727 + }, + { + "auxiliary_loss_clip": 0.01087391, + "auxiliary_loss_mlp": 0.01117652, + "balance_loss_clip": 1.00177217, + "balance_loss_mlp": 1.00063634, + "epoch": 0.45994288290996543, + "flos": 24535822383360.0, + "grad_norm": 1.6965048572844024, + "language_loss": 0.6918292, + "learning_rate": 2.3546664900745726e-06, + "loss": 0.71387964, + "num_input_tokens_seen": 164157960, + "step": 7650, + "time_per_iteration": 2.77519154548645 + }, + { + "auxiliary_loss_clip": 0.01151699, + "auxiliary_loss_mlp": 0.01118495, + "balance_loss_clip": 1.00217247, + "balance_loss_mlp": 1.0008111, + "epoch": 0.4600030061626334, + "flos": 14830245838080.0, + "grad_norm": 2.102412523119185, + "language_loss": 0.83765531, + "learning_rate": 2.354283194302761e-06, + "loss": 0.86035728, + "num_input_tokens_seen": 164174590, + "step": 7651, + "time_per_iteration": 2.559375047683716 + }, + { + "auxiliary_loss_clip": 0.01137709, + "auxiliary_loss_mlp": 0.00747505, + "balance_loss_clip": 1.00217676, + "balance_loss_mlp": 1.00027955, + "epoch": 0.46006312941530136, + "flos": 18113845582080.0, + "grad_norm": 1.890916720951614, + "language_loss": 0.75104457, + "learning_rate": 2.3538998850964948e-06, + "loss": 0.76989675, + "num_input_tokens_seen": 164192935, + "step": 7652, + "time_per_iteration": 2.5959744453430176 + }, + { + "auxiliary_loss_clip": 0.01103333, + "auxiliary_loss_mlp": 0.01117449, + "balance_loss_clip": 1.00192058, + "balance_loss_mlp": 1.00071955, + "epoch": 0.46012325266796933, + "flos": 21976468565760.0, + "grad_norm": 1.6727765874847509, + "language_loss": 0.76295948, + "learning_rate": 2.3535165624703097e-06, + "loss": 0.78516728, + "num_input_tokens_seen": 164213160, + "step": 7653, + "time_per_iteration": 2.803985595703125 + }, + { + "auxiliary_loss_clip": 0.01103452, + "auxiliary_loss_mlp": 0.01119508, + "balance_loss_clip": 1.00203204, + "balance_loss_mlp": 1.00067997, + "epoch": 0.4601833759206373, + "flos": 15268068714240.0, + "grad_norm": 3.260819016133206, + "language_loss": 0.6585474, + "learning_rate": 2.353133226438741e-06, + "loss": 0.68077695, + "num_input_tokens_seen": 164229330, + "step": 7654, + "time_per_iteration": 2.698439598083496 + }, + { + "auxiliary_loss_clip": 0.01136561, + "auxiliary_loss_mlp": 0.01117413, + "balance_loss_clip": 1.00198448, + "balance_loss_mlp": 1.00058842, + "epoch": 0.46024349917330526, + "flos": 27088999061760.0, + "grad_norm": 1.6189652206377318, + "language_loss": 0.79387939, + "learning_rate": 2.3527498770163248e-06, + "loss": 0.81641918, + "num_input_tokens_seen": 164248240, + "step": 7655, + "time_per_iteration": 2.659635066986084 + }, + { + "auxiliary_loss_clip": 0.01120077, + "auxiliary_loss_mlp": 0.01117382, + "balance_loss_clip": 1.00191498, + "balance_loss_mlp": 1.00055695, + "epoch": 0.4603036224259732, + "flos": 24462923731200.0, + "grad_norm": 1.5760573663807722, + "language_loss": 0.67586434, + "learning_rate": 2.3523665142175985e-06, + "loss": 0.69823891, + "num_input_tokens_seen": 164268020, + "step": 7656, + "time_per_iteration": 2.6843583583831787 + }, + { + "auxiliary_loss_clip": 0.01138416, + "auxiliary_loss_mlp": 0.01117601, + "balance_loss_clip": 1.00207281, + "balance_loss_mlp": 1.00058508, + "epoch": 0.4603637456786412, + "flos": 28109292883200.0, + "grad_norm": 1.7204666444046877, + "language_loss": 0.81143117, + "learning_rate": 2.351983138057098e-06, + "loss": 0.83399129, + "num_input_tokens_seen": 164287305, + "step": 7657, + "time_per_iteration": 2.669029712677002 + }, + { + "auxiliary_loss_clip": 0.01168465, + "auxiliary_loss_mlp": 0.00747431, + "balance_loss_clip": 1.00237238, + "balance_loss_mlp": 1.00019634, + "epoch": 0.4604238689313092, + "flos": 24348942898560.0, + "grad_norm": 3.531221942202233, + "language_loss": 0.70344907, + "learning_rate": 2.3515997485493623e-06, + "loss": 0.72260797, + "num_input_tokens_seen": 164306835, + "step": 7658, + "time_per_iteration": 2.5743980407714844 + }, + { + "auxiliary_loss_clip": 0.01148838, + "auxiliary_loss_mlp": 0.01096232, + "balance_loss_clip": 1.00164473, + "balance_loss_mlp": 1.00010204, + "epoch": 0.4604839921839772, + "flos": 53606229431040.0, + "grad_norm": 0.9474172325977548, + "language_loss": 0.62146729, + "learning_rate": 2.351216345708928e-06, + "loss": 0.64391798, + "num_input_tokens_seen": 164367095, + "step": 7659, + "time_per_iteration": 3.228163003921509 + }, + { + "auxiliary_loss_clip": 0.01106602, + "auxiliary_loss_mlp": 0.01117303, + "balance_loss_clip": 1.00202954, + "balance_loss_mlp": 1.0005734, + "epoch": 0.46054411543664514, + "flos": 31248424126080.0, + "grad_norm": 2.382412830167889, + "language_loss": 0.68207395, + "learning_rate": 2.350832929550336e-06, + "loss": 0.70431304, + "num_input_tokens_seen": 164388895, + "step": 7660, + "time_per_iteration": 2.830554485321045 + }, + { + "auxiliary_loss_clip": 0.01153387, + "auxiliary_loss_mlp": 0.011177, + "balance_loss_clip": 1.00215697, + "balance_loss_mlp": 1.00077939, + "epoch": 0.4606042386893131, + "flos": 24092863862400.0, + "grad_norm": 1.801223839098794, + "language_loss": 0.77062559, + "learning_rate": 2.3504495000881227e-06, + "loss": 0.79333651, + "num_input_tokens_seen": 164409080, + "step": 7661, + "time_per_iteration": 2.5898542404174805 + }, + { + "auxiliary_loss_clip": 0.01153231, + "auxiliary_loss_mlp": 0.01117564, + "balance_loss_clip": 1.0023073, + "balance_loss_mlp": 1.00073886, + "epoch": 0.46066436194198107, + "flos": 26578457101440.0, + "grad_norm": 2.284503504416794, + "language_loss": 0.75334233, + "learning_rate": 2.3500660573368305e-06, + "loss": 0.77605033, + "num_input_tokens_seen": 164427585, + "step": 7662, + "time_per_iteration": 2.626236915588379 + }, + { + "auxiliary_loss_clip": 0.01137087, + "auxiliary_loss_mlp": 0.0111869, + "balance_loss_clip": 1.00217128, + "balance_loss_mlp": 1.00072098, + "epoch": 0.46072448519464904, + "flos": 17775602184960.0, + "grad_norm": 3.650392660244471, + "language_loss": 0.7965973, + "learning_rate": 2.349682601310998e-06, + "loss": 0.8191551, + "num_input_tokens_seen": 164438455, + "step": 7663, + "time_per_iteration": 2.5650134086608887 + }, + { + "auxiliary_loss_clip": 0.01151669, + "auxiliary_loss_mlp": 0.01116395, + "balance_loss_clip": 1.00225902, + "balance_loss_mlp": 1.00052357, + "epoch": 0.460784608447317, + "flos": 15086109392640.0, + "grad_norm": 1.9487997661727858, + "language_loss": 0.73649037, + "learning_rate": 2.3492991320251653e-06, + "loss": 0.75917101, + "num_input_tokens_seen": 164456830, + "step": 7664, + "time_per_iteration": 2.584181070327759 + }, + { + "auxiliary_loss_clip": 0.01119679, + "auxiliary_loss_mlp": 0.01117639, + "balance_loss_clip": 1.00215435, + "balance_loss_mlp": 1.00062346, + "epoch": 0.46084473169998497, + "flos": 18588261438720.0, + "grad_norm": 2.163979390030008, + "language_loss": 0.72301859, + "learning_rate": 2.3489156494938753e-06, + "loss": 0.74539179, + "num_input_tokens_seen": 164475375, + "step": 7665, + "time_per_iteration": 2.6383025646209717 + }, + { + "auxiliary_loss_clip": 0.01121333, + "auxiliary_loss_mlp": 0.01117149, + "balance_loss_clip": 1.00205243, + "balance_loss_mlp": 1.00070524, + "epoch": 0.46090485495265293, + "flos": 19494789909120.0, + "grad_norm": 2.0700014655115275, + "language_loss": 0.7809366, + "learning_rate": 2.348532153731669e-06, + "loss": 0.80332142, + "num_input_tokens_seen": 164492040, + "step": 7666, + "time_per_iteration": 2.6481692790985107 + }, + { + "auxiliary_loss_clip": 0.01120005, + "auxiliary_loss_mlp": 0.01118265, + "balance_loss_clip": 1.002177, + "balance_loss_mlp": 1.00077212, + "epoch": 0.4609649782053209, + "flos": 33364927163520.0, + "grad_norm": 1.459579717525898, + "language_loss": 0.74247241, + "learning_rate": 2.348148644753088e-06, + "loss": 0.76485503, + "num_input_tokens_seen": 164513665, + "step": 7667, + "time_per_iteration": 2.755631685256958 + }, + { + "auxiliary_loss_clip": 0.01101955, + "auxiliary_loss_mlp": 0.01116743, + "balance_loss_clip": 1.00191426, + "balance_loss_mlp": 1.0006808, + "epoch": 0.46102510145798886, + "flos": 23769165473280.0, + "grad_norm": 1.4876154268504485, + "language_loss": 0.76209331, + "learning_rate": 2.347765122572676e-06, + "loss": 0.7842803, + "num_input_tokens_seen": 164533890, + "step": 7668, + "time_per_iteration": 2.6894466876983643 + }, + { + "auxiliary_loss_clip": 0.01103127, + "auxiliary_loss_mlp": 0.0111773, + "balance_loss_clip": 1.00225067, + "balance_loss_mlp": 1.0006187, + "epoch": 0.4610852247106568, + "flos": 23294821443840.0, + "grad_norm": 1.5821566092230466, + "language_loss": 0.78060257, + "learning_rate": 2.347381587204975e-06, + "loss": 0.80281121, + "num_input_tokens_seen": 164553815, + "step": 7669, + "time_per_iteration": 2.7126247882843018 + }, + { + "auxiliary_loss_clip": 0.01153141, + "auxiliary_loss_mlp": 0.01117325, + "balance_loss_clip": 1.00213981, + "balance_loss_mlp": 1.00050032, + "epoch": 0.4611453479633248, + "flos": 25447450584960.0, + "grad_norm": 1.7961916621768415, + "language_loss": 0.82438421, + "learning_rate": 2.34699803866453e-06, + "loss": 0.84708893, + "num_input_tokens_seen": 164573125, + "step": 7670, + "time_per_iteration": 2.605149030685425 + }, + { + "auxiliary_loss_clip": 0.01152839, + "auxiliary_loss_mlp": 0.01117166, + "balance_loss_clip": 1.00228465, + "balance_loss_mlp": 1.00072253, + "epoch": 0.4612054712159928, + "flos": 21139606523520.0, + "grad_norm": 1.6763192425024789, + "language_loss": 0.63389742, + "learning_rate": 2.3466144769658845e-06, + "loss": 0.6565975, + "num_input_tokens_seen": 164592575, + "step": 7671, + "time_per_iteration": 2.6035115718841553 + }, + { + "auxiliary_loss_clip": 0.01134319, + "auxiliary_loss_mlp": 0.01096515, + "balance_loss_clip": 1.00173545, + "balance_loss_mlp": 1.00000322, + "epoch": 0.4612655944686608, + "flos": 69959266404480.0, + "grad_norm": 0.694332890536514, + "language_loss": 0.5596866, + "learning_rate": 2.346230902123583e-06, + "loss": 0.58199495, + "num_input_tokens_seen": 164659795, + "step": 7672, + "time_per_iteration": 3.3232498168945312 + }, + { + "auxiliary_loss_clip": 0.01151757, + "auxiliary_loss_mlp": 0.01117228, + "balance_loss_clip": 1.00210619, + "balance_loss_mlp": 1.00068927, + "epoch": 0.46132571772132874, + "flos": 16837149502080.0, + "grad_norm": 1.8749358462428978, + "language_loss": 0.70988995, + "learning_rate": 2.3458473141521715e-06, + "loss": 0.73257983, + "num_input_tokens_seen": 164678735, + "step": 7673, + "time_per_iteration": 2.6056437492370605 + }, + { + "auxiliary_loss_clip": 0.01138464, + "auxiliary_loss_mlp": 0.01117395, + "balance_loss_clip": 1.00227928, + "balance_loss_mlp": 1.00076032, + "epoch": 0.4613858409739967, + "flos": 35808935431680.0, + "grad_norm": 1.6678247328458684, + "language_loss": 0.7075302, + "learning_rate": 2.345463713066195e-06, + "loss": 0.73008877, + "num_input_tokens_seen": 164700885, + "step": 7674, + "time_per_iteration": 2.7703311443328857 + }, + { + "auxiliary_loss_clip": 0.01134459, + "auxiliary_loss_mlp": 0.01117945, + "balance_loss_clip": 1.00204885, + "balance_loss_mlp": 1.00092936, + "epoch": 0.4614459642266647, + "flos": 35266756567680.0, + "grad_norm": 14.084097981885266, + "language_loss": 0.65593088, + "learning_rate": 2.3450800988801996e-06, + "loss": 0.67845488, + "num_input_tokens_seen": 164726960, + "step": 7675, + "time_per_iteration": 4.181057929992676 + }, + { + "auxiliary_loss_clip": 0.01163662, + "auxiliary_loss_mlp": 0.01096213, + "balance_loss_clip": 1.00176322, + "balance_loss_mlp": 1.00008285, + "epoch": 0.46150608747933264, + "flos": 66704610044160.0, + "grad_norm": 0.7238287475921555, + "language_loss": 0.58685219, + "learning_rate": 2.3446964716087327e-06, + "loss": 0.60945094, + "num_input_tokens_seen": 164788525, + "step": 7676, + "time_per_iteration": 3.1552181243896484 + }, + { + "auxiliary_loss_clip": 0.0111593, + "auxiliary_loss_mlp": 0.01096716, + "balance_loss_clip": 1.00165939, + "balance_loss_mlp": 1.00020432, + "epoch": 0.4615662107320006, + "flos": 55830177025920.0, + "grad_norm": 0.7877421683041581, + "language_loss": 0.62741226, + "learning_rate": 2.344312831266341e-06, + "loss": 0.6495387, + "num_input_tokens_seen": 164843525, + "step": 7677, + "time_per_iteration": 3.0396766662597656 + }, + { + "auxiliary_loss_clip": 0.01134854, + "auxiliary_loss_mlp": 0.01117175, + "balance_loss_clip": 1.00212336, + "balance_loss_mlp": 1.00054097, + "epoch": 0.46162633398466857, + "flos": 15483245137920.0, + "grad_norm": 2.2433006856642654, + "language_loss": 0.76567835, + "learning_rate": 2.3439291778675718e-06, + "loss": 0.78819865, + "num_input_tokens_seen": 164859895, + "step": 7678, + "time_per_iteration": 2.591488838195801 + }, + { + "auxiliary_loss_clip": 0.01168607, + "auxiliary_loss_mlp": 0.01117643, + "balance_loss_clip": 1.00234926, + "balance_loss_mlp": 1.00053227, + "epoch": 0.46168645723733653, + "flos": 20011437181440.0, + "grad_norm": 2.2666150678334454, + "language_loss": 0.66482961, + "learning_rate": 2.343545511426974e-06, + "loss": 0.68769217, + "num_input_tokens_seen": 164878030, + "step": 7679, + "time_per_iteration": 3.9253244400024414 + }, + { + "auxiliary_loss_clip": 0.01119516, + "auxiliary_loss_mlp": 0.01117405, + "balance_loss_clip": 1.00214148, + "balance_loss_mlp": 1.00077081, + "epoch": 0.4617465804900045, + "flos": 20298542590080.0, + "grad_norm": 2.6618352731467034, + "language_loss": 0.69305396, + "learning_rate": 2.3431618319590963e-06, + "loss": 0.71542311, + "num_input_tokens_seen": 164895710, + "step": 7680, + "time_per_iteration": 4.110935688018799 + }, + { + "auxiliary_loss_clip": 0.01168447, + "auxiliary_loss_mlp": 0.01118132, + "balance_loss_clip": 1.00234783, + "balance_loss_mlp": 1.00073421, + "epoch": 0.46180670374267246, + "flos": 22346312952960.0, + "grad_norm": 1.8275304428665033, + "language_loss": 0.63472939, + "learning_rate": 2.342778139478487e-06, + "loss": 0.6575951, + "num_input_tokens_seen": 164913365, + "step": 7681, + "time_per_iteration": 3.9757513999938965 + }, + { + "auxiliary_loss_clip": 0.01152838, + "auxiliary_loss_mlp": 0.01116899, + "balance_loss_clip": 1.00221348, + "balance_loss_mlp": 1.00055039, + "epoch": 0.46186682699534043, + "flos": 19895696582400.0, + "grad_norm": 2.6258008426884873, + "language_loss": 0.67105269, + "learning_rate": 2.342394433999697e-06, + "loss": 0.69375008, + "num_input_tokens_seen": 164931620, + "step": 7682, + "time_per_iteration": 2.570582389831543 + }, + { + "auxiliary_loss_clip": 0.01120298, + "auxiliary_loss_mlp": 0.01117811, + "balance_loss_clip": 1.00218129, + "balance_loss_mlp": 1.00060463, + "epoch": 0.4619269502480084, + "flos": 31503569408640.0, + "grad_norm": 2.139215492676256, + "language_loss": 0.74083608, + "learning_rate": 2.342010715537275e-06, + "loss": 0.76321721, + "num_input_tokens_seen": 164950905, + "step": 7683, + "time_per_iteration": 2.7086684703826904 + }, + { + "auxiliary_loss_clip": 0.01168367, + "auxiliary_loss_mlp": 0.01117099, + "balance_loss_clip": 1.00236464, + "balance_loss_mlp": 1.00075102, + "epoch": 0.46198707350067636, + "flos": 25009484054400.0, + "grad_norm": 1.8282603075026562, + "language_loss": 0.76537293, + "learning_rate": 2.3416269841057726e-06, + "loss": 0.78822762, + "num_input_tokens_seen": 164970950, + "step": 7684, + "time_per_iteration": 2.575190305709839 + }, + { + "auxiliary_loss_clip": 0.01168495, + "auxiliary_loss_mlp": 0.01118237, + "balance_loss_clip": 1.00234795, + "balance_loss_mlp": 1.0006485, + "epoch": 0.4620471967533444, + "flos": 18292357198080.0, + "grad_norm": 2.0486781115099353, + "language_loss": 0.79608059, + "learning_rate": 2.3412432397197412e-06, + "loss": 0.81894791, + "num_input_tokens_seen": 164989855, + "step": 7685, + "time_per_iteration": 2.6031875610351562 + }, + { + "auxiliary_loss_clip": 0.01120164, + "auxiliary_loss_mlp": 0.01117631, + "balance_loss_clip": 1.0021354, + "balance_loss_mlp": 1.00071061, + "epoch": 0.46210732000601235, + "flos": 33985104410880.0, + "grad_norm": 1.5949731704787933, + "language_loss": 0.6686784, + "learning_rate": 2.340859482393731e-06, + "loss": 0.69105637, + "num_input_tokens_seen": 165012290, + "step": 7686, + "time_per_iteration": 2.757657289505005 + }, + { + "auxiliary_loss_clip": 0.01135031, + "auxiliary_loss_mlp": 0.00747451, + "balance_loss_clip": 1.00212705, + "balance_loss_mlp": 1.00027776, + "epoch": 0.4621674432586803, + "flos": 25009412227200.0, + "grad_norm": 2.315952410710465, + "language_loss": 0.74118572, + "learning_rate": 2.340475712142296e-06, + "loss": 0.76001054, + "num_input_tokens_seen": 165030810, + "step": 7687, + "time_per_iteration": 2.6499764919281006 + }, + { + "auxiliary_loss_clip": 0.01087528, + "auxiliary_loss_mlp": 0.01116832, + "balance_loss_clip": 1.0018903, + "balance_loss_mlp": 1.00057888, + "epoch": 0.4622275665113483, + "flos": 22014031213440.0, + "grad_norm": 2.1000041650291155, + "language_loss": 0.74619913, + "learning_rate": 2.3400919289799873e-06, + "loss": 0.76824272, + "num_input_tokens_seen": 165050205, + "step": 7688, + "time_per_iteration": 2.7370638847351074 + }, + { + "auxiliary_loss_clip": 0.01103287, + "auxiliary_loss_mlp": 0.00747435, + "balance_loss_clip": 1.00209284, + "balance_loss_mlp": 1.00026655, + "epoch": 0.46228768976401624, + "flos": 24058820747520.0, + "grad_norm": 1.8478750932565997, + "language_loss": 0.78629971, + "learning_rate": 2.3397081329213585e-06, + "loss": 0.80480695, + "num_input_tokens_seen": 165069370, + "step": 7689, + "time_per_iteration": 2.7188847064971924 + }, + { + "auxiliary_loss_clip": 0.01153526, + "auxiliary_loss_mlp": 0.01117848, + "balance_loss_clip": 1.00223231, + "balance_loss_mlp": 1.00073683, + "epoch": 0.4623478130166842, + "flos": 26651391667200.0, + "grad_norm": 2.252290430797493, + "language_loss": 0.5738185, + "learning_rate": 2.339324323980964e-06, + "loss": 0.59653223, + "num_input_tokens_seen": 165089610, + "step": 7690, + "time_per_iteration": 2.618985652923584 + }, + { + "auxiliary_loss_clip": 0.0115168, + "auxiliary_loss_mlp": 0.01116739, + "balance_loss_clip": 1.00224042, + "balance_loss_mlp": 1.00067663, + "epoch": 0.46240793626935217, + "flos": 20558428467840.0, + "grad_norm": 3.1786286579815144, + "language_loss": 0.82951689, + "learning_rate": 2.3389405021733562e-06, + "loss": 0.85220104, + "num_input_tokens_seen": 165109050, + "step": 7691, + "time_per_iteration": 2.586333990097046 + }, + { + "auxiliary_loss_clip": 0.01136399, + "auxiliary_loss_mlp": 0.0111722, + "balance_loss_clip": 1.00219202, + "balance_loss_mlp": 1.00058579, + "epoch": 0.46246805952202014, + "flos": 22456055980800.0, + "grad_norm": 1.412538608929144, + "language_loss": 0.75743842, + "learning_rate": 2.338556667513091e-06, + "loss": 0.77997464, + "num_input_tokens_seen": 165130130, + "step": 7692, + "time_per_iteration": 2.6617486476898193 + }, + { + "auxiliary_loss_clip": 0.01124888, + "auxiliary_loss_mlp": 0.01117632, + "balance_loss_clip": 1.00267613, + "balance_loss_mlp": 1.00071156, + "epoch": 0.4625281827746881, + "flos": 35041308854400.0, + "grad_norm": 1.55534827459578, + "language_loss": 0.74259937, + "learning_rate": 2.338172820014723e-06, + "loss": 0.76502454, + "num_input_tokens_seen": 165152685, + "step": 7693, + "time_per_iteration": 2.784668445587158 + }, + { + "auxiliary_loss_clip": 0.01120184, + "auxiliary_loss_mlp": 0.01118392, + "balance_loss_clip": 1.00224328, + "balance_loss_mlp": 1.00070882, + "epoch": 0.46258830602735607, + "flos": 21068647205760.0, + "grad_norm": 1.4972365059977946, + "language_loss": 0.85668504, + "learning_rate": 2.337788959692808e-06, + "loss": 0.87907082, + "num_input_tokens_seen": 165173315, + "step": 7694, + "time_per_iteration": 2.6907272338867188 + }, + { + "auxiliary_loss_clip": 0.01136229, + "auxiliary_loss_mlp": 0.01117601, + "balance_loss_clip": 1.00217748, + "balance_loss_mlp": 1.00077617, + "epoch": 0.46264842928002403, + "flos": 26177227205760.0, + "grad_norm": 1.9181382229033452, + "language_loss": 0.78842461, + "learning_rate": 2.337405086561902e-06, + "loss": 0.81096292, + "num_input_tokens_seen": 165192395, + "step": 7695, + "time_per_iteration": 2.6603517532348633 + }, + { + "auxiliary_loss_clip": 0.01151535, + "auxiliary_loss_mlp": 0.01117237, + "balance_loss_clip": 1.00220716, + "balance_loss_mlp": 1.00060272, + "epoch": 0.462708552532692, + "flos": 16764214936320.0, + "grad_norm": 1.6878170236758854, + "language_loss": 0.7219981, + "learning_rate": 2.3370212006365606e-06, + "loss": 0.74468583, + "num_input_tokens_seen": 165211355, + "step": 7696, + "time_per_iteration": 2.5806350708007812 + }, + { + "auxiliary_loss_clip": 0.01136286, + "auxiliary_loss_mlp": 0.01118115, + "balance_loss_clip": 1.00221729, + "balance_loss_mlp": 1.00071764, + "epoch": 0.46276867578535996, + "flos": 15560453422080.0, + "grad_norm": 1.476551906390626, + "language_loss": 0.69119585, + "learning_rate": 2.3366373019313423e-06, + "loss": 0.71373981, + "num_input_tokens_seen": 165229380, + "step": 7697, + "time_per_iteration": 2.625710964202881 + }, + { + "auxiliary_loss_clip": 0.01168483, + "auxiliary_loss_mlp": 0.0111747, + "balance_loss_clip": 1.00238919, + "balance_loss_mlp": 1.00074053, + "epoch": 0.462828799038028, + "flos": 22415404763520.0, + "grad_norm": 1.9136546936906091, + "language_loss": 0.84816742, + "learning_rate": 2.3362533904608025e-06, + "loss": 0.87102693, + "num_input_tokens_seen": 165247200, + "step": 7698, + "time_per_iteration": 2.52608585357666 + }, + { + "auxiliary_loss_clip": 0.01168352, + "auxiliary_loss_mlp": 0.01117418, + "balance_loss_clip": 1.00234056, + "balance_loss_mlp": 1.00068843, + "epoch": 0.46288892229069595, + "flos": 21069580959360.0, + "grad_norm": 1.836341603141498, + "language_loss": 0.71297264, + "learning_rate": 2.335869466239502e-06, + "loss": 0.73583037, + "num_input_tokens_seen": 165265825, + "step": 7699, + "time_per_iteration": 2.5255520343780518 + }, + { + "auxiliary_loss_clip": 0.01093286, + "auxiliary_loss_mlp": 0.01118055, + "balance_loss_clip": 1.0025934, + "balance_loss_mlp": 1.00065744, + "epoch": 0.4629490455433639, + "flos": 23185688947200.0, + "grad_norm": 1.6671946030558837, + "language_loss": 0.71469182, + "learning_rate": 2.335485529281996e-06, + "loss": 0.7368052, + "num_input_tokens_seen": 165284380, + "step": 7700, + "time_per_iteration": 2.748662233352661 + }, + { + "auxiliary_loss_clip": 0.01168349, + "auxiliary_loss_mlp": 0.00747412, + "balance_loss_clip": 1.00236261, + "balance_loss_mlp": 1.00033045, + "epoch": 0.4630091687960319, + "flos": 18835541642880.0, + "grad_norm": 1.9783516256248093, + "language_loss": 0.72545052, + "learning_rate": 2.3351015796028467e-06, + "loss": 0.74460804, + "num_input_tokens_seen": 165300320, + "step": 7701, + "time_per_iteration": 2.528400421142578 + }, + { + "auxiliary_loss_clip": 0.01122297, + "auxiliary_loss_mlp": 0.01117754, + "balance_loss_clip": 1.00208354, + "balance_loss_mlp": 1.0007385, + "epoch": 0.46306929204869984, + "flos": 38907020407680.0, + "grad_norm": 2.0758024975865452, + "language_loss": 0.64960253, + "learning_rate": 2.3347176172166114e-06, + "loss": 0.67200303, + "num_input_tokens_seen": 165318130, + "step": 7702, + "time_per_iteration": 2.7713043689727783 + }, + { + "auxiliary_loss_clip": 0.01134919, + "auxiliary_loss_mlp": 0.01116783, + "balance_loss_clip": 1.00208735, + "balance_loss_mlp": 1.00053048, + "epoch": 0.4631294153013678, + "flos": 19644178573440.0, + "grad_norm": 2.0306038795893215, + "language_loss": 0.732113, + "learning_rate": 2.33433364213785e-06, + "loss": 0.75462997, + "num_input_tokens_seen": 165336225, + "step": 7703, + "time_per_iteration": 2.6003310680389404 + }, + { + "auxiliary_loss_clip": 0.01136949, + "auxiliary_loss_mlp": 0.01118345, + "balance_loss_clip": 1.00223899, + "balance_loss_mlp": 1.0007565, + "epoch": 0.4631895385540358, + "flos": 24608254158720.0, + "grad_norm": 1.574229515316003, + "language_loss": 0.69041032, + "learning_rate": 2.3339496543811243e-06, + "loss": 0.71296322, + "num_input_tokens_seen": 165355005, + "step": 7704, + "time_per_iteration": 2.678771495819092 + }, + { + "auxiliary_loss_clip": 0.0115161, + "auxiliary_loss_mlp": 0.01117264, + "balance_loss_clip": 1.00223219, + "balance_loss_mlp": 1.00053418, + "epoch": 0.46324966180670374, + "flos": 26320115508480.0, + "grad_norm": 2.369434051416109, + "language_loss": 0.81074715, + "learning_rate": 2.3335656539609934e-06, + "loss": 0.83343595, + "num_input_tokens_seen": 165374910, + "step": 7705, + "time_per_iteration": 2.595932960510254 + }, + { + "auxiliary_loss_clip": 0.01158214, + "auxiliary_loss_mlp": 0.01117828, + "balance_loss_clip": 1.00248814, + "balance_loss_mlp": 1.00062191, + "epoch": 0.4633097850593717, + "flos": 19240506552960.0, + "grad_norm": 2.177334040771864, + "language_loss": 0.77579784, + "learning_rate": 2.3331816408920196e-06, + "loss": 0.79855824, + "num_input_tokens_seen": 165392590, + "step": 7706, + "time_per_iteration": 2.560708522796631 + }, + { + "auxiliary_loss_clip": 0.01134547, + "auxiliary_loss_mlp": 0.01116983, + "balance_loss_clip": 1.00195956, + "balance_loss_mlp": 1.0007304, + "epoch": 0.46336990831203967, + "flos": 22783166161920.0, + "grad_norm": 1.8196923459537104, + "language_loss": 0.69630826, + "learning_rate": 2.3327976151887654e-06, + "loss": 0.71882361, + "num_input_tokens_seen": 165411195, + "step": 7707, + "time_per_iteration": 2.609853744506836 + }, + { + "auxiliary_loss_clip": 0.01136411, + "auxiliary_loss_mlp": 0.01117917, + "balance_loss_clip": 1.00219059, + "balance_loss_mlp": 1.00061488, + "epoch": 0.46343003156470763, + "flos": 38210604543360.0, + "grad_norm": 6.179036382146904, + "language_loss": 0.61128205, + "learning_rate": 2.332413576865791e-06, + "loss": 0.63382536, + "num_input_tokens_seen": 165430150, + "step": 7708, + "time_per_iteration": 2.7851805686950684 + }, + { + "auxiliary_loss_clip": 0.01119584, + "auxiliary_loss_mlp": 0.01117025, + "balance_loss_clip": 1.00201941, + "balance_loss_mlp": 1.00067699, + "epoch": 0.4634901548173756, + "flos": 31938555110400.0, + "grad_norm": 3.276871261812145, + "language_loss": 0.77763385, + "learning_rate": 2.3320295259376614e-06, + "loss": 0.79999995, + "num_input_tokens_seen": 165450595, + "step": 7709, + "time_per_iteration": 2.7591986656188965 + }, + { + "auxiliary_loss_clip": 0.01168411, + "auxiliary_loss_mlp": 0.01117558, + "balance_loss_clip": 1.00231147, + "balance_loss_mlp": 1.00082779, + "epoch": 0.46355027807004356, + "flos": 20082540153600.0, + "grad_norm": 2.005679350120934, + "language_loss": 0.772578, + "learning_rate": 2.3316454624189385e-06, + "loss": 0.79543769, + "num_input_tokens_seen": 165469515, + "step": 7710, + "time_per_iteration": 2.557724714279175 + }, + { + "auxiliary_loss_clip": 0.01153587, + "auxiliary_loss_mlp": 0.01118568, + "balance_loss_clip": 1.00236249, + "balance_loss_mlp": 1.00059867, + "epoch": 0.4636104013227116, + "flos": 24061370613120.0, + "grad_norm": 1.987341881381631, + "language_loss": 0.7332902, + "learning_rate": 2.3312613863241865e-06, + "loss": 0.75601178, + "num_input_tokens_seen": 165488125, + "step": 7711, + "time_per_iteration": 2.6165077686309814 + }, + { + "auxiliary_loss_clip": 0.01141205, + "auxiliary_loss_mlp": 0.01117728, + "balance_loss_clip": 1.00231266, + "balance_loss_mlp": 1.00080776, + "epoch": 0.46367052457537955, + "flos": 23914639555200.0, + "grad_norm": 1.3059627833858085, + "language_loss": 0.71977258, + "learning_rate": 2.33087729766797e-06, + "loss": 0.7423619, + "num_input_tokens_seen": 165509225, + "step": 7712, + "time_per_iteration": 4.077153205871582 + }, + { + "auxiliary_loss_clip": 0.01136997, + "auxiliary_loss_mlp": 0.01119162, + "balance_loss_clip": 1.0021013, + "balance_loss_mlp": 1.00090647, + "epoch": 0.4637306478280475, + "flos": 26396533693440.0, + "grad_norm": 1.7499641804726325, + "language_loss": 0.73186862, + "learning_rate": 2.3304931964648524e-06, + "loss": 0.75443023, + "num_input_tokens_seen": 165529945, + "step": 7713, + "time_per_iteration": 2.661947250366211 + }, + { + "auxiliary_loss_clip": 0.01121938, + "auxiliary_loss_mlp": 0.01118167, + "balance_loss_clip": 1.00216937, + "balance_loss_mlp": 1.00067389, + "epoch": 0.4637907710807155, + "flos": 21980706370560.0, + "grad_norm": 1.3979077369836608, + "language_loss": 0.58332539, + "learning_rate": 2.3301090827294e-06, + "loss": 0.60572642, + "num_input_tokens_seen": 165550690, + "step": 7714, + "time_per_iteration": 2.6700737476348877 + }, + { + "auxiliary_loss_clip": 0.01152131, + "auxiliary_loss_mlp": 0.01116882, + "balance_loss_clip": 1.00228763, + "balance_loss_mlp": 1.00072443, + "epoch": 0.46385089433338345, + "flos": 12422291846400.0, + "grad_norm": 1.7002730448462766, + "language_loss": 0.70666778, + "learning_rate": 2.3297249564761784e-06, + "loss": 0.7293579, + "num_input_tokens_seen": 165567775, + "step": 7715, + "time_per_iteration": 2.602688789367676 + }, + { + "auxiliary_loss_clip": 0.01168637, + "auxiliary_loss_mlp": 0.01118453, + "balance_loss_clip": 1.00231731, + "balance_loss_mlp": 1.00076938, + "epoch": 0.4639110175860514, + "flos": 23915752876800.0, + "grad_norm": 18.652721096073133, + "language_loss": 0.68778849, + "learning_rate": 2.3293408177197527e-06, + "loss": 0.71065938, + "num_input_tokens_seen": 165587010, + "step": 7716, + "time_per_iteration": 3.95133638381958 + }, + { + "auxiliary_loss_clip": 0.01168467, + "auxiliary_loss_mlp": 0.01118164, + "balance_loss_clip": 1.00227821, + "balance_loss_mlp": 1.00057554, + "epoch": 0.4639711408387194, + "flos": 25300396304640.0, + "grad_norm": 1.7736375370424562, + "language_loss": 0.80915314, + "learning_rate": 2.328956666474691e-06, + "loss": 0.83201945, + "num_input_tokens_seen": 165607850, + "step": 7717, + "time_per_iteration": 2.552845001220703 + }, + { + "auxiliary_loss_clip": 0.01168445, + "auxiliary_loss_mlp": 0.01117835, + "balance_loss_clip": 1.00226903, + "balance_loss_mlp": 1.0007236, + "epoch": 0.46403126409138734, + "flos": 21211822817280.0, + "grad_norm": 1.6743284210357188, + "language_loss": 0.7317403, + "learning_rate": 2.3285725027555593e-06, + "loss": 0.75460315, + "num_input_tokens_seen": 165627175, + "step": 7718, + "time_per_iteration": 5.453479766845703 + }, + { + "auxiliary_loss_clip": 0.01168288, + "auxiliary_loss_mlp": 0.00747381, + "balance_loss_clip": 1.00221014, + "balance_loss_mlp": 1.00019932, + "epoch": 0.4640913873440553, + "flos": 35845564325760.0, + "grad_norm": 1.6141294600923872, + "language_loss": 0.70825207, + "learning_rate": 2.3281883265769254e-06, + "loss": 0.72740877, + "num_input_tokens_seen": 165648340, + "step": 7719, + "time_per_iteration": 2.637608766555786 + }, + { + "auxiliary_loss_clip": 0.0112309, + "auxiliary_loss_mlp": 0.01117776, + "balance_loss_clip": 1.00216842, + "balance_loss_mlp": 1.00075984, + "epoch": 0.46415151059672327, + "flos": 19166207270400.0, + "grad_norm": 2.2514343396725063, + "language_loss": 0.86604393, + "learning_rate": 2.327804137953357e-06, + "loss": 0.88845265, + "num_input_tokens_seen": 165667195, + "step": 7720, + "time_per_iteration": 2.64072847366333 + }, + { + "auxiliary_loss_clip": 0.01132565, + "auxiliary_loss_mlp": 0.01095821, + "balance_loss_clip": 1.00187063, + "balance_loss_mlp": 1.00007236, + "epoch": 0.46421163384939124, + "flos": 58912750304640.0, + "grad_norm": 0.7181789033848558, + "language_loss": 0.55056036, + "learning_rate": 2.3274199368994226e-06, + "loss": 0.57284421, + "num_input_tokens_seen": 165726760, + "step": 7721, + "time_per_iteration": 3.2266602516174316 + }, + { + "auxiliary_loss_clip": 0.01136453, + "auxiliary_loss_mlp": 0.01117196, + "balance_loss_clip": 1.00220704, + "balance_loss_mlp": 1.00065684, + "epoch": 0.4642717571020592, + "flos": 20157342226560.0, + "grad_norm": 2.186189025956084, + "language_loss": 0.79545617, + "learning_rate": 2.3270357234296918e-06, + "loss": 0.81799263, + "num_input_tokens_seen": 165745005, + "step": 7722, + "time_per_iteration": 2.662482500076294 + }, + { + "auxiliary_loss_clip": 0.01168486, + "auxiliary_loss_mlp": 0.01117807, + "balance_loss_clip": 1.00227952, + "balance_loss_mlp": 1.00050521, + "epoch": 0.46433188035472717, + "flos": 25046184775680.0, + "grad_norm": 1.5748695325737692, + "language_loss": 0.77443117, + "learning_rate": 2.3266514975587332e-06, + "loss": 0.79729408, + "num_input_tokens_seen": 165765750, + "step": 7723, + "time_per_iteration": 2.612934112548828 + }, + { + "auxiliary_loss_clip": 0.01037547, + "auxiliary_loss_mlp": 0.01116891, + "balance_loss_clip": 1.00166464, + "balance_loss_mlp": 1.0006386, + "epoch": 0.4643920036073952, + "flos": 28075644817920.0, + "grad_norm": 1.7272852545754576, + "language_loss": 0.68406343, + "learning_rate": 2.326267259301118e-06, + "loss": 0.70560777, + "num_input_tokens_seen": 165787515, + "step": 7724, + "time_per_iteration": 3.0576725006103516 + }, + { + "auxiliary_loss_clip": 0.01151797, + "auxiliary_loss_mlp": 0.01117715, + "balance_loss_clip": 1.00203538, + "balance_loss_mlp": 1.00060368, + "epoch": 0.46445212686006315, + "flos": 18369350000640.0, + "grad_norm": 1.8004380264667497, + "language_loss": 0.66963458, + "learning_rate": 2.325883008671415e-06, + "loss": 0.6923297, + "num_input_tokens_seen": 165806675, + "step": 7725, + "time_per_iteration": 2.953355550765991 + }, + { + "auxiliary_loss_clip": 0.01151491, + "auxiliary_loss_mlp": 0.01115593, + "balance_loss_clip": 1.00230539, + "balance_loss_mlp": 1.00057983, + "epoch": 0.4645122501127311, + "flos": 31721618920320.0, + "grad_norm": 1.9733281341327003, + "language_loss": 0.64921415, + "learning_rate": 2.3254987456841955e-06, + "loss": 0.67188495, + "num_input_tokens_seen": 165829835, + "step": 7726, + "time_per_iteration": 2.684220552444458 + }, + { + "auxiliary_loss_clip": 0.01138136, + "auxiliary_loss_mlp": 0.00747438, + "balance_loss_clip": 1.00249946, + "balance_loss_mlp": 1.00038421, + "epoch": 0.4645723733653991, + "flos": 23768806337280.0, + "grad_norm": 1.965685327894766, + "language_loss": 0.74544054, + "learning_rate": 2.3251144703540307e-06, + "loss": 0.76429629, + "num_input_tokens_seen": 165849380, + "step": 7727, + "time_per_iteration": 2.666665554046631 + }, + { + "auxiliary_loss_clip": 0.01134991, + "auxiliary_loss_mlp": 0.01117755, + "balance_loss_clip": 1.00218379, + "balance_loss_mlp": 1.00073862, + "epoch": 0.46463249661806705, + "flos": 33145512935040.0, + "grad_norm": 1.9314579844480617, + "language_loss": 0.789294, + "learning_rate": 2.3247301826954936e-06, + "loss": 0.81182146, + "num_input_tokens_seen": 165868620, + "step": 7728, + "time_per_iteration": 2.705289125442505 + }, + { + "auxiliary_loss_clip": 0.01121355, + "auxiliary_loss_mlp": 0.01117237, + "balance_loss_clip": 1.00230432, + "balance_loss_mlp": 1.0006032, + "epoch": 0.464692619870735, + "flos": 18296020385280.0, + "grad_norm": 1.915733602472038, + "language_loss": 0.75624859, + "learning_rate": 2.324345882723155e-06, + "loss": 0.77863455, + "num_input_tokens_seen": 165885915, + "step": 7729, + "time_per_iteration": 2.6732687950134277 + }, + { + "auxiliary_loss_clip": 0.01138279, + "auxiliary_loss_mlp": 0.01117168, + "balance_loss_clip": 1.00226355, + "balance_loss_mlp": 1.00062907, + "epoch": 0.464752743123403, + "flos": 22638051216000.0, + "grad_norm": 9.984614262080237, + "language_loss": 0.79566222, + "learning_rate": 2.323961570451588e-06, + "loss": 0.81821668, + "num_input_tokens_seen": 165905465, + "step": 7730, + "time_per_iteration": 2.6366114616394043 + }, + { + "auxiliary_loss_clip": 0.01168265, + "auxiliary_loss_mlp": 0.01117029, + "balance_loss_clip": 1.00230455, + "balance_loss_mlp": 1.00077581, + "epoch": 0.46481286637607094, + "flos": 20412128373120.0, + "grad_norm": 1.6757568238648117, + "language_loss": 0.7738499, + "learning_rate": 2.3235772458953655e-06, + "loss": 0.79670286, + "num_input_tokens_seen": 165924640, + "step": 7731, + "time_per_iteration": 2.569723129272461 + }, + { + "auxiliary_loss_clip": 0.01120212, + "auxiliary_loss_mlp": 0.01116704, + "balance_loss_clip": 1.00214696, + "balance_loss_mlp": 1.0005461, + "epoch": 0.4648729896287389, + "flos": 34275406129920.0, + "grad_norm": 1.925666705430909, + "language_loss": 0.65853143, + "learning_rate": 2.323192909069061e-06, + "loss": 0.68090057, + "num_input_tokens_seen": 165945765, + "step": 7732, + "time_per_iteration": 2.7615840435028076 + }, + { + "auxiliary_loss_clip": 0.01137665, + "auxiliary_loss_mlp": 0.01117816, + "balance_loss_clip": 1.00219202, + "balance_loss_mlp": 1.00070524, + "epoch": 0.4649331128814069, + "flos": 21321781326720.0, + "grad_norm": 2.3496077059647065, + "language_loss": 0.72745526, + "learning_rate": 2.32280855998725e-06, + "loss": 0.75001007, + "num_input_tokens_seen": 165964025, + "step": 7733, + "time_per_iteration": 2.6512670516967773 + }, + { + "auxiliary_loss_clip": 0.01164009, + "auxiliary_loss_mlp": 0.01095728, + "balance_loss_clip": 1.00199747, + "balance_loss_mlp": 0.9999792, + "epoch": 0.46499323613407484, + "flos": 58308515717760.0, + "grad_norm": 1.2017959135396479, + "language_loss": 0.51867086, + "learning_rate": 2.3224241986645057e-06, + "loss": 0.54126823, + "num_input_tokens_seen": 166021950, + "step": 7734, + "time_per_iteration": 3.037729024887085 + }, + { + "auxiliary_loss_clip": 0.01135906, + "auxiliary_loss_mlp": 0.01117188, + "balance_loss_clip": 1.00215864, + "balance_loss_mlp": 1.000458, + "epoch": 0.4650533593867428, + "flos": 10889660384640.0, + "grad_norm": 3.5573048943580856, + "language_loss": 0.75922942, + "learning_rate": 2.3220398251154035e-06, + "loss": 0.78176033, + "num_input_tokens_seen": 166039675, + "step": 7735, + "time_per_iteration": 2.595184564590454 + }, + { + "auxiliary_loss_clip": 0.01120132, + "auxiliary_loss_mlp": 0.01116281, + "balance_loss_clip": 1.0022037, + "balance_loss_mlp": 1.00079131, + "epoch": 0.46511348263941077, + "flos": 19974592805760.0, + "grad_norm": 2.052450948277691, + "language_loss": 0.70061785, + "learning_rate": 2.321655439354519e-06, + "loss": 0.72298199, + "num_input_tokens_seen": 166057745, + "step": 7736, + "time_per_iteration": 2.642712116241455 + }, + { + "auxiliary_loss_clip": 0.01168121, + "auxiliary_loss_mlp": 0.01115962, + "balance_loss_clip": 1.00231135, + "balance_loss_mlp": 1.00056791, + "epoch": 0.46517360589207873, + "flos": 19678401256320.0, + "grad_norm": 1.7082583663423356, + "language_loss": 0.72500271, + "learning_rate": 2.321271041396427e-06, + "loss": 0.7478435, + "num_input_tokens_seen": 166076440, + "step": 7737, + "time_per_iteration": 2.5563254356384277 + }, + { + "auxiliary_loss_clip": 0.0114148, + "auxiliary_loss_mlp": 0.01117708, + "balance_loss_clip": 1.00271797, + "balance_loss_mlp": 1.00059724, + "epoch": 0.46523372914474675, + "flos": 16872665074560.0, + "grad_norm": 2.101986718703526, + "language_loss": 0.84069002, + "learning_rate": 2.3208866312557065e-06, + "loss": 0.86328185, + "num_input_tokens_seen": 166092520, + "step": 7738, + "time_per_iteration": 2.565380334854126 + }, + { + "auxiliary_loss_clip": 0.01147418, + "auxiliary_loss_mlp": 0.01095349, + "balance_loss_clip": 1.00185299, + "balance_loss_mlp": 0.99998134, + "epoch": 0.4652938523974147, + "flos": 53439138339840.0, + "grad_norm": 0.7711520616111357, + "language_loss": 0.57868969, + "learning_rate": 2.320502208946932e-06, + "loss": 0.60111737, + "num_input_tokens_seen": 166156285, + "step": 7739, + "time_per_iteration": 3.18194317817688 + }, + { + "auxiliary_loss_clip": 0.01135029, + "auxiliary_loss_mlp": 0.01116797, + "balance_loss_clip": 1.00218868, + "balance_loss_mlp": 1.00083005, + "epoch": 0.4653539756500827, + "flos": 15231296165760.0, + "grad_norm": 1.8911793065396145, + "language_loss": 0.85055292, + "learning_rate": 2.3201177744846815e-06, + "loss": 0.87307119, + "num_input_tokens_seen": 166173455, + "step": 7740, + "time_per_iteration": 2.5929629802703857 + }, + { + "auxiliary_loss_clip": 0.01136279, + "auxiliary_loss_mlp": 0.0111725, + "balance_loss_clip": 1.00231457, + "balance_loss_mlp": 1.0006156, + "epoch": 0.46541409890275065, + "flos": 23732249270400.0, + "grad_norm": 1.7744199937209857, + "language_loss": 0.75760227, + "learning_rate": 2.3197333278835327e-06, + "loss": 0.78013754, + "num_input_tokens_seen": 166194370, + "step": 7741, + "time_per_iteration": 2.6778764724731445 + }, + { + "auxiliary_loss_clip": 0.01120174, + "auxiliary_loss_mlp": 0.01117267, + "balance_loss_clip": 1.00210786, + "balance_loss_mlp": 1.00063229, + "epoch": 0.4654742221554186, + "flos": 20847329556480.0, + "grad_norm": 1.6536014337934657, + "language_loss": 0.81208241, + "learning_rate": 2.319348869158064e-06, + "loss": 0.83445686, + "num_input_tokens_seen": 166213195, + "step": 7742, + "time_per_iteration": 2.6505730152130127 + }, + { + "auxiliary_loss_clip": 0.01135684, + "auxiliary_loss_mlp": 0.01117331, + "balance_loss_clip": 1.0020566, + "balance_loss_mlp": 1.00069618, + "epoch": 0.4655343454080866, + "flos": 20704836303360.0, + "grad_norm": 1.6854214623318322, + "language_loss": 0.72356123, + "learning_rate": 2.3189643983228555e-06, + "loss": 0.74609137, + "num_input_tokens_seen": 166231350, + "step": 7743, + "time_per_iteration": 2.6129612922668457 + }, + { + "auxiliary_loss_clip": 0.01137421, + "auxiliary_loss_mlp": 0.01117226, + "balance_loss_clip": 1.00232553, + "balance_loss_mlp": 1.00059128, + "epoch": 0.46559446866075455, + "flos": 18989850470400.0, + "grad_norm": 2.4211824612475423, + "language_loss": 0.71305454, + "learning_rate": 2.318579915392483e-06, + "loss": 0.73560101, + "num_input_tokens_seen": 166250530, + "step": 7744, + "time_per_iteration": 2.597564458847046 + }, + { + "auxiliary_loss_clip": 0.01109185, + "auxiliary_loss_mlp": 0.01115437, + "balance_loss_clip": 1.00220573, + "balance_loss_mlp": 1.00051916, + "epoch": 0.4656545919134225, + "flos": 34496364643200.0, + "grad_norm": 1.4715566576109542, + "language_loss": 0.85026211, + "learning_rate": 2.31819542038153e-06, + "loss": 0.87250835, + "num_input_tokens_seen": 166272545, + "step": 7745, + "time_per_iteration": 2.8445000648498535 + }, + { + "auxiliary_loss_clip": 0.01152873, + "auxiliary_loss_mlp": 0.01116704, + "balance_loss_clip": 1.00229216, + "balance_loss_mlp": 1.00064158, + "epoch": 0.4657147151660905, + "flos": 24310554238080.0, + "grad_norm": 1.3888019488233612, + "language_loss": 0.72954381, + "learning_rate": 2.317810913304574e-06, + "loss": 0.75223953, + "num_input_tokens_seen": 166292135, + "step": 7746, + "time_per_iteration": 2.631570339202881 + }, + { + "auxiliary_loss_clip": 0.01151251, + "auxiliary_loss_mlp": 0.01115958, + "balance_loss_clip": 1.00217462, + "balance_loss_mlp": 1.00075436, + "epoch": 0.46577483841875844, + "flos": 58795139220480.0, + "grad_norm": 1.585789937907805, + "language_loss": 0.69530755, + "learning_rate": 2.3174263941761963e-06, + "loss": 0.71797961, + "num_input_tokens_seen": 166316710, + "step": 7747, + "time_per_iteration": 2.9441378116607666 + }, + { + "auxiliary_loss_clip": 0.01119583, + "auxiliary_loss_mlp": 0.0111745, + "balance_loss_clip": 1.00214446, + "balance_loss_mlp": 1.00062442, + "epoch": 0.4658349616714264, + "flos": 31321969223040.0, + "grad_norm": 1.5126897128502914, + "language_loss": 0.67205954, + "learning_rate": 2.317041863010978e-06, + "loss": 0.69442981, + "num_input_tokens_seen": 166338535, + "step": 7748, + "time_per_iteration": 2.760728120803833 + }, + { + "auxiliary_loss_clip": 0.01118169, + "auxiliary_loss_mlp": 0.01116973, + "balance_loss_clip": 1.00190604, + "balance_loss_mlp": 1.0006243, + "epoch": 0.46589508492409437, + "flos": 14860338456960.0, + "grad_norm": 2.159853690673335, + "language_loss": 0.63618451, + "learning_rate": 2.3166573198235007e-06, + "loss": 0.65853596, + "num_input_tokens_seen": 166355540, + "step": 7749, + "time_per_iteration": 2.6597092151641846 + }, + { + "auxiliary_loss_clip": 0.01153002, + "auxiliary_loss_mlp": 0.01118004, + "balance_loss_clip": 1.00229418, + "balance_loss_mlp": 1.00079751, + "epoch": 0.46595520817676234, + "flos": 12895989431040.0, + "grad_norm": 3.888826517667341, + "language_loss": 0.74952704, + "learning_rate": 2.3162727646283456e-06, + "loss": 0.77223706, + "num_input_tokens_seen": 166372635, + "step": 7750, + "time_per_iteration": 4.290260314941406 + }, + { + "auxiliary_loss_clip": 0.01142308, + "auxiliary_loss_mlp": 0.01117917, + "balance_loss_clip": 1.00220132, + "balance_loss_mlp": 1.00071073, + "epoch": 0.46601533142943036, + "flos": 32854169721600.0, + "grad_norm": 1.8818193765194988, + "language_loss": 0.73578846, + "learning_rate": 2.3158881974400963e-06, + "loss": 0.75839072, + "num_input_tokens_seen": 166393175, + "step": 7751, + "time_per_iteration": 2.9815759658813477 + }, + { + "auxiliary_loss_clip": 0.01119319, + "auxiliary_loss_mlp": 0.01116869, + "balance_loss_clip": 1.00209212, + "balance_loss_mlp": 1.00071192, + "epoch": 0.4660754546820983, + "flos": 19967517826560.0, + "grad_norm": 1.825579838452634, + "language_loss": 0.73426467, + "learning_rate": 2.3155036182733345e-06, + "loss": 0.75662661, + "num_input_tokens_seen": 166408630, + "step": 7752, + "time_per_iteration": 2.7183542251586914 + }, + { + "auxiliary_loss_clip": 0.01135389, + "auxiliary_loss_mlp": 0.01117424, + "balance_loss_clip": 1.00210941, + "balance_loss_mlp": 1.00069463, + "epoch": 0.4661355779347663, + "flos": 26688164215680.0, + "grad_norm": 2.0127412125033963, + "language_loss": 0.69532031, + "learning_rate": 2.315119027142644e-06, + "loss": 0.71784842, + "num_input_tokens_seen": 166428170, + "step": 7753, + "time_per_iteration": 2.6614022254943848 + }, + { + "auxiliary_loss_clip": 0.01134215, + "auxiliary_loss_mlp": 0.01116276, + "balance_loss_clip": 1.00214076, + "balance_loss_mlp": 1.0006907, + "epoch": 0.46619570118743425, + "flos": 20959442881920.0, + "grad_norm": 1.7693957670751166, + "language_loss": 0.72820747, + "learning_rate": 2.3147344240626076e-06, + "loss": 0.75071239, + "num_input_tokens_seen": 166446705, + "step": 7754, + "time_per_iteration": 4.093633413314819 + }, + { + "auxiliary_loss_clip": 0.01137151, + "auxiliary_loss_mlp": 0.01117064, + "balance_loss_clip": 1.00216365, + "balance_loss_mlp": 1.00062037, + "epoch": 0.4662558244401022, + "flos": 24426079355520.0, + "grad_norm": 1.7614386067868653, + "language_loss": 0.78775048, + "learning_rate": 2.3143498090478114e-06, + "loss": 0.81029266, + "num_input_tokens_seen": 166466750, + "step": 7755, + "time_per_iteration": 2.626948118209839 + }, + { + "auxiliary_loss_clip": 0.0115156, + "auxiliary_loss_mlp": 0.01117119, + "balance_loss_clip": 1.00223327, + "balance_loss_mlp": 1.0005796, + "epoch": 0.4663159476927702, + "flos": 20595452411520.0, + "grad_norm": 1.678573487664174, + "language_loss": 0.72428608, + "learning_rate": 2.3139651821128382e-06, + "loss": 0.74697292, + "num_input_tokens_seen": 166485400, + "step": 7756, + "time_per_iteration": 5.667840003967285 + }, + { + "auxiliary_loss_clip": 0.01151434, + "auxiliary_loss_mlp": 0.0111544, + "balance_loss_clip": 1.00209999, + "balance_loss_mlp": 1.0005219, + "epoch": 0.46637607094543815, + "flos": 25661872823040.0, + "grad_norm": 1.9603157885933478, + "language_loss": 0.78532934, + "learning_rate": 2.313580543272274e-06, + "loss": 0.807998, + "num_input_tokens_seen": 166505730, + "step": 7757, + "time_per_iteration": 2.6072137355804443 + }, + { + "auxiliary_loss_clip": 0.01120038, + "auxiliary_loss_mlp": 0.01116947, + "balance_loss_clip": 1.00225842, + "balance_loss_mlp": 1.00050354, + "epoch": 0.4664361941981061, + "flos": 24273853516800.0, + "grad_norm": 4.350263523256363, + "language_loss": 0.66568428, + "learning_rate": 2.313195892540705e-06, + "loss": 0.68805414, + "num_input_tokens_seen": 166523770, + "step": 7758, + "time_per_iteration": 2.675278663635254 + }, + { + "auxiliary_loss_clip": 0.01136326, + "auxiliary_loss_mlp": 0.01116385, + "balance_loss_clip": 1.00223756, + "balance_loss_mlp": 1.00070453, + "epoch": 0.4664963174507741, + "flos": 18405871153920.0, + "grad_norm": 2.1863230475048394, + "language_loss": 0.74654174, + "learning_rate": 2.3128112299327147e-06, + "loss": 0.7690689, + "num_input_tokens_seen": 166542935, + "step": 7759, + "time_per_iteration": 2.5959064960479736 + }, + { + "auxiliary_loss_clip": 0.01136521, + "auxiliary_loss_mlp": 0.01117111, + "balance_loss_clip": 1.00220394, + "balance_loss_mlp": 1.00057232, + "epoch": 0.46655644070344204, + "flos": 22455122227200.0, + "grad_norm": 3.9757334818241232, + "language_loss": 0.77653921, + "learning_rate": 2.312426555462893e-06, + "loss": 0.79907548, + "num_input_tokens_seen": 166563935, + "step": 7760, + "time_per_iteration": 2.6216225624084473 + }, + { + "auxiliary_loss_clip": 0.01135814, + "auxiliary_loss_mlp": 0.0111592, + "balance_loss_clip": 1.00216043, + "balance_loss_mlp": 1.00071573, + "epoch": 0.46661656395611, + "flos": 13808407731840.0, + "grad_norm": 1.7281642063373661, + "language_loss": 0.7397567, + "learning_rate": 2.3120418691458237e-06, + "loss": 0.76227403, + "num_input_tokens_seen": 166582175, + "step": 7761, + "time_per_iteration": 2.6221632957458496 + }, + { + "auxiliary_loss_clip": 0.0115153, + "auxiliary_loss_mlp": 0.01117886, + "balance_loss_clip": 1.00216532, + "balance_loss_mlp": 1.00067973, + "epoch": 0.466676687208778, + "flos": 21652159645440.0, + "grad_norm": 1.745132025967033, + "language_loss": 0.78743505, + "learning_rate": 2.3116571709960956e-06, + "loss": 0.81012917, + "num_input_tokens_seen": 166601870, + "step": 7762, + "time_per_iteration": 2.570614814758301 + }, + { + "auxiliary_loss_clip": 0.01149078, + "auxiliary_loss_mlp": 0.01095367, + "balance_loss_clip": 1.00197196, + "balance_loss_mlp": 0.99999982, + "epoch": 0.46673681046144594, + "flos": 68534259068160.0, + "grad_norm": 0.7956831325266986, + "language_loss": 0.59870696, + "learning_rate": 2.311272461028297e-06, + "loss": 0.62115145, + "num_input_tokens_seen": 166668960, + "step": 7763, + "time_per_iteration": 3.2630743980407715 + }, + { + "auxiliary_loss_clip": 0.01119654, + "auxiliary_loss_mlp": 0.01117947, + "balance_loss_clip": 1.00194371, + "balance_loss_mlp": 1.00074029, + "epoch": 0.46679693371411396, + "flos": 15814449469440.0, + "grad_norm": 1.8295639669536896, + "language_loss": 0.78568137, + "learning_rate": 2.3108877392570146e-06, + "loss": 0.80805743, + "num_input_tokens_seen": 166686110, + "step": 7764, + "time_per_iteration": 2.6862993240356445 + }, + { + "auxiliary_loss_clip": 0.01120904, + "auxiliary_loss_mlp": 0.01116529, + "balance_loss_clip": 1.00209188, + "balance_loss_mlp": 1.00065756, + "epoch": 0.4668570569667819, + "flos": 18514572687360.0, + "grad_norm": 2.0396021555376658, + "language_loss": 0.72281563, + "learning_rate": 2.310503005696839e-06, + "loss": 0.74519002, + "num_input_tokens_seen": 166703930, + "step": 7765, + "time_per_iteration": 2.6322622299194336 + }, + { + "auxiliary_loss_clip": 0.01118826, + "auxiliary_loss_mlp": 0.01118078, + "balance_loss_clip": 1.00193405, + "balance_loss_mlp": 1.00068116, + "epoch": 0.4669171802194499, + "flos": 19206643006080.0, + "grad_norm": 2.9489272444458723, + "language_loss": 0.77809644, + "learning_rate": 2.3101182603623576e-06, + "loss": 0.80046558, + "num_input_tokens_seen": 166719940, + "step": 7766, + "time_per_iteration": 2.6438143253326416 + }, + { + "auxiliary_loss_clip": 0.01153092, + "auxiliary_loss_mlp": 0.0111726, + "balance_loss_clip": 1.00225687, + "balance_loss_mlp": 1.00062573, + "epoch": 0.46697730347211786, + "flos": 12276135406080.0, + "grad_norm": 2.4196671582343803, + "language_loss": 0.65089637, + "learning_rate": 2.3097335032681607e-06, + "loss": 0.67359984, + "num_input_tokens_seen": 166738285, + "step": 7767, + "time_per_iteration": 2.5654590129852295 + }, + { + "auxiliary_loss_clip": 0.01151679, + "auxiliary_loss_mlp": 0.0111695, + "balance_loss_clip": 1.00228059, + "balance_loss_mlp": 1.00069678, + "epoch": 0.4670374267247858, + "flos": 23586739274880.0, + "grad_norm": 2.067073094708756, + "language_loss": 0.74153554, + "learning_rate": 2.3093487344288393e-06, + "loss": 0.76422179, + "num_input_tokens_seen": 166758170, + "step": 7768, + "time_per_iteration": 2.5944876670837402 + }, + { + "auxiliary_loss_clip": 0.01134857, + "auxiliary_loss_mlp": 0.01116108, + "balance_loss_clip": 1.00184321, + "balance_loss_mlp": 1.00052309, + "epoch": 0.4670975499774538, + "flos": 15991093578240.0, + "grad_norm": 1.53463201624187, + "language_loss": 0.70906639, + "learning_rate": 2.308963953858982e-06, + "loss": 0.73157603, + "num_input_tokens_seen": 166775750, + "step": 7769, + "time_per_iteration": 2.5977911949157715 + }, + { + "auxiliary_loss_clip": 0.01168236, + "auxiliary_loss_mlp": 0.01116807, + "balance_loss_clip": 1.00217223, + "balance_loss_mlp": 1.00055432, + "epoch": 0.46715767323012175, + "flos": 15377596260480.0, + "grad_norm": 1.8194585277772166, + "language_loss": 0.81267262, + "learning_rate": 2.3085791615731803e-06, + "loss": 0.83552313, + "num_input_tokens_seen": 166791720, + "step": 7770, + "time_per_iteration": 2.5253045558929443 + }, + { + "auxiliary_loss_clip": 0.01163801, + "auxiliary_loss_mlp": 0.01094588, + "balance_loss_clip": 1.0019232, + "balance_loss_mlp": 0.99998313, + "epoch": 0.4672177964827897, + "flos": 60252217401600.0, + "grad_norm": 0.7940050102466281, + "language_loss": 0.55665922, + "learning_rate": 2.3081943575860265e-06, + "loss": 0.57924306, + "num_input_tokens_seen": 166856360, + "step": 7771, + "time_per_iteration": 3.119966983795166 + }, + { + "auxiliary_loss_clip": 0.01153292, + "auxiliary_loss_mlp": 0.00747436, + "balance_loss_clip": 1.00229406, + "balance_loss_mlp": 1.0003593, + "epoch": 0.4672779197354577, + "flos": 27636134002560.0, + "grad_norm": 2.0350767853840934, + "language_loss": 0.659042, + "learning_rate": 2.3078095419121117e-06, + "loss": 0.67804933, + "num_input_tokens_seen": 166875925, + "step": 7772, + "time_per_iteration": 2.6114912033081055 + }, + { + "auxiliary_loss_clip": 0.01151822, + "auxiliary_loss_mlp": 0.01116654, + "balance_loss_clip": 1.00219584, + "balance_loss_mlp": 1.0005914, + "epoch": 0.46733804298812565, + "flos": 31394257344000.0, + "grad_norm": 1.8347427322151175, + "language_loss": 0.63970304, + "learning_rate": 2.3074247145660283e-06, + "loss": 0.66238785, + "num_input_tokens_seen": 166896520, + "step": 7773, + "time_per_iteration": 2.7130162715911865 + }, + { + "auxiliary_loss_clip": 0.01136938, + "auxiliary_loss_mlp": 0.01116348, + "balance_loss_clip": 1.0020256, + "balance_loss_mlp": 1.00057173, + "epoch": 0.4673981662407936, + "flos": 19500607912320.0, + "grad_norm": 2.111477544011243, + "language_loss": 0.7999357, + "learning_rate": 2.3070398755623685e-06, + "loss": 0.82246852, + "num_input_tokens_seen": 166915370, + "step": 7774, + "time_per_iteration": 2.612640142440796 + }, + { + "auxiliary_loss_clip": 0.01121793, + "auxiliary_loss_mlp": 0.0111579, + "balance_loss_clip": 1.0022099, + "balance_loss_mlp": 1.0003953, + "epoch": 0.4674582894934616, + "flos": 20521835487360.0, + "grad_norm": 1.6692718058225264, + "language_loss": 0.77582669, + "learning_rate": 2.306655024915726e-06, + "loss": 0.79820246, + "num_input_tokens_seen": 166934875, + "step": 7775, + "time_per_iteration": 2.6797244548797607 + }, + { + "auxiliary_loss_clip": 0.01136973, + "auxiliary_loss_mlp": 0.01115806, + "balance_loss_clip": 1.00209832, + "balance_loss_mlp": 1.00060189, + "epoch": 0.46751841274612954, + "flos": 22090952188800.0, + "grad_norm": 2.19340774717859, + "language_loss": 0.69928133, + "learning_rate": 2.306270162640694e-06, + "loss": 0.72180909, + "num_input_tokens_seen": 166954285, + "step": 7776, + "time_per_iteration": 2.603492021560669 + }, + { + "auxiliary_loss_clip": 0.01151718, + "auxiliary_loss_mlp": 0.01115173, + "balance_loss_clip": 1.00212622, + "balance_loss_mlp": 1.00054097, + "epoch": 0.46757853599879756, + "flos": 26980082046720.0, + "grad_norm": 1.9314103358412607, + "language_loss": 0.73576272, + "learning_rate": 2.3058852887518678e-06, + "loss": 0.75843167, + "num_input_tokens_seen": 166975975, + "step": 7777, + "time_per_iteration": 2.624246597290039 + }, + { + "auxiliary_loss_clip": 0.01151633, + "auxiliary_loss_mlp": 0.01116495, + "balance_loss_clip": 1.00217354, + "balance_loss_mlp": 1.00052857, + "epoch": 0.4676386592514655, + "flos": 24134053783680.0, + "grad_norm": 3.67737288030771, + "language_loss": 0.69609797, + "learning_rate": 2.3055004032638394e-06, + "loss": 0.71877927, + "num_input_tokens_seen": 166996140, + "step": 7778, + "time_per_iteration": 2.582960844039917 + }, + { + "auxiliary_loss_clip": 0.01151544, + "auxiliary_loss_mlp": 0.01117635, + "balance_loss_clip": 1.00218654, + "balance_loss_mlp": 1.00061941, + "epoch": 0.4676987825041335, + "flos": 25483720343040.0, + "grad_norm": 1.7131962956289393, + "language_loss": 0.73851776, + "learning_rate": 2.305115506191206e-06, + "loss": 0.76120961, + "num_input_tokens_seen": 167016105, + "step": 7779, + "time_per_iteration": 2.6073639392852783 + }, + { + "auxiliary_loss_clip": 0.01104878, + "auxiliary_loss_mlp": 0.01115166, + "balance_loss_clip": 1.00192189, + "balance_loss_mlp": 1.00053442, + "epoch": 0.46775890575680146, + "flos": 21945298538880.0, + "grad_norm": 1.6149166691827532, + "language_loss": 0.72692537, + "learning_rate": 2.304730597548562e-06, + "loss": 0.74912584, + "num_input_tokens_seen": 167036185, + "step": 7780, + "time_per_iteration": 2.692613124847412 + }, + { + "auxiliary_loss_clip": 0.01138136, + "auxiliary_loss_mlp": 0.0111807, + "balance_loss_clip": 1.00229311, + "balance_loss_mlp": 1.00057709, + "epoch": 0.4678190290094694, + "flos": 25228395492480.0, + "grad_norm": 1.8409463193960847, + "language_loss": 0.74338067, + "learning_rate": 2.3043456773505023e-06, + "loss": 0.76594275, + "num_input_tokens_seen": 167054515, + "step": 7781, + "time_per_iteration": 2.675899028778076 + }, + { + "auxiliary_loss_clip": 0.01151464, + "auxiliary_loss_mlp": 0.01116927, + "balance_loss_clip": 1.00212157, + "balance_loss_mlp": 1.00067425, + "epoch": 0.4678791522621374, + "flos": 32268358811520.0, + "grad_norm": 1.731860048951085, + "language_loss": 0.62413275, + "learning_rate": 2.3039607456116252e-06, + "loss": 0.64681661, + "num_input_tokens_seen": 167077245, + "step": 7782, + "time_per_iteration": 2.6742169857025146 + }, + { + "auxiliary_loss_clip": 0.0113672, + "auxiliary_loss_mlp": 0.01116841, + "balance_loss_clip": 1.00215256, + "balance_loss_mlp": 1.00068355, + "epoch": 0.46793927551480535, + "flos": 27046480337280.0, + "grad_norm": 2.2023127488826604, + "language_loss": 0.63367289, + "learning_rate": 2.3035758023465254e-06, + "loss": 0.65620852, + "num_input_tokens_seen": 167097235, + "step": 7783, + "time_per_iteration": 2.6652655601501465 + }, + { + "auxiliary_loss_clip": 0.01151687, + "auxiliary_loss_mlp": 0.0111765, + "balance_loss_clip": 1.0022428, + "balance_loss_mlp": 1.00063443, + "epoch": 0.4679993987674733, + "flos": 17457398576640.0, + "grad_norm": 2.3866066671801507, + "language_loss": 0.68156576, + "learning_rate": 2.303190847569801e-06, + "loss": 0.70425916, + "num_input_tokens_seen": 167113155, + "step": 7784, + "time_per_iteration": 2.5660078525543213 + }, + { + "auxiliary_loss_clip": 0.01134961, + "auxiliary_loss_mlp": 0.01116192, + "balance_loss_clip": 1.00209069, + "balance_loss_mlp": 1.00060701, + "epoch": 0.4680595220201413, + "flos": 17165121609600.0, + "grad_norm": 1.8445677116803394, + "language_loss": 0.83933443, + "learning_rate": 2.3028058812960497e-06, + "loss": 0.86184597, + "num_input_tokens_seen": 167131765, + "step": 7785, + "time_per_iteration": 2.6049139499664307 + }, + { + "auxiliary_loss_clip": 0.01118483, + "auxiliary_loss_mlp": 0.01115676, + "balance_loss_clip": 1.00200033, + "balance_loss_mlp": 1.00056708, + "epoch": 0.46811964527280925, + "flos": 11327591001600.0, + "grad_norm": 2.1015693555614003, + "language_loss": 0.77193522, + "learning_rate": 2.3024209035398678e-06, + "loss": 0.79427671, + "num_input_tokens_seen": 167149030, + "step": 7786, + "time_per_iteration": 2.6318540573120117 + }, + { + "auxiliary_loss_clip": 0.01151383, + "auxiliary_loss_mlp": 0.01114716, + "balance_loss_clip": 1.00217581, + "balance_loss_mlp": 1.00065637, + "epoch": 0.4681797685254772, + "flos": 24278809593600.0, + "grad_norm": 1.9173381715086717, + "language_loss": 0.74019414, + "learning_rate": 2.302035914315856e-06, + "loss": 0.76285517, + "num_input_tokens_seen": 167167375, + "step": 7787, + "time_per_iteration": 2.5850863456726074 + }, + { + "auxiliary_loss_clip": 0.01136962, + "auxiliary_loss_mlp": 0.01116022, + "balance_loss_clip": 1.00226235, + "balance_loss_mlp": 1.00072312, + "epoch": 0.4682398917781452, + "flos": 31650372293760.0, + "grad_norm": 1.9187452791941146, + "language_loss": 0.65404785, + "learning_rate": 2.3016509136386116e-06, + "loss": 0.67657769, + "num_input_tokens_seen": 167188065, + "step": 7788, + "time_per_iteration": 4.055980443954468 + }, + { + "auxiliary_loss_clip": 0.01151303, + "auxiliary_loss_mlp": 0.01115278, + "balance_loss_clip": 1.00209093, + "balance_loss_mlp": 1.00064659, + "epoch": 0.46830001503081314, + "flos": 28110765340800.0, + "grad_norm": 1.6170150634790468, + "language_loss": 0.64324379, + "learning_rate": 2.3012659015227343e-06, + "loss": 0.66590959, + "num_input_tokens_seen": 167209675, + "step": 7789, + "time_per_iteration": 2.6291980743408203 + }, + { + "auxiliary_loss_clip": 0.0114721, + "auxiliary_loss_mlp": 0.01094583, + "balance_loss_clip": 1.00178695, + "balance_loss_mlp": 0.99997818, + "epoch": 0.4683601382834811, + "flos": 57881718316800.0, + "grad_norm": 0.7033656306763728, + "language_loss": 0.61897397, + "learning_rate": 2.300880877982825e-06, + "loss": 0.64139187, + "num_input_tokens_seen": 167273940, + "step": 7790, + "time_per_iteration": 3.230943202972412 + }, + { + "auxiliary_loss_clip": 0.01121743, + "auxiliary_loss_mlp": 0.01115415, + "balance_loss_clip": 1.00213087, + "balance_loss_mlp": 1.00059247, + "epoch": 0.46842026153614913, + "flos": 21871933009920.0, + "grad_norm": 2.0998504524249273, + "language_loss": 0.79591954, + "learning_rate": 2.3004958430334808e-06, + "loss": 0.81829107, + "num_input_tokens_seen": 167292730, + "step": 7791, + "time_per_iteration": 2.694355010986328 + }, + { + "auxiliary_loss_clip": 0.01151293, + "auxiliary_loss_mlp": 0.01115511, + "balance_loss_clip": 1.00226426, + "balance_loss_mlp": 1.00068843, + "epoch": 0.4684803847888171, + "flos": 24900818434560.0, + "grad_norm": 1.639814222610456, + "language_loss": 0.75328535, + "learning_rate": 2.3001107966893052e-06, + "loss": 0.77595341, + "num_input_tokens_seen": 167313460, + "step": 7792, + "time_per_iteration": 4.049442529678345 + }, + { + "auxiliary_loss_clip": 0.01137916, + "auxiliary_loss_mlp": 0.0111556, + "balance_loss_clip": 1.00234103, + "balance_loss_mlp": 1.00073802, + "epoch": 0.46854050804148506, + "flos": 26251670142720.0, + "grad_norm": 1.483648374648839, + "language_loss": 0.68365562, + "learning_rate": 2.299725738964898e-06, + "loss": 0.70619035, + "num_input_tokens_seen": 167335385, + "step": 7793, + "time_per_iteration": 2.648928642272949 + }, + { + "auxiliary_loss_clip": 0.01151423, + "auxiliary_loss_mlp": 0.00747301, + "balance_loss_clip": 1.00235844, + "balance_loss_mlp": 1.00029647, + "epoch": 0.468600631294153, + "flos": 21579799697280.0, + "grad_norm": 1.589013237334802, + "language_loss": 0.73787749, + "learning_rate": 2.2993406698748607e-06, + "loss": 0.75686479, + "num_input_tokens_seen": 167353625, + "step": 7794, + "time_per_iteration": 5.515044689178467 + }, + { + "auxiliary_loss_clip": 0.01120189, + "auxiliary_loss_mlp": 0.01116478, + "balance_loss_clip": 1.00217056, + "balance_loss_mlp": 1.00070214, + "epoch": 0.468660754546821, + "flos": 25885632597120.0, + "grad_norm": 2.110799085898629, + "language_loss": 0.63486123, + "learning_rate": 2.2989555894337953e-06, + "loss": 0.65722787, + "num_input_tokens_seen": 167374565, + "step": 7795, + "time_per_iteration": 2.6964073181152344 + }, + { + "auxiliary_loss_clip": 0.01123525, + "auxiliary_loss_mlp": 0.01116019, + "balance_loss_clip": 1.00227714, + "balance_loss_mlp": 1.00052905, + "epoch": 0.46872087779948896, + "flos": 35475001666560.0, + "grad_norm": 1.6093698155621303, + "language_loss": 0.67949128, + "learning_rate": 2.298570497656304e-06, + "loss": 0.70188677, + "num_input_tokens_seen": 167395010, + "step": 7796, + "time_per_iteration": 2.754263162612915 + }, + { + "auxiliary_loss_clip": 0.01168151, + "auxiliary_loss_mlp": 0.00747311, + "balance_loss_clip": 1.00228262, + "balance_loss_mlp": 1.00026631, + "epoch": 0.4687810010521569, + "flos": 26396425952640.0, + "grad_norm": 1.8242129405002407, + "language_loss": 0.7006287, + "learning_rate": 2.2981853945569894e-06, + "loss": 0.71978337, + "num_input_tokens_seen": 167415285, + "step": 7797, + "time_per_iteration": 2.57578182220459 + }, + { + "auxiliary_loss_clip": 0.01142503, + "auxiliary_loss_mlp": 0.01116576, + "balance_loss_clip": 1.00236905, + "balance_loss_mlp": 1.00060892, + "epoch": 0.4688411243048249, + "flos": 19972761212160.0, + "grad_norm": 1.9000587799818291, + "language_loss": 0.67325282, + "learning_rate": 2.297800280150454e-06, + "loss": 0.69584364, + "num_input_tokens_seen": 167432405, + "step": 7798, + "time_per_iteration": 2.5738017559051514 + }, + { + "auxiliary_loss_clip": 0.01149118, + "auxiliary_loss_mlp": 0.01094611, + "balance_loss_clip": 1.00192058, + "balance_loss_mlp": 1.0000062, + "epoch": 0.46890124755749285, + "flos": 63977015900160.0, + "grad_norm": 0.9195482354697527, + "language_loss": 0.64569485, + "learning_rate": 2.2974151544513033e-06, + "loss": 0.66813213, + "num_input_tokens_seen": 167499365, + "step": 7799, + "time_per_iteration": 3.3308398723602295 + }, + { + "auxiliary_loss_clip": 0.01120512, + "auxiliary_loss_mlp": 0.01115269, + "balance_loss_clip": 1.00200057, + "balance_loss_mlp": 1.00063765, + "epoch": 0.4689613708101608, + "flos": 23768985905280.0, + "grad_norm": 1.4958021489499227, + "language_loss": 0.72213387, + "learning_rate": 2.2970300174741395e-06, + "loss": 0.7444917, + "num_input_tokens_seen": 167520390, + "step": 7800, + "time_per_iteration": 2.727651834487915 + }, + { + "auxiliary_loss_clip": 0.01168098, + "auxiliary_loss_mlp": 0.01115142, + "balance_loss_clip": 1.00232649, + "balance_loss_mlp": 1.00060594, + "epoch": 0.4690214940628288, + "flos": 24788705109120.0, + "grad_norm": 1.80006822530985, + "language_loss": 0.72296774, + "learning_rate": 2.296644869233568e-06, + "loss": 0.74580014, + "num_input_tokens_seen": 167539865, + "step": 7801, + "time_per_iteration": 2.5712685585021973 + }, + { + "auxiliary_loss_clip": 0.0112156, + "auxiliary_loss_mlp": 0.01116289, + "balance_loss_clip": 1.00210917, + "balance_loss_mlp": 1.00060844, + "epoch": 0.46908161731549675, + "flos": 18077324428800.0, + "grad_norm": 2.0049685812227804, + "language_loss": 0.62364721, + "learning_rate": 2.2962597097441936e-06, + "loss": 0.64602566, + "num_input_tokens_seen": 167558190, + "step": 7802, + "time_per_iteration": 2.640148401260376 + }, + { + "auxiliary_loss_clip": 0.0116817, + "auxiliary_loss_mlp": 0.01116799, + "balance_loss_clip": 1.00225437, + "balance_loss_mlp": 1.00073659, + "epoch": 0.4691417405681647, + "flos": 25703350053120.0, + "grad_norm": 1.8527487322844445, + "language_loss": 0.73864532, + "learning_rate": 2.2958745390206206e-06, + "loss": 0.76149499, + "num_input_tokens_seen": 167577685, + "step": 7803, + "time_per_iteration": 2.5693869590759277 + }, + { + "auxiliary_loss_clip": 0.01134673, + "auxiliary_loss_mlp": 0.00747319, + "balance_loss_clip": 1.00206971, + "balance_loss_mlp": 1.00025821, + "epoch": 0.46920186382083273, + "flos": 17457039440640.0, + "grad_norm": 1.5384766186657621, + "language_loss": 0.77414721, + "learning_rate": 2.2954893570774558e-06, + "loss": 0.79296714, + "num_input_tokens_seen": 167596390, + "step": 7804, + "time_per_iteration": 2.614051103591919 + }, + { + "auxiliary_loss_clip": 0.01135202, + "auxiliary_loss_mlp": 0.0111652, + "balance_loss_clip": 1.00203991, + "balance_loss_mlp": 1.00055325, + "epoch": 0.4692619870735007, + "flos": 20339445202560.0, + "grad_norm": 1.8344394395286434, + "language_loss": 0.77212691, + "learning_rate": 2.295104163929305e-06, + "loss": 0.79464412, + "num_input_tokens_seen": 167614980, + "step": 7805, + "time_per_iteration": 2.645608901977539 + }, + { + "auxiliary_loss_clip": 0.01168288, + "auxiliary_loss_mlp": 0.01117557, + "balance_loss_clip": 1.0022471, + "balance_loss_mlp": 1.00092268, + "epoch": 0.46932211032616866, + "flos": 29496558003840.0, + "grad_norm": 1.6177485149919186, + "language_loss": 0.82577586, + "learning_rate": 2.2947189595907742e-06, + "loss": 0.84863436, + "num_input_tokens_seen": 167635895, + "step": 7806, + "time_per_iteration": 2.6271913051605225 + }, + { + "auxiliary_loss_clip": 0.01134961, + "auxiliary_loss_mlp": 0.011166, + "balance_loss_clip": 1.00203967, + "balance_loss_mlp": 1.00072849, + "epoch": 0.4693822335788366, + "flos": 36211242735360.0, + "grad_norm": 1.9907081456724258, + "language_loss": 0.77301985, + "learning_rate": 2.294333744076472e-06, + "loss": 0.79553545, + "num_input_tokens_seen": 167657440, + "step": 7807, + "time_per_iteration": 2.746816635131836 + }, + { + "auxiliary_loss_clip": 0.01135371, + "auxiliary_loss_mlp": 0.01117461, + "balance_loss_clip": 1.002092, + "balance_loss_mlp": 1.0006361, + "epoch": 0.4694423568315046, + "flos": 20338978325760.0, + "grad_norm": 1.8335016292994473, + "language_loss": 0.5171513, + "learning_rate": 2.2939485174010035e-06, + "loss": 0.53967965, + "num_input_tokens_seen": 167675025, + "step": 7808, + "time_per_iteration": 2.5995662212371826 + }, + { + "auxiliary_loss_clip": 0.01098236, + "auxiliary_loss_mlp": 0.01095002, + "balance_loss_clip": 1.00163317, + "balance_loss_mlp": 1.00001621, + "epoch": 0.46950248008417256, + "flos": 64326353621760.0, + "grad_norm": 0.839228595130534, + "language_loss": 0.57753569, + "learning_rate": 2.293563279578978e-06, + "loss": 0.59946811, + "num_input_tokens_seen": 167729635, + "step": 7809, + "time_per_iteration": 3.1255290508270264 + }, + { + "auxiliary_loss_clip": 0.01109101, + "auxiliary_loss_mlp": 0.01116545, + "balance_loss_clip": 1.00225723, + "balance_loss_mlp": 1.00076878, + "epoch": 0.4695626033368405, + "flos": 19200106730880.0, + "grad_norm": 2.4050447767302847, + "language_loss": 0.71557397, + "learning_rate": 2.2931780306250045e-06, + "loss": 0.7378304, + "num_input_tokens_seen": 167745135, + "step": 7810, + "time_per_iteration": 2.7457711696624756 + }, + { + "auxiliary_loss_clip": 0.01151578, + "auxiliary_loss_mlp": 0.01116527, + "balance_loss_clip": 1.00225329, + "balance_loss_mlp": 1.00065541, + "epoch": 0.4696227265895085, + "flos": 23002436736000.0, + "grad_norm": 1.751842973267171, + "language_loss": 0.81184179, + "learning_rate": 2.29279277055369e-06, + "loss": 0.83452284, + "num_input_tokens_seen": 167763875, + "step": 7811, + "time_per_iteration": 2.6270217895507812 + }, + { + "auxiliary_loss_clip": 0.01151942, + "auxiliary_loss_mlp": 0.01116882, + "balance_loss_clip": 1.00220299, + "balance_loss_mlp": 1.00072467, + "epoch": 0.46968284984217645, + "flos": 21870855601920.0, + "grad_norm": 2.1072879255665837, + "language_loss": 0.80476028, + "learning_rate": 2.292407499379644e-06, + "loss": 0.82744849, + "num_input_tokens_seen": 167784895, + "step": 7812, + "time_per_iteration": 2.605243444442749 + }, + { + "auxiliary_loss_clip": 0.01111276, + "auxiliary_loss_mlp": 0.01115233, + "balance_loss_clip": 1.00219965, + "balance_loss_mlp": 1.00079155, + "epoch": 0.4697429730948444, + "flos": 19974987855360.0, + "grad_norm": 4.624445345020415, + "language_loss": 0.74400532, + "learning_rate": 2.292022217117477e-06, + "loss": 0.7662704, + "num_input_tokens_seen": 167803185, + "step": 7813, + "time_per_iteration": 2.681462049484253 + }, + { + "auxiliary_loss_clip": 0.01134953, + "auxiliary_loss_mlp": 0.01115809, + "balance_loss_clip": 1.00205743, + "balance_loss_mlp": 1.00060534, + "epoch": 0.4698030963475124, + "flos": 15156206784000.0, + "grad_norm": 2.298488758089737, + "language_loss": 0.84507686, + "learning_rate": 2.291636923781798e-06, + "loss": 0.86758447, + "num_input_tokens_seen": 167816550, + "step": 7814, + "time_per_iteration": 2.578181028366089 + }, + { + "auxiliary_loss_clip": 0.01138055, + "auxiliary_loss_mlp": 0.01115004, + "balance_loss_clip": 1.00212216, + "balance_loss_mlp": 1.00075412, + "epoch": 0.46986321960018035, + "flos": 15151178880000.0, + "grad_norm": 2.013021879729851, + "language_loss": 0.81602216, + "learning_rate": 2.291251619387217e-06, + "loss": 0.83855277, + "num_input_tokens_seen": 167831845, + "step": 7815, + "time_per_iteration": 2.5561249256134033 + }, + { + "auxiliary_loss_clip": 0.01101332, + "auxiliary_loss_mlp": 0.01115603, + "balance_loss_clip": 1.00204921, + "balance_loss_mlp": 1.00068474, + "epoch": 0.4699233428528483, + "flos": 23108911626240.0, + "grad_norm": 1.8841232461541784, + "language_loss": 0.77707046, + "learning_rate": 2.2908663039483468e-06, + "loss": 0.79923975, + "num_input_tokens_seen": 167850360, + "step": 7816, + "time_per_iteration": 2.718264102935791 + }, + { + "auxiliary_loss_clip": 0.01163823, + "auxiliary_loss_mlp": 0.01094593, + "balance_loss_clip": 1.00201976, + "balance_loss_mlp": 0.99998826, + "epoch": 0.46998346610551633, + "flos": 68105558246400.0, + "grad_norm": 0.8609859867845711, + "language_loss": 0.59029895, + "learning_rate": 2.290480977479796e-06, + "loss": 0.61288309, + "num_input_tokens_seen": 167908660, + "step": 7817, + "time_per_iteration": 3.1058928966522217 + }, + { + "auxiliary_loss_clip": 0.0113583, + "auxiliary_loss_mlp": 0.01115172, + "balance_loss_clip": 1.0021379, + "balance_loss_mlp": 1.00054061, + "epoch": 0.4700435893581843, + "flos": 24129456842880.0, + "grad_norm": 1.652035123118468, + "language_loss": 0.79280555, + "learning_rate": 2.2900956399961775e-06, + "loss": 0.8153156, + "num_input_tokens_seen": 167927905, + "step": 7818, + "time_per_iteration": 2.626485586166382 + }, + { + "auxiliary_loss_clip": 0.01168055, + "auxiliary_loss_mlp": 0.01116082, + "balance_loss_clip": 1.00217533, + "balance_loss_mlp": 1.00059152, + "epoch": 0.47010371261085226, + "flos": 20150518642560.0, + "grad_norm": 2.0486166376183306, + "language_loss": 0.84027493, + "learning_rate": 2.289710291512104e-06, + "loss": 0.86311632, + "num_input_tokens_seen": 167945995, + "step": 7819, + "time_per_iteration": 2.5282983779907227 + }, + { + "auxiliary_loss_clip": 0.01123356, + "auxiliary_loss_mlp": 0.01116425, + "balance_loss_clip": 1.00207412, + "balance_loss_mlp": 1.00055313, + "epoch": 0.47016383586352023, + "flos": 15122199582720.0, + "grad_norm": 2.0582893841186225, + "language_loss": 0.75412601, + "learning_rate": 2.289324932042186e-06, + "loss": 0.77652383, + "num_input_tokens_seen": 167963380, + "step": 7820, + "time_per_iteration": 2.6384665966033936 + }, + { + "auxiliary_loss_clip": 0.01151535, + "auxiliary_loss_mlp": 0.01115837, + "balance_loss_clip": 1.00216007, + "balance_loss_mlp": 1.00063312, + "epoch": 0.4702239591161882, + "flos": 13552975140480.0, + "grad_norm": 1.9277949699801313, + "language_loss": 0.73991275, + "learning_rate": 2.288939561601039e-06, + "loss": 0.76258647, + "num_input_tokens_seen": 167981740, + "step": 7821, + "time_per_iteration": 2.5359647274017334 + }, + { + "auxiliary_loss_clip": 0.01167963, + "auxiliary_loss_mlp": 0.01115273, + "balance_loss_clip": 1.00227344, + "balance_loss_mlp": 1.00064194, + "epoch": 0.47028408236885616, + "flos": 24276511123200.0, + "grad_norm": 1.689828160570216, + "language_loss": 0.88765132, + "learning_rate": 2.2885541802032746e-06, + "loss": 0.91048366, + "num_input_tokens_seen": 167999380, + "step": 7822, + "time_per_iteration": 2.550410509109497 + }, + { + "auxiliary_loss_clip": 0.01151182, + "auxiliary_loss_mlp": 0.01116155, + "balance_loss_clip": 1.00214601, + "balance_loss_mlp": 1.00056982, + "epoch": 0.4703442056215241, + "flos": 22856926740480.0, + "grad_norm": 1.666494351134621, + "language_loss": 0.79517698, + "learning_rate": 2.2881687878635055e-06, + "loss": 0.81785035, + "num_input_tokens_seen": 168018395, + "step": 7823, + "time_per_iteration": 2.5756354331970215 + }, + { + "auxiliary_loss_clip": 0.01131439, + "auxiliary_loss_mlp": 0.01094228, + "balance_loss_clip": 1.00201046, + "balance_loss_mlp": 1.00000536, + "epoch": 0.4704043288741921, + "flos": 69240227950080.0, + "grad_norm": 0.6902124620625308, + "language_loss": 0.56620038, + "learning_rate": 2.2877833845963487e-06, + "loss": 0.58845699, + "num_input_tokens_seen": 168084080, + "step": 7824, + "time_per_iteration": 3.2605934143066406 + }, + { + "auxiliary_loss_clip": 0.01136449, + "auxiliary_loss_mlp": 0.011163, + "balance_loss_clip": 1.00213194, + "balance_loss_mlp": 1.00071502, + "epoch": 0.47046445212686006, + "flos": 18041090584320.0, + "grad_norm": 1.9118898546286704, + "language_loss": 0.81315386, + "learning_rate": 2.2873979704164157e-06, + "loss": 0.83568132, + "num_input_tokens_seen": 168101555, + "step": 7825, + "time_per_iteration": 4.033329486846924 + }, + { + "auxiliary_loss_clip": 0.01136009, + "auxiliary_loss_mlp": 0.01115899, + "balance_loss_clip": 1.00215423, + "balance_loss_mlp": 1.00069499, + "epoch": 0.470524575379528, + "flos": 23951448017280.0, + "grad_norm": 1.6020628525519682, + "language_loss": 0.66802424, + "learning_rate": 2.287012545338324e-06, + "loss": 0.69054329, + "num_input_tokens_seen": 168121530, + "step": 7826, + "time_per_iteration": 2.6310672760009766 + }, + { + "auxiliary_loss_clip": 0.01136266, + "auxiliary_loss_mlp": 0.01115388, + "balance_loss_clip": 1.00196075, + "balance_loss_mlp": 1.00066137, + "epoch": 0.470584698632196, + "flos": 18113558273280.0, + "grad_norm": 1.7793835023776903, + "language_loss": 0.8410306, + "learning_rate": 2.2866271093766877e-06, + "loss": 0.86354715, + "num_input_tokens_seen": 168140335, + "step": 7827, + "time_per_iteration": 2.613651990890503 + }, + { + "auxiliary_loss_clip": 0.01132615, + "auxiliary_loss_mlp": 0.01094292, + "balance_loss_clip": 1.00196505, + "balance_loss_mlp": 1.00006866, + "epoch": 0.47064482188486395, + "flos": 57251916224640.0, + "grad_norm": 0.797448138779094, + "language_loss": 0.55724275, + "learning_rate": 2.286241662546122e-06, + "loss": 0.57951176, + "num_input_tokens_seen": 168200535, + "step": 7828, + "time_per_iteration": 3.157431125640869 + }, + { + "auxiliary_loss_clip": 0.0116788, + "auxiliary_loss_mlp": 0.01115396, + "balance_loss_clip": 1.00220156, + "balance_loss_mlp": 1.00066876, + "epoch": 0.4707049451375319, + "flos": 17895077798400.0, + "grad_norm": 1.8604887238016876, + "language_loss": 0.8056699, + "learning_rate": 2.285856204861245e-06, + "loss": 0.82850271, + "num_input_tokens_seen": 168219610, + "step": 7829, + "time_per_iteration": 2.5179531574249268 + }, + { + "auxiliary_loss_clip": 0.01168018, + "auxiliary_loss_mlp": 0.01114619, + "balance_loss_clip": 1.00234449, + "balance_loss_mlp": 1.0006547, + "epoch": 0.47076506839019994, + "flos": 25232669210880.0, + "grad_norm": 1.3982614609839104, + "language_loss": 0.75923026, + "learning_rate": 2.2854707363366703e-06, + "loss": 0.78205663, + "num_input_tokens_seen": 168242505, + "step": 7830, + "time_per_iteration": 3.9877688884735107 + }, + { + "auxiliary_loss_clip": 0.01120802, + "auxiliary_loss_mlp": 0.01115615, + "balance_loss_clip": 1.00216758, + "balance_loss_mlp": 1.00069737, + "epoch": 0.4708251916428679, + "flos": 13479681438720.0, + "grad_norm": 2.9444657483103223, + "language_loss": 0.7893706, + "learning_rate": 2.2850852569870177e-06, + "loss": 0.8117348, + "num_input_tokens_seen": 168260220, + "step": 7831, + "time_per_iteration": 4.084066152572632 + }, + { + "auxiliary_loss_clip": 0.01127673, + "auxiliary_loss_mlp": 0.01116723, + "balance_loss_clip": 1.00242662, + "balance_loss_mlp": 1.00056577, + "epoch": 0.47088531489553587, + "flos": 30147833450880.0, + "grad_norm": 2.2581182154278707, + "language_loss": 0.75578809, + "learning_rate": 2.2846997668269033e-06, + "loss": 0.7782321, + "num_input_tokens_seen": 168277360, + "step": 7832, + "time_per_iteration": 4.130492925643921 + }, + { + "auxiliary_loss_clip": 0.01134485, + "auxiliary_loss_mlp": 0.01114195, + "balance_loss_clip": 1.00220942, + "balance_loss_mlp": 1.00051749, + "epoch": 0.47094543814820383, + "flos": 21798280172160.0, + "grad_norm": 2.4784941846538984, + "language_loss": 0.7460469, + "learning_rate": 2.2843142658709454e-06, + "loss": 0.76853371, + "num_input_tokens_seen": 168296605, + "step": 7833, + "time_per_iteration": 2.6104753017425537 + }, + { + "auxiliary_loss_clip": 0.01151379, + "auxiliary_loss_mlp": 0.01115841, + "balance_loss_clip": 1.00224376, + "balance_loss_mlp": 1.00063765, + "epoch": 0.4710055614008718, + "flos": 23003011353600.0, + "grad_norm": 1.648738716952697, + "language_loss": 0.75808036, + "learning_rate": 2.283928754133762e-06, + "loss": 0.78075254, + "num_input_tokens_seen": 168316205, + "step": 7834, + "time_per_iteration": 2.5734448432922363 + }, + { + "auxiliary_loss_clip": 0.01101547, + "auxiliary_loss_mlp": 0.01115598, + "balance_loss_clip": 1.00206029, + "balance_loss_mlp": 1.00068021, + "epoch": 0.47106568465353976, + "flos": 42741346452480.0, + "grad_norm": 1.8366281313128896, + "language_loss": 0.65995955, + "learning_rate": 2.283543231629972e-06, + "loss": 0.68213093, + "num_input_tokens_seen": 168338935, + "step": 7835, + "time_per_iteration": 2.8721072673797607 + }, + { + "auxiliary_loss_clip": 0.0114891, + "auxiliary_loss_mlp": 0.00745917, + "balance_loss_clip": 1.00191998, + "balance_loss_mlp": 0.9997139, + "epoch": 0.4711258079062077, + "flos": 68554008570240.0, + "grad_norm": 0.8704823985097193, + "language_loss": 0.62183499, + "learning_rate": 2.283157698374194e-06, + "loss": 0.64078325, + "num_input_tokens_seen": 168392800, + "step": 7836, + "time_per_iteration": 3.131505250930786 + }, + { + "auxiliary_loss_clip": 0.0112191, + "auxiliary_loss_mlp": 0.00747316, + "balance_loss_clip": 1.00217009, + "balance_loss_mlp": 1.00024247, + "epoch": 0.4711859311588757, + "flos": 25446588658560.0, + "grad_norm": 2.1807254219121806, + "language_loss": 0.69668341, + "learning_rate": 2.2827721543810475e-06, + "loss": 0.71537566, + "num_input_tokens_seen": 168412940, + "step": 7837, + "time_per_iteration": 2.70430326461792 + }, + { + "auxiliary_loss_clip": 0.01157684, + "auxiliary_loss_mlp": 0.01115885, + "balance_loss_clip": 1.00232399, + "balance_loss_mlp": 1.00058591, + "epoch": 0.47124605441154366, + "flos": 21981891519360.0, + "grad_norm": 2.525549611594877, + "language_loss": 0.66255057, + "learning_rate": 2.282386599665153e-06, + "loss": 0.68528622, + "num_input_tokens_seen": 168431995, + "step": 7838, + "time_per_iteration": 2.562074899673462 + }, + { + "auxiliary_loss_clip": 0.0113624, + "auxiliary_loss_mlp": 0.01116684, + "balance_loss_clip": 1.00211716, + "balance_loss_mlp": 1.00052667, + "epoch": 0.4713061776642116, + "flos": 25412689198080.0, + "grad_norm": 1.9027934025658404, + "language_loss": 0.77319515, + "learning_rate": 2.2820010342411304e-06, + "loss": 0.79572439, + "num_input_tokens_seen": 168454585, + "step": 7839, + "time_per_iteration": 2.646127700805664 + }, + { + "auxiliary_loss_clip": 0.01117394, + "auxiliary_loss_mlp": 0.01115012, + "balance_loss_clip": 1.00214064, + "balance_loss_mlp": 1.0006665, + "epoch": 0.4713663009168796, + "flos": 26542259170560.0, + "grad_norm": 2.1001747564689413, + "language_loss": 0.72340226, + "learning_rate": 2.2816154581235993e-06, + "loss": 0.74572635, + "num_input_tokens_seen": 168471265, + "step": 7840, + "time_per_iteration": 2.682328224182129 + }, + { + "auxiliary_loss_clip": 0.01134971, + "auxiliary_loss_mlp": 0.01115805, + "balance_loss_clip": 1.00208306, + "balance_loss_mlp": 1.00050592, + "epoch": 0.47142642416954755, + "flos": 23623583650560.0, + "grad_norm": 1.6467510919033765, + "language_loss": 0.75400758, + "learning_rate": 2.2812298713271833e-06, + "loss": 0.77651542, + "num_input_tokens_seen": 168491360, + "step": 7841, + "time_per_iteration": 2.6361677646636963 + }, + { + "auxiliary_loss_clip": 0.01141182, + "auxiliary_loss_mlp": 0.01115431, + "balance_loss_clip": 1.00206113, + "balance_loss_mlp": 1.00070393, + "epoch": 0.4714865474222155, + "flos": 22310150935680.0, + "grad_norm": 1.7822749058930152, + "language_loss": 0.7038793, + "learning_rate": 2.280844273866501e-06, + "loss": 0.72644538, + "num_input_tokens_seen": 168511335, + "step": 7842, + "time_per_iteration": 2.6098012924194336 + }, + { + "auxiliary_loss_clip": 0.01152394, + "auxiliary_loss_mlp": 0.01115876, + "balance_loss_clip": 1.00234175, + "balance_loss_mlp": 1.00057638, + "epoch": 0.4715466706748835, + "flos": 17822430541440.0, + "grad_norm": 4.185792145726555, + "language_loss": 0.78577995, + "learning_rate": 2.280458665756177e-06, + "loss": 0.80846262, + "num_input_tokens_seen": 168529920, + "step": 7843, + "time_per_iteration": 2.560457706451416 + }, + { + "auxiliary_loss_clip": 0.011576, + "auxiliary_loss_mlp": 0.01115707, + "balance_loss_clip": 1.00230145, + "balance_loss_mlp": 1.00050306, + "epoch": 0.4716067939275515, + "flos": 23659530186240.0, + "grad_norm": 1.6330930308212541, + "language_loss": 0.74376059, + "learning_rate": 2.280073047010832e-06, + "loss": 0.76649368, + "num_input_tokens_seen": 168550595, + "step": 7844, + "time_per_iteration": 2.615522623062134 + }, + { + "auxiliary_loss_clip": 0.01134284, + "auxiliary_loss_mlp": 0.01115401, + "balance_loss_clip": 1.00191522, + "balance_loss_mlp": 1.00086498, + "epoch": 0.47166691718021947, + "flos": 17930162407680.0, + "grad_norm": 1.8747228132304754, + "language_loss": 0.78588825, + "learning_rate": 2.279687417645088e-06, + "loss": 0.80838513, + "num_input_tokens_seen": 168569765, + "step": 7845, + "time_per_iteration": 2.591348171234131 + }, + { + "auxiliary_loss_clip": 0.01152377, + "auxiliary_loss_mlp": 0.01115272, + "balance_loss_clip": 1.00211155, + "balance_loss_mlp": 1.00064027, + "epoch": 0.47172704043288743, + "flos": 26614583205120.0, + "grad_norm": 1.2822221793969684, + "language_loss": 0.73366868, + "learning_rate": 2.2793017776735703e-06, + "loss": 0.75634515, + "num_input_tokens_seen": 168591525, + "step": 7846, + "time_per_iteration": 2.6071393489837646 + }, + { + "auxiliary_loss_clip": 0.01151073, + "auxiliary_loss_mlp": 0.01114558, + "balance_loss_clip": 1.00210595, + "balance_loss_mlp": 1.00049877, + "epoch": 0.4717871636855554, + "flos": 27922700707200.0, + "grad_norm": 1.2822060002706457, + "language_loss": 0.74154103, + "learning_rate": 2.2789161271109e-06, + "loss": 0.76419735, + "num_input_tokens_seen": 168611235, + "step": 7847, + "time_per_iteration": 2.661052942276001 + }, + { + "auxiliary_loss_clip": 0.01102805, + "auxiliary_loss_mlp": 0.01115319, + "balance_loss_clip": 1.00210333, + "balance_loss_mlp": 1.00068712, + "epoch": 0.47184728693822336, + "flos": 14502237816960.0, + "grad_norm": 2.1259085948046694, + "language_loss": 0.80794662, + "learning_rate": 2.278530465971703e-06, + "loss": 0.83012784, + "num_input_tokens_seen": 168628710, + "step": 7848, + "time_per_iteration": 2.6777608394622803 + }, + { + "auxiliary_loss_clip": 0.01153237, + "auxiliary_loss_mlp": 0.01117297, + "balance_loss_clip": 1.00241709, + "balance_loss_mlp": 1.0006628, + "epoch": 0.47190741019089133, + "flos": 17856545483520.0, + "grad_norm": 1.9073286122124045, + "language_loss": 0.70526683, + "learning_rate": 2.2781447942706032e-06, + "loss": 0.72797215, + "num_input_tokens_seen": 168645645, + "step": 7849, + "time_per_iteration": 2.601562261581421 + }, + { + "auxiliary_loss_clip": 0.01120242, + "auxiliary_loss_mlp": 0.01116665, + "balance_loss_clip": 1.00207567, + "balance_loss_mlp": 1.00069773, + "epoch": 0.4719675334435593, + "flos": 17895472848000.0, + "grad_norm": 2.834403286726089, + "language_loss": 0.69395351, + "learning_rate": 2.277759112022224e-06, + "loss": 0.71632254, + "num_input_tokens_seen": 168664165, + "step": 7850, + "time_per_iteration": 2.6416680812835693 + }, + { + "auxiliary_loss_clip": 0.01088092, + "auxiliary_loss_mlp": 0.01116134, + "balance_loss_clip": 1.00192916, + "balance_loss_mlp": 1.00073981, + "epoch": 0.47202765669622726, + "flos": 20704369426560.0, + "grad_norm": 2.1594091481933444, + "language_loss": 0.74897933, + "learning_rate": 2.2773734192411916e-06, + "loss": 0.7710216, + "num_input_tokens_seen": 168681940, + "step": 7851, + "time_per_iteration": 2.7472329139709473 + }, + { + "auxiliary_loss_clip": 0.01089594, + "auxiliary_loss_mlp": 0.0111589, + "balance_loss_clip": 1.0020932, + "balance_loss_mlp": 1.00059092, + "epoch": 0.4720877799488952, + "flos": 16360255607040.0, + "grad_norm": 1.9031768859183422, + "language_loss": 0.76260662, + "learning_rate": 2.276987715942132e-06, + "loss": 0.78466147, + "num_input_tokens_seen": 168698830, + "step": 7852, + "time_per_iteration": 2.7052392959594727 + }, + { + "auxiliary_loss_clip": 0.01119482, + "auxiliary_loss_mlp": 0.01115232, + "balance_loss_clip": 1.00208759, + "balance_loss_mlp": 1.0006001, + "epoch": 0.4721479032015632, + "flos": 20668171495680.0, + "grad_norm": 1.7363737972531974, + "language_loss": 0.69236374, + "learning_rate": 2.2766020021396696e-06, + "loss": 0.71471089, + "num_input_tokens_seen": 168718305, + "step": 7853, + "time_per_iteration": 2.6558268070220947 + }, + { + "auxiliary_loss_clip": 0.01101832, + "auxiliary_loss_mlp": 0.0109421, + "balance_loss_clip": 1.00178778, + "balance_loss_mlp": 0.99998713, + "epoch": 0.47220802645423116, + "flos": 67750438435200.0, + "grad_norm": 0.6892890626436653, + "language_loss": 0.50150788, + "learning_rate": 2.276216277848432e-06, + "loss": 0.52346832, + "num_input_tokens_seen": 168782365, + "step": 7854, + "time_per_iteration": 3.4315547943115234 + }, + { + "auxiliary_loss_clip": 0.01151617, + "auxiliary_loss_mlp": 0.01116613, + "balance_loss_clip": 1.00213814, + "balance_loss_mlp": 1.00064611, + "epoch": 0.4722681497068991, + "flos": 20921449271040.0, + "grad_norm": 1.9673593109918308, + "language_loss": 0.63633096, + "learning_rate": 2.2758305430830455e-06, + "loss": 0.65901327, + "num_input_tokens_seen": 168800485, + "step": 7855, + "time_per_iteration": 2.582637310028076 + }, + { + "auxiliary_loss_clip": 0.01151215, + "auxiliary_loss_mlp": 0.0111533, + "balance_loss_clip": 1.00216722, + "balance_loss_mlp": 1.00060344, + "epoch": 0.4723282729595671, + "flos": 28293083798400.0, + "grad_norm": 1.9467786981395832, + "language_loss": 0.75808334, + "learning_rate": 2.2754447978581376e-06, + "loss": 0.78074872, + "num_input_tokens_seen": 168818965, + "step": 7856, + "time_per_iteration": 2.700438976287842 + }, + { + "auxiliary_loss_clip": 0.01134448, + "auxiliary_loss_mlp": 0.01115436, + "balance_loss_clip": 1.00211644, + "balance_loss_mlp": 1.00061393, + "epoch": 0.4723883962122351, + "flos": 27125053338240.0, + "grad_norm": 10.56501469132353, + "language_loss": 0.74966228, + "learning_rate": 2.2750590421883347e-06, + "loss": 0.77216113, + "num_input_tokens_seen": 168840355, + "step": 7857, + "time_per_iteration": 2.699472665786743 + }, + { + "auxiliary_loss_clip": 0.01135649, + "auxiliary_loss_mlp": 0.01114935, + "balance_loss_clip": 1.00215662, + "balance_loss_mlp": 1.00068462, + "epoch": 0.47244851946490307, + "flos": 31537253387520.0, + "grad_norm": 1.538208343057256, + "language_loss": 0.64629185, + "learning_rate": 2.2746732760882655e-06, + "loss": 0.66879767, + "num_input_tokens_seen": 168861765, + "step": 7858, + "time_per_iteration": 2.6893982887268066 + }, + { + "auxiliary_loss_clip": 0.01152868, + "auxiliary_loss_mlp": 0.00747339, + "balance_loss_clip": 1.0022229, + "balance_loss_mlp": 1.00038981, + "epoch": 0.47250864271757104, + "flos": 20886544229760.0, + "grad_norm": 1.9686206839371987, + "language_loss": 0.70327842, + "learning_rate": 2.2742874995725575e-06, + "loss": 0.7222805, + "num_input_tokens_seen": 168881310, + "step": 7859, + "time_per_iteration": 2.6225874423980713 + }, + { + "auxiliary_loss_clip": 0.01168225, + "auxiliary_loss_mlp": 0.01115534, + "balance_loss_clip": 1.00234044, + "balance_loss_mlp": 1.00061584, + "epoch": 0.472568765970239, + "flos": 20522086882560.0, + "grad_norm": 1.760021665140894, + "language_loss": 0.61926919, + "learning_rate": 2.2739017126558413e-06, + "loss": 0.64210677, + "num_input_tokens_seen": 168899470, + "step": 7860, + "time_per_iteration": 2.531450033187866 + }, + { + "auxiliary_loss_clip": 0.01134183, + "auxiliary_loss_mlp": 0.01115798, + "balance_loss_clip": 1.00223041, + "balance_loss_mlp": 1.00078511, + "epoch": 0.47262888922290697, + "flos": 35805200417280.0, + "grad_norm": 2.1708475037206965, + "language_loss": 0.7199018, + "learning_rate": 2.2735159153527445e-06, + "loss": 0.7424016, + "num_input_tokens_seen": 168921495, + "step": 7861, + "time_per_iteration": 2.727260112762451 + }, + { + "auxiliary_loss_clip": 0.01136235, + "auxiliary_loss_mlp": 0.01115288, + "balance_loss_clip": 1.00229716, + "balance_loss_mlp": 1.00065649, + "epoch": 0.47268901247557493, + "flos": 20667740532480.0, + "grad_norm": 1.91723833162623, + "language_loss": 0.84960079, + "learning_rate": 2.273130107677896e-06, + "loss": 0.87211603, + "num_input_tokens_seen": 168940515, + "step": 7862, + "time_per_iteration": 2.606462240219116 + }, + { + "auxiliary_loss_clip": 0.01168152, + "auxiliary_loss_mlp": 0.01115507, + "balance_loss_clip": 1.00239277, + "balance_loss_mlp": 1.00049376, + "epoch": 0.4727491357282429, + "flos": 19573291082880.0, + "grad_norm": 2.1765852005678408, + "language_loss": 0.84354633, + "learning_rate": 2.272744289645927e-06, + "loss": 0.86638296, + "num_input_tokens_seen": 168958340, + "step": 7863, + "time_per_iteration": 4.068113327026367 + }, + { + "auxiliary_loss_clip": 0.01134484, + "auxiliary_loss_mlp": 0.01115724, + "balance_loss_clip": 1.00214815, + "balance_loss_mlp": 1.00071073, + "epoch": 0.47280925898091086, + "flos": 18217231902720.0, + "grad_norm": 1.9913289497462932, + "language_loss": 0.65781724, + "learning_rate": 2.272358461271467e-06, + "loss": 0.68031931, + "num_input_tokens_seen": 168974850, + "step": 7864, + "time_per_iteration": 2.606132745742798 + }, + { + "auxiliary_loss_clip": 0.01167977, + "auxiliary_loss_mlp": 0.01115505, + "balance_loss_clip": 1.00226367, + "balance_loss_mlp": 1.00058722, + "epoch": 0.4728693822335788, + "flos": 17821820010240.0, + "grad_norm": 2.303354010490662, + "language_loss": 0.65495253, + "learning_rate": 2.271972622569147e-06, + "loss": 0.67778742, + "num_input_tokens_seen": 168992860, + "step": 7865, + "time_per_iteration": 2.5099215507507324 + }, + { + "auxiliary_loss_clip": 0.01136045, + "auxiliary_loss_mlp": 0.00747387, + "balance_loss_clip": 1.00223637, + "balance_loss_mlp": 1.0004164, + "epoch": 0.4729295054862468, + "flos": 20595057361920.0, + "grad_norm": 2.5664283404826342, + "language_loss": 0.74529529, + "learning_rate": 2.2715867735535976e-06, + "loss": 0.76412958, + "num_input_tokens_seen": 169010325, + "step": 7866, + "time_per_iteration": 2.6338694095611572 + }, + { + "auxiliary_loss_clip": 0.01168031, + "auxiliary_loss_mlp": 0.01115711, + "balance_loss_clip": 1.00226307, + "balance_loss_mlp": 1.00050676, + "epoch": 0.47298962873891476, + "flos": 23368079232000.0, + "grad_norm": 2.250153845265494, + "language_loss": 0.82816046, + "learning_rate": 2.271200914239451e-06, + "loss": 0.85099787, + "num_input_tokens_seen": 169029840, + "step": 7867, + "time_per_iteration": 3.922335624694824 + }, + { + "auxiliary_loss_clip": 0.01151029, + "auxiliary_loss_mlp": 0.01114939, + "balance_loss_clip": 1.0021224, + "balance_loss_mlp": 1.00059366, + "epoch": 0.4730497519915827, + "flos": 22052240305920.0, + "grad_norm": 1.6370599016468372, + "language_loss": 0.79630494, + "learning_rate": 2.2708150446413385e-06, + "loss": 0.8189646, + "num_input_tokens_seen": 169049975, + "step": 7868, + "time_per_iteration": 2.5722100734710693 + }, + { + "auxiliary_loss_clip": 0.01072702, + "auxiliary_loss_mlp": 0.01115547, + "balance_loss_clip": 1.00185978, + "balance_loss_mlp": 1.00062871, + "epoch": 0.4731098752442507, + "flos": 21069724613760.0, + "grad_norm": 1.9365966593844481, + "language_loss": 0.75113142, + "learning_rate": 2.2704291647738915e-06, + "loss": 0.77301383, + "num_input_tokens_seen": 169069540, + "step": 7869, + "time_per_iteration": 4.22061562538147 + }, + { + "auxiliary_loss_clip": 0.0113595, + "auxiliary_loss_mlp": 0.0111611, + "balance_loss_clip": 1.00216293, + "balance_loss_mlp": 1.00071585, + "epoch": 0.4731699984969187, + "flos": 22528775064960.0, + "grad_norm": 4.069207862070512, + "language_loss": 0.73595625, + "learning_rate": 2.2700432746517443e-06, + "loss": 0.75847685, + "num_input_tokens_seen": 169089940, + "step": 7870, + "time_per_iteration": 4.0966715812683105 + }, + { + "auxiliary_loss_clip": 0.01168138, + "auxiliary_loss_mlp": 0.01117336, + "balance_loss_clip": 1.00232363, + "balance_loss_mlp": 1.00070131, + "epoch": 0.4732301217495867, + "flos": 24898124914560.0, + "grad_norm": 6.482642107294238, + "language_loss": 0.80993736, + "learning_rate": 2.2696573742895292e-06, + "loss": 0.83279216, + "num_input_tokens_seen": 169109650, + "step": 7871, + "time_per_iteration": 2.5515811443328857 + }, + { + "auxiliary_loss_clip": 0.01153038, + "auxiliary_loss_mlp": 0.01116132, + "balance_loss_clip": 1.00223327, + "balance_loss_mlp": 1.00054646, + "epoch": 0.47329024500225464, + "flos": 22784423137920.0, + "grad_norm": 1.867308560326361, + "language_loss": 0.75976491, + "learning_rate": 2.269271463701879e-06, + "loss": 0.78245664, + "num_input_tokens_seen": 169128990, + "step": 7872, + "time_per_iteration": 2.583203077316284 + }, + { + "auxiliary_loss_clip": 0.01118826, + "auxiliary_loss_mlp": 0.0111553, + "balance_loss_clip": 1.00209975, + "balance_loss_mlp": 1.00061238, + "epoch": 0.4733503682549226, + "flos": 38695902220800.0, + "grad_norm": 1.9143342057826933, + "language_loss": 0.67813325, + "learning_rate": 2.268885542903428e-06, + "loss": 0.70047677, + "num_input_tokens_seen": 169154645, + "step": 7873, + "time_per_iteration": 2.8321046829223633 + }, + { + "auxiliary_loss_clip": 0.01152534, + "auxiliary_loss_mlp": 0.01115377, + "balance_loss_clip": 1.0022409, + "balance_loss_mlp": 1.00045919, + "epoch": 0.47341049150759057, + "flos": 22966849336320.0, + "grad_norm": 1.4808016372414712, + "language_loss": 0.72734737, + "learning_rate": 2.26849961190881e-06, + "loss": 0.75002646, + "num_input_tokens_seen": 169174995, + "step": 7874, + "time_per_iteration": 2.5796608924865723 + }, + { + "auxiliary_loss_clip": 0.01136991, + "auxiliary_loss_mlp": 0.01115435, + "balance_loss_clip": 1.00225306, + "balance_loss_mlp": 1.00070763, + "epoch": 0.47347061476025853, + "flos": 14538471661440.0, + "grad_norm": 2.4432640141878044, + "language_loss": 0.65266931, + "learning_rate": 2.26811367073266e-06, + "loss": 0.67519361, + "num_input_tokens_seen": 169191815, + "step": 7875, + "time_per_iteration": 2.573962450027466 + }, + { + "auxiliary_loss_clip": 0.01105633, + "auxiliary_loss_mlp": 0.01116459, + "balance_loss_clip": 1.00224006, + "balance_loss_mlp": 1.00068343, + "epoch": 0.4735307380129265, + "flos": 30263250827520.0, + "grad_norm": 2.0440104776775594, + "language_loss": 0.81216711, + "learning_rate": 2.2677277193896125e-06, + "loss": 0.83438802, + "num_input_tokens_seen": 169210430, + "step": 7876, + "time_per_iteration": 2.7661292552948 + }, + { + "auxiliary_loss_clip": 0.01142437, + "auxiliary_loss_mlp": 0.01115054, + "balance_loss_clip": 1.00239611, + "balance_loss_mlp": 1.00061285, + "epoch": 0.47359086126559446, + "flos": 19391044452480.0, + "grad_norm": 1.7840630847415422, + "language_loss": 0.78856599, + "learning_rate": 2.267341757894304e-06, + "loss": 0.81114089, + "num_input_tokens_seen": 169229295, + "step": 7877, + "time_per_iteration": 2.575300693511963 + }, + { + "auxiliary_loss_clip": 0.01152458, + "auxiliary_loss_mlp": 0.0074744, + "balance_loss_clip": 1.00219893, + "balance_loss_mlp": 1.00036502, + "epoch": 0.47365098451826243, + "flos": 21939408708480.0, + "grad_norm": 1.7379438951885173, + "language_loss": 0.70681608, + "learning_rate": 2.2669557862613685e-06, + "loss": 0.72581512, + "num_input_tokens_seen": 169247855, + "step": 7878, + "time_per_iteration": 2.5778796672821045 + }, + { + "auxiliary_loss_clip": 0.01118457, + "auxiliary_loss_mlp": 0.01114581, + "balance_loss_clip": 1.00198388, + "balance_loss_mlp": 1.00071275, + "epoch": 0.4737111077709304, + "flos": 25845053207040.0, + "grad_norm": 1.535327911033768, + "language_loss": 0.75201166, + "learning_rate": 2.2665698045054425e-06, + "loss": 0.77434206, + "num_input_tokens_seen": 169268860, + "step": 7879, + "time_per_iteration": 2.6792359352111816 + }, + { + "auxiliary_loss_clip": 0.01131213, + "auxiliary_loss_mlp": 0.0109385, + "balance_loss_clip": 1.00173807, + "balance_loss_mlp": 1.00000823, + "epoch": 0.47377123102359836, + "flos": 67760886314880.0, + "grad_norm": 0.7285517697870327, + "language_loss": 0.61375415, + "learning_rate": 2.266183812641164e-06, + "loss": 0.63600481, + "num_input_tokens_seen": 169331855, + "step": 7880, + "time_per_iteration": 3.195340156555176 + }, + { + "auxiliary_loss_clip": 0.01136474, + "auxiliary_loss_mlp": 0.01114878, + "balance_loss_clip": 1.0021652, + "balance_loss_mlp": 1.00062788, + "epoch": 0.4738313542762663, + "flos": 24315977191680.0, + "grad_norm": 1.5568668785595141, + "language_loss": 0.68215781, + "learning_rate": 2.2657978106831675e-06, + "loss": 0.70467138, + "num_input_tokens_seen": 169352175, + "step": 7881, + "time_per_iteration": 2.65028977394104 + }, + { + "auxiliary_loss_clip": 0.01069925, + "auxiliary_loss_mlp": 0.01115084, + "balance_loss_clip": 1.00181806, + "balance_loss_mlp": 1.00054789, + "epoch": 0.4738914775289343, + "flos": 20705339093760.0, + "grad_norm": 1.788057652910685, + "language_loss": 0.77229208, + "learning_rate": 2.265411798646092e-06, + "loss": 0.79414213, + "num_input_tokens_seen": 169371215, + "step": 7882, + "time_per_iteration": 2.8310184478759766 + }, + { + "auxiliary_loss_clip": 0.01151311, + "auxiliary_loss_mlp": 0.01115468, + "balance_loss_clip": 1.0022217, + "balance_loss_mlp": 1.0006454, + "epoch": 0.4739516007816023, + "flos": 25446337263360.0, + "grad_norm": 1.432989940395987, + "language_loss": 0.76338506, + "learning_rate": 2.2650257765445747e-06, + "loss": 0.78605288, + "num_input_tokens_seen": 169391745, + "step": 7883, + "time_per_iteration": 2.603649616241455 + }, + { + "auxiliary_loss_clip": 0.0113551, + "auxiliary_loss_mlp": 0.01115175, + "balance_loss_clip": 1.00213015, + "balance_loss_mlp": 1.00054336, + "epoch": 0.4740117240342703, + "flos": 19974341410560.0, + "grad_norm": 2.0525696926598553, + "language_loss": 0.72040844, + "learning_rate": 2.2646397443932525e-06, + "loss": 0.74291527, + "num_input_tokens_seen": 169409845, + "step": 7884, + "time_per_iteration": 2.606990098953247 + }, + { + "auxiliary_loss_clip": 0.01151253, + "auxiliary_loss_mlp": 0.0111678, + "balance_loss_clip": 1.00210893, + "balance_loss_mlp": 1.00052691, + "epoch": 0.47407184728693824, + "flos": 15661146222720.0, + "grad_norm": 2.294055849098416, + "language_loss": 0.82416892, + "learning_rate": 2.2642537022067655e-06, + "loss": 0.8468492, + "num_input_tokens_seen": 169426085, + "step": 7885, + "time_per_iteration": 2.525109052658081 + }, + { + "auxiliary_loss_clip": 0.01138122, + "auxiliary_loss_mlp": 0.01116372, + "balance_loss_clip": 1.00233555, + "balance_loss_mlp": 1.00078714, + "epoch": 0.4741319705396062, + "flos": 18588800142720.0, + "grad_norm": 1.6303145632975888, + "language_loss": 0.73686707, + "learning_rate": 2.263867649999751e-06, + "loss": 0.75941205, + "num_input_tokens_seen": 169444705, + "step": 7886, + "time_per_iteration": 2.5726420879364014 + }, + { + "auxiliary_loss_clip": 0.01135952, + "auxiliary_loss_mlp": 0.01117318, + "balance_loss_clip": 1.00208902, + "balance_loss_mlp": 1.00068319, + "epoch": 0.47419209379227417, + "flos": 13261093223040.0, + "grad_norm": 1.8720846515171863, + "language_loss": 0.73666167, + "learning_rate": 2.263481587786849e-06, + "loss": 0.75919437, + "num_input_tokens_seen": 169460850, + "step": 7887, + "time_per_iteration": 2.5690431594848633 + }, + { + "auxiliary_loss_clip": 0.01153009, + "auxiliary_loss_mlp": 0.01115031, + "balance_loss_clip": 1.00223625, + "balance_loss_mlp": 1.00059032, + "epoch": 0.47425221704494214, + "flos": 20044043752320.0, + "grad_norm": 1.9219024233700184, + "language_loss": 0.76990414, + "learning_rate": 2.2630955155826993e-06, + "loss": 0.79258454, + "num_input_tokens_seen": 169478890, + "step": 7888, + "time_per_iteration": 2.5549440383911133 + }, + { + "auxiliary_loss_clip": 0.01151324, + "auxiliary_loss_mlp": 0.01115671, + "balance_loss_clip": 1.00225687, + "balance_loss_mlp": 1.0006578, + "epoch": 0.4743123402976101, + "flos": 27271892136960.0, + "grad_norm": 1.7075426709085728, + "language_loss": 0.72373903, + "learning_rate": 2.2627094334019406e-06, + "loss": 0.74640894, + "num_input_tokens_seen": 169499690, + "step": 7889, + "time_per_iteration": 2.6155953407287598 + }, + { + "auxiliary_loss_clip": 0.01163315, + "auxiliary_loss_mlp": 0.01093445, + "balance_loss_clip": 1.00175524, + "balance_loss_mlp": 0.99998492, + "epoch": 0.47437246355027807, + "flos": 55393970261760.0, + "grad_norm": 0.7175548184740674, + "language_loss": 0.56098497, + "learning_rate": 2.262323341259214e-06, + "loss": 0.5835526, + "num_input_tokens_seen": 169560475, + "step": 7890, + "time_per_iteration": 3.163015604019165 + }, + { + "auxiliary_loss_clip": 0.01151132, + "auxiliary_loss_mlp": 0.01116097, + "balance_loss_clip": 1.00219679, + "balance_loss_mlp": 1.00060725, + "epoch": 0.47443258680294603, + "flos": 23878477537920.0, + "grad_norm": 1.8614497462583914, + "language_loss": 0.65424562, + "learning_rate": 2.2619372391691605e-06, + "loss": 0.67691797, + "num_input_tokens_seen": 169580110, + "step": 7891, + "time_per_iteration": 2.587657928466797 + }, + { + "auxiliary_loss_clip": 0.01168301, + "auxiliary_loss_mlp": 0.0111679, + "balance_loss_clip": 1.00240088, + "balance_loss_mlp": 1.00063252, + "epoch": 0.474492710055614, + "flos": 21977761455360.0, + "grad_norm": 2.9351257051608477, + "language_loss": 0.70194447, + "learning_rate": 2.26155112714642e-06, + "loss": 0.72479534, + "num_input_tokens_seen": 169597510, + "step": 7892, + "time_per_iteration": 2.526195764541626 + }, + { + "auxiliary_loss_clip": 0.01130506, + "auxiliary_loss_mlp": 0.01093863, + "balance_loss_clip": 1.00184488, + "balance_loss_mlp": 1.00002122, + "epoch": 0.47455283330828196, + "flos": 62557180122240.0, + "grad_norm": 0.8164444654587897, + "language_loss": 0.58629888, + "learning_rate": 2.2611650052056355e-06, + "loss": 0.60854262, + "num_input_tokens_seen": 169660010, + "step": 7893, + "time_per_iteration": 3.2378478050231934 + }, + { + "auxiliary_loss_clip": 0.01151674, + "auxiliary_loss_mlp": 0.01114795, + "balance_loss_clip": 1.00227141, + "balance_loss_mlp": 1.00073576, + "epoch": 0.47461295656094993, + "flos": 12093637380480.0, + "grad_norm": 1.8645155131635511, + "language_loss": 0.77817416, + "learning_rate": 2.2607788733614463e-06, + "loss": 0.80083883, + "num_input_tokens_seen": 169678485, + "step": 7894, + "time_per_iteration": 2.6295459270477295 + }, + { + "auxiliary_loss_clip": 0.01152253, + "auxiliary_loss_mlp": 0.01115517, + "balance_loss_clip": 1.00209951, + "balance_loss_mlp": 1.00069451, + "epoch": 0.4746730798136179, + "flos": 20884568981760.0, + "grad_norm": 1.6973210212041026, + "language_loss": 0.74693137, + "learning_rate": 2.260392731628497e-06, + "loss": 0.76960909, + "num_input_tokens_seen": 169697335, + "step": 7895, + "time_per_iteration": 2.57443904876709 + }, + { + "auxiliary_loss_clip": 0.01152912, + "auxiliary_loss_mlp": 0.01115457, + "balance_loss_clip": 1.00220895, + "balance_loss_mlp": 1.00063455, + "epoch": 0.4747332030662859, + "flos": 19974808287360.0, + "grad_norm": 2.4979626845269043, + "language_loss": 0.82479644, + "learning_rate": 2.260006580021429e-06, + "loss": 0.84748006, + "num_input_tokens_seen": 169715395, + "step": 7896, + "time_per_iteration": 2.6226723194122314 + }, + { + "auxiliary_loss_clip": 0.01151217, + "auxiliary_loss_mlp": 0.011167, + "balance_loss_clip": 1.00220704, + "balance_loss_mlp": 1.00054264, + "epoch": 0.4747933263189539, + "flos": 16034186920320.0, + "grad_norm": 2.370203359632436, + "language_loss": 0.75279212, + "learning_rate": 2.259620418554886e-06, + "loss": 0.77547133, + "num_input_tokens_seen": 169733755, + "step": 7897, + "time_per_iteration": 2.5908563137054443 + }, + { + "auxiliary_loss_clip": 0.01134734, + "auxiliary_loss_mlp": 0.01116037, + "balance_loss_clip": 1.00210571, + "balance_loss_mlp": 1.00083303, + "epoch": 0.47485344957162184, + "flos": 13955102876160.0, + "grad_norm": 1.7339002440038913, + "language_loss": 0.63783175, + "learning_rate": 2.25923424724351e-06, + "loss": 0.66033947, + "num_input_tokens_seen": 169751390, + "step": 7898, + "time_per_iteration": 2.568925619125366 + }, + { + "auxiliary_loss_clip": 0.01121411, + "auxiliary_loss_mlp": 0.01115501, + "balance_loss_clip": 1.00207341, + "balance_loss_mlp": 1.00067878, + "epoch": 0.4749135728242898, + "flos": 20449080489600.0, + "grad_norm": 2.170953246637204, + "language_loss": 0.6986261, + "learning_rate": 2.258848066101946e-06, + "loss": 0.72099519, + "num_input_tokens_seen": 169769500, + "step": 7899, + "time_per_iteration": 2.6325576305389404 + }, + { + "auxiliary_loss_clip": 0.01151027, + "auxiliary_loss_mlp": 0.01115613, + "balance_loss_clip": 1.00226021, + "balance_loss_mlp": 1.00059986, + "epoch": 0.4749736960769578, + "flos": 28949961767040.0, + "grad_norm": 2.4123114241809986, + "language_loss": 0.6853379, + "learning_rate": 2.258461875144837e-06, + "loss": 0.70800436, + "num_input_tokens_seen": 169789215, + "step": 7900, + "time_per_iteration": 2.6173267364501953 + }, + { + "auxiliary_loss_clip": 0.01118273, + "auxiliary_loss_mlp": 0.01115907, + "balance_loss_clip": 1.00209308, + "balance_loss_mlp": 1.00060821, + "epoch": 0.47503381932962574, + "flos": 31938770592000.0, + "grad_norm": 2.0410264016456123, + "language_loss": 0.70410293, + "learning_rate": 2.2580756743868273e-06, + "loss": 0.72644472, + "num_input_tokens_seen": 169808825, + "step": 7901, + "time_per_iteration": 4.0855512619018555 + }, + { + "auxiliary_loss_clip": 0.01135724, + "auxiliary_loss_mlp": 0.01115455, + "balance_loss_clip": 1.00221419, + "balance_loss_mlp": 1.0009191, + "epoch": 0.4750939425822937, + "flos": 22127257860480.0, + "grad_norm": 2.0278404387408133, + "language_loss": 0.73606724, + "learning_rate": 2.2576894638425636e-06, + "loss": 0.75857902, + "num_input_tokens_seen": 169827590, + "step": 7902, + "time_per_iteration": 2.622929334640503 + }, + { + "auxiliary_loss_clip": 0.01119112, + "auxiliary_loss_mlp": 0.01115362, + "balance_loss_clip": 1.00204968, + "balance_loss_mlp": 1.00063491, + "epoch": 0.47515406583496167, + "flos": 20850094903680.0, + "grad_norm": 1.944571164856913, + "language_loss": 0.68850136, + "learning_rate": 2.257303243526688e-06, + "loss": 0.71084607, + "num_input_tokens_seen": 169844925, + "step": 7903, + "time_per_iteration": 2.727536678314209 + }, + { + "auxiliary_loss_clip": 0.01138247, + "auxiliary_loss_mlp": 0.01113928, + "balance_loss_clip": 1.00222135, + "balance_loss_mlp": 1.00063133, + "epoch": 0.47521418908762963, + "flos": 17524802448000.0, + "grad_norm": 1.5020751573170257, + "language_loss": 0.7184397, + "learning_rate": 2.256917013453848e-06, + "loss": 0.74096149, + "num_input_tokens_seen": 169862705, + "step": 7904, + "time_per_iteration": 2.712446451187134 + }, + { + "auxiliary_loss_clip": 0.0109258, + "auxiliary_loss_mlp": 0.01115434, + "balance_loss_clip": 1.0020988, + "balance_loss_mlp": 1.00051618, + "epoch": 0.4752743123402976, + "flos": 20559434048640.0, + "grad_norm": 1.5053126039133886, + "language_loss": 0.85993099, + "learning_rate": 2.25653077363869e-06, + "loss": 0.88201112, + "num_input_tokens_seen": 169880155, + "step": 7905, + "time_per_iteration": 4.150814056396484 + }, + { + "auxiliary_loss_clip": 0.01152685, + "auxiliary_loss_mlp": 0.01113475, + "balance_loss_clip": 1.00222337, + "balance_loss_mlp": 1.00055981, + "epoch": 0.47533443559296557, + "flos": 26360623071360.0, + "grad_norm": 1.525564288228761, + "language_loss": 0.82225454, + "learning_rate": 2.2561445240958583e-06, + "loss": 0.84491611, + "num_input_tokens_seen": 169901525, + "step": 7906, + "time_per_iteration": 4.184343338012695 + }, + { + "auxiliary_loss_clip": 0.01115973, + "auxiliary_loss_mlp": 0.01093637, + "balance_loss_clip": 1.00161624, + "balance_loss_mlp": 1.00017667, + "epoch": 0.47539455884563353, + "flos": 65949660967680.0, + "grad_norm": 0.6651715205119323, + "language_loss": 0.58953166, + "learning_rate": 2.255758264840002e-06, + "loss": 0.6116277, + "num_input_tokens_seen": 169970345, + "step": 7907, + "time_per_iteration": 4.850710868835449 + }, + { + "auxiliary_loss_clip": 0.01153033, + "auxiliary_loss_mlp": 0.01114945, + "balance_loss_clip": 1.00224197, + "balance_loss_mlp": 1.00059915, + "epoch": 0.4754546820983015, + "flos": 17238128002560.0, + "grad_norm": 4.2037040630422755, + "language_loss": 0.81403989, + "learning_rate": 2.255371995885765e-06, + "loss": 0.83671975, + "num_input_tokens_seen": 169986440, + "step": 7908, + "time_per_iteration": 2.5639889240264893 + }, + { + "auxiliary_loss_clip": 0.0115148, + "auxiliary_loss_mlp": 0.01115212, + "balance_loss_clip": 1.00223637, + "balance_loss_mlp": 1.00067556, + "epoch": 0.47551480535096946, + "flos": 19825886499840.0, + "grad_norm": 1.6483970219626507, + "language_loss": 0.73600733, + "learning_rate": 2.254985717247797e-06, + "loss": 0.75867426, + "num_input_tokens_seen": 170005705, + "step": 7909, + "time_per_iteration": 2.629070997238159 + }, + { + "auxiliary_loss_clip": 0.0113616, + "auxiliary_loss_mlp": 0.01115528, + "balance_loss_clip": 1.00222945, + "balance_loss_mlp": 1.00080109, + "epoch": 0.4755749286036375, + "flos": 22163958581760.0, + "grad_norm": 1.4783871771770543, + "language_loss": 0.75336409, + "learning_rate": 2.2545994289407457e-06, + "loss": 0.77588099, + "num_input_tokens_seen": 170023415, + "step": 7910, + "time_per_iteration": 2.606416702270508 + }, + { + "auxiliary_loss_clip": 0.01151349, + "auxiliary_loss_mlp": 0.01114364, + "balance_loss_clip": 1.00221729, + "balance_loss_mlp": 1.00059104, + "epoch": 0.47563505185630545, + "flos": 21648280976640.0, + "grad_norm": 1.8667819703871251, + "language_loss": 0.79169875, + "learning_rate": 2.2542131309792577e-06, + "loss": 0.81435585, + "num_input_tokens_seen": 170042395, + "step": 7911, + "time_per_iteration": 2.5944578647613525 + }, + { + "auxiliary_loss_clip": 0.0113655, + "auxiliary_loss_mlp": 0.00747191, + "balance_loss_clip": 1.00214648, + "balance_loss_mlp": 1.0002594, + "epoch": 0.4756951751089734, + "flos": 20628777254400.0, + "grad_norm": 1.6732681696376543, + "language_loss": 0.75510007, + "learning_rate": 2.253826823377983e-06, + "loss": 0.77393746, + "num_input_tokens_seen": 170061610, + "step": 7912, + "time_per_iteration": 2.6056528091430664 + }, + { + "auxiliary_loss_clip": 0.01167933, + "auxiliary_loss_mlp": 0.01115796, + "balance_loss_clip": 1.00223386, + "balance_loss_mlp": 1.00078332, + "epoch": 0.4757552983616414, + "flos": 25848788221440.0, + "grad_norm": 1.3810993651575165, + "language_loss": 0.74175501, + "learning_rate": 2.253440506151569e-06, + "loss": 0.76459229, + "num_input_tokens_seen": 170083505, + "step": 7913, + "time_per_iteration": 2.5796217918395996 + }, + { + "auxiliary_loss_clip": 0.01136365, + "auxiliary_loss_mlp": 0.01114721, + "balance_loss_clip": 1.00227213, + "balance_loss_mlp": 1.00056648, + "epoch": 0.47581542161430934, + "flos": 18223013992320.0, + "grad_norm": 2.3323867295737397, + "language_loss": 0.72317982, + "learning_rate": 2.253054179314666e-06, + "loss": 0.74569064, + "num_input_tokens_seen": 170100690, + "step": 7914, + "time_per_iteration": 2.6126673221588135 + }, + { + "auxiliary_loss_clip": 0.01134932, + "auxiliary_loss_mlp": 0.01115881, + "balance_loss_clip": 1.00213611, + "balance_loss_mlp": 1.000772, + "epoch": 0.4758755448669773, + "flos": 21579763783680.0, + "grad_norm": 2.0285131024137373, + "language_loss": 0.6501109, + "learning_rate": 2.2526678428819227e-06, + "loss": 0.67261899, + "num_input_tokens_seen": 170119240, + "step": 7915, + "time_per_iteration": 2.6224076747894287 + }, + { + "auxiliary_loss_clip": 0.01167778, + "auxiliary_loss_mlp": 0.01114245, + "balance_loss_clip": 1.00215578, + "balance_loss_mlp": 1.00075793, + "epoch": 0.47593566811964527, + "flos": 15231152511360.0, + "grad_norm": 1.7567283109307312, + "language_loss": 0.77062654, + "learning_rate": 2.2522814968679896e-06, + "loss": 0.79344684, + "num_input_tokens_seen": 170136450, + "step": 7916, + "time_per_iteration": 2.495234727859497 + }, + { + "auxiliary_loss_clip": 0.01167789, + "auxiliary_loss_mlp": 0.01113963, + "balance_loss_clip": 1.00221562, + "balance_loss_mlp": 1.0006665, + "epoch": 0.47599579137231324, + "flos": 21543242630400.0, + "grad_norm": 1.867977404685779, + "language_loss": 0.6451183, + "learning_rate": 2.2518951412875173e-06, + "loss": 0.66793579, + "num_input_tokens_seen": 170155295, + "step": 7917, + "time_per_iteration": 2.5390634536743164 + }, + { + "auxiliary_loss_clip": 0.01114836, + "auxiliary_loss_mlp": 0.0109316, + "balance_loss_clip": 1.00163078, + "balance_loss_mlp": 1.00008142, + "epoch": 0.4760559146249812, + "flos": 64554602595840.0, + "grad_norm": 0.8815919779472088, + "language_loss": 0.65732741, + "learning_rate": 2.2515087761551557e-06, + "loss": 0.67940736, + "num_input_tokens_seen": 170222325, + "step": 7918, + "time_per_iteration": 3.2240660190582275 + }, + { + "auxiliary_loss_clip": 0.01153098, + "auxiliary_loss_mlp": 0.00747323, + "balance_loss_clip": 1.00223374, + "balance_loss_mlp": 1.00032008, + "epoch": 0.47611603787764917, + "flos": 22233876405120.0, + "grad_norm": 1.707893112511377, + "language_loss": 0.68916368, + "learning_rate": 2.2511224014855563e-06, + "loss": 0.70816779, + "num_input_tokens_seen": 170241625, + "step": 7919, + "time_per_iteration": 2.6131043434143066 + }, + { + "auxiliary_loss_clip": 0.01135764, + "auxiliary_loss_mlp": 0.01115678, + "balance_loss_clip": 1.00211883, + "balance_loss_mlp": 1.00056982, + "epoch": 0.47617616113031713, + "flos": 22780005765120.0, + "grad_norm": 1.7223219775298817, + "language_loss": 0.7491439, + "learning_rate": 2.2507360172933694e-06, + "loss": 0.7716583, + "num_input_tokens_seen": 170262470, + "step": 7920, + "time_per_iteration": 2.6752517223358154 + }, + { + "auxiliary_loss_clip": 0.01135108, + "auxiliary_loss_mlp": 0.01116616, + "balance_loss_clip": 1.00221658, + "balance_loss_mlp": 1.00064874, + "epoch": 0.4762362843829851, + "flos": 24133802388480.0, + "grad_norm": 1.561786970912407, + "language_loss": 0.77465498, + "learning_rate": 2.2503496235932487e-06, + "loss": 0.79717219, + "num_input_tokens_seen": 170283460, + "step": 7921, + "time_per_iteration": 2.6728885173797607 + }, + { + "auxiliary_loss_clip": 0.01135865, + "auxiliary_loss_mlp": 0.0111629, + "balance_loss_clip": 1.00211191, + "balance_loss_mlp": 1.00080037, + "epoch": 0.47629640763565306, + "flos": 22452069571200.0, + "grad_norm": 2.722165552380805, + "language_loss": 0.77932394, + "learning_rate": 2.249963220399845e-06, + "loss": 0.80184543, + "num_input_tokens_seen": 170304225, + "step": 7922, + "time_per_iteration": 2.6508312225341797 + }, + { + "auxiliary_loss_clip": 0.01117369, + "auxiliary_loss_mlp": 0.01115753, + "balance_loss_clip": 1.00202096, + "balance_loss_mlp": 1.00073993, + "epoch": 0.4763565308883211, + "flos": 11181398647680.0, + "grad_norm": 1.717628226833725, + "language_loss": 0.72777593, + "learning_rate": 2.2495768077278104e-06, + "loss": 0.75010717, + "num_input_tokens_seen": 170322110, + "step": 7923, + "time_per_iteration": 2.628159761428833 + }, + { + "auxiliary_loss_clip": 0.01119668, + "auxiliary_loss_mlp": 0.0111472, + "balance_loss_clip": 1.00205183, + "balance_loss_mlp": 1.00085115, + "epoch": 0.47641665414098905, + "flos": 22382151747840.0, + "grad_norm": 1.7942316042527324, + "language_loss": 0.81758904, + "learning_rate": 2.2491903855917992e-06, + "loss": 0.83993292, + "num_input_tokens_seen": 170340700, + "step": 7924, + "time_per_iteration": 2.657726526260376 + }, + { + "auxiliary_loss_clip": 0.01151446, + "auxiliary_loss_mlp": 0.0111589, + "balance_loss_clip": 1.00229871, + "balance_loss_mlp": 1.00068653, + "epoch": 0.476476777393657, + "flos": 25046148862080.0, + "grad_norm": 1.63563368571801, + "language_loss": 0.80209517, + "learning_rate": 2.2488039540064626e-06, + "loss": 0.82476848, + "num_input_tokens_seen": 170359780, + "step": 7925, + "time_per_iteration": 2.6544342041015625 + }, + { + "auxiliary_loss_clip": 0.01134235, + "auxiliary_loss_mlp": 0.01115431, + "balance_loss_clip": 1.0020318, + "balance_loss_mlp": 1.00070357, + "epoch": 0.476536900646325, + "flos": 27269916888960.0, + "grad_norm": 1.6930424436992857, + "language_loss": 0.72022623, + "learning_rate": 2.2484175129864558e-06, + "loss": 0.74272287, + "num_input_tokens_seen": 170381260, + "step": 7926, + "time_per_iteration": 2.7009902000427246 + }, + { + "auxiliary_loss_clip": 0.01151586, + "auxiliary_loss_mlp": 0.01117063, + "balance_loss_clip": 1.00235105, + "balance_loss_mlp": 1.00061989, + "epoch": 0.47659702389899294, + "flos": 25301401885440.0, + "grad_norm": 2.591543794032605, + "language_loss": 0.68372625, + "learning_rate": 2.248031062546432e-06, + "loss": 0.70641279, + "num_input_tokens_seen": 170400595, + "step": 7927, + "time_per_iteration": 2.62740159034729 + }, + { + "auxiliary_loss_clip": 0.01119237, + "auxiliary_loss_mlp": 0.01115481, + "balance_loss_clip": 1.00203586, + "balance_loss_mlp": 1.00065827, + "epoch": 0.4766571471516609, + "flos": 25992861672960.0, + "grad_norm": 1.8914738464139897, + "language_loss": 0.68178815, + "learning_rate": 2.247644602701045e-06, + "loss": 0.7041353, + "num_input_tokens_seen": 170421110, + "step": 7928, + "time_per_iteration": 2.670644521713257 + }, + { + "auxiliary_loss_clip": 0.01167966, + "auxiliary_loss_mlp": 0.01115616, + "balance_loss_clip": 1.0022006, + "balance_loss_mlp": 1.00050759, + "epoch": 0.4767172704043289, + "flos": 16032211672320.0, + "grad_norm": 2.2914147643468463, + "language_loss": 0.78650433, + "learning_rate": 2.2472581334649496e-06, + "loss": 0.80934018, + "num_input_tokens_seen": 170436700, + "step": 7929, + "time_per_iteration": 2.52622389793396 + }, + { + "auxiliary_loss_clip": 0.01137509, + "auxiliary_loss_mlp": 0.01115314, + "balance_loss_clip": 1.00218487, + "balance_loss_mlp": 1.00087285, + "epoch": 0.47677739365699684, + "flos": 39235351651200.0, + "grad_norm": 1.766679773234151, + "language_loss": 0.66715688, + "learning_rate": 2.2468716548528016e-06, + "loss": 0.68968511, + "num_input_tokens_seen": 170459555, + "step": 7930, + "time_per_iteration": 2.7766895294189453 + }, + { + "auxiliary_loss_clip": 0.01157565, + "auxiliary_loss_mlp": 0.01114346, + "balance_loss_clip": 1.00246429, + "balance_loss_mlp": 1.00057316, + "epoch": 0.4768375169096648, + "flos": 24717781704960.0, + "grad_norm": 2.0698831319303457, + "language_loss": 0.79589832, + "learning_rate": 2.2464851668792555e-06, + "loss": 0.81861746, + "num_input_tokens_seen": 170479175, + "step": 7931, + "time_per_iteration": 2.6180660724639893 + }, + { + "auxiliary_loss_clip": 0.01138252, + "auxiliary_loss_mlp": 0.01116188, + "balance_loss_clip": 1.00221515, + "balance_loss_mlp": 1.00060225, + "epoch": 0.47689764016233277, + "flos": 22528667324160.0, + "grad_norm": 2.032138342840271, + "language_loss": 0.76203454, + "learning_rate": 2.2460986695589678e-06, + "loss": 0.78457892, + "num_input_tokens_seen": 170498450, + "step": 7932, + "time_per_iteration": 2.617673635482788 + }, + { + "auxiliary_loss_clip": 0.01135497, + "auxiliary_loss_mlp": 0.00747219, + "balance_loss_clip": 1.00208235, + "balance_loss_mlp": 1.00050855, + "epoch": 0.47695776341500074, + "flos": 15120619384320.0, + "grad_norm": 1.9569188326836198, + "language_loss": 0.79761356, + "learning_rate": 2.245712162906593e-06, + "loss": 0.81644076, + "num_input_tokens_seen": 170516255, + "step": 7933, + "time_per_iteration": 2.6292314529418945 + }, + { + "auxiliary_loss_clip": 0.01152701, + "auxiliary_loss_mlp": 0.01116542, + "balance_loss_clip": 1.0022521, + "balance_loss_mlp": 1.00086176, + "epoch": 0.4770178866676687, + "flos": 14678917839360.0, + "grad_norm": 1.9194645358712559, + "language_loss": 0.73250687, + "learning_rate": 2.2453256469367888e-06, + "loss": 0.75519931, + "num_input_tokens_seen": 170532705, + "step": 7934, + "time_per_iteration": 2.521385431289673 + }, + { + "auxiliary_loss_clip": 0.01151291, + "auxiliary_loss_mlp": 0.01115826, + "balance_loss_clip": 1.00217867, + "balance_loss_mlp": 1.00062251, + "epoch": 0.47707800992033667, + "flos": 22565583527040.0, + "grad_norm": 1.8860977809247708, + "language_loss": 0.80627823, + "learning_rate": 2.244939121664211e-06, + "loss": 0.82894945, + "num_input_tokens_seen": 170551925, + "step": 7935, + "time_per_iteration": 2.583721876144409 + }, + { + "auxiliary_loss_clip": 0.01120067, + "auxiliary_loss_mlp": 0.01116728, + "balance_loss_clip": 1.00204659, + "balance_loss_mlp": 1.00076139, + "epoch": 0.4771381331730047, + "flos": 30918225375360.0, + "grad_norm": 2.4903917120431243, + "language_loss": 0.71022308, + "learning_rate": 2.2445525871035177e-06, + "loss": 0.73259103, + "num_input_tokens_seen": 170572320, + "step": 7936, + "time_per_iteration": 2.7166671752929688 + }, + { + "auxiliary_loss_clip": 0.01167884, + "auxiliary_loss_mlp": 0.01116567, + "balance_loss_clip": 1.00211453, + "balance_loss_mlp": 1.00069523, + "epoch": 0.47719825642567265, + "flos": 25738901539200.0, + "grad_norm": 2.619781188712295, + "language_loss": 0.67899287, + "learning_rate": 2.2441660432693656e-06, + "loss": 0.70183736, + "num_input_tokens_seen": 170589470, + "step": 7937, + "time_per_iteration": 2.5817699432373047 + }, + { + "auxiliary_loss_clip": 0.01146436, + "auxiliary_loss_mlp": 0.01093504, + "balance_loss_clip": 1.00150323, + "balance_loss_mlp": 1.00004351, + "epoch": 0.4772583796783406, + "flos": 66355128668160.0, + "grad_norm": 0.7315933812017114, + "language_loss": 0.56392318, + "learning_rate": 2.2437794901764128e-06, + "loss": 0.58632261, + "num_input_tokens_seen": 170662265, + "step": 7938, + "time_per_iteration": 4.691783666610718 + }, + { + "auxiliary_loss_clip": 0.01135601, + "auxiliary_loss_mlp": 0.01115013, + "balance_loss_clip": 1.00210357, + "balance_loss_mlp": 1.0007627, + "epoch": 0.4773185029310086, + "flos": 22051091070720.0, + "grad_norm": 1.634156098436902, + "language_loss": 0.88805443, + "learning_rate": 2.243392927839317e-06, + "loss": 0.91056055, + "num_input_tokens_seen": 170679680, + "step": 7939, + "time_per_iteration": 2.616028070449829 + }, + { + "auxiliary_loss_clip": 0.01152498, + "auxiliary_loss_mlp": 0.01115351, + "balance_loss_clip": 1.00219703, + "balance_loss_mlp": 1.00062358, + "epoch": 0.47737862618367655, + "flos": 16727801523840.0, + "grad_norm": 3.08282762287544, + "language_loss": 0.76765716, + "learning_rate": 2.2430063562727367e-06, + "loss": 0.79033566, + "num_input_tokens_seen": 170697340, + "step": 7940, + "time_per_iteration": 2.536494493484497 + }, + { + "auxiliary_loss_clip": 0.01134425, + "auxiliary_loss_mlp": 0.01114855, + "balance_loss_clip": 1.00220323, + "balance_loss_mlp": 1.00070047, + "epoch": 0.4774387494363445, + "flos": 19609453100160.0, + "grad_norm": 1.6375092514362846, + "language_loss": 0.85037482, + "learning_rate": 2.2426197754913322e-06, + "loss": 0.8728677, + "num_input_tokens_seen": 170714905, + "step": 7941, + "time_per_iteration": 2.6143341064453125 + }, + { + "auxiliary_loss_clip": 0.01136545, + "auxiliary_loss_mlp": 0.01116533, + "balance_loss_clip": 1.00229335, + "balance_loss_mlp": 1.00085258, + "epoch": 0.4774988726890125, + "flos": 16653969118080.0, + "grad_norm": 1.7523003792031047, + "language_loss": 0.75388914, + "learning_rate": 2.24223318550976e-06, + "loss": 0.77642, + "num_input_tokens_seen": 170731810, + "step": 7942, + "time_per_iteration": 2.5792062282562256 + }, + { + "auxiliary_loss_clip": 0.01151678, + "auxiliary_loss_mlp": 0.01115211, + "balance_loss_clip": 1.00217271, + "balance_loss_mlp": 1.0005796, + "epoch": 0.47755899594168044, + "flos": 20485565729280.0, + "grad_norm": 1.754142051895741, + "language_loss": 0.64883161, + "learning_rate": 2.241846586342682e-06, + "loss": 0.67150056, + "num_input_tokens_seen": 170750270, + "step": 7943, + "time_per_iteration": 4.017248153686523 + }, + { + "auxiliary_loss_clip": 0.01126187, + "auxiliary_loss_mlp": 0.01116158, + "balance_loss_clip": 1.00236118, + "balance_loss_mlp": 1.00066781, + "epoch": 0.4776191191943484, + "flos": 21652806090240.0, + "grad_norm": 1.7152418437946635, + "language_loss": 0.73372275, + "learning_rate": 2.2414599780047577e-06, + "loss": 0.75614619, + "num_input_tokens_seen": 170769015, + "step": 7944, + "time_per_iteration": 4.0508763790130615 + }, + { + "auxiliary_loss_clip": 0.01151489, + "auxiliary_loss_mlp": 0.01115616, + "balance_loss_clip": 1.00214183, + "balance_loss_mlp": 1.0006032, + "epoch": 0.4776792424470164, + "flos": 18770220760320.0, + "grad_norm": 2.891461039532753, + "language_loss": 0.68463731, + "learning_rate": 2.2410733605106456e-06, + "loss": 0.70730841, + "num_input_tokens_seen": 170785725, + "step": 7945, + "time_per_iteration": 3.9383511543273926 + }, + { + "auxiliary_loss_clip": 0.01120664, + "auxiliary_loss_mlp": 0.00747329, + "balance_loss_clip": 1.00200176, + "balance_loss_mlp": 1.00028574, + "epoch": 0.47773936569968434, + "flos": 29715828577920.0, + "grad_norm": 1.7599318218984563, + "language_loss": 0.75284445, + "learning_rate": 2.240686733875009e-06, + "loss": 0.77152431, + "num_input_tokens_seen": 170804600, + "step": 7946, + "time_per_iteration": 2.697688579559326 + }, + { + "auxiliary_loss_clip": 0.01142734, + "auxiliary_loss_mlp": 0.01115353, + "balance_loss_clip": 1.00233865, + "balance_loss_mlp": 1.00072098, + "epoch": 0.4777994889523523, + "flos": 24791542283520.0, + "grad_norm": 1.764727184551954, + "language_loss": 0.79094052, + "learning_rate": 2.240300098112506e-06, + "loss": 0.81352139, + "num_input_tokens_seen": 170824230, + "step": 7947, + "time_per_iteration": 2.658454656600952 + }, + { + "auxiliary_loss_clip": 0.01134655, + "auxiliary_loss_mlp": 0.01115764, + "balance_loss_clip": 1.00195289, + "balance_loss_mlp": 1.00065541, + "epoch": 0.47785961220502027, + "flos": 17858161595520.0, + "grad_norm": 1.9414175283438144, + "language_loss": 0.74028504, + "learning_rate": 2.2399134532377998e-06, + "loss": 0.76278919, + "num_input_tokens_seen": 170843365, + "step": 7948, + "time_per_iteration": 2.665708065032959 + }, + { + "auxiliary_loss_clip": 0.01137669, + "auxiliary_loss_mlp": 0.01115276, + "balance_loss_clip": 1.00218868, + "balance_loss_mlp": 1.0005486, + "epoch": 0.4779197354576883, + "flos": 20266546550400.0, + "grad_norm": 1.475792406563317, + "language_loss": 0.77802026, + "learning_rate": 2.2395267992655514e-06, + "loss": 0.80054963, + "num_input_tokens_seen": 170863515, + "step": 7949, + "time_per_iteration": 2.6219656467437744 + }, + { + "auxiliary_loss_clip": 0.01136135, + "auxiliary_loss_mlp": 0.01114764, + "balance_loss_clip": 1.00219333, + "balance_loss_mlp": 1.00051403, + "epoch": 0.47797985871035625, + "flos": 17056599644160.0, + "grad_norm": 2.4913390071058554, + "language_loss": 0.73891848, + "learning_rate": 2.2391401362104227e-06, + "loss": 0.7614274, + "num_input_tokens_seen": 170881245, + "step": 7950, + "time_per_iteration": 2.5705013275146484 + }, + { + "auxiliary_loss_clip": 0.01136366, + "auxiliary_loss_mlp": 0.01115073, + "balance_loss_clip": 1.00215828, + "balance_loss_mlp": 1.00082302, + "epoch": 0.4780399819630242, + "flos": 31358418549120.0, + "grad_norm": 1.6996093374054355, + "language_loss": 0.73843706, + "learning_rate": 2.2387534640870756e-06, + "loss": 0.76095146, + "num_input_tokens_seen": 170901285, + "step": 7951, + "time_per_iteration": 2.7081735134124756 + }, + { + "auxiliary_loss_clip": 0.01121165, + "auxiliary_loss_mlp": 0.01115904, + "balance_loss_clip": 1.00207973, + "balance_loss_mlp": 1.00060487, + "epoch": 0.4781001052156922, + "flos": 24899597372160.0, + "grad_norm": 2.2634328402215895, + "language_loss": 0.79759502, + "learning_rate": 2.238366782910174e-06, + "loss": 0.81996572, + "num_input_tokens_seen": 170919740, + "step": 7952, + "time_per_iteration": 2.6832752227783203 + }, + { + "auxiliary_loss_clip": 0.01136459, + "auxiliary_loss_mlp": 0.01116387, + "balance_loss_clip": 1.00222349, + "balance_loss_mlp": 1.00070596, + "epoch": 0.47816022846836015, + "flos": 18697717157760.0, + "grad_norm": 1.7333106314711806, + "language_loss": 0.78225994, + "learning_rate": 2.23798009269438e-06, + "loss": 0.80478841, + "num_input_tokens_seen": 170938510, + "step": 7953, + "time_per_iteration": 2.5938313007354736 + }, + { + "auxiliary_loss_clip": 0.0115134, + "auxiliary_loss_mlp": 0.01114974, + "balance_loss_clip": 1.00218725, + "balance_loss_mlp": 1.00062847, + "epoch": 0.4782203517210281, + "flos": 11977573559040.0, + "grad_norm": 2.1399141191550277, + "language_loss": 0.84188759, + "learning_rate": 2.2375933934543566e-06, + "loss": 0.86455077, + "num_input_tokens_seen": 170951170, + "step": 7954, + "time_per_iteration": 2.543128252029419 + }, + { + "auxiliary_loss_clip": 0.01136292, + "auxiliary_loss_mlp": 0.01114856, + "balance_loss_clip": 1.00214028, + "balance_loss_mlp": 1.00070155, + "epoch": 0.4782804749736961, + "flos": 20813501923200.0, + "grad_norm": 1.4256291661580363, + "language_loss": 0.70217544, + "learning_rate": 2.237206685204768e-06, + "loss": 0.72468698, + "num_input_tokens_seen": 170970990, + "step": 7955, + "time_per_iteration": 2.6078712940216064 + }, + { + "auxiliary_loss_clip": 0.01140973, + "auxiliary_loss_mlp": 0.01114898, + "balance_loss_clip": 1.00219595, + "balance_loss_mlp": 1.00055242, + "epoch": 0.47834059822636404, + "flos": 23840304359040.0, + "grad_norm": 1.8566744243271558, + "language_loss": 0.81819141, + "learning_rate": 2.2368199679602787e-06, + "loss": 0.8407501, + "num_input_tokens_seen": 170991215, + "step": 7956, + "time_per_iteration": 2.647080421447754 + }, + { + "auxiliary_loss_clip": 0.01134646, + "auxiliary_loss_mlp": 0.0111515, + "balance_loss_clip": 1.00217485, + "balance_loss_mlp": 1.00061417, + "epoch": 0.478400721479032, + "flos": 22633777497600.0, + "grad_norm": 4.076173639031555, + "language_loss": 0.84536022, + "learning_rate": 2.2364332417355516e-06, + "loss": 0.86785817, + "num_input_tokens_seen": 171007325, + "step": 7957, + "time_per_iteration": 2.602384090423584 + }, + { + "auxiliary_loss_clip": 0.01152736, + "auxiliary_loss_mlp": 0.01114666, + "balance_loss_clip": 1.00216734, + "balance_loss_mlp": 1.00070143, + "epoch": 0.4784608447317, + "flos": 19354954262400.0, + "grad_norm": 1.5673755031425458, + "language_loss": 0.79729235, + "learning_rate": 2.2360465065452527e-06, + "loss": 0.81996644, + "num_input_tokens_seen": 171025650, + "step": 7958, + "time_per_iteration": 2.549132823944092 + }, + { + "auxiliary_loss_clip": 0.01122237, + "auxiliary_loss_mlp": 0.00747265, + "balance_loss_clip": 1.00203204, + "balance_loss_mlp": 1.00028872, + "epoch": 0.47852096798436794, + "flos": 24021114445440.0, + "grad_norm": 1.9356490624888547, + "language_loss": 0.82856262, + "learning_rate": 2.235659762404047e-06, + "loss": 0.84725761, + "num_input_tokens_seen": 171045045, + "step": 7959, + "time_per_iteration": 2.6897060871124268 + }, + { + "auxiliary_loss_clip": 0.01119361, + "auxiliary_loss_mlp": 0.01114482, + "balance_loss_clip": 1.00215888, + "balance_loss_mlp": 1.00070882, + "epoch": 0.4785810912370359, + "flos": 25666433850240.0, + "grad_norm": 2.0446586342962756, + "language_loss": 0.72570735, + "learning_rate": 2.235273009326599e-06, + "loss": 0.7480458, + "num_input_tokens_seen": 171062910, + "step": 7960, + "time_per_iteration": 2.6814327239990234 + }, + { + "auxiliary_loss_clip": 0.01117658, + "auxiliary_loss_mlp": 0.01114556, + "balance_loss_clip": 1.00210381, + "balance_loss_mlp": 1.00078249, + "epoch": 0.47864121448970387, + "flos": 21432134885760.0, + "grad_norm": 1.8486806545068692, + "language_loss": 0.77321458, + "learning_rate": 2.2348862473275745e-06, + "loss": 0.79553676, + "num_input_tokens_seen": 171080875, + "step": 7961, + "time_per_iteration": 2.6686313152313232 + }, + { + "auxiliary_loss_clip": 0.01118142, + "auxiliary_loss_mlp": 0.01114591, + "balance_loss_clip": 1.00203788, + "balance_loss_mlp": 1.00062704, + "epoch": 0.47870133774237184, + "flos": 16143894034560.0, + "grad_norm": 1.7987656652337989, + "language_loss": 0.78473389, + "learning_rate": 2.2344994764216405e-06, + "loss": 0.80706125, + "num_input_tokens_seen": 171099190, + "step": 7962, + "time_per_iteration": 2.6783881187438965 + }, + { + "auxiliary_loss_clip": 0.01136252, + "auxiliary_loss_mlp": 0.01114882, + "balance_loss_clip": 1.00217056, + "balance_loss_mlp": 1.00063193, + "epoch": 0.47876146099503986, + "flos": 26906788344960.0, + "grad_norm": 1.9011138625980812, + "language_loss": 0.64990377, + "learning_rate": 2.2341126966234635e-06, + "loss": 0.67241514, + "num_input_tokens_seen": 171119060, + "step": 7963, + "time_per_iteration": 2.665038585662842 + }, + { + "auxiliary_loss_clip": 0.0115279, + "auxiliary_loss_mlp": 0.0111476, + "balance_loss_clip": 1.00220108, + "balance_loss_mlp": 1.00060463, + "epoch": 0.4788215842477078, + "flos": 45332085778560.0, + "grad_norm": 1.6995906933974974, + "language_loss": 0.77642274, + "learning_rate": 2.2337259079477083e-06, + "loss": 0.79909819, + "num_input_tokens_seen": 171141900, + "step": 7964, + "time_per_iteration": 2.7858798503875732 + }, + { + "auxiliary_loss_clip": 0.011515, + "auxiliary_loss_mlp": 0.0111619, + "balance_loss_clip": 1.002177, + "balance_loss_mlp": 1.00060487, + "epoch": 0.4788817075003758, + "flos": 22237180456320.0, + "grad_norm": 8.250668372817032, + "language_loss": 0.76172543, + "learning_rate": 2.233339110409044e-06, + "loss": 0.78440237, + "num_input_tokens_seen": 171161045, + "step": 7965, + "time_per_iteration": 2.583055257797241 + }, + { + "auxiliary_loss_clip": 0.01111189, + "auxiliary_loss_mlp": 0.01114718, + "balance_loss_clip": 1.00226223, + "balance_loss_mlp": 1.00075412, + "epoch": 0.47894183075304375, + "flos": 16471183783680.0, + "grad_norm": 1.7912596718956937, + "language_loss": 0.74856383, + "learning_rate": 2.232952304022137e-06, + "loss": 0.77082288, + "num_input_tokens_seen": 171179675, + "step": 7966, + "time_per_iteration": 2.7132022380828857 + }, + { + "auxiliary_loss_clip": 0.01136533, + "auxiliary_loss_mlp": 0.0111493, + "balance_loss_clip": 1.00210071, + "balance_loss_mlp": 1.0005846, + "epoch": 0.4790019540057117, + "flos": 24282688262400.0, + "grad_norm": 1.6012259832635845, + "language_loss": 0.73486668, + "learning_rate": 2.232565488801655e-06, + "loss": 0.75738132, + "num_input_tokens_seen": 171201175, + "step": 7967, + "time_per_iteration": 2.6618783473968506 + }, + { + "auxiliary_loss_clip": 0.01137848, + "auxiliary_loss_mlp": 0.01114619, + "balance_loss_clip": 1.00220466, + "balance_loss_mlp": 1.00055921, + "epoch": 0.4790620772583797, + "flos": 25666469763840.0, + "grad_norm": 1.656478977738784, + "language_loss": 0.79250801, + "learning_rate": 2.232178664762267e-06, + "loss": 0.8150326, + "num_input_tokens_seen": 171221750, + "step": 7968, + "time_per_iteration": 2.6614654064178467 + }, + { + "auxiliary_loss_clip": 0.01116461, + "auxiliary_loss_mlp": 0.01094017, + "balance_loss_clip": 1.00140262, + "balance_loss_mlp": 1.000175, + "epoch": 0.47912220051104765, + "flos": 69428077102080.0, + "grad_norm": 0.755054968012337, + "language_loss": 0.6226514, + "learning_rate": 2.2317918319186408e-06, + "loss": 0.6447562, + "num_input_tokens_seen": 171292235, + "step": 7969, + "time_per_iteration": 3.3881030082702637 + }, + { + "auxiliary_loss_clip": 0.01116992, + "auxiliary_loss_mlp": 0.01113926, + "balance_loss_clip": 1.00201619, + "balance_loss_mlp": 1.00062919, + "epoch": 0.4791823237637156, + "flos": 24168922911360.0, + "grad_norm": 1.5647054806903342, + "language_loss": 0.77263254, + "learning_rate": 2.2314049902854446e-06, + "loss": 0.79494172, + "num_input_tokens_seen": 171312215, + "step": 7970, + "time_per_iteration": 2.7036683559417725 + }, + { + "auxiliary_loss_clip": 0.01151097, + "auxiliary_loss_mlp": 0.01114174, + "balance_loss_clip": 1.0021317, + "balance_loss_mlp": 1.00059199, + "epoch": 0.4792424470163836, + "flos": 24751465683840.0, + "grad_norm": 1.59908173487416, + "language_loss": 0.70189071, + "learning_rate": 2.231018139877349e-06, + "loss": 0.72454345, + "num_input_tokens_seen": 171332975, + "step": 7971, + "time_per_iteration": 2.633413553237915 + }, + { + "auxiliary_loss_clip": 0.01111013, + "auxiliary_loss_mlp": 0.01115015, + "balance_loss_clip": 1.00202882, + "balance_loss_mlp": 1.00047874, + "epoch": 0.47930257026905154, + "flos": 23257905240960.0, + "grad_norm": 1.3888035601154283, + "language_loss": 0.79779887, + "learning_rate": 2.230631280709021e-06, + "loss": 0.82005912, + "num_input_tokens_seen": 171353880, + "step": 7972, + "time_per_iteration": 2.730724334716797 + }, + { + "auxiliary_loss_clip": 0.01151271, + "auxiliary_loss_mlp": 0.01115029, + "balance_loss_clip": 1.00220692, + "balance_loss_mlp": 1.0005877, + "epoch": 0.4793626935217195, + "flos": 14064091718400.0, + "grad_norm": 2.183398819045631, + "language_loss": 0.70154142, + "learning_rate": 2.2302444127951327e-06, + "loss": 0.72420442, + "num_input_tokens_seen": 171370930, + "step": 7973, + "time_per_iteration": 2.5791683197021484 + }, + { + "auxiliary_loss_clip": 0.01152047, + "auxiliary_loss_mlp": 0.0111361, + "balance_loss_clip": 1.00226021, + "balance_loss_mlp": 1.00079036, + "epoch": 0.4794228167743875, + "flos": 21798854789760.0, + "grad_norm": 1.841857398880129, + "language_loss": 0.78988874, + "learning_rate": 2.2298575361503523e-06, + "loss": 0.8125453, + "num_input_tokens_seen": 171387575, + "step": 7974, + "time_per_iteration": 2.5916030406951904 + }, + { + "auxiliary_loss_clip": 0.01131424, + "auxiliary_loss_mlp": 0.01093586, + "balance_loss_clip": 1.00138795, + "balance_loss_mlp": 1.00012612, + "epoch": 0.47948294002705544, + "flos": 66968805553920.0, + "grad_norm": 0.7502820910760045, + "language_loss": 0.54106337, + "learning_rate": 2.2294706507893517e-06, + "loss": 0.56331348, + "num_input_tokens_seen": 171449980, + "step": 7975, + "time_per_iteration": 3.208221912384033 + }, + { + "auxiliary_loss_clip": 0.01137301, + "auxiliary_loss_mlp": 0.01116995, + "balance_loss_clip": 1.00216341, + "balance_loss_mlp": 1.00074172, + "epoch": 0.47954306327972346, + "flos": 12422471414400.0, + "grad_norm": 2.1550821803115254, + "language_loss": 0.89775765, + "learning_rate": 2.2290837567268008e-06, + "loss": 0.9203006, + "num_input_tokens_seen": 171465290, + "step": 7976, + "time_per_iteration": 4.068279981613159 + }, + { + "auxiliary_loss_clip": 0.01168037, + "auxiliary_loss_mlp": 0.0111675, + "balance_loss_clip": 1.00229907, + "balance_loss_mlp": 1.0008781, + "epoch": 0.4796031865323914, + "flos": 18361951799040.0, + "grad_norm": 2.2786294348839555, + "language_loss": 0.73430347, + "learning_rate": 2.2286968539773713e-06, + "loss": 0.75715137, + "num_input_tokens_seen": 171481130, + "step": 7977, + "time_per_iteration": 2.4863157272338867 + }, + { + "auxiliary_loss_clip": 0.01157414, + "auxiliary_loss_mlp": 0.00747379, + "balance_loss_clip": 1.00225246, + "balance_loss_mlp": 1.000319, + "epoch": 0.4796633097850594, + "flos": 21835088634240.0, + "grad_norm": 1.6342632941135764, + "language_loss": 0.78471184, + "learning_rate": 2.228309942555734e-06, + "loss": 0.80375975, + "num_input_tokens_seen": 171501140, + "step": 7978, + "time_per_iteration": 2.5917515754699707 + }, + { + "auxiliary_loss_clip": 0.01134763, + "auxiliary_loss_mlp": 0.01115809, + "balance_loss_clip": 1.00224578, + "balance_loss_mlp": 1.00070024, + "epoch": 0.47972343303772735, + "flos": 23437350610560.0, + "grad_norm": 1.7113345143958123, + "language_loss": 0.89448333, + "learning_rate": 2.22792302247656e-06, + "loss": 0.91698897, + "num_input_tokens_seen": 171519835, + "step": 7979, + "time_per_iteration": 2.662217378616333 + }, + { + "auxiliary_loss_clip": 0.01153192, + "auxiliary_loss_mlp": 0.01115759, + "balance_loss_clip": 1.00229359, + "balance_loss_mlp": 1.00074565, + "epoch": 0.4797835562903953, + "flos": 24899776940160.0, + "grad_norm": 1.5076091147299855, + "language_loss": 0.7733438, + "learning_rate": 2.227536093754523e-06, + "loss": 0.79603326, + "num_input_tokens_seen": 171540980, + "step": 7980, + "time_per_iteration": 2.6231720447540283 + }, + { + "auxiliary_loss_clip": 0.01121528, + "auxiliary_loss_mlp": 0.01116396, + "balance_loss_clip": 1.00202322, + "balance_loss_mlp": 1.00062001, + "epoch": 0.4798436795430633, + "flos": 35042996793600.0, + "grad_norm": 1.5672303749299068, + "language_loss": 0.71670854, + "learning_rate": 2.227149156404295e-06, + "loss": 0.73908782, + "num_input_tokens_seen": 171563600, + "step": 7981, + "time_per_iteration": 4.179912567138672 + }, + { + "auxiliary_loss_clip": 0.01167905, + "auxiliary_loss_mlp": 0.01114884, + "balance_loss_clip": 1.00231361, + "balance_loss_mlp": 1.00072944, + "epoch": 0.47990380279573125, + "flos": 20590209025920.0, + "grad_norm": 1.7046362811942388, + "language_loss": 0.70430589, + "learning_rate": 2.2267622104405473e-06, + "loss": 0.72713381, + "num_input_tokens_seen": 171580700, + "step": 7982, + "time_per_iteration": 3.875756025314331 + }, + { + "auxiliary_loss_clip": 0.01136147, + "auxiliary_loss_mlp": 0.01114039, + "balance_loss_clip": 1.00209272, + "balance_loss_mlp": 1.0007422, + "epoch": 0.4799639260483992, + "flos": 26359402008960.0, + "grad_norm": 1.5244170234241028, + "language_loss": 0.71037424, + "learning_rate": 2.2263752558779544e-06, + "loss": 0.73287606, + "num_input_tokens_seen": 171602035, + "step": 7983, + "time_per_iteration": 4.0856359004974365 + }, + { + "auxiliary_loss_clip": 0.01147693, + "auxiliary_loss_mlp": 0.00746196, + "balance_loss_clip": 1.00134969, + "balance_loss_mlp": 0.99992132, + "epoch": 0.4800240493010672, + "flos": 70979021521920.0, + "grad_norm": 0.8694333690132932, + "language_loss": 0.59421986, + "learning_rate": 2.2259882927311883e-06, + "loss": 0.61315876, + "num_input_tokens_seen": 171659215, + "step": 7984, + "time_per_iteration": 3.0841498374938965 + }, + { + "auxiliary_loss_clip": 0.01106245, + "auxiliary_loss_mlp": 0.01114053, + "balance_loss_clip": 1.00196719, + "balance_loss_mlp": 1.00085163, + "epoch": 0.48008417255373514, + "flos": 17086656349440.0, + "grad_norm": 1.5378088429121366, + "language_loss": 0.6709708, + "learning_rate": 2.2256013210149247e-06, + "loss": 0.69317377, + "num_input_tokens_seen": 171675710, + "step": 7985, + "time_per_iteration": 2.665637493133545 + }, + { + "auxiliary_loss_clip": 0.01138218, + "auxiliary_loss_mlp": 0.01115205, + "balance_loss_clip": 1.00217044, + "balance_loss_mlp": 1.00066853, + "epoch": 0.4801442958064031, + "flos": 15413435055360.0, + "grad_norm": 1.773519328038918, + "language_loss": 0.70024645, + "learning_rate": 2.225214340743835e-06, + "loss": 0.7227807, + "num_input_tokens_seen": 171692510, + "step": 7986, + "time_per_iteration": 2.615131139755249 + }, + { + "auxiliary_loss_clip": 0.01117639, + "auxiliary_loss_mlp": 0.01115397, + "balance_loss_clip": 1.00193977, + "balance_loss_mlp": 1.00076532, + "epoch": 0.4802044190590711, + "flos": 11473747441920.0, + "grad_norm": 2.1376697814551826, + "language_loss": 0.78945178, + "learning_rate": 2.2248273519325956e-06, + "loss": 0.81178212, + "num_input_tokens_seen": 171710235, + "step": 7987, + "time_per_iteration": 2.6272473335266113 + }, + { + "auxiliary_loss_clip": 0.01104363, + "auxiliary_loss_mlp": 0.01114574, + "balance_loss_clip": 1.00197053, + "balance_loss_mlp": 1.00080061, + "epoch": 0.48026454231173904, + "flos": 20951003185920.0, + "grad_norm": 2.135684924256881, + "language_loss": 0.75488245, + "learning_rate": 2.2244403545958812e-06, + "loss": 0.77707177, + "num_input_tokens_seen": 171726715, + "step": 7988, + "time_per_iteration": 2.7047617435455322 + }, + { + "auxiliary_loss_clip": 0.01119607, + "auxiliary_loss_mlp": 0.01114329, + "balance_loss_clip": 1.00218248, + "balance_loss_mlp": 1.00055587, + "epoch": 0.48032466556440706, + "flos": 20448110822400.0, + "grad_norm": 2.1302942576014208, + "language_loss": 0.78524745, + "learning_rate": 2.224053348748365e-06, + "loss": 0.80758679, + "num_input_tokens_seen": 171743605, + "step": 7989, + "time_per_iteration": 2.6473870277404785 + }, + { + "auxiliary_loss_clip": 0.01135997, + "auxiliary_loss_mlp": 0.01115983, + "balance_loss_clip": 1.00207734, + "balance_loss_mlp": 1.00068355, + "epoch": 0.480384788817075, + "flos": 37120823861760.0, + "grad_norm": 1.5769891121488522, + "language_loss": 0.73780906, + "learning_rate": 2.223666334404724e-06, + "loss": 0.76032889, + "num_input_tokens_seen": 171765445, + "step": 7990, + "time_per_iteration": 2.7365734577178955 + }, + { + "auxiliary_loss_clip": 0.01147754, + "auxiliary_loss_mlp": 0.00746205, + "balance_loss_clip": 1.00139701, + "balance_loss_mlp": 0.99997807, + "epoch": 0.480444912069743, + "flos": 69552577641600.0, + "grad_norm": 0.7674901658679787, + "language_loss": 0.5911783, + "learning_rate": 2.223279311579633e-06, + "loss": 0.61011791, + "num_input_tokens_seen": 171830115, + "step": 7991, + "time_per_iteration": 3.2246477603912354 + }, + { + "auxiliary_loss_clip": 0.01152973, + "auxiliary_loss_mlp": 0.0074743, + "balance_loss_clip": 1.00216198, + "balance_loss_mlp": 1.00036621, + "epoch": 0.48050503532241096, + "flos": 29822231640960.0, + "grad_norm": 1.9164085084127094, + "language_loss": 0.66985559, + "learning_rate": 2.222892280287768e-06, + "loss": 0.68885958, + "num_input_tokens_seen": 171849135, + "step": 7992, + "time_per_iteration": 2.658992290496826 + }, + { + "auxiliary_loss_clip": 0.01136215, + "auxiliary_loss_mlp": 0.01115144, + "balance_loss_clip": 1.00197864, + "balance_loss_mlp": 1.00079846, + "epoch": 0.4805651585750789, + "flos": 23948539015680.0, + "grad_norm": 1.9108286618428239, + "language_loss": 0.76240879, + "learning_rate": 2.2225052405438056e-06, + "loss": 0.78492236, + "num_input_tokens_seen": 171868880, + "step": 7993, + "time_per_iteration": 2.664736270904541 + }, + { + "auxiliary_loss_clip": 0.01110061, + "auxiliary_loss_mlp": 0.01115292, + "balance_loss_clip": 1.00212502, + "balance_loss_mlp": 1.00066042, + "epoch": 0.4806252818277469, + "flos": 25665428269440.0, + "grad_norm": 2.1773067307909058, + "language_loss": 0.78369796, + "learning_rate": 2.222118192362422e-06, + "loss": 0.80595148, + "num_input_tokens_seen": 171889455, + "step": 7994, + "time_per_iteration": 2.7554798126220703 + }, + { + "auxiliary_loss_clip": 0.01137058, + "auxiliary_loss_mlp": 0.01114663, + "balance_loss_clip": 1.00215185, + "balance_loss_mlp": 1.00060391, + "epoch": 0.48068540508041485, + "flos": 13151996640000.0, + "grad_norm": 2.1940557987980465, + "language_loss": 0.7930913, + "learning_rate": 2.2217311357582946e-06, + "loss": 0.8156085, + "num_input_tokens_seen": 171906070, + "step": 7995, + "time_per_iteration": 2.6382641792297363 + }, + { + "auxiliary_loss_clip": 0.01109211, + "auxiliary_loss_mlp": 0.01115577, + "balance_loss_clip": 1.00197971, + "balance_loss_mlp": 1.0005641, + "epoch": 0.4807455283330828, + "flos": 21176738208000.0, + "grad_norm": 1.6163972413588064, + "language_loss": 0.8281827, + "learning_rate": 2.2213440707461e-06, + "loss": 0.85043055, + "num_input_tokens_seen": 171926515, + "step": 7996, + "time_per_iteration": 2.708749294281006 + }, + { + "auxiliary_loss_clip": 0.01089053, + "auxiliary_loss_mlp": 0.01114853, + "balance_loss_clip": 1.00181031, + "balance_loss_mlp": 1.00069833, + "epoch": 0.4808056515857508, + "flos": 12275991751680.0, + "grad_norm": 1.9295563500810486, + "language_loss": 0.80636287, + "learning_rate": 2.220956997340516e-06, + "loss": 0.82840192, + "num_input_tokens_seen": 171943845, + "step": 7997, + "time_per_iteration": 2.7055752277374268 + }, + { + "auxiliary_loss_clip": 0.01102811, + "auxiliary_loss_mlp": 0.01115435, + "balance_loss_clip": 1.00193524, + "balance_loss_mlp": 1.00061226, + "epoch": 0.48086577483841875, + "flos": 24826052275200.0, + "grad_norm": 1.8889697595540362, + "language_loss": 0.72675681, + "learning_rate": 2.220569915556221e-06, + "loss": 0.74893922, + "num_input_tokens_seen": 171964970, + "step": 7998, + "time_per_iteration": 2.7343838214874268 + }, + { + "auxiliary_loss_clip": 0.01167831, + "auxiliary_loss_mlp": 0.01114852, + "balance_loss_clip": 1.00223994, + "balance_loss_mlp": 1.00060225, + "epoch": 0.4809258980910867, + "flos": 24465365856000.0, + "grad_norm": 1.8100434643723367, + "language_loss": 0.70985162, + "learning_rate": 2.220182825407892e-06, + "loss": 0.73267847, + "num_input_tokens_seen": 171986340, + "step": 7999, + "time_per_iteration": 2.614246368408203 + }, + { + "auxiliary_loss_clip": 0.01152918, + "auxiliary_loss_mlp": 0.01115629, + "balance_loss_clip": 1.00214791, + "balance_loss_mlp": 1.00080705, + "epoch": 0.4809860213437547, + "flos": 21215952881280.0, + "grad_norm": 1.6421858829789222, + "language_loss": 0.71182203, + "learning_rate": 2.2197957269102083e-06, + "loss": 0.73450756, + "num_input_tokens_seen": 172007300, + "step": 8000, + "time_per_iteration": 2.620882987976074 + }, + { + "auxiliary_loss_clip": 0.01151209, + "auxiliary_loss_mlp": 0.01115225, + "balance_loss_clip": 1.0021956, + "balance_loss_mlp": 1.00087941, + "epoch": 0.48104614459642264, + "flos": 37632084094080.0, + "grad_norm": 5.388465222636088, + "language_loss": 0.74645042, + "learning_rate": 2.2194086200778485e-06, + "loss": 0.76911473, + "num_input_tokens_seen": 172029585, + "step": 8001, + "time_per_iteration": 2.7164218425750732 + }, + { + "auxiliary_loss_clip": 0.0115133, + "auxiliary_loss_mlp": 0.01116684, + "balance_loss_clip": 1.00222111, + "balance_loss_mlp": 1.00081205, + "epoch": 0.48110626784909066, + "flos": 18406122549120.0, + "grad_norm": 1.679594547122992, + "language_loss": 0.81954265, + "learning_rate": 2.219021504925493e-06, + "loss": 0.84222281, + "num_input_tokens_seen": 172047495, + "step": 8002, + "time_per_iteration": 2.65384840965271 + }, + { + "auxiliary_loss_clip": 0.01152331, + "auxiliary_loss_mlp": 0.01115876, + "balance_loss_clip": 1.0021472, + "balance_loss_mlp": 1.00048137, + "epoch": 0.48116639110175863, + "flos": 28439814856320.0, + "grad_norm": 1.6683016166079483, + "language_loss": 0.71465063, + "learning_rate": 2.218634381467819e-06, + "loss": 0.7373327, + "num_input_tokens_seen": 172067625, + "step": 8003, + "time_per_iteration": 2.6276893615722656 + }, + { + "auxiliary_loss_clip": 0.01152572, + "auxiliary_loss_mlp": 0.0111361, + "balance_loss_clip": 1.00216556, + "balance_loss_mlp": 1.00088549, + "epoch": 0.4812265143544266, + "flos": 21725237865600.0, + "grad_norm": 1.7748476363206573, + "language_loss": 0.82240093, + "learning_rate": 2.218247249719507e-06, + "loss": 0.84506273, + "num_input_tokens_seen": 172087885, + "step": 8004, + "time_per_iteration": 2.564500093460083 + }, + { + "auxiliary_loss_clip": 0.01136361, + "auxiliary_loss_mlp": 0.01117513, + "balance_loss_clip": 1.00222635, + "balance_loss_mlp": 1.00087893, + "epoch": 0.48128663760709456, + "flos": 13224679810560.0, + "grad_norm": 2.1509994374860755, + "language_loss": 0.77868009, + "learning_rate": 2.217860109695239e-06, + "loss": 0.80121881, + "num_input_tokens_seen": 172105815, + "step": 8005, + "time_per_iteration": 2.585524082183838 + }, + { + "auxiliary_loss_clip": 0.0115158, + "auxiliary_loss_mlp": 0.01116001, + "balance_loss_clip": 1.00209641, + "balance_loss_mlp": 1.00060618, + "epoch": 0.4813467608597625, + "flos": 24243437675520.0, + "grad_norm": 3.8590867427535662, + "language_loss": 0.70576841, + "learning_rate": 2.217472961409692e-06, + "loss": 0.72844422, + "num_input_tokens_seen": 172126125, + "step": 8006, + "time_per_iteration": 2.5799431800842285 + }, + { + "auxiliary_loss_clip": 0.01138111, + "auxiliary_loss_mlp": 0.01115321, + "balance_loss_clip": 1.00216079, + "balance_loss_mlp": 1.00068963, + "epoch": 0.4814068841124305, + "flos": 27480424544640.0, + "grad_norm": 1.791461543392667, + "language_loss": 0.70539021, + "learning_rate": 2.2170858048775495e-06, + "loss": 0.72792459, + "num_input_tokens_seen": 172141945, + "step": 8007, + "time_per_iteration": 2.6511781215667725 + }, + { + "auxiliary_loss_clip": 0.01167981, + "auxiliary_loss_mlp": 0.01116027, + "balance_loss_clip": 1.00217175, + "balance_loss_mlp": 1.00072777, + "epoch": 0.48146700736509845, + "flos": 19572896033280.0, + "grad_norm": 2.452904565102548, + "language_loss": 0.71407306, + "learning_rate": 2.2166986401134914e-06, + "loss": 0.73691314, + "num_input_tokens_seen": 172161095, + "step": 8008, + "time_per_iteration": 2.5485241413116455 + }, + { + "auxiliary_loss_clip": 0.011244, + "auxiliary_loss_mlp": 0.01116761, + "balance_loss_clip": 1.00218201, + "balance_loss_mlp": 1.00098491, + "epoch": 0.4815271306177664, + "flos": 20627771673600.0, + "grad_norm": 1.9034502573572716, + "language_loss": 0.61288476, + "learning_rate": 2.216311467132199e-06, + "loss": 0.6352964, + "num_input_tokens_seen": 172178750, + "step": 8009, + "time_per_iteration": 2.644808292388916 + }, + { + "auxiliary_loss_clip": 0.01136002, + "auxiliary_loss_mlp": 0.0109305, + "balance_loss_clip": 1.00142097, + "balance_loss_mlp": 0.99997121, + "epoch": 0.4815872538704344, + "flos": 67691076232320.0, + "grad_norm": 0.8598482529945022, + "language_loss": 0.61379695, + "learning_rate": 2.2159242859483547e-06, + "loss": 0.63608742, + "num_input_tokens_seen": 172240235, + "step": 8010, + "time_per_iteration": 3.180266857147217 + }, + { + "auxiliary_loss_clip": 0.01151458, + "auxiliary_loss_mlp": 0.01115606, + "balance_loss_clip": 1.00225258, + "balance_loss_mlp": 1.00087917, + "epoch": 0.48164737712310235, + "flos": 22820764723200.0, + "grad_norm": 1.6256548683246725, + "language_loss": 0.73547447, + "learning_rate": 2.215537096576639e-06, + "loss": 0.75814509, + "num_input_tokens_seen": 172259875, + "step": 8011, + "time_per_iteration": 2.582047939300537 + }, + { + "auxiliary_loss_clip": 0.01137489, + "auxiliary_loss_mlp": 0.01115256, + "balance_loss_clip": 1.00211346, + "balance_loss_mlp": 1.00071955, + "epoch": 0.4817075003757703, + "flos": 23733865382400.0, + "grad_norm": 2.022104253460244, + "language_loss": 0.79202402, + "learning_rate": 2.2151498990317354e-06, + "loss": 0.81455147, + "num_input_tokens_seen": 172280150, + "step": 8012, + "time_per_iteration": 2.6199071407318115 + }, + { + "auxiliary_loss_clip": 0.01118077, + "auxiliary_loss_mlp": 0.0111557, + "balance_loss_clip": 1.00200152, + "balance_loss_mlp": 1.00084329, + "epoch": 0.4817676236284383, + "flos": 28182909807360.0, + "grad_norm": 2.2448500842424512, + "language_loss": 0.73626924, + "learning_rate": 2.214762693328326e-06, + "loss": 0.75860572, + "num_input_tokens_seen": 172300810, + "step": 8013, + "time_per_iteration": 4.0928168296813965 + }, + { + "auxiliary_loss_clip": 0.01135258, + "auxiliary_loss_mlp": 0.01115569, + "balance_loss_clip": 1.00221276, + "balance_loss_mlp": 1.00065064, + "epoch": 0.48182774688110624, + "flos": 17091756080640.0, + "grad_norm": 1.888305642756549, + "language_loss": 0.90256882, + "learning_rate": 2.214375479481094e-06, + "loss": 0.92507708, + "num_input_tokens_seen": 172317930, + "step": 8014, + "time_per_iteration": 2.590146064758301 + }, + { + "auxiliary_loss_clip": 0.01167958, + "auxiliary_loss_mlp": 0.01115204, + "balance_loss_clip": 1.00212097, + "balance_loss_mlp": 1.00085855, + "epoch": 0.4818878701337742, + "flos": 12567873669120.0, + "grad_norm": 3.3120852034690964, + "language_loss": 0.74594998, + "learning_rate": 2.213988257504722e-06, + "loss": 0.76878166, + "num_input_tokens_seen": 172336340, + "step": 8015, + "time_per_iteration": 2.4978365898132324 + }, + { + "auxiliary_loss_clip": 0.0113609, + "auxiliary_loss_mlp": 0.01115869, + "balance_loss_clip": 1.00206232, + "balance_loss_mlp": 1.0005697, + "epoch": 0.48194799338644223, + "flos": 24608505553920.0, + "grad_norm": 2.011635710425387, + "language_loss": 0.80727673, + "learning_rate": 2.213601027413894e-06, + "loss": 0.82979631, + "num_input_tokens_seen": 172354315, + "step": 8016, + "time_per_iteration": 2.619138479232788 + }, + { + "auxiliary_loss_clip": 0.01150751, + "auxiliary_loss_mlp": 0.01113768, + "balance_loss_clip": 1.00212884, + "balance_loss_mlp": 1.00066185, + "epoch": 0.4820081166391102, + "flos": 21105204272640.0, + "grad_norm": 10.09114800012113, + "language_loss": 0.77490091, + "learning_rate": 2.2132137892232933e-06, + "loss": 0.79754609, + "num_input_tokens_seen": 172372695, + "step": 8017, + "time_per_iteration": 2.5659334659576416 + }, + { + "auxiliary_loss_clip": 0.01151093, + "auxiliary_loss_mlp": 0.01113588, + "balance_loss_clip": 1.00218773, + "balance_loss_mlp": 1.00057769, + "epoch": 0.48206823989177816, + "flos": 25264593423360.0, + "grad_norm": 1.8712857738716162, + "language_loss": 0.80101985, + "learning_rate": 2.2128265429476043e-06, + "loss": 0.82366669, + "num_input_tokens_seen": 172390905, + "step": 8018, + "time_per_iteration": 4.024756908416748 + }, + { + "auxiliary_loss_clip": 0.01120667, + "auxiliary_loss_mlp": 0.01116377, + "balance_loss_clip": 1.00201631, + "balance_loss_mlp": 1.00050545, + "epoch": 0.4821283631444461, + "flos": 24645062620800.0, + "grad_norm": 1.806859331372964, + "language_loss": 0.76244593, + "learning_rate": 2.2124392886015124e-06, + "loss": 0.78481638, + "num_input_tokens_seen": 172412295, + "step": 8019, + "time_per_iteration": 4.075536251068115 + }, + { + "auxiliary_loss_clip": 0.01118111, + "auxiliary_loss_mlp": 0.01115112, + "balance_loss_clip": 1.00180948, + "balance_loss_mlp": 1.0007664, + "epoch": 0.4821884863971141, + "flos": 23952094462080.0, + "grad_norm": 1.6916565670650174, + "language_loss": 0.78884923, + "learning_rate": 2.212052026199701e-06, + "loss": 0.81118143, + "num_input_tokens_seen": 172432625, + "step": 8020, + "time_per_iteration": 2.6708483695983887 + }, + { + "auxiliary_loss_clip": 0.01167601, + "auxiliary_loss_mlp": 0.0111528, + "balance_loss_clip": 1.00217617, + "balance_loss_mlp": 1.00074399, + "epoch": 0.48224860964978206, + "flos": 17160668323200.0, + "grad_norm": 2.4147690900269856, + "language_loss": 0.69549191, + "learning_rate": 2.211664755756855e-06, + "loss": 0.71832067, + "num_input_tokens_seen": 172450010, + "step": 8021, + "time_per_iteration": 3.9197938442230225 + }, + { + "auxiliary_loss_clip": 0.01136773, + "auxiliary_loss_mlp": 0.01116602, + "balance_loss_clip": 1.00223255, + "balance_loss_mlp": 1.00063527, + "epoch": 0.48230873290245, + "flos": 23075838178560.0, + "grad_norm": 2.011223240721051, + "language_loss": 0.63140297, + "learning_rate": 2.2112774772876603e-06, + "loss": 0.65393674, + "num_input_tokens_seen": 172469080, + "step": 8022, + "time_per_iteration": 2.6163477897644043 + }, + { + "auxiliary_loss_clip": 0.01134885, + "auxiliary_loss_mlp": 0.00747323, + "balance_loss_clip": 1.00209165, + "balance_loss_mlp": 1.00030541, + "epoch": 0.482368856155118, + "flos": 19353517718400.0, + "grad_norm": 3.5231049976470743, + "language_loss": 0.66228735, + "learning_rate": 2.2108901908068028e-06, + "loss": 0.68110943, + "num_input_tokens_seen": 172484850, + "step": 8023, + "time_per_iteration": 2.612614393234253 + }, + { + "auxiliary_loss_clip": 0.01074498, + "auxiliary_loss_mlp": 0.01115608, + "balance_loss_clip": 1.0018028, + "balance_loss_mlp": 1.00059462, + "epoch": 0.48242897940778595, + "flos": 20078984707200.0, + "grad_norm": 1.8828622072724321, + "language_loss": 0.76242596, + "learning_rate": 2.2105028963289683e-06, + "loss": 0.78432703, + "num_input_tokens_seen": 172503525, + "step": 8024, + "time_per_iteration": 2.8070573806762695 + }, + { + "auxiliary_loss_clip": 0.01134892, + "auxiliary_loss_mlp": 0.01115421, + "balance_loss_clip": 1.00212383, + "balance_loss_mlp": 1.00059843, + "epoch": 0.4824891026604539, + "flos": 23403989854080.0, + "grad_norm": 1.5562732090900593, + "language_loss": 0.75312066, + "learning_rate": 2.2101155938688423e-06, + "loss": 0.7756238, + "num_input_tokens_seen": 172524360, + "step": 8025, + "time_per_iteration": 2.618873357772827 + }, + { + "auxiliary_loss_clip": 0.01167899, + "auxiliary_loss_mlp": 0.0111526, + "balance_loss_clip": 1.00215423, + "balance_loss_mlp": 1.00062823, + "epoch": 0.4825492259131219, + "flos": 20368675895040.0, + "grad_norm": 1.8477775311329871, + "language_loss": 0.70884252, + "learning_rate": 2.209728283441112e-06, + "loss": 0.73167413, + "num_input_tokens_seen": 172541480, + "step": 8026, + "time_per_iteration": 2.5189766883850098 + }, + { + "auxiliary_loss_clip": 0.01152867, + "auxiliary_loss_mlp": 0.01116526, + "balance_loss_clip": 1.00226295, + "balance_loss_mlp": 1.00084567, + "epoch": 0.48260934916578985, + "flos": 14319021519360.0, + "grad_norm": 2.3563602982126626, + "language_loss": 0.75253278, + "learning_rate": 2.209340965060465e-06, + "loss": 0.77522671, + "num_input_tokens_seen": 172559005, + "step": 8027, + "time_per_iteration": 2.54599666595459 + }, + { + "auxiliary_loss_clip": 0.01134423, + "auxiliary_loss_mlp": 0.0111656, + "balance_loss_clip": 1.00205433, + "balance_loss_mlp": 1.00068903, + "epoch": 0.4826694724184578, + "flos": 22121152548480.0, + "grad_norm": 1.8514067245324493, + "language_loss": 0.67176324, + "learning_rate": 2.2089536387415868e-06, + "loss": 0.69427305, + "num_input_tokens_seen": 172578435, + "step": 8028, + "time_per_iteration": 2.618072509765625 + }, + { + "auxiliary_loss_clip": 0.01135791, + "auxiliary_loss_mlp": 0.0111558, + "balance_loss_clip": 1.00208712, + "balance_loss_mlp": 1.00075817, + "epoch": 0.48272959567112583, + "flos": 16181169373440.0, + "grad_norm": 2.7452788832217876, + "language_loss": 0.73142004, + "learning_rate": 2.2085663044991655e-06, + "loss": 0.75393373, + "num_input_tokens_seen": 172596095, + "step": 8029, + "time_per_iteration": 2.587482452392578 + }, + { + "auxiliary_loss_clip": 0.0113674, + "auxiliary_loss_mlp": 0.01115395, + "balance_loss_clip": 1.00211501, + "balance_loss_mlp": 1.00057304, + "epoch": 0.4827897189237938, + "flos": 23180445561600.0, + "grad_norm": 2.156000056607274, + "language_loss": 0.84820825, + "learning_rate": 2.2081789623478896e-06, + "loss": 0.87072957, + "num_input_tokens_seen": 172615255, + "step": 8030, + "time_per_iteration": 2.624058723449707 + }, + { + "auxiliary_loss_clip": 0.01136256, + "auxiliary_loss_mlp": 0.0111486, + "balance_loss_clip": 1.0020982, + "balance_loss_mlp": 1.00061023, + "epoch": 0.48284984217646176, + "flos": 21652626522240.0, + "grad_norm": 1.7822429549469994, + "language_loss": 0.73817301, + "learning_rate": 2.2077916123024466e-06, + "loss": 0.76068425, + "num_input_tokens_seen": 172633185, + "step": 8031, + "time_per_iteration": 2.598151683807373 + }, + { + "auxiliary_loss_clip": 0.01136363, + "auxiliary_loss_mlp": 0.01116905, + "balance_loss_clip": 1.00202441, + "balance_loss_mlp": 1.00084257, + "epoch": 0.48290996542912973, + "flos": 31467443304960.0, + "grad_norm": 2.125958669347001, + "language_loss": 0.71532589, + "learning_rate": 2.2074042543775245e-06, + "loss": 0.73785853, + "num_input_tokens_seen": 172654280, + "step": 8032, + "time_per_iteration": 2.662527561187744 + }, + { + "auxiliary_loss_clip": 0.01152568, + "auxiliary_loss_mlp": 0.01115835, + "balance_loss_clip": 1.00205624, + "balance_loss_mlp": 1.00072682, + "epoch": 0.4829700886817977, + "flos": 24461954064000.0, + "grad_norm": 1.566846448703337, + "language_loss": 0.73910105, + "learning_rate": 2.2070168885878126e-06, + "loss": 0.76178509, + "num_input_tokens_seen": 172675545, + "step": 8033, + "time_per_iteration": 2.595517158508301 + }, + { + "auxiliary_loss_clip": 0.01086686, + "auxiliary_loss_mlp": 0.01116406, + "balance_loss_clip": 1.00177455, + "balance_loss_mlp": 1.0008204, + "epoch": 0.48303021193446566, + "flos": 25702164904320.0, + "grad_norm": 1.6425045358837342, + "language_loss": 0.83582425, + "learning_rate": 2.2066295149479996e-06, + "loss": 0.8578552, + "num_input_tokens_seen": 172696455, + "step": 8034, + "time_per_iteration": 2.76239013671875 + }, + { + "auxiliary_loss_clip": 0.01119407, + "auxiliary_loss_mlp": 0.01114523, + "balance_loss_clip": 1.00204408, + "balance_loss_mlp": 1.00055861, + "epoch": 0.4830903351871336, + "flos": 20085233673600.0, + "grad_norm": 1.7378756142774168, + "language_loss": 0.79530656, + "learning_rate": 2.2062421334727744e-06, + "loss": 0.81764591, + "num_input_tokens_seen": 172716720, + "step": 8035, + "time_per_iteration": 2.6719555854797363 + }, + { + "auxiliary_loss_clip": 0.01134417, + "auxiliary_loss_mlp": 0.00747491, + "balance_loss_clip": 1.00204527, + "balance_loss_mlp": 1.00028896, + "epoch": 0.4831504584398016, + "flos": 39452216014080.0, + "grad_norm": 1.870235159634672, + "language_loss": 0.69733393, + "learning_rate": 2.2058547441768267e-06, + "loss": 0.71615303, + "num_input_tokens_seen": 172737435, + "step": 8036, + "time_per_iteration": 2.7843515872955322 + }, + { + "auxiliary_loss_clip": 0.01151035, + "auxiliary_loss_mlp": 0.01115678, + "balance_loss_clip": 1.00203848, + "balance_loss_mlp": 1.00056934, + "epoch": 0.48321058169246955, + "flos": 20006588845440.0, + "grad_norm": 2.1711268809077686, + "language_loss": 0.72646719, + "learning_rate": 2.205467347074847e-06, + "loss": 0.7491343, + "num_input_tokens_seen": 172755700, + "step": 8037, + "time_per_iteration": 2.550276517868042 + }, + { + "auxiliary_loss_clip": 0.01104447, + "auxiliary_loss_mlp": 0.01117371, + "balance_loss_clip": 1.00197315, + "balance_loss_mlp": 1.00054598, + "epoch": 0.4832707049451375, + "flos": 20741465197440.0, + "grad_norm": 3.2816100233186574, + "language_loss": 0.69163388, + "learning_rate": 2.205079942181525e-06, + "loss": 0.71385205, + "num_input_tokens_seen": 172775185, + "step": 8038, + "time_per_iteration": 2.6789321899414062 + }, + { + "auxiliary_loss_clip": 0.0111712, + "auxiliary_loss_mlp": 0.01115147, + "balance_loss_clip": 1.00197005, + "balance_loss_mlp": 1.00061083, + "epoch": 0.4833308281978055, + "flos": 33145584762240.0, + "grad_norm": 1.664050670778788, + "language_loss": 0.79461884, + "learning_rate": 2.20469252951155e-06, + "loss": 0.8169415, + "num_input_tokens_seen": 172796990, + "step": 8039, + "time_per_iteration": 2.742136240005493 + }, + { + "auxiliary_loss_clip": 0.01151006, + "auxiliary_loss_mlp": 0.01115091, + "balance_loss_clip": 1.00217676, + "balance_loss_mlp": 1.00055456, + "epoch": 0.48339095145047345, + "flos": 19099234362240.0, + "grad_norm": 3.887646343016898, + "language_loss": 0.77257776, + "learning_rate": 2.2043051090796143e-06, + "loss": 0.79523873, + "num_input_tokens_seen": 172814915, + "step": 8040, + "time_per_iteration": 2.555860996246338 + }, + { + "auxiliary_loss_clip": 0.01151245, + "auxiliary_loss_mlp": 0.01116038, + "balance_loss_clip": 1.0021106, + "balance_loss_mlp": 1.00064301, + "epoch": 0.4834510747031414, + "flos": 34459448440320.0, + "grad_norm": 1.7764790293866906, + "language_loss": 0.75729692, + "learning_rate": 2.203917680900409e-06, + "loss": 0.77996969, + "num_input_tokens_seen": 172837060, + "step": 8041, + "time_per_iteration": 2.6786935329437256 + }, + { + "auxiliary_loss_clip": 0.01119449, + "auxiliary_loss_mlp": 0.01114566, + "balance_loss_clip": 1.00207639, + "balance_loss_mlp": 1.00060153, + "epoch": 0.48351119795580944, + "flos": 27380845065600.0, + "grad_norm": 1.6704367419792778, + "language_loss": 0.66578418, + "learning_rate": 2.203530244988624e-06, + "loss": 0.6881243, + "num_input_tokens_seen": 172856545, + "step": 8042, + "time_per_iteration": 2.8130760192871094 + }, + { + "auxiliary_loss_clip": 0.01131557, + "auxiliary_loss_mlp": 0.01093425, + "balance_loss_clip": 1.00132108, + "balance_loss_mlp": 0.99996531, + "epoch": 0.4835713212084774, + "flos": 67143941291520.0, + "grad_norm": 0.6871324332389181, + "language_loss": 0.58563435, + "learning_rate": 2.2031428013589517e-06, + "loss": 0.60788417, + "num_input_tokens_seen": 172923055, + "step": 8043, + "time_per_iteration": 3.267469882965088 + }, + { + "auxiliary_loss_clip": 0.01137939, + "auxiliary_loss_mlp": 0.01116553, + "balance_loss_clip": 1.00215006, + "balance_loss_mlp": 1.00058627, + "epoch": 0.48363144446114537, + "flos": 17967473660160.0, + "grad_norm": 2.366126799148269, + "language_loss": 0.72417343, + "learning_rate": 2.2027553500260847e-06, + "loss": 0.74671841, + "num_input_tokens_seen": 172940700, + "step": 8044, + "time_per_iteration": 2.587900161743164 + }, + { + "auxiliary_loss_clip": 0.01110911, + "auxiliary_loss_mlp": 0.01115676, + "balance_loss_clip": 1.00258231, + "balance_loss_mlp": 1.00056767, + "epoch": 0.48369156771381333, + "flos": 20593513077120.0, + "grad_norm": 1.3019536063382122, + "language_loss": 0.75928092, + "learning_rate": 2.202367891004714e-06, + "loss": 0.78154677, + "num_input_tokens_seen": 172961125, + "step": 8045, + "time_per_iteration": 2.729722261428833 + }, + { + "auxiliary_loss_clip": 0.01100909, + "auxiliary_loss_mlp": 0.01116059, + "balance_loss_clip": 1.00188422, + "balance_loss_mlp": 1.00066447, + "epoch": 0.4837516909664813, + "flos": 22675075159680.0, + "grad_norm": 1.5715341139863712, + "language_loss": 0.6922158, + "learning_rate": 2.201980424309533e-06, + "loss": 0.71438551, + "num_input_tokens_seen": 172980405, + "step": 8046, + "time_per_iteration": 2.681119918823242 + }, + { + "auxiliary_loss_clip": 0.01167804, + "auxiliary_loss_mlp": 0.01114656, + "balance_loss_clip": 1.00212526, + "balance_loss_mlp": 1.00069165, + "epoch": 0.48381181421914926, + "flos": 25518625384320.0, + "grad_norm": 1.8236777846561734, + "language_loss": 0.82307261, + "learning_rate": 2.2015929499552337e-06, + "loss": 0.8458972, + "num_input_tokens_seen": 172999105, + "step": 8047, + "time_per_iteration": 2.5979502201080322 + }, + { + "auxiliary_loss_clip": 0.0113625, + "auxiliary_loss_mlp": 0.01114891, + "balance_loss_clip": 1.00206256, + "balance_loss_mlp": 1.00064111, + "epoch": 0.4838719374718172, + "flos": 24207491139840.0, + "grad_norm": 1.571075410471984, + "language_loss": 0.80928653, + "learning_rate": 2.2012054679565092e-06, + "loss": 0.83179796, + "num_input_tokens_seen": 173019935, + "step": 8048, + "time_per_iteration": 2.6682400703430176 + }, + { + "auxiliary_loss_clip": 0.01151333, + "auxiliary_loss_mlp": 0.01116047, + "balance_loss_clip": 1.00212073, + "balance_loss_mlp": 1.00055707, + "epoch": 0.4839320607244852, + "flos": 26724577628160.0, + "grad_norm": 1.869327029144736, + "language_loss": 0.81590676, + "learning_rate": 2.200817978328054e-06, + "loss": 0.83858061, + "num_input_tokens_seen": 173039700, + "step": 8049, + "time_per_iteration": 2.5994505882263184 + }, + { + "auxiliary_loss_clip": 0.01142592, + "auxiliary_loss_mlp": 0.01114736, + "balance_loss_clip": 1.002563, + "balance_loss_mlp": 1.00067663, + "epoch": 0.48399218397715316, + "flos": 20448900921600.0, + "grad_norm": 1.5529078984440463, + "language_loss": 0.72508228, + "learning_rate": 2.2004304810845602e-06, + "loss": 0.74765557, + "num_input_tokens_seen": 173059170, + "step": 8050, + "time_per_iteration": 2.5971035957336426 + }, + { + "auxiliary_loss_clip": 0.01148262, + "auxiliary_loss_mlp": 0.00746105, + "balance_loss_clip": 1.00142622, + "balance_loss_mlp": 0.99988383, + "epoch": 0.4840523072298211, + "flos": 67180570185600.0, + "grad_norm": 0.697123885291636, + "language_loss": 0.56402445, + "learning_rate": 2.200042976240723e-06, + "loss": 0.58296812, + "num_input_tokens_seen": 173119000, + "step": 8051, + "time_per_iteration": 4.521960020065308 + }, + { + "auxiliary_loss_clip": 0.01119304, + "auxiliary_loss_mlp": 0.0111622, + "balance_loss_clip": 1.00214016, + "balance_loss_mlp": 1.00053978, + "epoch": 0.4841124304824891, + "flos": 22411490181120.0, + "grad_norm": 1.8660134562965998, + "language_loss": 0.75064349, + "learning_rate": 2.199655463811236e-06, + "loss": 0.77299875, + "num_input_tokens_seen": 173137570, + "step": 8052, + "time_per_iteration": 2.6379706859588623 + }, + { + "auxiliary_loss_clip": 0.01151183, + "auxiliary_loss_mlp": 0.01115633, + "balance_loss_clip": 1.00214815, + "balance_loss_mlp": 1.0005244, + "epoch": 0.48417255373515705, + "flos": 13843959217920.0, + "grad_norm": 2.091901312526198, + "language_loss": 0.66269028, + "learning_rate": 2.1992679438107936e-06, + "loss": 0.68535841, + "num_input_tokens_seen": 173154355, + "step": 8053, + "time_per_iteration": 2.554547071456909 + }, + { + "auxiliary_loss_clip": 0.01151332, + "auxiliary_loss_mlp": 0.01114965, + "balance_loss_clip": 1.00211859, + "balance_loss_mlp": 1.00052416, + "epoch": 0.484232676987825, + "flos": 31649689935360.0, + "grad_norm": 1.804275905933712, + "language_loss": 0.69197625, + "learning_rate": 2.198880416254091e-06, + "loss": 0.71463919, + "num_input_tokens_seen": 173174845, + "step": 8054, + "time_per_iteration": 2.6415631771087646 + }, + { + "auxiliary_loss_clip": 0.01078775, + "auxiliary_loss_mlp": 0.01114083, + "balance_loss_clip": 1.00187707, + "balance_loss_mlp": 1.00059593, + "epoch": 0.48429280024049304, + "flos": 24095377814400.0, + "grad_norm": 2.33149228035172, + "language_loss": 0.69659144, + "learning_rate": 2.1984928811558233e-06, + "loss": 0.71852005, + "num_input_tokens_seen": 173195025, + "step": 8055, + "time_per_iteration": 2.7810614109039307 + }, + { + "auxiliary_loss_clip": 0.01151219, + "auxiliary_loss_mlp": 0.01115874, + "balance_loss_clip": 1.00226843, + "balance_loss_mlp": 1.00067019, + "epoch": 0.484352923493161, + "flos": 17530081747200.0, + "grad_norm": 2.0239381490112907, + "language_loss": 0.63304853, + "learning_rate": 2.198105338530685e-06, + "loss": 0.6557194, + "num_input_tokens_seen": 173213065, + "step": 8056, + "time_per_iteration": 3.940404176712036 + }, + { + "auxiliary_loss_clip": 0.01151338, + "auxiliary_loss_mlp": 0.01115259, + "balance_loss_clip": 1.00226283, + "balance_loss_mlp": 1.00053155, + "epoch": 0.48441304674582897, + "flos": 29166862043520.0, + "grad_norm": 1.6816804342592782, + "language_loss": 0.67521101, + "learning_rate": 2.1977177883933726e-06, + "loss": 0.69787699, + "num_input_tokens_seen": 173234545, + "step": 8057, + "time_per_iteration": 3.996434211730957 + }, + { + "auxiliary_loss_clip": 0.01122639, + "auxiliary_loss_mlp": 0.01114328, + "balance_loss_clip": 1.00189698, + "balance_loss_mlp": 1.00055444, + "epoch": 0.48447316999849693, + "flos": 15886701676800.0, + "grad_norm": 1.5419192749945678, + "language_loss": 0.81764102, + "learning_rate": 2.1973302307585827e-06, + "loss": 0.84001076, + "num_input_tokens_seen": 173252175, + "step": 8058, + "time_per_iteration": 4.019608020782471 + }, + { + "auxiliary_loss_clip": 0.01152864, + "auxiliary_loss_mlp": 0.01116041, + "balance_loss_clip": 1.00231051, + "balance_loss_mlp": 1.00064659, + "epoch": 0.4845332932511649, + "flos": 24381405815040.0, + "grad_norm": 1.6324024838398807, + "language_loss": 0.80009353, + "learning_rate": 2.1969426656410097e-06, + "loss": 0.82278258, + "num_input_tokens_seen": 173268790, + "step": 8059, + "time_per_iteration": 2.5633387565612793 + }, + { + "auxiliary_loss_clip": 0.011681, + "auxiliary_loss_mlp": 0.01116352, + "balance_loss_clip": 1.00226128, + "balance_loss_mlp": 1.00076675, + "epoch": 0.48459341650383286, + "flos": 37116478316160.0, + "grad_norm": 1.874767617664204, + "language_loss": 0.66938734, + "learning_rate": 2.196555093055352e-06, + "loss": 0.69223183, + "num_input_tokens_seen": 173288030, + "step": 8060, + "time_per_iteration": 2.6614460945129395 + }, + { + "auxiliary_loss_clip": 0.01151618, + "auxiliary_loss_mlp": 0.0111587, + "balance_loss_clip": 1.00217676, + "balance_loss_mlp": 1.00076151, + "epoch": 0.48465353975650083, + "flos": 22966777509120.0, + "grad_norm": 2.3055494581034974, + "language_loss": 0.67100155, + "learning_rate": 2.1961675130163046e-06, + "loss": 0.69367641, + "num_input_tokens_seen": 173305965, + "step": 8061, + "time_per_iteration": 2.556130886077881 + }, + { + "auxiliary_loss_clip": 0.01135863, + "auxiliary_loss_mlp": 0.01115974, + "balance_loss_clip": 1.00220323, + "balance_loss_mlp": 1.00057971, + "epoch": 0.4847136630091688, + "flos": 17707695523200.0, + "grad_norm": 2.067752577031868, + "language_loss": 0.8238076, + "learning_rate": 2.1957799255385653e-06, + "loss": 0.84632593, + "num_input_tokens_seen": 173321985, + "step": 8062, + "time_per_iteration": 2.571141481399536 + }, + { + "auxiliary_loss_clip": 0.01089273, + "auxiliary_loss_mlp": 0.01115558, + "balance_loss_clip": 1.00209427, + "balance_loss_mlp": 1.00083065, + "epoch": 0.48477378626183676, + "flos": 22018269018240.0, + "grad_norm": 1.5901403483782157, + "language_loss": 0.74724472, + "learning_rate": 2.1953923306368325e-06, + "loss": 0.76929301, + "num_input_tokens_seen": 173341315, + "step": 8063, + "time_per_iteration": 2.742668390274048 + }, + { + "auxiliary_loss_clip": 0.01136106, + "auxiliary_loss_mlp": 0.01116174, + "balance_loss_clip": 1.00206161, + "balance_loss_mlp": 1.00077891, + "epoch": 0.4848339095145047, + "flos": 27962956874880.0, + "grad_norm": 2.26425982213626, + "language_loss": 0.79045665, + "learning_rate": 2.1950047283258023e-06, + "loss": 0.81297946, + "num_input_tokens_seen": 173361055, + "step": 8064, + "time_per_iteration": 2.6933083534240723 + }, + { + "auxiliary_loss_clip": 0.01167778, + "auxiliary_loss_mlp": 0.00747332, + "balance_loss_clip": 1.00235486, + "balance_loss_mlp": 1.00042748, + "epoch": 0.4848940327671727, + "flos": 21688752625920.0, + "grad_norm": 1.9429234669162, + "language_loss": 0.79268229, + "learning_rate": 2.194617118620173e-06, + "loss": 0.81183338, + "num_input_tokens_seen": 173379255, + "step": 8065, + "time_per_iteration": 2.5328643321990967 + }, + { + "auxiliary_loss_clip": 0.01152718, + "auxiliary_loss_mlp": 0.007474, + "balance_loss_clip": 1.00217819, + "balance_loss_mlp": 1.00040662, + "epoch": 0.48495415601984065, + "flos": 20631578515200.0, + "grad_norm": 1.5173873396168263, + "language_loss": 0.76403654, + "learning_rate": 2.194229501534644e-06, + "loss": 0.78303772, + "num_input_tokens_seen": 173398370, + "step": 8066, + "time_per_iteration": 2.615267515182495 + }, + { + "auxiliary_loss_clip": 0.01167814, + "auxiliary_loss_mlp": 0.01115476, + "balance_loss_clip": 1.00235105, + "balance_loss_mlp": 1.00055861, + "epoch": 0.4850142792725086, + "flos": 25628152930560.0, + "grad_norm": 1.3666959446971225, + "language_loss": 0.71912706, + "learning_rate": 2.193841877083912e-06, + "loss": 0.74195993, + "num_input_tokens_seen": 173419595, + "step": 8067, + "time_per_iteration": 2.605821371078491 + }, + { + "auxiliary_loss_clip": 0.01084713, + "auxiliary_loss_mlp": 0.01114646, + "balance_loss_clip": 1.00171804, + "balance_loss_mlp": 1.00068235, + "epoch": 0.4850744025251766, + "flos": 13771958405760.0, + "grad_norm": 2.277106189527224, + "language_loss": 0.79104662, + "learning_rate": 2.1934542452826767e-06, + "loss": 0.8130402, + "num_input_tokens_seen": 173435390, + "step": 8068, + "time_per_iteration": 2.7226738929748535 + }, + { + "auxiliary_loss_clip": 0.01135752, + "auxiliary_loss_mlp": 0.01114259, + "balance_loss_clip": 1.00198865, + "balance_loss_mlp": 1.00077128, + "epoch": 0.4851345257778446, + "flos": 20261339078400.0, + "grad_norm": 1.470879888558324, + "language_loss": 0.84410632, + "learning_rate": 2.193066606145638e-06, + "loss": 0.86660635, + "num_input_tokens_seen": 173454095, + "step": 8069, + "time_per_iteration": 2.6013882160186768 + }, + { + "auxiliary_loss_clip": 0.01119019, + "auxiliary_loss_mlp": 0.01114441, + "balance_loss_clip": 1.00206995, + "balance_loss_mlp": 1.00066781, + "epoch": 0.48519464903051257, + "flos": 27089681420160.0, + "grad_norm": 4.15004312312003, + "language_loss": 0.77657712, + "learning_rate": 2.192678959687493e-06, + "loss": 0.79891169, + "num_input_tokens_seen": 173475300, + "step": 8070, + "time_per_iteration": 2.6779327392578125 + }, + { + "auxiliary_loss_clip": 0.01087324, + "auxiliary_loss_mlp": 0.01114523, + "balance_loss_clip": 1.00200808, + "balance_loss_mlp": 1.00046337, + "epoch": 0.48525477228318054, + "flos": 17127235739520.0, + "grad_norm": 2.1102434130116503, + "language_loss": 0.78055251, + "learning_rate": 2.192291305922943e-06, + "loss": 0.80257106, + "num_input_tokens_seen": 173492005, + "step": 8071, + "time_per_iteration": 2.6891653537750244 + }, + { + "auxiliary_loss_clip": 0.01085985, + "auxiliary_loss_mlp": 0.01115011, + "balance_loss_clip": 1.00175941, + "balance_loss_mlp": 1.00076079, + "epoch": 0.4853148955358485, + "flos": 28180324028160.0, + "grad_norm": 2.795017446236418, + "language_loss": 0.72081566, + "learning_rate": 2.1919036448666873e-06, + "loss": 0.74282569, + "num_input_tokens_seen": 173511995, + "step": 8072, + "time_per_iteration": 2.7858662605285645 + }, + { + "auxiliary_loss_clip": 0.01109431, + "auxiliary_loss_mlp": 0.01115778, + "balance_loss_clip": 1.00237787, + "balance_loss_mlp": 1.0007652, + "epoch": 0.48537501878851647, + "flos": 17493309198720.0, + "grad_norm": 1.7775644610038315, + "language_loss": 0.87460029, + "learning_rate": 2.1915159765334262e-06, + "loss": 0.89685237, + "num_input_tokens_seen": 173530215, + "step": 8073, + "time_per_iteration": 2.6530754566192627 + }, + { + "auxiliary_loss_clip": 0.0112101, + "auxiliary_loss_mlp": 0.01114964, + "balance_loss_clip": 1.00205278, + "balance_loss_mlp": 1.0006187, + "epoch": 0.48543514204118443, + "flos": 28584857975040.0, + "grad_norm": 1.7486886683208436, + "language_loss": 0.60660112, + "learning_rate": 2.19112830093786e-06, + "loss": 0.62896085, + "num_input_tokens_seen": 173550920, + "step": 8074, + "time_per_iteration": 2.712557077407837 + }, + { + "auxiliary_loss_clip": 0.011215, + "auxiliary_loss_mlp": 0.00747521, + "balance_loss_clip": 1.00200081, + "balance_loss_mlp": 1.00038493, + "epoch": 0.4854952652938524, + "flos": 20959981585920.0, + "grad_norm": 1.6385684709755013, + "language_loss": 0.73304594, + "learning_rate": 2.19074061809469e-06, + "loss": 0.7517361, + "num_input_tokens_seen": 173569065, + "step": 8075, + "time_per_iteration": 2.667454719543457 + }, + { + "auxiliary_loss_clip": 0.01167594, + "auxiliary_loss_mlp": 0.01113551, + "balance_loss_clip": 1.00231266, + "balance_loss_mlp": 1.00063562, + "epoch": 0.48555538854652036, + "flos": 66529543155840.0, + "grad_norm": 1.7179303620601785, + "language_loss": 0.81687868, + "learning_rate": 2.1903529280186163e-06, + "loss": 0.83969015, + "num_input_tokens_seen": 173596085, + "step": 8076, + "time_per_iteration": 2.9297666549682617 + }, + { + "auxiliary_loss_clip": 0.01137038, + "auxiliary_loss_mlp": 0.01114567, + "balance_loss_clip": 1.00212646, + "balance_loss_mlp": 1.00069821, + "epoch": 0.4856155117991883, + "flos": 15924982596480.0, + "grad_norm": 2.5522388782465444, + "language_loss": 0.86241484, + "learning_rate": 2.1899652307243407e-06, + "loss": 0.88493091, + "num_input_tokens_seen": 173613900, + "step": 8077, + "time_per_iteration": 2.605875253677368 + }, + { + "auxiliary_loss_clip": 0.01120465, + "auxiliary_loss_mlp": 0.01092814, + "balance_loss_clip": 1.00153363, + "balance_loss_mlp": 1.00011659, + "epoch": 0.4856756350518563, + "flos": 71047395060480.0, + "grad_norm": 0.9048441652892717, + "language_loss": 0.58510828, + "learning_rate": 2.189577526226564e-06, + "loss": 0.60724103, + "num_input_tokens_seen": 173671305, + "step": 8078, + "time_per_iteration": 3.163219451904297 + }, + { + "auxiliary_loss_clip": 0.01167896, + "auxiliary_loss_mlp": 0.01115949, + "balance_loss_clip": 1.00225222, + "balance_loss_mlp": 1.00065005, + "epoch": 0.48573575830452426, + "flos": 29825679346560.0, + "grad_norm": 1.584976566512085, + "language_loss": 0.72533953, + "learning_rate": 2.1891898145399884e-06, + "loss": 0.74817801, + "num_input_tokens_seen": 173692070, + "step": 8079, + "time_per_iteration": 2.6657145023345947 + }, + { + "auxiliary_loss_clip": 0.01118, + "auxiliary_loss_mlp": 0.01115396, + "balance_loss_clip": 1.00211847, + "balance_loss_mlp": 1.00076389, + "epoch": 0.4857958815571922, + "flos": 17639501552640.0, + "grad_norm": 5.984897514533729, + "language_loss": 0.79569197, + "learning_rate": 2.1888020956793172e-06, + "loss": 0.81802595, + "num_input_tokens_seen": 173709785, + "step": 8080, + "time_per_iteration": 2.6417195796966553 + }, + { + "auxiliary_loss_clip": 0.01136207, + "auxiliary_loss_mlp": 0.01114945, + "balance_loss_clip": 1.00222135, + "balance_loss_mlp": 1.00059974, + "epoch": 0.4858560048098602, + "flos": 21105491581440.0, + "grad_norm": 2.1400219078690412, + "language_loss": 0.84303683, + "learning_rate": 2.188414369659251e-06, + "loss": 0.86554837, + "num_input_tokens_seen": 173728770, + "step": 8081, + "time_per_iteration": 2.6450085639953613 + }, + { + "auxiliary_loss_clip": 0.01152769, + "auxiliary_loss_mlp": 0.0111482, + "balance_loss_clip": 1.00210047, + "balance_loss_mlp": 1.00076032, + "epoch": 0.4859161280625282, + "flos": 22090844448000.0, + "grad_norm": 1.5695559460636697, + "language_loss": 0.82954836, + "learning_rate": 2.1880266364944924e-06, + "loss": 0.85222423, + "num_input_tokens_seen": 173747355, + "step": 8082, + "time_per_iteration": 2.56597900390625 + }, + { + "auxiliary_loss_clip": 0.01137194, + "auxiliary_loss_mlp": 0.01113422, + "balance_loss_clip": 1.00226915, + "balance_loss_mlp": 1.00060213, + "epoch": 0.4859762513151962, + "flos": 17493452853120.0, + "grad_norm": 1.932096175558116, + "language_loss": 0.86833656, + "learning_rate": 2.187638896199746e-06, + "loss": 0.89084268, + "num_input_tokens_seen": 173764825, + "step": 8083, + "time_per_iteration": 2.607356071472168 + }, + { + "auxiliary_loss_clip": 0.01101093, + "auxiliary_loss_mlp": 0.01113769, + "balance_loss_clip": 1.00187862, + "balance_loss_mlp": 1.00094891, + "epoch": 0.48603637456786414, + "flos": 18004246208640.0, + "grad_norm": 1.5438955386984214, + "language_loss": 0.80813789, + "learning_rate": 2.1872511487897126e-06, + "loss": 0.8302865, + "num_input_tokens_seen": 173783215, + "step": 8084, + "time_per_iteration": 2.660885810852051 + }, + { + "auxiliary_loss_clip": 0.01151112, + "auxiliary_loss_mlp": 0.01115481, + "balance_loss_clip": 1.00226128, + "balance_loss_mlp": 1.0007534, + "epoch": 0.4860964978205321, + "flos": 22492038430080.0, + "grad_norm": 2.2521168758061, + "language_loss": 0.68643177, + "learning_rate": 2.186863394279098e-06, + "loss": 0.70909768, + "num_input_tokens_seen": 173801905, + "step": 8085, + "time_per_iteration": 2.642026424407959 + }, + { + "auxiliary_loss_clip": 0.01151236, + "auxiliary_loss_mlp": 0.01113923, + "balance_loss_clip": 1.00219572, + "balance_loss_mlp": 1.00081754, + "epoch": 0.48615662107320007, + "flos": 23372532518400.0, + "grad_norm": 1.4347066213371853, + "language_loss": 0.77615356, + "learning_rate": 2.1864756326826046e-06, + "loss": 0.79880512, + "num_input_tokens_seen": 173824690, + "step": 8086, + "time_per_iteration": 2.6465847492218018 + }, + { + "auxiliary_loss_clip": 0.01167682, + "auxiliary_loss_mlp": 0.0111437, + "balance_loss_clip": 1.002249, + "balance_loss_mlp": 1.00069261, + "epoch": 0.48621674432586803, + "flos": 34418833136640.0, + "grad_norm": 2.0318920526430317, + "language_loss": 0.69558412, + "learning_rate": 2.1860878640149355e-06, + "loss": 0.71840471, + "num_input_tokens_seen": 173844450, + "step": 8087, + "time_per_iteration": 2.633056879043579 + }, + { + "auxiliary_loss_clip": 0.01150951, + "auxiliary_loss_mlp": 0.01116208, + "balance_loss_clip": 1.00205183, + "balance_loss_mlp": 1.00071836, + "epoch": 0.486276867578536, + "flos": 33107555237760.0, + "grad_norm": 1.6978635216175968, + "language_loss": 0.72767389, + "learning_rate": 2.1857000882907974e-06, + "loss": 0.75034547, + "num_input_tokens_seen": 173864975, + "step": 8088, + "time_per_iteration": 4.119424104690552 + }, + { + "auxiliary_loss_clip": 0.01135865, + "auxiliary_loss_mlp": 0.01114526, + "balance_loss_clip": 1.0020113, + "balance_loss_mlp": 1.00075233, + "epoch": 0.48633699083120396, + "flos": 21470703114240.0, + "grad_norm": 1.6697138151070328, + "language_loss": 0.75143266, + "learning_rate": 2.185312305524892e-06, + "loss": 0.77393663, + "num_input_tokens_seen": 173883805, + "step": 8089, + "time_per_iteration": 2.6018364429473877 + }, + { + "auxiliary_loss_clip": 0.0111785, + "auxiliary_loss_mlp": 0.01116017, + "balance_loss_clip": 1.00204444, + "balance_loss_mlp": 1.00071764, + "epoch": 0.48639711408387193, + "flos": 20084335833600.0, + "grad_norm": 1.9507659700424198, + "language_loss": 0.84146386, + "learning_rate": 2.184924515731926e-06, + "loss": 0.86380249, + "num_input_tokens_seen": 173903520, + "step": 8090, + "time_per_iteration": 2.6401805877685547 + }, + { + "auxiliary_loss_clip": 0.01167568, + "auxiliary_loss_mlp": 0.01114409, + "balance_loss_clip": 1.00218856, + "balance_loss_mlp": 1.00063586, + "epoch": 0.4864572373365399, + "flos": 20778884190720.0, + "grad_norm": 1.6081779053350653, + "language_loss": 0.759112, + "learning_rate": 2.1845367189266045e-06, + "loss": 0.78193176, + "num_input_tokens_seen": 173924255, + "step": 8091, + "time_per_iteration": 2.546604633331299 + }, + { + "auxiliary_loss_clip": 0.01152926, + "auxiliary_loss_mlp": 0.0111357, + "balance_loss_clip": 1.00219786, + "balance_loss_mlp": 1.00065517, + "epoch": 0.48651736058920786, + "flos": 26025360503040.0, + "grad_norm": 1.6233224622282736, + "language_loss": 0.80572373, + "learning_rate": 2.184148915123631e-06, + "loss": 0.82838869, + "num_input_tokens_seen": 173943285, + "step": 8092, + "time_per_iteration": 2.619093656539917 + }, + { + "auxiliary_loss_clip": 0.01135855, + "auxiliary_loss_mlp": 0.00747364, + "balance_loss_clip": 1.00204039, + "balance_loss_mlp": 1.00027418, + "epoch": 0.4865774838418758, + "flos": 20485601642880.0, + "grad_norm": 1.4270727354614452, + "language_loss": 0.71727246, + "learning_rate": 2.1837611043377126e-06, + "loss": 0.73610473, + "num_input_tokens_seen": 173962205, + "step": 8093, + "time_per_iteration": 3.968170166015625 + }, + { + "auxiliary_loss_clip": 0.01167656, + "auxiliary_loss_mlp": 0.01114671, + "balance_loss_clip": 1.00217879, + "balance_loss_mlp": 1.00061142, + "epoch": 0.4866376070945438, + "flos": 23547704169600.0, + "grad_norm": 2.9310021410578733, + "language_loss": 0.68034494, + "learning_rate": 2.1833732865835545e-06, + "loss": 0.70316821, + "num_input_tokens_seen": 173980945, + "step": 8094, + "time_per_iteration": 2.5466156005859375 + }, + { + "auxiliary_loss_clip": 0.01134518, + "auxiliary_loss_mlp": 0.01116102, + "balance_loss_clip": 1.00218129, + "balance_loss_mlp": 1.00061202, + "epoch": 0.4866977303472118, + "flos": 16690598012160.0, + "grad_norm": 2.4798618566985895, + "language_loss": 0.67067945, + "learning_rate": 2.1829854618758636e-06, + "loss": 0.69318569, + "num_input_tokens_seen": 173998860, + "step": 8095, + "time_per_iteration": 3.917142629623413 + }, + { + "auxiliary_loss_clip": 0.01153179, + "auxiliary_loss_mlp": 0.01115407, + "balance_loss_clip": 1.00229084, + "balance_loss_mlp": 1.00068021, + "epoch": 0.4867578535998798, + "flos": 17896011552000.0, + "grad_norm": 2.941119879159365, + "language_loss": 0.78258967, + "learning_rate": 2.182597630229345e-06, + "loss": 0.80527556, + "num_input_tokens_seen": 174016665, + "step": 8096, + "time_per_iteration": 3.955781936645508 + }, + { + "auxiliary_loss_clip": 0.01137885, + "auxiliary_loss_mlp": 0.01114671, + "balance_loss_clip": 1.00218165, + "balance_loss_mlp": 1.00070643, + "epoch": 0.48681797685254774, + "flos": 22637799820800.0, + "grad_norm": 1.998028347014867, + "language_loss": 0.67572761, + "learning_rate": 2.1822097916587067e-06, + "loss": 0.69825321, + "num_input_tokens_seen": 174034800, + "step": 8097, + "time_per_iteration": 2.610464572906494 + }, + { + "auxiliary_loss_clip": 0.01137372, + "auxiliary_loss_mlp": 0.01115335, + "balance_loss_clip": 1.0021317, + "balance_loss_mlp": 1.00079834, + "epoch": 0.4868781001052157, + "flos": 20886077352960.0, + "grad_norm": 1.5593983377899288, + "language_loss": 0.71643293, + "learning_rate": 2.1818219461786543e-06, + "loss": 0.73895997, + "num_input_tokens_seen": 174054445, + "step": 8098, + "time_per_iteration": 2.6174380779266357 + }, + { + "auxiliary_loss_clip": 0.01151181, + "auxiliary_loss_mlp": 0.01116006, + "balance_loss_clip": 1.00211358, + "balance_loss_mlp": 1.00061178, + "epoch": 0.48693822335788367, + "flos": 41974940937600.0, + "grad_norm": 2.4826615403540404, + "language_loss": 0.6614241, + "learning_rate": 2.1814340938038956e-06, + "loss": 0.68409598, + "num_input_tokens_seen": 174077890, + "step": 8099, + "time_per_iteration": 2.7453322410583496 + }, + { + "auxiliary_loss_clip": 0.0110363, + "auxiliary_loss_mlp": 0.0111492, + "balance_loss_clip": 1.00188851, + "balance_loss_mlp": 1.00076509, + "epoch": 0.48699834661055164, + "flos": 24243294021120.0, + "grad_norm": 1.7036814208798874, + "language_loss": 0.66744673, + "learning_rate": 2.181046234549138e-06, + "loss": 0.6896323, + "num_input_tokens_seen": 174097460, + "step": 8100, + "time_per_iteration": 2.6923928260803223 + }, + { + "auxiliary_loss_clip": 0.01118718, + "auxiliary_loss_mlp": 0.01114263, + "balance_loss_clip": 1.00198722, + "balance_loss_mlp": 1.00058484, + "epoch": 0.4870584698632196, + "flos": 25923877603200.0, + "grad_norm": 1.3920986957417771, + "language_loss": 0.76720774, + "learning_rate": 2.180658368429088e-06, + "loss": 0.78953755, + "num_input_tokens_seen": 174120775, + "step": 8101, + "time_per_iteration": 2.7432079315185547 + }, + { + "auxiliary_loss_clip": 0.0116275, + "auxiliary_loss_mlp": 0.01092785, + "balance_loss_clip": 1.00141668, + "balance_loss_mlp": 1.00008798, + "epoch": 0.48711859311588757, + "flos": 70211933648640.0, + "grad_norm": 0.7020921329709108, + "language_loss": 0.52315992, + "learning_rate": 2.1802704954584565e-06, + "loss": 0.54571527, + "num_input_tokens_seen": 174189135, + "step": 8102, + "time_per_iteration": 3.24904465675354 + }, + { + "auxiliary_loss_clip": 0.01135113, + "auxiliary_loss_mlp": 0.01114552, + "balance_loss_clip": 1.00203359, + "balance_loss_mlp": 1.0005877, + "epoch": 0.48717871636855553, + "flos": 12342964659840.0, + "grad_norm": 1.993703086751408, + "language_loss": 0.73491192, + "learning_rate": 2.1798826156519484e-06, + "loss": 0.75740862, + "num_input_tokens_seen": 174203250, + "step": 8103, + "time_per_iteration": 2.583794116973877 + }, + { + "auxiliary_loss_clip": 0.01151291, + "auxiliary_loss_mlp": 0.01115396, + "balance_loss_clip": 1.00218117, + "balance_loss_mlp": 1.00105071, + "epoch": 0.4872388396212235, + "flos": 23477139901440.0, + "grad_norm": 1.8845668814961976, + "language_loss": 0.62870562, + "learning_rate": 2.1794947290242737e-06, + "loss": 0.65137243, + "num_input_tokens_seen": 174224145, + "step": 8104, + "time_per_iteration": 2.614285469055176 + }, + { + "auxiliary_loss_clip": 0.01167662, + "auxiliary_loss_mlp": 0.01114807, + "balance_loss_clip": 1.00213909, + "balance_loss_mlp": 1.00065184, + "epoch": 0.48729896287389146, + "flos": 31427582186880.0, + "grad_norm": 1.5745512757612696, + "language_loss": 0.69190061, + "learning_rate": 2.1791068355901413e-06, + "loss": 0.71472532, + "num_input_tokens_seen": 174244435, + "step": 8105, + "time_per_iteration": 2.62045955657959 + }, + { + "auxiliary_loss_clip": 0.01124267, + "auxiliary_loss_mlp": 0.01114081, + "balance_loss_clip": 1.00269663, + "balance_loss_mlp": 1.00049877, + "epoch": 0.4873590861265594, + "flos": 19057936700160.0, + "grad_norm": 1.7422912220762024, + "language_loss": 0.73438001, + "learning_rate": 2.178718935364259e-06, + "loss": 0.75676346, + "num_input_tokens_seen": 174262710, + "step": 8106, + "time_per_iteration": 2.650325298309326 + }, + { + "auxiliary_loss_clip": 0.0113406, + "auxiliary_loss_mlp": 0.00747481, + "balance_loss_clip": 1.00219893, + "balance_loss_mlp": 1.0003376, + "epoch": 0.4874192093792274, + "flos": 24348296453760.0, + "grad_norm": 2.1920418942581423, + "language_loss": 0.76982296, + "learning_rate": 2.1783310283613373e-06, + "loss": 0.78863835, + "num_input_tokens_seen": 174281545, + "step": 8107, + "time_per_iteration": 2.6828694343566895 + }, + { + "auxiliary_loss_clip": 0.01104025, + "auxiliary_loss_mlp": 0.01114537, + "balance_loss_clip": 1.00193024, + "balance_loss_mlp": 1.00057244, + "epoch": 0.4874793326318954, + "flos": 23112610727040.0, + "grad_norm": 1.6185851558315458, + "language_loss": 0.75377274, + "learning_rate": 2.1779431145960853e-06, + "loss": 0.7759583, + "num_input_tokens_seen": 174300290, + "step": 8108, + "time_per_iteration": 2.6806249618530273 + }, + { + "auxiliary_loss_clip": 0.01150926, + "auxiliary_loss_mlp": 0.01113365, + "balance_loss_clip": 1.00221801, + "balance_loss_mlp": 1.00054502, + "epoch": 0.4875394558845634, + "flos": 19026156142080.0, + "grad_norm": 1.7290538439756238, + "language_loss": 0.73692024, + "learning_rate": 2.177555194083212e-06, + "loss": 0.75956321, + "num_input_tokens_seen": 174318490, + "step": 8109, + "time_per_iteration": 2.5483858585357666 + }, + { + "auxiliary_loss_clip": 0.01151051, + "auxiliary_loss_mlp": 0.01114112, + "balance_loss_clip": 1.00213075, + "balance_loss_mlp": 1.00062513, + "epoch": 0.48759957913723134, + "flos": 21433607343360.0, + "grad_norm": 1.8108752925030622, + "language_loss": 0.78325236, + "learning_rate": 2.177167266837428e-06, + "loss": 0.80590403, + "num_input_tokens_seen": 174335505, + "step": 8110, + "time_per_iteration": 2.5554070472717285 + }, + { + "auxiliary_loss_clip": 0.01150957, + "auxiliary_loss_mlp": 0.01115124, + "balance_loss_clip": 1.00218713, + "balance_loss_mlp": 1.00077868, + "epoch": 0.4876597023898993, + "flos": 17748669962880.0, + "grad_norm": 2.170251356392201, + "language_loss": 0.7242111, + "learning_rate": 2.176779332873444e-06, + "loss": 0.74687195, + "num_input_tokens_seen": 174353990, + "step": 8111, + "time_per_iteration": 2.5619332790374756 + }, + { + "auxiliary_loss_clip": 0.01151235, + "auxiliary_loss_mlp": 0.01114892, + "balance_loss_clip": 1.0022701, + "balance_loss_mlp": 1.00064182, + "epoch": 0.4877198256425673, + "flos": 17019647527680.0, + "grad_norm": 1.6285006632395462, + "language_loss": 0.76088548, + "learning_rate": 2.17639139220597e-06, + "loss": 0.78354675, + "num_input_tokens_seen": 174373425, + "step": 8112, + "time_per_iteration": 2.5930533409118652 + }, + { + "auxiliary_loss_clip": 0.01151208, + "auxiliary_loss_mlp": 0.01115947, + "balance_loss_clip": 1.0021708, + "balance_loss_mlp": 1.0008384, + "epoch": 0.48777994889523524, + "flos": 22384091082240.0, + "grad_norm": 1.5140251232484878, + "language_loss": 0.75318414, + "learning_rate": 2.1760034448497166e-06, + "loss": 0.77585566, + "num_input_tokens_seen": 174393070, + "step": 8113, + "time_per_iteration": 2.5763635635375977 + }, + { + "auxiliary_loss_clip": 0.01131601, + "auxiliary_loss_mlp": 0.00746289, + "balance_loss_clip": 1.00139606, + "balance_loss_mlp": 1.00000024, + "epoch": 0.4878400721479032, + "flos": 61241772159360.0, + "grad_norm": 0.7804826378032714, + "language_loss": 0.48895931, + "learning_rate": 2.1756154908193943e-06, + "loss": 0.50773823, + "num_input_tokens_seen": 174446880, + "step": 8114, + "time_per_iteration": 3.054457426071167 + }, + { + "auxiliary_loss_clip": 0.01126384, + "auxiliary_loss_mlp": 0.01115414, + "balance_loss_clip": 1.0021944, + "balance_loss_mlp": 1.00078261, + "epoch": 0.48790019540057117, + "flos": 24536612482560.0, + "grad_norm": 1.3672678071391173, + "language_loss": 0.76747704, + "learning_rate": 2.1752275301297155e-06, + "loss": 0.78989506, + "num_input_tokens_seen": 174468485, + "step": 8115, + "time_per_iteration": 2.6599650382995605 + }, + { + "auxiliary_loss_clip": 0.01134226, + "auxiliary_loss_mlp": 0.01115615, + "balance_loss_clip": 1.00226271, + "balance_loss_mlp": 1.00069678, + "epoch": 0.48796031865323913, + "flos": 21833939399040.0, + "grad_norm": 2.0145846733926165, + "language_loss": 0.7189703, + "learning_rate": 2.1748395627953915e-06, + "loss": 0.74146867, + "num_input_tokens_seen": 174486360, + "step": 8116, + "time_per_iteration": 2.598599910736084 + }, + { + "auxiliary_loss_clip": 0.0112061, + "auxiliary_loss_mlp": 0.01114812, + "balance_loss_clip": 1.00207615, + "balance_loss_mlp": 1.00075269, + "epoch": 0.4880204419059071, + "flos": 18588907883520.0, + "grad_norm": 2.0057167450677715, + "language_loss": 0.6329428, + "learning_rate": 2.1744515888311335e-06, + "loss": 0.65529704, + "num_input_tokens_seen": 174505075, + "step": 8117, + "time_per_iteration": 2.6596877574920654 + }, + { + "auxiliary_loss_clip": 0.01135616, + "auxiliary_loss_mlp": 0.01115264, + "balance_loss_clip": 1.00203323, + "balance_loss_mlp": 1.00063217, + "epoch": 0.48808056515857506, + "flos": 19172168928000.0, + "grad_norm": 1.6452297141270236, + "language_loss": 0.78904736, + "learning_rate": 2.1740636082516533e-06, + "loss": 0.8115561, + "num_input_tokens_seen": 174523385, + "step": 8118, + "time_per_iteration": 2.5929601192474365 + }, + { + "auxiliary_loss_clip": 0.01133895, + "auxiliary_loss_mlp": 0.0111534, + "balance_loss_clip": 1.00206208, + "balance_loss_mlp": 1.00070882, + "epoch": 0.48814068841124303, + "flos": 20120497850880.0, + "grad_norm": 1.8055586697933916, + "language_loss": 0.63612747, + "learning_rate": 2.1736756210716645e-06, + "loss": 0.65861976, + "num_input_tokens_seen": 174542200, + "step": 8119, + "time_per_iteration": 2.586596965789795 + }, + { + "auxiliary_loss_clip": 0.01092077, + "auxiliary_loss_mlp": 0.0074739, + "balance_loss_clip": 1.0021019, + "balance_loss_mlp": 1.00028956, + "epoch": 0.488200811663911, + "flos": 22965592360320.0, + "grad_norm": 1.9177059558898228, + "language_loss": 0.721259, + "learning_rate": 2.173287627305878e-06, + "loss": 0.73965371, + "num_input_tokens_seen": 174563620, + "step": 8120, + "time_per_iteration": 2.7454702854156494 + }, + { + "auxiliary_loss_clip": 0.01152451, + "auxiliary_loss_mlp": 0.01115284, + "balance_loss_clip": 1.00222445, + "balance_loss_mlp": 1.00065196, + "epoch": 0.48826093491657896, + "flos": 33910697387520.0, + "grad_norm": 1.7474259231906608, + "language_loss": 0.63846725, + "learning_rate": 2.1728996269690075e-06, + "loss": 0.66114461, + "num_input_tokens_seen": 174586465, + "step": 8121, + "time_per_iteration": 2.7026827335357666 + }, + { + "auxiliary_loss_clip": 0.01153086, + "auxiliary_loss_mlp": 0.01116122, + "balance_loss_clip": 1.00226486, + "balance_loss_mlp": 1.00072718, + "epoch": 0.488321058169247, + "flos": 23070307484160.0, + "grad_norm": 1.872378205448449, + "language_loss": 0.827981, + "learning_rate": 2.1725116200757664e-06, + "loss": 0.85067308, + "num_input_tokens_seen": 174604035, + "step": 8122, + "time_per_iteration": 2.565009832382202 + }, + { + "auxiliary_loss_clip": 0.01152935, + "auxiliary_loss_mlp": 0.01116068, + "balance_loss_clip": 1.00237203, + "balance_loss_mlp": 1.00076878, + "epoch": 0.48838118142191494, + "flos": 19317714837120.0, + "grad_norm": 2.2970740857111034, + "language_loss": 0.85580271, + "learning_rate": 2.172123606640866e-06, + "loss": 0.87849277, + "num_input_tokens_seen": 174621715, + "step": 8123, + "time_per_iteration": 2.5533273220062256 + }, + { + "auxiliary_loss_clip": 0.01119287, + "auxiliary_loss_mlp": 0.01116126, + "balance_loss_clip": 1.001912, + "balance_loss_mlp": 1.00063634, + "epoch": 0.4884413046745829, + "flos": 25410678036480.0, + "grad_norm": 1.6623200426067748, + "language_loss": 0.85686904, + "learning_rate": 2.1717355866790227e-06, + "loss": 0.87922323, + "num_input_tokens_seen": 174643835, + "step": 8124, + "time_per_iteration": 2.709688186645508 + }, + { + "auxiliary_loss_clip": 0.01136285, + "auxiliary_loss_mlp": 0.01115457, + "balance_loss_clip": 1.00210059, + "balance_loss_mlp": 1.00063431, + "epoch": 0.4885014279272509, + "flos": 20991546662400.0, + "grad_norm": 2.030252449441466, + "language_loss": 0.79069889, + "learning_rate": 2.171347560204948e-06, + "loss": 0.81321633, + "num_input_tokens_seen": 174660955, + "step": 8125, + "time_per_iteration": 2.599571466445923 + }, + { + "auxiliary_loss_clip": 0.01103526, + "auxiliary_loss_mlp": 0.01115212, + "balance_loss_clip": 1.00197458, + "balance_loss_mlp": 1.00058055, + "epoch": 0.48856155117991884, + "flos": 13771599269760.0, + "grad_norm": 2.0826550863043916, + "language_loss": 0.72495353, + "learning_rate": 2.170959527233356e-06, + "loss": 0.74714088, + "num_input_tokens_seen": 174678270, + "step": 8126, + "time_per_iteration": 4.096534967422485 + }, + { + "auxiliary_loss_clip": 0.01152916, + "auxiliary_loss_mlp": 0.0111527, + "balance_loss_clip": 1.00220251, + "balance_loss_mlp": 1.00073338, + "epoch": 0.4886216744325868, + "flos": 32087764206720.0, + "grad_norm": 6.848737509789602, + "language_loss": 0.68660414, + "learning_rate": 2.1705714877789633e-06, + "loss": 0.70928603, + "num_input_tokens_seen": 174698360, + "step": 8127, + "time_per_iteration": 2.666353464126587 + }, + { + "auxiliary_loss_clip": 0.0116786, + "auxiliary_loss_mlp": 0.01115499, + "balance_loss_clip": 1.00217342, + "balance_loss_mlp": 1.00058115, + "epoch": 0.48868179768525477, + "flos": 19610063631360.0, + "grad_norm": 1.8801908017568632, + "language_loss": 0.75975204, + "learning_rate": 2.170183441856481e-06, + "loss": 0.78258568, + "num_input_tokens_seen": 174716755, + "step": 8128, + "time_per_iteration": 2.5275676250457764 + }, + { + "auxiliary_loss_clip": 0.0116776, + "auxiliary_loss_mlp": 0.01114504, + "balance_loss_clip": 1.00223088, + "balance_loss_mlp": 1.00063515, + "epoch": 0.48874192093792274, + "flos": 21286912199040.0, + "grad_norm": 1.5960062016644265, + "language_loss": 0.76035827, + "learning_rate": 2.1697953894806265e-06, + "loss": 0.78318095, + "num_input_tokens_seen": 174735560, + "step": 8129, + "time_per_iteration": 2.5175678730010986 + }, + { + "auxiliary_loss_clip": 0.01151045, + "auxiliary_loss_mlp": 0.01115893, + "balance_loss_clip": 1.00208783, + "balance_loss_mlp": 1.00068903, + "epoch": 0.4888020441905907, + "flos": 14173439696640.0, + "grad_norm": 2.3962074321050713, + "language_loss": 0.65128386, + "learning_rate": 2.169407330666114e-06, + "loss": 0.67395329, + "num_input_tokens_seen": 174752730, + "step": 8130, + "time_per_iteration": 3.928032159805298 + }, + { + "auxiliary_loss_clip": 0.01120756, + "auxiliary_loss_mlp": 0.01114779, + "balance_loss_clip": 1.00200129, + "balance_loss_mlp": 1.0006243, + "epoch": 0.48886216744325867, + "flos": 24097891766400.0, + "grad_norm": 1.9575109953517587, + "language_loss": 0.7231642, + "learning_rate": 2.169019265427658e-06, + "loss": 0.74551952, + "num_input_tokens_seen": 174772520, + "step": 8131, + "time_per_iteration": 2.6649651527404785 + }, + { + "auxiliary_loss_clip": 0.01151297, + "auxiliary_loss_mlp": 0.01115251, + "balance_loss_clip": 1.00223589, + "balance_loss_mlp": 1.00080967, + "epoch": 0.48892229069592663, + "flos": 38431419402240.0, + "grad_norm": 1.512233661145687, + "language_loss": 0.69478303, + "learning_rate": 2.1686311937799745e-06, + "loss": 0.71744847, + "num_input_tokens_seen": 174796540, + "step": 8132, + "time_per_iteration": 4.076133728027344 + }, + { + "auxiliary_loss_clip": 0.01157215, + "auxiliary_loss_mlp": 0.01115164, + "balance_loss_clip": 1.00225413, + "balance_loss_mlp": 1.00053239, + "epoch": 0.4889824139485946, + "flos": 23843321101440.0, + "grad_norm": 1.5296477741120975, + "language_loss": 0.70120704, + "learning_rate": 2.1682431157377797e-06, + "loss": 0.72393084, + "num_input_tokens_seen": 174817840, + "step": 8133, + "time_per_iteration": 4.047436714172363 + }, + { + "auxiliary_loss_clip": 0.01106768, + "auxiliary_loss_mlp": 0.01114042, + "balance_loss_clip": 1.00198686, + "balance_loss_mlp": 1.00084138, + "epoch": 0.48904253720126256, + "flos": 24425827960320.0, + "grad_norm": 3.020258402916455, + "language_loss": 0.70684576, + "learning_rate": 2.1678550313157883e-06, + "loss": 0.72905385, + "num_input_tokens_seen": 174837885, + "step": 8134, + "time_per_iteration": 2.777745485305786 + }, + { + "auxiliary_loss_clip": 0.01102477, + "auxiliary_loss_mlp": 0.01116017, + "balance_loss_clip": 1.00204682, + "balance_loss_mlp": 1.00071788, + "epoch": 0.4891026604539306, + "flos": 24170682677760.0, + "grad_norm": 1.8875713325772372, + "language_loss": 0.8035419, + "learning_rate": 2.167466940528718e-06, + "loss": 0.82572687, + "num_input_tokens_seen": 174855240, + "step": 8135, + "time_per_iteration": 2.7045698165893555 + }, + { + "auxiliary_loss_clip": 0.01167536, + "auxiliary_loss_mlp": 0.01113751, + "balance_loss_clip": 1.00211132, + "balance_loss_mlp": 1.00074053, + "epoch": 0.48916278370659855, + "flos": 21470954509440.0, + "grad_norm": 1.6320764082536423, + "language_loss": 0.74246681, + "learning_rate": 2.1670788433912843e-06, + "loss": 0.76527965, + "num_input_tokens_seen": 174875145, + "step": 8136, + "time_per_iteration": 2.533726692199707 + }, + { + "auxiliary_loss_clip": 0.01135818, + "auxiliary_loss_mlp": 0.01114779, + "balance_loss_clip": 1.0021162, + "balance_loss_mlp": 1.00062418, + "epoch": 0.4892229069592665, + "flos": 22309755886080.0, + "grad_norm": 1.374520589218751, + "language_loss": 0.7349726, + "learning_rate": 2.166690739918204e-06, + "loss": 0.75747859, + "num_input_tokens_seen": 174894770, + "step": 8137, + "time_per_iteration": 2.599360466003418 + }, + { + "auxiliary_loss_clip": 0.01069221, + "auxiliary_loss_mlp": 0.01115396, + "balance_loss_clip": 1.00180101, + "balance_loss_mlp": 1.00066853, + "epoch": 0.4892830302119345, + "flos": 12786856934400.0, + "grad_norm": 2.0684196820180936, + "language_loss": 0.75467229, + "learning_rate": 2.1663026301241944e-06, + "loss": 0.7765184, + "num_input_tokens_seen": 174912780, + "step": 8138, + "time_per_iteration": 2.7414944171905518 + }, + { + "auxiliary_loss_clip": 0.01119731, + "auxiliary_loss_mlp": 0.01113936, + "balance_loss_clip": 1.00211096, + "balance_loss_mlp": 1.0006392, + "epoch": 0.48934315346460244, + "flos": 20813896972800.0, + "grad_norm": 1.6460356462516255, + "language_loss": 0.74535912, + "learning_rate": 2.165914514023972e-06, + "loss": 0.76769578, + "num_input_tokens_seen": 174931250, + "step": 8139, + "time_per_iteration": 2.66831374168396 + }, + { + "auxiliary_loss_clip": 0.01152721, + "auxiliary_loss_mlp": 0.01115829, + "balance_loss_clip": 1.00215364, + "balance_loss_mlp": 1.0007205, + "epoch": 0.4894032767172704, + "flos": 19755537713280.0, + "grad_norm": 1.8108573326871522, + "language_loss": 0.61687773, + "learning_rate": 2.165526391632255e-06, + "loss": 0.6395632, + "num_input_tokens_seen": 174951105, + "step": 8140, + "time_per_iteration": 2.58154559135437 + }, + { + "auxiliary_loss_clip": 0.01117732, + "auxiliary_loss_mlp": 0.01116219, + "balance_loss_clip": 1.00199378, + "balance_loss_mlp": 1.00082469, + "epoch": 0.4894633999699384, + "flos": 17818982835840.0, + "grad_norm": 1.8255317725242575, + "language_loss": 0.82646215, + "learning_rate": 2.1651382629637608e-06, + "loss": 0.84880161, + "num_input_tokens_seen": 174969120, + "step": 8141, + "time_per_iteration": 2.6467270851135254 + }, + { + "auxiliary_loss_clip": 0.0112105, + "auxiliary_loss_mlp": 0.01115795, + "balance_loss_clip": 1.00217128, + "balance_loss_mlp": 1.00059128, + "epoch": 0.48952352322260634, + "flos": 25523222325120.0, + "grad_norm": 1.774163579368964, + "language_loss": 0.7269358, + "learning_rate": 2.1647501280332066e-06, + "loss": 0.74930429, + "num_input_tokens_seen": 174991295, + "step": 8142, + "time_per_iteration": 2.716460943222046 + }, + { + "auxiliary_loss_clip": 0.01167758, + "auxiliary_loss_mlp": 0.01113748, + "balance_loss_clip": 1.00220776, + "balance_loss_mlp": 1.00064254, + "epoch": 0.4895836464752743, + "flos": 29055502903680.0, + "grad_norm": 1.6784560164423779, + "language_loss": 0.67209983, + "learning_rate": 2.1643619868553105e-06, + "loss": 0.69491494, + "num_input_tokens_seen": 175012830, + "step": 8143, + "time_per_iteration": 2.632906913757324 + }, + { + "auxiliary_loss_clip": 0.01152043, + "auxiliary_loss_mlp": 0.00747286, + "balance_loss_clip": 1.00212264, + "balance_loss_mlp": 1.00020885, + "epoch": 0.48964376972794227, + "flos": 33546958312320.0, + "grad_norm": 4.078006936444268, + "language_loss": 0.75089324, + "learning_rate": 2.163973839444793e-06, + "loss": 0.76988661, + "num_input_tokens_seen": 175035695, + "step": 8144, + "time_per_iteration": 2.6770613193511963 + }, + { + "auxiliary_loss_clip": 0.01137354, + "auxiliary_loss_mlp": 0.0111445, + "balance_loss_clip": 1.0022049, + "balance_loss_mlp": 1.00067616, + "epoch": 0.48970389298061023, + "flos": 22054035985920.0, + "grad_norm": 2.1887906274715676, + "language_loss": 0.75761557, + "learning_rate": 2.1635856858163695e-06, + "loss": 0.78013361, + "num_input_tokens_seen": 175056425, + "step": 8145, + "time_per_iteration": 2.650845527648926 + }, + { + "auxiliary_loss_clip": 0.01136049, + "auxiliary_loss_mlp": 0.00747479, + "balance_loss_clip": 1.00207233, + "balance_loss_mlp": 1.00022924, + "epoch": 0.4897640162332782, + "flos": 20084299920000.0, + "grad_norm": 1.6591057317276015, + "language_loss": 0.79882759, + "learning_rate": 2.163197525984761e-06, + "loss": 0.81766284, + "num_input_tokens_seen": 175074800, + "step": 8146, + "time_per_iteration": 2.6119697093963623 + }, + { + "auxiliary_loss_clip": 0.01150885, + "auxiliary_loss_mlp": 0.01114275, + "balance_loss_clip": 1.00212026, + "balance_loss_mlp": 1.00059652, + "epoch": 0.48982413948594616, + "flos": 23806225330560.0, + "grad_norm": 1.824908914456037, + "language_loss": 0.74471253, + "learning_rate": 2.162809359964687e-06, + "loss": 0.76736408, + "num_input_tokens_seen": 175094500, + "step": 8147, + "time_per_iteration": 2.57692813873291 + }, + { + "auxiliary_loss_clip": 0.01142625, + "auxiliary_loss_mlp": 0.01114189, + "balance_loss_clip": 1.00231838, + "balance_loss_mlp": 1.0006063, + "epoch": 0.4898842627386142, + "flos": 17639645207040.0, + "grad_norm": 2.13418969688832, + "language_loss": 0.82781243, + "learning_rate": 2.162421187770864e-06, + "loss": 0.85038054, + "num_input_tokens_seen": 175112920, + "step": 8148, + "time_per_iteration": 2.587101936340332 + }, + { + "auxiliary_loss_clip": 0.01121909, + "auxiliary_loss_mlp": 0.01113936, + "balance_loss_clip": 1.00207949, + "balance_loss_mlp": 1.00063968, + "epoch": 0.48994438599128215, + "flos": 16617914841600.0, + "grad_norm": 1.9043920233728235, + "language_loss": 0.73938107, + "learning_rate": 2.162033009418015e-06, + "loss": 0.76173961, + "num_input_tokens_seen": 175129910, + "step": 8149, + "time_per_iteration": 2.6074044704437256 + }, + { + "auxiliary_loss_clip": 0.01167972, + "auxiliary_loss_mlp": 0.01115773, + "balance_loss_clip": 1.002321, + "balance_loss_mlp": 1.00056911, + "epoch": 0.4900045092439501, + "flos": 26614834600320.0, + "grad_norm": 1.605296211731316, + "language_loss": 0.761805, + "learning_rate": 2.1616448249208567e-06, + "loss": 0.78464246, + "num_input_tokens_seen": 175148705, + "step": 8150, + "time_per_iteration": 2.556307792663574 + }, + { + "auxiliary_loss_clip": 0.01134425, + "auxiliary_loss_mlp": 0.01115007, + "balance_loss_clip": 1.00212908, + "balance_loss_mlp": 1.0007565, + "epoch": 0.4900646324966181, + "flos": 19902125116800.0, + "grad_norm": 2.2436149701295416, + "language_loss": 0.72642225, + "learning_rate": 2.1612566342941106e-06, + "loss": 0.74891663, + "num_input_tokens_seen": 175167425, + "step": 8151, + "time_per_iteration": 2.579374313354492 + }, + { + "auxiliary_loss_clip": 0.01121221, + "auxiliary_loss_mlp": 0.01093087, + "balance_loss_clip": 1.00138962, + "balance_loss_mlp": 1.00000799, + "epoch": 0.49012475574928605, + "flos": 59189620337280.0, + "grad_norm": 0.8196529452948138, + "language_loss": 0.54342359, + "learning_rate": 2.1608684375524977e-06, + "loss": 0.56556672, + "num_input_tokens_seen": 175227985, + "step": 8152, + "time_per_iteration": 3.1582107543945312 + }, + { + "auxiliary_loss_clip": 0.0110961, + "auxiliary_loss_mlp": 0.01114982, + "balance_loss_clip": 1.00194764, + "balance_loss_mlp": 1.00063658, + "epoch": 0.490184879001954, + "flos": 45259797657600.0, + "grad_norm": 1.9984106856516106, + "language_loss": 0.61306727, + "learning_rate": 2.1604802347107364e-06, + "loss": 0.63531315, + "num_input_tokens_seen": 175251895, + "step": 8153, + "time_per_iteration": 2.8598008155822754 + }, + { + "auxiliary_loss_clip": 0.01118204, + "auxiliary_loss_mlp": 0.01113928, + "balance_loss_clip": 1.00188553, + "balance_loss_mlp": 1.00082231, + "epoch": 0.490245002254622, + "flos": 28002135634560.0, + "grad_norm": 1.653887009782555, + "language_loss": 0.7672776, + "learning_rate": 2.160092025783549e-06, + "loss": 0.78959894, + "num_input_tokens_seen": 175272770, + "step": 8154, + "time_per_iteration": 2.696321725845337 + }, + { + "auxiliary_loss_clip": 0.01131315, + "auxiliary_loss_mlp": 0.01092744, + "balance_loss_clip": 1.00129938, + "balance_loss_mlp": 1.00004673, + "epoch": 0.49030512550728994, + "flos": 58951318533120.0, + "grad_norm": 0.9675747368675853, + "language_loss": 0.67075455, + "learning_rate": 2.1597038107856564e-06, + "loss": 0.69299507, + "num_input_tokens_seen": 175336320, + "step": 8155, + "time_per_iteration": 3.227219581604004 + }, + { + "auxiliary_loss_clip": 0.01167772, + "auxiliary_loss_mlp": 0.01113626, + "balance_loss_clip": 1.00227678, + "balance_loss_mlp": 1.00061584, + "epoch": 0.4903652487599579, + "flos": 19791843384960.0, + "grad_norm": 1.8008184209633997, + "language_loss": 0.76620245, + "learning_rate": 2.1593155897317784e-06, + "loss": 0.78901637, + "num_input_tokens_seen": 175353540, + "step": 8156, + "time_per_iteration": 2.5270235538482666 + }, + { + "auxiliary_loss_clip": 0.01151201, + "auxiliary_loss_mlp": 0.01114708, + "balance_loss_clip": 1.00227046, + "balance_loss_mlp": 1.00064909, + "epoch": 0.49042537201262587, + "flos": 21762082241280.0, + "grad_norm": 2.0587674333550092, + "language_loss": 0.83758718, + "learning_rate": 2.1589273626366377e-06, + "loss": 0.8602463, + "num_input_tokens_seen": 175370445, + "step": 8157, + "time_per_iteration": 2.565826177597046 + }, + { + "auxiliary_loss_clip": 0.01152309, + "auxiliary_loss_mlp": 0.01113712, + "balance_loss_clip": 1.00221515, + "balance_loss_mlp": 1.0006063, + "epoch": 0.49048549526529384, + "flos": 18953042008320.0, + "grad_norm": 5.288795078295083, + "language_loss": 0.79962754, + "learning_rate": 2.158539129514956e-06, + "loss": 0.8222878, + "num_input_tokens_seen": 175389020, + "step": 8158, + "time_per_iteration": 2.577850103378296 + }, + { + "auxiliary_loss_clip": 0.01167893, + "auxiliary_loss_mlp": 0.01114993, + "balance_loss_clip": 1.00226426, + "balance_loss_mlp": 1.0006479, + "epoch": 0.4905456185179618, + "flos": 26906393295360.0, + "grad_norm": 1.6058562715263278, + "language_loss": 0.69250679, + "learning_rate": 2.158150890381454e-06, + "loss": 0.71533567, + "num_input_tokens_seen": 175409545, + "step": 8159, + "time_per_iteration": 2.5506179332733154 + }, + { + "auxiliary_loss_clip": 0.01152505, + "auxiliary_loss_mlp": 0.01114303, + "balance_loss_clip": 1.0021714, + "balance_loss_mlp": 1.00062537, + "epoch": 0.49060574177062977, + "flos": 20412343854720.0, + "grad_norm": 2.122766732080215, + "language_loss": 0.73070973, + "learning_rate": 2.157762645250854e-06, + "loss": 0.7533778, + "num_input_tokens_seen": 175429335, + "step": 8160, + "time_per_iteration": 2.5504376888275146 + }, + { + "auxiliary_loss_clip": 0.01151469, + "auxiliary_loss_mlp": 0.01115297, + "balance_loss_clip": 1.00213146, + "balance_loss_mlp": 1.0007602, + "epoch": 0.4906658650232978, + "flos": 17493704248320.0, + "grad_norm": 1.8981211862220257, + "language_loss": 0.71439654, + "learning_rate": 2.1573743941378796e-06, + "loss": 0.73706424, + "num_input_tokens_seen": 175446955, + "step": 8161, + "time_per_iteration": 2.5449626445770264 + }, + { + "auxiliary_loss_clip": 0.01104243, + "auxiliary_loss_mlp": 0.01113814, + "balance_loss_clip": 1.00209665, + "balance_loss_mlp": 1.00061262, + "epoch": 0.49072598827596575, + "flos": 26614439550720.0, + "grad_norm": 1.9834443339757755, + "language_loss": 0.68558621, + "learning_rate": 2.1569861370572517e-06, + "loss": 0.70776677, + "num_input_tokens_seen": 175468195, + "step": 8162, + "time_per_iteration": 2.727691650390625 + }, + { + "auxiliary_loss_clip": 0.01151508, + "auxiliary_loss_mlp": 0.01116024, + "balance_loss_clip": 1.00220859, + "balance_loss_mlp": 1.00062943, + "epoch": 0.4907861115286337, + "flos": 20412595249920.0, + "grad_norm": 2.342476154302021, + "language_loss": 0.63398349, + "learning_rate": 2.1565978740236944e-06, + "loss": 0.65665889, + "num_input_tokens_seen": 175487455, + "step": 8163, + "time_per_iteration": 2.5547103881835938 + }, + { + "auxiliary_loss_clip": 0.01119963, + "auxiliary_loss_mlp": 0.01113673, + "balance_loss_clip": 1.00206983, + "balance_loss_mlp": 1.00066304, + "epoch": 0.4908462347813017, + "flos": 14064271286400.0, + "grad_norm": 2.9036618748024354, + "language_loss": 0.76342809, + "learning_rate": 2.1562096050519293e-06, + "loss": 0.78576446, + "num_input_tokens_seen": 175504450, + "step": 8164, + "time_per_iteration": 4.038739204406738 + }, + { + "auxiliary_loss_clip": 0.01152589, + "auxiliary_loss_mlp": 0.0111524, + "balance_loss_clip": 1.00211811, + "balance_loss_mlp": 1.00051332, + "epoch": 0.49090635803396965, + "flos": 18735100237440.0, + "grad_norm": 1.6944972413628823, + "language_loss": 0.76408684, + "learning_rate": 2.1558213301566806e-06, + "loss": 0.78676516, + "num_input_tokens_seen": 175523600, + "step": 8165, + "time_per_iteration": 2.5488908290863037 + }, + { + "auxiliary_loss_clip": 0.01135383, + "auxiliary_loss_mlp": 0.0111487, + "balance_loss_clip": 1.00209677, + "balance_loss_mlp": 1.00062013, + "epoch": 0.4909664812866376, + "flos": 20558500295040.0, + "grad_norm": 1.6093888004868657, + "language_loss": 0.77354193, + "learning_rate": 2.1554330493526716e-06, + "loss": 0.79604447, + "num_input_tokens_seen": 175542720, + "step": 8166, + "time_per_iteration": 2.611543655395508 + }, + { + "auxiliary_loss_clip": 0.01147608, + "auxiliary_loss_mlp": 0.01092654, + "balance_loss_clip": 1.00139713, + "balance_loss_mlp": 0.99995726, + "epoch": 0.4910266045393056, + "flos": 54684017948160.0, + "grad_norm": 0.8035497321279745, + "language_loss": 0.54311198, + "learning_rate": 2.1550447626546253e-06, + "loss": 0.56551462, + "num_input_tokens_seen": 175598640, + "step": 8167, + "time_per_iteration": 3.150541067123413 + }, + { + "auxiliary_loss_clip": 0.01101366, + "auxiliary_loss_mlp": 0.01113805, + "balance_loss_clip": 1.00191832, + "balance_loss_mlp": 1.00050831, + "epoch": 0.49108672779197354, + "flos": 16246454342400.0, + "grad_norm": 1.8968599951407301, + "language_loss": 0.85804409, + "learning_rate": 2.1546564700772665e-06, + "loss": 0.88019586, + "num_input_tokens_seen": 175615675, + "step": 8168, + "time_per_iteration": 4.126108646392822 + }, + { + "auxiliary_loss_clip": 0.01152422, + "auxiliary_loss_mlp": 0.01113302, + "balance_loss_clip": 1.00222206, + "balance_loss_mlp": 1.00067282, + "epoch": 0.4911468510446415, + "flos": 19825419623040.0, + "grad_norm": 1.6336864715753592, + "language_loss": 0.7332505, + "learning_rate": 2.1542681716353193e-06, + "loss": 0.75590777, + "num_input_tokens_seen": 175632255, + "step": 8169, + "time_per_iteration": 2.5423543453216553 + }, + { + "auxiliary_loss_clip": 0.01150922, + "auxiliary_loss_mlp": 0.0111383, + "balance_loss_clip": 1.00210619, + "balance_loss_mlp": 1.00053382, + "epoch": 0.4912069742973095, + "flos": 21212684743680.0, + "grad_norm": 1.6586983424217747, + "language_loss": 0.78192228, + "learning_rate": 2.1538798673435068e-06, + "loss": 0.80456978, + "num_input_tokens_seen": 175651625, + "step": 8170, + "time_per_iteration": 3.9524827003479004 + }, + { + "auxiliary_loss_clip": 0.01138139, + "auxiliary_loss_mlp": 0.01113994, + "balance_loss_clip": 1.00215268, + "balance_loss_mlp": 1.00060248, + "epoch": 0.49126709754997744, + "flos": 19537129065600.0, + "grad_norm": 2.3714570886976083, + "language_loss": 0.76162744, + "learning_rate": 2.1534915572165545e-06, + "loss": 0.78414875, + "num_input_tokens_seen": 175669265, + "step": 8171, + "time_per_iteration": 4.0143046379089355 + }, + { + "auxiliary_loss_clip": 0.01136119, + "auxiliary_loss_mlp": 0.01114853, + "balance_loss_clip": 1.00203001, + "balance_loss_mlp": 1.0007937, + "epoch": 0.4913272208026454, + "flos": 12239686080000.0, + "grad_norm": 2.4255874872864767, + "language_loss": 0.81826305, + "learning_rate": 2.1531032412691875e-06, + "loss": 0.84077275, + "num_input_tokens_seen": 175686065, + "step": 8172, + "time_per_iteration": 2.5800023078918457 + }, + { + "auxiliary_loss_clip": 0.01146455, + "auxiliary_loss_mlp": 0.01092657, + "balance_loss_clip": 1.00137055, + "balance_loss_mlp": 0.99996012, + "epoch": 0.49138734405531337, + "flos": 65465871661440.0, + "grad_norm": 0.7138860420691788, + "language_loss": 0.53263688, + "learning_rate": 2.1527149195161295e-06, + "loss": 0.55502796, + "num_input_tokens_seen": 175748595, + "step": 8173, + "time_per_iteration": 3.128122329711914 + }, + { + "auxiliary_loss_clip": 0.01152397, + "auxiliary_loss_mlp": 0.00747425, + "balance_loss_clip": 1.00217247, + "balance_loss_mlp": 1.0003159, + "epoch": 0.4914474673079814, + "flos": 18439052342400.0, + "grad_norm": 1.6772892303195406, + "language_loss": 0.62865478, + "learning_rate": 2.152326591972107e-06, + "loss": 0.64765298, + "num_input_tokens_seen": 175766770, + "step": 8174, + "time_per_iteration": 2.5540733337402344 + }, + { + "auxiliary_loss_clip": 0.01119266, + "auxiliary_loss_mlp": 0.01114789, + "balance_loss_clip": 1.00221157, + "balance_loss_mlp": 1.00072908, + "epoch": 0.49150759056064935, + "flos": 21685053525120.0, + "grad_norm": 2.208665756299682, + "language_loss": 0.69324058, + "learning_rate": 2.1519382586518445e-06, + "loss": 0.71558112, + "num_input_tokens_seen": 175783605, + "step": 8175, + "time_per_iteration": 2.6212098598480225 + }, + { + "auxiliary_loss_clip": 0.01152673, + "auxiliary_loss_mlp": 0.01114037, + "balance_loss_clip": 1.00204313, + "balance_loss_mlp": 1.00055003, + "epoch": 0.4915677138133173, + "flos": 22382439056640.0, + "grad_norm": 1.6729861765880814, + "language_loss": 0.74877334, + "learning_rate": 2.151549919570068e-06, + "loss": 0.77144045, + "num_input_tokens_seen": 175801390, + "step": 8176, + "time_per_iteration": 2.5636749267578125 + }, + { + "auxiliary_loss_clip": 0.0115295, + "auxiliary_loss_mlp": 0.01114896, + "balance_loss_clip": 1.00222373, + "balance_loss_mlp": 1.00074124, + "epoch": 0.4916278370659853, + "flos": 18402890325120.0, + "grad_norm": 1.9026373298063182, + "language_loss": 0.69929647, + "learning_rate": 2.1511615747415036e-06, + "loss": 0.72197497, + "num_input_tokens_seen": 175819830, + "step": 8177, + "time_per_iteration": 2.552610397338867 + }, + { + "auxiliary_loss_clip": 0.01129737, + "auxiliary_loss_mlp": 0.00746212, + "balance_loss_clip": 1.00135493, + "balance_loss_mlp": 0.99999964, + "epoch": 0.49168796031865325, + "flos": 66609124715520.0, + "grad_norm": 0.6868747605740068, + "language_loss": 0.4621529, + "learning_rate": 2.150773224180877e-06, + "loss": 0.48091239, + "num_input_tokens_seen": 175881765, + "step": 8178, + "time_per_iteration": 3.1341137886047363 + }, + { + "auxiliary_loss_clip": 0.0116793, + "auxiliary_loss_mlp": 0.01115095, + "balance_loss_clip": 1.00228858, + "balance_loss_mlp": 1.00074935, + "epoch": 0.4917480835713212, + "flos": 20959335141120.0, + "grad_norm": 1.7041973893394395, + "language_loss": 0.65822661, + "learning_rate": 2.1503848679029147e-06, + "loss": 0.68105686, + "num_input_tokens_seen": 175901795, + "step": 8179, + "time_per_iteration": 2.558008909225464 + }, + { + "auxiliary_loss_clip": 0.01061741, + "auxiliary_loss_mlp": 0.01114917, + "balance_loss_clip": 1.00237703, + "balance_loss_mlp": 1.00057113, + "epoch": 0.4918082068239892, + "flos": 15772900412160.0, + "grad_norm": 2.162877303371075, + "language_loss": 0.70245969, + "learning_rate": 2.149996505922343e-06, + "loss": 0.72422624, + "num_input_tokens_seen": 175917770, + "step": 8180, + "time_per_iteration": 3.010488510131836 + }, + { + "auxiliary_loss_clip": 0.01134403, + "auxiliary_loss_mlp": 0.01114528, + "balance_loss_clip": 1.00214767, + "balance_loss_mlp": 1.0006597, + "epoch": 0.49186833007665715, + "flos": 24604806453120.0, + "grad_norm": 2.441560671776837, + "language_loss": 0.84352767, + "learning_rate": 2.1496081382538895e-06, + "loss": 0.86601698, + "num_input_tokens_seen": 175937000, + "step": 8181, + "time_per_iteration": 2.937760829925537 + }, + { + "auxiliary_loss_clip": 0.01167589, + "auxiliary_loss_mlp": 0.01113239, + "balance_loss_clip": 1.00225818, + "balance_loss_mlp": 1.00061023, + "epoch": 0.4919284533293251, + "flos": 22090557139200.0, + "grad_norm": 2.9702198034110014, + "language_loss": 0.73011839, + "learning_rate": 2.1492197649122793e-06, + "loss": 0.75292671, + "num_input_tokens_seen": 175955170, + "step": 8182, + "time_per_iteration": 2.517205238342285 + }, + { + "auxiliary_loss_clip": 0.01117754, + "auxiliary_loss_mlp": 0.01113288, + "balance_loss_clip": 1.00189602, + "balance_loss_mlp": 1.00075436, + "epoch": 0.4919885765819931, + "flos": 23368043318400.0, + "grad_norm": 2.3699885708624673, + "language_loss": 0.72938114, + "learning_rate": 2.1488313859122412e-06, + "loss": 0.75169152, + "num_input_tokens_seen": 175973725, + "step": 8183, + "time_per_iteration": 2.666785955429077 + }, + { + "auxiliary_loss_clip": 0.01087879, + "auxiliary_loss_mlp": 0.01115774, + "balance_loss_clip": 1.00201368, + "balance_loss_mlp": 1.00047481, + "epoch": 0.49204869983466104, + "flos": 21360493209600.0, + "grad_norm": 2.213775808985013, + "language_loss": 0.77122247, + "learning_rate": 2.1484430012685015e-06, + "loss": 0.79325902, + "num_input_tokens_seen": 175993885, + "step": 8184, + "time_per_iteration": 2.7447006702423096 + }, + { + "auxiliary_loss_clip": 0.01136258, + "auxiliary_loss_mlp": 0.01113661, + "balance_loss_clip": 1.00217581, + "balance_loss_mlp": 1.0005548, + "epoch": 0.492108823087329, + "flos": 21142695093120.0, + "grad_norm": 1.7972515895715044, + "language_loss": 0.70505476, + "learning_rate": 2.148054610995789e-06, + "loss": 0.72755396, + "num_input_tokens_seen": 176014210, + "step": 8185, + "time_per_iteration": 2.6233279705047607 + }, + { + "auxiliary_loss_clip": 0.01136284, + "auxiliary_loss_mlp": 0.0111535, + "balance_loss_clip": 1.00216389, + "balance_loss_mlp": 1.00062299, + "epoch": 0.49216894633999697, + "flos": 25116605389440.0, + "grad_norm": 1.9560053074581418, + "language_loss": 0.75109249, + "learning_rate": 2.147666215108831e-06, + "loss": 0.7736088, + "num_input_tokens_seen": 176033890, + "step": 8186, + "time_per_iteration": 2.625234603881836 + }, + { + "auxiliary_loss_clip": 0.01150844, + "auxiliary_loss_mlp": 0.01115032, + "balance_loss_clip": 1.00220346, + "balance_loss_mlp": 1.00068641, + "epoch": 0.49222906959266494, + "flos": 22637943475200.0, + "grad_norm": 2.1993392613502913, + "language_loss": 0.68194973, + "learning_rate": 2.1472778136223545e-06, + "loss": 0.70460856, + "num_input_tokens_seen": 176052720, + "step": 8187, + "time_per_iteration": 2.56121826171875 + }, + { + "auxiliary_loss_clip": 0.01104195, + "auxiliary_loss_mlp": 0.01114668, + "balance_loss_clip": 1.00201809, + "balance_loss_mlp": 1.00070429, + "epoch": 0.49228919284533296, + "flos": 20410548174720.0, + "grad_norm": 1.653703625922272, + "language_loss": 0.6657064, + "learning_rate": 2.1468894065510894e-06, + "loss": 0.687895, + "num_input_tokens_seen": 176072545, + "step": 8188, + "time_per_iteration": 2.704481840133667 + }, + { + "auxiliary_loss_clip": 0.01151164, + "auxiliary_loss_mlp": 0.01113957, + "balance_loss_clip": 1.00223041, + "balance_loss_mlp": 1.00066066, + "epoch": 0.4923493160980009, + "flos": 27122359818240.0, + "grad_norm": 1.7414392846665265, + "language_loss": 0.74637592, + "learning_rate": 2.1465009939097623e-06, + "loss": 0.76902717, + "num_input_tokens_seen": 176091490, + "step": 8189, + "time_per_iteration": 2.6100828647613525 + }, + { + "auxiliary_loss_clip": 0.01136118, + "auxiliary_loss_mlp": 0.01113185, + "balance_loss_clip": 1.00213099, + "balance_loss_mlp": 1.00055623, + "epoch": 0.4924094393506689, + "flos": 35736683224320.0, + "grad_norm": 2.2408057449793017, + "language_loss": 0.64547151, + "learning_rate": 2.146112575713104e-06, + "loss": 0.66796458, + "num_input_tokens_seen": 176113200, + "step": 8190, + "time_per_iteration": 2.7260935306549072 + }, + { + "auxiliary_loss_clip": 0.01167818, + "auxiliary_loss_mlp": 0.01114663, + "balance_loss_clip": 1.00232506, + "balance_loss_mlp": 1.00060403, + "epoch": 0.49246956260333685, + "flos": 20412487509120.0, + "grad_norm": 1.7993981901851153, + "language_loss": 0.71205813, + "learning_rate": 2.1457241519758413e-06, + "loss": 0.73488295, + "num_input_tokens_seen": 176132485, + "step": 8191, + "time_per_iteration": 2.5220587253570557 + }, + { + "auxiliary_loss_clip": 0.01167756, + "auxiliary_loss_mlp": 0.00747382, + "balance_loss_clip": 1.00226557, + "balance_loss_mlp": 1.00032568, + "epoch": 0.4925296858560048, + "flos": 38976938231040.0, + "grad_norm": 1.5778254182748774, + "language_loss": 0.71767545, + "learning_rate": 2.1453357227127043e-06, + "loss": 0.7368269, + "num_input_tokens_seen": 176155755, + "step": 8192, + "time_per_iteration": 2.692227363586426 + }, + { + "auxiliary_loss_clip": 0.01129914, + "auxiliary_loss_mlp": 0.01092752, + "balance_loss_clip": 1.00135756, + "balance_loss_mlp": 1.00005484, + "epoch": 0.4925898091086728, + "flos": 64278917712000.0, + "grad_norm": 0.7325348617006261, + "language_loss": 0.52154881, + "learning_rate": 2.1449472879384224e-06, + "loss": 0.54377544, + "num_input_tokens_seen": 176216295, + "step": 8193, + "time_per_iteration": 3.204561948776245 + }, + { + "auxiliary_loss_clip": 0.01167781, + "auxiliary_loss_mlp": 0.0111462, + "balance_loss_clip": 1.00231779, + "balance_loss_mlp": 1.00075173, + "epoch": 0.49264993236134075, + "flos": 23036372110080.0, + "grad_norm": 1.516942039209934, + "language_loss": 0.76689601, + "learning_rate": 2.1445588476677246e-06, + "loss": 0.78972, + "num_input_tokens_seen": 176235925, + "step": 8194, + "time_per_iteration": 2.532470703125 + }, + { + "auxiliary_loss_clip": 0.01137242, + "auxiliary_loss_mlp": 0.01114298, + "balance_loss_clip": 1.00209463, + "balance_loss_mlp": 1.00052488, + "epoch": 0.4927100556140087, + "flos": 24718212668160.0, + "grad_norm": 2.2875773711531084, + "language_loss": 0.7027812, + "learning_rate": 2.144170401915341e-06, + "loss": 0.72529662, + "num_input_tokens_seen": 176253865, + "step": 8195, + "time_per_iteration": 2.616328716278076 + }, + { + "auxiliary_loss_clip": 0.01103057, + "auxiliary_loss_mlp": 0.01114068, + "balance_loss_clip": 1.00196433, + "balance_loss_mlp": 1.00048566, + "epoch": 0.4927701788666767, + "flos": 23505544581120.0, + "grad_norm": 1.9053832658683976, + "language_loss": 0.80630034, + "learning_rate": 2.143781950696001e-06, + "loss": 0.82847166, + "num_input_tokens_seen": 176271525, + "step": 8196, + "time_per_iteration": 2.713240623474121 + }, + { + "auxiliary_loss_clip": 0.01122007, + "auxiliary_loss_mlp": 0.01114781, + "balance_loss_clip": 1.00214326, + "balance_loss_mlp": 1.00072169, + "epoch": 0.49283030211934464, + "flos": 22928891639040.0, + "grad_norm": 2.5659758439399445, + "language_loss": 0.70759737, + "learning_rate": 2.1433934940244356e-06, + "loss": 0.72996527, + "num_input_tokens_seen": 176290810, + "step": 8197, + "time_per_iteration": 2.6502294540405273 + }, + { + "auxiliary_loss_clip": 0.01150858, + "auxiliary_loss_mlp": 0.011133, + "balance_loss_clip": 1.00223017, + "balance_loss_mlp": 1.00067151, + "epoch": 0.4928904253720126, + "flos": 16873024210560.0, + "grad_norm": 1.8745884273700617, + "language_loss": 0.84158838, + "learning_rate": 2.143005031915374e-06, + "loss": 0.86422998, + "num_input_tokens_seen": 176309165, + "step": 8198, + "time_per_iteration": 2.5392589569091797 + }, + { + "auxiliary_loss_clip": 0.01152954, + "auxiliary_loss_mlp": 0.01115021, + "balance_loss_clip": 1.00220668, + "balance_loss_mlp": 1.00058007, + "epoch": 0.4929505486246806, + "flos": 14866551509760.0, + "grad_norm": 3.081627142671009, + "language_loss": 0.76536405, + "learning_rate": 2.1426165643835467e-06, + "loss": 0.78804374, + "num_input_tokens_seen": 176324960, + "step": 8199, + "time_per_iteration": 2.5317201614379883 + }, + { + "auxiliary_loss_clip": 0.01142549, + "auxiliary_loss_mlp": 0.01115765, + "balance_loss_clip": 1.00224626, + "balance_loss_mlp": 1.00075161, + "epoch": 0.49301067187734854, + "flos": 23842351434240.0, + "grad_norm": 1.928762593460368, + "language_loss": 0.59812057, + "learning_rate": 2.1422280914436864e-06, + "loss": 0.6207037, + "num_input_tokens_seen": 176346195, + "step": 8200, + "time_per_iteration": 2.6326146125793457 + }, + { + "auxiliary_loss_clip": 0.01152578, + "auxiliary_loss_mlp": 0.01113999, + "balance_loss_clip": 1.00215924, + "balance_loss_mlp": 1.00070262, + "epoch": 0.49307079513001656, + "flos": 22491284244480.0, + "grad_norm": 1.545368516975938, + "language_loss": 0.78804529, + "learning_rate": 2.1418396131105213e-06, + "loss": 0.81071103, + "num_input_tokens_seen": 176366735, + "step": 8201, + "time_per_iteration": 4.069760322570801 + }, + { + "auxiliary_loss_clip": 0.01152887, + "auxiliary_loss_mlp": 0.01115885, + "balance_loss_clip": 1.00216889, + "balance_loss_mlp": 1.00049007, + "epoch": 0.4931309183826845, + "flos": 15924587546880.0, + "grad_norm": 2.395962181046955, + "language_loss": 0.68068141, + "learning_rate": 2.141451129398785e-06, + "loss": 0.70336914, + "num_input_tokens_seen": 176384475, + "step": 8202, + "time_per_iteration": 2.5210118293762207 + }, + { + "auxiliary_loss_clip": 0.01134066, + "auxiliary_loss_mlp": 0.0111444, + "balance_loss_clip": 1.00203097, + "balance_loss_mlp": 1.00057101, + "epoch": 0.4931910416353525, + "flos": 27309059735040.0, + "grad_norm": 47.99598987231296, + "language_loss": 0.75511026, + "learning_rate": 2.1410626403232076e-06, + "loss": 0.77759528, + "num_input_tokens_seen": 176402645, + "step": 8203, + "time_per_iteration": 2.632105827331543 + }, + { + "auxiliary_loss_clip": 0.01101157, + "auxiliary_loss_mlp": 0.01115135, + "balance_loss_clip": 1.0017457, + "balance_loss_mlp": 1.00059867, + "epoch": 0.49325116488802045, + "flos": 20806139635200.0, + "grad_norm": 3.3410045955302676, + "language_loss": 0.80264568, + "learning_rate": 2.1406741458985197e-06, + "loss": 0.82480866, + "num_input_tokens_seen": 176416715, + "step": 8204, + "time_per_iteration": 2.671046257019043 + }, + { + "auxiliary_loss_clip": 0.01150928, + "auxiliary_loss_mlp": 0.0111443, + "balance_loss_clip": 1.002069, + "balance_loss_mlp": 1.00075245, + "epoch": 0.4933112881406884, + "flos": 19865963099520.0, + "grad_norm": 2.004982271079954, + "language_loss": 0.66157085, + "learning_rate": 2.140285646139455e-06, + "loss": 0.68422443, + "num_input_tokens_seen": 176435755, + "step": 8205, + "time_per_iteration": 2.61252760887146 + }, + { + "auxiliary_loss_clip": 0.01167902, + "auxiliary_loss_mlp": 0.01115993, + "balance_loss_clip": 1.00217652, + "balance_loss_mlp": 1.00059819, + "epoch": 0.4933714113933564, + "flos": 21827977741440.0, + "grad_norm": 3.093693613586198, + "language_loss": 0.66708684, + "learning_rate": 2.139897141060744e-06, + "loss": 0.68992579, + "num_input_tokens_seen": 176453915, + "step": 8206, + "time_per_iteration": 4.3678343296051025 + }, + { + "auxiliary_loss_clip": 0.01121365, + "auxiliary_loss_mlp": 0.01114747, + "balance_loss_clip": 1.00219893, + "balance_loss_mlp": 1.00059187, + "epoch": 0.49343153464602435, + "flos": 27890130049920.0, + "grad_norm": 1.9034016409895256, + "language_loss": 0.76257014, + "learning_rate": 2.1395086306771196e-06, + "loss": 0.78493124, + "num_input_tokens_seen": 176475175, + "step": 8207, + "time_per_iteration": 2.8891172409057617 + }, + { + "auxiliary_loss_clip": 0.01140645, + "auxiliary_loss_mlp": 0.01114915, + "balance_loss_clip": 1.00225294, + "balance_loss_mlp": 1.00056934, + "epoch": 0.4934916578986923, + "flos": 24681080983680.0, + "grad_norm": 1.972626329915613, + "language_loss": 0.60346341, + "learning_rate": 2.1391201150033147e-06, + "loss": 0.626019, + "num_input_tokens_seen": 176494250, + "step": 8208, + "time_per_iteration": 4.02635931968689 + }, + { + "auxiliary_loss_clip": 0.01135962, + "auxiliary_loss_mlp": 0.01114929, + "balance_loss_clip": 1.0020144, + "balance_loss_mlp": 1.00067842, + "epoch": 0.4935517811513603, + "flos": 23405139089280.0, + "grad_norm": 1.8572799839142753, + "language_loss": 0.7842319, + "learning_rate": 2.1387315940540598e-06, + "loss": 0.80674076, + "num_input_tokens_seen": 176513325, + "step": 8209, + "time_per_iteration": 4.021469354629517 + }, + { + "auxiliary_loss_clip": 0.01137646, + "auxiliary_loss_mlp": 0.00747492, + "balance_loss_clip": 1.0021261, + "balance_loss_mlp": 1.00039625, + "epoch": 0.49361190440402825, + "flos": 21944508439680.0, + "grad_norm": 1.7867296129918457, + "language_loss": 0.78716958, + "learning_rate": 2.138343067844089e-06, + "loss": 0.80602098, + "num_input_tokens_seen": 176532915, + "step": 8210, + "time_per_iteration": 2.636367082595825 + }, + { + "auxiliary_loss_clip": 0.01151443, + "auxiliary_loss_mlp": 0.01114469, + "balance_loss_clip": 1.00208879, + "balance_loss_mlp": 1.0006001, + "epoch": 0.4936720276566962, + "flos": 25115671635840.0, + "grad_norm": 2.261205292890612, + "language_loss": 0.81300831, + "learning_rate": 2.1379545363881363e-06, + "loss": 0.83566737, + "num_input_tokens_seen": 176552775, + "step": 8211, + "time_per_iteration": 2.5832154750823975 + }, + { + "auxiliary_loss_clip": 0.01121396, + "auxiliary_loss_mlp": 0.01115721, + "balance_loss_clip": 1.0022279, + "balance_loss_mlp": 1.00080347, + "epoch": 0.4937321509093642, + "flos": 26358935132160.0, + "grad_norm": 2.090783198863484, + "language_loss": 0.91400397, + "learning_rate": 2.137565999700933e-06, + "loss": 0.93637514, + "num_input_tokens_seen": 176572185, + "step": 8212, + "time_per_iteration": 2.668407440185547 + }, + { + "auxiliary_loss_clip": 0.01121113, + "auxiliary_loss_mlp": 0.01114202, + "balance_loss_clip": 1.00211155, + "balance_loss_mlp": 1.0007143, + "epoch": 0.49379227416203214, + "flos": 22961390469120.0, + "grad_norm": 1.9491591759411366, + "language_loss": 0.64252764, + "learning_rate": 2.1371774577972138e-06, + "loss": 0.66488081, + "num_input_tokens_seen": 176591490, + "step": 8213, + "time_per_iteration": 2.6962363719940186 + }, + { + "auxiliary_loss_clip": 0.01102542, + "auxiliary_loss_mlp": 0.00747547, + "balance_loss_clip": 1.00198972, + "balance_loss_mlp": 1.00048161, + "epoch": 0.49385239741470016, + "flos": 32489101843200.0, + "grad_norm": 1.7418691936238304, + "language_loss": 0.76233512, + "learning_rate": 2.136788910691711e-06, + "loss": 0.78083599, + "num_input_tokens_seen": 176612715, + "step": 8214, + "time_per_iteration": 2.7653579711914062 + }, + { + "auxiliary_loss_clip": 0.01167863, + "auxiliary_loss_mlp": 0.01115816, + "balance_loss_clip": 1.00235105, + "balance_loss_mlp": 1.00080252, + "epoch": 0.4939125206673681, + "flos": 22492864442880.0, + "grad_norm": 1.8612790589930883, + "language_loss": 0.8478117, + "learning_rate": 2.1364003583991594e-06, + "loss": 0.8706485, + "num_input_tokens_seen": 176631950, + "step": 8215, + "time_per_iteration": 2.5392496585845947 + }, + { + "auxiliary_loss_clip": 0.01150938, + "auxiliary_loss_mlp": 0.01113386, + "balance_loss_clip": 1.00222087, + "balance_loss_mlp": 1.00056696, + "epoch": 0.4939726439200361, + "flos": 31176351486720.0, + "grad_norm": 1.7221395595144602, + "language_loss": 0.83607227, + "learning_rate": 2.136011800934292e-06, + "loss": 0.85871553, + "num_input_tokens_seen": 176653060, + "step": 8216, + "time_per_iteration": 2.6581149101257324 + }, + { + "auxiliary_loss_clip": 0.01133871, + "auxiliary_loss_mlp": 0.01113943, + "balance_loss_clip": 1.00197852, + "balance_loss_mlp": 1.00064659, + "epoch": 0.49403276717270406, + "flos": 22674213233280.0, + "grad_norm": 1.3492511150567783, + "language_loss": 0.74252725, + "learning_rate": 2.1356232383118442e-06, + "loss": 0.76500547, + "num_input_tokens_seen": 176673895, + "step": 8217, + "time_per_iteration": 2.63189959526062 + }, + { + "auxiliary_loss_clip": 0.01167744, + "auxiliary_loss_mlp": 0.00747278, + "balance_loss_clip": 1.00231814, + "balance_loss_mlp": 1.00049949, + "epoch": 0.494092890425372, + "flos": 20741070147840.0, + "grad_norm": 1.8872563978256554, + "language_loss": 0.78389114, + "learning_rate": 2.1352346705465494e-06, + "loss": 0.80304134, + "num_input_tokens_seen": 176692550, + "step": 8218, + "time_per_iteration": 2.5298895835876465 + }, + { + "auxiliary_loss_clip": 0.01100165, + "auxiliary_loss_mlp": 0.00747413, + "balance_loss_clip": 1.00188327, + "balance_loss_mlp": 1.00038636, + "epoch": 0.49415301367804, + "flos": 18369026778240.0, + "grad_norm": 2.773110137419443, + "language_loss": 0.76860154, + "learning_rate": 2.134846097653142e-06, + "loss": 0.78707731, + "num_input_tokens_seen": 176709335, + "step": 8219, + "time_per_iteration": 2.656586170196533 + }, + { + "auxiliary_loss_clip": 0.0113439, + "auxiliary_loss_mlp": 0.01113734, + "balance_loss_clip": 1.0020299, + "balance_loss_mlp": 1.00072384, + "epoch": 0.49421313693070795, + "flos": 17530620451200.0, + "grad_norm": 1.680651749035097, + "language_loss": 0.62520105, + "learning_rate": 2.134457519646357e-06, + "loss": 0.64768231, + "num_input_tokens_seen": 176727715, + "step": 8220, + "time_per_iteration": 2.5843605995178223 + }, + { + "auxiliary_loss_clip": 0.01167664, + "auxiliary_loss_mlp": 0.01114314, + "balance_loss_clip": 1.00222206, + "balance_loss_mlp": 1.00082695, + "epoch": 0.4942732601833759, + "flos": 20812173120000.0, + "grad_norm": 2.3385593984543624, + "language_loss": 0.72211504, + "learning_rate": 2.1340689365409296e-06, + "loss": 0.74493492, + "num_input_tokens_seen": 176747530, + "step": 8221, + "time_per_iteration": 2.52386474609375 + }, + { + "auxiliary_loss_clip": 0.0111949, + "auxiliary_loss_mlp": 0.01114164, + "balance_loss_clip": 1.00212812, + "balance_loss_mlp": 1.00067687, + "epoch": 0.4943333834360439, + "flos": 15048941794560.0, + "grad_norm": 1.7173157378612882, + "language_loss": 0.7890209, + "learning_rate": 2.133680348351595e-06, + "loss": 0.81135744, + "num_input_tokens_seen": 176765260, + "step": 8222, + "time_per_iteration": 2.657334089279175 + }, + { + "auxiliary_loss_clip": 0.01151174, + "auxiliary_loss_mlp": 0.01114765, + "balance_loss_clip": 1.00229228, + "balance_loss_mlp": 1.00070584, + "epoch": 0.49439350668871185, + "flos": 16070420764800.0, + "grad_norm": 3.134386515652611, + "language_loss": 0.72007018, + "learning_rate": 2.133291755093088e-06, + "loss": 0.7427296, + "num_input_tokens_seen": 176781770, + "step": 8223, + "time_per_iteration": 2.5296499729156494 + }, + { + "auxiliary_loss_clip": 0.0115129, + "auxiliary_loss_mlp": 0.01115527, + "balance_loss_clip": 1.00232267, + "balance_loss_mlp": 1.00080013, + "epoch": 0.4944536299413798, + "flos": 20880079781760.0, + "grad_norm": 2.766305742321528, + "language_loss": 0.7502358, + "learning_rate": 2.132903156780144e-06, + "loss": 0.77290398, + "num_input_tokens_seen": 176800655, + "step": 8224, + "time_per_iteration": 2.5675501823425293 + }, + { + "auxiliary_loss_clip": 0.01140847, + "auxiliary_loss_mlp": 0.01114617, + "balance_loss_clip": 1.00243104, + "balance_loss_mlp": 1.00065303, + "epoch": 0.4945137531940478, + "flos": 26608908856320.0, + "grad_norm": 2.0140880667690673, + "language_loss": 0.6405071, + "learning_rate": 2.1325145534274997e-06, + "loss": 0.66306174, + "num_input_tokens_seen": 176820610, + "step": 8225, + "time_per_iteration": 2.6469638347625732 + }, + { + "auxiliary_loss_clip": 0.01135274, + "auxiliary_loss_mlp": 0.01114381, + "balance_loss_clip": 1.00207138, + "balance_loss_mlp": 1.00079811, + "epoch": 0.49457387644671574, + "flos": 23988148738560.0, + "grad_norm": 2.4487665304496344, + "language_loss": 0.76095498, + "learning_rate": 2.1321259450498893e-06, + "loss": 0.78345156, + "num_input_tokens_seen": 176840520, + "step": 8226, + "time_per_iteration": 2.6080679893493652 + }, + { + "auxiliary_loss_clip": 0.01167718, + "auxiliary_loss_mlp": 0.01114245, + "balance_loss_clip": 1.00219226, + "balance_loss_mlp": 1.00066233, + "epoch": 0.49463399969938376, + "flos": 26976598427520.0, + "grad_norm": 1.5616333791879817, + "language_loss": 0.70760733, + "learning_rate": 2.131737331662051e-06, + "loss": 0.73042691, + "num_input_tokens_seen": 176860265, + "step": 8227, + "time_per_iteration": 2.5603041648864746 + }, + { + "auxiliary_loss_clip": 0.01134667, + "auxiliary_loss_mlp": 0.01116096, + "balance_loss_clip": 1.00219405, + "balance_loss_mlp": 1.00060606, + "epoch": 0.49469412295205173, + "flos": 29681534067840.0, + "grad_norm": 1.4929405962569626, + "language_loss": 0.71871185, + "learning_rate": 2.131348713278718e-06, + "loss": 0.74121946, + "num_input_tokens_seen": 176882910, + "step": 8228, + "time_per_iteration": 2.787323236465454 + }, + { + "auxiliary_loss_clip": 0.01167549, + "auxiliary_loss_mlp": 0.01113178, + "balance_loss_clip": 1.00222051, + "balance_loss_mlp": 1.00064433, + "epoch": 0.4947542462047197, + "flos": 24131791226880.0, + "grad_norm": 1.4074375412124513, + "language_loss": 0.84068692, + "learning_rate": 2.1309600899146304e-06, + "loss": 0.86349416, + "num_input_tokens_seen": 176903030, + "step": 8229, + "time_per_iteration": 2.568855047225952 + }, + { + "auxiliary_loss_clip": 0.01152247, + "auxiliary_loss_mlp": 0.01114575, + "balance_loss_clip": 1.00208378, + "balance_loss_mlp": 1.00061047, + "epoch": 0.49481436945738766, + "flos": 20045049333120.0, + "grad_norm": 1.8498986671627, + "language_loss": 0.7461583, + "learning_rate": 2.1305714615845227e-06, + "loss": 0.76882648, + "num_input_tokens_seen": 176919025, + "step": 8230, + "time_per_iteration": 2.543504238128662 + }, + { + "auxiliary_loss_clip": 0.01151414, + "auxiliary_loss_mlp": 0.01114322, + "balance_loss_clip": 1.00221741, + "balance_loss_mlp": 1.000453, + "epoch": 0.4948744927100556, + "flos": 15669550005120.0, + "grad_norm": 1.865399182800044, + "language_loss": 0.7968967, + "learning_rate": 2.1301828283031314e-06, + "loss": 0.81955397, + "num_input_tokens_seen": 176937945, + "step": 8231, + "time_per_iteration": 2.5324950218200684 + }, + { + "auxiliary_loss_clip": 0.01145497, + "auxiliary_loss_mlp": 0.01092788, + "balance_loss_clip": 1.00124836, + "balance_loss_mlp": 1.00009048, + "epoch": 0.4949346159627236, + "flos": 68872071502080.0, + "grad_norm": 0.747118153664306, + "language_loss": 0.60255063, + "learning_rate": 2.1297941900851944e-06, + "loss": 0.62493348, + "num_input_tokens_seen": 177004575, + "step": 8232, + "time_per_iteration": 3.2440364360809326 + }, + { + "auxiliary_loss_clip": 0.01138033, + "auxiliary_loss_mlp": 0.01115334, + "balance_loss_clip": 1.00221586, + "balance_loss_mlp": 1.00070238, + "epoch": 0.49499473921539155, + "flos": 24790285307520.0, + "grad_norm": 1.8235167324597983, + "language_loss": 0.69049573, + "learning_rate": 2.1294055469454496e-06, + "loss": 0.71302938, + "num_input_tokens_seen": 177024155, + "step": 8233, + "time_per_iteration": 2.6341257095336914 + }, + { + "auxiliary_loss_clip": 0.0110474, + "auxiliary_loss_mlp": 0.01114335, + "balance_loss_clip": 1.00203657, + "balance_loss_mlp": 1.00056171, + "epoch": 0.4950548624680595, + "flos": 32707905540480.0, + "grad_norm": 1.8795067248691755, + "language_loss": 0.66434455, + "learning_rate": 2.129016898898633e-06, + "loss": 0.68653524, + "num_input_tokens_seen": 177046185, + "step": 8234, + "time_per_iteration": 2.8163180351257324 + }, + { + "auxiliary_loss_clip": 0.01131005, + "auxiliary_loss_mlp": 0.01092428, + "balance_loss_clip": 1.00139475, + "balance_loss_mlp": 1.00011182, + "epoch": 0.4951149857207275, + "flos": 50082173066880.0, + "grad_norm": 0.8032676386243727, + "language_loss": 0.57989132, + "learning_rate": 2.128628245959482e-06, + "loss": 0.60212564, + "num_input_tokens_seen": 177099025, + "step": 8235, + "time_per_iteration": 3.065685987472534 + }, + { + "auxiliary_loss_clip": 0.01122892, + "auxiliary_loss_mlp": 0.01114695, + "balance_loss_clip": 1.0019846, + "balance_loss_mlp": 1.00063515, + "epoch": 0.49517510897339545, + "flos": 22236785406720.0, + "grad_norm": 2.119080435149112, + "language_loss": 0.77166444, + "learning_rate": 2.1282395881427355e-06, + "loss": 0.79404026, + "num_input_tokens_seen": 177118365, + "step": 8236, + "time_per_iteration": 2.648360013961792 + }, + { + "auxiliary_loss_clip": 0.01119386, + "auxiliary_loss_mlp": 0.01114085, + "balance_loss_clip": 1.00202286, + "balance_loss_mlp": 1.00059748, + "epoch": 0.4952352322260634, + "flos": 25374120969600.0, + "grad_norm": 1.664943219091403, + "language_loss": 0.72793889, + "learning_rate": 2.1278509254631315e-06, + "loss": 0.75027359, + "num_input_tokens_seen": 177136415, + "step": 8237, + "time_per_iteration": 2.6691012382507324 + }, + { + "auxiliary_loss_clip": 0.01167638, + "auxiliary_loss_mlp": 0.01115071, + "balance_loss_clip": 1.00231373, + "balance_loss_mlp": 1.00072551, + "epoch": 0.4952953554787314, + "flos": 24608721035520.0, + "grad_norm": 1.881748035155134, + "language_loss": 0.7591747, + "learning_rate": 2.127462257935406e-06, + "loss": 0.78200179, + "num_input_tokens_seen": 177155690, + "step": 8238, + "time_per_iteration": 2.546295404434204 + }, + { + "auxiliary_loss_clip": 0.01119251, + "auxiliary_loss_mlp": 0.01115246, + "balance_loss_clip": 1.00181437, + "balance_loss_mlp": 1.00070989, + "epoch": 0.49535547873139935, + "flos": 17311278049920.0, + "grad_norm": 8.460753380229464, + "language_loss": 0.74013329, + "learning_rate": 2.1270735855743008e-06, + "loss": 0.76247823, + "num_input_tokens_seen": 177173350, + "step": 8239, + "time_per_iteration": 3.975353240966797 + }, + { + "auxiliary_loss_clip": 0.01053008, + "auxiliary_loss_mlp": 0.01115715, + "balance_loss_clip": 1.00160217, + "balance_loss_mlp": 1.00060689, + "epoch": 0.4954156019840673, + "flos": 20740315962240.0, + "grad_norm": 1.9875139810358042, + "language_loss": 0.78558218, + "learning_rate": 2.126684908394552e-06, + "loss": 0.80726945, + "num_input_tokens_seen": 177191115, + "step": 8240, + "time_per_iteration": 2.85693097114563 + }, + { + "auxiliary_loss_clip": 0.01152756, + "auxiliary_loss_mlp": 0.01113835, + "balance_loss_clip": 1.00218105, + "balance_loss_mlp": 1.00091982, + "epoch": 0.49547572523673533, + "flos": 12820684567680.0, + "grad_norm": 2.110145636896322, + "language_loss": 0.85909301, + "learning_rate": 2.126296226410898e-06, + "loss": 0.88175893, + "num_input_tokens_seen": 177206155, + "step": 8241, + "time_per_iteration": 2.7972381114959717 + }, + { + "auxiliary_loss_clip": 0.0110305, + "auxiliary_loss_mlp": 0.01113814, + "balance_loss_clip": 1.00198388, + "balance_loss_mlp": 1.00080395, + "epoch": 0.4955358484894033, + "flos": 15597046402560.0, + "grad_norm": 2.4205523408722036, + "language_loss": 0.7727564, + "learning_rate": 2.1259075396380794e-06, + "loss": 0.79492509, + "num_input_tokens_seen": 177224815, + "step": 8242, + "time_per_iteration": 2.691464900970459 + }, + { + "auxiliary_loss_clip": 0.01134243, + "auxiliary_loss_mlp": 0.00747269, + "balance_loss_clip": 1.00205648, + "balance_loss_mlp": 1.00048733, + "epoch": 0.49559597174207126, + "flos": 26464368528000.0, + "grad_norm": 1.4948731598652865, + "language_loss": 0.66662616, + "learning_rate": 2.125518848090833e-06, + "loss": 0.68544126, + "num_input_tokens_seen": 177244490, + "step": 8243, + "time_per_iteration": 2.653573989868164 + }, + { + "auxiliary_loss_clip": 0.01151189, + "auxiliary_loss_mlp": 0.0111383, + "balance_loss_clip": 1.00217128, + "balance_loss_mlp": 1.00072408, + "epoch": 0.4956560949947392, + "flos": 23148234040320.0, + "grad_norm": 1.9995633795837757, + "language_loss": 0.68202096, + "learning_rate": 2.125130151783901e-06, + "loss": 0.70467114, + "num_input_tokens_seen": 177264340, + "step": 8244, + "time_per_iteration": 4.095825910568237 + }, + { + "auxiliary_loss_clip": 0.01142383, + "auxiliary_loss_mlp": 0.01114449, + "balance_loss_clip": 1.00238132, + "balance_loss_mlp": 1.00077128, + "epoch": 0.4957162182474072, + "flos": 20773461237120.0, + "grad_norm": 2.36452322253969, + "language_loss": 0.7506308, + "learning_rate": 2.12474145073202e-06, + "loss": 0.77319908, + "num_input_tokens_seen": 177283055, + "step": 8245, + "time_per_iteration": 3.9591596126556396 + }, + { + "auxiliary_loss_clip": 0.01150779, + "auxiliary_loss_mlp": 0.01114317, + "balance_loss_clip": 1.00215018, + "balance_loss_mlp": 1.00063932, + "epoch": 0.49577634150007516, + "flos": 18734202397440.0, + "grad_norm": 2.3244461046688634, + "language_loss": 0.81560826, + "learning_rate": 2.1243527449499306e-06, + "loss": 0.83825916, + "num_input_tokens_seen": 177301140, + "step": 8246, + "time_per_iteration": 3.9928131103515625 + }, + { + "auxiliary_loss_clip": 0.0112137, + "auxiliary_loss_mlp": 0.01115197, + "balance_loss_clip": 1.00214577, + "balance_loss_mlp": 1.0007565, + "epoch": 0.4958364647527431, + "flos": 25554176870400.0, + "grad_norm": 1.8075936052789587, + "language_loss": 0.83759671, + "learning_rate": 2.1239640344523733e-06, + "loss": 0.85996234, + "num_input_tokens_seen": 177323095, + "step": 8247, + "time_per_iteration": 2.664361000061035 + }, + { + "auxiliary_loss_clip": 0.01119245, + "auxiliary_loss_mlp": 0.01113545, + "balance_loss_clip": 1.0020262, + "balance_loss_mlp": 1.00053418, + "epoch": 0.4958965880054111, + "flos": 24425325169920.0, + "grad_norm": 2.0784518229077977, + "language_loss": 0.83560497, + "learning_rate": 2.123575319254087e-06, + "loss": 0.85793287, + "num_input_tokens_seen": 177339845, + "step": 8248, + "time_per_iteration": 2.661306381225586 + }, + { + "auxiliary_loss_clip": 0.01151098, + "auxiliary_loss_mlp": 0.01114024, + "balance_loss_clip": 1.00216532, + "balance_loss_mlp": 1.00053632, + "epoch": 0.49595671125807905, + "flos": 25083460114560.0, + "grad_norm": 1.858529180114291, + "language_loss": 0.73464996, + "learning_rate": 2.123186599369812e-06, + "loss": 0.75730121, + "num_input_tokens_seen": 177359980, + "step": 8249, + "time_per_iteration": 2.591139554977417 + }, + { + "auxiliary_loss_clip": 0.01135734, + "auxiliary_loss_mlp": 0.01115193, + "balance_loss_clip": 1.00216043, + "balance_loss_mlp": 1.00075245, + "epoch": 0.496016834510747, + "flos": 16435883692800.0, + "grad_norm": 3.2184426582871932, + "language_loss": 0.75864792, + "learning_rate": 2.122797874814289e-06, + "loss": 0.78115726, + "num_input_tokens_seen": 177378580, + "step": 8250, + "time_per_iteration": 2.6306912899017334 + }, + { + "auxiliary_loss_clip": 0.01167644, + "auxiliary_loss_mlp": 0.01115378, + "balance_loss_clip": 1.00218594, + "balance_loss_mlp": 1.00074601, + "epoch": 0.496076957763415, + "flos": 23437925228160.0, + "grad_norm": 2.010623066196522, + "language_loss": 0.70307422, + "learning_rate": 2.1224091456022585e-06, + "loss": 0.72590441, + "num_input_tokens_seen": 177398790, + "step": 8251, + "time_per_iteration": 2.5438072681427 + }, + { + "auxiliary_loss_clip": 0.01104722, + "auxiliary_loss_mlp": 0.00747309, + "balance_loss_clip": 1.00202274, + "balance_loss_mlp": 1.00043917, + "epoch": 0.49613708101608295, + "flos": 16909509450240.0, + "grad_norm": 2.665109709143392, + "language_loss": 0.80040038, + "learning_rate": 2.122020411748461e-06, + "loss": 0.81892067, + "num_input_tokens_seen": 177416515, + "step": 8252, + "time_per_iteration": 2.661125659942627 + }, + { + "auxiliary_loss_clip": 0.01167735, + "auxiliary_loss_mlp": 0.01113936, + "balance_loss_clip": 1.00236118, + "balance_loss_mlp": 1.00054383, + "epoch": 0.4961972042687509, + "flos": 16618094409600.0, + "grad_norm": 1.7546662744579442, + "language_loss": 0.81040573, + "learning_rate": 2.1216316732676363e-06, + "loss": 0.83322245, + "num_input_tokens_seen": 177434425, + "step": 8253, + "time_per_iteration": 2.4858899116516113 + }, + { + "auxiliary_loss_clip": 0.01121601, + "auxiliary_loss_mlp": 0.01112681, + "balance_loss_clip": 1.0019803, + "balance_loss_mlp": 1.00052893, + "epoch": 0.49625732752141893, + "flos": 28956749437440.0, + "grad_norm": 1.4680418904025532, + "language_loss": 0.67398322, + "learning_rate": 2.1212429301745275e-06, + "loss": 0.69632602, + "num_input_tokens_seen": 177459675, + "step": 8254, + "time_per_iteration": 2.789578437805176 + }, + { + "auxiliary_loss_clip": 0.01119343, + "auxiliary_loss_mlp": 0.01115149, + "balance_loss_clip": 1.00202012, + "balance_loss_mlp": 1.00061321, + "epoch": 0.4963174507740869, + "flos": 23112359331840.0, + "grad_norm": 1.7175204053619488, + "language_loss": 0.74184549, + "learning_rate": 2.1208541824838743e-06, + "loss": 0.76419044, + "num_input_tokens_seen": 177478895, + "step": 8255, + "time_per_iteration": 2.672253131866455 + }, + { + "auxiliary_loss_clip": 0.01135691, + "auxiliary_loss_mlp": 0.01112876, + "balance_loss_clip": 1.00197744, + "balance_loss_mlp": 1.00053287, + "epoch": 0.49637757402675486, + "flos": 13917863450880.0, + "grad_norm": 2.401390600386871, + "language_loss": 0.8122865, + "learning_rate": 2.1204654302104183e-06, + "loss": 0.83477211, + "num_input_tokens_seen": 177494920, + "step": 8256, + "time_per_iteration": 2.5797274112701416 + }, + { + "auxiliary_loss_clip": 0.01134238, + "auxiliary_loss_mlp": 0.01113115, + "balance_loss_clip": 1.00193107, + "balance_loss_mlp": 1.00039124, + "epoch": 0.49643769727942283, + "flos": 22309001700480.0, + "grad_norm": 2.8871354561281537, + "language_loss": 0.80773234, + "learning_rate": 2.120076673368901e-06, + "loss": 0.83020592, + "num_input_tokens_seen": 177515455, + "step": 8257, + "time_per_iteration": 2.626328945159912 + }, + { + "auxiliary_loss_clip": 0.01167862, + "auxiliary_loss_mlp": 0.01115591, + "balance_loss_clip": 1.00223553, + "balance_loss_mlp": 1.00067365, + "epoch": 0.4964978205320908, + "flos": 19500248776320.0, + "grad_norm": 1.8560698671851381, + "language_loss": 0.66011941, + "learning_rate": 2.1196879119740647e-06, + "loss": 0.68295395, + "num_input_tokens_seen": 177534040, + "step": 8258, + "time_per_iteration": 2.54713773727417 + }, + { + "auxiliary_loss_clip": 0.01150248, + "auxiliary_loss_mlp": 0.01111757, + "balance_loss_clip": 1.00205326, + "balance_loss_mlp": 1.00055861, + "epoch": 0.49655794378475876, + "flos": 23436524597760.0, + "grad_norm": 1.6408508948660874, + "language_loss": 0.77726638, + "learning_rate": 2.1192991460406502e-06, + "loss": 0.79988647, + "num_input_tokens_seen": 177554510, + "step": 8259, + "time_per_iteration": 2.58770751953125 + }, + { + "auxiliary_loss_clip": 0.01135756, + "auxiliary_loss_mlp": 0.0111466, + "balance_loss_clip": 1.00219083, + "balance_loss_mlp": 1.00050521, + "epoch": 0.4966180670374267, + "flos": 26831124345600.0, + "grad_norm": 1.5760778626996474, + "language_loss": 0.78614795, + "learning_rate": 2.1189103755834e-06, + "loss": 0.8086521, + "num_input_tokens_seen": 177575780, + "step": 8260, + "time_per_iteration": 2.628631591796875 + }, + { + "auxiliary_loss_clip": 0.01137762, + "auxiliary_loss_mlp": 0.01114056, + "balance_loss_clip": 1.0021286, + "balance_loss_mlp": 1.00056911, + "epoch": 0.4966781902900947, + "flos": 22009326531840.0, + "grad_norm": 3.270616012392989, + "language_loss": 0.76197577, + "learning_rate": 2.1185216006170573e-06, + "loss": 0.78449392, + "num_input_tokens_seen": 177588965, + "step": 8261, + "time_per_iteration": 2.5722568035125732 + }, + { + "auxiliary_loss_clip": 0.01102282, + "auxiliary_loss_mlp": 0.01111895, + "balance_loss_clip": 1.00192773, + "balance_loss_mlp": 1.0006969, + "epoch": 0.49673831354276266, + "flos": 26213353309440.0, + "grad_norm": 1.8379734063594808, + "language_loss": 0.89683855, + "learning_rate": 2.1181328211563627e-06, + "loss": 0.91898036, + "num_input_tokens_seen": 177608425, + "step": 8262, + "time_per_iteration": 2.7231569290161133 + }, + { + "auxiliary_loss_clip": 0.01088804, + "auxiliary_loss_mlp": 0.01113729, + "balance_loss_clip": 1.00198209, + "balance_loss_mlp": 1.00062323, + "epoch": 0.4967984367954306, + "flos": 23182277155200.0, + "grad_norm": 2.4809009346006596, + "language_loss": 0.73679054, + "learning_rate": 2.11774403721606e-06, + "loss": 0.75881582, + "num_input_tokens_seen": 177628240, + "step": 8263, + "time_per_iteration": 2.746535062789917 + }, + { + "auxiliary_loss_clip": 0.01103174, + "auxiliary_loss_mlp": 0.01114966, + "balance_loss_clip": 1.00202894, + "balance_loss_mlp": 1.00071621, + "epoch": 0.4968585600480986, + "flos": 19281445079040.0, + "grad_norm": 2.4409122203013336, + "language_loss": 0.70422381, + "learning_rate": 2.1173552488108923e-06, + "loss": 0.7264052, + "num_input_tokens_seen": 177645920, + "step": 8264, + "time_per_iteration": 2.6621859073638916 + }, + { + "auxiliary_loss_clip": 0.01134437, + "auxiliary_loss_mlp": 0.0111455, + "balance_loss_clip": 1.00194025, + "balance_loss_mlp": 1.00049043, + "epoch": 0.49691868330076655, + "flos": 22528703237760.0, + "grad_norm": 1.4550928762620925, + "language_loss": 0.64753371, + "learning_rate": 2.1169664559556007e-06, + "loss": 0.67002356, + "num_input_tokens_seen": 177667185, + "step": 8265, + "time_per_iteration": 2.6198220252990723 + }, + { + "auxiliary_loss_clip": 0.01131041, + "auxiliary_loss_mlp": 0.01092, + "balance_loss_clip": 1.00140381, + "balance_loss_mlp": 1.00006604, + "epoch": 0.4969788065534345, + "flos": 66577128675840.0, + "grad_norm": 0.8475411993626132, + "language_loss": 0.53542078, + "learning_rate": 2.1165776586649304e-06, + "loss": 0.55765116, + "num_input_tokens_seen": 177733020, + "step": 8266, + "time_per_iteration": 3.200852632522583 + }, + { + "auxiliary_loss_clip": 0.01150983, + "auxiliary_loss_mlp": 0.011131, + "balance_loss_clip": 1.00212622, + "balance_loss_mlp": 1.00056601, + "epoch": 0.49703892980610254, + "flos": 24059503105920.0, + "grad_norm": 1.5454551013352176, + "language_loss": 0.79743409, + "learning_rate": 2.1161888569536223e-06, + "loss": 0.82007498, + "num_input_tokens_seen": 177753370, + "step": 8267, + "time_per_iteration": 2.631167411804199 + }, + { + "auxiliary_loss_clip": 0.01134821, + "auxiliary_loss_mlp": 0.01113459, + "balance_loss_clip": 1.002002, + "balance_loss_mlp": 1.00073457, + "epoch": 0.4970990530587705, + "flos": 29126174912640.0, + "grad_norm": 2.1425064238124913, + "language_loss": 0.74784297, + "learning_rate": 2.1158000508364223e-06, + "loss": 0.77032578, + "num_input_tokens_seen": 177771530, + "step": 8268, + "time_per_iteration": 2.686189651489258 + }, + { + "auxiliary_loss_clip": 0.01152707, + "auxiliary_loss_mlp": 0.00747432, + "balance_loss_clip": 1.00212479, + "balance_loss_mlp": 1.00041521, + "epoch": 0.49715917631143847, + "flos": 46026167258880.0, + "grad_norm": 1.8888689003739696, + "language_loss": 0.6797123, + "learning_rate": 2.115411240328073e-06, + "loss": 0.69871372, + "num_input_tokens_seen": 177796355, + "step": 8269, + "time_per_iteration": 2.775968313217163 + }, + { + "auxiliary_loss_clip": 0.01135121, + "auxiliary_loss_mlp": 0.01112975, + "balance_loss_clip": 1.00194907, + "balance_loss_mlp": 1.00072753, + "epoch": 0.49721929956410643, + "flos": 20191277600640.0, + "grad_norm": 1.4857214143009063, + "language_loss": 0.85287529, + "learning_rate": 2.1150224254433167e-06, + "loss": 0.87535632, + "num_input_tokens_seen": 177814300, + "step": 8270, + "time_per_iteration": 2.610978126525879 + }, + { + "auxiliary_loss_clip": 0.01104181, + "auxiliary_loss_mlp": 0.00747242, + "balance_loss_clip": 1.00196958, + "balance_loss_mlp": 1.00039148, + "epoch": 0.4972794228167744, + "flos": 21653560275840.0, + "grad_norm": 2.703708621432928, + "language_loss": 0.70866948, + "learning_rate": 2.114633606196899e-06, + "loss": 0.7271837, + "num_input_tokens_seen": 177833615, + "step": 8271, + "time_per_iteration": 2.6986732482910156 + }, + { + "auxiliary_loss_clip": 0.01153016, + "auxiliary_loss_mlp": 0.01114041, + "balance_loss_clip": 1.00236225, + "balance_loss_mlp": 1.00055349, + "epoch": 0.49733954606944236, + "flos": 24279743347200.0, + "grad_norm": 1.3330341158318888, + "language_loss": 0.78330088, + "learning_rate": 2.1142447826035635e-06, + "loss": 0.80597144, + "num_input_tokens_seen": 177855315, + "step": 8272, + "time_per_iteration": 2.6074986457824707 + }, + { + "auxiliary_loss_clip": 0.01123749, + "auxiliary_loss_mlp": 0.01114046, + "balance_loss_clip": 1.00210536, + "balance_loss_mlp": 1.00074935, + "epoch": 0.4973996693221103, + "flos": 37852575730560.0, + "grad_norm": 2.356369400950237, + "language_loss": 0.66709256, + "learning_rate": 2.1138559546780544e-06, + "loss": 0.68947053, + "num_input_tokens_seen": 177875590, + "step": 8273, + "time_per_iteration": 2.761669397354126 + }, + { + "auxiliary_loss_clip": 0.01125515, + "auxiliary_loss_mlp": 0.01113735, + "balance_loss_clip": 1.00222516, + "balance_loss_mlp": 1.00072432, + "epoch": 0.4974597925747783, + "flos": 21361426963200.0, + "grad_norm": 1.7995790548520219, + "language_loss": 0.78351724, + "learning_rate": 2.1134671224351163e-06, + "loss": 0.80590975, + "num_input_tokens_seen": 177894175, + "step": 8274, + "time_per_iteration": 2.6872055530548096 + }, + { + "auxiliary_loss_clip": 0.01119211, + "auxiliary_loss_mlp": 0.01114271, + "balance_loss_clip": 1.0019474, + "balance_loss_mlp": 1.00049782, + "epoch": 0.49751991582744626, + "flos": 30738133560960.0, + "grad_norm": 1.9298327036442806, + "language_loss": 0.75846779, + "learning_rate": 2.113078285889493e-06, + "loss": 0.78080267, + "num_input_tokens_seen": 177913920, + "step": 8275, + "time_per_iteration": 2.8033547401428223 + }, + { + "auxiliary_loss_clip": 0.01150875, + "auxiliary_loss_mlp": 0.01115744, + "balance_loss_clip": 1.00213265, + "balance_loss_mlp": 1.00063562, + "epoch": 0.4975800390801142, + "flos": 14100541044480.0, + "grad_norm": 2.2462862668191734, + "language_loss": 0.82984006, + "learning_rate": 2.1126894450559303e-06, + "loss": 0.85250628, + "num_input_tokens_seen": 177930425, + "step": 8276, + "time_per_iteration": 2.5565836429595947 + }, + { + "auxiliary_loss_clip": 0.01167412, + "auxiliary_loss_mlp": 0.00747317, + "balance_loss_clip": 1.00225115, + "balance_loss_mlp": 1.00042248, + "epoch": 0.4976401623327822, + "flos": 24207275658240.0, + "grad_norm": 1.3549507953570965, + "language_loss": 0.70256007, + "learning_rate": 2.112300599949172e-06, + "loss": 0.72170734, + "num_input_tokens_seen": 177949885, + "step": 8277, + "time_per_iteration": 3.9381229877471924 + }, + { + "auxiliary_loss_clip": 0.01150655, + "auxiliary_loss_mlp": 0.01113088, + "balance_loss_clip": 1.00214267, + "balance_loss_mlp": 1.00055408, + "epoch": 0.49770028558545015, + "flos": 21136769349120.0, + "grad_norm": 1.9000807535619635, + "language_loss": 0.822245, + "learning_rate": 2.111911750583964e-06, + "loss": 0.84488243, + "num_input_tokens_seen": 177965720, + "step": 8278, + "time_per_iteration": 2.5472633838653564 + }, + { + "auxiliary_loss_clip": 0.01151239, + "auxiliary_loss_mlp": 0.0111412, + "balance_loss_clip": 1.00208139, + "balance_loss_mlp": 1.00053716, + "epoch": 0.4977604088381181, + "flos": 16763927627520.0, + "grad_norm": 3.4665511082421037, + "language_loss": 0.6768142, + "learning_rate": 2.111522896975052e-06, + "loss": 0.69946778, + "num_input_tokens_seen": 177983190, + "step": 8279, + "time_per_iteration": 2.529327869415283 + }, + { + "auxiliary_loss_clip": 0.01152497, + "auxiliary_loss_mlp": 0.01114324, + "balance_loss_clip": 1.00201583, + "balance_loss_mlp": 1.00055099, + "epoch": 0.49782053209078614, + "flos": 15703521292800.0, + "grad_norm": 3.7576981939402487, + "language_loss": 0.71293646, + "learning_rate": 2.1111340391371794e-06, + "loss": 0.73560464, + "num_input_tokens_seen": 178000155, + "step": 8280, + "time_per_iteration": 2.563300609588623 + }, + { + "auxiliary_loss_clip": 0.0111911, + "auxiliary_loss_mlp": 0.0111442, + "balance_loss_clip": 1.00193644, + "balance_loss_mlp": 1.00064635, + "epoch": 0.4978806553434541, + "flos": 24753692327040.0, + "grad_norm": 2.090676177093987, + "language_loss": 0.6502369, + "learning_rate": 2.1107451770850936e-06, + "loss": 0.67257226, + "num_input_tokens_seen": 178021060, + "step": 8281, + "time_per_iteration": 4.065327167510986 + }, + { + "auxiliary_loss_clip": 0.01152798, + "auxiliary_loss_mlp": 0.01114914, + "balance_loss_clip": 1.00218475, + "balance_loss_mlp": 1.00056827, + "epoch": 0.49794077859612207, + "flos": 13115726881920.0, + "grad_norm": 2.1023095409480326, + "language_loss": 0.73299742, + "learning_rate": 2.1103563108335387e-06, + "loss": 0.7556746, + "num_input_tokens_seen": 178038180, + "step": 8282, + "time_per_iteration": 3.937110662460327 + }, + { + "auxiliary_loss_clip": 0.01134259, + "auxiliary_loss_mlp": 0.01113654, + "balance_loss_clip": 1.00191903, + "balance_loss_mlp": 1.00064373, + "epoch": 0.49800090184879003, + "flos": 27525133998720.0, + "grad_norm": 1.4967991586113532, + "language_loss": 0.73638147, + "learning_rate": 2.109967440397263e-06, + "loss": 0.75886065, + "num_input_tokens_seen": 178057565, + "step": 8283, + "time_per_iteration": 2.6595046520233154 + }, + { + "auxiliary_loss_clip": 0.01104045, + "auxiliary_loss_mlp": 0.01113559, + "balance_loss_clip": 1.00190973, + "balance_loss_mlp": 1.00064373, + "epoch": 0.498061025101458, + "flos": 19792489829760.0, + "grad_norm": 3.936536790259156, + "language_loss": 0.78880686, + "learning_rate": 2.1095785657910095e-06, + "loss": 0.81098288, + "num_input_tokens_seen": 178076965, + "step": 8284, + "time_per_iteration": 4.044479608535767 + }, + { + "auxiliary_loss_clip": 0.01134476, + "auxiliary_loss_mlp": 0.01115056, + "balance_loss_clip": 1.00194287, + "balance_loss_mlp": 1.00071049, + "epoch": 0.49812114835412596, + "flos": 29893909230720.0, + "grad_norm": 1.7429609775757895, + "language_loss": 0.73704362, + "learning_rate": 2.109189687029526e-06, + "loss": 0.75953889, + "num_input_tokens_seen": 178095105, + "step": 8285, + "time_per_iteration": 2.6502645015716553 + }, + { + "auxiliary_loss_clip": 0.0115752, + "auxiliary_loss_mlp": 0.01114632, + "balance_loss_clip": 1.00257492, + "balance_loss_mlp": 1.00066757, + "epoch": 0.49818127160679393, + "flos": 23147048891520.0, + "grad_norm": 1.7249438813311977, + "language_loss": 0.74554133, + "learning_rate": 2.1088008041275598e-06, + "loss": 0.76826286, + "num_input_tokens_seen": 178114505, + "step": 8286, + "time_per_iteration": 2.5981972217559814 + }, + { + "auxiliary_loss_clip": 0.0113421, + "auxiliary_loss_mlp": 0.01114448, + "balance_loss_clip": 1.00215304, + "balance_loss_mlp": 1.00067425, + "epoch": 0.4982413948594619, + "flos": 21652806090240.0, + "grad_norm": 1.8478100858701467, + "language_loss": 0.85339451, + "learning_rate": 2.1084119170998545e-06, + "loss": 0.87588108, + "num_input_tokens_seen": 178131595, + "step": 8287, + "time_per_iteration": 2.6248269081115723 + }, + { + "auxiliary_loss_clip": 0.01109531, + "auxiliary_loss_mlp": 0.01114733, + "balance_loss_clip": 1.00215554, + "balance_loss_mlp": 1.00048244, + "epoch": 0.49830151811212986, + "flos": 32486982940800.0, + "grad_norm": 1.522501674712094, + "language_loss": 0.72424442, + "learning_rate": 2.108023025961159e-06, + "loss": 0.74648708, + "num_input_tokens_seen": 178152055, + "step": 8288, + "time_per_iteration": 2.75479793548584 + }, + { + "auxiliary_loss_clip": 0.01136058, + "auxiliary_loss_mlp": 0.01115358, + "balance_loss_clip": 1.0020895, + "balance_loss_mlp": 1.00063062, + "epoch": 0.4983616413647978, + "flos": 18142358002560.0, + "grad_norm": 3.0536682223914546, + "language_loss": 0.80603927, + "learning_rate": 2.10763413072622e-06, + "loss": 0.82855344, + "num_input_tokens_seen": 178168150, + "step": 8289, + "time_per_iteration": 2.568819522857666 + }, + { + "auxiliary_loss_clip": 0.01152575, + "auxiliary_loss_mlp": 0.01113947, + "balance_loss_clip": 1.00203931, + "balance_loss_mlp": 1.00055504, + "epoch": 0.4984217646174658, + "flos": 19718836992000.0, + "grad_norm": 2.4458416818688753, + "language_loss": 0.7336005, + "learning_rate": 2.107245231409784e-06, + "loss": 0.75626576, + "num_input_tokens_seen": 178186150, + "step": 8290, + "time_per_iteration": 2.5672264099121094 + }, + { + "auxiliary_loss_clip": 0.0115132, + "auxiliary_loss_mlp": 0.01115261, + "balance_loss_clip": 1.00224996, + "balance_loss_mlp": 1.00062978, + "epoch": 0.49848188787013376, + "flos": 24936549488640.0, + "grad_norm": 1.4909803661663643, + "language_loss": 0.84083879, + "learning_rate": 2.106856328026598e-06, + "loss": 0.86350465, + "num_input_tokens_seen": 178207665, + "step": 8291, + "time_per_iteration": 2.6087491512298584 + }, + { + "auxiliary_loss_clip": 0.01137699, + "auxiliary_loss_mlp": 0.01115411, + "balance_loss_clip": 1.00215435, + "balance_loss_mlp": 1.00077891, + "epoch": 0.4985420111228017, + "flos": 22382439056640.0, + "grad_norm": 1.7198403602323884, + "language_loss": 0.67009008, + "learning_rate": 2.106467420591409e-06, + "loss": 0.69262111, + "num_input_tokens_seen": 178226325, + "step": 8292, + "time_per_iteration": 2.6141316890716553 + }, + { + "auxiliary_loss_clip": 0.01167542, + "auxiliary_loss_mlp": 0.01114882, + "balance_loss_clip": 1.00218749, + "balance_loss_mlp": 1.00063181, + "epoch": 0.4986021343754697, + "flos": 16216469464320.0, + "grad_norm": 1.736003322274005, + "language_loss": 0.67405438, + "learning_rate": 2.106078509118965e-06, + "loss": 0.69687867, + "num_input_tokens_seen": 178244960, + "step": 8293, + "time_per_iteration": 2.4943687915802 + }, + { + "auxiliary_loss_clip": 0.01152705, + "auxiliary_loss_mlp": 0.01115348, + "balance_loss_clip": 1.00232005, + "balance_loss_mlp": 1.00062132, + "epoch": 0.4986622576281377, + "flos": 23403594804480.0, + "grad_norm": 1.8094100359704681, + "language_loss": 0.82791293, + "learning_rate": 2.1056895936240133e-06, + "loss": 0.85059351, + "num_input_tokens_seen": 178265400, + "step": 8294, + "time_per_iteration": 2.574352264404297 + }, + { + "auxiliary_loss_clip": 0.01152714, + "auxiliary_loss_mlp": 0.01115324, + "balance_loss_clip": 1.00215244, + "balance_loss_mlp": 1.00059652, + "epoch": 0.49872238088080567, + "flos": 19974556892160.0, + "grad_norm": 1.660199819325514, + "language_loss": 0.72549963, + "learning_rate": 2.1053006741213016e-06, + "loss": 0.74817997, + "num_input_tokens_seen": 178284535, + "step": 8295, + "time_per_iteration": 2.5569634437561035 + }, + { + "auxiliary_loss_clip": 0.01091727, + "auxiliary_loss_mlp": 0.01114202, + "balance_loss_clip": 1.00193989, + "balance_loss_mlp": 1.00052452, + "epoch": 0.49878250413347364, + "flos": 22893016930560.0, + "grad_norm": 1.8019524799903421, + "language_loss": 0.6787653, + "learning_rate": 2.1049117506255775e-06, + "loss": 0.70082462, + "num_input_tokens_seen": 178302425, + "step": 8296, + "time_per_iteration": 2.7043797969818115 + }, + { + "auxiliary_loss_clip": 0.01141543, + "auxiliary_loss_mlp": 0.0111412, + "balance_loss_clip": 1.00228, + "balance_loss_mlp": 1.00053787, + "epoch": 0.4988426273861416, + "flos": 32598449821440.0, + "grad_norm": 2.6843224859671855, + "language_loss": 0.64848065, + "learning_rate": 2.1045228231515895e-06, + "loss": 0.67103732, + "num_input_tokens_seen": 178323065, + "step": 8297, + "time_per_iteration": 2.677856206893921 + }, + { + "auxiliary_loss_clip": 0.01107369, + "auxiliary_loss_mlp": 0.01113523, + "balance_loss_clip": 1.00204051, + "balance_loss_mlp": 1.00070333, + "epoch": 0.49890275063880957, + "flos": 20923604087040.0, + "grad_norm": 1.7991933949430845, + "language_loss": 0.69450867, + "learning_rate": 2.1041338917140857e-06, + "loss": 0.71671754, + "num_input_tokens_seen": 178343985, + "step": 8298, + "time_per_iteration": 2.6903305053710938 + }, + { + "auxiliary_loss_clip": 0.01167485, + "auxiliary_loss_mlp": 0.01113815, + "balance_loss_clip": 1.00219131, + "balance_loss_mlp": 1.00061357, + "epoch": 0.49896287389147753, + "flos": 18624459369600.0, + "grad_norm": 2.023948072565711, + "language_loss": 0.84469974, + "learning_rate": 2.103744956327814e-06, + "loss": 0.8675127, + "num_input_tokens_seen": 178362345, + "step": 8299, + "time_per_iteration": 2.5274996757507324 + }, + { + "auxiliary_loss_clip": 0.01121564, + "auxiliary_loss_mlp": 0.01114763, + "balance_loss_clip": 1.00204623, + "balance_loss_mlp": 1.00070322, + "epoch": 0.4990229971441455, + "flos": 24826555065600.0, + "grad_norm": 1.8434866857926515, + "language_loss": 0.69409245, + "learning_rate": 2.1033560170075234e-06, + "loss": 0.7164557, + "num_input_tokens_seen": 178383190, + "step": 8300, + "time_per_iteration": 2.693406581878662 + }, + { + "auxiliary_loss_clip": 0.01131618, + "auxiliary_loss_mlp": 0.01092999, + "balance_loss_clip": 1.00144207, + "balance_loss_mlp": 1.00030172, + "epoch": 0.49908312039681346, + "flos": 71384525136000.0, + "grad_norm": 0.751867347753388, + "language_loss": 0.51183152, + "learning_rate": 2.1029670737679623e-06, + "loss": 0.5340777, + "num_input_tokens_seen": 178444250, + "step": 8301, + "time_per_iteration": 3.265630006790161 + }, + { + "auxiliary_loss_clip": 0.01135179, + "auxiliary_loss_mlp": 0.0111437, + "balance_loss_clip": 1.00198603, + "balance_loss_mlp": 1.0008831, + "epoch": 0.4991432436494814, + "flos": 19828651847040.0, + "grad_norm": 1.6747563247005848, + "language_loss": 0.84391576, + "learning_rate": 2.102578126623879e-06, + "loss": 0.86641127, + "num_input_tokens_seen": 178463250, + "step": 8302, + "time_per_iteration": 2.5929982662200928 + }, + { + "auxiliary_loss_clip": 0.01150666, + "auxiliary_loss_mlp": 0.01113429, + "balance_loss_clip": 1.00211883, + "balance_loss_mlp": 1.00060928, + "epoch": 0.4992033669021494, + "flos": 15121912273920.0, + "grad_norm": 2.3040500198988743, + "language_loss": 0.69270957, + "learning_rate": 2.102189175590024e-06, + "loss": 0.71535051, + "num_input_tokens_seen": 178481340, + "step": 8303, + "time_per_iteration": 2.5538783073425293 + }, + { + "auxiliary_loss_clip": 0.0116767, + "auxiliary_loss_mlp": 0.01115241, + "balance_loss_clip": 1.00221074, + "balance_loss_mlp": 1.00051403, + "epoch": 0.49926349015481736, + "flos": 31207952476800.0, + "grad_norm": 6.760171831258093, + "language_loss": 0.72713542, + "learning_rate": 2.101800220681144e-06, + "loss": 0.74996454, + "num_input_tokens_seen": 178501545, + "step": 8304, + "time_per_iteration": 2.6025032997131348 + }, + { + "auxiliary_loss_clip": 0.01150913, + "auxiliary_loss_mlp": 0.01114412, + "balance_loss_clip": 1.00221062, + "balance_loss_mlp": 1.00063848, + "epoch": 0.4993236134074853, + "flos": 24900207903360.0, + "grad_norm": 2.1812614529346717, + "language_loss": 0.80745411, + "learning_rate": 2.10141126191199e-06, + "loss": 0.83010739, + "num_input_tokens_seen": 178519700, + "step": 8305, + "time_per_iteration": 2.623117446899414 + }, + { + "auxiliary_loss_clip": 0.01115879, + "auxiliary_loss_mlp": 0.01091955, + "balance_loss_clip": 1.00135875, + "balance_loss_mlp": 1.0000205, + "epoch": 0.4993837366601533, + "flos": 70420573797120.0, + "grad_norm": 0.7226192401583585, + "language_loss": 0.56908953, + "learning_rate": 2.1010222992973107e-06, + "loss": 0.59116787, + "num_input_tokens_seen": 178576740, + "step": 8306, + "time_per_iteration": 3.2965221405029297 + }, + { + "auxiliary_loss_clip": 0.01167568, + "auxiliary_loss_mlp": 0.01114367, + "balance_loss_clip": 1.0023216, + "balance_loss_mlp": 1.00059366, + "epoch": 0.4994438599128213, + "flos": 15961216440960.0, + "grad_norm": 1.7193051186543156, + "language_loss": 0.82988894, + "learning_rate": 2.1006333328518556e-06, + "loss": 0.85270828, + "num_input_tokens_seen": 178594745, + "step": 8307, + "time_per_iteration": 2.5068559646606445 + }, + { + "auxiliary_loss_clip": 0.01167461, + "auxiliary_loss_mlp": 0.01114489, + "balance_loss_clip": 1.00212944, + "balance_loss_mlp": 1.000525, + "epoch": 0.4995039831654893, + "flos": 27928303228800.0, + "grad_norm": 1.7109507539139983, + "language_loss": 0.61053467, + "learning_rate": 2.1002443625903748e-06, + "loss": 0.63335419, + "num_input_tokens_seen": 178614110, + "step": 8308, + "time_per_iteration": 2.5731124877929688 + }, + { + "auxiliary_loss_clip": 0.01167398, + "auxiliary_loss_mlp": 0.01113192, + "balance_loss_clip": 1.00220573, + "balance_loss_mlp": 1.00065804, + "epoch": 0.49956410641815724, + "flos": 24204797619840.0, + "grad_norm": 1.5272857023109658, + "language_loss": 0.74643201, + "learning_rate": 2.0998553885276168e-06, + "loss": 0.76923782, + "num_input_tokens_seen": 178634170, + "step": 8309, + "time_per_iteration": 2.5592799186706543 + }, + { + "auxiliary_loss_clip": 0.0113535, + "auxiliary_loss_mlp": 0.01114376, + "balance_loss_clip": 1.00205612, + "balance_loss_mlp": 1.00069773, + "epoch": 0.4996242296708252, + "flos": 16180127879040.0, + "grad_norm": 2.489445552878638, + "language_loss": 0.79648674, + "learning_rate": 2.0994664106783335e-06, + "loss": 0.81898391, + "num_input_tokens_seen": 178651775, + "step": 8310, + "time_per_iteration": 2.5657289028167725 + }, + { + "auxiliary_loss_clip": 0.01157283, + "auxiliary_loss_mlp": 0.01114888, + "balance_loss_clip": 1.00233471, + "balance_loss_mlp": 1.00063741, + "epoch": 0.49968435292349317, + "flos": 16873527000960.0, + "grad_norm": 1.5363375081107473, + "language_loss": 0.70872891, + "learning_rate": 2.0990774290572735e-06, + "loss": 0.73145056, + "num_input_tokens_seen": 178669720, + "step": 8311, + "time_per_iteration": 2.567173480987549 + }, + { + "auxiliary_loss_clip": 0.01134563, + "auxiliary_loss_mlp": 0.01114679, + "balance_loss_clip": 1.00206435, + "balance_loss_mlp": 1.00061977, + "epoch": 0.49974447617616113, + "flos": 14939521989120.0, + "grad_norm": 1.7876639223359407, + "language_loss": 0.7702682, + "learning_rate": 2.098688443679187e-06, + "loss": 0.79276061, + "num_input_tokens_seen": 178686765, + "step": 8312, + "time_per_iteration": 2.558758497238159 + }, + { + "auxiliary_loss_clip": 0.01120728, + "auxiliary_loss_mlp": 0.01113903, + "balance_loss_clip": 1.00206757, + "balance_loss_mlp": 1.00070214, + "epoch": 0.4998045994288291, + "flos": 26651535321600.0, + "grad_norm": 1.6014651259409354, + "language_loss": 0.84297097, + "learning_rate": 2.0982994545588256e-06, + "loss": 0.86531723, + "num_input_tokens_seen": 178705845, + "step": 8313, + "time_per_iteration": 2.6977732181549072 + }, + { + "auxiliary_loss_clip": 0.0113581, + "auxiliary_loss_mlp": 0.01114258, + "balance_loss_clip": 1.00211692, + "balance_loss_mlp": 1.00058007, + "epoch": 0.49986472268149706, + "flos": 20953768533120.0, + "grad_norm": 2.282974470180306, + "language_loss": 0.80487478, + "learning_rate": 2.097910461710939e-06, + "loss": 0.82737547, + "num_input_tokens_seen": 178723410, + "step": 8314, + "time_per_iteration": 2.5937659740448 + }, + { + "auxiliary_loss_clip": 0.01118781, + "auxiliary_loss_mlp": 0.00747478, + "balance_loss_clip": 1.00199366, + "balance_loss_mlp": 1.0004319, + "epoch": 0.49992484593416503, + "flos": 22783884433920.0, + "grad_norm": 1.8147392825407997, + "language_loss": 0.79523873, + "learning_rate": 2.0975214651502773e-06, + "loss": 0.81390131, + "num_input_tokens_seen": 178743560, + "step": 8315, + "time_per_iteration": 4.023710250854492 + }, + { + "auxiliary_loss_clip": 0.01167573, + "auxiliary_loss_mlp": 0.01113898, + "balance_loss_clip": 1.00225282, + "balance_loss_mlp": 1.0006969, + "epoch": 0.499984969186833, + "flos": 46786970252160.0, + "grad_norm": 2.1569103958081897, + "language_loss": 0.74536145, + "learning_rate": 2.0971324648915926e-06, + "loss": 0.7681762, + "num_input_tokens_seen": 178767225, + "step": 8316, + "time_per_iteration": 2.765031576156616 + }, + { + "auxiliary_loss_clip": 0.01151094, + "auxiliary_loss_mlp": 0.01114308, + "balance_loss_clip": 1.00220144, + "balance_loss_mlp": 1.00063038, + "epoch": 0.500045092439501, + "flos": 25556978131200.0, + "grad_norm": 1.5270583143145156, + "language_loss": 0.8137002, + "learning_rate": 2.0967434609496343e-06, + "loss": 0.83635426, + "num_input_tokens_seen": 178786810, + "step": 8317, + "time_per_iteration": 2.583183765411377 + }, + { + "auxiliary_loss_clip": 0.01133496, + "auxiliary_loss_mlp": 0.01114119, + "balance_loss_clip": 1.00201297, + "balance_loss_mlp": 1.00063169, + "epoch": 0.5001052156921689, + "flos": 20704764476160.0, + "grad_norm": 2.493382238454157, + "language_loss": 0.83185136, + "learning_rate": 2.0963544533391548e-06, + "loss": 0.8543275, + "num_input_tokens_seen": 178805660, + "step": 8318, + "time_per_iteration": 4.024357557296753 + }, + { + "auxiliary_loss_clip": 0.01150666, + "auxiliary_loss_mlp": 0.01113246, + "balance_loss_clip": 1.00204623, + "balance_loss_mlp": 1.00061703, + "epoch": 0.500165338944837, + "flos": 21251109317760.0, + "grad_norm": 1.744881671690999, + "language_loss": 0.81854439, + "learning_rate": 2.0959654420749045e-06, + "loss": 0.84118342, + "num_input_tokens_seen": 178824780, + "step": 8319, + "time_per_iteration": 2.5665712356567383 + }, + { + "auxiliary_loss_clip": 0.01121104, + "auxiliary_loss_mlp": 0.01113332, + "balance_loss_clip": 1.00207877, + "balance_loss_mlp": 1.00070262, + "epoch": 0.5002254621975049, + "flos": 27854398995840.0, + "grad_norm": 4.7432616063994555, + "language_loss": 0.71667826, + "learning_rate": 2.095576427171635e-06, + "loss": 0.73902261, + "num_input_tokens_seen": 178845640, + "step": 8320, + "time_per_iteration": 4.084001779556274 + }, + { + "auxiliary_loss_clip": 0.01123976, + "auxiliary_loss_mlp": 0.01116891, + "balance_loss_clip": 1.00215244, + "balance_loss_mlp": 1.0008285, + "epoch": 0.5002855854501729, + "flos": 15551941898880.0, + "grad_norm": 2.939272331149156, + "language_loss": 0.76391745, + "learning_rate": 2.0951874086440978e-06, + "loss": 0.78632611, + "num_input_tokens_seen": 178862290, + "step": 8321, + "time_per_iteration": 4.132070064544678 + }, + { + "auxiliary_loss_clip": 0.01150613, + "auxiliary_loss_mlp": 0.00747375, + "balance_loss_clip": 1.00205517, + "balance_loss_mlp": 1.00048208, + "epoch": 0.5003457087028408, + "flos": 16107408794880.0, + "grad_norm": 2.150687034980767, + "language_loss": 0.83196008, + "learning_rate": 2.0947983865070455e-06, + "loss": 0.85093999, + "num_input_tokens_seen": 178879805, + "step": 8322, + "time_per_iteration": 2.5494508743286133 + }, + { + "auxiliary_loss_clip": 0.01152755, + "auxiliary_loss_mlp": 0.0111461, + "balance_loss_clip": 1.00223422, + "balance_loss_mlp": 1.000741, + "epoch": 0.5004058319555088, + "flos": 22710518904960.0, + "grad_norm": 2.330196742991446, + "language_loss": 0.73832798, + "learning_rate": 2.094409360775228e-06, + "loss": 0.76100165, + "num_input_tokens_seen": 178896985, + "step": 8323, + "time_per_iteration": 2.5561113357543945 + }, + { + "auxiliary_loss_clip": 0.01117766, + "auxiliary_loss_mlp": 0.01115166, + "balance_loss_clip": 1.0020113, + "balance_loss_mlp": 1.00072539, + "epoch": 0.5004659552081767, + "flos": 30117956313600.0, + "grad_norm": 2.4308214705521847, + "language_loss": 0.6956445, + "learning_rate": 2.0940203314633977e-06, + "loss": 0.71797383, + "num_input_tokens_seen": 178920605, + "step": 8324, + "time_per_iteration": 2.7053141593933105 + }, + { + "auxiliary_loss_clip": 0.01150995, + "auxiliary_loss_mlp": 0.00747331, + "balance_loss_clip": 1.00202382, + "balance_loss_mlp": 1.00027525, + "epoch": 0.5005260784608447, + "flos": 18624710764800.0, + "grad_norm": 12.192403099267317, + "language_loss": 0.72035378, + "learning_rate": 2.0936312985863077e-06, + "loss": 0.73933709, + "num_input_tokens_seen": 178937760, + "step": 8325, + "time_per_iteration": 2.560032367706299 + }, + { + "auxiliary_loss_clip": 0.01120786, + "auxiliary_loss_mlp": 0.01114088, + "balance_loss_clip": 1.00202811, + "balance_loss_mlp": 1.00079131, + "epoch": 0.5005862017135126, + "flos": 24859987649280.0, + "grad_norm": 2.3651673033259395, + "language_loss": 0.73819882, + "learning_rate": 2.093242262158709e-06, + "loss": 0.76054758, + "num_input_tokens_seen": 178957985, + "step": 8326, + "time_per_iteration": 2.6762635707855225 + }, + { + "auxiliary_loss_clip": 0.01136039, + "auxiliary_loss_mlp": 0.01113233, + "balance_loss_clip": 1.00195873, + "balance_loss_mlp": 1.00089002, + "epoch": 0.5006463249661807, + "flos": 18734381965440.0, + "grad_norm": 1.6356223744046368, + "language_loss": 0.78022909, + "learning_rate": 2.0928532221953544e-06, + "loss": 0.80272186, + "num_input_tokens_seen": 178977070, + "step": 8327, + "time_per_iteration": 2.616628408432007 + }, + { + "auxiliary_loss_clip": 0.01167477, + "auxiliary_loss_mlp": 0.01114248, + "balance_loss_clip": 1.00221527, + "balance_loss_mlp": 1.00056982, + "epoch": 0.5007064482188487, + "flos": 13042145871360.0, + "grad_norm": 2.2124076643847754, + "language_loss": 0.87854958, + "learning_rate": 2.092464178710997e-06, + "loss": 0.90136683, + "num_input_tokens_seen": 178994175, + "step": 8328, + "time_per_iteration": 2.4862558841705322 + }, + { + "auxiliary_loss_clip": 0.0111903, + "auxiliary_loss_mlp": 0.0111411, + "balance_loss_clip": 1.00191188, + "balance_loss_mlp": 1.00052714, + "epoch": 0.5007665714715166, + "flos": 21288671965440.0, + "grad_norm": 3.068029351213353, + "language_loss": 0.74588513, + "learning_rate": 2.092075131720388e-06, + "loss": 0.76821649, + "num_input_tokens_seen": 179013710, + "step": 8329, + "time_per_iteration": 2.6514272689819336 + }, + { + "auxiliary_loss_clip": 0.01167397, + "auxiliary_loss_mlp": 0.01114161, + "balance_loss_clip": 1.00220251, + "balance_loss_mlp": 1.00067353, + "epoch": 0.5008266947241846, + "flos": 29754576374400.0, + "grad_norm": 2.6925098702671693, + "language_loss": 0.79614097, + "learning_rate": 2.091686081238281e-06, + "loss": 0.81895649, + "num_input_tokens_seen": 179035255, + "step": 8330, + "time_per_iteration": 2.5901882648468018 + }, + { + "auxiliary_loss_clip": 0.01129282, + "auxiliary_loss_mlp": 0.00745897, + "balance_loss_clip": 1.00128925, + "balance_loss_mlp": 0.99991018, + "epoch": 0.5008868179768525, + "flos": 63557829204480.0, + "grad_norm": 0.7287114433277456, + "language_loss": 0.5614624, + "learning_rate": 2.0912970272794282e-06, + "loss": 0.58021414, + "num_input_tokens_seen": 179090915, + "step": 8331, + "time_per_iteration": 2.9797935485839844 + }, + { + "auxiliary_loss_clip": 0.01150683, + "auxiliary_loss_mlp": 0.01113332, + "balance_loss_clip": 1.00218689, + "balance_loss_mlp": 1.00060785, + "epoch": 0.5009469412295205, + "flos": 27375637593600.0, + "grad_norm": 7.207390243384152, + "language_loss": 0.65398753, + "learning_rate": 2.0909079698585833e-06, + "loss": 0.6766277, + "num_input_tokens_seen": 179109160, + "step": 8332, + "time_per_iteration": 2.6140024662017822 + }, + { + "auxiliary_loss_clip": 0.01167274, + "auxiliary_loss_mlp": 0.01113166, + "balance_loss_clip": 1.00216365, + "balance_loss_mlp": 1.00063229, + "epoch": 0.5010070644821885, + "flos": 27378833904000.0, + "grad_norm": 2.3177056673652854, + "language_loss": 0.74943483, + "learning_rate": 2.0905189089904993e-06, + "loss": 0.77223927, + "num_input_tokens_seen": 179130610, + "step": 8333, + "time_per_iteration": 2.5564136505126953 + }, + { + "auxiliary_loss_clip": 0.01167537, + "auxiliary_loss_mlp": 0.01115226, + "balance_loss_clip": 1.00220335, + "balance_loss_mlp": 1.00088, + "epoch": 0.5010671877348565, + "flos": 20662748542080.0, + "grad_norm": 1.9651834841998843, + "language_loss": 0.804048, + "learning_rate": 2.090129844689929e-06, + "loss": 0.82687557, + "num_input_tokens_seen": 179147860, + "step": 8334, + "time_per_iteration": 2.5028648376464844 + }, + { + "auxiliary_loss_clip": 0.01145852, + "auxiliary_loss_mlp": 0.01091962, + "balance_loss_clip": 1.00131702, + "balance_loss_mlp": 1.00002766, + "epoch": 0.5011273109875244, + "flos": 59128645000320.0, + "grad_norm": 0.8983249889398415, + "language_loss": 0.6264782, + "learning_rate": 2.089740776971626e-06, + "loss": 0.64885634, + "num_input_tokens_seen": 179210490, + "step": 8335, + "time_per_iteration": 3.0850114822387695 + }, + { + "auxiliary_loss_clip": 0.01151958, + "auxiliary_loss_mlp": 0.01113227, + "balance_loss_clip": 1.00209248, + "balance_loss_mlp": 1.00059843, + "epoch": 0.5011874342401924, + "flos": 25336342840320.0, + "grad_norm": 1.4600943744107393, + "language_loss": 0.79689002, + "learning_rate": 2.0893517058503435e-06, + "loss": 0.81954187, + "num_input_tokens_seen": 179231360, + "step": 8336, + "time_per_iteration": 2.588606119155884 + }, + { + "auxiliary_loss_clip": 0.01120549, + "auxiliary_loss_mlp": 0.01113389, + "balance_loss_clip": 1.00205624, + "balance_loss_mlp": 1.00076056, + "epoch": 0.5012475574928603, + "flos": 20229953569920.0, + "grad_norm": 1.6155267367344175, + "language_loss": 0.79963452, + "learning_rate": 2.088962631340836e-06, + "loss": 0.82197392, + "num_input_tokens_seen": 179250625, + "step": 8337, + "time_per_iteration": 2.6261961460113525 + }, + { + "auxiliary_loss_clip": 0.01167557, + "auxiliary_loss_mlp": 0.01114304, + "balance_loss_clip": 1.00217724, + "balance_loss_mlp": 1.00062561, + "epoch": 0.5013076807455283, + "flos": 22710123855360.0, + "grad_norm": 3.4976957705416742, + "language_loss": 0.78940737, + "learning_rate": 2.0885735534578555e-06, + "loss": 0.81222594, + "num_input_tokens_seen": 179267360, + "step": 8338, + "time_per_iteration": 2.5157806873321533 + }, + { + "auxiliary_loss_clip": 0.01133998, + "auxiliary_loss_mlp": 0.01114166, + "balance_loss_clip": 1.00200975, + "balance_loss_mlp": 1.00058293, + "epoch": 0.5013678039981962, + "flos": 24245161528320.0, + "grad_norm": 1.6198713544992476, + "language_loss": 0.8536675, + "learning_rate": 2.0881844722161583e-06, + "loss": 0.87614912, + "num_input_tokens_seen": 179289810, + "step": 8339, + "time_per_iteration": 2.6341841220855713 + }, + { + "auxiliary_loss_clip": 0.01150795, + "auxiliary_loss_mlp": 0.0111299, + "balance_loss_clip": 1.0022769, + "balance_loss_mlp": 1.0005517, + "epoch": 0.5014279272508643, + "flos": 26176688501760.0, + "grad_norm": 1.4976658495878312, + "language_loss": 0.70678234, + "learning_rate": 2.0877953876304962e-06, + "loss": 0.72942013, + "num_input_tokens_seen": 179310620, + "step": 8340, + "time_per_iteration": 2.6079957485198975 + }, + { + "auxiliary_loss_clip": 0.01120413, + "auxiliary_loss_mlp": 0.01115817, + "balance_loss_clip": 1.00208354, + "balance_loss_mlp": 1.00051785, + "epoch": 0.5014880505035323, + "flos": 21430446946560.0, + "grad_norm": 2.4556246809518276, + "language_loss": 0.78265071, + "learning_rate": 2.0874062997156245e-06, + "loss": 0.80501294, + "num_input_tokens_seen": 179329005, + "step": 8341, + "time_per_iteration": 2.6513614654541016 + }, + { + "auxiliary_loss_clip": 0.01118544, + "auxiliary_loss_mlp": 0.01114787, + "balance_loss_clip": 1.0020051, + "balance_loss_mlp": 1.00063193, + "epoch": 0.5015481737562002, + "flos": 15770745596160.0, + "grad_norm": 2.8726775016710517, + "language_loss": 0.8961544, + "learning_rate": 2.0870172084862975e-06, + "loss": 0.91848779, + "num_input_tokens_seen": 179343785, + "step": 8342, + "time_per_iteration": 2.6004014015197754 + }, + { + "auxiliary_loss_clip": 0.01134504, + "auxiliary_loss_mlp": 0.01114446, + "balance_loss_clip": 1.00200033, + "balance_loss_mlp": 1.00076842, + "epoch": 0.5016082970088682, + "flos": 26830801123200.0, + "grad_norm": 4.6909747889540405, + "language_loss": 0.76733768, + "learning_rate": 2.0866281139572682e-06, + "loss": 0.78982711, + "num_input_tokens_seen": 179364070, + "step": 8343, + "time_per_iteration": 2.6527621746063232 + }, + { + "auxiliary_loss_clip": 0.01152026, + "auxiliary_loss_mlp": 0.01113116, + "balance_loss_clip": 1.00219989, + "balance_loss_mlp": 1.00058222, + "epoch": 0.5016684202615361, + "flos": 21470595373440.0, + "grad_norm": 1.7170394688631092, + "language_loss": 0.66724867, + "learning_rate": 2.086239016143293e-06, + "loss": 0.68990004, + "num_input_tokens_seen": 179384225, + "step": 8344, + "time_per_iteration": 2.6191184520721436 + }, + { + "auxiliary_loss_clip": 0.01136017, + "auxiliary_loss_mlp": 0.01114543, + "balance_loss_clip": 1.00210118, + "balance_loss_mlp": 1.00076962, + "epoch": 0.5017285435142042, + "flos": 26246821806720.0, + "grad_norm": 1.820243463971613, + "language_loss": 0.7549181, + "learning_rate": 2.0858499150591258e-06, + "loss": 0.77742374, + "num_input_tokens_seen": 179402595, + "step": 8345, + "time_per_iteration": 2.6970598697662354 + }, + { + "auxiliary_loss_clip": 0.01151152, + "auxiliary_loss_mlp": 0.01114858, + "balance_loss_clip": 1.00214672, + "balance_loss_mlp": 1.00051272, + "epoch": 0.5017886667668721, + "flos": 20777555387520.0, + "grad_norm": 3.049583211279622, + "language_loss": 0.78665924, + "learning_rate": 2.0854608107195203e-06, + "loss": 0.80931938, + "num_input_tokens_seen": 179419635, + "step": 8346, + "time_per_iteration": 2.566185474395752 + }, + { + "auxiliary_loss_clip": 0.01135403, + "auxiliary_loss_mlp": 0.00747393, + "balance_loss_clip": 1.0020299, + "balance_loss_mlp": 1.00037098, + "epoch": 0.5018487900195401, + "flos": 20156408472960.0, + "grad_norm": 1.6661331744732668, + "language_loss": 0.68970251, + "learning_rate": 2.0850717031392333e-06, + "loss": 0.70853055, + "num_input_tokens_seen": 179438770, + "step": 8347, + "time_per_iteration": 2.5998117923736572 + }, + { + "auxiliary_loss_clip": 0.01120701, + "auxiliary_loss_mlp": 0.01114775, + "balance_loss_clip": 1.0020386, + "balance_loss_mlp": 1.00081134, + "epoch": 0.501908913272208, + "flos": 18150689957760.0, + "grad_norm": 2.555672420739887, + "language_loss": 0.71270144, + "learning_rate": 2.0846825923330174e-06, + "loss": 0.73505616, + "num_input_tokens_seen": 179457475, + "step": 8348, + "time_per_iteration": 2.649878978729248 + }, + { + "auxiliary_loss_clip": 0.01150916, + "auxiliary_loss_mlp": 0.01113464, + "balance_loss_clip": 1.00225091, + "balance_loss_mlp": 1.0006448, + "epoch": 0.501969036524876, + "flos": 23112287504640.0, + "grad_norm": 2.004824080643952, + "language_loss": 0.74204004, + "learning_rate": 2.0842934783156303e-06, + "loss": 0.76468384, + "num_input_tokens_seen": 179478140, + "step": 8349, + "time_per_iteration": 2.576767921447754 + }, + { + "auxiliary_loss_clip": 0.01150849, + "auxiliary_loss_mlp": 0.01113819, + "balance_loss_clip": 1.00214982, + "balance_loss_mlp": 1.00061738, + "epoch": 0.5020291597775439, + "flos": 11363214314880.0, + "grad_norm": 2.267554755485401, + "language_loss": 0.63727129, + "learning_rate": 2.0839043611018266e-06, + "loss": 0.65991795, + "num_input_tokens_seen": 179494325, + "step": 8350, + "time_per_iteration": 2.5162806510925293 + }, + { + "auxiliary_loss_clip": 0.01114623, + "auxiliary_loss_mlp": 0.01092114, + "balance_loss_clip": 1.00147128, + "balance_loss_mlp": 1.00017989, + "epoch": 0.5020892830302119, + "flos": 64011094928640.0, + "grad_norm": 0.7848983305519788, + "language_loss": 0.59781611, + "learning_rate": 2.0835152407063597e-06, + "loss": 0.61988342, + "num_input_tokens_seen": 179553545, + "step": 8351, + "time_per_iteration": 3.3150522708892822 + }, + { + "auxiliary_loss_clip": 0.0113733, + "auxiliary_loss_mlp": 0.01113915, + "balance_loss_clip": 1.00210071, + "balance_loss_mlp": 1.00080895, + "epoch": 0.5021494062828799, + "flos": 23732859801600.0, + "grad_norm": 2.336705111886886, + "language_loss": 0.75336826, + "learning_rate": 2.0831261171439873e-06, + "loss": 0.77588069, + "num_input_tokens_seen": 179573645, + "step": 8352, + "time_per_iteration": 2.6164050102233887 + }, + { + "auxiliary_loss_clip": 0.01142375, + "auxiliary_loss_mlp": 0.01114625, + "balance_loss_clip": 1.0025456, + "balance_loss_mlp": 1.00056529, + "epoch": 0.5022095295355479, + "flos": 21576747041280.0, + "grad_norm": 2.040152750632252, + "language_loss": 0.72332978, + "learning_rate": 2.082736990429464e-06, + "loss": 0.74589974, + "num_input_tokens_seen": 179591435, + "step": 8353, + "time_per_iteration": 4.071436166763306 + }, + { + "auxiliary_loss_clip": 0.01152802, + "auxiliary_loss_mlp": 0.01115233, + "balance_loss_clip": 1.0023011, + "balance_loss_mlp": 1.00069678, + "epoch": 0.5022696527882159, + "flos": 21397229844480.0, + "grad_norm": 1.8256523985061832, + "language_loss": 0.74265999, + "learning_rate": 2.0823478605775455e-06, + "loss": 0.76534033, + "num_input_tokens_seen": 179609955, + "step": 8354, + "time_per_iteration": 2.5639469623565674 + }, + { + "auxiliary_loss_clip": 0.01135929, + "auxiliary_loss_mlp": 0.01114763, + "balance_loss_clip": 1.00215268, + "balance_loss_mlp": 1.00070333, + "epoch": 0.5023297760408838, + "flos": 27160712565120.0, + "grad_norm": 2.0609761816653305, + "language_loss": 0.72245073, + "learning_rate": 2.0819587276029884e-06, + "loss": 0.74495763, + "num_input_tokens_seen": 179630875, + "step": 8355, + "time_per_iteration": 2.6630733013153076 + }, + { + "auxiliary_loss_clip": 0.01151, + "auxiliary_loss_mlp": 0.01115037, + "balance_loss_clip": 1.00224173, + "balance_loss_mlp": 1.00069141, + "epoch": 0.5023898992935518, + "flos": 26213820186240.0, + "grad_norm": 1.5307757956200474, + "language_loss": 0.81113172, + "learning_rate": 2.081569591520548e-06, + "loss": 0.83379209, + "num_input_tokens_seen": 179649835, + "step": 8356, + "time_per_iteration": 4.000725030899048 + }, + { + "auxiliary_loss_clip": 0.01151024, + "auxiliary_loss_mlp": 0.01115513, + "balance_loss_clip": 1.00217032, + "balance_loss_mlp": 1.00069046, + "epoch": 0.5024500225462197, + "flos": 13440323111040.0, + "grad_norm": 2.224795518348188, + "language_loss": 0.76378906, + "learning_rate": 2.0811804523449803e-06, + "loss": 0.78645444, + "num_input_tokens_seen": 179667605, + "step": 8357, + "time_per_iteration": 3.921417236328125 + }, + { + "auxiliary_loss_clip": 0.01150844, + "auxiliary_loss_mlp": 0.0111432, + "balance_loss_clip": 1.00217128, + "balance_loss_mlp": 1.00054669, + "epoch": 0.5025101457988878, + "flos": 21579584215680.0, + "grad_norm": 3.1205913288258333, + "language_loss": 0.75877929, + "learning_rate": 2.0807913100910417e-06, + "loss": 0.78143084, + "num_input_tokens_seen": 179686910, + "step": 8358, + "time_per_iteration": 2.584099292755127 + }, + { + "auxiliary_loss_clip": 0.01135703, + "auxiliary_loss_mlp": 0.01114761, + "balance_loss_clip": 1.00205779, + "balance_loss_mlp": 1.0008918, + "epoch": 0.5025702690515557, + "flos": 24645134448000.0, + "grad_norm": 2.16062939049147, + "language_loss": 0.71783233, + "learning_rate": 2.0804021647734887e-06, + "loss": 0.74033689, + "num_input_tokens_seen": 179706395, + "step": 8359, + "time_per_iteration": 4.053769826889038 + }, + { + "auxiliary_loss_clip": 0.01136121, + "auxiliary_loss_mlp": 0.01113667, + "balance_loss_clip": 1.002195, + "balance_loss_mlp": 1.00084758, + "epoch": 0.5026303923042237, + "flos": 22090162089600.0, + "grad_norm": 1.9382091393325493, + "language_loss": 0.77132952, + "learning_rate": 2.080013016407077e-06, + "loss": 0.79382735, + "num_input_tokens_seen": 179725735, + "step": 8360, + "time_per_iteration": 2.6182098388671875 + }, + { + "auxiliary_loss_clip": 0.01123987, + "auxiliary_loss_mlp": 0.0111472, + "balance_loss_clip": 1.00208998, + "balance_loss_mlp": 1.00075626, + "epoch": 0.5026905155568916, + "flos": 23697200574720.0, + "grad_norm": 2.089432483342065, + "language_loss": 0.76865566, + "learning_rate": 2.0796238650065645e-06, + "loss": 0.79104275, + "num_input_tokens_seen": 179746150, + "step": 8361, + "time_per_iteration": 2.6458752155303955 + }, + { + "auxiliary_loss_clip": 0.01120373, + "auxiliary_loss_mlp": 0.01115999, + "balance_loss_clip": 1.00199938, + "balance_loss_mlp": 1.00060391, + "epoch": 0.5027506388095596, + "flos": 25812410722560.0, + "grad_norm": 1.8517033901443416, + "language_loss": 0.85057992, + "learning_rate": 2.0792347105867065e-06, + "loss": 0.87294364, + "num_input_tokens_seen": 179767550, + "step": 8362, + "time_per_iteration": 2.6989214420318604 + }, + { + "auxiliary_loss_clip": 0.01134052, + "auxiliary_loss_mlp": 0.01115086, + "balance_loss_clip": 1.00198555, + "balance_loss_mlp": 1.00064492, + "epoch": 0.5028107620622275, + "flos": 27526606456320.0, + "grad_norm": 1.5367687745316727, + "language_loss": 0.78365076, + "learning_rate": 2.0788455531622605e-06, + "loss": 0.80614221, + "num_input_tokens_seen": 179790075, + "step": 8363, + "time_per_iteration": 2.6660218238830566 + }, + { + "auxiliary_loss_clip": 0.0115067, + "auxiliary_loss_mlp": 0.01114017, + "balance_loss_clip": 1.00214195, + "balance_loss_mlp": 1.00052941, + "epoch": 0.5028708853148955, + "flos": 24534278098560.0, + "grad_norm": 2.327788007559738, + "language_loss": 0.75157189, + "learning_rate": 2.0784563927479838e-06, + "loss": 0.7742188, + "num_input_tokens_seen": 179806515, + "step": 8364, + "time_per_iteration": 2.610294818878174 + }, + { + "auxiliary_loss_clip": 0.0116735, + "auxiliary_loss_mlp": 0.01113562, + "balance_loss_clip": 1.00219357, + "balance_loss_mlp": 1.00064707, + "epoch": 0.5029310085675635, + "flos": 20813609664000.0, + "grad_norm": 2.9572953629488405, + "language_loss": 0.69595873, + "learning_rate": 2.0780672293586317e-06, + "loss": 0.71876788, + "num_input_tokens_seen": 179826450, + "step": 8365, + "time_per_iteration": 2.5364720821380615 + }, + { + "auxiliary_loss_clip": 0.01133596, + "auxiliary_loss_mlp": 0.01115135, + "balance_loss_clip": 1.00210392, + "balance_loss_mlp": 1.00059891, + "epoch": 0.5029911318202315, + "flos": 22342470197760.0, + "grad_norm": 2.1359305871642658, + "language_loss": 0.73031062, + "learning_rate": 2.0776780630089635e-06, + "loss": 0.7527979, + "num_input_tokens_seen": 179846770, + "step": 8366, + "time_per_iteration": 2.6187753677368164 + }, + { + "auxiliary_loss_clip": 0.01150688, + "auxiliary_loss_mlp": 0.01114266, + "balance_loss_clip": 1.00218391, + "balance_loss_mlp": 1.00058818, + "epoch": 0.5030512550728995, + "flos": 24352713826560.0, + "grad_norm": 1.3697512666921638, + "language_loss": 0.7816295, + "learning_rate": 2.077288893713735e-06, + "loss": 0.80427909, + "num_input_tokens_seen": 179866585, + "step": 8367, + "time_per_iteration": 2.672349452972412 + }, + { + "auxiliary_loss_clip": 0.01152396, + "auxiliary_loss_mlp": 0.01113172, + "balance_loss_clip": 1.00215495, + "balance_loss_mlp": 1.00054336, + "epoch": 0.5031113783255674, + "flos": 18259930195200.0, + "grad_norm": 1.8356858133799954, + "language_loss": 0.69662631, + "learning_rate": 2.0768997214877035e-06, + "loss": 0.71928197, + "num_input_tokens_seen": 179885575, + "step": 8368, + "time_per_iteration": 2.6187174320220947 + }, + { + "auxiliary_loss_clip": 0.01147527, + "auxiliary_loss_mlp": 0.01091725, + "balance_loss_clip": 1.00139356, + "balance_loss_mlp": 1.0001719, + "epoch": 0.5031715015782354, + "flos": 57253173200640.0, + "grad_norm": 0.8467879670410091, + "language_loss": 0.63339877, + "learning_rate": 2.0765105463456274e-06, + "loss": 0.65579128, + "num_input_tokens_seen": 179939650, + "step": 8369, + "time_per_iteration": 3.0765540599823 + }, + { + "auxiliary_loss_clip": 0.01150812, + "auxiliary_loss_mlp": 0.0111283, + "balance_loss_clip": 1.002056, + "balance_loss_mlp": 1.00058198, + "epoch": 0.5032316248309033, + "flos": 27527360641920.0, + "grad_norm": 1.8699064060183623, + "language_loss": 0.60302341, + "learning_rate": 2.076121368302263e-06, + "loss": 0.62565982, + "num_input_tokens_seen": 179961765, + "step": 8370, + "time_per_iteration": 2.6288886070251465 + }, + { + "auxiliary_loss_clip": 0.01104578, + "auxiliary_loss_mlp": 0.01113903, + "balance_loss_clip": 1.0019927, + "balance_loss_mlp": 1.00060666, + "epoch": 0.5032917480835714, + "flos": 34495825939200.0, + "grad_norm": 1.5128398819037936, + "language_loss": 0.68478125, + "learning_rate": 2.0757321873723695e-06, + "loss": 0.70696604, + "num_input_tokens_seen": 179983015, + "step": 8371, + "time_per_iteration": 2.80033016204834 + }, + { + "auxiliary_loss_clip": 0.01135877, + "auxiliary_loss_mlp": 0.01114696, + "balance_loss_clip": 1.00229824, + "balance_loss_mlp": 1.00073242, + "epoch": 0.5033518713362393, + "flos": 33656773167360.0, + "grad_norm": 1.6331806744086834, + "language_loss": 0.67450893, + "learning_rate": 2.0753430035707042e-06, + "loss": 0.69701463, + "num_input_tokens_seen": 180003210, + "step": 8372, + "time_per_iteration": 2.7153542041778564 + }, + { + "auxiliary_loss_clip": 0.01118451, + "auxiliary_loss_mlp": 0.01113759, + "balance_loss_clip": 1.00191319, + "balance_loss_mlp": 1.00065327, + "epoch": 0.5034119945889073, + "flos": 28185495586560.0, + "grad_norm": 2.371200322863775, + "language_loss": 0.6673643, + "learning_rate": 2.0749538169120235e-06, + "loss": 0.68968642, + "num_input_tokens_seen": 180025530, + "step": 8373, + "time_per_iteration": 2.689819574356079 + }, + { + "auxiliary_loss_clip": 0.01133972, + "auxiliary_loss_mlp": 0.01113235, + "balance_loss_clip": 1.00203252, + "balance_loss_mlp": 1.00070143, + "epoch": 0.5034721178415752, + "flos": 21358697529600.0, + "grad_norm": 1.6688982933401921, + "language_loss": 0.74609661, + "learning_rate": 2.0745646274110872e-06, + "loss": 0.76856869, + "num_input_tokens_seen": 180043180, + "step": 8374, + "time_per_iteration": 2.5940964221954346 + }, + { + "auxiliary_loss_clip": 0.01135341, + "auxiliary_loss_mlp": 0.01113817, + "balance_loss_clip": 1.00204468, + "balance_loss_mlp": 1.00061536, + "epoch": 0.5035322410942432, + "flos": 22674823764480.0, + "grad_norm": 1.572576057546292, + "language_loss": 0.68120396, + "learning_rate": 2.0741754350826525e-06, + "loss": 0.70369554, + "num_input_tokens_seen": 180062905, + "step": 8375, + "time_per_iteration": 2.602552890777588 + }, + { + "auxiliary_loss_clip": 0.01108892, + "auxiliary_loss_mlp": 0.01115973, + "balance_loss_clip": 1.00216091, + "balance_loss_mlp": 1.00057852, + "epoch": 0.5035923643469111, + "flos": 19828723674240.0, + "grad_norm": 1.7988082449906335, + "language_loss": 0.79063225, + "learning_rate": 2.0737862399414777e-06, + "loss": 0.81288093, + "num_input_tokens_seen": 180082000, + "step": 8376, + "time_per_iteration": 2.671581506729126 + }, + { + "auxiliary_loss_clip": 0.01150889, + "auxiliary_loss_mlp": 0.00747471, + "balance_loss_clip": 1.0022167, + "balance_loss_mlp": 1.00045526, + "epoch": 0.5036524875995791, + "flos": 30514625182080.0, + "grad_norm": 2.4964221089871477, + "language_loss": 0.59593642, + "learning_rate": 2.0733970420023213e-06, + "loss": 0.61492002, + "num_input_tokens_seen": 180101340, + "step": 8377, + "time_per_iteration": 2.650965690612793 + }, + { + "auxiliary_loss_clip": 0.0113414, + "auxiliary_loss_mlp": 0.0111466, + "balance_loss_clip": 1.00209951, + "balance_loss_mlp": 1.0006001, + "epoch": 0.5037126108522471, + "flos": 14720574637440.0, + "grad_norm": 2.133144194409776, + "language_loss": 0.76208925, + "learning_rate": 2.0730078412799425e-06, + "loss": 0.78457725, + "num_input_tokens_seen": 180119160, + "step": 8378, + "time_per_iteration": 2.5756115913391113 + }, + { + "auxiliary_loss_clip": 0.01123017, + "auxiliary_loss_mlp": 0.0111494, + "balance_loss_clip": 1.00209153, + "balance_loss_mlp": 1.00068927, + "epoch": 0.5037727341049151, + "flos": 25297702784640.0, + "grad_norm": 1.724343613123027, + "language_loss": 0.74634373, + "learning_rate": 2.0726186377890985e-06, + "loss": 0.76872337, + "num_input_tokens_seen": 180138730, + "step": 8379, + "time_per_iteration": 2.6776466369628906 + }, + { + "auxiliary_loss_clip": 0.01150528, + "auxiliary_loss_mlp": 0.01114118, + "balance_loss_clip": 1.00218225, + "balance_loss_mlp": 1.00072646, + "epoch": 0.5038328573575831, + "flos": 28541764632960.0, + "grad_norm": 3.6771907187189146, + "language_loss": 0.66890246, + "learning_rate": 2.072229431544548e-06, + "loss": 0.69154894, + "num_input_tokens_seen": 180158810, + "step": 8380, + "time_per_iteration": 2.6178412437438965 + }, + { + "auxiliary_loss_clip": 0.01102458, + "auxiliary_loss_mlp": 0.01113092, + "balance_loss_clip": 1.00195622, + "balance_loss_mlp": 1.00055838, + "epoch": 0.503892980610251, + "flos": 31649869503360.0, + "grad_norm": 2.0557235262010387, + "language_loss": 0.62948692, + "learning_rate": 2.071840222561051e-06, + "loss": 0.65164244, + "num_input_tokens_seen": 180179700, + "step": 8381, + "time_per_iteration": 2.790200710296631 + }, + { + "auxiliary_loss_clip": 0.01136937, + "auxiliary_loss_mlp": 0.01113889, + "balance_loss_clip": 1.00198555, + "balance_loss_mlp": 1.00078297, + "epoch": 0.503953103862919, + "flos": 27089358197760.0, + "grad_norm": 1.6850656099179155, + "language_loss": 0.67458248, + "learning_rate": 2.071451010853365e-06, + "loss": 0.69709069, + "num_input_tokens_seen": 180199890, + "step": 8382, + "time_per_iteration": 2.645860195159912 + }, + { + "auxiliary_loss_clip": 0.01137243, + "auxiliary_loss_mlp": 0.01114446, + "balance_loss_clip": 1.00218415, + "balance_loss_mlp": 1.00067306, + "epoch": 0.5040132271155869, + "flos": 15632957024640.0, + "grad_norm": 2.8615508736468582, + "language_loss": 0.62185007, + "learning_rate": 2.0710617964362506e-06, + "loss": 0.64436698, + "num_input_tokens_seen": 180217840, + "step": 8383, + "time_per_iteration": 2.5846309661865234 + }, + { + "auxiliary_loss_clip": 0.01125586, + "auxiliary_loss_mlp": 0.01113013, + "balance_loss_clip": 1.00218749, + "balance_loss_mlp": 1.0004791, + "epoch": 0.504073350368255, + "flos": 13590106824960.0, + "grad_norm": 1.7093136959321849, + "language_loss": 0.66766828, + "learning_rate": 2.070672579324465e-06, + "loss": 0.69005418, + "num_input_tokens_seen": 180236465, + "step": 8384, + "time_per_iteration": 2.622225761413574 + }, + { + "auxiliary_loss_clip": 0.01151089, + "auxiliary_loss_mlp": 0.01113792, + "balance_loss_clip": 1.00213075, + "balance_loss_mlp": 1.00068593, + "epoch": 0.5041334736209229, + "flos": 29058160510080.0, + "grad_norm": 1.737022476294291, + "language_loss": 0.70954633, + "learning_rate": 2.0702833595327674e-06, + "loss": 0.73219514, + "num_input_tokens_seen": 180258025, + "step": 8385, + "time_per_iteration": 2.6301767826080322 + }, + { + "auxiliary_loss_clip": 0.01151898, + "auxiliary_loss_mlp": 0.01112838, + "balance_loss_clip": 1.00218952, + "balance_loss_mlp": 1.00049472, + "epoch": 0.5041935968735909, + "flos": 24608361899520.0, + "grad_norm": 1.793481334571387, + "language_loss": 0.82609701, + "learning_rate": 2.069894137075919e-06, + "loss": 0.84874439, + "num_input_tokens_seen": 180277825, + "step": 8386, + "time_per_iteration": 2.6187593936920166 + }, + { + "auxiliary_loss_clip": 0.01150965, + "auxiliary_loss_mlp": 0.01114079, + "balance_loss_clip": 1.00207317, + "balance_loss_mlp": 1.0006876, + "epoch": 0.5042537201262588, + "flos": 26286934320000.0, + "grad_norm": 1.6680212421178084, + "language_loss": 0.6637187, + "learning_rate": 2.0695049119686766e-06, + "loss": 0.68636918, + "num_input_tokens_seen": 180300465, + "step": 8387, + "time_per_iteration": 2.6187899112701416 + }, + { + "auxiliary_loss_clip": 0.01088361, + "auxiliary_loss_mlp": 0.01112479, + "balance_loss_clip": 1.00193501, + "balance_loss_mlp": 1.00051761, + "epoch": 0.5043138433789268, + "flos": 22017371178240.0, + "grad_norm": 1.3252015317785881, + "language_loss": 0.80213201, + "learning_rate": 2.0691156842258016e-06, + "loss": 0.82414043, + "num_input_tokens_seen": 180321050, + "step": 8388, + "time_per_iteration": 2.824794292449951 + }, + { + "auxiliary_loss_clip": 0.01150711, + "auxiliary_loss_mlp": 0.01113009, + "balance_loss_clip": 1.00218928, + "balance_loss_mlp": 1.0005703, + "epoch": 0.5043739666315947, + "flos": 28767104605440.0, + "grad_norm": 2.2364087911426407, + "language_loss": 0.6978873, + "learning_rate": 2.0687264538620537e-06, + "loss": 0.72052455, + "num_input_tokens_seen": 180338870, + "step": 8389, + "time_per_iteration": 2.644284248352051 + }, + { + "auxiliary_loss_clip": 0.01136348, + "auxiliary_loss_mlp": 0.01113296, + "balance_loss_clip": 1.00204134, + "balance_loss_mlp": 1.00057197, + "epoch": 0.5044340898842627, + "flos": 27599253713280.0, + "grad_norm": 1.7143936223853722, + "language_loss": 0.69976169, + "learning_rate": 2.068337220892191e-06, + "loss": 0.72225809, + "num_input_tokens_seen": 180361285, + "step": 8390, + "time_per_iteration": 2.6411478519439697 + }, + { + "auxiliary_loss_clip": 0.01131188, + "auxiliary_loss_mlp": 0.01091212, + "balance_loss_clip": 1.00145984, + "balance_loss_mlp": 1.00004029, + "epoch": 0.5044942131369307, + "flos": 67458050749440.0, + "grad_norm": 0.8128310108947834, + "language_loss": 0.52983797, + "learning_rate": 2.067947985330974e-06, + "loss": 0.55206192, + "num_input_tokens_seen": 180415170, + "step": 8391, + "time_per_iteration": 4.3577470779418945 + }, + { + "auxiliary_loss_clip": 0.01115068, + "auxiliary_loss_mlp": 0.01091664, + "balance_loss_clip": 1.00151825, + "balance_loss_mlp": 1.00011158, + "epoch": 0.5045543363895987, + "flos": 58630849390080.0, + "grad_norm": 0.8653069480685754, + "language_loss": 0.6067009, + "learning_rate": 2.0675587471931628e-06, + "loss": 0.62876821, + "num_input_tokens_seen": 180468060, + "step": 8392, + "time_per_iteration": 3.0161759853363037 + }, + { + "auxiliary_loss_clip": 0.01120739, + "auxiliary_loss_mlp": 0.01112522, + "balance_loss_clip": 1.00204253, + "balance_loss_mlp": 1.00056052, + "epoch": 0.5046144596422667, + "flos": 22526620248960.0, + "grad_norm": 1.6646216758064154, + "language_loss": 0.84275174, + "learning_rate": 2.067169506493517e-06, + "loss": 0.86508429, + "num_input_tokens_seen": 180486610, + "step": 8393, + "time_per_iteration": 2.6566109657287598 + }, + { + "auxiliary_loss_clip": 0.01120238, + "auxiliary_loss_mlp": 0.01113089, + "balance_loss_clip": 1.00204718, + "balance_loss_mlp": 1.00055552, + "epoch": 0.5046745828949346, + "flos": 27454246508160.0, + "grad_norm": 2.46967837370522, + "language_loss": 0.50797683, + "learning_rate": 2.0667802632467974e-06, + "loss": 0.53031009, + "num_input_tokens_seen": 180508135, + "step": 8394, + "time_per_iteration": 4.135565519332886 + }, + { + "auxiliary_loss_clip": 0.0116747, + "auxiliary_loss_mlp": 0.0111371, + "balance_loss_clip": 1.00218725, + "balance_loss_mlp": 1.00050902, + "epoch": 0.5047347061476026, + "flos": 17274541415040.0, + "grad_norm": 1.6191945615299785, + "language_loss": 0.7534163, + "learning_rate": 2.0663910174677627e-06, + "loss": 0.77622807, + "num_input_tokens_seen": 180527000, + "step": 8395, + "time_per_iteration": 3.929072380065918 + }, + { + "auxiliary_loss_clip": 0.01152725, + "auxiliary_loss_mlp": 0.01113754, + "balance_loss_clip": 1.00217986, + "balance_loss_mlp": 1.00055242, + "epoch": 0.5047948294002705, + "flos": 16649515831680.0, + "grad_norm": 1.9265862275630545, + "language_loss": 0.67524743, + "learning_rate": 2.0660017691711737e-06, + "loss": 0.69791222, + "num_input_tokens_seen": 180544715, + "step": 8396, + "time_per_iteration": 2.546056032180786 + }, + { + "auxiliary_loss_clip": 0.01150921, + "auxiliary_loss_mlp": 0.01113211, + "balance_loss_clip": 1.00244737, + "balance_loss_mlp": 1.00048649, + "epoch": 0.5048549526529386, + "flos": 26865706164480.0, + "grad_norm": 1.8979704460589244, + "language_loss": 0.78576458, + "learning_rate": 2.065612518371792e-06, + "loss": 0.80840588, + "num_input_tokens_seen": 180565365, + "step": 8397, + "time_per_iteration": 4.072818994522095 + }, + { + "auxiliary_loss_clip": 0.01105111, + "auxiliary_loss_mlp": 0.0111271, + "balance_loss_clip": 1.00199342, + "balance_loss_mlp": 1.00046265, + "epoch": 0.5049150759056065, + "flos": 21833939399040.0, + "grad_norm": 4.631207707107736, + "language_loss": 0.66093808, + "learning_rate": 2.065223265084376e-06, + "loss": 0.68311632, + "num_input_tokens_seen": 180586670, + "step": 8398, + "time_per_iteration": 2.71561861038208 + }, + { + "auxiliary_loss_clip": 0.01150558, + "auxiliary_loss_mlp": 0.00747411, + "balance_loss_clip": 1.00215554, + "balance_loss_mlp": 1.00045848, + "epoch": 0.5049751991582745, + "flos": 21685807710720.0, + "grad_norm": 1.999906116663215, + "language_loss": 0.7180931, + "learning_rate": 2.064834009323688e-06, + "loss": 0.73707283, + "num_input_tokens_seen": 180605085, + "step": 8399, + "time_per_iteration": 2.5700464248657227 + }, + { + "auxiliary_loss_clip": 0.01136073, + "auxiliary_loss_mlp": 0.01113098, + "balance_loss_clip": 1.00224674, + "balance_loss_mlp": 1.00075483, + "epoch": 0.5050353224109424, + "flos": 21359379888000.0, + "grad_norm": 1.7270440811964636, + "language_loss": 0.81125802, + "learning_rate": 2.0644447511044878e-06, + "loss": 0.83374977, + "num_input_tokens_seen": 180624370, + "step": 8400, + "time_per_iteration": 2.5923237800598145 + }, + { + "auxiliary_loss_clip": 0.01118647, + "auxiliary_loss_mlp": 0.01112965, + "balance_loss_clip": 1.00212276, + "balance_loss_mlp": 1.00052667, + "epoch": 0.5050954456636104, + "flos": 22820082364800.0, + "grad_norm": 2.0595590557504098, + "language_loss": 0.78926051, + "learning_rate": 2.0640554904415362e-06, + "loss": 0.81157666, + "num_input_tokens_seen": 180642450, + "step": 8401, + "time_per_iteration": 2.6563916206359863 + }, + { + "auxiliary_loss_clip": 0.01167485, + "auxiliary_loss_mlp": 0.0074741, + "balance_loss_clip": 1.00219727, + "balance_loss_mlp": 1.00058067, + "epoch": 0.5051555689162783, + "flos": 30448226891520.0, + "grad_norm": 1.8649930837965072, + "language_loss": 0.69864154, + "learning_rate": 2.063666227349593e-06, + "loss": 0.71779048, + "num_input_tokens_seen": 180665250, + "step": 8402, + "time_per_iteration": 2.6062958240509033 + }, + { + "auxiliary_loss_clip": 0.01151826, + "auxiliary_loss_mlp": 0.00747389, + "balance_loss_clip": 1.00200129, + "balance_loss_mlp": 1.00046515, + "epoch": 0.5052156921689464, + "flos": 21287953693440.0, + "grad_norm": 1.9462034402426298, + "language_loss": 0.69341433, + "learning_rate": 2.063276961843422e-06, + "loss": 0.71240646, + "num_input_tokens_seen": 180687425, + "step": 8403, + "time_per_iteration": 2.618316411972046 + }, + { + "auxiliary_loss_clip": 0.0115216, + "auxiliary_loss_mlp": 0.01112917, + "balance_loss_clip": 1.00211751, + "balance_loss_mlp": 1.00076437, + "epoch": 0.5052758154216143, + "flos": 25081305298560.0, + "grad_norm": 1.421342349520926, + "language_loss": 0.85662639, + "learning_rate": 2.062887693937781e-06, + "loss": 0.87927711, + "num_input_tokens_seen": 180708725, + "step": 8404, + "time_per_iteration": 2.643793821334839 + }, + { + "auxiliary_loss_clip": 0.01117433, + "auxiliary_loss_mlp": 0.00747297, + "balance_loss_clip": 1.00201392, + "balance_loss_mlp": 1.00053167, + "epoch": 0.5053359386742823, + "flos": 20885502735360.0, + "grad_norm": 1.6596653951320879, + "language_loss": 0.75616455, + "learning_rate": 2.0624984236474322e-06, + "loss": 0.77481186, + "num_input_tokens_seen": 180727990, + "step": 8405, + "time_per_iteration": 2.6531753540039062 + }, + { + "auxiliary_loss_clip": 0.01167447, + "auxiliary_loss_mlp": 0.01113977, + "balance_loss_clip": 1.00220656, + "balance_loss_mlp": 1.00048971, + "epoch": 0.5053960619269503, + "flos": 37743335493120.0, + "grad_norm": 1.6378090403905674, + "language_loss": 0.72898638, + "learning_rate": 2.0621091509871378e-06, + "loss": 0.75180054, + "num_input_tokens_seen": 180749765, + "step": 8406, + "time_per_iteration": 2.6752469539642334 + }, + { + "auxiliary_loss_clip": 0.01118574, + "auxiliary_loss_mlp": 0.01112075, + "balance_loss_clip": 1.00199175, + "balance_loss_mlp": 1.00049531, + "epoch": 0.5054561851796182, + "flos": 23513840622720.0, + "grad_norm": 2.0392758653432685, + "language_loss": 0.76915395, + "learning_rate": 2.0617198759716568e-06, + "loss": 0.79146039, + "num_input_tokens_seen": 180769580, + "step": 8407, + "time_per_iteration": 2.6769869327545166 + }, + { + "auxiliary_loss_clip": 0.01126718, + "auxiliary_loss_mlp": 0.01113354, + "balance_loss_clip": 1.00210285, + "balance_loss_mlp": 1.00053406, + "epoch": 0.5055163084322862, + "flos": 30410233280640.0, + "grad_norm": 2.3437971461135705, + "language_loss": 0.62636602, + "learning_rate": 2.0613305986157535e-06, + "loss": 0.64876676, + "num_input_tokens_seen": 180790295, + "step": 8408, + "time_per_iteration": 2.71703839302063 + }, + { + "auxiliary_loss_clip": 0.01141783, + "auxiliary_loss_mlp": 0.01113879, + "balance_loss_clip": 1.00216782, + "balance_loss_mlp": 1.00048709, + "epoch": 0.5055764316849541, + "flos": 20259651139200.0, + "grad_norm": 1.7051471196309331, + "language_loss": 0.63652158, + "learning_rate": 2.0609413189341865e-06, + "loss": 0.65907824, + "num_input_tokens_seen": 180807875, + "step": 8409, + "time_per_iteration": 2.630876302719116 + }, + { + "auxiliary_loss_clip": 0.01135089, + "auxiliary_loss_mlp": 0.01112413, + "balance_loss_clip": 1.00214529, + "balance_loss_mlp": 1.0004518, + "epoch": 0.5056365549376222, + "flos": 26070895969920.0, + "grad_norm": 1.3580218779699469, + "language_loss": 0.70935392, + "learning_rate": 2.0605520369417193e-06, + "loss": 0.73182893, + "num_input_tokens_seen": 180831300, + "step": 8410, + "time_per_iteration": 2.65126895904541 + }, + { + "auxiliary_loss_clip": 0.0113581, + "auxiliary_loss_mlp": 0.01114029, + "balance_loss_clip": 1.00225925, + "balance_loss_mlp": 1.00063682, + "epoch": 0.5056966781902901, + "flos": 19279074781440.0, + "grad_norm": 1.6100645455710143, + "language_loss": 0.79700041, + "learning_rate": 2.060162752653113e-06, + "loss": 0.81949878, + "num_input_tokens_seen": 180849055, + "step": 8411, + "time_per_iteration": 2.5818495750427246 + }, + { + "auxiliary_loss_clip": 0.01167484, + "auxiliary_loss_mlp": 0.01113714, + "balance_loss_clip": 1.00225711, + "balance_loss_mlp": 1.0005132, + "epoch": 0.5057568014429581, + "flos": 21323325611520.0, + "grad_norm": 6.74413547517551, + "language_loss": 0.81533432, + "learning_rate": 2.0597734660831285e-06, + "loss": 0.83814627, + "num_input_tokens_seen": 180867395, + "step": 8412, + "time_per_iteration": 2.531580686569214 + }, + { + "auxiliary_loss_clip": 0.01136912, + "auxiliary_loss_mlp": 0.01113215, + "balance_loss_clip": 1.00214243, + "balance_loss_mlp": 1.0006814, + "epoch": 0.505816924695626, + "flos": 17493596507520.0, + "grad_norm": 1.9948984288995768, + "language_loss": 0.80179656, + "learning_rate": 2.0593841772465283e-06, + "loss": 0.8242979, + "num_input_tokens_seen": 180886670, + "step": 8413, + "time_per_iteration": 2.5950427055358887 + }, + { + "auxiliary_loss_clip": 0.01117589, + "auxiliary_loss_mlp": 0.00747411, + "balance_loss_clip": 1.0020721, + "balance_loss_mlp": 1.00053179, + "epoch": 0.505877047948294, + "flos": 21142084561920.0, + "grad_norm": 6.335257092575567, + "language_loss": 0.80022639, + "learning_rate": 2.0589948861580737e-06, + "loss": 0.81887639, + "num_input_tokens_seen": 180904645, + "step": 8414, + "time_per_iteration": 2.6803414821624756 + }, + { + "auxiliary_loss_clip": 0.01152488, + "auxiliary_loss_mlp": 0.01113556, + "balance_loss_clip": 1.00216496, + "balance_loss_mlp": 1.00045049, + "epoch": 0.5059371712009619, + "flos": 36350036887680.0, + "grad_norm": 2.4666487352569444, + "language_loss": 0.62256813, + "learning_rate": 2.058605592832528e-06, + "loss": 0.64522851, + "num_input_tokens_seen": 180922340, + "step": 8415, + "time_per_iteration": 2.7069265842437744 + }, + { + "auxiliary_loss_clip": 0.01118508, + "auxiliary_loss_mlp": 0.01113314, + "balance_loss_clip": 1.00202441, + "balance_loss_mlp": 1.00058937, + "epoch": 0.50599729445363, + "flos": 22673387220480.0, + "grad_norm": 1.5378580806047244, + "language_loss": 0.81774282, + "learning_rate": 2.0582162972846515e-06, + "loss": 0.84006101, + "num_input_tokens_seen": 180941350, + "step": 8416, + "time_per_iteration": 2.6544814109802246 + }, + { + "auxiliary_loss_clip": 0.01117658, + "auxiliary_loss_mlp": 0.01112854, + "balance_loss_clip": 1.00206411, + "balance_loss_mlp": 1.00051141, + "epoch": 0.5060574177062979, + "flos": 22747866071040.0, + "grad_norm": 1.7773234029494902, + "language_loss": 0.78979778, + "learning_rate": 2.0578269995292078e-06, + "loss": 0.81210297, + "num_input_tokens_seen": 180960720, + "step": 8417, + "time_per_iteration": 2.6356937885284424 + }, + { + "auxiliary_loss_clip": 0.01102205, + "auxiliary_loss_mlp": 0.01112041, + "balance_loss_clip": 1.00203359, + "balance_loss_mlp": 1.00055623, + "epoch": 0.5061175409589659, + "flos": 21653201139840.0, + "grad_norm": 1.792474740155978, + "language_loss": 0.62973273, + "learning_rate": 2.0574376995809588e-06, + "loss": 0.6518752, + "num_input_tokens_seen": 180979725, + "step": 8418, + "time_per_iteration": 2.703599214553833 + }, + { + "auxiliary_loss_clip": 0.01119094, + "auxiliary_loss_mlp": 0.01113194, + "balance_loss_clip": 1.00201726, + "balance_loss_mlp": 1.00046992, + "epoch": 0.5061776642116339, + "flos": 21616249023360.0, + "grad_norm": 5.643244775014786, + "language_loss": 0.77445859, + "learning_rate": 2.0570483974546653e-06, + "loss": 0.79678142, + "num_input_tokens_seen": 180998980, + "step": 8419, + "time_per_iteration": 2.6481964588165283 + }, + { + "auxiliary_loss_clip": 0.0109175, + "auxiliary_loss_mlp": 0.01113882, + "balance_loss_clip": 1.00210464, + "balance_loss_mlp": 1.00049043, + "epoch": 0.5062377874643018, + "flos": 24426294837120.0, + "grad_norm": 1.8770205111040883, + "language_loss": 0.77020442, + "learning_rate": 2.0566590931650917e-06, + "loss": 0.79226071, + "num_input_tokens_seen": 181019165, + "step": 8420, + "time_per_iteration": 2.7455782890319824 + }, + { + "auxiliary_loss_clip": 0.01167383, + "auxiliary_loss_mlp": 0.01113647, + "balance_loss_clip": 1.00217628, + "balance_loss_mlp": 1.00054157, + "epoch": 0.5062979107169698, + "flos": 22524429519360.0, + "grad_norm": 1.7153850432208164, + "language_loss": 0.7756778, + "learning_rate": 2.056269786726999e-06, + "loss": 0.79848814, + "num_input_tokens_seen": 181037110, + "step": 8421, + "time_per_iteration": 2.5818400382995605 + }, + { + "auxiliary_loss_clip": 0.01152188, + "auxiliary_loss_mlp": 0.01112786, + "balance_loss_clip": 1.00215435, + "balance_loss_mlp": 1.00053811, + "epoch": 0.5063580339696377, + "flos": 24571984400640.0, + "grad_norm": 1.4854714566166338, + "language_loss": 0.66710758, + "learning_rate": 2.0558804781551512e-06, + "loss": 0.68975735, + "num_input_tokens_seen": 181057775, + "step": 8422, + "time_per_iteration": 2.591779947280884 + }, + { + "auxiliary_loss_clip": 0.01167322, + "auxiliary_loss_mlp": 0.01113364, + "balance_loss_clip": 1.00221491, + "balance_loss_mlp": 1.00054467, + "epoch": 0.5064181572223058, + "flos": 22596143022720.0, + "grad_norm": 1.7243000950374026, + "language_loss": 0.81697869, + "learning_rate": 2.05549116746431e-06, + "loss": 0.83978558, + "num_input_tokens_seen": 181078260, + "step": 8423, + "time_per_iteration": 2.5837149620056152 + }, + { + "auxiliary_loss_clip": 0.01167594, + "auxiliary_loss_mlp": 0.00747507, + "balance_loss_clip": 1.00237405, + "balance_loss_mlp": 1.00044405, + "epoch": 0.5064782804749737, + "flos": 25994944661760.0, + "grad_norm": 2.6934538823765948, + "language_loss": 0.74057508, + "learning_rate": 2.055101854669237e-06, + "loss": 0.75972605, + "num_input_tokens_seen": 181098755, + "step": 8424, + "time_per_iteration": 2.567833423614502 + }, + { + "auxiliary_loss_clip": 0.01167349, + "auxiliary_loss_mlp": 0.01113878, + "balance_loss_clip": 1.0022614, + "balance_loss_mlp": 1.00067735, + "epoch": 0.5065384037276417, + "flos": 28553041503360.0, + "grad_norm": 1.6438104969434983, + "language_loss": 0.71118128, + "learning_rate": 2.0547125397846975e-06, + "loss": 0.73399359, + "num_input_tokens_seen": 181121570, + "step": 8425, + "time_per_iteration": 2.5975706577301025 + }, + { + "auxiliary_loss_clip": 0.0111866, + "auxiliary_loss_mlp": 0.01114306, + "balance_loss_clip": 1.00200605, + "balance_loss_mlp": 1.00072312, + "epoch": 0.5065985269803096, + "flos": 22966023323520.0, + "grad_norm": 3.664146670444424, + "language_loss": 0.78777218, + "learning_rate": 2.0543232228254524e-06, + "loss": 0.81010187, + "num_input_tokens_seen": 181140240, + "step": 8426, + "time_per_iteration": 2.6502864360809326 + }, + { + "auxiliary_loss_clip": 0.01150511, + "auxiliary_loss_mlp": 0.01113964, + "balance_loss_clip": 1.00210881, + "balance_loss_mlp": 1.00066757, + "epoch": 0.5066586502329776, + "flos": 21608563512960.0, + "grad_norm": 1.9053507535808905, + "language_loss": 0.78161526, + "learning_rate": 2.053933903806265e-06, + "loss": 0.80426002, + "num_input_tokens_seen": 181158630, + "step": 8427, + "time_per_iteration": 2.5574429035186768 + }, + { + "auxiliary_loss_clip": 0.01167291, + "auxiliary_loss_mlp": 0.01113063, + "balance_loss_clip": 1.00223804, + "balance_loss_mlp": 1.00033832, + "epoch": 0.5067187734856455, + "flos": 20339912079360.0, + "grad_norm": 2.3448429650747644, + "language_loss": 0.71314418, + "learning_rate": 2.0535445827418997e-06, + "loss": 0.73594773, + "num_input_tokens_seen": 181176405, + "step": 8428, + "time_per_iteration": 2.5498123168945312 + }, + { + "auxiliary_loss_clip": 0.01156933, + "auxiliary_loss_mlp": 0.00747357, + "balance_loss_clip": 1.00235426, + "balance_loss_mlp": 1.00043964, + "epoch": 0.5067788967383136, + "flos": 28841080665600.0, + "grad_norm": 1.6235218787193193, + "language_loss": 0.830935, + "learning_rate": 2.0531552596471168e-06, + "loss": 0.84997791, + "num_input_tokens_seen": 181197595, + "step": 8429, + "time_per_iteration": 3.990635633468628 + }, + { + "auxiliary_loss_clip": 0.01117675, + "auxiliary_loss_mlp": 0.01114323, + "balance_loss_clip": 1.00197887, + "balance_loss_mlp": 1.00055003, + "epoch": 0.5068390199909815, + "flos": 32450174478720.0, + "grad_norm": 5.264252700336355, + "language_loss": 0.73061931, + "learning_rate": 2.052765934536682e-06, + "loss": 0.75293934, + "num_input_tokens_seen": 181218560, + "step": 8430, + "time_per_iteration": 2.7240304946899414 + }, + { + "auxiliary_loss_clip": 0.01089377, + "auxiliary_loss_mlp": 0.01113534, + "balance_loss_clip": 1.0020858, + "balance_loss_mlp": 1.00061893, + "epoch": 0.5068991432436495, + "flos": 23146582014720.0, + "grad_norm": 1.889375110661611, + "language_loss": 0.76767457, + "learning_rate": 2.0523766074253575e-06, + "loss": 0.78970373, + "num_input_tokens_seen": 181237095, + "step": 8431, + "time_per_iteration": 2.77934193611145 + }, + { + "auxiliary_loss_clip": 0.01152403, + "auxiliary_loss_mlp": 0.01112916, + "balance_loss_clip": 1.00225961, + "balance_loss_mlp": 1.0005734, + "epoch": 0.5069592664963174, + "flos": 19936096404480.0, + "grad_norm": 1.784021408285349, + "language_loss": 0.722224, + "learning_rate": 2.0519872783279074e-06, + "loss": 0.74487722, + "num_input_tokens_seen": 181255940, + "step": 8432, + "time_per_iteration": 5.522912979125977 + }, + { + "auxiliary_loss_clip": 0.01098197, + "auxiliary_loss_mlp": 0.0109172, + "balance_loss_clip": 1.0018214, + "balance_loss_mlp": 1.00016689, + "epoch": 0.5070193897489854, + "flos": 65793771941760.0, + "grad_norm": 0.7543378785136379, + "language_loss": 0.63623703, + "learning_rate": 2.0515979472590945e-06, + "loss": 0.65813619, + "num_input_tokens_seen": 181316945, + "step": 8433, + "time_per_iteration": 3.341797113418579 + }, + { + "auxiliary_loss_clip": 0.01117205, + "auxiliary_loss_mlp": 0.01112956, + "balance_loss_clip": 1.00200534, + "balance_loss_mlp": 1.0007081, + "epoch": 0.5070795130016534, + "flos": 17275331514240.0, + "grad_norm": 2.0168718654385334, + "language_loss": 0.77756727, + "learning_rate": 2.051208614233681e-06, + "loss": 0.79986882, + "num_input_tokens_seen": 181335555, + "step": 8434, + "time_per_iteration": 2.9125161170959473 + }, + { + "auxiliary_loss_clip": 0.01135396, + "auxiliary_loss_mlp": 0.01113614, + "balance_loss_clip": 1.00205564, + "balance_loss_mlp": 1.00060368, + "epoch": 0.5071396362543213, + "flos": 21069940095360.0, + "grad_norm": 1.8153702462555736, + "language_loss": 0.71162188, + "learning_rate": 2.0508192792664326e-06, + "loss": 0.73411202, + "num_input_tokens_seen": 181354580, + "step": 8435, + "time_per_iteration": 4.345793724060059 + }, + { + "auxiliary_loss_clip": 0.01152573, + "auxiliary_loss_mlp": 0.01114364, + "balance_loss_clip": 1.00228155, + "balance_loss_mlp": 1.00068593, + "epoch": 0.5071997595069894, + "flos": 23144822248320.0, + "grad_norm": 1.9462669467008806, + "language_loss": 0.7234928, + "learning_rate": 2.050429942372112e-06, + "loss": 0.74616218, + "num_input_tokens_seen": 181374320, + "step": 8436, + "time_per_iteration": 2.6319541931152344 + }, + { + "auxiliary_loss_clip": 0.01167418, + "auxiliary_loss_mlp": 0.01113582, + "balance_loss_clip": 1.00222802, + "balance_loss_mlp": 1.00057197, + "epoch": 0.5072598827596573, + "flos": 22747183712640.0, + "grad_norm": 1.6894442278880881, + "language_loss": 0.83873403, + "learning_rate": 2.050040603565483e-06, + "loss": 0.86154407, + "num_input_tokens_seen": 181392190, + "step": 8437, + "time_per_iteration": 2.5554771423339844 + }, + { + "auxiliary_loss_clip": 0.01150904, + "auxiliary_loss_mlp": 0.01111876, + "balance_loss_clip": 1.00210249, + "balance_loss_mlp": 1.00048685, + "epoch": 0.5073200060123253, + "flos": 22566301799040.0, + "grad_norm": 1.7347773267310058, + "language_loss": 0.80413145, + "learning_rate": 2.049651262861309e-06, + "loss": 0.82675922, + "num_input_tokens_seen": 181413890, + "step": 8438, + "time_per_iteration": 2.6216845512390137 + }, + { + "auxiliary_loss_clip": 0.01117935, + "auxiliary_loss_mlp": 0.01113671, + "balance_loss_clip": 1.00208449, + "balance_loss_mlp": 1.00066054, + "epoch": 0.5073801292649932, + "flos": 25806341324160.0, + "grad_norm": 1.5902810442299222, + "language_loss": 0.79500645, + "learning_rate": 2.0492619202743543e-06, + "loss": 0.81732249, + "num_input_tokens_seen": 181433240, + "step": 8439, + "time_per_iteration": 2.712862730026245 + }, + { + "auxiliary_loss_clip": 0.01135852, + "auxiliary_loss_mlp": 0.00747351, + "balance_loss_clip": 1.00211906, + "balance_loss_mlp": 1.00038266, + "epoch": 0.5074402525176612, + "flos": 25373941401600.0, + "grad_norm": 1.4862397130868645, + "language_loss": 0.70906889, + "learning_rate": 2.048872575819383e-06, + "loss": 0.72790098, + "num_input_tokens_seen": 181453535, + "step": 8440, + "time_per_iteration": 2.6492815017700195 + }, + { + "auxiliary_loss_clip": 0.01137943, + "auxiliary_loss_mlp": 0.01112767, + "balance_loss_clip": 1.00229001, + "balance_loss_mlp": 1.00070989, + "epoch": 0.5075003757703291, + "flos": 26064431521920.0, + "grad_norm": 1.697771179203056, + "language_loss": 0.71199286, + "learning_rate": 2.048483229511158e-06, + "loss": 0.73449993, + "num_input_tokens_seen": 181474195, + "step": 8441, + "time_per_iteration": 2.700272798538208 + }, + { + "auxiliary_loss_clip": 0.01150898, + "auxiliary_loss_mlp": 0.00747442, + "balance_loss_clip": 1.00217593, + "balance_loss_mlp": 1.00039649, + "epoch": 0.5075604990229972, + "flos": 21835447770240.0, + "grad_norm": 3.559120302395655, + "language_loss": 0.64299631, + "learning_rate": 2.0480938813644445e-06, + "loss": 0.66197968, + "num_input_tokens_seen": 181494000, + "step": 8442, + "time_per_iteration": 2.6186447143554688 + }, + { + "auxiliary_loss_clip": 0.01101494, + "auxiliary_loss_mlp": 0.01111644, + "balance_loss_clip": 1.00161767, + "balance_loss_mlp": 1.00054049, + "epoch": 0.5076206222756651, + "flos": 31978703537280.0, + "grad_norm": 1.5543178837394256, + "language_loss": 0.71429801, + "learning_rate": 2.047704531394006e-06, + "loss": 0.73642939, + "num_input_tokens_seen": 181515955, + "step": 8443, + "time_per_iteration": 2.8160252571105957 + }, + { + "auxiliary_loss_clip": 0.01074939, + "auxiliary_loss_mlp": 0.01114151, + "balance_loss_clip": 1.00210655, + "balance_loss_mlp": 1.00066352, + "epoch": 0.5076807455283331, + "flos": 36904031326080.0, + "grad_norm": 1.486296696956957, + "language_loss": 0.62041688, + "learning_rate": 2.047315179614607e-06, + "loss": 0.64230782, + "num_input_tokens_seen": 181540225, + "step": 8444, + "time_per_iteration": 2.913965940475464 + }, + { + "auxiliary_loss_clip": 0.01118563, + "auxiliary_loss_mlp": 0.01112282, + "balance_loss_clip": 1.00203109, + "balance_loss_mlp": 1.00051165, + "epoch": 0.507740868781001, + "flos": 29862415981440.0, + "grad_norm": 1.6853131774662127, + "language_loss": 0.63668311, + "learning_rate": 2.046925826041012e-06, + "loss": 0.65899158, + "num_input_tokens_seen": 181560125, + "step": 8445, + "time_per_iteration": 2.740131139755249 + }, + { + "auxiliary_loss_clip": 0.01120519, + "auxiliary_loss_mlp": 0.01090775, + "balance_loss_clip": 1.00187635, + "balance_loss_mlp": 0.99998534, + "epoch": 0.507800992033669, + "flos": 61918974247680.0, + "grad_norm": 0.8210969606680283, + "language_loss": 0.61917698, + "learning_rate": 2.0465364706879845e-06, + "loss": 0.64128995, + "num_input_tokens_seen": 181618830, + "step": 8446, + "time_per_iteration": 3.2388012409210205 + }, + { + "auxiliary_loss_clip": 0.01117151, + "auxiliary_loss_mlp": 0.01112326, + "balance_loss_clip": 1.00193095, + "balance_loss_mlp": 1.00055516, + "epoch": 0.507861115286337, + "flos": 20700490757760.0, + "grad_norm": 1.7418694163431812, + "language_loss": 0.80912244, + "learning_rate": 2.04614711357029e-06, + "loss": 0.83141726, + "num_input_tokens_seen": 181637120, + "step": 8447, + "time_per_iteration": 2.7294247150421143 + }, + { + "auxiliary_loss_clip": 0.01151726, + "auxiliary_loss_mlp": 0.01112458, + "balance_loss_clip": 1.00211775, + "balance_loss_mlp": 1.00059211, + "epoch": 0.507921238539005, + "flos": 30847050576000.0, + "grad_norm": 1.4177168477952242, + "language_loss": 0.70601147, + "learning_rate": 2.0457577547026916e-06, + "loss": 0.72865331, + "num_input_tokens_seen": 181659965, + "step": 8448, + "time_per_iteration": 2.655991792678833 + }, + { + "auxiliary_loss_clip": 0.01167256, + "auxiliary_loss_mlp": 0.00747364, + "balance_loss_clip": 1.00220346, + "balance_loss_mlp": 1.00052571, + "epoch": 0.507981361791673, + "flos": 35700197984640.0, + "grad_norm": 1.5538331732680097, + "language_loss": 0.71862435, + "learning_rate": 2.045368394099955e-06, + "loss": 0.73777056, + "num_input_tokens_seen": 181685290, + "step": 8449, + "time_per_iteration": 2.6840391159057617 + }, + { + "auxiliary_loss_clip": 0.0113386, + "auxiliary_loss_mlp": 0.01113382, + "balance_loss_clip": 1.00197792, + "balance_loss_mlp": 1.00056183, + "epoch": 0.5080414850443409, + "flos": 27161466750720.0, + "grad_norm": 1.5965818528531628, + "language_loss": 0.72905421, + "learning_rate": 2.044979031776844e-06, + "loss": 0.75152659, + "num_input_tokens_seen": 181706080, + "step": 8450, + "time_per_iteration": 2.654090642929077 + }, + { + "auxiliary_loss_clip": 0.01167391, + "auxiliary_loss_mlp": 0.01113514, + "balance_loss_clip": 1.00227368, + "balance_loss_mlp": 1.00059867, + "epoch": 0.5081016082970089, + "flos": 27085192220160.0, + "grad_norm": 2.0613926459731733, + "language_loss": 0.77104199, + "learning_rate": 2.0445896677481234e-06, + "loss": 0.79385102, + "num_input_tokens_seen": 181724805, + "step": 8451, + "time_per_iteration": 2.587024450302124 + }, + { + "auxiliary_loss_clip": 0.01167406, + "auxiliary_loss_mlp": 0.01113696, + "balance_loss_clip": 1.00225306, + "balance_loss_mlp": 1.0007813, + "epoch": 0.5081617315496768, + "flos": 22856531690880.0, + "grad_norm": 1.699065294025688, + "language_loss": 0.84891051, + "learning_rate": 2.044200302028559e-06, + "loss": 0.87172151, + "num_input_tokens_seen": 181743725, + "step": 8452, + "time_per_iteration": 2.516195058822632 + }, + { + "auxiliary_loss_clip": 0.01167548, + "auxiliary_loss_mlp": 0.01114511, + "balance_loss_clip": 1.00232291, + "balance_loss_mlp": 1.00054681, + "epoch": 0.5082218548023448, + "flos": 16281898087680.0, + "grad_norm": 2.4473130624318773, + "language_loss": 0.77947873, + "learning_rate": 2.0438109346329143e-06, + "loss": 0.80229926, + "num_input_tokens_seen": 181757720, + "step": 8453, + "time_per_iteration": 2.5085206031799316 + }, + { + "auxiliary_loss_clip": 0.01118502, + "auxiliary_loss_mlp": 0.01112156, + "balance_loss_clip": 1.00216222, + "balance_loss_mlp": 1.00067139, + "epoch": 0.5082819780550127, + "flos": 24460768915200.0, + "grad_norm": 1.7906036922100008, + "language_loss": 0.75866556, + "learning_rate": 2.0434215655759544e-06, + "loss": 0.78097212, + "num_input_tokens_seen": 181778545, + "step": 8454, + "time_per_iteration": 2.6912169456481934 + }, + { + "auxiliary_loss_clip": 0.0113589, + "auxiliary_loss_mlp": 0.01114299, + "balance_loss_clip": 1.00230944, + "balance_loss_mlp": 1.00071609, + "epoch": 0.5083421013076808, + "flos": 23403271582080.0, + "grad_norm": 1.6051009905264668, + "language_loss": 0.89169097, + "learning_rate": 2.0430321948724446e-06, + "loss": 0.91419291, + "num_input_tokens_seen": 181799495, + "step": 8455, + "time_per_iteration": 2.672327756881714 + }, + { + "auxiliary_loss_clip": 0.01135937, + "auxiliary_loss_mlp": 0.00747552, + "balance_loss_clip": 1.00207889, + "balance_loss_mlp": 1.00046146, + "epoch": 0.5084022245603487, + "flos": 23872695448320.0, + "grad_norm": 1.880998672569722, + "language_loss": 0.62309611, + "learning_rate": 2.042642822537149e-06, + "loss": 0.64193106, + "num_input_tokens_seen": 181818400, + "step": 8456, + "time_per_iteration": 2.640575885772705 + }, + { + "auxiliary_loss_clip": 0.01145866, + "auxiliary_loss_mlp": 0.01091159, + "balance_loss_clip": 1.00177372, + "balance_loss_mlp": 0.9999879, + "epoch": 0.5084623478130167, + "flos": 62873336655360.0, + "grad_norm": 0.8148091823130575, + "language_loss": 0.62386239, + "learning_rate": 2.0422534485848343e-06, + "loss": 0.6462326, + "num_input_tokens_seen": 181875975, + "step": 8457, + "time_per_iteration": 3.036160469055176 + }, + { + "auxiliary_loss_clip": 0.01152606, + "auxiliary_loss_mlp": 0.01114103, + "balance_loss_clip": 1.00225234, + "balance_loss_mlp": 1.00061572, + "epoch": 0.5085224710656846, + "flos": 22346133384960.0, + "grad_norm": 2.3424759918845366, + "language_loss": 0.67184842, + "learning_rate": 2.0418640730302644e-06, + "loss": 0.69451547, + "num_input_tokens_seen": 181896450, + "step": 8458, + "time_per_iteration": 2.597322463989258 + }, + { + "auxiliary_loss_clip": 0.01150764, + "auxiliary_loss_mlp": 0.01113462, + "balance_loss_clip": 1.00216818, + "balance_loss_mlp": 1.00054681, + "epoch": 0.5085825943183526, + "flos": 26066263115520.0, + "grad_norm": 1.7649473693123099, + "language_loss": 0.77649558, + "learning_rate": 2.0414746958882043e-06, + "loss": 0.79913783, + "num_input_tokens_seen": 181916770, + "step": 8459, + "time_per_iteration": 2.6374716758728027 + }, + { + "auxiliary_loss_clip": 0.01167702, + "auxiliary_loss_mlp": 0.01115036, + "balance_loss_clip": 1.00245714, + "balance_loss_mlp": 1.00059521, + "epoch": 0.5086427175710206, + "flos": 17420733768960.0, + "grad_norm": 2.255705331092577, + "language_loss": 0.80710429, + "learning_rate": 2.0410853171734196e-06, + "loss": 0.82993162, + "num_input_tokens_seen": 181932710, + "step": 8460, + "time_per_iteration": 2.497804880142212 + }, + { + "auxiliary_loss_clip": 0.01137067, + "auxiliary_loss_mlp": 0.01113993, + "balance_loss_clip": 1.00217366, + "balance_loss_mlp": 1.00079226, + "epoch": 0.5087028408236886, + "flos": 20631758083200.0, + "grad_norm": 1.5568301935260453, + "language_loss": 0.68837303, + "learning_rate": 2.0406959369006754e-06, + "loss": 0.71088368, + "num_input_tokens_seen": 181950665, + "step": 8461, + "time_per_iteration": 2.662095546722412 + }, + { + "auxiliary_loss_clip": 0.01167379, + "auxiliary_loss_mlp": 0.01113721, + "balance_loss_clip": 1.00230682, + "balance_loss_mlp": 1.0006156, + "epoch": 0.5087629640763566, + "flos": 25593822506880.0, + "grad_norm": 1.765746351165466, + "language_loss": 0.76225889, + "learning_rate": 2.0403065550847375e-06, + "loss": 0.78506982, + "num_input_tokens_seen": 181971270, + "step": 8462, + "time_per_iteration": 2.5661699771881104 + }, + { + "auxiliary_loss_clip": 0.01118564, + "auxiliary_loss_mlp": 0.01113789, + "balance_loss_clip": 1.00222349, + "balance_loss_mlp": 1.0005883, + "epoch": 0.5088230873290245, + "flos": 13261631927040.0, + "grad_norm": 2.0677758986920276, + "language_loss": 0.81144047, + "learning_rate": 2.0399171717403706e-06, + "loss": 0.83376408, + "num_input_tokens_seen": 181988410, + "step": 8463, + "time_per_iteration": 2.6319923400878906 + }, + { + "auxiliary_loss_clip": 0.01152375, + "auxiliary_loss_mlp": 0.01113517, + "balance_loss_clip": 1.00230837, + "balance_loss_mlp": 1.00079298, + "epoch": 0.5088832105816925, + "flos": 20043469134720.0, + "grad_norm": 3.7097109033168936, + "language_loss": 0.76168931, + "learning_rate": 2.039527786882341e-06, + "loss": 0.78434825, + "num_input_tokens_seen": 182006530, + "step": 8464, + "time_per_iteration": 2.569441556930542 + }, + { + "auxiliary_loss_clip": 0.01145747, + "auxiliary_loss_mlp": 0.01091202, + "balance_loss_clip": 1.0015974, + "balance_loss_mlp": 1.0000304, + "epoch": 0.5089433338343604, + "flos": 67422179018880.0, + "grad_norm": 0.6820510744752225, + "language_loss": 0.59387237, + "learning_rate": 2.0391384005254133e-06, + "loss": 0.61624187, + "num_input_tokens_seen": 182074240, + "step": 8465, + "time_per_iteration": 3.313425302505493 + }, + { + "auxiliary_loss_clip": 0.01167444, + "auxiliary_loss_mlp": 0.01113228, + "balance_loss_clip": 1.0023191, + "balance_loss_mlp": 1.00059867, + "epoch": 0.5090034570870284, + "flos": 22710339336960.0, + "grad_norm": 1.8225555401991678, + "language_loss": 0.7988826, + "learning_rate": 2.038749012684354e-06, + "loss": 0.82168931, + "num_input_tokens_seen": 182093360, + "step": 8466, + "time_per_iteration": 3.9507126808166504 + }, + { + "auxiliary_loss_clip": 0.01152357, + "auxiliary_loss_mlp": 0.01112432, + "balance_loss_clip": 1.00216365, + "balance_loss_mlp": 1.00047064, + "epoch": 0.5090635803396963, + "flos": 20445812352000.0, + "grad_norm": 1.7033971246406916, + "language_loss": 0.7849375, + "learning_rate": 2.0383596233739286e-06, + "loss": 0.80758548, + "num_input_tokens_seen": 182110170, + "step": 8467, + "time_per_iteration": 2.603926658630371 + }, + { + "auxiliary_loss_clip": 0.01167189, + "auxiliary_loss_mlp": 0.01112441, + "balance_loss_clip": 1.00232625, + "balance_loss_mlp": 1.00057507, + "epoch": 0.5091237035923644, + "flos": 23768878164480.0, + "grad_norm": 3.8409015865060816, + "language_loss": 0.74328828, + "learning_rate": 2.0379702326089013e-06, + "loss": 0.76608455, + "num_input_tokens_seen": 182129570, + "step": 8468, + "time_per_iteration": 2.562199831008911 + }, + { + "auxiliary_loss_clip": 0.01167328, + "auxiliary_loss_mlp": 0.01113522, + "balance_loss_clip": 1.0022819, + "balance_loss_mlp": 1.00051177, + "epoch": 0.5091838268450323, + "flos": 18327908684160.0, + "grad_norm": 1.8459684730276655, + "language_loss": 0.77827942, + "learning_rate": 2.03758084040404e-06, + "loss": 0.80108798, + "num_input_tokens_seen": 182147565, + "step": 8469, + "time_per_iteration": 2.513875722885132 + }, + { + "auxiliary_loss_clip": 0.01152808, + "auxiliary_loss_mlp": 0.01113255, + "balance_loss_clip": 1.00244164, + "balance_loss_mlp": 1.00062633, + "epoch": 0.5092439500977003, + "flos": 29057621806080.0, + "grad_norm": 1.4588219588248452, + "language_loss": 0.69463438, + "learning_rate": 2.037191446774109e-06, + "loss": 0.71729505, + "num_input_tokens_seen": 182169695, + "step": 8470, + "time_per_iteration": 5.5150415897369385 + }, + { + "auxiliary_loss_clip": 0.01135536, + "auxiliary_loss_mlp": 0.01114283, + "balance_loss_clip": 1.00213671, + "balance_loss_mlp": 1.00079608, + "epoch": 0.5093040733503682, + "flos": 13553908894080.0, + "grad_norm": 2.0819721015703836, + "language_loss": 0.7358259, + "learning_rate": 2.0368020517338745e-06, + "loss": 0.75832409, + "num_input_tokens_seen": 182186385, + "step": 8471, + "time_per_iteration": 2.6203927993774414 + }, + { + "auxiliary_loss_clip": 0.01162333, + "auxiliary_loss_mlp": 0.01090797, + "balance_loss_clip": 1.00157177, + "balance_loss_mlp": 1.00000727, + "epoch": 0.5093641966030362, + "flos": 68906617407360.0, + "grad_norm": 0.7516988990849224, + "language_loss": 0.58139372, + "learning_rate": 2.036412655298103e-06, + "loss": 0.60392499, + "num_input_tokens_seen": 182247095, + "step": 8472, + "time_per_iteration": 3.097275495529175 + }, + { + "auxiliary_loss_clip": 0.01104034, + "auxiliary_loss_mlp": 0.01112744, + "balance_loss_clip": 1.00202072, + "balance_loss_mlp": 1.00068736, + "epoch": 0.5094243198557042, + "flos": 21580948932480.0, + "grad_norm": 2.2115302696102375, + "language_loss": 0.69252741, + "learning_rate": 2.03602325748156e-06, + "loss": 0.71469522, + "num_input_tokens_seen": 182266380, + "step": 8473, + "time_per_iteration": 2.6993894577026367 + }, + { + "auxiliary_loss_clip": 0.011333, + "auxiliary_loss_mlp": 0.01113735, + "balance_loss_clip": 1.00208926, + "balance_loss_mlp": 1.00072479, + "epoch": 0.5094844431083722, + "flos": 28840721529600.0, + "grad_norm": 1.7844193718340289, + "language_loss": 0.85188401, + "learning_rate": 2.0356338582990105e-06, + "loss": 0.8743543, + "num_input_tokens_seen": 182284685, + "step": 8474, + "time_per_iteration": 4.108074426651001 + }, + { + "auxiliary_loss_clip": 0.01140468, + "auxiliary_loss_mlp": 0.0111382, + "balance_loss_clip": 1.00247276, + "balance_loss_mlp": 1.00061846, + "epoch": 0.5095445663610402, + "flos": 14976114969600.0, + "grad_norm": 1.8939623003948882, + "language_loss": 0.65476811, + "learning_rate": 2.035244457765222e-06, + "loss": 0.67731094, + "num_input_tokens_seen": 182301810, + "step": 8475, + "time_per_iteration": 2.5946035385131836 + }, + { + "auxiliary_loss_clip": 0.01135839, + "auxiliary_loss_mlp": 0.01114754, + "balance_loss_clip": 1.0023098, + "balance_loss_mlp": 1.00079, + "epoch": 0.5096046896137081, + "flos": 20777088510720.0, + "grad_norm": 5.051783736999061, + "language_loss": 0.8143521, + "learning_rate": 2.0348550558949605e-06, + "loss": 0.83685797, + "num_input_tokens_seen": 182320285, + "step": 8476, + "time_per_iteration": 2.603466510772705 + }, + { + "auxiliary_loss_clip": 0.01106064, + "auxiliary_loss_mlp": 0.01114498, + "balance_loss_clip": 1.00212479, + "balance_loss_mlp": 1.00062966, + "epoch": 0.5096648128663761, + "flos": 23185078416000.0, + "grad_norm": 1.9413391911962983, + "language_loss": 0.8083567, + "learning_rate": 2.0344656527029917e-06, + "loss": 0.83056229, + "num_input_tokens_seen": 182339465, + "step": 8477, + "time_per_iteration": 2.713052272796631 + }, + { + "auxiliary_loss_clip": 0.01135974, + "auxiliary_loss_mlp": 0.01114272, + "balance_loss_clip": 1.00226808, + "balance_loss_mlp": 1.00049829, + "epoch": 0.509724936119044, + "flos": 22309432663680.0, + "grad_norm": 2.048047034795069, + "language_loss": 0.61959374, + "learning_rate": 2.034076248204082e-06, + "loss": 0.64209616, + "num_input_tokens_seen": 182358375, + "step": 8478, + "time_per_iteration": 2.6182262897491455 + }, + { + "auxiliary_loss_clip": 0.01150504, + "auxiliary_loss_mlp": 0.01113715, + "balance_loss_clip": 1.00212109, + "balance_loss_mlp": 1.00080013, + "epoch": 0.509785059371712, + "flos": 26287077974400.0, + "grad_norm": 1.5546164401742355, + "language_loss": 0.65452498, + "learning_rate": 2.0336868424129968e-06, + "loss": 0.67716718, + "num_input_tokens_seen": 182377935, + "step": 8479, + "time_per_iteration": 2.6456785202026367 + }, + { + "auxiliary_loss_clip": 0.01150624, + "auxiliary_loss_mlp": 0.01112533, + "balance_loss_clip": 1.00226331, + "balance_loss_mlp": 1.00057149, + "epoch": 0.50984518262438, + "flos": 22964586779520.0, + "grad_norm": 1.7604547946587927, + "language_loss": 0.69246817, + "learning_rate": 2.0332974353445037e-06, + "loss": 0.71509975, + "num_input_tokens_seen": 182396440, + "step": 8480, + "time_per_iteration": 2.574533462524414 + }, + { + "auxiliary_loss_clip": 0.01167541, + "auxiliary_loss_mlp": 0.01113633, + "balance_loss_clip": 1.00231695, + "balance_loss_mlp": 1.00071788, + "epoch": 0.509905305877048, + "flos": 26213389223040.0, + "grad_norm": 1.8273142860262508, + "language_loss": 0.79381257, + "learning_rate": 2.0329080270133688e-06, + "loss": 0.81662428, + "num_input_tokens_seen": 182415890, + "step": 8481, + "time_per_iteration": 2.598475217819214 + }, + { + "auxiliary_loss_clip": 0.01150222, + "auxiliary_loss_mlp": 0.01113154, + "balance_loss_clip": 1.00206554, + "balance_loss_mlp": 1.00081134, + "epoch": 0.5099654291297159, + "flos": 20340055733760.0, + "grad_norm": 1.560991827304863, + "language_loss": 0.8346954, + "learning_rate": 2.0325186174343578e-06, + "loss": 0.85732919, + "num_input_tokens_seen": 182434235, + "step": 8482, + "time_per_iteration": 2.558706283569336 + }, + { + "auxiliary_loss_clip": 0.01152599, + "auxiliary_loss_mlp": 0.00747564, + "balance_loss_clip": 1.00209165, + "balance_loss_mlp": 1.00057316, + "epoch": 0.5100255523823839, + "flos": 29054820545280.0, + "grad_norm": 2.49197702441103, + "language_loss": 0.85854846, + "learning_rate": 2.032129206622238e-06, + "loss": 0.87755007, + "num_input_tokens_seen": 182454360, + "step": 8483, + "time_per_iteration": 2.687685251235962 + }, + { + "auxiliary_loss_clip": 0.01150594, + "auxiliary_loss_mlp": 0.01112685, + "balance_loss_clip": 1.00210071, + "balance_loss_mlp": 1.00053298, + "epoch": 0.5100856756350518, + "flos": 22455912326400.0, + "grad_norm": 1.8637078189087195, + "language_loss": 0.83370757, + "learning_rate": 2.031739794591775e-06, + "loss": 0.85634035, + "num_input_tokens_seen": 182471940, + "step": 8484, + "time_per_iteration": 2.5557305812835693 + }, + { + "auxiliary_loss_clip": 0.01135874, + "auxiliary_loss_mlp": 0.01112652, + "balance_loss_clip": 1.00229824, + "balance_loss_mlp": 1.00049961, + "epoch": 0.5101457988877198, + "flos": 19171055606400.0, + "grad_norm": 2.0325852706372607, + "language_loss": 0.81396997, + "learning_rate": 2.031350381357736e-06, + "loss": 0.83645523, + "num_input_tokens_seen": 182490685, + "step": 8485, + "time_per_iteration": 2.6120998859405518 + }, + { + "auxiliary_loss_clip": 0.01137525, + "auxiliary_loss_mlp": 0.01112483, + "balance_loss_clip": 1.00227571, + "balance_loss_mlp": 1.00061667, + "epoch": 0.5102059221403878, + "flos": 14866371941760.0, + "grad_norm": 2.184425793023582, + "language_loss": 0.73828077, + "learning_rate": 2.0309609669348874e-06, + "loss": 0.76078081, + "num_input_tokens_seen": 182508325, + "step": 8486, + "time_per_iteration": 2.6431214809417725 + }, + { + "auxiliary_loss_clip": 0.0112123, + "auxiliary_loss_mlp": 0.01113974, + "balance_loss_clip": 1.00219417, + "balance_loss_mlp": 1.00058198, + "epoch": 0.5102660453930558, + "flos": 22961103160320.0, + "grad_norm": 2.045014276062393, + "language_loss": 0.70259506, + "learning_rate": 2.0305715513379953e-06, + "loss": 0.72494709, + "num_input_tokens_seen": 182527020, + "step": 8487, + "time_per_iteration": 2.6939120292663574 + }, + { + "auxiliary_loss_clip": 0.01135263, + "auxiliary_loss_mlp": 0.01112686, + "balance_loss_clip": 1.00223541, + "balance_loss_mlp": 1.00072455, + "epoch": 0.5103261686457238, + "flos": 23149311448320.0, + "grad_norm": 2.384544382771948, + "language_loss": 0.72923589, + "learning_rate": 2.030182134581827e-06, + "loss": 0.75171536, + "num_input_tokens_seen": 182543505, + "step": 8488, + "time_per_iteration": 2.6292619705200195 + }, + { + "auxiliary_loss_clip": 0.01119388, + "auxiliary_loss_mlp": 0.00747467, + "balance_loss_clip": 1.00208116, + "balance_loss_mlp": 1.0004611, + "epoch": 0.5103862918983917, + "flos": 14319237000960.0, + "grad_norm": 1.9768542711655717, + "language_loss": 0.69804198, + "learning_rate": 2.0297927166811503e-06, + "loss": 0.71671057, + "num_input_tokens_seen": 182562250, + "step": 8489, + "time_per_iteration": 2.641932249069214 + }, + { + "auxiliary_loss_clip": 0.01141988, + "auxiliary_loss_mlp": 0.01113719, + "balance_loss_clip": 1.00243068, + "balance_loss_mlp": 1.00070906, + "epoch": 0.5104464151510597, + "flos": 25848536826240.0, + "grad_norm": 2.366609002290979, + "language_loss": 0.72519213, + "learning_rate": 2.0294032976507297e-06, + "loss": 0.74774921, + "num_input_tokens_seen": 182581910, + "step": 8490, + "time_per_iteration": 2.6419970989227295 + }, + { + "auxiliary_loss_clip": 0.01135126, + "auxiliary_loss_mlp": 0.01112469, + "balance_loss_clip": 1.00203633, + "balance_loss_mlp": 1.00069857, + "epoch": 0.5105065384037276, + "flos": 21652913831040.0, + "grad_norm": 1.5750049009219387, + "language_loss": 0.80549586, + "learning_rate": 2.0290138775053337e-06, + "loss": 0.82797182, + "num_input_tokens_seen": 182601350, + "step": 8491, + "time_per_iteration": 2.6280956268310547 + }, + { + "auxiliary_loss_clip": 0.01151717, + "auxiliary_loss_mlp": 0.01111975, + "balance_loss_clip": 1.00222027, + "balance_loss_mlp": 1.0006808, + "epoch": 0.5105666616563956, + "flos": 22491571553280.0, + "grad_norm": 2.049069575830099, + "language_loss": 0.78888762, + "learning_rate": 2.028624456259728e-06, + "loss": 0.81152451, + "num_input_tokens_seen": 182619660, + "step": 8492, + "time_per_iteration": 2.5682992935180664 + }, + { + "auxiliary_loss_clip": 0.01118651, + "auxiliary_loss_mlp": 0.01112847, + "balance_loss_clip": 1.00218809, + "balance_loss_mlp": 1.00059998, + "epoch": 0.5106267849090635, + "flos": 22455768672000.0, + "grad_norm": 1.9478875416851469, + "language_loss": 0.78105152, + "learning_rate": 2.0282350339286804e-06, + "loss": 0.80336654, + "num_input_tokens_seen": 182639815, + "step": 8493, + "time_per_iteration": 2.6711297035217285 + }, + { + "auxiliary_loss_clip": 0.01119287, + "auxiliary_loss_mlp": 0.01114377, + "balance_loss_clip": 1.00217247, + "balance_loss_mlp": 1.00060344, + "epoch": 0.5106869081617316, + "flos": 23547093638400.0, + "grad_norm": 1.9982728371513676, + "language_loss": 0.83456653, + "learning_rate": 2.0278456105269574e-06, + "loss": 0.85690314, + "num_input_tokens_seen": 182659655, + "step": 8494, + "time_per_iteration": 2.705495834350586 + }, + { + "auxiliary_loss_clip": 0.01167523, + "auxiliary_loss_mlp": 0.01113692, + "balance_loss_clip": 1.00238061, + "balance_loss_mlp": 1.00077689, + "epoch": 0.5107470314143995, + "flos": 26792987080320.0, + "grad_norm": 2.1404378464349425, + "language_loss": 0.79129732, + "learning_rate": 2.027456186069326e-06, + "loss": 0.81410944, + "num_input_tokens_seen": 182677075, + "step": 8495, + "time_per_iteration": 2.5771923065185547 + }, + { + "auxiliary_loss_clip": 0.01118664, + "auxiliary_loss_mlp": 0.01113161, + "balance_loss_clip": 1.00206125, + "balance_loss_mlp": 1.00053239, + "epoch": 0.5108071546670675, + "flos": 25739691638400.0, + "grad_norm": 1.5085866972543922, + "language_loss": 0.7869736, + "learning_rate": 2.0270667605705535e-06, + "loss": 0.80929184, + "num_input_tokens_seen": 182699625, + "step": 8496, + "time_per_iteration": 2.6953489780426025 + }, + { + "auxiliary_loss_clip": 0.01150511, + "auxiliary_loss_mlp": 0.0111356, + "balance_loss_clip": 1.00222373, + "balance_loss_mlp": 1.00054932, + "epoch": 0.5108672779197354, + "flos": 18697537589760.0, + "grad_norm": 1.910060119082307, + "language_loss": 0.79330993, + "learning_rate": 2.0266773340454066e-06, + "loss": 0.81595063, + "num_input_tokens_seen": 182717020, + "step": 8497, + "time_per_iteration": 2.608362913131714 + }, + { + "auxiliary_loss_clip": 0.01167244, + "auxiliary_loss_mlp": 0.01113195, + "balance_loss_clip": 1.00221121, + "balance_loss_mlp": 1.00056577, + "epoch": 0.5109274011724034, + "flos": 26688164215680.0, + "grad_norm": 1.6975945376115564, + "language_loss": 0.8203826, + "learning_rate": 2.0262879065086525e-06, + "loss": 0.84318697, + "num_input_tokens_seen": 182736955, + "step": 8498, + "time_per_iteration": 2.5548131465911865 + }, + { + "auxiliary_loss_clip": 0.01137671, + "auxiliary_loss_mlp": 0.0074746, + "balance_loss_clip": 1.00241852, + "balance_loss_mlp": 1.00057626, + "epoch": 0.5109875244250714, + "flos": 22784028088320.0, + "grad_norm": 1.9960232437518173, + "language_loss": 0.71308237, + "learning_rate": 2.0258984779750584e-06, + "loss": 0.73193365, + "num_input_tokens_seen": 182757620, + "step": 8499, + "time_per_iteration": 2.648347854614258 + }, + { + "auxiliary_loss_clip": 0.01102346, + "auxiliary_loss_mlp": 0.01113234, + "balance_loss_clip": 1.0019269, + "balance_loss_mlp": 1.00050902, + "epoch": 0.5110476476777394, + "flos": 35588515622400.0, + "grad_norm": 1.53240972267212, + "language_loss": 0.72309923, + "learning_rate": 2.0255090484593914e-06, + "loss": 0.74525499, + "num_input_tokens_seen": 182780195, + "step": 8500, + "time_per_iteration": 2.775684356689453 + }, + { + "auxiliary_loss_clip": 0.01150878, + "auxiliary_loss_mlp": 0.01115496, + "balance_loss_clip": 1.00213754, + "balance_loss_mlp": 1.0005784, + "epoch": 0.5111077709304074, + "flos": 19280798634240.0, + "grad_norm": 4.472408657207338, + "language_loss": 0.62831312, + "learning_rate": 2.0251196179764183e-06, + "loss": 0.6509769, + "num_input_tokens_seen": 182795765, + "step": 8501, + "time_per_iteration": 2.549600839614868 + }, + { + "auxiliary_loss_clip": 0.01167346, + "auxiliary_loss_mlp": 0.01113981, + "balance_loss_clip": 1.00207245, + "balance_loss_mlp": 1.00058937, + "epoch": 0.5111678941830753, + "flos": 20668207409280.0, + "grad_norm": 1.7970409397463827, + "language_loss": 0.87590116, + "learning_rate": 2.024730186540907e-06, + "loss": 0.89871442, + "num_input_tokens_seen": 182813120, + "step": 8502, + "time_per_iteration": 2.508988857269287 + }, + { + "auxiliary_loss_clip": 0.01150372, + "auxiliary_loss_mlp": 0.01113441, + "balance_loss_clip": 1.00212264, + "balance_loss_mlp": 1.00071681, + "epoch": 0.5112280174357433, + "flos": 26287903987200.0, + "grad_norm": 3.4154004861304736, + "language_loss": 0.82630169, + "learning_rate": 2.0243407541676253e-06, + "loss": 0.84893978, + "num_input_tokens_seen": 182835745, + "step": 8503, + "time_per_iteration": 2.6191251277923584 + }, + { + "auxiliary_loss_clip": 0.0112879, + "auxiliary_loss_mlp": 0.01090848, + "balance_loss_clip": 1.00147676, + "balance_loss_mlp": 1.00005805, + "epoch": 0.5112881406884112, + "flos": 59474247707520.0, + "grad_norm": 0.861750779913654, + "language_loss": 0.63891417, + "learning_rate": 2.023951320871339e-06, + "loss": 0.66111058, + "num_input_tokens_seen": 182892540, + "step": 8504, + "time_per_iteration": 4.5568013191223145 + }, + { + "auxiliary_loss_clip": 0.0112073, + "auxiliary_loss_mlp": 0.00747317, + "balance_loss_clip": 1.00221741, + "balance_loss_mlp": 1.00035048, + "epoch": 0.5113482639410792, + "flos": 26468857728000.0, + "grad_norm": 2.7742927549943035, + "language_loss": 0.84669948, + "learning_rate": 2.023561886666816e-06, + "loss": 0.86537993, + "num_input_tokens_seen": 182911515, + "step": 8505, + "time_per_iteration": 2.707038402557373 + }, + { + "auxiliary_loss_clip": 0.01150662, + "auxiliary_loss_mlp": 0.01113249, + "balance_loss_clip": 1.00218272, + "balance_loss_mlp": 1.00061989, + "epoch": 0.5114083871937471, + "flos": 29895848565120.0, + "grad_norm": 2.776522864880333, + "language_loss": 0.75772917, + "learning_rate": 2.0231724515688246e-06, + "loss": 0.78036827, + "num_input_tokens_seen": 182930860, + "step": 8506, + "time_per_iteration": 2.6387410163879395 + }, + { + "auxiliary_loss_clip": 0.01167392, + "auxiliary_loss_mlp": 0.01114489, + "balance_loss_clip": 1.00228226, + "balance_loss_mlp": 1.00062025, + "epoch": 0.5114685104464152, + "flos": 24314576561280.0, + "grad_norm": 1.859276720947418, + "language_loss": 0.57759124, + "learning_rate": 2.022783015592131e-06, + "loss": 0.60040998, + "num_input_tokens_seen": 182949960, + "step": 8507, + "time_per_iteration": 4.075252294540405 + }, + { + "auxiliary_loss_clip": 0.01150899, + "auxiliary_loss_mlp": 0.01114357, + "balance_loss_clip": 1.00234437, + "balance_loss_mlp": 1.00077486, + "epoch": 0.5115286336990831, + "flos": 17019288391680.0, + "grad_norm": 1.9067431552114484, + "language_loss": 0.8630088, + "learning_rate": 2.022393578751503e-06, + "loss": 0.88566136, + "num_input_tokens_seen": 182968085, + "step": 8508, + "time_per_iteration": 3.9536502361297607 + }, + { + "auxiliary_loss_clip": 0.01118266, + "auxiliary_loss_mlp": 0.00747552, + "balance_loss_clip": 1.00199819, + "balance_loss_mlp": 1.0005486, + "epoch": 0.5115887569517511, + "flos": 23659386531840.0, + "grad_norm": 1.645062463773059, + "language_loss": 0.72272646, + "learning_rate": 2.022004141061709e-06, + "loss": 0.74138463, + "num_input_tokens_seen": 182987275, + "step": 8509, + "time_per_iteration": 2.6592445373535156 + }, + { + "auxiliary_loss_clip": 0.01167262, + "auxiliary_loss_mlp": 0.00747448, + "balance_loss_clip": 1.00233245, + "balance_loss_mlp": 1.00044227, + "epoch": 0.511648880204419, + "flos": 16107193313280.0, + "grad_norm": 2.0793806362327394, + "language_loss": 0.76035511, + "learning_rate": 2.0216147025375153e-06, + "loss": 0.77950221, + "num_input_tokens_seen": 183004700, + "step": 8510, + "time_per_iteration": 2.4962899684906006 + }, + { + "auxiliary_loss_clip": 0.01167543, + "auxiliary_loss_mlp": 0.01113164, + "balance_loss_clip": 1.00246799, + "balance_loss_mlp": 1.00063038, + "epoch": 0.511709003457087, + "flos": 32634970974720.0, + "grad_norm": 1.5656627062529795, + "language_loss": 0.71248794, + "learning_rate": 2.0212252631936907e-06, + "loss": 0.73529506, + "num_input_tokens_seen": 183025830, + "step": 8511, + "time_per_iteration": 4.115802049636841 + }, + { + "auxiliary_loss_clip": 0.01135839, + "auxiliary_loss_mlp": 0.01113585, + "balance_loss_clip": 1.00236273, + "balance_loss_mlp": 1.00057435, + "epoch": 0.511769126709755, + "flos": 21762082241280.0, + "grad_norm": 1.9640213514757079, + "language_loss": 0.66569328, + "learning_rate": 2.020835823045001e-06, + "loss": 0.68818748, + "num_input_tokens_seen": 183045140, + "step": 8512, + "time_per_iteration": 2.6038289070129395 + }, + { + "auxiliary_loss_clip": 0.01093827, + "auxiliary_loss_mlp": 0.01113872, + "balance_loss_clip": 1.00223756, + "balance_loss_mlp": 1.0005759, + "epoch": 0.511829249962423, + "flos": 23915357827200.0, + "grad_norm": 4.194612924063857, + "language_loss": 0.67086494, + "learning_rate": 2.0204463821062146e-06, + "loss": 0.6929419, + "num_input_tokens_seen": 183063935, + "step": 8513, + "time_per_iteration": 2.7281014919281006 + }, + { + "auxiliary_loss_clip": 0.01117498, + "auxiliary_loss_mlp": 0.01113086, + "balance_loss_clip": 1.00203693, + "balance_loss_mlp": 1.00055265, + "epoch": 0.511889373215091, + "flos": 23727005884800.0, + "grad_norm": 2.7992362569454805, + "language_loss": 0.68699574, + "learning_rate": 2.0200569403921e-06, + "loss": 0.70930159, + "num_input_tokens_seen": 183084135, + "step": 8514, + "time_per_iteration": 2.6568620204925537 + }, + { + "auxiliary_loss_clip": 0.01167316, + "auxiliary_loss_mlp": 0.01112429, + "balance_loss_clip": 1.00227594, + "balance_loss_mlp": 1.00056279, + "epoch": 0.5119494964677589, + "flos": 28111519526400.0, + "grad_norm": 1.8550400014808184, + "language_loss": 0.66283906, + "learning_rate": 2.019667497917424e-06, + "loss": 0.68563652, + "num_input_tokens_seen": 183104570, + "step": 8515, + "time_per_iteration": 2.5669875144958496 + }, + { + "auxiliary_loss_clip": 0.01150556, + "auxiliary_loss_mlp": 0.01113511, + "balance_loss_clip": 1.00221348, + "balance_loss_mlp": 1.00059569, + "epoch": 0.5120096197204269, + "flos": 24973214296320.0, + "grad_norm": 2.0253207576615697, + "language_loss": 0.75287026, + "learning_rate": 2.019278054696955e-06, + "loss": 0.77551097, + "num_input_tokens_seen": 183123850, + "step": 8516, + "time_per_iteration": 2.5894386768341064 + }, + { + "auxiliary_loss_clip": 0.01135984, + "auxiliary_loss_mlp": 0.01114026, + "balance_loss_clip": 1.00232077, + "balance_loss_mlp": 1.00072932, + "epoch": 0.5120697429730948, + "flos": 17968012364160.0, + "grad_norm": 1.9564150133065141, + "language_loss": 0.78025353, + "learning_rate": 2.0188886107454595e-06, + "loss": 0.80275369, + "num_input_tokens_seen": 183141725, + "step": 8517, + "time_per_iteration": 2.5697262287139893 + }, + { + "auxiliary_loss_clip": 0.01150905, + "auxiliary_loss_mlp": 0.01114839, + "balance_loss_clip": 1.00229788, + "balance_loss_mlp": 1.0005883, + "epoch": 0.5121298662257628, + "flos": 23292343405440.0, + "grad_norm": 1.7227579178272596, + "language_loss": 0.73404503, + "learning_rate": 2.0184991660777063e-06, + "loss": 0.75670242, + "num_input_tokens_seen": 183161300, + "step": 8518, + "time_per_iteration": 2.590792655944824 + }, + { + "auxiliary_loss_clip": 0.01151027, + "auxiliary_loss_mlp": 0.01113835, + "balance_loss_clip": 1.00223291, + "balance_loss_mlp": 1.00072932, + "epoch": 0.5121899894784308, + "flos": 17311062568320.0, + "grad_norm": 1.7533361723302079, + "language_loss": 0.78346348, + "learning_rate": 2.0181097207084625e-06, + "loss": 0.80611211, + "num_input_tokens_seen": 183180495, + "step": 8519, + "time_per_iteration": 2.5830397605895996 + }, + { + "auxiliary_loss_clip": 0.01167362, + "auxiliary_loss_mlp": 0.01113464, + "balance_loss_clip": 1.00236011, + "balance_loss_mlp": 1.0005486, + "epoch": 0.5122501127310988, + "flos": 24930085040640.0, + "grad_norm": 2.2508547292123713, + "language_loss": 0.79502779, + "learning_rate": 2.017720274652497e-06, + "loss": 0.81783605, + "num_input_tokens_seen": 183200330, + "step": 8520, + "time_per_iteration": 2.551459789276123 + }, + { + "auxiliary_loss_clip": 0.01135836, + "auxiliary_loss_mlp": 0.01114371, + "balance_loss_clip": 1.00222218, + "balance_loss_mlp": 1.00059772, + "epoch": 0.5123102359837667, + "flos": 18442859184000.0, + "grad_norm": 1.8191194127010075, + "language_loss": 0.81175005, + "learning_rate": 2.0173308279245765e-06, + "loss": 0.83425212, + "num_input_tokens_seen": 183218230, + "step": 8521, + "time_per_iteration": 2.591632604598999 + }, + { + "auxiliary_loss_clip": 0.01151785, + "auxiliary_loss_mlp": 0.01112937, + "balance_loss_clip": 1.00207114, + "balance_loss_mlp": 1.00049901, + "epoch": 0.5123703592364347, + "flos": 26684860164480.0, + "grad_norm": 1.9378635715781436, + "language_loss": 0.68225694, + "learning_rate": 2.0169413805394692e-06, + "loss": 0.70490408, + "num_input_tokens_seen": 183236735, + "step": 8522, + "time_per_iteration": 2.6076607704162598 + }, + { + "auxiliary_loss_clip": 0.01118171, + "auxiliary_loss_mlp": 0.01116593, + "balance_loss_clip": 1.0020535, + "balance_loss_mlp": 1.00072169, + "epoch": 0.5124304824891026, + "flos": 28803948981120.0, + "grad_norm": 1.7398409718422956, + "language_loss": 0.61742598, + "learning_rate": 2.0165519325119433e-06, + "loss": 0.63977361, + "num_input_tokens_seen": 183257550, + "step": 8523, + "time_per_iteration": 2.6823530197143555 + }, + { + "auxiliary_loss_clip": 0.01120269, + "auxiliary_loss_mlp": 0.01113408, + "balance_loss_clip": 1.00230074, + "balance_loss_mlp": 1.00058794, + "epoch": 0.5124906057417706, + "flos": 21761830846080.0, + "grad_norm": 3.227276063899061, + "language_loss": 0.78194726, + "learning_rate": 2.0161624838567656e-06, + "loss": 0.8042841, + "num_input_tokens_seen": 183275515, + "step": 8524, + "time_per_iteration": 2.6323161125183105 + }, + { + "auxiliary_loss_clip": 0.01135793, + "auxiliary_loss_mlp": 0.01113604, + "balance_loss_clip": 1.00217319, + "balance_loss_mlp": 1.00068915, + "epoch": 0.5125507289944387, + "flos": 18880538405760.0, + "grad_norm": 2.115347694793173, + "language_loss": 0.74521184, + "learning_rate": 2.015773034588706e-06, + "loss": 0.7677058, + "num_input_tokens_seen": 183293880, + "step": 8525, + "time_per_iteration": 2.606067180633545 + }, + { + "auxiliary_loss_clip": 0.01135673, + "auxiliary_loss_mlp": 0.01114965, + "balance_loss_clip": 1.00226307, + "balance_loss_mlp": 1.00071478, + "epoch": 0.5126108522471066, + "flos": 35627838036480.0, + "grad_norm": 3.610008722735217, + "language_loss": 0.73932713, + "learning_rate": 2.015383584722531e-06, + "loss": 0.76183355, + "num_input_tokens_seen": 183315860, + "step": 8526, + "time_per_iteration": 2.7168784141540527 + }, + { + "auxiliary_loss_clip": 0.01152009, + "auxiliary_loss_mlp": 0.01113762, + "balance_loss_clip": 1.0022577, + "balance_loss_mlp": 1.00056124, + "epoch": 0.5126709754997746, + "flos": 20190918464640.0, + "grad_norm": 1.5538387827219269, + "language_loss": 0.65332997, + "learning_rate": 2.0149941342730088e-06, + "loss": 0.6759876, + "num_input_tokens_seen": 183335480, + "step": 8527, + "time_per_iteration": 2.614244222640991 + }, + { + "auxiliary_loss_clip": 0.01133197, + "auxiliary_loss_mlp": 0.01112049, + "balance_loss_clip": 1.00220275, + "balance_loss_mlp": 1.00065994, + "epoch": 0.5127310987524425, + "flos": 18588548747520.0, + "grad_norm": 1.7431972884731044, + "language_loss": 0.7436651, + "learning_rate": 2.014604683254908e-06, + "loss": 0.76611757, + "num_input_tokens_seen": 183354395, + "step": 8528, + "time_per_iteration": 2.583686351776123 + }, + { + "auxiliary_loss_clip": 0.01150644, + "auxiliary_loss_mlp": 0.01113554, + "balance_loss_clip": 1.00219142, + "balance_loss_mlp": 1.00054383, + "epoch": 0.5127912220051105, + "flos": 22454691264000.0, + "grad_norm": 3.894281826323997, + "language_loss": 0.8273114, + "learning_rate": 2.014215231682995e-06, + "loss": 0.84995341, + "num_input_tokens_seen": 183372980, + "step": 8529, + "time_per_iteration": 2.5882582664489746 + }, + { + "auxiliary_loss_clip": 0.01102313, + "auxiliary_loss_mlp": 0.01112061, + "balance_loss_clip": 1.00208485, + "balance_loss_mlp": 1.00057626, + "epoch": 0.5128513452577784, + "flos": 19093703667840.0, + "grad_norm": 2.727849914033018, + "language_loss": 0.7394042, + "learning_rate": 2.01382577957204e-06, + "loss": 0.76154792, + "num_input_tokens_seen": 183390160, + "step": 8530, + "time_per_iteration": 2.711568832397461 + }, + { + "auxiliary_loss_clip": 0.01113962, + "auxiliary_loss_mlp": 0.01091195, + "balance_loss_clip": 1.00149441, + "balance_loss_mlp": 1.00002372, + "epoch": 0.5129114685104464, + "flos": 67892285243520.0, + "grad_norm": 0.815493861525947, + "language_loss": 0.60852873, + "learning_rate": 2.0134363269368095e-06, + "loss": 0.63058031, + "num_input_tokens_seen": 183455280, + "step": 8531, + "time_per_iteration": 3.2971787452697754 + }, + { + "auxiliary_loss_clip": 0.01135956, + "auxiliary_loss_mlp": 0.01113939, + "balance_loss_clip": 1.0022403, + "balance_loss_mlp": 1.00054669, + "epoch": 0.5129715917631144, + "flos": 20449152316800.0, + "grad_norm": 1.6494610579914606, + "language_loss": 0.76856625, + "learning_rate": 2.0130468737920725e-06, + "loss": 0.79106516, + "num_input_tokens_seen": 183473955, + "step": 8532, + "time_per_iteration": 2.59088134765625 + }, + { + "auxiliary_loss_clip": 0.01134242, + "auxiliary_loss_mlp": 0.01113472, + "balance_loss_clip": 1.002303, + "balance_loss_mlp": 1.00055671, + "epoch": 0.5130317150157824, + "flos": 35116146840960.0, + "grad_norm": 2.3426177773848007, + "language_loss": 0.67595023, + "learning_rate": 2.012657420152597e-06, + "loss": 0.69842738, + "num_input_tokens_seen": 183497195, + "step": 8533, + "time_per_iteration": 2.7146315574645996 + }, + { + "auxiliary_loss_clip": 0.01117282, + "auxiliary_loss_mlp": 0.01113793, + "balance_loss_clip": 1.00211191, + "balance_loss_mlp": 1.00068676, + "epoch": 0.5130918382684503, + "flos": 19791627903360.0, + "grad_norm": 2.2202356577690985, + "language_loss": 0.82346487, + "learning_rate": 2.01226796603315e-06, + "loss": 0.8457756, + "num_input_tokens_seen": 183513675, + "step": 8534, + "time_per_iteration": 2.624011278152466 + }, + { + "auxiliary_loss_clip": 0.01152478, + "auxiliary_loss_mlp": 0.01114531, + "balance_loss_clip": 1.00224805, + "balance_loss_mlp": 1.00066233, + "epoch": 0.5131519615211183, + "flos": 26323096337280.0, + "grad_norm": 1.440367486424767, + "language_loss": 0.63917994, + "learning_rate": 2.0118785114485017e-06, + "loss": 0.66185004, + "num_input_tokens_seen": 183535165, + "step": 8535, + "time_per_iteration": 2.6071672439575195 + }, + { + "auxiliary_loss_clip": 0.01150708, + "auxiliary_loss_mlp": 0.01113804, + "balance_loss_clip": 1.00231338, + "balance_loss_mlp": 1.0006988, + "epoch": 0.5132120847737862, + "flos": 19171917532800.0, + "grad_norm": 1.5599615308482666, + "language_loss": 0.69644225, + "learning_rate": 2.011489056413418e-06, + "loss": 0.71908736, + "num_input_tokens_seen": 183553780, + "step": 8536, + "time_per_iteration": 2.536533832550049 + }, + { + "auxiliary_loss_clip": 0.0115051, + "auxiliary_loss_mlp": 0.01114144, + "balance_loss_clip": 1.00222039, + "balance_loss_mlp": 1.00075245, + "epoch": 0.5132722080264542, + "flos": 20230420446720.0, + "grad_norm": 2.4741236611933504, + "language_loss": 0.71165037, + "learning_rate": 2.011099600942669e-06, + "loss": 0.73429686, + "num_input_tokens_seen": 183572285, + "step": 8537, + "time_per_iteration": 2.5396316051483154 + }, + { + "auxiliary_loss_clip": 0.0110364, + "auxiliary_loss_mlp": 0.01112683, + "balance_loss_clip": 1.00208068, + "balance_loss_mlp": 1.00062633, + "epoch": 0.5133323312791223, + "flos": 16469459930880.0, + "grad_norm": 1.8346246314131698, + "language_loss": 0.7964834, + "learning_rate": 2.0107101450510214e-06, + "loss": 0.81864667, + "num_input_tokens_seen": 183589330, + "step": 8538, + "time_per_iteration": 2.6331281661987305 + }, + { + "auxiliary_loss_clip": 0.01150599, + "auxiliary_loss_mlp": 0.01112979, + "balance_loss_clip": 1.00214243, + "balance_loss_mlp": 1.00054073, + "epoch": 0.5133924545317902, + "flos": 26068094709120.0, + "grad_norm": 3.3671119005011643, + "language_loss": 0.78235626, + "learning_rate": 2.0103206887532437e-06, + "loss": 0.80499202, + "num_input_tokens_seen": 183609205, + "step": 8539, + "time_per_iteration": 2.590532064437866 + }, + { + "auxiliary_loss_clip": 0.01135767, + "auxiliary_loss_mlp": 0.01113921, + "balance_loss_clip": 1.00224519, + "balance_loss_mlp": 1.00052941, + "epoch": 0.5134525777844582, + "flos": 29131023248640.0, + "grad_norm": 2.4661260901463415, + "language_loss": 0.76042473, + "learning_rate": 2.009931232064105e-06, + "loss": 0.78292161, + "num_input_tokens_seen": 183629985, + "step": 8540, + "time_per_iteration": 2.659966468811035 + }, + { + "auxiliary_loss_clip": 0.01100791, + "auxiliary_loss_mlp": 0.01115278, + "balance_loss_clip": 1.00182486, + "balance_loss_mlp": 1.00064659, + "epoch": 0.5135127010371261, + "flos": 17454776883840.0, + "grad_norm": 2.220092639627103, + "language_loss": 0.7458955, + "learning_rate": 2.0095417749983724e-06, + "loss": 0.76805621, + "num_input_tokens_seen": 183648220, + "step": 8541, + "time_per_iteration": 4.135653018951416 + }, + { + "auxiliary_loss_clip": 0.01091925, + "auxiliary_loss_mlp": 0.01113482, + "balance_loss_clip": 1.00204039, + "balance_loss_mlp": 1.00066209, + "epoch": 0.5135728242897941, + "flos": 21944975316480.0, + "grad_norm": 1.6472332703286081, + "language_loss": 0.70653671, + "learning_rate": 2.0091523175708162e-06, + "loss": 0.72859073, + "num_input_tokens_seen": 183668230, + "step": 8542, + "time_per_iteration": 2.7320730686187744 + }, + { + "auxiliary_loss_clip": 0.01135076, + "auxiliary_loss_mlp": 0.01114435, + "balance_loss_clip": 1.00215793, + "balance_loss_mlp": 1.00056624, + "epoch": 0.513632947542462, + "flos": 22674859678080.0, + "grad_norm": 4.11836176135542, + "language_loss": 0.78932571, + "learning_rate": 2.0087628597962023e-06, + "loss": 0.81182081, + "num_input_tokens_seen": 183687800, + "step": 8543, + "time_per_iteration": 2.639521598815918 + }, + { + "auxiliary_loss_clip": 0.01135067, + "auxiliary_loss_mlp": 0.01114167, + "balance_loss_clip": 1.00212467, + "balance_loss_mlp": 1.00067973, + "epoch": 0.51369307079513, + "flos": 29457163762560.0, + "grad_norm": 1.633184595099704, + "language_loss": 0.67685783, + "learning_rate": 2.008373401689299e-06, + "loss": 0.69935024, + "num_input_tokens_seen": 183709025, + "step": 8544, + "time_per_iteration": 2.675234317779541 + }, + { + "auxiliary_loss_clip": 0.01122562, + "auxiliary_loss_mlp": 0.0111404, + "balance_loss_clip": 1.00219691, + "balance_loss_mlp": 1.0006485, + "epoch": 0.513753194047798, + "flos": 18989347680000.0, + "grad_norm": 2.441687368202285, + "language_loss": 0.72208685, + "learning_rate": 2.0079839432648765e-06, + "loss": 0.74445289, + "num_input_tokens_seen": 183725740, + "step": 8545, + "time_per_iteration": 4.159254789352417 + }, + { + "auxiliary_loss_clip": 0.01150698, + "auxiliary_loss_mlp": 0.011143, + "balance_loss_clip": 1.00216854, + "balance_loss_mlp": 1.00071716, + "epoch": 0.513813317300466, + "flos": 17821855923840.0, + "grad_norm": 2.3706800063610767, + "language_loss": 0.81659102, + "learning_rate": 2.0075944845377016e-06, + "loss": 0.83924091, + "num_input_tokens_seen": 183743995, + "step": 8546, + "time_per_iteration": 2.5552189350128174 + }, + { + "auxiliary_loss_clip": 0.01152637, + "auxiliary_loss_mlp": 0.01114286, + "balance_loss_clip": 1.0023787, + "balance_loss_mlp": 1.00070357, + "epoch": 0.5138734405531339, + "flos": 24061191045120.0, + "grad_norm": 2.901310327082769, + "language_loss": 0.73415613, + "learning_rate": 2.007205025522544e-06, + "loss": 0.75682539, + "num_input_tokens_seen": 183764150, + "step": 8547, + "time_per_iteration": 2.622377872467041 + }, + { + "auxiliary_loss_clip": 0.01151847, + "auxiliary_loss_mlp": 0.01113698, + "balance_loss_clip": 1.00217915, + "balance_loss_mlp": 1.00078273, + "epoch": 0.5139335638058019, + "flos": 26097253574400.0, + "grad_norm": 3.833842684433216, + "language_loss": 0.73299575, + "learning_rate": 2.0068155662341702e-06, + "loss": 0.75565112, + "num_input_tokens_seen": 183783280, + "step": 8548, + "time_per_iteration": 2.6304445266723633 + }, + { + "auxiliary_loss_clip": 0.01126523, + "auxiliary_loss_mlp": 0.01114245, + "balance_loss_clip": 1.00225043, + "balance_loss_mlp": 1.0006628, + "epoch": 0.5139936870584698, + "flos": 18917095472640.0, + "grad_norm": 1.5843481800108346, + "language_loss": 0.82221305, + "learning_rate": 2.0064261066873495e-06, + "loss": 0.84462076, + "num_input_tokens_seen": 183800725, + "step": 8549, + "time_per_iteration": 4.108192443847656 + }, + { + "auxiliary_loss_clip": 0.01152464, + "auxiliary_loss_mlp": 0.01112041, + "balance_loss_clip": 1.00238013, + "balance_loss_mlp": 1.00055659, + "epoch": 0.5140538103111378, + "flos": 16144001775360.0, + "grad_norm": 2.9963130340290625, + "language_loss": 0.72218436, + "learning_rate": 2.0060366468968504e-06, + "loss": 0.74482942, + "num_input_tokens_seen": 183818735, + "step": 8550, + "time_per_iteration": 2.546485662460327 + }, + { + "auxiliary_loss_clip": 0.01151043, + "auxiliary_loss_mlp": 0.01114041, + "balance_loss_clip": 1.00217915, + "balance_loss_mlp": 1.00055385, + "epoch": 0.5141139335638057, + "flos": 22420145358720.0, + "grad_norm": 1.542835241490255, + "language_loss": 0.75253737, + "learning_rate": 2.0056471868774408e-06, + "loss": 0.77518821, + "num_input_tokens_seen": 183840015, + "step": 8551, + "time_per_iteration": 2.5736937522888184 + }, + { + "auxiliary_loss_clip": 0.01140006, + "auxiliary_loss_mlp": 0.01113552, + "balance_loss_clip": 1.00232267, + "balance_loss_mlp": 1.00054121, + "epoch": 0.5141740568164738, + "flos": 27089645506560.0, + "grad_norm": 1.789588858917041, + "language_loss": 0.69231337, + "learning_rate": 2.0052577266438897e-06, + "loss": 0.714849, + "num_input_tokens_seen": 183860145, + "step": 8552, + "time_per_iteration": 2.6396920680999756 + }, + { + "auxiliary_loss_clip": 0.0115056, + "auxiliary_loss_mlp": 0.01113811, + "balance_loss_clip": 1.00212502, + "balance_loss_mlp": 1.00061011, + "epoch": 0.5142341800691418, + "flos": 24973250209920.0, + "grad_norm": 2.2622882791991716, + "language_loss": 0.74333239, + "learning_rate": 2.004868266210965e-06, + "loss": 0.76597607, + "num_input_tokens_seen": 183880540, + "step": 8553, + "time_per_iteration": 2.5789072513580322 + }, + { + "auxiliary_loss_clip": 0.01167218, + "auxiliary_loss_mlp": 0.01113622, + "balance_loss_clip": 1.00219262, + "balance_loss_mlp": 1.00061178, + "epoch": 0.5142943033218097, + "flos": 20704513080960.0, + "grad_norm": 3.0390971422709407, + "language_loss": 0.67644656, + "learning_rate": 2.004478805593435e-06, + "loss": 0.69925493, + "num_input_tokens_seen": 183900895, + "step": 8554, + "time_per_iteration": 2.581972599029541 + }, + { + "auxiliary_loss_clip": 0.0115248, + "auxiliary_loss_mlp": 0.01114533, + "balance_loss_clip": 1.00225902, + "balance_loss_mlp": 1.00076008, + "epoch": 0.5143544265744777, + "flos": 22925479847040.0, + "grad_norm": 2.1085475914139846, + "language_loss": 0.7375173, + "learning_rate": 2.004089344806068e-06, + "loss": 0.76018739, + "num_input_tokens_seen": 183920335, + "step": 8555, + "time_per_iteration": 2.573289632797241 + }, + { + "auxiliary_loss_clip": 0.01118719, + "auxiliary_loss_mlp": 0.01113584, + "balance_loss_clip": 1.00208831, + "balance_loss_mlp": 1.00076485, + "epoch": 0.5144145498271456, + "flos": 15921391236480.0, + "grad_norm": 2.4414666195459733, + "language_loss": 0.74576342, + "learning_rate": 2.003699883863633e-06, + "loss": 0.76808643, + "num_input_tokens_seen": 183936220, + "step": 8556, + "time_per_iteration": 2.6304891109466553 + }, + { + "auxiliary_loss_clip": 0.01119941, + "auxiliary_loss_mlp": 0.01112316, + "balance_loss_clip": 1.00204468, + "balance_loss_mlp": 1.00064075, + "epoch": 0.5144746730798136, + "flos": 19681238430720.0, + "grad_norm": 1.8980135919006316, + "language_loss": 0.86309201, + "learning_rate": 2.003310422780898e-06, + "loss": 0.8854146, + "num_input_tokens_seen": 183953250, + "step": 8557, + "time_per_iteration": 2.6132023334503174 + }, + { + "auxiliary_loss_clip": 0.01152162, + "auxiliary_loss_mlp": 0.0111285, + "balance_loss_clip": 1.00229251, + "balance_loss_mlp": 1.00069785, + "epoch": 0.5145347963324816, + "flos": 23914711382400.0, + "grad_norm": 1.5128932477307278, + "language_loss": 0.88846791, + "learning_rate": 2.0029209615726307e-06, + "loss": 0.91111803, + "num_input_tokens_seen": 183973865, + "step": 8558, + "time_per_iteration": 2.591583013534546 + }, + { + "auxiliary_loss_clip": 0.01167099, + "auxiliary_loss_mlp": 0.00747495, + "balance_loss_clip": 1.00231886, + "balance_loss_mlp": 1.00048161, + "epoch": 0.5145949195851496, + "flos": 18260002022400.0, + "grad_norm": 2.126248304716268, + "language_loss": 0.65377331, + "learning_rate": 2.002531500253602e-06, + "loss": 0.67291927, + "num_input_tokens_seen": 183992555, + "step": 8559, + "time_per_iteration": 2.5101146697998047 + }, + { + "auxiliary_loss_clip": 0.01152661, + "auxiliary_loss_mlp": 0.0074758, + "balance_loss_clip": 1.00240099, + "balance_loss_mlp": 1.00053775, + "epoch": 0.5146550428378175, + "flos": 26213425136640.0, + "grad_norm": 1.6058605393498222, + "language_loss": 0.63565403, + "learning_rate": 2.002142038838577e-06, + "loss": 0.65465647, + "num_input_tokens_seen": 184010825, + "step": 8560, + "time_per_iteration": 2.6305253505706787 + }, + { + "auxiliary_loss_clip": 0.01167327, + "auxiliary_loss_mlp": 0.01113324, + "balance_loss_clip": 1.00227165, + "balance_loss_mlp": 1.00050461, + "epoch": 0.5147151660904855, + "flos": 22674177319680.0, + "grad_norm": 1.9983953981891238, + "language_loss": 0.70436275, + "learning_rate": 2.0017525773423265e-06, + "loss": 0.72716928, + "num_input_tokens_seen": 184030155, + "step": 8561, + "time_per_iteration": 2.53495717048645 + }, + { + "auxiliary_loss_clip": 0.01137605, + "auxiliary_loss_mlp": 0.0111377, + "balance_loss_clip": 1.00231993, + "balance_loss_mlp": 1.00056922, + "epoch": 0.5147752893431534, + "flos": 24972388283520.0, + "grad_norm": 1.716355673165057, + "language_loss": 0.67067438, + "learning_rate": 2.0013631157796177e-06, + "loss": 0.69318807, + "num_input_tokens_seen": 184051440, + "step": 8562, + "time_per_iteration": 2.6571383476257324 + }, + { + "auxiliary_loss_clip": 0.0115207, + "auxiliary_loss_mlp": 0.01114144, + "balance_loss_clip": 1.00238895, + "balance_loss_mlp": 1.00056159, + "epoch": 0.5148354125958214, + "flos": 22744669760640.0, + "grad_norm": 3.7757598967121107, + "language_loss": 0.776218, + "learning_rate": 2.0009736541652188e-06, + "loss": 0.79888022, + "num_input_tokens_seen": 184070205, + "step": 8563, + "time_per_iteration": 2.5819926261901855 + }, + { + "auxiliary_loss_clip": 0.01150709, + "auxiliary_loss_mlp": 0.01115232, + "balance_loss_clip": 1.00223279, + "balance_loss_mlp": 1.0006007, + "epoch": 0.5148955358484893, + "flos": 23068763199360.0, + "grad_norm": 2.224941025713827, + "language_loss": 0.82806641, + "learning_rate": 2.0005841925139e-06, + "loss": 0.85072577, + "num_input_tokens_seen": 184087345, + "step": 8564, + "time_per_iteration": 2.5780582427978516 + }, + { + "auxiliary_loss_clip": 0.01135398, + "auxiliary_loss_mlp": 0.0111499, + "balance_loss_clip": 1.00216091, + "balance_loss_mlp": 1.00054908, + "epoch": 0.5149556591011574, + "flos": 20340127560960.0, + "grad_norm": 1.7208226285666255, + "language_loss": 0.73290282, + "learning_rate": 2.0001947308404283e-06, + "loss": 0.75540674, + "num_input_tokens_seen": 184107110, + "step": 8565, + "time_per_iteration": 2.595673084259033 + }, + { + "auxiliary_loss_clip": 0.01152712, + "auxiliary_loss_mlp": 0.01115197, + "balance_loss_clip": 1.00238359, + "balance_loss_mlp": 1.00056553, + "epoch": 0.5150157823538254, + "flos": 22638230784000.0, + "grad_norm": 2.385759169673876, + "language_loss": 0.68270862, + "learning_rate": 1.9998052691595715e-06, + "loss": 0.70538765, + "num_input_tokens_seen": 184127105, + "step": 8566, + "time_per_iteration": 2.598498821258545 + }, + { + "auxiliary_loss_clip": 0.01167472, + "auxiliary_loss_mlp": 0.00747494, + "balance_loss_clip": 1.00227308, + "balance_loss_mlp": 1.00040948, + "epoch": 0.5150759056064933, + "flos": 26067627832320.0, + "grad_norm": 4.24921253978875, + "language_loss": 0.7843793, + "learning_rate": 1.9994158074861005e-06, + "loss": 0.80352896, + "num_input_tokens_seen": 184148060, + "step": 8567, + "time_per_iteration": 2.562713384628296 + }, + { + "auxiliary_loss_clip": 0.01150942, + "auxiliary_loss_mlp": 0.01114597, + "balance_loss_clip": 1.00221848, + "balance_loss_mlp": 1.00044203, + "epoch": 0.5151360288591613, + "flos": 25952641418880.0, + "grad_norm": 1.8676200254881197, + "language_loss": 0.79164839, + "learning_rate": 1.9990263458347806e-06, + "loss": 0.81430376, + "num_input_tokens_seen": 184166175, + "step": 8568, + "time_per_iteration": 2.6071105003356934 + }, + { + "auxiliary_loss_clip": 0.01135132, + "auxiliary_loss_mlp": 0.01113146, + "balance_loss_clip": 1.00227582, + "balance_loss_mlp": 1.00061297, + "epoch": 0.5151961521118292, + "flos": 18507246312960.0, + "grad_norm": 3.1929677471381557, + "language_loss": 0.90299112, + "learning_rate": 1.9986368842203825e-06, + "loss": 0.92547393, + "num_input_tokens_seen": 184182600, + "step": 8569, + "time_per_iteration": 2.6513516902923584 + }, + { + "auxiliary_loss_clip": 0.0116741, + "auxiliary_loss_mlp": 0.01114629, + "balance_loss_clip": 1.00232518, + "balance_loss_mlp": 1.00066447, + "epoch": 0.5152562753644973, + "flos": 22233696837120.0, + "grad_norm": 1.493071596872769, + "language_loss": 0.76463819, + "learning_rate": 1.998247422657674e-06, + "loss": 0.7874586, + "num_input_tokens_seen": 184202020, + "step": 8570, + "time_per_iteration": 2.533087730407715 + }, + { + "auxiliary_loss_clip": 0.01150849, + "auxiliary_loss_mlp": 0.01113544, + "balance_loss_clip": 1.00213838, + "balance_loss_mlp": 1.00062859, + "epoch": 0.5153163986171652, + "flos": 38436555047040.0, + "grad_norm": 1.5432860636287153, + "language_loss": 0.73618329, + "learning_rate": 1.9978579611614227e-06, + "loss": 0.75882715, + "num_input_tokens_seen": 184224850, + "step": 8571, + "time_per_iteration": 2.6981589794158936 + }, + { + "auxiliary_loss_clip": 0.01130439, + "auxiliary_loss_mlp": 0.01091147, + "balance_loss_clip": 1.00130975, + "balance_loss_mlp": 0.9999758, + "epoch": 0.5153765218698332, + "flos": 66384503015040.0, + "grad_norm": 0.797191796812446, + "language_loss": 0.52970499, + "learning_rate": 1.9974684997463984e-06, + "loss": 0.55192083, + "num_input_tokens_seen": 184288520, + "step": 8572, + "time_per_iteration": 3.2536404132843018 + }, + { + "auxiliary_loss_clip": 0.01150636, + "auxiliary_loss_mlp": 0.01114429, + "balance_loss_clip": 1.00243509, + "balance_loss_mlp": 1.00065565, + "epoch": 0.5154366451225011, + "flos": 24024669891840.0, + "grad_norm": 1.9771327898760995, + "language_loss": 0.76096934, + "learning_rate": 1.9970790384273687e-06, + "loss": 0.78362, + "num_input_tokens_seen": 184308565, + "step": 8573, + "time_per_iteration": 2.565303325653076 + }, + { + "auxiliary_loss_clip": 0.01150159, + "auxiliary_loss_mlp": 0.0111301, + "balance_loss_clip": 1.00225949, + "balance_loss_mlp": 1.00057197, + "epoch": 0.5154967683751691, + "flos": 23468843859840.0, + "grad_norm": 1.7009584215368352, + "language_loss": 0.76765674, + "learning_rate": 1.996689577219102e-06, + "loss": 0.79028845, + "num_input_tokens_seen": 184326795, + "step": 8574, + "time_per_iteration": 2.5815815925598145 + }, + { + "auxiliary_loss_clip": 0.01142005, + "auxiliary_loss_mlp": 0.01112446, + "balance_loss_clip": 1.00228858, + "balance_loss_mlp": 1.00067496, + "epoch": 0.515556891627837, + "flos": 23805650712960.0, + "grad_norm": 2.8007738881806734, + "language_loss": 0.85250109, + "learning_rate": 1.996300116136367e-06, + "loss": 0.8750456, + "num_input_tokens_seen": 184345990, + "step": 8575, + "time_per_iteration": 2.633673667907715 + }, + { + "auxiliary_loss_clip": 0.01152573, + "auxiliary_loss_mlp": 0.01114048, + "balance_loss_clip": 1.00220311, + "balance_loss_mlp": 1.00056052, + "epoch": 0.515617014880505, + "flos": 19828544106240.0, + "grad_norm": 2.74435179301039, + "language_loss": 0.76913416, + "learning_rate": 1.995910655193932e-06, + "loss": 0.79180038, + "num_input_tokens_seen": 184366300, + "step": 8576, + "time_per_iteration": 2.594567060470581 + }, + { + "auxiliary_loss_clip": 0.01102409, + "auxiliary_loss_mlp": 0.00747614, + "balance_loss_clip": 1.00190651, + "balance_loss_mlp": 1.00054264, + "epoch": 0.515677138133173, + "flos": 14245907385600.0, + "grad_norm": 2.1831887843323856, + "language_loss": 0.75681573, + "learning_rate": 1.9955211944065654e-06, + "loss": 0.775316, + "num_input_tokens_seen": 184383030, + "step": 8577, + "time_per_iteration": 2.6568117141723633 + }, + { + "auxiliary_loss_clip": 0.0113517, + "auxiliary_loss_mlp": 0.01114834, + "balance_loss_clip": 1.00216091, + "balance_loss_mlp": 1.00067973, + "epoch": 0.515737261385841, + "flos": 28289707920000.0, + "grad_norm": 2.311137927876335, + "language_loss": 0.8104946, + "learning_rate": 1.9951317337890353e-06, + "loss": 0.8329947, + "num_input_tokens_seen": 184403410, + "step": 8578, + "time_per_iteration": 2.6486122608184814 + }, + { + "auxiliary_loss_clip": 0.01167375, + "auxiliary_loss_mlp": 0.01112753, + "balance_loss_clip": 1.00228107, + "balance_loss_mlp": 1.00060105, + "epoch": 0.515797384638509, + "flos": 27891925729920.0, + "grad_norm": 1.6693341647989741, + "language_loss": 0.75817227, + "learning_rate": 1.9947422733561105e-06, + "loss": 0.78097355, + "num_input_tokens_seen": 184423830, + "step": 8579, + "time_per_iteration": 4.013010501861572 + }, + { + "auxiliary_loss_clip": 0.01119231, + "auxiliary_loss_mlp": 0.0111353, + "balance_loss_clip": 1.00200367, + "balance_loss_mlp": 1.00061488, + "epoch": 0.5158575078911769, + "flos": 23040071210880.0, + "grad_norm": 1.7054255683697552, + "language_loss": 0.78991377, + "learning_rate": 1.994352813122559e-06, + "loss": 0.81224138, + "num_input_tokens_seen": 184445050, + "step": 8580, + "time_per_iteration": 2.697829008102417 + }, + { + "auxiliary_loss_clip": 0.01120158, + "auxiliary_loss_mlp": 0.01115352, + "balance_loss_clip": 1.00213361, + "balance_loss_mlp": 1.00072074, + "epoch": 0.5159176311438449, + "flos": 12641346938880.0, + "grad_norm": 5.448070766268005, + "language_loss": 0.73018646, + "learning_rate": 1.99396335310315e-06, + "loss": 0.75254166, + "num_input_tokens_seen": 184460775, + "step": 8581, + "time_per_iteration": 2.6196253299713135 + }, + { + "auxiliary_loss_clip": 0.0115054, + "auxiliary_loss_mlp": 0.011135, + "balance_loss_clip": 1.00213087, + "balance_loss_mlp": 1.00058484, + "epoch": 0.5159777543965128, + "flos": 15558154951680.0, + "grad_norm": 2.4086566964283724, + "language_loss": 0.74379718, + "learning_rate": 1.9935738933126508e-06, + "loss": 0.76643765, + "num_input_tokens_seen": 184477365, + "step": 8582, + "time_per_iteration": 3.9789419174194336 + }, + { + "auxiliary_loss_clip": 0.01118841, + "auxiliary_loss_mlp": 0.01113111, + "balance_loss_clip": 1.00207269, + "balance_loss_mlp": 1.00057733, + "epoch": 0.5160378776491809, + "flos": 23221671396480.0, + "grad_norm": 3.0340086734136196, + "language_loss": 0.6578325, + "learning_rate": 1.99318443376583e-06, + "loss": 0.68015206, + "num_input_tokens_seen": 184497045, + "step": 8583, + "time_per_iteration": 4.026090145111084 + }, + { + "auxiliary_loss_clip": 0.01152638, + "auxiliary_loss_mlp": 0.01114434, + "balance_loss_clip": 1.00242889, + "balance_loss_mlp": 1.0006609, + "epoch": 0.5160980009018488, + "flos": 21944616180480.0, + "grad_norm": 2.2488628068183676, + "language_loss": 0.76002151, + "learning_rate": 1.9927949744774568e-06, + "loss": 0.78269225, + "num_input_tokens_seen": 184517675, + "step": 8584, + "time_per_iteration": 2.6025917530059814 + }, + { + "auxiliary_loss_clip": 0.01119145, + "auxiliary_loss_mlp": 0.01114005, + "balance_loss_clip": 1.00208497, + "balance_loss_mlp": 1.00070858, + "epoch": 0.5161581241545168, + "flos": 22784064001920.0, + "grad_norm": 2.0306499415165487, + "language_loss": 0.7870602, + "learning_rate": 1.9924055154622983e-06, + "loss": 0.80939174, + "num_input_tokens_seen": 184537745, + "step": 8585, + "time_per_iteration": 2.6482534408569336 + }, + { + "auxiliary_loss_clip": 0.01156609, + "auxiliary_loss_mlp": 0.01112517, + "balance_loss_clip": 1.00229263, + "balance_loss_mlp": 1.00055599, + "epoch": 0.5162182474071847, + "flos": 19675384513920.0, + "grad_norm": 2.421697588339855, + "language_loss": 0.80919206, + "learning_rate": 1.9920160567351238e-06, + "loss": 0.83188337, + "num_input_tokens_seen": 184553630, + "step": 8586, + "time_per_iteration": 4.077638626098633 + }, + { + "auxiliary_loss_clip": 0.01134219, + "auxiliary_loss_mlp": 0.01113246, + "balance_loss_clip": 1.00215697, + "balance_loss_mlp": 1.00071239, + "epoch": 0.5162783706598527, + "flos": 20046198568320.0, + "grad_norm": 1.6410581037514398, + "language_loss": 0.71221781, + "learning_rate": 1.991626598310701e-06, + "loss": 0.73469245, + "num_input_tokens_seen": 184573530, + "step": 8587, + "time_per_iteration": 2.6352319717407227 + }, + { + "auxiliary_loss_clip": 0.01147539, + "auxiliary_loss_mlp": 0.01090808, + "balance_loss_clip": 1.00147533, + "balance_loss_mlp": 1.000018, + "epoch": 0.5163384939125206, + "flos": 69959553713280.0, + "grad_norm": 0.7237588666339446, + "language_loss": 0.57804585, + "learning_rate": 1.9912371402037984e-06, + "loss": 0.6004293, + "num_input_tokens_seen": 184637875, + "step": 8588, + "time_per_iteration": 3.1645500659942627 + }, + { + "auxiliary_loss_clip": 0.01141149, + "auxiliary_loss_mlp": 0.01114306, + "balance_loss_clip": 1.00220048, + "balance_loss_mlp": 1.00081825, + "epoch": 0.5163986171651886, + "flos": 17417034668160.0, + "grad_norm": 1.7549332724087692, + "language_loss": 0.75244093, + "learning_rate": 1.990847682429185e-06, + "loss": 0.77499545, + "num_input_tokens_seen": 184656125, + "step": 8589, + "time_per_iteration": 2.5758113861083984 + }, + { + "auxiliary_loss_clip": 0.01151932, + "auxiliary_loss_mlp": 0.01113103, + "balance_loss_clip": 1.00219703, + "balance_loss_mlp": 1.00056982, + "epoch": 0.5164587404178566, + "flos": 21322679166720.0, + "grad_norm": 1.5389583179995143, + "language_loss": 0.67405808, + "learning_rate": 1.990458225001627e-06, + "loss": 0.69670838, + "num_input_tokens_seen": 184675920, + "step": 8590, + "time_per_iteration": 2.5662708282470703 + }, + { + "auxiliary_loss_clip": 0.01146342, + "auxiliary_loss_mlp": 0.01090397, + "balance_loss_clip": 1.00162256, + "balance_loss_mlp": 0.99998873, + "epoch": 0.5165188636705246, + "flos": 68057149691520.0, + "grad_norm": 0.7789262735193809, + "language_loss": 0.559075, + "learning_rate": 1.990068767935895e-06, + "loss": 0.58144236, + "num_input_tokens_seen": 184730520, + "step": 8591, + "time_per_iteration": 3.036975383758545 + }, + { + "auxiliary_loss_clip": 0.0113571, + "auxiliary_loss_mlp": 0.01111812, + "balance_loss_clip": 1.00225329, + "balance_loss_mlp": 1.00070906, + "epoch": 0.5165789869231926, + "flos": 19385657412480.0, + "grad_norm": 2.2794153503126067, + "language_loss": 0.81245315, + "learning_rate": 1.9896793112467566e-06, + "loss": 0.83492833, + "num_input_tokens_seen": 184748340, + "step": 8592, + "time_per_iteration": 2.6105458736419678 + }, + { + "auxiliary_loss_clip": 0.01152428, + "auxiliary_loss_mlp": 0.01112257, + "balance_loss_clip": 1.00241542, + "balance_loss_mlp": 1.00039113, + "epoch": 0.5166391101758605, + "flos": 20960197067520.0, + "grad_norm": 2.367230879741987, + "language_loss": 0.83304751, + "learning_rate": 1.989289854948979e-06, + "loss": 0.85569441, + "num_input_tokens_seen": 184766615, + "step": 8593, + "time_per_iteration": 2.6056792736053467 + }, + { + "auxiliary_loss_clip": 0.01135395, + "auxiliary_loss_mlp": 0.0111299, + "balance_loss_clip": 1.00210619, + "balance_loss_mlp": 1.00074255, + "epoch": 0.5166992334285285, + "flos": 29462407148160.0, + "grad_norm": 1.6535131695661391, + "language_loss": 0.6934123, + "learning_rate": 1.9889003990573314e-06, + "loss": 0.71589613, + "num_input_tokens_seen": 184788075, + "step": 8594, + "time_per_iteration": 2.6736698150634766 + }, + { + "auxiliary_loss_clip": 0.01119058, + "auxiliary_loss_mlp": 0.01112845, + "balance_loss_clip": 1.00216472, + "balance_loss_mlp": 1.00059772, + "epoch": 0.5167593566811964, + "flos": 20304360593280.0, + "grad_norm": 1.772086570172735, + "language_loss": 0.77396542, + "learning_rate": 1.988510943586582e-06, + "loss": 0.79628444, + "num_input_tokens_seen": 184808710, + "step": 8595, + "time_per_iteration": 2.6533429622650146 + }, + { + "auxiliary_loss_clip": 0.01167435, + "auxiliary_loss_mlp": 0.01113343, + "balance_loss_clip": 1.00240719, + "balance_loss_mlp": 1.00071406, + "epoch": 0.5168194799338645, + "flos": 14611370313600.0, + "grad_norm": 1.6100005832023043, + "language_loss": 0.65404516, + "learning_rate": 1.9881214885514986e-06, + "loss": 0.67685294, + "num_input_tokens_seen": 184826475, + "step": 8596, + "time_per_iteration": 2.530742645263672 + }, + { + "auxiliary_loss_clip": 0.01125012, + "auxiliary_loss_mlp": 0.01113671, + "balance_loss_clip": 1.00229359, + "balance_loss_mlp": 1.00056505, + "epoch": 0.5168796031865324, + "flos": 25007257411200.0, + "grad_norm": 1.6409048023021537, + "language_loss": 0.75558513, + "learning_rate": 1.9877320339668492e-06, + "loss": 0.77797192, + "num_input_tokens_seen": 184845245, + "step": 8597, + "time_per_iteration": 2.6704299449920654 + }, + { + "auxiliary_loss_clip": 0.01167286, + "auxiliary_loss_mlp": 0.01113691, + "balance_loss_clip": 1.00230253, + "balance_loss_mlp": 1.00048935, + "epoch": 0.5169397264392004, + "flos": 26939969533440.0, + "grad_norm": 2.463615328323379, + "language_loss": 0.81051749, + "learning_rate": 1.987342579847403e-06, + "loss": 0.83332723, + "num_input_tokens_seen": 184866605, + "step": 8598, + "time_per_iteration": 2.570344924926758 + }, + { + "auxiliary_loss_clip": 0.01103592, + "auxiliary_loss_mlp": 0.01113287, + "balance_loss_clip": 1.00211167, + "balance_loss_mlp": 1.00075316, + "epoch": 0.5169998496918683, + "flos": 25407804948480.0, + "grad_norm": 1.6229116825073862, + "language_loss": 0.7549473, + "learning_rate": 1.9869531262079273e-06, + "loss": 0.77711606, + "num_input_tokens_seen": 184886945, + "step": 8599, + "time_per_iteration": 2.7244808673858643 + }, + { + "auxiliary_loss_clip": 0.01137061, + "auxiliary_loss_mlp": 0.01112406, + "balance_loss_clip": 1.00234771, + "balance_loss_mlp": 1.00063515, + "epoch": 0.5170599729445363, + "flos": 24680793674880.0, + "grad_norm": 2.1942788525211507, + "language_loss": 0.72310591, + "learning_rate": 1.9865636730631904e-06, + "loss": 0.74560058, + "num_input_tokens_seen": 184905590, + "step": 8600, + "time_per_iteration": 2.6363131999969482 + }, + { + "auxiliary_loss_clip": 0.01120771, + "auxiliary_loss_mlp": 0.01112912, + "balance_loss_clip": 1.00216794, + "balance_loss_mlp": 1.00047421, + "epoch": 0.5171200961972042, + "flos": 20994455664000.0, + "grad_norm": 1.643678789365292, + "language_loss": 0.74518013, + "learning_rate": 1.9861742204279602e-06, + "loss": 0.76751697, + "num_input_tokens_seen": 184925555, + "step": 8601, + "time_per_iteration": 2.7380285263061523 + }, + { + "auxiliary_loss_clip": 0.01150617, + "auxiliary_loss_mlp": 0.01114487, + "balance_loss_clip": 1.00230145, + "balance_loss_mlp": 1.00071335, + "epoch": 0.5171802194498722, + "flos": 22745639427840.0, + "grad_norm": 2.1941632906762534, + "language_loss": 0.84455633, + "learning_rate": 1.9857847683170045e-06, + "loss": 0.86720729, + "num_input_tokens_seen": 184944490, + "step": 8602, + "time_per_iteration": 2.6136105060577393 + }, + { + "auxiliary_loss_clip": 0.01167344, + "auxiliary_loss_mlp": 0.01114143, + "balance_loss_clip": 1.00234294, + "balance_loss_mlp": 1.00065553, + "epoch": 0.5172403427025402, + "flos": 28176732668160.0, + "grad_norm": 1.7258801298228457, + "language_loss": 0.74489379, + "learning_rate": 1.9853953167450926e-06, + "loss": 0.76770866, + "num_input_tokens_seen": 184963190, + "step": 8603, + "time_per_iteration": 2.5758891105651855 + }, + { + "auxiliary_loss_clip": 0.01133903, + "auxiliary_loss_mlp": 0.01113149, + "balance_loss_clip": 1.00226021, + "balance_loss_mlp": 1.00071049, + "epoch": 0.5173004659552082, + "flos": 20337829090560.0, + "grad_norm": 2.2157387149856755, + "language_loss": 0.72748709, + "learning_rate": 1.9850058657269915e-06, + "loss": 0.74995756, + "num_input_tokens_seen": 184981220, + "step": 8604, + "time_per_iteration": 2.5877463817596436 + }, + { + "auxiliary_loss_clip": 0.01133895, + "auxiliary_loss_mlp": 0.01114259, + "balance_loss_clip": 1.00213528, + "balance_loss_mlp": 1.00067663, + "epoch": 0.5173605892078762, + "flos": 19063323740160.0, + "grad_norm": 2.923763163089263, + "language_loss": 0.85179484, + "learning_rate": 1.984616415277469e-06, + "loss": 0.87427634, + "num_input_tokens_seen": 184998810, + "step": 8605, + "time_per_iteration": 2.580876111984253 + }, + { + "auxiliary_loss_clip": 0.01150876, + "auxiliary_loss_mlp": 0.01112784, + "balance_loss_clip": 1.00222659, + "balance_loss_mlp": 1.00044084, + "epoch": 0.5174207124605441, + "flos": 27995168396160.0, + "grad_norm": 1.724149075922084, + "language_loss": 0.6460228, + "learning_rate": 1.984226965411294e-06, + "loss": 0.66865945, + "num_input_tokens_seen": 185021185, + "step": 8606, + "time_per_iteration": 2.6239805221557617 + }, + { + "auxiliary_loss_clip": 0.01134057, + "auxiliary_loss_mlp": 0.0111304, + "balance_loss_clip": 1.00207889, + "balance_loss_mlp": 1.00050688, + "epoch": 0.5174808357132121, + "flos": 19496657416320.0, + "grad_norm": 1.620636854766002, + "language_loss": 0.77751732, + "learning_rate": 1.983837516143234e-06, + "loss": 0.79998839, + "num_input_tokens_seen": 185038465, + "step": 8607, + "time_per_iteration": 2.6136932373046875 + }, + { + "auxiliary_loss_clip": 0.01149994, + "auxiliary_loss_mlp": 0.01112383, + "balance_loss_clip": 1.00214171, + "balance_loss_mlp": 1.0006125, + "epoch": 0.51754095896588, + "flos": 22784171742720.0, + "grad_norm": 1.6452912053742745, + "language_loss": 0.71822208, + "learning_rate": 1.983448067488057e-06, + "loss": 0.74084586, + "num_input_tokens_seen": 185057340, + "step": 8608, + "time_per_iteration": 2.6130385398864746 + }, + { + "auxiliary_loss_clip": 0.01152617, + "auxiliary_loss_mlp": 0.01113607, + "balance_loss_clip": 1.0023458, + "balance_loss_mlp": 1.00059628, + "epoch": 0.5176010822185481, + "flos": 22669257156480.0, + "grad_norm": 2.4920535210354857, + "language_loss": 0.8702758, + "learning_rate": 1.983058619460531e-06, + "loss": 0.89293808, + "num_input_tokens_seen": 185074935, + "step": 8609, + "time_per_iteration": 2.573854684829712 + }, + { + "auxiliary_loss_clip": 0.01152437, + "auxiliary_loss_mlp": 0.01111901, + "balance_loss_clip": 1.00233984, + "balance_loss_mlp": 1.00060725, + "epoch": 0.517661205471216, + "flos": 23951196622080.0, + "grad_norm": 1.5902372744372375, + "language_loss": 0.73803967, + "learning_rate": 1.9826691720754237e-06, + "loss": 0.76068306, + "num_input_tokens_seen": 185095050, + "step": 8610, + "time_per_iteration": 2.5788421630859375 + }, + { + "auxiliary_loss_clip": 0.01167363, + "auxiliary_loss_mlp": 0.0111353, + "balance_loss_clip": 1.00237215, + "balance_loss_mlp": 1.00051928, + "epoch": 0.517721328723884, + "flos": 15596076735360.0, + "grad_norm": 1.7637567649202353, + "language_loss": 0.67295367, + "learning_rate": 1.9822797253475034e-06, + "loss": 0.69576263, + "num_input_tokens_seen": 185112275, + "step": 8611, + "time_per_iteration": 2.4755611419677734 + }, + { + "auxiliary_loss_clip": 0.01167339, + "auxiliary_loss_mlp": 0.0111289, + "balance_loss_clip": 1.00235248, + "balance_loss_mlp": 1.00054753, + "epoch": 0.5177814519765519, + "flos": 20960197067520.0, + "grad_norm": 2.0207832523138634, + "language_loss": 0.77184016, + "learning_rate": 1.9818902792915373e-06, + "loss": 0.79464239, + "num_input_tokens_seen": 185132165, + "step": 8612, + "time_per_iteration": 2.54142427444458 + }, + { + "auxiliary_loss_clip": 0.01151893, + "auxiliary_loss_mlp": 0.01112599, + "balance_loss_clip": 1.00220919, + "balance_loss_mlp": 1.00063813, + "epoch": 0.5178415752292199, + "flos": 17967832796160.0, + "grad_norm": 2.505044161413442, + "language_loss": 0.8201884, + "learning_rate": 1.981500833922294e-06, + "loss": 0.84283328, + "num_input_tokens_seen": 185151025, + "step": 8613, + "time_per_iteration": 2.5453133583068848 + }, + { + "auxiliary_loss_clip": 0.01167301, + "auxiliary_loss_mlp": 0.01115022, + "balance_loss_clip": 1.00231266, + "balance_loss_mlp": 1.00067663, + "epoch": 0.5179016984818878, + "flos": 17821496787840.0, + "grad_norm": 2.365500082291151, + "language_loss": 0.66219461, + "learning_rate": 1.981111389254541e-06, + "loss": 0.68501782, + "num_input_tokens_seen": 185168455, + "step": 8614, + "time_per_iteration": 2.482632875442505 + }, + { + "auxiliary_loss_clip": 0.01140198, + "auxiliary_loss_mlp": 0.01113941, + "balance_loss_clip": 1.00227046, + "balance_loss_mlp": 1.00045347, + "epoch": 0.5179618217345558, + "flos": 17820455293440.0, + "grad_norm": 3.9910583867729446, + "language_loss": 0.86712945, + "learning_rate": 1.9807219453030453e-06, + "loss": 0.88967085, + "num_input_tokens_seen": 185184415, + "step": 8615, + "time_per_iteration": 2.5331780910491943 + }, + { + "auxiliary_loss_clip": 0.011506, + "auxiliary_loss_mlp": 0.01111702, + "balance_loss_clip": 1.00217795, + "balance_loss_mlp": 1.00069475, + "epoch": 0.5180219449872238, + "flos": 22522131048960.0, + "grad_norm": 1.7424275270536538, + "language_loss": 0.8068735, + "learning_rate": 1.9803325020825763e-06, + "loss": 0.8294965, + "num_input_tokens_seen": 185202910, + "step": 8616, + "time_per_iteration": 2.549091339111328 + }, + { + "auxiliary_loss_clip": 0.01150629, + "auxiliary_loss_mlp": 0.00747557, + "balance_loss_clip": 1.00242233, + "balance_loss_mlp": 1.00072801, + "epoch": 0.5180820682398918, + "flos": 23915465568000.0, + "grad_norm": 2.2926158681656243, + "language_loss": 0.75233608, + "learning_rate": 1.9799430596079e-06, + "loss": 0.77131796, + "num_input_tokens_seen": 185223085, + "step": 8617, + "time_per_iteration": 2.58293080329895 + }, + { + "auxiliary_loss_clip": 0.01167381, + "auxiliary_loss_mlp": 0.01113614, + "balance_loss_clip": 1.00231576, + "balance_loss_mlp": 1.00079465, + "epoch": 0.5181421914925598, + "flos": 16979930064000.0, + "grad_norm": 2.401919360172279, + "language_loss": 0.70641327, + "learning_rate": 1.979553617893785e-06, + "loss": 0.72922325, + "num_input_tokens_seen": 185241295, + "step": 8618, + "time_per_iteration": 3.8454153537750244 + }, + { + "auxiliary_loss_clip": 0.01145642, + "auxiliary_loss_mlp": 0.01089994, + "balance_loss_clip": 1.00143075, + "balance_loss_mlp": 0.99996704, + "epoch": 0.5182023147452277, + "flos": 66059870872320.0, + "grad_norm": 0.9592131904224205, + "language_loss": 0.67286837, + "learning_rate": 1.979164176954999e-06, + "loss": 0.69522476, + "num_input_tokens_seen": 185298295, + "step": 8619, + "time_per_iteration": 3.0792696475982666 + }, + { + "auxiliary_loss_clip": 0.01101836, + "auxiliary_loss_mlp": 0.01112433, + "balance_loss_clip": 1.00209987, + "balance_loss_mlp": 1.00056696, + "epoch": 0.5182624379978957, + "flos": 18187749815040.0, + "grad_norm": 2.307804849721817, + "language_loss": 0.79006135, + "learning_rate": 1.97877473680631e-06, + "loss": 0.812204, + "num_input_tokens_seen": 185317000, + "step": 8620, + "time_per_iteration": 4.205958127975464 + }, + { + "auxiliary_loss_clip": 0.01087552, + "auxiliary_loss_mlp": 0.00747437, + "balance_loss_clip": 1.00200164, + "balance_loss_mlp": 1.00062311, + "epoch": 0.5183225612505636, + "flos": 14026708638720.0, + "grad_norm": 3.301208606834472, + "language_loss": 0.8193624, + "learning_rate": 1.9783852974624846e-06, + "loss": 0.83771229, + "num_input_tokens_seen": 185331185, + "step": 8621, + "time_per_iteration": 4.064948081970215 + }, + { + "auxiliary_loss_clip": 0.01137534, + "auxiliary_loss_mlp": 0.01112162, + "balance_loss_clip": 1.00222278, + "balance_loss_mlp": 1.00077271, + "epoch": 0.5183826845032317, + "flos": 23659781581440.0, + "grad_norm": 1.9673256959507566, + "language_loss": 0.66017044, + "learning_rate": 1.9779958589382905e-06, + "loss": 0.68266737, + "num_input_tokens_seen": 185348955, + "step": 8622, + "time_per_iteration": 2.633195400238037 + }, + { + "auxiliary_loss_clip": 0.0113585, + "auxiliary_loss_mlp": 0.01113327, + "balance_loss_clip": 1.00223553, + "balance_loss_mlp": 1.00060272, + "epoch": 0.5184428077558996, + "flos": 15888605097600.0, + "grad_norm": 2.080942996993828, + "language_loss": 0.60736299, + "learning_rate": 1.977606421248497e-06, + "loss": 0.62985468, + "num_input_tokens_seen": 185367330, + "step": 8623, + "time_per_iteration": 2.5913398265838623 + }, + { + "auxiliary_loss_clip": 0.01167242, + "auxiliary_loss_mlp": 0.01112179, + "balance_loss_clip": 1.00232244, + "balance_loss_mlp": 1.00059915, + "epoch": 0.5185029310085676, + "flos": 21030833162880.0, + "grad_norm": 1.7515022733094994, + "language_loss": 0.7622261, + "learning_rate": 1.9772169844078685e-06, + "loss": 0.78502035, + "num_input_tokens_seen": 185385060, + "step": 8624, + "time_per_iteration": 2.531141519546509 + }, + { + "auxiliary_loss_clip": 0.01120915, + "auxiliary_loss_mlp": 0.01112574, + "balance_loss_clip": 1.00204492, + "balance_loss_mlp": 1.00061226, + "epoch": 0.5185630542612355, + "flos": 26542690133760.0, + "grad_norm": 2.1322339886545825, + "language_loss": 0.71175683, + "learning_rate": 1.9768275484311756e-06, + "loss": 0.73409176, + "num_input_tokens_seen": 185403745, + "step": 8625, + "time_per_iteration": 4.112926006317139 + }, + { + "auxiliary_loss_clip": 0.01137621, + "auxiliary_loss_mlp": 0.01112817, + "balance_loss_clip": 1.00225759, + "balance_loss_mlp": 1.00066519, + "epoch": 0.5186231775139035, + "flos": 20668422890880.0, + "grad_norm": 1.7907256256527244, + "language_loss": 0.67255998, + "learning_rate": 1.976438113333184e-06, + "loss": 0.69506443, + "num_input_tokens_seen": 185422620, + "step": 8626, + "time_per_iteration": 2.6075291633605957 + }, + { + "auxiliary_loss_clip": 0.01152193, + "auxiliary_loss_mlp": 0.01112777, + "balance_loss_clip": 1.00220382, + "balance_loss_mlp": 1.00052989, + "epoch": 0.5186833007665714, + "flos": 20885502735360.0, + "grad_norm": 1.9384578615043642, + "language_loss": 0.70543051, + "learning_rate": 1.9760486791286612e-06, + "loss": 0.72808021, + "num_input_tokens_seen": 185439380, + "step": 8627, + "time_per_iteration": 2.5526812076568604 + }, + { + "auxiliary_loss_clip": 0.01167522, + "auxiliary_loss_mlp": 0.00747665, + "balance_loss_clip": 1.00251055, + "balance_loss_mlp": 1.00060177, + "epoch": 0.5187434240192395, + "flos": 20886903365760.0, + "grad_norm": 2.065248344615954, + "language_loss": 0.73314917, + "learning_rate": 1.9756592458323753e-06, + "loss": 0.75230104, + "num_input_tokens_seen": 185458830, + "step": 8628, + "time_per_iteration": 2.5505244731903076 + }, + { + "auxiliary_loss_clip": 0.0113404, + "auxiliary_loss_mlp": 0.01111746, + "balance_loss_clip": 1.00221348, + "balance_loss_mlp": 1.00054765, + "epoch": 0.5188035472719074, + "flos": 19859929614720.0, + "grad_norm": 1.6311674696261722, + "language_loss": 0.77787489, + "learning_rate": 1.9752698134590927e-06, + "loss": 0.80033273, + "num_input_tokens_seen": 185477270, + "step": 8629, + "time_per_iteration": 2.5968308448791504 + }, + { + "auxiliary_loss_clip": 0.01152498, + "auxiliary_loss_mlp": 0.01112879, + "balance_loss_clip": 1.00238919, + "balance_loss_mlp": 1.00044036, + "epoch": 0.5188636705245754, + "flos": 21138313633920.0, + "grad_norm": 1.992058904434439, + "language_loss": 0.7506876, + "learning_rate": 1.9748803820235815e-06, + "loss": 0.7733413, + "num_input_tokens_seen": 185495795, + "step": 8630, + "time_per_iteration": 2.5952541828155518 + }, + { + "auxiliary_loss_clip": 0.01149963, + "auxiliary_loss_mlp": 0.01113125, + "balance_loss_clip": 1.00206459, + "balance_loss_mlp": 1.0005914, + "epoch": 0.5189237937772434, + "flos": 22419786222720.0, + "grad_norm": 1.975133049670963, + "language_loss": 0.80538094, + "learning_rate": 1.9744909515406093e-06, + "loss": 0.82801175, + "num_input_tokens_seen": 185514885, + "step": 8631, + "time_per_iteration": 2.567894220352173 + }, + { + "auxiliary_loss_clip": 0.01150747, + "auxiliary_loss_mlp": 0.01113233, + "balance_loss_clip": 1.00229001, + "balance_loss_mlp": 1.0005089, + "epoch": 0.5189839170299113, + "flos": 25446696399360.0, + "grad_norm": 1.528102241706483, + "language_loss": 0.74362063, + "learning_rate": 1.974101522024942e-06, + "loss": 0.76626045, + "num_input_tokens_seen": 185537155, + "step": 8632, + "time_per_iteration": 2.620588541030884 + }, + { + "auxiliary_loss_clip": 0.01119146, + "auxiliary_loss_mlp": 0.01112197, + "balance_loss_clip": 1.00206709, + "balance_loss_mlp": 1.00061703, + "epoch": 0.5190440402825793, + "flos": 18587722734720.0, + "grad_norm": 2.4239184945367813, + "language_loss": 0.78780657, + "learning_rate": 1.9737120934913477e-06, + "loss": 0.81011999, + "num_input_tokens_seen": 185555520, + "step": 8633, + "time_per_iteration": 2.6147468090057373 + }, + { + "auxiliary_loss_clip": 0.01151695, + "auxiliary_loss_mlp": 0.01112064, + "balance_loss_clip": 1.00211477, + "balance_loss_mlp": 1.00057912, + "epoch": 0.5191041635352472, + "flos": 21908633731200.0, + "grad_norm": 1.6988593111669164, + "language_loss": 0.80373228, + "learning_rate": 1.9733226659545936e-06, + "loss": 0.82636982, + "num_input_tokens_seen": 185573855, + "step": 8634, + "time_per_iteration": 2.547213077545166 + }, + { + "auxiliary_loss_clip": 0.01167182, + "auxiliary_loss_mlp": 0.01112368, + "balance_loss_clip": 1.00231576, + "balance_loss_mlp": 1.00078785, + "epoch": 0.5191642867879153, + "flos": 27527971173120.0, + "grad_norm": 1.9284979636601072, + "language_loss": 0.68846506, + "learning_rate": 1.9729332394294467e-06, + "loss": 0.71126062, + "num_input_tokens_seen": 185595145, + "step": 8635, + "time_per_iteration": 2.575976610183716 + }, + { + "auxiliary_loss_clip": 0.01135973, + "auxiliary_loss_mlp": 0.01113817, + "balance_loss_clip": 1.00234807, + "balance_loss_mlp": 1.00071144, + "epoch": 0.5192244100405832, + "flos": 15705999331200.0, + "grad_norm": 2.655719447279282, + "language_loss": 0.77270913, + "learning_rate": 1.9725438139306742e-06, + "loss": 0.79520702, + "num_input_tokens_seen": 185613320, + "step": 8636, + "time_per_iteration": 2.573767900466919 + }, + { + "auxiliary_loss_clip": 0.01167298, + "auxiliary_loss_mlp": 0.01113455, + "balance_loss_clip": 1.00229526, + "balance_loss_mlp": 1.00053978, + "epoch": 0.5192845332932512, + "flos": 12057080313600.0, + "grad_norm": 1.9553714327491931, + "language_loss": 0.71624506, + "learning_rate": 1.9721543894730425e-06, + "loss": 0.73905259, + "num_input_tokens_seen": 185630730, + "step": 8637, + "time_per_iteration": 2.4880661964416504 + }, + { + "auxiliary_loss_clip": 0.01117278, + "auxiliary_loss_mlp": 0.01112317, + "balance_loss_clip": 1.00218654, + "balance_loss_mlp": 1.00073695, + "epoch": 0.5193446565459191, + "flos": 18953185662720.0, + "grad_norm": 2.1966257846293593, + "language_loss": 0.76380986, + "learning_rate": 1.9717649660713194e-06, + "loss": 0.78610581, + "num_input_tokens_seen": 185648515, + "step": 8638, + "time_per_iteration": 2.6232621669769287 + }, + { + "auxiliary_loss_clip": 0.01118965, + "auxiliary_loss_mlp": 0.01112826, + "balance_loss_clip": 1.00203454, + "balance_loss_mlp": 1.00067377, + "epoch": 0.5194047797985871, + "flos": 20374960775040.0, + "grad_norm": 5.366273268969763, + "language_loss": 0.74891579, + "learning_rate": 1.971375543740272e-06, + "loss": 0.77123374, + "num_input_tokens_seen": 185665220, + "step": 8639, + "time_per_iteration": 2.6296558380126953 + }, + { + "auxiliary_loss_clip": 0.01167334, + "auxiliary_loss_mlp": 0.01113198, + "balance_loss_clip": 1.00237501, + "balance_loss_mlp": 1.00056899, + "epoch": 0.519464903051255, + "flos": 24353001135360.0, + "grad_norm": 1.6211859274281322, + "language_loss": 0.77687114, + "learning_rate": 1.9709861224946665e-06, + "loss": 0.79967648, + "num_input_tokens_seen": 185683750, + "step": 8640, + "time_per_iteration": 2.553478717803955 + }, + { + "auxiliary_loss_clip": 0.01118843, + "auxiliary_loss_mlp": 0.01112251, + "balance_loss_clip": 1.00214553, + "balance_loss_mlp": 1.00057602, + "epoch": 0.519525026303923, + "flos": 14061829161600.0, + "grad_norm": 4.80952057957692, + "language_loss": 0.65980816, + "learning_rate": 1.97059670234927e-06, + "loss": 0.68211913, + "num_input_tokens_seen": 185700625, + "step": 8641, + "time_per_iteration": 2.6104276180267334 + }, + { + "auxiliary_loss_clip": 0.01167201, + "auxiliary_loss_mlp": 0.01112917, + "balance_loss_clip": 1.00232553, + "balance_loss_mlp": 1.00057364, + "epoch": 0.519585149556591, + "flos": 28835873193600.0, + "grad_norm": 1.670908550725741, + "language_loss": 0.76662076, + "learning_rate": 1.97020728331885e-06, + "loss": 0.78942204, + "num_input_tokens_seen": 185721155, + "step": 8642, + "time_per_iteration": 2.6039626598358154 + }, + { + "auxiliary_loss_clip": 0.01167163, + "auxiliary_loss_mlp": 0.01112267, + "balance_loss_clip": 1.00234675, + "balance_loss_mlp": 1.00059223, + "epoch": 0.519645272809259, + "flos": 25373007648000.0, + "grad_norm": 1.6245722192207777, + "language_loss": 0.82655001, + "learning_rate": 1.9698178654181726e-06, + "loss": 0.84934431, + "num_input_tokens_seen": 185740990, + "step": 8643, + "time_per_iteration": 2.5674872398376465 + }, + { + "auxiliary_loss_clip": 0.0116743, + "auxiliary_loss_mlp": 0.0111367, + "balance_loss_clip": 1.00232959, + "balance_loss_mlp": 1.00075459, + "epoch": 0.519705396061927, + "flos": 25372863993600.0, + "grad_norm": 1.78498769210314, + "language_loss": 0.6999768, + "learning_rate": 1.969428448662004e-06, + "loss": 0.72278774, + "num_input_tokens_seen": 185762235, + "step": 8644, + "time_per_iteration": 2.5709073543548584 + }, + { + "auxiliary_loss_clip": 0.01150661, + "auxiliary_loss_mlp": 0.00747422, + "balance_loss_clip": 1.00214338, + "balance_loss_mlp": 1.00043082, + "epoch": 0.5197655193145949, + "flos": 28476228268800.0, + "grad_norm": 1.4944292030454556, + "language_loss": 0.80059165, + "learning_rate": 1.9690390330651133e-06, + "loss": 0.81957245, + "num_input_tokens_seen": 185783415, + "step": 8645, + "time_per_iteration": 2.6360435485839844 + }, + { + "auxiliary_loss_clip": 0.01167227, + "auxiliary_loss_mlp": 0.01112511, + "balance_loss_clip": 1.00231934, + "balance_loss_mlp": 1.00054932, + "epoch": 0.5198256425672629, + "flos": 20009138711040.0, + "grad_norm": 1.799996681743111, + "language_loss": 0.78093684, + "learning_rate": 1.968649618642264e-06, + "loss": 0.80373418, + "num_input_tokens_seen": 185801345, + "step": 8646, + "time_per_iteration": 2.5225045680999756 + }, + { + "auxiliary_loss_clip": 0.01151934, + "auxiliary_loss_mlp": 0.01113055, + "balance_loss_clip": 1.00233245, + "balance_loss_mlp": 1.00061715, + "epoch": 0.5198857658199308, + "flos": 19828867328640.0, + "grad_norm": 1.6818157774993547, + "language_loss": 0.65951145, + "learning_rate": 1.9682602054082252e-06, + "loss": 0.68216133, + "num_input_tokens_seen": 185820815, + "step": 8647, + "time_per_iteration": 2.552119731903076 + }, + { + "auxiliary_loss_clip": 0.01167326, + "auxiliary_loss_mlp": 0.01114168, + "balance_loss_clip": 1.00226629, + "balance_loss_mlp": 1.00058591, + "epoch": 0.5199458890725989, + "flos": 24461918150400.0, + "grad_norm": 1.7463969814437443, + "language_loss": 0.71187401, + "learning_rate": 1.967870793377763e-06, + "loss": 0.734689, + "num_input_tokens_seen": 185841450, + "step": 8648, + "time_per_iteration": 2.555316209793091 + }, + { + "auxiliary_loss_clip": 0.01134278, + "auxiliary_loss_mlp": 0.01112265, + "balance_loss_clip": 1.00225413, + "balance_loss_mlp": 1.00049436, + "epoch": 0.5200060123252668, + "flos": 23404779953280.0, + "grad_norm": 1.7812669136527453, + "language_loss": 0.64270842, + "learning_rate": 1.967481382565642e-06, + "loss": 0.66517389, + "num_input_tokens_seen": 185859935, + "step": 8649, + "time_per_iteration": 2.634103775024414 + }, + { + "auxiliary_loss_clip": 0.01135716, + "auxiliary_loss_mlp": 0.01114429, + "balance_loss_clip": 1.00214624, + "balance_loss_mlp": 1.00065541, + "epoch": 0.5200661355779348, + "flos": 17201355454080.0, + "grad_norm": 1.7408498627487716, + "language_loss": 0.70520282, + "learning_rate": 1.9670919729866315e-06, + "loss": 0.72770429, + "num_input_tokens_seen": 185876795, + "step": 8650, + "time_per_iteration": 2.5537800788879395 + }, + { + "auxiliary_loss_clip": 0.01167179, + "auxiliary_loss_mlp": 0.01112298, + "balance_loss_clip": 1.00225937, + "balance_loss_mlp": 1.00052786, + "epoch": 0.5201262588306027, + "flos": 18515075477760.0, + "grad_norm": 1.639656488366663, + "language_loss": 0.77849847, + "learning_rate": 1.966702564655496e-06, + "loss": 0.80129325, + "num_input_tokens_seen": 185895570, + "step": 8651, + "time_per_iteration": 2.485203981399536 + }, + { + "auxiliary_loss_clip": 0.01085091, + "auxiliary_loss_mlp": 0.01113596, + "balance_loss_clip": 1.00193501, + "balance_loss_mlp": 1.00077677, + "epoch": 0.5201863820832707, + "flos": 18619395552000.0, + "grad_norm": 2.193999149247153, + "language_loss": 0.78141069, + "learning_rate": 1.966313157587003e-06, + "loss": 0.80339754, + "num_input_tokens_seen": 185913700, + "step": 8652, + "time_per_iteration": 2.6714653968811035 + }, + { + "auxiliary_loss_clip": 0.011189, + "auxiliary_loss_mlp": 0.01113295, + "balance_loss_clip": 1.00206065, + "balance_loss_mlp": 1.0006659, + "epoch": 0.5202465053359386, + "flos": 22857142222080.0, + "grad_norm": 1.7326430546036382, + "language_loss": 0.69824302, + "learning_rate": 1.9659237517959187e-06, + "loss": 0.72056502, + "num_input_tokens_seen": 185932460, + "step": 8653, + "time_per_iteration": 2.6892478466033936 + }, + { + "auxiliary_loss_clip": 0.01117107, + "auxiliary_loss_mlp": 0.01113489, + "balance_loss_clip": 1.00208998, + "balance_loss_mlp": 1.00066924, + "epoch": 0.5203066285886067, + "flos": 21981532383360.0, + "grad_norm": 1.8718704094566885, + "language_loss": 0.78477418, + "learning_rate": 1.965534347297008e-06, + "loss": 0.80708015, + "num_input_tokens_seen": 185952030, + "step": 8654, + "time_per_iteration": 2.6686105728149414 + }, + { + "auxiliary_loss_clip": 0.01152588, + "auxiliary_loss_mlp": 0.01114967, + "balance_loss_clip": 1.00238585, + "balance_loss_mlp": 1.00081205, + "epoch": 0.5203667518412746, + "flos": 20233329448320.0, + "grad_norm": 1.845690207000418, + "language_loss": 0.84306926, + "learning_rate": 1.9651449441050393e-06, + "loss": 0.86574483, + "num_input_tokens_seen": 185973130, + "step": 8655, + "time_per_iteration": 2.6034529209136963 + }, + { + "auxiliary_loss_clip": 0.01151671, + "auxiliary_loss_mlp": 0.01113029, + "balance_loss_clip": 1.00238156, + "balance_loss_mlp": 1.00068653, + "epoch": 0.5204268750939426, + "flos": 15705460627200.0, + "grad_norm": 4.720204373241825, + "language_loss": 0.65784818, + "learning_rate": 1.9647555422347777e-06, + "loss": 0.68049514, + "num_input_tokens_seen": 185990200, + "step": 8656, + "time_per_iteration": 3.883413314819336 + }, + { + "auxiliary_loss_clip": 0.01120621, + "auxiliary_loss_mlp": 0.01112638, + "balance_loss_clip": 1.00217557, + "balance_loss_mlp": 1.00058162, + "epoch": 0.5204869983466105, + "flos": 27449469999360.0, + "grad_norm": 1.948812896671703, + "language_loss": 0.73304975, + "learning_rate": 1.9643661417009893e-06, + "loss": 0.7553823, + "num_input_tokens_seen": 186009880, + "step": 8657, + "time_per_iteration": 2.6910667419433594 + }, + { + "auxiliary_loss_clip": 0.01118425, + "auxiliary_loss_mlp": 0.01113488, + "balance_loss_clip": 1.00199771, + "balance_loss_mlp": 1.00057292, + "epoch": 0.5205471215992785, + "flos": 20595452411520.0, + "grad_norm": 1.7124970674030033, + "language_loss": 0.71593606, + "learning_rate": 1.9639767425184408e-06, + "loss": 0.7382552, + "num_input_tokens_seen": 186026680, + "step": 8658, + "time_per_iteration": 4.028859376907349 + }, + { + "auxiliary_loss_clip": 0.01167242, + "auxiliary_loss_mlp": 0.01113571, + "balance_loss_clip": 1.00229335, + "balance_loss_mlp": 1.00056064, + "epoch": 0.5206072448519465, + "flos": 22127904305280.0, + "grad_norm": 2.1561405334155643, + "language_loss": 0.8346442, + "learning_rate": 1.963587344701897e-06, + "loss": 0.85745233, + "num_input_tokens_seen": 186046920, + "step": 8659, + "time_per_iteration": 3.910071611404419 + }, + { + "auxiliary_loss_clip": 0.01137883, + "auxiliary_loss_mlp": 0.01115369, + "balance_loss_clip": 1.00227749, + "balance_loss_mlp": 1.00073719, + "epoch": 0.5206673681046144, + "flos": 18330422636160.0, + "grad_norm": 2.7497015778340983, + "language_loss": 0.75550234, + "learning_rate": 1.9631979482661253e-06, + "loss": 0.77803487, + "num_input_tokens_seen": 186062090, + "step": 8660, + "time_per_iteration": 2.5723350048065186 + }, + { + "auxiliary_loss_clip": 0.01167206, + "auxiliary_loss_mlp": 0.01113052, + "balance_loss_clip": 1.00229478, + "balance_loss_mlp": 1.0007093, + "epoch": 0.5207274913572825, + "flos": 20230240878720.0, + "grad_norm": 1.6259684289839937, + "language_loss": 0.77353603, + "learning_rate": 1.9628085532258906e-06, + "loss": 0.79633868, + "num_input_tokens_seen": 186081135, + "step": 8661, + "time_per_iteration": 2.512099027633667 + }, + { + "auxiliary_loss_clip": 0.01135246, + "auxiliary_loss_mlp": 0.01113266, + "balance_loss_clip": 1.00209451, + "balance_loss_mlp": 1.00063729, + "epoch": 0.5207876146099504, + "flos": 22127042378880.0, + "grad_norm": 1.8961107435867226, + "language_loss": 0.70390284, + "learning_rate": 1.9624191595959603e-06, + "loss": 0.7263881, + "num_input_tokens_seen": 186099700, + "step": 8662, + "time_per_iteration": 4.053753852844238 + }, + { + "auxiliary_loss_clip": 0.01152374, + "auxiliary_loss_mlp": 0.011131, + "balance_loss_clip": 1.00241256, + "balance_loss_mlp": 1.00056684, + "epoch": 0.5208477378626184, + "flos": 23878908501120.0, + "grad_norm": 1.7701238553983034, + "language_loss": 0.69329119, + "learning_rate": 1.962029767391098e-06, + "loss": 0.71594596, + "num_input_tokens_seen": 186119740, + "step": 8663, + "time_per_iteration": 2.5892417430877686 + }, + { + "auxiliary_loss_clip": 0.01135959, + "auxiliary_loss_mlp": 0.00747527, + "balance_loss_clip": 1.00229514, + "balance_loss_mlp": 1.00053382, + "epoch": 0.5209078611152863, + "flos": 20961525870720.0, + "grad_norm": 1.7609901907735308, + "language_loss": 0.76963049, + "learning_rate": 1.961640376626072e-06, + "loss": 0.78846538, + "num_input_tokens_seen": 186140645, + "step": 8664, + "time_per_iteration": 2.6271069049835205 + }, + { + "auxiliary_loss_clip": 0.01134193, + "auxiliary_loss_mlp": 0.01112209, + "balance_loss_clip": 1.00204206, + "balance_loss_mlp": 1.0007242, + "epoch": 0.5209679843679543, + "flos": 20667740532480.0, + "grad_norm": 2.303931831471682, + "language_loss": 0.76186568, + "learning_rate": 1.961250987315646e-06, + "loss": 0.78432971, + "num_input_tokens_seen": 186160130, + "step": 8665, + "time_per_iteration": 2.6091408729553223 + }, + { + "auxiliary_loss_clip": 0.01150845, + "auxiliary_loss_mlp": 0.01112262, + "balance_loss_clip": 1.00224376, + "balance_loss_mlp": 1.00068235, + "epoch": 0.5210281076206222, + "flos": 20227295963520.0, + "grad_norm": 1.6074712313329718, + "language_loss": 0.71964008, + "learning_rate": 1.960861599474586e-06, + "loss": 0.74227118, + "num_input_tokens_seen": 186179485, + "step": 8666, + "time_per_iteration": 2.5992822647094727 + }, + { + "auxiliary_loss_clip": 0.01134249, + "auxiliary_loss_mlp": 0.01114879, + "balance_loss_clip": 1.00219727, + "balance_loss_mlp": 1.00053382, + "epoch": 0.5210882308732903, + "flos": 16069989801600.0, + "grad_norm": 3.4027074237642077, + "language_loss": 0.68172383, + "learning_rate": 1.9604722131176592e-06, + "loss": 0.70421505, + "num_input_tokens_seen": 186197140, + "step": 8667, + "time_per_iteration": 2.5712411403656006 + }, + { + "auxiliary_loss_clip": 0.01102597, + "auxiliary_loss_mlp": 0.01111834, + "balance_loss_clip": 1.00206447, + "balance_loss_mlp": 1.00054026, + "epoch": 0.5211483541259582, + "flos": 24825298089600.0, + "grad_norm": 1.4846092534177935, + "language_loss": 0.81210989, + "learning_rate": 1.960082828259629e-06, + "loss": 0.83425426, + "num_input_tokens_seen": 186216800, + "step": 8668, + "time_per_iteration": 2.7224857807159424 + }, + { + "auxiliary_loss_clip": 0.01137523, + "auxiliary_loss_mlp": 0.01113125, + "balance_loss_clip": 1.00225055, + "balance_loss_mlp": 1.00049603, + "epoch": 0.5212084773786262, + "flos": 20370651143040.0, + "grad_norm": 1.994085960858139, + "language_loss": 0.63975418, + "learning_rate": 1.9596934449152623e-06, + "loss": 0.66226065, + "num_input_tokens_seen": 186235320, + "step": 8669, + "time_per_iteration": 2.6521997451782227 + }, + { + "auxiliary_loss_clip": 0.01133447, + "auxiliary_loss_mlp": 0.00747451, + "balance_loss_clip": 1.002249, + "balance_loss_mlp": 1.00050998, + "epoch": 0.5212686006312941, + "flos": 23145468693120.0, + "grad_norm": 1.4523160636726045, + "language_loss": 0.66505557, + "learning_rate": 1.959304063099325e-06, + "loss": 0.68386459, + "num_input_tokens_seen": 186254460, + "step": 8670, + "time_per_iteration": 2.6260623931884766 + }, + { + "auxiliary_loss_clip": 0.01116247, + "auxiliary_loss_mlp": 0.01112011, + "balance_loss_clip": 1.00204015, + "balance_loss_mlp": 1.00062168, + "epoch": 0.5213287238839621, + "flos": 27774030314880.0, + "grad_norm": 3.1421036525857837, + "language_loss": 0.76087832, + "learning_rate": 1.9589146828265806e-06, + "loss": 0.78316087, + "num_input_tokens_seen": 186269465, + "step": 8671, + "time_per_iteration": 2.677882194519043 + }, + { + "auxiliary_loss_clip": 0.01118355, + "auxiliary_loss_mlp": 0.01113211, + "balance_loss_clip": 1.00201571, + "balance_loss_mlp": 1.00067699, + "epoch": 0.5213888471366301, + "flos": 19937676602880.0, + "grad_norm": 2.209610267264834, + "language_loss": 0.78180194, + "learning_rate": 1.958525304111796e-06, + "loss": 0.80411768, + "num_input_tokens_seen": 186288660, + "step": 8672, + "time_per_iteration": 2.6552631855010986 + }, + { + "auxiliary_loss_clip": 0.01120089, + "auxiliary_loss_mlp": 0.01111511, + "balance_loss_clip": 1.00204742, + "balance_loss_mlp": 1.00069416, + "epoch": 0.521448970389298, + "flos": 16982731324800.0, + "grad_norm": 1.8030912387413167, + "language_loss": 0.72408783, + "learning_rate": 1.958135926969736e-06, + "loss": 0.74640381, + "num_input_tokens_seen": 186305760, + "step": 8673, + "time_per_iteration": 2.617133617401123 + }, + { + "auxiliary_loss_clip": 0.01152183, + "auxiliary_loss_mlp": 0.01112748, + "balance_loss_clip": 1.00226021, + "balance_loss_mlp": 1.00069153, + "epoch": 0.5215090936419661, + "flos": 18989706816000.0, + "grad_norm": 1.4935693006574544, + "language_loss": 0.74899697, + "learning_rate": 1.957746551415166e-06, + "loss": 0.77164632, + "num_input_tokens_seen": 186324135, + "step": 8674, + "time_per_iteration": 2.5450634956359863 + }, + { + "auxiliary_loss_clip": 0.01135584, + "auxiliary_loss_mlp": 0.01113417, + "balance_loss_clip": 1.00216174, + "balance_loss_mlp": 1.00069261, + "epoch": 0.521569216894634, + "flos": 16143427157760.0, + "grad_norm": 1.9255658078423112, + "language_loss": 0.85723901, + "learning_rate": 1.9573571774628506e-06, + "loss": 0.87972903, + "num_input_tokens_seen": 186340205, + "step": 8675, + "time_per_iteration": 2.581888198852539 + }, + { + "auxiliary_loss_clip": 0.01130789, + "auxiliary_loss_mlp": 0.01089977, + "balance_loss_clip": 1.00154734, + "balance_loss_mlp": 0.99994993, + "epoch": 0.521629340147302, + "flos": 57579493282560.0, + "grad_norm": 0.8619931973100446, + "language_loss": 0.632038, + "learning_rate": 1.9569678051275556e-06, + "loss": 0.65424562, + "num_input_tokens_seen": 186396940, + "step": 8676, + "time_per_iteration": 3.133042335510254 + }, + { + "auxiliary_loss_clip": 0.0114996, + "auxiliary_loss_mlp": 0.01111595, + "balance_loss_clip": 1.00215864, + "balance_loss_mlp": 1.00058722, + "epoch": 0.5216894633999699, + "flos": 26796901662720.0, + "grad_norm": 2.0255292014637054, + "language_loss": 0.68873608, + "learning_rate": 1.956578434424046e-06, + "loss": 0.71135163, + "num_input_tokens_seen": 186418680, + "step": 8677, + "time_per_iteration": 2.6119909286499023 + }, + { + "auxiliary_loss_clip": 0.01152248, + "auxiliary_loss_mlp": 0.01111787, + "balance_loss_clip": 1.00224543, + "balance_loss_mlp": 1.00068355, + "epoch": 0.5217495866526379, + "flos": 26358719650560.0, + "grad_norm": 1.974919107919632, + "language_loss": 0.6520803, + "learning_rate": 1.956189065367086e-06, + "loss": 0.67472064, + "num_input_tokens_seen": 186438265, + "step": 8678, + "time_per_iteration": 2.6238532066345215 + }, + { + "auxiliary_loss_clip": 0.01137651, + "auxiliary_loss_mlp": 0.01113575, + "balance_loss_clip": 1.00219464, + "balance_loss_mlp": 1.00066006, + "epoch": 0.5218097099053058, + "flos": 23584009841280.0, + "grad_norm": 3.290709804212662, + "language_loss": 0.68629402, + "learning_rate": 1.9557996979714414e-06, + "loss": 0.70880628, + "num_input_tokens_seen": 186456870, + "step": 8679, + "time_per_iteration": 2.6252641677856445 + }, + { + "auxiliary_loss_clip": 0.01167254, + "auxiliary_loss_mlp": 0.01112961, + "balance_loss_clip": 1.00234652, + "balance_loss_mlp": 1.00080895, + "epoch": 0.5218698331579739, + "flos": 18077396256000.0, + "grad_norm": 1.6566875781509995, + "language_loss": 0.66916633, + "learning_rate": 1.9554103322518764e-06, + "loss": 0.69196844, + "num_input_tokens_seen": 186476425, + "step": 8680, + "time_per_iteration": 2.535799503326416 + }, + { + "auxiliary_loss_clip": 0.0116729, + "auxiliary_loss_mlp": 0.01113322, + "balance_loss_clip": 1.00231409, + "balance_loss_mlp": 1.00059807, + "epoch": 0.5219299564106418, + "flos": 19281121856640.0, + "grad_norm": 2.6610392631252737, + "language_loss": 0.83322048, + "learning_rate": 1.955020968223156e-06, + "loss": 0.85602659, + "num_input_tokens_seen": 186492555, + "step": 8681, + "time_per_iteration": 2.507009267807007 + }, + { + "auxiliary_loss_clip": 0.01133688, + "auxiliary_loss_mlp": 0.01112166, + "balance_loss_clip": 1.00209534, + "balance_loss_mlp": 1.00058651, + "epoch": 0.5219900796633098, + "flos": 26651355753600.0, + "grad_norm": 3.043763256497941, + "language_loss": 0.77352858, + "learning_rate": 1.9546316059000454e-06, + "loss": 0.79598713, + "num_input_tokens_seen": 186513190, + "step": 8682, + "time_per_iteration": 2.657601833343506 + }, + { + "auxiliary_loss_clip": 0.011207, + "auxiliary_loss_mlp": 0.01112778, + "balance_loss_clip": 1.00218225, + "balance_loss_mlp": 1.00072122, + "epoch": 0.5220502029159777, + "flos": 34312717382400.0, + "grad_norm": 1.4286625782190088, + "language_loss": 0.69157994, + "learning_rate": 1.9542422452973082e-06, + "loss": 0.71391463, + "num_input_tokens_seen": 186534830, + "step": 8683, + "time_per_iteration": 2.7546067237854004 + }, + { + "auxiliary_loss_clip": 0.01118944, + "auxiliary_loss_mlp": 0.01112741, + "balance_loss_clip": 1.00208676, + "balance_loss_mlp": 1.00087535, + "epoch": 0.5221103261686457, + "flos": 22156488552960.0, + "grad_norm": 1.622003685634452, + "language_loss": 0.76144266, + "learning_rate": 1.9538528864297104e-06, + "loss": 0.78375953, + "num_input_tokens_seen": 186554390, + "step": 8684, + "time_per_iteration": 2.6618869304656982 + }, + { + "auxiliary_loss_clip": 0.011523, + "auxiliary_loss_mlp": 0.0074734, + "balance_loss_clip": 1.00223422, + "balance_loss_mlp": 1.00045574, + "epoch": 0.5221704494213137, + "flos": 19208402772480.0, + "grad_norm": 1.7960803016922453, + "language_loss": 0.75708967, + "learning_rate": 1.9534635293120153e-06, + "loss": 0.77608609, + "num_input_tokens_seen": 186572360, + "step": 8685, + "time_per_iteration": 2.5601043701171875 + }, + { + "auxiliary_loss_clip": 0.0113374, + "auxiliary_loss_mlp": 0.01113035, + "balance_loss_clip": 1.00220096, + "balance_loss_mlp": 1.00069249, + "epoch": 0.5222305726739817, + "flos": 19354056422400.0, + "grad_norm": 2.267273348685132, + "language_loss": 0.80828881, + "learning_rate": 1.9530741739589876e-06, + "loss": 0.83075655, + "num_input_tokens_seen": 186590655, + "step": 8686, + "time_per_iteration": 2.600423812866211 + }, + { + "auxiliary_loss_clip": 0.01136563, + "auxiliary_loss_mlp": 0.01111493, + "balance_loss_clip": 1.00203347, + "balance_loss_mlp": 1.00077081, + "epoch": 0.5222906959266497, + "flos": 27814789272960.0, + "grad_norm": 2.152950880989981, + "language_loss": 0.69901788, + "learning_rate": 1.9526848203853927e-06, + "loss": 0.72149849, + "num_input_tokens_seen": 186610345, + "step": 8687, + "time_per_iteration": 2.69538950920105 + }, + { + "auxiliary_loss_clip": 0.01166931, + "auxiliary_loss_mlp": 0.01111089, + "balance_loss_clip": 1.00227594, + "balance_loss_mlp": 1.00055802, + "epoch": 0.5223508191793176, + "flos": 12712988615040.0, + "grad_norm": 1.9399678225477697, + "language_loss": 0.82937604, + "learning_rate": 1.9522954686059936e-06, + "loss": 0.85215628, + "num_input_tokens_seen": 186624360, + "step": 8688, + "time_per_iteration": 2.5429277420043945 + }, + { + "auxiliary_loss_clip": 0.01150191, + "auxiliary_loss_mlp": 0.00747462, + "balance_loss_clip": 1.00216103, + "balance_loss_mlp": 1.00059056, + "epoch": 0.5224109424319856, + "flos": 15632238752640.0, + "grad_norm": 5.710755873153659, + "language_loss": 0.73589516, + "learning_rate": 1.9519061186355558e-06, + "loss": 0.75487173, + "num_input_tokens_seen": 186638680, + "step": 8689, + "time_per_iteration": 2.5564613342285156 + }, + { + "auxiliary_loss_clip": 0.01136882, + "auxiliary_loss_mlp": 0.01110896, + "balance_loss_clip": 1.00220633, + "balance_loss_mlp": 1.00065148, + "epoch": 0.5224710656846535, + "flos": 15742233175680.0, + "grad_norm": 2.2809689320903392, + "language_loss": 0.83053458, + "learning_rate": 1.9515167704888417e-06, + "loss": 0.85301232, + "num_input_tokens_seen": 186655840, + "step": 8690, + "time_per_iteration": 2.6057708263397217 + }, + { + "auxiliary_loss_clip": 0.01123685, + "auxiliary_loss_mlp": 0.01112099, + "balance_loss_clip": 1.00205767, + "balance_loss_mlp": 1.00061464, + "epoch": 0.5225311889373215, + "flos": 26030998938240.0, + "grad_norm": 1.9094384114193652, + "language_loss": 0.79199219, + "learning_rate": 1.9511274241806173e-06, + "loss": 0.81435001, + "num_input_tokens_seen": 186674150, + "step": 8691, + "time_per_iteration": 2.6558377742767334 + }, + { + "auxiliary_loss_clip": 0.01152367, + "auxiliary_loss_mlp": 0.01113279, + "balance_loss_clip": 1.00226903, + "balance_loss_mlp": 1.00064993, + "epoch": 0.5225913121899894, + "flos": 18369278173440.0, + "grad_norm": 2.204935883213542, + "language_loss": 0.76841074, + "learning_rate": 1.950738079725646e-06, + "loss": 0.79106718, + "num_input_tokens_seen": 186690675, + "step": 8692, + "time_per_iteration": 2.555767774581909 + }, + { + "auxiliary_loss_clip": 0.01150082, + "auxiliary_loss_mlp": 0.01111691, + "balance_loss_clip": 1.0022316, + "balance_loss_mlp": 1.00058782, + "epoch": 0.5226514354426575, + "flos": 29273516501760.0, + "grad_norm": 2.8610425683703737, + "language_loss": 0.72940099, + "learning_rate": 1.950348737138691e-06, + "loss": 0.75201869, + "num_input_tokens_seen": 186710380, + "step": 8693, + "time_per_iteration": 3.9996209144592285 + }, + { + "auxiliary_loss_clip": 0.01167322, + "auxiliary_loss_mlp": 0.01113993, + "balance_loss_clip": 1.00227976, + "balance_loss_mlp": 1.00060093, + "epoch": 0.5227115586953254, + "flos": 22853299466880.0, + "grad_norm": 3.304510751569124, + "language_loss": 0.82279098, + "learning_rate": 1.949959396434517e-06, + "loss": 0.84560418, + "num_input_tokens_seen": 186729135, + "step": 8694, + "time_per_iteration": 2.53286075592041 + }, + { + "auxiliary_loss_clip": 0.0111267, + "auxiliary_loss_mlp": 0.01090415, + "balance_loss_clip": 1.00156415, + "balance_loss_mlp": 1.00000632, + "epoch": 0.5227716819479934, + "flos": 57474419022720.0, + "grad_norm": 0.7953589107682794, + "language_loss": 0.5566712, + "learning_rate": 1.949570057627888e-06, + "loss": 0.57870203, + "num_input_tokens_seen": 186791115, + "step": 8695, + "time_per_iteration": 3.230539321899414 + }, + { + "auxiliary_loss_clip": 0.01089642, + "auxiliary_loss_mlp": 0.01112469, + "balance_loss_clip": 1.00203466, + "balance_loss_mlp": 1.00060332, + "epoch": 0.5228318052006613, + "flos": 13808264077440.0, + "grad_norm": 1.614886808757056, + "language_loss": 0.73576975, + "learning_rate": 1.9491807207335672e-06, + "loss": 0.75779092, + "num_input_tokens_seen": 186808660, + "step": 8696, + "time_per_iteration": 4.055562973022461 + }, + { + "auxiliary_loss_clip": 0.01135491, + "auxiliary_loss_mlp": 0.01113008, + "balance_loss_clip": 1.0021286, + "balance_loss_mlp": 1.00085592, + "epoch": 0.5228919284533293, + "flos": 15596184476160.0, + "grad_norm": 1.789842221340616, + "language_loss": 0.7137208, + "learning_rate": 1.948791385766319e-06, + "loss": 0.73620582, + "num_input_tokens_seen": 186825900, + "step": 8697, + "time_per_iteration": 3.947988510131836 + }, + { + "auxiliary_loss_clip": 0.01133805, + "auxiliary_loss_mlp": 0.01112039, + "balance_loss_clip": 1.00210643, + "balance_loss_mlp": 1.00055432, + "epoch": 0.5229520517059973, + "flos": 22491499726080.0, + "grad_norm": 2.0592826225341043, + "language_loss": 0.80586284, + "learning_rate": 1.948402052740906e-06, + "loss": 0.82832128, + "num_input_tokens_seen": 186843735, + "step": 8698, + "time_per_iteration": 2.592740774154663 + }, + { + "auxiliary_loss_clip": 0.01152203, + "auxiliary_loss_mlp": 0.01111525, + "balance_loss_clip": 1.00218546, + "balance_loss_mlp": 1.00070834, + "epoch": 0.5230121749586653, + "flos": 22090880361600.0, + "grad_norm": 1.6442983912517797, + "language_loss": 0.74112463, + "learning_rate": 1.948012721672093e-06, + "loss": 0.76376194, + "num_input_tokens_seen": 186862440, + "step": 8699, + "time_per_iteration": 2.5633106231689453 + }, + { + "auxiliary_loss_clip": 0.01152484, + "auxiliary_loss_mlp": 0.00747496, + "balance_loss_clip": 1.00221884, + "balance_loss_mlp": 1.00053024, + "epoch": 0.5230722982113333, + "flos": 22127150119680.0, + "grad_norm": 4.393186086727676, + "language_loss": 0.73635614, + "learning_rate": 1.947623392574642e-06, + "loss": 0.75535595, + "num_input_tokens_seen": 186880940, + "step": 8700, + "time_per_iteration": 4.061461687088013 + }, + { + "auxiliary_loss_clip": 0.01135946, + "auxiliary_loss_mlp": 0.01113643, + "balance_loss_clip": 1.00231361, + "balance_loss_mlp": 1.00072837, + "epoch": 0.5231324214640012, + "flos": 25009268572800.0, + "grad_norm": 1.8399187122927272, + "language_loss": 0.66872263, + "learning_rate": 1.947234065463318e-06, + "loss": 0.6912185, + "num_input_tokens_seen": 186900785, + "step": 8701, + "time_per_iteration": 2.650222063064575 + }, + { + "auxiliary_loss_clip": 0.01136703, + "auxiliary_loss_mlp": 0.00747483, + "balance_loss_clip": 1.00221324, + "balance_loss_mlp": 1.00057316, + "epoch": 0.5231925447166692, + "flos": 25740517651200.0, + "grad_norm": 1.7210137679495667, + "language_loss": 0.66790289, + "learning_rate": 1.9468447403528826e-06, + "loss": 0.68674469, + "num_input_tokens_seen": 186920895, + "step": 8702, + "time_per_iteration": 2.6577203273773193 + }, + { + "auxiliary_loss_clip": 0.01134908, + "auxiliary_loss_mlp": 0.01111936, + "balance_loss_clip": 1.00218773, + "balance_loss_mlp": 1.00054646, + "epoch": 0.5232526679693371, + "flos": 21433930565760.0, + "grad_norm": 1.8481847132216889, + "language_loss": 0.76679337, + "learning_rate": 1.946455417258101e-06, + "loss": 0.78926188, + "num_input_tokens_seen": 186940605, + "step": 8703, + "time_per_iteration": 2.594989538192749 + }, + { + "auxiliary_loss_clip": 0.01150555, + "auxiliary_loss_mlp": 0.01113841, + "balance_loss_clip": 1.00226617, + "balance_loss_mlp": 1.00073552, + "epoch": 0.5233127912220051, + "flos": 35298393471360.0, + "grad_norm": 2.6390635032415384, + "language_loss": 0.76901019, + "learning_rate": 1.9460660961937348e-06, + "loss": 0.79165417, + "num_input_tokens_seen": 186960820, + "step": 8704, + "time_per_iteration": 2.7041311264038086 + }, + { + "auxiliary_loss_clip": 0.01134928, + "auxiliary_loss_mlp": 0.01112825, + "balance_loss_clip": 1.00239277, + "balance_loss_mlp": 1.00076795, + "epoch": 0.523372914474673, + "flos": 17051320344960.0, + "grad_norm": 1.659088608135658, + "language_loss": 0.78054041, + "learning_rate": 1.9456767771745474e-06, + "loss": 0.80301791, + "num_input_tokens_seen": 186976240, + "step": 8705, + "time_per_iteration": 2.5831735134124756 + }, + { + "auxiliary_loss_clip": 0.01135928, + "auxiliary_loss_mlp": 0.01114166, + "balance_loss_clip": 1.00226057, + "balance_loss_mlp": 1.00058293, + "epoch": 0.5234330377273411, + "flos": 18406302117120.0, + "grad_norm": 2.6247962724675786, + "language_loss": 0.69675744, + "learning_rate": 1.9452874602153027e-06, + "loss": 0.71925843, + "num_input_tokens_seen": 186992855, + "step": 8706, + "time_per_iteration": 2.59596586227417 + }, + { + "auxiliary_loss_clip": 0.01146307, + "auxiliary_loss_mlp": 0.0109008, + "balance_loss_clip": 1.00175154, + "balance_loss_mlp": 1.00005269, + "epoch": 0.523493160980009, + "flos": 65850296970240.0, + "grad_norm": 0.9411917665477477, + "language_loss": 0.52488559, + "learning_rate": 1.9448981453307623e-06, + "loss": 0.54724938, + "num_input_tokens_seen": 187051205, + "step": 8707, + "time_per_iteration": 3.230659246444702 + }, + { + "auxiliary_loss_clip": 0.01135549, + "auxiliary_loss_mlp": 0.01112253, + "balance_loss_clip": 1.00219798, + "balance_loss_mlp": 1.00086403, + "epoch": 0.523553284232677, + "flos": 21872076664320.0, + "grad_norm": 2.0468796208303623, + "language_loss": 0.74648255, + "learning_rate": 1.9445088325356904e-06, + "loss": 0.7689606, + "num_input_tokens_seen": 187070540, + "step": 8708, + "time_per_iteration": 2.7436487674713135 + }, + { + "auxiliary_loss_clip": 0.0113423, + "auxiliary_loss_mlp": 0.01111582, + "balance_loss_clip": 1.00224757, + "balance_loss_mlp": 1.00047934, + "epoch": 0.5236134074853449, + "flos": 20848191482880.0, + "grad_norm": 1.5214134247321, + "language_loss": 0.77078599, + "learning_rate": 1.944119521844849e-06, + "loss": 0.79324412, + "num_input_tokens_seen": 187089975, + "step": 8709, + "time_per_iteration": 2.6336312294006348 + }, + { + "auxiliary_loss_clip": 0.01103872, + "auxiliary_loss_mlp": 0.01114032, + "balance_loss_clip": 1.00212431, + "balance_loss_mlp": 1.00073564, + "epoch": 0.5236735307380129, + "flos": 25520421064320.0, + "grad_norm": 1.9771772803454628, + "language_loss": 0.83375937, + "learning_rate": 1.9437302132730003e-06, + "loss": 0.85593837, + "num_input_tokens_seen": 187108775, + "step": 8710, + "time_per_iteration": 2.7242212295532227 + }, + { + "auxiliary_loss_clip": 0.01135361, + "auxiliary_loss_mlp": 0.01112592, + "balance_loss_clip": 1.00212669, + "balance_loss_mlp": 1.00063086, + "epoch": 0.523733653990681, + "flos": 23583112001280.0, + "grad_norm": 2.2370727233184553, + "language_loss": 0.68873763, + "learning_rate": 1.943340906834908e-06, + "loss": 0.71121716, + "num_input_tokens_seen": 187128830, + "step": 8711, + "time_per_iteration": 2.620084524154663 + }, + { + "auxiliary_loss_clip": 0.01150811, + "auxiliary_loss_mlp": 0.01111952, + "balance_loss_clip": 1.00227475, + "balance_loss_mlp": 1.00056267, + "epoch": 0.5237937772433489, + "flos": 21106245767040.0, + "grad_norm": 1.6795957683207048, + "language_loss": 0.83089602, + "learning_rate": 1.9429516025453345e-06, + "loss": 0.85352361, + "num_input_tokens_seen": 187149570, + "step": 8712, + "time_per_iteration": 2.5946710109710693 + }, + { + "auxiliary_loss_clip": 0.01167277, + "auxiliary_loss_mlp": 0.01112716, + "balance_loss_clip": 1.00240254, + "balance_loss_mlp": 1.00075483, + "epoch": 0.5238539004960169, + "flos": 19172887200000.0, + "grad_norm": 1.6777291224454862, + "language_loss": 0.69905627, + "learning_rate": 1.9425623004190415e-06, + "loss": 0.72185624, + "num_input_tokens_seen": 187170575, + "step": 8713, + "time_per_iteration": 2.539370536804199 + }, + { + "auxiliary_loss_clip": 0.01118736, + "auxiliary_loss_mlp": 0.011136, + "balance_loss_clip": 1.00206566, + "balance_loss_mlp": 1.00058937, + "epoch": 0.5239140237486848, + "flos": 17888218300800.0, + "grad_norm": 2.2355763786243448, + "language_loss": 0.76911974, + "learning_rate": 1.9421730004707925e-06, + "loss": 0.79144311, + "num_input_tokens_seen": 187187190, + "step": 8714, + "time_per_iteration": 2.6243536472320557 + }, + { + "auxiliary_loss_clip": 0.01119805, + "auxiliary_loss_mlp": 0.01113544, + "balance_loss_clip": 1.00216115, + "balance_loss_mlp": 1.00043845, + "epoch": 0.5239741470013528, + "flos": 17930413802880.0, + "grad_norm": 2.2340722918638742, + "language_loss": 0.76100922, + "learning_rate": 1.9417837027153483e-06, + "loss": 0.78334272, + "num_input_tokens_seen": 187204350, + "step": 8715, + "time_per_iteration": 2.6144216060638428 + }, + { + "auxiliary_loss_clip": 0.01135559, + "auxiliary_loss_mlp": 0.01112758, + "balance_loss_clip": 1.0022378, + "balance_loss_mlp": 1.00070095, + "epoch": 0.5240342702540207, + "flos": 30993386584320.0, + "grad_norm": 1.4417356272161623, + "language_loss": 0.70977688, + "learning_rate": 1.9413944071674723e-06, + "loss": 0.73226011, + "num_input_tokens_seen": 187225605, + "step": 8716, + "time_per_iteration": 2.690483570098877 + }, + { + "auxiliary_loss_clip": 0.01167158, + "auxiliary_loss_mlp": 0.01112216, + "balance_loss_clip": 1.00231171, + "balance_loss_mlp": 1.00073111, + "epoch": 0.5240943935066887, + "flos": 25005066681600.0, + "grad_norm": 2.5894249481377964, + "language_loss": 0.87055403, + "learning_rate": 1.941005113841926e-06, + "loss": 0.89334774, + "num_input_tokens_seen": 187241335, + "step": 8717, + "time_per_iteration": 2.523688793182373 + }, + { + "auxiliary_loss_clip": 0.01150925, + "auxiliary_loss_mlp": 0.01112325, + "balance_loss_clip": 1.00227642, + "balance_loss_mlp": 1.00064969, + "epoch": 0.5241545167593566, + "flos": 23659099223040.0, + "grad_norm": 2.0416626135633438, + "language_loss": 0.61002892, + "learning_rate": 1.9406158227534723e-06, + "loss": 0.63266146, + "num_input_tokens_seen": 187259925, + "step": 8718, + "time_per_iteration": 2.6457324028015137 + }, + { + "auxiliary_loss_clip": 0.01136169, + "auxiliary_loss_mlp": 0.01112299, + "balance_loss_clip": 1.00231087, + "balance_loss_mlp": 1.00071955, + "epoch": 0.5242146400120247, + "flos": 23400398494080.0, + "grad_norm": 2.0590139968838086, + "language_loss": 0.71915817, + "learning_rate": 1.940226533916872e-06, + "loss": 0.74164283, + "num_input_tokens_seen": 187279035, + "step": 8719, + "time_per_iteration": 2.6502177715301514 + }, + { + "auxiliary_loss_clip": 0.01150347, + "auxiliary_loss_mlp": 0.01111391, + "balance_loss_clip": 1.00223875, + "balance_loss_mlp": 1.00057447, + "epoch": 0.5242747632646926, + "flos": 17749065012480.0, + "grad_norm": 2.8030335397734945, + "language_loss": 0.73254925, + "learning_rate": 1.9398372473468877e-06, + "loss": 0.75516665, + "num_input_tokens_seen": 187297555, + "step": 8720, + "time_per_iteration": 2.6858365535736084 + }, + { + "auxiliary_loss_clip": 0.01152205, + "auxiliary_loss_mlp": 0.01112583, + "balance_loss_clip": 1.00221288, + "balance_loss_mlp": 1.00071657, + "epoch": 0.5243348865173606, + "flos": 32597731549440.0, + "grad_norm": 2.084474717785337, + "language_loss": 0.70083165, + "learning_rate": 1.939447963058281e-06, + "loss": 0.72347951, + "num_input_tokens_seen": 187320265, + "step": 8721, + "time_per_iteration": 2.648167610168457 + }, + { + "auxiliary_loss_clip": 0.01104012, + "auxiliary_loss_mlp": 0.01111881, + "balance_loss_clip": 1.00215101, + "balance_loss_mlp": 1.00068247, + "epoch": 0.5243950097700285, + "flos": 25484115392640.0, + "grad_norm": 1.8496045489059036, + "language_loss": 0.86597985, + "learning_rate": 1.939058681065813e-06, + "loss": 0.88813877, + "num_input_tokens_seen": 187338045, + "step": 8722, + "time_per_iteration": 2.7247414588928223 + }, + { + "auxiliary_loss_clip": 0.01167085, + "auxiliary_loss_mlp": 0.01112503, + "balance_loss_clip": 1.00225949, + "balance_loss_mlp": 1.00073218, + "epoch": 0.5244551330226965, + "flos": 15268391936640.0, + "grad_norm": 3.0154363687707577, + "language_loss": 0.79845786, + "learning_rate": 1.938669401384247e-06, + "loss": 0.82125378, + "num_input_tokens_seen": 187356040, + "step": 8723, + "time_per_iteration": 2.5093207359313965 + }, + { + "auxiliary_loss_clip": 0.01151009, + "auxiliary_loss_mlp": 0.01113995, + "balance_loss_clip": 1.00242186, + "balance_loss_mlp": 1.00079429, + "epoch": 0.5245152562753645, + "flos": 22237108629120.0, + "grad_norm": 1.7741978675214591, + "language_loss": 0.74747026, + "learning_rate": 1.9382801240283426e-06, + "loss": 0.77012026, + "num_input_tokens_seen": 187374185, + "step": 8724, + "time_per_iteration": 2.5864315032958984 + }, + { + "auxiliary_loss_clip": 0.01167314, + "auxiliary_loss_mlp": 0.01113246, + "balance_loss_clip": 1.00227809, + "balance_loss_mlp": 1.0005219, + "epoch": 0.5245753795280325, + "flos": 29426460612480.0, + "grad_norm": 2.663890855412376, + "language_loss": 0.70389199, + "learning_rate": 1.9378908490128625e-06, + "loss": 0.72669762, + "num_input_tokens_seen": 187396640, + "step": 8725, + "time_per_iteration": 2.6082043647766113 + }, + { + "auxiliary_loss_clip": 0.01114199, + "auxiliary_loss_mlp": 0.01090372, + "balance_loss_clip": 1.00172925, + "balance_loss_mlp": 0.99996388, + "epoch": 0.5246355027807005, + "flos": 58834392785280.0, + "grad_norm": 0.7533995721252792, + "language_loss": 0.55629754, + "learning_rate": 1.937501576352568e-06, + "loss": 0.57834327, + "num_input_tokens_seen": 187455945, + "step": 8726, + "time_per_iteration": 3.194955825805664 + }, + { + "auxiliary_loss_clip": 0.01129682, + "auxiliary_loss_mlp": 0.01090798, + "balance_loss_clip": 1.00167334, + "balance_loss_mlp": 1.00000799, + "epoch": 0.5246956260333684, + "flos": 64526592965760.0, + "grad_norm": 0.7948114436490228, + "language_loss": 0.58419538, + "learning_rate": 1.937112306062219e-06, + "loss": 0.60640019, + "num_input_tokens_seen": 187519975, + "step": 8727, + "time_per_iteration": 3.1296987533569336 + }, + { + "auxiliary_loss_clip": 0.01151824, + "auxiliary_loss_mlp": 0.01113253, + "balance_loss_clip": 1.00218284, + "balance_loss_mlp": 1.00071931, + "epoch": 0.5247557492860364, + "flos": 24533631653760.0, + "grad_norm": 1.3928567432323182, + "language_loss": 0.70695966, + "learning_rate": 1.9367230381565786e-06, + "loss": 0.72961044, + "num_input_tokens_seen": 187541775, + "step": 8728, + "time_per_iteration": 2.632678270339966 + }, + { + "auxiliary_loss_clip": 0.01150394, + "auxiliary_loss_mlp": 0.01112363, + "balance_loss_clip": 1.00213957, + "balance_loss_mlp": 1.00049686, + "epoch": 0.5248158725387043, + "flos": 18806131382400.0, + "grad_norm": 1.3900756314861074, + "language_loss": 0.69338584, + "learning_rate": 1.9363337726504062e-06, + "loss": 0.71601343, + "num_input_tokens_seen": 187560425, + "step": 8729, + "time_per_iteration": 2.5705931186676025 + }, + { + "auxiliary_loss_clip": 0.01119298, + "auxiliary_loss_mlp": 0.01112579, + "balance_loss_clip": 1.0020802, + "balance_loss_mlp": 1.00052261, + "epoch": 0.5248759957913723, + "flos": 20955851521920.0, + "grad_norm": 1.9604037131642518, + "language_loss": 0.83770859, + "learning_rate": 1.935944509558464e-06, + "loss": 0.86002731, + "num_input_tokens_seen": 187579930, + "step": 8730, + "time_per_iteration": 2.6508872509002686 + }, + { + "auxiliary_loss_clip": 0.0112025, + "auxiliary_loss_mlp": 0.0111234, + "balance_loss_clip": 1.00224435, + "balance_loss_mlp": 1.00056899, + "epoch": 0.5249361190440403, + "flos": 18660980522880.0, + "grad_norm": 10.793793417476936, + "language_loss": 0.79608834, + "learning_rate": 1.9355552488955125e-06, + "loss": 0.81841421, + "num_input_tokens_seen": 187595365, + "step": 8731, + "time_per_iteration": 4.014423370361328 + }, + { + "auxiliary_loss_clip": 0.01150327, + "auxiliary_loss_mlp": 0.01111265, + "balance_loss_clip": 1.00223327, + "balance_loss_mlp": 1.0006392, + "epoch": 0.5249962422967083, + "flos": 24863327614080.0, + "grad_norm": 1.9660089668199747, + "language_loss": 0.83033884, + "learning_rate": 1.935165990676312e-06, + "loss": 0.85295475, + "num_input_tokens_seen": 187614715, + "step": 8732, + "time_per_iteration": 2.6162946224212646 + }, + { + "auxiliary_loss_clip": 0.01149693, + "auxiliary_loss_mlp": 0.01111522, + "balance_loss_clip": 1.00209296, + "balance_loss_mlp": 1.00060928, + "epoch": 0.5250563655493762, + "flos": 15262681674240.0, + "grad_norm": 2.844883740006956, + "language_loss": 0.77358079, + "learning_rate": 1.9347767349156237e-06, + "loss": 0.796193, + "num_input_tokens_seen": 187630745, + "step": 8733, + "time_per_iteration": 2.516381025314331 + }, + { + "auxiliary_loss_clip": 0.01167374, + "auxiliary_loss_mlp": 0.01113206, + "balance_loss_clip": 1.00242615, + "balance_loss_mlp": 1.00048137, + "epoch": 0.5251164888020442, + "flos": 18625177641600.0, + "grad_norm": 2.391861637052414, + "language_loss": 0.81345087, + "learning_rate": 1.934387481628208e-06, + "loss": 0.83625662, + "num_input_tokens_seen": 187648200, + "step": 8734, + "time_per_iteration": 3.8697562217712402 + }, + { + "auxiliary_loss_clip": 0.01133881, + "auxiliary_loss_mlp": 0.01112056, + "balance_loss_clip": 1.00212646, + "balance_loss_mlp": 1.00057125, + "epoch": 0.5251766120547121, + "flos": 29710764760320.0, + "grad_norm": 1.4689139809814695, + "language_loss": 0.76672542, + "learning_rate": 1.933998230828826e-06, + "loss": 0.78918481, + "num_input_tokens_seen": 187669205, + "step": 8735, + "time_per_iteration": 3.9985687732696533 + }, + { + "auxiliary_loss_clip": 0.01150419, + "auxiliary_loss_mlp": 0.0111241, + "balance_loss_clip": 1.00229216, + "balance_loss_mlp": 1.00073528, + "epoch": 0.5252367353073801, + "flos": 23440295525760.0, + "grad_norm": 2.272463601902733, + "language_loss": 0.80398822, + "learning_rate": 1.9336089825322376e-06, + "loss": 0.82661653, + "num_input_tokens_seen": 187690890, + "step": 8736, + "time_per_iteration": 2.5986783504486084 + }, + { + "auxiliary_loss_clip": 0.01167227, + "auxiliary_loss_mlp": 0.01113116, + "balance_loss_clip": 1.00241375, + "balance_loss_mlp": 1.0006783, + "epoch": 0.5252968585600482, + "flos": 30810708990720.0, + "grad_norm": 7.086773154651062, + "language_loss": 0.70209825, + "learning_rate": 1.9332197367532033e-06, + "loss": 0.72490168, + "num_input_tokens_seen": 187713045, + "step": 8737, + "time_per_iteration": 2.5810558795928955 + }, + { + "auxiliary_loss_clip": 0.01134019, + "auxiliary_loss_mlp": 0.01112659, + "balance_loss_clip": 1.00212705, + "balance_loss_mlp": 1.00069749, + "epoch": 0.5253569818127161, + "flos": 20628274464000.0, + "grad_norm": 2.0903564759419178, + "language_loss": 0.77345687, + "learning_rate": 1.9328304935064833e-06, + "loss": 0.79592365, + "num_input_tokens_seen": 187733640, + "step": 8738, + "time_per_iteration": 4.082360744476318 + }, + { + "auxiliary_loss_clip": 0.01135011, + "auxiliary_loss_mlp": 0.0074608, + "balance_loss_clip": 1.00177324, + "balance_loss_mlp": 1.00011659, + "epoch": 0.5254171050653841, + "flos": 63428695810560.0, + "grad_norm": 0.7420537241249744, + "language_loss": 0.54445291, + "learning_rate": 1.932441252806837e-06, + "loss": 0.56326377, + "num_input_tokens_seen": 187792930, + "step": 8739, + "time_per_iteration": 3.1500113010406494 + }, + { + "auxiliary_loss_clip": 0.01136738, + "auxiliary_loss_mlp": 0.01112302, + "balance_loss_clip": 1.00223494, + "balance_loss_mlp": 1.00081706, + "epoch": 0.525477228318052, + "flos": 34670782108800.0, + "grad_norm": 1.845950369412853, + "language_loss": 0.84660757, + "learning_rate": 1.9320520146690263e-06, + "loss": 0.86909795, + "num_input_tokens_seen": 187812495, + "step": 8740, + "time_per_iteration": 2.720689058303833 + }, + { + "auxiliary_loss_clip": 0.01152322, + "auxiliary_loss_mlp": 0.00747466, + "balance_loss_clip": 1.00233269, + "balance_loss_mlp": 1.00058424, + "epoch": 0.52553735157072, + "flos": 17930844766080.0, + "grad_norm": 1.8639135575496473, + "language_loss": 0.69810635, + "learning_rate": 1.9316627791078093e-06, + "loss": 0.7171042, + "num_input_tokens_seen": 187829685, + "step": 8741, + "time_per_iteration": 2.6117637157440186 + }, + { + "auxiliary_loss_clip": 0.01135571, + "auxiliary_loss_mlp": 0.01113208, + "balance_loss_clip": 1.00225234, + "balance_loss_mlp": 1.00057948, + "epoch": 0.5255974748233879, + "flos": 9940864584960.0, + "grad_norm": 3.1245901956125373, + "language_loss": 0.66219056, + "learning_rate": 1.931273546137947e-06, + "loss": 0.68467832, + "num_input_tokens_seen": 187846495, + "step": 8742, + "time_per_iteration": 2.5918023586273193 + }, + { + "auxiliary_loss_clip": 0.01118821, + "auxiliary_loss_mlp": 0.01114243, + "balance_loss_clip": 1.00215411, + "balance_loss_mlp": 1.00066018, + "epoch": 0.5256575980760559, + "flos": 16868427269760.0, + "grad_norm": 4.505611259806348, + "language_loss": 0.63108242, + "learning_rate": 1.9308843157741983e-06, + "loss": 0.65341312, + "num_input_tokens_seen": 187862010, + "step": 8743, + "time_per_iteration": 2.5988473892211914 + }, + { + "auxiliary_loss_clip": 0.01147523, + "auxiliary_loss_mlp": 0.01090431, + "balance_loss_clip": 1.0017767, + "balance_loss_mlp": 1.00002277, + "epoch": 0.5257177213287239, + "flos": 62386210362240.0, + "grad_norm": 0.7721307135406723, + "language_loss": 0.54145956, + "learning_rate": 1.930495088031323e-06, + "loss": 0.56383908, + "num_input_tokens_seen": 187922730, + "step": 8744, + "time_per_iteration": 3.228900671005249 + }, + { + "auxiliary_loss_clip": 0.01135677, + "auxiliary_loss_mlp": 0.0111371, + "balance_loss_clip": 1.0021795, + "balance_loss_mlp": 1.00060463, + "epoch": 0.5257778445813919, + "flos": 20776908942720.0, + "grad_norm": 2.5424863833924034, + "language_loss": 0.76207769, + "learning_rate": 1.9301058629240814e-06, + "loss": 0.78457165, + "num_input_tokens_seen": 187940160, + "step": 8745, + "time_per_iteration": 2.5977978706359863 + }, + { + "auxiliary_loss_clip": 0.01152372, + "auxiliary_loss_mlp": 0.01112168, + "balance_loss_clip": 1.0024358, + "balance_loss_mlp": 1.0005883, + "epoch": 0.5258379678340598, + "flos": 17018606033280.0, + "grad_norm": 2.2140732383853914, + "language_loss": 0.80901515, + "learning_rate": 1.9297166404672324e-06, + "loss": 0.83166057, + "num_input_tokens_seen": 187958625, + "step": 8746, + "time_per_iteration": 2.5547049045562744 + }, + { + "auxiliary_loss_clip": 0.0115169, + "auxiliary_loss_mlp": 0.01112161, + "balance_loss_clip": 1.00228608, + "balance_loss_mlp": 1.00077224, + "epoch": 0.5258980910867278, + "flos": 21068754946560.0, + "grad_norm": 2.8348269586588675, + "language_loss": 0.74831665, + "learning_rate": 1.9293274206755353e-06, + "loss": 0.77095515, + "num_input_tokens_seen": 187977575, + "step": 8747, + "time_per_iteration": 2.5686919689178467 + }, + { + "auxiliary_loss_clip": 0.01087331, + "auxiliary_loss_mlp": 0.01111252, + "balance_loss_clip": 1.00212097, + "balance_loss_mlp": 1.00053, + "epoch": 0.5259582143393957, + "flos": 18004461690240.0, + "grad_norm": 1.9687442237670834, + "language_loss": 0.83206284, + "learning_rate": 1.9289382035637505e-06, + "loss": 0.85404867, + "num_input_tokens_seen": 187996650, + "step": 8748, + "time_per_iteration": 2.7195630073547363 + }, + { + "auxiliary_loss_clip": 0.01137319, + "auxiliary_loss_mlp": 0.01113053, + "balance_loss_clip": 1.00215197, + "balance_loss_mlp": 1.0006144, + "epoch": 0.5260183375920637, + "flos": 22783848520320.0, + "grad_norm": 2.0242026638015735, + "language_loss": 0.80199492, + "learning_rate": 1.9285489891466345e-06, + "loss": 0.82449865, + "num_input_tokens_seen": 188013510, + "step": 8749, + "time_per_iteration": 2.6005923748016357 + }, + { + "auxiliary_loss_clip": 0.01150394, + "auxiliary_loss_mlp": 0.01113198, + "balance_loss_clip": 1.00233185, + "balance_loss_mlp": 1.00066471, + "epoch": 0.5260784608447318, + "flos": 27052406081280.0, + "grad_norm": 2.14943650902571, + "language_loss": 0.72201437, + "learning_rate": 1.9281597774389487e-06, + "loss": 0.7446503, + "num_input_tokens_seen": 188032085, + "step": 8750, + "time_per_iteration": 2.595050096511841 + }, + { + "auxiliary_loss_clip": 0.01136764, + "auxiliary_loss_mlp": 0.01113309, + "balance_loss_clip": 1.00216603, + "balance_loss_mlp": 1.00067973, + "epoch": 0.5261385840973997, + "flos": 20662820369280.0, + "grad_norm": 2.2051523625043528, + "language_loss": 0.76458406, + "learning_rate": 1.9277705684554517e-06, + "loss": 0.78708482, + "num_input_tokens_seen": 188050590, + "step": 8751, + "time_per_iteration": 2.5901639461517334 + }, + { + "auxiliary_loss_clip": 0.01167098, + "auxiliary_loss_mlp": 0.0111219, + "balance_loss_clip": 1.00232625, + "balance_loss_mlp": 1.00060987, + "epoch": 0.5261987073500677, + "flos": 23622649896960.0, + "grad_norm": 1.4625858695009768, + "language_loss": 0.76082385, + "learning_rate": 1.927381362210902e-06, + "loss": 0.78361672, + "num_input_tokens_seen": 188071620, + "step": 8752, + "time_per_iteration": 2.5607104301452637 + }, + { + "auxiliary_loss_clip": 0.01150597, + "auxiliary_loss_mlp": 0.01113472, + "balance_loss_clip": 1.00222242, + "balance_loss_mlp": 1.00065219, + "epoch": 0.5262588306027356, + "flos": 27636241743360.0, + "grad_norm": 1.7084846246888599, + "language_loss": 0.67962295, + "learning_rate": 1.926992158720058e-06, + "loss": 0.70226365, + "num_input_tokens_seen": 188091740, + "step": 8753, + "time_per_iteration": 2.620304584503174 + }, + { + "auxiliary_loss_clip": 0.01151759, + "auxiliary_loss_mlp": 0.01113009, + "balance_loss_clip": 1.00228286, + "balance_loss_mlp": 1.00066614, + "epoch": 0.5263189538554036, + "flos": 21759711943680.0, + "grad_norm": 1.5421766513962814, + "language_loss": 0.83842945, + "learning_rate": 1.9266029579976785e-06, + "loss": 0.86107719, + "num_input_tokens_seen": 188111165, + "step": 8754, + "time_per_iteration": 2.579664468765259 + }, + { + "auxiliary_loss_clip": 0.01150335, + "auxiliary_loss_mlp": 0.01113432, + "balance_loss_clip": 1.00218725, + "balance_loss_mlp": 1.00070763, + "epoch": 0.5263790771080715, + "flos": 14276359140480.0, + "grad_norm": 2.379677522195315, + "language_loss": 0.8720696, + "learning_rate": 1.926213760058522e-06, + "loss": 0.89470732, + "num_input_tokens_seen": 188127825, + "step": 8755, + "time_per_iteration": 2.508582353591919 + }, + { + "auxiliary_loss_clip": 0.01112732, + "auxiliary_loss_mlp": 0.01090117, + "balance_loss_clip": 1.0016253, + "balance_loss_mlp": 1.00009012, + "epoch": 0.5264392003607395, + "flos": 65806413528960.0, + "grad_norm": 0.7206369831926013, + "language_loss": 0.58873141, + "learning_rate": 1.9258245649173477e-06, + "loss": 0.61075991, + "num_input_tokens_seen": 188194050, + "step": 8756, + "time_per_iteration": 3.2593114376068115 + }, + { + "auxiliary_loss_clip": 0.01120625, + "auxiliary_loss_mlp": 0.0111315, + "balance_loss_clip": 1.00214791, + "balance_loss_mlp": 1.00061607, + "epoch": 0.5264993236134075, + "flos": 21032413361280.0, + "grad_norm": 1.872746585698782, + "language_loss": 0.70404673, + "learning_rate": 1.925435372588913e-06, + "loss": 0.72638446, + "num_input_tokens_seen": 188212565, + "step": 8757, + "time_per_iteration": 2.63816499710083 + }, + { + "auxiliary_loss_clip": 0.01150479, + "auxiliary_loss_mlp": 0.01112942, + "balance_loss_clip": 1.00231636, + "balance_loss_mlp": 1.00059915, + "epoch": 0.5265594468660755, + "flos": 16618202150400.0, + "grad_norm": 1.5804219624381524, + "language_loss": 0.87619478, + "learning_rate": 1.9250461830879768e-06, + "loss": 0.89882898, + "num_input_tokens_seen": 188229505, + "step": 8758, + "time_per_iteration": 2.527167797088623 + }, + { + "auxiliary_loss_clip": 0.01086931, + "auxiliary_loss_mlp": 0.01113547, + "balance_loss_clip": 1.00199771, + "balance_loss_mlp": 1.00082266, + "epoch": 0.5266195701187434, + "flos": 24134125610880.0, + "grad_norm": 3.530048829066253, + "language_loss": 0.76108712, + "learning_rate": 1.9246569964292965e-06, + "loss": 0.78309184, + "num_input_tokens_seen": 188250395, + "step": 8759, + "time_per_iteration": 2.72178316116333 + }, + { + "auxiliary_loss_clip": 0.01135534, + "auxiliary_loss_mlp": 0.01111599, + "balance_loss_clip": 1.00215244, + "balance_loss_mlp": 1.00049615, + "epoch": 0.5266796933714114, + "flos": 15844111125120.0, + "grad_norm": 2.161358932361247, + "language_loss": 0.71403402, + "learning_rate": 1.9242678126276307e-06, + "loss": 0.73650533, + "num_input_tokens_seen": 188266785, + "step": 8760, + "time_per_iteration": 2.5746030807495117 + }, + { + "auxiliary_loss_clip": 0.01134217, + "auxiliary_loss_mlp": 0.01113565, + "balance_loss_clip": 1.0020752, + "balance_loss_mlp": 1.00064981, + "epoch": 0.5267398166240793, + "flos": 20951434149120.0, + "grad_norm": 5.361098539225, + "language_loss": 0.75746417, + "learning_rate": 1.923878631697736e-06, + "loss": 0.77994204, + "num_input_tokens_seen": 188282525, + "step": 8761, + "time_per_iteration": 2.605762481689453 + }, + { + "auxiliary_loss_clip": 0.01152116, + "auxiliary_loss_mlp": 0.00747336, + "balance_loss_clip": 1.002213, + "balance_loss_mlp": 1.00052285, + "epoch": 0.5267999398767473, + "flos": 20996394998400.0, + "grad_norm": 2.1961220361283647, + "language_loss": 0.70706594, + "learning_rate": 1.923489453654373e-06, + "loss": 0.72606039, + "num_input_tokens_seen": 188301395, + "step": 8762, + "time_per_iteration": 2.5593016147613525 + }, + { + "auxiliary_loss_clip": 0.01130728, + "auxiliary_loss_mlp": 0.01089948, + "balance_loss_clip": 1.00162029, + "balance_loss_mlp": 0.99992162, + "epoch": 0.5268600631294152, + "flos": 66849401767680.0, + "grad_norm": 0.9145274517640642, + "language_loss": 0.65451866, + "learning_rate": 1.9231002785122963e-06, + "loss": 0.67672545, + "num_input_tokens_seen": 188357665, + "step": 8763, + "time_per_iteration": 3.0517735481262207 + }, + { + "auxiliary_loss_clip": 0.01150466, + "auxiliary_loss_mlp": 0.01113484, + "balance_loss_clip": 1.00222111, + "balance_loss_mlp": 1.00066411, + "epoch": 0.5269201863820833, + "flos": 17165552572800.0, + "grad_norm": 3.055323576267083, + "language_loss": 0.71196449, + "learning_rate": 1.922711106286265e-06, + "loss": 0.734604, + "num_input_tokens_seen": 188376935, + "step": 8764, + "time_per_iteration": 2.5952134132385254 + }, + { + "auxiliary_loss_clip": 0.01122753, + "auxiliary_loss_mlp": 0.01113454, + "balance_loss_clip": 1.00229645, + "balance_loss_mlp": 1.00053859, + "epoch": 0.5269803096347513, + "flos": 20522589672960.0, + "grad_norm": 1.6227115744273568, + "language_loss": 0.74276638, + "learning_rate": 1.9223219369910368e-06, + "loss": 0.76512849, + "num_input_tokens_seen": 188394995, + "step": 8765, + "time_per_iteration": 2.7010557651519775 + }, + { + "auxiliary_loss_clip": 0.01135574, + "auxiliary_loss_mlp": 0.01113115, + "balance_loss_clip": 1.00212061, + "balance_loss_mlp": 1.00048625, + "epoch": 0.5270404328874192, + "flos": 27230989524480.0, + "grad_norm": 1.4807206282257954, + "language_loss": 0.85345852, + "learning_rate": 1.9219327706413677e-06, + "loss": 0.87594545, + "num_input_tokens_seen": 188415475, + "step": 8766, + "time_per_iteration": 2.677938938140869 + }, + { + "auxiliary_loss_clip": 0.01167313, + "auxiliary_loss_mlp": 0.01113314, + "balance_loss_clip": 1.00239849, + "balance_loss_mlp": 1.00068498, + "epoch": 0.5271005561400872, + "flos": 23110491824640.0, + "grad_norm": 2.036357144243833, + "language_loss": 0.79205847, + "learning_rate": 1.921543607252017e-06, + "loss": 0.81486475, + "num_input_tokens_seen": 188435665, + "step": 8767, + "time_per_iteration": 2.5775678157806396 + }, + { + "auxiliary_loss_clip": 0.01151578, + "auxiliary_loss_mlp": 0.01112752, + "balance_loss_clip": 1.00220382, + "balance_loss_mlp": 1.00050449, + "epoch": 0.5271606793927551, + "flos": 22564793427840.0, + "grad_norm": 1.7626131936211245, + "language_loss": 0.73636842, + "learning_rate": 1.9211544468377394e-06, + "loss": 0.75901169, + "num_input_tokens_seen": 188455405, + "step": 8768, + "time_per_iteration": 2.5907087326049805 + }, + { + "auxiliary_loss_clip": 0.01136586, + "auxiliary_loss_mlp": 0.01112772, + "balance_loss_clip": 1.00214577, + "balance_loss_mlp": 1.0008111, + "epoch": 0.5272208026454231, + "flos": 18764259102720.0, + "grad_norm": 1.9464603890760046, + "language_loss": 0.73995453, + "learning_rate": 1.9207652894132933e-06, + "loss": 0.76244807, + "num_input_tokens_seen": 188472940, + "step": 8769, + "time_per_iteration": 4.073123931884766 + }, + { + "auxiliary_loss_clip": 0.01118395, + "auxiliary_loss_mlp": 0.01112589, + "balance_loss_clip": 1.00204492, + "balance_loss_mlp": 1.00062752, + "epoch": 0.5272809258980911, + "flos": 20412164286720.0, + "grad_norm": 1.8187559538308269, + "language_loss": 0.73794907, + "learning_rate": 1.920376134993436e-06, + "loss": 0.76025891, + "num_input_tokens_seen": 188493035, + "step": 8770, + "time_per_iteration": 2.6369194984436035 + }, + { + "auxiliary_loss_clip": 0.01167076, + "auxiliary_loss_mlp": 0.0111274, + "balance_loss_clip": 1.00231862, + "balance_loss_mlp": 1.00049293, + "epoch": 0.5273410491507591, + "flos": 28256742213120.0, + "grad_norm": 1.8535279068063206, + "language_loss": 0.68177909, + "learning_rate": 1.9199869835929224e-06, + "loss": 0.70457721, + "num_input_tokens_seen": 188513860, + "step": 8771, + "time_per_iteration": 2.5652170181274414 + }, + { + "auxiliary_loss_clip": 0.01150274, + "auxiliary_loss_mlp": 0.0111322, + "balance_loss_clip": 1.00219464, + "balance_loss_mlp": 1.00068617, + "epoch": 0.527401172403427, + "flos": 22455158140800.0, + "grad_norm": 1.67678294239427, + "language_loss": 0.76600194, + "learning_rate": 1.9195978352265115e-06, + "loss": 0.7886368, + "num_input_tokens_seen": 188533345, + "step": 8772, + "time_per_iteration": 5.415428876876831 + }, + { + "auxiliary_loss_clip": 0.01152606, + "auxiliary_loss_mlp": 0.01112895, + "balance_loss_clip": 1.00238895, + "balance_loss_mlp": 1.00093377, + "epoch": 0.527461295656095, + "flos": 21031084558080.0, + "grad_norm": 1.8102831173031335, + "language_loss": 0.65693831, + "learning_rate": 1.9192086899089585e-06, + "loss": 0.67959332, + "num_input_tokens_seen": 188551550, + "step": 8773, + "time_per_iteration": 2.572829246520996 + }, + { + "auxiliary_loss_clip": 0.01118932, + "auxiliary_loss_mlp": 0.01112508, + "balance_loss_clip": 1.00200605, + "balance_loss_mlp": 1.00073743, + "epoch": 0.5275214189087629, + "flos": 26322018929280.0, + "grad_norm": 2.084547565497379, + "language_loss": 0.86048925, + "learning_rate": 1.91881954765502e-06, + "loss": 0.88280368, + "num_input_tokens_seen": 188571615, + "step": 8774, + "time_per_iteration": 2.7033603191375732 + }, + { + "auxiliary_loss_clip": 0.01135943, + "auxiliary_loss_mlp": 0.01112922, + "balance_loss_clip": 1.00227487, + "balance_loss_mlp": 1.00057948, + "epoch": 0.5275815421614309, + "flos": 20047024581120.0, + "grad_norm": 2.262670928136856, + "language_loss": 0.79775953, + "learning_rate": 1.9184304084794523e-06, + "loss": 0.82024825, + "num_input_tokens_seen": 188591965, + "step": 8775, + "time_per_iteration": 2.6057615280151367 + }, + { + "auxiliary_loss_clip": 0.01137066, + "auxiliary_loss_mlp": 0.01112298, + "balance_loss_clip": 1.0022856, + "balance_loss_mlp": 1.00071812, + "epoch": 0.5276416654140988, + "flos": 21432206712960.0, + "grad_norm": 1.6515572110142271, + "language_loss": 0.83646691, + "learning_rate": 1.918041272397012e-06, + "loss": 0.85896063, + "num_input_tokens_seen": 188610675, + "step": 8776, + "time_per_iteration": 4.036803483963013 + }, + { + "auxiliary_loss_clip": 0.01133784, + "auxiliary_loss_mlp": 0.01112687, + "balance_loss_clip": 1.00193977, + "balance_loss_mlp": 1.00072527, + "epoch": 0.5277017886667669, + "flos": 17165085696000.0, + "grad_norm": 1.5678738413119648, + "language_loss": 0.67298353, + "learning_rate": 1.9176521394224547e-06, + "loss": 0.69544816, + "num_input_tokens_seen": 188628235, + "step": 8777, + "time_per_iteration": 2.5600481033325195 + }, + { + "auxiliary_loss_clip": 0.0114162, + "auxiliary_loss_mlp": 0.01112181, + "balance_loss_clip": 1.00233984, + "balance_loss_mlp": 1.00069594, + "epoch": 0.5277619119194349, + "flos": 20448146736000.0, + "grad_norm": 2.1366881893729457, + "language_loss": 0.81890291, + "learning_rate": 1.9172630095705358e-06, + "loss": 0.84144092, + "num_input_tokens_seen": 188648925, + "step": 8778, + "time_per_iteration": 2.600903034210205 + }, + { + "auxiliary_loss_clip": 0.01150094, + "auxiliary_loss_mlp": 0.01113742, + "balance_loss_clip": 1.00227797, + "balance_loss_mlp": 1.00073195, + "epoch": 0.5278220351721028, + "flos": 24061083304320.0, + "grad_norm": 1.8506829723101377, + "language_loss": 0.79366416, + "learning_rate": 1.916873882856013e-06, + "loss": 0.81630248, + "num_input_tokens_seen": 188668125, + "step": 8779, + "time_per_iteration": 2.570988893508911 + }, + { + "auxiliary_loss_clip": 0.01150208, + "auxiliary_loss_mlp": 0.01111625, + "balance_loss_clip": 1.00208354, + "balance_loss_mlp": 1.00061703, + "epoch": 0.5278821584247708, + "flos": 24642907804800.0, + "grad_norm": 2.4136730603805785, + "language_loss": 0.76554298, + "learning_rate": 1.9164847592936406e-06, + "loss": 0.78816134, + "num_input_tokens_seen": 188684410, + "step": 8780, + "time_per_iteration": 2.5731637477874756 + }, + { + "auxiliary_loss_clip": 0.0111785, + "auxiliary_loss_mlp": 0.01113367, + "balance_loss_clip": 1.00206804, + "balance_loss_mlp": 1.00045228, + "epoch": 0.5279422816774387, + "flos": 35408244240000.0, + "grad_norm": 3.174582993005633, + "language_loss": 0.69582814, + "learning_rate": 1.916095638898174e-06, + "loss": 0.71814036, + "num_input_tokens_seen": 188706130, + "step": 8781, + "time_per_iteration": 2.7794508934020996 + }, + { + "auxiliary_loss_clip": 0.01150307, + "auxiliary_loss_mlp": 0.01111982, + "balance_loss_clip": 1.0022459, + "balance_loss_mlp": 1.00068796, + "epoch": 0.5280024049301068, + "flos": 22967028904320.0, + "grad_norm": 1.6146741112771152, + "language_loss": 0.72079295, + "learning_rate": 1.9157065216843696e-06, + "loss": 0.74341589, + "num_input_tokens_seen": 188725030, + "step": 8782, + "time_per_iteration": 2.5815610885620117 + }, + { + "auxiliary_loss_clip": 0.01133337, + "auxiliary_loss_mlp": 0.01112603, + "balance_loss_clip": 1.00204289, + "balance_loss_mlp": 1.00054622, + "epoch": 0.5280625281827747, + "flos": 21507619317120.0, + "grad_norm": 4.0799697800891295, + "language_loss": 0.68351996, + "learning_rate": 1.915317407666982e-06, + "loss": 0.70597935, + "num_input_tokens_seen": 188744325, + "step": 8783, + "time_per_iteration": 2.6064047813415527 + }, + { + "auxiliary_loss_clip": 0.01152597, + "auxiliary_loss_mlp": 0.01115918, + "balance_loss_clip": 1.00245547, + "balance_loss_mlp": 1.00052381, + "epoch": 0.5281226514354427, + "flos": 31208167958400.0, + "grad_norm": 2.2064349838685384, + "language_loss": 0.69415438, + "learning_rate": 1.9149282968607674e-06, + "loss": 0.71683949, + "num_input_tokens_seen": 188765100, + "step": 8784, + "time_per_iteration": 2.6348953247070312 + }, + { + "auxiliary_loss_clip": 0.01167339, + "auxiliary_loss_mlp": 0.01114686, + "balance_loss_clip": 1.00226355, + "balance_loss_mlp": 1.00053096, + "epoch": 0.5281827746881106, + "flos": 25077821679360.0, + "grad_norm": 2.1892939663577495, + "language_loss": 0.75095701, + "learning_rate": 1.91453918928048e-06, + "loss": 0.77377725, + "num_input_tokens_seen": 188783995, + "step": 8785, + "time_per_iteration": 2.5549728870391846 + }, + { + "auxiliary_loss_clip": 0.01150393, + "auxiliary_loss_mlp": 0.01112956, + "balance_loss_clip": 1.00226545, + "balance_loss_mlp": 1.00070858, + "epoch": 0.5282428979407786, + "flos": 20631255292800.0, + "grad_norm": 2.757173546288414, + "language_loss": 0.83613324, + "learning_rate": 1.9141500849408745e-06, + "loss": 0.85876667, + "num_input_tokens_seen": 188803120, + "step": 8786, + "time_per_iteration": 2.5831398963928223 + }, + { + "auxiliary_loss_clip": 0.01118018, + "auxiliary_loss_mlp": 0.0111193, + "balance_loss_clip": 1.00206816, + "balance_loss_mlp": 1.00063658, + "epoch": 0.5283030211934465, + "flos": 22419391173120.0, + "grad_norm": 4.356837647805671, + "language_loss": 0.82900381, + "learning_rate": 1.9137609838567076e-06, + "loss": 0.85130328, + "num_input_tokens_seen": 188820960, + "step": 8787, + "time_per_iteration": 2.640739917755127 + }, + { + "auxiliary_loss_clip": 0.01102476, + "auxiliary_loss_mlp": 0.01112446, + "balance_loss_clip": 1.00200224, + "balance_loss_mlp": 1.00048447, + "epoch": 0.5283631444461145, + "flos": 23615467176960.0, + "grad_norm": 1.690703962498913, + "language_loss": 0.83355665, + "learning_rate": 1.9133718860427316e-06, + "loss": 0.85570586, + "num_input_tokens_seen": 188837165, + "step": 8788, + "time_per_iteration": 2.6789844036102295 + }, + { + "auxiliary_loss_clip": 0.01118797, + "auxiliary_loss_mlp": 0.01112402, + "balance_loss_clip": 1.00208235, + "balance_loss_mlp": 1.00063181, + "epoch": 0.5284232676987825, + "flos": 32671994918400.0, + "grad_norm": 2.1767548251887274, + "language_loss": 0.74829608, + "learning_rate": 1.9129827915137027e-06, + "loss": 0.77060807, + "num_input_tokens_seen": 188858555, + "step": 8789, + "time_per_iteration": 2.733076572418213 + }, + { + "auxiliary_loss_clip": 0.01150568, + "auxiliary_loss_mlp": 0.01114525, + "balance_loss_clip": 1.00230813, + "balance_loss_mlp": 1.0006566, + "epoch": 0.5284833909514505, + "flos": 26760919213440.0, + "grad_norm": 1.7850815457402691, + "language_loss": 0.69723356, + "learning_rate": 1.9125937002843754e-06, + "loss": 0.71988446, + "num_input_tokens_seen": 188879050, + "step": 8790, + "time_per_iteration": 2.5990943908691406 + }, + { + "auxiliary_loss_clip": 0.0116698, + "auxiliary_loss_mlp": 0.01112798, + "balance_loss_clip": 1.00226712, + "balance_loss_mlp": 1.00064588, + "epoch": 0.5285435142041185, + "flos": 22090700793600.0, + "grad_norm": 1.9208258223357082, + "language_loss": 0.79128152, + "learning_rate": 1.9122046123695036e-06, + "loss": 0.81407928, + "num_input_tokens_seen": 188898885, + "step": 8791, + "time_per_iteration": 2.5277390480041504 + }, + { + "auxiliary_loss_clip": 0.01108444, + "auxiliary_loss_mlp": 0.01112151, + "balance_loss_clip": 1.00222957, + "balance_loss_mlp": 1.00066614, + "epoch": 0.5286036374567864, + "flos": 20375463565440.0, + "grad_norm": 1.9457076011785341, + "language_loss": 0.66000569, + "learning_rate": 1.9118155277838423e-06, + "loss": 0.68221164, + "num_input_tokens_seen": 188917225, + "step": 8792, + "time_per_iteration": 2.6466946601867676 + }, + { + "auxiliary_loss_clip": 0.01135324, + "auxiliary_loss_mlp": 0.01112834, + "balance_loss_clip": 1.0021801, + "balance_loss_mlp": 1.00058651, + "epoch": 0.5286637607094544, + "flos": 24352175122560.0, + "grad_norm": 2.207965357573179, + "language_loss": 0.79373556, + "learning_rate": 1.9114264465421443e-06, + "loss": 0.81621718, + "num_input_tokens_seen": 188936120, + "step": 8793, + "time_per_iteration": 2.615837812423706 + }, + { + "auxiliary_loss_clip": 0.01167092, + "auxiliary_loss_mlp": 0.01113089, + "balance_loss_clip": 1.00226283, + "balance_loss_mlp": 1.00074577, + "epoch": 0.5287238839621223, + "flos": 17271165536640.0, + "grad_norm": 3.6633086114495494, + "language_loss": 0.84263802, + "learning_rate": 1.9110373686591645e-06, + "loss": 0.86543989, + "num_input_tokens_seen": 188953405, + "step": 8794, + "time_per_iteration": 2.525081157684326 + }, + { + "auxiliary_loss_clip": 0.01135675, + "auxiliary_loss_mlp": 0.01113834, + "balance_loss_clip": 1.00203443, + "balance_loss_mlp": 1.00082397, + "epoch": 0.5287840072147904, + "flos": 17566890209280.0, + "grad_norm": 2.734410984490857, + "language_loss": 0.6796149, + "learning_rate": 1.9106482941496564e-06, + "loss": 0.70210999, + "num_input_tokens_seen": 188971150, + "step": 8795, + "time_per_iteration": 2.5910751819610596 + }, + { + "auxiliary_loss_clip": 0.01134411, + "auxiliary_loss_mlp": 0.01112766, + "balance_loss_clip": 1.00207603, + "balance_loss_mlp": 1.00061369, + "epoch": 0.5288441304674583, + "flos": 18552099421440.0, + "grad_norm": 1.9354062654311674, + "language_loss": 0.80457473, + "learning_rate": 1.910259223028374e-06, + "loss": 0.82704651, + "num_input_tokens_seen": 188989550, + "step": 8796, + "time_per_iteration": 2.5775222778320312 + }, + { + "auxiliary_loss_clip": 0.0112212, + "auxiliary_loss_mlp": 0.01113212, + "balance_loss_clip": 1.00237334, + "balance_loss_mlp": 1.00067878, + "epoch": 0.5289042537201263, + "flos": 20814507504000.0, + "grad_norm": 1.933467139400768, + "language_loss": 0.69863707, + "learning_rate": 1.909870155310071e-06, + "loss": 0.7209903, + "num_input_tokens_seen": 189008795, + "step": 8797, + "time_per_iteration": 2.640835762023926 + }, + { + "auxiliary_loss_clip": 0.01150585, + "auxiliary_loss_mlp": 0.01112594, + "balance_loss_clip": 1.00220644, + "balance_loss_mlp": 1.00072849, + "epoch": 0.5289643769727942, + "flos": 15735265937280.0, + "grad_norm": 1.5357155261700077, + "language_loss": 0.82104993, + "learning_rate": 1.9094810910095005e-06, + "loss": 0.84368169, + "num_input_tokens_seen": 189025540, + "step": 8798, + "time_per_iteration": 2.544132709503174 + }, + { + "auxiliary_loss_clip": 0.01135382, + "auxiliary_loss_mlp": 0.00747441, + "balance_loss_clip": 1.00215566, + "balance_loss_mlp": 1.00050962, + "epoch": 0.5290245002254622, + "flos": 19537308633600.0, + "grad_norm": 1.7901594107039323, + "language_loss": 0.71071291, + "learning_rate": 1.9090920301414166e-06, + "loss": 0.72954118, + "num_input_tokens_seen": 189044885, + "step": 8799, + "time_per_iteration": 2.5798256397247314 + }, + { + "auxiliary_loss_clip": 0.01150639, + "auxiliary_loss_mlp": 0.01111901, + "balance_loss_clip": 1.00224137, + "balance_loss_mlp": 1.00060761, + "epoch": 0.5290846234781301, + "flos": 15815131827840.0, + "grad_norm": 1.861920477916122, + "language_loss": 0.69729984, + "learning_rate": 1.9087029727205716e-06, + "loss": 0.71992517, + "num_input_tokens_seen": 189061280, + "step": 8800, + "time_per_iteration": 2.516937732696533 + }, + { + "auxiliary_loss_clip": 0.01115681, + "auxiliary_loss_mlp": 0.01089991, + "balance_loss_clip": 1.0015974, + "balance_loss_mlp": 0.99996436, + "epoch": 0.5291447467307981, + "flos": 70057624821120.0, + "grad_norm": 0.9936452399039518, + "language_loss": 0.57011008, + "learning_rate": 1.9083139187617193e-06, + "loss": 0.59216678, + "num_input_tokens_seen": 189114775, + "step": 8801, + "time_per_iteration": 3.0822036266326904 + }, + { + "auxiliary_loss_clip": 0.01136628, + "auxiliary_loss_mlp": 0.01112259, + "balance_loss_clip": 1.0021857, + "balance_loss_mlp": 1.00077426, + "epoch": 0.529204869983466, + "flos": 28364186770560.0, + "grad_norm": 1.8499349556954259, + "language_loss": 0.63956392, + "learning_rate": 1.9079248682796123e-06, + "loss": 0.66205281, + "num_input_tokens_seen": 189134700, + "step": 8802, + "time_per_iteration": 2.658437967300415 + }, + { + "auxiliary_loss_clip": 0.01135632, + "auxiliary_loss_mlp": 0.01112589, + "balance_loss_clip": 1.00215626, + "balance_loss_mlp": 1.00043678, + "epoch": 0.5292649932361341, + "flos": 33758830684800.0, + "grad_norm": 1.8961451994113339, + "language_loss": 0.68807197, + "learning_rate": 1.907535821289003e-06, + "loss": 0.71055418, + "num_input_tokens_seen": 189155365, + "step": 8803, + "time_per_iteration": 2.7020816802978516 + }, + { + "auxiliary_loss_clip": 0.01150551, + "auxiliary_loss_mlp": 0.00747516, + "balance_loss_clip": 1.00226688, + "balance_loss_mlp": 1.00054765, + "epoch": 0.5293251164888021, + "flos": 20447679859200.0, + "grad_norm": 1.611650426996047, + "language_loss": 0.76345247, + "learning_rate": 1.9071467778046458e-06, + "loss": 0.78243315, + "num_input_tokens_seen": 189173885, + "step": 8804, + "time_per_iteration": 2.561643362045288 + }, + { + "auxiliary_loss_clip": 0.01147483, + "auxiliary_loss_mlp": 0.01090039, + "balance_loss_clip": 1.00163317, + "balance_loss_mlp": 1.00001192, + "epoch": 0.52938523974147, + "flos": 66545312204160.0, + "grad_norm": 0.7501627285308373, + "language_loss": 0.52965331, + "learning_rate": 1.906757737841291e-06, + "loss": 0.55202854, + "num_input_tokens_seen": 189236515, + "step": 8805, + "time_per_iteration": 3.200483798980713 + }, + { + "auxiliary_loss_clip": 0.01147305, + "auxiliary_loss_mlp": 0.01090441, + "balance_loss_clip": 1.0016259, + "balance_loss_mlp": 1.00003266, + "epoch": 0.529445362994138, + "flos": 67151734542720.0, + "grad_norm": 0.7394320570808659, + "language_loss": 0.6384753, + "learning_rate": 1.906368701413693e-06, + "loss": 0.66085279, + "num_input_tokens_seen": 189300500, + "step": 8806, + "time_per_iteration": 3.1186351776123047 + }, + { + "auxiliary_loss_clip": 0.01150556, + "auxiliary_loss_mlp": 0.01113323, + "balance_loss_clip": 1.00218201, + "balance_loss_mlp": 1.00069368, + "epoch": 0.5295054862468059, + "flos": 17749316407680.0, + "grad_norm": 1.5429858229171796, + "language_loss": 0.72386467, + "learning_rate": 1.9059796685366026e-06, + "loss": 0.74650347, + "num_input_tokens_seen": 189319745, + "step": 8807, + "time_per_iteration": 3.918306827545166 + }, + { + "auxiliary_loss_clip": 0.0112304, + "auxiliary_loss_mlp": 0.01112115, + "balance_loss_clip": 1.00228953, + "balance_loss_mlp": 1.00072598, + "epoch": 0.529565609499474, + "flos": 11397401084160.0, + "grad_norm": 2.2588081749947255, + "language_loss": 0.69480795, + "learning_rate": 1.9055906392247723e-06, + "loss": 0.71715951, + "num_input_tokens_seen": 189334550, + "step": 8808, + "time_per_iteration": 2.741164207458496 + }, + { + "auxiliary_loss_clip": 0.01150559, + "auxiliary_loss_mlp": 0.0111232, + "balance_loss_clip": 1.00222003, + "balance_loss_mlp": 1.00054944, + "epoch": 0.5296257327521419, + "flos": 17196363463680.0, + "grad_norm": 1.9098564899089594, + "language_loss": 0.86752617, + "learning_rate": 1.9052016134929554e-06, + "loss": 0.89015496, + "num_input_tokens_seen": 189351735, + "step": 8809, + "time_per_iteration": 2.54032301902771 + }, + { + "auxiliary_loss_clip": 0.01150473, + "auxiliary_loss_mlp": 0.01114406, + "balance_loss_clip": 1.00213861, + "balance_loss_mlp": 1.00072789, + "epoch": 0.5296858560048099, + "flos": 39964086777600.0, + "grad_norm": 1.6949163429036977, + "language_loss": 0.63940036, + "learning_rate": 1.9048125913559016e-06, + "loss": 0.66204917, + "num_input_tokens_seen": 189373105, + "step": 8810, + "time_per_iteration": 5.46829628944397 + }, + { + "auxiliary_loss_clip": 0.01166938, + "auxiliary_loss_mlp": 0.01111965, + "balance_loss_clip": 1.00231266, + "balance_loss_mlp": 1.00057554, + "epoch": 0.5297459792574778, + "flos": 20961418129920.0, + "grad_norm": 1.6955248355300188, + "language_loss": 0.68139493, + "learning_rate": 1.9044235728283646e-06, + "loss": 0.70418406, + "num_input_tokens_seen": 189394615, + "step": 8811, + "time_per_iteration": 2.6439714431762695 + }, + { + "auxiliary_loss_clip": 0.01101445, + "auxiliary_loss_mlp": 0.0109078, + "balance_loss_clip": 1.00186825, + "balance_loss_mlp": 0.99998993, + "epoch": 0.5298061025101458, + "flos": 66523620389760.0, + "grad_norm": 0.6636006678642797, + "language_loss": 0.53366685, + "learning_rate": 1.9040345579250953e-06, + "loss": 0.55558908, + "num_input_tokens_seen": 189459750, + "step": 8812, + "time_per_iteration": 3.408076286315918 + }, + { + "auxiliary_loss_clip": 0.01128839, + "auxiliary_loss_mlp": 0.01090445, + "balance_loss_clip": 1.00160789, + "balance_loss_mlp": 1.00003684, + "epoch": 0.5298662257628137, + "flos": 67662994775040.0, + "grad_norm": 0.7513750593855008, + "language_loss": 0.56295937, + "learning_rate": 1.9036455466608453e-06, + "loss": 0.58515221, + "num_input_tokens_seen": 189527540, + "step": 8813, + "time_per_iteration": 3.197500467300415 + }, + { + "auxiliary_loss_clip": 0.01110075, + "auxiliary_loss_mlp": 0.01111034, + "balance_loss_clip": 1.00222278, + "balance_loss_mlp": 1.00059795, + "epoch": 0.5299263490154817, + "flos": 19646405216640.0, + "grad_norm": 1.7159446932988438, + "language_loss": 0.81624508, + "learning_rate": 1.9032565390503657e-06, + "loss": 0.83845615, + "num_input_tokens_seen": 189546900, + "step": 8814, + "time_per_iteration": 4.182138204574585 + }, + { + "auxiliary_loss_clip": 0.01167154, + "auxiliary_loss_mlp": 0.0111303, + "balance_loss_clip": 1.00220704, + "balance_loss_mlp": 1.00059223, + "epoch": 0.5299864722681497, + "flos": 22055005653120.0, + "grad_norm": 2.141817017346689, + "language_loss": 0.84821296, + "learning_rate": 1.9028675351084076e-06, + "loss": 0.87101477, + "num_input_tokens_seen": 189566490, + "step": 8815, + "time_per_iteration": 2.5173652172088623 + }, + { + "auxiliary_loss_clip": 0.01167147, + "auxiliary_loss_mlp": 0.01112526, + "balance_loss_clip": 1.00242019, + "balance_loss_mlp": 1.0006597, + "epoch": 0.5300465955208177, + "flos": 21763698353280.0, + "grad_norm": 1.825434621942555, + "language_loss": 0.66589159, + "learning_rate": 1.9024785348497225e-06, + "loss": 0.68868834, + "num_input_tokens_seen": 189585580, + "step": 8816, + "time_per_iteration": 2.5133070945739746 + }, + { + "auxiliary_loss_clip": 0.0113552, + "auxiliary_loss_mlp": 0.01112234, + "balance_loss_clip": 1.00208306, + "balance_loss_mlp": 1.00074923, + "epoch": 0.5301067187734857, + "flos": 42996491735040.0, + "grad_norm": 1.5293514355334048, + "language_loss": 0.72315687, + "learning_rate": 1.9020895382890611e-06, + "loss": 0.74563444, + "num_input_tokens_seen": 189608485, + "step": 8817, + "time_per_iteration": 2.788569450378418 + }, + { + "auxiliary_loss_clip": 0.01134985, + "auxiliary_loss_mlp": 0.01112805, + "balance_loss_clip": 1.00195599, + "balance_loss_mlp": 1.00055742, + "epoch": 0.5301668420261536, + "flos": 20554298403840.0, + "grad_norm": 1.6528220691216422, + "language_loss": 0.65325564, + "learning_rate": 1.9017005454411743e-06, + "loss": 0.67573357, + "num_input_tokens_seen": 189627815, + "step": 8818, + "time_per_iteration": 2.7026994228363037 + }, + { + "auxiliary_loss_clip": 0.01104611, + "auxiliary_loss_mlp": 0.01112568, + "balance_loss_clip": 1.00203025, + "balance_loss_mlp": 1.00051117, + "epoch": 0.5302269652788216, + "flos": 17486665182720.0, + "grad_norm": 2.151325047312495, + "language_loss": 0.7455523, + "learning_rate": 1.9013115563208126e-06, + "loss": 0.7677241, + "num_input_tokens_seen": 189644850, + "step": 8819, + "time_per_iteration": 2.6624934673309326 + }, + { + "auxiliary_loss_clip": 0.0111883, + "auxiliary_loss_mlp": 0.01114004, + "balance_loss_clip": 1.00209773, + "balance_loss_mlp": 1.00070715, + "epoch": 0.5302870885314895, + "flos": 14574202715520.0, + "grad_norm": 2.335332135943329, + "language_loss": 0.82329726, + "learning_rate": 1.9009225709427267e-06, + "loss": 0.84562558, + "num_input_tokens_seen": 189660945, + "step": 8820, + "time_per_iteration": 2.6118011474609375 + }, + { + "auxiliary_loss_clip": 0.01135077, + "auxiliary_loss_mlp": 0.01111659, + "balance_loss_clip": 1.00211799, + "balance_loss_mlp": 1.0006516, + "epoch": 0.5303472117841576, + "flos": 23438032968960.0, + "grad_norm": 1.4170736948173275, + "language_loss": 0.72510469, + "learning_rate": 1.9005335893216667e-06, + "loss": 0.747572, + "num_input_tokens_seen": 189680425, + "step": 8821, + "time_per_iteration": 2.6194658279418945 + }, + { + "auxiliary_loss_clip": 0.01139778, + "auxiliary_loss_mlp": 0.01112457, + "balance_loss_clip": 1.00219917, + "balance_loss_mlp": 1.00049531, + "epoch": 0.5304073350368255, + "flos": 22709010533760.0, + "grad_norm": 1.422362780781109, + "language_loss": 0.7402336, + "learning_rate": 1.9001446114723824e-06, + "loss": 0.76275593, + "num_input_tokens_seen": 189700375, + "step": 8822, + "time_per_iteration": 2.609003782272339 + }, + { + "auxiliary_loss_clip": 0.01120133, + "auxiliary_loss_mlp": 0.01113708, + "balance_loss_clip": 1.002069, + "balance_loss_mlp": 1.00069761, + "epoch": 0.5304674582894935, + "flos": 27928554624000.0, + "grad_norm": 1.7516698852235517, + "language_loss": 0.67978299, + "learning_rate": 1.8997556374096257e-06, + "loss": 0.70212138, + "num_input_tokens_seen": 189721225, + "step": 8823, + "time_per_iteration": 2.6989309787750244 + }, + { + "auxiliary_loss_clip": 0.01167176, + "auxiliary_loss_mlp": 0.01112753, + "balance_loss_clip": 1.00225592, + "balance_loss_mlp": 1.00060141, + "epoch": 0.5305275815421614, + "flos": 21250642440960.0, + "grad_norm": 2.0755475636801815, + "language_loss": 0.69019127, + "learning_rate": 1.8993666671481444e-06, + "loss": 0.71299058, + "num_input_tokens_seen": 189740170, + "step": 8824, + "time_per_iteration": 2.5360140800476074 + }, + { + "auxiliary_loss_clip": 0.01133648, + "auxiliary_loss_mlp": 0.00747357, + "balance_loss_clip": 1.00219059, + "balance_loss_mlp": 1.00046015, + "epoch": 0.5305877047948294, + "flos": 17603088140160.0, + "grad_norm": 1.8680237163309237, + "language_loss": 0.76601052, + "learning_rate": 1.898977700702689e-06, + "loss": 0.78482062, + "num_input_tokens_seen": 189757890, + "step": 8825, + "time_per_iteration": 2.5983283519744873 + }, + { + "auxiliary_loss_clip": 0.01070183, + "auxiliary_loss_mlp": 0.01112015, + "balance_loss_clip": 1.00187898, + "balance_loss_mlp": 1.00081706, + "epoch": 0.5306478280474973, + "flos": 15195493284480.0, + "grad_norm": 1.9517465595357164, + "language_loss": 0.85685658, + "learning_rate": 1.8985887380880103e-06, + "loss": 0.87867856, + "num_input_tokens_seen": 189775390, + "step": 8826, + "time_per_iteration": 2.7661843299865723 + }, + { + "auxiliary_loss_clip": 0.01166848, + "auxiliary_loss_mlp": 0.01112065, + "balance_loss_clip": 1.00214946, + "balance_loss_mlp": 1.00077128, + "epoch": 0.5307079513001653, + "flos": 15341218761600.0, + "grad_norm": 1.632907363582354, + "language_loss": 0.6438185, + "learning_rate": 1.8981997793188558e-06, + "loss": 0.66660762, + "num_input_tokens_seen": 189793975, + "step": 8827, + "time_per_iteration": 2.500453472137451 + }, + { + "auxiliary_loss_clip": 0.01135083, + "auxiliary_loss_mlp": 0.01112543, + "balance_loss_clip": 1.00219345, + "balance_loss_mlp": 1.00067687, + "epoch": 0.5307680745528333, + "flos": 43544452688640.0, + "grad_norm": 1.4659204755130193, + "language_loss": 0.60010254, + "learning_rate": 1.8978108244099762e-06, + "loss": 0.6225788, + "num_input_tokens_seen": 189817870, + "step": 8828, + "time_per_iteration": 2.78031325340271 + }, + { + "auxiliary_loss_clip": 0.01152198, + "auxiliary_loss_mlp": 0.01112913, + "balance_loss_clip": 1.00233006, + "balance_loss_mlp": 1.00047445, + "epoch": 0.5308281978055013, + "flos": 20048928001920.0, + "grad_norm": 1.6470134834510322, + "language_loss": 0.8130371, + "learning_rate": 1.8974218733761208e-06, + "loss": 0.83568823, + "num_input_tokens_seen": 189837905, + "step": 8829, + "time_per_iteration": 2.5833752155303955 + }, + { + "auxiliary_loss_clip": 0.01150521, + "auxiliary_loss_mlp": 0.01111961, + "balance_loss_clip": 1.00207663, + "balance_loss_mlp": 1.00057197, + "epoch": 0.5308883210581693, + "flos": 20703938463360.0, + "grad_norm": 1.469712649845116, + "language_loss": 0.78361404, + "learning_rate": 1.8970329262320375e-06, + "loss": 0.80623889, + "num_input_tokens_seen": 189856970, + "step": 8830, + "time_per_iteration": 2.5739965438842773 + }, + { + "auxiliary_loss_clip": 0.01150365, + "auxiliary_loss_mlp": 0.01112126, + "balance_loss_clip": 1.00207782, + "balance_loss_mlp": 1.00045109, + "epoch": 0.5309484443108372, + "flos": 14355506759040.0, + "grad_norm": 2.3944541915182738, + "language_loss": 0.80746496, + "learning_rate": 1.8966439829924768e-06, + "loss": 0.83008987, + "num_input_tokens_seen": 189872830, + "step": 8831, + "time_per_iteration": 2.5460762977600098 + }, + { + "auxiliary_loss_clip": 0.01150204, + "auxiliary_loss_mlp": 0.0111196, + "balance_loss_clip": 1.00214946, + "balance_loss_mlp": 1.00047565, + "epoch": 0.5310085675635052, + "flos": 20010503427840.0, + "grad_norm": 2.248146887988538, + "language_loss": 0.72890085, + "learning_rate": 1.896255043672186e-06, + "loss": 0.75152248, + "num_input_tokens_seen": 189891635, + "step": 8832, + "time_per_iteration": 2.531743049621582 + }, + { + "auxiliary_loss_clip": 0.0112075, + "auxiliary_loss_mlp": 0.01114016, + "balance_loss_clip": 1.00221133, + "balance_loss_mlp": 1.00062466, + "epoch": 0.5310686908161731, + "flos": 22127293774080.0, + "grad_norm": 2.6152925665165987, + "language_loss": 0.75514323, + "learning_rate": 1.8958661082859143e-06, + "loss": 0.77749085, + "num_input_tokens_seen": 189909050, + "step": 8833, + "time_per_iteration": 2.6494390964508057 + }, + { + "auxiliary_loss_clip": 0.01118528, + "auxiliary_loss_mlp": 0.01112052, + "balance_loss_clip": 1.00195301, + "balance_loss_mlp": 1.00056767, + "epoch": 0.5311288140688412, + "flos": 24717889445760.0, + "grad_norm": 1.9670744330785859, + "language_loss": 0.73608351, + "learning_rate": 1.8954771768484103e-06, + "loss": 0.75838929, + "num_input_tokens_seen": 189927405, + "step": 8834, + "time_per_iteration": 2.6675450801849365 + }, + { + "auxiliary_loss_clip": 0.01167259, + "auxiliary_loss_mlp": 0.01114383, + "balance_loss_clip": 1.00232172, + "balance_loss_mlp": 1.00070453, + "epoch": 0.5311889373215091, + "flos": 24097712198400.0, + "grad_norm": 1.849061485434869, + "language_loss": 0.77393591, + "learning_rate": 1.8950882493744226e-06, + "loss": 0.79675239, + "num_input_tokens_seen": 189947740, + "step": 8835, + "time_per_iteration": 2.5436007976531982 + }, + { + "auxiliary_loss_clip": 0.01134043, + "auxiliary_loss_mlp": 0.011121, + "balance_loss_clip": 1.00207591, + "balance_loss_mlp": 1.00071096, + "epoch": 0.5312490605741771, + "flos": 22017012042240.0, + "grad_norm": 1.5234121689565598, + "language_loss": 0.72443873, + "learning_rate": 1.8946993258786985e-06, + "loss": 0.74690014, + "num_input_tokens_seen": 189966495, + "step": 8836, + "time_per_iteration": 2.596885919570923 + }, + { + "auxiliary_loss_clip": 0.01134073, + "auxiliary_loss_mlp": 0.01112333, + "balance_loss_clip": 1.00211453, + "balance_loss_mlp": 1.00056195, + "epoch": 0.531309183826845, + "flos": 19390541662080.0, + "grad_norm": 1.6858567427314535, + "language_loss": 0.80887753, + "learning_rate": 1.894310406375987e-06, + "loss": 0.83134162, + "num_input_tokens_seen": 189985325, + "step": 8837, + "time_per_iteration": 2.586581230163574 + }, + { + "auxiliary_loss_clip": 0.01150495, + "auxiliary_loss_mlp": 0.01112031, + "balance_loss_clip": 1.00222182, + "balance_loss_mlp": 1.00064182, + "epoch": 0.531369307079513, + "flos": 20190056538240.0, + "grad_norm": 2.145793402767973, + "language_loss": 0.85902059, + "learning_rate": 1.893921490881035e-06, + "loss": 0.88164586, + "num_input_tokens_seen": 190003290, + "step": 8838, + "time_per_iteration": 2.5568795204162598 + }, + { + "auxiliary_loss_clip": 0.01133628, + "auxiliary_loss_mlp": 0.01112227, + "balance_loss_clip": 1.00213838, + "balance_loss_mlp": 1.00064719, + "epoch": 0.5314294303321809, + "flos": 18880143356160.0, + "grad_norm": 1.8382689830155388, + "language_loss": 0.72725821, + "learning_rate": 1.8935325794085906e-06, + "loss": 0.74971676, + "num_input_tokens_seen": 190023260, + "step": 8839, + "time_per_iteration": 2.5953307151794434 + }, + { + "auxiliary_loss_clip": 0.01134971, + "auxiliary_loss_mlp": 0.01112129, + "balance_loss_clip": 1.00208962, + "balance_loss_mlp": 1.00064409, + "epoch": 0.531489553584849, + "flos": 23040035297280.0, + "grad_norm": 1.871972677502477, + "language_loss": 0.76419425, + "learning_rate": 1.8931436719734023e-06, + "loss": 0.78666532, + "num_input_tokens_seen": 190042035, + "step": 8840, + "time_per_iteration": 2.611238718032837 + }, + { + "auxiliary_loss_clip": 0.01116559, + "auxiliary_loss_mlp": 0.01112525, + "balance_loss_clip": 1.00184, + "balance_loss_mlp": 1.00075436, + "epoch": 0.5315496768375169, + "flos": 19790478668160.0, + "grad_norm": 2.8604745315670774, + "language_loss": 0.77313733, + "learning_rate": 1.892754768590216e-06, + "loss": 0.79542816, + "num_input_tokens_seen": 190057545, + "step": 8841, + "time_per_iteration": 2.645958185195923 + }, + { + "auxiliary_loss_clip": 0.01129712, + "auxiliary_loss_mlp": 0.01089285, + "balance_loss_clip": 1.00178158, + "balance_loss_mlp": 1.00002122, + "epoch": 0.5316098000901849, + "flos": 71023228185600.0, + "grad_norm": 0.6849552604950904, + "language_loss": 0.56737578, + "learning_rate": 1.8923658692737793e-06, + "loss": 0.58956575, + "num_input_tokens_seen": 190123800, + "step": 8842, + "time_per_iteration": 3.305138111114502 + }, + { + "auxiliary_loss_clip": 0.01133608, + "auxiliary_loss_mlp": 0.01113741, + "balance_loss_clip": 1.00208879, + "balance_loss_mlp": 1.00063539, + "epoch": 0.5316699233428529, + "flos": 16435560470400.0, + "grad_norm": 1.7218876592529453, + "language_loss": 0.73418224, + "learning_rate": 1.8919769740388407e-06, + "loss": 0.75665569, + "num_input_tokens_seen": 190141625, + "step": 8843, + "time_per_iteration": 2.5963504314422607 + }, + { + "auxiliary_loss_clip": 0.01128284, + "auxiliary_loss_mlp": 0.01088859, + "balance_loss_clip": 1.00151515, + "balance_loss_mlp": 0.99997634, + "epoch": 0.5317300465955208, + "flos": 67420814302080.0, + "grad_norm": 0.8808448000143998, + "language_loss": 0.61040479, + "learning_rate": 1.891588082900145e-06, + "loss": 0.63257623, + "num_input_tokens_seen": 190198110, + "step": 8844, + "time_per_iteration": 4.590775012969971 + }, + { + "auxiliary_loss_clip": 0.01147601, + "auxiliary_loss_mlp": 0.01088914, + "balance_loss_clip": 1.00175333, + "balance_loss_mlp": 1.00003183, + "epoch": 0.5317901698481888, + "flos": 59508075340800.0, + "grad_norm": 0.846379084854204, + "language_loss": 0.6219902, + "learning_rate": 1.8911991958724411e-06, + "loss": 0.64435542, + "num_input_tokens_seen": 190259950, + "step": 8845, + "time_per_iteration": 3.1101276874542236 + }, + { + "auxiliary_loss_clip": 0.01119103, + "auxiliary_loss_mlp": 0.01112428, + "balance_loss_clip": 1.00201225, + "balance_loss_mlp": 1.00056255, + "epoch": 0.5318502931008567, + "flos": 19129219240320.0, + "grad_norm": 1.8838876466422427, + "language_loss": 0.75651985, + "learning_rate": 1.890810312970474e-06, + "loss": 0.77883518, + "num_input_tokens_seen": 190278265, + "step": 8846, + "time_per_iteration": 2.6340248584747314 + }, + { + "auxiliary_loss_clip": 0.01150446, + "auxiliary_loss_mlp": 0.01112151, + "balance_loss_clip": 1.0022428, + "balance_loss_mlp": 1.00085711, + "epoch": 0.5319104163535248, + "flos": 24681045070080.0, + "grad_norm": 1.6792285194389565, + "language_loss": 0.75278533, + "learning_rate": 1.8904214342089903e-06, + "loss": 0.77541125, + "num_input_tokens_seen": 190298400, + "step": 8847, + "time_per_iteration": 4.00723934173584 + }, + { + "auxiliary_loss_clip": 0.01134868, + "auxiliary_loss_mlp": 0.01111114, + "balance_loss_clip": 1.00202775, + "balance_loss_mlp": 1.00048804, + "epoch": 0.5319705396061927, + "flos": 19385513758080.0, + "grad_norm": 1.6156754727267268, + "language_loss": 0.87506199, + "learning_rate": 1.8900325596027378e-06, + "loss": 0.89752173, + "num_input_tokens_seen": 190316235, + "step": 8848, + "time_per_iteration": 3.9557206630706787 + }, + { + "auxiliary_loss_clip": 0.01120723, + "auxiliary_loss_mlp": 0.01112399, + "balance_loss_clip": 1.00213981, + "balance_loss_mlp": 1.00062835, + "epoch": 0.5320306628588607, + "flos": 18259319664000.0, + "grad_norm": 2.8577851752334102, + "language_loss": 0.74559504, + "learning_rate": 1.8896436891664609e-06, + "loss": 0.76792622, + "num_input_tokens_seen": 190335060, + "step": 8849, + "time_per_iteration": 2.5952694416046143 + }, + { + "auxiliary_loss_clip": 0.01152169, + "auxiliary_loss_mlp": 0.01112835, + "balance_loss_clip": 1.00216818, + "balance_loss_mlp": 1.00058722, + "epoch": 0.5320907861115286, + "flos": 23732321097600.0, + "grad_norm": 1.7360918173651108, + "language_loss": 0.79536223, + "learning_rate": 1.8892548229149066e-06, + "loss": 0.81801224, + "num_input_tokens_seen": 190353265, + "step": 8850, + "time_per_iteration": 2.5689895153045654 + }, + { + "auxiliary_loss_clip": 0.01167032, + "auxiliary_loss_mlp": 0.01111468, + "balance_loss_clip": 1.00223911, + "balance_loss_mlp": 1.00065064, + "epoch": 0.5321509093641966, + "flos": 34495251321600.0, + "grad_norm": 1.5015507802598516, + "language_loss": 0.54768366, + "learning_rate": 1.888865960862821e-06, + "loss": 0.57046866, + "num_input_tokens_seen": 190376575, + "step": 8851, + "time_per_iteration": 2.680901527404785 + }, + { + "auxiliary_loss_clip": 0.01152054, + "auxiliary_loss_mlp": 0.01112049, + "balance_loss_clip": 1.00218987, + "balance_loss_mlp": 1.00056505, + "epoch": 0.5322110326168645, + "flos": 20010934391040.0, + "grad_norm": 1.5556113527475155, + "language_loss": 0.68422621, + "learning_rate": 1.8884771030249484e-06, + "loss": 0.70686722, + "num_input_tokens_seen": 190395185, + "step": 8852, + "time_per_iteration": 4.044157981872559 + }, + { + "auxiliary_loss_clip": 0.01130499, + "auxiliary_loss_mlp": 0.00746013, + "balance_loss_clip": 1.00182855, + "balance_loss_mlp": 0.99995995, + "epoch": 0.5322711558695326, + "flos": 64631164435200.0, + "grad_norm": 0.8032272040039274, + "language_loss": 0.62893665, + "learning_rate": 1.8880882494160357e-06, + "loss": 0.64770174, + "num_input_tokens_seen": 190452595, + "step": 8853, + "time_per_iteration": 3.1394004821777344 + }, + { + "auxiliary_loss_clip": 0.01151839, + "auxiliary_loss_mlp": 0.01112285, + "balance_loss_clip": 1.00216794, + "balance_loss_mlp": 1.000705, + "epoch": 0.5323312791222005, + "flos": 14939342421120.0, + "grad_norm": 2.4351091655445334, + "language_loss": 0.79507011, + "learning_rate": 1.8876994000508278e-06, + "loss": 0.81771135, + "num_input_tokens_seen": 190469140, + "step": 8854, + "time_per_iteration": 2.5299713611602783 + }, + { + "auxiliary_loss_clip": 0.01135771, + "auxiliary_loss_mlp": 0.01110688, + "balance_loss_clip": 1.00217462, + "balance_loss_mlp": 1.00063407, + "epoch": 0.5323914023748685, + "flos": 23440834229760.0, + "grad_norm": 2.5518940805743684, + "language_loss": 0.73276335, + "learning_rate": 1.8873105549440698e-06, + "loss": 0.75522792, + "num_input_tokens_seen": 190489015, + "step": 8855, + "time_per_iteration": 2.6452476978302 + }, + { + "auxiliary_loss_clip": 0.0113546, + "auxiliary_loss_mlp": 0.00747298, + "balance_loss_clip": 1.0021528, + "balance_loss_mlp": 1.00046933, + "epoch": 0.5324515256275365, + "flos": 26286180134400.0, + "grad_norm": 2.0203379184852963, + "language_loss": 0.64665204, + "learning_rate": 1.886921714110507e-06, + "loss": 0.66547966, + "num_input_tokens_seen": 190508065, + "step": 8856, + "time_per_iteration": 2.6205697059631348 + }, + { + "auxiliary_loss_clip": 0.01134969, + "auxiliary_loss_mlp": 0.01112741, + "balance_loss_clip": 1.0020355, + "balance_loss_mlp": 1.00077939, + "epoch": 0.5325116488802044, + "flos": 26870913636480.0, + "grad_norm": 1.6991896192080664, + "language_loss": 0.77478671, + "learning_rate": 1.8865328775648842e-06, + "loss": 0.79726386, + "num_input_tokens_seen": 190527045, + "step": 8857, + "time_per_iteration": 2.6996936798095703 + }, + { + "auxiliary_loss_clip": 0.01122547, + "auxiliary_loss_mlp": 0.01111022, + "balance_loss_clip": 1.0021503, + "balance_loss_mlp": 1.00068176, + "epoch": 0.5325717721328724, + "flos": 25884734757120.0, + "grad_norm": 4.500099229498074, + "language_loss": 0.7120887, + "learning_rate": 1.8861440453219456e-06, + "loss": 0.73442441, + "num_input_tokens_seen": 190544075, + "step": 8858, + "time_per_iteration": 2.7093923091888428 + }, + { + "auxiliary_loss_clip": 0.01152077, + "auxiliary_loss_mlp": 0.01112349, + "balance_loss_clip": 1.00240505, + "balance_loss_mlp": 1.00076914, + "epoch": 0.5326318953855403, + "flos": 21799321666560.0, + "grad_norm": 1.8291004954002898, + "language_loss": 0.69580829, + "learning_rate": 1.8857552173964367e-06, + "loss": 0.71845245, + "num_input_tokens_seen": 190566030, + "step": 8859, + "time_per_iteration": 2.597447633743286 + }, + { + "auxiliary_loss_clip": 0.01150269, + "auxiliary_loss_mlp": 0.0111032, + "balance_loss_clip": 1.00220549, + "balance_loss_mlp": 1.00064778, + "epoch": 0.5326920186382084, + "flos": 20922921728640.0, + "grad_norm": 1.5934779182003105, + "language_loss": 0.69464201, + "learning_rate": 1.8853663938031013e-06, + "loss": 0.7172479, + "num_input_tokens_seen": 190585605, + "step": 8860, + "time_per_iteration": 2.546013116836548 + }, + { + "auxiliary_loss_clip": 0.01134822, + "auxiliary_loss_mlp": 0.01111369, + "balance_loss_clip": 1.00215054, + "balance_loss_mlp": 1.00055194, + "epoch": 0.5327521418908763, + "flos": 21433427775360.0, + "grad_norm": 2.41027251971135, + "language_loss": 0.77477235, + "learning_rate": 1.884977574556683e-06, + "loss": 0.79723424, + "num_input_tokens_seen": 190604625, + "step": 8861, + "time_per_iteration": 2.601487398147583 + }, + { + "auxiliary_loss_clip": 0.01102521, + "auxiliary_loss_mlp": 0.01111389, + "balance_loss_clip": 1.00200224, + "balance_loss_mlp": 1.0006671, + "epoch": 0.5328122651435443, + "flos": 21760250647680.0, + "grad_norm": 1.5013750794511735, + "language_loss": 0.85090142, + "learning_rate": 1.8845887596719279e-06, + "loss": 0.87304056, + "num_input_tokens_seen": 190625060, + "step": 8862, + "time_per_iteration": 2.721337080001831 + }, + { + "auxiliary_loss_clip": 0.01135179, + "auxiliary_loss_mlp": 0.0111335, + "balance_loss_clip": 1.0020566, + "balance_loss_mlp": 1.00062561, + "epoch": 0.5328723883962122, + "flos": 18296487262080.0, + "grad_norm": 1.764807530774305, + "language_loss": 0.61642075, + "learning_rate": 1.8841999491635778e-06, + "loss": 0.638906, + "num_input_tokens_seen": 190643150, + "step": 8863, + "time_per_iteration": 2.575545072555542 + }, + { + "auxiliary_loss_clip": 0.01135634, + "auxiliary_loss_mlp": 0.01111347, + "balance_loss_clip": 1.00231385, + "balance_loss_mlp": 1.00072122, + "epoch": 0.5329325116488802, + "flos": 25374911068800.0, + "grad_norm": 1.8395274171364562, + "language_loss": 0.73752916, + "learning_rate": 1.883811143046377e-06, + "loss": 0.75999892, + "num_input_tokens_seen": 190662725, + "step": 8864, + "time_per_iteration": 2.6361615657806396 + }, + { + "auxiliary_loss_clip": 0.0116694, + "auxiliary_loss_mlp": 0.01111572, + "balance_loss_clip": 1.00221491, + "balance_loss_mlp": 1.00084996, + "epoch": 0.5329926349015481, + "flos": 25592098654080.0, + "grad_norm": 1.760705003464555, + "language_loss": 0.64204979, + "learning_rate": 1.8834223413350702e-06, + "loss": 0.66483486, + "num_input_tokens_seen": 190683680, + "step": 8865, + "time_per_iteration": 2.5781588554382324 + }, + { + "auxiliary_loss_clip": 0.01151628, + "auxiliary_loss_mlp": 0.0111154, + "balance_loss_clip": 1.00214601, + "balance_loss_mlp": 1.00062811, + "epoch": 0.5330527581542162, + "flos": 22889605138560.0, + "grad_norm": 2.0599825581169586, + "language_loss": 0.78436017, + "learning_rate": 1.8830335440443989e-06, + "loss": 0.80699188, + "num_input_tokens_seen": 190703350, + "step": 8866, + "time_per_iteration": 2.5640029907226562 + }, + { + "auxiliary_loss_clip": 0.01150345, + "auxiliary_loss_mlp": 0.01111046, + "balance_loss_clip": 1.00230646, + "balance_loss_mlp": 1.0004195, + "epoch": 0.5331128814068841, + "flos": 16026752805120.0, + "grad_norm": 2.1647057547797943, + "language_loss": 0.73683172, + "learning_rate": 1.882644751189108e-06, + "loss": 0.75944567, + "num_input_tokens_seen": 190721170, + "step": 8867, + "time_per_iteration": 2.535271406173706 + }, + { + "auxiliary_loss_clip": 0.01135347, + "auxiliary_loss_mlp": 0.01111499, + "balance_loss_clip": 1.00207925, + "balance_loss_mlp": 1.00058722, + "epoch": 0.5331730046595521, + "flos": 39344699629440.0, + "grad_norm": 1.6945775327729933, + "language_loss": 0.72387636, + "learning_rate": 1.88225596278394e-06, + "loss": 0.7463448, + "num_input_tokens_seen": 190743795, + "step": 8868, + "time_per_iteration": 2.758775234222412 + }, + { + "auxiliary_loss_clip": 0.01120552, + "auxiliary_loss_mlp": 0.01112104, + "balance_loss_clip": 1.00205874, + "balance_loss_mlp": 1.00071514, + "epoch": 0.5332331279122201, + "flos": 24024382583040.0, + "grad_norm": 2.0910122455613487, + "language_loss": 0.7822246, + "learning_rate": 1.881867178843637e-06, + "loss": 0.80455112, + "num_input_tokens_seen": 190761560, + "step": 8869, + "time_per_iteration": 2.6553823947906494 + }, + { + "auxiliary_loss_clip": 0.01150354, + "auxiliary_loss_mlp": 0.01112424, + "balance_loss_clip": 1.00212574, + "balance_loss_mlp": 1.00065362, + "epoch": 0.533293251164888, + "flos": 17129318728320.0, + "grad_norm": 1.817518332949534, + "language_loss": 0.75922334, + "learning_rate": 1.8814783993829434e-06, + "loss": 0.78185117, + "num_input_tokens_seen": 190778875, + "step": 8870, + "time_per_iteration": 2.528110980987549 + }, + { + "auxiliary_loss_clip": 0.01133765, + "auxiliary_loss_mlp": 0.01113236, + "balance_loss_clip": 1.0021137, + "balance_loss_mlp": 1.00060678, + "epoch": 0.533353374417556, + "flos": 22126360020480.0, + "grad_norm": 1.735813246863479, + "language_loss": 0.75383162, + "learning_rate": 1.8810896244165997e-06, + "loss": 0.77630156, + "num_input_tokens_seen": 190799830, + "step": 8871, + "time_per_iteration": 2.6026551723480225 + }, + { + "auxiliary_loss_clip": 0.01133237, + "auxiliary_loss_mlp": 0.01111456, + "balance_loss_clip": 1.00204074, + "balance_loss_mlp": 1.00063896, + "epoch": 0.533413497670224, + "flos": 15011091838080.0, + "grad_norm": 11.355492588568051, + "language_loss": 0.72305518, + "learning_rate": 1.8807008539593498e-06, + "loss": 0.74550211, + "num_input_tokens_seen": 190817155, + "step": 8872, + "time_per_iteration": 2.542494773864746 + }, + { + "auxiliary_loss_clip": 0.0113958, + "auxiliary_loss_mlp": 0.01112022, + "balance_loss_clip": 1.00237155, + "balance_loss_mlp": 1.00072837, + "epoch": 0.533473620922892, + "flos": 19609955890560.0, + "grad_norm": 1.6736984945076196, + "language_loss": 0.65025115, + "learning_rate": 1.880312088025936e-06, + "loss": 0.67276716, + "num_input_tokens_seen": 190835240, + "step": 8873, + "time_per_iteration": 2.5864832401275635 + }, + { + "auxiliary_loss_clip": 0.0113338, + "auxiliary_loss_mlp": 0.01111607, + "balance_loss_clip": 1.00203443, + "balance_loss_mlp": 1.00078976, + "epoch": 0.5335337441755599, + "flos": 14282644020480.0, + "grad_norm": 2.388054587767112, + "language_loss": 0.80240339, + "learning_rate": 1.879923326631099e-06, + "loss": 0.82485324, + "num_input_tokens_seen": 190851620, + "step": 8874, + "time_per_iteration": 2.5752177238464355 + }, + { + "auxiliary_loss_clip": 0.01151474, + "auxiliary_loss_mlp": 0.01111634, + "balance_loss_clip": 1.00210238, + "balance_loss_mlp": 1.00053072, + "epoch": 0.5335938674282279, + "flos": 20814830726400.0, + "grad_norm": 1.6668577136769718, + "language_loss": 0.69931716, + "learning_rate": 1.879534569789582e-06, + "loss": 0.72194827, + "num_input_tokens_seen": 190870545, + "step": 8875, + "time_per_iteration": 2.5489256381988525 + }, + { + "auxiliary_loss_clip": 0.01162073, + "auxiliary_loss_mlp": 0.01088567, + "balance_loss_clip": 1.00167167, + "balance_loss_mlp": 1.00006628, + "epoch": 0.5336539906808958, + "flos": 71396448451200.0, + "grad_norm": 0.7197469786908477, + "language_loss": 0.5964824, + "learning_rate": 1.879145817516126e-06, + "loss": 0.61898875, + "num_input_tokens_seen": 190931995, + "step": 8876, + "time_per_iteration": 3.2262537479400635 + }, + { + "auxiliary_loss_clip": 0.01150601, + "auxiliary_loss_mlp": 0.01111352, + "balance_loss_clip": 1.0020864, + "balance_loss_mlp": 1.00063086, + "epoch": 0.5337141139335638, + "flos": 20152996680960.0, + "grad_norm": 1.9537175635563595, + "language_loss": 0.74845862, + "learning_rate": 1.8787570698254727e-06, + "loss": 0.77107823, + "num_input_tokens_seen": 190949890, + "step": 8877, + "time_per_iteration": 2.5341684818267822 + }, + { + "auxiliary_loss_clip": 0.01145311, + "auxiliary_loss_mlp": 0.01088648, + "balance_loss_clip": 1.00167406, + "balance_loss_mlp": 1.00014722, + "epoch": 0.5337742371862317, + "flos": 67728387484800.0, + "grad_norm": 0.7909484248211683, + "language_loss": 0.5720607, + "learning_rate": 1.8783683267323629e-06, + "loss": 0.59440029, + "num_input_tokens_seen": 191008480, + "step": 8878, + "time_per_iteration": 3.034971237182617 + }, + { + "auxiliary_loss_clip": 0.01167035, + "auxiliary_loss_mlp": 0.01112837, + "balance_loss_clip": 1.00216269, + "balance_loss_mlp": 1.00068438, + "epoch": 0.5338343604388998, + "flos": 25008909436800.0, + "grad_norm": 2.299719026155923, + "language_loss": 0.72692597, + "learning_rate": 1.8779795882515395e-06, + "loss": 0.74972469, + "num_input_tokens_seen": 191028995, + "step": 8879, + "time_per_iteration": 2.5549025535583496 + }, + { + "auxiliary_loss_clip": 0.01167068, + "auxiliary_loss_mlp": 0.01113068, + "balance_loss_clip": 1.00223505, + "balance_loss_mlp": 1.00053453, + "epoch": 0.5338944836915677, + "flos": 17601256546560.0, + "grad_norm": 2.1906739215244735, + "language_loss": 0.83671314, + "learning_rate": 1.8775908543977416e-06, + "loss": 0.85951453, + "num_input_tokens_seen": 191045285, + "step": 8880, + "time_per_iteration": 2.481166124343872 + }, + { + "auxiliary_loss_clip": 0.01087297, + "auxiliary_loss_mlp": 0.0111111, + "balance_loss_clip": 1.00174928, + "balance_loss_mlp": 1.00048363, + "epoch": 0.5339546069442357, + "flos": 21724124544000.0, + "grad_norm": 1.4544754712526835, + "language_loss": 0.79346311, + "learning_rate": 1.8772021251857107e-06, + "loss": 0.81544721, + "num_input_tokens_seen": 191066105, + "step": 8881, + "time_per_iteration": 2.7354040145874023 + }, + { + "auxiliary_loss_clip": 0.01128206, + "auxiliary_loss_mlp": 0.01088567, + "balance_loss_clip": 1.00158525, + "balance_loss_mlp": 1.00006568, + "epoch": 0.5340147301969036, + "flos": 69723583315200.0, + "grad_norm": 0.7799407972336346, + "language_loss": 0.59248668, + "learning_rate": 1.8768134006301882e-06, + "loss": 0.61465442, + "num_input_tokens_seen": 191126315, + "step": 8882, + "time_per_iteration": 4.461606025695801 + }, + { + "auxiliary_loss_clip": 0.01131209, + "auxiliary_loss_mlp": 0.01089023, + "balance_loss_clip": 1.00168324, + "balance_loss_mlp": 1.00014043, + "epoch": 0.5340748534495716, + "flos": 63880701580800.0, + "grad_norm": 0.8667729858049402, + "language_loss": 0.63688266, + "learning_rate": 1.876424680745913e-06, + "loss": 0.65908492, + "num_input_tokens_seen": 191174240, + "step": 8883, + "time_per_iteration": 2.950921058654785 + }, + { + "auxiliary_loss_clip": 0.01104032, + "auxiliary_loss_mlp": 0.0111249, + "balance_loss_clip": 1.00210822, + "balance_loss_mlp": 1.00062358, + "epoch": 0.5341349767022396, + "flos": 28694313694080.0, + "grad_norm": 2.0450889467269002, + "language_loss": 0.81893647, + "learning_rate": 1.8760359655476272e-06, + "loss": 0.84110165, + "num_input_tokens_seen": 191193335, + "step": 8884, + "time_per_iteration": 2.7631852626800537 + }, + { + "auxiliary_loss_clip": 0.01137283, + "auxiliary_loss_mlp": 0.01110907, + "balance_loss_clip": 1.0022788, + "balance_loss_mlp": 1.00066209, + "epoch": 0.5341950999549075, + "flos": 16289691338880.0, + "grad_norm": 1.66077207607208, + "language_loss": 0.72044897, + "learning_rate": 1.8756472550500695e-06, + "loss": 0.74293089, + "num_input_tokens_seen": 191210900, + "step": 8885, + "time_per_iteration": 4.065836668014526 + }, + { + "auxiliary_loss_clip": 0.01135074, + "auxiliary_loss_mlp": 0.01112372, + "balance_loss_clip": 1.00214696, + "balance_loss_mlp": 1.00060177, + "epoch": 0.5342552232075756, + "flos": 14355650413440.0, + "grad_norm": 2.7653930959222683, + "language_loss": 0.78904867, + "learning_rate": 1.87525854926798e-06, + "loss": 0.81152308, + "num_input_tokens_seen": 191226730, + "step": 8886, + "time_per_iteration": 3.9285829067230225 + }, + { + "auxiliary_loss_clip": 0.0111689, + "auxiliary_loss_mlp": 0.00747338, + "balance_loss_clip": 1.00197172, + "balance_loss_mlp": 1.00036311, + "epoch": 0.5343153464602435, + "flos": 30297976300800.0, + "grad_norm": 2.432441495643402, + "language_loss": 0.7488122, + "learning_rate": 1.8748698482160996e-06, + "loss": 0.76745439, + "num_input_tokens_seen": 191250435, + "step": 8887, + "time_per_iteration": 2.73323130607605 + }, + { + "auxiliary_loss_clip": 0.01135448, + "auxiliary_loss_mlp": 0.01111467, + "balance_loss_clip": 1.00208998, + "balance_loss_mlp": 1.00055444, + "epoch": 0.5343754697129115, + "flos": 15596292216960.0, + "grad_norm": 3.184922633309123, + "language_loss": 0.69113767, + "learning_rate": 1.8744811519091663e-06, + "loss": 0.71360683, + "num_input_tokens_seen": 191268315, + "step": 8888, + "time_per_iteration": 2.5594489574432373 + }, + { + "auxiliary_loss_clip": 0.0115232, + "auxiliary_loss_mlp": 0.01112559, + "balance_loss_clip": 1.00212848, + "balance_loss_mlp": 1.00069332, + "epoch": 0.5344355929655794, + "flos": 16909617191040.0, + "grad_norm": 2.507375111447012, + "language_loss": 0.78244352, + "learning_rate": 1.8740924603619208e-06, + "loss": 0.80509233, + "num_input_tokens_seen": 191287000, + "step": 8889, + "time_per_iteration": 2.5237491130828857 + }, + { + "auxiliary_loss_clip": 0.01167046, + "auxiliary_loss_mlp": 0.01111936, + "balance_loss_clip": 1.0023632, + "balance_loss_mlp": 1.00083303, + "epoch": 0.5344957162182474, + "flos": 16798186224000.0, + "grad_norm": 7.336033662680917, + "language_loss": 0.6924414, + "learning_rate": 1.873703773589102e-06, + "loss": 0.71523118, + "num_input_tokens_seen": 191304565, + "step": 8890, + "time_per_iteration": 3.932048797607422 + }, + { + "auxiliary_loss_clip": 0.01167141, + "auxiliary_loss_mlp": 0.01112633, + "balance_loss_clip": 1.0022136, + "balance_loss_mlp": 1.00076687, + "epoch": 0.5345558394709153, + "flos": 12705590413440.0, + "grad_norm": 2.256489960851774, + "language_loss": 0.76428878, + "learning_rate": 1.8733150916054483e-06, + "loss": 0.78708649, + "num_input_tokens_seen": 191318300, + "step": 8891, + "time_per_iteration": 2.476489305496216 + }, + { + "auxiliary_loss_clip": 0.0115181, + "auxiliary_loss_mlp": 0.01110808, + "balance_loss_clip": 1.00216758, + "balance_loss_mlp": 1.00065827, + "epoch": 0.5346159627235834, + "flos": 22455050400000.0, + "grad_norm": 1.8085469767543403, + "language_loss": 0.74243653, + "learning_rate": 1.872926414425699e-06, + "loss": 0.76506269, + "num_input_tokens_seen": 191337925, + "step": 8892, + "time_per_iteration": 2.55420184135437 + }, + { + "auxiliary_loss_clip": 0.01136526, + "auxiliary_loss_mlp": 0.01111471, + "balance_loss_clip": 1.00205135, + "balance_loss_mlp": 1.00055861, + "epoch": 0.5346760859762513, + "flos": 22415763899520.0, + "grad_norm": 1.5001727555161484, + "language_loss": 0.87863225, + "learning_rate": 1.8725377420645932e-06, + "loss": 0.9011122, + "num_input_tokens_seen": 191357120, + "step": 8893, + "time_per_iteration": 2.6080594062805176 + }, + { + "auxiliary_loss_clip": 0.01166879, + "auxiliary_loss_mlp": 0.01110905, + "balance_loss_clip": 1.00218379, + "balance_loss_mlp": 1.00056505, + "epoch": 0.5347362092289193, + "flos": 22816131868800.0, + "grad_norm": 1.6173004564929219, + "language_loss": 0.72208333, + "learning_rate": 1.872149074536869e-06, + "loss": 0.74486119, + "num_input_tokens_seen": 191375395, + "step": 8894, + "time_per_iteration": 2.508303165435791 + }, + { + "auxiliary_loss_clip": 0.01151433, + "auxiliary_loss_mlp": 0.01110587, + "balance_loss_clip": 1.00212395, + "balance_loss_mlp": 1.00062788, + "epoch": 0.5347963324815872, + "flos": 23219480666880.0, + "grad_norm": 1.7756876131763681, + "language_loss": 0.74958163, + "learning_rate": 1.8717604118572648e-06, + "loss": 0.77220184, + "num_input_tokens_seen": 191395595, + "step": 8895, + "time_per_iteration": 2.5768327713012695 + }, + { + "auxiliary_loss_clip": 0.01135855, + "auxiliary_loss_mlp": 0.01110939, + "balance_loss_clip": 1.00208211, + "balance_loss_mlp": 1.00059891, + "epoch": 0.5348564557342552, + "flos": 22601350494720.0, + "grad_norm": 1.6310010014726006, + "language_loss": 0.76954979, + "learning_rate": 1.8713717540405178e-06, + "loss": 0.79201776, + "num_input_tokens_seen": 191413730, + "step": 8896, + "time_per_iteration": 2.598653554916382 + }, + { + "auxiliary_loss_clip": 0.01134085, + "auxiliary_loss_mlp": 0.01110717, + "balance_loss_clip": 1.00210965, + "balance_loss_mlp": 1.00047207, + "epoch": 0.5349165789869232, + "flos": 18002378701440.0, + "grad_norm": 2.3438918741082757, + "language_loss": 0.78843731, + "learning_rate": 1.8709831011013676e-06, + "loss": 0.81088531, + "num_input_tokens_seen": 191432400, + "step": 8897, + "time_per_iteration": 2.591923475265503 + }, + { + "auxiliary_loss_clip": 0.01150133, + "auxiliary_loss_mlp": 0.01111537, + "balance_loss_clip": 1.00208449, + "balance_loss_mlp": 1.00071979, + "epoch": 0.5349767022395912, + "flos": 17159770483200.0, + "grad_norm": 1.798527823421771, + "language_loss": 0.76264715, + "learning_rate": 1.8705944530545509e-06, + "loss": 0.7852639, + "num_input_tokens_seen": 191448855, + "step": 8898, + "time_per_iteration": 2.529327392578125 + }, + { + "auxiliary_loss_clip": 0.01147385, + "auxiliary_loss_mlp": 0.0108862, + "balance_loss_clip": 1.00151873, + "balance_loss_mlp": 1.00011921, + "epoch": 0.5350368254922592, + "flos": 70992058158720.0, + "grad_norm": 0.8529823675315035, + "language_loss": 0.58096731, + "learning_rate": 1.8702058099148052e-06, + "loss": 0.60332739, + "num_input_tokens_seen": 191519690, + "step": 8899, + "time_per_iteration": 3.2996010780334473 + }, + { + "auxiliary_loss_clip": 0.01133985, + "auxiliary_loss_mlp": 0.01110807, + "balance_loss_clip": 1.00203669, + "balance_loss_mlp": 1.00056267, + "epoch": 0.5350969487449271, + "flos": 27417833095680.0, + "grad_norm": 1.5701056721306141, + "language_loss": 0.69758892, + "learning_rate": 1.869817171696868e-06, + "loss": 0.72003686, + "num_input_tokens_seen": 191539380, + "step": 8900, + "time_per_iteration": 2.6749987602233887 + }, + { + "auxiliary_loss_clip": 0.01135471, + "auxiliary_loss_mlp": 0.01112206, + "balance_loss_clip": 1.00202799, + "balance_loss_mlp": 1.00062656, + "epoch": 0.5351570719975951, + "flos": 19316134638720.0, + "grad_norm": 1.6385494302608237, + "language_loss": 0.71534407, + "learning_rate": 1.8694285384154777e-06, + "loss": 0.73782086, + "num_input_tokens_seen": 191557400, + "step": 8901, + "time_per_iteration": 2.589970350265503 + }, + { + "auxiliary_loss_clip": 0.01121876, + "auxiliary_loss_mlp": 0.01112268, + "balance_loss_clip": 1.00214601, + "balance_loss_mlp": 1.00049758, + "epoch": 0.535217195250263, + "flos": 19828580019840.0, + "grad_norm": 1.8797797878614757, + "language_loss": 0.77230531, + "learning_rate": 1.8690399100853699e-06, + "loss": 0.79464674, + "num_input_tokens_seen": 191575860, + "step": 8902, + "time_per_iteration": 2.6314826011657715 + }, + { + "auxiliary_loss_clip": 0.0113543, + "auxiliary_loss_mlp": 0.01111265, + "balance_loss_clip": 1.00210381, + "balance_loss_mlp": 1.00044847, + "epoch": 0.535277318502931, + "flos": 22127868391680.0, + "grad_norm": 1.6769446444781078, + "language_loss": 0.69857091, + "learning_rate": 1.868651286721281e-06, + "loss": 0.72103786, + "num_input_tokens_seen": 191595775, + "step": 8903, + "time_per_iteration": 2.616198778152466 + }, + { + "auxiliary_loss_clip": 0.01150314, + "auxiliary_loss_mlp": 0.00747349, + "balance_loss_clip": 1.00207138, + "balance_loss_mlp": 1.0003916, + "epoch": 0.5353374417555989, + "flos": 25045897466880.0, + "grad_norm": 1.6899173846162405, + "language_loss": 0.72589386, + "learning_rate": 1.86826266833795e-06, + "loss": 0.74487042, + "num_input_tokens_seen": 191617785, + "step": 8904, + "time_per_iteration": 2.770455837249756 + }, + { + "auxiliary_loss_clip": 0.0113345, + "auxiliary_loss_mlp": 0.0111243, + "balance_loss_clip": 1.00215232, + "balance_loss_mlp": 1.00065899, + "epoch": 0.535397565008267, + "flos": 19388710068480.0, + "grad_norm": 2.07501944697882, + "language_loss": 0.73581988, + "learning_rate": 1.8678740549501103e-06, + "loss": 0.75827873, + "num_input_tokens_seen": 191636900, + "step": 8905, + "time_per_iteration": 2.6709587574005127 + }, + { + "auxiliary_loss_clip": 0.01150021, + "auxiliary_loss_mlp": 0.01110955, + "balance_loss_clip": 1.00205481, + "balance_loss_mlp": 1.00061512, + "epoch": 0.5354576882609349, + "flos": 21471205904640.0, + "grad_norm": 1.4115051581222007, + "language_loss": 0.83400875, + "learning_rate": 1.8674854465725005e-06, + "loss": 0.85661852, + "num_input_tokens_seen": 191656720, + "step": 8906, + "time_per_iteration": 2.5930440425872803 + }, + { + "auxiliary_loss_clip": 0.01150396, + "auxiliary_loss_mlp": 0.0074738, + "balance_loss_clip": 1.00214219, + "balance_loss_mlp": 1.00034392, + "epoch": 0.5355178115136029, + "flos": 20777519473920.0, + "grad_norm": 2.4251225090686703, + "language_loss": 0.73946536, + "learning_rate": 1.8670968432198563e-06, + "loss": 0.75844312, + "num_input_tokens_seen": 191674445, + "step": 8907, + "time_per_iteration": 2.5664961338043213 + }, + { + "auxiliary_loss_clip": 0.01151901, + "auxiliary_loss_mlp": 0.01110864, + "balance_loss_clip": 1.00227785, + "balance_loss_mlp": 1.00052392, + "epoch": 0.5355779347662708, + "flos": 23514020190720.0, + "grad_norm": 1.7373105186432534, + "language_loss": 0.76534766, + "learning_rate": 1.866708244906912e-06, + "loss": 0.78797531, + "num_input_tokens_seen": 191695000, + "step": 8908, + "time_per_iteration": 2.5870542526245117 + }, + { + "auxiliary_loss_clip": 0.01135942, + "auxiliary_loss_mlp": 0.00747201, + "balance_loss_clip": 1.00207675, + "balance_loss_mlp": 1.0002892, + "epoch": 0.5356380580189388, + "flos": 20303211358080.0, + "grad_norm": 2.2758569405561726, + "language_loss": 0.74091971, + "learning_rate": 1.8663196516484055e-06, + "loss": 0.75975114, + "num_input_tokens_seen": 191713295, + "step": 8909, + "time_per_iteration": 2.583717107772827 + }, + { + "auxiliary_loss_clip": 0.01126316, + "auxiliary_loss_mlp": 0.0111096, + "balance_loss_clip": 1.00237942, + "balance_loss_mlp": 1.0007149, + "epoch": 0.5356981812716068, + "flos": 21361642444800.0, + "grad_norm": 3.8200683046832515, + "language_loss": 0.8360095, + "learning_rate": 1.8659310634590702e-06, + "loss": 0.85838223, + "num_input_tokens_seen": 191732725, + "step": 8910, + "time_per_iteration": 2.6173791885375977 + }, + { + "auxiliary_loss_clip": 0.01135507, + "auxiliary_loss_mlp": 0.01111805, + "balance_loss_clip": 1.00196695, + "balance_loss_mlp": 1.00051105, + "epoch": 0.5357583045242748, + "flos": 23111246010240.0, + "grad_norm": 1.4890139111989968, + "language_loss": 0.81596017, + "learning_rate": 1.8655424803536427e-06, + "loss": 0.83843327, + "num_input_tokens_seen": 191753765, + "step": 8911, + "time_per_iteration": 2.6082234382629395 + }, + { + "auxiliary_loss_clip": 0.01118384, + "auxiliary_loss_mlp": 0.01111468, + "balance_loss_clip": 1.00191116, + "balance_loss_mlp": 1.0006516, + "epoch": 0.5358184277769428, + "flos": 21141761339520.0, + "grad_norm": 2.5716411004247806, + "language_loss": 0.68956649, + "learning_rate": 1.8651539023468585e-06, + "loss": 0.71186507, + "num_input_tokens_seen": 191773560, + "step": 8912, + "time_per_iteration": 2.637977361679077 + }, + { + "auxiliary_loss_clip": 0.01134614, + "auxiliary_loss_mlp": 0.01111352, + "balance_loss_clip": 1.0020076, + "balance_loss_mlp": 1.00063026, + "epoch": 0.5358785510296107, + "flos": 16282400878080.0, + "grad_norm": 2.005764492583619, + "language_loss": 0.71263731, + "learning_rate": 1.8647653294534509e-06, + "loss": 0.73509693, + "num_input_tokens_seen": 191791255, + "step": 8913, + "time_per_iteration": 2.557692527770996 + }, + { + "auxiliary_loss_clip": 0.01118228, + "auxiliary_loss_mlp": 0.01111324, + "balance_loss_clip": 1.00195718, + "balance_loss_mlp": 1.00069809, + "epoch": 0.5359386742822787, + "flos": 16976877408000.0, + "grad_norm": 1.9498253530487244, + "language_loss": 0.72407109, + "learning_rate": 1.864376761688156e-06, + "loss": 0.74636662, + "num_input_tokens_seen": 191809325, + "step": 8914, + "time_per_iteration": 2.6144323348999023 + }, + { + "auxiliary_loss_clip": 0.01140036, + "auxiliary_loss_mlp": 0.01113, + "balance_loss_clip": 1.00223088, + "balance_loss_mlp": 1.00065732, + "epoch": 0.5359987975349466, + "flos": 20812927305600.0, + "grad_norm": 2.2511738319205765, + "language_loss": 0.70522785, + "learning_rate": 1.8639881990657079e-06, + "loss": 0.72775829, + "num_input_tokens_seen": 191829795, + "step": 8915, + "time_per_iteration": 2.6150400638580322 + }, + { + "auxiliary_loss_clip": 0.01135615, + "auxiliary_loss_mlp": 0.01111322, + "balance_loss_clip": 1.00219131, + "balance_loss_mlp": 1.00069559, + "epoch": 0.5360589207876146, + "flos": 22199941031040.0, + "grad_norm": 1.9615733948641596, + "language_loss": 0.75244296, + "learning_rate": 1.8635996416008408e-06, + "loss": 0.77491224, + "num_input_tokens_seen": 191850840, + "step": 8916, + "time_per_iteration": 2.601517915725708 + }, + { + "auxiliary_loss_clip": 0.01102148, + "auxiliary_loss_mlp": 0.00747257, + "balance_loss_clip": 1.00182712, + "balance_loss_mlp": 1.00027299, + "epoch": 0.5361190440402825, + "flos": 31394365084800.0, + "grad_norm": 2.2469832966658565, + "language_loss": 0.72563612, + "learning_rate": 1.863211089308289e-06, + "loss": 0.74413019, + "num_input_tokens_seen": 191869520, + "step": 8917, + "time_per_iteration": 2.755699872970581 + }, + { + "auxiliary_loss_clip": 0.01133476, + "auxiliary_loss_mlp": 0.01112536, + "balance_loss_clip": 1.00202703, + "balance_loss_mlp": 1.00076568, + "epoch": 0.5361791672929506, + "flos": 16069882060800.0, + "grad_norm": 2.25699971893889, + "language_loss": 0.71523702, + "learning_rate": 1.8628225422027865e-06, + "loss": 0.73769718, + "num_input_tokens_seen": 191887240, + "step": 8918, + "time_per_iteration": 2.648930549621582 + }, + { + "auxiliary_loss_clip": 0.01133902, + "auxiliary_loss_mlp": 0.01111678, + "balance_loss_clip": 1.00204504, + "balance_loss_mlp": 1.00067019, + "epoch": 0.5362392905456185, + "flos": 20740926493440.0, + "grad_norm": 2.172607601458963, + "language_loss": 0.75029528, + "learning_rate": 1.862434000299067e-06, + "loss": 0.77275109, + "num_input_tokens_seen": 191905690, + "step": 8919, + "time_per_iteration": 4.030436038970947 + }, + { + "auxiliary_loss_clip": 0.01133602, + "auxiliary_loss_mlp": 0.0111197, + "balance_loss_clip": 1.00173926, + "balance_loss_mlp": 1.00067639, + "epoch": 0.5362994137982865, + "flos": 17340077779200.0, + "grad_norm": 1.9232434877754174, + "language_loss": 0.71400648, + "learning_rate": 1.862045463611864e-06, + "loss": 0.73646218, + "num_input_tokens_seen": 191920725, + "step": 8920, + "time_per_iteration": 2.573408603668213 + }, + { + "auxiliary_loss_clip": 0.01151802, + "auxiliary_loss_mlp": 0.01111106, + "balance_loss_clip": 1.00210357, + "balance_loss_mlp": 1.00057483, + "epoch": 0.5363595370509544, + "flos": 42813957795840.0, + "grad_norm": 1.3650834490007242, + "language_loss": 0.68683696, + "learning_rate": 1.8616569321559105e-06, + "loss": 0.70946604, + "num_input_tokens_seen": 191944645, + "step": 8921, + "time_per_iteration": 2.7515010833740234 + }, + { + "auxiliary_loss_clip": 0.01150353, + "auxiliary_loss_mlp": 0.01112186, + "balance_loss_clip": 1.00223279, + "balance_loss_mlp": 1.00060582, + "epoch": 0.5364196603036224, + "flos": 19171953446400.0, + "grad_norm": 1.8653796311617217, + "language_loss": 0.81864917, + "learning_rate": 1.86126840594594e-06, + "loss": 0.8412745, + "num_input_tokens_seen": 191962265, + "step": 8922, + "time_per_iteration": 2.550081253051758 + }, + { + "auxiliary_loss_clip": 0.01150542, + "auxiliary_loss_mlp": 0.01111974, + "balance_loss_clip": 1.00202298, + "balance_loss_mlp": 1.00048923, + "epoch": 0.5364797835562904, + "flos": 17931060247680.0, + "grad_norm": 1.9034933548011779, + "language_loss": 0.76566422, + "learning_rate": 1.860879884996686e-06, + "loss": 0.78828931, + "num_input_tokens_seen": 191978850, + "step": 8923, + "time_per_iteration": 3.892103672027588 + }, + { + "auxiliary_loss_clip": 0.01136015, + "auxiliary_loss_mlp": 0.01112411, + "balance_loss_clip": 1.00210524, + "balance_loss_mlp": 1.00054526, + "epoch": 0.5365399068089584, + "flos": 30228058477440.0, + "grad_norm": 1.5517107758243902, + "language_loss": 0.70308167, + "learning_rate": 1.8604913693228804e-06, + "loss": 0.72556597, + "num_input_tokens_seen": 192002000, + "step": 8924, + "time_per_iteration": 4.2448203563690186 + }, + { + "auxiliary_loss_clip": 0.011178, + "auxiliary_loss_mlp": 0.01112683, + "balance_loss_clip": 1.00195384, + "balance_loss_mlp": 1.00062644, + "epoch": 0.5366000300616264, + "flos": 24891696380160.0, + "grad_norm": 1.8336747398381037, + "language_loss": 0.87046695, + "learning_rate": 1.8601028589392558e-06, + "loss": 0.89277178, + "num_input_tokens_seen": 192019100, + "step": 8925, + "time_per_iteration": 2.6940853595733643 + }, + { + "auxiliary_loss_clip": 0.0116698, + "auxiliary_loss_mlp": 0.01111649, + "balance_loss_clip": 1.00209761, + "balance_loss_mlp": 1.00054646, + "epoch": 0.5366601533142943, + "flos": 29826649013760.0, + "grad_norm": 3.254963027998985, + "language_loss": 0.78090751, + "learning_rate": 1.8597143538605455e-06, + "loss": 0.80369383, + "num_input_tokens_seen": 192041660, + "step": 8926, + "time_per_iteration": 2.5717601776123047 + }, + { + "auxiliary_loss_clip": 0.01119082, + "auxiliary_loss_mlp": 0.01110919, + "balance_loss_clip": 1.00200868, + "balance_loss_mlp": 1.00057852, + "epoch": 0.5367202765669623, + "flos": 27199352620800.0, + "grad_norm": 1.3995911567211174, + "language_loss": 0.67036903, + "learning_rate": 1.85932585410148e-06, + "loss": 0.69266909, + "num_input_tokens_seen": 192063540, + "step": 8927, + "time_per_iteration": 2.6843414306640625 + }, + { + "auxiliary_loss_clip": 0.01151682, + "auxiliary_loss_mlp": 0.01112486, + "balance_loss_clip": 1.00212741, + "balance_loss_mlp": 1.00052428, + "epoch": 0.5367803998196302, + "flos": 20229953569920.0, + "grad_norm": 4.683536401186566, + "language_loss": 0.73860008, + "learning_rate": 1.8589373596767929e-06, + "loss": 0.76124173, + "num_input_tokens_seen": 192081760, + "step": 8928, + "time_per_iteration": 3.9732348918914795 + }, + { + "auxiliary_loss_clip": 0.01134874, + "auxiliary_loss_mlp": 0.01110992, + "balance_loss_clip": 1.00198686, + "balance_loss_mlp": 1.0004611, + "epoch": 0.5368405230722982, + "flos": 32154629374080.0, + "grad_norm": 1.7983784842488, + "language_loss": 0.63090634, + "learning_rate": 1.8585488706012154e-06, + "loss": 0.65336502, + "num_input_tokens_seen": 192101620, + "step": 8929, + "time_per_iteration": 2.675537347793579 + }, + { + "auxiliary_loss_clip": 0.0115028, + "auxiliary_loss_mlp": 0.011117, + "balance_loss_clip": 1.00211263, + "balance_loss_mlp": 1.00059724, + "epoch": 0.5369006463249661, + "flos": 26247935128320.0, + "grad_norm": 1.7520479409874234, + "language_loss": 0.65995222, + "learning_rate": 1.8581603868894781e-06, + "loss": 0.68257201, + "num_input_tokens_seen": 192121805, + "step": 8930, + "time_per_iteration": 2.6177594661712646 + }, + { + "auxiliary_loss_clip": 0.01100972, + "auxiliary_loss_mlp": 0.01110744, + "balance_loss_clip": 1.00176573, + "balance_loss_mlp": 1.00049901, + "epoch": 0.5369607695776342, + "flos": 26211306234240.0, + "grad_norm": 1.4855401712474854, + "language_loss": 0.67114133, + "learning_rate": 1.8577719085563136e-06, + "loss": 0.6932584, + "num_input_tokens_seen": 192141765, + "step": 8931, + "time_per_iteration": 2.7271640300750732 + }, + { + "auxiliary_loss_clip": 0.01105022, + "auxiliary_loss_mlp": 0.01111567, + "balance_loss_clip": 1.00214565, + "balance_loss_mlp": 1.00055933, + "epoch": 0.5370208928303021, + "flos": 25009017177600.0, + "grad_norm": 1.7823991473224177, + "language_loss": 0.76022375, + "learning_rate": 1.8573834356164525e-06, + "loss": 0.78238964, + "num_input_tokens_seen": 192161560, + "step": 8932, + "time_per_iteration": 2.7314019203186035 + }, + { + "auxiliary_loss_clip": 0.011264, + "auxiliary_loss_mlp": 0.01111312, + "balance_loss_clip": 1.00215411, + "balance_loss_mlp": 1.00059068, + "epoch": 0.5370810160829701, + "flos": 31792147274880.0, + "grad_norm": 2.1534420778582812, + "language_loss": 0.65882385, + "learning_rate": 1.8569949680846261e-06, + "loss": 0.68120098, + "num_input_tokens_seen": 192180190, + "step": 8933, + "time_per_iteration": 2.729996681213379 + }, + { + "auxiliary_loss_clip": 0.01150015, + "auxiliary_loss_mlp": 0.00747331, + "balance_loss_clip": 1.00203717, + "balance_loss_mlp": 1.00034475, + "epoch": 0.537141139335638, + "flos": 23842602829440.0, + "grad_norm": 1.7488867269959754, + "language_loss": 0.830172, + "learning_rate": 1.856606505975565e-06, + "loss": 0.84914547, + "num_input_tokens_seen": 192198855, + "step": 8934, + "time_per_iteration": 2.593165397644043 + }, + { + "auxiliary_loss_clip": 0.01118153, + "auxiliary_loss_mlp": 0.01111034, + "balance_loss_clip": 1.00185072, + "balance_loss_mlp": 1.00059819, + "epoch": 0.537201262588306, + "flos": 18508826511360.0, + "grad_norm": 2.283248801655276, + "language_loss": 0.79902297, + "learning_rate": 1.856218049303999e-06, + "loss": 0.82131487, + "num_input_tokens_seen": 192216555, + "step": 8935, + "time_per_iteration": 2.610682964324951 + }, + { + "auxiliary_loss_clip": 0.01150298, + "auxiliary_loss_mlp": 0.011113, + "balance_loss_clip": 1.00205588, + "balance_loss_mlp": 1.00057876, + "epoch": 0.537261385840974, + "flos": 25662950231040.0, + "grad_norm": 1.696559521375054, + "language_loss": 0.83620667, + "learning_rate": 1.855829598084659e-06, + "loss": 0.85882264, + "num_input_tokens_seen": 192236910, + "step": 8936, + "time_per_iteration": 2.5934066772460938 + }, + { + "auxiliary_loss_clip": 0.01118436, + "auxiliary_loss_mlp": 0.01110631, + "balance_loss_clip": 1.00196981, + "balance_loss_mlp": 1.00057662, + "epoch": 0.537321509093642, + "flos": 40735017406080.0, + "grad_norm": 1.2450235519742436, + "language_loss": 0.72576129, + "learning_rate": 1.8554411523322754e-06, + "loss": 0.74805194, + "num_input_tokens_seen": 192260790, + "step": 8937, + "time_per_iteration": 2.8051750659942627 + }, + { + "auxiliary_loss_clip": 0.01135043, + "auxiliary_loss_mlp": 0.01111372, + "balance_loss_clip": 1.00188255, + "balance_loss_mlp": 1.00036466, + "epoch": 0.53738163234631, + "flos": 17238487138560.0, + "grad_norm": 1.971303021820752, + "language_loss": 0.81476176, + "learning_rate": 1.8550527120615778e-06, + "loss": 0.83722591, + "num_input_tokens_seen": 192277230, + "step": 8938, + "time_per_iteration": 2.5715091228485107 + }, + { + "auxiliary_loss_clip": 0.01167059, + "auxiliary_loss_mlp": 0.0111309, + "balance_loss_clip": 1.00210989, + "balance_loss_mlp": 1.00055647, + "epoch": 0.5374417555989779, + "flos": 12821977457280.0, + "grad_norm": 2.3871369149170443, + "language_loss": 0.80748153, + "learning_rate": 1.8546642772872957e-06, + "loss": 0.83028299, + "num_input_tokens_seen": 192292840, + "step": 8939, + "time_per_iteration": 2.516758441925049 + }, + { + "auxiliary_loss_clip": 0.01114062, + "auxiliary_loss_mlp": 0.01088541, + "balance_loss_clip": 1.00141788, + "balance_loss_mlp": 1.00003958, + "epoch": 0.5375018788516459, + "flos": 67256018703360.0, + "grad_norm": 0.7090347589769068, + "language_loss": 0.52487957, + "learning_rate": 1.8542758480241589e-06, + "loss": 0.54690564, + "num_input_tokens_seen": 192358240, + "step": 8940, + "time_per_iteration": 3.29364013671875 + }, + { + "auxiliary_loss_clip": 0.01119911, + "auxiliary_loss_mlp": 0.01110857, + "balance_loss_clip": 1.00201786, + "balance_loss_mlp": 1.00042188, + "epoch": 0.5375620021043138, + "flos": 18114168804480.0, + "grad_norm": 1.7752200471495554, + "language_loss": 0.71699297, + "learning_rate": 1.8538874242868965e-06, + "loss": 0.73930067, + "num_input_tokens_seen": 192377370, + "step": 8941, + "time_per_iteration": 2.611952066421509 + }, + { + "auxiliary_loss_clip": 0.01135174, + "auxiliary_loss_mlp": 0.01110513, + "balance_loss_clip": 1.00207567, + "balance_loss_mlp": 1.00055408, + "epoch": 0.5376221253569818, + "flos": 23149383275520.0, + "grad_norm": 1.6798689168576353, + "language_loss": 0.79456568, + "learning_rate": 1.853499006090237e-06, + "loss": 0.81702244, + "num_input_tokens_seen": 192396450, + "step": 8942, + "time_per_iteration": 2.6434991359710693 + }, + { + "auxiliary_loss_clip": 0.0116706, + "auxiliary_loss_mlp": 0.01113045, + "balance_loss_clip": 1.00221229, + "balance_loss_mlp": 1.00060725, + "epoch": 0.5376822486096497, + "flos": 29972302663680.0, + "grad_norm": 1.755428700268662, + "language_loss": 0.69987404, + "learning_rate": 1.853110593448911e-06, + "loss": 0.72267509, + "num_input_tokens_seen": 192417390, + "step": 8943, + "time_per_iteration": 2.623300313949585 + }, + { + "auxiliary_loss_clip": 0.01145125, + "auxiliary_loss_mlp": 0.01088169, + "balance_loss_clip": 1.00148642, + "balance_loss_mlp": 1.00004983, + "epoch": 0.5377423718623178, + "flos": 54168950874240.0, + "grad_norm": 0.8063763295780064, + "language_loss": 0.59684443, + "learning_rate": 1.852722186377645e-06, + "loss": 0.61917734, + "num_input_tokens_seen": 192478060, + "step": 8944, + "time_per_iteration": 3.1375887393951416 + }, + { + "auxiliary_loss_clip": 0.0110038, + "auxiliary_loss_mlp": 0.01112819, + "balance_loss_clip": 1.00178087, + "balance_loss_mlp": 1.00057197, + "epoch": 0.5378024951149857, + "flos": 23257079228160.0, + "grad_norm": 2.0950191477453206, + "language_loss": 0.78062975, + "learning_rate": 1.852333784891169e-06, + "loss": 0.80276179, + "num_input_tokens_seen": 192495985, + "step": 8945, + "time_per_iteration": 2.684455633163452 + }, + { + "auxiliary_loss_clip": 0.0115192, + "auxiliary_loss_mlp": 0.0111088, + "balance_loss_clip": 1.00196552, + "balance_loss_mlp": 1.0006355, + "epoch": 0.5378626183676537, + "flos": 24024095274240.0, + "grad_norm": 2.15926052742011, + "language_loss": 0.68343675, + "learning_rate": 1.8519453890042112e-06, + "loss": 0.70606482, + "num_input_tokens_seen": 192515445, + "step": 8946, + "time_per_iteration": 2.5703232288360596 + }, + { + "auxiliary_loss_clip": 0.011015, + "auxiliary_loss_mlp": 0.01111117, + "balance_loss_clip": 1.00177574, + "balance_loss_mlp": 1.0006814, + "epoch": 0.5379227416203216, + "flos": 27161789973120.0, + "grad_norm": 1.4755889157782163, + "language_loss": 0.77362126, + "learning_rate": 1.851556998731498e-06, + "loss": 0.7957474, + "num_input_tokens_seen": 192536530, + "step": 8947, + "time_per_iteration": 2.7124505043029785 + }, + { + "auxiliary_loss_clip": 0.01152085, + "auxiliary_loss_mlp": 0.01111567, + "balance_loss_clip": 1.00210464, + "balance_loss_mlp": 1.00055957, + "epoch": 0.5379828648729896, + "flos": 24681619687680.0, + "grad_norm": 1.7343617949911567, + "language_loss": 0.60061026, + "learning_rate": 1.8511686140877592e-06, + "loss": 0.62324679, + "num_input_tokens_seen": 192556075, + "step": 8948, + "time_per_iteration": 2.5939390659332275 + }, + { + "auxiliary_loss_clip": 0.01120738, + "auxiliary_loss_mlp": 0.01111628, + "balance_loss_clip": 1.00210929, + "balance_loss_mlp": 1.0006206, + "epoch": 0.5380429881256577, + "flos": 22523280284160.0, + "grad_norm": 1.5520650978080954, + "language_loss": 0.79355872, + "learning_rate": 1.8507802350877205e-06, + "loss": 0.81588233, + "num_input_tokens_seen": 192575535, + "step": 8949, + "time_per_iteration": 2.6312568187713623 + }, + { + "auxiliary_loss_clip": 0.01120022, + "auxiliary_loss_mlp": 0.0111084, + "balance_loss_clip": 1.00201023, + "balance_loss_mlp": 1.00069022, + "epoch": 0.5381031113783256, + "flos": 26979543342720.0, + "grad_norm": 1.6498044300364278, + "language_loss": 0.77896053, + "learning_rate": 1.850391861746111e-06, + "loss": 0.80126917, + "num_input_tokens_seen": 192594490, + "step": 8950, + "time_per_iteration": 2.683495283126831 + }, + { + "auxiliary_loss_clip": 0.01136226, + "auxiliary_loss_mlp": 0.01111225, + "balance_loss_clip": 1.00201392, + "balance_loss_mlp": 1.00059891, + "epoch": 0.5381632346309936, + "flos": 24754087376640.0, + "grad_norm": 1.5832551819110168, + "language_loss": 0.72681129, + "learning_rate": 1.8500034940776573e-06, + "loss": 0.74928582, + "num_input_tokens_seen": 192615650, + "step": 8951, + "time_per_iteration": 2.7432737350463867 + }, + { + "auxiliary_loss_clip": 0.01166846, + "auxiliary_loss_mlp": 0.00747438, + "balance_loss_clip": 1.00207436, + "balance_loss_mlp": 1.00036931, + "epoch": 0.5382233578836615, + "flos": 15560058372480.0, + "grad_norm": 1.8735035229867405, + "language_loss": 0.75175911, + "learning_rate": 1.849615132097085e-06, + "loss": 0.77090192, + "num_input_tokens_seen": 192633840, + "step": 8952, + "time_per_iteration": 2.5190958976745605 + }, + { + "auxiliary_loss_clip": 0.01135076, + "auxiliary_loss_mlp": 0.01111049, + "balance_loss_clip": 1.00199413, + "balance_loss_mlp": 1.0004226, + "epoch": 0.5382834811363295, + "flos": 25084501608960.0, + "grad_norm": 1.9381476226543841, + "language_loss": 0.79317772, + "learning_rate": 1.8492267758191228e-06, + "loss": 0.81563896, + "num_input_tokens_seen": 192655890, + "step": 8953, + "time_per_iteration": 2.6408369541168213 + }, + { + "auxiliary_loss_clip": 0.01121442, + "auxiliary_loss_mlp": 0.01111597, + "balance_loss_clip": 1.00199461, + "balance_loss_mlp": 1.00049388, + "epoch": 0.5383436043889974, + "flos": 13297901685120.0, + "grad_norm": 1.9349755421965094, + "language_loss": 0.80866247, + "learning_rate": 1.8488384252584964e-06, + "loss": 0.83099288, + "num_input_tokens_seen": 192673025, + "step": 8954, + "time_per_iteration": 2.6244776248931885 + }, + { + "auxiliary_loss_clip": 0.01166886, + "auxiliary_loss_mlp": 0.01110872, + "balance_loss_clip": 1.00226235, + "balance_loss_mlp": 1.00053215, + "epoch": 0.5384037276416654, + "flos": 23039388852480.0, + "grad_norm": 2.303056236673434, + "language_loss": 0.76367259, + "learning_rate": 1.8484500804299318e-06, + "loss": 0.78645015, + "num_input_tokens_seen": 192692190, + "step": 8955, + "time_per_iteration": 2.545314073562622 + }, + { + "auxiliary_loss_clip": 0.01133433, + "auxiliary_loss_mlp": 0.01111472, + "balance_loss_clip": 1.0019722, + "balance_loss_mlp": 1.00065541, + "epoch": 0.5384638508943334, + "flos": 20631147552000.0, + "grad_norm": 1.8569075952000795, + "language_loss": 0.78142136, + "learning_rate": 1.8480617413481557e-06, + "loss": 0.80387044, + "num_input_tokens_seen": 192710380, + "step": 8956, + "time_per_iteration": 2.5983688831329346 + }, + { + "auxiliary_loss_clip": 0.01135233, + "auxiliary_loss_mlp": 0.01087933, + "balance_loss_clip": 1.00154471, + "balance_loss_mlp": 1.00019538, + "epoch": 0.5385239741470014, + "flos": 66737683491840.0, + "grad_norm": 0.8569761035183928, + "language_loss": 0.63420868, + "learning_rate": 1.8476734080278932e-06, + "loss": 0.65644038, + "num_input_tokens_seen": 192768995, + "step": 8957, + "time_per_iteration": 3.103518009185791 + }, + { + "auxiliary_loss_clip": 0.01098228, + "auxiliary_loss_mlp": 0.01088067, + "balance_loss_clip": 1.00139236, + "balance_loss_mlp": 0.99994755, + "epoch": 0.5385840973996693, + "flos": 64716058229760.0, + "grad_norm": 0.725548477133178, + "language_loss": 0.51699829, + "learning_rate": 1.8472850804838705e-06, + "loss": 0.53886116, + "num_input_tokens_seen": 192825585, + "step": 8958, + "time_per_iteration": 4.602313280105591 + }, + { + "auxiliary_loss_clip": 0.01150313, + "auxiliary_loss_mlp": 0.01112331, + "balance_loss_clip": 1.00234365, + "balance_loss_mlp": 1.00046504, + "epoch": 0.5386442206523373, + "flos": 26141783460480.0, + "grad_norm": 1.814489833320406, + "language_loss": 0.7742551, + "learning_rate": 1.8468967587308128e-06, + "loss": 0.79688156, + "num_input_tokens_seen": 192847335, + "step": 8959, + "time_per_iteration": 2.6139142513275146 + }, + { + "auxiliary_loss_clip": 0.01101972, + "auxiliary_loss_mlp": 0.01111608, + "balance_loss_clip": 1.00183618, + "balance_loss_mlp": 1.00050497, + "epoch": 0.5387043439050052, + "flos": 18251849635200.0, + "grad_norm": 2.6877115841432806, + "language_loss": 0.83477432, + "learning_rate": 1.8465084427834455e-06, + "loss": 0.85691011, + "num_input_tokens_seen": 192862205, + "step": 8960, + "time_per_iteration": 2.631922721862793 + }, + { + "auxiliary_loss_clip": 0.01150218, + "auxiliary_loss_mlp": 0.01111184, + "balance_loss_clip": 1.00221658, + "balance_loss_mlp": 1.00055766, + "epoch": 0.5387644671576732, + "flos": 29788296266880.0, + "grad_norm": 1.4996957502682449, + "language_loss": 0.78578174, + "learning_rate": 1.8461201326564933e-06, + "loss": 0.80839574, + "num_input_tokens_seen": 192883695, + "step": 8961, + "time_per_iteration": 4.052433013916016 + }, + { + "auxiliary_loss_clip": 0.0111868, + "auxiliary_loss_mlp": 0.01110618, + "balance_loss_clip": 1.00201702, + "balance_loss_mlp": 1.00056362, + "epoch": 0.5388245904103413, + "flos": 22374466237440.0, + "grad_norm": 1.7086789928698285, + "language_loss": 0.84064293, + "learning_rate": 1.845731828364681e-06, + "loss": 0.86293596, + "num_input_tokens_seen": 192900190, + "step": 8962, + "time_per_iteration": 4.030648469924927 + }, + { + "auxiliary_loss_clip": 0.01127873, + "auxiliary_loss_mlp": 0.01088123, + "balance_loss_clip": 1.001405, + "balance_loss_mlp": 1.0000031, + "epoch": 0.5388847136630092, + "flos": 69807794751360.0, + "grad_norm": 0.7317681070680822, + "language_loss": 0.54197526, + "learning_rate": 1.8453435299227333e-06, + "loss": 0.56413519, + "num_input_tokens_seen": 192958675, + "step": 8963, + "time_per_iteration": 3.0953915119171143 + }, + { + "auxiliary_loss_clip": 0.01128644, + "auxiliary_loss_mlp": 0.01088196, + "balance_loss_clip": 1.00146544, + "balance_loss_mlp": 1.00007629, + "epoch": 0.5389448369156772, + "flos": 69822303845760.0, + "grad_norm": 0.8048280612149379, + "language_loss": 0.63417363, + "learning_rate": 1.8449552373453744e-06, + "loss": 0.65634203, + "num_input_tokens_seen": 193033135, + "step": 8964, + "time_per_iteration": 3.2361230850219727 + }, + { + "auxiliary_loss_clip": 0.01087242, + "auxiliary_loss_mlp": 0.0111153, + "balance_loss_clip": 1.00185537, + "balance_loss_mlp": 1.00042701, + "epoch": 0.5390049601683451, + "flos": 31722444933120.0, + "grad_norm": 1.811928875077576, + "language_loss": 0.70085919, + "learning_rate": 1.8445669506473287e-06, + "loss": 0.72284687, + "num_input_tokens_seen": 193055570, + "step": 8965, + "time_per_iteration": 4.190657138824463 + }, + { + "auxiliary_loss_clip": 0.01135607, + "auxiliary_loss_mlp": 0.00747273, + "balance_loss_clip": 1.00220358, + "balance_loss_mlp": 1.00035489, + "epoch": 0.5390650834210131, + "flos": 18113486446080.0, + "grad_norm": 2.3008335648031424, + "language_loss": 0.82010758, + "learning_rate": 1.8441786698433192e-06, + "loss": 0.83893645, + "num_input_tokens_seen": 193073120, + "step": 8966, + "time_per_iteration": 2.611442804336548 + }, + { + "auxiliary_loss_clip": 0.0116673, + "auxiliary_loss_mlp": 0.01111796, + "balance_loss_clip": 1.00227237, + "balance_loss_mlp": 1.00059772, + "epoch": 0.539125206673681, + "flos": 17416711445760.0, + "grad_norm": 1.783424695436232, + "language_loss": 0.72199786, + "learning_rate": 1.8437903949480706e-06, + "loss": 0.74478316, + "num_input_tokens_seen": 193090105, + "step": 8967, + "time_per_iteration": 2.4949820041656494 + }, + { + "auxiliary_loss_clip": 0.01134896, + "auxiliary_loss_mlp": 0.01110396, + "balance_loss_clip": 1.00195479, + "balance_loss_mlp": 1.00043738, + "epoch": 0.539185329926349, + "flos": 22198935450240.0, + "grad_norm": 1.5962000179255509, + "language_loss": 0.8159368, + "learning_rate": 1.8434021259763065e-06, + "loss": 0.83838969, + "num_input_tokens_seen": 193109325, + "step": 8968, + "time_per_iteration": 2.60832142829895 + }, + { + "auxiliary_loss_clip": 0.01118548, + "auxiliary_loss_mlp": 0.01111457, + "balance_loss_clip": 1.00192451, + "balance_loss_mlp": 1.00063968, + "epoch": 0.539245453179017, + "flos": 21434397442560.0, + "grad_norm": 1.849005462261561, + "language_loss": 0.73784924, + "learning_rate": 1.8430138629427484e-06, + "loss": 0.76014936, + "num_input_tokens_seen": 193130595, + "step": 8969, + "time_per_iteration": 2.6505091190338135 + }, + { + "auxiliary_loss_clip": 0.01120363, + "auxiliary_loss_mlp": 0.0074744, + "balance_loss_clip": 1.00198436, + "balance_loss_mlp": 1.00031328, + "epoch": 0.539305576431685, + "flos": 20735000749440.0, + "grad_norm": 1.697913918173344, + "language_loss": 0.81795388, + "learning_rate": 1.8426256058621205e-06, + "loss": 0.83663189, + "num_input_tokens_seen": 193148930, + "step": 8970, + "time_per_iteration": 2.645944833755493 + }, + { + "auxiliary_loss_clip": 0.01133284, + "auxiliary_loss_mlp": 0.01110836, + "balance_loss_clip": 1.00198865, + "balance_loss_mlp": 1.00049615, + "epoch": 0.5393656996843529, + "flos": 30920452018560.0, + "grad_norm": 1.5364267080592975, + "language_loss": 0.75359416, + "learning_rate": 1.842237354749146e-06, + "loss": 0.77603543, + "num_input_tokens_seen": 193170140, + "step": 8971, + "time_per_iteration": 2.7158048152923584 + }, + { + "auxiliary_loss_clip": 0.01147066, + "auxiliary_loss_mlp": 0.01087741, + "balance_loss_clip": 1.00153673, + "balance_loss_mlp": 1.00000286, + "epoch": 0.5394258229370209, + "flos": 50317781351040.0, + "grad_norm": 0.8804788091928856, + "language_loss": 0.60281968, + "learning_rate": 1.8418491096185465e-06, + "loss": 0.62516773, + "num_input_tokens_seen": 193227235, + "step": 8972, + "time_per_iteration": 3.113128423690796 + }, + { + "auxiliary_loss_clip": 0.01152067, + "auxiliary_loss_mlp": 0.01111158, + "balance_loss_clip": 1.00215673, + "balance_loss_mlp": 1.00081801, + "epoch": 0.5394859461896888, + "flos": 25411935012480.0, + "grad_norm": 1.4556281246660145, + "language_loss": 0.78567624, + "learning_rate": 1.841460870485045e-06, + "loss": 0.80830848, + "num_input_tokens_seen": 193248435, + "step": 8973, + "time_per_iteration": 2.6077256202697754 + }, + { + "auxiliary_loss_clip": 0.01151347, + "auxiliary_loss_mlp": 0.01112593, + "balance_loss_clip": 1.00211406, + "balance_loss_mlp": 1.00063157, + "epoch": 0.5395460694423568, + "flos": 25478476957440.0, + "grad_norm": 2.64961505528426, + "language_loss": 0.73922551, + "learning_rate": 1.8410726373633623e-06, + "loss": 0.76186496, + "num_input_tokens_seen": 193267490, + "step": 8974, + "time_per_iteration": 2.595179319381714 + }, + { + "auxiliary_loss_clip": 0.0116177, + "auxiliary_loss_mlp": 0.01087688, + "balance_loss_clip": 1.00148916, + "balance_loss_mlp": 0.99995023, + "epoch": 0.5396061926950249, + "flos": 53249493507840.0, + "grad_norm": 0.7327177610183369, + "language_loss": 0.51039374, + "learning_rate": 1.8406844102682215e-06, + "loss": 0.53288829, + "num_input_tokens_seen": 193326050, + "step": 8975, + "time_per_iteration": 3.073871374130249 + }, + { + "auxiliary_loss_clip": 0.01152003, + "auxiliary_loss_mlp": 0.01111377, + "balance_loss_clip": 1.00219798, + "balance_loss_mlp": 1.0007503, + "epoch": 0.5396663159476928, + "flos": 26725080418560.0, + "grad_norm": 1.526163852495118, + "language_loss": 0.72241044, + "learning_rate": 1.840296189214344e-06, + "loss": 0.74504423, + "num_input_tokens_seen": 193348785, + "step": 8976, + "time_per_iteration": 2.6198112964630127 + }, + { + "auxiliary_loss_clip": 0.01151927, + "auxiliary_loss_mlp": 0.00747297, + "balance_loss_clip": 1.0021162, + "balance_loss_mlp": 1.000368, + "epoch": 0.5397264392003608, + "flos": 23253380127360.0, + "grad_norm": 1.709153025200383, + "language_loss": 0.69927907, + "learning_rate": 1.8399079742164509e-06, + "loss": 0.71827137, + "num_input_tokens_seen": 193367080, + "step": 8977, + "time_per_iteration": 2.5876243114471436 + }, + { + "auxiliary_loss_clip": 0.01085327, + "auxiliary_loss_mlp": 0.0111184, + "balance_loss_clip": 1.00181389, + "balance_loss_mlp": 1.00045061, + "epoch": 0.5397865624530287, + "flos": 18294188791680.0, + "grad_norm": 2.7674451198368555, + "language_loss": 0.72608554, + "learning_rate": 1.8395197652892636e-06, + "loss": 0.74805725, + "num_input_tokens_seen": 193383715, + "step": 8978, + "time_per_iteration": 2.6864476203918457 + }, + { + "auxiliary_loss_clip": 0.01123027, + "auxiliary_loss_mlp": 0.01113113, + "balance_loss_clip": 1.00192451, + "balance_loss_mlp": 1.00067496, + "epoch": 0.5398466857056967, + "flos": 15297514888320.0, + "grad_norm": 1.8160424063615892, + "language_loss": 0.73864186, + "learning_rate": 1.8391315624475028e-06, + "loss": 0.76100326, + "num_input_tokens_seen": 193400560, + "step": 8979, + "time_per_iteration": 2.6079981327056885 + }, + { + "auxiliary_loss_clip": 0.01090059, + "auxiliary_loss_mlp": 0.01112416, + "balance_loss_clip": 1.00210261, + "balance_loss_mlp": 1.00074077, + "epoch": 0.5399068089583646, + "flos": 17821748183040.0, + "grad_norm": 1.8902234613053588, + "language_loss": 0.77001762, + "learning_rate": 1.8387433657058892e-06, + "loss": 0.79204237, + "num_input_tokens_seen": 193418680, + "step": 8980, + "time_per_iteration": 2.73464035987854 + }, + { + "auxiliary_loss_clip": 0.01166773, + "auxiliary_loss_mlp": 0.01111153, + "balance_loss_clip": 1.00209045, + "balance_loss_mlp": 1.00052702, + "epoch": 0.5399669322110326, + "flos": 27381635164800.0, + "grad_norm": 1.7019804100305616, + "language_loss": 0.82128918, + "learning_rate": 1.8383551750791431e-06, + "loss": 0.84406841, + "num_input_tokens_seen": 193439310, + "step": 8981, + "time_per_iteration": 2.570544958114624 + }, + { + "auxiliary_loss_clip": 0.01151452, + "auxiliary_loss_mlp": 0.01112147, + "balance_loss_clip": 1.00206852, + "balance_loss_mlp": 1.00066292, + "epoch": 0.5400270554637006, + "flos": 20449116403200.0, + "grad_norm": 1.7153800137725403, + "language_loss": 0.67026854, + "learning_rate": 1.8379669905819857e-06, + "loss": 0.69290459, + "num_input_tokens_seen": 193458115, + "step": 8982, + "time_per_iteration": 2.5783188343048096 + }, + { + "auxiliary_loss_clip": 0.01119275, + "auxiliary_loss_mlp": 0.0074732, + "balance_loss_clip": 1.00195158, + "balance_loss_mlp": 1.0004468, + "epoch": 0.5400871787163686, + "flos": 21689578638720.0, + "grad_norm": 1.6009658026712252, + "language_loss": 0.82768095, + "learning_rate": 1.8375788122291358e-06, + "loss": 0.84634686, + "num_input_tokens_seen": 193477365, + "step": 8983, + "time_per_iteration": 2.7642221450805664 + }, + { + "auxiliary_loss_clip": 0.01101987, + "auxiliary_loss_mlp": 0.01111417, + "balance_loss_clip": 1.00183856, + "balance_loss_mlp": 1.00059986, + "epoch": 0.5401473019690365, + "flos": 19204739585280.0, + "grad_norm": 1.7460875682833166, + "language_loss": 0.70728141, + "learning_rate": 1.8371906400353138e-06, + "loss": 0.72941542, + "num_input_tokens_seen": 193495595, + "step": 8984, + "time_per_iteration": 2.6797170639038086 + }, + { + "auxiliary_loss_clip": 0.01166916, + "auxiliary_loss_mlp": 0.01112056, + "balance_loss_clip": 1.00223398, + "balance_loss_mlp": 1.00057161, + "epoch": 0.5402074252217045, + "flos": 20627376624000.0, + "grad_norm": 1.9343855503282479, + "language_loss": 0.80380106, + "learning_rate": 1.8368024740152386e-06, + "loss": 0.82659078, + "num_input_tokens_seen": 193514035, + "step": 8985, + "time_per_iteration": 2.504638195037842 + }, + { + "auxiliary_loss_clip": 0.01118719, + "auxiliary_loss_mlp": 0.01109919, + "balance_loss_clip": 1.00194001, + "balance_loss_mlp": 1.00043678, + "epoch": 0.5402675484743724, + "flos": 24973465691520.0, + "grad_norm": 1.4512301466806286, + "language_loss": 0.7908355, + "learning_rate": 1.83641431418363e-06, + "loss": 0.8131218, + "num_input_tokens_seen": 193535445, + "step": 8986, + "time_per_iteration": 2.7000339031219482 + }, + { + "auxiliary_loss_clip": 0.0115152, + "auxiliary_loss_mlp": 0.0111035, + "balance_loss_clip": 1.00199795, + "balance_loss_mlp": 1.00048685, + "epoch": 0.5403276717270404, + "flos": 19459022941440.0, + "grad_norm": 1.5123554651049502, + "language_loss": 0.76591575, + "learning_rate": 1.8360261605552075e-06, + "loss": 0.78853446, + "num_input_tokens_seen": 193554780, + "step": 8987, + "time_per_iteration": 2.5713579654693604 + }, + { + "auxiliary_loss_clip": 0.01133613, + "auxiliary_loss_mlp": 0.01111482, + "balance_loss_clip": 1.00190496, + "balance_loss_mlp": 1.00056994, + "epoch": 0.5403877949797083, + "flos": 18442140912000.0, + "grad_norm": 2.521588764216759, + "language_loss": 0.71470392, + "learning_rate": 1.8356380131446887e-06, + "loss": 0.73715484, + "num_input_tokens_seen": 193573580, + "step": 8988, + "time_per_iteration": 2.57951283454895 + }, + { + "auxiliary_loss_clip": 0.01101285, + "auxiliary_loss_mlp": 0.01111664, + "balance_loss_clip": 1.00177383, + "balance_loss_mlp": 1.00046551, + "epoch": 0.5404479182323764, + "flos": 28292868316800.0, + "grad_norm": 2.4291412236287138, + "language_loss": 0.67838061, + "learning_rate": 1.8352498719667934e-06, + "loss": 0.70051008, + "num_input_tokens_seen": 193590490, + "step": 8989, + "time_per_iteration": 2.715090274810791 + }, + { + "auxiliary_loss_clip": 0.0115025, + "auxiliary_loss_mlp": 0.01112192, + "balance_loss_clip": 1.00217748, + "balance_loss_mlp": 1.00080276, + "epoch": 0.5405080414850444, + "flos": 23367325046400.0, + "grad_norm": 1.4924122820686763, + "language_loss": 0.77994323, + "learning_rate": 1.8348617370362399e-06, + "loss": 0.8025676, + "num_input_tokens_seen": 193609900, + "step": 8990, + "time_per_iteration": 2.5778839588165283 + }, + { + "auxiliary_loss_clip": 0.01149796, + "auxiliary_loss_mlp": 0.01110303, + "balance_loss_clip": 1.0019598, + "balance_loss_mlp": 1.00043988, + "epoch": 0.5405681647377123, + "flos": 21106425335040.0, + "grad_norm": 2.201996135040462, + "language_loss": 0.69501674, + "learning_rate": 1.834473608367745e-06, + "loss": 0.71761775, + "num_input_tokens_seen": 193629775, + "step": 8991, + "time_per_iteration": 2.5413362979888916 + }, + { + "auxiliary_loss_clip": 0.01088472, + "auxiliary_loss_mlp": 0.01110864, + "balance_loss_clip": 1.00193679, + "balance_loss_mlp": 1.00061929, + "epoch": 0.5406282879903803, + "flos": 20449188230400.0, + "grad_norm": 1.9052646620269074, + "language_loss": 0.76251733, + "learning_rate": 1.8340854859760277e-06, + "loss": 0.78451073, + "num_input_tokens_seen": 193648070, + "step": 8992, + "time_per_iteration": 2.7113876342773438 + }, + { + "auxiliary_loss_clip": 0.01135127, + "auxiliary_loss_mlp": 0.01111284, + "balance_loss_clip": 1.002105, + "balance_loss_mlp": 1.00046682, + "epoch": 0.5406884112430482, + "flos": 14209493973120.0, + "grad_norm": 8.012835524986999, + "language_loss": 0.75923568, + "learning_rate": 1.8336973698758056e-06, + "loss": 0.78169978, + "num_input_tokens_seen": 193665060, + "step": 8993, + "time_per_iteration": 2.581716537475586 + }, + { + "auxiliary_loss_clip": 0.0115172, + "auxiliary_loss_mlp": 0.01110879, + "balance_loss_clip": 1.00206721, + "balance_loss_mlp": 1.00063455, + "epoch": 0.5407485344957162, + "flos": 23875568536320.0, + "grad_norm": 1.8729167908257027, + "language_loss": 0.70508307, + "learning_rate": 1.8333092600817959e-06, + "loss": 0.72770905, + "num_input_tokens_seen": 193683620, + "step": 8994, + "time_per_iteration": 3.979588270187378 + }, + { + "auxiliary_loss_clip": 0.01151928, + "auxiliary_loss_mlp": 0.01111656, + "balance_loss_clip": 1.00223517, + "balance_loss_mlp": 1.000458, + "epoch": 0.5408086577483842, + "flos": 23148485435520.0, + "grad_norm": 1.7804047713446531, + "language_loss": 0.75386965, + "learning_rate": 1.8329211566087157e-06, + "loss": 0.77650547, + "num_input_tokens_seen": 193702990, + "step": 8995, + "time_per_iteration": 2.5944221019744873 + }, + { + "auxiliary_loss_clip": 0.01149863, + "auxiliary_loss_mlp": 0.01110415, + "balance_loss_clip": 1.00204206, + "balance_loss_mlp": 1.00055218, + "epoch": 0.5408687810010522, + "flos": 18771046773120.0, + "grad_norm": 1.8534482985813654, + "language_loss": 0.73383129, + "learning_rate": 1.832533059471282e-06, + "loss": 0.75643402, + "num_input_tokens_seen": 193721785, + "step": 8996, + "time_per_iteration": 2.5518805980682373 + }, + { + "auxiliary_loss_clip": 0.01103512, + "auxiliary_loss_mlp": 0.0111001, + "balance_loss_clip": 1.00195026, + "balance_loss_mlp": 1.00062335, + "epoch": 0.5409289042537201, + "flos": 13881557779200.0, + "grad_norm": 1.770049948227515, + "language_loss": 0.73407745, + "learning_rate": 1.8321449686842115e-06, + "loss": 0.75621271, + "num_input_tokens_seen": 193740315, + "step": 8997, + "time_per_iteration": 2.7131950855255127 + }, + { + "auxiliary_loss_clip": 0.01166727, + "auxiliary_loss_mlp": 0.01111392, + "balance_loss_clip": 1.00212646, + "balance_loss_mlp": 1.00048006, + "epoch": 0.5409890275063881, + "flos": 14465357527680.0, + "grad_norm": 2.1610361865195804, + "language_loss": 0.71441883, + "learning_rate": 1.8317568842622207e-06, + "loss": 0.73720002, + "num_input_tokens_seen": 193757580, + "step": 8998, + "time_per_iteration": 2.54341983795166 + }, + { + "auxiliary_loss_clip": 0.01118068, + "auxiliary_loss_mlp": 0.01111449, + "balance_loss_clip": 1.00197458, + "balance_loss_mlp": 1.00053644, + "epoch": 0.541049150759056, + "flos": 48977449349760.0, + "grad_norm": 1.469002811198233, + "language_loss": 0.70406866, + "learning_rate": 1.8313688062200256e-06, + "loss": 0.7263639, + "num_input_tokens_seen": 193780965, + "step": 8999, + "time_per_iteration": 4.209726333618164 + }, + { + "auxiliary_loss_clip": 0.01135619, + "auxiliary_loss_mlp": 0.01111501, + "balance_loss_clip": 1.00209093, + "balance_loss_mlp": 1.00058889, + "epoch": 0.541109274011724, + "flos": 18147601388160.0, + "grad_norm": 2.3837402202568096, + "language_loss": 0.80392361, + "learning_rate": 1.8309807345723422e-06, + "loss": 0.8263948, + "num_input_tokens_seen": 193797855, + "step": 9000, + "time_per_iteration": 3.9574058055877686 + }, + { + "auxiliary_loss_clip": 0.01105541, + "auxiliary_loss_mlp": 0.01110839, + "balance_loss_clip": 1.00209522, + "balance_loss_mlp": 1.00068974, + "epoch": 0.541169397264392, + "flos": 20522553759360.0, + "grad_norm": 2.0420466337699525, + "language_loss": 0.73317409, + "learning_rate": 1.8305926693338863e-06, + "loss": 0.75533783, + "num_input_tokens_seen": 193817375, + "step": 9001, + "time_per_iteration": 2.7012441158294678 + }, + { + "auxiliary_loss_clip": 0.01119939, + "auxiliary_loss_mlp": 0.01111772, + "balance_loss_clip": 1.00195134, + "balance_loss_mlp": 1.000669, + "epoch": 0.54122952051706, + "flos": 20044043752320.0, + "grad_norm": 2.856968386330605, + "language_loss": 0.8535499, + "learning_rate": 1.8302046105193734e-06, + "loss": 0.87586701, + "num_input_tokens_seen": 193832205, + "step": 9002, + "time_per_iteration": 4.115927457809448 + }, + { + "auxiliary_loss_clip": 0.01103334, + "auxiliary_loss_mlp": 0.01110019, + "balance_loss_clip": 1.00200462, + "balance_loss_mlp": 1.00063276, + "epoch": 0.541289643769728, + "flos": 19062246332160.0, + "grad_norm": 1.719342424079215, + "language_loss": 0.77899534, + "learning_rate": 1.8298165581435183e-06, + "loss": 0.80112886, + "num_input_tokens_seen": 193849830, + "step": 9003, + "time_per_iteration": 2.6848838329315186 + }, + { + "auxiliary_loss_clip": 0.01149401, + "auxiliary_loss_mlp": 0.0111041, + "balance_loss_clip": 1.00196481, + "balance_loss_mlp": 1.00054705, + "epoch": 0.5413497670223959, + "flos": 22382295402240.0, + "grad_norm": 1.6946919833794005, + "language_loss": 0.69736218, + "learning_rate": 1.8294285122210372e-06, + "loss": 0.71996027, + "num_input_tokens_seen": 193869945, + "step": 9004, + "time_per_iteration": 2.623762369155884 + }, + { + "auxiliary_loss_clip": 0.01145229, + "auxiliary_loss_mlp": 0.01087808, + "balance_loss_clip": 1.00168133, + "balance_loss_mlp": 1.0000701, + "epoch": 0.5414098902750639, + "flos": 70031734093440.0, + "grad_norm": 0.9694864181556851, + "language_loss": 0.59121352, + "learning_rate": 1.8290404727666434e-06, + "loss": 0.61354393, + "num_input_tokens_seen": 193930860, + "step": 9005, + "time_per_iteration": 3.2389614582061768 + }, + { + "auxiliary_loss_clip": 0.01166886, + "auxiliary_loss_mlp": 0.00747448, + "balance_loss_clip": 1.00217652, + "balance_loss_mlp": 1.00040102, + "epoch": 0.5414700135277318, + "flos": 21798962530560.0, + "grad_norm": 2.005170433351581, + "language_loss": 0.78109193, + "learning_rate": 1.8286524397950517e-06, + "loss": 0.80023527, + "num_input_tokens_seen": 193949075, + "step": 9006, + "time_per_iteration": 2.567321300506592 + }, + { + "auxiliary_loss_clip": 0.01135439, + "auxiliary_loss_mlp": 0.01109623, + "balance_loss_clip": 1.00200009, + "balance_loss_mlp": 1.00071287, + "epoch": 0.5415301367803999, + "flos": 16907929251840.0, + "grad_norm": 1.7147137724549326, + "language_loss": 0.83207846, + "learning_rate": 1.8282644133209777e-06, + "loss": 0.85452908, + "num_input_tokens_seen": 193967630, + "step": 9007, + "time_per_iteration": 2.6224148273468018 + }, + { + "auxiliary_loss_clip": 0.01150199, + "auxiliary_loss_mlp": 0.01111196, + "balance_loss_clip": 1.00199568, + "balance_loss_mlp": 1.00066507, + "epoch": 0.5415902600330678, + "flos": 25704176065920.0, + "grad_norm": 1.8587338167119491, + "language_loss": 0.67190897, + "learning_rate": 1.8278763933591334e-06, + "loss": 0.69452298, + "num_input_tokens_seen": 193988730, + "step": 9008, + "time_per_iteration": 2.6306326389312744 + }, + { + "auxiliary_loss_clip": 0.01166863, + "auxiliary_loss_mlp": 0.01111355, + "balance_loss_clip": 1.00213087, + "balance_loss_mlp": 1.00072896, + "epoch": 0.5416503832857358, + "flos": 19208151377280.0, + "grad_norm": 3.435981418717575, + "language_loss": 0.73616374, + "learning_rate": 1.827488379924234e-06, + "loss": 0.75894594, + "num_input_tokens_seen": 194005160, + "step": 9009, + "time_per_iteration": 2.5052058696746826 + }, + { + "auxiliary_loss_clip": 0.01100229, + "auxiliary_loss_mlp": 0.01111878, + "balance_loss_clip": 1.00189292, + "balance_loss_mlp": 1.00067925, + "epoch": 0.5417105065384037, + "flos": 12713706887040.0, + "grad_norm": 2.242841208380999, + "language_loss": 0.8796947, + "learning_rate": 1.8271003730309923e-06, + "loss": 0.90181577, + "num_input_tokens_seen": 194021700, + "step": 9010, + "time_per_iteration": 2.653945207595825 + }, + { + "auxiliary_loss_clip": 0.01166633, + "auxiliary_loss_mlp": 0.01110538, + "balance_loss_clip": 1.00215161, + "balance_loss_mlp": 1.00067484, + "epoch": 0.5417706297910717, + "flos": 30335933998080.0, + "grad_norm": 2.6661150491713146, + "language_loss": 0.65242684, + "learning_rate": 1.826712372694122e-06, + "loss": 0.67519855, + "num_input_tokens_seen": 194042620, + "step": 9011, + "time_per_iteration": 2.5818660259246826 + }, + { + "auxiliary_loss_clip": 0.01150022, + "auxiliary_loss_mlp": 0.01110314, + "balance_loss_clip": 1.00207305, + "balance_loss_mlp": 1.00064158, + "epoch": 0.5418307530437396, + "flos": 29020992912000.0, + "grad_norm": 2.675325726778836, + "language_loss": 0.79513228, + "learning_rate": 1.8263243789283362e-06, + "loss": 0.81773567, + "num_input_tokens_seen": 194061800, + "step": 9012, + "time_per_iteration": 2.596156120300293 + }, + { + "auxiliary_loss_clip": 0.01166842, + "auxiliary_loss_mlp": 0.01110858, + "balance_loss_clip": 1.00220573, + "balance_loss_mlp": 1.00061297, + "epoch": 0.5418908762964076, + "flos": 16873455173760.0, + "grad_norm": 1.9805657765339508, + "language_loss": 0.74657631, + "learning_rate": 1.8259363917483466e-06, + "loss": 0.76935327, + "num_input_tokens_seen": 194079890, + "step": 9013, + "time_per_iteration": 2.4924209117889404 + }, + { + "auxiliary_loss_clip": 0.01122973, + "auxiliary_loss_mlp": 0.01111642, + "balance_loss_clip": 1.0020988, + "balance_loss_mlp": 1.00063396, + "epoch": 0.5419509995490756, + "flos": 18949702043520.0, + "grad_norm": 3.8388560521522157, + "language_loss": 0.7254836, + "learning_rate": 1.8255484111688667e-06, + "loss": 0.74782974, + "num_input_tokens_seen": 194097625, + "step": 9014, + "time_per_iteration": 2.6262660026550293 + }, + { + "auxiliary_loss_clip": 0.0113507, + "auxiliary_loss_mlp": 0.01110324, + "balance_loss_clip": 1.00200498, + "balance_loss_mlp": 1.00055599, + "epoch": 0.5420111228017436, + "flos": 18077719478400.0, + "grad_norm": 1.491361216837668, + "language_loss": 0.80446434, + "learning_rate": 1.8251604372046085e-06, + "loss": 0.82691824, + "num_input_tokens_seen": 194116055, + "step": 9015, + "time_per_iteration": 2.593161106109619 + }, + { + "auxiliary_loss_clip": 0.01150395, + "auxiliary_loss_mlp": 0.01111714, + "balance_loss_clip": 1.00204694, + "balance_loss_mlp": 1.00061059, + "epoch": 0.5420712460544116, + "flos": 19061779455360.0, + "grad_norm": 2.1704181610515803, + "language_loss": 0.81102073, + "learning_rate": 1.8247724698702843e-06, + "loss": 0.83364183, + "num_input_tokens_seen": 194130365, + "step": 9016, + "time_per_iteration": 2.5664429664611816 + }, + { + "auxiliary_loss_clip": 0.01166598, + "auxiliary_loss_mlp": 0.01110395, + "balance_loss_clip": 1.00211477, + "balance_loss_mlp": 1.00053191, + "epoch": 0.5421313693070795, + "flos": 18187103370240.0, + "grad_norm": 1.61426324117502, + "language_loss": 0.81411016, + "learning_rate": 1.8243845091806053e-06, + "loss": 0.83688009, + "num_input_tokens_seen": 194148975, + "step": 9017, + "time_per_iteration": 2.515152931213379 + }, + { + "auxiliary_loss_clip": 0.01166694, + "auxiliary_loss_mlp": 0.01111133, + "balance_loss_clip": 1.00224841, + "balance_loss_mlp": 1.00060189, + "epoch": 0.5421914925597475, + "flos": 13005947940480.0, + "grad_norm": 1.9635503907314171, + "language_loss": 0.77456009, + "learning_rate": 1.8239965551502837e-06, + "loss": 0.79733837, + "num_input_tokens_seen": 194167185, + "step": 9018, + "time_per_iteration": 2.5143325328826904 + }, + { + "auxiliary_loss_clip": 0.01166711, + "auxiliary_loss_mlp": 0.01112216, + "balance_loss_clip": 1.00205612, + "balance_loss_mlp": 1.00073171, + "epoch": 0.5422516158124154, + "flos": 46758457831680.0, + "grad_norm": 1.491323308873137, + "language_loss": 0.66723621, + "learning_rate": 1.8236086077940303e-06, + "loss": 0.69002539, + "num_input_tokens_seen": 194192840, + "step": 9019, + "time_per_iteration": 2.748617649078369 + }, + { + "auxiliary_loss_clip": 0.01151498, + "auxiliary_loss_mlp": 0.01110204, + "balance_loss_clip": 1.00208426, + "balance_loss_mlp": 1.00053191, + "epoch": 0.5423117390650835, + "flos": 31758642864000.0, + "grad_norm": 1.7273540383588017, + "language_loss": 0.69818592, + "learning_rate": 1.8232206671265555e-06, + "loss": 0.72080296, + "num_input_tokens_seen": 194213150, + "step": 9020, + "time_per_iteration": 2.672433376312256 + }, + { + "auxiliary_loss_clip": 0.01117842, + "auxiliary_loss_mlp": 0.01110115, + "balance_loss_clip": 1.0020299, + "balance_loss_mlp": 1.00063264, + "epoch": 0.5423718623177514, + "flos": 27201974313600.0, + "grad_norm": 1.6252504727249166, + "language_loss": 0.80545527, + "learning_rate": 1.8228327331625717e-06, + "loss": 0.82773483, + "num_input_tokens_seen": 194234665, + "step": 9021, + "time_per_iteration": 2.7165234088897705 + }, + { + "auxiliary_loss_clip": 0.01101137, + "auxiliary_loss_mlp": 0.01111005, + "balance_loss_clip": 1.0018363, + "balance_loss_mlp": 1.00076008, + "epoch": 0.5424319855704194, + "flos": 23546447193600.0, + "grad_norm": 1.7901404705076245, + "language_loss": 0.78656214, + "learning_rate": 1.822444805916788e-06, + "loss": 0.80868357, + "num_input_tokens_seen": 194253790, + "step": 9022, + "time_per_iteration": 2.6878228187561035 + }, + { + "auxiliary_loss_clip": 0.01135432, + "auxiliary_loss_mlp": 0.00747358, + "balance_loss_clip": 1.00208926, + "balance_loss_mlp": 1.00042021, + "epoch": 0.5424921088230873, + "flos": 26615624699520.0, + "grad_norm": 1.7030512804252453, + "language_loss": 0.81985432, + "learning_rate": 1.822056885403915e-06, + "loss": 0.83868223, + "num_input_tokens_seen": 194274950, + "step": 9023, + "time_per_iteration": 2.663355588912964 + }, + { + "auxiliary_loss_clip": 0.0115024, + "auxiliary_loss_mlp": 0.01110413, + "balance_loss_clip": 1.00212669, + "balance_loss_mlp": 1.00045395, + "epoch": 0.5425522320757553, + "flos": 23586811102080.0, + "grad_norm": 1.8495017971350574, + "language_loss": 0.71487844, + "learning_rate": 1.8216689716386627e-06, + "loss": 0.73748493, + "num_input_tokens_seen": 194296155, + "step": 9024, + "time_per_iteration": 2.586909532546997 + }, + { + "auxiliary_loss_clip": 0.01151414, + "auxiliary_loss_mlp": 0.01110545, + "balance_loss_clip": 1.00212431, + "balance_loss_mlp": 1.0004909, + "epoch": 0.5426123553284232, + "flos": 30592264429440.0, + "grad_norm": 1.5860784123125908, + "language_loss": 0.6507411, + "learning_rate": 1.8212810646357405e-06, + "loss": 0.67336065, + "num_input_tokens_seen": 194318025, + "step": 9025, + "time_per_iteration": 2.6366126537323 + }, + { + "auxiliary_loss_clip": 0.01119203, + "auxiliary_loss_mlp": 0.00747416, + "balance_loss_clip": 1.00194144, + "balance_loss_mlp": 1.00050843, + "epoch": 0.5426724785810912, + "flos": 12495118671360.0, + "grad_norm": 1.8932327930253567, + "language_loss": 0.73617637, + "learning_rate": 1.8208931644098591e-06, + "loss": 0.75484258, + "num_input_tokens_seen": 194336150, + "step": 9026, + "time_per_iteration": 2.658837080001831 + }, + { + "auxiliary_loss_clip": 0.01135176, + "auxiliary_loss_mlp": 0.01111382, + "balance_loss_clip": 1.00204217, + "balance_loss_mlp": 1.00056541, + "epoch": 0.5427326018337592, + "flos": 26064611089920.0, + "grad_norm": 2.2626463437719653, + "language_loss": 0.78635901, + "learning_rate": 1.8205052709757265e-06, + "loss": 0.8088246, + "num_input_tokens_seen": 194355980, + "step": 9027, + "time_per_iteration": 2.647803544998169 + }, + { + "auxiliary_loss_clip": 0.0111577, + "auxiliary_loss_mlp": 0.01087494, + "balance_loss_clip": 1.00156116, + "balance_loss_mlp": 1.00013757, + "epoch": 0.5427927250864272, + "flos": 65984745576960.0, + "grad_norm": 0.7527642496535039, + "language_loss": 0.56569147, + "learning_rate": 1.8201173843480515e-06, + "loss": 0.58772409, + "num_input_tokens_seen": 194422660, + "step": 9028, + "time_per_iteration": 3.2650156021118164 + }, + { + "auxiliary_loss_clip": 0.01103216, + "auxiliary_loss_mlp": 0.01111126, + "balance_loss_clip": 1.00189567, + "balance_loss_mlp": 1.00049996, + "epoch": 0.5428528483390952, + "flos": 19975382904960.0, + "grad_norm": 1.8770941248054944, + "language_loss": 0.78062093, + "learning_rate": 1.8197295045415442e-06, + "loss": 0.80276436, + "num_input_tokens_seen": 194438545, + "step": 9029, + "time_per_iteration": 2.6839191913604736 + }, + { + "auxiliary_loss_clip": 0.0110686, + "auxiliary_loss_mlp": 0.01111066, + "balance_loss_clip": 1.00204015, + "balance_loss_mlp": 1.00053525, + "epoch": 0.5429129715917631, + "flos": 21832323287040.0, + "grad_norm": 1.869021283063478, + "language_loss": 0.83072209, + "learning_rate": 1.8193416315709112e-06, + "loss": 0.85290134, + "num_input_tokens_seen": 194458060, + "step": 9030, + "time_per_iteration": 2.718017339706421 + }, + { + "auxiliary_loss_clip": 0.01166642, + "auxiliary_loss_mlp": 0.01110789, + "balance_loss_clip": 1.00218415, + "balance_loss_mlp": 1.00044918, + "epoch": 0.5429730948444311, + "flos": 27782685492480.0, + "grad_norm": 1.690980879728648, + "language_loss": 0.74807942, + "learning_rate": 1.8189537654508623e-06, + "loss": 0.77085376, + "num_input_tokens_seen": 194477405, + "step": 9031, + "time_per_iteration": 2.585087537765503 + }, + { + "auxiliary_loss_clip": 0.01151404, + "auxiliary_loss_mlp": 0.01109641, + "balance_loss_clip": 1.00214839, + "balance_loss_mlp": 1.00063562, + "epoch": 0.543033218097099, + "flos": 26760452336640.0, + "grad_norm": 1.9429355201878566, + "language_loss": 0.85462677, + "learning_rate": 1.8185659061961045e-06, + "loss": 0.87723726, + "num_input_tokens_seen": 194497085, + "step": 9032, + "time_per_iteration": 4.0070881843566895 + }, + { + "auxiliary_loss_clip": 0.01135282, + "auxiliary_loss_mlp": 0.01111499, + "balance_loss_clip": 1.0019846, + "balance_loss_mlp": 1.00068259, + "epoch": 0.5430933413497671, + "flos": 22675254727680.0, + "grad_norm": 2.2708604312705165, + "language_loss": 0.73754454, + "learning_rate": 1.8181780538213457e-06, + "loss": 0.76001239, + "num_input_tokens_seen": 194516785, + "step": 9033, + "time_per_iteration": 2.607074499130249 + }, + { + "auxiliary_loss_clip": 0.011168, + "auxiliary_loss_mlp": 0.01111174, + "balance_loss_clip": 1.00187135, + "balance_loss_mlp": 1.00073862, + "epoch": 0.543153464602435, + "flos": 24607499973120.0, + "grad_norm": 1.6629529813331787, + "language_loss": 0.75525165, + "learning_rate": 1.8177902083412935e-06, + "loss": 0.77753139, + "num_input_tokens_seen": 194536475, + "step": 9034, + "time_per_iteration": 2.6747403144836426 + }, + { + "auxiliary_loss_clip": 0.01116466, + "auxiliary_loss_mlp": 0.0110998, + "balance_loss_clip": 1.00184357, + "balance_loss_mlp": 1.00059342, + "epoch": 0.543213587855103, + "flos": 19025725178880.0, + "grad_norm": 2.4128355361796205, + "language_loss": 0.84212494, + "learning_rate": 1.817402369770655e-06, + "loss": 0.86438942, + "num_input_tokens_seen": 194554495, + "step": 9035, + "time_per_iteration": 2.6227989196777344 + }, + { + "auxiliary_loss_clip": 0.01111364, + "auxiliary_loss_mlp": 0.01087493, + "balance_loss_clip": 1.0015043, + "balance_loss_mlp": 1.00013685, + "epoch": 0.5432737111077709, + "flos": 65686435125120.0, + "grad_norm": 0.7235366228883892, + "language_loss": 0.55873775, + "learning_rate": 1.8170145381241364e-06, + "loss": 0.58072627, + "num_input_tokens_seen": 194617620, + "step": 9036, + "time_per_iteration": 4.671886444091797 + }, + { + "auxiliary_loss_clip": 0.01084997, + "auxiliary_loss_mlp": 0.01110926, + "balance_loss_clip": 1.00181305, + "balance_loss_mlp": 1.00049067, + "epoch": 0.5433338343604389, + "flos": 22091670460800.0, + "grad_norm": 1.5399067843627037, + "language_loss": 0.7525869, + "learning_rate": 1.8166267134164451e-06, + "loss": 0.77454615, + "num_input_tokens_seen": 194637690, + "step": 9037, + "time_per_iteration": 4.138727188110352 + }, + { + "auxiliary_loss_clip": 0.01139157, + "auxiliary_loss_mlp": 0.01110356, + "balance_loss_clip": 1.00205576, + "balance_loss_mlp": 1.00058794, + "epoch": 0.5433939576131068, + "flos": 34672649616000.0, + "grad_norm": 1.6151405205964076, + "language_loss": 0.66607547, + "learning_rate": 1.8162388956622875e-06, + "loss": 0.68857062, + "num_input_tokens_seen": 194659520, + "step": 9038, + "time_per_iteration": 2.708735466003418 + }, + { + "auxiliary_loss_clip": 0.01151311, + "auxiliary_loss_mlp": 0.01111132, + "balance_loss_clip": 1.00205457, + "balance_loss_mlp": 1.00050545, + "epoch": 0.5434540808657748, + "flos": 20303355012480.0, + "grad_norm": 1.979704084820017, + "language_loss": 0.78329939, + "learning_rate": 1.8158510848763692e-06, + "loss": 0.80592382, + "num_input_tokens_seen": 194677645, + "step": 9039, + "time_per_iteration": 2.578627347946167 + }, + { + "auxiliary_loss_clip": 0.01125988, + "auxiliary_loss_mlp": 0.01111041, + "balance_loss_clip": 1.00211, + "balance_loss_mlp": 1.00060594, + "epoch": 0.5435142041184428, + "flos": 23112790295040.0, + "grad_norm": 2.444495600320104, + "language_loss": 0.76224858, + "learning_rate": 1.8154632810733962e-06, + "loss": 0.78461891, + "num_input_tokens_seen": 194697400, + "step": 9040, + "time_per_iteration": 4.047276496887207 + }, + { + "auxiliary_loss_clip": 0.01128415, + "auxiliary_loss_mlp": 0.01087485, + "balance_loss_clip": 1.00149977, + "balance_loss_mlp": 1.00012827, + "epoch": 0.5435743273711108, + "flos": 64012746954240.0, + "grad_norm": 0.67402395407052, + "language_loss": 0.52504325, + "learning_rate": 1.815075484268074e-06, + "loss": 0.54720229, + "num_input_tokens_seen": 194761205, + "step": 9041, + "time_per_iteration": 3.1549971103668213 + }, + { + "auxiliary_loss_clip": 0.01133185, + "auxiliary_loss_mlp": 0.01110639, + "balance_loss_clip": 1.00198781, + "balance_loss_mlp": 1.00068045, + "epoch": 0.5436344506237788, + "flos": 25118903859840.0, + "grad_norm": 1.5431236561971509, + "language_loss": 0.7604332, + "learning_rate": 1.8146876944751078e-06, + "loss": 0.78287148, + "num_input_tokens_seen": 194782445, + "step": 9042, + "time_per_iteration": 2.688885450363159 + }, + { + "auxiliary_loss_clip": 0.01116463, + "auxiliary_loss_mlp": 0.01110496, + "balance_loss_clip": 1.00190282, + "balance_loss_mlp": 1.00072789, + "epoch": 0.5436945738764467, + "flos": 19572967860480.0, + "grad_norm": 1.6635355872137787, + "language_loss": 0.67446476, + "learning_rate": 1.8142999117092033e-06, + "loss": 0.69673437, + "num_input_tokens_seen": 194800325, + "step": 9043, + "time_per_iteration": 2.6403281688690186 + }, + { + "auxiliary_loss_clip": 0.01119998, + "auxiliary_loss_mlp": 0.01110128, + "balance_loss_clip": 1.00181246, + "balance_loss_mlp": 1.00055051, + "epoch": 0.5437546971291147, + "flos": 21142515525120.0, + "grad_norm": 1.542692564091702, + "language_loss": 0.84244132, + "learning_rate": 1.8139121359850644e-06, + "loss": 0.86474252, + "num_input_tokens_seen": 194818675, + "step": 9044, + "time_per_iteration": 2.6342718601226807 + }, + { + "auxiliary_loss_clip": 0.01166838, + "auxiliary_loss_mlp": 0.01110601, + "balance_loss_clip": 1.00217974, + "balance_loss_mlp": 1.00045145, + "epoch": 0.5438148203817826, + "flos": 25118688378240.0, + "grad_norm": 2.8173464464888562, + "language_loss": 0.62001717, + "learning_rate": 1.8135243673173956e-06, + "loss": 0.64279157, + "num_input_tokens_seen": 194836595, + "step": 9045, + "time_per_iteration": 2.5974817276000977 + }, + { + "auxiliary_loss_clip": 0.01166777, + "auxiliary_loss_mlp": 0.01110606, + "balance_loss_clip": 1.00223196, + "balance_loss_mlp": 1.00064707, + "epoch": 0.5438749436344507, + "flos": 23002939526400.0, + "grad_norm": 1.6475964549102229, + "language_loss": 0.70181918, + "learning_rate": 1.8131366057209023e-06, + "loss": 0.72459292, + "num_input_tokens_seen": 194857520, + "step": 9046, + "time_per_iteration": 2.556384325027466 + }, + { + "auxiliary_loss_clip": 0.01166525, + "auxiliary_loss_mlp": 0.01109654, + "balance_loss_clip": 1.00217867, + "balance_loss_mlp": 1.00045824, + "epoch": 0.5439350668871186, + "flos": 15487016065920.0, + "grad_norm": 1.5078676027509184, + "language_loss": 0.77216941, + "learning_rate": 1.8127488512102868e-06, + "loss": 0.79493117, + "num_input_tokens_seen": 194876020, + "step": 9047, + "time_per_iteration": 2.5239503383636475 + }, + { + "auxiliary_loss_clip": 0.01135177, + "auxiliary_loss_mlp": 0.01111512, + "balance_loss_clip": 1.00212848, + "balance_loss_mlp": 1.00079024, + "epoch": 0.5439951901397866, + "flos": 17238415311360.0, + "grad_norm": 1.7838150759913929, + "language_loss": 0.72769707, + "learning_rate": 1.8123611038002547e-06, + "loss": 0.75016403, + "num_input_tokens_seen": 194894650, + "step": 9048, + "time_per_iteration": 2.59721040725708 + }, + { + "auxiliary_loss_clip": 0.01106641, + "auxiliary_loss_mlp": 0.01110867, + "balance_loss_clip": 1.00205612, + "balance_loss_mlp": 1.00062251, + "epoch": 0.5440553133924545, + "flos": 18661016436480.0, + "grad_norm": 2.246843412502415, + "language_loss": 0.93609393, + "learning_rate": 1.8119733635055076e-06, + "loss": 0.95826894, + "num_input_tokens_seen": 194911935, + "step": 9049, + "time_per_iteration": 2.645991802215576 + }, + { + "auxiliary_loss_clip": 0.01151673, + "auxiliary_loss_mlp": 0.01109782, + "balance_loss_clip": 1.00198579, + "balance_loss_mlp": 1.0006814, + "epoch": 0.5441154366451225, + "flos": 27122934435840.0, + "grad_norm": 1.6632421105203674, + "language_loss": 0.74439466, + "learning_rate": 1.8115856303407492e-06, + "loss": 0.7670092, + "num_input_tokens_seen": 194931620, + "step": 9050, + "time_per_iteration": 2.5900330543518066 + }, + { + "auxiliary_loss_clip": 0.01150256, + "auxiliary_loss_mlp": 0.01110169, + "balance_loss_clip": 1.00216758, + "balance_loss_mlp": 1.00059211, + "epoch": 0.5441755598977904, + "flos": 25993867253760.0, + "grad_norm": 2.0742999485358484, + "language_loss": 0.66933727, + "learning_rate": 1.8111979043206832e-06, + "loss": 0.6919415, + "num_input_tokens_seen": 194952560, + "step": 9051, + "time_per_iteration": 2.6061501502990723 + }, + { + "auxiliary_loss_clip": 0.01118247, + "auxiliary_loss_mlp": 0.01109738, + "balance_loss_clip": 1.00197697, + "balance_loss_mlp": 1.00054204, + "epoch": 0.5442356831504584, + "flos": 32380041173760.0, + "grad_norm": 2.041317607022306, + "language_loss": 0.67524368, + "learning_rate": 1.810810185460011e-06, + "loss": 0.69752347, + "num_input_tokens_seen": 194973915, + "step": 9052, + "time_per_iteration": 2.738022804260254 + }, + { + "auxiliary_loss_clip": 0.01166653, + "auxiliary_loss_mlp": 0.01110609, + "balance_loss_clip": 1.00214958, + "balance_loss_mlp": 1.00065053, + "epoch": 0.5442958064031264, + "flos": 24164290056960.0, + "grad_norm": 1.7676202711290279, + "language_loss": 0.9289459, + "learning_rate": 1.810422473773436e-06, + "loss": 0.95171857, + "num_input_tokens_seen": 194990170, + "step": 9053, + "time_per_iteration": 2.567021131515503 + }, + { + "auxiliary_loss_clip": 0.01139399, + "auxiliary_loss_mlp": 0.01111475, + "balance_loss_clip": 1.00208592, + "balance_loss_mlp": 1.00075376, + "epoch": 0.5443559296557944, + "flos": 18764690065920.0, + "grad_norm": 1.9348173516343923, + "language_loss": 0.8330186, + "learning_rate": 1.8100347692756595e-06, + "loss": 0.85552728, + "num_input_tokens_seen": 195006395, + "step": 9054, + "time_per_iteration": 2.602700710296631 + }, + { + "auxiliary_loss_clip": 0.011413, + "auxiliary_loss_mlp": 0.01110889, + "balance_loss_clip": 1.00236166, + "balance_loss_mlp": 1.00064385, + "epoch": 0.5444160529084624, + "flos": 22632556435200.0, + "grad_norm": 2.076226954138241, + "language_loss": 0.68258071, + "learning_rate": 1.8096470719813836e-06, + "loss": 0.70510262, + "num_input_tokens_seen": 195025080, + "step": 9055, + "time_per_iteration": 2.6160967350006104 + }, + { + "auxiliary_loss_clip": 0.01119235, + "auxiliary_loss_mlp": 0.01087139, + "balance_loss_clip": 1.00144315, + "balance_loss_mlp": 1.00016391, + "epoch": 0.5444761761611303, + "flos": 69671909600640.0, + "grad_norm": 0.76376378486096, + "language_loss": 0.5763793, + "learning_rate": 1.80925938190531e-06, + "loss": 0.59844303, + "num_input_tokens_seen": 195085725, + "step": 9056, + "time_per_iteration": 3.1998751163482666 + }, + { + "auxiliary_loss_clip": 0.01116594, + "auxiliary_loss_mlp": 0.01110714, + "balance_loss_clip": 1.00178933, + "balance_loss_mlp": 1.00046945, + "epoch": 0.5445362994137983, + "flos": 14278442129280.0, + "grad_norm": 2.2543274181284363, + "language_loss": 0.69627059, + "learning_rate": 1.8088716990621395e-06, + "loss": 0.71854365, + "num_input_tokens_seen": 195102585, + "step": 9057, + "time_per_iteration": 2.6116323471069336 + }, + { + "auxiliary_loss_clip": 0.01149692, + "auxiliary_loss_mlp": 0.01109312, + "balance_loss_clip": 1.0019784, + "balance_loss_mlp": 1.0005933, + "epoch": 0.5445964226664662, + "flos": 28986195611520.0, + "grad_norm": 1.8641405057824452, + "language_loss": 0.74599886, + "learning_rate": 1.8084840234665738e-06, + "loss": 0.7685889, + "num_input_tokens_seen": 195120055, + "step": 9058, + "time_per_iteration": 2.717606782913208 + }, + { + "auxiliary_loss_clip": 0.01114128, + "auxiliary_loss_mlp": 0.01087087, + "balance_loss_clip": 1.00162506, + "balance_loss_mlp": 1.00011241, + "epoch": 0.5446565459191343, + "flos": 68620230270720.0, + "grad_norm": 0.7972465941661558, + "language_loss": 0.62674654, + "learning_rate": 1.808096355133312e-06, + "loss": 0.64875865, + "num_input_tokens_seen": 195181045, + "step": 9059, + "time_per_iteration": 3.2859373092651367 + }, + { + "auxiliary_loss_clip": 0.01149876, + "auxiliary_loss_mlp": 0.01110082, + "balance_loss_clip": 1.00203967, + "balance_loss_mlp": 1.00050533, + "epoch": 0.5447166691718022, + "flos": 16216469464320.0, + "grad_norm": 1.9004635862844526, + "language_loss": 0.79284966, + "learning_rate": 1.8077086940770572e-06, + "loss": 0.81544924, + "num_input_tokens_seen": 195198840, + "step": 9060, + "time_per_iteration": 2.5833234786987305 + }, + { + "auxiliary_loss_clip": 0.01151691, + "auxiliary_loss_mlp": 0.01109949, + "balance_loss_clip": 1.00206602, + "balance_loss_mlp": 1.00056291, + "epoch": 0.5447767924244702, + "flos": 25849039616640.0, + "grad_norm": 1.8803491171701712, + "language_loss": 0.79409146, + "learning_rate": 1.8073210403125072e-06, + "loss": 0.81670785, + "num_input_tokens_seen": 195218720, + "step": 9061, + "time_per_iteration": 2.6214168071746826 + }, + { + "auxiliary_loss_clip": 0.01149968, + "auxiliary_loss_mlp": 0.01110332, + "balance_loss_clip": 1.00218821, + "balance_loss_mlp": 1.00056434, + "epoch": 0.5448369156771381, + "flos": 19677718897920.0, + "grad_norm": 3.0089021055967837, + "language_loss": 0.87161183, + "learning_rate": 1.8069333938543627e-06, + "loss": 0.89421481, + "num_input_tokens_seen": 195235770, + "step": 9062, + "time_per_iteration": 2.574577569961548 + }, + { + "auxiliary_loss_clip": 0.01141001, + "auxiliary_loss_mlp": 0.01111907, + "balance_loss_clip": 1.0021323, + "balance_loss_mlp": 1.00051761, + "epoch": 0.5448970389298061, + "flos": 19281804215040.0, + "grad_norm": 1.8940150305418206, + "language_loss": 0.8263675, + "learning_rate": 1.8065457547173233e-06, + "loss": 0.8488965, + "num_input_tokens_seen": 195254870, + "step": 9063, + "time_per_iteration": 2.614699602127075 + }, + { + "auxiliary_loss_clip": 0.01166618, + "auxiliary_loss_mlp": 0.01111237, + "balance_loss_clip": 1.00212479, + "balance_loss_mlp": 1.00051546, + "epoch": 0.544957162182474, + "flos": 20991690316800.0, + "grad_norm": 2.647181195776577, + "language_loss": 0.63984901, + "learning_rate": 1.8061581229160878e-06, + "loss": 0.66262758, + "num_input_tokens_seen": 195273390, + "step": 9064, + "time_per_iteration": 2.5342066287994385 + }, + { + "auxiliary_loss_clip": 0.01166717, + "auxiliary_loss_mlp": 0.01110697, + "balance_loss_clip": 1.0021224, + "balance_loss_mlp": 1.0006429, + "epoch": 0.545017285435142, + "flos": 25374587846400.0, + "grad_norm": 1.7595450336294265, + "language_loss": 0.79904044, + "learning_rate": 1.8057704984653566e-06, + "loss": 0.82181454, + "num_input_tokens_seen": 195295635, + "step": 9065, + "time_per_iteration": 2.564779043197632 + }, + { + "auxiliary_loss_clip": 0.01117029, + "auxiliary_loss_mlp": 0.0110895, + "balance_loss_clip": 1.00184, + "balance_loss_mlp": 1.00061226, + "epoch": 0.54507740868781, + "flos": 19134749934720.0, + "grad_norm": 1.9268089742515246, + "language_loss": 0.78465772, + "learning_rate": 1.805382881379827e-06, + "loss": 0.80691755, + "num_input_tokens_seen": 195312545, + "step": 9066, + "time_per_iteration": 2.6007230281829834 + }, + { + "auxiliary_loss_clip": 0.01156041, + "auxiliary_loss_mlp": 0.01110868, + "balance_loss_clip": 1.00209522, + "balance_loss_mlp": 1.00062323, + "epoch": 0.545137531940478, + "flos": 26249802635520.0, + "grad_norm": 1.7169668670020388, + "language_loss": 0.76207328, + "learning_rate": 1.8049952716741975e-06, + "loss": 0.78474241, + "num_input_tokens_seen": 195332955, + "step": 9067, + "time_per_iteration": 2.6179964542388916 + }, + { + "auxiliary_loss_clip": 0.01117074, + "auxiliary_loss_mlp": 0.01111361, + "balance_loss_clip": 1.00196218, + "balance_loss_mlp": 1.00073481, + "epoch": 0.545197655193146, + "flos": 37555629995520.0, + "grad_norm": 2.2351372337405873, + "language_loss": 0.63631618, + "learning_rate": 1.8046076693631682e-06, + "loss": 0.65860057, + "num_input_tokens_seen": 195355930, + "step": 9068, + "time_per_iteration": 2.763289213180542 + }, + { + "auxiliary_loss_clip": 0.01123999, + "auxiliary_loss_mlp": 0.01110335, + "balance_loss_clip": 1.00212264, + "balance_loss_mlp": 1.00075829, + "epoch": 0.5452577784458139, + "flos": 26031250333440.0, + "grad_norm": 1.5636192161035516, + "language_loss": 0.72040343, + "learning_rate": 1.8042200744614343e-06, + "loss": 0.74274677, + "num_input_tokens_seen": 195376445, + "step": 9069, + "time_per_iteration": 2.66941237449646 + }, + { + "auxiliary_loss_clip": 0.01166504, + "auxiliary_loss_mlp": 0.01109967, + "balance_loss_clip": 1.00216556, + "balance_loss_mlp": 1.00048542, + "epoch": 0.5453179016984819, + "flos": 17639034675840.0, + "grad_norm": 2.1038518970672997, + "language_loss": 0.73433352, + "learning_rate": 1.8038324869836957e-06, + "loss": 0.7570982, + "num_input_tokens_seen": 195393725, + "step": 9070, + "time_per_iteration": 3.842594623565674 + }, + { + "auxiliary_loss_clip": 0.01150016, + "auxiliary_loss_mlp": 0.01110424, + "balance_loss_clip": 1.0021069, + "balance_loss_mlp": 1.00056124, + "epoch": 0.5453780249511498, + "flos": 23216679406080.0, + "grad_norm": 2.174861654172096, + "language_loss": 0.60755515, + "learning_rate": 1.8034449069446489e-06, + "loss": 0.63015956, + "num_input_tokens_seen": 195411380, + "step": 9071, + "time_per_iteration": 2.586587429046631 + }, + { + "auxiliary_loss_clip": 0.0116177, + "auxiliary_loss_mlp": 0.01087014, + "balance_loss_clip": 1.00162005, + "balance_loss_mlp": 1.00003898, + "epoch": 0.5454381482038179, + "flos": 68696504801280.0, + "grad_norm": 0.698981668513708, + "language_loss": 0.57146949, + "learning_rate": 1.80305733435899e-06, + "loss": 0.5939573, + "num_input_tokens_seen": 195482015, + "step": 9072, + "time_per_iteration": 3.1963415145874023 + }, + { + "auxiliary_loss_clip": 0.01134619, + "auxiliary_loss_mlp": 0.01110419, + "balance_loss_clip": 1.00201392, + "balance_loss_mlp": 1.00065088, + "epoch": 0.5454982714564858, + "flos": 13260626346240.0, + "grad_norm": 2.071530825439696, + "language_loss": 0.6998719, + "learning_rate": 1.8026697692414174e-06, + "loss": 0.72232223, + "num_input_tokens_seen": 195500440, + "step": 9073, + "time_per_iteration": 2.558884620666504 + }, + { + "auxiliary_loss_clip": 0.01134386, + "auxiliary_loss_mlp": 0.0110935, + "balance_loss_clip": 1.00193787, + "balance_loss_mlp": 1.00072598, + "epoch": 0.5455583947091538, + "flos": 21835878733440.0, + "grad_norm": 1.9165314761388736, + "language_loss": 0.7148515, + "learning_rate": 1.802282211606627e-06, + "loss": 0.73728883, + "num_input_tokens_seen": 195520860, + "step": 9074, + "time_per_iteration": 4.118977785110474 + }, + { + "auxiliary_loss_clip": 0.01149857, + "auxiliary_loss_mlp": 0.01110107, + "balance_loss_clip": 1.00204086, + "balance_loss_mlp": 1.00062573, + "epoch": 0.5456185179618217, + "flos": 17817438551040.0, + "grad_norm": 1.8463477945610673, + "language_loss": 0.68842953, + "learning_rate": 1.8018946614693148e-06, + "loss": 0.71102917, + "num_input_tokens_seen": 195538615, + "step": 9075, + "time_per_iteration": 3.9726951122283936 + }, + { + "auxiliary_loss_clip": 0.0115013, + "auxiliary_loss_mlp": 0.01109732, + "balance_loss_clip": 1.0020082, + "balance_loss_mlp": 1.00053608, + "epoch": 0.5456786412144897, + "flos": 21069401391360.0, + "grad_norm": 1.6840081375634695, + "language_loss": 0.81319511, + "learning_rate": 1.8015071188441768e-06, + "loss": 0.83579379, + "num_input_tokens_seen": 195557460, + "step": 9076, + "time_per_iteration": 2.5637311935424805 + }, + { + "auxiliary_loss_clip": 0.01151173, + "auxiliary_loss_mlp": 0.01110574, + "balance_loss_clip": 1.00208676, + "balance_loss_mlp": 1.00061584, + "epoch": 0.5457387644671576, + "flos": 23294965098240.0, + "grad_norm": 1.7747136730938848, + "language_loss": 0.80388361, + "learning_rate": 1.8011195837459089e-06, + "loss": 0.82650107, + "num_input_tokens_seen": 195577985, + "step": 9077, + "time_per_iteration": 2.600268840789795 + }, + { + "auxiliary_loss_clip": 0.01155799, + "auxiliary_loss_mlp": 0.01109656, + "balance_loss_clip": 1.00217056, + "balance_loss_mlp": 1.00055611, + "epoch": 0.5457988877198257, + "flos": 21617039122560.0, + "grad_norm": 1.7277905401076064, + "language_loss": 0.67689836, + "learning_rate": 1.8007320561892064e-06, + "loss": 0.69955283, + "num_input_tokens_seen": 195597620, + "step": 9078, + "time_per_iteration": 4.041167974472046 + }, + { + "auxiliary_loss_clip": 0.01151397, + "auxiliary_loss_mlp": 0.01111279, + "balance_loss_clip": 1.00218511, + "balance_loss_mlp": 1.00055707, + "epoch": 0.5458590109724936, + "flos": 23762485543680.0, + "grad_norm": 2.4487513096918008, + "language_loss": 0.80934036, + "learning_rate": 1.800344536188764e-06, + "loss": 0.83196723, + "num_input_tokens_seen": 195615910, + "step": 9079, + "time_per_iteration": 2.628215789794922 + }, + { + "auxiliary_loss_clip": 0.01166773, + "auxiliary_loss_mlp": 0.01111248, + "balance_loss_clip": 1.00212634, + "balance_loss_mlp": 1.00062203, + "epoch": 0.5459191342251616, + "flos": 24424283675520.0, + "grad_norm": 4.468523210673498, + "language_loss": 0.75592387, + "learning_rate": 1.799957023759277e-06, + "loss": 0.77870417, + "num_input_tokens_seen": 195635620, + "step": 9080, + "time_per_iteration": 2.537191390991211 + }, + { + "auxiliary_loss_clip": 0.01118738, + "auxiliary_loss_mlp": 0.01111032, + "balance_loss_clip": 1.0020138, + "balance_loss_mlp": 1.00069141, + "epoch": 0.5459792574778296, + "flos": 23623009032960.0, + "grad_norm": 1.9941106929925934, + "language_loss": 0.83899385, + "learning_rate": 1.7995695189154392e-06, + "loss": 0.86129153, + "num_input_tokens_seen": 195652495, + "step": 9081, + "time_per_iteration": 2.688756227493286 + }, + { + "auxiliary_loss_clip": 0.01166871, + "auxiliary_loss_mlp": 0.01111775, + "balance_loss_clip": 1.00224638, + "balance_loss_mlp": 1.00057638, + "epoch": 0.5460393807304975, + "flos": 19135540033920.0, + "grad_norm": 1.6347498752782876, + "language_loss": 0.70248896, + "learning_rate": 1.7991820216719461e-06, + "loss": 0.72527546, + "num_input_tokens_seen": 195671965, + "step": 9082, + "time_per_iteration": 2.540616750717163 + }, + { + "auxiliary_loss_clip": 0.01166485, + "auxiliary_loss_mlp": 0.01109827, + "balance_loss_clip": 1.00211346, + "balance_loss_mlp": 1.0004406, + "epoch": 0.5460995039831655, + "flos": 35918534805120.0, + "grad_norm": 1.633505348857292, + "language_loss": 0.66608131, + "learning_rate": 1.7987945320434906e-06, + "loss": 0.68884444, + "num_input_tokens_seen": 195694725, + "step": 9083, + "time_per_iteration": 2.6503751277923584 + }, + { + "auxiliary_loss_clip": 0.01133083, + "auxiliary_loss_mlp": 0.01109689, + "balance_loss_clip": 1.00204408, + "balance_loss_mlp": 1.0005883, + "epoch": 0.5461596272358334, + "flos": 26759231274240.0, + "grad_norm": 1.648976509237895, + "language_loss": 0.79465556, + "learning_rate": 1.798407050044766e-06, + "loss": 0.8170833, + "num_input_tokens_seen": 195714090, + "step": 9084, + "time_per_iteration": 2.6329333782196045 + }, + { + "auxiliary_loss_clip": 0.01149956, + "auxiliary_loss_mlp": 0.01111151, + "balance_loss_clip": 1.00201654, + "balance_loss_mlp": 1.00061965, + "epoch": 0.5462197504885015, + "flos": 20886580143360.0, + "grad_norm": 1.703456361332396, + "language_loss": 0.74854678, + "learning_rate": 1.7980195756904675e-06, + "loss": 0.77115786, + "num_input_tokens_seen": 195733585, + "step": 9085, + "time_per_iteration": 2.548865795135498 + }, + { + "auxiliary_loss_clip": 0.01135211, + "auxiliary_loss_mlp": 0.0111035, + "balance_loss_clip": 1.00206542, + "balance_loss_mlp": 1.00067782, + "epoch": 0.5462798737411694, + "flos": 25804976607360.0, + "grad_norm": 2.9947539988694634, + "language_loss": 0.74853182, + "learning_rate": 1.7976321089952857e-06, + "loss": 0.77098739, + "num_input_tokens_seen": 195752820, + "step": 9086, + "time_per_iteration": 2.641270399093628 + }, + { + "auxiliary_loss_clip": 0.01150167, + "auxiliary_loss_mlp": 0.01109379, + "balance_loss_clip": 1.00198746, + "balance_loss_mlp": 1.00046945, + "epoch": 0.5463399969938374, + "flos": 25775027642880.0, + "grad_norm": 1.4155955430613891, + "language_loss": 0.76852715, + "learning_rate": 1.7972446499739155e-06, + "loss": 0.79112256, + "num_input_tokens_seen": 195773740, + "step": 9087, + "time_per_iteration": 2.594252824783325 + }, + { + "auxiliary_loss_clip": 0.01149843, + "auxiliary_loss_mlp": 0.01111223, + "balance_loss_clip": 1.00208688, + "balance_loss_mlp": 1.00059664, + "epoch": 0.5464001202465053, + "flos": 18843298980480.0, + "grad_norm": 1.6066938222833405, + "language_loss": 0.77730632, + "learning_rate": 1.7968571986410484e-06, + "loss": 0.79991698, + "num_input_tokens_seen": 195792125, + "step": 9088, + "time_per_iteration": 2.556875705718994 + }, + { + "auxiliary_loss_clip": 0.01066712, + "auxiliary_loss_mlp": 0.0108737, + "balance_loss_clip": 1.00162637, + "balance_loss_mlp": 1.00001323, + "epoch": 0.5464602434991733, + "flos": 69049541623680.0, + "grad_norm": 0.7234863766774343, + "language_loss": 0.57724845, + "learning_rate": 1.7964697550113758e-06, + "loss": 0.59878927, + "num_input_tokens_seen": 195854935, + "step": 9089, + "time_per_iteration": 3.5295681953430176 + }, + { + "auxiliary_loss_clip": 0.01119594, + "auxiliary_loss_mlp": 0.01110937, + "balance_loss_clip": 1.00206876, + "balance_loss_mlp": 1.00050187, + "epoch": 0.5465203667518412, + "flos": 27560039040000.0, + "grad_norm": 1.7193691071559598, + "language_loss": 0.76706636, + "learning_rate": 1.7960823190995918e-06, + "loss": 0.78937161, + "num_input_tokens_seen": 195874715, + "step": 9090, + "time_per_iteration": 3.1611297130584717 + }, + { + "auxiliary_loss_clip": 0.01152017, + "auxiliary_loss_mlp": 0.01111423, + "balance_loss_clip": 1.00207734, + "balance_loss_mlp": 1.0006063, + "epoch": 0.5465804900045093, + "flos": 21210206705280.0, + "grad_norm": 2.0029809685345947, + "language_loss": 0.74224001, + "learning_rate": 1.7956948909203855e-06, + "loss": 0.7648744, + "num_input_tokens_seen": 195892610, + "step": 9091, + "time_per_iteration": 2.5737338066101074 + }, + { + "auxiliary_loss_clip": 0.01135177, + "auxiliary_loss_mlp": 0.01111576, + "balance_loss_clip": 1.00214398, + "balance_loss_mlp": 1.00066388, + "epoch": 0.5466406132571772, + "flos": 22488949860480.0, + "grad_norm": 1.6416348073748672, + "language_loss": 0.77960992, + "learning_rate": 1.7953074704884498e-06, + "loss": 0.80207747, + "num_input_tokens_seen": 195911085, + "step": 9092, + "time_per_iteration": 2.640610456466675 + }, + { + "auxiliary_loss_clip": 0.01166734, + "auxiliary_loss_mlp": 0.01111372, + "balance_loss_clip": 1.00215077, + "balance_loss_mlp": 1.00045919, + "epoch": 0.5467007365098452, + "flos": 17675843137920.0, + "grad_norm": 2.1034297049450887, + "language_loss": 0.75342703, + "learning_rate": 1.794920057818476e-06, + "loss": 0.77620804, + "num_input_tokens_seen": 195929845, + "step": 9093, + "time_per_iteration": 2.4961724281311035 + }, + { + "auxiliary_loss_clip": 0.01151415, + "auxiliary_loss_mlp": 0.01111379, + "balance_loss_clip": 1.00220072, + "balance_loss_mlp": 1.00046694, + "epoch": 0.5467608597625132, + "flos": 15698852524800.0, + "grad_norm": 2.275074745906236, + "language_loss": 0.68716055, + "learning_rate": 1.7945326529251533e-06, + "loss": 0.7097885, + "num_input_tokens_seen": 195946350, + "step": 9094, + "time_per_iteration": 2.5263779163360596 + }, + { + "auxiliary_loss_clip": 0.01133325, + "auxiliary_loss_mlp": 0.01110613, + "balance_loss_clip": 1.00207877, + "balance_loss_mlp": 1.00065482, + "epoch": 0.5468209830151811, + "flos": 24312816794880.0, + "grad_norm": 4.383727669595265, + "language_loss": 0.67803299, + "learning_rate": 1.7941452558231731e-06, + "loss": 0.70047241, + "num_input_tokens_seen": 195959840, + "step": 9095, + "time_per_iteration": 2.6099331378936768 + }, + { + "auxiliary_loss_clip": 0.01117942, + "auxiliary_loss_mlp": 0.01110972, + "balance_loss_clip": 1.00197339, + "balance_loss_mlp": 1.00072682, + "epoch": 0.5468811062678491, + "flos": 29166323339520.0, + "grad_norm": 1.7112917169155883, + "language_loss": 0.66503894, + "learning_rate": 1.7937578665272256e-06, + "loss": 0.6873281, + "num_input_tokens_seen": 195981125, + "step": 9096, + "time_per_iteration": 2.7047014236450195 + }, + { + "auxiliary_loss_clip": 0.01115798, + "auxiliary_loss_mlp": 0.01087429, + "balance_loss_clip": 1.00146246, + "balance_loss_mlp": 1.00007296, + "epoch": 0.546941229520517, + "flos": 67867037982720.0, + "grad_norm": 0.7416605076867663, + "language_loss": 0.57534611, + "learning_rate": 1.7933704850520007e-06, + "loss": 0.59737837, + "num_input_tokens_seen": 196038880, + "step": 9097, + "time_per_iteration": 3.332507371902466 + }, + { + "auxiliary_loss_clip": 0.0114538, + "auxiliary_loss_mlp": 0.01086997, + "balance_loss_clip": 1.00162244, + "balance_loss_mlp": 1.00002193, + "epoch": 0.5470013527731851, + "flos": 58270306625280.0, + "grad_norm": 0.9188089558264865, + "language_loss": 0.64850396, + "learning_rate": 1.7929831114121868e-06, + "loss": 0.67082775, + "num_input_tokens_seen": 196099215, + "step": 9098, + "time_per_iteration": 3.0629124641418457 + }, + { + "auxiliary_loss_clip": 0.01151868, + "auxiliary_loss_mlp": 0.01111527, + "balance_loss_clip": 1.00210142, + "balance_loss_mlp": 1.00071001, + "epoch": 0.547061476025853, + "flos": 22965915582720.0, + "grad_norm": 1.58218127309683, + "language_loss": 0.73419607, + "learning_rate": 1.7925957456224753e-06, + "loss": 0.75682998, + "num_input_tokens_seen": 196120370, + "step": 9099, + "time_per_iteration": 2.6155455112457275 + }, + { + "auxiliary_loss_clip": 0.01133285, + "auxiliary_loss_mlp": 0.0111023, + "balance_loss_clip": 1.00211811, + "balance_loss_mlp": 1.00055707, + "epoch": 0.547121599278521, + "flos": 29968244426880.0, + "grad_norm": 1.8895287192488845, + "language_loss": 0.72367167, + "learning_rate": 1.7922083876975537e-06, + "loss": 0.74610686, + "num_input_tokens_seen": 196139075, + "step": 9100, + "time_per_iteration": 2.6736159324645996 + }, + { + "auxiliary_loss_clip": 0.01149845, + "auxiliary_loss_mlp": 0.00747247, + "balance_loss_clip": 1.00206017, + "balance_loss_mlp": 1.00031006, + "epoch": 0.5471817225311889, + "flos": 36535443914880.0, + "grad_norm": 2.0629715439711256, + "language_loss": 0.68225205, + "learning_rate": 1.7918210376521102e-06, + "loss": 0.70122296, + "num_input_tokens_seen": 196159990, + "step": 9101, + "time_per_iteration": 2.7028074264526367 + }, + { + "auxiliary_loss_clip": 0.01166597, + "auxiliary_loss_mlp": 0.01109953, + "balance_loss_clip": 1.00218344, + "balance_loss_mlp": 1.00056648, + "epoch": 0.5472418457838569, + "flos": 25775243124480.0, + "grad_norm": 1.9077804336469173, + "language_loss": 0.78341752, + "learning_rate": 1.7914336955008343e-06, + "loss": 0.80618298, + "num_input_tokens_seen": 196180570, + "step": 9102, + "time_per_iteration": 2.596351146697998 + }, + { + "auxiliary_loss_clip": 0.01119596, + "auxiliary_loss_mlp": 0.01110041, + "balance_loss_clip": 1.00207686, + "balance_loss_mlp": 1.00065446, + "epoch": 0.5473019690365248, + "flos": 27887687925120.0, + "grad_norm": 1.4463525412366929, + "language_loss": 0.72190249, + "learning_rate": 1.791046361258413e-06, + "loss": 0.74419892, + "num_input_tokens_seen": 196200300, + "step": 9103, + "time_per_iteration": 2.728769540786743 + }, + { + "auxiliary_loss_clip": 0.01139606, + "auxiliary_loss_mlp": 0.0111035, + "balance_loss_clip": 1.00213826, + "balance_loss_mlp": 1.00048685, + "epoch": 0.5473620922891929, + "flos": 57631490219520.0, + "grad_norm": 1.3649301240451577, + "language_loss": 0.65644598, + "learning_rate": 1.7906590349395356e-06, + "loss": 0.67894548, + "num_input_tokens_seen": 196228525, + "step": 9104, + "time_per_iteration": 2.970609426498413 + }, + { + "auxiliary_loss_clip": 0.01150372, + "auxiliary_loss_mlp": 0.01111396, + "balance_loss_clip": 1.00214624, + "balance_loss_mlp": 1.00057888, + "epoch": 0.5474222155418608, + "flos": 19354056422400.0, + "grad_norm": 1.9758581595950688, + "language_loss": 0.81650227, + "learning_rate": 1.790271716558888e-06, + "loss": 0.83911991, + "num_input_tokens_seen": 196247690, + "step": 9105, + "time_per_iteration": 2.546107530593872 + }, + { + "auxiliary_loss_clip": 0.01166487, + "auxiliary_loss_mlp": 0.01109806, + "balance_loss_clip": 1.00217462, + "balance_loss_mlp": 1.00051522, + "epoch": 0.5474823387945288, + "flos": 25120448144640.0, + "grad_norm": 1.5626702569856843, + "language_loss": 0.80268651, + "learning_rate": 1.7898844061311575e-06, + "loss": 0.82544947, + "num_input_tokens_seen": 196268555, + "step": 9106, + "time_per_iteration": 2.546884059906006 + }, + { + "auxiliary_loss_clip": 0.01150921, + "auxiliary_loss_mlp": 0.01110047, + "balance_loss_clip": 1.00215483, + "balance_loss_mlp": 1.00066066, + "epoch": 0.5475424620471967, + "flos": 18004174381440.0, + "grad_norm": 2.436224130678619, + "language_loss": 0.69474202, + "learning_rate": 1.7894971036710322e-06, + "loss": 0.71735168, + "num_input_tokens_seen": 196285585, + "step": 9107, + "time_per_iteration": 2.5333309173583984 + }, + { + "auxiliary_loss_clip": 0.0115136, + "auxiliary_loss_mlp": 0.01111108, + "balance_loss_clip": 1.00211298, + "balance_loss_mlp": 1.00038671, + "epoch": 0.5476025852998647, + "flos": 22309324922880.0, + "grad_norm": 2.027948435686945, + "language_loss": 0.63440841, + "learning_rate": 1.789109809193197e-06, + "loss": 0.65703309, + "num_input_tokens_seen": 196305085, + "step": 9108, + "time_per_iteration": 4.265549898147583 + }, + { + "auxiliary_loss_clip": 0.01166577, + "auxiliary_loss_mlp": 0.0110958, + "balance_loss_clip": 1.00221419, + "balance_loss_mlp": 1.00038457, + "epoch": 0.5476627085525327, + "flos": 20120497850880.0, + "grad_norm": 1.8243020386407465, + "language_loss": 0.75073773, + "learning_rate": 1.7887225227123396e-06, + "loss": 0.77349931, + "num_input_tokens_seen": 196323945, + "step": 9109, + "time_per_iteration": 2.531809091567993 + }, + { + "auxiliary_loss_clip": 0.01139295, + "auxiliary_loss_mlp": 0.01110156, + "balance_loss_clip": 1.00216103, + "balance_loss_mlp": 1.00057876, + "epoch": 0.5477228318052006, + "flos": 17712579772800.0, + "grad_norm": 1.7943720756151291, + "language_loss": 0.77675897, + "learning_rate": 1.7883352442431457e-06, + "loss": 0.79925346, + "num_input_tokens_seen": 196342200, + "step": 9110, + "time_per_iteration": 2.576613664627075 + }, + { + "auxiliary_loss_clip": 0.01149432, + "auxiliary_loss_mlp": 0.01109832, + "balance_loss_clip": 1.0019542, + "balance_loss_mlp": 1.00073123, + "epoch": 0.5477829550578687, + "flos": 25848895962240.0, + "grad_norm": 1.5022058902870234, + "language_loss": 0.70942998, + "learning_rate": 1.7879479738002993e-06, + "loss": 0.73202252, + "num_input_tokens_seen": 196362940, + "step": 9111, + "time_per_iteration": 2.6171228885650635 + }, + { + "auxiliary_loss_clip": 0.01149945, + "auxiliary_loss_mlp": 0.01109974, + "balance_loss_clip": 1.00204945, + "balance_loss_mlp": 1.00058746, + "epoch": 0.5478430783105366, + "flos": 23039676161280.0, + "grad_norm": 1.5426508190427206, + "language_loss": 0.71399558, + "learning_rate": 1.7875607113984876e-06, + "loss": 0.7365948, + "num_input_tokens_seen": 196383070, + "step": 9112, + "time_per_iteration": 4.094215393066406 + }, + { + "auxiliary_loss_clip": 0.01105773, + "auxiliary_loss_mlp": 0.01111112, + "balance_loss_clip": 1.00204325, + "balance_loss_mlp": 1.00067616, + "epoch": 0.5479032015632046, + "flos": 16071210864000.0, + "grad_norm": 2.1613825851989743, + "language_loss": 0.87863892, + "learning_rate": 1.7871734570523953e-06, + "loss": 0.90080774, + "num_input_tokens_seen": 196398485, + "step": 9113, + "time_per_iteration": 4.270246744155884 + }, + { + "auxiliary_loss_clip": 0.01086932, + "auxiliary_loss_mlp": 0.01109596, + "balance_loss_clip": 1.00196004, + "balance_loss_mlp": 1.0004003, + "epoch": 0.5479633248158725, + "flos": 24278701852800.0, + "grad_norm": 1.5902964511940765, + "language_loss": 0.72979689, + "learning_rate": 1.7867862107767067e-06, + "loss": 0.75176215, + "num_input_tokens_seen": 196417725, + "step": 9114, + "time_per_iteration": 2.922332763671875 + }, + { + "auxiliary_loss_clip": 0.01135114, + "auxiliary_loss_mlp": 0.00747149, + "balance_loss_clip": 1.00212538, + "balance_loss_mlp": 1.00024366, + "epoch": 0.5480234480685405, + "flos": 26358216860160.0, + "grad_norm": 1.5622994768775749, + "language_loss": 0.71992004, + "learning_rate": 1.7863989725861066e-06, + "loss": 0.73874265, + "num_input_tokens_seen": 196437840, + "step": 9115, + "time_per_iteration": 2.6731178760528564 + }, + { + "auxiliary_loss_clip": 0.01118243, + "auxiliary_loss_mlp": 0.00747321, + "balance_loss_clip": 1.0019381, + "balance_loss_mlp": 1.00026917, + "epoch": 0.5480835713212084, + "flos": 22055077480320.0, + "grad_norm": 6.073808344024177, + "language_loss": 0.72318965, + "learning_rate": 1.7860117424952781e-06, + "loss": 0.74184531, + "num_input_tokens_seen": 196457300, + "step": 9116, + "time_per_iteration": 4.078898668289185 + }, + { + "auxiliary_loss_clip": 0.01135059, + "auxiliary_loss_mlp": 0.01109985, + "balance_loss_clip": 1.00227237, + "balance_loss_mlp": 1.00069392, + "epoch": 0.5481436945738765, + "flos": 25301042749440.0, + "grad_norm": 2.6681412197186622, + "language_loss": 0.76782793, + "learning_rate": 1.7856245205189063e-06, + "loss": 0.79027838, + "num_input_tokens_seen": 196476720, + "step": 9117, + "time_per_iteration": 2.6452460289001465 + }, + { + "auxiliary_loss_clip": 0.01120546, + "auxiliary_loss_mlp": 0.01109605, + "balance_loss_clip": 1.00196803, + "balance_loss_mlp": 1.00060046, + "epoch": 0.5482038178265444, + "flos": 33580857772800.0, + "grad_norm": 2.11861826439263, + "language_loss": 0.62948453, + "learning_rate": 1.785237306671674e-06, + "loss": 0.65178603, + "num_input_tokens_seen": 196496765, + "step": 9118, + "time_per_iteration": 2.750896692276001 + }, + { + "auxiliary_loss_clip": 0.01166747, + "auxiliary_loss_mlp": 0.01111116, + "balance_loss_clip": 1.00224853, + "balance_loss_mlp": 1.00058472, + "epoch": 0.5482639410792124, + "flos": 19026192055680.0, + "grad_norm": 1.6587648053800221, + "language_loss": 0.79083288, + "learning_rate": 1.7848501009682646e-06, + "loss": 0.81361151, + "num_input_tokens_seen": 196516220, + "step": 9119, + "time_per_iteration": 2.5264360904693604 + }, + { + "auxiliary_loss_clip": 0.0113304, + "auxiliary_loss_mlp": 0.0074718, + "balance_loss_clip": 1.00208354, + "balance_loss_mlp": 1.00030243, + "epoch": 0.5483240643318803, + "flos": 25410318900480.0, + "grad_norm": 1.645906862680198, + "language_loss": 0.82147717, + "learning_rate": 1.7844629034233604e-06, + "loss": 0.84027934, + "num_input_tokens_seen": 196533860, + "step": 9120, + "time_per_iteration": 2.626786231994629 + }, + { + "auxiliary_loss_clip": 0.0111648, + "auxiliary_loss_mlp": 0.01110811, + "balance_loss_clip": 1.00200975, + "balance_loss_mlp": 1.00066161, + "epoch": 0.5483841875845483, + "flos": 21466896272640.0, + "grad_norm": 1.9596887226697208, + "language_loss": 0.80329072, + "learning_rate": 1.7840757140516455e-06, + "loss": 0.82556361, + "num_input_tokens_seen": 196551305, + "step": 9121, + "time_per_iteration": 2.662506580352783 + }, + { + "auxiliary_loss_clip": 0.0110269, + "auxiliary_loss_mlp": 0.01110924, + "balance_loss_clip": 1.00191617, + "balance_loss_mlp": 1.00058389, + "epoch": 0.5484443108372163, + "flos": 24747263792640.0, + "grad_norm": 1.716772945973335, + "language_loss": 0.61229396, + "learning_rate": 1.7836885328678008e-06, + "loss": 0.63443005, + "num_input_tokens_seen": 196569420, + "step": 9122, + "time_per_iteration": 2.727766752243042 + }, + { + "auxiliary_loss_clip": 0.01134646, + "auxiliary_loss_mlp": 0.01110093, + "balance_loss_clip": 1.00208139, + "balance_loss_mlp": 1.0007062, + "epoch": 0.5485044340898843, + "flos": 25375377945600.0, + "grad_norm": 1.5777921183917807, + "language_loss": 0.71250284, + "learning_rate": 1.7833013598865084e-06, + "loss": 0.73495024, + "num_input_tokens_seen": 196590610, + "step": 9123, + "time_per_iteration": 2.644308567047119 + }, + { + "auxiliary_loss_clip": 0.01166629, + "auxiliary_loss_mlp": 0.01110392, + "balance_loss_clip": 1.00224972, + "balance_loss_mlp": 1.00052905, + "epoch": 0.5485645573425523, + "flos": 12641167370880.0, + "grad_norm": 2.205741955155299, + "language_loss": 0.83183229, + "learning_rate": 1.7829141951224505e-06, + "loss": 0.85460258, + "num_input_tokens_seen": 196606495, + "step": 9124, + "time_per_iteration": 2.497861385345459 + }, + { + "auxiliary_loss_clip": 0.01133735, + "auxiliary_loss_mlp": 0.01109571, + "balance_loss_clip": 1.00225234, + "balance_loss_mlp": 1.00056624, + "epoch": 0.5486246805952202, + "flos": 28329425383680.0, + "grad_norm": 1.770082370137387, + "language_loss": 0.80503333, + "learning_rate": 1.7825270385903075e-06, + "loss": 0.82746637, + "num_input_tokens_seen": 196626365, + "step": 9125, + "time_per_iteration": 2.675956964492798 + }, + { + "auxiliary_loss_clip": 0.01150019, + "auxiliary_loss_mlp": 0.01110309, + "balance_loss_clip": 1.00223851, + "balance_loss_mlp": 1.00054073, + "epoch": 0.5486848038478882, + "flos": 16800017817600.0, + "grad_norm": 2.031459577918804, + "language_loss": 0.74754131, + "learning_rate": 1.7821398903047617e-06, + "loss": 0.77014458, + "num_input_tokens_seen": 196644465, + "step": 9126, + "time_per_iteration": 2.5346832275390625 + }, + { + "auxiliary_loss_clip": 0.01151708, + "auxiliary_loss_mlp": 0.01111289, + "balance_loss_clip": 1.00216866, + "balance_loss_mlp": 1.00047219, + "epoch": 0.5487449271005561, + "flos": 17236224581760.0, + "grad_norm": 2.255670292165032, + "language_loss": 0.66723335, + "learning_rate": 1.7817527502804928e-06, + "loss": 0.68986332, + "num_input_tokens_seen": 196659160, + "step": 9127, + "time_per_iteration": 2.531158685684204 + }, + { + "auxiliary_loss_clip": 0.01124081, + "auxiliary_loss_mlp": 0.01111007, + "balance_loss_clip": 1.00228024, + "balance_loss_mlp": 1.00066721, + "epoch": 0.5488050503532241, + "flos": 17340867878400.0, + "grad_norm": 1.6768127418299088, + "language_loss": 0.82962567, + "learning_rate": 1.781365618532181e-06, + "loss": 0.85197651, + "num_input_tokens_seen": 196677410, + "step": 9128, + "time_per_iteration": 2.6154630184173584 + }, + { + "auxiliary_loss_clip": 0.01120182, + "auxiliary_loss_mlp": 0.01109987, + "balance_loss_clip": 1.00203156, + "balance_loss_mlp": 1.00050497, + "epoch": 0.548865173605892, + "flos": 17239169496960.0, + "grad_norm": 1.7882584374312929, + "language_loss": 0.74387026, + "learning_rate": 1.7809784950745078e-06, + "loss": 0.76617193, + "num_input_tokens_seen": 196696765, + "step": 9129, + "time_per_iteration": 2.620694637298584 + }, + { + "auxiliary_loss_clip": 0.01120846, + "auxiliary_loss_mlp": 0.01112224, + "balance_loss_clip": 1.00216269, + "balance_loss_mlp": 1.00054872, + "epoch": 0.5489252968585601, + "flos": 17456716218240.0, + "grad_norm": 3.010039893055786, + "language_loss": 0.63913149, + "learning_rate": 1.7805913799221511e-06, + "loss": 0.66146219, + "num_input_tokens_seen": 196714895, + "step": 9130, + "time_per_iteration": 2.6871418952941895 + }, + { + "auxiliary_loss_clip": 0.01166702, + "auxiliary_loss_mlp": 0.00747212, + "balance_loss_clip": 1.00218832, + "balance_loss_mlp": 1.00025058, + "epoch": 0.548985420111228, + "flos": 26323383646080.0, + "grad_norm": 1.9264094793281261, + "language_loss": 0.6305716, + "learning_rate": 1.7802042730897915e-06, + "loss": 0.64971077, + "num_input_tokens_seen": 196735510, + "step": 9131, + "time_per_iteration": 2.576594114303589 + }, + { + "auxiliary_loss_clip": 0.01151913, + "auxiliary_loss_mlp": 0.01111606, + "balance_loss_clip": 1.00217056, + "balance_loss_mlp": 1.00050282, + "epoch": 0.549045543363896, + "flos": 18693730748160.0, + "grad_norm": 1.8113301071003316, + "language_loss": 0.74138743, + "learning_rate": 1.7798171745921084e-06, + "loss": 0.76402259, + "num_input_tokens_seen": 196752855, + "step": 9132, + "time_per_iteration": 2.547079086303711 + }, + { + "auxiliary_loss_clip": 0.01151675, + "auxiliary_loss_mlp": 0.01110631, + "balance_loss_clip": 1.00208592, + "balance_loss_mlp": 1.00048161, + "epoch": 0.5491056666165639, + "flos": 24717386655360.0, + "grad_norm": 1.520364568869551, + "language_loss": 0.81499696, + "learning_rate": 1.7794300844437795e-06, + "loss": 0.83762002, + "num_input_tokens_seen": 196772230, + "step": 9133, + "time_per_iteration": 2.5952301025390625 + }, + { + "auxiliary_loss_clip": 0.01133051, + "auxiliary_loss_mlp": 0.00747212, + "balance_loss_clip": 1.0020206, + "balance_loss_mlp": 1.00024438, + "epoch": 0.5491657898692319, + "flos": 21576926609280.0, + "grad_norm": 2.397852908883095, + "language_loss": 0.70191896, + "learning_rate": 1.7790430026594841e-06, + "loss": 0.72072154, + "num_input_tokens_seen": 196790405, + "step": 9134, + "time_per_iteration": 2.622466564178467 + }, + { + "auxiliary_loss_clip": 0.01123737, + "auxiliary_loss_mlp": 0.01110331, + "balance_loss_clip": 1.00198293, + "balance_loss_mlp": 1.00056326, + "epoch": 0.5492259131219, + "flos": 50476432746240.0, + "grad_norm": 2.824569192361487, + "language_loss": 0.61276138, + "learning_rate": 1.7786559292539004e-06, + "loss": 0.63510215, + "num_input_tokens_seen": 196813785, + "step": 9135, + "time_per_iteration": 2.8932149410247803 + }, + { + "auxiliary_loss_clip": 0.01149919, + "auxiliary_loss_mlp": 0.01111637, + "balance_loss_clip": 1.00210249, + "balance_loss_mlp": 1.0005343, + "epoch": 0.5492860363745679, + "flos": 25119262995840.0, + "grad_norm": 2.8654038067665883, + "language_loss": 0.72151804, + "learning_rate": 1.7782688642417058e-06, + "loss": 0.74413359, + "num_input_tokens_seen": 196834390, + "step": 9136, + "time_per_iteration": 2.7170212268829346 + }, + { + "auxiliary_loss_clip": 0.01084797, + "auxiliary_loss_mlp": 0.01111281, + "balance_loss_clip": 1.00175309, + "balance_loss_mlp": 1.00055969, + "epoch": 0.5493461596272359, + "flos": 22633777497600.0, + "grad_norm": 3.076084213813439, + "language_loss": 0.67937666, + "learning_rate": 1.7778818076375781e-06, + "loss": 0.7013374, + "num_input_tokens_seen": 196853290, + "step": 9137, + "time_per_iteration": 2.766972303390503 + }, + { + "auxiliary_loss_clip": 0.01146287, + "auxiliary_loss_mlp": 0.01086555, + "balance_loss_clip": 1.001688, + "balance_loss_mlp": 0.99996102, + "epoch": 0.5494062828799038, + "flos": 66151800754560.0, + "grad_norm": 0.7428947483794839, + "language_loss": 0.65286791, + "learning_rate": 1.7774947594561947e-06, + "loss": 0.67519641, + "num_input_tokens_seen": 196913120, + "step": 9138, + "time_per_iteration": 3.2097742557525635 + }, + { + "auxiliary_loss_clip": 0.01149702, + "auxiliary_loss_mlp": 0.01110535, + "balance_loss_clip": 1.00206614, + "balance_loss_mlp": 1.00048161, + "epoch": 0.5494664061325718, + "flos": 21105958458240.0, + "grad_norm": 1.925758912401427, + "language_loss": 0.75276107, + "learning_rate": 1.7771077197122321e-06, + "loss": 0.77536345, + "num_input_tokens_seen": 196931530, + "step": 9139, + "time_per_iteration": 2.569967269897461 + }, + { + "auxiliary_loss_clip": 0.01149755, + "auxiliary_loss_mlp": 0.01110053, + "balance_loss_clip": 1.00205874, + "balance_loss_mlp": 1.00047529, + "epoch": 0.5495265293852397, + "flos": 14392566616320.0, + "grad_norm": 1.973367618702643, + "language_loss": 0.71481025, + "learning_rate": 1.7767206884203672e-06, + "loss": 0.73740828, + "num_input_tokens_seen": 196949430, + "step": 9140, + "time_per_iteration": 2.661569595336914 + }, + { + "auxiliary_loss_clip": 0.01135249, + "auxiliary_loss_mlp": 0.01110039, + "balance_loss_clip": 1.0019294, + "balance_loss_mlp": 1.00055766, + "epoch": 0.5495866526379077, + "flos": 25549148966400.0, + "grad_norm": 2.0910717723040366, + "language_loss": 0.76727605, + "learning_rate": 1.7763336655952762e-06, + "loss": 0.78972888, + "num_input_tokens_seen": 196968265, + "step": 9141, + "time_per_iteration": 2.6696159839630127 + }, + { + "auxiliary_loss_clip": 0.01115691, + "auxiliary_loss_mlp": 0.01109106, + "balance_loss_clip": 1.00193441, + "balance_loss_mlp": 1.00057817, + "epoch": 0.5496467758905756, + "flos": 21317256213120.0, + "grad_norm": 1.9157272043642102, + "language_loss": 0.74938369, + "learning_rate": 1.7759466512516346e-06, + "loss": 0.77163166, + "num_input_tokens_seen": 196984930, + "step": 9142, + "time_per_iteration": 2.6591732501983643 + }, + { + "auxiliary_loss_clip": 0.01139297, + "auxiliary_loss_mlp": 0.01110968, + "balance_loss_clip": 1.00222921, + "balance_loss_mlp": 1.00072312, + "epoch": 0.5497068991432437, + "flos": 22233086305920.0, + "grad_norm": 3.2437699517607896, + "language_loss": 0.76732671, + "learning_rate": 1.7755596454041192e-06, + "loss": 0.78982937, + "num_input_tokens_seen": 197002320, + "step": 9143, + "time_per_iteration": 2.593536615371704 + }, + { + "auxiliary_loss_clip": 0.01134643, + "auxiliary_loss_mlp": 0.01110279, + "balance_loss_clip": 1.00202012, + "balance_loss_mlp": 1.00060606, + "epoch": 0.5497670223959116, + "flos": 18479093028480.0, + "grad_norm": 3.2530369197287334, + "language_loss": 0.79298818, + "learning_rate": 1.7751726480674044e-06, + "loss": 0.81543744, + "num_input_tokens_seen": 197020825, + "step": 9144, + "time_per_iteration": 2.5784695148468018 + }, + { + "auxiliary_loss_clip": 0.0115, + "auxiliary_loss_mlp": 0.01110278, + "balance_loss_clip": 1.00215816, + "balance_loss_mlp": 1.0005101, + "epoch": 0.5498271456485796, + "flos": 29205107049600.0, + "grad_norm": 1.9813217695387428, + "language_loss": 0.71291262, + "learning_rate": 1.7747856592561645e-06, + "loss": 0.73551536, + "num_input_tokens_seen": 197040450, + "step": 9145, + "time_per_iteration": 3.9867985248565674 + }, + { + "auxiliary_loss_clip": 0.01151823, + "auxiliary_loss_mlp": 0.011109, + "balance_loss_clip": 1.00212491, + "balance_loss_mlp": 1.00046444, + "epoch": 0.5498872689012475, + "flos": 34824372664320.0, + "grad_norm": 2.3995648017785784, + "language_loss": 0.70344746, + "learning_rate": 1.774398678985076e-06, + "loss": 0.7260747, + "num_input_tokens_seen": 197063930, + "step": 9146, + "time_per_iteration": 2.673529863357544 + }, + { + "auxiliary_loss_clip": 0.01136165, + "auxiliary_loss_mlp": 0.01109025, + "balance_loss_clip": 1.00200009, + "balance_loss_mlp": 1.00059199, + "epoch": 0.5499473921539155, + "flos": 25921938268800.0, + "grad_norm": 1.6869940778348729, + "language_loss": 0.64049983, + "learning_rate": 1.7740117072688113e-06, + "loss": 0.66295177, + "num_input_tokens_seen": 197082660, + "step": 9147, + "time_per_iteration": 2.6813361644744873 + }, + { + "auxiliary_loss_clip": 0.0116644, + "auxiliary_loss_mlp": 0.01110426, + "balance_loss_clip": 1.00215149, + "balance_loss_mlp": 1.00056291, + "epoch": 0.5500075154065835, + "flos": 22273701609600.0, + "grad_norm": 2.0747682719502865, + "language_loss": 0.80859059, + "learning_rate": 1.7736247441220458e-06, + "loss": 0.83135927, + "num_input_tokens_seen": 197100675, + "step": 9148, + "time_per_iteration": 2.547147274017334 + }, + { + "auxiliary_loss_clip": 0.0113334, + "auxiliary_loss_mlp": 0.01110955, + "balance_loss_clip": 1.00214028, + "balance_loss_mlp": 1.00061488, + "epoch": 0.5500676386592515, + "flos": 28037507552640.0, + "grad_norm": 1.691280424498512, + "language_loss": 0.79296565, + "learning_rate": 1.773237789559453e-06, + "loss": 0.81540859, + "num_input_tokens_seen": 197121320, + "step": 9149, + "time_per_iteration": 2.656114339828491 + }, + { + "auxiliary_loss_clip": 0.0111682, + "auxiliary_loss_mlp": 0.01109794, + "balance_loss_clip": 1.00190592, + "balance_loss_mlp": 1.00040746, + "epoch": 0.5501277619119195, + "flos": 23914819123200.0, + "grad_norm": 2.050538097515294, + "language_loss": 0.72241151, + "learning_rate": 1.7728508435957052e-06, + "loss": 0.74467766, + "num_input_tokens_seen": 197138965, + "step": 9150, + "time_per_iteration": 4.1128621101379395 + }, + { + "auxiliary_loss_clip": 0.01137054, + "auxiliary_loss_mlp": 0.0111034, + "balance_loss_clip": 1.00210285, + "balance_loss_mlp": 1.00038099, + "epoch": 0.5501878851645874, + "flos": 20923783655040.0, + "grad_norm": 2.0683791169542864, + "language_loss": 0.74757385, + "learning_rate": 1.772463906245477e-06, + "loss": 0.77004778, + "num_input_tokens_seen": 197156460, + "step": 9151, + "time_per_iteration": 3.9796791076660156 + }, + { + "auxiliary_loss_clip": 0.011353, + "auxiliary_loss_mlp": 0.01110748, + "balance_loss_clip": 1.00206232, + "balance_loss_mlp": 1.00059891, + "epoch": 0.5502480084172554, + "flos": 20665298407680.0, + "grad_norm": 2.004768565019623, + "language_loss": 0.76047254, + "learning_rate": 1.7720769775234394e-06, + "loss": 0.782933, + "num_input_tokens_seen": 197175140, + "step": 9152, + "time_per_iteration": 2.6237614154815674 + }, + { + "auxiliary_loss_clip": 0.01133082, + "auxiliary_loss_mlp": 0.01109452, + "balance_loss_clip": 1.00207174, + "balance_loss_mlp": 1.00063753, + "epoch": 0.5503081316699233, + "flos": 26432552056320.0, + "grad_norm": 1.7392063511208447, + "language_loss": 0.82296312, + "learning_rate": 1.7716900574442662e-06, + "loss": 0.84538847, + "num_input_tokens_seen": 197194345, + "step": 9153, + "time_per_iteration": 2.662703275680542 + }, + { + "auxiliary_loss_clip": 0.01149511, + "auxiliary_loss_mlp": 0.01109484, + "balance_loss_clip": 1.00209141, + "balance_loss_mlp": 1.00057435, + "epoch": 0.5503682549225913, + "flos": 30629144718720.0, + "grad_norm": 1.7482487893919225, + "language_loss": 0.74278092, + "learning_rate": 1.7713031460226294e-06, + "loss": 0.76537085, + "num_input_tokens_seen": 197215535, + "step": 9154, + "time_per_iteration": 4.0352537631988525 + }, + { + "auxiliary_loss_clip": 0.0113701, + "auxiliary_loss_mlp": 0.01111307, + "balance_loss_clip": 1.00196052, + "balance_loss_mlp": 1.00048995, + "epoch": 0.5504283781752592, + "flos": 22565439872640.0, + "grad_norm": 1.6617281528544585, + "language_loss": 0.72663927, + "learning_rate": 1.770916243273199e-06, + "loss": 0.74912244, + "num_input_tokens_seen": 197234945, + "step": 9155, + "time_per_iteration": 2.6139132976531982 + }, + { + "auxiliary_loss_clip": 0.01150488, + "auxiliary_loss_mlp": 0.01085785, + "balance_loss_clip": 1.00150299, + "balance_loss_mlp": 0.9999544, + "epoch": 0.5504885014279273, + "flos": 67901009270400.0, + "grad_norm": 0.7492878962642886, + "language_loss": 0.5539701, + "learning_rate": 1.7705293492106483e-06, + "loss": 0.57633281, + "num_input_tokens_seen": 197302285, + "step": 9156, + "time_per_iteration": 3.2582693099975586 + }, + { + "auxiliary_loss_clip": 0.01151643, + "auxiliary_loss_mlp": 0.01110664, + "balance_loss_clip": 1.00212753, + "balance_loss_mlp": 1.00041974, + "epoch": 0.5505486246805952, + "flos": 22450058409600.0, + "grad_norm": 1.7546362562423725, + "language_loss": 0.83004189, + "learning_rate": 1.7701424638496475e-06, + "loss": 0.85266495, + "num_input_tokens_seen": 197321575, + "step": 9157, + "time_per_iteration": 2.6009843349456787 + }, + { + "auxiliary_loss_clip": 0.01166743, + "auxiliary_loss_mlp": 0.01110646, + "balance_loss_clip": 1.00217843, + "balance_loss_mlp": 1.00049686, + "epoch": 0.5506087479332632, + "flos": 26906896085760.0, + "grad_norm": 8.96810754992824, + "language_loss": 0.75448918, + "learning_rate": 1.7697555872048677e-06, + "loss": 0.77726305, + "num_input_tokens_seen": 197340255, + "step": 9158, + "time_per_iteration": 2.58315372467041 + }, + { + "auxiliary_loss_clip": 0.01104794, + "auxiliary_loss_mlp": 0.01109149, + "balance_loss_clip": 1.00196385, + "balance_loss_mlp": 1.00052559, + "epoch": 0.5506688711859311, + "flos": 22930256355840.0, + "grad_norm": 1.5370862339412346, + "language_loss": 0.69804269, + "learning_rate": 1.769368719290979e-06, + "loss": 0.72018218, + "num_input_tokens_seen": 197360360, + "step": 9159, + "time_per_iteration": 2.716169595718384 + }, + { + "auxiliary_loss_clip": 0.01105065, + "auxiliary_loss_mlp": 0.0074708, + "balance_loss_clip": 1.00199127, + "balance_loss_mlp": 1.00018287, + "epoch": 0.5507289944385991, + "flos": 29606408772480.0, + "grad_norm": 1.7230473125990047, + "language_loss": 0.68289006, + "learning_rate": 1.7689818601226516e-06, + "loss": 0.70141149, + "num_input_tokens_seen": 197381905, + "step": 9160, + "time_per_iteration": 2.764124870300293 + }, + { + "auxiliary_loss_clip": 0.01166561, + "auxiliary_loss_mlp": 0.01109448, + "balance_loss_clip": 1.00225472, + "balance_loss_mlp": 1.00053895, + "epoch": 0.5507891176912671, + "flos": 15334431091200.0, + "grad_norm": 3.397032707895675, + "language_loss": 0.71476662, + "learning_rate": 1.7685950097145552e-06, + "loss": 0.73752671, + "num_input_tokens_seen": 197398555, + "step": 9161, + "time_per_iteration": 2.5301380157470703 + }, + { + "auxiliary_loss_clip": 0.01149905, + "auxiliary_loss_mlp": 0.01110282, + "balance_loss_clip": 1.00204551, + "balance_loss_mlp": 1.000705, + "epoch": 0.5508492409439351, + "flos": 26578313447040.0, + "grad_norm": 2.0528669932361283, + "language_loss": 0.69360387, + "learning_rate": 1.768208168081359e-06, + "loss": 0.71620578, + "num_input_tokens_seen": 197419630, + "step": 9162, + "time_per_iteration": 2.632206678390503 + }, + { + "auxiliary_loss_clip": 0.01166575, + "auxiliary_loss_mlp": 0.01109511, + "balance_loss_clip": 1.00226653, + "balance_loss_mlp": 1.00050616, + "epoch": 0.5509093641966031, + "flos": 25443428261760.0, + "grad_norm": 1.6772772657316268, + "language_loss": 0.85667002, + "learning_rate": 1.767821335237733e-06, + "loss": 0.87943077, + "num_input_tokens_seen": 197438480, + "step": 9163, + "time_per_iteration": 2.5463404655456543 + }, + { + "auxiliary_loss_clip": 0.01120086, + "auxiliary_loss_mlp": 0.01109166, + "balance_loss_clip": 1.00193143, + "balance_loss_mlp": 1.00044692, + "epoch": 0.550969487449271, + "flos": 18698543170560.0, + "grad_norm": 2.338817899564377, + "language_loss": 0.8047781, + "learning_rate": 1.7674345111983441e-06, + "loss": 0.82707071, + "num_input_tokens_seen": 197456755, + "step": 9164, + "time_per_iteration": 2.657144546508789 + }, + { + "auxiliary_loss_clip": 0.01133345, + "auxiliary_loss_mlp": 0.01110671, + "balance_loss_clip": 1.00208521, + "balance_loss_mlp": 1.00042605, + "epoch": 0.551029610701939, + "flos": 22708723224960.0, + "grad_norm": 2.0884249412585327, + "language_loss": 0.73106092, + "learning_rate": 1.767047695977863e-06, + "loss": 0.75350106, + "num_input_tokens_seen": 197475530, + "step": 9165, + "time_per_iteration": 2.6333212852478027 + }, + { + "auxiliary_loss_clip": 0.0115133, + "auxiliary_loss_mlp": 0.0110945, + "balance_loss_clip": 1.00213742, + "balance_loss_mlp": 1.00054038, + "epoch": 0.5510897339546069, + "flos": 12420496166400.0, + "grad_norm": 1.9844921928492865, + "language_loss": 0.79200387, + "learning_rate": 1.7666608895909563e-06, + "loss": 0.81461167, + "num_input_tokens_seen": 197490835, + "step": 9166, + "time_per_iteration": 2.5268352031707764 + }, + { + "auxiliary_loss_clip": 0.01119363, + "auxiliary_loss_mlp": 0.01110408, + "balance_loss_clip": 1.00203025, + "balance_loss_mlp": 1.00054431, + "epoch": 0.5511498572072749, + "flos": 18770579896320.0, + "grad_norm": 2.2066326388848867, + "language_loss": 0.76449341, + "learning_rate": 1.7662740920522913e-06, + "loss": 0.78679109, + "num_input_tokens_seen": 197508770, + "step": 9167, + "time_per_iteration": 2.640805959701538 + }, + { + "auxiliary_loss_clip": 0.01151556, + "auxiliary_loss_mlp": 0.01109521, + "balance_loss_clip": 1.00207996, + "balance_loss_mlp": 1.00051594, + "epoch": 0.5512099804599428, + "flos": 19573326996480.0, + "grad_norm": 2.8568297147357367, + "language_loss": 0.80030406, + "learning_rate": 1.7658873033765374e-06, + "loss": 0.82291484, + "num_input_tokens_seen": 197527340, + "step": 9168, + "time_per_iteration": 2.550027847290039 + }, + { + "auxiliary_loss_clip": 0.0115006, + "auxiliary_loss_mlp": 0.01110837, + "balance_loss_clip": 1.00209618, + "balance_loss_mlp": 1.00059235, + "epoch": 0.5512701037126109, + "flos": 26245600744320.0, + "grad_norm": 1.9382420004005885, + "language_loss": 0.68946087, + "learning_rate": 1.7655005235783591e-06, + "loss": 0.71206987, + "num_input_tokens_seen": 197547280, + "step": 9169, + "time_per_iteration": 2.607786178588867 + }, + { + "auxiliary_loss_clip": 0.01149745, + "auxiliary_loss_mlp": 0.01109578, + "balance_loss_clip": 1.00211394, + "balance_loss_mlp": 1.00047779, + "epoch": 0.5513302269652788, + "flos": 21945406279680.0, + "grad_norm": 1.894344165077927, + "language_loss": 0.85410237, + "learning_rate": 1.7651137526724251e-06, + "loss": 0.87669563, + "num_input_tokens_seen": 197565045, + "step": 9170, + "time_per_iteration": 2.5654616355895996 + }, + { + "auxiliary_loss_clip": 0.01128876, + "auxiliary_loss_mlp": 0.01087387, + "balance_loss_clip": 1.00180948, + "balance_loss_mlp": 1.00003088, + "epoch": 0.5513903502179468, + "flos": 68235948616320.0, + "grad_norm": 0.7775774819559732, + "language_loss": 0.59902346, + "learning_rate": 1.7647269906734017e-06, + "loss": 0.62118614, + "num_input_tokens_seen": 197625005, + "step": 9171, + "time_per_iteration": 3.1913108825683594 + }, + { + "auxiliary_loss_clip": 0.01119523, + "auxiliary_loss_mlp": 0.01109899, + "balance_loss_clip": 1.0019269, + "balance_loss_mlp": 1.00070274, + "epoch": 0.5514504734706147, + "flos": 18734238311040.0, + "grad_norm": 2.5423439639466183, + "language_loss": 0.705158, + "learning_rate": 1.7643402375959533e-06, + "loss": 0.72745216, + "num_input_tokens_seen": 197645050, + "step": 9172, + "time_per_iteration": 2.636805295944214 + }, + { + "auxiliary_loss_clip": 0.01166376, + "auxiliary_loss_mlp": 0.01109827, + "balance_loss_clip": 1.00206673, + "balance_loss_mlp": 1.00063133, + "epoch": 0.5515105967232827, + "flos": 22270972176000.0, + "grad_norm": 1.6550918766507978, + "language_loss": 0.7551862, + "learning_rate": 1.7639534934547474e-06, + "loss": 0.77794814, + "num_input_tokens_seen": 197663910, + "step": 9173, + "time_per_iteration": 2.547050952911377 + }, + { + "auxiliary_loss_clip": 0.01115646, + "auxiliary_loss_mlp": 0.01108625, + "balance_loss_clip": 1.00183165, + "balance_loss_mlp": 1.0005734, + "epoch": 0.5515707199759508, + "flos": 22557682535040.0, + "grad_norm": 2.456224587365152, + "language_loss": 0.75290203, + "learning_rate": 1.7635667582644484e-06, + "loss": 0.7751447, + "num_input_tokens_seen": 197681580, + "step": 9174, + "time_per_iteration": 2.635221242904663 + }, + { + "auxiliary_loss_clip": 0.01135483, + "auxiliary_loss_mlp": 0.0110906, + "balance_loss_clip": 1.00191545, + "balance_loss_mlp": 1.00043607, + "epoch": 0.5516308432286187, + "flos": 28291072636800.0, + "grad_norm": 1.8629053349005587, + "language_loss": 0.72557622, + "learning_rate": 1.7631800320397217e-06, + "loss": 0.74802172, + "num_input_tokens_seen": 197702095, + "step": 9175, + "time_per_iteration": 2.6999683380126953 + }, + { + "auxiliary_loss_clip": 0.01149689, + "auxiliary_loss_mlp": 0.01109975, + "balance_loss_clip": 1.00211799, + "balance_loss_mlp": 1.00058842, + "epoch": 0.5516909664812867, + "flos": 18764474584320.0, + "grad_norm": 1.7531060261455542, + "language_loss": 0.69056565, + "learning_rate": 1.7627933147952318e-06, + "loss": 0.7131623, + "num_input_tokens_seen": 197720720, + "step": 9176, + "time_per_iteration": 2.610929250717163 + }, + { + "auxiliary_loss_clip": 0.01151504, + "auxiliary_loss_mlp": 0.01109696, + "balance_loss_clip": 1.00205791, + "balance_loss_mlp": 1.00059509, + "epoch": 0.5517510897339546, + "flos": 27740346336000.0, + "grad_norm": 1.7006938946160661, + "language_loss": 0.70708764, + "learning_rate": 1.7624066065456435e-06, + "loss": 0.72969967, + "num_input_tokens_seen": 197741820, + "step": 9177, + "time_per_iteration": 2.6325111389160156 + }, + { + "auxiliary_loss_clip": 0.01149256, + "auxiliary_loss_mlp": 0.01109869, + "balance_loss_clip": 1.00202966, + "balance_loss_mlp": 1.0005784, + "epoch": 0.5518112129866226, + "flos": 18404470523520.0, + "grad_norm": 5.613620752114736, + "language_loss": 0.80208921, + "learning_rate": 1.7620199073056204e-06, + "loss": 0.82468045, + "num_input_tokens_seen": 197759160, + "step": 9178, + "time_per_iteration": 2.545424222946167 + }, + { + "auxiliary_loss_clip": 0.01091433, + "auxiliary_loss_mlp": 0.01110422, + "balance_loss_clip": 1.00194144, + "balance_loss_mlp": 1.0006541, + "epoch": 0.5518713362392905, + "flos": 25082670015360.0, + "grad_norm": 1.8349166764325322, + "language_loss": 0.75002813, + "learning_rate": 1.761633217089826e-06, + "loss": 0.77204669, + "num_input_tokens_seen": 197779760, + "step": 9179, + "time_per_iteration": 2.752431631088257 + }, + { + "auxiliary_loss_clip": 0.01150829, + "auxiliary_loss_mlp": 0.01109865, + "balance_loss_clip": 1.002105, + "balance_loss_mlp": 1.00066888, + "epoch": 0.5519314594919585, + "flos": 36538999361280.0, + "grad_norm": 1.718680712549269, + "language_loss": 0.69956231, + "learning_rate": 1.761246535912924e-06, + "loss": 0.72216928, + "num_input_tokens_seen": 197801545, + "step": 9180, + "time_per_iteration": 2.7427074909210205 + }, + { + "auxiliary_loss_clip": 0.01151467, + "auxiliary_loss_mlp": 0.01110556, + "balance_loss_clip": 1.00209653, + "balance_loss_mlp": 1.00059748, + "epoch": 0.5519915827446265, + "flos": 20448613612800.0, + "grad_norm": 2.6906487365351115, + "language_loss": 0.67265379, + "learning_rate": 1.7608598637895776e-06, + "loss": 0.69527406, + "num_input_tokens_seen": 197820760, + "step": 9181, + "time_per_iteration": 2.5785014629364014 + }, + { + "auxiliary_loss_clip": 0.01166583, + "auxiliary_loss_mlp": 0.01111003, + "balance_loss_clip": 1.00210834, + "balance_loss_mlp": 1.00066316, + "epoch": 0.5520517059972945, + "flos": 23768052151680.0, + "grad_norm": 2.200919930636192, + "language_loss": 0.78602314, + "learning_rate": 1.7604732007344486e-06, + "loss": 0.80879903, + "num_input_tokens_seen": 197840195, + "step": 9182, + "time_per_iteration": 3.9142003059387207 + }, + { + "auxiliary_loss_clip": 0.01118601, + "auxiliary_loss_mlp": 0.01110316, + "balance_loss_clip": 1.00190973, + "balance_loss_mlp": 1.00045228, + "epoch": 0.5521118292499624, + "flos": 22196457411840.0, + "grad_norm": 6.622102097516848, + "language_loss": 0.82922888, + "learning_rate": 1.7600865467622003e-06, + "loss": 0.85151803, + "num_input_tokens_seen": 197859475, + "step": 9183, + "time_per_iteration": 2.7643277645111084 + }, + { + "auxiliary_loss_clip": 0.01134502, + "auxiliary_loss_mlp": 0.01109245, + "balance_loss_clip": 1.00199842, + "balance_loss_mlp": 1.00052643, + "epoch": 0.5521719525026304, + "flos": 23583291569280.0, + "grad_norm": 1.4339050707927536, + "language_loss": 0.67323267, + "learning_rate": 1.7596999018874936e-06, + "loss": 0.69567013, + "num_input_tokens_seen": 197879395, + "step": 9184, + "time_per_iteration": 2.6613166332244873 + }, + { + "auxiliary_loss_clip": 0.01149764, + "auxiliary_loss_mlp": 0.0110947, + "balance_loss_clip": 1.002002, + "balance_loss_mlp": 1.00046539, + "epoch": 0.5522320757552983, + "flos": 26137617482880.0, + "grad_norm": 3.3801681347189945, + "language_loss": 0.76436663, + "learning_rate": 1.7593132661249917e-06, + "loss": 0.78695893, + "num_input_tokens_seen": 197900815, + "step": 9185, + "time_per_iteration": 2.607104778289795 + }, + { + "auxiliary_loss_clip": 0.01119867, + "auxiliary_loss_mlp": 0.01110327, + "balance_loss_clip": 1.00209045, + "balance_loss_mlp": 1.00074971, + "epoch": 0.5522921990079663, + "flos": 24676160820480.0, + "grad_norm": 2.0016141480466687, + "language_loss": 0.74269056, + "learning_rate": 1.7589266394893536e-06, + "loss": 0.76499248, + "num_input_tokens_seen": 197918985, + "step": 9186, + "time_per_iteration": 2.662536144256592 + }, + { + "auxiliary_loss_clip": 0.01118412, + "auxiliary_loss_mlp": 0.01110742, + "balance_loss_clip": 1.00193691, + "balance_loss_mlp": 1.00068808, + "epoch": 0.5523523222606344, + "flos": 22748153379840.0, + "grad_norm": 1.89997533196652, + "language_loss": 0.66643894, + "learning_rate": 1.7585400219952421e-06, + "loss": 0.68873048, + "num_input_tokens_seen": 197937725, + "step": 9187, + "time_per_iteration": 2.6761341094970703 + }, + { + "auxiliary_loss_clip": 0.0113292, + "auxiliary_loss_mlp": 0.0111011, + "balance_loss_clip": 1.00193715, + "balance_loss_mlp": 1.00053287, + "epoch": 0.5524124455133023, + "flos": 19755825022080.0, + "grad_norm": 1.5261933694287142, + "language_loss": 0.7812115, + "learning_rate": 1.758153413657318e-06, + "loss": 0.8036418, + "num_input_tokens_seen": 197955635, + "step": 9188, + "time_per_iteration": 4.062025547027588 + }, + { + "auxiliary_loss_clip": 0.01134752, + "auxiliary_loss_mlp": 0.01110363, + "balance_loss_clip": 1.00197029, + "balance_loss_mlp": 1.00059497, + "epoch": 0.5524725687659703, + "flos": 23294821443840.0, + "grad_norm": 2.337033960958663, + "language_loss": 0.816953, + "learning_rate": 1.7577668144902394e-06, + "loss": 0.83940411, + "num_input_tokens_seen": 197974490, + "step": 9189, + "time_per_iteration": 4.005409479141235 + }, + { + "auxiliary_loss_clip": 0.01149913, + "auxiliary_loss_mlp": 0.00747064, + "balance_loss_clip": 1.00203216, + "balance_loss_mlp": 1.00035441, + "epoch": 0.5525326920186382, + "flos": 24862178378880.0, + "grad_norm": 1.8496353950672408, + "language_loss": 0.76596987, + "learning_rate": 1.7573802245086684e-06, + "loss": 0.78493965, + "num_input_tokens_seen": 197995735, + "step": 9190, + "time_per_iteration": 2.6327338218688965 + }, + { + "auxiliary_loss_clip": 0.01166782, + "auxiliary_loss_mlp": 0.01111793, + "balance_loss_clip": 1.00222456, + "balance_loss_mlp": 1.00059462, + "epoch": 0.5525928152713062, + "flos": 13735580906880.0, + "grad_norm": 4.579663457795435, + "language_loss": 0.78984141, + "learning_rate": 1.7569936437272627e-06, + "loss": 0.81262714, + "num_input_tokens_seen": 198009685, + "step": 9191, + "time_per_iteration": 3.939741373062134 + }, + { + "auxiliary_loss_clip": 0.01088753, + "auxiliary_loss_mlp": 0.01109148, + "balance_loss_clip": 1.00205564, + "balance_loss_mlp": 1.00052488, + "epoch": 0.5526529385239741, + "flos": 13071592045440.0, + "grad_norm": 1.8204826860343322, + "language_loss": 0.68656546, + "learning_rate": 1.7566070721606829e-06, + "loss": 0.70854449, + "num_input_tokens_seen": 198026845, + "step": 9192, + "time_per_iteration": 2.7032039165496826 + }, + { + "auxiliary_loss_clip": 0.01150701, + "auxiliary_loss_mlp": 0.01108845, + "balance_loss_clip": 1.0019896, + "balance_loss_mlp": 1.00060344, + "epoch": 0.5527130617766421, + "flos": 23148377694720.0, + "grad_norm": 1.6622995864001566, + "language_loss": 0.77572954, + "learning_rate": 1.756220509823588e-06, + "loss": 0.79832506, + "num_input_tokens_seen": 198045275, + "step": 9193, + "time_per_iteration": 2.5722320079803467 + }, + { + "auxiliary_loss_clip": 0.01119821, + "auxiliary_loss_mlp": 0.0110994, + "balance_loss_clip": 1.0018723, + "balance_loss_mlp": 1.00055361, + "epoch": 0.55277318502931, + "flos": 21285547482240.0, + "grad_norm": 1.662989502191968, + "language_loss": 0.78609741, + "learning_rate": 1.7558339567306344e-06, + "loss": 0.80839491, + "num_input_tokens_seen": 198065760, + "step": 9194, + "time_per_iteration": 2.669221878051758 + }, + { + "auxiliary_loss_clip": 0.01122889, + "auxiliary_loss_mlp": 0.01110506, + "balance_loss_clip": 1.00203383, + "balance_loss_mlp": 1.00054741, + "epoch": 0.5528333082819781, + "flos": 38324549462400.0, + "grad_norm": 1.9417292586954376, + "language_loss": 0.69522679, + "learning_rate": 1.7554474128964825e-06, + "loss": 0.71756065, + "num_input_tokens_seen": 198087595, + "step": 9195, + "time_per_iteration": 2.7814416885375977 + }, + { + "auxiliary_loss_clip": 0.01135206, + "auxiliary_loss_mlp": 0.01110843, + "balance_loss_clip": 1.00204182, + "balance_loss_mlp": 1.00059819, + "epoch": 0.552893431534646, + "flos": 13553621585280.0, + "grad_norm": 2.053223116650602, + "language_loss": 0.74125516, + "learning_rate": 1.7550608783357887e-06, + "loss": 0.76371562, + "num_input_tokens_seen": 198104620, + "step": 9196, + "time_per_iteration": 2.6001198291778564 + }, + { + "auxiliary_loss_clip": 0.01151908, + "auxiliary_loss_mlp": 0.01110414, + "balance_loss_clip": 1.00222588, + "balance_loss_mlp": 1.00074124, + "epoch": 0.552953554787314, + "flos": 21939408708480.0, + "grad_norm": 1.7917854613117108, + "language_loss": 0.76814365, + "learning_rate": 1.7546743530632115e-06, + "loss": 0.79076684, + "num_input_tokens_seen": 198123565, + "step": 9197, + "time_per_iteration": 2.5734331607818604 + }, + { + "auxiliary_loss_clip": 0.01134886, + "auxiliary_loss_mlp": 0.01109529, + "balance_loss_clip": 1.00191092, + "balance_loss_mlp": 1.00042915, + "epoch": 0.5530136780399819, + "flos": 43658002558080.0, + "grad_norm": 2.2657984723744304, + "language_loss": 0.76627117, + "learning_rate": 1.754287837093407e-06, + "loss": 0.78871536, + "num_input_tokens_seen": 198148270, + "step": 9198, + "time_per_iteration": 2.8290464878082275 + }, + { + "auxiliary_loss_clip": 0.01166445, + "auxiliary_loss_mlp": 0.01109772, + "balance_loss_clip": 1.00200415, + "balance_loss_mlp": 1.00048089, + "epoch": 0.5530738012926499, + "flos": 25045502417280.0, + "grad_norm": 1.4061048536039713, + "language_loss": 0.79235238, + "learning_rate": 1.7539013304410327e-06, + "loss": 0.81511456, + "num_input_tokens_seen": 198168810, + "step": 9199, + "time_per_iteration": 2.554898738861084 + }, + { + "auxiliary_loss_clip": 0.01118029, + "auxiliary_loss_mlp": 0.01109572, + "balance_loss_clip": 1.00177956, + "balance_loss_mlp": 1.00047135, + "epoch": 0.553133924545318, + "flos": 16472081623680.0, + "grad_norm": 1.7312733839190997, + "language_loss": 0.6382376, + "learning_rate": 1.7535148331207443e-06, + "loss": 0.66051358, + "num_input_tokens_seen": 198186200, + "step": 9200, + "time_per_iteration": 2.680387258529663 + }, + { + "auxiliary_loss_clip": 0.01133291, + "auxiliary_loss_mlp": 0.0111024, + "balance_loss_clip": 1.00205112, + "balance_loss_mlp": 1.00047243, + "epoch": 0.5531940477979859, + "flos": 24606207083520.0, + "grad_norm": 1.5351085274632483, + "language_loss": 0.66120625, + "learning_rate": 1.7531283451471978e-06, + "loss": 0.68364149, + "num_input_tokens_seen": 198207050, + "step": 9201, + "time_per_iteration": 2.6310365200042725 + }, + { + "auxiliary_loss_clip": 0.01151757, + "auxiliary_loss_mlp": 0.01111144, + "balance_loss_clip": 1.00221252, + "balance_loss_mlp": 1.00061309, + "epoch": 0.5532541710506539, + "flos": 22159577122560.0, + "grad_norm": 1.973050397065144, + "language_loss": 0.60191774, + "learning_rate": 1.7527418665350502e-06, + "loss": 0.62454677, + "num_input_tokens_seen": 198224565, + "step": 9202, + "time_per_iteration": 2.6280477046966553 + }, + { + "auxiliary_loss_clip": 0.0114956, + "auxiliary_loss_mlp": 0.00747099, + "balance_loss_clip": 1.00195265, + "balance_loss_mlp": 1.00027966, + "epoch": 0.5533142943033218, + "flos": 21397265758080.0, + "grad_norm": 5.536321542966475, + "language_loss": 0.64205229, + "learning_rate": 1.7523553972989548e-06, + "loss": 0.66101885, + "num_input_tokens_seen": 198244790, + "step": 9203, + "time_per_iteration": 2.585675001144409 + }, + { + "auxiliary_loss_clip": 0.01149674, + "auxiliary_loss_mlp": 0.01110143, + "balance_loss_clip": 1.0020535, + "balance_loss_mlp": 1.00047052, + "epoch": 0.5533744175559898, + "flos": 23550541344000.0, + "grad_norm": 1.5141825364032524, + "language_loss": 0.63771856, + "learning_rate": 1.7519689374535683e-06, + "loss": 0.66031671, + "num_input_tokens_seen": 198264375, + "step": 9204, + "time_per_iteration": 2.5940163135528564 + }, + { + "auxiliary_loss_clip": 0.01149646, + "auxiliary_loss_mlp": 0.01109475, + "balance_loss_clip": 1.00201404, + "balance_loss_mlp": 1.00047028, + "epoch": 0.5534345408086577, + "flos": 24061514267520.0, + "grad_norm": 1.6000352139741756, + "language_loss": 0.77246714, + "learning_rate": 1.7515824870135445e-06, + "loss": 0.79505831, + "num_input_tokens_seen": 198283895, + "step": 9205, + "time_per_iteration": 2.5919694900512695 + }, + { + "auxiliary_loss_clip": 0.01109402, + "auxiliary_loss_mlp": 0.01109014, + "balance_loss_clip": 1.00207663, + "balance_loss_mlp": 1.00067616, + "epoch": 0.5534946640613257, + "flos": 33771831408000.0, + "grad_norm": 1.7733185225690633, + "language_loss": 0.72407746, + "learning_rate": 1.751196045993537e-06, + "loss": 0.7462616, + "num_input_tokens_seen": 198310035, + "step": 9206, + "time_per_iteration": 2.828794479370117 + }, + { + "auxiliary_loss_clip": 0.01104754, + "auxiliary_loss_mlp": 0.01109561, + "balance_loss_clip": 1.00199771, + "balance_loss_mlp": 1.00065148, + "epoch": 0.5535547873139937, + "flos": 15159223526400.0, + "grad_norm": 2.4010807835821857, + "language_loss": 0.75656825, + "learning_rate": 1.7508096144082012e-06, + "loss": 0.77871144, + "num_input_tokens_seen": 198327810, + "step": 9207, + "time_per_iteration": 2.6778316497802734 + }, + { + "auxiliary_loss_clip": 0.01116337, + "auxiliary_loss_mlp": 0.01111115, + "balance_loss_clip": 1.00193834, + "balance_loss_mlp": 1.00048912, + "epoch": 0.5536149105666617, + "flos": 16980863817600.0, + "grad_norm": 2.5584443299654023, + "language_loss": 0.61436182, + "learning_rate": 1.750423192272189e-06, + "loss": 0.63663638, + "num_input_tokens_seen": 198343150, + "step": 9208, + "time_per_iteration": 2.619915246963501 + }, + { + "auxiliary_loss_clip": 0.01166535, + "auxiliary_loss_mlp": 0.01110646, + "balance_loss_clip": 1.00207949, + "balance_loss_mlp": 1.00059223, + "epoch": 0.5536750338193296, + "flos": 18149935772160.0, + "grad_norm": 2.0701849710997635, + "language_loss": 0.6441738, + "learning_rate": 1.7500367796001547e-06, + "loss": 0.66694564, + "num_input_tokens_seen": 198360925, + "step": 9209, + "time_per_iteration": 2.4997668266296387 + }, + { + "auxiliary_loss_clip": 0.0111899, + "auxiliary_loss_mlp": 0.01110489, + "balance_loss_clip": 1.00184655, + "balance_loss_mlp": 1.00072098, + "epoch": 0.5537351570719976, + "flos": 22747794243840.0, + "grad_norm": 2.2991216276540345, + "language_loss": 0.82383859, + "learning_rate": 1.7496503764067513e-06, + "loss": 0.84613341, + "num_input_tokens_seen": 198379265, + "step": 9210, + "time_per_iteration": 2.6519298553466797 + }, + { + "auxiliary_loss_clip": 0.01136229, + "auxiliary_loss_mlp": 0.01109965, + "balance_loss_clip": 1.00201535, + "balance_loss_mlp": 1.0004828, + "epoch": 0.5537952803246655, + "flos": 26356026130560.0, + "grad_norm": 1.774092873207883, + "language_loss": 0.72988868, + "learning_rate": 1.74926398270663e-06, + "loss": 0.75235063, + "num_input_tokens_seen": 198399490, + "step": 9211, + "time_per_iteration": 2.6518867015838623 + }, + { + "auxiliary_loss_clip": 0.01119551, + "auxiliary_loss_mlp": 0.01111216, + "balance_loss_clip": 1.00182724, + "balance_loss_mlp": 1.00058961, + "epoch": 0.5538554035773335, + "flos": 18037427397120.0, + "grad_norm": 2.781891787732659, + "language_loss": 0.66564262, + "learning_rate": 1.7488775985144437e-06, + "loss": 0.68795025, + "num_input_tokens_seen": 198419110, + "step": 9212, + "time_per_iteration": 2.633220672607422 + }, + { + "auxiliary_loss_clip": 0.01119396, + "auxiliary_loss_mlp": 0.01110612, + "balance_loss_clip": 1.00183892, + "balance_loss_mlp": 1.00036752, + "epoch": 0.5539155268300014, + "flos": 31686247002240.0, + "grad_norm": 1.6630555974699053, + "language_loss": 0.51882571, + "learning_rate": 1.7484912238448443e-06, + "loss": 0.54112577, + "num_input_tokens_seen": 198441360, + "step": 9213, + "time_per_iteration": 2.726438283920288 + }, + { + "auxiliary_loss_clip": 0.01118322, + "auxiliary_loss_mlp": 0.01110349, + "balance_loss_clip": 1.00201833, + "balance_loss_mlp": 1.00039065, + "epoch": 0.5539756500826695, + "flos": 15193769431680.0, + "grad_norm": 2.1569232676970387, + "language_loss": 0.85528004, + "learning_rate": 1.7481048587124827e-06, + "loss": 0.87756675, + "num_input_tokens_seen": 198459835, + "step": 9214, + "time_per_iteration": 2.6483964920043945 + }, + { + "auxiliary_loss_clip": 0.01151066, + "auxiliary_loss_mlp": 0.0110953, + "balance_loss_clip": 1.00204372, + "balance_loss_mlp": 1.000525, + "epoch": 0.5540357733353375, + "flos": 26353117128960.0, + "grad_norm": 1.716951035161207, + "language_loss": 0.69933522, + "learning_rate": 1.7477185031320108e-06, + "loss": 0.72194123, + "num_input_tokens_seen": 198478955, + "step": 9215, + "time_per_iteration": 2.6274726390838623 + }, + { + "auxiliary_loss_clip": 0.01135038, + "auxiliary_loss_mlp": 0.01110347, + "balance_loss_clip": 1.00214648, + "balance_loss_mlp": 1.00048327, + "epoch": 0.5540958965880054, + "flos": 21323684747520.0, + "grad_norm": 1.9544435416473012, + "language_loss": 0.73191905, + "learning_rate": 1.7473321571180773e-06, + "loss": 0.75437295, + "num_input_tokens_seen": 198499030, + "step": 9216, + "time_per_iteration": 2.6671204566955566 + }, + { + "auxiliary_loss_clip": 0.01132541, + "auxiliary_loss_mlp": 0.01109616, + "balance_loss_clip": 1.00197434, + "balance_loss_mlp": 1.0005157, + "epoch": 0.5541560198406734, + "flos": 25666828899840.0, + "grad_norm": 30.348644088177984, + "language_loss": 0.72446716, + "learning_rate": 1.7469458206853345e-06, + "loss": 0.7468887, + "num_input_tokens_seen": 198520265, + "step": 9217, + "time_per_iteration": 2.6505494117736816 + }, + { + "auxiliary_loss_clip": 0.01150027, + "auxiliary_loss_mlp": 0.01108977, + "balance_loss_clip": 1.00198495, + "balance_loss_mlp": 1.0004487, + "epoch": 0.5542161430933413, + "flos": 21939624190080.0, + "grad_norm": 1.8544688857960012, + "language_loss": 0.78574109, + "learning_rate": 1.7465594938484315e-06, + "loss": 0.80833113, + "num_input_tokens_seen": 198539645, + "step": 9218, + "time_per_iteration": 2.576019525527954 + }, + { + "auxiliary_loss_clip": 0.01118817, + "auxiliary_loss_mlp": 0.01110231, + "balance_loss_clip": 1.00196278, + "balance_loss_mlp": 1.00046325, + "epoch": 0.5542762663460093, + "flos": 19571459489280.0, + "grad_norm": 1.8636858189645673, + "language_loss": 0.71987265, + "learning_rate": 1.7461731766220176e-06, + "loss": 0.74216312, + "num_input_tokens_seen": 198558710, + "step": 9219, + "time_per_iteration": 2.6591386795043945 + }, + { + "auxiliary_loss_clip": 0.01150314, + "auxiliary_loss_mlp": 0.01111265, + "balance_loss_clip": 1.00216198, + "balance_loss_mlp": 1.00063896, + "epoch": 0.5543363895986773, + "flos": 19499063627520.0, + "grad_norm": 1.5469506618875233, + "language_loss": 0.71478903, + "learning_rate": 1.7457868690207426e-06, + "loss": 0.73740476, + "num_input_tokens_seen": 198577050, + "step": 9220, + "time_per_iteration": 3.988365888595581 + }, + { + "auxiliary_loss_clip": 0.0116617, + "auxiliary_loss_mlp": 0.01108852, + "balance_loss_clip": 1.00201201, + "balance_loss_mlp": 1.0004189, + "epoch": 0.5543965128513453, + "flos": 22635609091200.0, + "grad_norm": 1.6159293483019697, + "language_loss": 0.79847312, + "learning_rate": 1.7454005710592547e-06, + "loss": 0.82122338, + "num_input_tokens_seen": 198595290, + "step": 9221, + "time_per_iteration": 2.5317459106445312 + }, + { + "auxiliary_loss_clip": 0.01124893, + "auxiliary_loss_mlp": 0.01110878, + "balance_loss_clip": 1.00198793, + "balance_loss_mlp": 1.00044227, + "epoch": 0.5544566361040132, + "flos": 25989952671360.0, + "grad_norm": 1.8400814614963448, + "language_loss": 0.83767301, + "learning_rate": 1.7450142827522027e-06, + "loss": 0.86003071, + "num_input_tokens_seen": 198614110, + "step": 9222, + "time_per_iteration": 2.7491726875305176 + }, + { + "auxiliary_loss_clip": 0.01118914, + "auxiliary_loss_mlp": 0.00747301, + "balance_loss_clip": 1.00195396, + "balance_loss_mlp": 1.00023532, + "epoch": 0.5545167593566812, + "flos": 28257568225920.0, + "grad_norm": 1.7383209703342242, + "language_loss": 0.75599045, + "learning_rate": 1.7446280041142344e-06, + "loss": 0.7746526, + "num_input_tokens_seen": 198633880, + "step": 9223, + "time_per_iteration": 2.7142140865325928 + }, + { + "auxiliary_loss_clip": 0.01134524, + "auxiliary_loss_mlp": 0.01109887, + "balance_loss_clip": 1.00199199, + "balance_loss_mlp": 1.0005002, + "epoch": 0.5545768826093491, + "flos": 28476551491200.0, + "grad_norm": 1.8312064172083635, + "language_loss": 0.81771696, + "learning_rate": 1.7442417351599986e-06, + "loss": 0.84016103, + "num_input_tokens_seen": 198653505, + "step": 9224, + "time_per_iteration": 2.670443058013916 + }, + { + "auxiliary_loss_clip": 0.0115026, + "auxiliary_loss_mlp": 0.01111015, + "balance_loss_clip": 1.00215709, + "balance_loss_mlp": 1.00076985, + "epoch": 0.5546370058620171, + "flos": 18478051534080.0, + "grad_norm": 3.2841587373427052, + "language_loss": 0.56979752, + "learning_rate": 1.743855475904141e-06, + "loss": 0.59241033, + "num_input_tokens_seen": 198671890, + "step": 9225, + "time_per_iteration": 2.648815870285034 + }, + { + "auxiliary_loss_clip": 0.01150643, + "auxiliary_loss_mlp": 0.01110203, + "balance_loss_clip": 1.0019393, + "balance_loss_mlp": 1.00062609, + "epoch": 0.554697129114685, + "flos": 22930507751040.0, + "grad_norm": 1.5451399412205467, + "language_loss": 0.6768381, + "learning_rate": 1.7434692263613098e-06, + "loss": 0.69944656, + "num_input_tokens_seen": 198691995, + "step": 9226, + "time_per_iteration": 3.972395896911621 + }, + { + "auxiliary_loss_clip": 0.01118248, + "auxiliary_loss_mlp": 0.01109025, + "balance_loss_clip": 1.00192702, + "balance_loss_mlp": 1.00059247, + "epoch": 0.5547572523673531, + "flos": 21797166850560.0, + "grad_norm": 1.4702619156199104, + "language_loss": 0.74382293, + "learning_rate": 1.7430829865461518e-06, + "loss": 0.76609564, + "num_input_tokens_seen": 198712440, + "step": 9227, + "time_per_iteration": 4.3205482959747314 + }, + { + "auxiliary_loss_clip": 0.01119157, + "auxiliary_loss_mlp": 0.01110586, + "balance_loss_clip": 1.00206065, + "balance_loss_mlp": 1.00053167, + "epoch": 0.5548173756200211, + "flos": 22342829333760.0, + "grad_norm": 1.5617246869950585, + "language_loss": 0.73386216, + "learning_rate": 1.7426967564733118e-06, + "loss": 0.75615954, + "num_input_tokens_seen": 198731515, + "step": 9228, + "time_per_iteration": 2.6600887775421143 + }, + { + "auxiliary_loss_clip": 0.01166521, + "auxiliary_loss_mlp": 0.01110142, + "balance_loss_clip": 1.00217462, + "balance_loss_mlp": 1.00056505, + "epoch": 0.554877498872689, + "flos": 17858736213120.0, + "grad_norm": 1.7992719796800953, + "language_loss": 0.75616765, + "learning_rate": 1.7423105361574373e-06, + "loss": 0.77893424, + "num_input_tokens_seen": 198749750, + "step": 9229, + "time_per_iteration": 4.004991054534912 + }, + { + "auxiliary_loss_clip": 0.01149402, + "auxiliary_loss_mlp": 0.00747194, + "balance_loss_clip": 1.00210857, + "balance_loss_mlp": 1.00022125, + "epoch": 0.554937622125357, + "flos": 17238343484160.0, + "grad_norm": 1.4673620753648349, + "language_loss": 0.68637276, + "learning_rate": 1.741924325613172e-06, + "loss": 0.70533872, + "num_input_tokens_seen": 198768320, + "step": 9230, + "time_per_iteration": 2.5856337547302246 + }, + { + "auxiliary_loss_clip": 0.01103356, + "auxiliary_loss_mlp": 0.01110831, + "balance_loss_clip": 1.00177646, + "balance_loss_mlp": 1.00049114, + "epoch": 0.5549977453780249, + "flos": 25368087484800.0, + "grad_norm": 2.4838730126976896, + "language_loss": 0.67978132, + "learning_rate": 1.741538124855163e-06, + "loss": 0.70192313, + "num_input_tokens_seen": 198787230, + "step": 9231, + "time_per_iteration": 2.7064130306243896 + }, + { + "auxiliary_loss_clip": 0.01166525, + "auxiliary_loss_mlp": 0.01110428, + "balance_loss_clip": 1.00211573, + "balance_loss_mlp": 1.00056469, + "epoch": 0.555057868630693, + "flos": 25079114568960.0, + "grad_norm": 1.6496309497411894, + "language_loss": 0.7838105, + "learning_rate": 1.7411519338980548e-06, + "loss": 0.80657995, + "num_input_tokens_seen": 198806720, + "step": 9232, + "time_per_iteration": 2.59360671043396 + }, + { + "auxiliary_loss_clip": 0.01120256, + "auxiliary_loss_mlp": 0.01109018, + "balance_loss_clip": 1.00209141, + "balance_loss_mlp": 1.00058484, + "epoch": 0.5551179918833609, + "flos": 26104220812800.0, + "grad_norm": 1.9084234374510711, + "language_loss": 0.8312903, + "learning_rate": 1.7407657527564898e-06, + "loss": 0.85358304, + "num_input_tokens_seen": 198826235, + "step": 9233, + "time_per_iteration": 2.6814541816711426 + }, + { + "auxiliary_loss_clip": 0.0115158, + "auxiliary_loss_mlp": 0.0111095, + "balance_loss_clip": 1.00206041, + "balance_loss_mlp": 1.00061011, + "epoch": 0.5551781151360289, + "flos": 19384759572480.0, + "grad_norm": 2.179315317821488, + "language_loss": 0.74996603, + "learning_rate": 1.7403795814451142e-06, + "loss": 0.77259135, + "num_input_tokens_seen": 198842655, + "step": 9234, + "time_per_iteration": 2.5557079315185547 + }, + { + "auxiliary_loss_clip": 0.01134571, + "auxiliary_loss_mlp": 0.01109665, + "balance_loss_clip": 1.0018847, + "balance_loss_mlp": 1.00056458, + "epoch": 0.5552382383886968, + "flos": 21725956137600.0, + "grad_norm": 2.8724895645012003, + "language_loss": 0.64891499, + "learning_rate": 1.7399934199785706e-06, + "loss": 0.67135733, + "num_input_tokens_seen": 198861210, + "step": 9235, + "time_per_iteration": 2.6304128170013428 + }, + { + "auxiliary_loss_clip": 0.01105748, + "auxiliary_loss_mlp": 0.01110709, + "balance_loss_clip": 1.00188076, + "balance_loss_mlp": 1.00055957, + "epoch": 0.5552983616413648, + "flos": 14356189117440.0, + "grad_norm": 1.7421395513298483, + "language_loss": 0.6840207, + "learning_rate": 1.7396072683715029e-06, + "loss": 0.70618522, + "num_input_tokens_seen": 198880045, + "step": 9236, + "time_per_iteration": 2.6710824966430664 + }, + { + "auxiliary_loss_clip": 0.01166084, + "auxiliary_loss_mlp": 0.0110893, + "balance_loss_clip": 1.00198603, + "balance_loss_mlp": 1.00040233, + "epoch": 0.5553584848940327, + "flos": 25478548784640.0, + "grad_norm": 3.3493417875376075, + "language_loss": 0.86243296, + "learning_rate": 1.7392211266385536e-06, + "loss": 0.8851831, + "num_input_tokens_seen": 198900210, + "step": 9237, + "time_per_iteration": 2.558270215988159 + }, + { + "auxiliary_loss_clip": 0.01150476, + "auxiliary_loss_mlp": 0.01109247, + "balance_loss_clip": 1.00189912, + "balance_loss_mlp": 1.00062323, + "epoch": 0.5554186081467007, + "flos": 22163850840960.0, + "grad_norm": 1.965430877083915, + "language_loss": 0.73472047, + "learning_rate": 1.7388349947943652e-06, + "loss": 0.75731772, + "num_input_tokens_seen": 198919055, + "step": 9238, + "time_per_iteration": 2.5839650630950928 + }, + { + "auxiliary_loss_clip": 0.01151549, + "auxiliary_loss_mlp": 0.01110187, + "balance_loss_clip": 1.00215316, + "balance_loss_mlp": 1.00051427, + "epoch": 0.5554787313993687, + "flos": 49746656125440.0, + "grad_norm": 2.093665957246502, + "language_loss": 0.78344584, + "learning_rate": 1.73844887285358e-06, + "loss": 0.80606318, + "num_input_tokens_seen": 198943505, + "step": 9239, + "time_per_iteration": 2.8652124404907227 + }, + { + "auxiliary_loss_clip": 0.01133551, + "auxiliary_loss_mlp": 0.01108766, + "balance_loss_clip": 1.00196791, + "balance_loss_mlp": 1.00042844, + "epoch": 0.5555388546520367, + "flos": 22127365601280.0, + "grad_norm": 1.6401614597094385, + "language_loss": 0.79996347, + "learning_rate": 1.7380627608308393e-06, + "loss": 0.82238662, + "num_input_tokens_seen": 198963590, + "step": 9240, + "time_per_iteration": 2.62691593170166 + }, + { + "auxiliary_loss_clip": 0.01132724, + "auxiliary_loss_mlp": 0.01108814, + "balance_loss_clip": 1.00188565, + "balance_loss_mlp": 1.00047624, + "epoch": 0.5555989779047047, + "flos": 24682122478080.0, + "grad_norm": 1.8116328403897637, + "language_loss": 0.65220428, + "learning_rate": 1.737676658740786e-06, + "loss": 0.67461967, + "num_input_tokens_seen": 198982680, + "step": 9241, + "time_per_iteration": 2.6400699615478516 + }, + { + "auxiliary_loss_clip": 0.01149586, + "auxiliary_loss_mlp": 0.00747163, + "balance_loss_clip": 1.0021013, + "balance_loss_mlp": 1.00030684, + "epoch": 0.5556591011573726, + "flos": 16106510954880.0, + "grad_norm": 2.012991009504382, + "language_loss": 0.7274735, + "learning_rate": 1.7372905665980594e-06, + "loss": 0.74644101, + "num_input_tokens_seen": 199000185, + "step": 9242, + "time_per_iteration": 2.5555803775787354 + }, + { + "auxiliary_loss_clip": 0.01135872, + "auxiliary_loss_mlp": 0.01109063, + "balance_loss_clip": 1.00192535, + "balance_loss_mlp": 1.00043988, + "epoch": 0.5557192244100406, + "flos": 12933695733120.0, + "grad_norm": 1.6200291335320203, + "language_loss": 0.63874215, + "learning_rate": 1.7369044844173012e-06, + "loss": 0.66119152, + "num_input_tokens_seen": 199018380, + "step": 9243, + "time_per_iteration": 2.5970497131347656 + }, + { + "auxiliary_loss_clip": 0.01133361, + "auxiliary_loss_mlp": 0.00747215, + "balance_loss_clip": 1.00202847, + "balance_loss_mlp": 1.00027645, + "epoch": 0.5557793476627085, + "flos": 23111712887040.0, + "grad_norm": 1.8058866042416513, + "language_loss": 0.75511003, + "learning_rate": 1.7365184122131509e-06, + "loss": 0.77391577, + "num_input_tokens_seen": 199037115, + "step": 9244, + "time_per_iteration": 2.6070973873138428 + }, + { + "auxiliary_loss_clip": 0.0113481, + "auxiliary_loss_mlp": 0.01108472, + "balance_loss_clip": 1.00192475, + "balance_loss_mlp": 1.00042105, + "epoch": 0.5558394709153766, + "flos": 21428040735360.0, + "grad_norm": 2.359034813339808, + "language_loss": 0.7466315, + "learning_rate": 1.7361323500002486e-06, + "loss": 0.76906437, + "num_input_tokens_seen": 199053375, + "step": 9245, + "time_per_iteration": 2.593794584274292 + }, + { + "auxiliary_loss_clip": 0.0114089, + "auxiliary_loss_mlp": 0.01110709, + "balance_loss_clip": 1.00214064, + "balance_loss_mlp": 1.00055993, + "epoch": 0.5558995941680445, + "flos": 25078324469760.0, + "grad_norm": 2.5085913442318266, + "language_loss": 0.79760337, + "learning_rate": 1.7357462977932348e-06, + "loss": 0.82011938, + "num_input_tokens_seen": 199070930, + "step": 9246, + "time_per_iteration": 2.6291539669036865 + }, + { + "auxiliary_loss_clip": 0.01166337, + "auxiliary_loss_mlp": 0.01109827, + "balance_loss_clip": 1.00206864, + "balance_loss_mlp": 1.00072694, + "epoch": 0.5559597174207125, + "flos": 20011149872640.0, + "grad_norm": 2.125473220599294, + "language_loss": 0.73699301, + "learning_rate": 1.7353602556067471e-06, + "loss": 0.75975466, + "num_input_tokens_seen": 199088675, + "step": 9247, + "time_per_iteration": 2.524411678314209 + }, + { + "auxiliary_loss_clip": 0.01132917, + "auxiliary_loss_mlp": 0.01109748, + "balance_loss_clip": 1.00176477, + "balance_loss_mlp": 1.00055265, + "epoch": 0.5560198406733804, + "flos": 16835677044480.0, + "grad_norm": 7.0710639275161995, + "language_loss": 0.75766426, + "learning_rate": 1.7349742234554254e-06, + "loss": 0.78009087, + "num_input_tokens_seen": 199103075, + "step": 9248, + "time_per_iteration": 2.5736875534057617 + }, + { + "auxiliary_loss_clip": 0.01098333, + "auxiliary_loss_mlp": 0.01085877, + "balance_loss_clip": 1.00119638, + "balance_loss_mlp": 1.00004673, + "epoch": 0.5560799639260484, + "flos": 70697051758080.0, + "grad_norm": 0.8448296719936594, + "language_loss": 0.5945549, + "learning_rate": 1.7345882013539081e-06, + "loss": 0.61639702, + "num_input_tokens_seen": 199160325, + "step": 9249, + "time_per_iteration": 3.5282509326934814 + }, + { + "auxiliary_loss_clip": 0.011663, + "auxiliary_loss_mlp": 0.01109697, + "balance_loss_clip": 1.00196612, + "balance_loss_mlp": 1.00050175, + "epoch": 0.5561400871787163, + "flos": 23148593176320.0, + "grad_norm": 1.8765429456449436, + "language_loss": 0.80054212, + "learning_rate": 1.734202189316832e-06, + "loss": 0.82330215, + "num_input_tokens_seen": 199179760, + "step": 9250, + "time_per_iteration": 2.728724241256714 + }, + { + "auxiliary_loss_clip": 0.0113249, + "auxiliary_loss_mlp": 0.01110393, + "balance_loss_clip": 1.00183702, + "balance_loss_mlp": 1.00052965, + "epoch": 0.5562002104313843, + "flos": 17566423332480.0, + "grad_norm": 2.076419231152314, + "language_loss": 0.68605542, + "learning_rate": 1.733816187358836e-06, + "loss": 0.70848429, + "num_input_tokens_seen": 199196695, + "step": 9251, + "time_per_iteration": 2.5719521045684814 + }, + { + "auxiliary_loss_clip": 0.01149824, + "auxiliary_loss_mlp": 0.01109172, + "balance_loss_clip": 1.00197577, + "balance_loss_mlp": 1.0005486, + "epoch": 0.5562603336840523, + "flos": 25045430590080.0, + "grad_norm": 1.521233319553084, + "language_loss": 0.75471622, + "learning_rate": 1.7334301954945569e-06, + "loss": 0.7773062, + "num_input_tokens_seen": 199217845, + "step": 9252, + "time_per_iteration": 2.6217074394226074 + }, + { + "auxiliary_loss_clip": 0.01151362, + "auxiliary_loss_mlp": 0.01110242, + "balance_loss_clip": 1.00196505, + "balance_loss_mlp": 1.00066507, + "epoch": 0.5563204569367203, + "flos": 29059022436480.0, + "grad_norm": 1.4281101201939206, + "language_loss": 0.72936809, + "learning_rate": 1.7330442137386313e-06, + "loss": 0.75198406, + "num_input_tokens_seen": 199239250, + "step": 9253, + "time_per_iteration": 2.6978018283843994 + }, + { + "auxiliary_loss_clip": 0.01119945, + "auxiliary_loss_mlp": 0.01108776, + "balance_loss_clip": 1.00198483, + "balance_loss_mlp": 1.00062943, + "epoch": 0.5563805801893883, + "flos": 22090449398400.0, + "grad_norm": 2.3244112106318413, + "language_loss": 0.82960641, + "learning_rate": 1.7326582421056965e-06, + "loss": 0.8518936, + "num_input_tokens_seen": 199258320, + "step": 9254, + "time_per_iteration": 2.663684844970703 + }, + { + "auxiliary_loss_clip": 0.01135504, + "auxiliary_loss_mlp": 0.01085849, + "balance_loss_clip": 1.0014255, + "balance_loss_mlp": 1.00001848, + "epoch": 0.5564407034420562, + "flos": 58636128689280.0, + "grad_norm": 0.8913540198896391, + "language_loss": 0.64909732, + "learning_rate": 1.732272280610387e-06, + "loss": 0.67131078, + "num_input_tokens_seen": 199314840, + "step": 9255, + "time_per_iteration": 3.0292820930480957 + }, + { + "auxiliary_loss_clip": 0.0115112, + "auxiliary_loss_mlp": 0.01108836, + "balance_loss_clip": 1.00217366, + "balance_loss_mlp": 1.00049829, + "epoch": 0.5565008266947242, + "flos": 23112323418240.0, + "grad_norm": 1.8543358103879726, + "language_loss": 0.69386923, + "learning_rate": 1.7318863292673399e-06, + "loss": 0.71646881, + "num_input_tokens_seen": 199335405, + "step": 9256, + "time_per_iteration": 2.5803442001342773 + }, + { + "auxiliary_loss_clip": 0.01115987, + "auxiliary_loss_mlp": 0.01108248, + "balance_loss_clip": 1.00178015, + "balance_loss_mlp": 1.00067389, + "epoch": 0.5565609499473921, + "flos": 21578399066880.0, + "grad_norm": 1.6724114955242457, + "language_loss": 0.76125443, + "learning_rate": 1.73150038809119e-06, + "loss": 0.7834968, + "num_input_tokens_seen": 199354345, + "step": 9257, + "time_per_iteration": 4.185161113739014 + }, + { + "auxiliary_loss_clip": 0.01099506, + "auxiliary_loss_mlp": 0.01109123, + "balance_loss_clip": 1.00182736, + "balance_loss_mlp": 1.00068986, + "epoch": 0.5566210732000602, + "flos": 18369637309440.0, + "grad_norm": 2.2739298133872152, + "language_loss": 0.61145514, + "learning_rate": 1.7311144570965724e-06, + "loss": 0.63354146, + "num_input_tokens_seen": 199372250, + "step": 9258, + "time_per_iteration": 2.667402982711792 + }, + { + "auxiliary_loss_clip": 0.01118097, + "auxiliary_loss_mlp": 0.01110599, + "balance_loss_clip": 1.00183356, + "balance_loss_mlp": 1.00054538, + "epoch": 0.5566811964527281, + "flos": 25703350053120.0, + "grad_norm": 1.6951335991873069, + "language_loss": 0.78622949, + "learning_rate": 1.7307285362981215e-06, + "loss": 0.8085165, + "num_input_tokens_seen": 199392815, + "step": 9259, + "time_per_iteration": 2.6965484619140625 + }, + { + "auxiliary_loss_clip": 0.01133491, + "auxiliary_loss_mlp": 0.01109331, + "balance_loss_clip": 1.00186503, + "balance_loss_mlp": 1.0006125, + "epoch": 0.5567413197053961, + "flos": 26943991856640.0, + "grad_norm": 2.1686642706448342, + "language_loss": 0.82128024, + "learning_rate": 1.7303426257104712e-06, + "loss": 0.84370852, + "num_input_tokens_seen": 199412375, + "step": 9260, + "time_per_iteration": 2.686096668243408 + }, + { + "auxiliary_loss_clip": 0.01166484, + "auxiliary_loss_mlp": 0.01109949, + "balance_loss_clip": 1.0021013, + "balance_loss_mlp": 1.00056291, + "epoch": 0.556801442958064, + "flos": 20850597694080.0, + "grad_norm": 1.7868899131873939, + "language_loss": 0.68699956, + "learning_rate": 1.729956725348256e-06, + "loss": 0.70976388, + "num_input_tokens_seen": 199431490, + "step": 9261, + "time_per_iteration": 2.539334297180176 + }, + { + "auxiliary_loss_clip": 0.01111802, + "auxiliary_loss_mlp": 0.01086945, + "balance_loss_clip": 1.00134778, + "balance_loss_mlp": 1.00035143, + "epoch": 0.556861566210732, + "flos": 70498213044480.0, + "grad_norm": 0.7408358061226232, + "language_loss": 0.61109817, + "learning_rate": 1.729570835226108e-06, + "loss": 0.63308561, + "num_input_tokens_seen": 199495855, + "step": 9262, + "time_per_iteration": 3.206770896911621 + }, + { + "auxiliary_loss_clip": 0.01151063, + "auxiliary_loss_mlp": 0.01110532, + "balance_loss_clip": 1.00197363, + "balance_loss_mlp": 1.00057364, + "epoch": 0.5569216894633999, + "flos": 25337276593920.0, + "grad_norm": 1.6320968864568561, + "language_loss": 0.64314032, + "learning_rate": 1.7291849553586622e-06, + "loss": 0.66575623, + "num_input_tokens_seen": 199515870, + "step": 9263, + "time_per_iteration": 2.60310697555542 + }, + { + "auxiliary_loss_clip": 0.01136752, + "auxiliary_loss_mlp": 0.01110208, + "balance_loss_clip": 1.00203216, + "balance_loss_mlp": 1.00072646, + "epoch": 0.556981812716068, + "flos": 22638733574400.0, + "grad_norm": 2.170516370960263, + "language_loss": 0.72731757, + "learning_rate": 1.7287990857605497e-06, + "loss": 0.74978709, + "num_input_tokens_seen": 199535745, + "step": 9264, + "time_per_iteration": 4.029955863952637 + }, + { + "auxiliary_loss_clip": 0.01119529, + "auxiliary_loss_mlp": 0.01109764, + "balance_loss_clip": 1.00197494, + "balance_loss_mlp": 1.00056803, + "epoch": 0.5570419359687359, + "flos": 11035852738560.0, + "grad_norm": 1.9391940072439586, + "language_loss": 0.76496053, + "learning_rate": 1.7284132264464022e-06, + "loss": 0.78725344, + "num_input_tokens_seen": 199554035, + "step": 9265, + "time_per_iteration": 4.010787487030029 + }, + { + "auxiliary_loss_clip": 0.0113328, + "auxiliary_loss_mlp": 0.01108626, + "balance_loss_clip": 1.00193429, + "balance_loss_mlp": 1.00047922, + "epoch": 0.5571020592214039, + "flos": 22823135020800.0, + "grad_norm": 1.414693239931922, + "language_loss": 0.70771587, + "learning_rate": 1.7280273774308536e-06, + "loss": 0.73013496, + "num_input_tokens_seen": 199576120, + "step": 9266, + "time_per_iteration": 2.621812105178833 + }, + { + "auxiliary_loss_clip": 0.01134967, + "auxiliary_loss_mlp": 0.01108865, + "balance_loss_clip": 1.00209093, + "balance_loss_mlp": 1.00052738, + "epoch": 0.5571621824740719, + "flos": 22927778317440.0, + "grad_norm": 1.7603790567812607, + "language_loss": 0.6813969, + "learning_rate": 1.727641538728533e-06, + "loss": 0.70383525, + "num_input_tokens_seen": 199593780, + "step": 9267, + "time_per_iteration": 4.149031162261963 + }, + { + "auxiliary_loss_clip": 0.01150844, + "auxiliary_loss_mlp": 0.01108882, + "balance_loss_clip": 1.00201213, + "balance_loss_mlp": 1.00064039, + "epoch": 0.5572223057267398, + "flos": 22966705681920.0, + "grad_norm": 2.771166728828646, + "language_loss": 0.74638551, + "learning_rate": 1.7272557103540736e-06, + "loss": 0.76898277, + "num_input_tokens_seen": 199613220, + "step": 9268, + "time_per_iteration": 2.578310489654541 + }, + { + "auxiliary_loss_clip": 0.01150111, + "auxiliary_loss_mlp": 0.00747017, + "balance_loss_clip": 1.00207007, + "balance_loss_mlp": 1.00024903, + "epoch": 0.5572824289794078, + "flos": 20960053413120.0, + "grad_norm": 3.290971539305688, + "language_loss": 0.74963868, + "learning_rate": 1.726869892322104e-06, + "loss": 0.76860988, + "num_input_tokens_seen": 199632085, + "step": 9269, + "time_per_iteration": 2.5916194915771484 + }, + { + "auxiliary_loss_clip": 0.01119405, + "auxiliary_loss_mlp": 0.01109223, + "balance_loss_clip": 1.0019598, + "balance_loss_mlp": 1.00050378, + "epoch": 0.5573425522320757, + "flos": 25042413847680.0, + "grad_norm": 1.755877997577948, + "language_loss": 0.82710975, + "learning_rate": 1.726484084647256e-06, + "loss": 0.84939599, + "num_input_tokens_seen": 199649295, + "step": 9270, + "time_per_iteration": 2.6793410778045654 + }, + { + "auxiliary_loss_clip": 0.01110684, + "auxiliary_loss_mlp": 0.01110084, + "balance_loss_clip": 1.00219798, + "balance_loss_mlp": 1.00050688, + "epoch": 0.5574026754847438, + "flos": 23659637927040.0, + "grad_norm": 1.9746128075719525, + "language_loss": 0.79992485, + "learning_rate": 1.7260982873441591e-06, + "loss": 0.82213259, + "num_input_tokens_seen": 199668870, + "step": 9271, + "time_per_iteration": 2.704457998275757 + }, + { + "auxiliary_loss_clip": 0.01134885, + "auxiliary_loss_mlp": 0.01110197, + "balance_loss_clip": 1.00195789, + "balance_loss_mlp": 1.00052488, + "epoch": 0.5574627987374117, + "flos": 24782240661120.0, + "grad_norm": 1.7719606861887085, + "language_loss": 0.90567154, + "learning_rate": 1.725712500427442e-06, + "loss": 0.9281224, + "num_input_tokens_seen": 199684870, + "step": 9272, + "time_per_iteration": 2.6360270977020264 + }, + { + "auxiliary_loss_clip": 0.01118029, + "auxiliary_loss_mlp": 0.01109021, + "balance_loss_clip": 1.00200462, + "balance_loss_mlp": 1.00049329, + "epoch": 0.5575229219900797, + "flos": 21834944979840.0, + "grad_norm": 1.9869230427732005, + "language_loss": 0.84221703, + "learning_rate": 1.7253267239117347e-06, + "loss": 0.86448753, + "num_input_tokens_seen": 199701975, + "step": 9273, + "time_per_iteration": 2.678105354309082 + }, + { + "auxiliary_loss_clip": 0.01151605, + "auxiliary_loss_mlp": 0.01109953, + "balance_loss_clip": 1.00213003, + "balance_loss_mlp": 1.00075722, + "epoch": 0.5575830452427476, + "flos": 27815148408960.0, + "grad_norm": 2.4021826366011507, + "language_loss": 0.74277955, + "learning_rate": 1.7249409578116655e-06, + "loss": 0.76539516, + "num_input_tokens_seen": 199721865, + "step": 9274, + "time_per_iteration": 2.626042366027832 + }, + { + "auxiliary_loss_clip": 0.01133035, + "auxiliary_loss_mlp": 0.01110247, + "balance_loss_clip": 1.00197554, + "balance_loss_mlp": 1.00067019, + "epoch": 0.5576431684954156, + "flos": 17812805696640.0, + "grad_norm": 2.9645384924258376, + "language_loss": 0.77621984, + "learning_rate": 1.7245552021418629e-06, + "loss": 0.79865265, + "num_input_tokens_seen": 199736455, + "step": 9275, + "time_per_iteration": 2.6615688800811768 + }, + { + "auxiliary_loss_clip": 0.01133195, + "auxiliary_loss_mlp": 0.01109886, + "balance_loss_clip": 1.00187492, + "balance_loss_mlp": 1.00059474, + "epoch": 0.5577032917480835, + "flos": 15486872411520.0, + "grad_norm": 1.5859329718662207, + "language_loss": 0.75082159, + "learning_rate": 1.7241694569169546e-06, + "loss": 0.77325243, + "num_input_tokens_seen": 199753125, + "step": 9276, + "time_per_iteration": 2.596548080444336 + }, + { + "auxiliary_loss_clip": 0.01135425, + "auxiliary_loss_mlp": 0.01108621, + "balance_loss_clip": 1.00193405, + "balance_loss_mlp": 1.00056982, + "epoch": 0.5577634150007516, + "flos": 21579763783680.0, + "grad_norm": 1.7572760388720523, + "language_loss": 0.75224388, + "learning_rate": 1.7237837221515678e-06, + "loss": 0.77468431, + "num_input_tokens_seen": 199771365, + "step": 9277, + "time_per_iteration": 2.6283981800079346 + }, + { + "auxiliary_loss_clip": 0.01166246, + "auxiliary_loss_mlp": 0.01109459, + "balance_loss_clip": 1.00209332, + "balance_loss_mlp": 1.00064492, + "epoch": 0.5578235382534195, + "flos": 21139750177920.0, + "grad_norm": 1.666165516530509, + "language_loss": 0.7172997, + "learning_rate": 1.7233979978603304e-06, + "loss": 0.74005675, + "num_input_tokens_seen": 199790035, + "step": 9278, + "time_per_iteration": 2.542041540145874 + }, + { + "auxiliary_loss_clip": 0.01116718, + "auxiliary_loss_mlp": 0.01109721, + "balance_loss_clip": 1.00193954, + "balance_loss_mlp": 1.00052571, + "epoch": 0.5578836615060875, + "flos": 26505199313280.0, + "grad_norm": 1.4839514964733214, + "language_loss": 0.75898081, + "learning_rate": 1.723012284057868e-06, + "loss": 0.78124517, + "num_input_tokens_seen": 199811125, + "step": 9279, + "time_per_iteration": 2.7120606899261475 + }, + { + "auxiliary_loss_clip": 0.01134759, + "auxiliary_loss_mlp": 0.01109176, + "balance_loss_clip": 1.00195289, + "balance_loss_mlp": 1.00064814, + "epoch": 0.5579437847587555, + "flos": 20153786780160.0, + "grad_norm": 1.7852587481673923, + "language_loss": 0.67325842, + "learning_rate": 1.7226265807588082e-06, + "loss": 0.69569778, + "num_input_tokens_seen": 199829915, + "step": 9280, + "time_per_iteration": 2.6043198108673096 + }, + { + "auxiliary_loss_clip": 0.01151604, + "auxiliary_loss_mlp": 0.0110963, + "balance_loss_clip": 1.00213552, + "balance_loss_mlp": 1.0007205, + "epoch": 0.5580039080114234, + "flos": 26102281478400.0, + "grad_norm": 1.5704252439360018, + "language_loss": 0.73017383, + "learning_rate": 1.7222408879777763e-06, + "loss": 0.75278616, + "num_input_tokens_seen": 199850670, + "step": 9281, + "time_per_iteration": 2.604217529296875 + }, + { + "auxiliary_loss_clip": 0.01119329, + "auxiliary_loss_mlp": 0.00747148, + "balance_loss_clip": 1.00199676, + "balance_loss_mlp": 1.00023818, + "epoch": 0.5580640312640914, + "flos": 13771671096960.0, + "grad_norm": 2.1353806248511553, + "language_loss": 0.75510448, + "learning_rate": 1.7218552057293974e-06, + "loss": 0.77376926, + "num_input_tokens_seen": 199867645, + "step": 9282, + "time_per_iteration": 2.6559486389160156 + }, + { + "auxiliary_loss_clip": 0.01084547, + "auxiliary_loss_mlp": 0.01109149, + "balance_loss_clip": 1.00168872, + "balance_loss_mlp": 1.00052536, + "epoch": 0.5581241545167593, + "flos": 17675986792320.0, + "grad_norm": 1.81352263768592, + "language_loss": 0.66003799, + "learning_rate": 1.721469534028297e-06, + "loss": 0.68197501, + "num_input_tokens_seen": 199886320, + "step": 9283, + "time_per_iteration": 2.725285530090332 + }, + { + "auxiliary_loss_clip": 0.01115781, + "auxiliary_loss_mlp": 0.01108555, + "balance_loss_clip": 1.00185466, + "balance_loss_mlp": 1.00050402, + "epoch": 0.5581842777694274, + "flos": 19569161018880.0, + "grad_norm": 1.7467928019824057, + "language_loss": 0.83103168, + "learning_rate": 1.7210838728890994e-06, + "loss": 0.853275, + "num_input_tokens_seen": 199904895, + "step": 9284, + "time_per_iteration": 2.633310079574585 + }, + { + "auxiliary_loss_clip": 0.01132463, + "auxiliary_loss_mlp": 0.01109968, + "balance_loss_clip": 1.00189984, + "balance_loss_mlp": 1.0005815, + "epoch": 0.5582444010220953, + "flos": 20595165102720.0, + "grad_norm": 2.4400154950885824, + "language_loss": 0.856668, + "learning_rate": 1.7206982223264304e-06, + "loss": 0.87909234, + "num_input_tokens_seen": 199921090, + "step": 9285, + "time_per_iteration": 2.5932657718658447 + }, + { + "auxiliary_loss_clip": 0.01133364, + "auxiliary_loss_mlp": 0.01110004, + "balance_loss_clip": 1.0019654, + "balance_loss_mlp": 1.00071311, + "epoch": 0.5583045242747633, + "flos": 19135504120320.0, + "grad_norm": 2.098146230835618, + "language_loss": 0.73996878, + "learning_rate": 1.720312582354912e-06, + "loss": 0.76240253, + "num_input_tokens_seen": 199939925, + "step": 9286, + "time_per_iteration": 2.592296600341797 + }, + { + "auxiliary_loss_clip": 0.01166308, + "auxiliary_loss_mlp": 0.01109067, + "balance_loss_clip": 1.00216305, + "balance_loss_mlp": 1.0004431, + "epoch": 0.5583646475274312, + "flos": 27454569730560.0, + "grad_norm": 1.638050595707928, + "language_loss": 0.73876441, + "learning_rate": 1.7199269529891684e-06, + "loss": 0.76151818, + "num_input_tokens_seen": 199960015, + "step": 9287, + "time_per_iteration": 2.5786118507385254 + }, + { + "auxiliary_loss_clip": 0.01120138, + "auxiliary_loss_mlp": 0.011104, + "balance_loss_clip": 1.00193691, + "balance_loss_mlp": 1.00053692, + "epoch": 0.5584247707800992, + "flos": 23653784010240.0, + "grad_norm": 1.9559569228214242, + "language_loss": 0.75304461, + "learning_rate": 1.7195413342438233e-06, + "loss": 0.77535003, + "num_input_tokens_seen": 199980505, + "step": 9288, + "time_per_iteration": 2.666896343231201 + }, + { + "auxiliary_loss_clip": 0.01135399, + "auxiliary_loss_mlp": 0.01110165, + "balance_loss_clip": 1.00212955, + "balance_loss_mlp": 1.00068331, + "epoch": 0.5584848940327671, + "flos": 13698880185600.0, + "grad_norm": 2.1689051699895217, + "language_loss": 0.77414328, + "learning_rate": 1.7191557261334984e-06, + "loss": 0.79659891, + "num_input_tokens_seen": 199999020, + "step": 9289, + "time_per_iteration": 2.583523988723755 + }, + { + "auxiliary_loss_clip": 0.01118312, + "auxiliary_loss_mlp": 0.01111353, + "balance_loss_clip": 1.00198483, + "balance_loss_mlp": 1.00082183, + "epoch": 0.5585450172854352, + "flos": 27016208150400.0, + "grad_norm": 1.7905396002476512, + "language_loss": 0.61401725, + "learning_rate": 1.718770128672817e-06, + "loss": 0.63631392, + "num_input_tokens_seen": 200019020, + "step": 9290, + "time_per_iteration": 2.7188172340393066 + }, + { + "auxiliary_loss_clip": 0.01103885, + "auxiliary_loss_mlp": 0.01110023, + "balance_loss_clip": 1.00186968, + "balance_loss_mlp": 1.00054133, + "epoch": 0.5586051405381031, + "flos": 23185653033600.0, + "grad_norm": 3.207061889313829, + "language_loss": 0.68025672, + "learning_rate": 1.7183845418764e-06, + "loss": 0.70239586, + "num_input_tokens_seen": 200038110, + "step": 9291, + "time_per_iteration": 2.7283358573913574 + }, + { + "auxiliary_loss_clip": 0.0111926, + "auxiliary_loss_mlp": 0.01109496, + "balance_loss_clip": 1.0018543, + "balance_loss_mlp": 1.00058627, + "epoch": 0.5586652637907711, + "flos": 20775544225920.0, + "grad_norm": 2.9096448688701844, + "language_loss": 0.83679521, + "learning_rate": 1.7179989657588698e-06, + "loss": 0.8590827, + "num_input_tokens_seen": 200056210, + "step": 9292, + "time_per_iteration": 2.6824941635131836 + }, + { + "auxiliary_loss_clip": 0.01135063, + "auxiliary_loss_mlp": 0.01109073, + "balance_loss_clip": 1.00208485, + "balance_loss_mlp": 1.00064075, + "epoch": 0.5587253870434391, + "flos": 28219897837440.0, + "grad_norm": 2.0319301437173403, + "language_loss": 0.74092746, + "learning_rate": 1.7176134003348476e-06, + "loss": 0.76336884, + "num_input_tokens_seen": 200075620, + "step": 9293, + "time_per_iteration": 2.694835901260376 + }, + { + "auxiliary_loss_clip": 0.01139124, + "auxiliary_loss_mlp": 0.01109126, + "balance_loss_clip": 1.00232482, + "balance_loss_mlp": 1.00059843, + "epoch": 0.558785510296107, + "flos": 26615732440320.0, + "grad_norm": 1.9157062075341278, + "language_loss": 0.72707713, + "learning_rate": 1.7172278456189523e-06, + "loss": 0.7495597, + "num_input_tokens_seen": 200095945, + "step": 9294, + "time_per_iteration": 4.012557506561279 + }, + { + "auxiliary_loss_clip": 0.01132919, + "auxiliary_loss_mlp": 0.00747072, + "balance_loss_clip": 1.00199437, + "balance_loss_mlp": 1.00015879, + "epoch": 0.558845633548775, + "flos": 20156767608960.0, + "grad_norm": 1.9923426134631372, + "language_loss": 0.68389374, + "learning_rate": 1.716842301625806e-06, + "loss": 0.70269364, + "num_input_tokens_seen": 200114185, + "step": 9295, + "time_per_iteration": 2.6089727878570557 + }, + { + "auxiliary_loss_clip": 0.01166259, + "auxiliary_loss_mlp": 0.01109503, + "balance_loss_clip": 1.00206661, + "balance_loss_mlp": 1.00059354, + "epoch": 0.5589057568014429, + "flos": 24350774492160.0, + "grad_norm": 2.04074572417619, + "language_loss": 0.80722284, + "learning_rate": 1.7164567683700281e-06, + "loss": 0.82998049, + "num_input_tokens_seen": 200135030, + "step": 9296, + "time_per_iteration": 2.556828260421753 + }, + { + "auxiliary_loss_clip": 0.0114962, + "auxiliary_loss_mlp": 0.01109777, + "balance_loss_clip": 1.00202048, + "balance_loss_mlp": 1.0005815, + "epoch": 0.558965880054111, + "flos": 21105168359040.0, + "grad_norm": 1.670079625393036, + "language_loss": 0.65795279, + "learning_rate": 1.7160712458662379e-06, + "loss": 0.68054664, + "num_input_tokens_seen": 200154290, + "step": 9297, + "time_per_iteration": 2.5665953159332275 + }, + { + "auxiliary_loss_clip": 0.01116078, + "auxiliary_loss_mlp": 0.01110794, + "balance_loss_clip": 1.00182652, + "balance_loss_mlp": 1.00074017, + "epoch": 0.5590260033067789, + "flos": 18436071513600.0, + "grad_norm": 2.7731491643332205, + "language_loss": 0.75536966, + "learning_rate": 1.7156857341290544e-06, + "loss": 0.77763838, + "num_input_tokens_seen": 200171555, + "step": 9298, + "time_per_iteration": 2.638903856277466 + }, + { + "auxiliary_loss_clip": 0.01130509, + "auxiliary_loss_mlp": 0.0108641, + "balance_loss_clip": 1.00141501, + "balance_loss_mlp": 1.00019777, + "epoch": 0.5590861265594469, + "flos": 70577432490240.0, + "grad_norm": 0.6782816754140282, + "language_loss": 0.52377117, + "learning_rate": 1.7153002331730967e-06, + "loss": 0.54594034, + "num_input_tokens_seen": 200237010, + "step": 9299, + "time_per_iteration": 3.2480034828186035 + }, + { + "auxiliary_loss_clip": 0.01151382, + "auxiliary_loss_mlp": 0.01108079, + "balance_loss_clip": 1.00214612, + "balance_loss_mlp": 1.00060022, + "epoch": 0.5591462498121148, + "flos": 30664408896000.0, + "grad_norm": 2.118647423835474, + "language_loss": 0.68295288, + "learning_rate": 1.7149147430129824e-06, + "loss": 0.70554751, + "num_input_tokens_seen": 200260820, + "step": 9300, + "time_per_iteration": 2.6624438762664795 + }, + { + "auxiliary_loss_clip": 0.01105084, + "auxiliary_loss_mlp": 0.01110121, + "balance_loss_clip": 1.0020647, + "balance_loss_mlp": 1.00083017, + "epoch": 0.5592063730647828, + "flos": 18150438562560.0, + "grad_norm": 3.04225863816872, + "language_loss": 0.81731403, + "learning_rate": 1.7145292636633293e-06, + "loss": 0.83946609, + "num_input_tokens_seen": 200278035, + "step": 9301, + "time_per_iteration": 4.049944877624512 + }, + { + "auxiliary_loss_clip": 0.01166364, + "auxiliary_loss_mlp": 0.01109719, + "balance_loss_clip": 1.00201893, + "balance_loss_mlp": 1.00052357, + "epoch": 0.5592664963174507, + "flos": 24060400945920.0, + "grad_norm": 1.9052885165400664, + "language_loss": 0.67623919, + "learning_rate": 1.714143795138756e-06, + "loss": 0.699, + "num_input_tokens_seen": 200297255, + "step": 9302, + "time_per_iteration": 3.9122776985168457 + }, + { + "auxiliary_loss_clip": 0.01121753, + "auxiliary_loss_mlp": 0.01110236, + "balance_loss_clip": 1.00201488, + "balance_loss_mlp": 1.0004679, + "epoch": 0.5593266195701188, + "flos": 19827897661440.0, + "grad_norm": 1.5744700293679434, + "language_loss": 0.70970142, + "learning_rate": 1.713758337453878e-06, + "loss": 0.73202133, + "num_input_tokens_seen": 200317505, + "step": 9303, + "time_per_iteration": 2.6555988788604736 + }, + { + "auxiliary_loss_clip": 0.01093542, + "auxiliary_loss_mlp": 0.01108485, + "balance_loss_clip": 1.00215149, + "balance_loss_mlp": 1.00062466, + "epoch": 0.5593867428227867, + "flos": 25300755440640.0, + "grad_norm": 1.6644606295285964, + "language_loss": 0.72652829, + "learning_rate": 1.7133728906233124e-06, + "loss": 0.74854863, + "num_input_tokens_seen": 200338350, + "step": 9304, + "time_per_iteration": 4.3604700565338135 + }, + { + "auxiliary_loss_clip": 0.01149586, + "auxiliary_loss_mlp": 0.01110065, + "balance_loss_clip": 1.00196874, + "balance_loss_mlp": 1.00058317, + "epoch": 0.5594468660754547, + "flos": 12933013374720.0, + "grad_norm": 2.103740509700553, + "language_loss": 0.78222179, + "learning_rate": 1.7129874546616763e-06, + "loss": 0.80481839, + "num_input_tokens_seen": 200353965, + "step": 9305, + "time_per_iteration": 2.755098819732666 + }, + { + "auxiliary_loss_clip": 0.01101583, + "auxiliary_loss_mlp": 0.01108487, + "balance_loss_clip": 1.00174904, + "balance_loss_mlp": 1.00053072, + "epoch": 0.5595069893281227, + "flos": 19062713208960.0, + "grad_norm": 1.851846745469513, + "language_loss": 0.69389331, + "learning_rate": 1.7126020295835836e-06, + "loss": 0.715994, + "num_input_tokens_seen": 200373595, + "step": 9306, + "time_per_iteration": 2.685593366622925 + }, + { + "auxiliary_loss_clip": 0.01129804, + "auxiliary_loss_mlp": 0.01085892, + "balance_loss_clip": 1.0013051, + "balance_loss_mlp": 1.00006151, + "epoch": 0.5595671125807906, + "flos": 70273375862400.0, + "grad_norm": 0.9023295618842037, + "language_loss": 0.60307276, + "learning_rate": 1.7122166154036518e-06, + "loss": 0.62522972, + "num_input_tokens_seen": 200429155, + "step": 9307, + "time_per_iteration": 3.2837724685668945 + }, + { + "auxiliary_loss_clip": 0.01155028, + "auxiliary_loss_mlp": 0.01108442, + "balance_loss_clip": 1.00207138, + "balance_loss_mlp": 1.00067711, + "epoch": 0.5596272358334586, + "flos": 20665513889280.0, + "grad_norm": 1.9337598535600646, + "language_loss": 0.74401218, + "learning_rate": 1.7118312121364943e-06, + "loss": 0.76664692, + "num_input_tokens_seen": 200448290, + "step": 9308, + "time_per_iteration": 2.596323251724243 + }, + { + "auxiliary_loss_clip": 0.01088417, + "auxiliary_loss_mlp": 0.01109914, + "balance_loss_clip": 1.00181317, + "balance_loss_mlp": 1.00062275, + "epoch": 0.5596873590861265, + "flos": 25041013217280.0, + "grad_norm": 2.1247641090180482, + "language_loss": 0.69748044, + "learning_rate": 1.7114458197967257e-06, + "loss": 0.71946377, + "num_input_tokens_seen": 200466555, + "step": 9309, + "time_per_iteration": 2.741656541824341 + }, + { + "auxiliary_loss_clip": 0.01133263, + "auxiliary_loss_mlp": 0.01110613, + "balance_loss_clip": 1.0019877, + "balance_loss_mlp": 1.00055838, + "epoch": 0.5597474823387946, + "flos": 25958387594880.0, + "grad_norm": 1.8510578607598378, + "language_loss": 0.75037706, + "learning_rate": 1.7110604383989613e-06, + "loss": 0.77281582, + "num_input_tokens_seen": 200485980, + "step": 9310, + "time_per_iteration": 2.627372980117798 + }, + { + "auxiliary_loss_clip": 0.01149786, + "auxiliary_loss_mlp": 0.01111601, + "balance_loss_clip": 1.002195, + "balance_loss_mlp": 1.00059295, + "epoch": 0.5598076055914625, + "flos": 26177442687360.0, + "grad_norm": 3.0652471951234794, + "language_loss": 0.70128214, + "learning_rate": 1.7106750679578133e-06, + "loss": 0.72389603, + "num_input_tokens_seen": 200504555, + "step": 9311, + "time_per_iteration": 2.6186773777008057 + }, + { + "auxiliary_loss_clip": 0.01149615, + "auxiliary_loss_mlp": 0.01109684, + "balance_loss_clip": 1.00201774, + "balance_loss_mlp": 1.00058401, + "epoch": 0.5598677288441305, + "flos": 11655778590720.0, + "grad_norm": 3.1282613726564503, + "language_loss": 0.72441846, + "learning_rate": 1.7102897084878962e-06, + "loss": 0.74701148, + "num_input_tokens_seen": 200522700, + "step": 9312, + "time_per_iteration": 2.5507936477661133 + }, + { + "auxiliary_loss_clip": 0.01115986, + "auxiliary_loss_mlp": 0.0110961, + "balance_loss_clip": 1.00194085, + "balance_loss_mlp": 1.0005095, + "epoch": 0.5599278520967984, + "flos": 22966597941120.0, + "grad_norm": 3.034565637921977, + "language_loss": 0.89223289, + "learning_rate": 1.709904360003822e-06, + "loss": 0.91448891, + "num_input_tokens_seen": 200541910, + "step": 9313, + "time_per_iteration": 2.6314456462860107 + }, + { + "auxiliary_loss_clip": 0.01119641, + "auxiliary_loss_mlp": 0.01110146, + "balance_loss_clip": 1.0020802, + "balance_loss_mlp": 1.000664, + "epoch": 0.5599879753494664, + "flos": 21215557831680.0, + "grad_norm": 1.492833568007223, + "language_loss": 0.77709568, + "learning_rate": 1.709519022520204e-06, + "loss": 0.79939348, + "num_input_tokens_seen": 200562600, + "step": 9314, + "time_per_iteration": 2.693240165710449 + }, + { + "auxiliary_loss_clip": 0.01118345, + "auxiliary_loss_mlp": 0.01109401, + "balance_loss_clip": 1.00181377, + "balance_loss_mlp": 1.00049126, + "epoch": 0.5600480986021343, + "flos": 31903219105920.0, + "grad_norm": 1.828966735130595, + "language_loss": 0.7053827, + "learning_rate": 1.7091336960516537e-06, + "loss": 0.72766018, + "num_input_tokens_seen": 200584795, + "step": 9315, + "time_per_iteration": 2.771298408508301 + }, + { + "auxiliary_loss_clip": 0.01135223, + "auxiliary_loss_mlp": 0.01110691, + "balance_loss_clip": 1.00188756, + "balance_loss_mlp": 1.00063753, + "epoch": 0.5601082218548024, + "flos": 28476048700800.0, + "grad_norm": 1.7117243763017178, + "language_loss": 0.67100728, + "learning_rate": 1.7087483806127824e-06, + "loss": 0.69346654, + "num_input_tokens_seen": 200606945, + "step": 9316, + "time_per_iteration": 2.6425387859344482 + }, + { + "auxiliary_loss_clip": 0.01119636, + "auxiliary_loss_mlp": 0.01109273, + "balance_loss_clip": 1.00180423, + "balance_loss_mlp": 1.00045872, + "epoch": 0.5601683451074703, + "flos": 24097173494400.0, + "grad_norm": 1.926027863268309, + "language_loss": 0.87011552, + "learning_rate": 1.7083630762182022e-06, + "loss": 0.89240456, + "num_input_tokens_seen": 200626340, + "step": 9317, + "time_per_iteration": 2.7228028774261475 + }, + { + "auxiliary_loss_clip": 0.01150116, + "auxiliary_loss_mlp": 0.01110468, + "balance_loss_clip": 1.00201583, + "balance_loss_mlp": 1.00050974, + "epoch": 0.5602284683601383, + "flos": 26356205698560.0, + "grad_norm": 1.5917398736256634, + "language_loss": 0.77014232, + "learning_rate": 1.7079777828825233e-06, + "loss": 0.79274809, + "num_input_tokens_seen": 200644520, + "step": 9318, + "time_per_iteration": 2.603440046310425 + }, + { + "auxiliary_loss_clip": 0.01149352, + "auxiliary_loss_mlp": 0.01109422, + "balance_loss_clip": 1.00186586, + "balance_loss_mlp": 1.00060773, + "epoch": 0.5602885916128063, + "flos": 24496392228480.0, + "grad_norm": 5.009924327710442, + "language_loss": 0.76359427, + "learning_rate": 1.7075925006203558e-06, + "loss": 0.78618205, + "num_input_tokens_seen": 200664845, + "step": 9319, + "time_per_iteration": 2.5988240242004395 + }, + { + "auxiliary_loss_clip": 0.01149748, + "auxiliary_loss_mlp": 0.0110864, + "balance_loss_clip": 1.00214458, + "balance_loss_mlp": 1.00058901, + "epoch": 0.5603487148654742, + "flos": 27345006270720.0, + "grad_norm": 1.4038339311345058, + "language_loss": 0.85223728, + "learning_rate": 1.7072072294463101e-06, + "loss": 0.87482119, + "num_input_tokens_seen": 200686535, + "step": 9320, + "time_per_iteration": 2.608208656311035 + }, + { + "auxiliary_loss_clip": 0.01145836, + "auxiliary_loss_mlp": 0.01085945, + "balance_loss_clip": 1.00139546, + "balance_loss_mlp": 1.00011408, + "epoch": 0.5604088381181422, + "flos": 54087756180480.0, + "grad_norm": 0.7590192248043109, + "language_loss": 0.52622777, + "learning_rate": 1.706821969374996e-06, + "loss": 0.54854554, + "num_input_tokens_seen": 200736965, + "step": 9321, + "time_per_iteration": 2.953925609588623 + }, + { + "auxiliary_loss_clip": 0.01134644, + "auxiliary_loss_mlp": 0.01109456, + "balance_loss_clip": 1.0019505, + "balance_loss_mlp": 1.00064206, + "epoch": 0.5604689613708101, + "flos": 22236390357120.0, + "grad_norm": 2.0666150376472916, + "language_loss": 0.74560982, + "learning_rate": 1.7064367204210216e-06, + "loss": 0.76805079, + "num_input_tokens_seen": 200757420, + "step": 9322, + "time_per_iteration": 2.648376941680908 + }, + { + "auxiliary_loss_clip": 0.01166337, + "auxiliary_loss_mlp": 0.01110082, + "balance_loss_clip": 1.002033, + "balance_loss_mlp": 1.00060046, + "epoch": 0.5605290846234782, + "flos": 35297782940160.0, + "grad_norm": 1.6030640454975822, + "language_loss": 0.73937762, + "learning_rate": 1.7060514825989963e-06, + "loss": 0.76214182, + "num_input_tokens_seen": 200779520, + "step": 9323, + "time_per_iteration": 2.642781972885132 + }, + { + "auxiliary_loss_clip": 0.01133253, + "auxiliary_loss_mlp": 0.01109687, + "balance_loss_clip": 1.00190103, + "balance_loss_mlp": 1.00049138, + "epoch": 0.5605892078761461, + "flos": 20263314326400.0, + "grad_norm": 1.4678425042232888, + "language_loss": 0.61616576, + "learning_rate": 1.7056662559235286e-06, + "loss": 0.6385951, + "num_input_tokens_seen": 200799485, + "step": 9324, + "time_per_iteration": 2.5997776985168457 + }, + { + "auxiliary_loss_clip": 0.01108751, + "auxiliary_loss_mlp": 0.01110356, + "balance_loss_clip": 1.00196242, + "balance_loss_mlp": 1.00058842, + "epoch": 0.5606493311288141, + "flos": 17308333134720.0, + "grad_norm": 1.7437270308735482, + "language_loss": 0.87705004, + "learning_rate": 1.705281040409226e-06, + "loss": 0.89924115, + "num_input_tokens_seen": 200817540, + "step": 9325, + "time_per_iteration": 2.643327474594116 + }, + { + "auxiliary_loss_clip": 0.01134783, + "auxiliary_loss_mlp": 0.01110291, + "balance_loss_clip": 1.00200701, + "balance_loss_mlp": 1.00052261, + "epoch": 0.560709454381482, + "flos": 21652985658240.0, + "grad_norm": 1.6349214603224695, + "language_loss": 0.73720986, + "learning_rate": 1.7048958360706952e-06, + "loss": 0.7596606, + "num_input_tokens_seen": 200838380, + "step": 9326, + "time_per_iteration": 2.6271820068359375 + }, + { + "auxiliary_loss_clip": 0.01136339, + "auxiliary_loss_mlp": 0.01110967, + "balance_loss_clip": 1.0021311, + "balance_loss_mlp": 1.00053203, + "epoch": 0.56076957763415, + "flos": 20303355012480.0, + "grad_norm": 1.798575412544955, + "language_loss": 0.78194672, + "learning_rate": 1.7045106429225447e-06, + "loss": 0.80441976, + "num_input_tokens_seen": 200855640, + "step": 9327, + "time_per_iteration": 2.6078174114227295 + }, + { + "auxiliary_loss_clip": 0.01150079, + "auxiliary_loss_mlp": 0.01110274, + "balance_loss_clip": 1.0021286, + "balance_loss_mlp": 1.00050628, + "epoch": 0.5608297008868179, + "flos": 25045897466880.0, + "grad_norm": 1.8381249152122896, + "language_loss": 0.78876877, + "learning_rate": 1.7041254609793795e-06, + "loss": 0.81137234, + "num_input_tokens_seen": 200876585, + "step": 9328, + "time_per_iteration": 2.597982406616211 + }, + { + "auxiliary_loss_clip": 0.01166329, + "auxiliary_loss_mlp": 0.01109105, + "balance_loss_clip": 1.00217271, + "balance_loss_mlp": 1.00048113, + "epoch": 0.560889824139486, + "flos": 19866825025920.0, + "grad_norm": 1.4536056268035682, + "language_loss": 0.73468542, + "learning_rate": 1.7037402902558066e-06, + "loss": 0.75743973, + "num_input_tokens_seen": 200898175, + "step": 9329, + "time_per_iteration": 2.5392701625823975 + }, + { + "auxiliary_loss_clip": 0.01133065, + "auxiliary_loss_mlp": 0.00747318, + "balance_loss_clip": 1.001845, + "balance_loss_mlp": 1.00026202, + "epoch": 0.5609499473921539, + "flos": 22929394429440.0, + "grad_norm": 2.8352350413099017, + "language_loss": 0.83413327, + "learning_rate": 1.7033551307664324e-06, + "loss": 0.8529371, + "num_input_tokens_seen": 200917515, + "step": 9330, + "time_per_iteration": 2.607656478881836 + }, + { + "auxiliary_loss_clip": 0.01161136, + "auxiliary_loss_mlp": 0.01085925, + "balance_loss_clip": 1.00130785, + "balance_loss_mlp": 1.00009429, + "epoch": 0.5610100706448219, + "flos": 53035825455360.0, + "grad_norm": 0.7206707300518416, + "language_loss": 0.57824624, + "learning_rate": 1.7029699825258603e-06, + "loss": 0.60071683, + "num_input_tokens_seen": 200978615, + "step": 9331, + "time_per_iteration": 3.137127637863159 + }, + { + "auxiliary_loss_clip": 0.01101469, + "auxiliary_loss_mlp": 0.01109399, + "balance_loss_clip": 1.00176764, + "balance_loss_mlp": 1.00058484, + "epoch": 0.5610701938974898, + "flos": 21834944979840.0, + "grad_norm": 2.235291734500983, + "language_loss": 0.81552815, + "learning_rate": 1.7025848455486971e-06, + "loss": 0.83763683, + "num_input_tokens_seen": 200997745, + "step": 9332, + "time_per_iteration": 4.0757129192352295 + }, + { + "auxiliary_loss_clip": 0.01151965, + "auxiliary_loss_mlp": 0.01111182, + "balance_loss_clip": 1.00217962, + "balance_loss_mlp": 1.00065112, + "epoch": 0.5611303171501578, + "flos": 17457183095040.0, + "grad_norm": 3.865635230753519, + "language_loss": 0.81525856, + "learning_rate": 1.7021997198495454e-06, + "loss": 0.83789003, + "num_input_tokens_seen": 201016370, + "step": 9333, + "time_per_iteration": 2.5562124252319336 + }, + { + "auxiliary_loss_clip": 0.01166386, + "auxiliary_loss_mlp": 0.01109934, + "balance_loss_clip": 1.00206947, + "balance_loss_mlp": 1.00054753, + "epoch": 0.5611904404028258, + "flos": 22637799820800.0, + "grad_norm": 1.8515392057864133, + "language_loss": 0.7246834, + "learning_rate": 1.7018146054430108e-06, + "loss": 0.74744666, + "num_input_tokens_seen": 201034310, + "step": 9334, + "time_per_iteration": 2.5558509826660156 + }, + { + "auxiliary_loss_clip": 0.01140579, + "auxiliary_loss_mlp": 0.01109589, + "balance_loss_clip": 1.00219464, + "balance_loss_mlp": 1.000489, + "epoch": 0.5612505636554938, + "flos": 14316327999360.0, + "grad_norm": 1.7099433676429334, + "language_loss": 0.71424854, + "learning_rate": 1.7014295023436961e-06, + "loss": 0.73675025, + "num_input_tokens_seen": 201052030, + "step": 9335, + "time_per_iteration": 2.581031322479248 + }, + { + "auxiliary_loss_clip": 0.01135073, + "auxiliary_loss_mlp": 0.01109891, + "balance_loss_clip": 1.00201285, + "balance_loss_mlp": 1.00060022, + "epoch": 0.5613106869081618, + "flos": 16508279554560.0, + "grad_norm": 1.8485939116508867, + "language_loss": 0.76651984, + "learning_rate": 1.701044410566205e-06, + "loss": 0.78896952, + "num_input_tokens_seen": 201068445, + "step": 9336, + "time_per_iteration": 2.577080011367798 + }, + { + "auxiliary_loss_clip": 0.01149747, + "auxiliary_loss_mlp": 0.01110091, + "balance_loss_clip": 1.00213695, + "balance_loss_mlp": 1.00060928, + "epoch": 0.5613708101608297, + "flos": 24058569352320.0, + "grad_norm": 2.237540161814564, + "language_loss": 0.64263105, + "learning_rate": 1.7006593301251393e-06, + "loss": 0.66522944, + "num_input_tokens_seen": 201082140, + "step": 9337, + "time_per_iteration": 2.581791639328003 + }, + { + "auxiliary_loss_clip": 0.01127811, + "auxiliary_loss_mlp": 0.01086852, + "balance_loss_clip": 1.00114989, + "balance_loss_mlp": 1.00025845, + "epoch": 0.5614309334134977, + "flos": 64905735997440.0, + "grad_norm": 0.8788879129871695, + "language_loss": 0.62620664, + "learning_rate": 1.700274261035102e-06, + "loss": 0.64835322, + "num_input_tokens_seen": 201137245, + "step": 9338, + "time_per_iteration": 3.1160330772399902 + }, + { + "auxiliary_loss_clip": 0.01120025, + "auxiliary_loss_mlp": 0.01109715, + "balance_loss_clip": 1.00191021, + "balance_loss_mlp": 1.00051916, + "epoch": 0.5614910566661656, + "flos": 32919849740160.0, + "grad_norm": 1.935474168378378, + "language_loss": 0.65364242, + "learning_rate": 1.6998892033106946e-06, + "loss": 0.6759398, + "num_input_tokens_seen": 201157270, + "step": 9339, + "time_per_iteration": 4.235727071762085 + }, + { + "auxiliary_loss_clip": 0.01151171, + "auxiliary_loss_mlp": 0.01109921, + "balance_loss_clip": 1.00204968, + "balance_loss_mlp": 1.00053453, + "epoch": 0.5615511799188336, + "flos": 18588871969920.0, + "grad_norm": 1.8891794146577414, + "language_loss": 0.698376, + "learning_rate": 1.6995041569665184e-06, + "loss": 0.7209869, + "num_input_tokens_seen": 201174530, + "step": 9340, + "time_per_iteration": 2.5340232849121094 + }, + { + "auxiliary_loss_clip": 0.01118408, + "auxiliary_loss_mlp": 0.0110961, + "balance_loss_clip": 1.00194383, + "balance_loss_mlp": 1.00060475, + "epoch": 0.5616113031715015, + "flos": 22820010537600.0, + "grad_norm": 1.6263667029463165, + "language_loss": 0.77078414, + "learning_rate": 1.6991191220171756e-06, + "loss": 0.79306436, + "num_input_tokens_seen": 201194905, + "step": 9341, + "time_per_iteration": 2.650364875793457 + }, + { + "auxiliary_loss_clip": 0.01102635, + "auxiliary_loss_mlp": 0.01110111, + "balance_loss_clip": 1.00182199, + "balance_loss_mlp": 1.00062895, + "epoch": 0.5616714264241696, + "flos": 22345702421760.0, + "grad_norm": 1.6351096298854197, + "language_loss": 0.80087692, + "learning_rate": 1.6987340984772653e-06, + "loss": 0.82300436, + "num_input_tokens_seen": 201213715, + "step": 9342, + "time_per_iteration": 4.129619359970093 + }, + { + "auxiliary_loss_clip": 0.01119738, + "auxiliary_loss_mlp": 0.01109815, + "balance_loss_clip": 1.00194824, + "balance_loss_mlp": 1.00052452, + "epoch": 0.5617315496768375, + "flos": 18807783408000.0, + "grad_norm": 2.1688263835036885, + "language_loss": 0.76815015, + "learning_rate": 1.6983490863613882e-06, + "loss": 0.79044569, + "num_input_tokens_seen": 201231415, + "step": 9343, + "time_per_iteration": 2.6223092079162598 + }, + { + "auxiliary_loss_clip": 0.01101713, + "auxiliary_loss_mlp": 0.01110596, + "balance_loss_clip": 1.00200248, + "balance_loss_mlp": 1.00073302, + "epoch": 0.5617916729295055, + "flos": 18369314087040.0, + "grad_norm": 1.94493702636527, + "language_loss": 0.6882236, + "learning_rate": 1.6979640856841442e-06, + "loss": 0.7103467, + "num_input_tokens_seen": 201249625, + "step": 9344, + "time_per_iteration": 2.6869375705718994 + }, + { + "auxiliary_loss_clip": 0.01166383, + "auxiliary_loss_mlp": 0.01109441, + "balance_loss_clip": 1.00218797, + "balance_loss_mlp": 1.00053179, + "epoch": 0.5618517961821734, + "flos": 28179964892160.0, + "grad_norm": 2.350941192322124, + "language_loss": 0.66072369, + "learning_rate": 1.6975790964601318e-06, + "loss": 0.68348193, + "num_input_tokens_seen": 201271205, + "step": 9345, + "time_per_iteration": 2.5906713008880615 + }, + { + "auxiliary_loss_clip": 0.01133142, + "auxiliary_loss_mlp": 0.01110424, + "balance_loss_clip": 1.00216579, + "balance_loss_mlp": 1.00046563, + "epoch": 0.5619119194348414, + "flos": 15486872411520.0, + "grad_norm": 2.6812193121795618, + "language_loss": 0.8709814, + "learning_rate": 1.6971941187039512e-06, + "loss": 0.89341706, + "num_input_tokens_seen": 201287700, + "step": 9346, + "time_per_iteration": 2.560694932937622 + }, + { + "auxiliary_loss_clip": 0.01140159, + "auxiliary_loss_mlp": 0.01110011, + "balance_loss_clip": 1.00219285, + "balance_loss_mlp": 1.00062406, + "epoch": 0.5619720426875094, + "flos": 29128652951040.0, + "grad_norm": 2.055540346572536, + "language_loss": 0.59891242, + "learning_rate": 1.6968091524301993e-06, + "loss": 0.62141412, + "num_input_tokens_seen": 201307530, + "step": 9347, + "time_per_iteration": 2.646010160446167 + }, + { + "auxiliary_loss_clip": 0.011501, + "auxiliary_loss_mlp": 0.01110562, + "balance_loss_clip": 1.00207245, + "balance_loss_mlp": 1.00050843, + "epoch": 0.5620321659401774, + "flos": 18003743418240.0, + "grad_norm": 2.184094987897251, + "language_loss": 0.68516064, + "learning_rate": 1.6964241976534745e-06, + "loss": 0.70776725, + "num_input_tokens_seen": 201326210, + "step": 9348, + "time_per_iteration": 2.525383472442627 + }, + { + "auxiliary_loss_clip": 0.01101076, + "auxiliary_loss_mlp": 0.01110751, + "balance_loss_clip": 1.00181103, + "balance_loss_mlp": 1.00041127, + "epoch": 0.5620922891928454, + "flos": 20594518657920.0, + "grad_norm": 1.9285332605965302, + "language_loss": 0.79282403, + "learning_rate": 1.6960392543883754e-06, + "loss": 0.81494224, + "num_input_tokens_seen": 201346120, + "step": 9349, + "time_per_iteration": 2.6549229621887207 + }, + { + "auxiliary_loss_clip": 0.01086919, + "auxiliary_loss_mlp": 0.01110201, + "balance_loss_clip": 1.00189209, + "balance_loss_mlp": 1.00062335, + "epoch": 0.5621524124455133, + "flos": 26287006147200.0, + "grad_norm": 2.0314290451499564, + "language_loss": 0.66624922, + "learning_rate": 1.6956543226494975e-06, + "loss": 0.68822044, + "num_input_tokens_seen": 201365700, + "step": 9350, + "time_per_iteration": 2.7856459617614746 + }, + { + "auxiliary_loss_clip": 0.01109057, + "auxiliary_loss_mlp": 0.01111132, + "balance_loss_clip": 1.00206351, + "balance_loss_mlp": 1.00060153, + "epoch": 0.5622125356981813, + "flos": 12750299867520.0, + "grad_norm": 2.3297668315307614, + "language_loss": 0.78365207, + "learning_rate": 1.6952694024514381e-06, + "loss": 0.80585396, + "num_input_tokens_seen": 201382795, + "step": 9351, + "time_per_iteration": 2.6459996700286865 + }, + { + "auxiliary_loss_clip": 0.01135211, + "auxiliary_loss_mlp": 0.00747324, + "balance_loss_clip": 1.0020045, + "balance_loss_mlp": 1.00030243, + "epoch": 0.5622726589508492, + "flos": 23805327490560.0, + "grad_norm": 1.6408997311852092, + "language_loss": 0.58860272, + "learning_rate": 1.6948844938087945e-06, + "loss": 0.60742801, + "num_input_tokens_seen": 201402780, + "step": 9352, + "time_per_iteration": 2.7043659687042236 + }, + { + "auxiliary_loss_clip": 0.01151359, + "auxiliary_loss_mlp": 0.0110838, + "balance_loss_clip": 1.00217295, + "balance_loss_mlp": 1.00061524, + "epoch": 0.5623327822035172, + "flos": 24718212668160.0, + "grad_norm": 1.7166882457658446, + "language_loss": 0.71930963, + "learning_rate": 1.6944995967361604e-06, + "loss": 0.741907, + "num_input_tokens_seen": 201424140, + "step": 9353, + "time_per_iteration": 2.6079325675964355 + }, + { + "auxiliary_loss_clip": 0.01132529, + "auxiliary_loss_mlp": 0.01110412, + "balance_loss_clip": 1.00192022, + "balance_loss_mlp": 1.0005486, + "epoch": 0.5623929054561851, + "flos": 14019274523520.0, + "grad_norm": 2.5480496680713114, + "language_loss": 0.76349205, + "learning_rate": 1.6941147112481327e-06, + "loss": 0.78592151, + "num_input_tokens_seen": 201439645, + "step": 9354, + "time_per_iteration": 2.55946683883667 + }, + { + "auxiliary_loss_clip": 0.01118096, + "auxiliary_loss_mlp": 0.01110782, + "balance_loss_clip": 1.001899, + "balance_loss_mlp": 1.00063241, + "epoch": 0.5624530287088532, + "flos": 20704405340160.0, + "grad_norm": 2.009100010987294, + "language_loss": 0.72559249, + "learning_rate": 1.6937298373593056e-06, + "loss": 0.74788123, + "num_input_tokens_seen": 201459970, + "step": 9355, + "time_per_iteration": 2.6531732082366943 + }, + { + "auxiliary_loss_clip": 0.01149929, + "auxiliary_loss_mlp": 0.01109219, + "balance_loss_clip": 1.00197983, + "balance_loss_mlp": 1.00050044, + "epoch": 0.5625131519615211, + "flos": 21470918595840.0, + "grad_norm": 2.648789511416065, + "language_loss": 0.7306844, + "learning_rate": 1.693344975084274e-06, + "loss": 0.75327587, + "num_input_tokens_seen": 201480055, + "step": 9356, + "time_per_iteration": 2.6120314598083496 + }, + { + "auxiliary_loss_clip": 0.01166331, + "auxiliary_loss_mlp": 0.01109822, + "balance_loss_clip": 1.00211453, + "balance_loss_mlp": 1.0006268, + "epoch": 0.5625732752141891, + "flos": 18698004466560.0, + "grad_norm": 3.8567686025547494, + "language_loss": 0.83336651, + "learning_rate": 1.6929601244376318e-06, + "loss": 0.8561281, + "num_input_tokens_seen": 201497645, + "step": 9357, + "time_per_iteration": 2.4936347007751465 + }, + { + "auxiliary_loss_clip": 0.01149586, + "auxiliary_loss_mlp": 0.01109257, + "balance_loss_clip": 1.00199819, + "balance_loss_mlp": 1.00053787, + "epoch": 0.562633398466857, + "flos": 16216900427520.0, + "grad_norm": 3.3177821253947193, + "language_loss": 0.72792631, + "learning_rate": 1.6925752854339722e-06, + "loss": 0.75051475, + "num_input_tokens_seen": 201515455, + "step": 9358, + "time_per_iteration": 2.524707555770874 + }, + { + "auxiliary_loss_clip": 0.01166347, + "auxiliary_loss_mlp": 0.01110057, + "balance_loss_clip": 1.00211, + "balance_loss_mlp": 1.00076556, + "epoch": 0.562693521719525, + "flos": 22491930689280.0, + "grad_norm": 1.678189432019929, + "language_loss": 0.77547926, + "learning_rate": 1.6921904580878885e-06, + "loss": 0.79824328, + "num_input_tokens_seen": 201534500, + "step": 9359, + "time_per_iteration": 2.552675247192383 + }, + { + "auxiliary_loss_clip": 0.01133107, + "auxiliary_loss_mlp": 0.01111375, + "balance_loss_clip": 1.00200009, + "balance_loss_mlp": 1.0005579, + "epoch": 0.562753644972193, + "flos": 25331171281920.0, + "grad_norm": 1.7816565093480767, + "language_loss": 0.70718765, + "learning_rate": 1.6918056424139736e-06, + "loss": 0.72963244, + "num_input_tokens_seen": 201553280, + "step": 9360, + "time_per_iteration": 2.6345338821411133 + }, + { + "auxiliary_loss_clip": 0.01104042, + "auxiliary_loss_mlp": 0.01086251, + "balance_loss_clip": 1.00126123, + "balance_loss_mlp": 1.0000391, + "epoch": 0.562813768224861, + "flos": 67392622126080.0, + "grad_norm": 0.8437178529239763, + "language_loss": 0.55610836, + "learning_rate": 1.6914208384268197e-06, + "loss": 0.57801139, + "num_input_tokens_seen": 201610030, + "step": 9361, + "time_per_iteration": 3.1626627445220947 + }, + { + "auxiliary_loss_clip": 0.01132293, + "auxiliary_loss_mlp": 0.01109094, + "balance_loss_clip": 1.00197625, + "balance_loss_mlp": 1.0006609, + "epoch": 0.562873891477529, + "flos": 23331163029120.0, + "grad_norm": 1.4891644090303109, + "language_loss": 0.81980288, + "learning_rate": 1.691036046141018e-06, + "loss": 0.84221673, + "num_input_tokens_seen": 201628370, + "step": 9362, + "time_per_iteration": 2.602449893951416 + }, + { + "auxiliary_loss_clip": 0.01117518, + "auxiliary_loss_mlp": 0.00747225, + "balance_loss_clip": 1.00188947, + "balance_loss_mlp": 1.00027251, + "epoch": 0.5629340147301969, + "flos": 38472824805120.0, + "grad_norm": 1.790530382254686, + "language_loss": 0.74244177, + "learning_rate": 1.6906512655711614e-06, + "loss": 0.76108921, + "num_input_tokens_seen": 201649790, + "step": 9363, + "time_per_iteration": 2.786322593688965 + }, + { + "auxiliary_loss_clip": 0.01151168, + "auxiliary_loss_mlp": 0.01109993, + "balance_loss_clip": 1.00204253, + "balance_loss_mlp": 1.00060642, + "epoch": 0.5629941379828649, + "flos": 29242023252480.0, + "grad_norm": 1.8779605042384628, + "language_loss": 0.8267405, + "learning_rate": 1.690266496731839e-06, + "loss": 0.84935212, + "num_input_tokens_seen": 201669175, + "step": 9364, + "time_per_iteration": 2.6357715129852295 + }, + { + "auxiliary_loss_clip": 0.01123713, + "auxiliary_loss_mlp": 0.011098, + "balance_loss_clip": 1.00210583, + "balance_loss_mlp": 1.00069928, + "epoch": 0.5630542612355328, + "flos": 19420885676160.0, + "grad_norm": 2.5590626438756234, + "language_loss": 0.65388829, + "learning_rate": 1.689881739637642e-06, + "loss": 0.6762234, + "num_input_tokens_seen": 201687000, + "step": 9365, + "time_per_iteration": 2.639991521835327 + }, + { + "auxiliary_loss_clip": 0.01135473, + "auxiliary_loss_mlp": 0.011113, + "balance_loss_clip": 1.00206876, + "balance_loss_mlp": 1.00067401, + "epoch": 0.5631143844882008, + "flos": 22266303408000.0, + "grad_norm": 4.5715042703969, + "language_loss": 0.82194138, + "learning_rate": 1.6894969943031611e-06, + "loss": 0.84440905, + "num_input_tokens_seen": 201703335, + "step": 9366, + "time_per_iteration": 2.5995166301727295 + }, + { + "auxiliary_loss_clip": 0.01166279, + "auxiliary_loss_mlp": 0.01109038, + "balance_loss_clip": 1.0021534, + "balance_loss_mlp": 1.00060546, + "epoch": 0.5631745077408687, + "flos": 22965305051520.0, + "grad_norm": 1.5006184405791638, + "language_loss": 0.73272455, + "learning_rate": 1.6891122607429845e-06, + "loss": 0.75547767, + "num_input_tokens_seen": 201723495, + "step": 9367, + "time_per_iteration": 2.52303409576416 + }, + { + "auxiliary_loss_clip": 0.01127674, + "auxiliary_loss_mlp": 0.01086503, + "balance_loss_clip": 1.00104237, + "balance_loss_mlp": 1.00029111, + "epoch": 0.5632346309935368, + "flos": 65080515576960.0, + "grad_norm": 0.6328136273191319, + "language_loss": 0.53541386, + "learning_rate": 1.6887275389717028e-06, + "loss": 0.55755556, + "num_input_tokens_seen": 201792615, + "step": 9368, + "time_per_iteration": 3.2671313285827637 + }, + { + "auxiliary_loss_clip": 0.01166378, + "auxiliary_loss_mlp": 0.01110026, + "balance_loss_clip": 1.00217867, + "balance_loss_mlp": 1.00054383, + "epoch": 0.5632947542462047, + "flos": 23002903612800.0, + "grad_norm": 1.687325792093094, + "language_loss": 0.69366711, + "learning_rate": 1.6883428290039046e-06, + "loss": 0.71643114, + "num_input_tokens_seen": 201812520, + "step": 9369, + "time_per_iteration": 3.987178087234497 + }, + { + "auxiliary_loss_clip": 0.0111939, + "auxiliary_loss_mlp": 0.01109532, + "balance_loss_clip": 1.00185323, + "balance_loss_mlp": 1.00062203, + "epoch": 0.5633548774988727, + "flos": 30482593228800.0, + "grad_norm": 2.500702068629991, + "language_loss": 0.76137316, + "learning_rate": 1.6879581308541763e-06, + "loss": 0.78366232, + "num_input_tokens_seen": 201834185, + "step": 9370, + "time_per_iteration": 2.7131903171539307 + }, + { + "auxiliary_loss_clip": 0.0113872, + "auxiliary_loss_mlp": 0.01111024, + "balance_loss_clip": 1.00209737, + "balance_loss_mlp": 1.00058806, + "epoch": 0.5634150007515406, + "flos": 18515039564160.0, + "grad_norm": 3.2562417010022497, + "language_loss": 0.76167583, + "learning_rate": 1.687573444537108e-06, + "loss": 0.78417331, + "num_input_tokens_seen": 201851305, + "step": 9371, + "time_per_iteration": 2.5781846046447754 + }, + { + "auxiliary_loss_clip": 0.01149749, + "auxiliary_loss_mlp": 0.01109205, + "balance_loss_clip": 1.00195694, + "balance_loss_mlp": 1.000772, + "epoch": 0.5634751240042086, + "flos": 19244672530560.0, + "grad_norm": 1.7517646905014432, + "language_loss": 0.76007783, + "learning_rate": 1.687188770067285e-06, + "loss": 0.78266734, + "num_input_tokens_seen": 201870350, + "step": 9372, + "time_per_iteration": 2.543015480041504 + }, + { + "auxiliary_loss_clip": 0.01134597, + "auxiliary_loss_mlp": 0.01109592, + "balance_loss_clip": 1.00198781, + "balance_loss_mlp": 1.00058711, + "epoch": 0.5635352472568766, + "flos": 12020630987520.0, + "grad_norm": 1.9448191689162684, + "language_loss": 0.71165919, + "learning_rate": 1.6868041074592956e-06, + "loss": 0.73410112, + "num_input_tokens_seen": 201886800, + "step": 9373, + "time_per_iteration": 2.5687875747680664 + }, + { + "auxiliary_loss_clip": 0.01118302, + "auxiliary_loss_mlp": 0.01109948, + "balance_loss_clip": 1.00199986, + "balance_loss_mlp": 1.00046635, + "epoch": 0.5635953705095446, + "flos": 21871645701120.0, + "grad_norm": 8.56092329694232, + "language_loss": 0.82534528, + "learning_rate": 1.6864194567277264e-06, + "loss": 0.84762776, + "num_input_tokens_seen": 201904730, + "step": 9374, + "time_per_iteration": 2.633889675140381 + }, + { + "auxiliary_loss_clip": 0.01150949, + "auxiliary_loss_mlp": 0.01108711, + "balance_loss_clip": 1.00198197, + "balance_loss_mlp": 1.00046873, + "epoch": 0.5636554937622126, + "flos": 27126166659840.0, + "grad_norm": 2.3820467376454824, + "language_loss": 0.66183031, + "learning_rate": 1.6860348178871618e-06, + "loss": 0.6844269, + "num_input_tokens_seen": 201924850, + "step": 9375, + "time_per_iteration": 2.644178867340088 + }, + { + "auxiliary_loss_clip": 0.01117796, + "auxiliary_loss_mlp": 0.00747131, + "balance_loss_clip": 1.0019356, + "balance_loss_mlp": 1.00020194, + "epoch": 0.5637156170148805, + "flos": 12926405272320.0, + "grad_norm": 4.824378653833676, + "language_loss": 0.80935657, + "learning_rate": 1.6856501909521889e-06, + "loss": 0.82800579, + "num_input_tokens_seen": 201939500, + "step": 9376, + "time_per_iteration": 2.6149022579193115 + }, + { + "auxiliary_loss_clip": 0.0113438, + "auxiliary_loss_mlp": 0.01110372, + "balance_loss_clip": 1.00188684, + "balance_loss_mlp": 1.00069952, + "epoch": 0.5637757402675485, + "flos": 45551033130240.0, + "grad_norm": 1.3245561466972122, + "language_loss": 0.6941641, + "learning_rate": 1.6852655759373925e-06, + "loss": 0.71661162, + "num_input_tokens_seen": 201963000, + "step": 9377, + "time_per_iteration": 4.251716136932373 + }, + { + "auxiliary_loss_clip": 0.01123819, + "auxiliary_loss_mlp": 0.01109401, + "balance_loss_clip": 1.00223422, + "balance_loss_mlp": 1.00058651, + "epoch": 0.5638358635202164, + "flos": 20886041439360.0, + "grad_norm": 1.3821854232785098, + "language_loss": 0.74512583, + "learning_rate": 1.6848809728573565e-06, + "loss": 0.76745802, + "num_input_tokens_seen": 201983145, + "step": 9378, + "time_per_iteration": 2.700486421585083 + }, + { + "auxiliary_loss_clip": 0.01166562, + "auxiliary_loss_mlp": 0.01111404, + "balance_loss_clip": 1.00202656, + "balance_loss_mlp": 1.00058675, + "epoch": 0.5638959867728844, + "flos": 18806562345600.0, + "grad_norm": 3.0456761326767343, + "language_loss": 0.81982934, + "learning_rate": 1.6844963817266656e-06, + "loss": 0.84260905, + "num_input_tokens_seen": 202000335, + "step": 9379, + "time_per_iteration": 2.53391695022583 + }, + { + "auxiliary_loss_clip": 0.01134795, + "auxiliary_loss_mlp": 0.01109669, + "balance_loss_clip": 1.00197244, + "balance_loss_mlp": 1.00066376, + "epoch": 0.5639561100255523, + "flos": 27490336698240.0, + "grad_norm": 3.727487728276947, + "language_loss": 0.71742558, + "learning_rate": 1.6841118025599042e-06, + "loss": 0.73987019, + "num_input_tokens_seen": 202018275, + "step": 9380, + "time_per_iteration": 4.040493011474609 + }, + { + "auxiliary_loss_clip": 0.01103441, + "auxiliary_loss_mlp": 0.01110034, + "balance_loss_clip": 1.00184929, + "balance_loss_mlp": 1.00064731, + "epoch": 0.5640162332782204, + "flos": 18076570243200.0, + "grad_norm": 2.527702076345482, + "language_loss": 0.7444492, + "learning_rate": 1.6837272353716542e-06, + "loss": 0.76658398, + "num_input_tokens_seen": 202034330, + "step": 9381, + "time_per_iteration": 2.6308493614196777 + }, + { + "auxiliary_loss_clip": 0.01105405, + "auxiliary_loss_mlp": 0.01110412, + "balance_loss_clip": 1.00203133, + "balance_loss_mlp": 1.0007391, + "epoch": 0.5640763565308883, + "flos": 20884856290560.0, + "grad_norm": 1.9682725783225299, + "language_loss": 0.72316992, + "learning_rate": 1.683342680176499e-06, + "loss": 0.74532807, + "num_input_tokens_seen": 202053100, + "step": 9382, + "time_per_iteration": 2.66890025138855 + }, + { + "auxiliary_loss_clip": 0.01160928, + "auxiliary_loss_mlp": 0.01085956, + "balance_loss_clip": 1.00126863, + "balance_loss_mlp": 1.00012541, + "epoch": 0.5641364797835563, + "flos": 64447912224000.0, + "grad_norm": 0.8076530196555756, + "language_loss": 0.54494965, + "learning_rate": 1.682958136989022e-06, + "loss": 0.56741846, + "num_input_tokens_seen": 202120125, + "step": 9383, + "time_per_iteration": 3.21120285987854 + }, + { + "auxiliary_loss_clip": 0.01155271, + "auxiliary_loss_mlp": 0.01110521, + "balance_loss_clip": 1.00213718, + "balance_loss_mlp": 1.00056243, + "epoch": 0.5641966030362242, + "flos": 18660944609280.0, + "grad_norm": 2.2530145436998095, + "language_loss": 0.70783025, + "learning_rate": 1.6825736058238033e-06, + "loss": 0.73048818, + "num_input_tokens_seen": 202138030, + "step": 9384, + "time_per_iteration": 2.5423507690429688 + }, + { + "auxiliary_loss_clip": 0.01132786, + "auxiliary_loss_mlp": 0.01109817, + "balance_loss_clip": 1.00187874, + "balance_loss_mlp": 1.00043046, + "epoch": 0.5642567262888922, + "flos": 22492325738880.0, + "grad_norm": 2.087009064504852, + "language_loss": 0.75932354, + "learning_rate": 1.6821890866954263e-06, + "loss": 0.78174961, + "num_input_tokens_seen": 202155580, + "step": 9385, + "time_per_iteration": 2.6038026809692383 + }, + { + "auxiliary_loss_clip": 0.01151507, + "auxiliary_loss_mlp": 0.01109858, + "balance_loss_clip": 1.00200272, + "balance_loss_mlp": 1.00066245, + "epoch": 0.5643168495415603, + "flos": 13003972692480.0, + "grad_norm": 2.108046730107934, + "language_loss": 0.82053387, + "learning_rate": 1.6818045796184703e-06, + "loss": 0.84314752, + "num_input_tokens_seen": 202170365, + "step": 9386, + "time_per_iteration": 2.531697988510132 + }, + { + "auxiliary_loss_clip": 0.01155405, + "auxiliary_loss_mlp": 0.01111322, + "balance_loss_clip": 1.00220144, + "balance_loss_mlp": 1.00069559, + "epoch": 0.5643769727942282, + "flos": 18588297352320.0, + "grad_norm": 4.387828142149606, + "language_loss": 0.69605708, + "learning_rate": 1.681420084607516e-06, + "loss": 0.71872437, + "num_input_tokens_seen": 202189095, + "step": 9387, + "time_per_iteration": 2.531519651412964 + }, + { + "auxiliary_loss_clip": 0.01149812, + "auxiliary_loss_mlp": 0.01110462, + "balance_loss_clip": 1.00203276, + "balance_loss_mlp": 1.00069427, + "epoch": 0.5644370960468962, + "flos": 33806269572480.0, + "grad_norm": 1.5637028164918199, + "language_loss": 0.74426413, + "learning_rate": 1.6810356016771452e-06, + "loss": 0.76686692, + "num_input_tokens_seen": 202213500, + "step": 9388, + "time_per_iteration": 2.677947521209717 + }, + { + "auxiliary_loss_clip": 0.01149636, + "auxiliary_loss_mlp": 0.01108683, + "balance_loss_clip": 1.00206661, + "balance_loss_mlp": 1.00053644, + "epoch": 0.5644972192995641, + "flos": 21214911386880.0, + "grad_norm": 2.250277146396002, + "language_loss": 0.82131779, + "learning_rate": 1.6806511308419353e-06, + "loss": 0.84390092, + "num_input_tokens_seen": 202231920, + "step": 9389, + "time_per_iteration": 2.5758209228515625 + }, + { + "auxiliary_loss_clip": 0.0111977, + "auxiliary_loss_mlp": 0.01110601, + "balance_loss_clip": 1.00196815, + "balance_loss_mlp": 1.00073731, + "epoch": 0.5645573425522321, + "flos": 18587722734720.0, + "grad_norm": 1.9861004588124926, + "language_loss": 0.63704324, + "learning_rate": 1.680266672116467e-06, + "loss": 0.65934694, + "num_input_tokens_seen": 202247600, + "step": 9390, + "time_per_iteration": 2.611802577972412 + }, + { + "auxiliary_loss_clip": 0.01133002, + "auxiliary_loss_mlp": 0.01110118, + "balance_loss_clip": 1.0020647, + "balance_loss_mlp": 1.00073218, + "epoch": 0.5646174658049, + "flos": 18113809668480.0, + "grad_norm": 1.9477863217587024, + "language_loss": 0.91797125, + "learning_rate": 1.6798822255153192e-06, + "loss": 0.94040251, + "num_input_tokens_seen": 202265350, + "step": 9391, + "time_per_iteration": 2.5822298526763916 + }, + { + "auxiliary_loss_clip": 0.01150365, + "auxiliary_loss_mlp": 0.01111286, + "balance_loss_clip": 1.00209975, + "balance_loss_mlp": 1.00066006, + "epoch": 0.564677589057568, + "flos": 28329964087680.0, + "grad_norm": 2.3072077439184238, + "language_loss": 0.60146314, + "learning_rate": 1.6794977910530684e-06, + "loss": 0.62407964, + "num_input_tokens_seen": 202284285, + "step": 9392, + "time_per_iteration": 2.617739200592041 + }, + { + "auxiliary_loss_clip": 0.01102756, + "auxiliary_loss_mlp": 0.01109906, + "balance_loss_clip": 1.0017854, + "balance_loss_mlp": 1.00061512, + "epoch": 0.564737712310236, + "flos": 22163743100160.0, + "grad_norm": 2.1072677133749598, + "language_loss": 0.8143748, + "learning_rate": 1.6791133687442937e-06, + "loss": 0.83650136, + "num_input_tokens_seen": 202303450, + "step": 9393, + "time_per_iteration": 2.7482874393463135 + }, + { + "auxiliary_loss_clip": 0.01132779, + "auxiliary_loss_mlp": 0.0110991, + "balance_loss_clip": 1.00190818, + "balance_loss_mlp": 1.0007143, + "epoch": 0.564797835562904, + "flos": 20959011918720.0, + "grad_norm": 1.7663667668914467, + "language_loss": 0.87326336, + "learning_rate": 1.6787289586035725e-06, + "loss": 0.89569026, + "num_input_tokens_seen": 202322315, + "step": 9394, + "time_per_iteration": 2.6076033115386963 + }, + { + "auxiliary_loss_clip": 0.01151771, + "auxiliary_loss_mlp": 0.01108966, + "balance_loss_clip": 1.00226498, + "balance_loss_mlp": 1.00072455, + "epoch": 0.5648579588155719, + "flos": 17420302805760.0, + "grad_norm": 1.9421184401114093, + "language_loss": 0.84712982, + "learning_rate": 1.6783445606454814e-06, + "loss": 0.86973721, + "num_input_tokens_seen": 202339905, + "step": 9395, + "time_per_iteration": 2.629365921020508 + }, + { + "auxiliary_loss_clip": 0.01144281, + "auxiliary_loss_mlp": 0.01086075, + "balance_loss_clip": 1.00122571, + "balance_loss_mlp": 1.00024402, + "epoch": 0.5649180820682399, + "flos": 69929568835200.0, + "grad_norm": 0.8732699881328276, + "language_loss": 0.58270955, + "learning_rate": 1.677960174884597e-06, + "loss": 0.60501313, + "num_input_tokens_seen": 202397320, + "step": 9396, + "time_per_iteration": 3.1234922409057617 + }, + { + "auxiliary_loss_clip": 0.01133082, + "auxiliary_loss_mlp": 0.01110405, + "balance_loss_clip": 1.0019542, + "balance_loss_mlp": 1.00054181, + "epoch": 0.5649782053209078, + "flos": 24973070641920.0, + "grad_norm": 2.0279336273918402, + "language_loss": 0.69506276, + "learning_rate": 1.6775758013354943e-06, + "loss": 0.71749759, + "num_input_tokens_seen": 202416865, + "step": 9397, + "time_per_iteration": 2.6330406665802 + }, + { + "auxiliary_loss_clip": 0.01117383, + "auxiliary_loss_mlp": 0.01110787, + "balance_loss_clip": 1.00186372, + "balance_loss_mlp": 1.00063801, + "epoch": 0.5650383285735758, + "flos": 21726602582400.0, + "grad_norm": 2.2969556420411785, + "language_loss": 0.66663641, + "learning_rate": 1.67719144001275e-06, + "loss": 0.68891811, + "num_input_tokens_seen": 202436210, + "step": 9398, + "time_per_iteration": 2.6451375484466553 + }, + { + "auxiliary_loss_clip": 0.01129863, + "auxiliary_loss_mlp": 0.01085899, + "balance_loss_clip": 1.00126624, + "balance_loss_mlp": 1.00006795, + "epoch": 0.5650984518262439, + "flos": 65904484636800.0, + "grad_norm": 0.7592398708022497, + "language_loss": 0.58145344, + "learning_rate": 1.6768070909309386e-06, + "loss": 0.60361111, + "num_input_tokens_seen": 202492925, + "step": 9399, + "time_per_iteration": 3.0770368576049805 + }, + { + "auxiliary_loss_clip": 0.01104604, + "auxiliary_loss_mlp": 0.01110354, + "balance_loss_clip": 1.00179815, + "balance_loss_mlp": 1.00058639, + "epoch": 0.5651585750789118, + "flos": 21032592929280.0, + "grad_norm": 2.0262993684000947, + "language_loss": 0.73058426, + "learning_rate": 1.6764227541046347e-06, + "loss": 0.75273389, + "num_input_tokens_seen": 202511905, + "step": 9400, + "time_per_iteration": 2.705906867980957 + }, + { + "auxiliary_loss_clip": 0.01118261, + "auxiliary_loss_mlp": 0.01111037, + "balance_loss_clip": 1.00192738, + "balance_loss_mlp": 1.00050628, + "epoch": 0.5652186983315798, + "flos": 18551919853440.0, + "grad_norm": 1.7119857804772212, + "language_loss": 0.60959435, + "learning_rate": 1.676038429548412e-06, + "loss": 0.63188732, + "num_input_tokens_seen": 202529815, + "step": 9401, + "time_per_iteration": 2.643552303314209 + }, + { + "auxiliary_loss_clip": 0.01123683, + "auxiliary_loss_mlp": 0.01108983, + "balance_loss_clip": 1.00203443, + "balance_loss_mlp": 1.00055027, + "epoch": 0.5652788215842477, + "flos": 18478662065280.0, + "grad_norm": 1.8670561505980978, + "language_loss": 0.81112307, + "learning_rate": 1.6756541172768453e-06, + "loss": 0.83344972, + "num_input_tokens_seen": 202547710, + "step": 9402, + "time_per_iteration": 2.622663736343384 + }, + { + "auxiliary_loss_clip": 0.01104878, + "auxiliary_loss_mlp": 0.01109082, + "balance_loss_clip": 1.00188601, + "balance_loss_mlp": 1.00064898, + "epoch": 0.5653389448369157, + "flos": 30044052080640.0, + "grad_norm": 1.5140125777487095, + "language_loss": 0.77772617, + "learning_rate": 1.6752698173045068e-06, + "loss": 0.79986572, + "num_input_tokens_seen": 202568835, + "step": 9403, + "time_per_iteration": 2.766632080078125 + }, + { + "auxiliary_loss_clip": 0.01105315, + "auxiliary_loss_mlp": 0.01109464, + "balance_loss_clip": 1.00196886, + "balance_loss_mlp": 1.00045872, + "epoch": 0.5653990680895836, + "flos": 16727550128640.0, + "grad_norm": 1.5865020121521103, + "language_loss": 0.68806779, + "learning_rate": 1.6748855296459685e-06, + "loss": 0.71021557, + "num_input_tokens_seen": 202587385, + "step": 9404, + "time_per_iteration": 2.6656112670898438 + }, + { + "auxiliary_loss_clip": 0.01138815, + "auxiliary_loss_mlp": 0.01108659, + "balance_loss_clip": 1.00208163, + "balance_loss_mlp": 1.00060725, + "epoch": 0.5654591913422516, + "flos": 14538256179840.0, + "grad_norm": 2.2001367400825576, + "language_loss": 0.67599595, + "learning_rate": 1.6745012543158045e-06, + "loss": 0.69847071, + "num_input_tokens_seen": 202604815, + "step": 9405, + "time_per_iteration": 2.6597299575805664 + }, + { + "auxiliary_loss_clip": 0.01134867, + "auxiliary_loss_mlp": 0.01108468, + "balance_loss_clip": 1.00211883, + "balance_loss_mlp": 1.00060749, + "epoch": 0.5655193145949196, + "flos": 26209905603840.0, + "grad_norm": 1.8294473053730451, + "language_loss": 0.74590617, + "learning_rate": 1.6741169913285852e-06, + "loss": 0.76833951, + "num_input_tokens_seen": 202623775, + "step": 9406, + "time_per_iteration": 2.61776065826416 + }, + { + "auxiliary_loss_clip": 0.01107058, + "auxiliary_loss_mlp": 0.01110714, + "balance_loss_clip": 1.00194025, + "balance_loss_mlp": 1.0006597, + "epoch": 0.5655794378475876, + "flos": 25046579825280.0, + "grad_norm": 1.800859514471088, + "language_loss": 0.79444617, + "learning_rate": 1.673732740698882e-06, + "loss": 0.81662387, + "num_input_tokens_seen": 202643375, + "step": 9407, + "time_per_iteration": 4.098676681518555 + }, + { + "auxiliary_loss_clip": 0.01119093, + "auxiliary_loss_mlp": 0.01108327, + "balance_loss_clip": 1.00192094, + "balance_loss_mlp": 1.00065696, + "epoch": 0.5656395611002555, + "flos": 31032852652800.0, + "grad_norm": 1.5482158704285853, + "language_loss": 0.7090162, + "learning_rate": 1.6733485024412666e-06, + "loss": 0.73129046, + "num_input_tokens_seen": 202668400, + "step": 9408, + "time_per_iteration": 2.7528672218322754 + }, + { + "auxiliary_loss_clip": 0.01107037, + "auxiliary_loss_mlp": 0.01109417, + "balance_loss_clip": 1.00199759, + "balance_loss_mlp": 1.00069761, + "epoch": 0.5656996843529235, + "flos": 20229522606720.0, + "grad_norm": 1.78798756092494, + "language_loss": 0.80947101, + "learning_rate": 1.672964276570308e-06, + "loss": 0.83163553, + "num_input_tokens_seen": 202685125, + "step": 9409, + "time_per_iteration": 2.6686174869537354 + }, + { + "auxiliary_loss_clip": 0.01123589, + "auxiliary_loss_mlp": 0.01109883, + "balance_loss_clip": 1.00209057, + "balance_loss_mlp": 1.00059199, + "epoch": 0.5657598076055914, + "flos": 20996251344000.0, + "grad_norm": 1.5431949687182502, + "language_loss": 0.78262579, + "learning_rate": 1.6725800631005776e-06, + "loss": 0.80496061, + "num_input_tokens_seen": 202703830, + "step": 9410, + "time_per_iteration": 2.6314022541046143 + }, + { + "auxiliary_loss_clip": 0.011662, + "auxiliary_loss_mlp": 0.0111006, + "balance_loss_clip": 1.00207353, + "balance_loss_mlp": 1.0005784, + "epoch": 0.5658199308582594, + "flos": 11545999649280.0, + "grad_norm": 2.4045575179396823, + "language_loss": 0.83123493, + "learning_rate": 1.6721958620466432e-06, + "loss": 0.85399753, + "num_input_tokens_seen": 202719835, + "step": 9411, + "time_per_iteration": 2.499417543411255 + }, + { + "auxiliary_loss_clip": 0.01149721, + "auxiliary_loss_mlp": 0.01110482, + "balance_loss_clip": 1.00199115, + "balance_loss_mlp": 1.00052357, + "epoch": 0.5658800541109275, + "flos": 14172146807040.0, + "grad_norm": 2.053267014930074, + "language_loss": 0.66967517, + "learning_rate": 1.6718116734230749e-06, + "loss": 0.69227719, + "num_input_tokens_seen": 202736795, + "step": 9412, + "time_per_iteration": 2.558208465576172 + }, + { + "auxiliary_loss_clip": 0.01149532, + "auxiliary_loss_mlp": 0.01107907, + "balance_loss_clip": 1.00203025, + "balance_loss_mlp": 1.00052381, + "epoch": 0.5659401773635954, + "flos": 27305073325440.0, + "grad_norm": 2.2351632625460827, + "language_loss": 0.58432478, + "learning_rate": 1.6714274972444413e-06, + "loss": 0.60689914, + "num_input_tokens_seen": 202756900, + "step": 9413, + "time_per_iteration": 2.6075687408447266 + }, + { + "auxiliary_loss_clip": 0.01070043, + "auxiliary_loss_mlp": 0.01108802, + "balance_loss_clip": 1.00174928, + "balance_loss_mlp": 1.00055957, + "epoch": 0.5660003006162634, + "flos": 16728196573440.0, + "grad_norm": 1.6548111450109695, + "language_loss": 0.69753432, + "learning_rate": 1.6710433335253092e-06, + "loss": 0.71932268, + "num_input_tokens_seen": 202775145, + "step": 9414, + "time_per_iteration": 4.112274169921875 + }, + { + "auxiliary_loss_clip": 0.01067833, + "auxiliary_loss_mlp": 0.01108595, + "balance_loss_clip": 1.00168157, + "balance_loss_mlp": 1.00054336, + "epoch": 0.5660604238689313, + "flos": 21653452535040.0, + "grad_norm": 2.0475124236086537, + "language_loss": 0.78243786, + "learning_rate": 1.670659182280247e-06, + "loss": 0.80420208, + "num_input_tokens_seen": 202794505, + "step": 9415, + "time_per_iteration": 4.290862560272217 + }, + { + "auxiliary_loss_clip": 0.01129024, + "auxiliary_loss_mlp": 0.01085534, + "balance_loss_clip": 1.00125611, + "balance_loss_mlp": 1.0000844, + "epoch": 0.5661205471215993, + "flos": 68824022083200.0, + "grad_norm": 0.6854775218296851, + "language_loss": 0.49223837, + "learning_rate": 1.670275043523822e-06, + "loss": 0.51438391, + "num_input_tokens_seen": 202858580, + "step": 9416, + "time_per_iteration": 3.5135927200317383 + }, + { + "auxiliary_loss_clip": 0.01149653, + "auxiliary_loss_mlp": 0.0074726, + "balance_loss_clip": 1.00208974, + "balance_loss_mlp": 1.00032043, + "epoch": 0.5661806703742672, + "flos": 28621774177920.0, + "grad_norm": 1.8753732323213794, + "language_loss": 0.62772924, + "learning_rate": 1.6698909172706e-06, + "loss": 0.64669836, + "num_input_tokens_seen": 202878565, + "step": 9417, + "time_per_iteration": 4.166104555130005 + }, + { + "auxiliary_loss_clip": 0.01134869, + "auxiliary_loss_mlp": 0.0111007, + "balance_loss_clip": 1.00195742, + "balance_loss_mlp": 1.00049293, + "epoch": 0.5662407936269352, + "flos": 21397948116480.0, + "grad_norm": 1.8145832212712616, + "language_loss": 0.68948245, + "learning_rate": 1.6695068035351479e-06, + "loss": 0.71193182, + "num_input_tokens_seen": 202897350, + "step": 9418, + "time_per_iteration": 2.626194477081299 + }, + { + "auxiliary_loss_clip": 0.01151373, + "auxiliary_loss_mlp": 0.0111029, + "balance_loss_clip": 1.00197339, + "balance_loss_mlp": 1.00042653, + "epoch": 0.5663009168796032, + "flos": 25660005315840.0, + "grad_norm": 1.8490269795605851, + "language_loss": 0.64944655, + "learning_rate": 1.6691227023320304e-06, + "loss": 0.67206317, + "num_input_tokens_seen": 202916745, + "step": 9419, + "time_per_iteration": 2.5967721939086914 + }, + { + "auxiliary_loss_clip": 0.01061696, + "auxiliary_loss_mlp": 0.01087238, + "balance_loss_clip": 1.00128961, + "balance_loss_mlp": 1.00026321, + "epoch": 0.5663610401322712, + "flos": 67930458422400.0, + "grad_norm": 0.7482406565824105, + "language_loss": 0.59687674, + "learning_rate": 1.6687386136758135e-06, + "loss": 0.618366, + "num_input_tokens_seen": 202982375, + "step": 9420, + "time_per_iteration": 3.742680072784424 + }, + { + "auxiliary_loss_clip": 0.01136014, + "auxiliary_loss_mlp": 0.00747091, + "balance_loss_clip": 1.00197577, + "balance_loss_mlp": 1.00027323, + "epoch": 0.5664211633849391, + "flos": 24609367480320.0, + "grad_norm": 1.679318407967829, + "language_loss": 0.73731232, + "learning_rate": 1.6683545375810618e-06, + "loss": 0.75614333, + "num_input_tokens_seen": 203002430, + "step": 9421, + "time_per_iteration": 2.9379055500030518 + }, + { + "auxiliary_loss_clip": 0.01117779, + "auxiliary_loss_mlp": 0.01110277, + "balance_loss_clip": 1.00185966, + "balance_loss_mlp": 1.00060391, + "epoch": 0.5664812866376071, + "flos": 11648811352320.0, + "grad_norm": 1.7973475325894477, + "language_loss": 0.72945666, + "learning_rate": 1.6679704740623389e-06, + "loss": 0.75173736, + "num_input_tokens_seen": 203019425, + "step": 9422, + "time_per_iteration": 2.6020591259002686 + }, + { + "auxiliary_loss_clip": 0.01149521, + "auxiliary_loss_mlp": 0.01108653, + "balance_loss_clip": 1.00203991, + "balance_loss_mlp": 1.00079203, + "epoch": 0.566541409890275, + "flos": 24643985212800.0, + "grad_norm": 1.5878211747843247, + "language_loss": 0.81971455, + "learning_rate": 1.6675864231342085e-06, + "loss": 0.84229636, + "num_input_tokens_seen": 203039035, + "step": 9423, + "time_per_iteration": 2.5820608139038086 + }, + { + "auxiliary_loss_clip": 0.01139199, + "auxiliary_loss_mlp": 0.0110951, + "balance_loss_clip": 1.00202644, + "balance_loss_mlp": 1.00069571, + "epoch": 0.566601533142943, + "flos": 22270577126400.0, + "grad_norm": 1.767670967513287, + "language_loss": 0.80910051, + "learning_rate": 1.6672023848112353e-06, + "loss": 0.83158755, + "num_input_tokens_seen": 203059320, + "step": 9424, + "time_per_iteration": 2.595538377761841 + }, + { + "auxiliary_loss_clip": 0.0116639, + "auxiliary_loss_mlp": 0.00747243, + "balance_loss_clip": 1.00212526, + "balance_loss_mlp": 1.00019836, + "epoch": 0.5666616563956111, + "flos": 29971656218880.0, + "grad_norm": 2.58940468910197, + "language_loss": 0.78884959, + "learning_rate": 1.6668183591079805e-06, + "loss": 0.8079859, + "num_input_tokens_seen": 203078490, + "step": 9425, + "time_per_iteration": 2.583810329437256 + }, + { + "auxiliary_loss_clip": 0.01133206, + "auxiliary_loss_mlp": 0.01109911, + "balance_loss_clip": 1.00196099, + "balance_loss_mlp": 1.00061977, + "epoch": 0.566721779648279, + "flos": 17781456101760.0, + "grad_norm": 2.385268001878372, + "language_loss": 0.59124571, + "learning_rate": 1.6664343460390064e-06, + "loss": 0.61367685, + "num_input_tokens_seen": 203096065, + "step": 9426, + "time_per_iteration": 2.5812110900878906 + }, + { + "auxiliary_loss_clip": 0.0114983, + "auxiliary_loss_mlp": 0.01109552, + "balance_loss_clip": 1.0019989, + "balance_loss_mlp": 1.00073802, + "epoch": 0.566781902900947, + "flos": 21033490769280.0, + "grad_norm": 2.20829015840999, + "language_loss": 0.81786692, + "learning_rate": 1.6660503456188764e-06, + "loss": 0.84046078, + "num_input_tokens_seen": 203115270, + "step": 9427, + "time_per_iteration": 2.5598502159118652 + }, + { + "auxiliary_loss_clip": 0.01166232, + "auxiliary_loss_mlp": 0.01109505, + "balance_loss_clip": 1.00218356, + "balance_loss_mlp": 1.00069094, + "epoch": 0.5668420261536149, + "flos": 23148593176320.0, + "grad_norm": 1.87961117440967, + "language_loss": 0.86551565, + "learning_rate": 1.6656663578621498e-06, + "loss": 0.888273, + "num_input_tokens_seen": 203134290, + "step": 9428, + "time_per_iteration": 2.5264732837677 + }, + { + "auxiliary_loss_clip": 0.01136194, + "auxiliary_loss_mlp": 0.01110122, + "balance_loss_clip": 1.00212348, + "balance_loss_mlp": 1.00063968, + "epoch": 0.5669021494062829, + "flos": 22601601889920.0, + "grad_norm": 2.0076874855284284, + "language_loss": 0.73218274, + "learning_rate": 1.6652823827833886e-06, + "loss": 0.75464594, + "num_input_tokens_seen": 203152935, + "step": 9429, + "time_per_iteration": 2.6039516925811768 + }, + { + "auxiliary_loss_clip": 0.01133998, + "auxiliary_loss_mlp": 0.0074723, + "balance_loss_clip": 1.00192797, + "balance_loss_mlp": 1.00022995, + "epoch": 0.5669622726589508, + "flos": 17381231786880.0, + "grad_norm": 1.8293903864146543, + "language_loss": 0.75456154, + "learning_rate": 1.6648984203971538e-06, + "loss": 0.77337384, + "num_input_tokens_seen": 203170110, + "step": 9430, + "time_per_iteration": 2.578176259994507 + }, + { + "auxiliary_loss_clip": 0.01166241, + "auxiliary_loss_mlp": 0.0110992, + "balance_loss_clip": 1.00211263, + "balance_loss_mlp": 1.0005331, + "epoch": 0.5670223959116188, + "flos": 18763253521920.0, + "grad_norm": 1.8539420385545413, + "language_loss": 0.72689617, + "learning_rate": 1.6645144707180032e-06, + "loss": 0.74965781, + "num_input_tokens_seen": 203188825, + "step": 9431, + "time_per_iteration": 2.512507915496826 + }, + { + "auxiliary_loss_clip": 0.01103426, + "auxiliary_loss_mlp": 0.0110753, + "balance_loss_clip": 1.00181746, + "balance_loss_mlp": 1.00052762, + "epoch": 0.5670825191642868, + "flos": 13553334276480.0, + "grad_norm": 1.8063595727712085, + "language_loss": 0.73225319, + "learning_rate": 1.6641305337604984e-06, + "loss": 0.7543627, + "num_input_tokens_seen": 203206860, + "step": 9432, + "time_per_iteration": 2.660367965698242 + }, + { + "auxiliary_loss_clip": 0.01108333, + "auxiliary_loss_mlp": 0.0110976, + "balance_loss_clip": 1.00197971, + "balance_loss_mlp": 1.00056398, + "epoch": 0.5671426424169548, + "flos": 22054035985920.0, + "grad_norm": 1.465081386460907, + "language_loss": 0.78150356, + "learning_rate": 1.663746609539197e-06, + "loss": 0.80368447, + "num_input_tokens_seen": 203225625, + "step": 9433, + "time_per_iteration": 2.7380359172821045 + }, + { + "auxiliary_loss_clip": 0.01166364, + "auxiliary_loss_mlp": 0.01111422, + "balance_loss_clip": 1.00207233, + "balance_loss_mlp": 1.00060546, + "epoch": 0.5672027656696227, + "flos": 21323972056320.0, + "grad_norm": 2.1094792373893974, + "language_loss": 0.63404912, + "learning_rate": 1.6633626980686582e-06, + "loss": 0.65682697, + "num_input_tokens_seen": 203242920, + "step": 9434, + "time_per_iteration": 2.5285744667053223 + }, + { + "auxiliary_loss_clip": 0.01151212, + "auxiliary_loss_mlp": 0.01108328, + "balance_loss_clip": 1.00198698, + "balance_loss_mlp": 1.00056243, + "epoch": 0.5672628889222907, + "flos": 23514056104320.0, + "grad_norm": 1.7206934190813044, + "language_loss": 0.66279781, + "learning_rate": 1.6629787993634399e-06, + "loss": 0.68539321, + "num_input_tokens_seen": 203261995, + "step": 9435, + "time_per_iteration": 2.581486463546753 + }, + { + "auxiliary_loss_clip": 0.01134592, + "auxiliary_loss_mlp": 0.00747168, + "balance_loss_clip": 1.00191128, + "balance_loss_mlp": 1.00022197, + "epoch": 0.5673230121749586, + "flos": 27121928855040.0, + "grad_norm": 1.480072486029937, + "language_loss": 0.71845192, + "learning_rate": 1.6625949134380984e-06, + "loss": 0.73726952, + "num_input_tokens_seen": 203280670, + "step": 9436, + "time_per_iteration": 2.6768813133239746 + }, + { + "auxiliary_loss_clip": 0.01166282, + "auxiliary_loss_mlp": 0.01110778, + "balance_loss_clip": 1.00202537, + "balance_loss_mlp": 1.0007236, + "epoch": 0.5673831354276266, + "flos": 31141985149440.0, + "grad_norm": 2.3728993568602106, + "language_loss": 0.74080753, + "learning_rate": 1.6622110403071921e-06, + "loss": 0.76357812, + "num_input_tokens_seen": 203304800, + "step": 9437, + "time_per_iteration": 2.6211483478546143 + }, + { + "auxiliary_loss_clip": 0.01150043, + "auxiliary_loss_mlp": 0.01109289, + "balance_loss_clip": 1.00217485, + "balance_loss_mlp": 1.00057018, + "epoch": 0.5674432586802945, + "flos": 27673193859840.0, + "grad_norm": 1.7580697539186627, + "language_loss": 0.61269677, + "learning_rate": 1.661827179985277e-06, + "loss": 0.63529009, + "num_input_tokens_seen": 203324060, + "step": 9438, + "time_per_iteration": 2.630281686782837 + }, + { + "auxiliary_loss_clip": 0.01134212, + "auxiliary_loss_mlp": 0.01108725, + "balance_loss_clip": 1.00182772, + "balance_loss_mlp": 1.0004828, + "epoch": 0.5675033819329626, + "flos": 26615157822720.0, + "grad_norm": 1.5654487139942042, + "language_loss": 0.75002992, + "learning_rate": 1.661443332486909e-06, + "loss": 0.77245927, + "num_input_tokens_seen": 203344360, + "step": 9439, + "time_per_iteration": 2.6920881271362305 + }, + { + "auxiliary_loss_clip": 0.01132492, + "auxiliary_loss_mlp": 0.01109975, + "balance_loss_clip": 1.00191355, + "balance_loss_mlp": 1.00058854, + "epoch": 0.5675635051856306, + "flos": 19098372435840.0, + "grad_norm": 1.8367563227067911, + "language_loss": 0.83613366, + "learning_rate": 1.6610594978266438e-06, + "loss": 0.8585583, + "num_input_tokens_seen": 203362115, + "step": 9440, + "time_per_iteration": 2.599717378616333 + }, + { + "auxiliary_loss_clip": 0.01119624, + "auxiliary_loss_mlp": 0.01110531, + "balance_loss_clip": 1.00179005, + "balance_loss_mlp": 1.00066757, + "epoch": 0.5676236284382985, + "flos": 17566315591680.0, + "grad_norm": 2.0087716663627755, + "language_loss": 0.75480384, + "learning_rate": 1.6606756760190365e-06, + "loss": 0.77710533, + "num_input_tokens_seen": 203380550, + "step": 9441, + "time_per_iteration": 2.6296494007110596 + }, + { + "auxiliary_loss_clip": 0.01102666, + "auxiliary_loss_mlp": 0.01109807, + "balance_loss_clip": 1.00190902, + "balance_loss_mlp": 1.00070691, + "epoch": 0.5676837516909665, + "flos": 15954069634560.0, + "grad_norm": 2.2239266532109614, + "language_loss": 0.83022553, + "learning_rate": 1.6602918670786413e-06, + "loss": 0.85235023, + "num_input_tokens_seen": 203396590, + "step": 9442, + "time_per_iteration": 2.6292333602905273 + }, + { + "auxiliary_loss_clip": 0.01119176, + "auxiliary_loss_mlp": 0.01107791, + "balance_loss_clip": 1.00200152, + "balance_loss_mlp": 1.00059843, + "epoch": 0.5677438749436344, + "flos": 18295912644480.0, + "grad_norm": 1.7794414144394857, + "language_loss": 0.74482942, + "learning_rate": 1.6599080710200126e-06, + "loss": 0.76709902, + "num_input_tokens_seen": 203414280, + "step": 9443, + "time_per_iteration": 2.606238603591919 + }, + { + "auxiliary_loss_clip": 0.0113833, + "auxiliary_loss_mlp": 0.01110486, + "balance_loss_clip": 1.00211549, + "balance_loss_mlp": 1.00062275, + "epoch": 0.5678039981963025, + "flos": 17931311642880.0, + "grad_norm": 2.0654319080339265, + "language_loss": 0.77319413, + "learning_rate": 1.6595242878577046e-06, + "loss": 0.79568231, + "num_input_tokens_seen": 203433280, + "step": 9444, + "time_per_iteration": 4.004214286804199 + }, + { + "auxiliary_loss_clip": 0.01117774, + "auxiliary_loss_mlp": 0.01110269, + "balance_loss_clip": 1.00182486, + "balance_loss_mlp": 1.00088227, + "epoch": 0.5678641214489704, + "flos": 19316350120320.0, + "grad_norm": 2.1132121332633345, + "language_loss": 0.81062502, + "learning_rate": 1.6591405176062687e-06, + "loss": 0.83290541, + "num_input_tokens_seen": 203449935, + "step": 9445, + "time_per_iteration": 2.631584882736206 + }, + { + "auxiliary_loss_clip": 0.01166099, + "auxiliary_loss_mlp": 0.01109165, + "balance_loss_clip": 1.00199032, + "balance_loss_mlp": 1.00054145, + "epoch": 0.5679242447016384, + "flos": 27751084502400.0, + "grad_norm": 1.2721222665925123, + "language_loss": 0.71012855, + "learning_rate": 1.658756760280259e-06, + "loss": 0.73288119, + "num_input_tokens_seen": 203473025, + "step": 9446, + "time_per_iteration": 2.62107515335083 + }, + { + "auxiliary_loss_clip": 0.0111748, + "auxiliary_loss_mlp": 0.01110311, + "balance_loss_clip": 1.0019021, + "balance_loss_mlp": 1.00063837, + "epoch": 0.5679843679543063, + "flos": 23769093646080.0, + "grad_norm": 1.9345516977977246, + "language_loss": 0.73378664, + "learning_rate": 1.6583730158942276e-06, + "loss": 0.75606459, + "num_input_tokens_seen": 203492895, + "step": 9447, + "time_per_iteration": 2.6454739570617676 + }, + { + "auxiliary_loss_clip": 0.0113517, + "auxiliary_loss_mlp": 0.01109447, + "balance_loss_clip": 1.0019896, + "balance_loss_mlp": 1.00053763, + "epoch": 0.5680444912069743, + "flos": 25591883172480.0, + "grad_norm": 1.711194164255658, + "language_loss": 0.75110722, + "learning_rate": 1.657989284462725e-06, + "loss": 0.77355343, + "num_input_tokens_seen": 203513710, + "step": 9448, + "time_per_iteration": 2.6342291831970215 + }, + { + "auxiliary_loss_clip": 0.01103166, + "auxiliary_loss_mlp": 0.01111421, + "balance_loss_clip": 1.0019021, + "balance_loss_mlp": 1.00060439, + "epoch": 0.5681046144596422, + "flos": 23695799944320.0, + "grad_norm": 2.924411941237804, + "language_loss": 0.76346803, + "learning_rate": 1.6576055660003038e-06, + "loss": 0.78561395, + "num_input_tokens_seen": 203531630, + "step": 9449, + "time_per_iteration": 2.6706557273864746 + }, + { + "auxiliary_loss_clip": 0.0113575, + "auxiliary_loss_mlp": 0.01109741, + "balance_loss_clip": 1.00195956, + "balance_loss_mlp": 1.000736, + "epoch": 0.5681647377123102, + "flos": 28000770917760.0, + "grad_norm": 1.8598506940580144, + "language_loss": 0.74685705, + "learning_rate": 1.6572218605215128e-06, + "loss": 0.76931202, + "num_input_tokens_seen": 203551885, + "step": 9450, + "time_per_iteration": 2.6886494159698486 + }, + { + "auxiliary_loss_clip": 0.01136065, + "auxiliary_loss_mlp": 0.0110898, + "balance_loss_clip": 1.00204098, + "balance_loss_mlp": 1.00073767, + "epoch": 0.5682248609649782, + "flos": 22747758330240.0, + "grad_norm": 9.31090146723441, + "language_loss": 0.66837609, + "learning_rate": 1.6568381680409038e-06, + "loss": 0.69082654, + "num_input_tokens_seen": 203572250, + "step": 9451, + "time_per_iteration": 2.625741958618164 + }, + { + "auxiliary_loss_clip": 0.01135101, + "auxiliary_loss_mlp": 0.01111654, + "balance_loss_clip": 1.00192976, + "balance_loss_mlp": 1.000646, + "epoch": 0.5682849842176462, + "flos": 21288600138240.0, + "grad_norm": 2.1709208809101006, + "language_loss": 0.72395599, + "learning_rate": 1.656454488573026e-06, + "loss": 0.7464236, + "num_input_tokens_seen": 203590605, + "step": 9452, + "time_per_iteration": 4.199565172195435 + }, + { + "auxiliary_loss_clip": 0.01118113, + "auxiliary_loss_mlp": 0.01109092, + "balance_loss_clip": 1.00184846, + "balance_loss_mlp": 1.00075471, + "epoch": 0.5683451074703142, + "flos": 21141689512320.0, + "grad_norm": 1.4285695907085596, + "language_loss": 0.7051779, + "learning_rate": 1.656070822132428e-06, + "loss": 0.72744995, + "num_input_tokens_seen": 203610080, + "step": 9453, + "time_per_iteration": 4.041194915771484 + }, + { + "auxiliary_loss_clip": 0.011011, + "auxiliary_loss_mlp": 0.00747087, + "balance_loss_clip": 1.00179732, + "balance_loss_mlp": 1.00030828, + "epoch": 0.5684052307229821, + "flos": 22344481359360.0, + "grad_norm": 1.8945352974558751, + "language_loss": 0.70028126, + "learning_rate": 1.6556871687336592e-06, + "loss": 0.71876311, + "num_input_tokens_seen": 203630060, + "step": 9454, + "time_per_iteration": 4.196339845657349 + }, + { + "auxiliary_loss_clip": 0.01134518, + "auxiliary_loss_mlp": 0.01108653, + "balance_loss_clip": 1.00185943, + "balance_loss_mlp": 1.00060129, + "epoch": 0.5684653539756501, + "flos": 21798639308160.0, + "grad_norm": 1.8169062449162627, + "language_loss": 0.60715741, + "learning_rate": 1.6553035283912671e-06, + "loss": 0.62958908, + "num_input_tokens_seen": 203649065, + "step": 9455, + "time_per_iteration": 2.618492364883423 + }, + { + "auxiliary_loss_clip": 0.01102017, + "auxiliary_loss_mlp": 0.0111001, + "balance_loss_clip": 1.00189757, + "balance_loss_mlp": 1.0006237, + "epoch": 0.568525477228318, + "flos": 22999635475200.0, + "grad_norm": 1.9049990159293995, + "language_loss": 0.73269367, + "learning_rate": 1.6549199011198e-06, + "loss": 0.75481391, + "num_input_tokens_seen": 203667545, + "step": 9456, + "time_per_iteration": 2.7250893115997314 + }, + { + "auxiliary_loss_clip": 0.01135371, + "auxiliary_loss_mlp": 0.01109408, + "balance_loss_clip": 1.00201166, + "balance_loss_mlp": 1.0005939, + "epoch": 0.568585600480986, + "flos": 21392489249280.0, + "grad_norm": 1.6053602746119264, + "language_loss": 0.76788229, + "learning_rate": 1.6545362869338048e-06, + "loss": 0.79033011, + "num_input_tokens_seen": 203686025, + "step": 9457, + "time_per_iteration": 2.6403286457061768 + }, + { + "auxiliary_loss_clip": 0.01149126, + "auxiliary_loss_mlp": 0.01109917, + "balance_loss_clip": 1.00198483, + "balance_loss_mlp": 1.00053024, + "epoch": 0.568645723733654, + "flos": 30007351359360.0, + "grad_norm": 1.6295030543340923, + "language_loss": 0.65815294, + "learning_rate": 1.6541526858478285e-06, + "loss": 0.68074334, + "num_input_tokens_seen": 203705540, + "step": 9458, + "time_per_iteration": 2.656780958175659 + }, + { + "auxiliary_loss_clip": 0.01149637, + "auxiliary_loss_mlp": 0.01110242, + "balance_loss_clip": 1.00196981, + "balance_loss_mlp": 1.00066483, + "epoch": 0.568705846986322, + "flos": 20412667077120.0, + "grad_norm": 2.210588334924356, + "language_loss": 0.6783697, + "learning_rate": 1.6537690978764167e-06, + "loss": 0.7009685, + "num_input_tokens_seen": 203723670, + "step": 9459, + "time_per_iteration": 2.5660531520843506 + }, + { + "auxiliary_loss_clip": 0.01119533, + "auxiliary_loss_mlp": 0.01109299, + "balance_loss_clip": 1.00189698, + "balance_loss_mlp": 1.0005796, + "epoch": 0.5687659702389899, + "flos": 17456752131840.0, + "grad_norm": 5.438600765841896, + "language_loss": 0.76863575, + "learning_rate": 1.6533855230341155e-06, + "loss": 0.79092407, + "num_input_tokens_seen": 203739705, + "step": 9460, + "time_per_iteration": 2.6443541049957275 + }, + { + "auxiliary_loss_clip": 0.01083134, + "auxiliary_loss_mlp": 0.01109498, + "balance_loss_clip": 1.00161791, + "balance_loss_mlp": 1.00068414, + "epoch": 0.5688260934916579, + "flos": 25406081095680.0, + "grad_norm": 1.747794178882087, + "language_loss": 0.71720791, + "learning_rate": 1.65300196133547e-06, + "loss": 0.73913425, + "num_input_tokens_seen": 203759000, + "step": 9461, + "time_per_iteration": 2.811211347579956 + }, + { + "auxiliary_loss_clip": 0.01150488, + "auxiliary_loss_mlp": 0.01109082, + "balance_loss_clip": 1.00194633, + "balance_loss_mlp": 1.00064921, + "epoch": 0.5688862167443258, + "flos": 21608024808960.0, + "grad_norm": 2.0256166373371065, + "language_loss": 0.73310781, + "learning_rate": 1.6526184127950249e-06, + "loss": 0.75570345, + "num_input_tokens_seen": 203774295, + "step": 9462, + "time_per_iteration": 2.613906145095825 + }, + { + "auxiliary_loss_clip": 0.01150717, + "auxiliary_loss_mlp": 0.01109407, + "balance_loss_clip": 1.00198126, + "balance_loss_mlp": 1.00049746, + "epoch": 0.5689463399969938, + "flos": 22418996123520.0, + "grad_norm": 1.8596306441128172, + "language_loss": 0.72480249, + "learning_rate": 1.6522348774273246e-06, + "loss": 0.74740374, + "num_input_tokens_seen": 203792710, + "step": 9463, + "time_per_iteration": 2.564866065979004 + }, + { + "auxiliary_loss_clip": 0.01149558, + "auxiliary_loss_mlp": 0.01109247, + "balance_loss_clip": 1.00203872, + "balance_loss_mlp": 1.00062311, + "epoch": 0.5690064632496618, + "flos": 18296810484480.0, + "grad_norm": 1.8951797773603343, + "language_loss": 0.74100918, + "learning_rate": 1.6518513552469123e-06, + "loss": 0.76359725, + "num_input_tokens_seen": 203811645, + "step": 9464, + "time_per_iteration": 2.5546250343322754 + }, + { + "auxiliary_loss_clip": 0.01150784, + "auxiliary_loss_mlp": 0.00747269, + "balance_loss_clip": 1.00194073, + "balance_loss_mlp": 1.00032365, + "epoch": 0.5690665865023298, + "flos": 21579260993280.0, + "grad_norm": 3.396795246026668, + "language_loss": 0.83867574, + "learning_rate": 1.6514678462683312e-06, + "loss": 0.85765624, + "num_input_tokens_seen": 203830040, + "step": 9465, + "time_per_iteration": 2.574587821960449 + }, + { + "auxiliary_loss_clip": 0.01135803, + "auxiliary_loss_mlp": 0.01108719, + "balance_loss_clip": 1.00184417, + "balance_loss_mlp": 1.00057197, + "epoch": 0.5691267097549978, + "flos": 24421446501120.0, + "grad_norm": 2.847188721012862, + "language_loss": 0.72289538, + "learning_rate": 1.651084350506125e-06, + "loss": 0.74534059, + "num_input_tokens_seen": 203851245, + "step": 9466, + "time_per_iteration": 2.643411636352539 + }, + { + "auxiliary_loss_clip": 0.01115062, + "auxiliary_loss_mlp": 0.0108509, + "balance_loss_clip": 1.00122881, + "balance_loss_mlp": 1.00002193, + "epoch": 0.5691868330076657, + "flos": 61657906199040.0, + "grad_norm": 0.7117248074325323, + "language_loss": 0.55351806, + "learning_rate": 1.6507008679748343e-06, + "loss": 0.57551956, + "num_input_tokens_seen": 203916400, + "step": 9467, + "time_per_iteration": 3.276948928833008 + }, + { + "auxiliary_loss_clip": 0.01151608, + "auxiliary_loss_mlp": 0.01110635, + "balance_loss_clip": 1.00206316, + "balance_loss_mlp": 1.00058079, + "epoch": 0.5692469562603337, + "flos": 21325193118720.0, + "grad_norm": 2.038242184309035, + "language_loss": 0.63689893, + "learning_rate": 1.6503173986890023e-06, + "loss": 0.65952134, + "num_input_tokens_seen": 203935870, + "step": 9468, + "time_per_iteration": 2.5712368488311768 + }, + { + "auxiliary_loss_clip": 0.01104607, + "auxiliary_loss_mlp": 0.01109452, + "balance_loss_clip": 1.00180113, + "balance_loss_mlp": 1.00054264, + "epoch": 0.5693070795130016, + "flos": 23367899664000.0, + "grad_norm": 2.1574592599058766, + "language_loss": 0.79119062, + "learning_rate": 1.64993394266317e-06, + "loss": 0.81333119, + "num_input_tokens_seen": 203954950, + "step": 9469, + "time_per_iteration": 2.693161725997925 + }, + { + "auxiliary_loss_clip": 0.01117769, + "auxiliary_loss_mlp": 0.01111115, + "balance_loss_clip": 1.0018158, + "balance_loss_mlp": 1.00067902, + "epoch": 0.5693672027656697, + "flos": 18697250280960.0, + "grad_norm": 2.60212543671233, + "language_loss": 0.69661665, + "learning_rate": 1.6495504999118769e-06, + "loss": 0.71890545, + "num_input_tokens_seen": 203972715, + "step": 9470, + "time_per_iteration": 2.603842258453369 + }, + { + "auxiliary_loss_clip": 0.01133668, + "auxiliary_loss_mlp": 0.01109571, + "balance_loss_clip": 1.00205314, + "balance_loss_mlp": 1.00066113, + "epoch": 0.5694273260183376, + "flos": 20449188230400.0, + "grad_norm": 1.9097844293151314, + "language_loss": 0.74220395, + "learning_rate": 1.6491670704496644e-06, + "loss": 0.76463634, + "num_input_tokens_seen": 203990775, + "step": 9471, + "time_per_iteration": 2.601619243621826 + }, + { + "auxiliary_loss_clip": 0.01117697, + "auxiliary_loss_mlp": 0.01109487, + "balance_loss_clip": 1.0017904, + "balance_loss_mlp": 1.00067258, + "epoch": 0.5694874492710056, + "flos": 17603195880960.0, + "grad_norm": 1.8010333646165406, + "language_loss": 0.57486612, + "learning_rate": 1.6487836542910716e-06, + "loss": 0.59713793, + "num_input_tokens_seen": 204008845, + "step": 9472, + "time_per_iteration": 2.6261394023895264 + }, + { + "auxiliary_loss_clip": 0.01120278, + "auxiliary_loss_mlp": 0.01108481, + "balance_loss_clip": 1.00197756, + "balance_loss_mlp": 1.00062013, + "epoch": 0.5695475725236735, + "flos": 13370836250880.0, + "grad_norm": 1.8594195346352291, + "language_loss": 0.7394067, + "learning_rate": 1.648400251450638e-06, + "loss": 0.76169431, + "num_input_tokens_seen": 204023755, + "step": 9473, + "time_per_iteration": 2.611192464828491 + }, + { + "auxiliary_loss_clip": 0.01115198, + "auxiliary_loss_mlp": 0.01085407, + "balance_loss_clip": 1.00110459, + "balance_loss_mlp": 0.99995786, + "epoch": 0.5696076957763415, + "flos": 68174398661760.0, + "grad_norm": 0.6516408375626166, + "language_loss": 0.57603723, + "learning_rate": 1.6480168619429023e-06, + "loss": 0.59804326, + "num_input_tokens_seen": 204091255, + "step": 9474, + "time_per_iteration": 3.224501609802246 + }, + { + "auxiliary_loss_clip": 0.01151455, + "auxiliary_loss_mlp": 0.01109592, + "balance_loss_clip": 1.0021466, + "balance_loss_mlp": 1.00068212, + "epoch": 0.5696678190290094, + "flos": 33838301525760.0, + "grad_norm": 2.7942065896660457, + "language_loss": 0.53899479, + "learning_rate": 1.6476334857824017e-06, + "loss": 0.56160522, + "num_input_tokens_seen": 204113285, + "step": 9475, + "time_per_iteration": 2.676617383956909 + }, + { + "auxiliary_loss_clip": 0.01166314, + "auxiliary_loss_mlp": 0.01110142, + "balance_loss_clip": 1.00214601, + "balance_loss_mlp": 1.00065994, + "epoch": 0.5697279422816774, + "flos": 26356600748160.0, + "grad_norm": 1.5733934299171495, + "language_loss": 0.79399544, + "learning_rate": 1.647250122983675e-06, + "loss": 0.81676, + "num_input_tokens_seen": 204133045, + "step": 9476, + "time_per_iteration": 2.5677337646484375 + }, + { + "auxiliary_loss_clip": 0.0113309, + "auxiliary_loss_mlp": 0.01110498, + "balance_loss_clip": 1.00196576, + "balance_loss_mlp": 1.00073051, + "epoch": 0.5697880655343454, + "flos": 22930507751040.0, + "grad_norm": 1.765677452671747, + "language_loss": 0.6659416, + "learning_rate": 1.6468667735612592e-06, + "loss": 0.68837744, + "num_input_tokens_seen": 204152590, + "step": 9477, + "time_per_iteration": 2.640087127685547 + }, + { + "auxiliary_loss_clip": 0.01116137, + "auxiliary_loss_mlp": 0.01109834, + "balance_loss_clip": 1.00183702, + "balance_loss_mlp": 1.00063872, + "epoch": 0.5698481887870134, + "flos": 26761314263040.0, + "grad_norm": 1.5895956018497273, + "language_loss": 0.71119583, + "learning_rate": 1.6464834375296906e-06, + "loss": 0.73345554, + "num_input_tokens_seen": 204171815, + "step": 9478, + "time_per_iteration": 2.6874313354492188 + }, + { + "auxiliary_loss_clip": 0.01121184, + "auxiliary_loss_mlp": 0.01108743, + "balance_loss_clip": 1.00193596, + "balance_loss_mlp": 1.00069129, + "epoch": 0.5699083120396814, + "flos": 15742269089280.0, + "grad_norm": 1.8829677433597256, + "language_loss": 0.69625407, + "learning_rate": 1.6461001149035055e-06, + "loss": 0.7185533, + "num_input_tokens_seen": 204188535, + "step": 9479, + "time_per_iteration": 2.613947868347168 + }, + { + "auxiliary_loss_clip": 0.01118077, + "auxiliary_loss_mlp": 0.01108892, + "balance_loss_clip": 1.001845, + "balance_loss_mlp": 1.00065017, + "epoch": 0.5699684352923493, + "flos": 19537272720000.0, + "grad_norm": 1.4556135344638985, + "language_loss": 0.71418494, + "learning_rate": 1.6457168056972392e-06, + "loss": 0.73645467, + "num_input_tokens_seen": 204208365, + "step": 9480, + "time_per_iteration": 2.661410093307495 + }, + { + "auxiliary_loss_clip": 0.01119862, + "auxiliary_loss_mlp": 0.00747274, + "balance_loss_clip": 1.00187588, + "balance_loss_mlp": 1.00035644, + "epoch": 0.5700285585450173, + "flos": 16253349753600.0, + "grad_norm": 2.4329567960109695, + "language_loss": 0.72037876, + "learning_rate": 1.6453335099254276e-06, + "loss": 0.73905015, + "num_input_tokens_seen": 204226560, + "step": 9481, + "time_per_iteration": 2.6335458755493164 + }, + { + "auxiliary_loss_clip": 0.01150787, + "auxiliary_loss_mlp": 0.01110073, + "balance_loss_clip": 1.00203586, + "balance_loss_mlp": 1.00059164, + "epoch": 0.5700886817976852, + "flos": 19864993432320.0, + "grad_norm": 2.0014739620152127, + "language_loss": 0.78622675, + "learning_rate": 1.6449502276026041e-06, + "loss": 0.80883539, + "num_input_tokens_seen": 204245410, + "step": 9482, + "time_per_iteration": 3.9682695865631104 + }, + { + "auxiliary_loss_clip": 0.0113445, + "auxiliary_loss_mlp": 0.01108662, + "balance_loss_clip": 1.00198174, + "balance_loss_mlp": 1.00051546, + "epoch": 0.5701488050503533, + "flos": 23841704989440.0, + "grad_norm": 1.516119119050325, + "language_loss": 0.77843255, + "learning_rate": 1.6445669587433043e-06, + "loss": 0.80086368, + "num_input_tokens_seen": 204264840, + "step": 9483, + "time_per_iteration": 2.6265792846679688 + }, + { + "auxiliary_loss_clip": 0.0113274, + "auxiliary_loss_mlp": 0.01109708, + "balance_loss_clip": 1.00185502, + "balance_loss_mlp": 1.00070262, + "epoch": 0.5702089283030212, + "flos": 23659673840640.0, + "grad_norm": 1.798957243683217, + "language_loss": 0.81272936, + "learning_rate": 1.6441837033620612e-06, + "loss": 0.83515382, + "num_input_tokens_seen": 204284335, + "step": 9484, + "time_per_iteration": 2.6416258811950684 + }, + { + "auxiliary_loss_clip": 0.01166244, + "auxiliary_loss_mlp": 0.00747455, + "balance_loss_clip": 1.00201201, + "balance_loss_mlp": 1.00041747, + "epoch": 0.5702690515556892, + "flos": 27891171544320.0, + "grad_norm": 3.132102808045226, + "language_loss": 0.60839403, + "learning_rate": 1.6438004614734073e-06, + "loss": 0.62753105, + "num_input_tokens_seen": 204302590, + "step": 9485, + "time_per_iteration": 2.567758083343506 + }, + { + "auxiliary_loss_clip": 0.01149454, + "auxiliary_loss_mlp": 0.01109468, + "balance_loss_clip": 1.00195849, + "balance_loss_mlp": 1.00065339, + "epoch": 0.5703291748083571, + "flos": 24023951619840.0, + "grad_norm": 1.8091752103696235, + "language_loss": 0.653781, + "learning_rate": 1.6434172330918757e-06, + "loss": 0.6763702, + "num_input_tokens_seen": 204323055, + "step": 9486, + "time_per_iteration": 2.5949971675872803 + }, + { + "auxiliary_loss_clip": 0.01131324, + "auxiliary_loss_mlp": 0.01085039, + "balance_loss_clip": 1.00120258, + "balance_loss_mlp": 0.99997133, + "epoch": 0.5703892980610251, + "flos": 57023382919680.0, + "grad_norm": 0.6630756941542227, + "language_loss": 0.47992715, + "learning_rate": 1.6430340182319978e-06, + "loss": 0.50209081, + "num_input_tokens_seen": 204386160, + "step": 9487, + "time_per_iteration": 3.2327077388763428 + }, + { + "auxiliary_loss_clip": 0.01117383, + "auxiliary_loss_mlp": 0.00747438, + "balance_loss_clip": 1.00186884, + "balance_loss_mlp": 1.00043082, + "epoch": 0.570449421313693, + "flos": 24351025887360.0, + "grad_norm": 1.4983572743955795, + "language_loss": 0.85701501, + "learning_rate": 1.6426508169083067e-06, + "loss": 0.87566328, + "num_input_tokens_seen": 204406315, + "step": 9488, + "time_per_iteration": 2.680253505706787 + }, + { + "auxiliary_loss_clip": 0.01117779, + "auxiliary_loss_mlp": 0.01109825, + "balance_loss_clip": 1.0018537, + "balance_loss_mlp": 1.00062895, + "epoch": 0.570509544566361, + "flos": 24828566227200.0, + "grad_norm": 1.4842599668159286, + "language_loss": 0.79352438, + "learning_rate": 1.6422676291353314e-06, + "loss": 0.81580043, + "num_input_tokens_seen": 204427645, + "step": 9489, + "time_per_iteration": 4.042750835418701 + }, + { + "auxiliary_loss_clip": 0.01133975, + "auxiliary_loss_mlp": 0.01108713, + "balance_loss_clip": 1.00190473, + "balance_loss_mlp": 1.00056577, + "epoch": 0.570569667819029, + "flos": 21397301671680.0, + "grad_norm": 1.7303818624892173, + "language_loss": 0.69709289, + "learning_rate": 1.641884454927604e-06, + "loss": 0.71951973, + "num_input_tokens_seen": 204445910, + "step": 9490, + "time_per_iteration": 4.045556306838989 + }, + { + "auxiliary_loss_clip": 0.0111763, + "auxiliary_loss_mlp": 0.01109172, + "balance_loss_clip": 1.00189149, + "balance_loss_mlp": 1.00054848, + "epoch": 0.570629791071697, + "flos": 23216751233280.0, + "grad_norm": 2.0171134537800106, + "language_loss": 0.76312554, + "learning_rate": 1.6415012942996548e-06, + "loss": 0.78539354, + "num_input_tokens_seen": 204464680, + "step": 9491, + "time_per_iteration": 2.635246992111206 + }, + { + "auxiliary_loss_clip": 0.01110544, + "auxiliary_loss_mlp": 0.007457, + "balance_loss_clip": 1.00117719, + "balance_loss_mlp": 0.99994355, + "epoch": 0.570689914324365, + "flos": 65284666525440.0, + "grad_norm": 0.7951309953002534, + "language_loss": 0.57433772, + "learning_rate": 1.641118147266011e-06, + "loss": 0.59290016, + "num_input_tokens_seen": 204525580, + "step": 9492, + "time_per_iteration": 4.5629096031188965 + }, + { + "auxiliary_loss_clip": 0.01133133, + "auxiliary_loss_mlp": 0.00747406, + "balance_loss_clip": 1.00200033, + "balance_loss_mlp": 1.00036442, + "epoch": 0.5707500375770329, + "flos": 21141904993920.0, + "grad_norm": 1.8452246793006977, + "language_loss": 0.71964705, + "learning_rate": 1.6407350138412035e-06, + "loss": 0.73845249, + "num_input_tokens_seen": 204541320, + "step": 9493, + "time_per_iteration": 2.606139898300171 + }, + { + "auxiliary_loss_clip": 0.01166346, + "auxiliary_loss_mlp": 0.01109692, + "balance_loss_clip": 1.00209868, + "balance_loss_mlp": 1.00049627, + "epoch": 0.5708101608297009, + "flos": 20812747737600.0, + "grad_norm": 1.6864610513371525, + "language_loss": 0.78054059, + "learning_rate": 1.6403518940397606e-06, + "loss": 0.80330098, + "num_input_tokens_seen": 204560275, + "step": 9494, + "time_per_iteration": 2.5290727615356445 + }, + { + "auxiliary_loss_clip": 0.0116647, + "auxiliary_loss_mlp": 0.01110839, + "balance_loss_clip": 1.00206363, + "balance_loss_mlp": 1.00059402, + "epoch": 0.5708702840823688, + "flos": 25812338895360.0, + "grad_norm": 2.478650172680569, + "language_loss": 0.80053204, + "learning_rate": 1.6399687878762096e-06, + "loss": 0.82330519, + "num_input_tokens_seen": 204579430, + "step": 9495, + "time_per_iteration": 2.5418879985809326 + }, + { + "auxiliary_loss_clip": 0.01109925, + "auxiliary_loss_mlp": 0.01111203, + "balance_loss_clip": 1.0019604, + "balance_loss_mlp": 1.00076723, + "epoch": 0.5709304073350369, + "flos": 23651916503040.0, + "grad_norm": 1.9948403370713763, + "language_loss": 0.66844118, + "learning_rate": 1.6395856953650784e-06, + "loss": 0.69065243, + "num_input_tokens_seen": 204597710, + "step": 9496, + "time_per_iteration": 2.6863796710968018 + }, + { + "auxiliary_loss_clip": 0.01166319, + "auxiliary_loss_mlp": 0.01110456, + "balance_loss_clip": 1.0020566, + "balance_loss_mlp": 1.00068831, + "epoch": 0.5709905305877048, + "flos": 16107552449280.0, + "grad_norm": 3.1928453059569604, + "language_loss": 0.69849873, + "learning_rate": 1.6392026165208938e-06, + "loss": 0.72126645, + "num_input_tokens_seen": 204616140, + "step": 9497, + "time_per_iteration": 2.5023410320281982 + }, + { + "auxiliary_loss_clip": 0.01149903, + "auxiliary_loss_mlp": 0.00747473, + "balance_loss_clip": 1.00198698, + "balance_loss_mlp": 1.00044167, + "epoch": 0.5710506538403728, + "flos": 24750819239040.0, + "grad_norm": 5.3494153883642355, + "language_loss": 0.81664217, + "learning_rate": 1.638819551358182e-06, + "loss": 0.83561587, + "num_input_tokens_seen": 204636470, + "step": 9498, + "time_per_iteration": 2.6132187843322754 + }, + { + "auxiliary_loss_clip": 0.01166182, + "auxiliary_loss_mlp": 0.01109444, + "balance_loss_clip": 1.00206029, + "balance_loss_mlp": 1.00062943, + "epoch": 0.5711107770930407, + "flos": 21982250655360.0, + "grad_norm": 2.00244152736852, + "language_loss": 0.66555619, + "learning_rate": 1.638436499891469e-06, + "loss": 0.68831253, + "num_input_tokens_seen": 204656640, + "step": 9499, + "time_per_iteration": 2.5245347023010254 + }, + { + "auxiliary_loss_clip": 0.01134847, + "auxiliary_loss_mlp": 0.01110182, + "balance_loss_clip": 1.00197589, + "balance_loss_mlp": 1.00060511, + "epoch": 0.5711709003457087, + "flos": 19574009354880.0, + "grad_norm": 1.5251727434210234, + "language_loss": 0.71749902, + "learning_rate": 1.6380534621352805e-06, + "loss": 0.73994929, + "num_input_tokens_seen": 204675475, + "step": 9500, + "time_per_iteration": 2.5692148208618164 + }, + { + "auxiliary_loss_clip": 0.01120065, + "auxiliary_loss_mlp": 0.0111024, + "balance_loss_clip": 1.00187957, + "balance_loss_mlp": 1.00047207, + "epoch": 0.5712310235983766, + "flos": 24242683489920.0, + "grad_norm": 2.322851206508187, + "language_loss": 0.76393008, + "learning_rate": 1.6376704381041407e-06, + "loss": 0.78623307, + "num_input_tokens_seen": 204695385, + "step": 9501, + "time_per_iteration": 2.6750123500823975 + }, + { + "auxiliary_loss_clip": 0.01134691, + "auxiliary_loss_mlp": 0.01109723, + "balance_loss_clip": 1.00196934, + "balance_loss_mlp": 1.00062251, + "epoch": 0.5712911468510447, + "flos": 20996143603200.0, + "grad_norm": 1.7536933245543909, + "language_loss": 0.75017864, + "learning_rate": 1.6372874278125742e-06, + "loss": 0.77262276, + "num_input_tokens_seen": 204714730, + "step": 9502, + "time_per_iteration": 2.5994913578033447 + }, + { + "auxiliary_loss_clip": 0.01138498, + "auxiliary_loss_mlp": 0.0110942, + "balance_loss_clip": 1.00203872, + "balance_loss_mlp": 1.00060582, + "epoch": 0.5713512701037126, + "flos": 18916987731840.0, + "grad_norm": 2.1264300936879943, + "language_loss": 0.82529593, + "learning_rate": 1.636904431275105e-06, + "loss": 0.8477751, + "num_input_tokens_seen": 204735025, + "step": 9503, + "time_per_iteration": 2.607219934463501 + }, + { + "auxiliary_loss_clip": 0.01116113, + "auxiliary_loss_mlp": 0.01109283, + "balance_loss_clip": 1.00190854, + "balance_loss_mlp": 1.00075495, + "epoch": 0.5714113933563806, + "flos": 17413443308160.0, + "grad_norm": 2.2391931107819514, + "language_loss": 0.85756546, + "learning_rate": 1.6365214485062553e-06, + "loss": 0.87981945, + "num_input_tokens_seen": 204751365, + "step": 9504, + "time_per_iteration": 2.600414276123047 + }, + { + "auxiliary_loss_clip": 0.0111792, + "auxiliary_loss_mlp": 0.01109051, + "balance_loss_clip": 1.00188935, + "balance_loss_mlp": 1.00052249, + "epoch": 0.5714715166090486, + "flos": 20193360589440.0, + "grad_norm": 22.366373036235867, + "language_loss": 0.75085312, + "learning_rate": 1.6361384795205496e-06, + "loss": 0.77312285, + "num_input_tokens_seen": 204768980, + "step": 9505, + "time_per_iteration": 2.6363749504089355 + }, + { + "auxiliary_loss_clip": 0.0116598, + "auxiliary_loss_mlp": 0.01109492, + "balance_loss_clip": 1.00204563, + "balance_loss_mlp": 1.00058246, + "epoch": 0.5715316398617165, + "flos": 18551668458240.0, + "grad_norm": 1.521230984802183, + "language_loss": 0.8201772, + "learning_rate": 1.635755524332509e-06, + "loss": 0.84293193, + "num_input_tokens_seen": 204788110, + "step": 9506, + "time_per_iteration": 2.5156261920928955 + }, + { + "auxiliary_loss_clip": 0.01117811, + "auxiliary_loss_mlp": 0.00747447, + "balance_loss_clip": 1.00200844, + "balance_loss_mlp": 1.00038648, + "epoch": 0.5715917631143845, + "flos": 18478195188480.0, + "grad_norm": 1.6509075608887773, + "language_loss": 0.77184331, + "learning_rate": 1.6353725829566552e-06, + "loss": 0.79049587, + "num_input_tokens_seen": 204807240, + "step": 9507, + "time_per_iteration": 2.6503968238830566 + }, + { + "auxiliary_loss_clip": 0.01134787, + "auxiliary_loss_mlp": 0.01109887, + "balance_loss_clip": 1.00187755, + "balance_loss_mlp": 1.00059569, + "epoch": 0.5716518863670524, + "flos": 24020037037440.0, + "grad_norm": 1.9323625512313132, + "language_loss": 0.68603128, + "learning_rate": 1.63498965540751e-06, + "loss": 0.70847797, + "num_input_tokens_seen": 204826415, + "step": 9508, + "time_per_iteration": 2.6305501461029053 + }, + { + "auxiliary_loss_clip": 0.01166353, + "auxiliary_loss_mlp": 0.0111023, + "balance_loss_clip": 1.00210381, + "balance_loss_mlp": 1.00055754, + "epoch": 0.5717120096197205, + "flos": 17819485626240.0, + "grad_norm": 1.9322337786069017, + "language_loss": 0.79592121, + "learning_rate": 1.634606741699593e-06, + "loss": 0.81868708, + "num_input_tokens_seen": 204844305, + "step": 9509, + "time_per_iteration": 2.6310958862304688 + }, + { + "auxiliary_loss_clip": 0.01149121, + "auxiliary_loss_mlp": 0.0110979, + "balance_loss_clip": 1.0019381, + "balance_loss_mlp": 1.00059402, + "epoch": 0.5717721328723884, + "flos": 21866043179520.0, + "grad_norm": 2.6006999250570635, + "language_loss": 0.71982765, + "learning_rate": 1.6342238418474255e-06, + "loss": 0.74241674, + "num_input_tokens_seen": 204861765, + "step": 9510, + "time_per_iteration": 2.7072319984436035 + }, + { + "auxiliary_loss_clip": 0.01132916, + "auxiliary_loss_mlp": 0.01109051, + "balance_loss_clip": 1.00194693, + "balance_loss_mlp": 1.00061798, + "epoch": 0.5718322561250564, + "flos": 28437624126720.0, + "grad_norm": 1.3442547578555923, + "language_loss": 0.69082326, + "learning_rate": 1.6338409558655264e-06, + "loss": 0.71324295, + "num_input_tokens_seen": 204882505, + "step": 9511, + "time_per_iteration": 2.6690011024475098 + }, + { + "auxiliary_loss_clip": 0.01134834, + "auxiliary_loss_mlp": 0.01109699, + "balance_loss_clip": 1.00203753, + "balance_loss_mlp": 1.00078905, + "epoch": 0.5718923793777243, + "flos": 13551825905280.0, + "grad_norm": 2.2775317737285956, + "language_loss": 0.61538863, + "learning_rate": 1.6334580837684152e-06, + "loss": 0.63783395, + "num_input_tokens_seen": 204899830, + "step": 9512, + "time_per_iteration": 2.6297667026519775 + }, + { + "auxiliary_loss_clip": 0.01133942, + "auxiliary_loss_mlp": 0.01108499, + "balance_loss_clip": 1.0019424, + "balance_loss_mlp": 1.00054312, + "epoch": 0.5719525026303923, + "flos": 17822035491840.0, + "grad_norm": 2.4689517358989925, + "language_loss": 0.75919998, + "learning_rate": 1.6330752255706104e-06, + "loss": 0.78162444, + "num_input_tokens_seen": 204918100, + "step": 9513, + "time_per_iteration": 2.5927889347076416 + }, + { + "auxiliary_loss_clip": 0.01146234, + "auxiliary_loss_mlp": 0.01085081, + "balance_loss_clip": 1.00124288, + "balance_loss_mlp": 1.00001359, + "epoch": 0.5720126258830602, + "flos": 61298042814720.0, + "grad_norm": 0.8964738402429954, + "language_loss": 0.66788244, + "learning_rate": 1.6326923812866288e-06, + "loss": 0.69019556, + "num_input_tokens_seen": 204972925, + "step": 9514, + "time_per_iteration": 3.1178061962127686 + }, + { + "auxiliary_loss_clip": 0.01151542, + "auxiliary_loss_mlp": 0.01110553, + "balance_loss_clip": 1.00208545, + "balance_loss_mlp": 1.00078487, + "epoch": 0.5720727491357283, + "flos": 23988040997760.0, + "grad_norm": 1.8436584351287875, + "language_loss": 0.81296718, + "learning_rate": 1.63230955093099e-06, + "loss": 0.83558816, + "num_input_tokens_seen": 204990910, + "step": 9515, + "time_per_iteration": 2.6097233295440674 + }, + { + "auxiliary_loss_clip": 0.01151255, + "auxiliary_loss_mlp": 0.01109145, + "balance_loss_clip": 1.00196266, + "balance_loss_mlp": 1.00052142, + "epoch": 0.5721328723883962, + "flos": 23405426398080.0, + "grad_norm": 1.533556559554755, + "language_loss": 0.85965455, + "learning_rate": 1.6319267345182092e-06, + "loss": 0.88225853, + "num_input_tokens_seen": 205010500, + "step": 9516, + "time_per_iteration": 2.6003811359405518 + }, + { + "auxiliary_loss_clip": 0.01133456, + "auxiliary_loss_mlp": 0.01110156, + "balance_loss_clip": 1.00195909, + "balance_loss_mlp": 1.00048375, + "epoch": 0.5721929956410642, + "flos": 18804910320000.0, + "grad_norm": 1.758426000354053, + "language_loss": 0.87212145, + "learning_rate": 1.6315439320628038e-06, + "loss": 0.8945576, + "num_input_tokens_seen": 205028560, + "step": 9517, + "time_per_iteration": 2.674258232116699 + }, + { + "auxiliary_loss_clip": 0.01103001, + "auxiliary_loss_mlp": 0.01109052, + "balance_loss_clip": 1.00182188, + "balance_loss_mlp": 1.00042832, + "epoch": 0.5722531188937322, + "flos": 27196659100800.0, + "grad_norm": 1.8099731790021278, + "language_loss": 0.85361952, + "learning_rate": 1.6311611435792893e-06, + "loss": 0.87574005, + "num_input_tokens_seen": 205048650, + "step": 9518, + "time_per_iteration": 2.7585558891296387 + }, + { + "auxiliary_loss_clip": 0.01149342, + "auxiliary_loss_mlp": 0.01108646, + "balance_loss_clip": 1.00187707, + "balance_loss_mlp": 1.00049877, + "epoch": 0.5723132421464001, + "flos": 15195672852480.0, + "grad_norm": 1.848357109333999, + "language_loss": 0.78533041, + "learning_rate": 1.6307783690821812e-06, + "loss": 0.80791026, + "num_input_tokens_seen": 205066480, + "step": 9519, + "time_per_iteration": 2.533216714859009 + }, + { + "auxiliary_loss_clip": 0.01166031, + "auxiliary_loss_mlp": 0.01108884, + "balance_loss_clip": 1.00197744, + "balance_loss_mlp": 1.00045133, + "epoch": 0.5723733653990681, + "flos": 27599433281280.0, + "grad_norm": 1.9700040470851434, + "language_loss": 0.83004689, + "learning_rate": 1.6303956085859944e-06, + "loss": 0.85279602, + "num_input_tokens_seen": 205087475, + "step": 9520, + "time_per_iteration": 4.007467985153198 + }, + { + "auxiliary_loss_clip": 0.01132891, + "auxiliary_loss_mlp": 0.01109878, + "balance_loss_clip": 1.0017935, + "balance_loss_mlp": 1.00068235, + "epoch": 0.572433488651736, + "flos": 18222870337920.0, + "grad_norm": 1.9501301578370618, + "language_loss": 0.72474867, + "learning_rate": 1.630012862105243e-06, + "loss": 0.74717641, + "num_input_tokens_seen": 205106495, + "step": 9521, + "time_per_iteration": 2.5994656085968018 + }, + { + "auxiliary_loss_clip": 0.01166048, + "auxiliary_loss_mlp": 0.00747315, + "balance_loss_clip": 1.00203133, + "balance_loss_mlp": 1.00041306, + "epoch": 0.5724936119044041, + "flos": 31249106484480.0, + "grad_norm": 2.025775953112719, + "language_loss": 0.77969015, + "learning_rate": 1.6296301296544415e-06, + "loss": 0.79882383, + "num_input_tokens_seen": 205128285, + "step": 9522, + "time_per_iteration": 2.59824275970459 + }, + { + "auxiliary_loss_clip": 0.01133982, + "auxiliary_loss_mlp": 0.01108424, + "balance_loss_clip": 1.001917, + "balance_loss_mlp": 1.00065899, + "epoch": 0.572553735157072, + "flos": 19202189719680.0, + "grad_norm": 1.6356007587281345, + "language_loss": 0.71658409, + "learning_rate": 1.629247411248102e-06, + "loss": 0.73900819, + "num_input_tokens_seen": 205146595, + "step": 9523, + "time_per_iteration": 2.598921298980713 + }, + { + "auxiliary_loss_clip": 0.01133599, + "auxiliary_loss_mlp": 0.01108279, + "balance_loss_clip": 1.00183141, + "balance_loss_mlp": 1.00060964, + "epoch": 0.57261385840974, + "flos": 21214911386880.0, + "grad_norm": 1.79998489633621, + "language_loss": 0.7045728, + "learning_rate": 1.628864706900738e-06, + "loss": 0.72699159, + "num_input_tokens_seen": 205164295, + "step": 9524, + "time_per_iteration": 2.572793483734131 + }, + { + "auxiliary_loss_clip": 0.01150769, + "auxiliary_loss_mlp": 0.01109327, + "balance_loss_clip": 1.00199533, + "balance_loss_mlp": 1.00051248, + "epoch": 0.5726739816624079, + "flos": 33984529793280.0, + "grad_norm": 1.7068584123709478, + "language_loss": 0.65260541, + "learning_rate": 1.6284820166268615e-06, + "loss": 0.67520642, + "num_input_tokens_seen": 205185380, + "step": 9525, + "time_per_iteration": 2.65437650680542 + }, + { + "auxiliary_loss_clip": 0.01132691, + "auxiliary_loss_mlp": 0.01108633, + "balance_loss_clip": 1.00183535, + "balance_loss_mlp": 1.00048637, + "epoch": 0.5727341049150759, + "flos": 24275972419200.0, + "grad_norm": 1.5406180547828903, + "language_loss": 0.72245431, + "learning_rate": 1.628099340440984e-06, + "loss": 0.74486756, + "num_input_tokens_seen": 205204895, + "step": 9526, + "time_per_iteration": 2.6078686714172363 + }, + { + "auxiliary_loss_clip": 0.01150608, + "auxiliary_loss_mlp": 0.01108269, + "balance_loss_clip": 1.00201178, + "balance_loss_mlp": 1.00059962, + "epoch": 0.5727942281677438, + "flos": 28400564269440.0, + "grad_norm": 1.6091058227232333, + "language_loss": 0.80221653, + "learning_rate": 1.6277166783576176e-06, + "loss": 0.82480526, + "num_input_tokens_seen": 205223440, + "step": 9527, + "time_per_iteration": 3.9982173442840576 + }, + { + "auxiliary_loss_clip": 0.01150998, + "auxiliary_loss_mlp": 0.01109895, + "balance_loss_clip": 1.00205696, + "balance_loss_mlp": 1.00069952, + "epoch": 0.5728543514204119, + "flos": 19536769929600.0, + "grad_norm": 2.2406410181896206, + "language_loss": 0.72314405, + "learning_rate": 1.6273340303912713e-06, + "loss": 0.74575293, + "num_input_tokens_seen": 205242800, + "step": 9528, + "time_per_iteration": 3.9383645057678223 + }, + { + "auxiliary_loss_clip": 0.01166224, + "auxiliary_loss_mlp": 0.01109316, + "balance_loss_clip": 1.00209856, + "balance_loss_mlp": 1.00040686, + "epoch": 0.5729144746730798, + "flos": 21506757390720.0, + "grad_norm": 2.0932982363489208, + "language_loss": 0.85877228, + "learning_rate": 1.6269513965564557e-06, + "loss": 0.88152766, + "num_input_tokens_seen": 205259465, + "step": 9529, + "time_per_iteration": 2.508639097213745 + }, + { + "auxiliary_loss_clip": 0.0112914, + "auxiliary_loss_mlp": 0.01085066, + "balance_loss_clip": 1.00127792, + "balance_loss_mlp": 0.99999809, + "epoch": 0.5729745979257478, + "flos": 58681628242560.0, + "grad_norm": 0.7641997973837953, + "language_loss": 0.5612216, + "learning_rate": 1.6265687768676813e-06, + "loss": 0.58336365, + "num_input_tokens_seen": 205314100, + "step": 9530, + "time_per_iteration": 4.547295570373535 + }, + { + "auxiliary_loss_clip": 0.01134241, + "auxiliary_loss_mlp": 0.01109302, + "balance_loss_clip": 1.00193369, + "balance_loss_mlp": 1.00048733, + "epoch": 0.5730347211784158, + "flos": 18552099421440.0, + "grad_norm": 1.728694410588564, + "language_loss": 0.66254115, + "learning_rate": 1.6261861713394553e-06, + "loss": 0.68497658, + "num_input_tokens_seen": 205333420, + "step": 9531, + "time_per_iteration": 2.682037353515625 + }, + { + "auxiliary_loss_clip": 0.0115104, + "auxiliary_loss_mlp": 0.01110137, + "balance_loss_clip": 1.00202823, + "balance_loss_mlp": 1.00065494, + "epoch": 0.5730948444310837, + "flos": 38031482396160.0, + "grad_norm": 2.2807290936536884, + "language_loss": 0.75921553, + "learning_rate": 1.6258035799862876e-06, + "loss": 0.78182733, + "num_input_tokens_seen": 205350995, + "step": 9532, + "time_per_iteration": 2.671870470046997 + }, + { + "auxiliary_loss_clip": 0.01165947, + "auxiliary_loss_mlp": 0.01108594, + "balance_loss_clip": 1.00198066, + "balance_loss_mlp": 1.00054312, + "epoch": 0.5731549676837517, + "flos": 25227066689280.0, + "grad_norm": 1.2584363056992338, + "language_loss": 0.78931242, + "learning_rate": 1.625421002822686e-06, + "loss": 0.81205785, + "num_input_tokens_seen": 205372675, + "step": 9533, + "time_per_iteration": 2.5986223220825195 + }, + { + "auxiliary_loss_clip": 0.01149314, + "auxiliary_loss_mlp": 0.01108472, + "balance_loss_clip": 1.0020299, + "balance_loss_mlp": 1.00061119, + "epoch": 0.5732150909364196, + "flos": 23368222886400.0, + "grad_norm": 1.6108144700995688, + "language_loss": 0.8562187, + "learning_rate": 1.6250384398631574e-06, + "loss": 0.87879652, + "num_input_tokens_seen": 205392590, + "step": 9534, + "time_per_iteration": 2.5568032264709473 + }, + { + "auxiliary_loss_clip": 0.01134566, + "auxiliary_loss_mlp": 0.01109396, + "balance_loss_clip": 1.00195599, + "balance_loss_mlp": 1.0005815, + "epoch": 0.5732752141890877, + "flos": 23079357711360.0, + "grad_norm": 1.5948449140190653, + "language_loss": 0.74997425, + "learning_rate": 1.6246558911222085e-06, + "loss": 0.77241385, + "num_input_tokens_seen": 205414885, + "step": 9535, + "time_per_iteration": 2.647383689880371 + }, + { + "auxiliary_loss_clip": 0.01136829, + "auxiliary_loss_mlp": 0.01110096, + "balance_loss_clip": 1.00213695, + "balance_loss_mlp": 1.00061417, + "epoch": 0.5733353374417556, + "flos": 24352282863360.0, + "grad_norm": 1.5323695192676563, + "language_loss": 0.71200073, + "learning_rate": 1.624273356614346e-06, + "loss": 0.73446989, + "num_input_tokens_seen": 205434440, + "step": 9536, + "time_per_iteration": 2.6269586086273193 + }, + { + "auxiliary_loss_clip": 0.01119202, + "auxiliary_loss_mlp": 0.01109221, + "balance_loss_clip": 1.00189102, + "balance_loss_mlp": 1.00059795, + "epoch": 0.5733954606944236, + "flos": 27198849830400.0, + "grad_norm": 2.049492892361694, + "language_loss": 0.70066088, + "learning_rate": 1.6238908363540755e-06, + "loss": 0.72294509, + "num_input_tokens_seen": 205454225, + "step": 9537, + "time_per_iteration": 2.678752899169922 + }, + { + "auxiliary_loss_clip": 0.01165935, + "auxiliary_loss_mlp": 0.01108546, + "balance_loss_clip": 1.00199091, + "balance_loss_mlp": 1.00059009, + "epoch": 0.5734555839470915, + "flos": 28765129357440.0, + "grad_norm": 1.9439999503470464, + "language_loss": 0.62214792, + "learning_rate": 1.623508330355902e-06, + "loss": 0.64489275, + "num_input_tokens_seen": 205474750, + "step": 9538, + "time_per_iteration": 2.6064155101776123 + }, + { + "auxiliary_loss_clip": 0.01149197, + "auxiliary_loss_mlp": 0.01109361, + "balance_loss_clip": 1.0019393, + "balance_loss_mlp": 1.00064206, + "epoch": 0.5735157071997595, + "flos": 22966813422720.0, + "grad_norm": 1.6849695072320094, + "language_loss": 0.82788473, + "learning_rate": 1.6231258386343306e-06, + "loss": 0.85047036, + "num_input_tokens_seen": 205495495, + "step": 9539, + "time_per_iteration": 2.669851779937744 + }, + { + "auxiliary_loss_clip": 0.01102828, + "auxiliary_loss_mlp": 0.01109643, + "balance_loss_clip": 1.00187874, + "balance_loss_mlp": 1.00054312, + "epoch": 0.5735758304524274, + "flos": 18989455420800.0, + "grad_norm": 1.965565615477479, + "language_loss": 0.72817826, + "learning_rate": 1.6227433612038647e-06, + "loss": 0.75030297, + "num_input_tokens_seen": 205510070, + "step": 9540, + "time_per_iteration": 2.6751108169555664 + }, + { + "auxiliary_loss_clip": 0.01149253, + "auxiliary_loss_mlp": 0.00747379, + "balance_loss_clip": 1.00189328, + "balance_loss_mlp": 1.0003612, + "epoch": 0.5736359537050955, + "flos": 28397942576640.0, + "grad_norm": 3.180656249655076, + "language_loss": 0.79861081, + "learning_rate": 1.6223608980790089e-06, + "loss": 0.81757712, + "num_input_tokens_seen": 205530190, + "step": 9541, + "time_per_iteration": 2.6056904792785645 + }, + { + "auxiliary_loss_clip": 0.01132485, + "auxiliary_loss_mlp": 0.01109899, + "balance_loss_clip": 1.00189483, + "balance_loss_mlp": 1.00051236, + "epoch": 0.5736960769577634, + "flos": 15627210848640.0, + "grad_norm": 3.2119964506685394, + "language_loss": 0.64828819, + "learning_rate": 1.6219784492742654e-06, + "loss": 0.67071199, + "num_input_tokens_seen": 205547380, + "step": 9542, + "time_per_iteration": 2.5629568099975586 + }, + { + "auxiliary_loss_clip": 0.01133268, + "auxiliary_loss_mlp": 0.01109268, + "balance_loss_clip": 1.00185657, + "balance_loss_mlp": 1.00054944, + "epoch": 0.5737562002104314, + "flos": 18003994813440.0, + "grad_norm": 1.9627393764505638, + "language_loss": 0.82956076, + "learning_rate": 1.6215960148041365e-06, + "loss": 0.85198617, + "num_input_tokens_seen": 205566540, + "step": 9543, + "time_per_iteration": 2.5785210132598877 + }, + { + "auxiliary_loss_clip": 0.01117653, + "auxiliary_loss_mlp": 0.01109784, + "balance_loss_clip": 1.0019486, + "balance_loss_mlp": 1.0005883, + "epoch": 0.5738163234630994, + "flos": 20698192287360.0, + "grad_norm": 3.5204458488621255, + "language_loss": 0.73199761, + "learning_rate": 1.6212135946831257e-06, + "loss": 0.75427198, + "num_input_tokens_seen": 205584200, + "step": 9544, + "time_per_iteration": 2.637033700942993 + }, + { + "auxiliary_loss_clip": 0.01103533, + "auxiliary_loss_mlp": 0.01109697, + "balance_loss_clip": 1.00190258, + "balance_loss_mlp": 1.00040627, + "epoch": 0.5738764467157673, + "flos": 23149311448320.0, + "grad_norm": 2.2229896467309462, + "language_loss": 0.76188803, + "learning_rate": 1.620831188925733e-06, + "loss": 0.78402036, + "num_input_tokens_seen": 205604675, + "step": 9545, + "time_per_iteration": 2.7033302783966064 + }, + { + "auxiliary_loss_clip": 0.01132409, + "auxiliary_loss_mlp": 0.01108907, + "balance_loss_clip": 1.00184059, + "balance_loss_mlp": 1.00066495, + "epoch": 0.5739365699684353, + "flos": 29492930730240.0, + "grad_norm": 3.6523743419300145, + "language_loss": 0.56876338, + "learning_rate": 1.620448797546459e-06, + "loss": 0.59117651, + "num_input_tokens_seen": 205624680, + "step": 9546, + "time_per_iteration": 2.6494367122650146 + }, + { + "auxiliary_loss_clip": 0.01134863, + "auxiliary_loss_mlp": 0.01108696, + "balance_loss_clip": 1.00196552, + "balance_loss_mlp": 1.00054944, + "epoch": 0.5739966932211032, + "flos": 14027247342720.0, + "grad_norm": 2.35456681049174, + "language_loss": 0.76441514, + "learning_rate": 1.6200664205598055e-06, + "loss": 0.78685069, + "num_input_tokens_seen": 205641950, + "step": 9547, + "time_per_iteration": 2.5624289512634277 + }, + { + "auxiliary_loss_clip": 0.01151248, + "auxiliary_loss_mlp": 0.01108941, + "balance_loss_clip": 1.00196004, + "balance_loss_mlp": 1.00069857, + "epoch": 0.5740568164737713, + "flos": 19062030850560.0, + "grad_norm": 84.77760373630501, + "language_loss": 0.74502778, + "learning_rate": 1.6196840579802704e-06, + "loss": 0.76762974, + "num_input_tokens_seen": 205660130, + "step": 9548, + "time_per_iteration": 2.530625104904175 + }, + { + "auxiliary_loss_clip": 0.01121498, + "auxiliary_loss_mlp": 0.01109734, + "balance_loss_clip": 1.00186265, + "balance_loss_mlp": 1.00063348, + "epoch": 0.5741169397264392, + "flos": 22127832478080.0, + "grad_norm": 2.1562350717711305, + "language_loss": 0.69422102, + "learning_rate": 1.619301709822355e-06, + "loss": 0.71653342, + "num_input_tokens_seen": 205678895, + "step": 9549, + "time_per_iteration": 2.655564069747925 + }, + { + "auxiliary_loss_clip": 0.01105098, + "auxiliary_loss_mlp": 0.01109136, + "balance_loss_clip": 1.00197625, + "balance_loss_mlp": 1.00060844, + "epoch": 0.5741770629791072, + "flos": 24936836797440.0, + "grad_norm": 2.4211244653937762, + "language_loss": 0.79472893, + "learning_rate": 1.6189193761005564e-06, + "loss": 0.81687129, + "num_input_tokens_seen": 205698450, + "step": 9550, + "time_per_iteration": 2.707139253616333 + }, + { + "auxiliary_loss_clip": 0.01134138, + "auxiliary_loss_mlp": 0.01109219, + "balance_loss_clip": 1.00190628, + "balance_loss_mlp": 1.00059497, + "epoch": 0.5742371862317751, + "flos": 18801462614400.0, + "grad_norm": 2.003119884804776, + "language_loss": 0.67421877, + "learning_rate": 1.6185370568293727e-06, + "loss": 0.69665229, + "num_input_tokens_seen": 205714870, + "step": 9551, + "time_per_iteration": 2.5736165046691895 + }, + { + "auxiliary_loss_clip": 0.01115855, + "auxiliary_loss_mlp": 0.01110209, + "balance_loss_clip": 1.00181556, + "balance_loss_mlp": 1.00053692, + "epoch": 0.5742973094844431, + "flos": 24460661174400.0, + "grad_norm": 1.7792927997192507, + "language_loss": 0.71876895, + "learning_rate": 1.6181547520233031e-06, + "loss": 0.74102962, + "num_input_tokens_seen": 205736045, + "step": 9552, + "time_per_iteration": 2.674099922180176 + }, + { + "auxiliary_loss_clip": 0.01149762, + "auxiliary_loss_mlp": 0.01109241, + "balance_loss_clip": 1.00216269, + "balance_loss_mlp": 1.00052214, + "epoch": 0.574357432737111, + "flos": 21652770176640.0, + "grad_norm": 1.8525629646029438, + "language_loss": 0.79943252, + "learning_rate": 1.617772461696843e-06, + "loss": 0.82202256, + "num_input_tokens_seen": 205754445, + "step": 9553, + "time_per_iteration": 2.6024372577667236 + }, + { + "auxiliary_loss_clip": 0.01149852, + "auxiliary_loss_mlp": 0.01110021, + "balance_loss_clip": 1.00193405, + "balance_loss_mlp": 1.00044429, + "epoch": 0.5744175559897791, + "flos": 16544728880640.0, + "grad_norm": 2.6848355919184996, + "language_loss": 0.83328605, + "learning_rate": 1.6173901858644895e-06, + "loss": 0.85588479, + "num_input_tokens_seen": 205770595, + "step": 9554, + "time_per_iteration": 2.548692464828491 + }, + { + "auxiliary_loss_clip": 0.01150869, + "auxiliary_loss_mlp": 0.00747384, + "balance_loss_clip": 1.00208056, + "balance_loss_mlp": 1.00050449, + "epoch": 0.574477679242447, + "flos": 24207598880640.0, + "grad_norm": 8.720729949192155, + "language_loss": 0.70949483, + "learning_rate": 1.6170079245407385e-06, + "loss": 0.7284773, + "num_input_tokens_seen": 205791935, + "step": 9555, + "time_per_iteration": 2.6553590297698975 + }, + { + "auxiliary_loss_clip": 0.01132595, + "auxiliary_loss_mlp": 0.01109616, + "balance_loss_clip": 1.00196636, + "balance_loss_mlp": 1.00042033, + "epoch": 0.574537802495115, + "flos": 14903000835840.0, + "grad_norm": 2.1070895598332307, + "language_loss": 0.72290462, + "learning_rate": 1.6166256777400853e-06, + "loss": 0.7453267, + "num_input_tokens_seen": 205807260, + "step": 9556, + "time_per_iteration": 2.658456563949585 + }, + { + "auxiliary_loss_clip": 0.01149369, + "auxiliary_loss_mlp": 0.01108617, + "balance_loss_clip": 1.00202703, + "balance_loss_mlp": 1.00066113, + "epoch": 0.5745979257477829, + "flos": 24934969290240.0, + "grad_norm": 1.7332718222962469, + "language_loss": 0.73914748, + "learning_rate": 1.6162434454770248e-06, + "loss": 0.76172733, + "num_input_tokens_seen": 205826885, + "step": 9557, + "time_per_iteration": 2.634653091430664 + }, + { + "auxiliary_loss_clip": 0.01151165, + "auxiliary_loss_mlp": 0.01109121, + "balance_loss_clip": 1.00206757, + "balance_loss_mlp": 1.00068808, + "epoch": 0.5746580490004509, + "flos": 17235757704960.0, + "grad_norm": 1.740661529396576, + "language_loss": 0.68007803, + "learning_rate": 1.6158612277660514e-06, + "loss": 0.70268089, + "num_input_tokens_seen": 205844630, + "step": 9558, + "time_per_iteration": 3.9094691276550293 + }, + { + "auxiliary_loss_clip": 0.01134895, + "auxiliary_loss_mlp": 0.01111528, + "balance_loss_clip": 1.00201702, + "balance_loss_mlp": 1.00071049, + "epoch": 0.5747181722531189, + "flos": 13187871348480.0, + "grad_norm": 2.370489800171474, + "language_loss": 0.7135964, + "learning_rate": 1.615479024621659e-06, + "loss": 0.73606062, + "num_input_tokens_seen": 205860960, + "step": 9559, + "time_per_iteration": 2.5948238372802734 + }, + { + "auxiliary_loss_clip": 0.01132907, + "auxiliary_loss_mlp": 0.00747117, + "balance_loss_clip": 1.00203598, + "balance_loss_mlp": 1.00040555, + "epoch": 0.5747782955057869, + "flos": 22963006581120.0, + "grad_norm": 1.5273750729197002, + "language_loss": 0.79104745, + "learning_rate": 1.6150968360583398e-06, + "loss": 0.80984771, + "num_input_tokens_seen": 205880675, + "step": 9560, + "time_per_iteration": 2.6358728408813477 + }, + { + "auxiliary_loss_clip": 0.01084387, + "auxiliary_loss_mlp": 0.01109105, + "balance_loss_clip": 1.00174904, + "balance_loss_mlp": 1.00057709, + "epoch": 0.5748384187584549, + "flos": 23403235668480.0, + "grad_norm": 1.8673824247727737, + "language_loss": 0.63691396, + "learning_rate": 1.614714662090588e-06, + "loss": 0.65884888, + "num_input_tokens_seen": 205900050, + "step": 9561, + "time_per_iteration": 2.733124017715454 + }, + { + "auxiliary_loss_clip": 0.01151499, + "auxiliary_loss_mlp": 0.0111021, + "balance_loss_clip": 1.00218797, + "balance_loss_mlp": 1.00072849, + "epoch": 0.5748985420111228, + "flos": 17785514338560.0, + "grad_norm": 1.6848639011867552, + "language_loss": 0.71420062, + "learning_rate": 1.6143325027328945e-06, + "loss": 0.73681772, + "num_input_tokens_seen": 205918855, + "step": 9562, + "time_per_iteration": 2.5813984870910645 + }, + { + "auxiliary_loss_clip": 0.01101252, + "auxiliary_loss_mlp": 0.01109078, + "balance_loss_clip": 1.00198317, + "balance_loss_mlp": 1.00074017, + "epoch": 0.5749586652637908, + "flos": 19866250408320.0, + "grad_norm": 1.4958271510937657, + "language_loss": 0.84238696, + "learning_rate": 1.613950357999751e-06, + "loss": 0.86449027, + "num_input_tokens_seen": 205936970, + "step": 9563, + "time_per_iteration": 2.6970231533050537 + }, + { + "auxiliary_loss_clip": 0.01088049, + "auxiliary_loss_mlp": 0.01111299, + "balance_loss_clip": 1.00183761, + "balance_loss_mlp": 1.00067317, + "epoch": 0.5750187885164587, + "flos": 21287235421440.0, + "grad_norm": 4.971188433136866, + "language_loss": 0.57329822, + "learning_rate": 1.6135682279056488e-06, + "loss": 0.59529173, + "num_input_tokens_seen": 205954630, + "step": 9564, + "time_per_iteration": 2.727949619293213 + }, + { + "auxiliary_loss_clip": 0.01135241, + "auxiliary_loss_mlp": 0.01108474, + "balance_loss_clip": 1.00191927, + "balance_loss_mlp": 1.00042307, + "epoch": 0.5750789117691267, + "flos": 18804658924800.0, + "grad_norm": 4.462941085783999, + "language_loss": 0.76372111, + "learning_rate": 1.613186112465078e-06, + "loss": 0.78615832, + "num_input_tokens_seen": 205971510, + "step": 9565, + "time_per_iteration": 5.615516424179077 + }, + { + "auxiliary_loss_clip": 0.01098226, + "auxiliary_loss_mlp": 0.0108472, + "balance_loss_clip": 1.00113237, + "balance_loss_mlp": 1.00003326, + "epoch": 0.5751390350217946, + "flos": 70663224124800.0, + "grad_norm": 0.7662307865367224, + "language_loss": 0.6075871, + "learning_rate": 1.6128040116925287e-06, + "loss": 0.62941658, + "num_input_tokens_seen": 206035125, + "step": 9566, + "time_per_iteration": 3.321516752243042 + }, + { + "auxiliary_loss_clip": 0.01134788, + "auxiliary_loss_mlp": 0.01109285, + "balance_loss_clip": 1.00197935, + "balance_loss_mlp": 1.00056565, + "epoch": 0.5751991582744627, + "flos": 14246338348800.0, + "grad_norm": 3.4494505003440485, + "language_loss": 0.75443643, + "learning_rate": 1.6124219256024901e-06, + "loss": 0.77687716, + "num_input_tokens_seen": 206052075, + "step": 9567, + "time_per_iteration": 4.059280157089233 + }, + { + "auxiliary_loss_clip": 0.01148956, + "auxiliary_loss_mlp": 0.01109304, + "balance_loss_clip": 1.00196147, + "balance_loss_mlp": 1.00048947, + "epoch": 0.5752592815271306, + "flos": 18328160079360.0, + "grad_norm": 1.4864986797050803, + "language_loss": 0.74701542, + "learning_rate": 1.6120398542094504e-06, + "loss": 0.76959801, + "num_input_tokens_seen": 206069970, + "step": 9568, + "time_per_iteration": 2.561472177505493 + }, + { + "auxiliary_loss_clip": 0.01166086, + "auxiliary_loss_mlp": 0.01109573, + "balance_loss_clip": 1.00206757, + "balance_loss_mlp": 1.00047266, + "epoch": 0.5753194047797986, + "flos": 20922742160640.0, + "grad_norm": 1.6089634714713987, + "language_loss": 0.7115283, + "learning_rate": 1.6116577975278994e-06, + "loss": 0.73428494, + "num_input_tokens_seen": 206088950, + "step": 9569, + "time_per_iteration": 2.5281026363372803 + }, + { + "auxiliary_loss_clip": 0.01149292, + "auxiliary_loss_mlp": 0.01109799, + "balance_loss_clip": 1.002002, + "balance_loss_mlp": 1.00060308, + "epoch": 0.5753795280324665, + "flos": 19281804215040.0, + "grad_norm": 2.156301306942989, + "language_loss": 0.55882621, + "learning_rate": 1.6112757555723223e-06, + "loss": 0.58141714, + "num_input_tokens_seen": 206107780, + "step": 9570, + "time_per_iteration": 2.5862762928009033 + }, + { + "auxiliary_loss_clip": 0.01165798, + "auxiliary_loss_mlp": 0.01108648, + "balance_loss_clip": 1.00194716, + "balance_loss_mlp": 1.00078785, + "epoch": 0.5754396512851345, + "flos": 21652877917440.0, + "grad_norm": 1.5019067111503734, + "language_loss": 0.64236021, + "learning_rate": 1.6108937283572082e-06, + "loss": 0.66510469, + "num_input_tokens_seen": 206127445, + "step": 9571, + "time_per_iteration": 2.5410375595092773 + }, + { + "auxiliary_loss_clip": 0.01151141, + "auxiliary_loss_mlp": 0.01108842, + "balance_loss_clip": 1.00199914, + "balance_loss_mlp": 1.00050473, + "epoch": 0.5754997745378025, + "flos": 51021700179840.0, + "grad_norm": 2.148275776804908, + "language_loss": 0.67327023, + "learning_rate": 1.6105117158970434e-06, + "loss": 0.69587004, + "num_input_tokens_seen": 206152005, + "step": 9572, + "time_per_iteration": 2.8487367630004883 + }, + { + "auxiliary_loss_clip": 0.01133593, + "auxiliary_loss_mlp": 0.01109151, + "balance_loss_clip": 1.00194693, + "balance_loss_mlp": 1.00062299, + "epoch": 0.5755598977904705, + "flos": 22856890826880.0, + "grad_norm": 1.867186329249288, + "language_loss": 0.72450328, + "learning_rate": 1.6101297182063123e-06, + "loss": 0.74693072, + "num_input_tokens_seen": 206169875, + "step": 9573, + "time_per_iteration": 2.621626377105713 + }, + { + "auxiliary_loss_clip": 0.01165892, + "auxiliary_loss_mlp": 0.01107925, + "balance_loss_clip": 1.00217128, + "balance_loss_mlp": 1.00063705, + "epoch": 0.5756200210431385, + "flos": 38472824805120.0, + "grad_norm": 1.8257988769266056, + "language_loss": 0.76599735, + "learning_rate": 1.6097477352995022e-06, + "loss": 0.78873551, + "num_input_tokens_seen": 206192635, + "step": 9574, + "time_per_iteration": 2.6642580032348633 + }, + { + "auxiliary_loss_clip": 0.01102862, + "auxiliary_loss_mlp": 0.01110248, + "balance_loss_clip": 1.00193787, + "balance_loss_mlp": 1.0005753, + "epoch": 0.5756801442958064, + "flos": 23910006700800.0, + "grad_norm": 2.694217214506011, + "language_loss": 0.66984189, + "learning_rate": 1.6093657671910968e-06, + "loss": 0.69197303, + "num_input_tokens_seen": 206211485, + "step": 9575, + "time_per_iteration": 2.7186107635498047 + }, + { + "auxiliary_loss_clip": 0.01134536, + "auxiliary_loss_mlp": 0.01108528, + "balance_loss_clip": 1.00200105, + "balance_loss_mlp": 1.00066733, + "epoch": 0.5757402675484744, + "flos": 21105276099840.0, + "grad_norm": 1.5377112215509436, + "language_loss": 0.79760867, + "learning_rate": 1.6089838138955804e-06, + "loss": 0.82003933, + "num_input_tokens_seen": 206231740, + "step": 9576, + "time_per_iteration": 2.732616662979126 + }, + { + "auxiliary_loss_clip": 0.01132596, + "auxiliary_loss_mlp": 0.01108945, + "balance_loss_clip": 1.00204897, + "balance_loss_mlp": 1.00051212, + "epoch": 0.5758003908011423, + "flos": 20559110826240.0, + "grad_norm": 1.8209711747338788, + "language_loss": 0.69559789, + "learning_rate": 1.6086018754274372e-06, + "loss": 0.71801335, + "num_input_tokens_seen": 206250975, + "step": 9577, + "time_per_iteration": 2.5977649688720703 + }, + { + "auxiliary_loss_clip": 0.0114948, + "auxiliary_loss_mlp": 0.01108476, + "balance_loss_clip": 1.00198388, + "balance_loss_mlp": 1.00051975, + "epoch": 0.5758605140538103, + "flos": 16473015377280.0, + "grad_norm": 1.648771829021899, + "language_loss": 0.66408235, + "learning_rate": 1.6082199518011504e-06, + "loss": 0.68666196, + "num_input_tokens_seen": 206268800, + "step": 9578, + "time_per_iteration": 2.597088098526001 + }, + { + "auxiliary_loss_clip": 0.01137627, + "auxiliary_loss_mlp": 0.01107769, + "balance_loss_clip": 1.00208485, + "balance_loss_mlp": 1.00048041, + "epoch": 0.5759206373064782, + "flos": 21287558643840.0, + "grad_norm": 2.0039094730720812, + "language_loss": 0.72860652, + "learning_rate": 1.6078380430312016e-06, + "loss": 0.75106049, + "num_input_tokens_seen": 206287190, + "step": 9579, + "time_per_iteration": 2.5942983627319336 + }, + { + "auxiliary_loss_clip": 0.01132822, + "auxiliary_loss_mlp": 0.01110229, + "balance_loss_clip": 1.0020479, + "balance_loss_mlp": 1.00055647, + "epoch": 0.5759807605591463, + "flos": 26067879227520.0, + "grad_norm": 2.244783925805768, + "language_loss": 0.64518321, + "learning_rate": 1.6074561491320742e-06, + "loss": 0.66761369, + "num_input_tokens_seen": 206307020, + "step": 9580, + "time_per_iteration": 2.647970676422119 + }, + { + "auxiliary_loss_clip": 0.01136111, + "auxiliary_loss_mlp": 0.01109247, + "balance_loss_clip": 1.00200796, + "balance_loss_mlp": 1.00062323, + "epoch": 0.5760408838118142, + "flos": 18873068376960.0, + "grad_norm": 1.885759143466501, + "language_loss": 0.85607886, + "learning_rate": 1.6070742701182486e-06, + "loss": 0.87853241, + "num_input_tokens_seen": 206324095, + "step": 9581, + "time_per_iteration": 2.5884742736816406 + }, + { + "auxiliary_loss_clip": 0.01166342, + "auxiliary_loss_mlp": 0.0111001, + "balance_loss_clip": 1.00222027, + "balance_loss_mlp": 1.00071836, + "epoch": 0.5761010070644822, + "flos": 15378134964480.0, + "grad_norm": 5.600979573654476, + "language_loss": 0.67469263, + "learning_rate": 1.6066924060042057e-06, + "loss": 0.69745612, + "num_input_tokens_seen": 206343210, + "step": 9582, + "time_per_iteration": 2.502168655395508 + }, + { + "auxiliary_loss_clip": 0.01131077, + "auxiliary_loss_mlp": 0.01085007, + "balance_loss_clip": 1.00118303, + "balance_loss_mlp": 0.99993896, + "epoch": 0.5761611303171501, + "flos": 71471932882560.0, + "grad_norm": 0.64294951222157, + "language_loss": 0.57228929, + "learning_rate": 1.6063105568044271e-06, + "loss": 0.59445018, + "num_input_tokens_seen": 206415935, + "step": 9583, + "time_per_iteration": 3.3341224193573 + }, + { + "auxiliary_loss_clip": 0.01134881, + "auxiliary_loss_mlp": 0.01109668, + "balance_loss_clip": 1.00199199, + "balance_loss_mlp": 1.00056767, + "epoch": 0.5762212535698181, + "flos": 16246167033600.0, + "grad_norm": 2.047937541397578, + "language_loss": 0.82597077, + "learning_rate": 1.6059287225333912e-06, + "loss": 0.84841627, + "num_input_tokens_seen": 206431900, + "step": 9584, + "time_per_iteration": 2.577014446258545 + }, + { + "auxiliary_loss_clip": 0.01160615, + "auxiliary_loss_mlp": 0.01084635, + "balance_loss_clip": 1.00120294, + "balance_loss_mlp": 0.99994832, + "epoch": 0.5762813768224861, + "flos": 70185504216960.0, + "grad_norm": 0.6243958121887375, + "language_loss": 0.49646187, + "learning_rate": 1.6055469032055773e-06, + "loss": 0.5189144, + "num_input_tokens_seen": 206501200, + "step": 9585, + "time_per_iteration": 3.1211884021759033 + }, + { + "auxiliary_loss_clip": 0.01131772, + "auxiliary_loss_mlp": 0.01107222, + "balance_loss_clip": 1.00172365, + "balance_loss_mlp": 1.0004108, + "epoch": 0.5763415000751541, + "flos": 20518028645760.0, + "grad_norm": 1.6251991917114745, + "language_loss": 0.84649694, + "learning_rate": 1.605165098835465e-06, + "loss": 0.86888689, + "num_input_tokens_seen": 206520575, + "step": 9586, + "time_per_iteration": 2.601734161376953 + }, + { + "auxiliary_loss_clip": 0.01149293, + "auxiliary_loss_mlp": 0.01108412, + "balance_loss_clip": 1.00197172, + "balance_loss_mlp": 1.0005517, + "epoch": 0.5764016233278221, + "flos": 15815526877440.0, + "grad_norm": 1.7487037093114455, + "language_loss": 0.80000913, + "learning_rate": 1.6047833094375308e-06, + "loss": 0.82258618, + "num_input_tokens_seen": 206538060, + "step": 9587, + "time_per_iteration": 2.5492441654205322 + }, + { + "auxiliary_loss_clip": 0.01134433, + "auxiliary_loss_mlp": 0.01109292, + "balance_loss_clip": 1.00197017, + "balance_loss_mlp": 1.00057268, + "epoch": 0.57646174658049, + "flos": 20772312001920.0, + "grad_norm": 1.694983364319035, + "language_loss": 0.65830034, + "learning_rate": 1.6044015350262542e-06, + "loss": 0.68073756, + "num_input_tokens_seen": 206557320, + "step": 9588, + "time_per_iteration": 2.583209753036499 + }, + { + "auxiliary_loss_clip": 0.01135035, + "auxiliary_loss_mlp": 0.01109058, + "balance_loss_clip": 1.00196791, + "balance_loss_mlp": 1.00062537, + "epoch": 0.576521869833158, + "flos": 23549930812800.0, + "grad_norm": 3.0458156447549296, + "language_loss": 0.78383994, + "learning_rate": 1.6040197756161104e-06, + "loss": 0.80628091, + "num_input_tokens_seen": 206575780, + "step": 9589, + "time_per_iteration": 2.6013801097869873 + }, + { + "auxiliary_loss_clip": 0.01165793, + "auxiliary_loss_mlp": 0.01107873, + "balance_loss_clip": 1.00197077, + "balance_loss_mlp": 1.00039423, + "epoch": 0.5765819930858259, + "flos": 20266582464000.0, + "grad_norm": 2.1852477763284135, + "language_loss": 0.79148883, + "learning_rate": 1.6036380312215762e-06, + "loss": 0.81422549, + "num_input_tokens_seen": 206594100, + "step": 9590, + "time_per_iteration": 2.5135021209716797 + }, + { + "auxiliary_loss_clip": 0.01084443, + "auxiliary_loss_mlp": 0.00747314, + "balance_loss_clip": 1.00171232, + "balance_loss_mlp": 1.00048292, + "epoch": 0.5766421163384939, + "flos": 23148772744320.0, + "grad_norm": 4.599220788485463, + "language_loss": 0.63127935, + "learning_rate": 1.6032563018571283e-06, + "loss": 0.64959693, + "num_input_tokens_seen": 206613325, + "step": 9591, + "time_per_iteration": 2.7345430850982666 + }, + { + "auxiliary_loss_clip": 0.01166154, + "auxiliary_loss_mlp": 0.00747336, + "balance_loss_clip": 1.00217652, + "balance_loss_mlp": 1.00050855, + "epoch": 0.5767022395911618, + "flos": 25848895962240.0, + "grad_norm": 1.6434233728116467, + "language_loss": 0.78060937, + "learning_rate": 1.6028745875372406e-06, + "loss": 0.79974431, + "num_input_tokens_seen": 206634265, + "step": 9592, + "time_per_iteration": 2.563899040222168 + }, + { + "auxiliary_loss_clip": 0.0108388, + "auxiliary_loss_mlp": 0.01085093, + "balance_loss_clip": 1.00103438, + "balance_loss_mlp": 1.00002563, + "epoch": 0.5767623628438299, + "flos": 68293299657600.0, + "grad_norm": 0.7336163062236376, + "language_loss": 0.59654093, + "learning_rate": 1.6024928882763885e-06, + "loss": 0.61823058, + "num_input_tokens_seen": 206696990, + "step": 9593, + "time_per_iteration": 3.5092720985412598 + }, + { + "auxiliary_loss_clip": 0.01151267, + "auxiliary_loss_mlp": 0.01109644, + "balance_loss_clip": 1.00204313, + "balance_loss_mlp": 1.00063956, + "epoch": 0.5768224860964978, + "flos": 30188448754560.0, + "grad_norm": 2.755876086632889, + "language_loss": 0.71014076, + "learning_rate": 1.6021112040890463e-06, + "loss": 0.73274988, + "num_input_tokens_seen": 206717815, + "step": 9594, + "time_per_iteration": 2.91306471824646 + }, + { + "auxiliary_loss_clip": 0.01115887, + "auxiliary_loss_mlp": 0.011083, + "balance_loss_clip": 1.00181103, + "balance_loss_mlp": 1.00053501, + "epoch": 0.5768826093491658, + "flos": 17895041884800.0, + "grad_norm": 1.810665184546381, + "language_loss": 0.71009195, + "learning_rate": 1.6017295349896863e-06, + "loss": 0.73233378, + "num_input_tokens_seen": 206735985, + "step": 9595, + "time_per_iteration": 2.637047290802002 + }, + { + "auxiliary_loss_clip": 0.01166012, + "auxiliary_loss_mlp": 0.01108794, + "balance_loss_clip": 1.0020076, + "balance_loss_mlp": 1.00055194, + "epoch": 0.5769427326018337, + "flos": 17457183095040.0, + "grad_norm": 2.85189449556319, + "language_loss": 0.6953758, + "learning_rate": 1.6013478809927828e-06, + "loss": 0.71812379, + "num_input_tokens_seen": 206753370, + "step": 9596, + "time_per_iteration": 3.8742940425872803 + }, + { + "auxiliary_loss_clip": 0.01132644, + "auxiliary_loss_mlp": 0.01109379, + "balance_loss_clip": 1.00185585, + "balance_loss_mlp": 1.00046968, + "epoch": 0.5770028558545017, + "flos": 39421728345600.0, + "grad_norm": 2.0098600363367956, + "language_loss": 0.67520601, + "learning_rate": 1.6009662421128074e-06, + "loss": 0.69762623, + "num_input_tokens_seen": 206777645, + "step": 9597, + "time_per_iteration": 2.745511293411255 + }, + { + "auxiliary_loss_clip": 0.01132623, + "auxiliary_loss_mlp": 0.01108357, + "balance_loss_clip": 1.00191808, + "balance_loss_mlp": 1.00068724, + "epoch": 0.5770629791071697, + "flos": 21536383132800.0, + "grad_norm": 1.7536405033217626, + "language_loss": 0.81962931, + "learning_rate": 1.6005846183642323e-06, + "loss": 0.84203911, + "num_input_tokens_seen": 206794865, + "step": 9598, + "time_per_iteration": 2.618631362915039 + }, + { + "auxiliary_loss_clip": 0.01121478, + "auxiliary_loss_mlp": 0.01108225, + "balance_loss_clip": 1.00198245, + "balance_loss_mlp": 1.00065017, + "epoch": 0.5771231023598377, + "flos": 20886795624960.0, + "grad_norm": 1.4582393063423582, + "language_loss": 0.72692609, + "learning_rate": 1.6002030097615277e-06, + "loss": 0.74922311, + "num_input_tokens_seen": 206814095, + "step": 9599, + "time_per_iteration": 2.6337668895721436 + }, + { + "auxiliary_loss_clip": 0.0116591, + "auxiliary_loss_mlp": 0.01107632, + "balance_loss_clip": 1.00209403, + "balance_loss_mlp": 1.00043941, + "epoch": 0.5771832256125057, + "flos": 18077216688000.0, + "grad_norm": 1.9904430512228, + "language_loss": 0.77891111, + "learning_rate": 1.5998214163191663e-06, + "loss": 0.80164653, + "num_input_tokens_seen": 206832245, + "step": 9600, + "time_per_iteration": 2.511575222015381 + }, + { + "auxiliary_loss_clip": 0.01151415, + "auxiliary_loss_mlp": 0.00747416, + "balance_loss_clip": 1.00209844, + "balance_loss_mlp": 1.00050628, + "epoch": 0.5772433488651736, + "flos": 26359078786560.0, + "grad_norm": 1.5863007336670594, + "language_loss": 0.72360563, + "learning_rate": 1.5994398380516163e-06, + "loss": 0.742594, + "num_input_tokens_seen": 206851535, + "step": 9601, + "time_per_iteration": 2.6269001960754395 + }, + { + "auxiliary_loss_clip": 0.01087769, + "auxiliary_loss_mlp": 0.01108619, + "balance_loss_clip": 1.0018158, + "balance_loss_mlp": 1.00056815, + "epoch": 0.5773034721178416, + "flos": 19680987035520.0, + "grad_norm": 1.6902589337721488, + "language_loss": 0.68391675, + "learning_rate": 1.599058274973348e-06, + "loss": 0.70588064, + "num_input_tokens_seen": 206870595, + "step": 9602, + "time_per_iteration": 2.784205675125122 + }, + { + "auxiliary_loss_clip": 0.01134135, + "auxiliary_loss_mlp": 0.01107622, + "balance_loss_clip": 1.00204456, + "balance_loss_mlp": 1.00071537, + "epoch": 0.5773635953705095, + "flos": 25082885496960.0, + "grad_norm": 1.4880450344790752, + "language_loss": 0.73370624, + "learning_rate": 1.5986767270988297e-06, + "loss": 0.75612378, + "num_input_tokens_seen": 206892320, + "step": 9603, + "time_per_iteration": 5.470494031906128 + }, + { + "auxiliary_loss_clip": 0.01149254, + "auxiliary_loss_mlp": 0.01107862, + "balance_loss_clip": 1.00200272, + "balance_loss_mlp": 1.00047803, + "epoch": 0.5774237186231775, + "flos": 21032987978880.0, + "grad_norm": 1.7181088562953633, + "language_loss": 0.76385528, + "learning_rate": 1.5982951944425298e-06, + "loss": 0.78642642, + "num_input_tokens_seen": 206912485, + "step": 9604, + "time_per_iteration": 2.5543265342712402 + }, + { + "auxiliary_loss_clip": 0.011177, + "auxiliary_loss_mlp": 0.01108922, + "balance_loss_clip": 1.00188398, + "balance_loss_mlp": 1.00058436, + "epoch": 0.5774838418758454, + "flos": 15231727128960.0, + "grad_norm": 2.2956419647031474, + "language_loss": 0.83519483, + "learning_rate": 1.5979136770189174e-06, + "loss": 0.85746098, + "num_input_tokens_seen": 206929100, + "step": 9605, + "time_per_iteration": 4.103900909423828 + }, + { + "auxiliary_loss_clip": 0.01118143, + "auxiliary_loss_mlp": 0.01111149, + "balance_loss_clip": 1.00198615, + "balance_loss_mlp": 1.00033236, + "epoch": 0.5775439651285135, + "flos": 23582609210880.0, + "grad_norm": 1.7635072393367557, + "language_loss": 0.77905226, + "learning_rate": 1.5975321748424581e-06, + "loss": 0.80134523, + "num_input_tokens_seen": 206947020, + "step": 9606, + "time_per_iteration": 2.6656394004821777 + }, + { + "auxiliary_loss_clip": 0.01166122, + "auxiliary_loss_mlp": 0.01108971, + "balance_loss_clip": 1.00214219, + "balance_loss_mlp": 1.00053835, + "epoch": 0.5776040883811814, + "flos": 18040515966720.0, + "grad_norm": 1.6994782940737885, + "language_loss": 0.74024588, + "learning_rate": 1.597150687927619e-06, + "loss": 0.76299679, + "num_input_tokens_seen": 206964065, + "step": 9607, + "time_per_iteration": 2.522176504135132 + }, + { + "auxiliary_loss_clip": 0.01118501, + "auxiliary_loss_mlp": 0.01109283, + "balance_loss_clip": 1.00192571, + "balance_loss_mlp": 1.00056386, + "epoch": 0.5776642116338494, + "flos": 18624638937600.0, + "grad_norm": 1.7582842276358894, + "language_loss": 0.69271451, + "learning_rate": 1.5967692162888664e-06, + "loss": 0.7149924, + "num_input_tokens_seen": 206981940, + "step": 9608, + "time_per_iteration": 2.6368191242218018 + }, + { + "auxiliary_loss_clip": 0.01117246, + "auxiliary_loss_mlp": 0.01109059, + "balance_loss_clip": 1.00189734, + "balance_loss_mlp": 1.00053132, + "epoch": 0.5777243348865173, + "flos": 28402539517440.0, + "grad_norm": 2.417022178167743, + "language_loss": 0.76787001, + "learning_rate": 1.596387759940665e-06, + "loss": 0.790133, + "num_input_tokens_seen": 207002365, + "step": 9609, + "time_per_iteration": 2.6957597732543945 + }, + { + "auxiliary_loss_clip": 0.01118306, + "auxiliary_loss_mlp": 0.01107676, + "balance_loss_clip": 1.00190985, + "balance_loss_mlp": 1.00048304, + "epoch": 0.5777844581391853, + "flos": 24024705805440.0, + "grad_norm": 1.634160576880898, + "language_loss": 0.7726016, + "learning_rate": 1.5960063188974808e-06, + "loss": 0.79486144, + "num_input_tokens_seen": 207021195, + "step": 9610, + "time_per_iteration": 2.653326988220215 + }, + { + "auxiliary_loss_clip": 0.01119272, + "auxiliary_loss_mlp": 0.01108598, + "balance_loss_clip": 1.00199306, + "balance_loss_mlp": 1.00045133, + "epoch": 0.5778445813918534, + "flos": 17777361951360.0, + "grad_norm": 2.2691346658618805, + "language_loss": 0.68669951, + "learning_rate": 1.5956248931737777e-06, + "loss": 0.70897824, + "num_input_tokens_seen": 207037465, + "step": 9611, + "time_per_iteration": 2.624748468399048 + }, + { + "auxiliary_loss_clip": 0.0115125, + "auxiliary_loss_mlp": 0.01108484, + "balance_loss_clip": 1.00205231, + "balance_loss_mlp": 1.00043273, + "epoch": 0.5779047046445213, + "flos": 22233194046720.0, + "grad_norm": 2.1148810583733066, + "language_loss": 0.83119255, + "learning_rate": 1.5952434827840185e-06, + "loss": 0.85378993, + "num_input_tokens_seen": 207054230, + "step": 9612, + "time_per_iteration": 2.6306886672973633 + }, + { + "auxiliary_loss_clip": 0.01166097, + "auxiliary_loss_mlp": 0.01108952, + "balance_loss_clip": 1.00221705, + "balance_loss_mlp": 1.00051928, + "epoch": 0.5779648278971893, + "flos": 21434361528960.0, + "grad_norm": 3.526936856257733, + "language_loss": 0.79625678, + "learning_rate": 1.594862087742667e-06, + "loss": 0.81900728, + "num_input_tokens_seen": 207073150, + "step": 9613, + "time_per_iteration": 2.5410234928131104 + }, + { + "auxiliary_loss_clip": 0.01150384, + "auxiliary_loss_mlp": 0.01107153, + "balance_loss_clip": 1.00188494, + "balance_loss_mlp": 1.00053239, + "epoch": 0.5780249511498572, + "flos": 19026120228480.0, + "grad_norm": 1.9976265687393473, + "language_loss": 0.77651882, + "learning_rate": 1.5944807080641863e-06, + "loss": 0.7990942, + "num_input_tokens_seen": 207090375, + "step": 9614, + "time_per_iteration": 2.557596445083618 + }, + { + "auxiliary_loss_clip": 0.01115826, + "auxiliary_loss_mlp": 0.0110818, + "balance_loss_clip": 1.00184536, + "balance_loss_mlp": 1.00051022, + "epoch": 0.5780850744025252, + "flos": 12124663752960.0, + "grad_norm": 2.88074670476126, + "language_loss": 0.81301379, + "learning_rate": 1.5940993437630375e-06, + "loss": 0.83525389, + "num_input_tokens_seen": 207106030, + "step": 9615, + "time_per_iteration": 2.609196662902832 + }, + { + "auxiliary_loss_clip": 0.01151267, + "auxiliary_loss_mlp": 0.01109581, + "balance_loss_clip": 1.00197971, + "balance_loss_mlp": 1.00067091, + "epoch": 0.5781451976551931, + "flos": 25044425009280.0, + "grad_norm": 1.7266938026525471, + "language_loss": 0.66933095, + "learning_rate": 1.5937179948536825e-06, + "loss": 0.69193941, + "num_input_tokens_seen": 207125435, + "step": 9616, + "time_per_iteration": 2.6239845752716064 + }, + { + "auxiliary_loss_clip": 0.01150244, + "auxiliary_loss_mlp": 0.01108408, + "balance_loss_clip": 1.0019623, + "balance_loss_mlp": 1.00054741, + "epoch": 0.5782053209078611, + "flos": 19245606284160.0, + "grad_norm": 1.8133543121540578, + "language_loss": 0.77614403, + "learning_rate": 1.5933366613505812e-06, + "loss": 0.79873055, + "num_input_tokens_seen": 207145095, + "step": 9617, + "time_per_iteration": 2.554659366607666 + }, + { + "auxiliary_loss_clip": 0.01132439, + "auxiliary_loss_mlp": 0.01107918, + "balance_loss_clip": 1.0018599, + "balance_loss_mlp": 1.00053418, + "epoch": 0.578265444160529, + "flos": 25993831340160.0, + "grad_norm": 3.055111358930792, + "language_loss": 0.74981892, + "learning_rate": 1.5929553432681947e-06, + "loss": 0.77222246, + "num_input_tokens_seen": 207166045, + "step": 9618, + "time_per_iteration": 2.6319100856781006 + }, + { + "auxiliary_loss_clip": 0.01165917, + "auxiliary_loss_mlp": 0.01107607, + "balance_loss_clip": 1.00208485, + "balance_loss_mlp": 1.00041413, + "epoch": 0.5783255674131971, + "flos": 21798603394560.0, + "grad_norm": 1.794770794823841, + "language_loss": 0.81218576, + "learning_rate": 1.5925740406209826e-06, + "loss": 0.834921, + "num_input_tokens_seen": 207185290, + "step": 9619, + "time_per_iteration": 2.5368599891662598 + }, + { + "auxiliary_loss_clip": 0.01134319, + "auxiliary_loss_mlp": 0.01107957, + "balance_loss_clip": 1.00184894, + "balance_loss_mlp": 1.00047803, + "epoch": 0.578385690665865, + "flos": 24789746603520.0, + "grad_norm": 3.587930179943035, + "language_loss": 0.72411019, + "learning_rate": 1.5921927534234039e-06, + "loss": 0.74653292, + "num_input_tokens_seen": 207205505, + "step": 9620, + "time_per_iteration": 2.6499359607696533 + }, + { + "auxiliary_loss_clip": 0.01134098, + "auxiliary_loss_mlp": 0.01108427, + "balance_loss_clip": 1.00193, + "balance_loss_mlp": 1.00056648, + "epoch": 0.578445813918533, + "flos": 21212864311680.0, + "grad_norm": 1.5849091869500185, + "language_loss": 0.77352899, + "learning_rate": 1.591811481689916e-06, + "loss": 0.79595423, + "num_input_tokens_seen": 207225315, + "step": 9621, + "time_per_iteration": 2.6141109466552734 + }, + { + "auxiliary_loss_clip": 0.01088106, + "auxiliary_loss_mlp": 0.01109145, + "balance_loss_clip": 1.00185645, + "balance_loss_mlp": 1.0005213, + "epoch": 0.5785059371712009, + "flos": 25046795306880.0, + "grad_norm": 1.9492281615506215, + "language_loss": 0.70613307, + "learning_rate": 1.5914302254349787e-06, + "loss": 0.7281056, + "num_input_tokens_seen": 207247690, + "step": 9622, + "time_per_iteration": 2.7504336833953857 + }, + { + "auxiliary_loss_clip": 0.01127572, + "auxiliary_loss_mlp": 0.01084622, + "balance_loss_clip": 1.00144911, + "balance_loss_mlp": 0.99993593, + "epoch": 0.5785660604238689, + "flos": 70843172284800.0, + "grad_norm": 0.7665974032626854, + "language_loss": 0.55978799, + "learning_rate": 1.5910489846730476e-06, + "loss": 0.58190989, + "num_input_tokens_seen": 207301735, + "step": 9623, + "time_per_iteration": 3.18428373336792 + }, + { + "auxiliary_loss_clip": 0.01115767, + "auxiliary_loss_mlp": 0.01108513, + "balance_loss_clip": 1.00181961, + "balance_loss_mlp": 1.00055754, + "epoch": 0.578626183676537, + "flos": 31649977244160.0, + "grad_norm": 2.229166184446626, + "language_loss": 0.71325928, + "learning_rate": 1.5906677594185799e-06, + "loss": 0.73550212, + "num_input_tokens_seen": 207321240, + "step": 9624, + "time_per_iteration": 2.7238926887512207 + }, + { + "auxiliary_loss_clip": 0.01119145, + "auxiliary_loss_mlp": 0.01108337, + "balance_loss_clip": 1.00196338, + "balance_loss_mlp": 1.00066745, + "epoch": 0.5786863069292049, + "flos": 21865181253120.0, + "grad_norm": 1.900275231314409, + "language_loss": 0.82166517, + "learning_rate": 1.5902865496860322e-06, + "loss": 0.84394002, + "num_input_tokens_seen": 207339540, + "step": 9625, + "time_per_iteration": 2.682541847229004 + }, + { + "auxiliary_loss_clip": 0.01165847, + "auxiliary_loss_mlp": 0.0110857, + "balance_loss_clip": 1.00209141, + "balance_loss_mlp": 1.0005188, + "epoch": 0.5787464301818729, + "flos": 23364954748800.0, + "grad_norm": 2.0068877230833224, + "language_loss": 0.70082819, + "learning_rate": 1.5899053554898591e-06, + "loss": 0.72357237, + "num_input_tokens_seen": 207360470, + "step": 9626, + "time_per_iteration": 2.558656930923462 + }, + { + "auxiliary_loss_clip": 0.01133622, + "auxiliary_loss_mlp": 0.01107473, + "balance_loss_clip": 1.00183427, + "balance_loss_mlp": 1.00056577, + "epoch": 0.5788065534345408, + "flos": 30004011394560.0, + "grad_norm": 1.4721854769362919, + "language_loss": 0.71833223, + "learning_rate": 1.5895241768445166e-06, + "loss": 0.74074316, + "num_input_tokens_seen": 207383080, + "step": 9627, + "time_per_iteration": 2.665503740310669 + }, + { + "auxiliary_loss_clip": 0.01151184, + "auxiliary_loss_mlp": 0.01107502, + "balance_loss_clip": 1.00200844, + "balance_loss_mlp": 1.0005002, + "epoch": 0.5788666766872088, + "flos": 24527849564160.0, + "grad_norm": 1.8532624464882026, + "language_loss": 0.83995134, + "learning_rate": 1.589143013764458e-06, + "loss": 0.86253822, + "num_input_tokens_seen": 207401000, + "step": 9628, + "time_per_iteration": 2.6039886474609375 + }, + { + "auxiliary_loss_clip": 0.01134017, + "auxiliary_loss_mlp": 0.01108276, + "balance_loss_clip": 1.00193238, + "balance_loss_mlp": 1.00051093, + "epoch": 0.5789267999398767, + "flos": 23732823888000.0, + "grad_norm": 1.541520282674656, + "language_loss": 0.72303307, + "learning_rate": 1.5887618662641376e-06, + "loss": 0.74545598, + "num_input_tokens_seen": 207419230, + "step": 9629, + "time_per_iteration": 2.633772373199463 + }, + { + "auxiliary_loss_clip": 0.0113425, + "auxiliary_loss_mlp": 0.01109124, + "balance_loss_clip": 1.00200856, + "balance_loss_mlp": 1.00050008, + "epoch": 0.5789869231925447, + "flos": 21135045496320.0, + "grad_norm": 2.014808020863201, + "language_loss": 0.74031216, + "learning_rate": 1.5883807343580087e-06, + "loss": 0.76274592, + "num_input_tokens_seen": 207437615, + "step": 9630, + "time_per_iteration": 2.578075885772705 + }, + { + "auxiliary_loss_clip": 0.01118416, + "auxiliary_loss_mlp": 0.00747238, + "balance_loss_clip": 1.00186682, + "balance_loss_mlp": 1.00051665, + "epoch": 0.5790470464452127, + "flos": 21209632087680.0, + "grad_norm": 1.6194381973023666, + "language_loss": 0.78726792, + "learning_rate": 1.587999618060523e-06, + "loss": 0.80592442, + "num_input_tokens_seen": 207457270, + "step": 9631, + "time_per_iteration": 2.681760311126709 + }, + { + "auxiliary_loss_clip": 0.01166057, + "auxiliary_loss_mlp": 0.01108284, + "balance_loss_clip": 1.00209951, + "balance_loss_mlp": 1.00051904, + "epoch": 0.5791071696978807, + "flos": 23404384903680.0, + "grad_norm": 2.027289791465858, + "language_loss": 0.75056553, + "learning_rate": 1.5876185173861333e-06, + "loss": 0.77330899, + "num_input_tokens_seen": 207477890, + "step": 9632, + "time_per_iteration": 2.5520401000976562 + }, + { + "auxiliary_loss_clip": 0.01138013, + "auxiliary_loss_mlp": 0.01108649, + "balance_loss_clip": 1.00208974, + "balance_loss_mlp": 1.0004071, + "epoch": 0.5791672929505486, + "flos": 24206521472640.0, + "grad_norm": 1.8400907912258953, + "language_loss": 0.79513311, + "learning_rate": 1.5872374323492915e-06, + "loss": 0.81759971, + "num_input_tokens_seen": 207497670, + "step": 9633, + "time_per_iteration": 4.03522801399231 + }, + { + "auxiliary_loss_clip": 0.01117861, + "auxiliary_loss_mlp": 0.01110702, + "balance_loss_clip": 1.00186276, + "balance_loss_mlp": 1.00074339, + "epoch": 0.5792274162032166, + "flos": 24348871071360.0, + "grad_norm": 1.6299376712425238, + "language_loss": 0.77444273, + "learning_rate": 1.5868563629644464e-06, + "loss": 0.79672831, + "num_input_tokens_seen": 207516105, + "step": 9634, + "time_per_iteration": 2.664456844329834 + }, + { + "auxiliary_loss_clip": 0.01135534, + "auxiliary_loss_mlp": 0.01109472, + "balance_loss_clip": 1.00194871, + "balance_loss_mlp": 1.00056195, + "epoch": 0.5792875394558845, + "flos": 20449403712000.0, + "grad_norm": 2.3409918682763395, + "language_loss": 0.63055813, + "learning_rate": 1.5864753092460502e-06, + "loss": 0.65300822, + "num_input_tokens_seen": 207533685, + "step": 9635, + "time_per_iteration": 2.6005518436431885 + }, + { + "auxiliary_loss_clip": 0.01136037, + "auxiliary_loss_mlp": 0.01107727, + "balance_loss_clip": 1.00208449, + "balance_loss_mlp": 1.00053382, + "epoch": 0.5793476627085525, + "flos": 24060329118720.0, + "grad_norm": 1.6767050169787803, + "language_loss": 0.76968265, + "learning_rate": 1.5860942712085516e-06, + "loss": 0.79212028, + "num_input_tokens_seen": 207552840, + "step": 9636, + "time_per_iteration": 2.634751081466675 + }, + { + "auxiliary_loss_clip": 0.01134121, + "auxiliary_loss_mlp": 0.01106677, + "balance_loss_clip": 1.00190687, + "balance_loss_mlp": 1.00053275, + "epoch": 0.5794077859612206, + "flos": 22054287381120.0, + "grad_norm": 1.6958820586722272, + "language_loss": 0.68290317, + "learning_rate": 1.5857132488663998e-06, + "loss": 0.70531118, + "num_input_tokens_seen": 207572095, + "step": 9637, + "time_per_iteration": 2.6061649322509766 + }, + { + "auxiliary_loss_clip": 0.01100528, + "auxiliary_loss_mlp": 0.01108212, + "balance_loss_clip": 1.00179517, + "balance_loss_mlp": 1.0005424, + "epoch": 0.5794679092138885, + "flos": 11434855991040.0, + "grad_norm": 2.3466987641970687, + "language_loss": 0.72818148, + "learning_rate": 1.585332242234043e-06, + "loss": 0.75026882, + "num_input_tokens_seen": 207587495, + "step": 9638, + "time_per_iteration": 2.6335272789001465 + }, + { + "auxiliary_loss_clip": 0.01149288, + "auxiliary_loss_mlp": 0.01108069, + "balance_loss_clip": 1.00214005, + "balance_loss_mlp": 1.00058973, + "epoch": 0.5795280324665565, + "flos": 18880215183360.0, + "grad_norm": 1.9077287411894925, + "language_loss": 0.72556478, + "learning_rate": 1.5849512513259291e-06, + "loss": 0.74813831, + "num_input_tokens_seen": 207606795, + "step": 9639, + "time_per_iteration": 2.5341665744781494 + }, + { + "auxiliary_loss_clip": 0.01134463, + "auxiliary_loss_mlp": 0.01108542, + "balance_loss_clip": 1.00192904, + "balance_loss_mlp": 1.00058603, + "epoch": 0.5795881557192244, + "flos": 13005947940480.0, + "grad_norm": 2.319490207860426, + "language_loss": 0.69517154, + "learning_rate": 1.5845702761565054e-06, + "loss": 0.71760166, + "num_input_tokens_seen": 207623620, + "step": 9640, + "time_per_iteration": 4.114201545715332 + }, + { + "auxiliary_loss_clip": 0.01134359, + "auxiliary_loss_mlp": 0.01109485, + "balance_loss_clip": 1.00197148, + "balance_loss_mlp": 1.00057566, + "epoch": 0.5796482789718924, + "flos": 19932397303680.0, + "grad_norm": 5.598643037709018, + "language_loss": 0.77941674, + "learning_rate": 1.5841893167402183e-06, + "loss": 0.80185521, + "num_input_tokens_seen": 207639380, + "step": 9641, + "time_per_iteration": 3.956347942352295 + }, + { + "auxiliary_loss_clip": 0.01165893, + "auxiliary_loss_mlp": 0.01107967, + "balance_loss_clip": 1.00209737, + "balance_loss_mlp": 1.0004878, + "epoch": 0.5797084022245603, + "flos": 21650794928640.0, + "grad_norm": 1.8244319228387118, + "language_loss": 0.74105835, + "learning_rate": 1.5838083730915143e-06, + "loss": 0.76379693, + "num_input_tokens_seen": 207657915, + "step": 9642, + "time_per_iteration": 2.5362255573272705 + }, + { + "auxiliary_loss_clip": 0.01135508, + "auxiliary_loss_mlp": 0.01108244, + "balance_loss_clip": 1.00195932, + "balance_loss_mlp": 1.00047898, + "epoch": 0.5797685254772283, + "flos": 26031573555840.0, + "grad_norm": 1.6583552195414628, + "language_loss": 0.73385406, + "learning_rate": 1.5834274452248378e-06, + "loss": 0.75629157, + "num_input_tokens_seen": 207678620, + "step": 9643, + "time_per_iteration": 4.101254940032959 + }, + { + "auxiliary_loss_clip": 0.01166127, + "auxiliary_loss_mlp": 0.0110894, + "balance_loss_clip": 1.00208271, + "balance_loss_mlp": 1.00050712, + "epoch": 0.5798286487298963, + "flos": 22705167778560.0, + "grad_norm": 1.953812653187021, + "language_loss": 0.67477471, + "learning_rate": 1.5830465331546352e-06, + "loss": 0.69752538, + "num_input_tokens_seen": 207696980, + "step": 9644, + "time_per_iteration": 2.5335466861724854 + }, + { + "auxiliary_loss_clip": 0.01151257, + "auxiliary_loss_mlp": 0.01109196, + "balance_loss_clip": 1.00215197, + "balance_loss_mlp": 1.00047672, + "epoch": 0.5798887719825643, + "flos": 23148988225920.0, + "grad_norm": 2.397350299831126, + "language_loss": 0.85639656, + "learning_rate": 1.5826656368953496e-06, + "loss": 0.87900114, + "num_input_tokens_seen": 207714065, + "step": 9645, + "time_per_iteration": 2.55277156829834 + }, + { + "auxiliary_loss_clip": 0.01166045, + "auxiliary_loss_mlp": 0.01108849, + "balance_loss_clip": 1.00219107, + "balance_loss_mlp": 1.00051212, + "epoch": 0.5799488952352322, + "flos": 24426043441920.0, + "grad_norm": 1.8537073614850612, + "language_loss": 0.75179571, + "learning_rate": 1.5822847564614244e-06, + "loss": 0.77454466, + "num_input_tokens_seen": 207734720, + "step": 9646, + "time_per_iteration": 2.611149311065674 + }, + { + "auxiliary_loss_clip": 0.01132355, + "auxiliary_loss_mlp": 0.01110295, + "balance_loss_clip": 1.00200093, + "balance_loss_mlp": 1.00052738, + "epoch": 0.5800090184879002, + "flos": 38395903829760.0, + "grad_norm": 1.7730232476274377, + "language_loss": 0.59213626, + "learning_rate": 1.5819038918673038e-06, + "loss": 0.61456275, + "num_input_tokens_seen": 207755435, + "step": 9647, + "time_per_iteration": 2.7283623218536377 + }, + { + "auxiliary_loss_clip": 0.01104049, + "auxiliary_loss_mlp": 0.01108793, + "balance_loss_clip": 1.00188339, + "balance_loss_mlp": 1.00074172, + "epoch": 0.5800691417405681, + "flos": 19784840232960.0, + "grad_norm": 1.8027370903056001, + "language_loss": 0.84091139, + "learning_rate": 1.5815230431274288e-06, + "loss": 0.86303985, + "num_input_tokens_seen": 207773570, + "step": 9648, + "time_per_iteration": 2.751878499984741 + }, + { + "auxiliary_loss_clip": 0.01144671, + "auxiliary_loss_mlp": 0.01084678, + "balance_loss_clip": 1.00140548, + "balance_loss_mlp": 0.99999219, + "epoch": 0.5801292649932361, + "flos": 70314565783680.0, + "grad_norm": 0.8362920973324297, + "language_loss": 0.63063395, + "learning_rate": 1.581142210256242e-06, + "loss": 0.65292746, + "num_input_tokens_seen": 207830095, + "step": 9649, + "time_per_iteration": 3.1861321926116943 + }, + { + "auxiliary_loss_clip": 0.01121469, + "auxiliary_loss_mlp": 0.01107542, + "balance_loss_clip": 1.00205183, + "balance_loss_mlp": 1.00053942, + "epoch": 0.5801893882459042, + "flos": 18734812928640.0, + "grad_norm": 1.9305133600321218, + "language_loss": 0.82007933, + "learning_rate": 1.5807613932681857e-06, + "loss": 0.8423695, + "num_input_tokens_seen": 207848555, + "step": 9650, + "time_per_iteration": 2.6321585178375244 + }, + { + "auxiliary_loss_clip": 0.01117302, + "auxiliary_loss_mlp": 0.01108681, + "balance_loss_clip": 1.00182533, + "balance_loss_mlp": 1.00043905, + "epoch": 0.5802495114985721, + "flos": 15596507698560.0, + "grad_norm": 2.750487421183117, + "language_loss": 0.7763809, + "learning_rate": 1.580380592177698e-06, + "loss": 0.79864073, + "num_input_tokens_seen": 207867060, + "step": 9651, + "time_per_iteration": 2.620744466781616 + }, + { + "auxiliary_loss_clip": 0.01134242, + "auxiliary_loss_mlp": 0.01109706, + "balance_loss_clip": 1.00208545, + "balance_loss_mlp": 1.00070143, + "epoch": 0.5803096347512401, + "flos": 18255405081600.0, + "grad_norm": 3.1690763831653723, + "language_loss": 0.74465698, + "learning_rate": 1.5799998069992213e-06, + "loss": 0.7670964, + "num_input_tokens_seen": 207884520, + "step": 9652, + "time_per_iteration": 2.631983995437622 + }, + { + "auxiliary_loss_clip": 0.01137546, + "auxiliary_loss_mlp": 0.01108485, + "balance_loss_clip": 1.00202322, + "balance_loss_mlp": 1.00043321, + "epoch": 0.580369758003908, + "flos": 22893160584960.0, + "grad_norm": 2.885595097333626, + "language_loss": 0.76613069, + "learning_rate": 1.579619037747193e-06, + "loss": 0.78859091, + "num_input_tokens_seen": 207905370, + "step": 9653, + "time_per_iteration": 2.6355392932891846 + }, + { + "auxiliary_loss_clip": 0.01166043, + "auxiliary_loss_mlp": 0.01108829, + "balance_loss_clip": 1.00210798, + "balance_loss_mlp": 1.00049198, + "epoch": 0.580429881256576, + "flos": 18697681244160.0, + "grad_norm": 2.4882738628353622, + "language_loss": 0.74514258, + "learning_rate": 1.5792382844360534e-06, + "loss": 0.76789129, + "num_input_tokens_seen": 207923790, + "step": 9654, + "time_per_iteration": 2.5147814750671387 + }, + { + "auxiliary_loss_clip": 0.01085019, + "auxiliary_loss_mlp": 0.01107776, + "balance_loss_clip": 1.00178933, + "balance_loss_mlp": 1.00077355, + "epoch": 0.5804900045092439, + "flos": 24681978823680.0, + "grad_norm": 1.787085444417341, + "language_loss": 0.69988108, + "learning_rate": 1.5788575470802408e-06, + "loss": 0.72180903, + "num_input_tokens_seen": 207942335, + "step": 9655, + "time_per_iteration": 2.7355082035064697 + }, + { + "auxiliary_loss_clip": 0.0116603, + "auxiliary_loss_mlp": 0.01108757, + "balance_loss_clip": 1.00200975, + "balance_loss_mlp": 1.00051522, + "epoch": 0.580550127761912, + "flos": 23112790295040.0, + "grad_norm": 2.2494034975590145, + "language_loss": 0.69662607, + "learning_rate": 1.5784768256941915e-06, + "loss": 0.71937394, + "num_input_tokens_seen": 207961975, + "step": 9656, + "time_per_iteration": 2.544492483139038 + }, + { + "auxiliary_loss_clip": 0.01148937, + "auxiliary_loss_mlp": 0.01107374, + "balance_loss_clip": 1.0020541, + "balance_loss_mlp": 1.00065804, + "epoch": 0.5806102510145799, + "flos": 18475681236480.0, + "grad_norm": 1.4956237550820493, + "language_loss": 0.7159006, + "learning_rate": 1.5780961202923433e-06, + "loss": 0.73846376, + "num_input_tokens_seen": 207979520, + "step": 9657, + "time_per_iteration": 2.5483663082122803 + }, + { + "auxiliary_loss_clip": 0.01149198, + "auxiliary_loss_mlp": 0.01109772, + "balance_loss_clip": 1.00203037, + "balance_loss_mlp": 1.00048089, + "epoch": 0.5806703742672479, + "flos": 23915645136000.0, + "grad_norm": 1.8828656758365196, + "language_loss": 0.70991093, + "learning_rate": 1.5777154308891328e-06, + "loss": 0.73250067, + "num_input_tokens_seen": 207998375, + "step": 9658, + "time_per_iteration": 2.5773603916168213 + }, + { + "auxiliary_loss_clip": 0.01144295, + "auxiliary_loss_mlp": 0.01084296, + "balance_loss_clip": 1.00130057, + "balance_loss_mlp": 0.9999916, + "epoch": 0.5807304975199158, + "flos": 66311999412480.0, + "grad_norm": 0.645911472483649, + "language_loss": 0.53549761, + "learning_rate": 1.5773347574989953e-06, + "loss": 0.55778354, + "num_input_tokens_seen": 208060605, + "step": 9659, + "time_per_iteration": 3.15458345413208 + }, + { + "auxiliary_loss_clip": 0.0115085, + "auxiliary_loss_mlp": 0.01108497, + "balance_loss_clip": 1.00209975, + "balance_loss_mlp": 1.00073147, + "epoch": 0.5807906207725838, + "flos": 31722444933120.0, + "grad_norm": 2.14071335008914, + "language_loss": 0.62249815, + "learning_rate": 1.576954100136366e-06, + "loss": 0.64509159, + "num_input_tokens_seen": 208080320, + "step": 9660, + "time_per_iteration": 2.7013964653015137 + }, + { + "auxiliary_loss_clip": 0.01151202, + "auxiliary_loss_mlp": 0.01108825, + "balance_loss_clip": 1.0018996, + "balance_loss_mlp": 1.00048792, + "epoch": 0.5808507440252517, + "flos": 23801161512960.0, + "grad_norm": 1.5268494268268553, + "language_loss": 0.65093029, + "learning_rate": 1.5765734588156797e-06, + "loss": 0.67353058, + "num_input_tokens_seen": 208099305, + "step": 9661, + "time_per_iteration": 2.57873797416687 + }, + { + "auxiliary_loss_clip": 0.01103143, + "auxiliary_loss_mlp": 0.01107268, + "balance_loss_clip": 1.0019362, + "balance_loss_mlp": 1.00055158, + "epoch": 0.5809108672779197, + "flos": 13698449222400.0, + "grad_norm": 1.5014030506390503, + "language_loss": 0.74338889, + "learning_rate": 1.5761928335513704e-06, + "loss": 0.76549304, + "num_input_tokens_seen": 208116960, + "step": 9662, + "time_per_iteration": 2.6684839725494385 + }, + { + "auxiliary_loss_clip": 0.01160669, + "auxiliary_loss_mlp": 0.01084179, + "balance_loss_clip": 1.00125027, + "balance_loss_mlp": 0.99987394, + "epoch": 0.5809709905305876, + "flos": 69134866381440.0, + "grad_norm": 0.8748567311144871, + "language_loss": 0.58385968, + "learning_rate": 1.5758122243578709e-06, + "loss": 0.60630816, + "num_input_tokens_seen": 208182190, + "step": 9663, + "time_per_iteration": 3.16213321685791 + }, + { + "auxiliary_loss_clip": 0.01134466, + "auxiliary_loss_mlp": 0.01108369, + "balance_loss_clip": 1.00199246, + "balance_loss_mlp": 1.00050831, + "epoch": 0.5810311137832557, + "flos": 19827538525440.0, + "grad_norm": 2.2879409155178467, + "language_loss": 0.81401002, + "learning_rate": 1.5754316312496152e-06, + "loss": 0.83643842, + "num_input_tokens_seen": 208197015, + "step": 9664, + "time_per_iteration": 2.6055843830108643 + }, + { + "auxiliary_loss_clip": 0.01139349, + "auxiliary_loss_mlp": 0.00747334, + "balance_loss_clip": 1.0019381, + "balance_loss_mlp": 1.00041425, + "epoch": 0.5810912370359237, + "flos": 29238503719680.0, + "grad_norm": 1.6558093659808482, + "language_loss": 0.81779045, + "learning_rate": 1.5750510542410337e-06, + "loss": 0.83665729, + "num_input_tokens_seen": 208215795, + "step": 9665, + "time_per_iteration": 2.670412063598633 + }, + { + "auxiliary_loss_clip": 0.01133229, + "auxiliary_loss_mlp": 0.01109227, + "balance_loss_clip": 1.0020169, + "balance_loss_mlp": 1.00069904, + "epoch": 0.5811513602885916, + "flos": 22785572373120.0, + "grad_norm": 1.5982392237409027, + "language_loss": 0.81052327, + "learning_rate": 1.5746704933465599e-06, + "loss": 0.83294785, + "num_input_tokens_seen": 208234655, + "step": 9666, + "time_per_iteration": 2.6655898094177246 + }, + { + "auxiliary_loss_clip": 0.01149445, + "auxiliary_loss_mlp": 0.01108448, + "balance_loss_clip": 1.00203013, + "balance_loss_mlp": 1.0005877, + "epoch": 0.5812114835412596, + "flos": 18734346051840.0, + "grad_norm": 1.8534160400054924, + "language_loss": 0.80041957, + "learning_rate": 1.5742899485806227e-06, + "loss": 0.82299852, + "num_input_tokens_seen": 208251300, + "step": 9667, + "time_per_iteration": 2.5303003787994385 + }, + { + "auxiliary_loss_clip": 0.01150567, + "auxiliary_loss_mlp": 0.01109768, + "balance_loss_clip": 1.0019865, + "balance_loss_mlp": 1.00057268, + "epoch": 0.5812716067939275, + "flos": 26431295080320.0, + "grad_norm": 1.8325841382612669, + "language_loss": 0.78613359, + "learning_rate": 1.573909419957653e-06, + "loss": 0.80873698, + "num_input_tokens_seen": 208272685, + "step": 9668, + "time_per_iteration": 2.601724147796631 + }, + { + "auxiliary_loss_clip": 0.01132609, + "auxiliary_loss_mlp": 0.01108557, + "balance_loss_clip": 1.00193405, + "balance_loss_mlp": 1.00060081, + "epoch": 0.5813317300465956, + "flos": 43397865285120.0, + "grad_norm": 2.0115964158464936, + "language_loss": 0.64341617, + "learning_rate": 1.5735289074920819e-06, + "loss": 0.66582787, + "num_input_tokens_seen": 208294315, + "step": 9669, + "time_per_iteration": 2.783102035522461 + }, + { + "auxiliary_loss_clip": 0.01101691, + "auxiliary_loss_mlp": 0.01108725, + "balance_loss_clip": 1.00193965, + "balance_loss_mlp": 1.00057817, + "epoch": 0.5813918532992635, + "flos": 24785472885120.0, + "grad_norm": 2.3379726168549655, + "language_loss": 0.73310083, + "learning_rate": 1.5731484111983363e-06, + "loss": 0.75520498, + "num_input_tokens_seen": 208315610, + "step": 9670, + "time_per_iteration": 4.089112758636475 + }, + { + "auxiliary_loss_clip": 0.01117779, + "auxiliary_loss_mlp": 0.0110916, + "balance_loss_clip": 1.00188255, + "balance_loss_mlp": 1.00082302, + "epoch": 0.5814519765519315, + "flos": 22857357703680.0, + "grad_norm": 2.6408783689980395, + "language_loss": 0.78876781, + "learning_rate": 1.5727679310908464e-06, + "loss": 0.81103718, + "num_input_tokens_seen": 208334725, + "step": 9671, + "time_per_iteration": 2.635514736175537 + }, + { + "auxiliary_loss_clip": 0.01101763, + "auxiliary_loss_mlp": 0.01110323, + "balance_loss_clip": 1.00183797, + "balance_loss_mlp": 1.00065005, + "epoch": 0.5815120998045994, + "flos": 24060831909120.0, + "grad_norm": 2.136894915403681, + "language_loss": 0.6098516, + "learning_rate": 1.5723874671840399e-06, + "loss": 0.63197243, + "num_input_tokens_seen": 208353825, + "step": 9672, + "time_per_iteration": 2.7290194034576416 + }, + { + "auxiliary_loss_clip": 0.01107464, + "auxiliary_loss_mlp": 0.01108476, + "balance_loss_clip": 1.00199533, + "balance_loss_mlp": 1.00051999, + "epoch": 0.5815722230572674, + "flos": 24279491952000.0, + "grad_norm": 1.7924370259285154, + "language_loss": 0.81356978, + "learning_rate": 1.572007019492342e-06, + "loss": 0.83572918, + "num_input_tokens_seen": 208374160, + "step": 9673, + "time_per_iteration": 2.7093491554260254 + }, + { + "auxiliary_loss_clip": 0.01117786, + "auxiliary_loss_mlp": 0.01109412, + "balance_loss_clip": 1.00193155, + "balance_loss_mlp": 1.00050271, + "epoch": 0.5816323463099353, + "flos": 22200371994240.0, + "grad_norm": 2.0211873911186204, + "language_loss": 0.87901008, + "learning_rate": 1.5716265880301817e-06, + "loss": 0.90128207, + "num_input_tokens_seen": 208392105, + "step": 9674, + "time_per_iteration": 2.661578893661499 + }, + { + "auxiliary_loss_clip": 0.01165936, + "auxiliary_loss_mlp": 0.00747141, + "balance_loss_clip": 1.00210857, + "balance_loss_mlp": 1.00037086, + "epoch": 0.5816924695626033, + "flos": 24134448833280.0, + "grad_norm": 1.602125422370855, + "language_loss": 0.79035634, + "learning_rate": 1.571246172811984e-06, + "loss": 0.8094871, + "num_input_tokens_seen": 208411755, + "step": 9675, + "time_per_iteration": 2.6140949726104736 + }, + { + "auxiliary_loss_clip": 0.01151223, + "auxiliary_loss_mlp": 0.01108439, + "balance_loss_clip": 1.0021081, + "balance_loss_mlp": 1.00067401, + "epoch": 0.5817525928152713, + "flos": 21324223451520.0, + "grad_norm": 2.6171906302579995, + "language_loss": 0.702124, + "learning_rate": 1.5708657738521748e-06, + "loss": 0.7247206, + "num_input_tokens_seen": 208429995, + "step": 9676, + "time_per_iteration": 2.576101779937744 + }, + { + "auxiliary_loss_clip": 0.0109402, + "auxiliary_loss_mlp": 0.0110896, + "balance_loss_clip": 1.00204062, + "balance_loss_mlp": 1.00062287, + "epoch": 0.5818127160679393, + "flos": 26934510666240.0, + "grad_norm": 2.1791889699699944, + "language_loss": 0.63482702, + "learning_rate": 1.5704853911651779e-06, + "loss": 0.65685683, + "num_input_tokens_seen": 208443655, + "step": 9677, + "time_per_iteration": 4.2231011390686035 + }, + { + "auxiliary_loss_clip": 0.01129567, + "auxiliary_loss_mlp": 0.01085607, + "balance_loss_clip": 1.00163817, + "balance_loss_mlp": 1.00015795, + "epoch": 0.5818728393206073, + "flos": 63918626342400.0, + "grad_norm": 0.802284417555618, + "language_loss": 0.54204863, + "learning_rate": 1.5701050247654182e-06, + "loss": 0.5642004, + "num_input_tokens_seen": 208498405, + "step": 9678, + "time_per_iteration": 3.2604258060455322 + }, + { + "auxiliary_loss_clip": 0.01129469, + "auxiliary_loss_mlp": 0.01084215, + "balance_loss_clip": 1.00124145, + "balance_loss_mlp": 0.99990982, + "epoch": 0.5819329625732752, + "flos": 64954108638720.0, + "grad_norm": 0.7504954207835065, + "language_loss": 0.56213707, + "learning_rate": 1.569724674667319e-06, + "loss": 0.58427393, + "num_input_tokens_seen": 208559075, + "step": 9679, + "time_per_iteration": 4.58592414855957 + }, + { + "auxiliary_loss_clip": 0.01166036, + "auxiliary_loss_mlp": 0.01108078, + "balance_loss_clip": 1.00206733, + "balance_loss_mlp": 1.0005033, + "epoch": 0.5819930858259432, + "flos": 21215270522880.0, + "grad_norm": 2.029770342376121, + "language_loss": 0.65436637, + "learning_rate": 1.5693443408853032e-06, + "loss": 0.67710751, + "num_input_tokens_seen": 208577770, + "step": 9680, + "time_per_iteration": 2.5212767124176025 + }, + { + "auxiliary_loss_clip": 0.01132603, + "auxiliary_loss_mlp": 0.0110874, + "balance_loss_clip": 1.00194764, + "balance_loss_mlp": 1.00059319, + "epoch": 0.5820532090786111, + "flos": 19458520151040.0, + "grad_norm": 2.0708115344595317, + "language_loss": 0.83502746, + "learning_rate": 1.5689640234337933e-06, + "loss": 0.85744083, + "num_input_tokens_seen": 208595110, + "step": 9681, + "time_per_iteration": 4.102574348449707 + }, + { + "auxiliary_loss_clip": 0.01166001, + "auxiliary_loss_mlp": 0.01107956, + "balance_loss_clip": 1.00206995, + "balance_loss_mlp": 1.00038171, + "epoch": 0.5821133323312792, + "flos": 17712615686400.0, + "grad_norm": 1.8062119275801396, + "language_loss": 0.75944972, + "learning_rate": 1.5685837223272109e-06, + "loss": 0.78218925, + "num_input_tokens_seen": 208612080, + "step": 9682, + "time_per_iteration": 2.504910469055176 + }, + { + "auxiliary_loss_clip": 0.01092638, + "auxiliary_loss_mlp": 0.01108486, + "balance_loss_clip": 1.00189233, + "balance_loss_mlp": 1.00043488, + "epoch": 0.5821734555839471, + "flos": 24571804832640.0, + "grad_norm": 1.8949840970551195, + "language_loss": 0.75358725, + "learning_rate": 1.568203437579977e-06, + "loss": 0.77559853, + "num_input_tokens_seen": 208630235, + "step": 9683, + "time_per_iteration": 2.7308640480041504 + }, + { + "auxiliary_loss_clip": 0.01133383, + "auxiliary_loss_mlp": 0.01108396, + "balance_loss_clip": 1.00198245, + "balance_loss_mlp": 1.0004406, + "epoch": 0.5822335788366151, + "flos": 22382259488640.0, + "grad_norm": 3.7889701054355394, + "language_loss": 0.73542678, + "learning_rate": 1.5678231692065116e-06, + "loss": 0.75784457, + "num_input_tokens_seen": 208647925, + "step": 9684, + "time_per_iteration": 2.6193580627441406 + }, + { + "auxiliary_loss_clip": 0.01137765, + "auxiliary_loss_mlp": 0.01107759, + "balance_loss_clip": 1.00198913, + "balance_loss_mlp": 1.00056577, + "epoch": 0.582293702089283, + "flos": 26722494639360.0, + "grad_norm": 2.459109324258949, + "language_loss": 0.78226191, + "learning_rate": 1.5674429172212348e-06, + "loss": 0.80471712, + "num_input_tokens_seen": 208666180, + "step": 9685, + "time_per_iteration": 2.6240296363830566 + }, + { + "auxiliary_loss_clip": 0.01165964, + "auxiliary_loss_mlp": 0.01109156, + "balance_loss_clip": 1.00202823, + "balance_loss_mlp": 1.00072312, + "epoch": 0.582353825341951, + "flos": 17348661129600.0, + "grad_norm": 1.8206493600403946, + "language_loss": 0.75404119, + "learning_rate": 1.5670626816385667e-06, + "loss": 0.77679241, + "num_input_tokens_seen": 208684240, + "step": 9686, + "time_per_iteration": 2.5079760551452637 + }, + { + "auxiliary_loss_clip": 0.01144172, + "auxiliary_loss_mlp": 0.01083919, + "balance_loss_clip": 1.00126839, + "balance_loss_mlp": 0.99999565, + "epoch": 0.5824139485946189, + "flos": 55473261534720.0, + "grad_norm": 0.8244663617599673, + "language_loss": 0.57437247, + "learning_rate": 1.5666824624729244e-06, + "loss": 0.5966534, + "num_input_tokens_seen": 208736090, + "step": 9687, + "time_per_iteration": 2.9662723541259766 + }, + { + "auxiliary_loss_clip": 0.01118878, + "auxiliary_loss_mlp": 0.01108805, + "balance_loss_clip": 1.00183249, + "balance_loss_mlp": 1.0004673, + "epoch": 0.582474071847287, + "flos": 20303031790080.0, + "grad_norm": 1.9930756402755971, + "language_loss": 0.69796801, + "learning_rate": 1.566302259738727e-06, + "loss": 0.72024488, + "num_input_tokens_seen": 208754600, + "step": 9688, + "time_per_iteration": 2.6399827003479004 + }, + { + "auxiliary_loss_clip": 0.01151163, + "auxiliary_loss_mlp": 0.01108102, + "balance_loss_clip": 1.00208855, + "balance_loss_mlp": 1.00052738, + "epoch": 0.5825341950999549, + "flos": 23878010661120.0, + "grad_norm": 3.2580031711933253, + "language_loss": 0.65215015, + "learning_rate": 1.5659220734503918e-06, + "loss": 0.67474276, + "num_input_tokens_seen": 208773140, + "step": 9689, + "time_per_iteration": 2.5651791095733643 + }, + { + "auxiliary_loss_clip": 0.0113393, + "auxiliary_loss_mlp": 0.00747111, + "balance_loss_clip": 1.00207734, + "balance_loss_mlp": 1.00037837, + "epoch": 0.5825943183526229, + "flos": 23113041690240.0, + "grad_norm": 1.53412259953759, + "language_loss": 0.73183846, + "learning_rate": 1.5655419036223341e-06, + "loss": 0.75064886, + "num_input_tokens_seen": 208793410, + "step": 9690, + "time_per_iteration": 2.631432056427002 + }, + { + "auxiliary_loss_clip": 0.01134234, + "auxiliary_loss_mlp": 0.01108691, + "balance_loss_clip": 1.00201774, + "balance_loss_mlp": 1.00073457, + "epoch": 0.5826544416052909, + "flos": 22857429530880.0, + "grad_norm": 1.7406531664671665, + "language_loss": 0.7600143, + "learning_rate": 1.5651617502689717e-06, + "loss": 0.78244352, + "num_input_tokens_seen": 208811920, + "step": 9691, + "time_per_iteration": 2.6350553035736084 + }, + { + "auxiliary_loss_clip": 0.01150501, + "auxiliary_loss_mlp": 0.01107576, + "balance_loss_clip": 1.00189924, + "balance_loss_mlp": 1.00047898, + "epoch": 0.5827145648579588, + "flos": 31501845555840.0, + "grad_norm": 1.559098628199713, + "language_loss": 0.80209476, + "learning_rate": 1.5647816134047184e-06, + "loss": 0.82467556, + "num_input_tokens_seen": 208834720, + "step": 9692, + "time_per_iteration": 2.655062675476074 + }, + { + "auxiliary_loss_clip": 0.0114422, + "auxiliary_loss_mlp": 0.01083955, + "balance_loss_clip": 1.00128615, + "balance_loss_mlp": 1.00003147, + "epoch": 0.5827746881106268, + "flos": 69811817074560.0, + "grad_norm": 0.7563390397525241, + "language_loss": 0.5695051, + "learning_rate": 1.5644014930439907e-06, + "loss": 0.59178686, + "num_input_tokens_seen": 208898415, + "step": 9693, + "time_per_iteration": 3.0991721153259277 + }, + { + "auxiliary_loss_clip": 0.01149224, + "auxiliary_loss_mlp": 0.0074725, + "balance_loss_clip": 1.00193536, + "balance_loss_mlp": 1.00039077, + "epoch": 0.5828348113632947, + "flos": 23112395245440.0, + "grad_norm": 1.8935457957043327, + "language_loss": 0.7916491, + "learning_rate": 1.5640213892012025e-06, + "loss": 0.81061387, + "num_input_tokens_seen": 208919045, + "step": 9694, + "time_per_iteration": 2.6066854000091553 + }, + { + "auxiliary_loss_clip": 0.01134343, + "auxiliary_loss_mlp": 0.01107168, + "balance_loss_clip": 1.00197387, + "balance_loss_mlp": 1.00054765, + "epoch": 0.5828949346159628, + "flos": 21873082245120.0, + "grad_norm": 1.6221847300718741, + "language_loss": 0.75986451, + "learning_rate": 1.5636413018907656e-06, + "loss": 0.78227961, + "num_input_tokens_seen": 208939375, + "step": 9695, + "time_per_iteration": 2.6198363304138184 + }, + { + "auxiliary_loss_clip": 0.01144076, + "auxiliary_loss_mlp": 0.01083871, + "balance_loss_clip": 1.00120234, + "balance_loss_mlp": 0.99994797, + "epoch": 0.5829550578686307, + "flos": 65962553950080.0, + "grad_norm": 0.7764036723786949, + "language_loss": 0.55069727, + "learning_rate": 1.563261231127095e-06, + "loss": 0.57297671, + "num_input_tokens_seen": 209004760, + "step": 9696, + "time_per_iteration": 3.230412006378174 + }, + { + "auxiliary_loss_clip": 0.01117455, + "auxiliary_loss_mlp": 0.01108371, + "balance_loss_clip": 1.00197434, + "balance_loss_mlp": 1.00051069, + "epoch": 0.5830151811212987, + "flos": 16289799079680.0, + "grad_norm": 2.039468297330503, + "language_loss": 0.76214522, + "learning_rate": 1.5628811769246021e-06, + "loss": 0.78440356, + "num_input_tokens_seen": 209022930, + "step": 9697, + "time_per_iteration": 2.6229398250579834 + }, + { + "auxiliary_loss_clip": 0.01165975, + "auxiliary_loss_mlp": 0.01109116, + "balance_loss_clip": 1.0019803, + "balance_loss_mlp": 1.00058782, + "epoch": 0.5830753043739666, + "flos": 24168851084160.0, + "grad_norm": 4.4221946927184055, + "language_loss": 0.7766369, + "learning_rate": 1.5625011392976991e-06, + "loss": 0.79938781, + "num_input_tokens_seen": 209043740, + "step": 9698, + "time_per_iteration": 2.549607038497925 + }, + { + "auxiliary_loss_clip": 0.01103436, + "auxiliary_loss_mlp": 0.01109, + "balance_loss_clip": 1.00188434, + "balance_loss_mlp": 1.00075793, + "epoch": 0.5831354276266346, + "flos": 27059050097280.0, + "grad_norm": 1.7646968418290032, + "language_loss": 0.83718008, + "learning_rate": 1.5621211182607966e-06, + "loss": 0.85930443, + "num_input_tokens_seen": 209068885, + "step": 9699, + "time_per_iteration": 2.7992515563964844 + }, + { + "auxiliary_loss_clip": 0.01136058, + "auxiliary_loss_mlp": 0.01108435, + "balance_loss_clip": 1.00195789, + "balance_loss_mlp": 1.00047946, + "epoch": 0.5831955508793025, + "flos": 23623475909760.0, + "grad_norm": 2.305769803554409, + "language_loss": 0.66124994, + "learning_rate": 1.561741113828305e-06, + "loss": 0.68369484, + "num_input_tokens_seen": 209087340, + "step": 9700, + "time_per_iteration": 2.612548828125 + }, + { + "auxiliary_loss_clip": 0.01151083, + "auxiliary_loss_mlp": 0.01108358, + "balance_loss_clip": 1.00204706, + "balance_loss_mlp": 1.00049722, + "epoch": 0.5832556741319705, + "flos": 24973250209920.0, + "grad_norm": 1.6561103009243092, + "language_loss": 0.71600735, + "learning_rate": 1.5613611260146344e-06, + "loss": 0.7386018, + "num_input_tokens_seen": 209108840, + "step": 9701, + "time_per_iteration": 2.614755153656006 + }, + { + "auxiliary_loss_clip": 0.01132454, + "auxiliary_loss_mlp": 0.0110778, + "balance_loss_clip": 1.00190282, + "balance_loss_mlp": 1.00058746, + "epoch": 0.5833157973846385, + "flos": 23221563655680.0, + "grad_norm": 1.7322568949010826, + "language_loss": 0.84809154, + "learning_rate": 1.5609811548341936e-06, + "loss": 0.87049389, + "num_input_tokens_seen": 209127985, + "step": 9702, + "time_per_iteration": 2.6258909702301025 + }, + { + "auxiliary_loss_clip": 0.0114888, + "auxiliary_loss_mlp": 0.01107232, + "balance_loss_clip": 1.00186992, + "balance_loss_mlp": 1.0005157, + "epoch": 0.5833759206373065, + "flos": 21977941023360.0, + "grad_norm": 1.4900101416593465, + "language_loss": 0.7802875, + "learning_rate": 1.560601200301392e-06, + "loss": 0.80284864, + "num_input_tokens_seen": 209146885, + "step": 9703, + "time_per_iteration": 2.5439374446868896 + }, + { + "auxiliary_loss_clip": 0.01165954, + "auxiliary_loss_mlp": 0.01108087, + "balance_loss_clip": 1.00205469, + "balance_loss_mlp": 1.00060797, + "epoch": 0.5834360438899745, + "flos": 21762405463680.0, + "grad_norm": 1.619099155933946, + "language_loss": 0.71229947, + "learning_rate": 1.5602212624306366e-06, + "loss": 0.73503983, + "num_input_tokens_seen": 209166130, + "step": 9704, + "time_per_iteration": 2.5138492584228516 + }, + { + "auxiliary_loss_clip": 0.01132496, + "auxiliary_loss_mlp": 0.01108133, + "balance_loss_clip": 1.00186872, + "balance_loss_mlp": 1.00055897, + "epoch": 0.5834961671426424, + "flos": 15992566035840.0, + "grad_norm": 1.7967687275721123, + "language_loss": 0.81072259, + "learning_rate": 1.559841341236335e-06, + "loss": 0.83312881, + "num_input_tokens_seen": 209183350, + "step": 9705, + "time_per_iteration": 2.569518566131592 + }, + { + "auxiliary_loss_clip": 0.01120118, + "auxiliary_loss_mlp": 0.01107141, + "balance_loss_clip": 1.00208366, + "balance_loss_mlp": 1.00052071, + "epoch": 0.5835562903953104, + "flos": 22818322598400.0, + "grad_norm": 1.7085812396267521, + "language_loss": 0.80287045, + "learning_rate": 1.5594614367328937e-06, + "loss": 0.82514298, + "num_input_tokens_seen": 209203945, + "step": 9706, + "time_per_iteration": 2.6375856399536133 + }, + { + "auxiliary_loss_clip": 0.0115108, + "auxiliary_loss_mlp": 0.01108197, + "balance_loss_clip": 1.00205708, + "balance_loss_mlp": 1.00052774, + "epoch": 0.5836164136479783, + "flos": 48468056624640.0, + "grad_norm": 1.9803032340934403, + "language_loss": 0.75254774, + "learning_rate": 1.5590815489347187e-06, + "loss": 0.77514058, + "num_input_tokens_seen": 209227080, + "step": 9707, + "time_per_iteration": 2.780822515487671 + }, + { + "auxiliary_loss_clip": 0.01119057, + "auxiliary_loss_mlp": 0.0110717, + "balance_loss_clip": 1.00179267, + "balance_loss_mlp": 1.00054908, + "epoch": 0.5836765369006464, + "flos": 26905998245760.0, + "grad_norm": 1.7817490797634943, + "language_loss": 0.8134011, + "learning_rate": 1.5587016778562163e-06, + "loss": 0.83566344, + "num_input_tokens_seen": 209248170, + "step": 9708, + "time_per_iteration": 4.058682203292847 + }, + { + "auxiliary_loss_clip": 0.01149598, + "auxiliary_loss_mlp": 0.01108162, + "balance_loss_clip": 1.00214791, + "balance_loss_mlp": 1.00058746, + "epoch": 0.5837366601533143, + "flos": 20084048524800.0, + "grad_norm": 2.265726325709053, + "language_loss": 0.78654778, + "learning_rate": 1.5583218235117896e-06, + "loss": 0.8091253, + "num_input_tokens_seen": 209267730, + "step": 9709, + "time_per_iteration": 2.5398101806640625 + }, + { + "auxiliary_loss_clip": 0.01129698, + "auxiliary_loss_mlp": 0.01084231, + "balance_loss_clip": 1.00123787, + "balance_loss_mlp": 0.99992621, + "epoch": 0.5837967834059823, + "flos": 65363885971200.0, + "grad_norm": 0.7758239117898353, + "language_loss": 0.56528687, + "learning_rate": 1.557941985915844e-06, + "loss": 0.58742619, + "num_input_tokens_seen": 209332510, + "step": 9710, + "time_per_iteration": 3.1839005947113037 + }, + { + "auxiliary_loss_clip": 0.0111785, + "auxiliary_loss_mlp": 0.01106703, + "balance_loss_clip": 1.00185037, + "balance_loss_mlp": 1.00065458, + "epoch": 0.5838569066586502, + "flos": 25338641310720.0, + "grad_norm": 2.1861839658361757, + "language_loss": 0.65442371, + "learning_rate": 1.5575621650827833e-06, + "loss": 0.6766693, + "num_input_tokens_seen": 209353355, + "step": 9711, + "time_per_iteration": 2.6724183559417725 + }, + { + "auxiliary_loss_clip": 0.01166024, + "auxiliary_loss_mlp": 0.0110944, + "balance_loss_clip": 1.00209486, + "balance_loss_mlp": 1.00053048, + "epoch": 0.5839170299113182, + "flos": 22229243550720.0, + "grad_norm": 1.8743899491266218, + "language_loss": 0.78435993, + "learning_rate": 1.5571823610270085e-06, + "loss": 0.80711454, + "num_input_tokens_seen": 209370960, + "step": 9712, + "time_per_iteration": 2.53210186958313 + }, + { + "auxiliary_loss_clip": 0.01121262, + "auxiliary_loss_mlp": 0.00747151, + "balance_loss_clip": 1.00184464, + "balance_loss_mlp": 1.00033069, + "epoch": 0.5839771531639861, + "flos": 22200012858240.0, + "grad_norm": 2.7099600065135228, + "language_loss": 0.73341769, + "learning_rate": 1.5568025737629234e-06, + "loss": 0.75210178, + "num_input_tokens_seen": 209390955, + "step": 9713, + "time_per_iteration": 2.6581482887268066 + }, + { + "auxiliary_loss_clip": 0.0113282, + "auxiliary_loss_mlp": 0.01108651, + "balance_loss_clip": 1.00196075, + "balance_loss_mlp": 1.00050414, + "epoch": 0.5840372764166541, + "flos": 22419355259520.0, + "grad_norm": 1.7439563080407317, + "language_loss": 0.69755566, + "learning_rate": 1.5564228033049292e-06, + "loss": 0.71997035, + "num_input_tokens_seen": 209410260, + "step": 9714, + "time_per_iteration": 2.6004958152770996 + }, + { + "auxiliary_loss_clip": 0.01165947, + "auxiliary_loss_mlp": 0.01107801, + "balance_loss_clip": 1.00206602, + "balance_loss_mlp": 1.00041747, + "epoch": 0.5840973996693221, + "flos": 19828256797440.0, + "grad_norm": 1.8881621930715307, + "language_loss": 0.80347681, + "learning_rate": 1.5560430496674268e-06, + "loss": 0.82621431, + "num_input_tokens_seen": 209429920, + "step": 9715, + "time_per_iteration": 3.892279624938965 + }, + { + "auxiliary_loss_clip": 0.0113568, + "auxiliary_loss_mlp": 0.0110784, + "balance_loss_clip": 1.00192487, + "balance_loss_mlp": 1.00045633, + "epoch": 0.5841575229219901, + "flos": 21142982401920.0, + "grad_norm": 1.880398979971782, + "language_loss": 0.72781551, + "learning_rate": 1.5556633128648167e-06, + "loss": 0.7502507, + "num_input_tokens_seen": 209449470, + "step": 9716, + "time_per_iteration": 3.9910309314727783 + }, + { + "auxiliary_loss_clip": 0.0113429, + "auxiliary_loss_mlp": 0.01107135, + "balance_loss_clip": 1.0020082, + "balance_loss_mlp": 1.00041866, + "epoch": 0.5842176461746581, + "flos": 24640322025600.0, + "grad_norm": 1.6119460714618312, + "language_loss": 0.75256568, + "learning_rate": 1.5552835929114976e-06, + "loss": 0.77497995, + "num_input_tokens_seen": 209467695, + "step": 9717, + "time_per_iteration": 2.6617376804351807 + }, + { + "auxiliary_loss_clip": 0.01150495, + "auxiliary_loss_mlp": 0.01107942, + "balance_loss_clip": 1.00189829, + "balance_loss_mlp": 1.00065386, + "epoch": 0.584277769427326, + "flos": 19131158574720.0, + "grad_norm": 1.9571571585791558, + "language_loss": 0.80352426, + "learning_rate": 1.5549038898218697e-06, + "loss": 0.82610863, + "num_input_tokens_seen": 209484250, + "step": 9718, + "time_per_iteration": 2.5790562629699707 + }, + { + "auxiliary_loss_clip": 0.01134204, + "auxiliary_loss_mlp": 0.01108386, + "balance_loss_clip": 1.00191414, + "balance_loss_mlp": 1.00052571, + "epoch": 0.584337892679994, + "flos": 22675111073280.0, + "grad_norm": 1.8229490933986163, + "language_loss": 0.67792445, + "learning_rate": 1.5545242036103306e-06, + "loss": 0.7003504, + "num_input_tokens_seen": 209502830, + "step": 9719, + "time_per_iteration": 4.143615245819092 + }, + { + "auxiliary_loss_clip": 0.01165705, + "auxiliary_loss_mlp": 0.011084, + "balance_loss_clip": 1.0018878, + "balance_loss_mlp": 1.00063503, + "epoch": 0.5843980159326619, + "flos": 31284083352960.0, + "grad_norm": 1.8319913767623204, + "language_loss": 0.75607216, + "learning_rate": 1.5541445342912786e-06, + "loss": 0.77881324, + "num_input_tokens_seen": 209525995, + "step": 9720, + "time_per_iteration": 2.58370304107666 + }, + { + "auxiliary_loss_clip": 0.01121007, + "auxiliary_loss_mlp": 0.01108466, + "balance_loss_clip": 1.00184679, + "balance_loss_mlp": 1.00060582, + "epoch": 0.58445813918533, + "flos": 22748117466240.0, + "grad_norm": 1.8484872502079461, + "language_loss": 0.82997894, + "learning_rate": 1.5537648818791105e-06, + "loss": 0.8522737, + "num_input_tokens_seen": 209545895, + "step": 9721, + "time_per_iteration": 2.621725559234619 + }, + { + "auxiliary_loss_clip": 0.01160508, + "auxiliary_loss_mlp": 0.01084228, + "balance_loss_clip": 1.00122988, + "balance_loss_mlp": 0.99992293, + "epoch": 0.5845182624379979, + "flos": 60686556658560.0, + "grad_norm": 0.9323525403152445, + "language_loss": 0.71357203, + "learning_rate": 1.5533852463882226e-06, + "loss": 0.73601937, + "num_input_tokens_seen": 209602315, + "step": 9722, + "time_per_iteration": 3.1172168254852295 + }, + { + "auxiliary_loss_clip": 0.0115131, + "auxiliary_loss_mlp": 0.01108251, + "balance_loss_clip": 1.0020864, + "balance_loss_mlp": 1.00067687, + "epoch": 0.5845783856906659, + "flos": 16362446336640.0, + "grad_norm": 2.2242996384338456, + "language_loss": 0.89418602, + "learning_rate": 1.5530056278330113e-06, + "loss": 0.91678166, + "num_input_tokens_seen": 209617615, + "step": 9723, + "time_per_iteration": 2.511650800704956 + }, + { + "auxiliary_loss_clip": 0.01132751, + "auxiliary_loss_mlp": 0.01107533, + "balance_loss_clip": 1.0018847, + "balance_loss_mlp": 1.00053132, + "epoch": 0.5846385089433338, + "flos": 20083402080000.0, + "grad_norm": 1.585304369051887, + "language_loss": 0.68691599, + "learning_rate": 1.5526260262278709e-06, + "loss": 0.70931876, + "num_input_tokens_seen": 209637005, + "step": 9724, + "time_per_iteration": 2.6077845096588135 + }, + { + "auxiliary_loss_clip": 0.01149147, + "auxiliary_loss_mlp": 0.01107889, + "balance_loss_clip": 1.00199151, + "balance_loss_mlp": 1.00050545, + "epoch": 0.5846986321960018, + "flos": 17311062568320.0, + "grad_norm": 1.7809451738908215, + "language_loss": 0.86408639, + "learning_rate": 1.552246441587197e-06, + "loss": 0.88665676, + "num_input_tokens_seen": 209653170, + "step": 9725, + "time_per_iteration": 2.5042035579681396 + }, + { + "auxiliary_loss_clip": 0.0113725, + "auxiliary_loss_mlp": 0.01108868, + "balance_loss_clip": 1.00208497, + "balance_loss_mlp": 1.00062597, + "epoch": 0.5847587554486697, + "flos": 17197907748480.0, + "grad_norm": 4.280045255174135, + "language_loss": 0.82228816, + "learning_rate": 1.5518668739253821e-06, + "loss": 0.84474933, + "num_input_tokens_seen": 209671275, + "step": 9726, + "time_per_iteration": 2.5906238555908203 + }, + { + "auxiliary_loss_clip": 0.0108774, + "auxiliary_loss_mlp": 0.00747295, + "balance_loss_clip": 1.00190687, + "balance_loss_mlp": 1.0004797, + "epoch": 0.5848188787013378, + "flos": 24529106540160.0, + "grad_norm": 1.8455114586950505, + "language_loss": 0.66753596, + "learning_rate": 1.5514873232568206e-06, + "loss": 0.68588626, + "num_input_tokens_seen": 209690380, + "step": 9727, + "time_per_iteration": 2.760219097137451 + }, + { + "auxiliary_loss_clip": 0.01121704, + "auxiliary_loss_mlp": 0.0110903, + "balance_loss_clip": 1.0021503, + "balance_loss_mlp": 1.00078845, + "epoch": 0.5848790019540057, + "flos": 20628382204800.0, + "grad_norm": 2.0066883535904916, + "language_loss": 0.81893444, + "learning_rate": 1.5511077895959055e-06, + "loss": 0.84124184, + "num_input_tokens_seen": 209708845, + "step": 9728, + "time_per_iteration": 2.623049020767212 + }, + { + "auxiliary_loss_clip": 0.01150981, + "auxiliary_loss_mlp": 0.01107339, + "balance_loss_clip": 1.00201643, + "balance_loss_mlp": 1.00062263, + "epoch": 0.5849391252066737, + "flos": 22418852469120.0, + "grad_norm": 2.0764217878651907, + "language_loss": 0.77649248, + "learning_rate": 1.550728272957027e-06, + "loss": 0.7990756, + "num_input_tokens_seen": 209729000, + "step": 9729, + "time_per_iteration": 2.588040351867676 + }, + { + "auxiliary_loss_clip": 0.01151345, + "auxiliary_loss_mlp": 0.01108723, + "balance_loss_clip": 1.00204027, + "balance_loss_mlp": 1.00057626, + "epoch": 0.5849992484593417, + "flos": 25410929431680.0, + "grad_norm": 2.5019004783179493, + "language_loss": 0.70594448, + "learning_rate": 1.5503487733545782e-06, + "loss": 0.72854513, + "num_input_tokens_seen": 209747435, + "step": 9730, + "time_per_iteration": 2.5847277641296387 + }, + { + "auxiliary_loss_clip": 0.01165967, + "auxiliary_loss_mlp": 0.01109962, + "balance_loss_clip": 1.00210714, + "balance_loss_mlp": 1.00057602, + "epoch": 0.5850593717120096, + "flos": 21065163586560.0, + "grad_norm": 1.9323107316413792, + "language_loss": 0.78857195, + "learning_rate": 1.5499692908029482e-06, + "loss": 0.81133127, + "num_input_tokens_seen": 209764910, + "step": 9731, + "time_per_iteration": 2.528827428817749 + }, + { + "auxiliary_loss_clip": 0.01150831, + "auxiliary_loss_mlp": 0.01108464, + "balance_loss_clip": 1.00204563, + "balance_loss_mlp": 1.00060368, + "epoch": 0.5851194949646776, + "flos": 25301545539840.0, + "grad_norm": 2.2863454063326376, + "language_loss": 0.70027065, + "learning_rate": 1.549589825316528e-06, + "loss": 0.72286355, + "num_input_tokens_seen": 209786115, + "step": 9732, + "time_per_iteration": 2.5877695083618164 + }, + { + "auxiliary_loss_clip": 0.01104468, + "auxiliary_loss_mlp": 0.01111215, + "balance_loss_clip": 1.0020864, + "balance_loss_mlp": 1.00058889, + "epoch": 0.5851796182173455, + "flos": 23587242065280.0, + "grad_norm": 1.7965980333283496, + "language_loss": 0.52659529, + "learning_rate": 1.5492103769097075e-06, + "loss": 0.54875213, + "num_input_tokens_seen": 209806095, + "step": 9733, + "time_per_iteration": 2.6833102703094482 + }, + { + "auxiliary_loss_clip": 0.01151378, + "auxiliary_loss_mlp": 0.01108808, + "balance_loss_clip": 1.00217748, + "balance_loss_mlp": 1.0006609, + "epoch": 0.5852397414700136, + "flos": 24822712310400.0, + "grad_norm": 5.245909719130256, + "language_loss": 0.8762598, + "learning_rate": 1.5488309455968739e-06, + "loss": 0.89886165, + "num_input_tokens_seen": 209823650, + "step": 9734, + "time_per_iteration": 2.5695619583129883 + }, + { + "auxiliary_loss_clip": 0.01135775, + "auxiliary_loss_mlp": 0.01107257, + "balance_loss_clip": 1.0021894, + "balance_loss_mlp": 1.00073147, + "epoch": 0.5852998647226815, + "flos": 19937784343680.0, + "grad_norm": 1.6788928378944539, + "language_loss": 0.72357106, + "learning_rate": 1.5484515313924163e-06, + "loss": 0.74600142, + "num_input_tokens_seen": 209843220, + "step": 9735, + "time_per_iteration": 2.604396104812622 + }, + { + "auxiliary_loss_clip": 0.01149227, + "auxiliary_loss_mlp": 0.0110943, + "balance_loss_clip": 1.00200331, + "balance_loss_mlp": 1.00071073, + "epoch": 0.5853599879753495, + "flos": 16720367408640.0, + "grad_norm": 2.548294909276045, + "language_loss": 0.74325025, + "learning_rate": 1.5480721343107217e-06, + "loss": 0.76583678, + "num_input_tokens_seen": 209854880, + "step": 9736, + "time_per_iteration": 2.4903712272644043 + }, + { + "auxiliary_loss_clip": 0.01121286, + "auxiliary_loss_mlp": 0.01108529, + "balance_loss_clip": 1.00188136, + "balance_loss_mlp": 1.00047755, + "epoch": 0.5854201112280174, + "flos": 44456583680640.0, + "grad_norm": 1.5603663675508137, + "language_loss": 0.70281601, + "learning_rate": 1.5476927543661772e-06, + "loss": 0.72511411, + "num_input_tokens_seen": 209877870, + "step": 9737, + "time_per_iteration": 2.8263745307922363 + }, + { + "auxiliary_loss_clip": 0.01117388, + "auxiliary_loss_mlp": 0.011081, + "balance_loss_clip": 1.00198376, + "balance_loss_mlp": 1.00062084, + "epoch": 0.5854802344806854, + "flos": 20339193807360.0, + "grad_norm": 2.6889772843274433, + "language_loss": 0.82262707, + "learning_rate": 1.547313391573169e-06, + "loss": 0.84488201, + "num_input_tokens_seen": 209896690, + "step": 9738, + "time_per_iteration": 2.6377856731414795 + }, + { + "auxiliary_loss_clip": 0.0116607, + "auxiliary_loss_mlp": 0.00747329, + "balance_loss_clip": 1.00214684, + "balance_loss_mlp": 1.00033212, + "epoch": 0.5855403577333533, + "flos": 20921054221440.0, + "grad_norm": 1.673189153347782, + "language_loss": 0.68311483, + "learning_rate": 1.546934045946082e-06, + "loss": 0.70224881, + "num_input_tokens_seen": 209914640, + "step": 9739, + "time_per_iteration": 2.5495972633361816 + }, + { + "auxiliary_loss_clip": 0.0116589, + "auxiliary_loss_mlp": 0.01109285, + "balance_loss_clip": 1.00194788, + "balance_loss_mlp": 1.00047088, + "epoch": 0.5856004809860214, + "flos": 20448649526400.0, + "grad_norm": 3.3607087769360224, + "language_loss": 0.58972502, + "learning_rate": 1.5465547174993017e-06, + "loss": 0.61247671, + "num_input_tokens_seen": 209933375, + "step": 9740, + "time_per_iteration": 2.543144702911377 + }, + { + "auxiliary_loss_clip": 0.01132804, + "auxiliary_loss_mlp": 0.01109184, + "balance_loss_clip": 1.00180542, + "balance_loss_mlp": 1.00056088, + "epoch": 0.5856606042386893, + "flos": 19640766781440.0, + "grad_norm": 1.9457952747376497, + "language_loss": 0.7540943, + "learning_rate": 1.5461754062472113e-06, + "loss": 0.77651423, + "num_input_tokens_seen": 209952055, + "step": 9741, + "time_per_iteration": 2.607889175415039 + }, + { + "auxiliary_loss_clip": 0.01119779, + "auxiliary_loss_mlp": 0.01108723, + "balance_loss_clip": 1.0020076, + "balance_loss_mlp": 1.00057578, + "epoch": 0.5857207274913573, + "flos": 21686166846720.0, + "grad_norm": 1.5273805533560894, + "language_loss": 0.75891733, + "learning_rate": 1.5457961122041959e-06, + "loss": 0.78120232, + "num_input_tokens_seen": 209971190, + "step": 9742, + "time_per_iteration": 2.6777005195617676 + }, + { + "auxiliary_loss_clip": 0.01132418, + "auxiliary_loss_mlp": 0.01107892, + "balance_loss_clip": 1.00184727, + "balance_loss_mlp": 1.0006032, + "epoch": 0.5857808507440253, + "flos": 23182708118400.0, + "grad_norm": 1.8051000428565063, + "language_loss": 0.75092208, + "learning_rate": 1.5454168353846369e-06, + "loss": 0.77332515, + "num_input_tokens_seen": 209990695, + "step": 9743, + "time_per_iteration": 2.6207447052001953 + }, + { + "auxiliary_loss_clip": 0.01132968, + "auxiliary_loss_mlp": 0.01107836, + "balance_loss_clip": 1.00200808, + "balance_loss_mlp": 1.00064278, + "epoch": 0.5858409739966932, + "flos": 27235299156480.0, + "grad_norm": 1.7730214020580193, + "language_loss": 0.80984908, + "learning_rate": 1.5450375758029172e-06, + "loss": 0.83225715, + "num_input_tokens_seen": 210010210, + "step": 9744, + "time_per_iteration": 2.6596338748931885 + }, + { + "auxiliary_loss_clip": 0.0113504, + "auxiliary_loss_mlp": 0.01109095, + "balance_loss_clip": 1.00203514, + "balance_loss_mlp": 1.00066197, + "epoch": 0.5859010972493612, + "flos": 27855512317440.0, + "grad_norm": 1.6341268780234934, + "language_loss": 0.71725279, + "learning_rate": 1.5446583334734183e-06, + "loss": 0.73969412, + "num_input_tokens_seen": 210030030, + "step": 9745, + "time_per_iteration": 2.6340770721435547 + }, + { + "auxiliary_loss_clip": 0.01126697, + "auxiliary_loss_mlp": 0.01084398, + "balance_loss_clip": 1.00091577, + "balance_loss_mlp": 1.00009298, + "epoch": 0.5859612205020291, + "flos": 70007064428160.0, + "grad_norm": 0.7223718504116176, + "language_loss": 0.53345585, + "learning_rate": 1.5442791084105204e-06, + "loss": 0.55556679, + "num_input_tokens_seen": 210094840, + "step": 9746, + "time_per_iteration": 4.61349630355835 + }, + { + "auxiliary_loss_clip": 0.01137075, + "auxiliary_loss_mlp": 0.01109276, + "balance_loss_clip": 1.00200462, + "balance_loss_mlp": 1.00046217, + "epoch": 0.5860213437546972, + "flos": 24056019486720.0, + "grad_norm": 2.3388059389080156, + "language_loss": 0.73034, + "learning_rate": 1.5438999006286054e-06, + "loss": 0.75280356, + "num_input_tokens_seen": 210114660, + "step": 9747, + "time_per_iteration": 2.5995495319366455 + }, + { + "auxiliary_loss_clip": 0.01136235, + "auxiliary_loss_mlp": 0.01109194, + "balance_loss_clip": 1.00202346, + "balance_loss_mlp": 1.0005703, + "epoch": 0.5860814670073651, + "flos": 18947583141120.0, + "grad_norm": 1.9944323807505342, + "language_loss": 0.80964285, + "learning_rate": 1.543520710142051e-06, + "loss": 0.83209717, + "num_input_tokens_seen": 210132770, + "step": 9748, + "time_per_iteration": 2.569474697113037 + }, + { + "auxiliary_loss_clip": 0.0114927, + "auxiliary_loss_mlp": 0.01109413, + "balance_loss_clip": 1.00198793, + "balance_loss_mlp": 1.00059867, + "epoch": 0.5861415902600331, + "flos": 22561848512640.0, + "grad_norm": 1.6044308137344705, + "language_loss": 0.72116911, + "learning_rate": 1.5431415369652375e-06, + "loss": 0.74375594, + "num_input_tokens_seen": 210151895, + "step": 9749, + "time_per_iteration": 2.5613174438476562 + }, + { + "auxiliary_loss_clip": 0.01133193, + "auxiliary_loss_mlp": 0.01107899, + "balance_loss_clip": 1.00206184, + "balance_loss_mlp": 1.0004195, + "epoch": 0.586201713512701, + "flos": 14392027912320.0, + "grad_norm": 2.4137100417506017, + "language_loss": 0.74660933, + "learning_rate": 1.5427623811125428e-06, + "loss": 0.7690202, + "num_input_tokens_seen": 210168040, + "step": 9750, + "time_per_iteration": 2.548689365386963 + }, + { + "auxiliary_loss_clip": 0.01116262, + "auxiliary_loss_mlp": 0.01108067, + "balance_loss_clip": 1.00188816, + "balance_loss_mlp": 1.00058746, + "epoch": 0.586261836765369, + "flos": 19498560837120.0, + "grad_norm": 1.8977589142421234, + "language_loss": 0.70688242, + "learning_rate": 1.542383242598344e-06, + "loss": 0.72912568, + "num_input_tokens_seen": 210187720, + "step": 9751, + "time_per_iteration": 2.6492760181427 + }, + { + "auxiliary_loss_clip": 0.01166079, + "auxiliary_loss_mlp": 0.01110193, + "balance_loss_clip": 1.00209069, + "balance_loss_mlp": 1.00061607, + "epoch": 0.5863219600180369, + "flos": 20701819560960.0, + "grad_norm": 1.6817947052587876, + "language_loss": 0.74671435, + "learning_rate": 1.5420041214370184e-06, + "loss": 0.76947707, + "num_input_tokens_seen": 210206080, + "step": 9752, + "time_per_iteration": 2.5584957599639893 + }, + { + "auxiliary_loss_clip": 0.01149311, + "auxiliary_loss_mlp": 0.01109116, + "balance_loss_clip": 1.00213027, + "balance_loss_mlp": 1.00049257, + "epoch": 0.586382083270705, + "flos": 19792130693760.0, + "grad_norm": 1.8460519790747083, + "language_loss": 0.77596283, + "learning_rate": 1.541625017642943e-06, + "loss": 0.79854709, + "num_input_tokens_seen": 210225660, + "step": 9753, + "time_per_iteration": 3.948873519897461 + }, + { + "auxiliary_loss_clip": 0.01165738, + "auxiliary_loss_mlp": 0.01107653, + "balance_loss_clip": 1.00200605, + "balance_loss_mlp": 1.00055575, + "epoch": 0.5864422065233729, + "flos": 16500558130560.0, + "grad_norm": 1.9382863581330507, + "language_loss": 0.70889556, + "learning_rate": 1.5412459312304927e-06, + "loss": 0.73162943, + "num_input_tokens_seen": 210242725, + "step": 9754, + "time_per_iteration": 3.9754278659820557 + }, + { + "auxiliary_loss_clip": 0.01134372, + "auxiliary_loss_mlp": 0.01109657, + "balance_loss_clip": 1.00191391, + "balance_loss_mlp": 1.00055695, + "epoch": 0.5865023297760409, + "flos": 20413277608320.0, + "grad_norm": 1.868575221253906, + "language_loss": 0.72351617, + "learning_rate": 1.540866862214043e-06, + "loss": 0.74595648, + "num_input_tokens_seen": 210263225, + "step": 9755, + "time_per_iteration": 2.5791518688201904 + }, + { + "auxiliary_loss_clip": 0.01111917, + "auxiliary_loss_mlp": 0.01084435, + "balance_loss_clip": 1.0010972, + "balance_loss_mlp": 1.00013053, + "epoch": 0.5865624530287089, + "flos": 63350769254400.0, + "grad_norm": 0.7401298261983187, + "language_loss": 0.56964183, + "learning_rate": 1.540487810607967e-06, + "loss": 0.59160542, + "num_input_tokens_seen": 210322310, + "step": 9756, + "time_per_iteration": 4.583248138427734 + }, + { + "auxiliary_loss_clip": 0.01166081, + "auxiliary_loss_mlp": 0.01108833, + "balance_loss_clip": 1.00210977, + "balance_loss_mlp": 1.00068629, + "epoch": 0.5866225762813768, + "flos": 27016279977600.0, + "grad_norm": 2.994290695183014, + "language_loss": 0.7634604, + "learning_rate": 1.5401087764266396e-06, + "loss": 0.78620946, + "num_input_tokens_seen": 210340845, + "step": 9757, + "time_per_iteration": 2.5566892623901367 + }, + { + "auxiliary_loss_clip": 0.01129393, + "auxiliary_loss_mlp": 0.01084807, + "balance_loss_clip": 1.00114083, + "balance_loss_mlp": 1.00012064, + "epoch": 0.5866826995340448, + "flos": 72987038507520.0, + "grad_norm": 0.8664601618663331, + "language_loss": 0.60502297, + "learning_rate": 1.5397297596844337e-06, + "loss": 0.62716496, + "num_input_tokens_seen": 210397815, + "step": 9758, + "time_per_iteration": 3.12506103515625 + }, + { + "auxiliary_loss_clip": 0.01166139, + "auxiliary_loss_mlp": 0.01109528, + "balance_loss_clip": 1.00202966, + "balance_loss_mlp": 1.0006187, + "epoch": 0.5867428227867127, + "flos": 21285727050240.0, + "grad_norm": 2.2337164691821094, + "language_loss": 0.71476293, + "learning_rate": 1.5393507603957212e-06, + "loss": 0.73751962, + "num_input_tokens_seen": 210413900, + "step": 9759, + "time_per_iteration": 2.534907817840576 + }, + { + "auxiliary_loss_clip": 0.01134389, + "auxiliary_loss_mlp": 0.01109416, + "balance_loss_clip": 1.00194645, + "balance_loss_mlp": 1.00069666, + "epoch": 0.5868029460393808, + "flos": 33468852188160.0, + "grad_norm": 1.5799681545277502, + "language_loss": 0.72926867, + "learning_rate": 1.5389717785748742e-06, + "loss": 0.75170672, + "num_input_tokens_seen": 210434110, + "step": 9760, + "time_per_iteration": 2.7039918899536133 + }, + { + "auxiliary_loss_clip": 0.01149113, + "auxiliary_loss_mlp": 0.0110908, + "balance_loss_clip": 1.00198972, + "balance_loss_mlp": 1.00055158, + "epoch": 0.5868630692920487, + "flos": 17889475276800.0, + "grad_norm": 4.17209418458108, + "language_loss": 0.7240175, + "learning_rate": 1.5385928142362637e-06, + "loss": 0.74659944, + "num_input_tokens_seen": 210451685, + "step": 9761, + "time_per_iteration": 2.514263153076172 + }, + { + "auxiliary_loss_clip": 0.01134161, + "auxiliary_loss_mlp": 0.01109029, + "balance_loss_clip": 1.00185776, + "balance_loss_mlp": 1.00050092, + "epoch": 0.5869231925447167, + "flos": 21035035054080.0, + "grad_norm": 1.7201843812604167, + "language_loss": 0.74819756, + "learning_rate": 1.5382138673942597e-06, + "loss": 0.77062953, + "num_input_tokens_seen": 210470825, + "step": 9762, + "time_per_iteration": 2.648556709289551 + }, + { + "auxiliary_loss_clip": 0.01119528, + "auxiliary_loss_mlp": 0.01108958, + "balance_loss_clip": 1.00206578, + "balance_loss_mlp": 1.00062013, + "epoch": 0.5869833157973846, + "flos": 74738219293440.0, + "grad_norm": 1.2570999509045708, + "language_loss": 0.72310644, + "learning_rate": 1.5378349380632317e-06, + "loss": 0.74539125, + "num_input_tokens_seen": 210500075, + "step": 9763, + "time_per_iteration": 3.045839548110962 + }, + { + "auxiliary_loss_clip": 0.01151111, + "auxiliary_loss_mlp": 0.01108848, + "balance_loss_clip": 1.00191522, + "balance_loss_mlp": 1.00060654, + "epoch": 0.5870434390500526, + "flos": 17638998762240.0, + "grad_norm": 1.5465954047003667, + "language_loss": 0.79889071, + "learning_rate": 1.53745602625755e-06, + "loss": 0.82149029, + "num_input_tokens_seen": 210518150, + "step": 9764, + "time_per_iteration": 2.5615322589874268 + }, + { + "auxiliary_loss_clip": 0.01132868, + "auxiliary_loss_mlp": 0.01109354, + "balance_loss_clip": 1.00197291, + "balance_loss_mlp": 1.00073028, + "epoch": 0.5871035623027205, + "flos": 21506146859520.0, + "grad_norm": 1.96813015319404, + "language_loss": 0.79000664, + "learning_rate": 1.5370771319915819e-06, + "loss": 0.81242889, + "num_input_tokens_seen": 210537760, + "step": 9765, + "time_per_iteration": 2.576737403869629 + }, + { + "auxiliary_loss_clip": 0.01135077, + "auxiliary_loss_mlp": 0.01108481, + "balance_loss_clip": 1.00212383, + "balance_loss_mlp": 1.00071573, + "epoch": 0.5871636855553886, + "flos": 13551861818880.0, + "grad_norm": 2.19089706626037, + "language_loss": 0.83192325, + "learning_rate": 1.5366982552796947e-06, + "loss": 0.85435879, + "num_input_tokens_seen": 210555515, + "step": 9766, + "time_per_iteration": 2.55930495262146 + }, + { + "auxiliary_loss_clip": 0.01149629, + "auxiliary_loss_mlp": 0.01109399, + "balance_loss_clip": 1.00215089, + "balance_loss_mlp": 1.0005846, + "epoch": 0.5872238088080565, + "flos": 26212922346240.0, + "grad_norm": 1.5602112635400436, + "language_loss": 0.69430637, + "learning_rate": 1.536319396136257e-06, + "loss": 0.71689671, + "num_input_tokens_seen": 210575000, + "step": 9767, + "time_per_iteration": 2.6069393157958984 + }, + { + "auxiliary_loss_clip": 0.01151374, + "auxiliary_loss_mlp": 0.00747431, + "balance_loss_clip": 1.00199223, + "balance_loss_mlp": 1.00051415, + "epoch": 0.5872839320607245, + "flos": 30665198995200.0, + "grad_norm": 1.8321169965606279, + "language_loss": 0.63526499, + "learning_rate": 1.5359405545756336e-06, + "loss": 0.65425313, + "num_input_tokens_seen": 210595185, + "step": 9768, + "time_per_iteration": 2.6253862380981445 + }, + { + "auxiliary_loss_clip": 0.01160472, + "auxiliary_loss_mlp": 0.00745673, + "balance_loss_clip": 1.00108123, + "balance_loss_mlp": 1.00008953, + "epoch": 0.5873440553133924, + "flos": 60303570871680.0, + "grad_norm": 0.7202448117587019, + "language_loss": 0.53938895, + "learning_rate": 1.5355617306121914e-06, + "loss": 0.5584504, + "num_input_tokens_seen": 210653210, + "step": 9769, + "time_per_iteration": 3.1153459548950195 + }, + { + "auxiliary_loss_clip": 0.01117733, + "auxiliary_loss_mlp": 0.01108206, + "balance_loss_clip": 1.00176239, + "balance_loss_mlp": 1.00072682, + "epoch": 0.5874041785660604, + "flos": 21539292134400.0, + "grad_norm": 1.5811788240617435, + "language_loss": 0.70426512, + "learning_rate": 1.5351829242602945e-06, + "loss": 0.72652447, + "num_input_tokens_seen": 210673750, + "step": 9770, + "time_per_iteration": 2.639693260192871 + }, + { + "auxiliary_loss_clip": 0.01100445, + "auxiliary_loss_mlp": 0.01107853, + "balance_loss_clip": 1.0017035, + "balance_loss_mlp": 1.00066006, + "epoch": 0.5874643018187284, + "flos": 24388947671040.0, + "grad_norm": 2.0248367948585875, + "language_loss": 0.67910373, + "learning_rate": 1.5348041355343077e-06, + "loss": 0.70118672, + "num_input_tokens_seen": 210692960, + "step": 9771, + "time_per_iteration": 2.7108137607574463 + }, + { + "auxiliary_loss_clip": 0.01102269, + "auxiliary_loss_mlp": 0.01110119, + "balance_loss_clip": 1.00175595, + "balance_loss_mlp": 1.00054169, + "epoch": 0.5875244250713964, + "flos": 28147717457280.0, + "grad_norm": 1.5470601142654956, + "language_loss": 0.65952784, + "learning_rate": 1.5344253644485954e-06, + "loss": 0.68165171, + "num_input_tokens_seen": 210714040, + "step": 9772, + "time_per_iteration": 2.722744941711426 + }, + { + "auxiliary_loss_clip": 0.01166053, + "auxiliary_loss_mlp": 0.01110148, + "balance_loss_clip": 1.00207794, + "balance_loss_mlp": 1.00076175, + "epoch": 0.5875845483240644, + "flos": 25812410722560.0, + "grad_norm": 1.658310523735364, + "language_loss": 0.73994887, + "learning_rate": 1.534046611017519e-06, + "loss": 0.76271093, + "num_input_tokens_seen": 210733710, + "step": 9773, + "time_per_iteration": 2.614689350128174 + }, + { + "auxiliary_loss_clip": 0.01116805, + "auxiliary_loss_mlp": 0.01109769, + "balance_loss_clip": 1.00187397, + "balance_loss_mlp": 1.00066864, + "epoch": 0.5876446715767323, + "flos": 26906572863360.0, + "grad_norm": 3.2714327776606047, + "language_loss": 0.53550106, + "learning_rate": 1.5336678752554421e-06, + "loss": 0.5577668, + "num_input_tokens_seen": 210753580, + "step": 9774, + "time_per_iteration": 2.6980018615722656 + }, + { + "auxiliary_loss_clip": 0.01150389, + "auxiliary_loss_mlp": 0.01109158, + "balance_loss_clip": 1.00204444, + "balance_loss_mlp": 1.00062978, + "epoch": 0.5877047948294003, + "flos": 36684832579200.0, + "grad_norm": 2.414308986345861, + "language_loss": 0.64979684, + "learning_rate": 1.5332891571767264e-06, + "loss": 0.67239225, + "num_input_tokens_seen": 210773495, + "step": 9775, + "time_per_iteration": 2.6781160831451416 + }, + { + "auxiliary_loss_clip": 0.01149235, + "auxiliary_loss_mlp": 0.01109063, + "balance_loss_clip": 1.00192118, + "balance_loss_mlp": 1.0006299, + "epoch": 0.5877649180820682, + "flos": 26724721282560.0, + "grad_norm": 1.6553336714937539, + "language_loss": 0.7395243, + "learning_rate": 1.5329104567957326e-06, + "loss": 0.76210731, + "num_input_tokens_seen": 210793645, + "step": 9776, + "time_per_iteration": 2.6049017906188965 + }, + { + "auxiliary_loss_clip": 0.01166112, + "auxiliary_loss_mlp": 0.01109681, + "balance_loss_clip": 1.00207913, + "balance_loss_mlp": 1.00058043, + "epoch": 0.5878250413347362, + "flos": 21032197879680.0, + "grad_norm": 1.8466783149655734, + "language_loss": 0.74762154, + "learning_rate": 1.532531774126821e-06, + "loss": 0.77037942, + "num_input_tokens_seen": 210813415, + "step": 9777, + "time_per_iteration": 2.5405943393707275 + }, + { + "auxiliary_loss_clip": 0.01117269, + "auxiliary_loss_mlp": 0.01107566, + "balance_loss_clip": 1.00186634, + "balance_loss_mlp": 1.0005635, + "epoch": 0.5878851645874041, + "flos": 25484259047040.0, + "grad_norm": 1.51658638071149, + "language_loss": 0.74444163, + "learning_rate": 1.5321531091843512e-06, + "loss": 0.76668996, + "num_input_tokens_seen": 210833850, + "step": 9778, + "time_per_iteration": 2.694397449493408 + }, + { + "auxiliary_loss_clip": 0.01124246, + "auxiliary_loss_mlp": 0.01108952, + "balance_loss_clip": 1.00201154, + "balance_loss_mlp": 1.00061417, + "epoch": 0.5879452878400722, + "flos": 23769129559680.0, + "grad_norm": 1.865550968323832, + "language_loss": 0.69973129, + "learning_rate": 1.5317744619826824e-06, + "loss": 0.7220633, + "num_input_tokens_seen": 210853115, + "step": 9779, + "time_per_iteration": 2.6348862648010254 + }, + { + "auxiliary_loss_clip": 0.0116614, + "auxiliary_loss_mlp": 0.0074751, + "balance_loss_clip": 1.00209975, + "balance_loss_mlp": 1.00058377, + "epoch": 0.5880054110927401, + "flos": 17824513530240.0, + "grad_norm": 5.928299405798898, + "language_loss": 0.67055249, + "learning_rate": 1.5313958325361727e-06, + "loss": 0.68968898, + "num_input_tokens_seen": 210872090, + "step": 9780, + "time_per_iteration": 2.5567145347595215 + }, + { + "auxiliary_loss_clip": 0.01133368, + "auxiliary_loss_mlp": 0.01109384, + "balance_loss_clip": 1.00198162, + "balance_loss_mlp": 1.00076103, + "epoch": 0.5880655343454081, + "flos": 19463404400640.0, + "grad_norm": 1.8879765140699083, + "language_loss": 0.72652936, + "learning_rate": 1.5310172208591807e-06, + "loss": 0.74895692, + "num_input_tokens_seen": 210888490, + "step": 9781, + "time_per_iteration": 2.5623152256011963 + }, + { + "auxiliary_loss_clip": 0.01137411, + "auxiliary_loss_mlp": 0.00747454, + "balance_loss_clip": 1.00201118, + "balance_loss_mlp": 1.00061226, + "epoch": 0.588125657598076, + "flos": 21397588980480.0, + "grad_norm": 1.3845050951764049, + "language_loss": 0.70210892, + "learning_rate": 1.5306386269660622e-06, + "loss": 0.72095752, + "num_input_tokens_seen": 210908220, + "step": 9782, + "time_per_iteration": 2.6044342517852783 + }, + { + "auxiliary_loss_clip": 0.0115094, + "auxiliary_loss_mlp": 0.01109871, + "balance_loss_clip": 1.00196075, + "balance_loss_mlp": 1.00067544, + "epoch": 0.588185780850744, + "flos": 16034653797120.0, + "grad_norm": 3.4998871281797497, + "language_loss": 0.70097136, + "learning_rate": 1.5302600508711741e-06, + "loss": 0.72357947, + "num_input_tokens_seen": 210923945, + "step": 9783, + "time_per_iteration": 2.500248670578003 + }, + { + "auxiliary_loss_clip": 0.01121133, + "auxiliary_loss_mlp": 0.01109388, + "balance_loss_clip": 1.00203502, + "balance_loss_mlp": 1.00057352, + "epoch": 0.588245904103412, + "flos": 23728226947200.0, + "grad_norm": 5.190247378156329, + "language_loss": 0.69204211, + "learning_rate": 1.5298814925888719e-06, + "loss": 0.71434736, + "num_input_tokens_seen": 210941955, + "step": 9784, + "time_per_iteration": 4.117636442184448 + }, + { + "auxiliary_loss_clip": 0.0110196, + "auxiliary_loss_mlp": 0.01110039, + "balance_loss_clip": 1.00179243, + "balance_loss_mlp": 1.00055742, + "epoch": 0.58830602735608, + "flos": 33802534558080.0, + "grad_norm": 2.2873811782134306, + "language_loss": 0.69110847, + "learning_rate": 1.5295029521335102e-06, + "loss": 0.71322846, + "num_input_tokens_seen": 210963105, + "step": 9785, + "time_per_iteration": 2.784492254257202 + }, + { + "auxiliary_loss_clip": 0.01149508, + "auxiliary_loss_mlp": 0.01107915, + "balance_loss_clip": 1.00196147, + "balance_loss_mlp": 1.00043607, + "epoch": 0.588366150608748, + "flos": 17090714586240.0, + "grad_norm": 2.0174407659570384, + "language_loss": 0.77483839, + "learning_rate": 1.5291244295194448e-06, + "loss": 0.79741263, + "num_input_tokens_seen": 210978720, + "step": 9786, + "time_per_iteration": 2.5152106285095215 + }, + { + "auxiliary_loss_clip": 0.01135974, + "auxiliary_loss_mlp": 0.01108954, + "balance_loss_clip": 1.00192499, + "balance_loss_mlp": 1.00061703, + "epoch": 0.5884262738614159, + "flos": 22127186033280.0, + "grad_norm": 1.4963196476989176, + "language_loss": 0.79266727, + "learning_rate": 1.5287459247610276e-06, + "loss": 0.81511652, + "num_input_tokens_seen": 210998750, + "step": 9787, + "time_per_iteration": 2.593782901763916 + }, + { + "auxiliary_loss_clip": 0.01116615, + "auxiliary_loss_mlp": 0.01109225, + "balance_loss_clip": 1.0018369, + "balance_loss_mlp": 1.00050592, + "epoch": 0.5884863971140839, + "flos": 21031838743680.0, + "grad_norm": 1.568543537317635, + "language_loss": 0.66337359, + "learning_rate": 1.5283674378726116e-06, + "loss": 0.68563199, + "num_input_tokens_seen": 211017550, + "step": 9788, + "time_per_iteration": 2.6535449028015137 + }, + { + "auxiliary_loss_clip": 0.01134091, + "auxiliary_loss_mlp": 0.01109352, + "balance_loss_clip": 1.0020144, + "balance_loss_mlp": 1.00072861, + "epoch": 0.5885465203667518, + "flos": 23805112008960.0, + "grad_norm": 3.134651189199853, + "language_loss": 0.80902308, + "learning_rate": 1.5279889688685506e-06, + "loss": 0.83145756, + "num_input_tokens_seen": 211034135, + "step": 9789, + "time_per_iteration": 2.6055164337158203 + }, + { + "auxiliary_loss_clip": 0.01134474, + "auxiliary_loss_mlp": 0.00747345, + "balance_loss_clip": 1.0020678, + "balance_loss_mlp": 1.00060856, + "epoch": 0.5886066436194198, + "flos": 18880574319360.0, + "grad_norm": 2.010251204634706, + "language_loss": 0.70753694, + "learning_rate": 1.5276105177631944e-06, + "loss": 0.7263552, + "num_input_tokens_seen": 211053850, + "step": 9790, + "time_per_iteration": 4.0289483070373535 + }, + { + "auxiliary_loss_clip": 0.01117662, + "auxiliary_loss_mlp": 0.01108914, + "balance_loss_clip": 1.00190437, + "balance_loss_mlp": 1.00057673, + "epoch": 0.5886667668720877, + "flos": 24790141653120.0, + "grad_norm": 9.082657259417397, + "language_loss": 0.83097148, + "learning_rate": 1.527232084570895e-06, + "loss": 0.85323715, + "num_input_tokens_seen": 211072165, + "step": 9791, + "time_per_iteration": 2.668951988220215 + }, + { + "auxiliary_loss_clip": 0.01151098, + "auxiliary_loss_mlp": 0.01109485, + "balance_loss_clip": 1.00215983, + "balance_loss_mlp": 1.00067067, + "epoch": 0.5887268901247558, + "flos": 21614381516160.0, + "grad_norm": 1.640959892068242, + "language_loss": 0.76371467, + "learning_rate": 1.5268536693060026e-06, + "loss": 0.78632057, + "num_input_tokens_seen": 211089630, + "step": 9792, + "time_per_iteration": 2.575819492340088 + }, + { + "auxiliary_loss_clip": 0.01104756, + "auxiliary_loss_mlp": 0.01108809, + "balance_loss_clip": 1.00181818, + "balance_loss_mlp": 1.0005666, + "epoch": 0.5887870133774237, + "flos": 20481722974080.0, + "grad_norm": 2.1167971562559647, + "language_loss": 0.69140673, + "learning_rate": 1.5264752719828662e-06, + "loss": 0.71354234, + "num_input_tokens_seen": 211106120, + "step": 9793, + "time_per_iteration": 4.17664361000061 + }, + { + "auxiliary_loss_clip": 0.01165957, + "auxiliary_loss_mlp": 0.01108639, + "balance_loss_clip": 1.00206077, + "balance_loss_mlp": 1.00058746, + "epoch": 0.5888471366300917, + "flos": 19206283870080.0, + "grad_norm": 1.8273644295714584, + "language_loss": 0.59964263, + "learning_rate": 1.5260968926158353e-06, + "loss": 0.62238854, + "num_input_tokens_seen": 211122450, + "step": 9794, + "time_per_iteration": 3.9045543670654297 + }, + { + "auxiliary_loss_clip": 0.01118091, + "auxiliary_loss_mlp": 0.01109087, + "balance_loss_clip": 1.00198054, + "balance_loss_mlp": 1.00084507, + "epoch": 0.5889072598827596, + "flos": 19972904866560.0, + "grad_norm": 4.776456216179591, + "language_loss": 0.65200943, + "learning_rate": 1.525718531219257e-06, + "loss": 0.67428124, + "num_input_tokens_seen": 211141765, + "step": 9795, + "time_per_iteration": 2.667034149169922 + }, + { + "auxiliary_loss_clip": 0.01123896, + "auxiliary_loss_mlp": 0.01109236, + "balance_loss_clip": 1.00202155, + "balance_loss_mlp": 1.00070739, + "epoch": 0.5889673831354276, + "flos": 20741249715840.0, + "grad_norm": 2.083247922099452, + "language_loss": 0.74165398, + "learning_rate": 1.5253401878074801e-06, + "loss": 0.76398528, + "num_input_tokens_seen": 211160475, + "step": 9796, + "time_per_iteration": 2.630596160888672 + }, + { + "auxiliary_loss_clip": 0.01136337, + "auxiliary_loss_mlp": 0.01108997, + "balance_loss_clip": 1.00213289, + "balance_loss_mlp": 1.00046909, + "epoch": 0.5890275063880956, + "flos": 25300935008640.0, + "grad_norm": 1.5758040382550034, + "language_loss": 0.83334571, + "learning_rate": 1.5249618623948507e-06, + "loss": 0.85579908, + "num_input_tokens_seen": 211180480, + "step": 9797, + "time_per_iteration": 2.6182758808135986 + }, + { + "auxiliary_loss_clip": 0.01134535, + "auxiliary_loss_mlp": 0.01108656, + "balance_loss_clip": 1.00190663, + "balance_loss_mlp": 1.00070024, + "epoch": 0.5890876296407636, + "flos": 11765377964160.0, + "grad_norm": 4.897345008595726, + "language_loss": 0.79284096, + "learning_rate": 1.5245835549957152e-06, + "loss": 0.81527281, + "num_input_tokens_seen": 211198000, + "step": 9798, + "time_per_iteration": 2.5788233280181885 + }, + { + "auxiliary_loss_clip": 0.01165997, + "auxiliary_loss_mlp": 0.01108132, + "balance_loss_clip": 1.00212455, + "balance_loss_mlp": 1.0005579, + "epoch": 0.5891477528934316, + "flos": 13589460380160.0, + "grad_norm": 2.158981022220851, + "language_loss": 0.74192345, + "learning_rate": 1.5242052656244186e-06, + "loss": 0.76466477, + "num_input_tokens_seen": 211214765, + "step": 9799, + "time_per_iteration": 2.531158208847046 + }, + { + "auxiliary_loss_clip": 0.0111925, + "auxiliary_loss_mlp": 0.01110753, + "balance_loss_clip": 1.00209904, + "balance_loss_mlp": 1.00060403, + "epoch": 0.5892078761460995, + "flos": 15049193189760.0, + "grad_norm": 2.550759547170997, + "language_loss": 0.76326001, + "learning_rate": 1.5238269942953064e-06, + "loss": 0.78556001, + "num_input_tokens_seen": 211232335, + "step": 9800, + "time_per_iteration": 2.608334541320801 + }, + { + "auxiliary_loss_clip": 0.01103313, + "auxiliary_loss_mlp": 0.01109089, + "balance_loss_clip": 1.00194848, + "balance_loss_mlp": 1.00056112, + "epoch": 0.5892679993987675, + "flos": 15778215624960.0, + "grad_norm": 3.2187097369769346, + "language_loss": 0.7838909, + "learning_rate": 1.523448741022722e-06, + "loss": 0.80601496, + "num_input_tokens_seen": 211249985, + "step": 9801, + "time_per_iteration": 2.6417911052703857 + }, + { + "auxiliary_loss_clip": 0.01118408, + "auxiliary_loss_mlp": 0.01110467, + "balance_loss_clip": 1.0019331, + "balance_loss_mlp": 1.00050843, + "epoch": 0.5893281226514354, + "flos": 25265203954560.0, + "grad_norm": 2.071585842856102, + "language_loss": 0.66201687, + "learning_rate": 1.5230705058210088e-06, + "loss": 0.68430561, + "num_input_tokens_seen": 211268425, + "step": 9802, + "time_per_iteration": 2.684551954269409 + }, + { + "auxiliary_loss_clip": 0.0114922, + "auxiliary_loss_mlp": 0.01108568, + "balance_loss_clip": 1.00201714, + "balance_loss_mlp": 1.00051713, + "epoch": 0.5893882459041034, + "flos": 19458232842240.0, + "grad_norm": 3.9548750673577393, + "language_loss": 0.78264898, + "learning_rate": 1.5226922887045108e-06, + "loss": 0.80522692, + "num_input_tokens_seen": 211286680, + "step": 9803, + "time_per_iteration": 2.5629611015319824 + }, + { + "auxiliary_loss_clip": 0.01149469, + "auxiliary_loss_mlp": 0.01109783, + "balance_loss_clip": 1.00200009, + "balance_loss_mlp": 1.00077844, + "epoch": 0.5894483691567713, + "flos": 20634056553600.0, + "grad_norm": 1.8886979765467502, + "language_loss": 0.73389304, + "learning_rate": 1.5223140896875686e-06, + "loss": 0.75648558, + "num_input_tokens_seen": 211307700, + "step": 9804, + "time_per_iteration": 2.5889182090759277 + }, + { + "auxiliary_loss_clip": 0.01134934, + "auxiliary_loss_mlp": 0.01108477, + "balance_loss_clip": 1.00209963, + "balance_loss_mlp": 1.00061619, + "epoch": 0.5895084924094394, + "flos": 17778223877760.0, + "grad_norm": 2.540315835757177, + "language_loss": 0.74782962, + "learning_rate": 1.5219359087845234e-06, + "loss": 0.77026367, + "num_input_tokens_seen": 211324835, + "step": 9805, + "time_per_iteration": 2.5739166736602783 + }, + { + "auxiliary_loss_clip": 0.01151032, + "auxiliary_loss_mlp": 0.00747635, + "balance_loss_clip": 1.00206208, + "balance_loss_mlp": 1.00066018, + "epoch": 0.5895686156621073, + "flos": 20121072468480.0, + "grad_norm": 3.589939809761361, + "language_loss": 0.78320867, + "learning_rate": 1.5215577460097174e-06, + "loss": 0.80219531, + "num_input_tokens_seen": 211344130, + "step": 9806, + "time_per_iteration": 2.567075490951538 + }, + { + "auxiliary_loss_clip": 0.01165957, + "auxiliary_loss_mlp": 0.01109634, + "balance_loss_clip": 1.00206757, + "balance_loss_mlp": 1.00043833, + "epoch": 0.5896287389147753, + "flos": 20850058990080.0, + "grad_norm": 1.8968968623577804, + "language_loss": 0.76770258, + "learning_rate": 1.5211796013774887e-06, + "loss": 0.7904585, + "num_input_tokens_seen": 211362915, + "step": 9807, + "time_per_iteration": 2.522871494293213 + }, + { + "auxiliary_loss_clip": 0.0115053, + "auxiliary_loss_mlp": 0.01110097, + "balance_loss_clip": 1.00210357, + "balance_loss_mlp": 1.00051975, + "epoch": 0.5896888621674432, + "flos": 14537897043840.0, + "grad_norm": 2.420264031145529, + "language_loss": 0.7470566, + "learning_rate": 1.5208014749021786e-06, + "loss": 0.76966286, + "num_input_tokens_seen": 211380700, + "step": 9808, + "time_per_iteration": 2.530811071395874 + }, + { + "auxiliary_loss_clip": 0.01102646, + "auxiliary_loss_mlp": 0.0110975, + "balance_loss_clip": 1.00194836, + "balance_loss_mlp": 1.00055456, + "epoch": 0.5897489854201112, + "flos": 20886759711360.0, + "grad_norm": 1.924122202100369, + "language_loss": 0.71761233, + "learning_rate": 1.5204233665981236e-06, + "loss": 0.73973632, + "num_input_tokens_seen": 211400095, + "step": 9809, + "time_per_iteration": 2.674997329711914 + }, + { + "auxiliary_loss_clip": 0.01132778, + "auxiliary_loss_mlp": 0.01109434, + "balance_loss_clip": 1.00201941, + "balance_loss_mlp": 1.00061953, + "epoch": 0.5898091086727792, + "flos": 20011149872640.0, + "grad_norm": 3.367232755719993, + "language_loss": 0.82593942, + "learning_rate": 1.5200452764796627e-06, + "loss": 0.84836149, + "num_input_tokens_seen": 211417810, + "step": 9810, + "time_per_iteration": 2.584484338760376 + }, + { + "auxiliary_loss_clip": 0.01151248, + "auxiliary_loss_mlp": 0.01108903, + "balance_loss_clip": 1.0021615, + "balance_loss_mlp": 1.00056601, + "epoch": 0.5898692319254472, + "flos": 16253242012800.0, + "grad_norm": 1.913212250684781, + "language_loss": 0.81103277, + "learning_rate": 1.5196672045611336e-06, + "loss": 0.83363426, + "num_input_tokens_seen": 211436020, + "step": 9811, + "time_per_iteration": 2.5501980781555176 + }, + { + "auxiliary_loss_clip": 0.01149816, + "auxiliary_loss_mlp": 0.01109068, + "balance_loss_clip": 1.0020175, + "balance_loss_mlp": 1.00044501, + "epoch": 0.5899293551781152, + "flos": 20448541785600.0, + "grad_norm": 2.093639889117898, + "language_loss": 0.7666682, + "learning_rate": 1.5192891508568715e-06, + "loss": 0.78925705, + "num_input_tokens_seen": 211454335, + "step": 9812, + "time_per_iteration": 2.5971033573150635 + }, + { + "auxiliary_loss_clip": 0.01117443, + "auxiliary_loss_mlp": 0.0110907, + "balance_loss_clip": 1.0018754, + "balance_loss_mlp": 1.00063753, + "epoch": 0.5899894784307831, + "flos": 13881701433600.0, + "grad_norm": 1.968085472537494, + "language_loss": 0.70589477, + "learning_rate": 1.5189111153812133e-06, + "loss": 0.7281599, + "num_input_tokens_seen": 211472775, + "step": 9813, + "time_per_iteration": 2.6201353073120117 + }, + { + "auxiliary_loss_clip": 0.01133023, + "auxiliary_loss_mlp": 0.01108844, + "balance_loss_clip": 1.00193346, + "balance_loss_mlp": 1.00060213, + "epoch": 0.5900496016834511, + "flos": 20083797129600.0, + "grad_norm": 1.5958883243946402, + "language_loss": 0.72236049, + "learning_rate": 1.518533098148494e-06, + "loss": 0.74477923, + "num_input_tokens_seen": 211492195, + "step": 9814, + "time_per_iteration": 2.5986645221710205 + }, + { + "auxiliary_loss_clip": 0.01137343, + "auxiliary_loss_mlp": 0.0110983, + "balance_loss_clip": 1.0020926, + "balance_loss_mlp": 1.00063419, + "epoch": 0.590109724936119, + "flos": 20259148348800.0, + "grad_norm": 1.9609637941423717, + "language_loss": 0.7802856, + "learning_rate": 1.5181550991730476e-06, + "loss": 0.80275732, + "num_input_tokens_seen": 211510220, + "step": 9815, + "time_per_iteration": 2.589895725250244 + }, + { + "auxiliary_loss_clip": 0.01119617, + "auxiliary_loss_mlp": 0.00747589, + "balance_loss_clip": 1.00201941, + "balance_loss_mlp": 1.00064743, + "epoch": 0.590169848188787, + "flos": 24235069806720.0, + "grad_norm": 1.8697931153288367, + "language_loss": 0.75953341, + "learning_rate": 1.5177771184692083e-06, + "loss": 0.77820539, + "num_input_tokens_seen": 211526260, + "step": 9816, + "time_per_iteration": 2.6704485416412354 + }, + { + "auxiliary_loss_clip": 0.01166137, + "auxiliary_loss_mlp": 0.01109208, + "balance_loss_clip": 1.0021323, + "balance_loss_mlp": 1.00068021, + "epoch": 0.590229971441455, + "flos": 17784724239360.0, + "grad_norm": 2.256690389376948, + "language_loss": 0.80994046, + "learning_rate": 1.517399156051309e-06, + "loss": 0.83269393, + "num_input_tokens_seen": 211542890, + "step": 9817, + "time_per_iteration": 2.4943220615386963 + }, + { + "auxiliary_loss_clip": 0.0110756, + "auxiliary_loss_mlp": 0.01109344, + "balance_loss_clip": 1.00200534, + "balance_loss_mlp": 1.0006249, + "epoch": 0.590290094694123, + "flos": 22236893147520.0, + "grad_norm": 1.6010358435573984, + "language_loss": 0.76400602, + "learning_rate": 1.517021211933682e-06, + "loss": 0.78617501, + "num_input_tokens_seen": 211562685, + "step": 9818, + "time_per_iteration": 2.6868910789489746 + }, + { + "auxiliary_loss_clip": 0.01115808, + "auxiliary_loss_mlp": 0.01109311, + "balance_loss_clip": 1.00179446, + "balance_loss_mlp": 1.00049675, + "epoch": 0.5903502179467909, + "flos": 19098623831040.0, + "grad_norm": 5.391893929712254, + "language_loss": 0.66865569, + "learning_rate": 1.5166432861306592e-06, + "loss": 0.69090688, + "num_input_tokens_seen": 211579960, + "step": 9819, + "time_per_iteration": 2.6110641956329346 + }, + { + "auxiliary_loss_clip": 0.01166167, + "auxiliary_loss_mlp": 0.01109455, + "balance_loss_clip": 1.00217342, + "balance_loss_mlp": 1.00073647, + "epoch": 0.5904103411994589, + "flos": 24235500769920.0, + "grad_norm": 3.0251004224137397, + "language_loss": 0.78300017, + "learning_rate": 1.5162653786565714e-06, + "loss": 0.80575645, + "num_input_tokens_seen": 211599310, + "step": 9820, + "time_per_iteration": 2.5660030841827393 + }, + { + "auxiliary_loss_clip": 0.01112225, + "auxiliary_loss_mlp": 0.0108537, + "balance_loss_clip": 1.0012871, + "balance_loss_mlp": 1.00030184, + "epoch": 0.5904704644521268, + "flos": 64876613045760.0, + "grad_norm": 0.9323983983592711, + "language_loss": 0.65052974, + "learning_rate": 1.5158874895257487e-06, + "loss": 0.67250568, + "num_input_tokens_seen": 211658790, + "step": 9821, + "time_per_iteration": 4.50871205329895 + }, + { + "auxiliary_loss_clip": 0.01117927, + "auxiliary_loss_mlp": 0.01109033, + "balance_loss_clip": 1.00203609, + "balance_loss_mlp": 1.0007906, + "epoch": 0.5905305877047948, + "flos": 19609991804160.0, + "grad_norm": 2.1015724216492613, + "language_loss": 0.61760974, + "learning_rate": 1.515509618752521e-06, + "loss": 0.63987935, + "num_input_tokens_seen": 211677240, + "step": 9822, + "time_per_iteration": 2.634442090988159 + }, + { + "auxiliary_loss_clip": 0.01165975, + "auxiliary_loss_mlp": 0.01110104, + "balance_loss_clip": 1.00199652, + "balance_loss_mlp": 1.00062203, + "epoch": 0.5905907109574628, + "flos": 18989634988800.0, + "grad_norm": 1.827884100703792, + "language_loss": 0.82518595, + "learning_rate": 1.5151317663512173e-06, + "loss": 0.84794676, + "num_input_tokens_seen": 211695485, + "step": 9823, + "time_per_iteration": 2.4875435829162598 + }, + { + "auxiliary_loss_clip": 0.01134499, + "auxiliary_loss_mlp": 0.01108946, + "balance_loss_clip": 1.00208545, + "balance_loss_mlp": 1.00051343, + "epoch": 0.5906508342101308, + "flos": 22200407907840.0, + "grad_norm": 2.8204977048053084, + "language_loss": 0.72859555, + "learning_rate": 1.514753932336165e-06, + "loss": 0.75102997, + "num_input_tokens_seen": 211713090, + "step": 9824, + "time_per_iteration": 2.585761547088623 + }, + { + "auxiliary_loss_clip": 0.01119965, + "auxiliary_loss_mlp": 0.00747588, + "balance_loss_clip": 1.00197804, + "balance_loss_mlp": 1.00060558, + "epoch": 0.5907109574627988, + "flos": 20886687884160.0, + "grad_norm": 2.2462411496837036, + "language_loss": 0.82884943, + "learning_rate": 1.514376116721693e-06, + "loss": 0.84752494, + "num_input_tokens_seen": 211732510, + "step": 9825, + "time_per_iteration": 2.6294617652893066 + }, + { + "auxiliary_loss_clip": 0.01149126, + "auxiliary_loss_mlp": 0.01106703, + "balance_loss_clip": 1.00195479, + "balance_loss_mlp": 1.00055933, + "epoch": 0.5907710807154667, + "flos": 21506649649920.0, + "grad_norm": 1.6877185737720266, + "language_loss": 0.76415211, + "learning_rate": 1.5139983195221272e-06, + "loss": 0.78671038, + "num_input_tokens_seen": 211748695, + "step": 9826, + "time_per_iteration": 2.5421533584594727 + }, + { + "auxiliary_loss_clip": 0.01132284, + "auxiliary_loss_mlp": 0.01108421, + "balance_loss_clip": 1.00196254, + "balance_loss_mlp": 1.00056076, + "epoch": 0.5908312039681347, + "flos": 22018376759040.0, + "grad_norm": 2.0256464719028022, + "language_loss": 0.71938467, + "learning_rate": 1.513620540751793e-06, + "loss": 0.74179173, + "num_input_tokens_seen": 211768545, + "step": 9827, + "time_per_iteration": 2.5992000102996826 + }, + { + "auxiliary_loss_clip": 0.01099334, + "auxiliary_loss_mlp": 0.01109071, + "balance_loss_clip": 1.00173712, + "balance_loss_mlp": 1.00073314, + "epoch": 0.5908913272208026, + "flos": 18479523991680.0, + "grad_norm": 2.4468658100269707, + "language_loss": 0.79773474, + "learning_rate": 1.5132427804250178e-06, + "loss": 0.81981874, + "num_input_tokens_seen": 211786665, + "step": 9828, + "time_per_iteration": 4.158663511276245 + }, + { + "auxiliary_loss_clip": 0.01104286, + "auxiliary_loss_mlp": 0.01109633, + "balance_loss_clip": 1.00196087, + "balance_loss_mlp": 1.00062799, + "epoch": 0.5909514504734706, + "flos": 12312189682560.0, + "grad_norm": 2.1996348742248353, + "language_loss": 0.88094676, + "learning_rate": 1.5128650385561241e-06, + "loss": 0.90308601, + "num_input_tokens_seen": 211801215, + "step": 9829, + "time_per_iteration": 2.6663501262664795 + }, + { + "auxiliary_loss_clip": 0.01128268, + "auxiliary_loss_mlp": 0.01084019, + "balance_loss_clip": 1.00165844, + "balance_loss_mlp": 1.00009596, + "epoch": 0.5910115737261386, + "flos": 70213262451840.0, + "grad_norm": 0.7491686464554064, + "language_loss": 0.57839411, + "learning_rate": 1.5124873151594376e-06, + "loss": 0.60051697, + "num_input_tokens_seen": 211857005, + "step": 9830, + "time_per_iteration": 3.1081998348236084 + }, + { + "auxiliary_loss_clip": 0.0114956, + "auxiliary_loss_mlp": 0.00747481, + "balance_loss_clip": 1.00216019, + "balance_loss_mlp": 1.00047696, + "epoch": 0.5910716969788066, + "flos": 22017766227840.0, + "grad_norm": 2.677722428208373, + "language_loss": 0.75999963, + "learning_rate": 1.5121096102492812e-06, + "loss": 0.77897, + "num_input_tokens_seen": 211876675, + "step": 9831, + "time_per_iteration": 5.357731819152832 + }, + { + "auxiliary_loss_clip": 0.01132746, + "auxiliary_loss_mlp": 0.01107283, + "balance_loss_clip": 1.00200319, + "balance_loss_mlp": 1.00056696, + "epoch": 0.5911318202314745, + "flos": 21251648021760.0, + "grad_norm": 1.8909861885829478, + "language_loss": 0.77644169, + "learning_rate": 1.5117319238399767e-06, + "loss": 0.79884195, + "num_input_tokens_seen": 211895725, + "step": 9832, + "time_per_iteration": 2.6017906665802 + }, + { + "auxiliary_loss_clip": 0.01149172, + "auxiliary_loss_mlp": 0.01108122, + "balance_loss_clip": 1.0019691, + "balance_loss_mlp": 1.00054765, + "epoch": 0.5911919434841425, + "flos": 17821604528640.0, + "grad_norm": 2.805166339009459, + "language_loss": 0.83601058, + "learning_rate": 1.511354255945847e-06, + "loss": 0.85858351, + "num_input_tokens_seen": 211913860, + "step": 9833, + "time_per_iteration": 2.5206215381622314 + }, + { + "auxiliary_loss_clip": 0.01150796, + "auxiliary_loss_mlp": 0.01109568, + "balance_loss_clip": 1.00204921, + "balance_loss_mlp": 1.00065863, + "epoch": 0.5912520667368104, + "flos": 20374781207040.0, + "grad_norm": 1.633045489232861, + "language_loss": 0.74109042, + "learning_rate": 1.5109766065812123e-06, + "loss": 0.76369405, + "num_input_tokens_seen": 211932880, + "step": 9834, + "time_per_iteration": 2.5878727436065674 + }, + { + "auxiliary_loss_clip": 0.01166078, + "auxiliary_loss_mlp": 0.01108412, + "balance_loss_clip": 1.00215423, + "balance_loss_mlp": 1.00064707, + "epoch": 0.5913121899894784, + "flos": 17930557457280.0, + "grad_norm": 2.3020981298951884, + "language_loss": 0.78070652, + "learning_rate": 1.5105989757603942e-06, + "loss": 0.80345136, + "num_input_tokens_seen": 211948625, + "step": 9835, + "time_per_iteration": 2.5636188983917236 + }, + { + "auxiliary_loss_clip": 0.01136636, + "auxiliary_loss_mlp": 0.01108874, + "balance_loss_clip": 1.00208068, + "balance_loss_mlp": 1.00063181, + "epoch": 0.5913723132421465, + "flos": 22126934638080.0, + "grad_norm": 2.68850421784318, + "language_loss": 0.73794544, + "learning_rate": 1.5102213634977117e-06, + "loss": 0.76040053, + "num_input_tokens_seen": 211965355, + "step": 9836, + "time_per_iteration": 2.645982503890991 + }, + { + "auxiliary_loss_clip": 0.0111972, + "auxiliary_loss_mlp": 0.01109514, + "balance_loss_clip": 1.00210059, + "balance_loss_mlp": 1.00041401, + "epoch": 0.5914324364948144, + "flos": 15697918771200.0, + "grad_norm": 2.090440935107307, + "language_loss": 0.82233429, + "learning_rate": 1.5098437698074841e-06, + "loss": 0.84462667, + "num_input_tokens_seen": 211982245, + "step": 9837, + "time_per_iteration": 2.6092638969421387 + }, + { + "auxiliary_loss_clip": 0.01120244, + "auxiliary_loss_mlp": 0.01109611, + "balance_loss_clip": 1.00210476, + "balance_loss_mlp": 1.00051069, + "epoch": 0.5914925597474824, + "flos": 22747327367040.0, + "grad_norm": 1.7165743075169675, + "language_loss": 0.79472125, + "learning_rate": 1.5094661947040304e-06, + "loss": 0.81701976, + "num_input_tokens_seen": 212000250, + "step": 9838, + "time_per_iteration": 2.6455397605895996 + }, + { + "auxiliary_loss_clip": 0.01098542, + "auxiliary_loss_mlp": 0.01108552, + "balance_loss_clip": 1.00172091, + "balance_loss_mlp": 1.00069106, + "epoch": 0.5915526830001503, + "flos": 18292788161280.0, + "grad_norm": 2.044402454970723, + "language_loss": 0.69727832, + "learning_rate": 1.5090886382016673e-06, + "loss": 0.71934921, + "num_input_tokens_seen": 212017505, + "step": 9839, + "time_per_iteration": 2.6716930866241455 + }, + { + "auxiliary_loss_clip": 0.01135086, + "auxiliary_loss_mlp": 0.01109998, + "balance_loss_clip": 1.00195503, + "balance_loss_mlp": 1.00070679, + "epoch": 0.5916128062528183, + "flos": 17019072910080.0, + "grad_norm": 2.4141900738740203, + "language_loss": 0.65758896, + "learning_rate": 1.5087111003147124e-06, + "loss": 0.68003982, + "num_input_tokens_seen": 212034595, + "step": 9840, + "time_per_iteration": 2.6022324562072754 + }, + { + "auxiliary_loss_clip": 0.01134366, + "auxiliary_loss_mlp": 0.01109434, + "balance_loss_clip": 1.00204742, + "balance_loss_mlp": 1.00062013, + "epoch": 0.5916729295054862, + "flos": 24754231031040.0, + "grad_norm": 1.7203008429202569, + "language_loss": 0.81670064, + "learning_rate": 1.5083335810574813e-06, + "loss": 0.83913863, + "num_input_tokens_seen": 212055775, + "step": 9841, + "time_per_iteration": 2.612638473510742 + }, + { + "auxiliary_loss_clip": 0.01134402, + "auxiliary_loss_mlp": 0.01108645, + "balance_loss_clip": 1.00205064, + "balance_loss_mlp": 1.00059426, + "epoch": 0.5917330527581542, + "flos": 15958199698560.0, + "grad_norm": 1.5985927634941193, + "language_loss": 0.69417292, + "learning_rate": 1.507956080444291e-06, + "loss": 0.7166034, + "num_input_tokens_seen": 212074000, + "step": 9842, + "time_per_iteration": 2.570152521133423 + }, + { + "auxiliary_loss_clip": 0.01134438, + "auxiliary_loss_mlp": 0.01108909, + "balance_loss_clip": 1.00200427, + "balance_loss_mlp": 1.00057197, + "epoch": 0.5917931760108222, + "flos": 23800730549760.0, + "grad_norm": 1.7293583760501996, + "language_loss": 0.82346845, + "learning_rate": 1.5075785984894549e-06, + "loss": 0.84590197, + "num_input_tokens_seen": 212091415, + "step": 9843, + "time_per_iteration": 2.611501455307007 + }, + { + "auxiliary_loss_clip": 0.01136494, + "auxiliary_loss_mlp": 0.01109451, + "balance_loss_clip": 1.00210941, + "balance_loss_mlp": 1.00054121, + "epoch": 0.5918532992634902, + "flos": 23249609199360.0, + "grad_norm": 2.325973851154417, + "language_loss": 0.82648188, + "learning_rate": 1.5072011352072875e-06, + "loss": 0.84894133, + "num_input_tokens_seen": 212105255, + "step": 9844, + "time_per_iteration": 2.621032953262329 + }, + { + "auxiliary_loss_clip": 0.01104856, + "auxiliary_loss_mlp": 0.01109352, + "balance_loss_clip": 1.00193989, + "balance_loss_mlp": 1.00053763, + "epoch": 0.5919134225161581, + "flos": 19499853726720.0, + "grad_norm": 2.342135086299679, + "language_loss": 0.74795449, + "learning_rate": 1.5068236906121032e-06, + "loss": 0.77009654, + "num_input_tokens_seen": 212122765, + "step": 9845, + "time_per_iteration": 2.649272918701172 + }, + { + "auxiliary_loss_clip": 0.01117856, + "auxiliary_loss_mlp": 0.01108858, + "balance_loss_clip": 1.00190401, + "balance_loss_mlp": 1.00052047, + "epoch": 0.5919735457688261, + "flos": 38800940567040.0, + "grad_norm": 1.7199480713955935, + "language_loss": 0.63972914, + "learning_rate": 1.506446264718213e-06, + "loss": 0.66199636, + "num_input_tokens_seen": 212143960, + "step": 9846, + "time_per_iteration": 2.773212432861328 + }, + { + "auxiliary_loss_clip": 0.01117006, + "auxiliary_loss_mlp": 0.00747138, + "balance_loss_clip": 1.00198746, + "balance_loss_mlp": 1.00042737, + "epoch": 0.592033669021494, + "flos": 22163994495360.0, + "grad_norm": 1.767130993970745, + "language_loss": 0.75876057, + "learning_rate": 1.506068857539931e-06, + "loss": 0.77740204, + "num_input_tokens_seen": 212162005, + "step": 9847, + "time_per_iteration": 2.660121440887451 + }, + { + "auxiliary_loss_clip": 0.01134487, + "auxiliary_loss_mlp": 0.01108396, + "balance_loss_clip": 1.00203502, + "balance_loss_mlp": 1.00044048, + "epoch": 0.592093792274162, + "flos": 22710985781760.0, + "grad_norm": 1.868785669297814, + "language_loss": 0.62212127, + "learning_rate": 1.5056914690915667e-06, + "loss": 0.64455009, + "num_input_tokens_seen": 212181635, + "step": 9848, + "time_per_iteration": 2.593470573425293 + }, + { + "auxiliary_loss_clip": 0.01149351, + "auxiliary_loss_mlp": 0.01109239, + "balance_loss_clip": 1.00201809, + "balance_loss_mlp": 1.0006156, + "epoch": 0.59215391552683, + "flos": 22528954632960.0, + "grad_norm": 2.2062030122538046, + "language_loss": 0.75808203, + "learning_rate": 1.5053140993874312e-06, + "loss": 0.7806679, + "num_input_tokens_seen": 212201615, + "step": 9849, + "time_per_iteration": 2.561857223510742 + }, + { + "auxiliary_loss_clip": 0.01134416, + "auxiliary_loss_mlp": 0.01108153, + "balance_loss_clip": 1.00202048, + "balance_loss_mlp": 1.00057888, + "epoch": 0.592214038779498, + "flos": 24499013921280.0, + "grad_norm": 3.2295296322943883, + "language_loss": 0.75355971, + "learning_rate": 1.5049367484418353e-06, + "loss": 0.77598536, + "num_input_tokens_seen": 212219355, + "step": 9850, + "time_per_iteration": 2.608937978744507 + }, + { + "auxiliary_loss_clip": 0.01119455, + "auxiliary_loss_mlp": 0.01108675, + "balance_loss_clip": 1.00198698, + "balance_loss_mlp": 1.00081396, + "epoch": 0.592274162032166, + "flos": 21831353619840.0, + "grad_norm": 1.7884289131045912, + "language_loss": 0.76001596, + "learning_rate": 1.5045594162690868e-06, + "loss": 0.78229725, + "num_input_tokens_seen": 212236710, + "step": 9851, + "time_per_iteration": 2.6249823570251465 + }, + { + "auxiliary_loss_clip": 0.01132624, + "auxiliary_loss_mlp": 0.01109338, + "balance_loss_clip": 1.00201237, + "balance_loss_mlp": 1.00061953, + "epoch": 0.5923342852848339, + "flos": 24608146417920.0, + "grad_norm": 1.6995132404842583, + "language_loss": 0.70690691, + "learning_rate": 1.5041821028834954e-06, + "loss": 0.72932655, + "num_input_tokens_seen": 212256195, + "step": 9852, + "time_per_iteration": 2.623464345932007 + }, + { + "auxiliary_loss_clip": 0.01132745, + "auxiliary_loss_mlp": 0.00747579, + "balance_loss_clip": 1.00194526, + "balance_loss_mlp": 1.00063872, + "epoch": 0.5923944085375019, + "flos": 19938143479680.0, + "grad_norm": 2.108153185095033, + "language_loss": 0.80489498, + "learning_rate": 1.5038048082993685e-06, + "loss": 0.82369822, + "num_input_tokens_seen": 212274085, + "step": 9853, + "time_per_iteration": 2.620929002761841 + }, + { + "auxiliary_loss_clip": 0.01132559, + "auxiliary_loss_mlp": 0.01107664, + "balance_loss_clip": 1.00194621, + "balance_loss_mlp": 1.00056612, + "epoch": 0.5924545317901698, + "flos": 28658510812800.0, + "grad_norm": 1.5012134992079695, + "language_loss": 0.67354441, + "learning_rate": 1.5034275325310124e-06, + "loss": 0.69594663, + "num_input_tokens_seen": 212295530, + "step": 9854, + "time_per_iteration": 2.672618865966797 + }, + { + "auxiliary_loss_clip": 0.01124338, + "auxiliary_loss_mlp": 0.01108278, + "balance_loss_clip": 1.00208974, + "balance_loss_mlp": 1.00060773, + "epoch": 0.5925146550428378, + "flos": 19864885691520.0, + "grad_norm": 2.1887635520261552, + "language_loss": 0.88960004, + "learning_rate": 1.5030502755927344e-06, + "loss": 0.91192621, + "num_input_tokens_seen": 212313770, + "step": 9855, + "time_per_iteration": 2.6318507194519043 + }, + { + "auxiliary_loss_clip": 0.01153828, + "auxiliary_loss_mlp": 0.01107263, + "balance_loss_clip": 1.00214911, + "balance_loss_mlp": 1.0006423, + "epoch": 0.5925747782955058, + "flos": 15122989681920.0, + "grad_norm": 2.2990873674405177, + "language_loss": 0.87205601, + "learning_rate": 1.5026730374988397e-06, + "loss": 0.89466691, + "num_input_tokens_seen": 212331525, + "step": 9856, + "time_per_iteration": 2.564664363861084 + }, + { + "auxiliary_loss_clip": 0.01149392, + "auxiliary_loss_mlp": 0.01109248, + "balance_loss_clip": 1.00197268, + "balance_loss_mlp": 1.00071979, + "epoch": 0.5926349015481738, + "flos": 18405440190720.0, + "grad_norm": 3.4553043715312834, + "language_loss": 0.77646011, + "learning_rate": 1.5022958182636332e-06, + "loss": 0.79904652, + "num_input_tokens_seen": 212347295, + "step": 9857, + "time_per_iteration": 2.5254807472229004 + }, + { + "auxiliary_loss_clip": 0.0112136, + "auxiliary_loss_mlp": 0.011088, + "balance_loss_clip": 1.00199592, + "balance_loss_mlp": 1.00074887, + "epoch": 0.5926950248008417, + "flos": 23111138269440.0, + "grad_norm": 1.9604929760283447, + "language_loss": 0.64266539, + "learning_rate": 1.501918617901419e-06, + "loss": 0.66496694, + "num_input_tokens_seen": 212365750, + "step": 9858, + "time_per_iteration": 2.633485794067383 + }, + { + "auxiliary_loss_clip": 0.01148748, + "auxiliary_loss_mlp": 0.01107616, + "balance_loss_clip": 1.00201726, + "balance_loss_mlp": 1.00061417, + "epoch": 0.5927551480535097, + "flos": 28033916192640.0, + "grad_norm": 5.892383634031396, + "language_loss": 0.76862562, + "learning_rate": 1.501541436426501e-06, + "loss": 0.79118925, + "num_input_tokens_seen": 212385300, + "step": 9859, + "time_per_iteration": 4.017362356185913 + }, + { + "auxiliary_loss_clip": 0.01099151, + "auxiliary_loss_mlp": 0.00747417, + "balance_loss_clip": 1.00184727, + "balance_loss_mlp": 1.00049996, + "epoch": 0.5928152713061776, + "flos": 21798675221760.0, + "grad_norm": 6.940326350161811, + "language_loss": 0.7533108, + "learning_rate": 1.5011642738531818e-06, + "loss": 0.7717765, + "num_input_tokens_seen": 212402140, + "step": 9860, + "time_per_iteration": 2.6908745765686035 + }, + { + "auxiliary_loss_clip": 0.01117583, + "auxiliary_loss_mlp": 0.01108018, + "balance_loss_clip": 1.00196517, + "balance_loss_mlp": 1.00073028, + "epoch": 0.5928753945588456, + "flos": 24316839118080.0, + "grad_norm": 2.017261901989596, + "language_loss": 0.76282442, + "learning_rate": 1.500787130195763e-06, + "loss": 0.78508043, + "num_input_tokens_seen": 212421790, + "step": 9861, + "time_per_iteration": 2.6892809867858887 + }, + { + "auxiliary_loss_clip": 0.01115892, + "auxiliary_loss_mlp": 0.01107283, + "balance_loss_clip": 1.00181317, + "balance_loss_mlp": 1.00047183, + "epoch": 0.5929355178115137, + "flos": 26464619923200.0, + "grad_norm": 1.7551960900203532, + "language_loss": 0.7073456, + "learning_rate": 1.5004100054685465e-06, + "loss": 0.72957742, + "num_input_tokens_seen": 212442115, + "step": 9862, + "time_per_iteration": 2.6804187297821045 + }, + { + "auxiliary_loss_clip": 0.01103618, + "auxiliary_loss_mlp": 0.01107669, + "balance_loss_clip": 1.00184262, + "balance_loss_mlp": 1.0006671, + "epoch": 0.5929956410641816, + "flos": 24965995662720.0, + "grad_norm": 1.8044140640253028, + "language_loss": 0.78331065, + "learning_rate": 1.500032899685832e-06, + "loss": 0.8054235, + "num_input_tokens_seen": 212459535, + "step": 9863, + "time_per_iteration": 2.721176862716675 + }, + { + "auxiliary_loss_clip": 0.01134437, + "auxiliary_loss_mlp": 0.01109535, + "balance_loss_clip": 1.00207448, + "balance_loss_mlp": 1.00072074, + "epoch": 0.5930557643168496, + "flos": 26208325405440.0, + "grad_norm": 2.195493577556321, + "language_loss": 0.70372021, + "learning_rate": 1.499655812861921e-06, + "loss": 0.72615993, + "num_input_tokens_seen": 212479385, + "step": 9864, + "time_per_iteration": 2.633334159851074 + }, + { + "auxiliary_loss_clip": 0.01134093, + "auxiliary_loss_mlp": 0.01108954, + "balance_loss_clip": 1.00188124, + "balance_loss_mlp": 1.00080776, + "epoch": 0.5931158875695175, + "flos": 27854937699840.0, + "grad_norm": 1.658058055962844, + "language_loss": 0.67393196, + "learning_rate": 1.4992787450111112e-06, + "loss": 0.6963625, + "num_input_tokens_seen": 212500060, + "step": 9865, + "time_per_iteration": 2.6528513431549072 + }, + { + "auxiliary_loss_clip": 0.01134698, + "auxiliary_loss_mlp": 0.01108583, + "balance_loss_clip": 1.00193179, + "balance_loss_mlp": 1.00072241, + "epoch": 0.5931760108221855, + "flos": 15413650536960.0, + "grad_norm": 2.2138611238418573, + "language_loss": 0.78121924, + "learning_rate": 1.4989016961477015e-06, + "loss": 0.80365205, + "num_input_tokens_seen": 212518590, + "step": 9866, + "time_per_iteration": 3.971141815185547 + }, + { + "auxiliary_loss_clip": 0.0113273, + "auxiliary_loss_mlp": 0.01108551, + "balance_loss_clip": 1.00204659, + "balance_loss_mlp": 1.0005002, + "epoch": 0.5932361340748534, + "flos": 30188520581760.0, + "grad_norm": 5.29337178000035, + "language_loss": 0.72052085, + "learning_rate": 1.4985246662859903e-06, + "loss": 0.74293363, + "num_input_tokens_seen": 212538190, + "step": 9867, + "time_per_iteration": 2.660033702850342 + }, + { + "auxiliary_loss_clip": 0.01132636, + "auxiliary_loss_mlp": 0.01108761, + "balance_loss_clip": 1.00205278, + "balance_loss_mlp": 1.0007093, + "epoch": 0.5932962573275214, + "flos": 20157557708160.0, + "grad_norm": 1.4817457558865519, + "language_loss": 0.66258621, + "learning_rate": 1.4981476554402732e-06, + "loss": 0.68500018, + "num_input_tokens_seen": 212557820, + "step": 9868, + "time_per_iteration": 2.5764410495758057 + }, + { + "auxiliary_loss_clip": 0.01085901, + "auxiliary_loss_mlp": 0.00747388, + "balance_loss_clip": 1.00176966, + "balance_loss_mlp": 1.00049257, + "epoch": 0.5933563805801894, + "flos": 25445906300160.0, + "grad_norm": 1.5811796440195223, + "language_loss": 0.75171554, + "learning_rate": 1.4977706636248478e-06, + "loss": 0.77004838, + "num_input_tokens_seen": 212577645, + "step": 9869, + "time_per_iteration": 4.226045846939087 + }, + { + "auxiliary_loss_clip": 0.0110571, + "auxiliary_loss_mlp": 0.01108655, + "balance_loss_clip": 1.00200844, + "balance_loss_mlp": 1.0006988, + "epoch": 0.5934165038328574, + "flos": 59995740337920.0, + "grad_norm": 2.5048548153680317, + "language_loss": 0.74192047, + "learning_rate": 1.4973936908540091e-06, + "loss": 0.76406413, + "num_input_tokens_seen": 212603430, + "step": 9870, + "time_per_iteration": 4.4233012199401855 + }, + { + "auxiliary_loss_clip": 0.01102129, + "auxiliary_loss_mlp": 0.01108418, + "balance_loss_clip": 1.00185919, + "balance_loss_mlp": 1.00055718, + "epoch": 0.5934766270855253, + "flos": 24420548661120.0, + "grad_norm": 2.052546263227005, + "language_loss": 0.71785051, + "learning_rate": 1.4970167371420517e-06, + "loss": 0.7399559, + "num_input_tokens_seen": 212620730, + "step": 9871, + "time_per_iteration": 2.6975462436676025 + }, + { + "auxiliary_loss_clip": 0.01117744, + "auxiliary_loss_mlp": 0.01108118, + "balance_loss_clip": 1.00188422, + "balance_loss_mlp": 1.00054383, + "epoch": 0.5935367503381933, + "flos": 23513158264320.0, + "grad_norm": 1.9715708254220805, + "language_loss": 0.74501377, + "learning_rate": 1.496639802503271e-06, + "loss": 0.76727241, + "num_input_tokens_seen": 212639745, + "step": 9872, + "time_per_iteration": 2.6733992099761963 + }, + { + "auxiliary_loss_clip": 0.01151347, + "auxiliary_loss_mlp": 0.01109853, + "balance_loss_clip": 1.00211167, + "balance_loss_mlp": 1.00065708, + "epoch": 0.5935968735908612, + "flos": 18948337326720.0, + "grad_norm": 2.2606894392083374, + "language_loss": 0.78828239, + "learning_rate": 1.4962628869519583e-06, + "loss": 0.81089437, + "num_input_tokens_seen": 212655915, + "step": 9873, + "time_per_iteration": 2.538792371749878 + }, + { + "auxiliary_loss_clip": 0.01149223, + "auxiliary_loss_mlp": 0.01108904, + "balance_loss_clip": 1.00209761, + "balance_loss_mlp": 1.00066185, + "epoch": 0.5936569968435292, + "flos": 25483433034240.0, + "grad_norm": 1.5424803184923375, + "language_loss": 0.84849191, + "learning_rate": 1.4958859905024078e-06, + "loss": 0.87107313, + "num_input_tokens_seen": 212676115, + "step": 9874, + "time_per_iteration": 2.590250015258789 + }, + { + "auxiliary_loss_clip": 0.01129776, + "auxiliary_loss_mlp": 0.01084122, + "balance_loss_clip": 1.00141656, + "balance_loss_mlp": 1.00019825, + "epoch": 0.5937171200961973, + "flos": 66378361789440.0, + "grad_norm": 0.7132803413304359, + "language_loss": 0.6002959, + "learning_rate": 1.4955091131689115e-06, + "loss": 0.62243485, + "num_input_tokens_seen": 212737560, + "step": 9875, + "time_per_iteration": 3.234053373336792 + }, + { + "auxiliary_loss_clip": 0.01135896, + "auxiliary_loss_mlp": 0.01109137, + "balance_loss_clip": 1.00196671, + "balance_loss_mlp": 1.00060892, + "epoch": 0.5937772433488652, + "flos": 14903467712640.0, + "grad_norm": 2.2949694723663456, + "language_loss": 0.77839941, + "learning_rate": 1.4951322549657594e-06, + "loss": 0.80084968, + "num_input_tokens_seen": 212755365, + "step": 9876, + "time_per_iteration": 2.5655181407928467 + }, + { + "auxiliary_loss_clip": 0.01151153, + "auxiliary_loss_mlp": 0.01107697, + "balance_loss_clip": 1.00203609, + "balance_loss_mlp": 1.00059891, + "epoch": 0.5938373666015332, + "flos": 22561489376640.0, + "grad_norm": 1.4573694681056126, + "language_loss": 0.75935471, + "learning_rate": 1.494755415907243e-06, + "loss": 0.78194326, + "num_input_tokens_seen": 212773875, + "step": 9877, + "time_per_iteration": 2.5845677852630615 + }, + { + "auxiliary_loss_clip": 0.01149392, + "auxiliary_loss_mlp": 0.01109363, + "balance_loss_clip": 1.0019269, + "balance_loss_mlp": 1.00064456, + "epoch": 0.5938974898542011, + "flos": 18440883936000.0, + "grad_norm": 2.7624423182264124, + "language_loss": 0.8115927, + "learning_rate": 1.4943785960076522e-06, + "loss": 0.83418024, + "num_input_tokens_seen": 212790590, + "step": 9878, + "time_per_iteration": 2.5296030044555664 + }, + { + "auxiliary_loss_clip": 0.01134753, + "auxiliary_loss_mlp": 0.00747381, + "balance_loss_clip": 1.00196421, + "balance_loss_mlp": 1.00054359, + "epoch": 0.5939576131068691, + "flos": 45586728270720.0, + "grad_norm": 1.8780053953182336, + "language_loss": 0.71021038, + "learning_rate": 1.4940017952812754e-06, + "loss": 0.72903168, + "num_input_tokens_seen": 212812265, + "step": 9879, + "time_per_iteration": 2.82135272026062 + }, + { + "auxiliary_loss_clip": 0.01149183, + "auxiliary_loss_mlp": 0.01108233, + "balance_loss_clip": 1.00201881, + "balance_loss_mlp": 1.00084925, + "epoch": 0.594017736359537, + "flos": 23587708942080.0, + "grad_norm": 2.8779696445147547, + "language_loss": 0.57501876, + "learning_rate": 1.493625013742401e-06, + "loss": 0.59759295, + "num_input_tokens_seen": 212831915, + "step": 9880, + "time_per_iteration": 2.59379506111145 + }, + { + "auxiliary_loss_clip": 0.01150735, + "auxiliary_loss_mlp": 0.01108372, + "balance_loss_clip": 1.00207543, + "balance_loss_mlp": 1.00089324, + "epoch": 0.594077859612205, + "flos": 29457235589760.0, + "grad_norm": 1.8816641038074213, + "language_loss": 0.77519941, + "learning_rate": 1.4932482514053177e-06, + "loss": 0.79779041, + "num_input_tokens_seen": 212851350, + "step": 9881, + "time_per_iteration": 2.634629249572754 + }, + { + "auxiliary_loss_clip": 0.01149322, + "auxiliary_loss_mlp": 0.01108421, + "balance_loss_clip": 1.0019393, + "balance_loss_mlp": 1.00056052, + "epoch": 0.594137982864873, + "flos": 16800089644800.0, + "grad_norm": 2.4540670427956326, + "language_loss": 0.8345108, + "learning_rate": 1.4928715082843112e-06, + "loss": 0.85708821, + "num_input_tokens_seen": 212867995, + "step": 9882, + "time_per_iteration": 2.629917860031128 + }, + { + "auxiliary_loss_clip": 0.0114971, + "auxiliary_loss_mlp": 0.01108338, + "balance_loss_clip": 1.00207472, + "balance_loss_mlp": 1.00057244, + "epoch": 0.594198106117541, + "flos": 12750263953920.0, + "grad_norm": 2.334264456858249, + "language_loss": 0.79031193, + "learning_rate": 1.492494784393667e-06, + "loss": 0.81289232, + "num_input_tokens_seen": 212885220, + "step": 9883, + "time_per_iteration": 2.5141048431396484 + }, + { + "auxiliary_loss_clip": 0.01116462, + "auxiliary_loss_mlp": 0.0074753, + "balance_loss_clip": 1.00190854, + "balance_loss_mlp": 1.000543, + "epoch": 0.5942582293702089, + "flos": 20996538652800.0, + "grad_norm": 1.8880197075605598, + "language_loss": 0.74415076, + "learning_rate": 1.4921180797476725e-06, + "loss": 0.76279068, + "num_input_tokens_seen": 212903195, + "step": 9884, + "time_per_iteration": 2.658222198486328 + }, + { + "auxiliary_loss_clip": 0.01166074, + "auxiliary_loss_mlp": 0.01109259, + "balance_loss_clip": 1.00221384, + "balance_loss_mlp": 1.00063515, + "epoch": 0.5943183526228769, + "flos": 28291431772800.0, + "grad_norm": 1.9729132799674949, + "language_loss": 0.66421759, + "learning_rate": 1.4917413943606106e-06, + "loss": 0.68697095, + "num_input_tokens_seen": 212923340, + "step": 9885, + "time_per_iteration": 2.5746757984161377 + }, + { + "auxiliary_loss_clip": 0.01132618, + "auxiliary_loss_mlp": 0.01109479, + "balance_loss_clip": 1.00203538, + "balance_loss_mlp": 1.00076056, + "epoch": 0.5943784758755448, + "flos": 26614619118720.0, + "grad_norm": 2.145603889389757, + "language_loss": 0.77274001, + "learning_rate": 1.4913647282467667e-06, + "loss": 0.79516095, + "num_input_tokens_seen": 212942755, + "step": 9886, + "time_per_iteration": 2.6286380290985107 + }, + { + "auxiliary_loss_clip": 0.01144, + "auxiliary_loss_mlp": 0.0108488, + "balance_loss_clip": 1.00146484, + "balance_loss_mlp": 1.0001936, + "epoch": 0.5944385991282128, + "flos": 64190935347840.0, + "grad_norm": 0.8341230615360664, + "language_loss": 0.64553332, + "learning_rate": 1.490988081420423e-06, + "loss": 0.66782212, + "num_input_tokens_seen": 212999355, + "step": 9887, + "time_per_iteration": 3.00990891456604 + }, + { + "auxiliary_loss_clip": 0.01149611, + "auxiliary_loss_mlp": 0.01108906, + "balance_loss_clip": 1.00200307, + "balance_loss_mlp": 1.00066376, + "epoch": 0.5944987223808808, + "flos": 19571998193280.0, + "grad_norm": 2.0190417359600197, + "language_loss": 0.6938653, + "learning_rate": 1.4906114538958615e-06, + "loss": 0.71645051, + "num_input_tokens_seen": 213018570, + "step": 9888, + "time_per_iteration": 2.5590710639953613 + }, + { + "auxiliary_loss_clip": 0.01136084, + "auxiliary_loss_mlp": 0.01108766, + "balance_loss_clip": 1.00213623, + "balance_loss_mlp": 1.00061929, + "epoch": 0.5945588456335488, + "flos": 26177586341760.0, + "grad_norm": 1.4953263842487325, + "language_loss": 0.79872519, + "learning_rate": 1.490234845687366e-06, + "loss": 0.82117367, + "num_input_tokens_seen": 213037735, + "step": 9889, + "time_per_iteration": 2.6224751472473145 + }, + { + "auxiliary_loss_clip": 0.01117512, + "auxiliary_loss_mlp": 0.01107409, + "balance_loss_clip": 1.0018357, + "balance_loss_mlp": 1.00059724, + "epoch": 0.5946189688862168, + "flos": 20446494710400.0, + "grad_norm": 1.6286940913135597, + "language_loss": 0.70715916, + "learning_rate": 1.4898582568092154e-06, + "loss": 0.72940838, + "num_input_tokens_seen": 213057160, + "step": 9890, + "time_per_iteration": 2.640429735183716 + }, + { + "auxiliary_loss_clip": 0.01116215, + "auxiliary_loss_mlp": 0.01109158, + "balance_loss_clip": 1.0018301, + "balance_loss_mlp": 1.00053394, + "epoch": 0.5946790921388847, + "flos": 13437521850240.0, + "grad_norm": 2.1035717489467847, + "language_loss": 0.69039536, + "learning_rate": 1.489481687275691e-06, + "loss": 0.71264911, + "num_input_tokens_seen": 213073630, + "step": 9891, + "time_per_iteration": 2.592681646347046 + }, + { + "auxiliary_loss_clip": 0.01149207, + "auxiliary_loss_mlp": 0.0110786, + "balance_loss_clip": 1.00194287, + "balance_loss_mlp": 1.00066674, + "epoch": 0.5947392153915527, + "flos": 20412272027520.0, + "grad_norm": 2.0373495248269515, + "language_loss": 0.5353992, + "learning_rate": 1.4891051371010726e-06, + "loss": 0.55796987, + "num_input_tokens_seen": 213092450, + "step": 9892, + "time_per_iteration": 2.539241075515747 + }, + { + "auxiliary_loss_clip": 0.0111377, + "auxiliary_loss_mlp": 0.01084472, + "balance_loss_clip": 1.00158429, + "balance_loss_mlp": 1.00016713, + "epoch": 0.5947993386442206, + "flos": 65619138994560.0, + "grad_norm": 0.6572213628602458, + "language_loss": 0.54559606, + "learning_rate": 1.4887286062996375e-06, + "loss": 0.56757843, + "num_input_tokens_seen": 213155465, + "step": 9893, + "time_per_iteration": 3.2831192016601562 + }, + { + "auxiliary_loss_clip": 0.01116472, + "auxiliary_loss_mlp": 0.01107365, + "balance_loss_clip": 1.001876, + "balance_loss_mlp": 1.00055337, + "epoch": 0.5948594618968887, + "flos": 23183103168000.0, + "grad_norm": 1.6242657864078989, + "language_loss": 0.75017381, + "learning_rate": 1.4883520948856658e-06, + "loss": 0.77241218, + "num_input_tokens_seen": 213174875, + "step": 9894, + "time_per_iteration": 2.632706642150879 + }, + { + "auxiliary_loss_clip": 0.01117273, + "auxiliary_loss_mlp": 0.01108387, + "balance_loss_clip": 1.00188124, + "balance_loss_mlp": 1.00052679, + "epoch": 0.5949195851495566, + "flos": 13626771632640.0, + "grad_norm": 2.602828769263834, + "language_loss": 0.77952087, + "learning_rate": 1.487975602873434e-06, + "loss": 0.80177748, + "num_input_tokens_seen": 213192695, + "step": 9895, + "time_per_iteration": 2.6510820388793945 + }, + { + "auxiliary_loss_clip": 0.01103812, + "auxiliary_loss_mlp": 0.01108758, + "balance_loss_clip": 1.00194407, + "balance_loss_mlp": 1.00061107, + "epoch": 0.5949797084022246, + "flos": 19751012599680.0, + "grad_norm": 1.7427996092733593, + "language_loss": 0.78964078, + "learning_rate": 1.4875991302772182e-06, + "loss": 0.81176645, + "num_input_tokens_seen": 213211195, + "step": 9896, + "time_per_iteration": 4.027004957199097 + }, + { + "auxiliary_loss_clip": 0.01149362, + "auxiliary_loss_mlp": 0.01108149, + "balance_loss_clip": 1.00192618, + "balance_loss_mlp": 1.00047886, + "epoch": 0.5950398316548925, + "flos": 25773878407680.0, + "grad_norm": 1.5283654760321135, + "language_loss": 0.83502364, + "learning_rate": 1.4872226771112954e-06, + "loss": 0.85759878, + "num_input_tokens_seen": 213231975, + "step": 9897, + "time_per_iteration": 2.6056840419769287 + }, + { + "auxiliary_loss_clip": 0.01137502, + "auxiliary_loss_mlp": 0.01108396, + "balance_loss_clip": 1.0021385, + "balance_loss_mlp": 1.00053549, + "epoch": 0.5950999549075605, + "flos": 23039029716480.0, + "grad_norm": 2.369269174206919, + "language_loss": 0.70603919, + "learning_rate": 1.486846243389939e-06, + "loss": 0.72849816, + "num_input_tokens_seen": 213249760, + "step": 9898, + "time_per_iteration": 2.5960378646850586 + }, + { + "auxiliary_loss_clip": 0.01151138, + "auxiliary_loss_mlp": 0.01110298, + "balance_loss_clip": 1.00204229, + "balance_loss_mlp": 1.00053048, + "epoch": 0.5951600781602284, + "flos": 32446367637120.0, + "grad_norm": 2.2723968716213676, + "language_loss": 0.63997197, + "learning_rate": 1.4864698291274251e-06, + "loss": 0.66258633, + "num_input_tokens_seen": 213269890, + "step": 9899, + "time_per_iteration": 2.648210048675537 + }, + { + "auxiliary_loss_clip": 0.01165859, + "auxiliary_loss_mlp": 0.01108345, + "balance_loss_clip": 1.002069, + "balance_loss_mlp": 1.0004847, + "epoch": 0.5952202014128964, + "flos": 23800874204160.0, + "grad_norm": 1.7774438086141717, + "language_loss": 0.72098887, + "learning_rate": 1.4860934343380267e-06, + "loss": 0.7437309, + "num_input_tokens_seen": 213289400, + "step": 9900, + "time_per_iteration": 2.539491891860962 + }, + { + "auxiliary_loss_clip": 0.01165806, + "auxiliary_loss_mlp": 0.01108185, + "balance_loss_clip": 1.00200844, + "balance_loss_mlp": 1.00061011, + "epoch": 0.5952803246655644, + "flos": 22492182084480.0, + "grad_norm": 2.5006890749880695, + "language_loss": 0.84626985, + "learning_rate": 1.4857170590360169e-06, + "loss": 0.86900979, + "num_input_tokens_seen": 213308040, + "step": 9901, + "time_per_iteration": 2.5284078121185303 + }, + { + "auxiliary_loss_clip": 0.0109891, + "auxiliary_loss_mlp": 0.01083552, + "balance_loss_clip": 1.00139856, + "balance_loss_mlp": 1.00001001, + "epoch": 0.5953404479182324, + "flos": 51234688851840.0, + "grad_norm": 0.7970024331756552, + "language_loss": 0.58180773, + "learning_rate": 1.4853407032356674e-06, + "loss": 0.60363245, + "num_input_tokens_seen": 213358585, + "step": 9902, + "time_per_iteration": 3.078547716140747 + }, + { + "auxiliary_loss_clip": 0.01085447, + "auxiliary_loss_mlp": 0.01108304, + "balance_loss_clip": 1.00179064, + "balance_loss_mlp": 1.00044322, + "epoch": 0.5954005711709004, + "flos": 23112682554240.0, + "grad_norm": 3.0939070127003494, + "language_loss": 0.77315569, + "learning_rate": 1.4849643669512503e-06, + "loss": 0.79509324, + "num_input_tokens_seen": 213379585, + "step": 9903, + "time_per_iteration": 4.102309703826904 + }, + { + "auxiliary_loss_clip": 0.01116104, + "auxiliary_loss_mlp": 0.01108036, + "balance_loss_clip": 1.00179124, + "balance_loss_mlp": 1.00065231, + "epoch": 0.5954606944235683, + "flos": 35954732736000.0, + "grad_norm": 1.7329744966539595, + "language_loss": 0.77550912, + "learning_rate": 1.4845880501970362e-06, + "loss": 0.79775047, + "num_input_tokens_seen": 213401465, + "step": 9904, + "time_per_iteration": 2.7459633350372314 + }, + { + "auxiliary_loss_clip": 0.01149716, + "auxiliary_loss_mlp": 0.01109577, + "balance_loss_clip": 1.00193501, + "balance_loss_mlp": 1.00066757, + "epoch": 0.5955208176762363, + "flos": 30443665864320.0, + "grad_norm": 1.5704513395744069, + "language_loss": 0.72348952, + "learning_rate": 1.4842117529872942e-06, + "loss": 0.74608243, + "num_input_tokens_seen": 213422720, + "step": 9905, + "time_per_iteration": 2.655602216720581 + }, + { + "auxiliary_loss_clip": 0.01150614, + "auxiliary_loss_mlp": 0.01107343, + "balance_loss_clip": 1.00201619, + "balance_loss_mlp": 1.00053167, + "epoch": 0.5955809409289042, + "flos": 17640112083840.0, + "grad_norm": 1.6003318969410447, + "language_loss": 0.69877177, + "learning_rate": 1.483835475336295e-06, + "loss": 0.72135139, + "num_input_tokens_seen": 213439480, + "step": 9906, + "time_per_iteration": 4.002831697463989 + }, + { + "auxiliary_loss_clip": 0.01149267, + "auxiliary_loss_mlp": 0.01108481, + "balance_loss_clip": 1.00200081, + "balance_loss_mlp": 1.00062037, + "epoch": 0.5956410641815723, + "flos": 24279887001600.0, + "grad_norm": 2.036613150379808, + "language_loss": 0.75175452, + "learning_rate": 1.4834592172583057e-06, + "loss": 0.77433205, + "num_input_tokens_seen": 213458895, + "step": 9907, + "time_per_iteration": 4.02740216255188 + }, + { + "auxiliary_loss_clip": 0.01134489, + "auxiliary_loss_mlp": 0.01109146, + "balance_loss_clip": 1.00199187, + "balance_loss_mlp": 1.00061798, + "epoch": 0.5957011874342402, + "flos": 35734277013120.0, + "grad_norm": 3.052291697453, + "language_loss": 0.67234755, + "learning_rate": 1.483082978767595e-06, + "loss": 0.69478387, + "num_input_tokens_seen": 213481730, + "step": 9908, + "time_per_iteration": 2.6894748210906982 + }, + { + "auxiliary_loss_clip": 0.01070756, + "auxiliary_loss_mlp": 0.01108102, + "balance_loss_clip": 1.00181866, + "balance_loss_mlp": 1.00062275, + "epoch": 0.5957613106869082, + "flos": 21245004005760.0, + "grad_norm": 2.044581490564231, + "language_loss": 0.76420927, + "learning_rate": 1.4827067598784298e-06, + "loss": 0.78599787, + "num_input_tokens_seen": 213497225, + "step": 9909, + "time_per_iteration": 2.7360973358154297 + }, + { + "auxiliary_loss_clip": 0.01160544, + "auxiliary_loss_mlp": 0.01083861, + "balance_loss_clip": 1.00138617, + "balance_loss_mlp": 0.99993724, + "epoch": 0.5958214339395761, + "flos": 65940969876480.0, + "grad_norm": 0.9262895500120413, + "language_loss": 0.73404515, + "learning_rate": 1.4823305606050753e-06, + "loss": 0.75648928, + "num_input_tokens_seen": 213556890, + "step": 9910, + "time_per_iteration": 3.2144508361816406 + }, + { + "auxiliary_loss_clip": 0.01132518, + "auxiliary_loss_mlp": 0.01108449, + "balance_loss_clip": 1.00191176, + "balance_loss_mlp": 1.00058854, + "epoch": 0.5958815571922441, + "flos": 23218690567680.0, + "grad_norm": 1.846848891331455, + "language_loss": 0.69588369, + "learning_rate": 1.481954380961799e-06, + "loss": 0.71829331, + "num_input_tokens_seen": 213575800, + "step": 9911, + "time_per_iteration": 2.602926254272461 + }, + { + "auxiliary_loss_clip": 0.01150904, + "auxiliary_loss_mlp": 0.01110219, + "balance_loss_clip": 1.00210357, + "balance_loss_mlp": 1.0006423, + "epoch": 0.595941680444912, + "flos": 16538623568640.0, + "grad_norm": 2.157504217565463, + "language_loss": 0.65585601, + "learning_rate": 1.4815782209628631e-06, + "loss": 0.67846727, + "num_input_tokens_seen": 213592740, + "step": 9912, + "time_per_iteration": 2.5300865173339844 + }, + { + "auxiliary_loss_clip": 0.01119247, + "auxiliary_loss_mlp": 0.01109065, + "balance_loss_clip": 1.00194561, + "balance_loss_mlp": 1.00063229, + "epoch": 0.59600180369758, + "flos": 27818883423360.0, + "grad_norm": 1.9135403199067889, + "language_loss": 0.73449773, + "learning_rate": 1.4812020806225337e-06, + "loss": 0.75678086, + "num_input_tokens_seen": 213611970, + "step": 9913, + "time_per_iteration": 2.701545476913452 + }, + { + "auxiliary_loss_clip": 0.01120941, + "auxiliary_loss_mlp": 0.00747257, + "balance_loss_clip": 1.0023582, + "balance_loss_mlp": 1.00034487, + "epoch": 0.596061926950248, + "flos": 29491566013440.0, + "grad_norm": 2.423030804013831, + "language_loss": 0.8004607, + "learning_rate": 1.4808259599550738e-06, + "loss": 0.8191427, + "num_input_tokens_seen": 213632230, + "step": 9914, + "time_per_iteration": 2.7008793354034424 + }, + { + "auxiliary_loss_clip": 0.01117403, + "auxiliary_loss_mlp": 0.01107805, + "balance_loss_clip": 1.00187016, + "balance_loss_mlp": 1.00061238, + "epoch": 0.596122050202916, + "flos": 16836790366080.0, + "grad_norm": 1.9077743323263832, + "language_loss": 0.67948604, + "learning_rate": 1.4804498589747448e-06, + "loss": 0.70173806, + "num_input_tokens_seen": 213649645, + "step": 9915, + "time_per_iteration": 2.6415457725524902 + }, + { + "auxiliary_loss_clip": 0.01132889, + "auxiliary_loss_mlp": 0.01108027, + "balance_loss_clip": 1.00182307, + "balance_loss_mlp": 1.00064325, + "epoch": 0.596182173455584, + "flos": 20996646393600.0, + "grad_norm": 1.6621072385780415, + "language_loss": 0.78463054, + "learning_rate": 1.4800737776958095e-06, + "loss": 0.80703968, + "num_input_tokens_seen": 213668850, + "step": 9916, + "time_per_iteration": 2.6299355030059814 + }, + { + "auxiliary_loss_clip": 0.01134719, + "auxiliary_loss_mlp": 0.01108337, + "balance_loss_clip": 1.00188744, + "balance_loss_mlp": 1.00057185, + "epoch": 0.5962422967082519, + "flos": 16065680169600.0, + "grad_norm": 1.7816270647351418, + "language_loss": 0.82877338, + "learning_rate": 1.4796977161325286e-06, + "loss": 0.85120404, + "num_input_tokens_seen": 213685695, + "step": 9917, + "time_per_iteration": 2.582760810852051 + }, + { + "auxiliary_loss_clip": 0.01134641, + "auxiliary_loss_mlp": 0.01107789, + "balance_loss_clip": 1.0019803, + "balance_loss_mlp": 1.00050092, + "epoch": 0.5963024199609199, + "flos": 12166966995840.0, + "grad_norm": 8.928894038194457, + "language_loss": 0.77284086, + "learning_rate": 1.4793216742991625e-06, + "loss": 0.79526508, + "num_input_tokens_seen": 213703515, + "step": 9918, + "time_per_iteration": 2.5884106159210205 + }, + { + "auxiliary_loss_clip": 0.01149276, + "auxiliary_loss_mlp": 0.01108425, + "balance_loss_clip": 1.00205016, + "balance_loss_mlp": 1.00075495, + "epoch": 0.5963625432135878, + "flos": 28074280101120.0, + "grad_norm": 1.75019176489743, + "language_loss": 0.78646547, + "learning_rate": 1.4789456522099707e-06, + "loss": 0.80904251, + "num_input_tokens_seen": 213724170, + "step": 9919, + "time_per_iteration": 2.620786428451538 + }, + { + "auxiliary_loss_clip": 0.0113283, + "auxiliary_loss_mlp": 0.01107576, + "balance_loss_clip": 1.00193024, + "balance_loss_mlp": 1.00047886, + "epoch": 0.5964226664662559, + "flos": 19860324664320.0, + "grad_norm": 2.3787600664519952, + "language_loss": 0.77322203, + "learning_rate": 1.4785696498792122e-06, + "loss": 0.7956261, + "num_input_tokens_seen": 213740620, + "step": 9920, + "time_per_iteration": 2.582327365875244 + }, + { + "auxiliary_loss_clip": 0.01151127, + "auxiliary_loss_mlp": 0.01108303, + "balance_loss_clip": 1.0022397, + "balance_loss_mlp": 1.0006336, + "epoch": 0.5964827897189238, + "flos": 12932618325120.0, + "grad_norm": 3.0743573846570933, + "language_loss": 0.82149297, + "learning_rate": 1.4781936673211446e-06, + "loss": 0.84408724, + "num_input_tokens_seen": 213755390, + "step": 9921, + "time_per_iteration": 2.52533221244812 + }, + { + "auxiliary_loss_clip": 0.01151139, + "auxiliary_loss_mlp": 0.01108136, + "balance_loss_clip": 1.00216687, + "balance_loss_mlp": 1.00056124, + "epoch": 0.5965429129715918, + "flos": 18150797698560.0, + "grad_norm": 2.4251690533328745, + "language_loss": 0.8032316, + "learning_rate": 1.4778177045500252e-06, + "loss": 0.82582438, + "num_input_tokens_seen": 213773225, + "step": 9922, + "time_per_iteration": 2.534247398376465 + }, + { + "auxiliary_loss_clip": 0.01150468, + "auxiliary_loss_mlp": 0.0074715, + "balance_loss_clip": 1.00198114, + "balance_loss_mlp": 1.00027359, + "epoch": 0.5966030362242597, + "flos": 21763231476480.0, + "grad_norm": 1.9603522491296708, + "language_loss": 0.76929379, + "learning_rate": 1.477441761580111e-06, + "loss": 0.78826994, + "num_input_tokens_seen": 213791860, + "step": 9923, + "time_per_iteration": 2.572949171066284 + }, + { + "auxiliary_loss_clip": 0.01136028, + "auxiliary_loss_mlp": 0.01108988, + "balance_loss_clip": 1.00201821, + "balance_loss_mlp": 1.000651, + "epoch": 0.5966631594769277, + "flos": 18807208790400.0, + "grad_norm": 2.250734587996032, + "language_loss": 0.75867569, + "learning_rate": 1.4770658384256573e-06, + "loss": 0.7811259, + "num_input_tokens_seen": 213809455, + "step": 9924, + "time_per_iteration": 2.576336145401001 + }, + { + "auxiliary_loss_clip": 0.01150668, + "auxiliary_loss_mlp": 0.01107641, + "balance_loss_clip": 1.00194371, + "balance_loss_mlp": 1.00054312, + "epoch": 0.5967232827295956, + "flos": 14064163545600.0, + "grad_norm": 2.1171504598592352, + "language_loss": 0.66670644, + "learning_rate": 1.4766899351009204e-06, + "loss": 0.68928957, + "num_input_tokens_seen": 213826615, + "step": 9925, + "time_per_iteration": 2.5014171600341797 + }, + { + "auxiliary_loss_clip": 0.01134217, + "auxiliary_loss_mlp": 0.01107588, + "balance_loss_clip": 1.00216019, + "balance_loss_mlp": 1.00049043, + "epoch": 0.5967834059822636, + "flos": 17238235743360.0, + "grad_norm": 2.079945070916879, + "language_loss": 0.71584451, + "learning_rate": 1.4763140516201528e-06, + "loss": 0.73826253, + "num_input_tokens_seen": 213844495, + "step": 9926, + "time_per_iteration": 2.5706300735473633 + }, + { + "auxiliary_loss_clip": 0.01104717, + "auxiliary_loss_mlp": 0.00747357, + "balance_loss_clip": 1.00186634, + "balance_loss_mlp": 1.00044, + "epoch": 0.5968435292349316, + "flos": 42520244284800.0, + "grad_norm": 1.8947354719254095, + "language_loss": 0.70347327, + "learning_rate": 1.4759381879976088e-06, + "loss": 0.72199404, + "num_input_tokens_seen": 213869125, + "step": 9927, + "time_per_iteration": 2.881289482116699 + }, + { + "auxiliary_loss_clip": 0.01103425, + "auxiliary_loss_mlp": 0.01109063, + "balance_loss_clip": 1.0019424, + "balance_loss_mlp": 1.00043893, + "epoch": 0.5969036524875996, + "flos": 37630898945280.0, + "grad_norm": 1.8472495799219926, + "language_loss": 0.63555831, + "learning_rate": 1.4755623442475415e-06, + "loss": 0.65768319, + "num_input_tokens_seen": 213891115, + "step": 9928, + "time_per_iteration": 2.8911292552948 + }, + { + "auxiliary_loss_clip": 0.01165801, + "auxiliary_loss_mlp": 0.01106939, + "balance_loss_clip": 1.00201726, + "balance_loss_mlp": 1.00050926, + "epoch": 0.5969637757402676, + "flos": 23148377694720.0, + "grad_norm": 1.6677355590440988, + "language_loss": 0.69569582, + "learning_rate": 1.4751865203842022e-06, + "loss": 0.71842325, + "num_input_tokens_seen": 213911925, + "step": 9929, + "time_per_iteration": 2.601675271987915 + }, + { + "auxiliary_loss_clip": 0.01100291, + "auxiliary_loss_mlp": 0.01106971, + "balance_loss_clip": 1.00190449, + "balance_loss_mlp": 1.00054049, + "epoch": 0.5970238989929355, + "flos": 24020934877440.0, + "grad_norm": 1.9380210763373846, + "language_loss": 0.76735282, + "learning_rate": 1.4748107164218431e-06, + "loss": 0.78942543, + "num_input_tokens_seen": 213930715, + "step": 9930, + "time_per_iteration": 2.6637637615203857 + }, + { + "auxiliary_loss_clip": 0.01133038, + "auxiliary_loss_mlp": 0.01109049, + "balance_loss_clip": 1.00202918, + "balance_loss_mlp": 1.00052047, + "epoch": 0.5970840222456035, + "flos": 19426883247360.0, + "grad_norm": 2.3522213403567624, + "language_loss": 0.69128978, + "learning_rate": 1.4744349323747146e-06, + "loss": 0.71371067, + "num_input_tokens_seen": 213950015, + "step": 9931, + "time_per_iteration": 2.5811305046081543 + }, + { + "auxiliary_loss_clip": 0.01144492, + "auxiliary_loss_mlp": 0.01083546, + "balance_loss_clip": 1.00150073, + "balance_loss_mlp": 1.0000037, + "epoch": 0.5971441454982714, + "flos": 62976615235200.0, + "grad_norm": 0.858594957700774, + "language_loss": 0.6419009, + "learning_rate": 1.474059168257065e-06, + "loss": 0.66418135, + "num_input_tokens_seen": 214003330, + "step": 9932, + "time_per_iteration": 3.0384790897369385 + }, + { + "auxiliary_loss_clip": 0.01119069, + "auxiliary_loss_mlp": 0.01108419, + "balance_loss_clip": 1.00195932, + "balance_loss_mlp": 1.00046301, + "epoch": 0.5972042687509395, + "flos": 20266223328000.0, + "grad_norm": 2.2012546724601836, + "language_loss": 0.73843461, + "learning_rate": 1.4736834240831454e-06, + "loss": 0.76070946, + "num_input_tokens_seen": 214021680, + "step": 9933, + "time_per_iteration": 4.03005313873291 + }, + { + "auxiliary_loss_clip": 0.01144706, + "auxiliary_loss_mlp": 0.01084298, + "balance_loss_clip": 1.00173044, + "balance_loss_mlp": 0.99999362, + "epoch": 0.5972643920036074, + "flos": 71652383832960.0, + "grad_norm": 0.6594291621334837, + "language_loss": 0.52041721, + "learning_rate": 1.473307699867203e-06, + "loss": 0.54270726, + "num_input_tokens_seen": 214090265, + "step": 9934, + "time_per_iteration": 3.2111411094665527 + }, + { + "auxiliary_loss_clip": 0.01160467, + "auxiliary_loss_mlp": 0.0108351, + "balance_loss_clip": 1.00136638, + "balance_loss_mlp": 0.99996835, + "epoch": 0.5973245152562754, + "flos": 56892702263040.0, + "grad_norm": 0.8280767764696331, + "language_loss": 0.5423575, + "learning_rate": 1.4729319956234849e-06, + "loss": 0.56479728, + "num_input_tokens_seen": 214146375, + "step": 9935, + "time_per_iteration": 3.0052120685577393 + }, + { + "auxiliary_loss_clip": 0.01134389, + "auxiliary_loss_mlp": 0.01107476, + "balance_loss_clip": 1.00196218, + "balance_loss_mlp": 1.00056875, + "epoch": 0.5973846385089433, + "flos": 24164361884160.0, + "grad_norm": 1.5905037409283227, + "language_loss": 0.66027009, + "learning_rate": 1.4725563113662394e-06, + "loss": 0.68268877, + "num_input_tokens_seen": 214165340, + "step": 9936, + "time_per_iteration": 2.625173330307007 + }, + { + "auxiliary_loss_clip": 0.01101928, + "auxiliary_loss_mlp": 0.01108922, + "balance_loss_clip": 1.00191045, + "balance_loss_mlp": 1.00067973, + "epoch": 0.5974447617616113, + "flos": 17670599752320.0, + "grad_norm": 2.2054837128787206, + "language_loss": 0.6793257, + "learning_rate": 1.4721806471097103e-06, + "loss": 0.70143425, + "num_input_tokens_seen": 214181360, + "step": 9937, + "time_per_iteration": 2.659787893295288 + }, + { + "auxiliary_loss_clip": 0.01149345, + "auxiliary_loss_mlp": 0.01108796, + "balance_loss_clip": 1.00203526, + "balance_loss_mlp": 1.00064909, + "epoch": 0.5975048850142792, + "flos": 22892514140160.0, + "grad_norm": 2.7401804099095, + "language_loss": 0.77323204, + "learning_rate": 1.4718050028681442e-06, + "loss": 0.79581344, + "num_input_tokens_seen": 214198525, + "step": 9938, + "time_per_iteration": 2.57462739944458 + }, + { + "auxiliary_loss_clip": 0.01151194, + "auxiliary_loss_mlp": 0.01107878, + "balance_loss_clip": 1.00202096, + "balance_loss_mlp": 1.00039887, + "epoch": 0.5975650082669473, + "flos": 24353108876160.0, + "grad_norm": 1.825296641858551, + "language_loss": 0.7589432, + "learning_rate": 1.4714293786557855e-06, + "loss": 0.78153396, + "num_input_tokens_seen": 214218710, + "step": 9939, + "time_per_iteration": 4.1537697315216064 + }, + { + "auxiliary_loss_clip": 0.01106187, + "auxiliary_loss_mlp": 0.01108686, + "balance_loss_clip": 1.0021956, + "balance_loss_mlp": 1.00044382, + "epoch": 0.5976251315196152, + "flos": 20923352691840.0, + "grad_norm": 2.0650115552389314, + "language_loss": 0.68549138, + "learning_rate": 1.471053774486878e-06, + "loss": 0.70764011, + "num_input_tokens_seen": 214237800, + "step": 9940, + "time_per_iteration": 2.6635377407073975 + }, + { + "auxiliary_loss_clip": 0.01133113, + "auxiliary_loss_mlp": 0.0110703, + "balance_loss_clip": 1.0019182, + "balance_loss_mlp": 1.0006001, + "epoch": 0.5976852547722832, + "flos": 35844594658560.0, + "grad_norm": 1.8217619089737809, + "language_loss": 0.70127124, + "learning_rate": 1.470678190375664e-06, + "loss": 0.72367263, + "num_input_tokens_seen": 214260355, + "step": 9941, + "time_per_iteration": 2.7370123863220215 + }, + { + "auxiliary_loss_clip": 0.01133738, + "auxiliary_loss_mlp": 0.01107336, + "balance_loss_clip": 1.00174057, + "balance_loss_mlp": 1.00052476, + "epoch": 0.5977453780249512, + "flos": 12855948744960.0, + "grad_norm": 1.8636494505930064, + "language_loss": 0.77516985, + "learning_rate": 1.470302626336386e-06, + "loss": 0.7975806, + "num_input_tokens_seen": 214277120, + "step": 9942, + "time_per_iteration": 2.562713384628296 + }, + { + "auxiliary_loss_clip": 0.01099031, + "auxiliary_loss_mlp": 0.01108209, + "balance_loss_clip": 1.00164676, + "balance_loss_mlp": 1.00053942, + "epoch": 0.5978055012776191, + "flos": 20959155573120.0, + "grad_norm": 1.9363100757668852, + "language_loss": 0.75964177, + "learning_rate": 1.4699270823832857e-06, + "loss": 0.78171414, + "num_input_tokens_seen": 214295300, + "step": 9943, + "time_per_iteration": 2.6704914569854736 + }, + { + "auxiliary_loss_clip": 0.01088884, + "auxiliary_loss_mlp": 0.01106715, + "balance_loss_clip": 1.00207853, + "balance_loss_mlp": 1.00057149, + "epoch": 0.5978656245302871, + "flos": 34058003063040.0, + "grad_norm": 1.8864013360836946, + "language_loss": 0.61939585, + "learning_rate": 1.4695515585306032e-06, + "loss": 0.64135182, + "num_input_tokens_seen": 214317050, + "step": 9944, + "time_per_iteration": 4.259161949157715 + }, + { + "auxiliary_loss_clip": 0.01132944, + "auxiliary_loss_mlp": 0.01108156, + "balance_loss_clip": 1.00200343, + "balance_loss_mlp": 1.00058126, + "epoch": 0.597925747782955, + "flos": 37373275624320.0, + "grad_norm": 1.784747083090356, + "language_loss": 0.72433281, + "learning_rate": 1.4691760547925795e-06, + "loss": 0.7467438, + "num_input_tokens_seen": 214337470, + "step": 9945, + "time_per_iteration": 4.121801853179932 + }, + { + "auxiliary_loss_clip": 0.01098577, + "auxiliary_loss_mlp": 0.01107274, + "balance_loss_clip": 1.00164258, + "balance_loss_mlp": 1.00046289, + "epoch": 0.5979858710356231, + "flos": 25374803328000.0, + "grad_norm": 1.867763055999552, + "language_loss": 0.67058718, + "learning_rate": 1.4688005711834522e-06, + "loss": 0.69264561, + "num_input_tokens_seen": 214357975, + "step": 9946, + "time_per_iteration": 2.7012264728546143 + }, + { + "auxiliary_loss_clip": 0.01151026, + "auxiliary_loss_mlp": 0.01108809, + "balance_loss_clip": 1.00207472, + "balance_loss_mlp": 1.00047183, + "epoch": 0.598045994288291, + "flos": 13698413308800.0, + "grad_norm": 1.9915731131981886, + "language_loss": 0.88419449, + "learning_rate": 1.468425107717461e-06, + "loss": 0.90679282, + "num_input_tokens_seen": 214374125, + "step": 9947, + "time_per_iteration": 2.5185060501098633 + }, + { + "auxiliary_loss_clip": 0.01165665, + "auxiliary_loss_mlp": 0.01107788, + "balance_loss_clip": 1.00203335, + "balance_loss_mlp": 1.00059485, + "epoch": 0.598106117540959, + "flos": 21981352815360.0, + "grad_norm": 1.7155587531240792, + "language_loss": 0.7194044, + "learning_rate": 1.4680496644088432e-06, + "loss": 0.74213898, + "num_input_tokens_seen": 214393395, + "step": 9948, + "time_per_iteration": 2.5189695358276367 + }, + { + "auxiliary_loss_clip": 0.01131864, + "auxiliary_loss_mlp": 0.0110786, + "balance_loss_clip": 1.00184798, + "balance_loss_mlp": 1.00047588, + "epoch": 0.5981662407936269, + "flos": 20559362221440.0, + "grad_norm": 2.8339163249776047, + "language_loss": 0.89548719, + "learning_rate": 1.4676742412718347e-06, + "loss": 0.91788447, + "num_input_tokens_seen": 214411550, + "step": 9949, + "time_per_iteration": 2.63354754447937 + }, + { + "auxiliary_loss_clip": 0.0114942, + "auxiliary_loss_mlp": 0.0110742, + "balance_loss_clip": 1.00213385, + "balance_loss_mlp": 1.0006088, + "epoch": 0.5982263640462949, + "flos": 14063840323200.0, + "grad_norm": 2.0815039268272955, + "language_loss": 0.70263398, + "learning_rate": 1.467298838320673e-06, + "loss": 0.72520244, + "num_input_tokens_seen": 214429780, + "step": 9950, + "time_per_iteration": 2.525815486907959 + }, + { + "auxiliary_loss_clip": 0.01149214, + "auxiliary_loss_mlp": 0.01107972, + "balance_loss_clip": 1.00202763, + "balance_loss_mlp": 1.00049329, + "epoch": 0.5982864872989628, + "flos": 17707228646400.0, + "grad_norm": 1.6550737342756796, + "language_loss": 0.78561139, + "learning_rate": 1.4669234555695921e-06, + "loss": 0.80818319, + "num_input_tokens_seen": 214447775, + "step": 9951, + "time_per_iteration": 2.60870623588562 + }, + { + "auxiliary_loss_clip": 0.01132113, + "auxiliary_loss_mlp": 0.01108303, + "balance_loss_clip": 1.00190759, + "balance_loss_mlp": 1.00072861, + "epoch": 0.5983466105516309, + "flos": 16764789553920.0, + "grad_norm": 1.56571732787673, + "language_loss": 0.73755133, + "learning_rate": 1.4665480930328275e-06, + "loss": 0.75995541, + "num_input_tokens_seen": 214467245, + "step": 9952, + "time_per_iteration": 2.594590425491333 + }, + { + "auxiliary_loss_clip": 0.01132543, + "auxiliary_loss_mlp": 0.00747471, + "balance_loss_clip": 1.00186181, + "balance_loss_mlp": 1.0004468, + "epoch": 0.5984067338042988, + "flos": 20042714949120.0, + "grad_norm": 2.4895102011304413, + "language_loss": 0.79056901, + "learning_rate": 1.466172750724613e-06, + "loss": 0.80936909, + "num_input_tokens_seen": 214484385, + "step": 9953, + "time_per_iteration": 2.5939643383026123 + }, + { + "auxiliary_loss_clip": 0.01118068, + "auxiliary_loss_mlp": 0.01107436, + "balance_loss_clip": 1.00182462, + "balance_loss_mlp": 1.00052941, + "epoch": 0.5984668570569668, + "flos": 26319900026880.0, + "grad_norm": 1.4297960730566572, + "language_loss": 0.69705468, + "learning_rate": 1.4657974286591807e-06, + "loss": 0.71930981, + "num_input_tokens_seen": 214503465, + "step": 9954, + "time_per_iteration": 2.6781907081604004 + }, + { + "auxiliary_loss_clip": 0.01132396, + "auxiliary_loss_mlp": 0.01108028, + "balance_loss_clip": 1.001894, + "balance_loss_mlp": 1.00045347, + "epoch": 0.5985269803096348, + "flos": 20593728558720.0, + "grad_norm": 1.883527862821195, + "language_loss": 0.73166388, + "learning_rate": 1.4654221268507637e-06, + "loss": 0.75406814, + "num_input_tokens_seen": 214520725, + "step": 9955, + "time_per_iteration": 2.5790908336639404 + }, + { + "auxiliary_loss_clip": 0.01165798, + "auxiliary_loss_mlp": 0.01107174, + "balance_loss_clip": 1.00201917, + "balance_loss_mlp": 1.00055325, + "epoch": 0.5985871035623027, + "flos": 26865382942080.0, + "grad_norm": 2.951467361153034, + "language_loss": 0.68722951, + "learning_rate": 1.4650468453135934e-06, + "loss": 0.70995927, + "num_input_tokens_seen": 214540675, + "step": 9956, + "time_per_iteration": 2.568183183670044 + }, + { + "auxiliary_loss_clip": 0.01165954, + "auxiliary_loss_mlp": 0.01108381, + "balance_loss_clip": 1.00214815, + "balance_loss_mlp": 1.00061607, + "epoch": 0.5986472268149707, + "flos": 19609704495360.0, + "grad_norm": 3.575684856776777, + "language_loss": 0.73966777, + "learning_rate": 1.4646715840618999e-06, + "loss": 0.76241106, + "num_input_tokens_seen": 214559910, + "step": 9957, + "time_per_iteration": 2.4981284141540527 + }, + { + "auxiliary_loss_clip": 0.01117172, + "auxiliary_loss_mlp": 0.01105961, + "balance_loss_clip": 1.00181282, + "balance_loss_mlp": 1.00058007, + "epoch": 0.5987073500676386, + "flos": 21794616984960.0, + "grad_norm": 1.7888402287970528, + "language_loss": 0.84804094, + "learning_rate": 1.4642963431099138e-06, + "loss": 0.87027222, + "num_input_tokens_seen": 214575960, + "step": 9958, + "time_per_iteration": 2.6203837394714355 + }, + { + "auxiliary_loss_clip": 0.01118785, + "auxiliary_loss_mlp": 0.00747389, + "balance_loss_clip": 1.00191855, + "balance_loss_mlp": 1.00040162, + "epoch": 0.5987674733203067, + "flos": 24314361079680.0, + "grad_norm": 3.642954520645821, + "language_loss": 0.66289937, + "learning_rate": 1.463921122471864e-06, + "loss": 0.68156105, + "num_input_tokens_seen": 214594230, + "step": 9959, + "time_per_iteration": 2.686375141143799 + }, + { + "auxiliary_loss_clip": 0.01151046, + "auxiliary_loss_mlp": 0.0110761, + "balance_loss_clip": 1.00210929, + "balance_loss_mlp": 1.000512, + "epoch": 0.5988275965729746, + "flos": 21320201128320.0, + "grad_norm": 1.6885245322605211, + "language_loss": 0.83639801, + "learning_rate": 1.4635459221619796e-06, + "loss": 0.85898447, + "num_input_tokens_seen": 214613130, + "step": 9960, + "time_per_iteration": 2.5493509769439697 + }, + { + "auxiliary_loss_clip": 0.01135725, + "auxiliary_loss_mlp": 0.01108133, + "balance_loss_clip": 1.00193858, + "balance_loss_mlp": 1.00046325, + "epoch": 0.5988877198256426, + "flos": 25118041933440.0, + "grad_norm": 1.505210473428543, + "language_loss": 0.79643846, + "learning_rate": 1.4631707421944868e-06, + "loss": 0.81887704, + "num_input_tokens_seen": 214634470, + "step": 9961, + "time_per_iteration": 2.660745859146118 + }, + { + "auxiliary_loss_clip": 0.0116581, + "auxiliary_loss_mlp": 0.01107561, + "balance_loss_clip": 1.00203753, + "balance_loss_mlp": 1.00046372, + "epoch": 0.5989478430783105, + "flos": 26429104350720.0, + "grad_norm": 1.73319583857715, + "language_loss": 0.66866583, + "learning_rate": 1.4627955825836136e-06, + "loss": 0.69139951, + "num_input_tokens_seen": 214654030, + "step": 9962, + "time_per_iteration": 2.556274890899658 + }, + { + "auxiliary_loss_clip": 0.01150879, + "auxiliary_loss_mlp": 0.01107822, + "balance_loss_clip": 1.00205708, + "balance_loss_mlp": 1.0005337, + "epoch": 0.5990079663309785, + "flos": 25778439434880.0, + "grad_norm": 1.4583165780725815, + "language_loss": 0.7429288, + "learning_rate": 1.4624204433435857e-06, + "loss": 0.7655158, + "num_input_tokens_seen": 214676985, + "step": 9963, + "time_per_iteration": 2.6000070571899414 + }, + { + "auxiliary_loss_clip": 0.01149225, + "auxiliary_loss_mlp": 0.01106995, + "balance_loss_clip": 1.00203574, + "balance_loss_mlp": 1.00056517, + "epoch": 0.5990680895836464, + "flos": 36831779118720.0, + "grad_norm": 1.7924725989031198, + "language_loss": 0.67569757, + "learning_rate": 1.4620453244886281e-06, + "loss": 0.69825983, + "num_input_tokens_seen": 214700105, + "step": 9964, + "time_per_iteration": 2.692322015762329 + }, + { + "auxiliary_loss_clip": 0.0111656, + "auxiliary_loss_mlp": 0.01107959, + "balance_loss_clip": 1.00190008, + "balance_loss_mlp": 1.0005753, + "epoch": 0.5991282128363145, + "flos": 24133550993280.0, + "grad_norm": 2.1353820494116538, + "language_loss": 0.77374327, + "learning_rate": 1.4616702260329662e-06, + "loss": 0.79598856, + "num_input_tokens_seen": 214717885, + "step": 9965, + "time_per_iteration": 2.6425399780273438 + }, + { + "auxiliary_loss_clip": 0.01149612, + "auxiliary_loss_mlp": 0.01107381, + "balance_loss_clip": 1.00195861, + "balance_loss_mlp": 1.00047457, + "epoch": 0.5991883360889824, + "flos": 10304064956160.0, + "grad_norm": 1.9501537513440537, + "language_loss": 0.77242339, + "learning_rate": 1.4612951479908229e-06, + "loss": 0.79499334, + "num_input_tokens_seen": 214733680, + "step": 9966, + "time_per_iteration": 2.5059103965759277 + }, + { + "auxiliary_loss_clip": 0.01115749, + "auxiliary_loss_mlp": 0.01107624, + "balance_loss_clip": 1.00188541, + "balance_loss_mlp": 1.00052667, + "epoch": 0.5992484593416504, + "flos": 23951196622080.0, + "grad_norm": 1.536409703264037, + "language_loss": 0.73279148, + "learning_rate": 1.460920090376422e-06, + "loss": 0.75502527, + "num_input_tokens_seen": 214753285, + "step": 9967, + "time_per_iteration": 2.6682276725769043 + }, + { + "auxiliary_loss_clip": 0.01149321, + "auxiliary_loss_mlp": 0.01109033, + "balance_loss_clip": 1.00203061, + "balance_loss_mlp": 1.00059974, + "epoch": 0.5993085825943184, + "flos": 11944105061760.0, + "grad_norm": 2.001865001915349, + "language_loss": 0.68820202, + "learning_rate": 1.4605450532039847e-06, + "loss": 0.71078557, + "num_input_tokens_seen": 214767810, + "step": 9968, + "time_per_iteration": 2.499556303024292 + }, + { + "auxiliary_loss_clip": 0.01151189, + "auxiliary_loss_mlp": 0.01107911, + "balance_loss_clip": 1.00199318, + "balance_loss_mlp": 1.00052738, + "epoch": 0.5993687058469863, + "flos": 19026838500480.0, + "grad_norm": 4.437709059676922, + "language_loss": 0.78921163, + "learning_rate": 1.4601700364877334e-06, + "loss": 0.81180263, + "num_input_tokens_seen": 214786040, + "step": 9969, + "time_per_iteration": 2.5447068214416504 + }, + { + "auxiliary_loss_clip": 0.01153454, + "auxiliary_loss_mlp": 0.01107023, + "balance_loss_clip": 1.0020709, + "balance_loss_mlp": 1.00040209, + "epoch": 0.5994288290996543, + "flos": 14282967242880.0, + "grad_norm": 1.8020593290446372, + "language_loss": 0.81264383, + "learning_rate": 1.4597950402418889e-06, + "loss": 0.83524859, + "num_input_tokens_seen": 214803110, + "step": 9970, + "time_per_iteration": 3.9845941066741943 + }, + { + "auxiliary_loss_clip": 0.01103409, + "auxiliary_loss_mlp": 0.01108946, + "balance_loss_clip": 1.00190353, + "balance_loss_mlp": 1.00051284, + "epoch": 0.5994889523523222, + "flos": 19206643006080.0, + "grad_norm": 2.1601019223481175, + "language_loss": 0.6193133, + "learning_rate": 1.4594200644806697e-06, + "loss": 0.64143682, + "num_input_tokens_seen": 214819945, + "step": 9971, + "time_per_iteration": 2.6411242485046387 + }, + { + "auxiliary_loss_clip": 0.01165763, + "auxiliary_loss_mlp": 0.01107812, + "balance_loss_clip": 1.00206017, + "balance_loss_mlp": 1.00042796, + "epoch": 0.5995490756049903, + "flos": 28037040675840.0, + "grad_norm": 1.6614770451169283, + "language_loss": 0.79268754, + "learning_rate": 1.4590451092182962e-06, + "loss": 0.81542325, + "num_input_tokens_seen": 214838810, + "step": 9972, + "time_per_iteration": 2.591883659362793 + }, + { + "auxiliary_loss_clip": 0.01099246, + "auxiliary_loss_mlp": 0.01108854, + "balance_loss_clip": 1.00178456, + "balance_loss_mlp": 1.0005163, + "epoch": 0.5996091988576582, + "flos": 29052953038080.0, + "grad_norm": 2.078040746413895, + "language_loss": 0.76321065, + "learning_rate": 1.4586701744689864e-06, + "loss": 0.78529167, + "num_input_tokens_seen": 214857040, + "step": 9973, + "time_per_iteration": 2.726726531982422 + }, + { + "auxiliary_loss_clip": 0.01115825, + "auxiliary_loss_mlp": 0.01107228, + "balance_loss_clip": 1.00174749, + "balance_loss_mlp": 1.00060701, + "epoch": 0.5996693221103262, + "flos": 20813968800000.0, + "grad_norm": 2.1707247319516636, + "language_loss": 0.65455246, + "learning_rate": 1.4582952602469578e-06, + "loss": 0.67678303, + "num_input_tokens_seen": 214873375, + "step": 9974, + "time_per_iteration": 2.6204731464385986 + }, + { + "auxiliary_loss_clip": 0.01150834, + "auxiliary_loss_mlp": 0.011079, + "balance_loss_clip": 1.00191164, + "balance_loss_mlp": 1.00051665, + "epoch": 0.5997294453629941, + "flos": 23768914078080.0, + "grad_norm": 1.4195013307289257, + "language_loss": 0.74679601, + "learning_rate": 1.457920366566428e-06, + "loss": 0.76938331, + "num_input_tokens_seen": 214893900, + "step": 9975, + "time_per_iteration": 2.634909152984619 + }, + { + "auxiliary_loss_clip": 0.01165899, + "auxiliary_loss_mlp": 0.01108199, + "balance_loss_clip": 1.00207806, + "balance_loss_mlp": 1.00043392, + "epoch": 0.5997895686156621, + "flos": 20960017499520.0, + "grad_norm": 4.176918382820259, + "language_loss": 0.77378321, + "learning_rate": 1.457545493441611e-06, + "loss": 0.79652417, + "num_input_tokens_seen": 214912110, + "step": 9976, + "time_per_iteration": 2.5894856452941895 + }, + { + "auxiliary_loss_clip": 0.01135199, + "auxiliary_loss_mlp": 0.01107999, + "balance_loss_clip": 1.00194561, + "balance_loss_mlp": 1.00061536, + "epoch": 0.59984969186833, + "flos": 28365443746560.0, + "grad_norm": 5.444188032225885, + "language_loss": 0.74734896, + "learning_rate": 1.4571706408867237e-06, + "loss": 0.76978099, + "num_input_tokens_seen": 214930140, + "step": 9977, + "time_per_iteration": 4.0109968185424805 + }, + { + "auxiliary_loss_clip": 0.01119072, + "auxiliary_loss_mlp": 0.01108832, + "balance_loss_clip": 1.00187111, + "balance_loss_mlp": 1.00059021, + "epoch": 0.5999098151209981, + "flos": 22565906749440.0, + "grad_norm": 1.641249101615916, + "language_loss": 0.6886245, + "learning_rate": 1.4567958089159802e-06, + "loss": 0.71090353, + "num_input_tokens_seen": 214949200, + "step": 9978, + "time_per_iteration": 2.646562099456787 + }, + { + "auxiliary_loss_clip": 0.01166048, + "auxiliary_loss_mlp": 0.01109733, + "balance_loss_clip": 1.00211978, + "balance_loss_mlp": 1.00063312, + "epoch": 0.599969938373666, + "flos": 18768712389120.0, + "grad_norm": 2.2912672264582383, + "language_loss": 0.81374323, + "learning_rate": 1.456420997543594e-06, + "loss": 0.836501, + "num_input_tokens_seen": 214965775, + "step": 9979, + "time_per_iteration": 2.507242202758789 + }, + { + "auxiliary_loss_clip": 0.01165528, + "auxiliary_loss_mlp": 0.01106644, + "balance_loss_clip": 1.00195158, + "balance_loss_mlp": 1.00059545, + "epoch": 0.600030061626334, + "flos": 11327231865600.0, + "grad_norm": 2.1107512340729153, + "language_loss": 0.70336097, + "learning_rate": 1.4560462067837782e-06, + "loss": 0.72608268, + "num_input_tokens_seen": 214982480, + "step": 9980, + "time_per_iteration": 2.5044727325439453 + }, + { + "auxiliary_loss_clip": 0.0115111, + "auxiliary_loss_mlp": 0.01108716, + "balance_loss_clip": 1.00208807, + "balance_loss_mlp": 1.00047374, + "epoch": 0.600090184879002, + "flos": 16578664254720.0, + "grad_norm": 2.915818837587539, + "language_loss": 0.68355286, + "learning_rate": 1.4556714366507445e-06, + "loss": 0.70615119, + "num_input_tokens_seen": 214998110, + "step": 9981, + "time_per_iteration": 3.908691883087158 + }, + { + "auxiliary_loss_clip": 0.01149075, + "auxiliary_loss_mlp": 0.01107075, + "balance_loss_clip": 1.0020026, + "balance_loss_mlp": 1.00054979, + "epoch": 0.6001503081316699, + "flos": 23618627573760.0, + "grad_norm": 2.4533604245723994, + "language_loss": 0.78867257, + "learning_rate": 1.4552966871587048e-06, + "loss": 0.81123412, + "num_input_tokens_seen": 215017995, + "step": 9982, + "time_per_iteration": 2.597843647003174 + }, + { + "auxiliary_loss_clip": 0.01103206, + "auxiliary_loss_mlp": 0.01107831, + "balance_loss_clip": 1.00198865, + "balance_loss_mlp": 1.00054276, + "epoch": 0.6002104313843379, + "flos": 20667668705280.0, + "grad_norm": 1.5056905253547914, + "language_loss": 0.73096383, + "learning_rate": 1.4549219583218686e-06, + "loss": 0.75307417, + "num_input_tokens_seen": 215038285, + "step": 9983, + "time_per_iteration": 4.127289533615112 + }, + { + "auxiliary_loss_clip": 0.01117701, + "auxiliary_loss_mlp": 0.01108753, + "balance_loss_clip": 1.00175738, + "balance_loss_mlp": 1.00060678, + "epoch": 0.6002705546370058, + "flos": 22455229968000.0, + "grad_norm": 2.238997531403864, + "language_loss": 0.78028357, + "learning_rate": 1.454547250154447e-06, + "loss": 0.80254811, + "num_input_tokens_seen": 215057825, + "step": 9984, + "time_per_iteration": 2.64994215965271 + }, + { + "auxiliary_loss_clip": 0.01149654, + "auxiliary_loss_mlp": 0.01108234, + "balance_loss_clip": 1.00202572, + "balance_loss_mlp": 1.00046837, + "epoch": 0.6003306778896739, + "flos": 25191982080000.0, + "grad_norm": 1.5993813621186657, + "language_loss": 0.83165598, + "learning_rate": 1.4541725626706485e-06, + "loss": 0.85423481, + "num_input_tokens_seen": 215077790, + "step": 9985, + "time_per_iteration": 2.633229970932007 + }, + { + "auxiliary_loss_clip": 0.0114917, + "auxiliary_loss_mlp": 0.01108044, + "balance_loss_clip": 1.00196755, + "balance_loss_mlp": 1.00065994, + "epoch": 0.6003908011423418, + "flos": 26687733252480.0, + "grad_norm": 1.8998682167094414, + "language_loss": 0.71297014, + "learning_rate": 1.4537978958846809e-06, + "loss": 0.7355423, + "num_input_tokens_seen": 215097650, + "step": 9986, + "time_per_iteration": 2.596794605255127 + }, + { + "auxiliary_loss_clip": 0.01166005, + "auxiliary_loss_mlp": 0.00747529, + "balance_loss_clip": 1.00215316, + "balance_loss_mlp": 1.00050855, + "epoch": 0.6004509243950098, + "flos": 22565080736640.0, + "grad_norm": 1.6839499841627612, + "language_loss": 0.7184239, + "learning_rate": 1.4534232498107514e-06, + "loss": 0.73755926, + "num_input_tokens_seen": 215118235, + "step": 9987, + "time_per_iteration": 2.5637190341949463 + }, + { + "auxiliary_loss_clip": 0.01132748, + "auxiliary_loss_mlp": 0.0110741, + "balance_loss_clip": 1.00198352, + "balance_loss_mlp": 1.00050259, + "epoch": 0.6005110476476777, + "flos": 19719303868800.0, + "grad_norm": 1.9474618069190497, + "language_loss": 0.84781736, + "learning_rate": 1.4530486244630673e-06, + "loss": 0.87021899, + "num_input_tokens_seen": 215136755, + "step": 9988, + "time_per_iteration": 2.6625523567199707 + }, + { + "auxiliary_loss_clip": 0.01151188, + "auxiliary_loss_mlp": 0.01108063, + "balance_loss_clip": 1.00204182, + "balance_loss_mlp": 1.00058413, + "epoch": 0.6005711709003457, + "flos": 17712543859200.0, + "grad_norm": 1.918134355271818, + "language_loss": 0.66032267, + "learning_rate": 1.4526740198558346e-06, + "loss": 0.68291515, + "num_input_tokens_seen": 215155225, + "step": 9989, + "time_per_iteration": 2.520946502685547 + }, + { + "auxiliary_loss_clip": 0.01149245, + "auxiliary_loss_mlp": 0.01107828, + "balance_loss_clip": 1.00185537, + "balance_loss_mlp": 1.00063467, + "epoch": 0.6006312941530136, + "flos": 18514464946560.0, + "grad_norm": 1.8660807819000456, + "language_loss": 0.8033694, + "learning_rate": 1.452299436003257e-06, + "loss": 0.82594019, + "num_input_tokens_seen": 215174815, + "step": 9990, + "time_per_iteration": 2.534177541732788 + }, + { + "auxiliary_loss_clip": 0.01116398, + "auxiliary_loss_mlp": 0.01108646, + "balance_loss_clip": 1.00182128, + "balance_loss_mlp": 1.00059485, + "epoch": 0.6006914174056817, + "flos": 21390837223680.0, + "grad_norm": 2.6131731407733274, + "language_loss": 0.8245275, + "learning_rate": 1.4519248729195403e-06, + "loss": 0.84677792, + "num_input_tokens_seen": 215192045, + "step": 9991, + "time_per_iteration": 2.654808521270752 + }, + { + "auxiliary_loss_clip": 0.01101732, + "auxiliary_loss_mlp": 0.01107892, + "balance_loss_clip": 1.00175261, + "balance_loss_mlp": 1.00050807, + "epoch": 0.6007515406583496, + "flos": 12750515349120.0, + "grad_norm": 2.282252823339493, + "language_loss": 0.8265934, + "learning_rate": 1.4515503306188878e-06, + "loss": 0.84868968, + "num_input_tokens_seen": 215209885, + "step": 9992, + "time_per_iteration": 2.6502819061279297 + }, + { + "auxiliary_loss_clip": 0.0113264, + "auxiliary_loss_mlp": 0.00747503, + "balance_loss_clip": 1.00186849, + "balance_loss_mlp": 1.00049114, + "epoch": 0.6008116639110176, + "flos": 19206894401280.0, + "grad_norm": 2.5950192427623135, + "language_loss": 0.66687304, + "learning_rate": 1.4511758091155008e-06, + "loss": 0.68567449, + "num_input_tokens_seen": 215228150, + "step": 9993, + "time_per_iteration": 2.621432304382324 + }, + { + "auxiliary_loss_clip": 0.01117768, + "auxiliary_loss_mlp": 0.01107987, + "balance_loss_clip": 1.00189471, + "balance_loss_mlp": 1.00060332, + "epoch": 0.6008717871636855, + "flos": 17055342668160.0, + "grad_norm": 3.107107286457601, + "language_loss": 0.80734885, + "learning_rate": 1.4508013084235826e-06, + "loss": 0.82960641, + "num_input_tokens_seen": 215243755, + "step": 9994, + "time_per_iteration": 2.6128482818603516 + }, + { + "auxiliary_loss_clip": 0.01119099, + "auxiliary_loss_mlp": 0.01106232, + "balance_loss_clip": 1.00193572, + "balance_loss_mlp": 1.00046968, + "epoch": 0.6009319104163535, + "flos": 20298686244480.0, + "grad_norm": 2.3626740558320893, + "language_loss": 0.72430396, + "learning_rate": 1.4504268285573337e-06, + "loss": 0.74655718, + "num_input_tokens_seen": 215262130, + "step": 9995, + "time_per_iteration": 2.639857769012451 + }, + { + "auxiliary_loss_clip": 0.01134246, + "auxiliary_loss_mlp": 0.01108387, + "balance_loss_clip": 1.0018723, + "balance_loss_mlp": 1.00043118, + "epoch": 0.6009920336690215, + "flos": 21836776573440.0, + "grad_norm": 1.934805764956555, + "language_loss": 0.80914712, + "learning_rate": 1.4500523695309546e-06, + "loss": 0.83157349, + "num_input_tokens_seen": 215281785, + "step": 9996, + "time_per_iteration": 2.6315805912017822 + }, + { + "auxiliary_loss_clip": 0.01087681, + "auxiliary_loss_mlp": 0.01107776, + "balance_loss_clip": 1.00181127, + "balance_loss_mlp": 1.00067878, + "epoch": 0.6010521569216895, + "flos": 22596107109120.0, + "grad_norm": 2.881433288414343, + "language_loss": 0.78325164, + "learning_rate": 1.4496779313586447e-06, + "loss": 0.80520618, + "num_input_tokens_seen": 215297550, + "step": 9997, + "time_per_iteration": 2.723787307739258 + }, + { + "auxiliary_loss_clip": 0.01149403, + "auxiliary_loss_mlp": 0.01108553, + "balance_loss_clip": 1.00196552, + "balance_loss_mlp": 1.00059748, + "epoch": 0.6011122801743575, + "flos": 19171702051200.0, + "grad_norm": 1.91308644447225, + "language_loss": 0.72773272, + "learning_rate": 1.4493035140546028e-06, + "loss": 0.75031233, + "num_input_tokens_seen": 215316360, + "step": 9998, + "time_per_iteration": 2.5626933574676514 + }, + { + "auxiliary_loss_clip": 0.0113453, + "auxiliary_loss_mlp": 0.01107718, + "balance_loss_clip": 1.0019722, + "balance_loss_mlp": 1.00052547, + "epoch": 0.6011724034270254, + "flos": 25010022758400.0, + "grad_norm": 1.533071119726425, + "language_loss": 0.72327477, + "learning_rate": 1.448929117633027e-06, + "loss": 0.74569726, + "num_input_tokens_seen": 215336405, + "step": 9999, + "time_per_iteration": 2.6577014923095703 + }, + { + "auxiliary_loss_clip": 0.01101244, + "auxiliary_loss_mlp": 0.01109153, + "balance_loss_clip": 1.00174165, + "balance_loss_mlp": 1.00062513, + "epoch": 0.6012325266796934, + "flos": 21797669640960.0, + "grad_norm": 1.747889563483842, + "language_loss": 0.78394455, + "learning_rate": 1.4485547421081142e-06, + "loss": 0.80604851, + "num_input_tokens_seen": 215356590, + "step": 10000, + "time_per_iteration": 2.690432548522949 + }, + { + "auxiliary_loss_clip": 0.01166162, + "auxiliary_loss_mlp": 0.01108603, + "balance_loss_clip": 1.00219536, + "balance_loss_mlp": 1.00055194, + "epoch": 0.6012926499323613, + "flos": 19573003774080.0, + "grad_norm": 1.7554613751986787, + "language_loss": 0.77264345, + "learning_rate": 1.4481803874940608e-06, + "loss": 0.79539108, + "num_input_tokens_seen": 215374295, + "step": 10001, + "time_per_iteration": 2.5296499729156494 + }, + { + "auxiliary_loss_clip": 0.01149394, + "auxiliary_loss_mlp": 0.01108607, + "balance_loss_clip": 1.00205791, + "balance_loss_mlp": 1.00046003, + "epoch": 0.6013527731850293, + "flos": 34860786076800.0, + "grad_norm": 1.6721032520079344, + "language_loss": 0.59366906, + "learning_rate": 1.4478060538050624e-06, + "loss": 0.61624902, + "num_input_tokens_seen": 215394535, + "step": 10002, + "time_per_iteration": 2.6689207553863525 + }, + { + "auxiliary_loss_clip": 0.01134244, + "auxiliary_loss_mlp": 0.01108366, + "balance_loss_clip": 1.00200629, + "balance_loss_mlp": 1.00050545, + "epoch": 0.6014128964376972, + "flos": 23291948355840.0, + "grad_norm": 1.4647170830602017, + "language_loss": 0.77786899, + "learning_rate": 1.447431741055314e-06, + "loss": 0.800295, + "num_input_tokens_seen": 215414355, + "step": 10003, + "time_per_iteration": 2.6353652477264404 + }, + { + "auxiliary_loss_clip": 0.01165904, + "auxiliary_loss_mlp": 0.01108987, + "balance_loss_clip": 1.0020653, + "balance_loss_mlp": 1.00064981, + "epoch": 0.6014730196903653, + "flos": 24820916630400.0, + "grad_norm": 3.539498300995466, + "language_loss": 0.77803445, + "learning_rate": 1.4470574492590091e-06, + "loss": 0.80078328, + "num_input_tokens_seen": 215428280, + "step": 10004, + "time_per_iteration": 2.520646333694458 + }, + { + "auxiliary_loss_clip": 0.01149095, + "auxiliary_loss_mlp": 0.01107969, + "balance_loss_clip": 1.00192118, + "balance_loss_mlp": 1.00058508, + "epoch": 0.6015331429430332, + "flos": 23112359331840.0, + "grad_norm": 1.4422864156998325, + "language_loss": 0.72517902, + "learning_rate": 1.4466831784303408e-06, + "loss": 0.74774963, + "num_input_tokens_seen": 215448970, + "step": 10005, + "time_per_iteration": 2.590883255004883 + }, + { + "auxiliary_loss_clip": 0.01165742, + "auxiliary_loss_mlp": 0.01107944, + "balance_loss_clip": 1.00208306, + "balance_loss_mlp": 1.00056028, + "epoch": 0.6015932661957012, + "flos": 19201363706880.0, + "grad_norm": 2.0476665686492264, + "language_loss": 0.74705529, + "learning_rate": 1.4463089285835026e-06, + "loss": 0.76979208, + "num_input_tokens_seen": 215465260, + "step": 10006, + "time_per_iteration": 2.4918465614318848 + }, + { + "auxiliary_loss_clip": 0.0113243, + "auxiliary_loss_mlp": 0.01108357, + "balance_loss_clip": 1.00184679, + "balance_loss_mlp": 1.00059199, + "epoch": 0.6016533894483691, + "flos": 18113630100480.0, + "grad_norm": 2.736910997015976, + "language_loss": 0.74017537, + "learning_rate": 1.445934699732685e-06, + "loss": 0.76258326, + "num_input_tokens_seen": 215482725, + "step": 10007, + "time_per_iteration": 2.5721564292907715 + }, + { + "auxiliary_loss_clip": 0.01134023, + "auxiliary_loss_mlp": 0.01108016, + "balance_loss_clip": 1.00202847, + "balance_loss_mlp": 1.00044203, + "epoch": 0.6017135127010371, + "flos": 16216900427520.0, + "grad_norm": 1.7843992903049333, + "language_loss": 0.7026577, + "learning_rate": 1.4455604918920785e-06, + "loss": 0.72507811, + "num_input_tokens_seen": 215500420, + "step": 10008, + "time_per_iteration": 4.035009145736694 + }, + { + "auxiliary_loss_clip": 0.0115104, + "auxiliary_loss_mlp": 0.01107921, + "balance_loss_clip": 1.00203681, + "balance_loss_mlp": 1.00044227, + "epoch": 0.6017736359537051, + "flos": 23444246021760.0, + "grad_norm": 1.6481192312170072, + "language_loss": 0.76276183, + "learning_rate": 1.4451863050758748e-06, + "loss": 0.78535146, + "num_input_tokens_seen": 215522260, + "step": 10009, + "time_per_iteration": 2.5938379764556885 + }, + { + "auxiliary_loss_clip": 0.01133695, + "auxiliary_loss_mlp": 0.00747315, + "balance_loss_clip": 1.00193095, + "balance_loss_mlp": 1.00044298, + "epoch": 0.601833759206373, + "flos": 23514056104320.0, + "grad_norm": 2.286990228368338, + "language_loss": 0.74098045, + "learning_rate": 1.4448121392982608e-06, + "loss": 0.75979054, + "num_input_tokens_seen": 215541715, + "step": 10010, + "time_per_iteration": 2.6260251998901367 + }, + { + "auxiliary_loss_clip": 0.01145951, + "auxiliary_loss_mlp": 0.01083893, + "balance_loss_clip": 1.00129199, + "balance_loss_mlp": 0.99996978, + "epoch": 0.6018938824590411, + "flos": 63991668648960.0, + "grad_norm": 0.8059708183101509, + "language_loss": 0.55086946, + "learning_rate": 1.4444379945734268e-06, + "loss": 0.57316792, + "num_input_tokens_seen": 215603020, + "step": 10011, + "time_per_iteration": 3.1893420219421387 + }, + { + "auxiliary_loss_clip": 0.01150637, + "auxiliary_loss_mlp": 0.01107584, + "balance_loss_clip": 1.00198936, + "balance_loss_mlp": 1.00058186, + "epoch": 0.601954005711709, + "flos": 34640007131520.0, + "grad_norm": 1.5042805441742038, + "language_loss": 0.6231662, + "learning_rate": 1.44406387091556e-06, + "loss": 0.64574838, + "num_input_tokens_seen": 215625115, + "step": 10012, + "time_per_iteration": 2.725064992904663 + }, + { + "auxiliary_loss_clip": 0.0111794, + "auxiliary_loss_mlp": 0.0110785, + "balance_loss_clip": 1.00205183, + "balance_loss_mlp": 1.00046647, + "epoch": 0.602014128964377, + "flos": 19427062815360.0, + "grad_norm": 1.6979739988881482, + "language_loss": 0.75229847, + "learning_rate": 1.4436897683388462e-06, + "loss": 0.7745564, + "num_input_tokens_seen": 215643730, + "step": 10013, + "time_per_iteration": 2.6387381553649902 + }, + { + "auxiliary_loss_clip": 0.01165622, + "auxiliary_loss_mlp": 0.01106197, + "balance_loss_clip": 1.00203204, + "balance_loss_mlp": 1.00062513, + "epoch": 0.6020742522170449, + "flos": 28329389470080.0, + "grad_norm": 1.6072602044454567, + "language_loss": 0.81605053, + "learning_rate": 1.4433156868574732e-06, + "loss": 0.83876872, + "num_input_tokens_seen": 215664425, + "step": 10014, + "time_per_iteration": 4.088608980178833 + }, + { + "auxiliary_loss_clip": 0.01134183, + "auxiliary_loss_mlp": 0.01106267, + "balance_loss_clip": 1.00195527, + "balance_loss_mlp": 1.00050497, + "epoch": 0.6021343754697129, + "flos": 22747040058240.0, + "grad_norm": 1.4499384499668055, + "language_loss": 0.7219584, + "learning_rate": 1.442941626485624e-06, + "loss": 0.74436295, + "num_input_tokens_seen": 215684280, + "step": 10015, + "time_per_iteration": 2.6474578380584717 + }, + { + "auxiliary_loss_clip": 0.01128813, + "auxiliary_loss_mlp": 0.01083494, + "balance_loss_clip": 1.00120139, + "balance_loss_mlp": 0.99995244, + "epoch": 0.6021944987223808, + "flos": 65752007402880.0, + "grad_norm": 0.8234120458487836, + "language_loss": 0.54787076, + "learning_rate": 1.4425675872374848e-06, + "loss": 0.56999379, + "num_input_tokens_seen": 215739780, + "step": 10016, + "time_per_iteration": 3.0462639331817627 + }, + { + "auxiliary_loss_clip": 0.0113276, + "auxiliary_loss_mlp": 0.01106757, + "balance_loss_clip": 1.00197124, + "balance_loss_mlp": 1.00051808, + "epoch": 0.6022546219750489, + "flos": 16105182151680.0, + "grad_norm": 1.477973008923002, + "language_loss": 0.8288275, + "learning_rate": 1.4421935691272381e-06, + "loss": 0.85122269, + "num_input_tokens_seen": 215757885, + "step": 10017, + "time_per_iteration": 3.9774529933929443 + }, + { + "auxiliary_loss_clip": 0.01133614, + "auxiliary_loss_mlp": 0.01107214, + "balance_loss_clip": 1.00192642, + "balance_loss_mlp": 1.00059307, + "epoch": 0.6023147452277168, + "flos": 25512555985920.0, + "grad_norm": 1.814846497883773, + "language_loss": 0.83639216, + "learning_rate": 1.4418195721690677e-06, + "loss": 0.85880041, + "num_input_tokens_seen": 215776415, + "step": 10018, + "time_per_iteration": 2.6293182373046875 + }, + { + "auxiliary_loss_clip": 0.01134438, + "auxiliary_loss_mlp": 0.01108908, + "balance_loss_clip": 1.00193977, + "balance_loss_mlp": 1.00066626, + "epoch": 0.6023748684803848, + "flos": 22636075968000.0, + "grad_norm": 1.5900399221333243, + "language_loss": 0.78977621, + "learning_rate": 1.4414455963771549e-06, + "loss": 0.81220973, + "num_input_tokens_seen": 215794865, + "step": 10019, + "time_per_iteration": 2.6165902614593506 + }, + { + "auxiliary_loss_clip": 0.01115275, + "auxiliary_loss_mlp": 0.00747307, + "balance_loss_clip": 1.00164866, + "balance_loss_mlp": 1.00052285, + "epoch": 0.6024349917330527, + "flos": 26210444307840.0, + "grad_norm": 1.7235884461834459, + "language_loss": 0.73719162, + "learning_rate": 1.441071641765681e-06, + "loss": 0.75581741, + "num_input_tokens_seen": 215816840, + "step": 10020, + "time_per_iteration": 4.114797115325928 + }, + { + "auxiliary_loss_clip": 0.01134341, + "auxiliary_loss_mlp": 0.01107704, + "balance_loss_clip": 1.00191069, + "balance_loss_mlp": 1.00051105, + "epoch": 0.6024951149857207, + "flos": 21251755762560.0, + "grad_norm": 1.722437612150098, + "language_loss": 0.64129347, + "learning_rate": 1.4406977083488264e-06, + "loss": 0.66371393, + "num_input_tokens_seen": 215836100, + "step": 10021, + "time_per_iteration": 2.6017680168151855 + }, + { + "auxiliary_loss_clip": 0.0114865, + "auxiliary_loss_mlp": 0.01108028, + "balance_loss_clip": 1.00194359, + "balance_loss_mlp": 1.00054932, + "epoch": 0.6025552382383887, + "flos": 26943453152640.0, + "grad_norm": 1.3904954527521443, + "language_loss": 0.80589497, + "learning_rate": 1.4403237961407704e-06, + "loss": 0.82846177, + "num_input_tokens_seen": 215858480, + "step": 10022, + "time_per_iteration": 2.5958383083343506 + }, + { + "auxiliary_loss_clip": 0.0115403, + "auxiliary_loss_mlp": 0.01108935, + "balance_loss_clip": 1.0023427, + "balance_loss_mlp": 1.00050211, + "epoch": 0.6026153614910567, + "flos": 31684379495040.0, + "grad_norm": 1.4943184338026525, + "language_loss": 0.66448051, + "learning_rate": 1.439949905155693e-06, + "loss": 0.68711013, + "num_input_tokens_seen": 215879950, + "step": 10023, + "time_per_iteration": 2.663621425628662 + }, + { + "auxiliary_loss_clip": 0.01150951, + "auxiliary_loss_mlp": 0.0110773, + "balance_loss_clip": 1.0019753, + "balance_loss_mlp": 1.0005374, + "epoch": 0.6026754847437247, + "flos": 29312731175040.0, + "grad_norm": 1.8916144388263452, + "language_loss": 0.74472684, + "learning_rate": 1.4395760354077707e-06, + "loss": 0.76731366, + "num_input_tokens_seen": 215899830, + "step": 10024, + "time_per_iteration": 2.634451389312744 + }, + { + "auxiliary_loss_clip": 0.01149067, + "auxiliary_loss_mlp": 0.01107364, + "balance_loss_clip": 1.00200844, + "balance_loss_mlp": 1.00055218, + "epoch": 0.6027356079963926, + "flos": 23586775188480.0, + "grad_norm": 1.8124086210039372, + "language_loss": 0.73022091, + "learning_rate": 1.4392021869111815e-06, + "loss": 0.75278521, + "num_input_tokens_seen": 215920440, + "step": 10025, + "time_per_iteration": 2.5764143466949463 + }, + { + "auxiliary_loss_clip": 0.01165953, + "auxiliary_loss_mlp": 0.01109138, + "balance_loss_clip": 1.00203788, + "balance_loss_mlp": 1.00051451, + "epoch": 0.6027957312490606, + "flos": 20813753318400.0, + "grad_norm": 2.406386933064542, + "language_loss": 0.67258722, + "learning_rate": 1.4388283596801016e-06, + "loss": 0.69533813, + "num_input_tokens_seen": 215940535, + "step": 10026, + "time_per_iteration": 2.5364246368408203 + }, + { + "auxiliary_loss_clip": 0.01165582, + "auxiliary_loss_mlp": 0.01106275, + "balance_loss_clip": 1.00193906, + "balance_loss_mlp": 1.00051212, + "epoch": 0.6028558545017285, + "flos": 19935773182080.0, + "grad_norm": 1.9074116155655334, + "language_loss": 0.79924119, + "learning_rate": 1.4384545537287061e-06, + "loss": 0.82195973, + "num_input_tokens_seen": 215958045, + "step": 10027, + "time_per_iteration": 2.52168869972229 + }, + { + "auxiliary_loss_clip": 0.01119666, + "auxiliary_loss_mlp": 0.01108584, + "balance_loss_clip": 1.00197685, + "balance_loss_mlp": 1.00053239, + "epoch": 0.6029159777543965, + "flos": 22820836550400.0, + "grad_norm": 3.0455617307610874, + "language_loss": 0.70951712, + "learning_rate": 1.438080769071171e-06, + "loss": 0.7317996, + "num_input_tokens_seen": 215977330, + "step": 10028, + "time_per_iteration": 2.650297164916992 + }, + { + "auxiliary_loss_clip": 0.01102846, + "auxiliary_loss_mlp": 0.0110741, + "balance_loss_clip": 1.00174475, + "balance_loss_mlp": 1.00059855, + "epoch": 0.6029761010070644, + "flos": 23587242065280.0, + "grad_norm": 3.0216624363749482, + "language_loss": 0.84471714, + "learning_rate": 1.437707005721669e-06, + "loss": 0.86681962, + "num_input_tokens_seen": 215997865, + "step": 10029, + "time_per_iteration": 2.690739631652832 + }, + { + "auxiliary_loss_clip": 0.01134198, + "auxiliary_loss_mlp": 0.0110721, + "balance_loss_clip": 1.0019207, + "balance_loss_mlp": 1.00058913, + "epoch": 0.6030362242597325, + "flos": 13662430859520.0, + "grad_norm": 2.7467763921658452, + "language_loss": 0.80026847, + "learning_rate": 1.437333263694373e-06, + "loss": 0.8226825, + "num_input_tokens_seen": 216016230, + "step": 10030, + "time_per_iteration": 2.571723699569702 + }, + { + "auxiliary_loss_clip": 0.01090217, + "auxiliary_loss_mlp": 0.01107375, + "balance_loss_clip": 1.00191295, + "balance_loss_mlp": 1.00046802, + "epoch": 0.6030963475124004, + "flos": 24422883045120.0, + "grad_norm": 1.6404767102625892, + "language_loss": 0.71273267, + "learning_rate": 1.4369595430034572e-06, + "loss": 0.73470867, + "num_input_tokens_seen": 216035785, + "step": 10031, + "time_per_iteration": 2.7367630004882812 + }, + { + "auxiliary_loss_clip": 0.0110063, + "auxiliary_loss_mlp": 0.0110836, + "balance_loss_clip": 1.00172758, + "balance_loss_mlp": 1.00049973, + "epoch": 0.6031564707650684, + "flos": 29644043247360.0, + "grad_norm": 1.5413108697754168, + "language_loss": 0.73219138, + "learning_rate": 1.4365858436630912e-06, + "loss": 0.75428122, + "num_input_tokens_seen": 216059555, + "step": 10032, + "time_per_iteration": 2.7505366802215576 + }, + { + "auxiliary_loss_clip": 0.01134859, + "auxiliary_loss_mlp": 0.01107429, + "balance_loss_clip": 1.00215149, + "balance_loss_mlp": 1.00052166, + "epoch": 0.6032165940177363, + "flos": 16618776768000.0, + "grad_norm": 1.8553115895853032, + "language_loss": 0.68432868, + "learning_rate": 1.4362121656874465e-06, + "loss": 0.70675153, + "num_input_tokens_seen": 216077235, + "step": 10033, + "time_per_iteration": 2.5893733501434326 + }, + { + "auxiliary_loss_clip": 0.01134774, + "auxiliary_loss_mlp": 0.01106164, + "balance_loss_clip": 1.00209033, + "balance_loss_mlp": 1.00059199, + "epoch": 0.6032767172704043, + "flos": 17488173553920.0, + "grad_norm": 2.1123645496303407, + "language_loss": 0.75874352, + "learning_rate": 1.4358385090906934e-06, + "loss": 0.78115284, + "num_input_tokens_seen": 216094985, + "step": 10034, + "time_per_iteration": 2.592560291290283 + }, + { + "auxiliary_loss_clip": 0.01132306, + "auxiliary_loss_mlp": 0.01108749, + "balance_loss_clip": 1.00190258, + "balance_loss_mlp": 1.00060272, + "epoch": 0.6033368405230723, + "flos": 26832955939200.0, + "grad_norm": 1.9621918882800156, + "language_loss": 0.74865234, + "learning_rate": 1.4354648738870004e-06, + "loss": 0.77106285, + "num_input_tokens_seen": 216115905, + "step": 10035, + "time_per_iteration": 2.8041908740997314 + }, + { + "auxiliary_loss_clip": 0.01134369, + "auxiliary_loss_mlp": 0.0110701, + "balance_loss_clip": 1.00200689, + "balance_loss_mlp": 1.00038934, + "epoch": 0.6033969637757403, + "flos": 16909904499840.0, + "grad_norm": 1.7285955350378537, + "language_loss": 0.86467755, + "learning_rate": 1.435091260090536e-06, + "loss": 0.8870914, + "num_input_tokens_seen": 216132420, + "step": 10036, + "time_per_iteration": 2.614593505859375 + }, + { + "auxiliary_loss_clip": 0.01115553, + "auxiliary_loss_mlp": 0.01107762, + "balance_loss_clip": 1.0018158, + "balance_loss_mlp": 1.00047398, + "epoch": 0.6034570870284083, + "flos": 22930076787840.0, + "grad_norm": 2.1734669696612823, + "language_loss": 0.70473897, + "learning_rate": 1.4347176677154676e-06, + "loss": 0.7269721, + "num_input_tokens_seen": 216149800, + "step": 10037, + "time_per_iteration": 2.6374292373657227 + }, + { + "auxiliary_loss_clip": 0.01150751, + "auxiliary_loss_mlp": 0.01107731, + "balance_loss_clip": 1.0020659, + "balance_loss_mlp": 1.00053823, + "epoch": 0.6035172102810762, + "flos": 23366319465600.0, + "grad_norm": 2.306658208653264, + "language_loss": 0.85602438, + "learning_rate": 1.4343440967759616e-06, + "loss": 0.87860918, + "num_input_tokens_seen": 216168200, + "step": 10038, + "time_per_iteration": 2.5698323249816895 + }, + { + "auxiliary_loss_clip": 0.01135352, + "auxiliary_loss_mlp": 0.01107138, + "balance_loss_clip": 1.00192034, + "balance_loss_mlp": 1.00051737, + "epoch": 0.6035773335337442, + "flos": 20887082933760.0, + "grad_norm": 2.09015056511822, + "language_loss": 0.76040643, + "learning_rate": 1.4339705472861846e-06, + "loss": 0.78283137, + "num_input_tokens_seen": 216187105, + "step": 10039, + "time_per_iteration": 2.626534938812256 + }, + { + "auxiliary_loss_clip": 0.01149058, + "auxiliary_loss_mlp": 0.01106686, + "balance_loss_clip": 1.00184584, + "balance_loss_mlp": 1.00054169, + "epoch": 0.6036374567864121, + "flos": 24936298093440.0, + "grad_norm": 1.5708906272157206, + "language_loss": 0.71328938, + "learning_rate": 1.433597019260301e-06, + "loss": 0.73584688, + "num_input_tokens_seen": 216205440, + "step": 10040, + "time_per_iteration": 2.5869498252868652 + }, + { + "auxiliary_loss_clip": 0.01148963, + "auxiliary_loss_mlp": 0.01108582, + "balance_loss_clip": 1.00197244, + "balance_loss_mlp": 1.00043523, + "epoch": 0.6036975800390801, + "flos": 23148269953920.0, + "grad_norm": 1.9093549463168127, + "language_loss": 0.77986187, + "learning_rate": 1.433223512712475e-06, + "loss": 0.80243731, + "num_input_tokens_seen": 216223130, + "step": 10041, + "time_per_iteration": 2.5530214309692383 + }, + { + "auxiliary_loss_clip": 0.01133505, + "auxiliary_loss_mlp": 0.01107501, + "balance_loss_clip": 1.00186825, + "balance_loss_mlp": 1.00049901, + "epoch": 0.603757703291748, + "flos": 18660729127680.0, + "grad_norm": 1.8751135848649867, + "language_loss": 0.75790596, + "learning_rate": 1.4328500276568704e-06, + "loss": 0.780316, + "num_input_tokens_seen": 216240260, + "step": 10042, + "time_per_iteration": 2.555208683013916 + }, + { + "auxiliary_loss_clip": 0.0110181, + "auxiliary_loss_mlp": 0.01107622, + "balance_loss_clip": 1.00175369, + "balance_loss_mlp": 1.00052476, + "epoch": 0.6038178265444161, + "flos": 19682603147520.0, + "grad_norm": 1.9340295999788828, + "language_loss": 0.84468269, + "learning_rate": 1.4324765641076498e-06, + "loss": 0.866777, + "num_input_tokens_seen": 216258510, + "step": 10043, + "time_per_iteration": 2.649857521057129 + }, + { + "auxiliary_loss_clip": 0.01117415, + "auxiliary_loss_mlp": 0.0110867, + "balance_loss_clip": 1.00177884, + "balance_loss_mlp": 1.00061846, + "epoch": 0.603877949797084, + "flos": 22638230784000.0, + "grad_norm": 1.9737596529782762, + "language_loss": 0.69498688, + "learning_rate": 1.432103122078974e-06, + "loss": 0.71724772, + "num_input_tokens_seen": 216277550, + "step": 10044, + "time_per_iteration": 2.629370927810669 + }, + { + "auxiliary_loss_clip": 0.01149004, + "auxiliary_loss_mlp": 0.01108286, + "balance_loss_clip": 1.00194788, + "balance_loss_mlp": 1.00052094, + "epoch": 0.603938073049752, + "flos": 25447881548160.0, + "grad_norm": 1.700256839492909, + "language_loss": 0.77936202, + "learning_rate": 1.4317297015850057e-06, + "loss": 0.80193496, + "num_input_tokens_seen": 216296690, + "step": 10045, + "time_per_iteration": 4.004278898239136 + }, + { + "auxiliary_loss_clip": 0.01087251, + "auxiliary_loss_mlp": 0.01106738, + "balance_loss_clip": 1.0017699, + "balance_loss_mlp": 1.00049877, + "epoch": 0.6039981963024199, + "flos": 22340135813760.0, + "grad_norm": 1.8723219380455525, + "language_loss": 0.77098858, + "learning_rate": 1.4313563026399036e-06, + "loss": 0.79292846, + "num_input_tokens_seen": 216316110, + "step": 10046, + "time_per_iteration": 2.7311761379241943 + }, + { + "auxiliary_loss_clip": 0.01104006, + "auxiliary_loss_mlp": 0.01106697, + "balance_loss_clip": 1.00179434, + "balance_loss_mlp": 1.00055301, + "epoch": 0.6040583195550879, + "flos": 20703148364160.0, + "grad_norm": 1.5999179215974833, + "language_loss": 0.8699773, + "learning_rate": 1.430982925257827e-06, + "loss": 0.8920843, + "num_input_tokens_seen": 216333855, + "step": 10047, + "time_per_iteration": 2.7004926204681396 + }, + { + "auxiliary_loss_clip": 0.01151027, + "auxiliary_loss_mlp": 0.01106646, + "balance_loss_clip": 1.00202405, + "balance_loss_mlp": 1.00050235, + "epoch": 0.604118442807756, + "flos": 27163118776320.0, + "grad_norm": 1.5736955413201401, + "language_loss": 0.75645328, + "learning_rate": 1.4306095694529358e-06, + "loss": 0.77902997, + "num_input_tokens_seen": 216354890, + "step": 10048, + "time_per_iteration": 2.631563186645508 + }, + { + "auxiliary_loss_clip": 0.01153578, + "auxiliary_loss_mlp": 0.01109177, + "balance_loss_clip": 1.0020541, + "balance_loss_mlp": 1.00045848, + "epoch": 0.6041785660604239, + "flos": 30881524654080.0, + "grad_norm": 2.384683802714721, + "language_loss": 0.66342998, + "learning_rate": 1.430236235239386e-06, + "loss": 0.68605751, + "num_input_tokens_seen": 216376055, + "step": 10049, + "time_per_iteration": 2.657257318496704 + }, + { + "auxiliary_loss_clip": 0.0113542, + "auxiliary_loss_mlp": 0.01107901, + "balance_loss_clip": 1.00197899, + "balance_loss_mlp": 1.0006125, + "epoch": 0.6042386893130919, + "flos": 19938215306880.0, + "grad_norm": 1.5585226053708485, + "language_loss": 0.66552758, + "learning_rate": 1.429862922631336e-06, + "loss": 0.68796074, + "num_input_tokens_seen": 216396295, + "step": 10050, + "time_per_iteration": 2.607726573944092 + }, + { + "auxiliary_loss_clip": 0.0111757, + "auxiliary_loss_mlp": 0.01107785, + "balance_loss_clip": 1.0020113, + "balance_loss_mlp": 1.00049639, + "epoch": 0.6042988125657598, + "flos": 32415915882240.0, + "grad_norm": 2.0105841959507105, + "language_loss": 0.70029223, + "learning_rate": 1.4294896316429408e-06, + "loss": 0.7225458, + "num_input_tokens_seen": 216416605, + "step": 10051, + "time_per_iteration": 4.126757621765137 + }, + { + "auxiliary_loss_clip": 0.01148307, + "auxiliary_loss_mlp": 0.01107189, + "balance_loss_clip": 1.00170362, + "balance_loss_mlp": 1.00047243, + "epoch": 0.6043589358184278, + "flos": 17420805596160.0, + "grad_norm": 2.293799094069772, + "language_loss": 0.64308691, + "learning_rate": 1.4291163622883553e-06, + "loss": 0.66564184, + "num_input_tokens_seen": 216435130, + "step": 10052, + "time_per_iteration": 2.526200771331787 + }, + { + "auxiliary_loss_clip": 0.01132456, + "auxiliary_loss_mlp": 0.01108329, + "balance_loss_clip": 1.00181019, + "balance_loss_mlp": 1.00065947, + "epoch": 0.6044190590710957, + "flos": 27672834723840.0, + "grad_norm": 2.4727526143900906, + "language_loss": 0.68866599, + "learning_rate": 1.4287431145817358e-06, + "loss": 0.71107376, + "num_input_tokens_seen": 216455640, + "step": 10053, + "time_per_iteration": 2.6397759914398193 + }, + { + "auxiliary_loss_clip": 0.0112922, + "auxiliary_loss_mlp": 0.01083885, + "balance_loss_clip": 1.00117683, + "balance_loss_mlp": 0.99996179, + "epoch": 0.6044791823237637, + "flos": 65316267515520.0, + "grad_norm": 0.7323434305338334, + "language_loss": 0.60409707, + "learning_rate": 1.4283698885372336e-06, + "loss": 0.62622815, + "num_input_tokens_seen": 216518130, + "step": 10054, + "time_per_iteration": 3.2687628269195557 + }, + { + "auxiliary_loss_clip": 0.01084246, + "auxiliary_loss_mlp": 0.01106297, + "balance_loss_clip": 1.00170255, + "balance_loss_mlp": 1.00053477, + "epoch": 0.6045393055764317, + "flos": 24492369905280.0, + "grad_norm": 3.321462537655228, + "language_loss": 0.85900366, + "learning_rate": 1.4279966841690027e-06, + "loss": 0.88090909, + "num_input_tokens_seen": 216536845, + "step": 10055, + "time_per_iteration": 4.129164218902588 + }, + { + "auxiliary_loss_clip": 0.01135505, + "auxiliary_loss_mlp": 0.01108258, + "balance_loss_clip": 1.00212097, + "balance_loss_mlp": 1.00068355, + "epoch": 0.6045994288290997, + "flos": 19054345340160.0, + "grad_norm": 2.548627179506917, + "language_loss": 0.73757946, + "learning_rate": 1.4276235014911952e-06, + "loss": 0.7600171, + "num_input_tokens_seen": 216551860, + "step": 10056, + "time_per_iteration": 2.578240156173706 + }, + { + "auxiliary_loss_clip": 0.01119318, + "auxiliary_loss_mlp": 0.01108277, + "balance_loss_clip": 1.00205362, + "balance_loss_mlp": 1.00060725, + "epoch": 0.6046595520817676, + "flos": 26576697335040.0, + "grad_norm": 1.5449784703134453, + "language_loss": 0.80153024, + "learning_rate": 1.4272503405179616e-06, + "loss": 0.82380623, + "num_input_tokens_seen": 216574775, + "step": 10057, + "time_per_iteration": 2.7247421741485596 + }, + { + "auxiliary_loss_clip": 0.01165657, + "auxiliary_loss_mlp": 0.0074729, + "balance_loss_clip": 1.00201058, + "balance_loss_mlp": 1.00052571, + "epoch": 0.6047196753344356, + "flos": 13582277660160.0, + "grad_norm": 2.385615314618719, + "language_loss": 0.75111556, + "learning_rate": 1.4268772012634527e-06, + "loss": 0.77024502, + "num_input_tokens_seen": 216590100, + "step": 10058, + "time_per_iteration": 3.9512581825256348 + }, + { + "auxiliary_loss_clip": 0.01148993, + "auxiliary_loss_mlp": 0.01106862, + "balance_loss_clip": 1.00196743, + "balance_loss_mlp": 1.00052702, + "epoch": 0.6047797985871035, + "flos": 25520456977920.0, + "grad_norm": 1.952623434130673, + "language_loss": 0.71015286, + "learning_rate": 1.4265040837418176e-06, + "loss": 0.73271143, + "num_input_tokens_seen": 216610145, + "step": 10059, + "time_per_iteration": 2.593386173248291 + }, + { + "auxiliary_loss_clip": 0.01132251, + "auxiliary_loss_mlp": 0.01107339, + "balance_loss_clip": 1.00184011, + "balance_loss_mlp": 1.00052738, + "epoch": 0.6048399218397715, + "flos": 20520147548160.0, + "grad_norm": 1.5944298168878654, + "language_loss": 0.76268393, + "learning_rate": 1.4261309879672054e-06, + "loss": 0.78507984, + "num_input_tokens_seen": 216630625, + "step": 10060, + "time_per_iteration": 2.6148173809051514 + }, + { + "auxiliary_loss_clip": 0.01150461, + "auxiliary_loss_mlp": 0.01107555, + "balance_loss_clip": 1.00197339, + "balance_loss_mlp": 1.00045753, + "epoch": 0.6049000450924396, + "flos": 20408788408320.0, + "grad_norm": 2.0184280462818003, + "language_loss": 0.73663366, + "learning_rate": 1.4257579139537628e-06, + "loss": 0.75921381, + "num_input_tokens_seen": 216649255, + "step": 10061, + "time_per_iteration": 2.5656940937042236 + }, + { + "auxiliary_loss_clip": 0.01104087, + "auxiliary_loss_mlp": 0.00747411, + "balance_loss_clip": 1.00170112, + "balance_loss_mlp": 1.0004859, + "epoch": 0.6049601683451075, + "flos": 20741357456640.0, + "grad_norm": 1.8031863730058353, + "language_loss": 0.67237794, + "learning_rate": 1.425384861715639e-06, + "loss": 0.69089293, + "num_input_tokens_seen": 216668100, + "step": 10062, + "time_per_iteration": 2.6783089637756348 + }, + { + "auxiliary_loss_clip": 0.01151108, + "auxiliary_loss_mlp": 0.01107147, + "balance_loss_clip": 1.0020566, + "balance_loss_mlp": 1.00062132, + "epoch": 0.6050202915977755, + "flos": 20083114771200.0, + "grad_norm": 2.0605435160175043, + "language_loss": 0.71703219, + "learning_rate": 1.425011831266978e-06, + "loss": 0.73961473, + "num_input_tokens_seen": 216686125, + "step": 10063, + "time_per_iteration": 2.5317649841308594 + }, + { + "auxiliary_loss_clip": 0.01165701, + "auxiliary_loss_mlp": 0.01106896, + "balance_loss_clip": 1.00202036, + "balance_loss_mlp": 1.000561, + "epoch": 0.6050804148504434, + "flos": 15960821391360.0, + "grad_norm": 10.332117665450218, + "language_loss": 0.84799653, + "learning_rate": 1.424638822621926e-06, + "loss": 0.87072241, + "num_input_tokens_seen": 216704265, + "step": 10064, + "time_per_iteration": 2.498258590698242 + }, + { + "auxiliary_loss_clip": 0.01150081, + "auxiliary_loss_mlp": 0.0110788, + "balance_loss_clip": 1.00187635, + "balance_loss_mlp": 1.00059175, + "epoch": 0.6051405381031114, + "flos": 17456644391040.0, + "grad_norm": 2.4053557654766973, + "language_loss": 0.80077606, + "learning_rate": 1.4242658357946278e-06, + "loss": 0.82335567, + "num_input_tokens_seen": 216721765, + "step": 10065, + "time_per_iteration": 2.548765182495117 + }, + { + "auxiliary_loss_clip": 0.0110299, + "auxiliary_loss_mlp": 0.01109445, + "balance_loss_clip": 1.00180674, + "balance_loss_mlp": 1.00053573, + "epoch": 0.6052006613557793, + "flos": 11400130517760.0, + "grad_norm": 2.2364395486970303, + "language_loss": 0.78805459, + "learning_rate": 1.423892870799226e-06, + "loss": 0.810179, + "num_input_tokens_seen": 216738295, + "step": 10066, + "time_per_iteration": 2.6327977180480957 + }, + { + "auxiliary_loss_clip": 0.01091652, + "auxiliary_loss_mlp": 0.0110754, + "balance_loss_clip": 1.0018518, + "balance_loss_mlp": 1.00053787, + "epoch": 0.6052607846084473, + "flos": 24750998807040.0, + "grad_norm": 1.9104480786427207, + "language_loss": 0.73106086, + "learning_rate": 1.4235199276498655e-06, + "loss": 0.75305271, + "num_input_tokens_seen": 216759875, + "step": 10067, + "time_per_iteration": 2.736419677734375 + }, + { + "auxiliary_loss_clip": 0.01132292, + "auxiliary_loss_mlp": 0.00747268, + "balance_loss_clip": 1.00184822, + "balance_loss_mlp": 1.00053847, + "epoch": 0.6053209078611153, + "flos": 20741141975040.0, + "grad_norm": 1.3689411140203087, + "language_loss": 0.68871725, + "learning_rate": 1.4231470063606863e-06, + "loss": 0.70751286, + "num_input_tokens_seen": 216780705, + "step": 10068, + "time_per_iteration": 2.653188467025757 + }, + { + "auxiliary_loss_clip": 0.01149394, + "auxiliary_loss_mlp": 0.01107849, + "balance_loss_clip": 1.00191736, + "balance_loss_mlp": 1.00046504, + "epoch": 0.6053810311137833, + "flos": 18953149749120.0, + "grad_norm": 2.247309164689066, + "language_loss": 0.86976099, + "learning_rate": 1.4227741069458303e-06, + "loss": 0.89233351, + "num_input_tokens_seen": 216797625, + "step": 10069, + "time_per_iteration": 2.538856267929077 + }, + { + "auxiliary_loss_clip": 0.01116921, + "auxiliary_loss_mlp": 0.01107753, + "balance_loss_clip": 1.00180185, + "balance_loss_mlp": 1.00055981, + "epoch": 0.6054411543664512, + "flos": 23951124794880.0, + "grad_norm": 1.7456430005198822, + "language_loss": 0.83405304, + "learning_rate": 1.4224012294194387e-06, + "loss": 0.85629982, + "num_input_tokens_seen": 216817610, + "step": 10070, + "time_per_iteration": 2.6732265949249268 + }, + { + "auxiliary_loss_clip": 0.01136327, + "auxiliary_loss_mlp": 0.01107784, + "balance_loss_clip": 1.00192332, + "balance_loss_mlp": 1.00059104, + "epoch": 0.6055012776191192, + "flos": 20593979953920.0, + "grad_norm": 1.6302179993172614, + "language_loss": 0.85971749, + "learning_rate": 1.4220283737956496e-06, + "loss": 0.88215852, + "num_input_tokens_seen": 216836835, + "step": 10071, + "time_per_iteration": 2.6038637161254883 + }, + { + "auxiliary_loss_clip": 0.01149171, + "auxiliary_loss_mlp": 0.01108495, + "balance_loss_clip": 1.00199699, + "balance_loss_mlp": 1.00063431, + "epoch": 0.6055614008717871, + "flos": 30298191782400.0, + "grad_norm": 1.6494875836722453, + "language_loss": 0.76822472, + "learning_rate": 1.421655540088603e-06, + "loss": 0.79080135, + "num_input_tokens_seen": 216856760, + "step": 10072, + "time_per_iteration": 2.6292710304260254 + }, + { + "auxiliary_loss_clip": 0.01135299, + "auxiliary_loss_mlp": 0.01107807, + "balance_loss_clip": 1.00185585, + "balance_loss_mlp": 1.00032759, + "epoch": 0.6056215241244551, + "flos": 27125017424640.0, + "grad_norm": 1.6463490798620868, + "language_loss": 0.7422626, + "learning_rate": 1.4212827283124367e-06, + "loss": 0.76469374, + "num_input_tokens_seen": 216878795, + "step": 10073, + "time_per_iteration": 2.6509222984313965 + }, + { + "auxiliary_loss_clip": 0.01100362, + "auxiliary_loss_mlp": 0.01083548, + "balance_loss_clip": 1.00121379, + "balance_loss_mlp": 1.0000062, + "epoch": 0.6056816473771232, + "flos": 56007323925120.0, + "grad_norm": 0.7778755755097329, + "language_loss": 0.55178159, + "learning_rate": 1.4209099384812863e-06, + "loss": 0.57362068, + "num_input_tokens_seen": 216937800, + "step": 10074, + "time_per_iteration": 3.298438787460327 + }, + { + "auxiliary_loss_clip": 0.01117228, + "auxiliary_loss_mlp": 0.01107474, + "balance_loss_clip": 1.00182879, + "balance_loss_mlp": 1.00047207, + "epoch": 0.6057417706297911, + "flos": 23549499849600.0, + "grad_norm": 1.7450869477117672, + "language_loss": 0.81760943, + "learning_rate": 1.4205371706092894e-06, + "loss": 0.83985645, + "num_input_tokens_seen": 216955280, + "step": 10075, + "time_per_iteration": 2.633186101913452 + }, + { + "auxiliary_loss_clip": 0.01150488, + "auxiliary_loss_mlp": 0.01107902, + "balance_loss_clip": 1.00201035, + "balance_loss_mlp": 1.00042319, + "epoch": 0.6058018938824591, + "flos": 27744296832000.0, + "grad_norm": 2.0155586182588423, + "language_loss": 0.77816176, + "learning_rate": 1.4201644247105813e-06, + "loss": 0.80074561, + "num_input_tokens_seen": 216976950, + "step": 10076, + "time_per_iteration": 2.6158199310302734 + }, + { + "auxiliary_loss_clip": 0.01151072, + "auxiliary_loss_mlp": 0.01107829, + "balance_loss_clip": 1.00194418, + "balance_loss_mlp": 1.00054097, + "epoch": 0.605862017135127, + "flos": 22783381643520.0, + "grad_norm": 1.852792643837907, + "language_loss": 0.72396129, + "learning_rate": 1.4197917007992964e-06, + "loss": 0.74655026, + "num_input_tokens_seen": 216996945, + "step": 10077, + "time_per_iteration": 2.5590295791625977 + }, + { + "auxiliary_loss_clip": 0.01165691, + "auxiliary_loss_mlp": 0.0110842, + "balance_loss_clip": 1.00202894, + "balance_loss_mlp": 1.00055921, + "epoch": 0.605922140387795, + "flos": 21215019127680.0, + "grad_norm": 2.348870874022758, + "language_loss": 0.5536958, + "learning_rate": 1.4194189988895682e-06, + "loss": 0.57643688, + "num_input_tokens_seen": 217016580, + "step": 10078, + "time_per_iteration": 2.518836259841919 + }, + { + "auxiliary_loss_clip": 0.01102032, + "auxiliary_loss_mlp": 0.0110747, + "balance_loss_clip": 1.00169468, + "balance_loss_mlp": 1.0004679, + "epoch": 0.6059822636404629, + "flos": 27268372604160.0, + "grad_norm": 1.617067582730835, + "language_loss": 0.7040205, + "learning_rate": 1.4190463189955297e-06, + "loss": 0.72611558, + "num_input_tokens_seen": 217037300, + "step": 10079, + "time_per_iteration": 2.7392690181732178 + }, + { + "auxiliary_loss_clip": 0.01132299, + "auxiliary_loss_mlp": 0.01108347, + "balance_loss_clip": 1.00182652, + "balance_loss_mlp": 1.00067699, + "epoch": 0.606042386893131, + "flos": 20631327120000.0, + "grad_norm": 1.8786232867643957, + "language_loss": 0.62371403, + "learning_rate": 1.4186736611313131e-06, + "loss": 0.64612049, + "num_input_tokens_seen": 217055805, + "step": 10080, + "time_per_iteration": 2.6311230659484863 + }, + { + "auxiliary_loss_clip": 0.01134485, + "auxiliary_loss_mlp": 0.0110772, + "balance_loss_clip": 1.00194061, + "balance_loss_mlp": 1.00062191, + "epoch": 0.6061025101457989, + "flos": 23002293081600.0, + "grad_norm": 2.0048767157070118, + "language_loss": 0.71157712, + "learning_rate": 1.4183010253110492e-06, + "loss": 0.73399913, + "num_input_tokens_seen": 217074175, + "step": 10081, + "time_per_iteration": 2.5993540287017822 + }, + { + "auxiliary_loss_clip": 0.01133806, + "auxiliary_loss_mlp": 0.01107398, + "balance_loss_clip": 1.001966, + "balance_loss_mlp": 1.0004909, + "epoch": 0.6061626333984669, + "flos": 29898937134720.0, + "grad_norm": 1.766301271710489, + "language_loss": 0.68937588, + "learning_rate": 1.4179284115488691e-06, + "loss": 0.71178794, + "num_input_tokens_seen": 217095695, + "step": 10082, + "time_per_iteration": 2.7020561695098877 + }, + { + "auxiliary_loss_clip": 0.01165797, + "auxiliary_loss_mlp": 0.01107522, + "balance_loss_clip": 1.00204587, + "balance_loss_mlp": 1.00061512, + "epoch": 0.6062227566511348, + "flos": 25009196745600.0, + "grad_norm": 1.4999843446344503, + "language_loss": 0.6585263, + "learning_rate": 1.4175558198589015e-06, + "loss": 0.68125951, + "num_input_tokens_seen": 217116260, + "step": 10083, + "time_per_iteration": 3.948542594909668 + }, + { + "auxiliary_loss_clip": 0.01149074, + "auxiliary_loss_mlp": 0.01107701, + "balance_loss_clip": 1.00190735, + "balance_loss_mlp": 1.00050759, + "epoch": 0.6062828799038028, + "flos": 19463943104640.0, + "grad_norm": 1.985610270922878, + "language_loss": 0.74006051, + "learning_rate": 1.4171832502552764e-06, + "loss": 0.7626282, + "num_input_tokens_seen": 217134465, + "step": 10084, + "time_per_iteration": 2.5398738384246826 + }, + { + "auxiliary_loss_clip": 0.0113407, + "auxiliary_loss_mlp": 0.01107854, + "balance_loss_clip": 1.00194025, + "balance_loss_mlp": 1.00066113, + "epoch": 0.6063430031564707, + "flos": 13589568120960.0, + "grad_norm": 2.257058304321894, + "language_loss": 0.71998274, + "learning_rate": 1.4168107027521204e-06, + "loss": 0.74240196, + "num_input_tokens_seen": 217149920, + "step": 10085, + "time_per_iteration": 2.5682404041290283 + }, + { + "auxiliary_loss_clip": 0.01165632, + "auxiliary_loss_mlp": 0.01107645, + "balance_loss_clip": 1.00205445, + "balance_loss_mlp": 1.00064278, + "epoch": 0.6064031264091387, + "flos": 23255499029760.0, + "grad_norm": 2.10591663013036, + "language_loss": 0.76660621, + "learning_rate": 1.4164381773635605e-06, + "loss": 0.78933895, + "num_input_tokens_seen": 217168165, + "step": 10086, + "time_per_iteration": 2.515038013458252 + }, + { + "auxiliary_loss_clip": 0.01134695, + "auxiliary_loss_mlp": 0.01106912, + "balance_loss_clip": 1.00194323, + "balance_loss_mlp": 1.00057769, + "epoch": 0.6064632496618068, + "flos": 22458462192000.0, + "grad_norm": 1.2718508080527584, + "language_loss": 0.72594756, + "learning_rate": 1.4160656741037246e-06, + "loss": 0.74836361, + "num_input_tokens_seen": 217190070, + "step": 10087, + "time_per_iteration": 2.6277072429656982 + }, + { + "auxiliary_loss_clip": 0.01148951, + "auxiliary_loss_mlp": 0.01106386, + "balance_loss_clip": 1.00188887, + "balance_loss_mlp": 1.0008142, + "epoch": 0.6065233729144747, + "flos": 25118652464640.0, + "grad_norm": 1.648806220825558, + "language_loss": 0.83565795, + "learning_rate": 1.4156931929867355e-06, + "loss": 0.85821134, + "num_input_tokens_seen": 217209370, + "step": 10088, + "time_per_iteration": 2.5712006092071533 + }, + { + "auxiliary_loss_clip": 0.01102772, + "auxiliary_loss_mlp": 0.00747293, + "balance_loss_clip": 1.00178826, + "balance_loss_mlp": 1.00040531, + "epoch": 0.6065834961671427, + "flos": 23477355383040.0, + "grad_norm": 2.4022794943179124, + "language_loss": 0.71186197, + "learning_rate": 1.4153207340267201e-06, + "loss": 0.73036253, + "num_input_tokens_seen": 217226990, + "step": 10089, + "time_per_iteration": 4.0813422203063965 + }, + { + "auxiliary_loss_clip": 0.0114908, + "auxiliary_loss_mlp": 0.01107346, + "balance_loss_clip": 1.00200582, + "balance_loss_mlp": 1.00072563, + "epoch": 0.6066436194198106, + "flos": 17019396132480.0, + "grad_norm": 5.425723271950715, + "language_loss": 0.82666987, + "learning_rate": 1.4149482972378009e-06, + "loss": 0.84923416, + "num_input_tokens_seen": 217244585, + "step": 10090, + "time_per_iteration": 2.543982744216919 + }, + { + "auxiliary_loss_clip": 0.01116061, + "auxiliary_loss_mlp": 0.01108952, + "balance_loss_clip": 1.00170219, + "balance_loss_mlp": 1.00071001, + "epoch": 0.6067037426724786, + "flos": 18514752255360.0, + "grad_norm": 2.112674571077342, + "language_loss": 0.75671768, + "learning_rate": 1.4145758826341e-06, + "loss": 0.7789678, + "num_input_tokens_seen": 217263435, + "step": 10091, + "time_per_iteration": 2.6229138374328613 + }, + { + "auxiliary_loss_clip": 0.01165732, + "auxiliary_loss_mlp": 0.01106782, + "balance_loss_clip": 1.00206387, + "balance_loss_mlp": 1.00044775, + "epoch": 0.6067638659251465, + "flos": 22345989730560.0, + "grad_norm": 2.081667964216401, + "language_loss": 0.79647875, + "learning_rate": 1.4142034902297415e-06, + "loss": 0.81920391, + "num_input_tokens_seen": 217283725, + "step": 10092, + "time_per_iteration": 2.5396885871887207 + }, + { + "auxiliary_loss_clip": 0.01132373, + "auxiliary_loss_mlp": 0.01107593, + "balance_loss_clip": 1.00188005, + "balance_loss_mlp": 1.00068593, + "epoch": 0.6068239891778145, + "flos": 12451019748480.0, + "grad_norm": 1.799903678031133, + "language_loss": 0.75901902, + "learning_rate": 1.4138311200388444e-06, + "loss": 0.78141868, + "num_input_tokens_seen": 217301120, + "step": 10093, + "time_per_iteration": 4.02031683921814 + }, + { + "auxiliary_loss_clip": 0.01134, + "auxiliary_loss_mlp": 0.0110633, + "balance_loss_clip": 1.00190306, + "balance_loss_mlp": 1.00056767, + "epoch": 0.6068841124304825, + "flos": 23185868515200.0, + "grad_norm": 1.725054463574911, + "language_loss": 0.87230504, + "learning_rate": 1.4134587720755304e-06, + "loss": 0.89470834, + "num_input_tokens_seen": 217319585, + "step": 10094, + "time_per_iteration": 2.6078145503997803 + }, + { + "auxiliary_loss_clip": 0.01151039, + "auxiliary_loss_mlp": 0.01107778, + "balance_loss_clip": 1.00203907, + "balance_loss_mlp": 1.00048983, + "epoch": 0.6069442356831505, + "flos": 18587902302720.0, + "grad_norm": 2.3972564428935557, + "language_loss": 0.72171724, + "learning_rate": 1.413086446353919e-06, + "loss": 0.74430537, + "num_input_tokens_seen": 217338880, + "step": 10095, + "time_per_iteration": 2.541546106338501 + }, + { + "auxiliary_loss_clip": 0.01134283, + "auxiliary_loss_mlp": 0.01106734, + "balance_loss_clip": 1.00194049, + "balance_loss_mlp": 1.00049496, + "epoch": 0.6070043589358184, + "flos": 20960340721920.0, + "grad_norm": 1.8902911527021895, + "language_loss": 0.76697761, + "learning_rate": 1.4127141428881273e-06, + "loss": 0.78938782, + "num_input_tokens_seen": 217357480, + "step": 10096, + "time_per_iteration": 4.024595022201538 + }, + { + "auxiliary_loss_clip": 0.01165795, + "auxiliary_loss_mlp": 0.01108054, + "balance_loss_clip": 1.00198829, + "balance_loss_mlp": 1.00066996, + "epoch": 0.6070644821884864, + "flos": 11692443398400.0, + "grad_norm": 1.8154620963844263, + "language_loss": 0.79521263, + "learning_rate": 1.4123418616922749e-06, + "loss": 0.81795108, + "num_input_tokens_seen": 217374575, + "step": 10097, + "time_per_iteration": 2.486654043197632 + }, + { + "auxiliary_loss_clip": 0.01134471, + "auxiliary_loss_mlp": 0.01106627, + "balance_loss_clip": 1.00190902, + "balance_loss_mlp": 1.00048363, + "epoch": 0.6071246054411543, + "flos": 19310568030720.0, + "grad_norm": 1.5174168202631537, + "language_loss": 0.669649, + "learning_rate": 1.411969602780478e-06, + "loss": 0.69205999, + "num_input_tokens_seen": 217392950, + "step": 10098, + "time_per_iteration": 2.605496644973755 + }, + { + "auxiliary_loss_clip": 0.01165565, + "auxiliary_loss_mlp": 0.01107169, + "balance_loss_clip": 1.00201678, + "balance_loss_mlp": 1.00054836, + "epoch": 0.6071847286938223, + "flos": 17749029098880.0, + "grad_norm": 1.7500458485375074, + "language_loss": 0.80553436, + "learning_rate": 1.4115973661668523e-06, + "loss": 0.82826167, + "num_input_tokens_seen": 217412145, + "step": 10099, + "time_per_iteration": 2.5738110542297363 + }, + { + "auxiliary_loss_clip": 0.01115813, + "auxiliary_loss_mlp": 0.01108049, + "balance_loss_clip": 1.00173354, + "balance_loss_mlp": 1.0005703, + "epoch": 0.6072448519464904, + "flos": 22637512512000.0, + "grad_norm": 3.8493996269576622, + "language_loss": 0.70620465, + "learning_rate": 1.4112251518655133e-06, + "loss": 0.72844326, + "num_input_tokens_seen": 217432080, + "step": 10100, + "time_per_iteration": 2.640150308609009 + }, + { + "auxiliary_loss_clip": 0.01118156, + "auxiliary_loss_mlp": 0.01107798, + "balance_loss_clip": 1.00194144, + "balance_loss_mlp": 1.00060523, + "epoch": 0.6073049751991583, + "flos": 19537308633600.0, + "grad_norm": 2.22642653083365, + "language_loss": 0.7086401, + "learning_rate": 1.4108529598905764e-06, + "loss": 0.73089969, + "num_input_tokens_seen": 217450945, + "step": 10101, + "time_per_iteration": 2.665144681930542 + }, + { + "auxiliary_loss_clip": 0.01134504, + "auxiliary_loss_mlp": 0.0110622, + "balance_loss_clip": 1.00181746, + "balance_loss_mlp": 1.00045753, + "epoch": 0.6073650984518263, + "flos": 28294233033600.0, + "grad_norm": 1.9078503915265064, + "language_loss": 0.69718802, + "learning_rate": 1.410480790256154e-06, + "loss": 0.71959531, + "num_input_tokens_seen": 217473105, + "step": 10102, + "time_per_iteration": 2.6777682304382324 + }, + { + "auxiliary_loss_clip": 0.01165869, + "auxiliary_loss_mlp": 0.01108232, + "balance_loss_clip": 1.00206995, + "balance_loss_mlp": 1.00056231, + "epoch": 0.6074252217044942, + "flos": 25664422688640.0, + "grad_norm": 2.802731725956036, + "language_loss": 0.73160183, + "learning_rate": 1.4101086429763589e-06, + "loss": 0.75434291, + "num_input_tokens_seen": 217491780, + "step": 10103, + "time_per_iteration": 2.5354535579681396 + }, + { + "auxiliary_loss_clip": 0.01120473, + "auxiliary_loss_mlp": 0.01108614, + "balance_loss_clip": 1.00197959, + "balance_loss_mlp": 1.00065768, + "epoch": 0.6074853449571622, + "flos": 22857106308480.0, + "grad_norm": 1.7246704513368525, + "language_loss": 0.76743436, + "learning_rate": 1.4097365180653032e-06, + "loss": 0.78972518, + "num_input_tokens_seen": 217510605, + "step": 10104, + "time_per_iteration": 2.631432056427002 + }, + { + "auxiliary_loss_clip": 0.01113477, + "auxiliary_loss_mlp": 0.01083342, + "balance_loss_clip": 1.0014019, + "balance_loss_mlp": 1.00018203, + "epoch": 0.6075454682098301, + "flos": 67111406547840.0, + "grad_norm": 0.7073742866055381, + "language_loss": 0.55935615, + "learning_rate": 1.4093644155370977e-06, + "loss": 0.58132434, + "num_input_tokens_seen": 217574815, + "step": 10105, + "time_per_iteration": 3.205322504043579 + }, + { + "auxiliary_loss_clip": 0.01143717, + "auxiliary_loss_mlp": 0.01083431, + "balance_loss_clip": 1.00122142, + "balance_loss_mlp": 1.00027072, + "epoch": 0.6076055914624982, + "flos": 70712024751360.0, + "grad_norm": 0.7569415248594072, + "language_loss": 0.56876516, + "learning_rate": 1.4089923354058533e-06, + "loss": 0.59103668, + "num_input_tokens_seen": 217632375, + "step": 10106, + "time_per_iteration": 3.0745391845703125 + }, + { + "auxiliary_loss_clip": 0.01103086, + "auxiliary_loss_mlp": 0.0110677, + "balance_loss_clip": 1.00190103, + "balance_loss_mlp": 1.00072145, + "epoch": 0.6076657147151661, + "flos": 28364545906560.0, + "grad_norm": 1.5537044432102234, + "language_loss": 0.68638903, + "learning_rate": 1.4086202776856784e-06, + "loss": 0.70848763, + "num_input_tokens_seen": 217653055, + "step": 10107, + "time_per_iteration": 2.735037088394165 + }, + { + "auxiliary_loss_clip": 0.01151043, + "auxiliary_loss_mlp": 0.01107908, + "balance_loss_clip": 1.00194871, + "balance_loss_mlp": 1.00052452, + "epoch": 0.6077258379678341, + "flos": 15049767807360.0, + "grad_norm": 1.7693954378657295, + "language_loss": 0.81025577, + "learning_rate": 1.4082482423906815e-06, + "loss": 0.83284533, + "num_input_tokens_seen": 217671520, + "step": 10108, + "time_per_iteration": 2.546841621398926 + }, + { + "auxiliary_loss_clip": 0.01134638, + "auxiliary_loss_mlp": 0.01107924, + "balance_loss_clip": 1.00198221, + "balance_loss_mlp": 1.00063598, + "epoch": 0.607785961220502, + "flos": 36167251553280.0, + "grad_norm": 2.4666095754944997, + "language_loss": 0.71313357, + "learning_rate": 1.4078762295349714e-06, + "loss": 0.73555917, + "num_input_tokens_seen": 217691880, + "step": 10109, + "time_per_iteration": 2.707095146179199 + }, + { + "auxiliary_loss_clip": 0.01131622, + "auxiliary_loss_mlp": 0.01104959, + "balance_loss_clip": 1.0018065, + "balance_loss_mlp": 1.00062656, + "epoch": 0.60784608447317, + "flos": 22524249951360.0, + "grad_norm": 1.7308828087018502, + "language_loss": 0.80501139, + "learning_rate": 1.407504239132653e-06, + "loss": 0.8273772, + "num_input_tokens_seen": 217710530, + "step": 10110, + "time_per_iteration": 2.6070728302001953 + }, + { + "auxiliary_loss_clip": 0.01134451, + "auxiliary_loss_mlp": 0.01107456, + "balance_loss_clip": 1.00192475, + "balance_loss_mlp": 1.00045383, + "epoch": 0.6079062077258379, + "flos": 23841166285440.0, + "grad_norm": 2.158823089639164, + "language_loss": 0.70097423, + "learning_rate": 1.4071322711978338e-06, + "loss": 0.72339332, + "num_input_tokens_seen": 217728650, + "step": 10111, + "time_per_iteration": 2.6262245178222656 + }, + { + "auxiliary_loss_clip": 0.01116904, + "auxiliary_loss_mlp": 0.01107915, + "balance_loss_clip": 1.00180125, + "balance_loss_mlp": 1.00043547, + "epoch": 0.6079663309785059, + "flos": 23367037737600.0, + "grad_norm": 1.7783957442412235, + "language_loss": 0.65298069, + "learning_rate": 1.4067603257446186e-06, + "loss": 0.67522883, + "num_input_tokens_seen": 217747135, + "step": 10112, + "time_per_iteration": 2.6613852977752686 + }, + { + "auxiliary_loss_clip": 0.011458, + "auxiliary_loss_mlp": 0.01082958, + "balance_loss_clip": 1.00117266, + "balance_loss_mlp": 1.00017929, + "epoch": 0.6080264542311739, + "flos": 71382873110400.0, + "grad_norm": 0.6468182841467011, + "language_loss": 0.49612877, + "learning_rate": 1.4063884027871105e-06, + "loss": 0.51841629, + "num_input_tokens_seen": 217811860, + "step": 10113, + "time_per_iteration": 3.1788480281829834 + }, + { + "auxiliary_loss_clip": 0.01145829, + "auxiliary_loss_mlp": 0.01083061, + "balance_loss_clip": 1.00115466, + "balance_loss_mlp": 1.00028253, + "epoch": 0.6080865774838419, + "flos": 66529833442560.0, + "grad_norm": 0.8394445008484889, + "language_loss": 0.5696125, + "learning_rate": 1.4060165023394147e-06, + "loss": 0.59190142, + "num_input_tokens_seen": 217866510, + "step": 10114, + "time_per_iteration": 3.033296823501587 + }, + { + "auxiliary_loss_clip": 0.01165862, + "auxiliary_loss_mlp": 0.01107055, + "balance_loss_clip": 1.00209272, + "balance_loss_mlp": 1.00052977, + "epoch": 0.6081467007365099, + "flos": 19207935895680.0, + "grad_norm": 1.8386017787197984, + "language_loss": 0.70200205, + "learning_rate": 1.4056446244156317e-06, + "loss": 0.72473115, + "num_input_tokens_seen": 217885650, + "step": 10115, + "time_per_iteration": 2.520920515060425 + }, + { + "auxiliary_loss_clip": 0.01119428, + "auxiliary_loss_mlp": 0.01107067, + "balance_loss_clip": 1.00196171, + "balance_loss_mlp": 1.00063682, + "epoch": 0.6082068239891778, + "flos": 24167737762560.0, + "grad_norm": 2.0392162149884103, + "language_loss": 0.72701991, + "learning_rate": 1.4052727690298642e-06, + "loss": 0.74928486, + "num_input_tokens_seen": 217905300, + "step": 10116, + "time_per_iteration": 2.6585237979888916 + }, + { + "auxiliary_loss_clip": 0.0113476, + "auxiliary_loss_mlp": 0.01108267, + "balance_loss_clip": 1.00195456, + "balance_loss_mlp": 1.000597, + "epoch": 0.6082669472418458, + "flos": 37413316310400.0, + "grad_norm": 1.7826457831802047, + "language_loss": 0.53572923, + "learning_rate": 1.4049009361962138e-06, + "loss": 0.55815947, + "num_input_tokens_seen": 217927845, + "step": 10117, + "time_per_iteration": 2.7332890033721924 + }, + { + "auxiliary_loss_clip": 0.01133613, + "auxiliary_loss_mlp": 0.01106795, + "balance_loss_clip": 1.00188887, + "balance_loss_mlp": 1.0003655, + "epoch": 0.6083270704945137, + "flos": 15085534775040.0, + "grad_norm": 1.8819698858120304, + "language_loss": 0.70208836, + "learning_rate": 1.4045291259287786e-06, + "loss": 0.72449243, + "num_input_tokens_seen": 217946145, + "step": 10118, + "time_per_iteration": 2.594391345977783 + }, + { + "auxiliary_loss_clip": 0.01090358, + "auxiliary_loss_mlp": 0.0110646, + "balance_loss_clip": 1.00190425, + "balance_loss_mlp": 1.00050664, + "epoch": 0.6083871937471818, + "flos": 20668458804480.0, + "grad_norm": 1.5977043367851522, + "language_loss": 0.74926579, + "learning_rate": 1.4041573382416588e-06, + "loss": 0.77123392, + "num_input_tokens_seen": 217965190, + "step": 10119, + "time_per_iteration": 2.7395336627960205 + }, + { + "auxiliary_loss_clip": 0.01148861, + "auxiliary_loss_mlp": 0.01107251, + "balance_loss_clip": 1.00191081, + "balance_loss_mlp": 1.00053453, + "epoch": 0.6084473169998497, + "flos": 21506901045120.0, + "grad_norm": 1.803734218910701, + "language_loss": 0.67532218, + "learning_rate": 1.4037855731489525e-06, + "loss": 0.69788331, + "num_input_tokens_seen": 217983625, + "step": 10120, + "time_per_iteration": 2.605146646499634 + }, + { + "auxiliary_loss_clip": 0.01149011, + "auxiliary_loss_mlp": 0.01108376, + "balance_loss_clip": 1.00189614, + "balance_loss_mlp": 1.00051522, + "epoch": 0.6085074402525177, + "flos": 26870051710080.0, + "grad_norm": 1.6923322196182369, + "language_loss": 0.74097532, + "learning_rate": 1.4034138306647571e-06, + "loss": 0.76354921, + "num_input_tokens_seen": 218006005, + "step": 10121, + "time_per_iteration": 3.978818655014038 + }, + { + "auxiliary_loss_clip": 0.01150794, + "auxiliary_loss_mlp": 0.01106519, + "balance_loss_clip": 1.00189686, + "balance_loss_mlp": 1.00056553, + "epoch": 0.6085675635051856, + "flos": 10889839952640.0, + "grad_norm": 1.7685144331383884, + "language_loss": 0.80226028, + "learning_rate": 1.4030421108031685e-06, + "loss": 0.82483339, + "num_input_tokens_seen": 218024195, + "step": 10122, + "time_per_iteration": 2.532858371734619 + }, + { + "auxiliary_loss_clip": 0.01151305, + "auxiliary_loss_mlp": 0.01107824, + "balance_loss_clip": 1.00223064, + "balance_loss_mlp": 1.00063086, + "epoch": 0.6086276867578536, + "flos": 34862186707200.0, + "grad_norm": 1.7769331499840648, + "language_loss": 0.5587399, + "learning_rate": 1.402670413578284e-06, + "loss": 0.58133119, + "num_input_tokens_seen": 218047190, + "step": 10123, + "time_per_iteration": 2.6692490577697754 + }, + { + "auxiliary_loss_clip": 0.01150078, + "auxiliary_loss_mlp": 0.01106743, + "balance_loss_clip": 1.00203753, + "balance_loss_mlp": 1.00059891, + "epoch": 0.6086878100105215, + "flos": 20047706939520.0, + "grad_norm": 1.7685214060472383, + "language_loss": 0.73901242, + "learning_rate": 1.4022987390041965e-06, + "loss": 0.76158065, + "num_input_tokens_seen": 218065945, + "step": 10124, + "time_per_iteration": 2.592177629470825 + }, + { + "auxiliary_loss_clip": 0.01136038, + "auxiliary_loss_mlp": 0.01107153, + "balance_loss_clip": 1.00196195, + "balance_loss_mlp": 1.00062728, + "epoch": 0.6087479332631895, + "flos": 18332469711360.0, + "grad_norm": 2.5244885821340914, + "language_loss": 0.65310627, + "learning_rate": 1.4019270870950006e-06, + "loss": 0.67553818, + "num_input_tokens_seen": 218085285, + "step": 10125, + "time_per_iteration": 2.581850528717041 + }, + { + "auxiliary_loss_clip": 0.01165662, + "auxiliary_loss_mlp": 0.01106593, + "balance_loss_clip": 1.00204849, + "balance_loss_mlp": 1.00063968, + "epoch": 0.6088080565158575, + "flos": 24493411399680.0, + "grad_norm": 1.9865777840126253, + "language_loss": 0.76172453, + "learning_rate": 1.40155545786479e-06, + "loss": 0.78444701, + "num_input_tokens_seen": 218104735, + "step": 10126, + "time_per_iteration": 2.535651922225952 + }, + { + "auxiliary_loss_clip": 0.01115783, + "auxiliary_loss_mlp": 0.01107741, + "balance_loss_clip": 1.00177574, + "balance_loss_mlp": 1.00054836, + "epoch": 0.6088681797685255, + "flos": 10269016260480.0, + "grad_norm": 3.3450584604936284, + "language_loss": 0.7115382, + "learning_rate": 1.4011838513276558e-06, + "loss": 0.73377347, + "num_input_tokens_seen": 218121855, + "step": 10127, + "time_per_iteration": 4.015663146972656 + }, + { + "auxiliary_loss_clip": 0.01165814, + "auxiliary_loss_mlp": 0.01109266, + "balance_loss_clip": 1.00210929, + "balance_loss_mlp": 1.00045192, + "epoch": 0.6089283030211935, + "flos": 21973703218560.0, + "grad_norm": 3.015376957709703, + "language_loss": 0.73037803, + "learning_rate": 1.400812267497691e-06, + "loss": 0.75312877, + "num_input_tokens_seen": 218137325, + "step": 10128, + "time_per_iteration": 2.499600410461426 + }, + { + "auxiliary_loss_clip": 0.01105026, + "auxiliary_loss_mlp": 0.01107091, + "balance_loss_clip": 1.001858, + "balance_loss_mlp": 1.00056553, + "epoch": 0.6089884262738614, + "flos": 17785191116160.0, + "grad_norm": 2.0306195623554757, + "language_loss": 0.73106146, + "learning_rate": 1.4004407063889842e-06, + "loss": 0.75318259, + "num_input_tokens_seen": 218155530, + "step": 10129, + "time_per_iteration": 2.8200201988220215 + }, + { + "auxiliary_loss_clip": 0.01165747, + "auxiliary_loss_mlp": 0.01107915, + "balance_loss_clip": 1.00205457, + "balance_loss_mlp": 1.00053167, + "epoch": 0.6090485495265294, + "flos": 36910423946880.0, + "grad_norm": 5.7933576696057765, + "language_loss": 0.65429246, + "learning_rate": 1.400069168015626e-06, + "loss": 0.67702907, + "num_input_tokens_seen": 218182535, + "step": 10130, + "time_per_iteration": 4.084508180618286 + }, + { + "auxiliary_loss_clip": 0.01131911, + "auxiliary_loss_mlp": 0.0110639, + "balance_loss_clip": 1.00168657, + "balance_loss_mlp": 1.00043654, + "epoch": 0.6091086727791973, + "flos": 19899036547200.0, + "grad_norm": 1.7668687434066093, + "language_loss": 0.7727356, + "learning_rate": 1.3996976523917054e-06, + "loss": 0.79511857, + "num_input_tokens_seen": 218201740, + "step": 10131, + "time_per_iteration": 2.5704638957977295 + }, + { + "auxiliary_loss_clip": 0.01117543, + "auxiliary_loss_mlp": 0.01106019, + "balance_loss_clip": 1.00179303, + "balance_loss_mlp": 1.00054288, + "epoch": 0.6091687960318654, + "flos": 22163635359360.0, + "grad_norm": 1.7311984837629835, + "language_loss": 0.76878488, + "learning_rate": 1.3993261595313093e-06, + "loss": 0.79102051, + "num_input_tokens_seen": 218219800, + "step": 10132, + "time_per_iteration": 2.63974928855896 + }, + { + "auxiliary_loss_clip": 0.01165596, + "auxiliary_loss_mlp": 0.01106082, + "balance_loss_clip": 1.00201583, + "balance_loss_mlp": 1.00051069, + "epoch": 0.6092289192845333, + "flos": 21465280160640.0, + "grad_norm": 1.910136801168735, + "language_loss": 0.75450552, + "learning_rate": 1.3989546894485261e-06, + "loss": 0.77722234, + "num_input_tokens_seen": 218237585, + "step": 10133, + "time_per_iteration": 2.5374300479888916 + }, + { + "auxiliary_loss_clip": 0.01149022, + "auxiliary_loss_mlp": 0.01107628, + "balance_loss_clip": 1.00187194, + "balance_loss_mlp": 1.00062585, + "epoch": 0.6092890425372013, + "flos": 28694924225280.0, + "grad_norm": 2.8136491738326748, + "language_loss": 0.63411808, + "learning_rate": 1.3985832421574414e-06, + "loss": 0.65668464, + "num_input_tokens_seen": 218258700, + "step": 10134, + "time_per_iteration": 4.043710470199585 + }, + { + "auxiliary_loss_clip": 0.01132246, + "auxiliary_loss_mlp": 0.01106272, + "balance_loss_clip": 1.0019002, + "balance_loss_mlp": 1.00050914, + "epoch": 0.6093491657898692, + "flos": 20813178700800.0, + "grad_norm": 2.0365311312397263, + "language_loss": 0.78782231, + "learning_rate": 1.3982118176721397e-06, + "loss": 0.81020749, + "num_input_tokens_seen": 218275655, + "step": 10135, + "time_per_iteration": 2.594055414199829 + }, + { + "auxiliary_loss_clip": 0.01131904, + "auxiliary_loss_mlp": 0.01106794, + "balance_loss_clip": 1.00178754, + "balance_loss_mlp": 1.00045967, + "epoch": 0.6094092890425372, + "flos": 25446983708160.0, + "grad_norm": 1.7749634864856856, + "language_loss": 0.72042561, + "learning_rate": 1.3978404160067069e-06, + "loss": 0.74281257, + "num_input_tokens_seen": 218295720, + "step": 10136, + "time_per_iteration": 2.644587278366089 + }, + { + "auxiliary_loss_clip": 0.01165805, + "auxiliary_loss_mlp": 0.01107471, + "balance_loss_clip": 1.00211084, + "balance_loss_mlp": 1.00046873, + "epoch": 0.6094694122952051, + "flos": 35621265847680.0, + "grad_norm": 1.703859910175036, + "language_loss": 0.74493778, + "learning_rate": 1.3974690371752253e-06, + "loss": 0.76767051, + "num_input_tokens_seen": 218316745, + "step": 10137, + "time_per_iteration": 2.6487104892730713 + }, + { + "auxiliary_loss_clip": 0.01150694, + "auxiliary_loss_mlp": 0.01107542, + "balance_loss_clip": 1.00193644, + "balance_loss_mlp": 1.00073028, + "epoch": 0.6095295355478731, + "flos": 24456962073600.0, + "grad_norm": 3.4363088160917408, + "language_loss": 0.80300319, + "learning_rate": 1.3970976811917785e-06, + "loss": 0.82558554, + "num_input_tokens_seen": 218335385, + "step": 10138, + "time_per_iteration": 2.570112705230713 + }, + { + "auxiliary_loss_clip": 0.01132107, + "auxiliary_loss_mlp": 0.01106686, + "balance_loss_clip": 1.00189435, + "balance_loss_mlp": 1.00054169, + "epoch": 0.6095896588005411, + "flos": 15633208419840.0, + "grad_norm": 1.8632986557579572, + "language_loss": 0.8078903, + "learning_rate": 1.3967263480704481e-06, + "loss": 0.83027822, + "num_input_tokens_seen": 218353320, + "step": 10139, + "time_per_iteration": 2.586578369140625 + }, + { + "auxiliary_loss_clip": 0.01121765, + "auxiliary_loss_mlp": 0.01108195, + "balance_loss_clip": 1.00195599, + "balance_loss_mlp": 1.0006206, + "epoch": 0.6096497820532091, + "flos": 15550577182080.0, + "grad_norm": 2.530486569310219, + "language_loss": 0.83300596, + "learning_rate": 1.396355037825315e-06, + "loss": 0.85530555, + "num_input_tokens_seen": 218365620, + "step": 10140, + "time_per_iteration": 2.5729401111602783 + }, + { + "auxiliary_loss_clip": 0.0114887, + "auxiliary_loss_mlp": 0.01107684, + "balance_loss_clip": 1.00185823, + "balance_loss_mlp": 1.00058579, + "epoch": 0.6097099053058771, + "flos": 24204474397440.0, + "grad_norm": 2.3520096422154997, + "language_loss": 0.75762177, + "learning_rate": 1.3959837504704592e-06, + "loss": 0.78018737, + "num_input_tokens_seen": 218383785, + "step": 10141, + "time_per_iteration": 2.579399585723877 + }, + { + "auxiliary_loss_clip": 0.01133634, + "auxiliary_loss_mlp": 0.01106708, + "balance_loss_clip": 1.00175905, + "balance_loss_mlp": 1.00046897, + "epoch": 0.609770028558545, + "flos": 19570238426880.0, + "grad_norm": 2.143811089714149, + "language_loss": 0.76568836, + "learning_rate": 1.3956124860199603e-06, + "loss": 0.78809178, + "num_input_tokens_seen": 218399055, + "step": 10142, + "time_per_iteration": 2.5567173957824707 + }, + { + "auxiliary_loss_clip": 0.01165656, + "auxiliary_loss_mlp": 0.01107196, + "balance_loss_clip": 1.00195682, + "balance_loss_mlp": 1.00047946, + "epoch": 0.609830151811213, + "flos": 23949185460480.0, + "grad_norm": 1.8996656336248705, + "language_loss": 0.76702356, + "learning_rate": 1.3952412444878964e-06, + "loss": 0.78975201, + "num_input_tokens_seen": 218419120, + "step": 10143, + "time_per_iteration": 2.5281009674072266 + }, + { + "auxiliary_loss_clip": 0.01150653, + "auxiliary_loss_mlp": 0.01107613, + "balance_loss_clip": 1.00195658, + "balance_loss_mlp": 1.00051546, + "epoch": 0.6098902750638809, + "flos": 16179732829440.0, + "grad_norm": 1.8787652018280632, + "language_loss": 0.75114048, + "learning_rate": 1.3948700258883448e-06, + "loss": 0.77372313, + "num_input_tokens_seen": 218435290, + "step": 10144, + "time_per_iteration": 2.5165207386016846 + }, + { + "auxiliary_loss_clip": 0.01134172, + "auxiliary_loss_mlp": 0.01108515, + "balance_loss_clip": 1.00196481, + "balance_loss_mlp": 1.0004642, + "epoch": 0.609950398316549, + "flos": 44526393763200.0, + "grad_norm": 2.2669260658048866, + "language_loss": 0.72958517, + "learning_rate": 1.394498830235383e-06, + "loss": 0.75201201, + "num_input_tokens_seen": 218457880, + "step": 10145, + "time_per_iteration": 2.7852790355682373 + }, + { + "auxiliary_loss_clip": 0.01135988, + "auxiliary_loss_mlp": 0.01107079, + "balance_loss_clip": 1.00186563, + "balance_loss_mlp": 1.00055385, + "epoch": 0.6100105215692169, + "flos": 23221743223680.0, + "grad_norm": 2.0799722486361474, + "language_loss": 0.69344193, + "learning_rate": 1.3941276575430862e-06, + "loss": 0.71587259, + "num_input_tokens_seen": 218475930, + "step": 10146, + "time_per_iteration": 2.6036927700042725 + }, + { + "auxiliary_loss_clip": 0.01102658, + "auxiliary_loss_mlp": 0.00747126, + "balance_loss_clip": 1.00186419, + "balance_loss_mlp": 1.00039077, + "epoch": 0.6100706448218849, + "flos": 15012564295680.0, + "grad_norm": 1.8293921883433746, + "language_loss": 0.77176058, + "learning_rate": 1.3937565078255289e-06, + "loss": 0.79025841, + "num_input_tokens_seen": 218493675, + "step": 10147, + "time_per_iteration": 2.689298152923584 + }, + { + "auxiliary_loss_clip": 0.01133948, + "auxiliary_loss_mlp": 0.01106393, + "balance_loss_clip": 1.00188112, + "balance_loss_mlp": 1.00044, + "epoch": 0.6101307680745528, + "flos": 19639976682240.0, + "grad_norm": 3.184625101522772, + "language_loss": 0.77935475, + "learning_rate": 1.393385381096786e-06, + "loss": 0.80175817, + "num_input_tokens_seen": 218511780, + "step": 10148, + "time_per_iteration": 2.588907480239868 + }, + { + "auxiliary_loss_clip": 0.01119676, + "auxiliary_loss_mlp": 0.01108479, + "balance_loss_clip": 1.00180328, + "balance_loss_mlp": 1.00061905, + "epoch": 0.6101908913272208, + "flos": 29935566028800.0, + "grad_norm": 2.2860024692349974, + "language_loss": 0.53956062, + "learning_rate": 1.39301427737093e-06, + "loss": 0.56184214, + "num_input_tokens_seen": 218531850, + "step": 10149, + "time_per_iteration": 2.7081727981567383 + }, + { + "auxiliary_loss_clip": 0.01132363, + "auxiliary_loss_mlp": 0.01106478, + "balance_loss_clip": 1.00189543, + "balance_loss_mlp": 1.00052476, + "epoch": 0.6102510145798887, + "flos": 21798639308160.0, + "grad_norm": 2.115988383814908, + "language_loss": 0.79980326, + "learning_rate": 1.3926431966620333e-06, + "loss": 0.8221916, + "num_input_tokens_seen": 218551245, + "step": 10150, + "time_per_iteration": 2.6086347103118896 + }, + { + "auxiliary_loss_clip": 0.0113248, + "auxiliary_loss_mlp": 0.01107887, + "balance_loss_clip": 1.00184536, + "balance_loss_mlp": 1.00050354, + "epoch": 0.6103111378325567, + "flos": 20706129192960.0, + "grad_norm": 2.185596630427409, + "language_loss": 0.68992305, + "learning_rate": 1.3922721389841684e-06, + "loss": 0.71232665, + "num_input_tokens_seen": 218571365, + "step": 10151, + "time_per_iteration": 2.6126863956451416 + }, + { + "auxiliary_loss_clip": 0.01165579, + "auxiliary_loss_mlp": 0.01105953, + "balance_loss_clip": 1.00196993, + "balance_loss_mlp": 1.00038099, + "epoch": 0.6103712610852247, + "flos": 29381643417600.0, + "grad_norm": 1.7891430642533777, + "language_loss": 0.70514673, + "learning_rate": 1.3919011043514036e-06, + "loss": 0.727862, + "num_input_tokens_seen": 218588315, + "step": 10152, + "time_per_iteration": 2.5678884983062744 + }, + { + "auxiliary_loss_clip": 0.01120948, + "auxiliary_loss_mlp": 0.01107123, + "balance_loss_clip": 1.00196385, + "balance_loss_mlp": 1.00050247, + "epoch": 0.6104313843378927, + "flos": 20813035046400.0, + "grad_norm": 1.9197345046228653, + "language_loss": 0.7837255, + "learning_rate": 1.391530092777811e-06, + "loss": 0.80600625, + "num_input_tokens_seen": 218605940, + "step": 10153, + "time_per_iteration": 2.624967098236084 + }, + { + "auxiliary_loss_clip": 0.01133711, + "auxiliary_loss_mlp": 0.01107193, + "balance_loss_clip": 1.00199676, + "balance_loss_mlp": 1.00066733, + "epoch": 0.6104915075905607, + "flos": 26578457101440.0, + "grad_norm": 1.9736502627423185, + "language_loss": 0.79276586, + "learning_rate": 1.3911591042774573e-06, + "loss": 0.81517482, + "num_input_tokens_seen": 218626100, + "step": 10154, + "time_per_iteration": 2.6284945011138916 + }, + { + "auxiliary_loss_clip": 0.01149319, + "auxiliary_loss_mlp": 0.01106642, + "balance_loss_clip": 1.00192261, + "balance_loss_mlp": 1.00049782, + "epoch": 0.6105516308432286, + "flos": 23915788790400.0, + "grad_norm": 1.5656442580282022, + "language_loss": 0.7059716, + "learning_rate": 1.3907881388644116e-06, + "loss": 0.72853124, + "num_input_tokens_seen": 218645060, + "step": 10155, + "time_per_iteration": 2.5787627696990967 + }, + { + "auxiliary_loss_clip": 0.01148906, + "auxiliary_loss_mlp": 0.01107437, + "balance_loss_clip": 1.00196826, + "balance_loss_mlp": 1.00043428, + "epoch": 0.6106117540958966, + "flos": 31577365900800.0, + "grad_norm": 1.6819250667962646, + "language_loss": 0.71452272, + "learning_rate": 1.3904171965527413e-06, + "loss": 0.73708618, + "num_input_tokens_seen": 218667690, + "step": 10156, + "time_per_iteration": 2.6284656524658203 + }, + { + "auxiliary_loss_clip": 0.01132187, + "auxiliary_loss_mlp": 0.01106394, + "balance_loss_clip": 1.00190091, + "balance_loss_mlp": 1.0005362, + "epoch": 0.6106718773485645, + "flos": 19608160210560.0, + "grad_norm": 4.312847050684939, + "language_loss": 0.67497933, + "learning_rate": 1.3900462773565114e-06, + "loss": 0.69736511, + "num_input_tokens_seen": 218687505, + "step": 10157, + "time_per_iteration": 2.60056471824646 + }, + { + "auxiliary_loss_clip": 0.01118993, + "auxiliary_loss_mlp": 0.01107006, + "balance_loss_clip": 1.00178337, + "balance_loss_mlp": 1.00057578, + "epoch": 0.6107320006012326, + "flos": 17123895774720.0, + "grad_norm": 3.485631268832003, + "language_loss": 0.72646153, + "learning_rate": 1.3896753812897877e-06, + "loss": 0.74872148, + "num_input_tokens_seen": 218705315, + "step": 10158, + "time_per_iteration": 2.6297683715820312 + }, + { + "auxiliary_loss_clip": 0.0114947, + "auxiliary_loss_mlp": 0.01107532, + "balance_loss_clip": 1.00195575, + "balance_loss_mlp": 1.00062561, + "epoch": 0.6107921238539005, + "flos": 30148228500480.0, + "grad_norm": 1.5440603455488673, + "language_loss": 0.69231045, + "learning_rate": 1.389304508366635e-06, + "loss": 0.71488047, + "num_input_tokens_seen": 218725735, + "step": 10159, + "time_per_iteration": 4.031926155090332 + }, + { + "auxiliary_loss_clip": 0.01165643, + "auxiliary_loss_mlp": 0.01107259, + "balance_loss_clip": 1.00200844, + "balance_loss_mlp": 1.000543, + "epoch": 0.6108522471065685, + "flos": 18440273404800.0, + "grad_norm": 1.9151690457537562, + "language_loss": 0.78701425, + "learning_rate": 1.3889336586011167e-06, + "loss": 0.80974329, + "num_input_tokens_seen": 218743215, + "step": 10160, + "time_per_iteration": 2.5412533283233643 + }, + { + "auxiliary_loss_clip": 0.01143586, + "auxiliary_loss_mlp": 0.01083099, + "balance_loss_clip": 1.00112152, + "balance_loss_mlp": 0.99993819, + "epoch": 0.6109123703592364, + "flos": 64135454791680.0, + "grad_norm": 0.8309443567754301, + "language_loss": 0.6153965, + "learning_rate": 1.388562832007295e-06, + "loss": 0.63766336, + "num_input_tokens_seen": 218806440, + "step": 10161, + "time_per_iteration": 3.268770694732666 + }, + { + "auxiliary_loss_clip": 0.01132779, + "auxiliary_loss_mlp": 0.00747342, + "balance_loss_clip": 1.00184989, + "balance_loss_mlp": 1.00036311, + "epoch": 0.6109724936119044, + "flos": 20667848273280.0, + "grad_norm": 2.302919761233453, + "language_loss": 0.75984335, + "learning_rate": 1.3881920285992324e-06, + "loss": 0.77864456, + "num_input_tokens_seen": 218825720, + "step": 10162, + "time_per_iteration": 2.614403009414673 + }, + { + "auxiliary_loss_clip": 0.01165657, + "auxiliary_loss_mlp": 0.0110686, + "balance_loss_clip": 1.0020268, + "balance_loss_mlp": 1.00052547, + "epoch": 0.6110326168645723, + "flos": 31351882273920.0, + "grad_norm": 1.5858842810991083, + "language_loss": 0.71423209, + "learning_rate": 1.3878212483909888e-06, + "loss": 0.73695725, + "num_input_tokens_seen": 218847735, + "step": 10163, + "time_per_iteration": 2.5735864639282227 + }, + { + "auxiliary_loss_clip": 0.01165517, + "auxiliary_loss_mlp": 0.01106252, + "balance_loss_clip": 1.00192404, + "balance_loss_mlp": 1.0005846, + "epoch": 0.6110927401172404, + "flos": 25003378742400.0, + "grad_norm": 2.1394702455941004, + "language_loss": 0.59660196, + "learning_rate": 1.387450491396625e-06, + "loss": 0.61931968, + "num_input_tokens_seen": 218866585, + "step": 10164, + "time_per_iteration": 3.9204578399658203 + }, + { + "auxiliary_loss_clip": 0.01149265, + "auxiliary_loss_mlp": 0.01107333, + "balance_loss_clip": 1.00197518, + "balance_loss_mlp": 1.00052178, + "epoch": 0.6111528633699083, + "flos": 26248078782720.0, + "grad_norm": 1.865034855450257, + "language_loss": 0.75876772, + "learning_rate": 1.3870797576302003e-06, + "loss": 0.7813338, + "num_input_tokens_seen": 218885560, + "step": 10165, + "time_per_iteration": 2.5853610038757324 + }, + { + "auxiliary_loss_clip": 0.01133778, + "auxiliary_loss_mlp": 0.01107373, + "balance_loss_clip": 1.00205743, + "balance_loss_mlp": 1.000561, + "epoch": 0.6112129866225763, + "flos": 22382474970240.0, + "grad_norm": 1.6081563582045506, + "language_loss": 0.79164338, + "learning_rate": 1.3867090471057719e-06, + "loss": 0.81405497, + "num_input_tokens_seen": 218905055, + "step": 10166, + "time_per_iteration": 2.5808513164520264 + }, + { + "auxiliary_loss_clip": 0.01134611, + "auxiliary_loss_mlp": 0.01107105, + "balance_loss_clip": 1.00194764, + "balance_loss_mlp": 1.00048423, + "epoch": 0.6112731098752443, + "flos": 25227892702080.0, + "grad_norm": 1.733664267002746, + "language_loss": 0.67438114, + "learning_rate": 1.3863383598373987e-06, + "loss": 0.69679832, + "num_input_tokens_seen": 218924030, + "step": 10167, + "time_per_iteration": 2.6249260902404785 + }, + { + "auxiliary_loss_clip": 0.01165723, + "auxiliary_loss_mlp": 0.01106353, + "balance_loss_clip": 1.00211859, + "balance_loss_mlp": 1.00049543, + "epoch": 0.6113332331279122, + "flos": 22893160584960.0, + "grad_norm": 1.669552612507146, + "language_loss": 0.7909438, + "learning_rate": 1.3859676958391364e-06, + "loss": 0.81366456, + "num_input_tokens_seen": 218943750, + "step": 10168, + "time_per_iteration": 3.921856641769409 + }, + { + "auxiliary_loss_clip": 0.01165932, + "auxiliary_loss_mlp": 0.01108846, + "balance_loss_clip": 1.00207233, + "balance_loss_mlp": 1.00060391, + "epoch": 0.6113933563805802, + "flos": 18620329305600.0, + "grad_norm": 2.6902381973738962, + "language_loss": 0.85115165, + "learning_rate": 1.3855970551250398e-06, + "loss": 0.87389946, + "num_input_tokens_seen": 218957585, + "step": 10169, + "time_per_iteration": 2.488311529159546 + }, + { + "auxiliary_loss_clip": 0.01165593, + "auxiliary_loss_mlp": 0.01106396, + "balance_loss_clip": 1.00197959, + "balance_loss_mlp": 1.00053799, + "epoch": 0.6114534796332481, + "flos": 41866275317760.0, + "grad_norm": 1.7297110479279307, + "language_loss": 0.78585792, + "learning_rate": 1.3852264377091652e-06, + "loss": 0.80857778, + "num_input_tokens_seen": 218980025, + "step": 10170, + "time_per_iteration": 2.671638011932373 + }, + { + "auxiliary_loss_clip": 0.0113183, + "auxiliary_loss_mlp": 0.01108003, + "balance_loss_clip": 1.00170803, + "balance_loss_mlp": 1.00042892, + "epoch": 0.6115136028859162, + "flos": 21908454163200.0, + "grad_norm": 2.0666954797988315, + "language_loss": 0.69203776, + "learning_rate": 1.3848558436055651e-06, + "loss": 0.71443605, + "num_input_tokens_seen": 218998200, + "step": 10171, + "time_per_iteration": 2.5852694511413574 + }, + { + "auxiliary_loss_clip": 0.01118894, + "auxiliary_loss_mlp": 0.0110874, + "balance_loss_clip": 1.00188994, + "balance_loss_mlp": 1.00059283, + "epoch": 0.6115737261385841, + "flos": 28804846821120.0, + "grad_norm": 3.4964196362626616, + "language_loss": 0.79007304, + "learning_rate": 1.3844852728282934e-06, + "loss": 0.81234938, + "num_input_tokens_seen": 219017910, + "step": 10172, + "time_per_iteration": 4.198727369308472 + }, + { + "auxiliary_loss_clip": 0.01117323, + "auxiliary_loss_mlp": 0.01107892, + "balance_loss_clip": 1.00176001, + "balance_loss_mlp": 1.0006038, + "epoch": 0.6116338493912521, + "flos": 21251468453760.0, + "grad_norm": 1.8815352314946612, + "language_loss": 0.67274809, + "learning_rate": 1.3841147253914022e-06, + "loss": 0.69500023, + "num_input_tokens_seen": 219037730, + "step": 10173, + "time_per_iteration": 2.6508641242980957 + }, + { + "auxiliary_loss_clip": 0.01134322, + "auxiliary_loss_mlp": 0.01108076, + "balance_loss_clip": 1.00212777, + "balance_loss_mlp": 1.00059724, + "epoch": 0.61169397264392, + "flos": 17530189488000.0, + "grad_norm": 1.6496450853949771, + "language_loss": 0.5597952, + "learning_rate": 1.3837442013089416e-06, + "loss": 0.58221924, + "num_input_tokens_seen": 219056755, + "step": 10174, + "time_per_iteration": 2.5747241973876953 + }, + { + "auxiliary_loss_clip": 0.01134158, + "auxiliary_loss_mlp": 0.01108021, + "balance_loss_clip": 1.00201416, + "balance_loss_mlp": 1.0005424, + "epoch": 0.611754095896588, + "flos": 23951555758080.0, + "grad_norm": 1.9196313570398678, + "language_loss": 0.65914333, + "learning_rate": 1.3833737005949628e-06, + "loss": 0.68156505, + "num_input_tokens_seen": 219076985, + "step": 10175, + "time_per_iteration": 2.6542816162109375 + }, + { + "auxiliary_loss_clip": 0.01148884, + "auxiliary_loss_mlp": 0.00747346, + "balance_loss_clip": 1.00185204, + "balance_loss_mlp": 1.00044227, + "epoch": 0.6118142191492559, + "flos": 25994872834560.0, + "grad_norm": 1.9245060563562926, + "language_loss": 0.82451057, + "learning_rate": 1.3830032232635154e-06, + "loss": 0.84347284, + "num_input_tokens_seen": 219096050, + "step": 10176, + "time_per_iteration": 2.6537511348724365 + }, + { + "auxiliary_loss_clip": 0.011334, + "auxiliary_loss_mlp": 0.01107421, + "balance_loss_clip": 1.00193524, + "balance_loss_mlp": 1.0006094, + "epoch": 0.611874342401924, + "flos": 24603190341120.0, + "grad_norm": 1.822491884733675, + "language_loss": 0.77451444, + "learning_rate": 1.3826327693286474e-06, + "loss": 0.79692262, + "num_input_tokens_seen": 219112665, + "step": 10177, + "time_per_iteration": 2.6071977615356445 + }, + { + "auxiliary_loss_clip": 0.01150723, + "auxiliary_loss_mlp": 0.00747268, + "balance_loss_clip": 1.0019995, + "balance_loss_mlp": 1.00036049, + "epoch": 0.6119344656545919, + "flos": 15887132640000.0, + "grad_norm": 1.8656589536929655, + "language_loss": 0.75286138, + "learning_rate": 1.3822623388044065e-06, + "loss": 0.77184123, + "num_input_tokens_seen": 219129120, + "step": 10178, + "time_per_iteration": 2.540459156036377 + }, + { + "auxiliary_loss_clip": 0.01134024, + "auxiliary_loss_mlp": 0.01108116, + "balance_loss_clip": 1.00192392, + "balance_loss_mlp": 1.00054181, + "epoch": 0.6119945889072599, + "flos": 21652877917440.0, + "grad_norm": 1.6345050902671834, + "language_loss": 0.66911232, + "learning_rate": 1.3818919317048402e-06, + "loss": 0.69153368, + "num_input_tokens_seen": 219148950, + "step": 10179, + "time_per_iteration": 2.6353275775909424 + }, + { + "auxiliary_loss_clip": 0.01133524, + "auxiliary_loss_mlp": 0.01107983, + "balance_loss_clip": 1.00193763, + "balance_loss_mlp": 1.00059986, + "epoch": 0.6120547121599279, + "flos": 13772533023360.0, + "grad_norm": 3.2327171830072707, + "language_loss": 0.8335945, + "learning_rate": 1.3815215480439933e-06, + "loss": 0.8560096, + "num_input_tokens_seen": 219165585, + "step": 10180, + "time_per_iteration": 2.58209228515625 + }, + { + "auxiliary_loss_clip": 0.01165649, + "auxiliary_loss_mlp": 0.01107899, + "balance_loss_clip": 1.00210083, + "balance_loss_mlp": 1.00042021, + "epoch": 0.6121148354125958, + "flos": 20079164275200.0, + "grad_norm": 1.6024286059190471, + "language_loss": 0.77598786, + "learning_rate": 1.3811511878359113e-06, + "loss": 0.79872346, + "num_input_tokens_seen": 219183280, + "step": 10181, + "time_per_iteration": 2.5003535747528076 + }, + { + "auxiliary_loss_clip": 0.01165824, + "auxiliary_loss_mlp": 0.01107475, + "balance_loss_clip": 1.00204587, + "balance_loss_mlp": 1.0006634, + "epoch": 0.6121749586652638, + "flos": 13471313569920.0, + "grad_norm": 3.090156976893177, + "language_loss": 0.80197132, + "learning_rate": 1.3807808510946384e-06, + "loss": 0.82470429, + "num_input_tokens_seen": 219197200, + "step": 10182, + "time_per_iteration": 2.451214551925659 + }, + { + "auxiliary_loss_clip": 0.01120407, + "auxiliary_loss_mlp": 0.01106015, + "balance_loss_clip": 1.00182235, + "balance_loss_mlp": 1.00053883, + "epoch": 0.6122350819179317, + "flos": 20120533764480.0, + "grad_norm": 1.6663029753832799, + "language_loss": 0.8311913, + "learning_rate": 1.3804105378342177e-06, + "loss": 0.85345554, + "num_input_tokens_seen": 219216825, + "step": 10183, + "time_per_iteration": 2.6381192207336426 + }, + { + "auxiliary_loss_clip": 0.01143519, + "auxiliary_loss_mlp": 0.01083118, + "balance_loss_clip": 1.00111556, + "balance_loss_mlp": 0.9999575, + "epoch": 0.6122952051705998, + "flos": 65429242767360.0, + "grad_norm": 0.7052261462008653, + "language_loss": 0.62878126, + "learning_rate": 1.3800402480686914e-06, + "loss": 0.65104765, + "num_input_tokens_seen": 219283795, + "step": 10184, + "time_per_iteration": 3.2222626209259033 + }, + { + "auxiliary_loss_clip": 0.01150458, + "auxiliary_loss_mlp": 0.01106293, + "balance_loss_clip": 1.00203967, + "balance_loss_mlp": 1.00053084, + "epoch": 0.6123553284232677, + "flos": 20376253664640.0, + "grad_norm": 2.1823199617751374, + "language_loss": 0.82177126, + "learning_rate": 1.379669981812101e-06, + "loss": 0.84433877, + "num_input_tokens_seen": 219302385, + "step": 10185, + "time_per_iteration": 2.5436885356903076 + }, + { + "auxiliary_loss_clip": 0.01134574, + "auxiliary_loss_mlp": 0.01107634, + "balance_loss_clip": 1.00207579, + "balance_loss_mlp": 1.00063205, + "epoch": 0.6124154516759357, + "flos": 23987645948160.0, + "grad_norm": 2.126764857868121, + "language_loss": 0.74681044, + "learning_rate": 1.3792997390784868e-06, + "loss": 0.76923251, + "num_input_tokens_seen": 219319765, + "step": 10186, + "time_per_iteration": 2.586516857147217 + }, + { + "auxiliary_loss_clip": 0.01150789, + "auxiliary_loss_mlp": 0.0110653, + "balance_loss_clip": 1.00185251, + "balance_loss_mlp": 1.00048113, + "epoch": 0.6124755749286036, + "flos": 21468799693440.0, + "grad_norm": 1.563782280902906, + "language_loss": 0.78343189, + "learning_rate": 1.3789295198818895e-06, + "loss": 0.80600512, + "num_input_tokens_seen": 219337440, + "step": 10187, + "time_per_iteration": 2.564772367477417 + }, + { + "auxiliary_loss_clip": 0.01165689, + "auxiliary_loss_mlp": 0.01107154, + "balance_loss_clip": 1.00199533, + "balance_loss_mlp": 1.00053275, + "epoch": 0.6125356981812716, + "flos": 23879195809920.0, + "grad_norm": 3.449049785884147, + "language_loss": 0.82844245, + "learning_rate": 1.3785593242363462e-06, + "loss": 0.8511709, + "num_input_tokens_seen": 219357525, + "step": 10188, + "time_per_iteration": 2.532179594039917 + }, + { + "auxiliary_loss_clip": 0.01117333, + "auxiliary_loss_mlp": 0.01106817, + "balance_loss_clip": 1.00178015, + "balance_loss_mlp": 1.00057817, + "epoch": 0.6125958214339395, + "flos": 14425604150400.0, + "grad_norm": 1.8039845998401673, + "language_loss": 0.75422186, + "learning_rate": 1.378189152155896e-06, + "loss": 0.77646339, + "num_input_tokens_seen": 219374855, + "step": 10189, + "time_per_iteration": 2.6201157569885254 + }, + { + "auxiliary_loss_clip": 0.0115081, + "auxiliary_loss_mlp": 0.01107026, + "balance_loss_clip": 1.0020628, + "balance_loss_mlp": 1.00059569, + "epoch": 0.6126559446866076, + "flos": 23259090389760.0, + "grad_norm": 1.499810814764405, + "language_loss": 0.74344367, + "learning_rate": 1.3778190036545758e-06, + "loss": 0.76602209, + "num_input_tokens_seen": 219394740, + "step": 10190, + "time_per_iteration": 2.551560163497925 + }, + { + "auxiliary_loss_clip": 0.01148409, + "auxiliary_loss_mlp": 0.01107462, + "balance_loss_clip": 1.00192285, + "balance_loss_mlp": 1.00045979, + "epoch": 0.6127160679392755, + "flos": 26864808324480.0, + "grad_norm": 2.065099083128639, + "language_loss": 0.68396503, + "learning_rate": 1.3774488787464207e-06, + "loss": 0.70652378, + "num_input_tokens_seen": 219413755, + "step": 10191, + "time_per_iteration": 2.59443998336792 + }, + { + "auxiliary_loss_clip": 0.01150827, + "auxiliary_loss_mlp": 0.01107235, + "balance_loss_clip": 1.00200295, + "balance_loss_mlp": 1.00051904, + "epoch": 0.6127761911919435, + "flos": 26396425952640.0, + "grad_norm": 1.910054617479616, + "language_loss": 0.73756099, + "learning_rate": 1.377078777445467e-06, + "loss": 0.76014161, + "num_input_tokens_seen": 219433560, + "step": 10192, + "time_per_iteration": 2.590507984161377 + }, + { + "auxiliary_loss_clip": 0.01119805, + "auxiliary_loss_mlp": 0.0110607, + "balance_loss_clip": 1.00186014, + "balance_loss_mlp": 1.00049853, + "epoch": 0.6128363144446115, + "flos": 22634747164800.0, + "grad_norm": 2.966014491513544, + "language_loss": 0.83453178, + "learning_rate": 1.3767086997657478e-06, + "loss": 0.85679054, + "num_input_tokens_seen": 219452640, + "step": 10193, + "time_per_iteration": 2.664768695831299 + }, + { + "auxiliary_loss_clip": 0.01115665, + "auxiliary_loss_mlp": 0.01107091, + "balance_loss_clip": 1.00188255, + "balance_loss_mlp": 1.0004704, + "epoch": 0.6128964376972794, + "flos": 26759051706240.0, + "grad_norm": 1.9621104778086569, + "language_loss": 0.70033062, + "learning_rate": 1.3763386457212979e-06, + "loss": 0.7225582, + "num_input_tokens_seen": 219468585, + "step": 10194, + "time_per_iteration": 2.672454595565796 + }, + { + "auxiliary_loss_clip": 0.01111573, + "auxiliary_loss_mlp": 0.01082745, + "balance_loss_clip": 1.00111759, + "balance_loss_mlp": 0.99996656, + "epoch": 0.6129565609499474, + "flos": 65567929178880.0, + "grad_norm": 0.8257652714402848, + "language_loss": 0.58696157, + "learning_rate": 1.375968615326149e-06, + "loss": 0.60890472, + "num_input_tokens_seen": 219523015, + "step": 10195, + "time_per_iteration": 2.985032558441162 + }, + { + "auxiliary_loss_clip": 0.0113427, + "auxiliary_loss_mlp": 0.01106886, + "balance_loss_clip": 1.00199223, + "balance_loss_mlp": 1.00055146, + "epoch": 0.6130166842026153, + "flos": 16362087200640.0, + "grad_norm": 2.33641184054633, + "language_loss": 0.69729197, + "learning_rate": 1.3755986085943324e-06, + "loss": 0.71970356, + "num_input_tokens_seen": 219539980, + "step": 10196, + "time_per_iteration": 2.562934398651123 + }, + { + "auxiliary_loss_clip": 0.01135875, + "auxiliary_loss_mlp": 0.01106367, + "balance_loss_clip": 1.00197899, + "balance_loss_mlp": 1.00050902, + "epoch": 0.6130768074552834, + "flos": 23652455207040.0, + "grad_norm": 1.832079242894308, + "language_loss": 0.71406931, + "learning_rate": 1.3752286255398788e-06, + "loss": 0.73649168, + "num_input_tokens_seen": 219556980, + "step": 10197, + "time_per_iteration": 3.9677016735076904 + }, + { + "auxiliary_loss_clip": 0.01150789, + "auxiliary_loss_mlp": 0.01106833, + "balance_loss_clip": 1.00207889, + "balance_loss_mlp": 1.00068903, + "epoch": 0.6131369307079513, + "flos": 20047455544320.0, + "grad_norm": 2.363297825469991, + "language_loss": 0.78889805, + "learning_rate": 1.3748586661768191e-06, + "loss": 0.81147426, + "num_input_tokens_seen": 219576410, + "step": 10198, + "time_per_iteration": 2.551238536834717 + }, + { + "auxiliary_loss_clip": 0.01117366, + "auxiliary_loss_mlp": 0.01107241, + "balance_loss_clip": 1.00174725, + "balance_loss_mlp": 1.00061977, + "epoch": 0.6131970539606193, + "flos": 22672166158080.0, + "grad_norm": 2.068668104639192, + "language_loss": 0.74696743, + "learning_rate": 1.374488730519181e-06, + "loss": 0.76921356, + "num_input_tokens_seen": 219597180, + "step": 10199, + "time_per_iteration": 2.661208152770996 + }, + { + "auxiliary_loss_clip": 0.0113228, + "auxiliary_loss_mlp": 0.01107296, + "balance_loss_clip": 1.00176597, + "balance_loss_mlp": 1.00067568, + "epoch": 0.6132571772132872, + "flos": 26870913636480.0, + "grad_norm": 1.5711174835936828, + "language_loss": 0.61538959, + "learning_rate": 1.374118818580993e-06, + "loss": 0.63778532, + "num_input_tokens_seen": 219617630, + "step": 10200, + "time_per_iteration": 2.6816565990448 + }, + { + "auxiliary_loss_clip": 0.01134606, + "auxiliary_loss_mlp": 0.01106769, + "balance_loss_clip": 1.00205839, + "balance_loss_mlp": 1.00052929, + "epoch": 0.6133173004659552, + "flos": 22892657794560.0, + "grad_norm": 2.0914760656386897, + "language_loss": 0.68312097, + "learning_rate": 1.3737489303762822e-06, + "loss": 0.7055347, + "num_input_tokens_seen": 219637025, + "step": 10201, + "time_per_iteration": 2.6337881088256836 + }, + { + "auxiliary_loss_clip": 0.01133961, + "auxiliary_loss_mlp": 0.01106634, + "balance_loss_clip": 1.00184846, + "balance_loss_mlp": 1.00049019, + "epoch": 0.6133774237186231, + "flos": 20485098852480.0, + "grad_norm": 1.8461467667474296, + "language_loss": 0.83420634, + "learning_rate": 1.3733790659190746e-06, + "loss": 0.85661227, + "num_input_tokens_seen": 219656625, + "step": 10202, + "time_per_iteration": 4.027469873428345 + }, + { + "auxiliary_loss_clip": 0.01160167, + "auxiliary_loss_mlp": 0.01082695, + "balance_loss_clip": 1.00111842, + "balance_loss_mlp": 0.99991626, + "epoch": 0.6134375469712912, + "flos": 69413065217280.0, + "grad_norm": 0.8984559098814733, + "language_loss": 0.6713798, + "learning_rate": 1.3730092252233953e-06, + "loss": 0.69380844, + "num_input_tokens_seen": 219718090, + "step": 10203, + "time_per_iteration": 3.1107561588287354 + }, + { + "auxiliary_loss_clip": 0.01150958, + "auxiliary_loss_mlp": 0.01106415, + "balance_loss_clip": 1.0019443, + "balance_loss_mlp": 1.00046146, + "epoch": 0.6134976702239591, + "flos": 41281541815680.0, + "grad_norm": 1.699860643897373, + "language_loss": 0.61312354, + "learning_rate": 1.37263940830327e-06, + "loss": 0.63569725, + "num_input_tokens_seen": 219740100, + "step": 10204, + "time_per_iteration": 2.736250400543213 + }, + { + "auxiliary_loss_clip": 0.01116632, + "auxiliary_loss_mlp": 0.01106736, + "balance_loss_clip": 1.00172234, + "balance_loss_mlp": 1.00049627, + "epoch": 0.6135577934766271, + "flos": 22346600261760.0, + "grad_norm": 2.33755578014355, + "language_loss": 0.72432387, + "learning_rate": 1.3722696151727204e-06, + "loss": 0.74655753, + "num_input_tokens_seen": 219761225, + "step": 10205, + "time_per_iteration": 2.64279842376709 + }, + { + "auxiliary_loss_clip": 0.01148807, + "auxiliary_loss_mlp": 0.01106091, + "balance_loss_clip": 1.00194836, + "balance_loss_mlp": 1.00042403, + "epoch": 0.6136179167292951, + "flos": 23728155120000.0, + "grad_norm": 1.6879737749749864, + "language_loss": 0.7563417, + "learning_rate": 1.3718998458457701e-06, + "loss": 0.77889073, + "num_input_tokens_seen": 219780085, + "step": 10206, + "time_per_iteration": 4.018507242202759 + }, + { + "auxiliary_loss_clip": 0.01104214, + "auxiliary_loss_mlp": 0.01106782, + "balance_loss_clip": 1.0018239, + "balance_loss_mlp": 1.00044763, + "epoch": 0.613678039981963, + "flos": 26024678144640.0, + "grad_norm": 2.2482942572539146, + "language_loss": 0.75576442, + "learning_rate": 1.3715301003364407e-06, + "loss": 0.77787441, + "num_input_tokens_seen": 219797895, + "step": 10207, + "time_per_iteration": 2.6913273334503174 + }, + { + "auxiliary_loss_clip": 0.01149037, + "auxiliary_loss_mlp": 0.01107547, + "balance_loss_clip": 1.00188994, + "balance_loss_mlp": 1.00064015, + "epoch": 0.613738163234631, + "flos": 9859957200000.0, + "grad_norm": 2.249786389132174, + "language_loss": 0.82565713, + "learning_rate": 1.3711603786587525e-06, + "loss": 0.84822297, + "num_input_tokens_seen": 219811295, + "step": 10208, + "time_per_iteration": 2.535338878631592 + }, + { + "auxiliary_loss_clip": 0.01133778, + "auxiliary_loss_mlp": 0.01107671, + "balance_loss_clip": 1.00193834, + "balance_loss_mlp": 1.00047803, + "epoch": 0.613798286487299, + "flos": 33182070001920.0, + "grad_norm": 2.1671692331180012, + "language_loss": 0.72341073, + "learning_rate": 1.3707906808267265e-06, + "loss": 0.74582517, + "num_input_tokens_seen": 219832735, + "step": 10209, + "time_per_iteration": 4.127934217453003 + }, + { + "auxiliary_loss_clip": 0.01165751, + "auxiliary_loss_mlp": 0.01107412, + "balance_loss_clip": 1.00208354, + "balance_loss_mlp": 1.00079119, + "epoch": 0.613858409739967, + "flos": 25627901535360.0, + "grad_norm": 1.6153989594450668, + "language_loss": 0.74084318, + "learning_rate": 1.37042100685438e-06, + "loss": 0.76357478, + "num_input_tokens_seen": 219852755, + "step": 10210, + "time_per_iteration": 2.5420634746551514 + }, + { + "auxiliary_loss_clip": 0.01116572, + "auxiliary_loss_mlp": 0.01082678, + "balance_loss_clip": 1.00108874, + "balance_loss_mlp": 0.99989909, + "epoch": 0.6139185329926349, + "flos": 67192313932800.0, + "grad_norm": 0.8568312022957678, + "language_loss": 0.64972508, + "learning_rate": 1.3700513567557325e-06, + "loss": 0.67171752, + "num_input_tokens_seen": 219922785, + "step": 10211, + "time_per_iteration": 3.382667303085327 + }, + { + "auxiliary_loss_clip": 0.01132304, + "auxiliary_loss_mlp": 0.00747246, + "balance_loss_clip": 1.00175047, + "balance_loss_mlp": 1.00029194, + "epoch": 0.6139786562453029, + "flos": 21543637680000.0, + "grad_norm": 1.60568314047137, + "language_loss": 0.75864899, + "learning_rate": 1.369681730544801e-06, + "loss": 0.77744448, + "num_input_tokens_seen": 219942215, + "step": 10212, + "time_per_iteration": 2.6588809490203857 + }, + { + "auxiliary_loss_clip": 0.01134016, + "auxiliary_loss_mlp": 0.01107374, + "balance_loss_clip": 1.00193775, + "balance_loss_mlp": 1.00065804, + "epoch": 0.6140387794979708, + "flos": 26068489758720.0, + "grad_norm": 1.4763903872695185, + "language_loss": 0.73906535, + "learning_rate": 1.3693121282356009e-06, + "loss": 0.76147926, + "num_input_tokens_seen": 219963830, + "step": 10213, + "time_per_iteration": 2.649022340774536 + }, + { + "auxiliary_loss_clip": 0.01133579, + "auxiliary_loss_mlp": 0.0110788, + "balance_loss_clip": 1.00191152, + "balance_loss_mlp": 1.00059152, + "epoch": 0.6140989027506388, + "flos": 23694614795520.0, + "grad_norm": 1.4846543756195014, + "language_loss": 0.72936952, + "learning_rate": 1.3689425498421483e-06, + "loss": 0.75178409, + "num_input_tokens_seen": 219983815, + "step": 10214, + "time_per_iteration": 2.632514476776123 + }, + { + "auxiliary_loss_clip": 0.01165703, + "auxiliary_loss_mlp": 0.0110695, + "balance_loss_clip": 1.00202131, + "balance_loss_mlp": 1.00042462, + "epoch": 0.6141590260033067, + "flos": 22231721589120.0, + "grad_norm": 1.61874989222684, + "language_loss": 0.7441048, + "learning_rate": 1.3685729953784572e-06, + "loss": 0.7668314, + "num_input_tokens_seen": 220003165, + "step": 10215, + "time_per_iteration": 2.5374372005462646 + }, + { + "auxiliary_loss_clip": 0.01148634, + "auxiliary_loss_mlp": 0.01106274, + "balance_loss_clip": 1.0018301, + "balance_loss_mlp": 1.00041616, + "epoch": 0.6142191492559748, + "flos": 23871653953920.0, + "grad_norm": 1.7210636155596457, + "language_loss": 0.78076613, + "learning_rate": 1.368203464858542e-06, + "loss": 0.80331528, + "num_input_tokens_seen": 220021015, + "step": 10216, + "time_per_iteration": 2.5654854774475098 + }, + { + "auxiliary_loss_clip": 0.01165711, + "auxiliary_loss_mlp": 0.01107632, + "balance_loss_clip": 1.00206196, + "balance_loss_mlp": 1.00053453, + "epoch": 0.6142792725086427, + "flos": 15042513260160.0, + "grad_norm": 3.520400980089713, + "language_loss": 0.80120897, + "learning_rate": 1.3678339582964147e-06, + "loss": 0.82394242, + "num_input_tokens_seen": 220035780, + "step": 10217, + "time_per_iteration": 2.4695096015930176 + }, + { + "auxiliary_loss_clip": 0.0113351, + "auxiliary_loss_mlp": 0.01106344, + "balance_loss_clip": 1.00182688, + "balance_loss_mlp": 1.00048649, + "epoch": 0.6143393957613107, + "flos": 23330947547520.0, + "grad_norm": 2.604992196197951, + "language_loss": 0.78201145, + "learning_rate": 1.3674644757060865e-06, + "loss": 0.80440992, + "num_input_tokens_seen": 220054280, + "step": 10218, + "time_per_iteration": 2.604827880859375 + }, + { + "auxiliary_loss_clip": 0.01150357, + "auxiliary_loss_mlp": 0.01105889, + "balance_loss_clip": 1.00204921, + "balance_loss_mlp": 1.00041234, + "epoch": 0.6143995190139786, + "flos": 20117086058880.0, + "grad_norm": 1.5665142606723528, + "language_loss": 0.82020336, + "learning_rate": 1.367095017101569e-06, + "loss": 0.84276587, + "num_input_tokens_seen": 220074120, + "step": 10219, + "time_per_iteration": 2.582752227783203 + }, + { + "auxiliary_loss_clip": 0.0114922, + "auxiliary_loss_mlp": 0.01107538, + "balance_loss_clip": 1.00187492, + "balance_loss_mlp": 1.0005362, + "epoch": 0.6144596422666466, + "flos": 42303559489920.0, + "grad_norm": 2.419565128681954, + "language_loss": 0.66694462, + "learning_rate": 1.3667255824968717e-06, + "loss": 0.68951219, + "num_input_tokens_seen": 220096320, + "step": 10220, + "time_per_iteration": 2.7407548427581787 + }, + { + "auxiliary_loss_clip": 0.01150655, + "auxiliary_loss_mlp": 0.01106548, + "balance_loss_clip": 1.00186837, + "balance_loss_mlp": 1.00040388, + "epoch": 0.6145197655193146, + "flos": 21573622558080.0, + "grad_norm": 3.740463513747422, + "language_loss": 0.72138011, + "learning_rate": 1.3663561719060041e-06, + "loss": 0.74395216, + "num_input_tokens_seen": 220114850, + "step": 10221, + "time_per_iteration": 2.5895330905914307 + }, + { + "auxiliary_loss_clip": 0.01099664, + "auxiliary_loss_mlp": 0.01106408, + "balance_loss_clip": 1.00165451, + "balance_loss_mlp": 1.00045466, + "epoch": 0.6145798887719826, + "flos": 21471098163840.0, + "grad_norm": 1.8549155939409352, + "language_loss": 0.79304516, + "learning_rate": 1.3659867853429735e-06, + "loss": 0.81510586, + "num_input_tokens_seen": 220133395, + "step": 10222, + "time_per_iteration": 2.6698999404907227 + }, + { + "auxiliary_loss_clip": 0.01133957, + "auxiliary_loss_mlp": 0.01107387, + "balance_loss_clip": 1.00192761, + "balance_loss_mlp": 1.00048041, + "epoch": 0.6146400120246506, + "flos": 20777016683520.0, + "grad_norm": 1.8899663601429797, + "language_loss": 0.76038808, + "learning_rate": 1.365617422821788e-06, + "loss": 0.78280151, + "num_input_tokens_seen": 220152790, + "step": 10223, + "time_per_iteration": 2.676851272583008 + }, + { + "auxiliary_loss_clip": 0.0113414, + "auxiliary_loss_mlp": 0.0110763, + "balance_loss_clip": 1.00195003, + "balance_loss_mlp": 1.00053215, + "epoch": 0.6147001352773185, + "flos": 13881306384000.0, + "grad_norm": 5.482597781497655, + "language_loss": 0.78348726, + "learning_rate": 1.3652480843564535e-06, + "loss": 0.80590498, + "num_input_tokens_seen": 220169535, + "step": 10224, + "time_per_iteration": 2.5965285301208496 + }, + { + "auxiliary_loss_clip": 0.01116856, + "auxiliary_loss_mlp": 0.0110606, + "balance_loss_clip": 1.0017662, + "balance_loss_mlp": 1.00048852, + "epoch": 0.6147602585299865, + "flos": 56641791807360.0, + "grad_norm": 1.4533078603938134, + "language_loss": 0.66318059, + "learning_rate": 1.3648787699609746e-06, + "loss": 0.68540978, + "num_input_tokens_seen": 220195305, + "step": 10225, + "time_per_iteration": 2.962188243865967 + }, + { + "auxiliary_loss_clip": 0.0114933, + "auxiliary_loss_mlp": 0.0074724, + "balance_loss_clip": 1.00194526, + "balance_loss_mlp": 1.00026977, + "epoch": 0.6148203817826544, + "flos": 32817217605120.0, + "grad_norm": 2.1415742840348506, + "language_loss": 0.62899196, + "learning_rate": 1.364509479649357e-06, + "loss": 0.64795762, + "num_input_tokens_seen": 220215040, + "step": 10226, + "time_per_iteration": 2.6615123748779297 + }, + { + "auxiliary_loss_clip": 0.01134245, + "auxiliary_loss_mlp": 0.01107994, + "balance_loss_clip": 1.00194621, + "balance_loss_mlp": 1.00051522, + "epoch": 0.6148805050353224, + "flos": 18332038748160.0, + "grad_norm": 2.9505463200265116, + "language_loss": 0.75561494, + "learning_rate": 1.3641402134356037e-06, + "loss": 0.77803731, + "num_input_tokens_seen": 220234205, + "step": 10227, + "time_per_iteration": 2.5759313106536865 + }, + { + "auxiliary_loss_clip": 0.01087879, + "auxiliary_loss_mlp": 0.01107629, + "balance_loss_clip": 1.00174916, + "balance_loss_mlp": 1.00053132, + "epoch": 0.6149406282879903, + "flos": 14063983977600.0, + "grad_norm": 2.6207089888066006, + "language_loss": 0.61684173, + "learning_rate": 1.3637709713337164e-06, + "loss": 0.63879681, + "num_input_tokens_seen": 220252730, + "step": 10228, + "time_per_iteration": 2.7037513256073 + }, + { + "auxiliary_loss_clip": 0.01136109, + "auxiliary_loss_mlp": 0.0110645, + "balance_loss_clip": 1.00192332, + "balance_loss_mlp": 1.00059199, + "epoch": 0.6150007515406584, + "flos": 25190186400000.0, + "grad_norm": 1.3421992883687492, + "language_loss": 0.74535543, + "learning_rate": 1.3634017533576985e-06, + "loss": 0.76778102, + "num_input_tokens_seen": 220273345, + "step": 10229, + "time_per_iteration": 2.637260675430298 + }, + { + "auxiliary_loss_clip": 0.01165613, + "auxiliary_loss_mlp": 0.01107373, + "balance_loss_clip": 1.00209785, + "balance_loss_mlp": 1.00075197, + "epoch": 0.6150608747933263, + "flos": 21945262625280.0, + "grad_norm": 2.4814431973411444, + "language_loss": 0.77608192, + "learning_rate": 1.3630325595215493e-06, + "loss": 0.79881179, + "num_input_tokens_seen": 220293845, + "step": 10230, + "time_per_iteration": 2.520404577255249 + }, + { + "auxiliary_loss_clip": 0.01134069, + "auxiliary_loss_mlp": 0.01107387, + "balance_loss_clip": 1.00188732, + "balance_loss_mlp": 1.00048029, + "epoch": 0.6151209980459943, + "flos": 30117453523200.0, + "grad_norm": 1.4739649014405733, + "language_loss": 0.73270297, + "learning_rate": 1.36266338983927e-06, + "loss": 0.75511754, + "num_input_tokens_seen": 220316070, + "step": 10231, + "time_per_iteration": 2.665297746658325 + }, + { + "auxiliary_loss_clip": 0.01133527, + "auxiliary_loss_mlp": 0.01106786, + "balance_loss_clip": 1.0018338, + "balance_loss_mlp": 1.00045121, + "epoch": 0.6151811212986622, + "flos": 30008356940160.0, + "grad_norm": 1.6657771784233926, + "language_loss": 0.6981076, + "learning_rate": 1.362294244324858e-06, + "loss": 0.72051072, + "num_input_tokens_seen": 220335695, + "step": 10232, + "time_per_iteration": 2.6604878902435303 + }, + { + "auxiliary_loss_clip": 0.01150803, + "auxiliary_loss_mlp": 0.00747221, + "balance_loss_clip": 1.00199521, + "balance_loss_mlp": 1.00034547, + "epoch": 0.6152412445513302, + "flos": 18872888808960.0, + "grad_norm": 2.6556345451555057, + "language_loss": 0.91660637, + "learning_rate": 1.3619251229923126e-06, + "loss": 0.93558663, + "num_input_tokens_seen": 220353720, + "step": 10233, + "time_per_iteration": 2.5471267700195312 + }, + { + "auxiliary_loss_clip": 0.0113278, + "auxiliary_loss_mlp": 0.01106855, + "balance_loss_clip": 1.00187278, + "balance_loss_mlp": 1.00052083, + "epoch": 0.6153013678039982, + "flos": 25703601448320.0, + "grad_norm": 1.7885140467070035, + "language_loss": 0.71420342, + "learning_rate": 1.3615560258556306e-06, + "loss": 0.7365998, + "num_input_tokens_seen": 220372515, + "step": 10234, + "time_per_iteration": 3.9931957721710205 + }, + { + "auxiliary_loss_clip": 0.01149409, + "auxiliary_loss_mlp": 0.00747173, + "balance_loss_clip": 1.00188541, + "balance_loss_mlp": 1.00035238, + "epoch": 0.6153614910566662, + "flos": 28510271383680.0, + "grad_norm": 2.34780878635787, + "language_loss": 0.67038763, + "learning_rate": 1.3611869529288077e-06, + "loss": 0.68935347, + "num_input_tokens_seen": 220393490, + "step": 10235, + "time_per_iteration": 2.6615731716156006 + }, + { + "auxiliary_loss_clip": 0.01153139, + "auxiliary_loss_mlp": 0.01107595, + "balance_loss_clip": 1.00202274, + "balance_loss_mlp": 1.00049734, + "epoch": 0.6154216143093342, + "flos": 23549787158400.0, + "grad_norm": 1.6832822137773575, + "language_loss": 0.81156212, + "learning_rate": 1.3608179042258398e-06, + "loss": 0.83416939, + "num_input_tokens_seen": 220412855, + "step": 10236, + "time_per_iteration": 2.565894365310669 + }, + { + "auxiliary_loss_clip": 0.01165602, + "auxiliary_loss_mlp": 0.01107295, + "balance_loss_clip": 1.00188375, + "balance_loss_mlp": 1.00038779, + "epoch": 0.6154817375620021, + "flos": 22748081552640.0, + "grad_norm": 1.5815874535267018, + "language_loss": 0.80646986, + "learning_rate": 1.360448879760721e-06, + "loss": 0.82919884, + "num_input_tokens_seen": 220433440, + "step": 10237, + "time_per_iteration": 2.5453763008117676 + }, + { + "auxiliary_loss_clip": 0.01149264, + "auxiliary_loss_mlp": 0.01107183, + "balance_loss_clip": 1.00193644, + "balance_loss_mlp": 1.00056219, + "epoch": 0.6155418608146701, + "flos": 27162975121920.0, + "grad_norm": 1.624202503811714, + "language_loss": 0.75847983, + "learning_rate": 1.3600798795474449e-06, + "loss": 0.78104424, + "num_input_tokens_seen": 220453445, + "step": 10238, + "time_per_iteration": 2.6277525424957275 + }, + { + "auxiliary_loss_clip": 0.0108167, + "auxiliary_loss_mlp": 0.01082721, + "balance_loss_clip": 1.00106406, + "balance_loss_mlp": 0.99994242, + "epoch": 0.615601984067338, + "flos": 68811165014400.0, + "grad_norm": 0.7600318532066646, + "language_loss": 0.57613426, + "learning_rate": 1.3597109036000036e-06, + "loss": 0.59777814, + "num_input_tokens_seen": 220509730, + "step": 10239, + "time_per_iteration": 4.762746572494507 + }, + { + "auxiliary_loss_clip": 0.01135175, + "auxiliary_loss_mlp": 0.01106653, + "balance_loss_clip": 1.00187945, + "balance_loss_mlp": 1.00050926, + "epoch": 0.615662107320006, + "flos": 15517144598400.0, + "grad_norm": 1.9038324784368899, + "language_loss": 0.77923071, + "learning_rate": 1.3593419519323892e-06, + "loss": 0.80164897, + "num_input_tokens_seen": 220527295, + "step": 10240, + "time_per_iteration": 3.223843574523926 + }, + { + "auxiliary_loss_clip": 0.01165709, + "auxiliary_loss_mlp": 0.01107549, + "balance_loss_clip": 1.0019877, + "balance_loss_mlp": 1.0006423, + "epoch": 0.615722230572674, + "flos": 21063691128960.0, + "grad_norm": 3.395811234479653, + "language_loss": 0.72717339, + "learning_rate": 1.3589730245585922e-06, + "loss": 0.74990594, + "num_input_tokens_seen": 220542730, + "step": 10241, + "time_per_iteration": 2.5261471271514893 + }, + { + "auxiliary_loss_clip": 0.01165443, + "auxiliary_loss_mlp": 0.01105972, + "balance_loss_clip": 1.00196517, + "balance_loss_mlp": 1.00049555, + "epoch": 0.615782353825342, + "flos": 23256791919360.0, + "grad_norm": 1.505058449937646, + "language_loss": 0.7183339, + "learning_rate": 1.3586041214926018e-06, + "loss": 0.7410481, + "num_input_tokens_seen": 220562995, + "step": 10242, + "time_per_iteration": 2.5311686992645264 + }, + { + "auxiliary_loss_clip": 0.01150372, + "auxiliary_loss_mlp": 0.01106715, + "balance_loss_clip": 1.00203085, + "balance_loss_mlp": 1.00057161, + "epoch": 0.6158424770780099, + "flos": 21103911383040.0, + "grad_norm": 1.9239481104858231, + "language_loss": 0.72515762, + "learning_rate": 1.3582352427484086e-06, + "loss": 0.74772853, + "num_input_tokens_seen": 220581775, + "step": 10243, + "time_per_iteration": 2.5715556144714355 + }, + { + "auxiliary_loss_clip": 0.011435, + "auxiliary_loss_mlp": 0.01082767, + "balance_loss_clip": 1.00103593, + "balance_loss_mlp": 0.9999882, + "epoch": 0.6159026003306779, + "flos": 70333276769280.0, + "grad_norm": 0.7589202874549705, + "language_loss": 0.56856507, + "learning_rate": 1.3578663883399984e-06, + "loss": 0.5908277, + "num_input_tokens_seen": 220646395, + "step": 10244, + "time_per_iteration": 4.741209030151367 + }, + { + "auxiliary_loss_clip": 0.01165544, + "auxiliary_loss_mlp": 0.01107396, + "balance_loss_clip": 1.00206101, + "balance_loss_mlp": 1.00048876, + "epoch": 0.6159627235833458, + "flos": 33874355802240.0, + "grad_norm": 1.6951780067170799, + "language_loss": 0.6366514, + "learning_rate": 1.3574975582813593e-06, + "loss": 0.65938079, + "num_input_tokens_seen": 220668335, + "step": 10245, + "time_per_iteration": 2.6569583415985107 + }, + { + "auxiliary_loss_clip": 0.01102758, + "auxiliary_loss_mlp": 0.01105901, + "balance_loss_clip": 1.00183535, + "balance_loss_mlp": 1.00051999, + "epoch": 0.6160228468360138, + "flos": 26575440359040.0, + "grad_norm": 1.6901943462400364, + "language_loss": 0.79120475, + "learning_rate": 1.3571287525864771e-06, + "loss": 0.81329131, + "num_input_tokens_seen": 220688915, + "step": 10246, + "time_per_iteration": 2.709510087966919 + }, + { + "auxiliary_loss_clip": 0.01117983, + "auxiliary_loss_mlp": 0.00747384, + "balance_loss_clip": 1.00189579, + "balance_loss_mlp": 1.00036454, + "epoch": 0.6160829700886818, + "flos": 17193274894080.0, + "grad_norm": 2.591984222658537, + "language_loss": 0.87548953, + "learning_rate": 1.3567599712693368e-06, + "loss": 0.89414322, + "num_input_tokens_seen": 220703465, + "step": 10247, + "time_per_iteration": 4.0270020961761475 + }, + { + "auxiliary_loss_clip": 0.01086471, + "auxiliary_loss_mlp": 0.0110793, + "balance_loss_clip": 1.00191224, + "balance_loss_mlp": 1.00064158, + "epoch": 0.6161430933413498, + "flos": 23623547736960.0, + "grad_norm": 1.816819350164136, + "language_loss": 0.7966361, + "learning_rate": 1.3563912143439235e-06, + "loss": 0.81858009, + "num_input_tokens_seen": 220722090, + "step": 10248, + "time_per_iteration": 2.7311348915100098 + }, + { + "auxiliary_loss_clip": 0.01103364, + "auxiliary_loss_mlp": 0.01105757, + "balance_loss_clip": 1.00183749, + "balance_loss_mlp": 1.00047183, + "epoch": 0.6162032165940178, + "flos": 23002436736000.0, + "grad_norm": 2.5535283806986726, + "language_loss": 0.86799192, + "learning_rate": 1.3560224818242191e-06, + "loss": 0.89008313, + "num_input_tokens_seen": 220741075, + "step": 10249, + "time_per_iteration": 2.6878697872161865 + }, + { + "auxiliary_loss_clip": 0.01165631, + "auxiliary_loss_mlp": 0.01107088, + "balance_loss_clip": 1.00198841, + "balance_loss_mlp": 1.00037217, + "epoch": 0.6162633398466857, + "flos": 39421979740800.0, + "grad_norm": 3.913582730707804, + "language_loss": 0.69257998, + "learning_rate": 1.3556537737242072e-06, + "loss": 0.71530712, + "num_input_tokens_seen": 220763395, + "step": 10250, + "time_per_iteration": 2.677618980407715 + }, + { + "auxiliary_loss_clip": 0.01134282, + "auxiliary_loss_mlp": 0.01105545, + "balance_loss_clip": 1.00196779, + "balance_loss_mlp": 1.00035429, + "epoch": 0.6163234630993537, + "flos": 19244672530560.0, + "grad_norm": 1.9700185248179682, + "language_loss": 0.74675065, + "learning_rate": 1.3552850900578692e-06, + "loss": 0.76914895, + "num_input_tokens_seen": 220780640, + "step": 10251, + "time_per_iteration": 2.566175699234009 + }, + { + "auxiliary_loss_clip": 0.0115058, + "auxiliary_loss_mlp": 0.0110657, + "balance_loss_clip": 1.00193095, + "balance_loss_mlp": 1.00033069, + "epoch": 0.6163835863520216, + "flos": 15961791058560.0, + "grad_norm": 2.4725598376395426, + "language_loss": 0.67852098, + "learning_rate": 1.3549164308391844e-06, + "loss": 0.70109248, + "num_input_tokens_seen": 220797960, + "step": 10252, + "time_per_iteration": 2.5464935302734375 + }, + { + "auxiliary_loss_clip": 0.01062525, + "auxiliary_loss_mlp": 0.0108309, + "balance_loss_clip": 1.00109684, + "balance_loss_mlp": 0.99993002, + "epoch": 0.6164437096046896, + "flos": 68103834393600.0, + "grad_norm": 0.8939395771499651, + "language_loss": 0.57788134, + "learning_rate": 1.3545477960821333e-06, + "loss": 0.59933746, + "num_input_tokens_seen": 220856930, + "step": 10253, + "time_per_iteration": 3.6381421089172363 + }, + { + "auxiliary_loss_clip": 0.01133741, + "auxiliary_loss_mlp": 0.01106433, + "balance_loss_clip": 1.00187969, + "balance_loss_mlp": 1.00047922, + "epoch": 0.6165038328573575, + "flos": 21361211481600.0, + "grad_norm": 1.430475962001082, + "language_loss": 0.79653454, + "learning_rate": 1.3541791858006946e-06, + "loss": 0.81893623, + "num_input_tokens_seen": 220877595, + "step": 10254, + "time_per_iteration": 3.0342087745666504 + }, + { + "auxiliary_loss_clip": 0.01133136, + "auxiliary_loss_mlp": 0.01107644, + "balance_loss_clip": 1.00194407, + "balance_loss_mlp": 1.00045156, + "epoch": 0.6165639561100256, + "flos": 21101972048640.0, + "grad_norm": 1.6108493231231775, + "language_loss": 0.80525637, + "learning_rate": 1.353810600008846e-06, + "loss": 0.82766414, + "num_input_tokens_seen": 220896880, + "step": 10255, + "time_per_iteration": 2.663743019104004 + }, + { + "auxiliary_loss_clip": 0.01133595, + "auxiliary_loss_mlp": 0.01106973, + "balance_loss_clip": 1.00192416, + "balance_loss_mlp": 1.00054264, + "epoch": 0.6166240793626935, + "flos": 25338533569920.0, + "grad_norm": 3.1514550621558217, + "language_loss": 0.65284419, + "learning_rate": 1.3534420387205646e-06, + "loss": 0.67524993, + "num_input_tokens_seen": 220916425, + "step": 10256, + "time_per_iteration": 2.646873712539673 + }, + { + "auxiliary_loss_clip": 0.01148624, + "auxiliary_loss_mlp": 0.01106532, + "balance_loss_clip": 1.00196671, + "balance_loss_mlp": 1.0005784, + "epoch": 0.6166842026153615, + "flos": 19682639061120.0, + "grad_norm": 1.8063178027113385, + "language_loss": 0.71833915, + "learning_rate": 1.353073501949825e-06, + "loss": 0.74089074, + "num_input_tokens_seen": 220935050, + "step": 10257, + "time_per_iteration": 2.5800867080688477 + }, + { + "auxiliary_loss_clip": 0.01134628, + "auxiliary_loss_mlp": 0.01107374, + "balance_loss_clip": 1.00188839, + "balance_loss_mlp": 1.0004673, + "epoch": 0.6167443258680294, + "flos": 19318361281920.0, + "grad_norm": 2.559958691780804, + "language_loss": 0.7214148, + "learning_rate": 1.3527049897106034e-06, + "loss": 0.74383485, + "num_input_tokens_seen": 220953085, + "step": 10258, + "time_per_iteration": 2.5851993560791016 + }, + { + "auxiliary_loss_clip": 0.011359, + "auxiliary_loss_mlp": 0.01106476, + "balance_loss_clip": 1.00195909, + "balance_loss_mlp": 1.00042713, + "epoch": 0.6168044491206974, + "flos": 25265239868160.0, + "grad_norm": 2.2185355498219805, + "language_loss": 0.63491601, + "learning_rate": 1.3523365020168735e-06, + "loss": 0.65733975, + "num_input_tokens_seen": 220969050, + "step": 10259, + "time_per_iteration": 2.6253113746643066 + }, + { + "auxiliary_loss_clip": 0.01115133, + "auxiliary_loss_mlp": 0.01106862, + "balance_loss_clip": 1.0017879, + "balance_loss_mlp": 1.00052714, + "epoch": 0.6168645723733654, + "flos": 13219903301760.0, + "grad_norm": 2.094239386887744, + "language_loss": 0.71167934, + "learning_rate": 1.3519680388826084e-06, + "loss": 0.7338993, + "num_input_tokens_seen": 220985825, + "step": 10260, + "time_per_iteration": 2.6097090244293213 + }, + { + "auxiliary_loss_clip": 0.01149205, + "auxiliary_loss_mlp": 0.01108855, + "balance_loss_clip": 1.00214767, + "balance_loss_mlp": 1.00061333, + "epoch": 0.6169246956260334, + "flos": 26652038112000.0, + "grad_norm": 2.0437502828798975, + "language_loss": 0.68862408, + "learning_rate": 1.3515996003217803e-06, + "loss": 0.71120465, + "num_input_tokens_seen": 221004465, + "step": 10261, + "time_per_iteration": 2.6092236042022705 + }, + { + "auxiliary_loss_clip": 0.01114573, + "auxiliary_loss_mlp": 0.01106421, + "balance_loss_clip": 1.00166154, + "balance_loss_mlp": 1.0005635, + "epoch": 0.6169848188787014, + "flos": 23148413608320.0, + "grad_norm": 1.8397960324860885, + "language_loss": 0.70986521, + "learning_rate": 1.3512311863483602e-06, + "loss": 0.7320751, + "num_input_tokens_seen": 221023260, + "step": 10262, + "time_per_iteration": 2.6340434551239014 + }, + { + "auxiliary_loss_clip": 0.01135689, + "auxiliary_loss_mlp": 0.01107949, + "balance_loss_clip": 1.00204456, + "balance_loss_mlp": 1.00037491, + "epoch": 0.6170449421313693, + "flos": 23331917214720.0, + "grad_norm": 1.7835640380062114, + "language_loss": 0.69940221, + "learning_rate": 1.3508627969763188e-06, + "loss": 0.72183859, + "num_input_tokens_seen": 221043090, + "step": 10263, + "time_per_iteration": 2.636617422103882 + }, + { + "auxiliary_loss_clip": 0.01086025, + "auxiliary_loss_mlp": 0.01106962, + "balance_loss_clip": 1.00175846, + "balance_loss_mlp": 1.00053155, + "epoch": 0.6171050653840373, + "flos": 15851617067520.0, + "grad_norm": 1.8985716353842033, + "language_loss": 0.76222312, + "learning_rate": 1.3504944322196244e-06, + "loss": 0.78415298, + "num_input_tokens_seen": 221061435, + "step": 10264, + "time_per_iteration": 2.6717395782470703 + }, + { + "auxiliary_loss_clip": 0.01165649, + "auxiliary_loss_mlp": 0.01107099, + "balance_loss_clip": 1.00212407, + "balance_loss_mlp": 1.00057411, + "epoch": 0.6171651886367052, + "flos": 20045516209920.0, + "grad_norm": 2.5192306319539832, + "language_loss": 0.85229659, + "learning_rate": 1.350126092092247e-06, + "loss": 0.87502408, + "num_input_tokens_seen": 221078705, + "step": 10265, + "time_per_iteration": 2.5084660053253174 + }, + { + "auxiliary_loss_clip": 0.0108892, + "auxiliary_loss_mlp": 0.01106223, + "balance_loss_clip": 1.00181079, + "balance_loss_mlp": 1.00055599, + "epoch": 0.6172253118893732, + "flos": 26432695710720.0, + "grad_norm": 5.24335431288941, + "language_loss": 0.64473176, + "learning_rate": 1.349757776608153e-06, + "loss": 0.6666832, + "num_input_tokens_seen": 221099245, + "step": 10266, + "time_per_iteration": 2.771653175354004 + }, + { + "auxiliary_loss_clip": 0.01117427, + "auxiliary_loss_mlp": 0.01106411, + "balance_loss_clip": 1.00179887, + "balance_loss_mlp": 1.00055313, + "epoch": 0.6172854351420412, + "flos": 22632879657600.0, + "grad_norm": 1.8639524351325778, + "language_loss": 0.7587834, + "learning_rate": 1.3493894857813094e-06, + "loss": 0.78102177, + "num_input_tokens_seen": 221116930, + "step": 10267, + "time_per_iteration": 2.6789610385894775 + }, + { + "auxiliary_loss_clip": 0.01119532, + "auxiliary_loss_mlp": 0.01108361, + "balance_loss_clip": 1.00183177, + "balance_loss_mlp": 1.00040507, + "epoch": 0.6173455583947092, + "flos": 21212936138880.0, + "grad_norm": 2.7558691477944284, + "language_loss": 0.74886119, + "learning_rate": 1.3490212196256818e-06, + "loss": 0.77114016, + "num_input_tokens_seen": 221137660, + "step": 10268, + "time_per_iteration": 2.7038981914520264 + }, + { + "auxiliary_loss_clip": 0.01132264, + "auxiliary_loss_mlp": 0.01107476, + "balance_loss_clip": 1.001858, + "balance_loss_mlp": 1.00056958, + "epoch": 0.6174056816473771, + "flos": 19500284689920.0, + "grad_norm": 1.7378970788898265, + "language_loss": 0.75522816, + "learning_rate": 1.3486529781552342e-06, + "loss": 0.77762556, + "num_input_tokens_seen": 221156225, + "step": 10269, + "time_per_iteration": 2.660259962081909 + }, + { + "auxiliary_loss_clip": 0.01165409, + "auxiliary_loss_mlp": 0.01106328, + "balance_loss_clip": 1.00188637, + "balance_loss_mlp": 1.00046968, + "epoch": 0.6174658049000451, + "flos": 15997342544640.0, + "grad_norm": 2.2343347333010417, + "language_loss": 0.7623108, + "learning_rate": 1.3482847613839318e-06, + "loss": 0.78502822, + "num_input_tokens_seen": 221173820, + "step": 10270, + "time_per_iteration": 2.5664279460906982 + }, + { + "auxiliary_loss_clip": 0.01133592, + "auxiliary_loss_mlp": 0.01106593, + "balance_loss_clip": 1.00192928, + "balance_loss_mlp": 1.00044942, + "epoch": 0.617525928152713, + "flos": 21903893136000.0, + "grad_norm": 1.8191782823928595, + "language_loss": 0.82468945, + "learning_rate": 1.347916569325736e-06, + "loss": 0.84709132, + "num_input_tokens_seen": 221191815, + "step": 10271, + "time_per_iteration": 2.6037673950195312 + }, + { + "auxiliary_loss_clip": 0.01165651, + "auxiliary_loss_mlp": 0.00747327, + "balance_loss_clip": 1.00202012, + "balance_loss_mlp": 1.0003953, + "epoch": 0.617586051405381, + "flos": 21105958458240.0, + "grad_norm": 1.620552783986809, + "language_loss": 0.77436918, + "learning_rate": 1.3475484019946093e-06, + "loss": 0.79349899, + "num_input_tokens_seen": 221211205, + "step": 10272, + "time_per_iteration": 3.9152121543884277 + }, + { + "auxiliary_loss_clip": 0.01113275, + "auxiliary_loss_mlp": 0.01082778, + "balance_loss_clip": 1.0012784, + "balance_loss_mlp": 0.99999946, + "epoch": 0.617646174658049, + "flos": 58610776665600.0, + "grad_norm": 0.8013056524768247, + "language_loss": 0.59057045, + "learning_rate": 1.347180259404513e-06, + "loss": 0.61253107, + "num_input_tokens_seen": 221268430, + "step": 10273, + "time_per_iteration": 3.070629358291626 + }, + { + "auxiliary_loss_clip": 0.01135706, + "auxiliary_loss_mlp": 0.01106334, + "balance_loss_clip": 1.00195122, + "balance_loss_mlp": 1.0004766, + "epoch": 0.617706297910717, + "flos": 13878684691200.0, + "grad_norm": 3.1023003778029814, + "language_loss": 0.72866589, + "learning_rate": 1.3468121415694059e-06, + "loss": 0.75108635, + "num_input_tokens_seen": 221281930, + "step": 10274, + "time_per_iteration": 2.5657224655151367 + }, + { + "auxiliary_loss_clip": 0.01150409, + "auxiliary_loss_mlp": 0.00747349, + "balance_loss_clip": 1.002056, + "balance_loss_mlp": 1.0003705, + "epoch": 0.617766421163385, + "flos": 19208438686080.0, + "grad_norm": 4.410604091660404, + "language_loss": 0.76959622, + "learning_rate": 1.3464440485032484e-06, + "loss": 0.7885738, + "num_input_tokens_seen": 221301605, + "step": 10275, + "time_per_iteration": 2.5668046474456787 + }, + { + "auxiliary_loss_clip": 0.01115513, + "auxiliary_loss_mlp": 0.01106562, + "balance_loss_clip": 1.00178242, + "balance_loss_mlp": 1.00051332, + "epoch": 0.6178265444160529, + "flos": 22565978576640.0, + "grad_norm": 1.8636473650181522, + "language_loss": 0.79643893, + "learning_rate": 1.346075980219998e-06, + "loss": 0.81865972, + "num_input_tokens_seen": 221320105, + "step": 10276, + "time_per_iteration": 2.6605827808380127 + }, + { + "auxiliary_loss_clip": 0.01092689, + "auxiliary_loss_mlp": 0.01106593, + "balance_loss_clip": 1.00193155, + "balance_loss_mlp": 1.00054431, + "epoch": 0.6178866676687209, + "flos": 11984289402240.0, + "grad_norm": 2.010497893632778, + "language_loss": 0.80878216, + "learning_rate": 1.345707936733612e-06, + "loss": 0.83077502, + "num_input_tokens_seen": 221335915, + "step": 10277, + "time_per_iteration": 4.466737747192383 + }, + { + "auxiliary_loss_clip": 0.01119521, + "auxiliary_loss_mlp": 0.0110714, + "balance_loss_clip": 1.00188804, + "balance_loss_mlp": 1.00042355, + "epoch": 0.6179467909213888, + "flos": 20991510748800.0, + "grad_norm": 1.6583886483755745, + "language_loss": 0.81577528, + "learning_rate": 1.3453399180580466e-06, + "loss": 0.8380419, + "num_input_tokens_seen": 221353965, + "step": 10278, + "time_per_iteration": 2.7000954151153564 + }, + { + "auxiliary_loss_clip": 0.01099937, + "auxiliary_loss_mlp": 0.00747227, + "balance_loss_clip": 1.00169671, + "balance_loss_mlp": 1.00036538, + "epoch": 0.6180069141740568, + "flos": 25338102606720.0, + "grad_norm": 1.505601102734118, + "language_loss": 0.74140394, + "learning_rate": 1.3449719242072567e-06, + "loss": 0.7598756, + "num_input_tokens_seen": 221374080, + "step": 10279, + "time_per_iteration": 2.746105432510376 + }, + { + "auxiliary_loss_clip": 0.01150651, + "auxiliary_loss_mlp": 0.01106276, + "balance_loss_clip": 1.00197577, + "balance_loss_mlp": 1.00032258, + "epoch": 0.6180670374267248, + "flos": 19645722858240.0, + "grad_norm": 5.090321760326199, + "language_loss": 0.71023268, + "learning_rate": 1.3446039551951975e-06, + "loss": 0.73280197, + "num_input_tokens_seen": 221392910, + "step": 10280, + "time_per_iteration": 2.587078809738159 + }, + { + "auxiliary_loss_clip": 0.01165628, + "auxiliary_loss_mlp": 0.01106925, + "balance_loss_clip": 1.00208545, + "balance_loss_mlp": 1.00059056, + "epoch": 0.6181271606793928, + "flos": 19464876858240.0, + "grad_norm": 1.5441933747889771, + "language_loss": 0.72933567, + "learning_rate": 1.3442360110358215e-06, + "loss": 0.75206125, + "num_input_tokens_seen": 221410990, + "step": 10281, + "time_per_iteration": 2.60624361038208 + }, + { + "auxiliary_loss_clip": 0.01134011, + "auxiliary_loss_mlp": 0.01104743, + "balance_loss_clip": 1.001894, + "balance_loss_mlp": 1.00060189, + "epoch": 0.6181872839320607, + "flos": 25594289383680.0, + "grad_norm": 1.6301287349314528, + "language_loss": 0.76611531, + "learning_rate": 1.3438680917430827e-06, + "loss": 0.78850287, + "num_input_tokens_seen": 221431020, + "step": 10282, + "time_per_iteration": 4.013160228729248 + }, + { + "auxiliary_loss_clip": 0.01134111, + "auxiliary_loss_mlp": 0.0110772, + "balance_loss_clip": 1.00194275, + "balance_loss_mlp": 1.00043154, + "epoch": 0.6182474071847287, + "flos": 25551806572800.0, + "grad_norm": 1.9190666405927914, + "language_loss": 0.68845814, + "learning_rate": 1.343500197330931e-06, + "loss": 0.71087641, + "num_input_tokens_seen": 221453235, + "step": 10283, + "time_per_iteration": 2.63468599319458 + }, + { + "auxiliary_loss_clip": 0.01150959, + "auxiliary_loss_mlp": 0.01107307, + "balance_loss_clip": 1.00183463, + "balance_loss_mlp": 1.0004003, + "epoch": 0.6183075304373966, + "flos": 22123738327680.0, + "grad_norm": 1.819744703541539, + "language_loss": 0.75197387, + "learning_rate": 1.3431323278133176e-06, + "loss": 0.77455652, + "num_input_tokens_seen": 221472560, + "step": 10284, + "time_per_iteration": 3.982553720474243 + }, + { + "auxiliary_loss_clip": 0.01148634, + "auxiliary_loss_mlp": 0.01105653, + "balance_loss_clip": 1.00201821, + "balance_loss_mlp": 1.0005579, + "epoch": 0.6183676536900646, + "flos": 22455589104000.0, + "grad_norm": 1.8025194105922424, + "language_loss": 0.75444376, + "learning_rate": 1.3427644832041922e-06, + "loss": 0.77698672, + "num_input_tokens_seen": 221492835, + "step": 10285, + "time_per_iteration": 2.57196044921875 + }, + { + "auxiliary_loss_clip": 0.0111699, + "auxiliary_loss_mlp": 0.01106523, + "balance_loss_clip": 1.00177097, + "balance_loss_mlp": 1.00047469, + "epoch": 0.6184277769427327, + "flos": 23364128736000.0, + "grad_norm": 1.4589323826308485, + "language_loss": 0.72679925, + "learning_rate": 1.342396663517503e-06, + "loss": 0.7490344, + "num_input_tokens_seen": 221511870, + "step": 10286, + "time_per_iteration": 2.6567795276641846 + }, + { + "auxiliary_loss_clip": 0.01165492, + "auxiliary_loss_mlp": 0.01106318, + "balance_loss_clip": 1.00198627, + "balance_loss_mlp": 1.00046015, + "epoch": 0.6184879001954006, + "flos": 22711057608960.0, + "grad_norm": 1.565812180158607, + "language_loss": 0.75698566, + "learning_rate": 1.342028868767199e-06, + "loss": 0.7797038, + "num_input_tokens_seen": 221529915, + "step": 10287, + "time_per_iteration": 2.5295052528381348 + }, + { + "auxiliary_loss_clip": 0.01117613, + "auxiliary_loss_mlp": 0.01106169, + "balance_loss_clip": 1.00191808, + "balance_loss_mlp": 1.00059676, + "epoch": 0.6185480234480686, + "flos": 23841920471040.0, + "grad_norm": 2.6840246238997194, + "language_loss": 0.72961915, + "learning_rate": 1.3416610989672262e-06, + "loss": 0.75185704, + "num_input_tokens_seen": 221549745, + "step": 10288, + "time_per_iteration": 2.6473426818847656 + }, + { + "auxiliary_loss_clip": 0.01150497, + "auxiliary_loss_mlp": 0.01105591, + "balance_loss_clip": 1.00196028, + "balance_loss_mlp": 1.00049615, + "epoch": 0.6186081467007365, + "flos": 45477595774080.0, + "grad_norm": 1.8817748280637094, + "language_loss": 0.72654629, + "learning_rate": 1.3412933541315296e-06, + "loss": 0.74910712, + "num_input_tokens_seen": 221572455, + "step": 10289, + "time_per_iteration": 2.7673606872558594 + }, + { + "auxiliary_loss_clip": 0.01134367, + "auxiliary_loss_mlp": 0.01106297, + "balance_loss_clip": 1.0020355, + "balance_loss_mlp": 1.0005343, + "epoch": 0.6186682699534045, + "flos": 23550864566400.0, + "grad_norm": 1.6340331504235235, + "language_loss": 0.79562294, + "learning_rate": 1.340925634274056e-06, + "loss": 0.81802952, + "num_input_tokens_seen": 221591325, + "step": 10290, + "time_per_iteration": 2.637582302093506 + }, + { + "auxiliary_loss_clip": 0.01148448, + "auxiliary_loss_mlp": 0.01107052, + "balance_loss_clip": 1.00194275, + "balance_loss_mlp": 1.00043106, + "epoch": 0.6187283932060724, + "flos": 25774201630080.0, + "grad_norm": 1.6665653091541042, + "language_loss": 0.81565344, + "learning_rate": 1.3405579394087475e-06, + "loss": 0.83820844, + "num_input_tokens_seen": 221611640, + "step": 10291, + "time_per_iteration": 2.5873498916625977 + }, + { + "auxiliary_loss_clip": 0.01165661, + "auxiliary_loss_mlp": 0.01106175, + "balance_loss_clip": 1.00204468, + "balance_loss_mlp": 1.00041246, + "epoch": 0.6187885164587404, + "flos": 25265203954560.0, + "grad_norm": 1.837847984086958, + "language_loss": 0.77677202, + "learning_rate": 1.3401902695495487e-06, + "loss": 0.79949045, + "num_input_tokens_seen": 221631225, + "step": 10292, + "time_per_iteration": 2.5513598918914795 + }, + { + "auxiliary_loss_clip": 0.01136451, + "auxiliary_loss_mlp": 0.01107408, + "balance_loss_clip": 1.00219226, + "balance_loss_mlp": 1.000597, + "epoch": 0.6188486397114084, + "flos": 26250772302720.0, + "grad_norm": 2.222518621097449, + "language_loss": 0.73420763, + "learning_rate": 1.339822624710401e-06, + "loss": 0.75664628, + "num_input_tokens_seen": 221651035, + "step": 10293, + "time_per_iteration": 2.6269941329956055 + }, + { + "auxiliary_loss_clip": 0.01117035, + "auxiliary_loss_mlp": 0.00747197, + "balance_loss_clip": 1.00198078, + "balance_loss_mlp": 1.00032532, + "epoch": 0.6189087629640764, + "flos": 20923388605440.0, + "grad_norm": 1.552108900990534, + "language_loss": 0.83019465, + "learning_rate": 1.3394550049052454e-06, + "loss": 0.84883702, + "num_input_tokens_seen": 221671300, + "step": 10294, + "time_per_iteration": 2.6568686962127686 + }, + { + "auxiliary_loss_clip": 0.01134267, + "auxiliary_loss_mlp": 0.01106233, + "balance_loss_clip": 1.00190365, + "balance_loss_mlp": 1.00047088, + "epoch": 0.6189688862167443, + "flos": 14829814874880.0, + "grad_norm": 2.5865515826537036, + "language_loss": 0.71027482, + "learning_rate": 1.3390874101480225e-06, + "loss": 0.73267984, + "num_input_tokens_seen": 221687320, + "step": 10295, + "time_per_iteration": 2.577641010284424 + }, + { + "auxiliary_loss_clip": 0.01165598, + "auxiliary_loss_mlp": 0.01106063, + "balance_loss_clip": 1.00213456, + "balance_loss_mlp": 1.00068164, + "epoch": 0.6190290094694123, + "flos": 24285058560000.0, + "grad_norm": 1.519356972238366, + "language_loss": 0.70240927, + "learning_rate": 1.3387198404526705e-06, + "loss": 0.72512591, + "num_input_tokens_seen": 221710175, + "step": 10296, + "time_per_iteration": 2.586245536804199 + }, + { + "auxiliary_loss_clip": 0.01117503, + "auxiliary_loss_mlp": 0.01107023, + "balance_loss_clip": 1.00205803, + "balance_loss_mlp": 1.00059247, + "epoch": 0.6190891327220802, + "flos": 22529457423360.0, + "grad_norm": 1.998515361480981, + "language_loss": 0.71442688, + "learning_rate": 1.3383522958331287e-06, + "loss": 0.73667216, + "num_input_tokens_seen": 221728145, + "step": 10297, + "time_per_iteration": 2.6451215744018555 + }, + { + "auxiliary_loss_clip": 0.01160075, + "auxiliary_loss_mlp": 0.01081924, + "balance_loss_clip": 1.00117135, + "balance_loss_mlp": 0.99990797, + "epoch": 0.6191492559747482, + "flos": 67729357152000.0, + "grad_norm": 0.8847376458276898, + "language_loss": 0.64131951, + "learning_rate": 1.3379847763033345e-06, + "loss": 0.66373956, + "num_input_tokens_seen": 221786100, + "step": 10298, + "time_per_iteration": 2.9854655265808105 + }, + { + "auxiliary_loss_clip": 0.01165593, + "auxiliary_loss_mlp": 0.01106531, + "balance_loss_clip": 1.00202394, + "balance_loss_mlp": 1.00057793, + "epoch": 0.6192093792274163, + "flos": 22346672088960.0, + "grad_norm": 1.6001488510643833, + "language_loss": 0.73941475, + "learning_rate": 1.3376172818772236e-06, + "loss": 0.76213598, + "num_input_tokens_seen": 221806450, + "step": 10299, + "time_per_iteration": 2.5494260787963867 + }, + { + "auxiliary_loss_clip": 0.01148969, + "auxiliary_loss_mlp": 0.01107598, + "balance_loss_clip": 1.00201416, + "balance_loss_mlp": 1.00050056, + "epoch": 0.6192695024800842, + "flos": 13553944807680.0, + "grad_norm": 1.6219489927988215, + "language_loss": 0.68316031, + "learning_rate": 1.337249812568732e-06, + "loss": 0.70572603, + "num_input_tokens_seen": 221823330, + "step": 10300, + "time_per_iteration": 2.679311752319336 + }, + { + "auxiliary_loss_clip": 0.01149253, + "auxiliary_loss_mlp": 0.0074716, + "balance_loss_clip": 1.00201488, + "balance_loss_mlp": 1.00031853, + "epoch": 0.6193296257327522, + "flos": 17415310815360.0, + "grad_norm": 1.7542775816949707, + "language_loss": 0.66784424, + "learning_rate": 1.3368823683917939e-06, + "loss": 0.68680841, + "num_input_tokens_seen": 221839360, + "step": 10301, + "time_per_iteration": 2.5390872955322266 + }, + { + "auxiliary_loss_clip": 0.01115793, + "auxiliary_loss_mlp": 0.01106674, + "balance_loss_clip": 1.00171757, + "balance_loss_mlp": 1.00053048, + "epoch": 0.6193897489854201, + "flos": 31101118450560.0, + "grad_norm": 1.565316627293701, + "language_loss": 0.73127544, + "learning_rate": 1.3365149493603424e-06, + "loss": 0.7535001, + "num_input_tokens_seen": 221859465, + "step": 10302, + "time_per_iteration": 2.7179250717163086 + }, + { + "auxiliary_loss_clip": 0.01132772, + "auxiliary_loss_mlp": 0.01106611, + "balance_loss_clip": 1.00187767, + "balance_loss_mlp": 1.00037193, + "epoch": 0.6194498722380881, + "flos": 19134031662720.0, + "grad_norm": 1.8591975681460535, + "language_loss": 0.80408961, + "learning_rate": 1.3361475554883107e-06, + "loss": 0.82648343, + "num_input_tokens_seen": 221878555, + "step": 10303, + "time_per_iteration": 2.5632758140563965 + }, + { + "auxiliary_loss_clip": 0.0116574, + "auxiliary_loss_mlp": 0.01107034, + "balance_loss_clip": 1.00210702, + "balance_loss_mlp": 1.00041318, + "epoch": 0.619509995490756, + "flos": 21835088634240.0, + "grad_norm": 1.79432803955393, + "language_loss": 0.76875579, + "learning_rate": 1.3357801867896307e-06, + "loss": 0.79148352, + "num_input_tokens_seen": 221898790, + "step": 10304, + "time_per_iteration": 2.534395694732666 + }, + { + "auxiliary_loss_clip": 0.01116787, + "auxiliary_loss_mlp": 0.01107807, + "balance_loss_clip": 1.00176239, + "balance_loss_mlp": 1.00061369, + "epoch": 0.619570118743424, + "flos": 23806548552960.0, + "grad_norm": 1.9525770517033074, + "language_loss": 0.76548719, + "learning_rate": 1.3354128432782324e-06, + "loss": 0.78773308, + "num_input_tokens_seen": 221918875, + "step": 10305, + "time_per_iteration": 2.650935649871826 + }, + { + "auxiliary_loss_clip": 0.01151076, + "auxiliary_loss_mlp": 0.01107197, + "balance_loss_clip": 1.00211501, + "balance_loss_mlp": 1.00048137, + "epoch": 0.619630241996092, + "flos": 21101612912640.0, + "grad_norm": 1.8086733454866324, + "language_loss": 0.79044831, + "learning_rate": 1.335045524968045e-06, + "loss": 0.81303102, + "num_input_tokens_seen": 221937895, + "step": 10306, + "time_per_iteration": 2.5547995567321777 + }, + { + "auxiliary_loss_clip": 0.01101642, + "auxiliary_loss_mlp": 0.01104888, + "balance_loss_clip": 1.00191391, + "balance_loss_mlp": 1.0003649, + "epoch": 0.61969036524876, + "flos": 27308269635840.0, + "grad_norm": 1.797928886697817, + "language_loss": 0.80209041, + "learning_rate": 1.3346782318729988e-06, + "loss": 0.82415575, + "num_input_tokens_seen": 221955920, + "step": 10307, + "time_per_iteration": 2.722372055053711 + }, + { + "auxiliary_loss_clip": 0.01118572, + "auxiliary_loss_mlp": 0.01082353, + "balance_loss_clip": 1.00129747, + "balance_loss_mlp": 0.99995595, + "epoch": 0.6197504885014279, + "flos": 51648955384320.0, + "grad_norm": 0.8342241345094017, + "language_loss": 0.59411132, + "learning_rate": 1.3343109640070203e-06, + "loss": 0.61612058, + "num_input_tokens_seen": 222011405, + "step": 10308, + "time_per_iteration": 3.2104873657226562 + }, + { + "auxiliary_loss_clip": 0.01132233, + "auxiliary_loss_mlp": 0.01105682, + "balance_loss_clip": 1.00183773, + "balance_loss_mlp": 1.00049138, + "epoch": 0.6198106117540959, + "flos": 30557107992960.0, + "grad_norm": 1.6841634185599976, + "language_loss": 0.67818618, + "learning_rate": 1.333943721384037e-06, + "loss": 0.70056534, + "num_input_tokens_seen": 222034545, + "step": 10309, + "time_per_iteration": 4.026824712753296 + }, + { + "auxiliary_loss_clip": 0.01134267, + "auxiliary_loss_mlp": 0.01105803, + "balance_loss_clip": 1.00197268, + "balance_loss_mlp": 1.00070834, + "epoch": 0.6198707350067638, + "flos": 18909733184640.0, + "grad_norm": 1.9528106413042683, + "language_loss": 0.72054905, + "learning_rate": 1.3335765040179746e-06, + "loss": 0.74294972, + "num_input_tokens_seen": 222052690, + "step": 10310, + "time_per_iteration": 2.5826573371887207 + }, + { + "auxiliary_loss_clip": 0.01133, + "auxiliary_loss_mlp": 0.01106295, + "balance_loss_clip": 1.00197601, + "balance_loss_mlp": 1.00053298, + "epoch": 0.6199308582594318, + "flos": 21433858738560.0, + "grad_norm": 2.25723766124663, + "language_loss": 0.78805912, + "learning_rate": 1.3332093119227573e-06, + "loss": 0.81045204, + "num_input_tokens_seen": 222069095, + "step": 10311, + "time_per_iteration": 2.671748161315918 + }, + { + "auxiliary_loss_clip": 0.01118345, + "auxiliary_loss_mlp": 0.01106404, + "balance_loss_clip": 1.00178969, + "balance_loss_mlp": 1.00045061, + "epoch": 0.6199909815120999, + "flos": 18407379525120.0, + "grad_norm": 1.993734720981554, + "language_loss": 0.72590196, + "learning_rate": 1.3328421451123105e-06, + "loss": 0.74814945, + "num_input_tokens_seen": 222087360, + "step": 10312, + "time_per_iteration": 2.653524160385132 + }, + { + "auxiliary_loss_clip": 0.01099387, + "auxiliary_loss_mlp": 0.01107277, + "balance_loss_clip": 1.00171244, + "balance_loss_mlp": 1.00046563, + "epoch": 0.6200511047647678, + "flos": 21466860359040.0, + "grad_norm": 2.3873435423339884, + "language_loss": 0.72095883, + "learning_rate": 1.3324750036005557e-06, + "loss": 0.74302554, + "num_input_tokens_seen": 222106130, + "step": 10313, + "time_per_iteration": 2.6809518337249756 + }, + { + "auxiliary_loss_clip": 0.01151039, + "auxiliary_loss_mlp": 0.01107094, + "balance_loss_clip": 1.00201607, + "balance_loss_mlp": 1.00047302, + "epoch": 0.6201112280174358, + "flos": 18215903099520.0, + "grad_norm": 1.8110220362445841, + "language_loss": 0.78557581, + "learning_rate": 1.332107887401416e-06, + "loss": 0.80815715, + "num_input_tokens_seen": 222123125, + "step": 10314, + "time_per_iteration": 2.5675694942474365 + }, + { + "auxiliary_loss_clip": 0.01148737, + "auxiliary_loss_mlp": 0.01106565, + "balance_loss_clip": 1.00187731, + "balance_loss_mlp": 1.00051594, + "epoch": 0.6201713512701037, + "flos": 20011185786240.0, + "grad_norm": 1.8461901158265739, + "language_loss": 0.78261554, + "learning_rate": 1.331740796528812e-06, + "loss": 0.80516851, + "num_input_tokens_seen": 222140655, + "step": 10315, + "time_per_iteration": 4.027977466583252 + }, + { + "auxiliary_loss_clip": 0.01116078, + "auxiliary_loss_mlp": 0.01107305, + "balance_loss_clip": 1.00184953, + "balance_loss_mlp": 1.00058889, + "epoch": 0.6202314745227717, + "flos": 22487692884480.0, + "grad_norm": 2.1317156827421715, + "language_loss": 0.76448464, + "learning_rate": 1.3313737309966641e-06, + "loss": 0.78671849, + "num_input_tokens_seen": 222160450, + "step": 10316, + "time_per_iteration": 2.663257598876953 + }, + { + "auxiliary_loss_clip": 0.01165536, + "auxiliary_loss_mlp": 0.01105946, + "balance_loss_clip": 1.00189638, + "balance_loss_mlp": 1.00046968, + "epoch": 0.6202915977754396, + "flos": 26828682220800.0, + "grad_norm": 2.0471583516393266, + "language_loss": 0.77750897, + "learning_rate": 1.3310066908188915e-06, + "loss": 0.80022383, + "num_input_tokens_seen": 222179170, + "step": 10317, + "time_per_iteration": 2.5707974433898926 + }, + { + "auxiliary_loss_clip": 0.01129303, + "auxiliary_loss_mlp": 0.01082311, + "balance_loss_clip": 1.00120676, + "balance_loss_mlp": 0.99991333, + "epoch": 0.6203517210281076, + "flos": 62742694890240.0, + "grad_norm": 0.6855251744484824, + "language_loss": 0.59102392, + "learning_rate": 1.3306396760094122e-06, + "loss": 0.61314011, + "num_input_tokens_seen": 222242660, + "step": 10318, + "time_per_iteration": 3.2299234867095947 + }, + { + "auxiliary_loss_clip": 0.01134596, + "auxiliary_loss_mlp": 0.01107539, + "balance_loss_clip": 1.0021807, + "balance_loss_mlp": 1.00072718, + "epoch": 0.6204118442807756, + "flos": 23404277162880.0, + "grad_norm": 1.6511829595068908, + "language_loss": 0.78029674, + "learning_rate": 1.330272686582143e-06, + "loss": 0.8027181, + "num_input_tokens_seen": 222262170, + "step": 10319, + "time_per_iteration": 2.6184616088867188 + }, + { + "auxiliary_loss_clip": 0.01133421, + "auxiliary_loss_mlp": 0.01106296, + "balance_loss_clip": 1.00190973, + "balance_loss_mlp": 1.00053358, + "epoch": 0.6204719675334436, + "flos": 20193647898240.0, + "grad_norm": 2.0088098158924432, + "language_loss": 0.66294271, + "learning_rate": 1.3299057225510013e-06, + "loss": 0.68533993, + "num_input_tokens_seen": 222280375, + "step": 10320, + "time_per_iteration": 4.077246427536011 + }, + { + "auxiliary_loss_clip": 0.01117022, + "auxiliary_loss_mlp": 0.01104956, + "balance_loss_clip": 1.00190616, + "balance_loss_mlp": 1.00052893, + "epoch": 0.6205320907861115, + "flos": 13188050916480.0, + "grad_norm": 1.8546087305426553, + "language_loss": 0.76188719, + "learning_rate": 1.3295387839299013e-06, + "loss": 0.78410697, + "num_input_tokens_seen": 222297325, + "step": 10321, + "time_per_iteration": 2.6108593940734863 + }, + { + "auxiliary_loss_clip": 0.01131928, + "auxiliary_loss_mlp": 0.011062, + "balance_loss_clip": 1.00181341, + "balance_loss_mlp": 1.00062776, + "epoch": 0.6205922140387795, + "flos": 20668386977280.0, + "grad_norm": 1.773473871256983, + "language_loss": 0.73488498, + "learning_rate": 1.329171870732758e-06, + "loss": 0.75726628, + "num_input_tokens_seen": 222317095, + "step": 10322, + "time_per_iteration": 4.029846668243408 + }, + { + "auxiliary_loss_clip": 0.01119492, + "auxiliary_loss_mlp": 0.01105785, + "balance_loss_clip": 1.00197721, + "balance_loss_mlp": 1.00049949, + "epoch": 0.6206523372914474, + "flos": 23877831093120.0, + "grad_norm": 1.7201785645182344, + "language_loss": 0.7242313, + "learning_rate": 1.3288049829734845e-06, + "loss": 0.74648404, + "num_input_tokens_seen": 222337055, + "step": 10323, + "time_per_iteration": 2.6661155223846436 + }, + { + "auxiliary_loss_clip": 0.01150543, + "auxiliary_loss_mlp": 0.0110729, + "balance_loss_clip": 1.00201368, + "balance_loss_mlp": 1.00047827, + "epoch": 0.6207124605441154, + "flos": 13406603218560.0, + "grad_norm": 2.0384811537593786, + "language_loss": 0.58841282, + "learning_rate": 1.3284381206659933e-06, + "loss": 0.61099118, + "num_input_tokens_seen": 222354515, + "step": 10324, + "time_per_iteration": 2.545560598373413 + }, + { + "auxiliary_loss_clip": 0.01117097, + "auxiliary_loss_mlp": 0.0110695, + "balance_loss_clip": 1.00187087, + "balance_loss_mlp": 1.00052011, + "epoch": 0.6207725837967835, + "flos": 18916341287040.0, + "grad_norm": 1.9856895395833873, + "language_loss": 0.7653271, + "learning_rate": 1.3280712838241956e-06, + "loss": 0.78756756, + "num_input_tokens_seen": 222372755, + "step": 10325, + "time_per_iteration": 2.605428695678711 + }, + { + "auxiliary_loss_clip": 0.01149129, + "auxiliary_loss_mlp": 0.01107185, + "balance_loss_clip": 1.00198126, + "balance_loss_mlp": 1.0003736, + "epoch": 0.6208327070494514, + "flos": 23980211832960.0, + "grad_norm": 1.8349885779756647, + "language_loss": 0.72660267, + "learning_rate": 1.327704472462003e-06, + "loss": 0.74916583, + "num_input_tokens_seen": 222391380, + "step": 10326, + "time_per_iteration": 2.5733144283294678 + }, + { + "auxiliary_loss_clip": 0.01149167, + "auxiliary_loss_mlp": 0.0110738, + "balance_loss_clip": 1.00208306, + "balance_loss_mlp": 1.00066435, + "epoch": 0.6208928303021194, + "flos": 22820405587200.0, + "grad_norm": 2.297389504251453, + "language_loss": 0.74158871, + "learning_rate": 1.3273376865933234e-06, + "loss": 0.7641542, + "num_input_tokens_seen": 222411165, + "step": 10327, + "time_per_iteration": 2.568295955657959 + }, + { + "auxiliary_loss_clip": 0.0113251, + "auxiliary_loss_mlp": 0.01107698, + "balance_loss_clip": 1.00194049, + "balance_loss_mlp": 1.00050521, + "epoch": 0.6209529535547873, + "flos": 17564519911680.0, + "grad_norm": 2.1966747152105732, + "language_loss": 0.80019611, + "learning_rate": 1.326970926232066e-06, + "loss": 0.82259816, + "num_input_tokens_seen": 222428110, + "step": 10328, + "time_per_iteration": 2.5550084114074707 + }, + { + "auxiliary_loss_clip": 0.01117446, + "auxiliary_loss_mlp": 0.01107041, + "balance_loss_clip": 1.00191045, + "balance_loss_mlp": 1.00070608, + "epoch": 0.6210130768074553, + "flos": 22011912311040.0, + "grad_norm": 2.6003981425617204, + "language_loss": 0.78066957, + "learning_rate": 1.3266041913921396e-06, + "loss": 0.8029145, + "num_input_tokens_seen": 222446385, + "step": 10329, + "time_per_iteration": 2.641649007797241 + }, + { + "auxiliary_loss_clip": 0.01142687, + "auxiliary_loss_mlp": 0.01081518, + "balance_loss_clip": 1.00089312, + "balance_loss_mlp": 0.99988335, + "epoch": 0.6210732000601232, + "flos": 63676873854720.0, + "grad_norm": 0.8230659195295771, + "language_loss": 0.62161505, + "learning_rate": 1.3262374820874484e-06, + "loss": 0.64385712, + "num_input_tokens_seen": 222502150, + "step": 10330, + "time_per_iteration": 3.073317289352417 + }, + { + "auxiliary_loss_clip": 0.01150474, + "auxiliary_loss_mlp": 0.01107468, + "balance_loss_clip": 1.00199771, + "balance_loss_mlp": 1.00075197, + "epoch": 0.6211333233127913, + "flos": 24243365848320.0, + "grad_norm": 2.496341101950045, + "language_loss": 0.77721167, + "learning_rate": 1.3258707983319002e-06, + "loss": 0.7997911, + "num_input_tokens_seen": 222519880, + "step": 10331, + "time_per_iteration": 2.5765883922576904 + }, + { + "auxiliary_loss_clip": 0.01165879, + "auxiliary_loss_mlp": 0.01107281, + "balance_loss_clip": 1.00215101, + "balance_loss_mlp": 1.00056458, + "epoch": 0.6211934465654592, + "flos": 16943803960320.0, + "grad_norm": 4.188932838197315, + "language_loss": 0.67398238, + "learning_rate": 1.3255041401393992e-06, + "loss": 0.69671404, + "num_input_tokens_seen": 222538545, + "step": 10332, + "time_per_iteration": 2.528474807739258 + }, + { + "auxiliary_loss_clip": 0.01116925, + "auxiliary_loss_mlp": 0.01106882, + "balance_loss_clip": 1.00181437, + "balance_loss_mlp": 1.00054705, + "epoch": 0.6212535698181272, + "flos": 15267386355840.0, + "grad_norm": 1.5799352847469486, + "language_loss": 0.76384097, + "learning_rate": 1.3251375075238476e-06, + "loss": 0.78607905, + "num_input_tokens_seen": 222556935, + "step": 10333, + "time_per_iteration": 2.670107841491699 + }, + { + "auxiliary_loss_clip": 0.01132403, + "auxiliary_loss_mlp": 0.01106436, + "balance_loss_clip": 1.00189269, + "balance_loss_mlp": 1.00048327, + "epoch": 0.6213136930707951, + "flos": 13443950384640.0, + "grad_norm": 2.3901847639936484, + "language_loss": 0.69019091, + "learning_rate": 1.3247709004991507e-06, + "loss": 0.71257925, + "num_input_tokens_seen": 222574035, + "step": 10334, + "time_per_iteration": 2.565019369125366 + }, + { + "auxiliary_loss_clip": 0.01132438, + "auxiliary_loss_mlp": 0.00747189, + "balance_loss_clip": 1.00195611, + "balance_loss_mlp": 1.00041056, + "epoch": 0.6213738163234631, + "flos": 18111223889280.0, + "grad_norm": 1.959478281442756, + "language_loss": 0.70193875, + "learning_rate": 1.3244043190792078e-06, + "loss": 0.72073501, + "num_input_tokens_seen": 222592290, + "step": 10335, + "time_per_iteration": 2.6067419052124023 + }, + { + "auxiliary_loss_clip": 0.01104109, + "auxiliary_loss_mlp": 0.01106643, + "balance_loss_clip": 1.00187707, + "balance_loss_mlp": 1.00049901, + "epoch": 0.621433939576131, + "flos": 25337348421120.0, + "grad_norm": 1.8650792165296086, + "language_loss": 0.79808432, + "learning_rate": 1.3240377632779213e-06, + "loss": 0.82019186, + "num_input_tokens_seen": 222612805, + "step": 10336, + "time_per_iteration": 2.720932960510254 + }, + { + "auxiliary_loss_clip": 0.01165452, + "auxiliary_loss_mlp": 0.0110571, + "balance_loss_clip": 1.00207627, + "balance_loss_mlp": 1.00051999, + "epoch": 0.621494062828799, + "flos": 22565619440640.0, + "grad_norm": 1.6483812726554685, + "language_loss": 0.73307717, + "learning_rate": 1.3236712331091907e-06, + "loss": 0.7557888, + "num_input_tokens_seen": 222632260, + "step": 10337, + "time_per_iteration": 2.554361581802368 + }, + { + "auxiliary_loss_clip": 0.01165666, + "auxiliary_loss_mlp": 0.01107946, + "balance_loss_clip": 1.00201762, + "balance_loss_mlp": 1.00056219, + "epoch": 0.621554186081467, + "flos": 27417976750080.0, + "grad_norm": 2.096649287176841, + "language_loss": 0.62988067, + "learning_rate": 1.3233047285869145e-06, + "loss": 0.65261674, + "num_input_tokens_seen": 222653570, + "step": 10338, + "time_per_iteration": 2.5980069637298584 + }, + { + "auxiliary_loss_clip": 0.0114891, + "auxiliary_loss_mlp": 0.01107086, + "balance_loss_clip": 1.00208509, + "balance_loss_mlp": 1.00056028, + "epoch": 0.621614309334135, + "flos": 22346815743360.0, + "grad_norm": 1.6746571446128906, + "language_loss": 0.71224833, + "learning_rate": 1.322938249724991e-06, + "loss": 0.73480821, + "num_input_tokens_seen": 222672480, + "step": 10339, + "time_per_iteration": 2.575207471847534 + }, + { + "auxiliary_loss_clip": 0.01101683, + "auxiliary_loss_mlp": 0.01105902, + "balance_loss_clip": 1.00182223, + "balance_loss_mlp": 1.00042546, + "epoch": 0.621674432586803, + "flos": 19281229597440.0, + "grad_norm": 2.197656680770498, + "language_loss": 0.69259411, + "learning_rate": 1.3225717965373166e-06, + "loss": 0.71466994, + "num_input_tokens_seen": 222691200, + "step": 10340, + "time_per_iteration": 2.6697540283203125 + }, + { + "auxiliary_loss_clip": 0.01117769, + "auxiliary_loss_mlp": 0.01105083, + "balance_loss_clip": 1.0019567, + "balance_loss_mlp": 1.00036955, + "epoch": 0.6217345558394709, + "flos": 21609533180160.0, + "grad_norm": 1.7900368437145093, + "language_loss": 0.68644261, + "learning_rate": 1.322205369037788e-06, + "loss": 0.70867115, + "num_input_tokens_seen": 222709975, + "step": 10341, + "time_per_iteration": 2.661137342453003 + }, + { + "auxiliary_loss_clip": 0.01148838, + "auxiliary_loss_mlp": 0.01107148, + "balance_loss_clip": 1.00204885, + "balance_loss_mlp": 1.00043237, + "epoch": 0.6217946790921389, + "flos": 18004102554240.0, + "grad_norm": 2.032013047861295, + "language_loss": 0.80642921, + "learning_rate": 1.321838967240299e-06, + "loss": 0.82898909, + "num_input_tokens_seen": 222729005, + "step": 10342, + "time_per_iteration": 2.527327299118042 + }, + { + "auxiliary_loss_clip": 0.01131145, + "auxiliary_loss_mlp": 0.0108231, + "balance_loss_clip": 1.00118876, + "balance_loss_mlp": 0.99991244, + "epoch": 0.6218548023448068, + "flos": 61973631768960.0, + "grad_norm": 0.7929202804463656, + "language_loss": 0.57392746, + "learning_rate": 1.3214725911587452e-06, + "loss": 0.596062, + "num_input_tokens_seen": 222786090, + "step": 10343, + "time_per_iteration": 3.079373359680176 + }, + { + "auxiliary_loss_clip": 0.01119224, + "auxiliary_loss_mlp": 0.01105754, + "balance_loss_clip": 1.00199246, + "balance_loss_mlp": 1.00056374, + "epoch": 0.6219149255974749, + "flos": 25739152934400.0, + "grad_norm": 1.8275764271490245, + "language_loss": 0.72841322, + "learning_rate": 1.3211062408070184e-06, + "loss": 0.75066292, + "num_input_tokens_seen": 222806100, + "step": 10344, + "time_per_iteration": 2.685389518737793 + }, + { + "auxiliary_loss_clip": 0.01148943, + "auxiliary_loss_mlp": 0.01106903, + "balance_loss_clip": 1.00209558, + "balance_loss_mlp": 1.00066328, + "epoch": 0.6219750488501428, + "flos": 25411073086080.0, + "grad_norm": 2.1199287081269738, + "language_loss": 0.60284722, + "learning_rate": 1.3207399161990105e-06, + "loss": 0.62540567, + "num_input_tokens_seen": 222826575, + "step": 10345, + "time_per_iteration": 2.584947109222412 + }, + { + "auxiliary_loss_clip": 0.01088437, + "auxiliary_loss_mlp": 0.01105843, + "balance_loss_clip": 1.00186706, + "balance_loss_mlp": 1.00046182, + "epoch": 0.6220351721028108, + "flos": 20047383717120.0, + "grad_norm": 1.9083544176930725, + "language_loss": 0.77650201, + "learning_rate": 1.320373617348614e-06, + "loss": 0.79844481, + "num_input_tokens_seen": 222845285, + "step": 10346, + "time_per_iteration": 2.68552565574646 + }, + { + "auxiliary_loss_clip": 0.01117901, + "auxiliary_loss_mlp": 0.01106563, + "balance_loss_clip": 1.0018698, + "balance_loss_mlp": 1.00051415, + "epoch": 0.6220952953554787, + "flos": 27488397363840.0, + "grad_norm": 4.325057883192198, + "language_loss": 0.71235448, + "learning_rate": 1.3200073442697171e-06, + "loss": 0.73459911, + "num_input_tokens_seen": 222864575, + "step": 10347, + "time_per_iteration": 4.009669542312622 + }, + { + "auxiliary_loss_clip": 0.01148271, + "auxiliary_loss_mlp": 0.01105821, + "balance_loss_clip": 1.00191832, + "balance_loss_mlp": 1.00043964, + "epoch": 0.6221554186081467, + "flos": 19207612673280.0, + "grad_norm": 1.9295265253149665, + "language_loss": 0.71347404, + "learning_rate": 1.3196410969762108e-06, + "loss": 0.73601496, + "num_input_tokens_seen": 222884420, + "step": 10348, + "time_per_iteration": 2.572882652282715 + }, + { + "auxiliary_loss_clip": 0.01112486, + "auxiliary_loss_mlp": 0.01082411, + "balance_loss_clip": 1.00125504, + "balance_loss_mlp": 1.00001359, + "epoch": 0.6222155418608146, + "flos": 62950939989120.0, + "grad_norm": 0.8101453038447408, + "language_loss": 0.5420742, + "learning_rate": 1.3192748754819815e-06, + "loss": 0.56402314, + "num_input_tokens_seen": 222944690, + "step": 10349, + "time_per_iteration": 3.1757779121398926 + }, + { + "auxiliary_loss_clip": 0.01119302, + "auxiliary_loss_mlp": 0.01107162, + "balance_loss_clip": 1.00189567, + "balance_loss_mlp": 1.00035095, + "epoch": 0.6222756651134826, + "flos": 22601099099520.0, + "grad_norm": 2.348174738473949, + "language_loss": 0.69710541, + "learning_rate": 1.3189086798009173e-06, + "loss": 0.71937007, + "num_input_tokens_seen": 222962990, + "step": 10350, + "time_per_iteration": 2.65193247795105 + }, + { + "auxiliary_loss_clip": 0.01165543, + "auxiliary_loss_mlp": 0.01106558, + "balance_loss_clip": 1.0020256, + "balance_loss_mlp": 1.00060439, + "epoch": 0.6223357883661506, + "flos": 21142228216320.0, + "grad_norm": 2.758233567836919, + "language_loss": 0.56918269, + "learning_rate": 1.3185425099469046e-06, + "loss": 0.59190369, + "num_input_tokens_seen": 222980715, + "step": 10351, + "time_per_iteration": 2.512843370437622 + }, + { + "auxiliary_loss_clip": 0.01143508, + "auxiliary_loss_mlp": 0.01082382, + "balance_loss_clip": 1.00123811, + "balance_loss_mlp": 0.9999842, + "epoch": 0.6223959116188186, + "flos": 63765071700480.0, + "grad_norm": 0.8357090325229907, + "language_loss": 0.61132449, + "learning_rate": 1.3181763659338276e-06, + "loss": 0.63358343, + "num_input_tokens_seen": 223040685, + "step": 10352, + "time_per_iteration": 4.468576192855835 + }, + { + "auxiliary_loss_clip": 0.01165517, + "auxiliary_loss_mlp": 0.01105536, + "balance_loss_clip": 1.00209641, + "balance_loss_mlp": 1.00044131, + "epoch": 0.6224560348714866, + "flos": 22565727181440.0, + "grad_norm": 2.044568283061382, + "language_loss": 0.81804347, + "learning_rate": 1.3178102477755714e-06, + "loss": 0.84075403, + "num_input_tokens_seen": 223059000, + "step": 10353, + "time_per_iteration": 2.546116590499878 + }, + { + "auxiliary_loss_clip": 0.0114869, + "auxiliary_loss_mlp": 0.01105559, + "balance_loss_clip": 1.00203323, + "balance_loss_mlp": 1.00055957, + "epoch": 0.6225161581241545, + "flos": 24097748112000.0, + "grad_norm": 1.8065009032222679, + "language_loss": 0.75805271, + "learning_rate": 1.3174441554860195e-06, + "loss": 0.78059524, + "num_input_tokens_seen": 223079345, + "step": 10354, + "time_per_iteration": 2.584294319152832 + }, + { + "auxiliary_loss_clip": 0.01120874, + "auxiliary_loss_mlp": 0.01106086, + "balance_loss_clip": 1.00233102, + "balance_loss_mlp": 1.00051379, + "epoch": 0.6225762813768225, + "flos": 20443513881600.0, + "grad_norm": 1.653745852185465, + "language_loss": 0.78770947, + "learning_rate": 1.3170780890790528e-06, + "loss": 0.80997902, + "num_input_tokens_seen": 223097880, + "step": 10355, + "time_per_iteration": 2.661229133605957 + }, + { + "auxiliary_loss_clip": 0.01149305, + "auxiliary_loss_mlp": 0.01105402, + "balance_loss_clip": 1.00198913, + "balance_loss_mlp": 1.00049758, + "epoch": 0.6226364046294904, + "flos": 27198131558400.0, + "grad_norm": 1.5207495134161886, + "language_loss": 0.77930963, + "learning_rate": 1.3167120485685538e-06, + "loss": 0.80185676, + "num_input_tokens_seen": 223118185, + "step": 10356, + "time_per_iteration": 2.602870225906372 + }, + { + "auxiliary_loss_clip": 0.01133256, + "auxiliary_loss_mlp": 0.0074733, + "balance_loss_clip": 1.00186777, + "balance_loss_mlp": 1.00043488, + "epoch": 0.6226965278821585, + "flos": 20445776438400.0, + "grad_norm": 2.5072452805893763, + "language_loss": 0.68071139, + "learning_rate": 1.3163460339684024e-06, + "loss": 0.69951725, + "num_input_tokens_seen": 223137600, + "step": 10357, + "time_per_iteration": 3.9957940578460693 + }, + { + "auxiliary_loss_clip": 0.01134323, + "auxiliary_loss_mlp": 0.01107445, + "balance_loss_clip": 1.00192189, + "balance_loss_mlp": 1.00053811, + "epoch": 0.6227566511348264, + "flos": 22162737519360.0, + "grad_norm": 2.651544760320225, + "language_loss": 0.76353943, + "learning_rate": 1.3159800452924778e-06, + "loss": 0.78595722, + "num_input_tokens_seen": 223154360, + "step": 10358, + "time_per_iteration": 2.5975794792175293 + }, + { + "auxiliary_loss_clip": 0.01132324, + "auxiliary_loss_mlp": 0.01106578, + "balance_loss_clip": 1.00196695, + "balance_loss_mlp": 1.00043404, + "epoch": 0.6228167743874944, + "flos": 18040875102720.0, + "grad_norm": 2.479251154310991, + "language_loss": 0.81740773, + "learning_rate": 1.3156140825546588e-06, + "loss": 0.83979678, + "num_input_tokens_seen": 223172255, + "step": 10359, + "time_per_iteration": 2.5656392574310303 + }, + { + "auxiliary_loss_clip": 0.01133187, + "auxiliary_loss_mlp": 0.01105606, + "balance_loss_clip": 1.0019331, + "balance_loss_mlp": 1.00060642, + "epoch": 0.6228768976401623, + "flos": 17742851959680.0, + "grad_norm": 2.1088990159461973, + "language_loss": 0.73702657, + "learning_rate": 1.315248145768822e-06, + "loss": 0.75941443, + "num_input_tokens_seen": 223186965, + "step": 10360, + "time_per_iteration": 4.032537460327148 + }, + { + "auxiliary_loss_clip": 0.01150376, + "auxiliary_loss_mlp": 0.01106323, + "balance_loss_clip": 1.00201118, + "balance_loss_mlp": 1.00056064, + "epoch": 0.6229370208928303, + "flos": 17894934144000.0, + "grad_norm": 5.1023305840768245, + "language_loss": 0.78157485, + "learning_rate": 1.3148822349488442e-06, + "loss": 0.80414188, + "num_input_tokens_seen": 223206045, + "step": 10361, + "time_per_iteration": 2.6095352172851562 + }, + { + "auxiliary_loss_clip": 0.01117191, + "auxiliary_loss_mlp": 0.01105587, + "balance_loss_clip": 1.00200737, + "balance_loss_mlp": 1.00058746, + "epoch": 0.6229971441454982, + "flos": 17347763289600.0, + "grad_norm": 1.5992056745791552, + "language_loss": 0.67556518, + "learning_rate": 1.3145163501086005e-06, + "loss": 0.69779295, + "num_input_tokens_seen": 223224820, + "step": 10362, + "time_per_iteration": 2.6524534225463867 + }, + { + "auxiliary_loss_clip": 0.01135614, + "auxiliary_loss_mlp": 0.01106581, + "balance_loss_clip": 1.00189042, + "balance_loss_mlp": 1.00053215, + "epoch": 0.6230572673981662, + "flos": 29241376807680.0, + "grad_norm": 2.066486330842594, + "language_loss": 0.6710248, + "learning_rate": 1.3141504912619658e-06, + "loss": 0.69344676, + "num_input_tokens_seen": 223243205, + "step": 10363, + "time_per_iteration": 2.6657192707061768 + }, + { + "auxiliary_loss_clip": 0.0110413, + "auxiliary_loss_mlp": 0.0110593, + "balance_loss_clip": 1.0020225, + "balance_loss_mlp": 1.00045371, + "epoch": 0.6231173906508342, + "flos": 16325961096960.0, + "grad_norm": 2.008122607364357, + "language_loss": 0.8631894, + "learning_rate": 1.3137846584228127e-06, + "loss": 0.88529003, + "num_input_tokens_seen": 223261370, + "step": 10364, + "time_per_iteration": 2.642141103744507 + }, + { + "auxiliary_loss_clip": 0.01127341, + "auxiliary_loss_mlp": 0.01082367, + "balance_loss_clip": 1.00121379, + "balance_loss_mlp": 0.99996912, + "epoch": 0.6231775139035022, + "flos": 68702032517760.0, + "grad_norm": 0.8964552234098188, + "language_loss": 0.60804808, + "learning_rate": 1.313418851605015e-06, + "loss": 0.63014513, + "num_input_tokens_seen": 223315050, + "step": 10365, + "time_per_iteration": 3.188446521759033 + }, + { + "auxiliary_loss_clip": 0.01116315, + "auxiliary_loss_mlp": 0.00747274, + "balance_loss_clip": 1.00174904, + "balance_loss_mlp": 1.00036538, + "epoch": 0.6232376371561702, + "flos": 19821038163840.0, + "grad_norm": 1.6680644503651452, + "language_loss": 0.75070339, + "learning_rate": 1.3130530708224427e-06, + "loss": 0.76933932, + "num_input_tokens_seen": 223332130, + "step": 10366, + "time_per_iteration": 2.625210762023926 + }, + { + "auxiliary_loss_clip": 0.01148937, + "auxiliary_loss_mlp": 0.01106902, + "balance_loss_clip": 1.00195646, + "balance_loss_mlp": 1.00056708, + "epoch": 0.6232977604088381, + "flos": 23258264376960.0, + "grad_norm": 2.544176178953743, + "language_loss": 0.76024097, + "learning_rate": 1.3126873160889665e-06, + "loss": 0.78279936, + "num_input_tokens_seen": 223351605, + "step": 10367, + "time_per_iteration": 2.569756269454956 + }, + { + "auxiliary_loss_clip": 0.01150741, + "auxiliary_loss_mlp": 0.01105917, + "balance_loss_clip": 1.00209832, + "balance_loss_mlp": 1.0005362, + "epoch": 0.6233578836615061, + "flos": 21106425335040.0, + "grad_norm": 1.4771925515327746, + "language_loss": 0.7847684, + "learning_rate": 1.312321587418457e-06, + "loss": 0.80733502, + "num_input_tokens_seen": 223372090, + "step": 10368, + "time_per_iteration": 2.5686089992523193 + }, + { + "auxiliary_loss_clip": 0.01086446, + "auxiliary_loss_mlp": 0.01105861, + "balance_loss_clip": 1.00193036, + "balance_loss_mlp": 1.00048006, + "epoch": 0.623418006914174, + "flos": 23769416868480.0, + "grad_norm": 1.7675472656059954, + "language_loss": 0.68406552, + "learning_rate": 1.3119558848247811e-06, + "loss": 0.70598853, + "num_input_tokens_seen": 223390110, + "step": 10369, + "time_per_iteration": 2.7224502563476562 + }, + { + "auxiliary_loss_clip": 0.01165736, + "auxiliary_loss_mlp": 0.01107122, + "balance_loss_clip": 1.00210083, + "balance_loss_mlp": 1.0005964, + "epoch": 0.6234781301668421, + "flos": 17890480857600.0, + "grad_norm": 2.267315147093564, + "language_loss": 0.87550044, + "learning_rate": 1.3115902083218072e-06, + "loss": 0.898229, + "num_input_tokens_seen": 223404205, + "step": 10370, + "time_per_iteration": 2.4887912273406982 + }, + { + "auxiliary_loss_clip": 0.01165571, + "auxiliary_loss_mlp": 0.01106029, + "balance_loss_clip": 1.00207591, + "balance_loss_mlp": 1.00055265, + "epoch": 0.62353825341951, + "flos": 26175503352960.0, + "grad_norm": 1.949406262045639, + "language_loss": 0.6614387, + "learning_rate": 1.311224557923402e-06, + "loss": 0.68415469, + "num_input_tokens_seen": 223424855, + "step": 10371, + "time_per_iteration": 2.576728582382202 + }, + { + "auxiliary_loss_clip": 0.01148715, + "auxiliary_loss_mlp": 0.01105124, + "balance_loss_clip": 1.00202608, + "balance_loss_mlp": 1.00050616, + "epoch": 0.623598376672178, + "flos": 31139902160640.0, + "grad_norm": 1.3951756938657214, + "language_loss": 0.77524936, + "learning_rate": 1.3108589336434298e-06, + "loss": 0.79778767, + "num_input_tokens_seen": 223447225, + "step": 10372, + "time_per_iteration": 2.652045965194702 + }, + { + "auxiliary_loss_clip": 0.0114912, + "auxiliary_loss_mlp": 0.01106333, + "balance_loss_clip": 1.00184178, + "balance_loss_mlp": 1.00047481, + "epoch": 0.6236584999248459, + "flos": 23730202195200.0, + "grad_norm": 1.5571519252750259, + "language_loss": 0.77338839, + "learning_rate": 1.3104933354957568e-06, + "loss": 0.7959429, + "num_input_tokens_seen": 223467520, + "step": 10373, + "time_per_iteration": 2.5956056118011475 + }, + { + "auxiliary_loss_clip": 0.01148254, + "auxiliary_loss_mlp": 0.01104883, + "balance_loss_clip": 1.00199342, + "balance_loss_mlp": 1.00045562, + "epoch": 0.6237186231775139, + "flos": 21762764599680.0, + "grad_norm": 1.6226590721449978, + "language_loss": 0.69283891, + "learning_rate": 1.3101277634942448e-06, + "loss": 0.7153703, + "num_input_tokens_seen": 223488130, + "step": 10374, + "time_per_iteration": 2.5832536220550537 + }, + { + "auxiliary_loss_clip": 0.01136243, + "auxiliary_loss_mlp": 0.01106147, + "balance_loss_clip": 1.00202274, + "balance_loss_mlp": 1.00048006, + "epoch": 0.6237787464301818, + "flos": 14939486075520.0, + "grad_norm": 1.7576383633885557, + "language_loss": 0.76867318, + "learning_rate": 1.3097622176527577e-06, + "loss": 0.79109704, + "num_input_tokens_seen": 223505105, + "step": 10375, + "time_per_iteration": 2.596564769744873 + }, + { + "auxiliary_loss_clip": 0.01132641, + "auxiliary_loss_mlp": 0.01105144, + "balance_loss_clip": 1.00196934, + "balance_loss_mlp": 1.00052571, + "epoch": 0.6238388696828499, + "flos": 35590311302400.0, + "grad_norm": 1.917553378349779, + "language_loss": 0.70021772, + "learning_rate": 1.3093966979851566e-06, + "loss": 0.72259557, + "num_input_tokens_seen": 223528065, + "step": 10376, + "time_per_iteration": 2.72625994682312 + }, + { + "auxiliary_loss_clip": 0.01118523, + "auxiliary_loss_mlp": 0.01107088, + "balance_loss_clip": 1.00191236, + "balance_loss_mlp": 1.00065756, + "epoch": 0.6238989929355178, + "flos": 23623511823360.0, + "grad_norm": 1.5777214191646558, + "language_loss": 0.76907134, + "learning_rate": 1.309031204505301e-06, + "loss": 0.79132754, + "num_input_tokens_seen": 223547305, + "step": 10377, + "time_per_iteration": 2.6915602684020996 + }, + { + "auxiliary_loss_clip": 0.0113223, + "auxiliary_loss_mlp": 0.01106574, + "balance_loss_clip": 1.00202823, + "balance_loss_mlp": 1.00052536, + "epoch": 0.6239591161881858, + "flos": 22087468569600.0, + "grad_norm": 2.6854147706653175, + "language_loss": 0.68113458, + "learning_rate": 1.308665737227052e-06, + "loss": 0.70352256, + "num_input_tokens_seen": 223567205, + "step": 10378, + "time_per_iteration": 2.5950443744659424 + }, + { + "auxiliary_loss_clip": 0.01134358, + "auxiliary_loss_mlp": 0.01106307, + "balance_loss_clip": 1.00187063, + "balance_loss_mlp": 1.00054431, + "epoch": 0.6240192394408538, + "flos": 24535930124160.0, + "grad_norm": 1.8605333947703995, + "language_loss": 0.76541555, + "learning_rate": 1.3083002961642675e-06, + "loss": 0.78782225, + "num_input_tokens_seen": 223586560, + "step": 10379, + "time_per_iteration": 2.617316722869873 + }, + { + "auxiliary_loss_clip": 0.011332, + "auxiliary_loss_mlp": 0.01106141, + "balance_loss_clip": 1.00182962, + "balance_loss_mlp": 1.00037837, + "epoch": 0.6240793626935217, + "flos": 27931930502400.0, + "grad_norm": 1.519396406123954, + "language_loss": 0.79269564, + "learning_rate": 1.3079348813308051e-06, + "loss": 0.81508905, + "num_input_tokens_seen": 223610595, + "step": 10380, + "time_per_iteration": 2.6795079708099365 + }, + { + "auxiliary_loss_clip": 0.01149151, + "auxiliary_loss_mlp": 0.01105292, + "balance_loss_clip": 1.00203776, + "balance_loss_mlp": 1.00057888, + "epoch": 0.6241394859461897, + "flos": 22892514140160.0, + "grad_norm": 2.213618545105738, + "language_loss": 0.79770434, + "learning_rate": 1.3075694927405207e-06, + "loss": 0.82024878, + "num_input_tokens_seen": 223630230, + "step": 10381, + "time_per_iteration": 2.6142592430114746 + }, + { + "auxiliary_loss_clip": 0.01133945, + "auxiliary_loss_mlp": 0.01106619, + "balance_loss_clip": 1.00190246, + "balance_loss_mlp": 1.0005703, + "epoch": 0.6241996091988576, + "flos": 12750766744320.0, + "grad_norm": 2.1392136288955386, + "language_loss": 0.74971879, + "learning_rate": 1.3072041304072718e-06, + "loss": 0.77212441, + "num_input_tokens_seen": 223648360, + "step": 10382, + "time_per_iteration": 2.566453218460083 + }, + { + "auxiliary_loss_clip": 0.01148809, + "auxiliary_loss_mlp": 0.01105919, + "balance_loss_clip": 1.00196183, + "balance_loss_mlp": 1.0004425, + "epoch": 0.6242597324515257, + "flos": 25851302173440.0, + "grad_norm": 2.285321881802312, + "language_loss": 0.78082258, + "learning_rate": 1.306838794344911e-06, + "loss": 0.80336982, + "num_input_tokens_seen": 223671255, + "step": 10383, + "time_per_iteration": 2.6234641075134277 + }, + { + "auxiliary_loss_clip": 0.01121208, + "auxiliary_loss_mlp": 0.01105141, + "balance_loss_clip": 1.00184846, + "balance_loss_mlp": 1.0004276, + "epoch": 0.6243198557041936, + "flos": 19937712516480.0, + "grad_norm": 1.796689039044966, + "language_loss": 0.75391001, + "learning_rate": 1.3064734845672925e-06, + "loss": 0.77617347, + "num_input_tokens_seen": 223689860, + "step": 10384, + "time_per_iteration": 3.9663589000701904 + }, + { + "auxiliary_loss_clip": 0.01135462, + "auxiliary_loss_mlp": 0.01107365, + "balance_loss_clip": 1.00203824, + "balance_loss_mlp": 1.00045812, + "epoch": 0.6243799789568616, + "flos": 18406194376320.0, + "grad_norm": 1.7215604448597077, + "language_loss": 0.66663879, + "learning_rate": 1.3061082010882694e-06, + "loss": 0.68906713, + "num_input_tokens_seen": 223707835, + "step": 10385, + "time_per_iteration": 2.566371440887451 + }, + { + "auxiliary_loss_clip": 0.01130835, + "auxiliary_loss_mlp": 0.010819, + "balance_loss_clip": 1.00121522, + "balance_loss_mlp": 0.99988431, + "epoch": 0.6244401022095295, + "flos": 66027587523840.0, + "grad_norm": 0.7513598441262741, + "language_loss": 0.62016201, + "learning_rate": 1.305742943921692e-06, + "loss": 0.6422894, + "num_input_tokens_seen": 223771875, + "step": 10386, + "time_per_iteration": 3.183272361755371 + }, + { + "auxiliary_loss_clip": 0.01148787, + "auxiliary_loss_mlp": 0.01106072, + "balance_loss_clip": 1.00189066, + "balance_loss_mlp": 1.00040472, + "epoch": 0.6245002254621975, + "flos": 24571266128640.0, + "grad_norm": 2.488318383782787, + "language_loss": 0.72002536, + "learning_rate": 1.3053777130814128e-06, + "loss": 0.74257398, + "num_input_tokens_seen": 223788895, + "step": 10387, + "time_per_iteration": 2.5799903869628906 + }, + { + "auxiliary_loss_clip": 0.01149405, + "auxiliary_loss_mlp": 0.01107828, + "balance_loss_clip": 1.00191808, + "balance_loss_mlp": 1.00063503, + "epoch": 0.6245603487148654, + "flos": 29168837291520.0, + "grad_norm": 2.3918927722745664, + "language_loss": 0.6542477, + "learning_rate": 1.3050125085812798e-06, + "loss": 0.67682004, + "num_input_tokens_seen": 223810385, + "step": 10388, + "time_per_iteration": 2.6643245220184326 + }, + { + "auxiliary_loss_clip": 0.01115321, + "auxiliary_loss_mlp": 0.01106043, + "balance_loss_clip": 1.00172734, + "balance_loss_mlp": 1.00047135, + "epoch": 0.6246204719675335, + "flos": 14790097411200.0, + "grad_norm": 1.5917856819462202, + "language_loss": 0.79287624, + "learning_rate": 1.3046473304351417e-06, + "loss": 0.81508988, + "num_input_tokens_seen": 223826040, + "step": 10389, + "time_per_iteration": 2.605201005935669 + }, + { + "auxiliary_loss_clip": 0.01133869, + "auxiliary_loss_mlp": 0.0110629, + "balance_loss_clip": 1.00188088, + "balance_loss_mlp": 1.00043225, + "epoch": 0.6246805952202014, + "flos": 12493538472960.0, + "grad_norm": 2.065826447073587, + "language_loss": 0.60468781, + "learning_rate": 1.3042821786568475e-06, + "loss": 0.62708938, + "num_input_tokens_seen": 223842300, + "step": 10390, + "time_per_iteration": 4.008006572723389 + }, + { + "auxiliary_loss_clip": 0.0113427, + "auxiliary_loss_mlp": 0.01106439, + "balance_loss_clip": 1.00192559, + "balance_loss_mlp": 1.00048566, + "epoch": 0.6247407184728694, + "flos": 12786677366400.0, + "grad_norm": 2.3226543354302627, + "language_loss": 0.77231085, + "learning_rate": 1.3039170532602416e-06, + "loss": 0.79471791, + "num_input_tokens_seen": 223858320, + "step": 10391, + "time_per_iteration": 2.5796282291412354 + }, + { + "auxiliary_loss_clip": 0.01132497, + "auxiliary_loss_mlp": 0.01106744, + "balance_loss_clip": 1.00195169, + "balance_loss_mlp": 1.0006001, + "epoch": 0.6248008417255374, + "flos": 40629188960640.0, + "grad_norm": 1.668916512660018, + "language_loss": 0.64526272, + "learning_rate": 1.3035519542591718e-06, + "loss": 0.66765517, + "num_input_tokens_seen": 223883545, + "step": 10392, + "time_per_iteration": 2.7847514152526855 + }, + { + "auxiliary_loss_clip": 0.01132914, + "auxiliary_loss_mlp": 0.01106286, + "balance_loss_clip": 1.00193036, + "balance_loss_mlp": 1.00061941, + "epoch": 0.6248609649782053, + "flos": 19902017376000.0, + "grad_norm": 1.899492855868048, + "language_loss": 0.76916361, + "learning_rate": 1.3031868816674819e-06, + "loss": 0.79155564, + "num_input_tokens_seen": 223901445, + "step": 10393, + "time_per_iteration": 2.5895004272460938 + }, + { + "auxiliary_loss_clip": 0.01121772, + "auxiliary_loss_mlp": 0.00747267, + "balance_loss_clip": 1.00215101, + "balance_loss_mlp": 1.00041008, + "epoch": 0.6249210882308733, + "flos": 19682746801920.0, + "grad_norm": 1.7863168532181413, + "language_loss": 0.82523763, + "learning_rate": 1.3028218354990142e-06, + "loss": 0.84392798, + "num_input_tokens_seen": 223920170, + "step": 10394, + "time_per_iteration": 2.639996290206909 + }, + { + "auxiliary_loss_clip": 0.01133723, + "auxiliary_loss_mlp": 0.01106339, + "balance_loss_clip": 1.00186419, + "balance_loss_mlp": 1.00057614, + "epoch": 0.6249812114835412, + "flos": 13990726189440.0, + "grad_norm": 2.991373890856779, + "language_loss": 0.7497282, + "learning_rate": 1.3024568157676128e-06, + "loss": 0.77212882, + "num_input_tokens_seen": 223936495, + "step": 10395, + "time_per_iteration": 3.935110092163086 + }, + { + "auxiliary_loss_clip": 0.01135501, + "auxiliary_loss_mlp": 0.0110735, + "balance_loss_clip": 1.00189543, + "balance_loss_mlp": 1.00063348, + "epoch": 0.6250413347362093, + "flos": 14530031965440.0, + "grad_norm": 2.451675719673486, + "language_loss": 0.72904456, + "learning_rate": 1.302091822487119e-06, + "loss": 0.75147307, + "num_input_tokens_seen": 223950070, + "step": 10396, + "time_per_iteration": 2.5498743057250977 + }, + { + "auxiliary_loss_clip": 0.01119594, + "auxiliary_loss_mlp": 0.01105905, + "balance_loss_clip": 1.00211704, + "balance_loss_mlp": 1.00052392, + "epoch": 0.6251014579888772, + "flos": 22963006581120.0, + "grad_norm": 2.0026906757371847, + "language_loss": 0.76313567, + "learning_rate": 1.3017268556713732e-06, + "loss": 0.78539062, + "num_input_tokens_seen": 223970065, + "step": 10397, + "time_per_iteration": 2.6427180767059326 + }, + { + "auxiliary_loss_clip": 0.01132049, + "auxiliary_loss_mlp": 0.011057, + "balance_loss_clip": 1.00180066, + "balance_loss_mlp": 1.00070071, + "epoch": 0.6251615812415452, + "flos": 28111232217600.0, + "grad_norm": 1.9510426079959473, + "language_loss": 0.75161523, + "learning_rate": 1.3013619153342154e-06, + "loss": 0.77399272, + "num_input_tokens_seen": 223990315, + "step": 10398, + "time_per_iteration": 4.124382019042969 + }, + { + "auxiliary_loss_clip": 0.01165408, + "auxiliary_loss_mlp": 0.0110589, + "balance_loss_clip": 1.00189877, + "balance_loss_mlp": 1.00050902, + "epoch": 0.6252217044942131, + "flos": 26724469887360.0, + "grad_norm": 1.7589587562111748, + "language_loss": 0.74293065, + "learning_rate": 1.300997001489483e-06, + "loss": 0.7656436, + "num_input_tokens_seen": 224009960, + "step": 10399, + "time_per_iteration": 2.602351665496826 + }, + { + "auxiliary_loss_clip": 0.01117926, + "auxiliary_loss_mlp": 0.01106719, + "balance_loss_clip": 1.00187349, + "balance_loss_mlp": 1.00057518, + "epoch": 0.6252818277468811, + "flos": 20006768413440.0, + "grad_norm": 2.3566534859719503, + "language_loss": 0.74682975, + "learning_rate": 1.3006321141510147e-06, + "loss": 0.76907623, + "num_input_tokens_seen": 224028870, + "step": 10400, + "time_per_iteration": 2.646385669708252 + }, + { + "auxiliary_loss_clip": 0.01116208, + "auxiliary_loss_mlp": 0.01081584, + "balance_loss_clip": 1.00114989, + "balance_loss_mlp": 0.99994987, + "epoch": 0.625341950999549, + "flos": 59278285059840.0, + "grad_norm": 0.8400623820815781, + "language_loss": 0.56471121, + "learning_rate": 1.3002672533326465e-06, + "loss": 0.58668923, + "num_input_tokens_seen": 224094140, + "step": 10401, + "time_per_iteration": 3.422144651412964 + }, + { + "auxiliary_loss_clip": 0.01150008, + "auxiliary_loss_mlp": 0.01106317, + "balance_loss_clip": 1.0019114, + "balance_loss_mlp": 1.00045896, + "epoch": 0.625402074252217, + "flos": 20157090831360.0, + "grad_norm": 2.2280313177842945, + "language_loss": 0.82978612, + "learning_rate": 1.2999024190482146e-06, + "loss": 0.8523494, + "num_input_tokens_seen": 224113235, + "step": 10402, + "time_per_iteration": 3.0799355506896973 + }, + { + "auxiliary_loss_clip": 0.0106871, + "auxiliary_loss_mlp": 0.0110555, + "balance_loss_clip": 1.00163972, + "balance_loss_mlp": 1.00055039, + "epoch": 0.625462197504885, + "flos": 29132531619840.0, + "grad_norm": 2.2632365436348856, + "language_loss": 0.689614, + "learning_rate": 1.2995376113115527e-06, + "loss": 0.71135664, + "num_input_tokens_seen": 224134530, + "step": 10403, + "time_per_iteration": 2.839364528656006 + }, + { + "auxiliary_loss_clip": 0.01118683, + "auxiliary_loss_mlp": 0.01105588, + "balance_loss_clip": 1.00198722, + "balance_loss_mlp": 1.00058854, + "epoch": 0.625522320757553, + "flos": 26104436294400.0, + "grad_norm": 1.6044981974187835, + "language_loss": 0.7205984, + "learning_rate": 1.2991728301364954e-06, + "loss": 0.74284112, + "num_input_tokens_seen": 224154170, + "step": 10404, + "time_per_iteration": 2.698038101196289 + }, + { + "auxiliary_loss_clip": 0.01102101, + "auxiliary_loss_mlp": 0.01107004, + "balance_loss_clip": 1.00190306, + "balance_loss_mlp": 1.00057399, + "epoch": 0.625582444010221, + "flos": 20630967984000.0, + "grad_norm": 1.8991792098306024, + "language_loss": 0.69660598, + "learning_rate": 1.2988080755368742e-06, + "loss": 0.71869707, + "num_input_tokens_seen": 224172730, + "step": 10405, + "time_per_iteration": 2.6645002365112305 + }, + { + "auxiliary_loss_clip": 0.01134519, + "auxiliary_loss_mlp": 0.0110622, + "balance_loss_clip": 1.0021131, + "balance_loss_mlp": 1.00055289, + "epoch": 0.6256425672628889, + "flos": 20521512264960.0, + "grad_norm": 1.8606900290160613, + "language_loss": 0.78856677, + "learning_rate": 1.2984433475265207e-06, + "loss": 0.81097418, + "num_input_tokens_seen": 224192620, + "step": 10406, + "time_per_iteration": 2.665454149246216 + }, + { + "auxiliary_loss_clip": 0.01121244, + "auxiliary_loss_mlp": 0.01105456, + "balance_loss_clip": 1.002033, + "balance_loss_mlp": 1.00045609, + "epoch": 0.6257026905155569, + "flos": 29529200488320.0, + "grad_norm": 1.759597763798249, + "language_loss": 0.68741453, + "learning_rate": 1.2980786461192666e-06, + "loss": 0.70968151, + "num_input_tokens_seen": 224214660, + "step": 10407, + "time_per_iteration": 2.697486400604248 + }, + { + "auxiliary_loss_clip": 0.01150565, + "auxiliary_loss_mlp": 0.0074718, + "balance_loss_clip": 1.0020175, + "balance_loss_mlp": 1.00051999, + "epoch": 0.6257628137682248, + "flos": 24024885373440.0, + "grad_norm": 3.79180796665616, + "language_loss": 0.85302424, + "learning_rate": 1.2977139713289398e-06, + "loss": 0.87200171, + "num_input_tokens_seen": 224234170, + "step": 10408, + "time_per_iteration": 2.6537768840789795 + }, + { + "auxiliary_loss_clip": 0.01133544, + "auxiliary_loss_mlp": 0.00747179, + "balance_loss_clip": 1.00192022, + "balance_loss_mlp": 1.0004164, + "epoch": 0.6258229370208929, + "flos": 20850956830080.0, + "grad_norm": 1.754191927870974, + "language_loss": 0.7984336, + "learning_rate": 1.2973493231693699e-06, + "loss": 0.81724083, + "num_input_tokens_seen": 224253115, + "step": 10409, + "time_per_iteration": 2.6206188201904297 + }, + { + "auxiliary_loss_clip": 0.01134074, + "auxiliary_loss_mlp": 0.01105758, + "balance_loss_clip": 1.00187254, + "balance_loss_mlp": 1.00056744, + "epoch": 0.6258830602735608, + "flos": 22231542021120.0, + "grad_norm": 2.626790547830593, + "language_loss": 0.69770187, + "learning_rate": 1.2969847016543845e-06, + "loss": 0.72010022, + "num_input_tokens_seen": 224271375, + "step": 10410, + "time_per_iteration": 2.626065254211426 + }, + { + "auxiliary_loss_clip": 0.0110401, + "auxiliary_loss_mlp": 0.01104932, + "balance_loss_clip": 1.00200558, + "balance_loss_mlp": 1.00040936, + "epoch": 0.6259431835262288, + "flos": 25076887925760.0, + "grad_norm": 2.0902850993558046, + "language_loss": 0.67475897, + "learning_rate": 1.2966201067978086e-06, + "loss": 0.69684839, + "num_input_tokens_seen": 224290315, + "step": 10411, + "time_per_iteration": 2.6913235187530518 + }, + { + "auxiliary_loss_clip": 0.01100498, + "auxiliary_loss_mlp": 0.01106419, + "balance_loss_clip": 1.00172353, + "balance_loss_mlp": 1.00056171, + "epoch": 0.6260033067788967, + "flos": 28252288926720.0, + "grad_norm": 1.693877753413702, + "language_loss": 0.6918205, + "learning_rate": 1.2962555386134702e-06, + "loss": 0.71388972, + "num_input_tokens_seen": 224310545, + "step": 10412, + "time_per_iteration": 2.7673180103302 + }, + { + "auxiliary_loss_clip": 0.01134106, + "auxiliary_loss_mlp": 0.01106065, + "balance_loss_clip": 1.00189924, + "balance_loss_mlp": 1.00049329, + "epoch": 0.6260634300315647, + "flos": 23367432787200.0, + "grad_norm": 1.569079143103051, + "language_loss": 0.69587743, + "learning_rate": 1.2958909971151908e-06, + "loss": 0.71827912, + "num_input_tokens_seen": 224331115, + "step": 10413, + "time_per_iteration": 2.61181640625 + }, + { + "auxiliary_loss_clip": 0.01117188, + "auxiliary_loss_mlp": 0.01106917, + "balance_loss_clip": 1.00183797, + "balance_loss_mlp": 1.00048709, + "epoch": 0.6261235532842326, + "flos": 18035308494720.0, + "grad_norm": 3.3768536862110805, + "language_loss": 0.80898762, + "learning_rate": 1.295526482316796e-06, + "loss": 0.83122873, + "num_input_tokens_seen": 224347525, + "step": 10414, + "time_per_iteration": 2.6214258670806885 + }, + { + "auxiliary_loss_clip": 0.01148983, + "auxiliary_loss_mlp": 0.01106997, + "balance_loss_clip": 1.0021311, + "balance_loss_mlp": 1.0006628, + "epoch": 0.6261836765369007, + "flos": 22011265866240.0, + "grad_norm": 1.9064128526568154, + "language_loss": 0.74808234, + "learning_rate": 1.2951619942321083e-06, + "loss": 0.77064216, + "num_input_tokens_seen": 224367045, + "step": 10415, + "time_per_iteration": 2.5687062740325928 + }, + { + "auxiliary_loss_clip": 0.01100976, + "auxiliary_loss_mlp": 0.01106215, + "balance_loss_clip": 1.00184774, + "balance_loss_mlp": 1.000453, + "epoch": 0.6262437997895686, + "flos": 24936010784640.0, + "grad_norm": 1.4772461944464021, + "language_loss": 0.74126172, + "learning_rate": 1.2947975328749472e-06, + "loss": 0.76333362, + "num_input_tokens_seen": 224388860, + "step": 10416, + "time_per_iteration": 2.7053208351135254 + }, + { + "auxiliary_loss_clip": 0.01132301, + "auxiliary_loss_mlp": 0.0110438, + "balance_loss_clip": 1.00197434, + "balance_loss_mlp": 1.00042999, + "epoch": 0.6263039230422366, + "flos": 31608428186880.0, + "grad_norm": 9.031943477663782, + "language_loss": 0.84455734, + "learning_rate": 1.2944330982591352e-06, + "loss": 0.86692417, + "num_input_tokens_seen": 224409645, + "step": 10417, + "time_per_iteration": 2.6910672187805176 + }, + { + "auxiliary_loss_clip": 0.01150722, + "auxiliary_loss_mlp": 0.01105445, + "balance_loss_clip": 1.00198269, + "balance_loss_mlp": 1.00054038, + "epoch": 0.6263640462949046, + "flos": 17639465639040.0, + "grad_norm": 2.317808831052913, + "language_loss": 0.56935191, + "learning_rate": 1.2940686903984904e-06, + "loss": 0.59191358, + "num_input_tokens_seen": 224428530, + "step": 10418, + "time_per_iteration": 2.5343575477600098 + }, + { + "auxiliary_loss_clip": 0.01150839, + "auxiliary_loss_mlp": 0.01106643, + "balance_loss_clip": 1.00200057, + "balance_loss_mlp": 1.00059438, + "epoch": 0.6264241695475725, + "flos": 19974951941760.0, + "grad_norm": 1.9467168269293194, + "language_loss": 0.84662092, + "learning_rate": 1.2937043093068316e-06, + "loss": 0.8691957, + "num_input_tokens_seen": 224447175, + "step": 10419, + "time_per_iteration": 2.5734198093414307 + }, + { + "auxiliary_loss_clip": 0.01165468, + "auxiliary_loss_mlp": 0.01106491, + "balance_loss_clip": 1.00204086, + "balance_loss_mlp": 1.00053799, + "epoch": 0.6264842928002405, + "flos": 27344323912320.0, + "grad_norm": 2.0161567326155403, + "language_loss": 0.64663744, + "learning_rate": 1.2933399549979762e-06, + "loss": 0.66935706, + "num_input_tokens_seen": 224469445, + "step": 10420, + "time_per_iteration": 2.5852699279785156 + }, + { + "auxiliary_loss_clip": 0.0111774, + "auxiliary_loss_mlp": 0.01107163, + "balance_loss_clip": 1.00199437, + "balance_loss_mlp": 1.0005424, + "epoch": 0.6265444160529084, + "flos": 22997265177600.0, + "grad_norm": 1.8730745640426967, + "language_loss": 0.86333132, + "learning_rate": 1.292975627485741e-06, + "loss": 0.88558042, + "num_input_tokens_seen": 224486590, + "step": 10421, + "time_per_iteration": 4.38111686706543 + }, + { + "auxiliary_loss_clip": 0.01118972, + "auxiliary_loss_mlp": 0.01105527, + "balance_loss_clip": 1.00195062, + "balance_loss_mlp": 1.00043201, + "epoch": 0.6266045393055765, + "flos": 19938323047680.0, + "grad_norm": 6.56515551991564, + "language_loss": 0.7996676, + "learning_rate": 1.2926113267839403e-06, + "loss": 0.82191253, + "num_input_tokens_seen": 224502795, + "step": 10422, + "time_per_iteration": 2.667569875717163 + }, + { + "auxiliary_loss_clip": 0.01150621, + "auxiliary_loss_mlp": 0.01104966, + "balance_loss_clip": 1.00193584, + "balance_loss_mlp": 1.00034785, + "epoch": 0.6266646625582444, + "flos": 24389091325440.0, + "grad_norm": 2.043306800025116, + "language_loss": 0.74266607, + "learning_rate": 1.292247052906389e-06, + "loss": 0.76522195, + "num_input_tokens_seen": 224522300, + "step": 10423, + "time_per_iteration": 2.5931007862091064 + }, + { + "auxiliary_loss_clip": 0.01165481, + "auxiliary_loss_mlp": 0.01104716, + "balance_loss_clip": 1.00199723, + "balance_loss_mlp": 1.00038433, + "epoch": 0.6267247858109124, + "flos": 14683802088960.0, + "grad_norm": 1.7839616290160916, + "language_loss": 0.77606869, + "learning_rate": 1.2918828058669004e-06, + "loss": 0.79877073, + "num_input_tokens_seen": 224538260, + "step": 10424, + "time_per_iteration": 2.490058183670044 + }, + { + "auxiliary_loss_clip": 0.01165386, + "auxiliary_loss_mlp": 0.01105456, + "balance_loss_clip": 1.00201654, + "balance_loss_mlp": 1.00064719, + "epoch": 0.6267849090635803, + "flos": 24929977299840.0, + "grad_norm": 1.853714253806342, + "language_loss": 0.6920476, + "learning_rate": 1.2915185856792868e-06, + "loss": 0.71475601, + "num_input_tokens_seen": 224559155, + "step": 10425, + "time_per_iteration": 2.5525777339935303 + }, + { + "auxiliary_loss_clip": 0.01133568, + "auxiliary_loss_mlp": 0.01104073, + "balance_loss_clip": 1.00188756, + "balance_loss_mlp": 1.00050354, + "epoch": 0.6268450323162483, + "flos": 25337851211520.0, + "grad_norm": 1.4732773927167568, + "language_loss": 0.74320012, + "learning_rate": 1.2911543923573598e-06, + "loss": 0.76557648, + "num_input_tokens_seen": 224578660, + "step": 10426, + "time_per_iteration": 2.653134822845459 + }, + { + "auxiliary_loss_clip": 0.0114991, + "auxiliary_loss_mlp": 0.00747179, + "balance_loss_clip": 1.00194013, + "balance_loss_mlp": 1.00043738, + "epoch": 0.6269051555689162, + "flos": 26177299032960.0, + "grad_norm": 1.4491789325551199, + "language_loss": 0.80545437, + "learning_rate": 1.290790225914929e-06, + "loss": 0.82442522, + "num_input_tokens_seen": 224599080, + "step": 10427, + "time_per_iteration": 2.6155848503112793 + }, + { + "auxiliary_loss_clip": 0.01117358, + "auxiliary_loss_mlp": 0.01105915, + "balance_loss_clip": 1.00182843, + "balance_loss_mlp": 1.00053358, + "epoch": 0.6269652788215843, + "flos": 18256877539200.0, + "grad_norm": 2.4216154187752976, + "language_loss": 0.68285227, + "learning_rate": 1.2904260863658034e-06, + "loss": 0.70508498, + "num_input_tokens_seen": 224614225, + "step": 10428, + "time_per_iteration": 4.018386363983154 + }, + { + "auxiliary_loss_clip": 0.01119204, + "auxiliary_loss_mlp": 0.01105646, + "balance_loss_clip": 1.00194788, + "balance_loss_mlp": 1.00074196, + "epoch": 0.6270254020742522, + "flos": 11765413877760.0, + "grad_norm": 1.8624966550050714, + "language_loss": 0.71417892, + "learning_rate": 1.2900619737237928e-06, + "loss": 0.73642743, + "num_input_tokens_seen": 224632365, + "step": 10429, + "time_per_iteration": 2.59861421585083 + }, + { + "auxiliary_loss_clip": 0.01150348, + "auxiliary_loss_mlp": 0.01105681, + "balance_loss_clip": 1.00201416, + "balance_loss_mlp": 1.00049043, + "epoch": 0.6270855253269202, + "flos": 23475631530240.0, + "grad_norm": 1.6350996637465025, + "language_loss": 0.79815763, + "learning_rate": 1.2896978880027023e-06, + "loss": 0.82071793, + "num_input_tokens_seen": 224651125, + "step": 10430, + "time_per_iteration": 2.579676389694214 + }, + { + "auxiliary_loss_clip": 0.01159967, + "auxiliary_loss_mlp": 0.0108113, + "balance_loss_clip": 1.00109744, + "balance_loss_mlp": 0.99987662, + "epoch": 0.6271456485795882, + "flos": 70064520232320.0, + "grad_norm": 0.7668482893907046, + "language_loss": 0.59158099, + "learning_rate": 1.2893338292163393e-06, + "loss": 0.61399198, + "num_input_tokens_seen": 224716115, + "step": 10431, + "time_per_iteration": 3.21209979057312 + }, + { + "auxiliary_loss_clip": 0.01126613, + "auxiliary_loss_mlp": 0.010816, + "balance_loss_clip": 1.00106955, + "balance_loss_mlp": 0.99996513, + "epoch": 0.6272057718322561, + "flos": 65156718280320.0, + "grad_norm": 0.8749225421143852, + "language_loss": 0.63738483, + "learning_rate": 1.2889697973785095e-06, + "loss": 0.65946692, + "num_input_tokens_seen": 224782930, + "step": 10432, + "time_per_iteration": 3.2448370456695557 + }, + { + "auxiliary_loss_clip": 0.01131953, + "auxiliary_loss_mlp": 0.01105323, + "balance_loss_clip": 1.00187612, + "balance_loss_mlp": 1.00051379, + "epoch": 0.6272658950849241, + "flos": 24389342720640.0, + "grad_norm": 2.025322216734143, + "language_loss": 0.65116942, + "learning_rate": 1.2886057925030153e-06, + "loss": 0.6735422, + "num_input_tokens_seen": 224802010, + "step": 10433, + "time_per_iteration": 4.088939189910889 + }, + { + "auxiliary_loss_clip": 0.01148819, + "auxiliary_loss_mlp": 0.01106025, + "balance_loss_clip": 1.00196648, + "balance_loss_mlp": 1.00045347, + "epoch": 0.627326018337592, + "flos": 17966001202560.0, + "grad_norm": 2.790410942465481, + "language_loss": 0.61678159, + "learning_rate": 1.2882418146036612e-06, + "loss": 0.63933003, + "num_input_tokens_seen": 224818875, + "step": 10434, + "time_per_iteration": 2.539008140563965 + }, + { + "auxiliary_loss_clip": 0.01115269, + "auxiliary_loss_mlp": 0.01105834, + "balance_loss_clip": 1.00168574, + "balance_loss_mlp": 1.00045276, + "epoch": 0.6273861415902601, + "flos": 20230097224320.0, + "grad_norm": 2.0107688784340976, + "language_loss": 0.84455603, + "learning_rate": 1.2878778636942484e-06, + "loss": 0.86676699, + "num_input_tokens_seen": 224837790, + "step": 10435, + "time_per_iteration": 4.14128303527832 + }, + { + "auxiliary_loss_clip": 0.01159965, + "auxiliary_loss_mlp": 0.01081193, + "balance_loss_clip": 1.00112128, + "balance_loss_mlp": 0.99993956, + "epoch": 0.627446264842928, + "flos": 64953210798720.0, + "grad_norm": 0.7339078325808288, + "language_loss": 0.61573625, + "learning_rate": 1.2875139397885786e-06, + "loss": 0.63814783, + "num_input_tokens_seen": 224899685, + "step": 10436, + "time_per_iteration": 3.078784227371216 + }, + { + "auxiliary_loss_clip": 0.01132565, + "auxiliary_loss_mlp": 0.0110599, + "balance_loss_clip": 1.0019443, + "balance_loss_mlp": 1.00060952, + "epoch": 0.627506388095596, + "flos": 23584261236480.0, + "grad_norm": 1.6532772946835637, + "language_loss": 0.77417219, + "learning_rate": 1.2871500429004523e-06, + "loss": 0.79655772, + "num_input_tokens_seen": 224918650, + "step": 10437, + "time_per_iteration": 2.6300268173217773 + }, + { + "auxiliary_loss_clip": 0.01143541, + "auxiliary_loss_mlp": 0.01081623, + "balance_loss_clip": 1.00108862, + "balance_loss_mlp": 0.99998873, + "epoch": 0.6275665113482639, + "flos": 67583631674880.0, + "grad_norm": 0.7241438944341114, + "language_loss": 0.54322803, + "learning_rate": 1.2867861730436667e-06, + "loss": 0.5654797, + "num_input_tokens_seen": 224981575, + "step": 10438, + "time_per_iteration": 3.0859639644622803 + }, + { + "auxiliary_loss_clip": 0.01100628, + "auxiliary_loss_mlp": 0.01105491, + "balance_loss_clip": 1.00179803, + "balance_loss_mlp": 1.00068212, + "epoch": 0.6276266346009319, + "flos": 27636924101760.0, + "grad_norm": 1.7622533061533947, + "language_loss": 0.84106427, + "learning_rate": 1.2864223302320214e-06, + "loss": 0.86312544, + "num_input_tokens_seen": 225000820, + "step": 10439, + "time_per_iteration": 2.742032527923584 + }, + { + "auxiliary_loss_clip": 0.01120319, + "auxiliary_loss_mlp": 0.01106309, + "balance_loss_clip": 1.00196052, + "balance_loss_mlp": 1.0006423, + "epoch": 0.6276867578535998, + "flos": 22746142218240.0, + "grad_norm": 2.1686050409976603, + "language_loss": 0.80133712, + "learning_rate": 1.2860585144793128e-06, + "loss": 0.82360351, + "num_input_tokens_seen": 225017585, + "step": 10440, + "time_per_iteration": 2.6509668827056885 + }, + { + "auxiliary_loss_clip": 0.01100325, + "auxiliary_loss_mlp": 0.01105091, + "balance_loss_clip": 1.00179553, + "balance_loss_mlp": 1.00037706, + "epoch": 0.6277468811062679, + "flos": 24644200694400.0, + "grad_norm": 3.3745067978815135, + "language_loss": 0.74422592, + "learning_rate": 1.285694725799337e-06, + "loss": 0.76628006, + "num_input_tokens_seen": 225039085, + "step": 10441, + "time_per_iteration": 2.7694029808044434 + }, + { + "auxiliary_loss_clip": 0.01150756, + "auxiliary_loss_mlp": 0.01104949, + "balance_loss_clip": 1.00198436, + "balance_loss_mlp": 1.00042665, + "epoch": 0.6278070043589358, + "flos": 19678975873920.0, + "grad_norm": 1.68940362400171, + "language_loss": 0.72468162, + "learning_rate": 1.2853309642058884e-06, + "loss": 0.74723864, + "num_input_tokens_seen": 225058105, + "step": 10442, + "time_per_iteration": 2.5867679119110107 + }, + { + "auxiliary_loss_clip": 0.01119318, + "auxiliary_loss_mlp": 0.01106041, + "balance_loss_clip": 1.00183332, + "balance_loss_mlp": 1.00056446, + "epoch": 0.6278671276116038, + "flos": 22121834906880.0, + "grad_norm": 1.5754831398800204, + "language_loss": 0.716308, + "learning_rate": 1.284967229712762e-06, + "loss": 0.73856163, + "num_input_tokens_seen": 225077605, + "step": 10443, + "time_per_iteration": 2.6610562801361084 + }, + { + "auxiliary_loss_clip": 0.01165362, + "auxiliary_loss_mlp": 0.01105387, + "balance_loss_clip": 1.00198698, + "balance_loss_mlp": 1.00057793, + "epoch": 0.6279272508642717, + "flos": 23038562839680.0, + "grad_norm": 1.8119491822117562, + "language_loss": 0.73258269, + "learning_rate": 1.2846035223337492e-06, + "loss": 0.75529027, + "num_input_tokens_seen": 225097775, + "step": 10444, + "time_per_iteration": 2.5412487983703613 + }, + { + "auxiliary_loss_clip": 0.01118017, + "auxiliary_loss_mlp": 0.01104654, + "balance_loss_clip": 1.00184274, + "balance_loss_mlp": 1.00041699, + "epoch": 0.6279873741169397, + "flos": 19824090819840.0, + "grad_norm": 2.6978579961772127, + "language_loss": 0.72175384, + "learning_rate": 1.2842398420826423e-06, + "loss": 0.74398053, + "num_input_tokens_seen": 225115585, + "step": 10445, + "time_per_iteration": 2.615783452987671 + }, + { + "auxiliary_loss_clip": 0.011485, + "auxiliary_loss_mlp": 0.01104904, + "balance_loss_clip": 1.00179267, + "balance_loss_mlp": 1.00047612, + "epoch": 0.6280474973696077, + "flos": 23915393740800.0, + "grad_norm": 4.148527874381616, + "language_loss": 0.69178182, + "learning_rate": 1.2838761889732331e-06, + "loss": 0.71431589, + "num_input_tokens_seen": 225135575, + "step": 10446, + "time_per_iteration": 2.5681960582733154 + }, + { + "auxiliary_loss_clip": 0.01103886, + "auxiliary_loss_mlp": 0.01106654, + "balance_loss_clip": 1.00187445, + "balance_loss_mlp": 1.00051045, + "epoch": 0.6281076206222757, + "flos": 17967976450560.0, + "grad_norm": 1.8012735323335993, + "language_loss": 0.73591101, + "learning_rate": 1.2835125630193102e-06, + "loss": 0.75801647, + "num_input_tokens_seen": 225154230, + "step": 10447, + "time_per_iteration": 2.6600213050842285 + }, + { + "auxiliary_loss_clip": 0.01144747, + "auxiliary_loss_mlp": 0.01081149, + "balance_loss_clip": 1.00112176, + "balance_loss_mlp": 0.99989587, + "epoch": 0.6281677438749437, + "flos": 66778370622720.0, + "grad_norm": 0.6771314156880501, + "language_loss": 0.52380884, + "learning_rate": 1.2831489642346626e-06, + "loss": 0.54606777, + "num_input_tokens_seen": 225213650, + "step": 10448, + "time_per_iteration": 3.016840934753418 + }, + { + "auxiliary_loss_clip": 0.01133866, + "auxiliary_loss_mlp": 0.01106564, + "balance_loss_clip": 1.00196195, + "balance_loss_mlp": 1.00061035, + "epoch": 0.6282278671276116, + "flos": 11656173640320.0, + "grad_norm": 2.153560234526097, + "language_loss": 0.91381991, + "learning_rate": 1.282785392633079e-06, + "loss": 0.93622422, + "num_input_tokens_seen": 225230135, + "step": 10449, + "time_per_iteration": 2.5468010902404785 + }, + { + "auxiliary_loss_clip": 0.01165377, + "auxiliary_loss_mlp": 0.01105342, + "balance_loss_clip": 1.00192738, + "balance_loss_mlp": 1.00043738, + "epoch": 0.6282879903802796, + "flos": 42741597847680.0, + "grad_norm": 1.7219944121191504, + "language_loss": 0.60062432, + "learning_rate": 1.2824218482283438e-06, + "loss": 0.62333155, + "num_input_tokens_seen": 225253520, + "step": 10450, + "time_per_iteration": 2.791522264480591 + }, + { + "auxiliary_loss_clip": 0.01134136, + "auxiliary_loss_mlp": 0.011044, + "balance_loss_clip": 1.00191545, + "balance_loss_mlp": 1.00054526, + "epoch": 0.6283481136329475, + "flos": 20009210538240.0, + "grad_norm": 1.636323410114565, + "language_loss": 0.76910806, + "learning_rate": 1.2820583310342452e-06, + "loss": 0.79149336, + "num_input_tokens_seen": 225272460, + "step": 10451, + "time_per_iteration": 2.5910873413085938 + }, + { + "auxiliary_loss_clip": 0.01134096, + "auxiliary_loss_mlp": 0.01106864, + "balance_loss_clip": 1.00187087, + "balance_loss_mlp": 1.00062513, + "epoch": 0.6284082368856155, + "flos": 21904431840000.0, + "grad_norm": 1.6109002302595585, + "language_loss": 0.78094113, + "learning_rate": 1.281694841064566e-06, + "loss": 0.80335069, + "num_input_tokens_seen": 225291700, + "step": 10452, + "time_per_iteration": 2.611234188079834 + }, + { + "auxiliary_loss_clip": 0.01118208, + "auxiliary_loss_mlp": 0.01105112, + "balance_loss_clip": 1.00188494, + "balance_loss_mlp": 1.000494, + "epoch": 0.6284683601382834, + "flos": 25484187219840.0, + "grad_norm": 3.9957256136488706, + "language_loss": 0.72854722, + "learning_rate": 1.2813313783330904e-06, + "loss": 0.75078046, + "num_input_tokens_seen": 225311470, + "step": 10453, + "time_per_iteration": 2.703406572341919 + }, + { + "auxiliary_loss_clip": 0.01102965, + "auxiliary_loss_mlp": 0.01105842, + "balance_loss_clip": 1.0017004, + "balance_loss_mlp": 1.00046086, + "epoch": 0.6285284833909515, + "flos": 16538695395840.0, + "grad_norm": 1.7955548714367662, + "language_loss": 0.80284345, + "learning_rate": 1.2809679428536013e-06, + "loss": 0.8249315, + "num_input_tokens_seen": 225328385, + "step": 10454, + "time_per_iteration": 2.673872232437134 + }, + { + "auxiliary_loss_clip": 0.01117495, + "auxiliary_loss_mlp": 0.01105746, + "balance_loss_clip": 1.00174391, + "balance_loss_mlp": 1.00046062, + "epoch": 0.6285886066436194, + "flos": 22820692896000.0, + "grad_norm": 2.288505997011995, + "language_loss": 0.82084334, + "learning_rate": 1.2806045346398792e-06, + "loss": 0.84307575, + "num_input_tokens_seen": 225348415, + "step": 10455, + "time_per_iteration": 2.6494321823120117 + }, + { + "auxiliary_loss_clip": 0.01104222, + "auxiliary_loss_mlp": 0.00747281, + "balance_loss_clip": 1.00187159, + "balance_loss_mlp": 1.00049162, + "epoch": 0.6286487298962874, + "flos": 24715734629760.0, + "grad_norm": 1.9111911024500867, + "language_loss": 0.81519985, + "learning_rate": 1.280241153705706e-06, + "loss": 0.83371484, + "num_input_tokens_seen": 225367740, + "step": 10456, + "time_per_iteration": 2.7355668544769287 + }, + { + "auxiliary_loss_clip": 0.01135973, + "auxiliary_loss_mlp": 0.01106933, + "balance_loss_clip": 1.00210047, + "balance_loss_mlp": 1.0005033, + "epoch": 0.6287088531489553, + "flos": 20740818752640.0, + "grad_norm": 1.5148794448734046, + "language_loss": 0.7215519, + "learning_rate": 1.27987780006486e-06, + "loss": 0.74398088, + "num_input_tokens_seen": 225388405, + "step": 10457, + "time_per_iteration": 2.6281356811523438 + }, + { + "auxiliary_loss_clip": 0.01150758, + "auxiliary_loss_mlp": 0.0110648, + "balance_loss_clip": 1.00199592, + "balance_loss_mlp": 1.00052631, + "epoch": 0.6287689764016233, + "flos": 23070630706560.0, + "grad_norm": 1.7053317873158735, + "language_loss": 0.80062419, + "learning_rate": 1.2795144737311202e-06, + "loss": 0.82319653, + "num_input_tokens_seen": 225408360, + "step": 10458, + "time_per_iteration": 2.596035957336426 + }, + { + "auxiliary_loss_clip": 0.01149269, + "auxiliary_loss_mlp": 0.01106439, + "balance_loss_clip": 1.0019052, + "balance_loss_mlp": 1.00058103, + "epoch": 0.6288290996542913, + "flos": 32233669251840.0, + "grad_norm": 1.6386406490876815, + "language_loss": 0.61073124, + "learning_rate": 1.2791511747182635e-06, + "loss": 0.63328838, + "num_input_tokens_seen": 225431310, + "step": 10459, + "time_per_iteration": 3.9843528270721436 + }, + { + "auxiliary_loss_clip": 0.0113142, + "auxiliary_loss_mlp": 0.0110491, + "balance_loss_clip": 1.00179088, + "balance_loss_mlp": 1.00067389, + "epoch": 0.6288892229069593, + "flos": 24641327606400.0, + "grad_norm": 1.796938227565854, + "language_loss": 0.78676069, + "learning_rate": 1.2787879030400666e-06, + "loss": 0.80912399, + "num_input_tokens_seen": 225450385, + "step": 10460, + "time_per_iteration": 2.6210098266601562 + }, + { + "auxiliary_loss_clip": 0.01119603, + "auxiliary_loss_mlp": 0.01105587, + "balance_loss_clip": 1.00190926, + "balance_loss_mlp": 1.00049162, + "epoch": 0.6289493461596273, + "flos": 17858341163520.0, + "grad_norm": 2.0926896079206134, + "language_loss": 0.74089181, + "learning_rate": 1.2784246587103047e-06, + "loss": 0.76314378, + "num_input_tokens_seen": 225467325, + "step": 10461, + "time_per_iteration": 2.6159098148345947 + }, + { + "auxiliary_loss_clip": 0.01133936, + "auxiliary_loss_mlp": 0.01105866, + "balance_loss_clip": 1.00192022, + "balance_loss_mlp": 1.0004853, + "epoch": 0.6290094694122952, + "flos": 22345379199360.0, + "grad_norm": 1.772193083915325, + "language_loss": 0.70404327, + "learning_rate": 1.2780614417427523e-06, + "loss": 0.72644132, + "num_input_tokens_seen": 225487370, + "step": 10462, + "time_per_iteration": 2.652421712875366 + }, + { + "auxiliary_loss_clip": 0.01165259, + "auxiliary_loss_mlp": 0.01103206, + "balance_loss_clip": 1.00207114, + "balance_loss_mlp": 1.00049543, + "epoch": 0.6290695926649632, + "flos": 28402431776640.0, + "grad_norm": 1.8509754949437431, + "language_loss": 0.72262239, + "learning_rate": 1.2776982521511821e-06, + "loss": 0.74530703, + "num_input_tokens_seen": 225506915, + "step": 10463, + "time_per_iteration": 2.584529399871826 + }, + { + "auxiliary_loss_clip": 0.01133158, + "auxiliary_loss_mlp": 0.01105578, + "balance_loss_clip": 1.00211596, + "balance_loss_mlp": 1.00067389, + "epoch": 0.6291297159176311, + "flos": 21505464501120.0, + "grad_norm": 1.6595151409479396, + "language_loss": 0.72630727, + "learning_rate": 1.2773350899493665e-06, + "loss": 0.74869466, + "num_input_tokens_seen": 225525670, + "step": 10464, + "time_per_iteration": 2.590932846069336 + }, + { + "auxiliary_loss_clip": 0.01135577, + "auxiliary_loss_mlp": 0.01105596, + "balance_loss_clip": 1.00202775, + "balance_loss_mlp": 1.00050128, + "epoch": 0.6291898391702991, + "flos": 12203308581120.0, + "grad_norm": 1.980175753911975, + "language_loss": 0.69564772, + "learning_rate": 1.2769719551510768e-06, + "loss": 0.71805942, + "num_input_tokens_seen": 225542235, + "step": 10465, + "time_per_iteration": 2.57167911529541 + }, + { + "auxiliary_loss_clip": 0.01144113, + "auxiliary_loss_mlp": 0.01081514, + "balance_loss_clip": 1.00114954, + "balance_loss_mlp": 0.99987966, + "epoch": 0.629249962422967, + "flos": 69299479434240.0, + "grad_norm": 0.6761471085594757, + "language_loss": 0.59787881, + "learning_rate": 1.2766088477700832e-06, + "loss": 0.62013507, + "num_input_tokens_seen": 225607185, + "step": 10466, + "time_per_iteration": 4.573511362075806 + }, + { + "auxiliary_loss_clip": 0.01116482, + "auxiliary_loss_mlp": 0.01104713, + "balance_loss_clip": 1.00164354, + "balance_loss_mlp": 1.00038075, + "epoch": 0.6293100856756351, + "flos": 40077888042240.0, + "grad_norm": 2.3770661250838394, + "language_loss": 0.65039116, + "learning_rate": 1.276245767820154e-06, + "loss": 0.67260313, + "num_input_tokens_seen": 225628785, + "step": 10467, + "time_per_iteration": 2.773005962371826 + }, + { + "auxiliary_loss_clip": 0.01127776, + "auxiliary_loss_mlp": 0.01081555, + "balance_loss_clip": 1.00085485, + "balance_loss_mlp": 0.99992007, + "epoch": 0.629370208928303, + "flos": 67501108177920.0, + "grad_norm": 0.7990006244660689, + "language_loss": 0.56890392, + "learning_rate": 1.2758827153150586e-06, + "loss": 0.59099728, + "num_input_tokens_seen": 225678980, + "step": 10468, + "time_per_iteration": 2.899921417236328 + }, + { + "auxiliary_loss_clip": 0.01097445, + "auxiliary_loss_mlp": 0.01082435, + "balance_loss_clip": 1.0010128, + "balance_loss_mlp": 1.00003719, + "epoch": 0.629430332180971, + "flos": 60660450449280.0, + "grad_norm": 0.7344278748793314, + "language_loss": 0.58011007, + "learning_rate": 1.2755196902685626e-06, + "loss": 0.6019088, + "num_input_tokens_seen": 225740295, + "step": 10469, + "time_per_iteration": 3.1413447856903076 + }, + { + "auxiliary_loss_clip": 0.01144385, + "auxiliary_loss_mlp": 0.01082204, + "balance_loss_clip": 1.00167036, + "balance_loss_mlp": 1.00018775, + "epoch": 0.6294904554336389, + "flos": 66869764778880.0, + "grad_norm": 0.6782107489998093, + "language_loss": 0.52107632, + "learning_rate": 1.2751566926944329e-06, + "loss": 0.54334223, + "num_input_tokens_seen": 225805615, + "step": 10470, + "time_per_iteration": 3.156024932861328 + }, + { + "auxiliary_loss_clip": 0.01148556, + "auxiliary_loss_mlp": 0.01104524, + "balance_loss_clip": 1.00187671, + "balance_loss_mlp": 1.00057387, + "epoch": 0.6295505786863069, + "flos": 42522794150400.0, + "grad_norm": 1.7332290298600919, + "language_loss": 0.74447477, + "learning_rate": 1.2747937226064342e-06, + "loss": 0.76700556, + "num_input_tokens_seen": 225826585, + "step": 10471, + "time_per_iteration": 4.161532163619995 + }, + { + "auxiliary_loss_clip": 0.01132411, + "auxiliary_loss_mlp": 0.01106338, + "balance_loss_clip": 1.0018909, + "balance_loss_mlp": 1.0004797, + "epoch": 0.629610701938975, + "flos": 17384140788480.0, + "grad_norm": 2.283583948426186, + "language_loss": 0.6286689, + "learning_rate": 1.2744307800183297e-06, + "loss": 0.65105635, + "num_input_tokens_seen": 225844095, + "step": 10472, + "time_per_iteration": 4.023419380187988 + }, + { + "auxiliary_loss_clip": 0.01165713, + "auxiliary_loss_mlp": 0.01106291, + "balance_loss_clip": 1.00218463, + "balance_loss_mlp": 1.00052845, + "epoch": 0.6296708251916429, + "flos": 24242934885120.0, + "grad_norm": 1.5760608218194796, + "language_loss": 0.69381338, + "learning_rate": 1.2740678649438828e-06, + "loss": 0.71653342, + "num_input_tokens_seen": 225864310, + "step": 10473, + "time_per_iteration": 2.5636038780212402 + }, + { + "auxiliary_loss_clip": 0.01133886, + "auxiliary_loss_mlp": 0.0110521, + "balance_loss_clip": 1.00183403, + "balance_loss_mlp": 1.00059152, + "epoch": 0.6297309484443109, + "flos": 19278536077440.0, + "grad_norm": 1.5830776363806047, + "language_loss": 0.75026143, + "learning_rate": 1.2737049773968554e-06, + "loss": 0.77265239, + "num_input_tokens_seen": 225883830, + "step": 10474, + "time_per_iteration": 2.65751314163208 + }, + { + "auxiliary_loss_clip": 0.01133903, + "auxiliary_loss_mlp": 0.0074728, + "balance_loss_clip": 1.00180638, + "balance_loss_mlp": 1.00053763, + "epoch": 0.6297910716969788, + "flos": 30662685043200.0, + "grad_norm": 3.9132843959520325, + "language_loss": 0.66333979, + "learning_rate": 1.2733421173910081e-06, + "loss": 0.68215156, + "num_input_tokens_seen": 225905755, + "step": 10475, + "time_per_iteration": 2.668069839477539 + }, + { + "auxiliary_loss_clip": 0.01098464, + "auxiliary_loss_mlp": 0.01105039, + "balance_loss_clip": 1.00187731, + "balance_loss_mlp": 1.00051606, + "epoch": 0.6298511949496468, + "flos": 14423018371200.0, + "grad_norm": 1.8507261367278143, + "language_loss": 0.90384936, + "learning_rate": 1.272979284940101e-06, + "loss": 0.92588437, + "num_input_tokens_seen": 225922155, + "step": 10476, + "time_per_iteration": 2.6461429595947266 + }, + { + "auxiliary_loss_clip": 0.01165452, + "auxiliary_loss_mlp": 0.01105005, + "balance_loss_clip": 1.00204337, + "balance_loss_mlp": 1.00057769, + "epoch": 0.6299113182023147, + "flos": 23514163845120.0, + "grad_norm": 2.5642200150298677, + "language_loss": 0.75747848, + "learning_rate": 1.2726164800578913e-06, + "loss": 0.78018308, + "num_input_tokens_seen": 225941060, + "step": 10477, + "time_per_iteration": 2.5342137813568115 + }, + { + "auxiliary_loss_clip": 0.01149073, + "auxiliary_loss_mlp": 0.01105481, + "balance_loss_clip": 1.00191975, + "balance_loss_mlp": 1.00048125, + "epoch": 0.6299714414549827, + "flos": 22674500542080.0, + "grad_norm": 3.509511072813596, + "language_loss": 0.70563728, + "learning_rate": 1.272253702758138e-06, + "loss": 0.72818279, + "num_input_tokens_seen": 225960870, + "step": 10478, + "time_per_iteration": 2.572429895401001 + }, + { + "auxiliary_loss_clip": 0.0115106, + "auxiliary_loss_mlp": 0.01106401, + "balance_loss_clip": 1.00211132, + "balance_loss_mlp": 1.00044799, + "epoch": 0.6300315647076506, + "flos": 14501735026560.0, + "grad_norm": 2.6025551788490757, + "language_loss": 0.6736511, + "learning_rate": 1.2718909530545974e-06, + "loss": 0.6962257, + "num_input_tokens_seen": 225977895, + "step": 10479, + "time_per_iteration": 2.534986972808838 + }, + { + "auxiliary_loss_clip": 0.01133742, + "auxiliary_loss_mlp": 0.00747249, + "balance_loss_clip": 1.00181842, + "balance_loss_mlp": 1.00051963, + "epoch": 0.6300916879603187, + "flos": 21871681614720.0, + "grad_norm": 2.029496278681394, + "language_loss": 0.73693168, + "learning_rate": 1.2715282309610245e-06, + "loss": 0.7557416, + "num_input_tokens_seen": 225997835, + "step": 10480, + "time_per_iteration": 2.613161087036133 + }, + { + "auxiliary_loss_clip": 0.01148811, + "auxiliary_loss_mlp": 0.01106438, + "balance_loss_clip": 1.00192571, + "balance_loss_mlp": 1.00057995, + "epoch": 0.6301518112129866, + "flos": 21834047139840.0, + "grad_norm": 1.9373317103910732, + "language_loss": 0.78806216, + "learning_rate": 1.2711655364911744e-06, + "loss": 0.81061471, + "num_input_tokens_seen": 226017620, + "step": 10481, + "time_per_iteration": 2.62394380569458 + }, + { + "auxiliary_loss_clip": 0.01130525, + "auxiliary_loss_mlp": 0.01081511, + "balance_loss_clip": 1.00107253, + "balance_loss_mlp": 0.99987692, + "epoch": 0.6302119344656546, + "flos": 44334237957120.0, + "grad_norm": 0.8992740703716765, + "language_loss": 0.61804569, + "learning_rate": 1.2708028696588e-06, + "loss": 0.64016604, + "num_input_tokens_seen": 226068755, + "step": 10482, + "time_per_iteration": 2.9463822841644287 + }, + { + "auxiliary_loss_clip": 0.01148926, + "auxiliary_loss_mlp": 0.01106548, + "balance_loss_clip": 1.00192165, + "balance_loss_mlp": 1.00049925, + "epoch": 0.6302720577183225, + "flos": 11217919800960.0, + "grad_norm": 2.19804719909063, + "language_loss": 0.82909691, + "learning_rate": 1.2704402304776541e-06, + "loss": 0.85165161, + "num_input_tokens_seen": 226084395, + "step": 10483, + "time_per_iteration": 2.5062673091888428 + }, + { + "auxiliary_loss_clip": 0.01148857, + "auxiliary_loss_mlp": 0.01104805, + "balance_loss_clip": 1.00198483, + "balance_loss_mlp": 1.00037813, + "epoch": 0.6303321809709905, + "flos": 27964932122880.0, + "grad_norm": 1.7236676280766838, + "language_loss": 0.72975767, + "learning_rate": 1.270077618961487e-06, + "loss": 0.7522943, + "num_input_tokens_seen": 226105890, + "step": 10484, + "time_per_iteration": 2.61478590965271 + }, + { + "auxiliary_loss_clip": 0.01117554, + "auxiliary_loss_mlp": 0.01105749, + "balance_loss_clip": 1.00169206, + "balance_loss_mlp": 1.00036836, + "epoch": 0.6303923042236586, + "flos": 28220759763840.0, + "grad_norm": 1.7463238914149364, + "language_loss": 0.74191451, + "learning_rate": 1.2697150351240506e-06, + "loss": 0.76414758, + "num_input_tokens_seen": 226126760, + "step": 10485, + "time_per_iteration": 2.6773977279663086 + }, + { + "auxiliary_loss_clip": 0.011325, + "auxiliary_loss_mlp": 0.00747325, + "balance_loss_clip": 1.00183213, + "balance_loss_mlp": 1.00053024, + "epoch": 0.6304524274763265, + "flos": 27631034271360.0, + "grad_norm": 1.8694410988850674, + "language_loss": 0.81228364, + "learning_rate": 1.269352478979093e-06, + "loss": 0.83108193, + "num_input_tokens_seen": 226147315, + "step": 10486, + "time_per_iteration": 2.6726343631744385 + }, + { + "auxiliary_loss_clip": 0.01131961, + "auxiliary_loss_mlp": 0.01105311, + "balance_loss_clip": 1.00191081, + "balance_loss_mlp": 1.00050199, + "epoch": 0.6305125507289945, + "flos": 17311313963520.0, + "grad_norm": 1.6461610280395436, + "language_loss": 0.63530749, + "learning_rate": 1.2689899505403628e-06, + "loss": 0.65768015, + "num_input_tokens_seen": 226165935, + "step": 10487, + "time_per_iteration": 2.583712100982666 + }, + { + "auxiliary_loss_clip": 0.01165366, + "auxiliary_loss_mlp": 0.01105405, + "balance_loss_clip": 1.00202036, + "balance_loss_mlp": 1.00069153, + "epoch": 0.6305726739816624, + "flos": 25808280658560.0, + "grad_norm": 1.607993608739743, + "language_loss": 0.67249238, + "learning_rate": 1.2686274498216065e-06, + "loss": 0.69520009, + "num_input_tokens_seen": 226186890, + "step": 10488, + "time_per_iteration": 2.54135799407959 + }, + { + "auxiliary_loss_clip": 0.01133183, + "auxiliary_loss_mlp": 0.01105579, + "balance_loss_clip": 1.00192273, + "balance_loss_mlp": 1.00048435, + "epoch": 0.6306327972343304, + "flos": 21797454159360.0, + "grad_norm": 1.72347246094187, + "language_loss": 0.67264009, + "learning_rate": 1.2682649768365706e-06, + "loss": 0.69502771, + "num_input_tokens_seen": 226206710, + "step": 10489, + "time_per_iteration": 2.6018364429473877 + }, + { + "auxiliary_loss_clip": 0.01117799, + "auxiliary_loss_mlp": 0.01107194, + "balance_loss_clip": 1.0018357, + "balance_loss_mlp": 1.00057316, + "epoch": 0.6306929204869983, + "flos": 20777375819520.0, + "grad_norm": 1.7788735327332799, + "language_loss": 0.69551301, + "learning_rate": 1.2679025315990007e-06, + "loss": 0.71776301, + "num_input_tokens_seen": 226225565, + "step": 10490, + "time_per_iteration": 2.6297385692596436 + }, + { + "auxiliary_loss_clip": 0.01134553, + "auxiliary_loss_mlp": 0.01105992, + "balance_loss_clip": 1.00201392, + "balance_loss_mlp": 1.00061142, + "epoch": 0.6307530437396663, + "flos": 23654214973440.0, + "grad_norm": 2.2779371746339137, + "language_loss": 0.78525376, + "learning_rate": 1.2675401141226393e-06, + "loss": 0.80765927, + "num_input_tokens_seen": 226243680, + "step": 10491, + "time_per_iteration": 2.59672212600708 + }, + { + "auxiliary_loss_clip": 0.01133766, + "auxiliary_loss_mlp": 0.01105271, + "balance_loss_clip": 1.00197947, + "balance_loss_mlp": 1.00046229, + "epoch": 0.6308131669923343, + "flos": 24719002767360.0, + "grad_norm": 5.558054860887814, + "language_loss": 0.55834484, + "learning_rate": 1.2671777244212308e-06, + "loss": 0.58073521, + "num_input_tokens_seen": 226264345, + "step": 10492, + "time_per_iteration": 2.615712881088257 + }, + { + "auxiliary_loss_clip": 0.01165515, + "auxiliary_loss_mlp": 0.01105331, + "balance_loss_clip": 1.00206065, + "balance_loss_mlp": 1.0005219, + "epoch": 0.6308732902450023, + "flos": 22565403959040.0, + "grad_norm": 16.603594670070414, + "language_loss": 0.63705194, + "learning_rate": 1.2668153625085168e-06, + "loss": 0.65976036, + "num_input_tokens_seen": 226283165, + "step": 10493, + "time_per_iteration": 2.502218246459961 + }, + { + "auxiliary_loss_clip": 0.0111657, + "auxiliary_loss_mlp": 0.01105224, + "balance_loss_clip": 1.00173986, + "balance_loss_mlp": 1.00041533, + "epoch": 0.6309334134976702, + "flos": 24644200694400.0, + "grad_norm": 1.5642013600068632, + "language_loss": 0.82730174, + "learning_rate": 1.2664530283982367e-06, + "loss": 0.84951961, + "num_input_tokens_seen": 226304080, + "step": 10494, + "time_per_iteration": 2.65335750579834 + }, + { + "auxiliary_loss_clip": 0.01135122, + "auxiliary_loss_mlp": 0.01105441, + "balance_loss_clip": 1.00193417, + "balance_loss_mlp": 1.00044131, + "epoch": 0.6309935367503382, + "flos": 41427949651200.0, + "grad_norm": 1.6969398770970923, + "language_loss": 0.79171979, + "learning_rate": 1.2660907221041317e-06, + "loss": 0.81412542, + "num_input_tokens_seen": 226325925, + "step": 10495, + "time_per_iteration": 2.771840810775757 + }, + { + "auxiliary_loss_clip": 0.01134212, + "auxiliary_loss_mlp": 0.01106232, + "balance_loss_clip": 1.00194299, + "balance_loss_mlp": 1.00056458, + "epoch": 0.6310536600030061, + "flos": 15118931445120.0, + "grad_norm": 2.0791101296960446, + "language_loss": 0.70814025, + "learning_rate": 1.2657284436399403e-06, + "loss": 0.73054469, + "num_input_tokens_seen": 226344190, + "step": 10496, + "time_per_iteration": 2.565822124481201 + }, + { + "auxiliary_loss_clip": 0.01133956, + "auxiliary_loss_mlp": 0.01106431, + "balance_loss_clip": 1.00191426, + "balance_loss_mlp": 1.00066876, + "epoch": 0.6311137832556741, + "flos": 15231619388160.0, + "grad_norm": 2.1304783411020973, + "language_loss": 0.79992455, + "learning_rate": 1.2653661930193997e-06, + "loss": 0.82232845, + "num_input_tokens_seen": 226361520, + "step": 10497, + "time_per_iteration": 4.026886463165283 + }, + { + "auxiliary_loss_clip": 0.01134618, + "auxiliary_loss_mlp": 0.01105132, + "balance_loss_clip": 1.00207114, + "balance_loss_mlp": 1.0006094, + "epoch": 0.6311739065083422, + "flos": 22018664067840.0, + "grad_norm": 2.1210217203642645, + "language_loss": 0.74196708, + "learning_rate": 1.265003970256247e-06, + "loss": 0.76436454, + "num_input_tokens_seen": 226381920, + "step": 10498, + "time_per_iteration": 2.6455748081207275 + }, + { + "auxiliary_loss_clip": 0.01150608, + "auxiliary_loss_mlp": 0.01105287, + "balance_loss_clip": 1.00197148, + "balance_loss_mlp": 1.00047851, + "epoch": 0.6312340297610101, + "flos": 22710770300160.0, + "grad_norm": 2.017905301172773, + "language_loss": 0.69418615, + "learning_rate": 1.264641775364217e-06, + "loss": 0.71674514, + "num_input_tokens_seen": 226400035, + "step": 10499, + "time_per_iteration": 2.6239519119262695 + }, + { + "auxiliary_loss_clip": 0.01149778, + "auxiliary_loss_mlp": 0.01105203, + "balance_loss_clip": 1.00206769, + "balance_loss_mlp": 1.00058532, + "epoch": 0.6312941530136781, + "flos": 24280102483200.0, + "grad_norm": 1.8308122486569522, + "language_loss": 0.70105088, + "learning_rate": 1.2642796083570448e-06, + "loss": 0.72360069, + "num_input_tokens_seen": 226418280, + "step": 10500, + "time_per_iteration": 2.6043310165405273 + }, + { + "auxiliary_loss_clip": 0.01165328, + "auxiliary_loss_mlp": 0.01105078, + "balance_loss_clip": 1.00203013, + "balance_loss_mlp": 1.00046039, + "epoch": 0.631354276266346, + "flos": 21725956137600.0, + "grad_norm": 7.617811902798675, + "language_loss": 0.74488258, + "learning_rate": 1.2639174692484634e-06, + "loss": 0.76758659, + "num_input_tokens_seen": 226436650, + "step": 10501, + "time_per_iteration": 2.6562564373016357 + }, + { + "auxiliary_loss_clip": 0.01148622, + "auxiliary_loss_mlp": 0.00747272, + "balance_loss_clip": 1.00201833, + "balance_loss_mlp": 1.0004679, + "epoch": 0.631414399519014, + "flos": 24025100855040.0, + "grad_norm": 1.875326062524401, + "language_loss": 0.75428939, + "learning_rate": 1.2635553580522053e-06, + "loss": 0.77324837, + "num_input_tokens_seen": 226456275, + "step": 10502, + "time_per_iteration": 2.6208841800689697 + }, + { + "auxiliary_loss_clip": 0.01148899, + "auxiliary_loss_mlp": 0.01107032, + "balance_loss_clip": 1.00191236, + "balance_loss_mlp": 1.0007931, + "epoch": 0.6314745227716819, + "flos": 24315797623680.0, + "grad_norm": 1.8823536311197255, + "language_loss": 0.85560298, + "learning_rate": 1.2631932747820022e-06, + "loss": 0.87816232, + "num_input_tokens_seen": 226473610, + "step": 10503, + "time_per_iteration": 3.9504263401031494 + }, + { + "auxiliary_loss_clip": 0.011363, + "auxiliary_loss_mlp": 0.01105189, + "balance_loss_clip": 1.00197935, + "balance_loss_mlp": 1.00047505, + "epoch": 0.6315346460243499, + "flos": 23366391292800.0, + "grad_norm": 2.2563084954623056, + "language_loss": 0.86520523, + "learning_rate": 1.2628312194515838e-06, + "loss": 0.88762009, + "num_input_tokens_seen": 226493665, + "step": 10504, + "time_per_iteration": 2.5856590270996094 + }, + { + "auxiliary_loss_clip": 0.01115487, + "auxiliary_loss_mlp": 0.01106624, + "balance_loss_clip": 1.00180125, + "balance_loss_mlp": 1.00057566, + "epoch": 0.6315947692770179, + "flos": 20260333497600.0, + "grad_norm": 1.6059411250870566, + "language_loss": 0.76561821, + "learning_rate": 1.2624691920746793e-06, + "loss": 0.78783935, + "num_input_tokens_seen": 226511625, + "step": 10505, + "time_per_iteration": 2.658522129058838 + }, + { + "auxiliary_loss_clip": 0.01104017, + "auxiliary_loss_mlp": 0.01105334, + "balance_loss_clip": 1.0017885, + "balance_loss_mlp": 1.00052547, + "epoch": 0.6316548925296859, + "flos": 25265850399360.0, + "grad_norm": 1.8063666872874804, + "language_loss": 0.81891513, + "learning_rate": 1.2621071926650166e-06, + "loss": 0.84100866, + "num_input_tokens_seen": 226530085, + "step": 10506, + "time_per_iteration": 2.7127060890197754 + }, + { + "auxiliary_loss_clip": 0.01165448, + "auxiliary_loss_mlp": 0.01105499, + "balance_loss_clip": 1.0019747, + "balance_loss_mlp": 1.00059509, + "epoch": 0.6317150157823538, + "flos": 22930579578240.0, + "grad_norm": 1.9862689235430997, + "language_loss": 0.74540347, + "learning_rate": 1.2617452212363238e-06, + "loss": 0.7681129, + "num_input_tokens_seen": 226548115, + "step": 10507, + "time_per_iteration": 2.5292866230010986 + }, + { + "auxiliary_loss_clip": 0.01132309, + "auxiliary_loss_mlp": 0.01106919, + "balance_loss_clip": 1.00189352, + "balance_loss_mlp": 1.00058401, + "epoch": 0.6317751390350218, + "flos": 22527051212160.0, + "grad_norm": 1.599794306619317, + "language_loss": 0.67731112, + "learning_rate": 1.2613832778023258e-06, + "loss": 0.6997034, + "num_input_tokens_seen": 226567955, + "step": 10508, + "time_per_iteration": 2.627187490463257 + }, + { + "auxiliary_loss_clip": 0.01120226, + "auxiliary_loss_mlp": 0.01105814, + "balance_loss_clip": 1.00201178, + "balance_loss_mlp": 1.00052881, + "epoch": 0.6318352622876897, + "flos": 23294749616640.0, + "grad_norm": 2.478967241827473, + "language_loss": 0.70945346, + "learning_rate": 1.2610213623767478e-06, + "loss": 0.73171383, + "num_input_tokens_seen": 226588205, + "step": 10509, + "time_per_iteration": 4.0131676197052 + }, + { + "auxiliary_loss_clip": 0.01149016, + "auxiliary_loss_mlp": 0.01105368, + "balance_loss_clip": 1.00192881, + "balance_loss_mlp": 1.00036812, + "epoch": 0.6318953855403577, + "flos": 20704082117760.0, + "grad_norm": 1.824132515212355, + "language_loss": 0.79758257, + "learning_rate": 1.2606594749733143e-06, + "loss": 0.82012641, + "num_input_tokens_seen": 226606965, + "step": 10510, + "time_per_iteration": 2.5452866554260254 + }, + { + "auxiliary_loss_clip": 0.01098759, + "auxiliary_loss_mlp": 0.0074742, + "balance_loss_clip": 1.00181866, + "balance_loss_mlp": 1.00050235, + "epoch": 0.6319555087930258, + "flos": 22820046451200.0, + "grad_norm": 1.531910018034548, + "language_loss": 0.70496804, + "learning_rate": 1.2602976156057469e-06, + "loss": 0.7234298, + "num_input_tokens_seen": 226627845, + "step": 10511, + "time_per_iteration": 4.204188108444214 + }, + { + "auxiliary_loss_clip": 0.0116546, + "auxiliary_loss_mlp": 0.01105509, + "balance_loss_clip": 1.00208306, + "balance_loss_mlp": 1.00050926, + "epoch": 0.6320156320456937, + "flos": 19970929618560.0, + "grad_norm": 2.7340149599835697, + "language_loss": 0.8005147, + "learning_rate": 1.2599357842877684e-06, + "loss": 0.82322431, + "num_input_tokens_seen": 226645855, + "step": 10512, + "time_per_iteration": 2.556196451187134 + }, + { + "auxiliary_loss_clip": 0.01148259, + "auxiliary_loss_mlp": 0.01105933, + "balance_loss_clip": 1.00194669, + "balance_loss_mlp": 1.0005517, + "epoch": 0.6320757552983617, + "flos": 27013406889600.0, + "grad_norm": 1.9895369592185774, + "language_loss": 0.70910335, + "learning_rate": 1.2595739810330994e-06, + "loss": 0.73164529, + "num_input_tokens_seen": 226665375, + "step": 10513, + "time_per_iteration": 2.6074259281158447 + }, + { + "auxiliary_loss_clip": 0.01150881, + "auxiliary_loss_mlp": 0.01106008, + "balance_loss_clip": 1.00200605, + "balance_loss_mlp": 1.00043583, + "epoch": 0.6321358785510296, + "flos": 23695943598720.0, + "grad_norm": 1.6237648775382176, + "language_loss": 0.66278923, + "learning_rate": 1.259212205855459e-06, + "loss": 0.68535817, + "num_input_tokens_seen": 226685270, + "step": 10514, + "time_per_iteration": 2.5894815921783447 + }, + { + "auxiliary_loss_clip": 0.01120934, + "auxiliary_loss_mlp": 0.01104679, + "balance_loss_clip": 1.00181937, + "balance_loss_mlp": 1.0004425, + "epoch": 0.6321960018036976, + "flos": 25995231970560.0, + "grad_norm": 1.7106091969895374, + "language_loss": 0.74216986, + "learning_rate": 1.2588504587685663e-06, + "loss": 0.76442599, + "num_input_tokens_seen": 226705325, + "step": 10515, + "time_per_iteration": 2.6773319244384766 + }, + { + "auxiliary_loss_clip": 0.01131753, + "auxiliary_loss_mlp": 0.01104411, + "balance_loss_clip": 1.00185883, + "balance_loss_mlp": 1.00046062, + "epoch": 0.6322561250563655, + "flos": 22821016118400.0, + "grad_norm": 2.5559187431564823, + "language_loss": 0.90067804, + "learning_rate": 1.2584887397861379e-06, + "loss": 0.92303967, + "num_input_tokens_seen": 226723815, + "step": 10516, + "time_per_iteration": 2.624986171722412 + }, + { + "auxiliary_loss_clip": 0.0116575, + "auxiliary_loss_mlp": 0.0110806, + "balance_loss_clip": 1.0021137, + "balance_loss_mlp": 1.00058067, + "epoch": 0.6323162483090335, + "flos": 18988413926400.0, + "grad_norm": 1.790281252927573, + "language_loss": 0.81625277, + "learning_rate": 1.2581270489218911e-06, + "loss": 0.83899087, + "num_input_tokens_seen": 226741550, + "step": 10517, + "time_per_iteration": 2.516171455383301 + }, + { + "auxiliary_loss_clip": 0.0108193, + "auxiliary_loss_mlp": 0.01105328, + "balance_loss_clip": 1.00164151, + "balance_loss_mlp": 1.0005188, + "epoch": 0.6323763715617015, + "flos": 19865173000320.0, + "grad_norm": 1.6100460545403987, + "language_loss": 0.7775324, + "learning_rate": 1.257765386189541e-06, + "loss": 0.79940498, + "num_input_tokens_seen": 226761115, + "step": 10518, + "time_per_iteration": 2.7142457962036133 + }, + { + "auxiliary_loss_clip": 0.01148912, + "auxiliary_loss_mlp": 0.01105558, + "balance_loss_clip": 1.00191832, + "balance_loss_mlp": 1.00055814, + "epoch": 0.6324364948143695, + "flos": 22782699285120.0, + "grad_norm": 1.6277937235064786, + "language_loss": 0.85031521, + "learning_rate": 1.2574037516028018e-06, + "loss": 0.87285995, + "num_input_tokens_seen": 226782225, + "step": 10519, + "time_per_iteration": 2.6196606159210205 + }, + { + "auxiliary_loss_clip": 0.01133737, + "auxiliary_loss_mlp": 0.01104665, + "balance_loss_clip": 1.00200474, + "balance_loss_mlp": 1.00052357, + "epoch": 0.6324966180670374, + "flos": 22235923480320.0, + "grad_norm": 1.610701886592563, + "language_loss": 0.72166431, + "learning_rate": 1.2570421451753867e-06, + "loss": 0.74404836, + "num_input_tokens_seen": 226802375, + "step": 10520, + "time_per_iteration": 2.6204516887664795 + }, + { + "auxiliary_loss_clip": 0.0114878, + "auxiliary_loss_mlp": 0.01104966, + "balance_loss_clip": 1.00192106, + "balance_loss_mlp": 1.0004431, + "epoch": 0.6325567413197054, + "flos": 21689183589120.0, + "grad_norm": 2.0202292782244697, + "language_loss": 0.71832955, + "learning_rate": 1.2566805669210081e-06, + "loss": 0.74086702, + "num_input_tokens_seen": 226822165, + "step": 10521, + "time_per_iteration": 2.567955493927002 + }, + { + "auxiliary_loss_clip": 0.01119497, + "auxiliary_loss_mlp": 0.01105859, + "balance_loss_clip": 1.00194478, + "balance_loss_mlp": 1.00047755, + "epoch": 0.6326168645723733, + "flos": 19937137898880.0, + "grad_norm": 1.9086285160047771, + "language_loss": 0.71789688, + "learning_rate": 1.256319016853377e-06, + "loss": 0.74015045, + "num_input_tokens_seen": 226841645, + "step": 10522, + "time_per_iteration": 2.6456551551818848 + }, + { + "auxiliary_loss_clip": 0.01100302, + "auxiliary_loss_mlp": 0.01104708, + "balance_loss_clip": 1.00166214, + "balance_loss_mlp": 1.00047135, + "epoch": 0.6326769878250413, + "flos": 20230348619520.0, + "grad_norm": 1.8726078181892105, + "language_loss": 0.81933188, + "learning_rate": 1.2559574949862023e-06, + "loss": 0.84138191, + "num_input_tokens_seen": 226860355, + "step": 10523, + "time_per_iteration": 2.677550792694092 + }, + { + "auxiliary_loss_clip": 0.01148689, + "auxiliary_loss_mlp": 0.01106201, + "balance_loss_clip": 1.00192261, + "balance_loss_mlp": 1.00053358, + "epoch": 0.6327371110777094, + "flos": 20775759707520.0, + "grad_norm": 2.018583096081883, + "language_loss": 0.73902774, + "learning_rate": 1.255596001333195e-06, + "loss": 0.76157665, + "num_input_tokens_seen": 226878390, + "step": 10524, + "time_per_iteration": 2.5507891178131104 + }, + { + "auxiliary_loss_clip": 0.01134029, + "auxiliary_loss_mlp": 0.01107341, + "balance_loss_clip": 1.00187397, + "balance_loss_mlp": 1.00071979, + "epoch": 0.6327972343303773, + "flos": 30336544529280.0, + "grad_norm": 2.821100513904808, + "language_loss": 0.84292769, + "learning_rate": 1.2552345359080615e-06, + "loss": 0.86534131, + "num_input_tokens_seen": 226898420, + "step": 10525, + "time_per_iteration": 2.6790452003479004 + }, + { + "auxiliary_loss_clip": 0.01135592, + "auxiliary_loss_mlp": 0.01105018, + "balance_loss_clip": 1.00191665, + "balance_loss_mlp": 1.0003041, + "epoch": 0.6328573575830453, + "flos": 17092258871040.0, + "grad_norm": 1.616121533863458, + "language_loss": 0.66977674, + "learning_rate": 1.2548730987245093e-06, + "loss": 0.69218278, + "num_input_tokens_seen": 226916305, + "step": 10526, + "time_per_iteration": 2.577017068862915 + }, + { + "auxiliary_loss_clip": 0.01148382, + "auxiliary_loss_mlp": 0.01106423, + "balance_loss_clip": 1.00195837, + "balance_loss_mlp": 1.00056529, + "epoch": 0.6329174808357132, + "flos": 25047154442880.0, + "grad_norm": 1.6057820805013263, + "language_loss": 0.73549747, + "learning_rate": 1.254511689796244e-06, + "loss": 0.75804555, + "num_input_tokens_seen": 226937705, + "step": 10527, + "time_per_iteration": 2.6140379905700684 + }, + { + "auxiliary_loss_clip": 0.01148774, + "auxiliary_loss_mlp": 0.01104775, + "balance_loss_clip": 1.0021354, + "balance_loss_mlp": 1.00044346, + "epoch": 0.6329776040883812, + "flos": 16836826279680.0, + "grad_norm": 1.9738932877948268, + "language_loss": 0.71890473, + "learning_rate": 1.2541503091369693e-06, + "loss": 0.74144024, + "num_input_tokens_seen": 226954880, + "step": 10528, + "time_per_iteration": 2.534550666809082 + }, + { + "auxiliary_loss_clip": 0.01148439, + "auxiliary_loss_mlp": 0.01105458, + "balance_loss_clip": 1.00186288, + "balance_loss_mlp": 1.00045896, + "epoch": 0.6330377273410491, + "flos": 13516705382400.0, + "grad_norm": 2.3992105586230417, + "language_loss": 0.66065347, + "learning_rate": 1.2537889567603905e-06, + "loss": 0.68319249, + "num_input_tokens_seen": 226972595, + "step": 10529, + "time_per_iteration": 2.5181190967559814 + }, + { + "auxiliary_loss_clip": 0.01150896, + "auxiliary_loss_mlp": 0.0110689, + "balance_loss_clip": 1.00203943, + "balance_loss_mlp": 1.00046027, + "epoch": 0.6330978505937171, + "flos": 21538825257600.0, + "grad_norm": 1.8382520498624932, + "language_loss": 0.75084633, + "learning_rate": 1.2534276326802092e-06, + "loss": 0.77342421, + "num_input_tokens_seen": 226991910, + "step": 10530, + "time_per_iteration": 2.550354242324829 + }, + { + "auxiliary_loss_clip": 0.01148728, + "auxiliary_loss_mlp": 0.00747068, + "balance_loss_clip": 1.00197554, + "balance_loss_mlp": 1.00046623, + "epoch": 0.6331579738463851, + "flos": 25009484054400.0, + "grad_norm": 1.896138541691783, + "language_loss": 0.74056542, + "learning_rate": 1.2530663369101259e-06, + "loss": 0.75952339, + "num_input_tokens_seen": 227010175, + "step": 10531, + "time_per_iteration": 2.581450939178467 + }, + { + "auxiliary_loss_clip": 0.0111527, + "auxiliary_loss_mlp": 0.01104235, + "balance_loss_clip": 1.00171483, + "balance_loss_mlp": 1.00047481, + "epoch": 0.6332180970990531, + "flos": 14976007228800.0, + "grad_norm": 4.522498452450151, + "language_loss": 0.79636264, + "learning_rate": 1.2527050694638432e-06, + "loss": 0.81855768, + "num_input_tokens_seen": 227025540, + "step": 10532, + "time_per_iteration": 2.581289768218994 + }, + { + "auxiliary_loss_clip": 0.01148998, + "auxiliary_loss_mlp": 0.01104029, + "balance_loss_clip": 1.0018959, + "balance_loss_mlp": 1.00045955, + "epoch": 0.633278220351721, + "flos": 22706963458560.0, + "grad_norm": 1.7820411140954517, + "language_loss": 0.74878865, + "learning_rate": 1.2523438303550582e-06, + "loss": 0.77131891, + "num_input_tokens_seen": 227045520, + "step": 10533, + "time_per_iteration": 2.5842392444610596 + }, + { + "auxiliary_loss_clip": 0.01131887, + "auxiliary_loss_mlp": 0.01106414, + "balance_loss_clip": 1.00183773, + "balance_loss_mlp": 1.00065136, + "epoch": 0.633338343604389, + "flos": 12602922364800.0, + "grad_norm": 3.139283297510585, + "language_loss": 0.77193582, + "learning_rate": 1.2519826195974706e-06, + "loss": 0.7943188, + "num_input_tokens_seen": 227059420, + "step": 10534, + "time_per_iteration": 2.5332448482513428 + }, + { + "auxiliary_loss_clip": 0.01120389, + "auxiliary_loss_mlp": 0.01105481, + "balance_loss_clip": 1.00191045, + "balance_loss_mlp": 1.00048113, + "epoch": 0.6333984668570569, + "flos": 25960111447680.0, + "grad_norm": 1.540495654348398, + "language_loss": 0.85564131, + "learning_rate": 1.251621437204777e-06, + "loss": 0.87789994, + "num_input_tokens_seen": 227081310, + "step": 10535, + "time_per_iteration": 2.706892251968384 + }, + { + "auxiliary_loss_clip": 0.01150663, + "auxiliary_loss_mlp": 0.01105085, + "balance_loss_clip": 1.00191617, + "balance_loss_mlp": 1.00037146, + "epoch": 0.6334585901097249, + "flos": 23659242877440.0, + "grad_norm": 2.113684014247753, + "language_loss": 0.76418245, + "learning_rate": 1.2512602831906733e-06, + "loss": 0.78673995, + "num_input_tokens_seen": 227100365, + "step": 10536, + "time_per_iteration": 3.9312174320220947 + }, + { + "auxiliary_loss_clip": 0.01150431, + "auxiliary_loss_mlp": 0.01105464, + "balance_loss_clip": 1.00209594, + "balance_loss_mlp": 1.00046432, + "epoch": 0.633518713362393, + "flos": 28760496503040.0, + "grad_norm": 1.635440517190817, + "language_loss": 0.59872627, + "learning_rate": 1.250899157568855e-06, + "loss": 0.6212852, + "num_input_tokens_seen": 227119680, + "step": 10537, + "time_per_iteration": 2.6148898601531982 + }, + { + "auxiliary_loss_clip": 0.01110664, + "auxiliary_loss_mlp": 0.01081833, + "balance_loss_clip": 1.00098205, + "balance_loss_mlp": 1.00019825, + "epoch": 0.6335788366150609, + "flos": 70420322401920.0, + "grad_norm": 0.770571016628155, + "language_loss": 0.52465785, + "learning_rate": 1.2505380603530155e-06, + "loss": 0.54658276, + "num_input_tokens_seen": 227184465, + "step": 10538, + "time_per_iteration": 3.304088830947876 + }, + { + "auxiliary_loss_clip": 0.01132366, + "auxiliary_loss_mlp": 0.01106848, + "balance_loss_clip": 1.00187993, + "balance_loss_mlp": 1.00051332, + "epoch": 0.6336389598677289, + "flos": 23732069702400.0, + "grad_norm": 1.8621010897766237, + "language_loss": 0.83572102, + "learning_rate": 1.250176991556848e-06, + "loss": 0.85811317, + "num_input_tokens_seen": 227202185, + "step": 10539, + "time_per_iteration": 2.6128673553466797 + }, + { + "auxiliary_loss_clip": 0.01134132, + "auxiliary_loss_mlp": 0.01105524, + "balance_loss_clip": 1.00195336, + "balance_loss_mlp": 1.00042915, + "epoch": 0.6336990831203968, + "flos": 29276676898560.0, + "grad_norm": 1.8735098358829132, + "language_loss": 0.86655366, + "learning_rate": 1.2498159511940438e-06, + "loss": 0.88895023, + "num_input_tokens_seen": 227222020, + "step": 10540, + "time_per_iteration": 4.052729845046997 + }, + { + "auxiliary_loss_clip": 0.01131983, + "auxiliary_loss_mlp": 0.01105162, + "balance_loss_clip": 1.00182366, + "balance_loss_mlp": 1.00054407, + "epoch": 0.6337592063730648, + "flos": 29096836479360.0, + "grad_norm": 1.5850205019183345, + "language_loss": 0.72455359, + "learning_rate": 1.2494549392782943e-06, + "loss": 0.746925, + "num_input_tokens_seen": 227240885, + "step": 10541, + "time_per_iteration": 2.6969051361083984 + }, + { + "auxiliary_loss_clip": 0.01149119, + "auxiliary_loss_mlp": 0.01106596, + "balance_loss_clip": 1.00196397, + "balance_loss_mlp": 1.00045192, + "epoch": 0.6338193296257327, + "flos": 34706477249280.0, + "grad_norm": 5.841634947420153, + "language_loss": 0.8463949, + "learning_rate": 1.2490939558232887e-06, + "loss": 0.86895204, + "num_input_tokens_seen": 227257880, + "step": 10542, + "time_per_iteration": 2.679663896560669 + }, + { + "auxiliary_loss_clip": 0.01148486, + "auxiliary_loss_mlp": 0.01104627, + "balance_loss_clip": 1.00182819, + "balance_loss_mlp": 1.00039005, + "epoch": 0.6338794528784008, + "flos": 16687581269760.0, + "grad_norm": 2.0239319014894486, + "language_loss": 0.77389395, + "learning_rate": 1.2487330008427153e-06, + "loss": 0.7964251, + "num_input_tokens_seen": 227274840, + "step": 10543, + "time_per_iteration": 2.597903251647949 + }, + { + "auxiliary_loss_clip": 0.01098957, + "auxiliary_loss_mlp": 0.01104972, + "balance_loss_clip": 1.00164235, + "balance_loss_mlp": 1.00064039, + "epoch": 0.6339395761310687, + "flos": 22346600261760.0, + "grad_norm": 1.7151126971736081, + "language_loss": 0.73403811, + "learning_rate": 1.2483720743502618e-06, + "loss": 0.75607741, + "num_input_tokens_seen": 227294835, + "step": 10544, + "time_per_iteration": 2.6711130142211914 + }, + { + "auxiliary_loss_clip": 0.01118942, + "auxiliary_loss_mlp": 0.01106651, + "balance_loss_clip": 1.00186372, + "balance_loss_mlp": 1.0006026, + "epoch": 0.6339996993837367, + "flos": 18551812112640.0, + "grad_norm": 1.9076488249184504, + "language_loss": 0.68443429, + "learning_rate": 1.2480111763596144e-06, + "loss": 0.70669019, + "num_input_tokens_seen": 227314935, + "step": 10545, + "time_per_iteration": 2.6400842666625977 + }, + { + "auxiliary_loss_clip": 0.01133168, + "auxiliary_loss_mlp": 0.01104786, + "balance_loss_clip": 1.00182438, + "balance_loss_mlp": 1.00054908, + "epoch": 0.6340598226364046, + "flos": 12969498614400.0, + "grad_norm": 2.4004141275414637, + "language_loss": 0.71377957, + "learning_rate": 1.2476503068844592e-06, + "loss": 0.73615915, + "num_input_tokens_seen": 227332905, + "step": 10546, + "time_per_iteration": 3.9185097217559814 + }, + { + "auxiliary_loss_clip": 0.01148366, + "auxiliary_loss_mlp": 0.01104568, + "balance_loss_clip": 1.00197196, + "balance_loss_mlp": 1.0005219, + "epoch": 0.6341199458890726, + "flos": 26687984647680.0, + "grad_norm": 1.4554569505771238, + "language_loss": 0.77992296, + "learning_rate": 1.2472894659384792e-06, + "loss": 0.80245227, + "num_input_tokens_seen": 227354915, + "step": 10547, + "time_per_iteration": 2.6123948097229004 + }, + { + "auxiliary_loss_clip": 0.01103468, + "auxiliary_loss_mlp": 0.01106595, + "balance_loss_clip": 1.00177002, + "balance_loss_mlp": 1.00054622, + "epoch": 0.6341800691417405, + "flos": 18734274224640.0, + "grad_norm": 1.8975887162468357, + "language_loss": 0.63250661, + "learning_rate": 1.2469286535353578e-06, + "loss": 0.65460718, + "num_input_tokens_seen": 227372990, + "step": 10548, + "time_per_iteration": 4.118873357772827 + }, + { + "auxiliary_loss_clip": 0.01135988, + "auxiliary_loss_mlp": 0.01104973, + "balance_loss_clip": 1.00206566, + "balance_loss_mlp": 1.00045013, + "epoch": 0.6342401923944085, + "flos": 26249443499520.0, + "grad_norm": 1.8399953571172671, + "language_loss": 0.62110645, + "learning_rate": 1.2465678696887785e-06, + "loss": 0.643516, + "num_input_tokens_seen": 227393270, + "step": 10549, + "time_per_iteration": 2.6429567337036133 + }, + { + "auxiliary_loss_clip": 0.01098055, + "auxiliary_loss_mlp": 0.01105044, + "balance_loss_clip": 1.00155187, + "balance_loss_mlp": 1.00052142, + "epoch": 0.6343003156470765, + "flos": 24680937329280.0, + "grad_norm": 1.5424401922287576, + "language_loss": 0.73307252, + "learning_rate": 1.2462071144124197e-06, + "loss": 0.75510347, + "num_input_tokens_seen": 227413630, + "step": 10550, + "time_per_iteration": 2.7052736282348633 + }, + { + "auxiliary_loss_clip": 0.01111512, + "auxiliary_loss_mlp": 0.01082704, + "balance_loss_clip": 1.00073862, + "balance_loss_mlp": 1.00030649, + "epoch": 0.6343604388997445, + "flos": 69805352626560.0, + "grad_norm": 0.6937282975345089, + "language_loss": 0.57707167, + "learning_rate": 1.2458463877199638e-06, + "loss": 0.59901386, + "num_input_tokens_seen": 227476630, + "step": 10551, + "time_per_iteration": 3.2279393672943115 + }, + { + "auxiliary_loss_clip": 0.0111581, + "auxiliary_loss_mlp": 0.01104137, + "balance_loss_clip": 1.0017159, + "balance_loss_mlp": 1.00047231, + "epoch": 0.6344205621524125, + "flos": 21982430223360.0, + "grad_norm": 1.7499260926051432, + "language_loss": 0.67188704, + "learning_rate": 1.2454856896250881e-06, + "loss": 0.69408649, + "num_input_tokens_seen": 227496060, + "step": 10552, + "time_per_iteration": 2.663801908493042 + }, + { + "auxiliary_loss_clip": 0.01118874, + "auxiliary_loss_mlp": 0.01106741, + "balance_loss_clip": 1.00185955, + "balance_loss_mlp": 1.00040674, + "epoch": 0.6344806854050804, + "flos": 20448865008000.0, + "grad_norm": 2.240839698225508, + "language_loss": 0.82181752, + "learning_rate": 1.24512502014147e-06, + "loss": 0.84407365, + "num_input_tokens_seen": 227513440, + "step": 10553, + "time_per_iteration": 2.635328531265259 + }, + { + "auxiliary_loss_clip": 0.01148707, + "auxiliary_loss_mlp": 0.01106882, + "balance_loss_clip": 1.00197315, + "balance_loss_mlp": 1.00045252, + "epoch": 0.6345408086577484, + "flos": 40510611187200.0, + "grad_norm": 2.4783078113632957, + "language_loss": 0.54467589, + "learning_rate": 1.2447643792827879e-06, + "loss": 0.56723177, + "num_input_tokens_seen": 227535395, + "step": 10554, + "time_per_iteration": 2.7221343517303467 + }, + { + "auxiliary_loss_clip": 0.01133502, + "auxiliary_loss_mlp": 0.01105822, + "balance_loss_clip": 1.00191641, + "balance_loss_mlp": 1.00044096, + "epoch": 0.6346009319104163, + "flos": 21361319222400.0, + "grad_norm": 1.8628198451245235, + "language_loss": 0.7088269, + "learning_rate": 1.2444037670627153e-06, + "loss": 0.73122013, + "num_input_tokens_seen": 227554545, + "step": 10555, + "time_per_iteration": 2.6035349369049072 + }, + { + "auxiliary_loss_clip": 0.01128769, + "auxiliary_loss_mlp": 0.01081325, + "balance_loss_clip": 1.00117397, + "balance_loss_mlp": 1.00007248, + "epoch": 0.6346610551630844, + "flos": 71365419100800.0, + "grad_norm": 0.7741164347698576, + "language_loss": 0.55354333, + "learning_rate": 1.2440431834949276e-06, + "loss": 0.57564425, + "num_input_tokens_seen": 227608575, + "step": 10556, + "time_per_iteration": 3.071367025375366 + }, + { + "auxiliary_loss_clip": 0.0113347, + "auxiliary_loss_mlp": 0.01106684, + "balance_loss_clip": 1.00183225, + "balance_loss_mlp": 1.00054038, + "epoch": 0.6347211784157523, + "flos": 25411504049280.0, + "grad_norm": 2.0495981789271402, + "language_loss": 0.68398696, + "learning_rate": 1.2436826285930985e-06, + "loss": 0.70638847, + "num_input_tokens_seen": 227628175, + "step": 10557, + "time_per_iteration": 2.6250765323638916 + }, + { + "auxiliary_loss_clip": 0.01133954, + "auxiliary_loss_mlp": 0.01105149, + "balance_loss_clip": 1.00185776, + "balance_loss_mlp": 1.0005306, + "epoch": 0.6347813016684203, + "flos": 15742735966080.0, + "grad_norm": 1.6264561780158067, + "language_loss": 0.7024976, + "learning_rate": 1.2433221023709002e-06, + "loss": 0.72488862, + "num_input_tokens_seen": 227645330, + "step": 10558, + "time_per_iteration": 2.594965696334839 + }, + { + "auxiliary_loss_clip": 0.01136189, + "auxiliary_loss_mlp": 0.01105191, + "balance_loss_clip": 1.0020057, + "balance_loss_mlp": 1.00038171, + "epoch": 0.6348414249210882, + "flos": 21464777370240.0, + "grad_norm": 1.6591020837132395, + "language_loss": 0.78181607, + "learning_rate": 1.2429616048420031e-06, + "loss": 0.80422986, + "num_input_tokens_seen": 227665250, + "step": 10559, + "time_per_iteration": 2.6142618656158447 + }, + { + "auxiliary_loss_clip": 0.01135986, + "auxiliary_loss_mlp": 0.0110638, + "balance_loss_clip": 1.00211978, + "balance_loss_mlp": 1.00052249, + "epoch": 0.6349015481737562, + "flos": 21653057485440.0, + "grad_norm": 1.7664590041354646, + "language_loss": 0.68213999, + "learning_rate": 1.242601136020078e-06, + "loss": 0.70456368, + "num_input_tokens_seen": 227685070, + "step": 10560, + "time_per_iteration": 2.609943151473999 + }, + { + "auxiliary_loss_clip": 0.01133709, + "auxiliary_loss_mlp": 0.01105324, + "balance_loss_clip": 1.00192344, + "balance_loss_mlp": 1.0005157, + "epoch": 0.6349616714264241, + "flos": 22194984954240.0, + "grad_norm": 1.6324637067158774, + "language_loss": 0.76926816, + "learning_rate": 1.2422406959187939e-06, + "loss": 0.79165846, + "num_input_tokens_seen": 227704430, + "step": 10561, + "time_per_iteration": 2.6698250770568848 + }, + { + "auxiliary_loss_clip": 0.01131982, + "auxiliary_loss_mlp": 0.01105695, + "balance_loss_clip": 1.00182271, + "balance_loss_mlp": 1.00060034, + "epoch": 0.6350217946790921, + "flos": 25410354814080.0, + "grad_norm": 1.8451315447225496, + "language_loss": 0.72149599, + "learning_rate": 1.2418802845518178e-06, + "loss": 0.74387276, + "num_input_tokens_seen": 227724920, + "step": 10562, + "time_per_iteration": 2.6791982650756836 + }, + { + "auxiliary_loss_clip": 0.01149174, + "auxiliary_loss_mlp": 0.01106162, + "balance_loss_clip": 1.00199056, + "balance_loss_mlp": 1.00049508, + "epoch": 0.63508191793176, + "flos": 19718944732800.0, + "grad_norm": 2.1502544960081145, + "language_loss": 0.80979663, + "learning_rate": 1.2415199019328185e-06, + "loss": 0.83235002, + "num_input_tokens_seen": 227743400, + "step": 10563, + "time_per_iteration": 2.5968146324157715 + }, + { + "auxiliary_loss_clip": 0.01117906, + "auxiliary_loss_mlp": 0.01105792, + "balance_loss_clip": 1.00191236, + "balance_loss_mlp": 1.00060129, + "epoch": 0.6351420411844281, + "flos": 18186923802240.0, + "grad_norm": 2.0901835006624823, + "language_loss": 0.8104431, + "learning_rate": 1.2411595480754597e-06, + "loss": 0.83268011, + "num_input_tokens_seen": 227759990, + "step": 10564, + "time_per_iteration": 2.617060422897339 + }, + { + "auxiliary_loss_clip": 0.01134043, + "auxiliary_loss_mlp": 0.011062, + "balance_loss_clip": 1.00193346, + "balance_loss_mlp": 1.00053263, + "epoch": 0.6352021644370961, + "flos": 33726511422720.0, + "grad_norm": 1.861074107385798, + "language_loss": 0.72268564, + "learning_rate": 1.240799222993407e-06, + "loss": 0.7450881, + "num_input_tokens_seen": 227780835, + "step": 10565, + "time_per_iteration": 2.7141263484954834 + }, + { + "auxiliary_loss_clip": 0.01149138, + "auxiliary_loss_mlp": 0.01106129, + "balance_loss_clip": 1.0019269, + "balance_loss_mlp": 1.00055707, + "epoch": 0.635262287689764, + "flos": 20374781207040.0, + "grad_norm": 2.126497754317784, + "language_loss": 0.69164491, + "learning_rate": 1.240438926700324e-06, + "loss": 0.71419764, + "num_input_tokens_seen": 227798580, + "step": 10566, + "time_per_iteration": 2.5518112182617188 + }, + { + "auxiliary_loss_clip": 0.01148476, + "auxiliary_loss_mlp": 0.01105822, + "balance_loss_clip": 1.00190985, + "balance_loss_mlp": 1.00063205, + "epoch": 0.635322410942432, + "flos": 27525421307520.0, + "grad_norm": 1.645377405512652, + "language_loss": 0.696841, + "learning_rate": 1.2400786592098725e-06, + "loss": 0.71938401, + "num_input_tokens_seen": 227819210, + "step": 10567, + "time_per_iteration": 2.5973949432373047 + }, + { + "auxiliary_loss_clip": 0.01148614, + "auxiliary_loss_mlp": 0.0110452, + "balance_loss_clip": 1.00205207, + "balance_loss_mlp": 1.00047398, + "epoch": 0.6353825341950999, + "flos": 21543601766400.0, + "grad_norm": 2.490980847577745, + "language_loss": 0.8458907, + "learning_rate": 1.2397184205357154e-06, + "loss": 0.86842203, + "num_input_tokens_seen": 227838340, + "step": 10568, + "time_per_iteration": 2.561067581176758 + }, + { + "auxiliary_loss_clip": 0.01104584, + "auxiliary_loss_mlp": 0.0110599, + "balance_loss_clip": 1.00184178, + "balance_loss_mlp": 1.00051332, + "epoch": 0.635442657447768, + "flos": 31759756185600.0, + "grad_norm": 2.1800428329524695, + "language_loss": 0.84056503, + "learning_rate": 1.2393582106915113e-06, + "loss": 0.86267078, + "num_input_tokens_seen": 227859170, + "step": 10569, + "time_per_iteration": 2.7266485691070557 + }, + { + "auxiliary_loss_clip": 0.01148159, + "auxiliary_loss_mlp": 0.01104332, + "balance_loss_clip": 1.00192666, + "balance_loss_mlp": 1.00047719, + "epoch": 0.6355027807004359, + "flos": 19828831415040.0, + "grad_norm": 1.5894281870114098, + "language_loss": 0.69206786, + "learning_rate": 1.2389980296909198e-06, + "loss": 0.71459281, + "num_input_tokens_seen": 227878545, + "step": 10570, + "time_per_iteration": 2.5646016597747803 + }, + { + "auxiliary_loss_clip": 0.0114858, + "auxiliary_loss_mlp": 0.01105652, + "balance_loss_clip": 1.00186133, + "balance_loss_mlp": 1.00046158, + "epoch": 0.6355629039531039, + "flos": 30372383324160.0, + "grad_norm": 2.873535535049416, + "language_loss": 0.66149807, + "learning_rate": 1.2386378775476e-06, + "loss": 0.68404043, + "num_input_tokens_seen": 227898875, + "step": 10571, + "time_per_iteration": 2.6157195568084717 + }, + { + "auxiliary_loss_clip": 0.01150805, + "auxiliary_loss_mlp": 0.01106219, + "balance_loss_clip": 1.00208211, + "balance_loss_mlp": 1.00055242, + "epoch": 0.6356230272057718, + "flos": 17932065828480.0, + "grad_norm": 1.8980031271644362, + "language_loss": 0.71484625, + "learning_rate": 1.2382777542752074e-06, + "loss": 0.73741657, + "num_input_tokens_seen": 227917130, + "step": 10572, + "time_per_iteration": 2.549812078475952 + }, + { + "auxiliary_loss_clip": 0.01119625, + "auxiliary_loss_mlp": 0.01104651, + "balance_loss_clip": 1.00184548, + "balance_loss_mlp": 1.00051022, + "epoch": 0.6356831504584398, + "flos": 25375844822400.0, + "grad_norm": 1.4762688845839513, + "language_loss": 0.8137148, + "learning_rate": 1.2379176598873992e-06, + "loss": 0.83595759, + "num_input_tokens_seen": 227939550, + "step": 10573, + "time_per_iteration": 4.081108331680298 + }, + { + "auxiliary_loss_clip": 0.01133901, + "auxiliary_loss_mlp": 0.01106129, + "balance_loss_clip": 1.00195765, + "balance_loss_mlp": 1.00055683, + "epoch": 0.6357432737111077, + "flos": 46500331720320.0, + "grad_norm": 2.131921845553344, + "language_loss": 0.6884867, + "learning_rate": 1.2375575943978303e-06, + "loss": 0.71088696, + "num_input_tokens_seen": 227962200, + "step": 10574, + "time_per_iteration": 2.8541719913482666 + }, + { + "auxiliary_loss_clip": 0.01165467, + "auxiliary_loss_mlp": 0.01105638, + "balance_loss_clip": 1.00210261, + "balance_loss_mlp": 1.00054336, + "epoch": 0.6358033969637757, + "flos": 17274361847040.0, + "grad_norm": 2.2184945737951436, + "language_loss": 0.86937088, + "learning_rate": 1.2371975578201525e-06, + "loss": 0.89208192, + "num_input_tokens_seen": 227979270, + "step": 10575, + "time_per_iteration": 2.5105113983154297 + }, + { + "auxiliary_loss_clip": 0.01165442, + "auxiliary_loss_mlp": 0.01105964, + "balance_loss_clip": 1.0020566, + "balance_loss_mlp": 1.00058341, + "epoch": 0.6358635202164437, + "flos": 27125520215040.0, + "grad_norm": 1.990805306809448, + "language_loss": 0.72213221, + "learning_rate": 1.2368375501680204e-06, + "loss": 0.74484622, + "num_input_tokens_seen": 228000550, + "step": 10576, + "time_per_iteration": 2.609126567840576 + }, + { + "auxiliary_loss_clip": 0.01131581, + "auxiliary_loss_mlp": 0.0110563, + "balance_loss_clip": 1.00193357, + "balance_loss_mlp": 1.00043941, + "epoch": 0.6359236434691117, + "flos": 27525205825920.0, + "grad_norm": 1.5330527965113296, + "language_loss": 0.69008696, + "learning_rate": 1.236477571455085e-06, + "loss": 0.71245909, + "num_input_tokens_seen": 228022005, + "step": 10577, + "time_per_iteration": 4.052915334701538 + }, + { + "auxiliary_loss_clip": 0.01102238, + "auxiliary_loss_mlp": 0.01105985, + "balance_loss_clip": 1.00172031, + "balance_loss_mlp": 1.00050879, + "epoch": 0.6359837667217797, + "flos": 39348290989440.0, + "grad_norm": 1.7635104257843504, + "language_loss": 0.71480548, + "learning_rate": 1.2361176216949964e-06, + "loss": 0.73688775, + "num_input_tokens_seen": 228043770, + "step": 10578, + "time_per_iteration": 2.8295817375183105 + }, + { + "auxiliary_loss_clip": 0.01115275, + "auxiliary_loss_mlp": 0.00745577, + "balance_loss_clip": 1.00097561, + "balance_loss_mlp": 0.9999947, + "epoch": 0.6360438899744476, + "flos": 56413797206400.0, + "grad_norm": 0.7022112447706522, + "language_loss": 0.5452171, + "learning_rate": 1.2357577009014044e-06, + "loss": 0.56382561, + "num_input_tokens_seen": 228104985, + "step": 10579, + "time_per_iteration": 3.292970895767212 + }, + { + "auxiliary_loss_clip": 0.01133736, + "auxiliary_loss_mlp": 0.01105193, + "balance_loss_clip": 1.00188708, + "balance_loss_mlp": 1.00047946, + "epoch": 0.6361040132271156, + "flos": 24973106555520.0, + "grad_norm": 1.7483319592256727, + "language_loss": 0.77018052, + "learning_rate": 1.2353978090879568e-06, + "loss": 0.79256988, + "num_input_tokens_seen": 228125620, + "step": 10580, + "time_per_iteration": 2.653064012527466 + }, + { + "auxiliary_loss_clip": 0.01118856, + "auxiliary_loss_mlp": 0.00747287, + "balance_loss_clip": 1.00189638, + "balance_loss_mlp": 1.00041723, + "epoch": 0.6361641364797835, + "flos": 23259198130560.0, + "grad_norm": 2.2700474066208525, + "language_loss": 0.66707397, + "learning_rate": 1.235037946268301e-06, + "loss": 0.68573546, + "num_input_tokens_seen": 228143495, + "step": 10581, + "time_per_iteration": 2.6641533374786377 + }, + { + "auxiliary_loss_clip": 0.01148692, + "auxiliary_loss_mlp": 0.01105033, + "balance_loss_clip": 1.00192881, + "balance_loss_mlp": 1.00050998, + "epoch": 0.6362242597324516, + "flos": 25994513698560.0, + "grad_norm": 1.5197820646957974, + "language_loss": 0.68412089, + "learning_rate": 1.2346781124560828e-06, + "loss": 0.70665812, + "num_input_tokens_seen": 228166500, + "step": 10582, + "time_per_iteration": 2.6378586292266846 + }, + { + "auxiliary_loss_clip": 0.01132282, + "auxiliary_loss_mlp": 0.01105263, + "balance_loss_clip": 1.00170016, + "balance_loss_mlp": 1.00054967, + "epoch": 0.6362843829851195, + "flos": 25703242312320.0, + "grad_norm": 1.9041205811432895, + "language_loss": 0.84845555, + "learning_rate": 1.2343183076649473e-06, + "loss": 0.87083101, + "num_input_tokens_seen": 228185325, + "step": 10583, + "time_per_iteration": 2.6543612480163574 + }, + { + "auxiliary_loss_clip": 0.01133899, + "auxiliary_loss_mlp": 0.01105341, + "balance_loss_clip": 1.001948, + "balance_loss_mlp": 1.00043678, + "epoch": 0.6363445062377875, + "flos": 20522912895360.0, + "grad_norm": 1.5647514371396056, + "language_loss": 0.75274312, + "learning_rate": 1.233958531908538e-06, + "loss": 0.77513558, + "num_input_tokens_seen": 228204050, + "step": 10584, + "time_per_iteration": 3.9639642238616943 + }, + { + "auxiliary_loss_clip": 0.01137009, + "auxiliary_loss_mlp": 0.01105517, + "balance_loss_clip": 1.00195909, + "balance_loss_mlp": 1.00061297, + "epoch": 0.6364046294904554, + "flos": 19463799450240.0, + "grad_norm": 2.274108259828446, + "language_loss": 0.73081887, + "learning_rate": 1.2335987852004985e-06, + "loss": 0.7532441, + "num_input_tokens_seen": 228222430, + "step": 10585, + "time_per_iteration": 4.046297311782837 + }, + { + "auxiliary_loss_clip": 0.01116983, + "auxiliary_loss_mlp": 0.01105575, + "balance_loss_clip": 1.00181139, + "balance_loss_mlp": 1.00048006, + "epoch": 0.6364647527431234, + "flos": 20995892208000.0, + "grad_norm": 1.7292423079124142, + "language_loss": 0.82987773, + "learning_rate": 1.2332390675544697e-06, + "loss": 0.85210329, + "num_input_tokens_seen": 228241925, + "step": 10586, + "time_per_iteration": 2.644015073776245 + }, + { + "auxiliary_loss_clip": 0.01148541, + "auxiliary_loss_mlp": 0.01104012, + "balance_loss_clip": 1.00191605, + "balance_loss_mlp": 1.00044274, + "epoch": 0.6365248759957913, + "flos": 25770789838080.0, + "grad_norm": 1.5160645291308936, + "language_loss": 0.72461808, + "learning_rate": 1.2328793789840918e-06, + "loss": 0.74714363, + "num_input_tokens_seen": 228262535, + "step": 10587, + "time_per_iteration": 2.6155588626861572 + }, + { + "auxiliary_loss_clip": 0.01132001, + "auxiliary_loss_mlp": 0.01105133, + "balance_loss_clip": 1.00185978, + "balance_loss_mlp": 1.00051498, + "epoch": 0.6365849992484593, + "flos": 22455589104000.0, + "grad_norm": 2.1221798715153657, + "language_loss": 0.76679814, + "learning_rate": 1.2325197195030058e-06, + "loss": 0.78916949, + "num_input_tokens_seen": 228281340, + "step": 10588, + "time_per_iteration": 2.5783376693725586 + }, + { + "auxiliary_loss_clip": 0.0109943, + "auxiliary_loss_mlp": 0.01104102, + "balance_loss_clip": 1.00166178, + "balance_loss_mlp": 1.0004375, + "epoch": 0.6366451225011273, + "flos": 19025689265280.0, + "grad_norm": 1.9230822091311552, + "language_loss": 0.79859221, + "learning_rate": 1.2321600891248478e-06, + "loss": 0.82062757, + "num_input_tokens_seen": 228300865, + "step": 10589, + "time_per_iteration": 2.669719934463501 + }, + { + "auxiliary_loss_clip": 0.01135333, + "auxiliary_loss_mlp": 0.0110503, + "balance_loss_clip": 1.00201893, + "balance_loss_mlp": 1.00041163, + "epoch": 0.6367052457537953, + "flos": 25228395492480.0, + "grad_norm": 1.898117193328137, + "language_loss": 0.67373598, + "learning_rate": 1.231800487863257e-06, + "loss": 0.69613963, + "num_input_tokens_seen": 228320815, + "step": 10590, + "time_per_iteration": 2.6363658905029297 + }, + { + "auxiliary_loss_clip": 0.01148717, + "auxiliary_loss_mlp": 0.01105728, + "balance_loss_clip": 1.00181949, + "balance_loss_mlp": 1.00053787, + "epoch": 0.6367653690064633, + "flos": 19208438686080.0, + "grad_norm": 1.9052231595053415, + "language_loss": 0.7913276, + "learning_rate": 1.2314409157318685e-06, + "loss": 0.81387198, + "num_input_tokens_seen": 228339065, + "step": 10591, + "time_per_iteration": 2.554387092590332 + }, + { + "auxiliary_loss_clip": 0.01133719, + "auxiliary_loss_mlp": 0.01105768, + "balance_loss_clip": 1.00193262, + "balance_loss_mlp": 1.00048184, + "epoch": 0.6368254922591312, + "flos": 23546806329600.0, + "grad_norm": 1.4489840738848825, + "language_loss": 0.88851535, + "learning_rate": 1.231081372744317e-06, + "loss": 0.91091025, + "num_input_tokens_seen": 228359210, + "step": 10592, + "time_per_iteration": 2.7250008583068848 + }, + { + "auxiliary_loss_clip": 0.01150564, + "auxiliary_loss_mlp": 0.01105329, + "balance_loss_clip": 1.00194669, + "balance_loss_mlp": 1.00051975, + "epoch": 0.6368856155117992, + "flos": 26467313443200.0, + "grad_norm": 1.3451016951614378, + "language_loss": 0.68404472, + "learning_rate": 1.2307218589142376e-06, + "loss": 0.70660365, + "num_input_tokens_seen": 228379630, + "step": 10593, + "time_per_iteration": 2.6055145263671875 + }, + { + "auxiliary_loss_clip": 0.01103774, + "auxiliary_loss_mlp": 0.01104434, + "balance_loss_clip": 1.00170994, + "balance_loss_mlp": 1.00038803, + "epoch": 0.6369457387644671, + "flos": 33692432394240.0, + "grad_norm": 1.9462551326318611, + "language_loss": 0.63443351, + "learning_rate": 1.2303623742552618e-06, + "loss": 0.6565156, + "num_input_tokens_seen": 228401410, + "step": 10594, + "time_per_iteration": 2.8674490451812744 + }, + { + "auxiliary_loss_clip": 0.0114329, + "auxiliary_loss_mlp": 0.01081588, + "balance_loss_clip": 1.00108743, + "balance_loss_mlp": 0.99995369, + "epoch": 0.6370058620171352, + "flos": 70908600908160.0, + "grad_norm": 0.7566010840134569, + "language_loss": 0.54594922, + "learning_rate": 1.230002918781022e-06, + "loss": 0.56819797, + "num_input_tokens_seen": 228470335, + "step": 10595, + "time_per_iteration": 3.2650222778320312 + }, + { + "auxiliary_loss_clip": 0.01165537, + "auxiliary_loss_mlp": 0.0110584, + "balance_loss_clip": 1.00201964, + "balance_loss_mlp": 1.00064945, + "epoch": 0.6370659852698031, + "flos": 21141940907520.0, + "grad_norm": 2.7654345076618543, + "language_loss": 0.66591585, + "learning_rate": 1.2296434925051493e-06, + "loss": 0.68862957, + "num_input_tokens_seen": 228490765, + "step": 10596, + "time_per_iteration": 2.5805325508117676 + }, + { + "auxiliary_loss_clip": 0.01150856, + "auxiliary_loss_mlp": 0.01105609, + "balance_loss_clip": 1.00208235, + "balance_loss_mlp": 1.00060952, + "epoch": 0.6371261085224711, + "flos": 20193288762240.0, + "grad_norm": 1.9977426557572004, + "language_loss": 0.79218078, + "learning_rate": 1.2292840954412718e-06, + "loss": 0.81474543, + "num_input_tokens_seen": 228509700, + "step": 10597, + "time_per_iteration": 2.562812089920044 + }, + { + "auxiliary_loss_clip": 0.01148688, + "auxiliary_loss_mlp": 0.01105894, + "balance_loss_clip": 1.00194335, + "balance_loss_mlp": 1.00060821, + "epoch": 0.637186231775139, + "flos": 19683536901120.0, + "grad_norm": 1.7347846264542965, + "language_loss": 0.75035119, + "learning_rate": 1.2289247276030189e-06, + "loss": 0.77289701, + "num_input_tokens_seen": 228529050, + "step": 10598, + "time_per_iteration": 2.568802833557129 + }, + { + "auxiliary_loss_clip": 0.01117188, + "auxiliary_loss_mlp": 0.00747148, + "balance_loss_clip": 1.00184047, + "balance_loss_mlp": 1.00035858, + "epoch": 0.637246355027807, + "flos": 13071196995840.0, + "grad_norm": 2.312510070901527, + "language_loss": 0.68424511, + "learning_rate": 1.2285653890040176e-06, + "loss": 0.70288849, + "num_input_tokens_seen": 228544665, + "step": 10599, + "time_per_iteration": 2.651132106781006 + }, + { + "auxiliary_loss_clip": 0.0112117, + "auxiliary_loss_mlp": 0.01106369, + "balance_loss_clip": 1.00193226, + "balance_loss_mlp": 1.00051129, + "epoch": 0.6373064782804749, + "flos": 18222654856320.0, + "grad_norm": 2.073023951918935, + "language_loss": 0.80449092, + "learning_rate": 1.2282060796578942e-06, + "loss": 0.82676631, + "num_input_tokens_seen": 228562060, + "step": 10600, + "time_per_iteration": 2.6584665775299072 + }, + { + "auxiliary_loss_clip": 0.01150449, + "auxiliary_loss_mlp": 0.01105759, + "balance_loss_clip": 1.00189114, + "balance_loss_mlp": 1.00056899, + "epoch": 0.637366601533143, + "flos": 24498475217280.0, + "grad_norm": 1.5647079818481586, + "language_loss": 0.79891002, + "learning_rate": 1.2278467995782732e-06, + "loss": 0.82147211, + "num_input_tokens_seen": 228582550, + "step": 10601, + "time_per_iteration": 2.6088340282440186 + }, + { + "auxiliary_loss_clip": 0.01115617, + "auxiliary_loss_mlp": 0.01105226, + "balance_loss_clip": 1.00177884, + "balance_loss_mlp": 1.00051296, + "epoch": 0.6374267247858109, + "flos": 26359042872960.0, + "grad_norm": 1.9387101061345622, + "language_loss": 0.67123151, + "learning_rate": 1.2274875487787797e-06, + "loss": 0.69343996, + "num_input_tokens_seen": 228604960, + "step": 10602, + "time_per_iteration": 2.704758882522583 + }, + { + "auxiliary_loss_clip": 0.01072483, + "auxiliary_loss_mlp": 0.01105502, + "balance_loss_clip": 1.0017426, + "balance_loss_mlp": 1.00040746, + "epoch": 0.6374868480384789, + "flos": 20371728551040.0, + "grad_norm": 1.6327940969684283, + "language_loss": 0.79336101, + "learning_rate": 1.2271283272730354e-06, + "loss": 0.81514084, + "num_input_tokens_seen": 228622195, + "step": 10603, + "time_per_iteration": 2.7693772315979004 + }, + { + "auxiliary_loss_clip": 0.0110012, + "auxiliary_loss_mlp": 0.00747205, + "balance_loss_clip": 1.00175285, + "balance_loss_mlp": 1.00043285, + "epoch": 0.6375469712911469, + "flos": 20996251344000.0, + "grad_norm": 2.1796863598835006, + "language_loss": 0.76947147, + "learning_rate": 1.2267691350746621e-06, + "loss": 0.78794467, + "num_input_tokens_seen": 228639735, + "step": 10604, + "time_per_iteration": 2.675158977508545 + }, + { + "auxiliary_loss_clip": 0.01133576, + "auxiliary_loss_mlp": 0.01105862, + "balance_loss_clip": 1.00184751, + "balance_loss_mlp": 1.00038576, + "epoch": 0.6376070945438148, + "flos": 19715748422400.0, + "grad_norm": 1.8742637140772016, + "language_loss": 0.76649213, + "learning_rate": 1.226409972197281e-06, + "loss": 0.78888655, + "num_input_tokens_seen": 228658195, + "step": 10605, + "time_per_iteration": 2.6112067699432373 + }, + { + "auxiliary_loss_clip": 0.01085083, + "auxiliary_loss_mlp": 0.01106177, + "balance_loss_clip": 1.00152946, + "balance_loss_mlp": 1.00041413, + "epoch": 0.6376672177964828, + "flos": 21506757390720.0, + "grad_norm": 1.6834100329360047, + "language_loss": 0.659123, + "learning_rate": 1.2260508386545106e-06, + "loss": 0.68103564, + "num_input_tokens_seen": 228677415, + "step": 10606, + "time_per_iteration": 2.703315019607544 + }, + { + "auxiliary_loss_clip": 0.01131827, + "auxiliary_loss_mlp": 0.01104298, + "balance_loss_clip": 1.00181115, + "balance_loss_mlp": 1.0006336, + "epoch": 0.6377273410491507, + "flos": 18843873598080.0, + "grad_norm": 1.6692345584223103, + "language_loss": 0.75427401, + "learning_rate": 1.225691734459971e-06, + "loss": 0.77663529, + "num_input_tokens_seen": 228696450, + "step": 10607, + "time_per_iteration": 2.6198744773864746 + }, + { + "auxiliary_loss_clip": 0.01131478, + "auxiliary_loss_mlp": 0.01106211, + "balance_loss_clip": 1.00186312, + "balance_loss_mlp": 1.00063956, + "epoch": 0.6377874643018188, + "flos": 53062970181120.0, + "grad_norm": 2.7083155106385326, + "language_loss": 0.65568399, + "learning_rate": 1.225332659627278e-06, + "loss": 0.67806089, + "num_input_tokens_seen": 228721600, + "step": 10608, + "time_per_iteration": 2.8896265029907227 + }, + { + "auxiliary_loss_clip": 0.01067405, + "auxiliary_loss_mlp": 0.01081185, + "balance_loss_clip": 1.00092506, + "balance_loss_mlp": 0.99993181, + "epoch": 0.6378475875544867, + "flos": 65135026465920.0, + "grad_norm": 0.7087595595999079, + "language_loss": 0.51905119, + "learning_rate": 1.2249736141700475e-06, + "loss": 0.54053706, + "num_input_tokens_seen": 228784535, + "step": 10609, + "time_per_iteration": 3.3210816383361816 + }, + { + "auxiliary_loss_clip": 0.01149966, + "auxiliary_loss_mlp": 0.0110385, + "balance_loss_clip": 1.00190103, + "balance_loss_mlp": 1.00047207, + "epoch": 0.6379077108071547, + "flos": 23002759958400.0, + "grad_norm": 1.767747100112022, + "language_loss": 0.74344087, + "learning_rate": 1.2246145981018965e-06, + "loss": 0.76597905, + "num_input_tokens_seen": 228804110, + "step": 10610, + "time_per_iteration": 4.301864862442017 + }, + { + "auxiliary_loss_clip": 0.0112609, + "auxiliary_loss_mlp": 0.0108131, + "balance_loss_clip": 1.00100052, + "balance_loss_mlp": 1.00005651, + "epoch": 0.6379678340598226, + "flos": 67601947610880.0, + "grad_norm": 0.8566290666556006, + "language_loss": 0.63090587, + "learning_rate": 1.2242556114364364e-06, + "loss": 0.65297985, + "num_input_tokens_seen": 228867705, + "step": 10611, + "time_per_iteration": 3.1736035346984863 + }, + { + "auxiliary_loss_clip": 0.01148655, + "auxiliary_loss_mlp": 0.01105014, + "balance_loss_clip": 1.00188565, + "balance_loss_mlp": 1.00039601, + "epoch": 0.6380279573124906, + "flos": 29680061610240.0, + "grad_norm": 2.0740565788223635, + "language_loss": 0.72270823, + "learning_rate": 1.223896654187282e-06, + "loss": 0.74524486, + "num_input_tokens_seen": 228889215, + "step": 10612, + "time_per_iteration": 2.6270041465759277 + }, + { + "auxiliary_loss_clip": 0.01129616, + "auxiliary_loss_mlp": 0.01081247, + "balance_loss_clip": 1.00093853, + "balance_loss_mlp": 0.9999941, + "epoch": 0.6380880805651585, + "flos": 66484046580480.0, + "grad_norm": 0.7094739052673953, + "language_loss": 0.57886803, + "learning_rate": 1.2235377263680446e-06, + "loss": 0.60097671, + "num_input_tokens_seen": 228948465, + "step": 10613, + "time_per_iteration": 3.0506069660186768 + }, + { + "auxiliary_loss_clip": 0.01102428, + "auxiliary_loss_mlp": 0.01105143, + "balance_loss_clip": 1.00178397, + "balance_loss_mlp": 1.00062001, + "epoch": 0.6381482038178266, + "flos": 23914998691200.0, + "grad_norm": 2.135257172319676, + "language_loss": 0.7576859, + "learning_rate": 1.2231788279923334e-06, + "loss": 0.77976167, + "num_input_tokens_seen": 228967955, + "step": 10614, + "time_per_iteration": 2.7141072750091553 + }, + { + "auxiliary_loss_clip": 0.01133922, + "auxiliary_loss_mlp": 0.00747188, + "balance_loss_clip": 1.00207329, + "balance_loss_mlp": 1.00039625, + "epoch": 0.6382083270704945, + "flos": 24243042625920.0, + "grad_norm": 2.262352303236198, + "language_loss": 0.7933507, + "learning_rate": 1.2228199590737599e-06, + "loss": 0.8121618, + "num_input_tokens_seen": 228985495, + "step": 10615, + "time_per_iteration": 4.559790134429932 + }, + { + "auxiliary_loss_clip": 0.01127383, + "auxiliary_loss_mlp": 0.01081633, + "balance_loss_clip": 1.00110245, + "balance_loss_mlp": 0.99999869, + "epoch": 0.6382684503231625, + "flos": 70775552931840.0, + "grad_norm": 0.6606015843713655, + "language_loss": 0.55619985, + "learning_rate": 1.2224611196259305e-06, + "loss": 0.57828999, + "num_input_tokens_seen": 229052995, + "step": 10616, + "time_per_iteration": 3.2905566692352295 + }, + { + "auxiliary_loss_clip": 0.01135788, + "auxiliary_loss_mlp": 0.01106133, + "balance_loss_clip": 1.00209749, + "balance_loss_mlp": 1.00065696, + "epoch": 0.6383285735758305, + "flos": 16544836621440.0, + "grad_norm": 2.1611709823192293, + "language_loss": 0.83901787, + "learning_rate": 1.2221023096624538e-06, + "loss": 0.86143708, + "num_input_tokens_seen": 229071030, + "step": 10617, + "time_per_iteration": 2.644831895828247 + }, + { + "auxiliary_loss_clip": 0.0114871, + "auxiliary_loss_mlp": 0.01105838, + "balance_loss_clip": 1.00193369, + "balance_loss_mlp": 1.00055218, + "epoch": 0.6383886968284984, + "flos": 14427651225600.0, + "grad_norm": 1.9424098145044622, + "language_loss": 0.86841124, + "learning_rate": 1.221743529196936e-06, + "loss": 0.8909567, + "num_input_tokens_seen": 229088275, + "step": 10618, + "time_per_iteration": 2.5473453998565674 + }, + { + "auxiliary_loss_clip": 0.01086312, + "auxiliary_loss_mlp": 0.01104988, + "balance_loss_clip": 1.00170851, + "balance_loss_mlp": 1.000561, + "epoch": 0.6384488200811664, + "flos": 17929659617280.0, + "grad_norm": 1.7236980097153898, + "language_loss": 0.73431498, + "learning_rate": 1.2213847782429806e-06, + "loss": 0.75622797, + "num_input_tokens_seen": 229105190, + "step": 10619, + "time_per_iteration": 2.72959041595459 + }, + { + "auxiliary_loss_clip": 0.01134194, + "auxiliary_loss_mlp": 0.01106897, + "balance_loss_clip": 1.00204396, + "balance_loss_mlp": 1.00056279, + "epoch": 0.6385089433338343, + "flos": 18515578268160.0, + "grad_norm": 2.1366111506852077, + "language_loss": 0.76350194, + "learning_rate": 1.221026056814193e-06, + "loss": 0.78591281, + "num_input_tokens_seen": 229122290, + "step": 10620, + "time_per_iteration": 2.596379518508911 + }, + { + "auxiliary_loss_clip": 0.01132008, + "auxiliary_loss_mlp": 0.01105513, + "balance_loss_clip": 1.00192451, + "balance_loss_mlp": 1.00041795, + "epoch": 0.6385690665865024, + "flos": 24753620499840.0, + "grad_norm": 4.127323637600838, + "language_loss": 0.7046479, + "learning_rate": 1.2206673649241752e-06, + "loss": 0.72702312, + "num_input_tokens_seen": 229141620, + "step": 10621, + "time_per_iteration": 4.058500289916992 + }, + { + "auxiliary_loss_clip": 0.01133293, + "auxiliary_loss_mlp": 0.01104236, + "balance_loss_clip": 1.00189948, + "balance_loss_mlp": 1.00066721, + "epoch": 0.6386291898391703, + "flos": 20120569678080.0, + "grad_norm": 1.628531204309203, + "language_loss": 0.77675509, + "learning_rate": 1.220308702586529e-06, + "loss": 0.79913038, + "num_input_tokens_seen": 229161570, + "step": 10622, + "time_per_iteration": 2.6305510997772217 + }, + { + "auxiliary_loss_clip": 0.01117442, + "auxiliary_loss_mlp": 0.01104928, + "balance_loss_clip": 1.0018363, + "balance_loss_mlp": 1.00050056, + "epoch": 0.6386893130918383, + "flos": 16867278034560.0, + "grad_norm": 2.1561181665143123, + "language_loss": 0.74544573, + "learning_rate": 1.2199500698148546e-06, + "loss": 0.76766944, + "num_input_tokens_seen": 229178465, + "step": 10623, + "time_per_iteration": 4.068316698074341 + }, + { + "auxiliary_loss_clip": 0.0113355, + "auxiliary_loss_mlp": 0.01104175, + "balance_loss_clip": 1.00179577, + "balance_loss_mlp": 1.00051069, + "epoch": 0.6387494363445062, + "flos": 22966274718720.0, + "grad_norm": 3.1669900197054384, + "language_loss": 0.76933563, + "learning_rate": 1.2195914666227527e-06, + "loss": 0.79171288, + "num_input_tokens_seen": 229198975, + "step": 10624, + "time_per_iteration": 2.6260790824890137 + }, + { + "auxiliary_loss_clip": 0.01083846, + "auxiliary_loss_mlp": 0.01105586, + "balance_loss_clip": 1.00164819, + "balance_loss_mlp": 1.00049067, + "epoch": 0.6388095595971742, + "flos": 22857716839680.0, + "grad_norm": 1.7639557809023767, + "language_loss": 0.80685341, + "learning_rate": 1.21923289302382e-06, + "loss": 0.82874775, + "num_input_tokens_seen": 229218825, + "step": 10625, + "time_per_iteration": 2.7445690631866455 + }, + { + "auxiliary_loss_clip": 0.01131925, + "auxiliary_loss_mlp": 0.01106628, + "balance_loss_clip": 1.00188398, + "balance_loss_mlp": 1.00057912, + "epoch": 0.6388696828498421, + "flos": 17311529445120.0, + "grad_norm": 1.9335208995015032, + "language_loss": 0.72567201, + "learning_rate": 1.218874349031654e-06, + "loss": 0.7480576, + "num_input_tokens_seen": 229236060, + "step": 10626, + "time_per_iteration": 2.5658321380615234 + }, + { + "auxiliary_loss_clip": 0.01133064, + "auxiliary_loss_mlp": 0.01106713, + "balance_loss_clip": 1.00192428, + "balance_loss_mlp": 1.00056887, + "epoch": 0.6389298061025102, + "flos": 17128636369920.0, + "grad_norm": 1.6692370675915396, + "language_loss": 0.72431076, + "learning_rate": 1.2185158346598517e-06, + "loss": 0.74670851, + "num_input_tokens_seen": 229255160, + "step": 10627, + "time_per_iteration": 2.604968786239624 + }, + { + "auxiliary_loss_clip": 0.011198, + "auxiliary_loss_mlp": 0.01106192, + "balance_loss_clip": 1.00183332, + "balance_loss_mlp": 1.00052512, + "epoch": 0.6389899293551781, + "flos": 27710971989120.0, + "grad_norm": 1.7389763706971257, + "language_loss": 0.66874772, + "learning_rate": 1.2181573499220064e-06, + "loss": 0.69100761, + "num_input_tokens_seen": 229278705, + "step": 10628, + "time_per_iteration": 2.720038890838623 + }, + { + "auxiliary_loss_clip": 0.01165329, + "auxiliary_loss_mlp": 0.01104369, + "balance_loss_clip": 1.00205493, + "balance_loss_mlp": 1.0004189, + "epoch": 0.6390500526078461, + "flos": 21215701486080.0, + "grad_norm": 2.011096665528044, + "language_loss": 0.67918885, + "learning_rate": 1.2177988948317135e-06, + "loss": 0.70188582, + "num_input_tokens_seen": 229299990, + "step": 10629, + "time_per_iteration": 2.5485787391662598 + }, + { + "auxiliary_loss_clip": 0.01117504, + "auxiliary_loss_mlp": 0.01107125, + "balance_loss_clip": 1.00182879, + "balance_loss_mlp": 1.00059986, + "epoch": 0.6391101758605141, + "flos": 21581056673280.0, + "grad_norm": 1.6029473741267626, + "language_loss": 0.75245118, + "learning_rate": 1.2174404694025646e-06, + "loss": 0.77469742, + "num_input_tokens_seen": 229319230, + "step": 10630, + "time_per_iteration": 2.6324779987335205 + }, + { + "auxiliary_loss_clip": 0.01135394, + "auxiliary_loss_mlp": 0.01105277, + "balance_loss_clip": 1.00203896, + "balance_loss_mlp": 1.00046849, + "epoch": 0.639170299113182, + "flos": 19900473091200.0, + "grad_norm": 1.6802599717738438, + "language_loss": 0.70745224, + "learning_rate": 1.2170820736481511e-06, + "loss": 0.72985893, + "num_input_tokens_seen": 229338600, + "step": 10631, + "time_per_iteration": 2.6036014556884766 + }, + { + "auxiliary_loss_clip": 0.01129103, + "auxiliary_loss_mlp": 0.01081058, + "balance_loss_clip": 1.00118279, + "balance_loss_mlp": 1.00018692, + "epoch": 0.63923042236585, + "flos": 69877604833920.0, + "grad_norm": 0.7714064895039481, + "language_loss": 0.62923825, + "learning_rate": 1.2167237075820646e-06, + "loss": 0.65133989, + "num_input_tokens_seen": 229402420, + "step": 10632, + "time_per_iteration": 3.2161037921905518 + }, + { + "auxiliary_loss_clip": 0.01132048, + "auxiliary_loss_mlp": 0.01104658, + "balance_loss_clip": 1.00190949, + "balance_loss_mlp": 1.0006125, + "epoch": 0.639290545618518, + "flos": 22674823764480.0, + "grad_norm": 1.8171566279274234, + "language_loss": 0.66761374, + "learning_rate": 1.216365371217893e-06, + "loss": 0.6899808, + "num_input_tokens_seen": 229419185, + "step": 10633, + "time_per_iteration": 2.5944900512695312 + }, + { + "auxiliary_loss_clip": 0.0106724, + "auxiliary_loss_mlp": 0.01104606, + "balance_loss_clip": 1.00154865, + "balance_loss_mlp": 1.00036955, + "epoch": 0.639350668871186, + "flos": 19829190551040.0, + "grad_norm": 1.8779997752717563, + "language_loss": 0.82175863, + "learning_rate": 1.216007064569225e-06, + "loss": 0.84347713, + "num_input_tokens_seen": 229436735, + "step": 10634, + "time_per_iteration": 2.747609853744507 + }, + { + "auxiliary_loss_clip": 0.01132462, + "auxiliary_loss_mlp": 0.01105977, + "balance_loss_clip": 1.00189757, + "balance_loss_mlp": 1.00069141, + "epoch": 0.6394107921238539, + "flos": 20553328736640.0, + "grad_norm": 2.0753247828793513, + "language_loss": 0.7537595, + "learning_rate": 1.2156487876496483e-06, + "loss": 0.77614391, + "num_input_tokens_seen": 229455595, + "step": 10635, + "time_per_iteration": 2.622060537338257 + }, + { + "auxiliary_loss_clip": 0.01148983, + "auxiliary_loss_mlp": 0.01105649, + "balance_loss_clip": 1.00193119, + "balance_loss_mlp": 1.00055385, + "epoch": 0.6394709153765219, + "flos": 25774991729280.0, + "grad_norm": 1.8649285995385407, + "language_loss": 0.71464014, + "learning_rate": 1.2152905404727475e-06, + "loss": 0.73718643, + "num_input_tokens_seen": 229476230, + "step": 10636, + "time_per_iteration": 2.6237645149230957 + }, + { + "auxiliary_loss_clip": 0.0113221, + "auxiliary_loss_mlp": 0.01106475, + "balance_loss_clip": 1.00192082, + "balance_loss_mlp": 1.00061727, + "epoch": 0.6395310386291898, + "flos": 17530153574400.0, + "grad_norm": 1.9984978501634387, + "language_loss": 0.73174119, + "learning_rate": 1.2149323230521085e-06, + "loss": 0.75412804, + "num_input_tokens_seen": 229494300, + "step": 10637, + "time_per_iteration": 2.61539363861084 + }, + { + "auxiliary_loss_clip": 0.01149063, + "auxiliary_loss_mlp": 0.01106336, + "balance_loss_clip": 1.00193524, + "balance_loss_mlp": 1.00038302, + "epoch": 0.6395911618818578, + "flos": 18588225525120.0, + "grad_norm": 1.7466073734826517, + "language_loss": 0.77444839, + "learning_rate": 1.2145741354013143e-06, + "loss": 0.79700238, + "num_input_tokens_seen": 229512985, + "step": 10638, + "time_per_iteration": 2.5683040618896484 + }, + { + "auxiliary_loss_clip": 0.01132254, + "auxiliary_loss_mlp": 0.01105355, + "balance_loss_clip": 1.00193274, + "balance_loss_mlp": 1.00054646, + "epoch": 0.6396512851345257, + "flos": 28366557068160.0, + "grad_norm": 1.5559214462490722, + "language_loss": 0.81652158, + "learning_rate": 1.2142159775339478e-06, + "loss": 0.83889771, + "num_input_tokens_seen": 229534270, + "step": 10639, + "time_per_iteration": 2.693230390548706 + }, + { + "auxiliary_loss_clip": 0.0112694, + "auxiliary_loss_mlp": 0.01080994, + "balance_loss_clip": 1.0010258, + "balance_loss_mlp": 1.00012231, + "epoch": 0.6397114083871938, + "flos": 70724307202560.0, + "grad_norm": 0.8142055677965616, + "language_loss": 0.59020221, + "learning_rate": 1.21385784946359e-06, + "loss": 0.61228156, + "num_input_tokens_seen": 229596455, + "step": 10640, + "time_per_iteration": 3.1505661010742188 + }, + { + "auxiliary_loss_clip": 0.01136083, + "auxiliary_loss_mlp": 0.01105506, + "balance_loss_clip": 1.00201917, + "balance_loss_mlp": 1.0005064, + "epoch": 0.6397715316398617, + "flos": 18142537570560.0, + "grad_norm": 1.7361900354140667, + "language_loss": 0.78651369, + "learning_rate": 1.2134997512038215e-06, + "loss": 0.80892956, + "num_input_tokens_seen": 229612860, + "step": 10641, + "time_per_iteration": 2.5926737785339355 + }, + { + "auxiliary_loss_clip": 0.01101371, + "auxiliary_loss_mlp": 0.01106994, + "balance_loss_clip": 1.0016818, + "balance_loss_mlp": 1.0005635, + "epoch": 0.6398316548925297, + "flos": 25739512070400.0, + "grad_norm": 1.6980852183971336, + "language_loss": 0.6320045, + "learning_rate": 1.2131416827682209e-06, + "loss": 0.65408814, + "num_input_tokens_seen": 229633960, + "step": 10642, + "time_per_iteration": 2.7217047214508057 + }, + { + "auxiliary_loss_clip": 0.01132759, + "auxiliary_loss_mlp": 0.01081088, + "balance_loss_clip": 1.001158, + "balance_loss_mlp": 1.00021648, + "epoch": 0.6398917781451977, + "flos": 71214234756480.0, + "grad_norm": 0.9137314619065234, + "language_loss": 0.55924571, + "learning_rate": 1.2127836441703667e-06, + "loss": 0.58138418, + "num_input_tokens_seen": 229686730, + "step": 10643, + "time_per_iteration": 3.130256175994873 + }, + { + "auxiliary_loss_clip": 0.0111743, + "auxiliary_loss_mlp": 0.01106434, + "balance_loss_clip": 1.0018661, + "balance_loss_mlp": 1.00038528, + "epoch": 0.6399519013978656, + "flos": 20521835487360.0, + "grad_norm": 4.5536348030380935, + "language_loss": 0.77095425, + "learning_rate": 1.2124256354238358e-06, + "loss": 0.79319286, + "num_input_tokens_seen": 229704800, + "step": 10644, + "time_per_iteration": 2.658022880554199 + }, + { + "auxiliary_loss_clip": 0.01115314, + "auxiliary_loss_mlp": 0.01105789, + "balance_loss_clip": 1.00192404, + "balance_loss_mlp": 1.00069416, + "epoch": 0.6400120246505336, + "flos": 24460840742400.0, + "grad_norm": 1.5098299881538773, + "language_loss": 0.82668293, + "learning_rate": 1.212067656542203e-06, + "loss": 0.848894, + "num_input_tokens_seen": 229725265, + "step": 10645, + "time_per_iteration": 2.6798110008239746 + }, + { + "auxiliary_loss_clip": 0.01150888, + "auxiliary_loss_mlp": 0.01105182, + "balance_loss_clip": 1.00198245, + "balance_loss_mlp": 1.00046873, + "epoch": 0.6400721479032015, + "flos": 28366090191360.0, + "grad_norm": 2.353473199156979, + "language_loss": 0.73602259, + "learning_rate": 1.2117097075390447e-06, + "loss": 0.75858331, + "num_input_tokens_seen": 229744840, + "step": 10646, + "time_per_iteration": 2.6098110675811768 + }, + { + "auxiliary_loss_clip": 0.01119528, + "auxiliary_loss_mlp": 0.01105746, + "balance_loss_clip": 1.0019809, + "balance_loss_mlp": 1.00045991, + "epoch": 0.6401322711558696, + "flos": 17816540711040.0, + "grad_norm": 2.271168611450042, + "language_loss": 0.80133653, + "learning_rate": 1.2113517884279327e-06, + "loss": 0.82358921, + "num_input_tokens_seen": 229759095, + "step": 10647, + "time_per_iteration": 2.5955212116241455 + }, + { + "auxiliary_loss_clip": 0.01104348, + "auxiliary_loss_mlp": 0.011049, + "balance_loss_clip": 1.00199914, + "balance_loss_mlp": 1.00047255, + "epoch": 0.6401923944085375, + "flos": 26030855283840.0, + "grad_norm": 1.7028436828054407, + "language_loss": 0.75847864, + "learning_rate": 1.2109938992224399e-06, + "loss": 0.7805711, + "num_input_tokens_seen": 229777750, + "step": 10648, + "time_per_iteration": 4.0712504386901855 + }, + { + "auxiliary_loss_clip": 0.01134585, + "auxiliary_loss_mlp": 0.0110524, + "balance_loss_clip": 1.00197387, + "balance_loss_mlp": 1.00043166, + "epoch": 0.6402525176612055, + "flos": 23586451966080.0, + "grad_norm": 1.9014534496857902, + "language_loss": 0.78882051, + "learning_rate": 1.210636039936138e-06, + "loss": 0.81121874, + "num_input_tokens_seen": 229796785, + "step": 10649, + "time_per_iteration": 2.629051685333252 + }, + { + "auxiliary_loss_clip": 0.01084754, + "auxiliary_loss_mlp": 0.01105863, + "balance_loss_clip": 1.00167716, + "balance_loss_mlp": 1.00048232, + "epoch": 0.6403126409138734, + "flos": 18041413806720.0, + "grad_norm": 2.006237645593255, + "language_loss": 0.75158781, + "learning_rate": 1.2102782105825956e-06, + "loss": 0.77349395, + "num_input_tokens_seen": 229815425, + "step": 10650, + "time_per_iteration": 2.725663661956787 + }, + { + "auxiliary_loss_clip": 0.01165384, + "auxiliary_loss_mlp": 0.01105265, + "balance_loss_clip": 1.00202131, + "balance_loss_mlp": 1.00055134, + "epoch": 0.6403727641665414, + "flos": 21979485308160.0, + "grad_norm": 2.090626952051307, + "language_loss": 0.71097273, + "learning_rate": 1.2099204111753833e-06, + "loss": 0.73367918, + "num_input_tokens_seen": 229834545, + "step": 10651, + "time_per_iteration": 2.5514683723449707 + }, + { + "auxiliary_loss_clip": 0.01117229, + "auxiliary_loss_mlp": 0.01106391, + "balance_loss_clip": 1.0018034, + "balance_loss_mlp": 1.00062859, + "epoch": 0.6404328874192093, + "flos": 24895539135360.0, + "grad_norm": 3.108516956027054, + "language_loss": 0.6361897, + "learning_rate": 1.2095626417280684e-06, + "loss": 0.65842593, + "num_input_tokens_seen": 229849175, + "step": 10652, + "time_per_iteration": 4.07157564163208 + }, + { + "auxiliary_loss_clip": 0.01135939, + "auxiliary_loss_mlp": 0.01105168, + "balance_loss_clip": 1.00197196, + "balance_loss_mlp": 1.00045431, + "epoch": 0.6404930106718774, + "flos": 17597198309760.0, + "grad_norm": 2.246654795655533, + "language_loss": 0.79284817, + "learning_rate": 1.2092049022542168e-06, + "loss": 0.81525922, + "num_input_tokens_seen": 229865400, + "step": 10653, + "time_per_iteration": 2.673711061477661 + }, + { + "auxiliary_loss_clip": 0.01135826, + "auxiliary_loss_mlp": 0.01108374, + "balance_loss_clip": 1.00205302, + "balance_loss_mlp": 1.00060844, + "epoch": 0.6405531339245453, + "flos": 20157880930560.0, + "grad_norm": 2.786643169329793, + "language_loss": 0.70266938, + "learning_rate": 1.2088471927673952e-06, + "loss": 0.72511137, + "num_input_tokens_seen": 229882945, + "step": 10654, + "time_per_iteration": 2.5920708179473877 + }, + { + "auxiliary_loss_clip": 0.01148956, + "auxiliary_loss_mlp": 0.01107129, + "balance_loss_clip": 1.00197375, + "balance_loss_mlp": 1.00060356, + "epoch": 0.6406132571772133, + "flos": 21942281796480.0, + "grad_norm": 2.3427777374022623, + "language_loss": 0.72975838, + "learning_rate": 1.2084895132811666e-06, + "loss": 0.75231922, + "num_input_tokens_seen": 229901590, + "step": 10655, + "time_per_iteration": 2.5790226459503174 + }, + { + "auxiliary_loss_clip": 0.01115964, + "auxiliary_loss_mlp": 0.01105919, + "balance_loss_clip": 1.00178313, + "balance_loss_mlp": 1.00053835, + "epoch": 0.6406733804298813, + "flos": 28768002445440.0, + "grad_norm": 1.696400692635525, + "language_loss": 0.829988, + "learning_rate": 1.2081318638090952e-06, + "loss": 0.85220683, + "num_input_tokens_seen": 229922535, + "step": 10656, + "time_per_iteration": 2.7108871936798096 + }, + { + "auxiliary_loss_clip": 0.0110322, + "auxiliary_loss_mlp": 0.01105286, + "balance_loss_clip": 1.00185585, + "balance_loss_mlp": 1.00057292, + "epoch": 0.6407335036825492, + "flos": 17457183095040.0, + "grad_norm": 2.3292781544709436, + "language_loss": 0.72470772, + "learning_rate": 1.2077742443647433e-06, + "loss": 0.74679279, + "num_input_tokens_seen": 229939575, + "step": 10657, + "time_per_iteration": 2.6531808376312256 + }, + { + "auxiliary_loss_clip": 0.01116404, + "auxiliary_loss_mlp": 0.01105635, + "balance_loss_clip": 1.00184679, + "balance_loss_mlp": 1.00063539, + "epoch": 0.6407936269352172, + "flos": 22125282612480.0, + "grad_norm": 1.7183327326836404, + "language_loss": 0.77136952, + "learning_rate": 1.2074166549616707e-06, + "loss": 0.79358989, + "num_input_tokens_seen": 229958840, + "step": 10658, + "time_per_iteration": 2.6517648696899414 + }, + { + "auxiliary_loss_clip": 0.0116542, + "auxiliary_loss_mlp": 0.01106002, + "balance_loss_clip": 1.0019809, + "balance_loss_mlp": 1.00062108, + "epoch": 0.6408537501878852, + "flos": 23110635479040.0, + "grad_norm": 2.1787968101672317, + "language_loss": 0.75991309, + "learning_rate": 1.2070590956134386e-06, + "loss": 0.78262728, + "num_input_tokens_seen": 229979680, + "step": 10659, + "time_per_iteration": 3.9532976150512695 + }, + { + "auxiliary_loss_clip": 0.01148666, + "auxiliary_loss_mlp": 0.01106186, + "balance_loss_clip": 1.00190723, + "balance_loss_mlp": 1.00051856, + "epoch": 0.6409138734405532, + "flos": 16472440759680.0, + "grad_norm": 2.398747416512377, + "language_loss": 0.78211373, + "learning_rate": 1.2067015663336046e-06, + "loss": 0.80466223, + "num_input_tokens_seen": 229996830, + "step": 10660, + "time_per_iteration": 2.617365837097168 + }, + { + "auxiliary_loss_clip": 0.01134384, + "auxiliary_loss_mlp": 0.01106947, + "balance_loss_clip": 1.00189233, + "balance_loss_mlp": 1.0006125, + "epoch": 0.6409739966932211, + "flos": 22777922776320.0, + "grad_norm": 2.059281288948055, + "language_loss": 0.68306983, + "learning_rate": 1.206344067135727e-06, + "loss": 0.70548308, + "num_input_tokens_seen": 230015115, + "step": 10661, + "time_per_iteration": 4.053146839141846 + }, + { + "auxiliary_loss_clip": 0.01165389, + "auxiliary_loss_mlp": 0.0110476, + "balance_loss_clip": 1.00214279, + "balance_loss_mlp": 1.00061905, + "epoch": 0.6410341199458891, + "flos": 25152049134720.0, + "grad_norm": 1.5386148766196548, + "language_loss": 0.76023591, + "learning_rate": 1.205986598033362e-06, + "loss": 0.78293735, + "num_input_tokens_seen": 230035515, + "step": 10662, + "time_per_iteration": 2.550086736679077 + }, + { + "auxiliary_loss_clip": 0.01150205, + "auxiliary_loss_mlp": 0.0110558, + "balance_loss_clip": 1.00188875, + "balance_loss_mlp": 1.00058031, + "epoch": 0.641094243198557, + "flos": 27046193028480.0, + "grad_norm": 1.7028439998129792, + "language_loss": 0.69479108, + "learning_rate": 1.2056291590400644e-06, + "loss": 0.71734893, + "num_input_tokens_seen": 230054355, + "step": 10663, + "time_per_iteration": 2.6231327056884766 + }, + { + "auxiliary_loss_clip": 0.01114624, + "auxiliary_loss_mlp": 0.01106371, + "balance_loss_clip": 1.00181532, + "balance_loss_mlp": 1.00070357, + "epoch": 0.641154366451225, + "flos": 25374551932800.0, + "grad_norm": 2.107842582047652, + "language_loss": 0.67905354, + "learning_rate": 1.205271750169389e-06, + "loss": 0.70126355, + "num_input_tokens_seen": 230074605, + "step": 10664, + "time_per_iteration": 2.661795139312744 + }, + { + "auxiliary_loss_clip": 0.01133616, + "auxiliary_loss_mlp": 0.01105027, + "balance_loss_clip": 1.00179243, + "balance_loss_mlp": 1.00059938, + "epoch": 0.6412144897038929, + "flos": 25153342024320.0, + "grad_norm": 1.8617945344694025, + "language_loss": 0.66215354, + "learning_rate": 1.2049143714348881e-06, + "loss": 0.68453997, + "num_input_tokens_seen": 230093820, + "step": 10665, + "time_per_iteration": 2.629085063934326 + }, + { + "auxiliary_loss_clip": 0.01150551, + "auxiliary_loss_mlp": 0.01105224, + "balance_loss_clip": 1.00193906, + "balance_loss_mlp": 1.00051093, + "epoch": 0.641274612956561, + "flos": 23440762402560.0, + "grad_norm": 1.7269817423745408, + "language_loss": 0.64218539, + "learning_rate": 1.2045570228501145e-06, + "loss": 0.66474313, + "num_input_tokens_seen": 230114285, + "step": 10666, + "time_per_iteration": 2.565166711807251 + }, + { + "auxiliary_loss_clip": 0.01148203, + "auxiliary_loss_mlp": 0.01104909, + "balance_loss_clip": 1.00194871, + "balance_loss_mlp": 1.00057662, + "epoch": 0.6413347362092289, + "flos": 19427493778560.0, + "grad_norm": 3.053362156111195, + "language_loss": 0.70895594, + "learning_rate": 1.2041997044286176e-06, + "loss": 0.73148704, + "num_input_tokens_seen": 230132760, + "step": 10667, + "time_per_iteration": 2.5459930896759033 + }, + { + "auxiliary_loss_clip": 0.0108726, + "auxiliary_loss_mlp": 0.00747477, + "balance_loss_clip": 1.00169492, + "balance_loss_mlp": 1.00047314, + "epoch": 0.6413948594618969, + "flos": 17196578945280.0, + "grad_norm": 2.2540778794979115, + "language_loss": 0.77251101, + "learning_rate": 1.2038424161839484e-06, + "loss": 0.79085839, + "num_input_tokens_seen": 230149690, + "step": 10668, + "time_per_iteration": 2.7015724182128906 + }, + { + "auxiliary_loss_clip": 0.0114999, + "auxiliary_loss_mlp": 0.01105858, + "balance_loss_clip": 1.00216401, + "balance_loss_mlp": 1.00057209, + "epoch": 0.6414549827145648, + "flos": 22269787027200.0, + "grad_norm": 1.7296512813989322, + "language_loss": 0.68115711, + "learning_rate": 1.2034851581296544e-06, + "loss": 0.70371556, + "num_input_tokens_seen": 230166950, + "step": 10669, + "time_per_iteration": 2.600341796875 + }, + { + "auxiliary_loss_clip": 0.01150095, + "auxiliary_loss_mlp": 0.01106488, + "balance_loss_clip": 1.00210643, + "balance_loss_mlp": 1.00063002, + "epoch": 0.6415151059672328, + "flos": 19640192163840.0, + "grad_norm": 1.9835284109984375, + "language_loss": 0.78310287, + "learning_rate": 1.2031279302792825e-06, + "loss": 0.80566871, + "num_input_tokens_seen": 230184785, + "step": 10670, + "time_per_iteration": 2.545599937438965 + }, + { + "auxiliary_loss_clip": 0.01115223, + "auxiliary_loss_mlp": 0.01106173, + "balance_loss_clip": 1.0017786, + "balance_loss_mlp": 1.00060177, + "epoch": 0.6415752292199008, + "flos": 14865833237760.0, + "grad_norm": 2.479990242980306, + "language_loss": 0.8883934, + "learning_rate": 1.20277073264638e-06, + "loss": 0.91060734, + "num_input_tokens_seen": 230201385, + "step": 10671, + "time_per_iteration": 2.613468647003174 + }, + { + "auxiliary_loss_clip": 0.01150583, + "auxiliary_loss_mlp": 0.01104334, + "balance_loss_clip": 1.00210559, + "balance_loss_mlp": 1.00047886, + "epoch": 0.6416353524725688, + "flos": 13735580906880.0, + "grad_norm": 1.5442857389328404, + "language_loss": 0.69698066, + "learning_rate": 1.2024135652444907e-06, + "loss": 0.71952987, + "num_input_tokens_seen": 230220380, + "step": 10672, + "time_per_iteration": 2.5534768104553223 + }, + { + "auxiliary_loss_clip": 0.01149897, + "auxiliary_loss_mlp": 0.01106422, + "balance_loss_clip": 1.00191522, + "balance_loss_mlp": 1.00037372, + "epoch": 0.6416954757252368, + "flos": 24534924543360.0, + "grad_norm": 2.1379376410627664, + "language_loss": 0.73931456, + "learning_rate": 1.2020564280871593e-06, + "loss": 0.76187772, + "num_input_tokens_seen": 230239845, + "step": 10673, + "time_per_iteration": 2.5872740745544434 + }, + { + "auxiliary_loss_clip": 0.01117146, + "auxiliary_loss_mlp": 0.01105863, + "balance_loss_clip": 1.00176632, + "balance_loss_mlp": 1.00057721, + "epoch": 0.6417555989779047, + "flos": 27710002321920.0, + "grad_norm": 1.5929580484617636, + "language_loss": 0.69144863, + "learning_rate": 1.2016993211879283e-06, + "loss": 0.71367872, + "num_input_tokens_seen": 230262420, + "step": 10674, + "time_per_iteration": 2.6871376037597656 + }, + { + "auxiliary_loss_clip": 0.01165396, + "auxiliary_loss_mlp": 0.01106107, + "balance_loss_clip": 1.00191486, + "balance_loss_mlp": 1.00044, + "epoch": 0.6418157222305727, + "flos": 20556632787840.0, + "grad_norm": 2.2287271914112408, + "language_loss": 0.66693735, + "learning_rate": 1.201342244560338e-06, + "loss": 0.68965244, + "num_input_tokens_seen": 230279950, + "step": 10675, + "time_per_iteration": 2.5359392166137695 + }, + { + "auxiliary_loss_clip": 0.01165508, + "auxiliary_loss_mlp": 0.01105589, + "balance_loss_clip": 1.0022229, + "balance_loss_mlp": 1.000494, + "epoch": 0.6418758454832406, + "flos": 22601530062720.0, + "grad_norm": 1.7852963236281019, + "language_loss": 0.66359842, + "learning_rate": 1.2009851982179307e-06, + "loss": 0.68630946, + "num_input_tokens_seen": 230299705, + "step": 10676, + "time_per_iteration": 2.5406124591827393 + }, + { + "auxiliary_loss_clip": 0.0116544, + "auxiliary_loss_mlp": 0.01106233, + "balance_loss_clip": 1.00207329, + "balance_loss_mlp": 1.0004704, + "epoch": 0.6419359687359086, + "flos": 27375098889600.0, + "grad_norm": 2.0167923140807646, + "language_loss": 0.75455475, + "learning_rate": 1.2006281821742446e-06, + "loss": 0.77727151, + "num_input_tokens_seen": 230320030, + "step": 10677, + "time_per_iteration": 2.5757923126220703 + }, + { + "auxiliary_loss_clip": 0.01126378, + "auxiliary_loss_mlp": 0.01080967, + "balance_loss_clip": 1.00107551, + "balance_loss_mlp": 1.00009525, + "epoch": 0.6419960919885765, + "flos": 67251924552960.0, + "grad_norm": 0.7728807718366301, + "language_loss": 0.60721892, + "learning_rate": 1.200271196442818e-06, + "loss": 0.62929237, + "num_input_tokens_seen": 230381495, + "step": 10678, + "time_per_iteration": 3.2410569190979004 + }, + { + "auxiliary_loss_clip": 0.01150529, + "auxiliary_loss_mlp": 0.01104574, + "balance_loss_clip": 1.00202155, + "balance_loss_mlp": 1.00052786, + "epoch": 0.6420562152412446, + "flos": 19901873721600.0, + "grad_norm": 1.752248628608047, + "language_loss": 0.67462695, + "learning_rate": 1.1999142410371875e-06, + "loss": 0.69717795, + "num_input_tokens_seen": 230401385, + "step": 10679, + "time_per_iteration": 2.562281608581543 + }, + { + "auxiliary_loss_clip": 0.01150576, + "auxiliary_loss_mlp": 0.01106377, + "balance_loss_clip": 1.00206339, + "balance_loss_mlp": 1.00042391, + "epoch": 0.6421163384939125, + "flos": 24790177566720.0, + "grad_norm": 1.62620140004788, + "language_loss": 0.73080677, + "learning_rate": 1.1995573159708897e-06, + "loss": 0.75337636, + "num_input_tokens_seen": 230421340, + "step": 10680, + "time_per_iteration": 2.5789430141448975 + }, + { + "auxiliary_loss_clip": 0.01115104, + "auxiliary_loss_mlp": 0.01104981, + "balance_loss_clip": 1.00171256, + "balance_loss_mlp": 1.00055397, + "epoch": 0.6421764617465805, + "flos": 25592816926080.0, + "grad_norm": 1.920268850552374, + "language_loss": 0.67941809, + "learning_rate": 1.1992004212574582e-06, + "loss": 0.70161891, + "num_input_tokens_seen": 230441270, + "step": 10681, + "time_per_iteration": 2.664168357849121 + }, + { + "auxiliary_loss_clip": 0.01165217, + "auxiliary_loss_mlp": 0.01105119, + "balance_loss_clip": 1.00192332, + "balance_loss_mlp": 1.00050116, + "epoch": 0.6422365849992484, + "flos": 14134727813760.0, + "grad_norm": 2.013094193371321, + "language_loss": 0.74630749, + "learning_rate": 1.198843556910427e-06, + "loss": 0.7690109, + "num_input_tokens_seen": 230457455, + "step": 10682, + "time_per_iteration": 2.5050759315490723 + }, + { + "auxiliary_loss_clip": 0.01085287, + "auxiliary_loss_mlp": 0.01105306, + "balance_loss_clip": 1.0018754, + "balance_loss_mlp": 1.00040221, + "epoch": 0.6422967082519164, + "flos": 22383911514240.0, + "grad_norm": 1.5384733942755016, + "language_loss": 0.79288161, + "learning_rate": 1.1984867229433287e-06, + "loss": 0.81478751, + "num_input_tokens_seen": 230478955, + "step": 10683, + "time_per_iteration": 2.7616357803344727 + }, + { + "auxiliary_loss_clip": 0.0116544, + "auxiliary_loss_mlp": 0.01105695, + "balance_loss_clip": 1.0020113, + "balance_loss_mlp": 1.00050449, + "epoch": 0.6423568315045844, + "flos": 14647927380480.0, + "grad_norm": 1.6660080972311109, + "language_loss": 0.67195541, + "learning_rate": 1.1981299193696941e-06, + "loss": 0.69466674, + "num_input_tokens_seen": 230496425, + "step": 10684, + "time_per_iteration": 2.5216658115386963 + }, + { + "auxiliary_loss_clip": 0.011487, + "auxiliary_loss_mlp": 0.01105625, + "balance_loss_clip": 1.00191689, + "balance_loss_mlp": 1.00052965, + "epoch": 0.6424169547572524, + "flos": 26833925606400.0, + "grad_norm": 1.9964463335978173, + "language_loss": 0.71484149, + "learning_rate": 1.1977731462030533e-06, + "loss": 0.73738474, + "num_input_tokens_seen": 230516245, + "step": 10685, + "time_per_iteration": 2.6138663291931152 + }, + { + "auxiliary_loss_clip": 0.01117311, + "auxiliary_loss_mlp": 0.01105149, + "balance_loss_clip": 1.00179887, + "balance_loss_mlp": 1.00053096, + "epoch": 0.6424770780099204, + "flos": 22707430335360.0, + "grad_norm": 1.844322290374164, + "language_loss": 0.75252301, + "learning_rate": 1.197416403456935e-06, + "loss": 0.77474761, + "num_input_tokens_seen": 230534745, + "step": 10686, + "time_per_iteration": 4.096750020980835 + }, + { + "auxiliary_loss_clip": 0.01115174, + "auxiliary_loss_mlp": 0.01106515, + "balance_loss_clip": 1.00191963, + "balance_loss_mlp": 1.00046659, + "epoch": 0.6425372012625883, + "flos": 28469512425600.0, + "grad_norm": 2.2999627942959635, + "language_loss": 0.68650603, + "learning_rate": 1.197059691144867e-06, + "loss": 0.70872295, + "num_input_tokens_seen": 230555895, + "step": 10687, + "time_per_iteration": 2.7376174926757812 + }, + { + "auxiliary_loss_clip": 0.01134049, + "auxiliary_loss_mlp": 0.0110645, + "balance_loss_clip": 1.00208414, + "balance_loss_mlp": 1.0004971, + "epoch": 0.6425973245152563, + "flos": 29351694453120.0, + "grad_norm": 1.7973140399053669, + "language_loss": 0.66493565, + "learning_rate": 1.1967030092803767e-06, + "loss": 0.68734062, + "num_input_tokens_seen": 230577460, + "step": 10688, + "time_per_iteration": 2.6497199535369873 + }, + { + "auxiliary_loss_clip": 0.01165415, + "auxiliary_loss_mlp": 0.01105414, + "balance_loss_clip": 1.00200844, + "balance_loss_mlp": 1.00041425, + "epoch": 0.6426574477679242, + "flos": 16430388912000.0, + "grad_norm": 1.8713490435782794, + "language_loss": 0.73127246, + "learning_rate": 1.1963463578769876e-06, + "loss": 0.75398076, + "num_input_tokens_seen": 230595030, + "step": 10689, + "time_per_iteration": 3.9248056411743164 + }, + { + "auxiliary_loss_clip": 0.01151186, + "auxiliary_loss_mlp": 0.01104657, + "balance_loss_clip": 1.00211716, + "balance_loss_mlp": 1.00042009, + "epoch": 0.6427175710205922, + "flos": 21835914647040.0, + "grad_norm": 1.911307701986168, + "language_loss": 0.72017014, + "learning_rate": 1.195989736948226e-06, + "loss": 0.74272859, + "num_input_tokens_seen": 230615135, + "step": 10690, + "time_per_iteration": 2.5453991889953613 + }, + { + "auxiliary_loss_clip": 0.01133959, + "auxiliary_loss_mlp": 0.01104169, + "balance_loss_clip": 1.00204134, + "balance_loss_mlp": 1.00050449, + "epoch": 0.6427776942732601, + "flos": 17786627660160.0, + "grad_norm": 1.7911846743437836, + "language_loss": 0.77808994, + "learning_rate": 1.1956331465076143e-06, + "loss": 0.80047119, + "num_input_tokens_seen": 230631965, + "step": 10691, + "time_per_iteration": 2.567007541656494 + }, + { + "auxiliary_loss_clip": 0.01131501, + "auxiliary_loss_mlp": 0.01105668, + "balance_loss_clip": 1.00190461, + "balance_loss_mlp": 1.0005734, + "epoch": 0.6428378175259282, + "flos": 15085893911040.0, + "grad_norm": 1.6242284083368368, + "language_loss": 0.7421627, + "learning_rate": 1.1952765865686738e-06, + "loss": 0.76453447, + "num_input_tokens_seen": 230649565, + "step": 10692, + "time_per_iteration": 2.589069366455078 + }, + { + "auxiliary_loss_clip": 0.01148698, + "auxiliary_loss_mlp": 0.01104945, + "balance_loss_clip": 1.00185537, + "balance_loss_mlp": 1.00070822, + "epoch": 0.6428979407785961, + "flos": 23841776816640.0, + "grad_norm": 1.887549446006859, + "language_loss": 0.61876857, + "learning_rate": 1.1949200571449263e-06, + "loss": 0.64130503, + "num_input_tokens_seen": 230669265, + "step": 10693, + "time_per_iteration": 2.568535089492798 + }, + { + "auxiliary_loss_clip": 0.01115615, + "auxiliary_loss_mlp": 0.01105867, + "balance_loss_clip": 1.00175226, + "balance_loss_mlp": 1.00058103, + "epoch": 0.6429580640312641, + "flos": 32926852892160.0, + "grad_norm": 2.0726082207781444, + "language_loss": 0.59634304, + "learning_rate": 1.1945635582498903e-06, + "loss": 0.61855787, + "num_input_tokens_seen": 230690575, + "step": 10694, + "time_per_iteration": 2.795386552810669 + }, + { + "auxiliary_loss_clip": 0.01136617, + "auxiliary_loss_mlp": 0.01105421, + "balance_loss_clip": 1.00196171, + "balance_loss_mlp": 1.0006119, + "epoch": 0.643018187283932, + "flos": 21068359896960.0, + "grad_norm": 1.4969501634499387, + "language_loss": 0.7996757, + "learning_rate": 1.1942070898970853e-06, + "loss": 0.82209611, + "num_input_tokens_seen": 230709420, + "step": 10695, + "time_per_iteration": 2.641646146774292 + }, + { + "auxiliary_loss_clip": 0.01165263, + "auxiliary_loss_mlp": 0.01105336, + "balance_loss_clip": 1.00191021, + "balance_loss_mlp": 1.00052702, + "epoch": 0.6430783105366, + "flos": 26724649455360.0, + "grad_norm": 1.7330291812532275, + "language_loss": 0.7381528, + "learning_rate": 1.1938506521000285e-06, + "loss": 0.76085877, + "num_input_tokens_seen": 230729350, + "step": 10696, + "time_per_iteration": 2.5628249645233154 + }, + { + "auxiliary_loss_clip": 0.01118682, + "auxiliary_loss_mlp": 0.01104589, + "balance_loss_clip": 1.00190806, + "balance_loss_mlp": 1.00044787, + "epoch": 0.643138433789268, + "flos": 23696841438720.0, + "grad_norm": 1.6099554471315973, + "language_loss": 0.75514084, + "learning_rate": 1.1934942448722347e-06, + "loss": 0.77737355, + "num_input_tokens_seen": 230749220, + "step": 10697, + "time_per_iteration": 2.6571881771087646 + }, + { + "auxiliary_loss_clip": 0.01134317, + "auxiliary_loss_mlp": 0.01105103, + "balance_loss_clip": 1.00185156, + "balance_loss_mlp": 1.0004847, + "epoch": 0.643198557041936, + "flos": 34202184255360.0, + "grad_norm": 1.5026371224130197, + "language_loss": 0.6654188, + "learning_rate": 1.1931378682272208e-06, + "loss": 0.68781304, + "num_input_tokens_seen": 230770245, + "step": 10698, + "time_per_iteration": 5.492242097854614 + }, + { + "auxiliary_loss_clip": 0.01159841, + "auxiliary_loss_mlp": 0.01080888, + "balance_loss_clip": 1.00103295, + "balance_loss_mlp": 1.00001645, + "epoch": 0.643258680294604, + "flos": 67626473621760.0, + "grad_norm": 0.8382915739316414, + "language_loss": 0.63376814, + "learning_rate": 1.1927815221784996e-06, + "loss": 0.65617537, + "num_input_tokens_seen": 230837030, + "step": 10699, + "time_per_iteration": 3.075852870941162 + }, + { + "auxiliary_loss_clip": 0.01149723, + "auxiliary_loss_mlp": 0.01103795, + "balance_loss_clip": 1.00203109, + "balance_loss_mlp": 1.00041628, + "epoch": 0.6433188035472719, + "flos": 25185984508800.0, + "grad_norm": 1.9113465144225046, + "language_loss": 0.69296581, + "learning_rate": 1.1924252067395838e-06, + "loss": 0.71550095, + "num_input_tokens_seen": 230856845, + "step": 10700, + "time_per_iteration": 2.5823099613189697 + }, + { + "auxiliary_loss_clip": 0.01165304, + "auxiliary_loss_mlp": 0.01104778, + "balance_loss_clip": 1.00199842, + "balance_loss_mlp": 1.00044572, + "epoch": 0.6433789267999399, + "flos": 24973573432320.0, + "grad_norm": 2.3622085521151717, + "language_loss": 0.73041022, + "learning_rate": 1.1920689219239855e-06, + "loss": 0.753111, + "num_input_tokens_seen": 230878785, + "step": 10701, + "time_per_iteration": 2.591493844985962 + }, + { + "auxiliary_loss_clip": 0.01148364, + "auxiliary_loss_mlp": 0.01106477, + "balance_loss_clip": 1.00197101, + "balance_loss_mlp": 1.0004288, + "epoch": 0.6434390500526078, + "flos": 17566028282880.0, + "grad_norm": 2.9597265630576697, + "language_loss": 0.82232708, + "learning_rate": 1.1917126677452144e-06, + "loss": 0.84487545, + "num_input_tokens_seen": 230895445, + "step": 10702, + "time_per_iteration": 2.573680877685547 + }, + { + "auxiliary_loss_clip": 0.01133495, + "auxiliary_loss_mlp": 0.01104485, + "balance_loss_clip": 1.0018903, + "balance_loss_mlp": 1.00072527, + "epoch": 0.6434991733052758, + "flos": 20843594542080.0, + "grad_norm": 2.2328823479581312, + "language_loss": 0.74864948, + "learning_rate": 1.1913564442167798e-06, + "loss": 0.77102929, + "num_input_tokens_seen": 230911375, + "step": 10703, + "time_per_iteration": 2.614607572555542 + }, + { + "auxiliary_loss_clip": 0.01081484, + "auxiliary_loss_mlp": 0.01080182, + "balance_loss_clip": 1.001086, + "balance_loss_mlp": 1.00007343, + "epoch": 0.6435592965579437, + "flos": 66094596345600.0, + "grad_norm": 0.6528744080657002, + "language_loss": 0.5463016, + "learning_rate": 1.1910002513521898e-06, + "loss": 0.56791818, + "num_input_tokens_seen": 230975990, + "step": 10704, + "time_per_iteration": 3.34084415435791 + }, + { + "auxiliary_loss_clip": 0.01115381, + "auxiliary_loss_mlp": 0.01104814, + "balance_loss_clip": 1.00168836, + "balance_loss_mlp": 1.0004822, + "epoch": 0.6436194198106118, + "flos": 23768842250880.0, + "grad_norm": 1.6203436837432936, + "language_loss": 0.77006471, + "learning_rate": 1.1906440891649519e-06, + "loss": 0.79226661, + "num_input_tokens_seen": 230997110, + "step": 10705, + "time_per_iteration": 2.6721205711364746 + }, + { + "auxiliary_loss_clip": 0.01117565, + "auxiliary_loss_mlp": 0.01104223, + "balance_loss_clip": 1.00174046, + "balance_loss_mlp": 1.00065374, + "epoch": 0.6436795430632797, + "flos": 20230312705920.0, + "grad_norm": 1.7083087882673977, + "language_loss": 0.79189122, + "learning_rate": 1.1902879576685708e-06, + "loss": 0.81410909, + "num_input_tokens_seen": 231015590, + "step": 10706, + "time_per_iteration": 2.640684127807617 + }, + { + "auxiliary_loss_clip": 0.01119223, + "auxiliary_loss_mlp": 0.01104479, + "balance_loss_clip": 1.00184143, + "balance_loss_mlp": 1.00043368, + "epoch": 0.6437396663159477, + "flos": 20301846641280.0, + "grad_norm": 2.303230877570738, + "language_loss": 0.80324984, + "learning_rate": 1.1899318568765518e-06, + "loss": 0.82548678, + "num_input_tokens_seen": 231033800, + "step": 10707, + "time_per_iteration": 2.6285500526428223 + }, + { + "auxiliary_loss_clip": 0.01148523, + "auxiliary_loss_mlp": 0.0110443, + "balance_loss_clip": 1.00187337, + "balance_loss_mlp": 1.00038385, + "epoch": 0.6437997895686156, + "flos": 23878585278720.0, + "grad_norm": 1.7637868149074845, + "language_loss": 0.8558591, + "learning_rate": 1.1895757868023978e-06, + "loss": 0.87838864, + "num_input_tokens_seen": 231053160, + "step": 10708, + "time_per_iteration": 2.6081185340881348 + }, + { + "auxiliary_loss_clip": 0.0110231, + "auxiliary_loss_mlp": 0.01106605, + "balance_loss_clip": 1.00188696, + "balance_loss_mlp": 1.00065184, + "epoch": 0.6438599128212836, + "flos": 18989275852800.0, + "grad_norm": 2.4473220827123576, + "language_loss": 0.65610254, + "learning_rate": 1.1892197474596106e-06, + "loss": 0.67819166, + "num_input_tokens_seen": 231069470, + "step": 10709, + "time_per_iteration": 2.6726529598236084 + }, + { + "auxiliary_loss_clip": 0.01165122, + "auxiliary_loss_mlp": 0.0110544, + "balance_loss_clip": 1.00191176, + "balance_loss_mlp": 1.00053596, + "epoch": 0.6439200360739517, + "flos": 24096347481600.0, + "grad_norm": 2.5110930236028137, + "language_loss": 0.80472726, + "learning_rate": 1.1888637388616929e-06, + "loss": 0.82743287, + "num_input_tokens_seen": 231088205, + "step": 10710, + "time_per_iteration": 2.551007032394409 + }, + { + "auxiliary_loss_clip": 0.011507, + "auxiliary_loss_mlp": 0.01105087, + "balance_loss_clip": 1.00201297, + "balance_loss_mlp": 1.00037313, + "epoch": 0.6439801593266196, + "flos": 31902141697920.0, + "grad_norm": 1.7671891951899081, + "language_loss": 0.65936571, + "learning_rate": 1.1885077610221425e-06, + "loss": 0.68192357, + "num_input_tokens_seen": 231107850, + "step": 10711, + "time_per_iteration": 2.6805996894836426 + }, + { + "auxiliary_loss_clip": 0.01118892, + "auxiliary_loss_mlp": 0.01104779, + "balance_loss_clip": 1.00194204, + "balance_loss_mlp": 1.00044656, + "epoch": 0.6440402825792876, + "flos": 27125879351040.0, + "grad_norm": 6.977298206167082, + "language_loss": 0.7881645, + "learning_rate": 1.1881518139544597e-06, + "loss": 0.8104012, + "num_input_tokens_seen": 231127200, + "step": 10712, + "time_per_iteration": 2.7275314331054688 + }, + { + "auxiliary_loss_clip": 0.01150649, + "auxiliary_loss_mlp": 0.01106166, + "balance_loss_clip": 1.00208378, + "balance_loss_mlp": 1.00059402, + "epoch": 0.6441004058319555, + "flos": 20667704618880.0, + "grad_norm": 1.637004255610196, + "language_loss": 0.8277092, + "learning_rate": 1.1877958976721417e-06, + "loss": 0.85027742, + "num_input_tokens_seen": 231146360, + "step": 10713, + "time_per_iteration": 2.585777997970581 + }, + { + "auxiliary_loss_clip": 0.01165108, + "auxiliary_loss_mlp": 0.01104756, + "balance_loss_clip": 1.0020299, + "balance_loss_mlp": 1.00051963, + "epoch": 0.6441605290846235, + "flos": 26026006947840.0, + "grad_norm": 1.5013460557990665, + "language_loss": 0.78466618, + "learning_rate": 1.187440012188684e-06, + "loss": 0.80736482, + "num_input_tokens_seen": 231168350, + "step": 10714, + "time_per_iteration": 2.558241128921509 + }, + { + "auxiliary_loss_clip": 0.01118335, + "auxiliary_loss_mlp": 0.01104484, + "balance_loss_clip": 1.00195682, + "balance_loss_mlp": 1.00053334, + "epoch": 0.6442206523372914, + "flos": 24899489631360.0, + "grad_norm": 1.397841280751012, + "language_loss": 0.81351531, + "learning_rate": 1.187084157517583e-06, + "loss": 0.83574349, + "num_input_tokens_seen": 231188385, + "step": 10715, + "time_per_iteration": 2.6929543018341064 + }, + { + "auxiliary_loss_clip": 0.01133995, + "auxiliary_loss_mlp": 0.01104711, + "balance_loss_clip": 1.0018965, + "balance_loss_mlp": 1.00047445, + "epoch": 0.6442807755899594, + "flos": 25156322853120.0, + "grad_norm": 2.0323937755312507, + "language_loss": 0.82098389, + "learning_rate": 1.186728333672332e-06, + "loss": 0.84337091, + "num_input_tokens_seen": 231209880, + "step": 10716, + "time_per_iteration": 2.673245906829834 + }, + { + "auxiliary_loss_clip": 0.01114574, + "auxiliary_loss_mlp": 0.01105419, + "balance_loss_clip": 1.0016948, + "balance_loss_mlp": 1.00041902, + "epoch": 0.6443408988426274, + "flos": 27344503480320.0, + "grad_norm": 1.7608543133886438, + "language_loss": 0.7825247, + "learning_rate": 1.186372540666424e-06, + "loss": 0.80472463, + "num_input_tokens_seen": 231230765, + "step": 10717, + "time_per_iteration": 2.671949625015259 + }, + { + "auxiliary_loss_clip": 0.01165047, + "auxiliary_loss_mlp": 0.01104186, + "balance_loss_clip": 1.0019691, + "balance_loss_mlp": 1.00061679, + "epoch": 0.6444010220952954, + "flos": 27928339142400.0, + "grad_norm": 1.7880065993553869, + "language_loss": 0.67978287, + "learning_rate": 1.1860167785133513e-06, + "loss": 0.70247525, + "num_input_tokens_seen": 231252350, + "step": 10718, + "time_per_iteration": 2.5733397006988525 + }, + { + "auxiliary_loss_clip": 0.01143398, + "auxiliary_loss_mlp": 0.01080916, + "balance_loss_clip": 1.00107145, + "balance_loss_mlp": 1.00004435, + "epoch": 0.6444611453479633, + "flos": 71215024855680.0, + "grad_norm": 0.7531735275985764, + "language_loss": 0.49594507, + "learning_rate": 1.185661047226603e-06, + "loss": 0.51818824, + "num_input_tokens_seen": 231313865, + "step": 10719, + "time_per_iteration": 3.2797701358795166 + }, + { + "auxiliary_loss_clip": 0.01165428, + "auxiliary_loss_mlp": 0.01105838, + "balance_loss_clip": 1.00202262, + "balance_loss_mlp": 1.00055265, + "epoch": 0.6445212686006313, + "flos": 22705131864960.0, + "grad_norm": 2.1843512383679364, + "language_loss": 0.78053099, + "learning_rate": 1.18530534681967e-06, + "loss": 0.80324364, + "num_input_tokens_seen": 231331710, + "step": 10720, + "time_per_iteration": 2.538104295730591 + }, + { + "auxiliary_loss_clip": 0.01132764, + "auxiliary_loss_mlp": 0.01104531, + "balance_loss_clip": 1.00184143, + "balance_loss_mlp": 1.00048506, + "epoch": 0.6445813918532992, + "flos": 21178821196800.0, + "grad_norm": 3.140669397277125, + "language_loss": 0.77155346, + "learning_rate": 1.18494967730604e-06, + "loss": 0.79392642, + "num_input_tokens_seen": 231350705, + "step": 10721, + "time_per_iteration": 2.5948729515075684 + }, + { + "auxiliary_loss_clip": 0.01120125, + "auxiliary_loss_mlp": 0.01105, + "balance_loss_clip": 1.00189066, + "balance_loss_mlp": 1.00047779, + "epoch": 0.6446415151059672, + "flos": 25191910252800.0, + "grad_norm": 2.3467528900505434, + "language_loss": 0.72775489, + "learning_rate": 1.1845940386991995e-06, + "loss": 0.75000608, + "num_input_tokens_seen": 231369550, + "step": 10722, + "time_per_iteration": 2.6640303134918213 + }, + { + "auxiliary_loss_clip": 0.01165334, + "auxiliary_loss_mlp": 0.01104645, + "balance_loss_clip": 1.00201762, + "balance_loss_mlp": 1.00040889, + "epoch": 0.6447016383586353, + "flos": 25302227898240.0, + "grad_norm": 1.5204074926542352, + "language_loss": 0.77678692, + "learning_rate": 1.184238431012635e-06, + "loss": 0.79948676, + "num_input_tokens_seen": 231389285, + "step": 10723, + "time_per_iteration": 4.046924591064453 + }, + { + "auxiliary_loss_clip": 0.01150829, + "auxiliary_loss_mlp": 0.01105741, + "balance_loss_clip": 1.00203574, + "balance_loss_mlp": 1.00055051, + "epoch": 0.6447617616113032, + "flos": 27703142824320.0, + "grad_norm": 1.537447381753892, + "language_loss": 0.58440864, + "learning_rate": 1.1838828542598312e-06, + "loss": 0.60697436, + "num_input_tokens_seen": 231408820, + "step": 10724, + "time_per_iteration": 2.619270086288452 + }, + { + "auxiliary_loss_clip": 0.01148458, + "auxiliary_loss_mlp": 0.01103806, + "balance_loss_clip": 1.00202465, + "balance_loss_mlp": 1.00052285, + "epoch": 0.6448218848639712, + "flos": 23039101543680.0, + "grad_norm": 3.4782598365190007, + "language_loss": 0.83956325, + "learning_rate": 1.183527308454271e-06, + "loss": 0.86208594, + "num_input_tokens_seen": 231428100, + "step": 10725, + "time_per_iteration": 2.5677738189697266 + }, + { + "auxiliary_loss_clip": 0.01133761, + "auxiliary_loss_mlp": 0.01104822, + "balance_loss_clip": 1.00179434, + "balance_loss_mlp": 1.00048983, + "epoch": 0.6448820081166391, + "flos": 24496104919680.0, + "grad_norm": 2.021981293668845, + "language_loss": 0.82085407, + "learning_rate": 1.1831717936094368e-06, + "loss": 0.84323984, + "num_input_tokens_seen": 231445810, + "step": 10726, + "time_per_iteration": 4.0168914794921875 + }, + { + "auxiliary_loss_clip": 0.01150894, + "auxiliary_loss_mlp": 0.01105596, + "balance_loss_clip": 1.00196707, + "balance_loss_mlp": 1.00050163, + "epoch": 0.6449421313693071, + "flos": 22419283432320.0, + "grad_norm": 3.414482495489306, + "language_loss": 0.81341732, + "learning_rate": 1.1828163097388108e-06, + "loss": 0.83598226, + "num_input_tokens_seen": 231463570, + "step": 10727, + "time_per_iteration": 2.58716082572937 + }, + { + "auxiliary_loss_clip": 0.01149288, + "auxiliary_loss_mlp": 0.01105808, + "balance_loss_clip": 1.00199318, + "balance_loss_mlp": 1.00052261, + "epoch": 0.645002254621975, + "flos": 20225715765120.0, + "grad_norm": 1.8884352441474375, + "language_loss": 0.79491758, + "learning_rate": 1.1824608568558717e-06, + "loss": 0.81746858, + "num_input_tokens_seen": 231482155, + "step": 10728, + "time_per_iteration": 2.579873561859131 + }, + { + "auxiliary_loss_clip": 0.01056682, + "auxiliary_loss_mlp": 0.01105168, + "balance_loss_clip": 1.00169158, + "balance_loss_mlp": 1.0004549, + "epoch": 0.645062377874643, + "flos": 27855440490240.0, + "grad_norm": 2.355318476023274, + "language_loss": 0.74706721, + "learning_rate": 1.1821054349740988e-06, + "loss": 0.7686857, + "num_input_tokens_seen": 231502465, + "step": 10729, + "time_per_iteration": 3.2041594982147217 + }, + { + "auxiliary_loss_clip": 0.01119098, + "auxiliary_loss_mlp": 0.01106096, + "balance_loss_clip": 1.00188279, + "balance_loss_mlp": 1.00052416, + "epoch": 0.645122501127311, + "flos": 25301509626240.0, + "grad_norm": 1.5886151281914958, + "language_loss": 0.66563207, + "learning_rate": 1.1817500441069706e-06, + "loss": 0.68788403, + "num_input_tokens_seen": 231522740, + "step": 10730, + "time_per_iteration": 2.8162219524383545 + }, + { + "auxiliary_loss_clip": 0.01084617, + "auxiliary_loss_mlp": 0.01105528, + "balance_loss_clip": 1.00173521, + "balance_loss_mlp": 1.00043285, + "epoch": 0.645182624379979, + "flos": 18807352444800.0, + "grad_norm": 2.17358723045475, + "language_loss": 0.63706982, + "learning_rate": 1.1813946842679614e-06, + "loss": 0.65897119, + "num_input_tokens_seen": 231542050, + "step": 10731, + "time_per_iteration": 2.736898899078369 + }, + { + "auxiliary_loss_clip": 0.01165185, + "auxiliary_loss_mlp": 0.01104694, + "balance_loss_clip": 1.00197482, + "balance_loss_mlp": 1.00045729, + "epoch": 0.6452427476326469, + "flos": 18332182402560.0, + "grad_norm": 2.1003771753416216, + "language_loss": 0.67889357, + "learning_rate": 1.1810393554705492e-06, + "loss": 0.70159239, + "num_input_tokens_seen": 231560380, + "step": 10732, + "time_per_iteration": 2.5177078247070312 + }, + { + "auxiliary_loss_clip": 0.01149479, + "auxiliary_loss_mlp": 0.01104294, + "balance_loss_clip": 1.00190282, + "balance_loss_mlp": 1.00053453, + "epoch": 0.6453028708853149, + "flos": 22784746360320.0, + "grad_norm": 2.6436118772415673, + "language_loss": 0.75941396, + "learning_rate": 1.1806840577282055e-06, + "loss": 0.78195167, + "num_input_tokens_seen": 231580810, + "step": 10733, + "time_per_iteration": 2.609421730041504 + }, + { + "auxiliary_loss_clip": 0.01150819, + "auxiliary_loss_mlp": 0.0110673, + "balance_loss_clip": 1.00219369, + "balance_loss_mlp": 1.0004909, + "epoch": 0.6453629941379828, + "flos": 23945989150080.0, + "grad_norm": 1.7559316077895661, + "language_loss": 0.66733766, + "learning_rate": 1.1803287910544048e-06, + "loss": 0.68991309, + "num_input_tokens_seen": 231600585, + "step": 10734, + "time_per_iteration": 2.5984256267547607 + }, + { + "auxiliary_loss_clip": 0.0116513, + "auxiliary_loss_mlp": 0.0110439, + "balance_loss_clip": 1.00206029, + "balance_loss_mlp": 1.00053477, + "epoch": 0.6454231173906508, + "flos": 17676381841920.0, + "grad_norm": 1.879977897880028, + "language_loss": 0.74047738, + "learning_rate": 1.1799735554626191e-06, + "loss": 0.76317263, + "num_input_tokens_seen": 231618765, + "step": 10735, + "time_per_iteration": 3.865623950958252 + }, + { + "auxiliary_loss_clip": 0.01085986, + "auxiliary_loss_mlp": 0.00747296, + "balance_loss_clip": 1.0018537, + "balance_loss_mlp": 1.00051403, + "epoch": 0.6454832406433189, + "flos": 23292774368640.0, + "grad_norm": 1.8142479297906473, + "language_loss": 0.75026023, + "learning_rate": 1.1796183509663176e-06, + "loss": 0.76859307, + "num_input_tokens_seen": 231638525, + "step": 10736, + "time_per_iteration": 4.183608770370483 + }, + { + "auxiliary_loss_clip": 0.01149096, + "auxiliary_loss_mlp": 0.01106743, + "balance_loss_clip": 1.00205791, + "balance_loss_mlp": 1.00050378, + "epoch": 0.6455433638959868, + "flos": 20157198572160.0, + "grad_norm": 1.8777531370896006, + "language_loss": 0.70802677, + "learning_rate": 1.1792631775789708e-06, + "loss": 0.73058516, + "num_input_tokens_seen": 231656785, + "step": 10737, + "time_per_iteration": 2.5978708267211914 + }, + { + "auxiliary_loss_clip": 0.01145279, + "auxiliary_loss_mlp": 0.01080405, + "balance_loss_clip": 1.00094509, + "balance_loss_mlp": 0.99991477, + "epoch": 0.6456034871486548, + "flos": 66532922012160.0, + "grad_norm": 0.7793101527283756, + "language_loss": 0.58455193, + "learning_rate": 1.1789080353140464e-06, + "loss": 0.60680878, + "num_input_tokens_seen": 231719075, + "step": 10738, + "time_per_iteration": 3.2112855911254883 + }, + { + "auxiliary_loss_clip": 0.0111711, + "auxiliary_loss_mlp": 0.01104734, + "balance_loss_clip": 1.00174928, + "balance_loss_mlp": 1.00040197, + "epoch": 0.6456636104013227, + "flos": 24206090509440.0, + "grad_norm": 1.9710404414689209, + "language_loss": 0.74668318, + "learning_rate": 1.1785529241850118e-06, + "loss": 0.76890159, + "num_input_tokens_seen": 231737810, + "step": 10739, + "time_per_iteration": 2.662118434906006 + }, + { + "auxiliary_loss_clip": 0.01131478, + "auxiliary_loss_mlp": 0.00747463, + "balance_loss_clip": 1.00185788, + "balance_loss_mlp": 1.00060964, + "epoch": 0.6457237336539907, + "flos": 23624086440960.0, + "grad_norm": 1.9866067669477132, + "language_loss": 0.7168138, + "learning_rate": 1.1781978442053324e-06, + "loss": 0.73560327, + "num_input_tokens_seen": 231756140, + "step": 10740, + "time_per_iteration": 2.6253459453582764 + }, + { + "auxiliary_loss_clip": 0.01130652, + "auxiliary_loss_mlp": 0.01080899, + "balance_loss_clip": 1.00094604, + "balance_loss_mlp": 1.00002754, + "epoch": 0.6457838569066586, + "flos": 65846023251840.0, + "grad_norm": 0.6645919246414888, + "language_loss": 0.55258298, + "learning_rate": 1.1778427953884733e-06, + "loss": 0.57469845, + "num_input_tokens_seen": 231823665, + "step": 10741, + "time_per_iteration": 3.2151994705200195 + }, + { + "auxiliary_loss_clip": 0.01165208, + "auxiliary_loss_mlp": 0.01104592, + "balance_loss_clip": 1.00195551, + "balance_loss_mlp": 1.00054622, + "epoch": 0.6458439801593266, + "flos": 22381972179840.0, + "grad_norm": 1.5631303941649293, + "language_loss": 0.80248475, + "learning_rate": 1.1774877777478977e-06, + "loss": 0.8251828, + "num_input_tokens_seen": 231844500, + "step": 10742, + "time_per_iteration": 2.542574167251587 + }, + { + "auxiliary_loss_clip": 0.01133089, + "auxiliary_loss_mlp": 0.01104145, + "balance_loss_clip": 1.00187409, + "balance_loss_mlp": 1.00038564, + "epoch": 0.6459041034119946, + "flos": 24789243813120.0, + "grad_norm": 1.6447156853830864, + "language_loss": 0.8163861, + "learning_rate": 1.1771327912970678e-06, + "loss": 0.83875847, + "num_input_tokens_seen": 231864510, + "step": 10743, + "time_per_iteration": 2.642594337463379 + }, + { + "auxiliary_loss_clip": 0.0113342, + "auxiliary_loss_mlp": 0.01104619, + "balance_loss_clip": 1.00191498, + "balance_loss_mlp": 1.00047755, + "epoch": 0.6459642266646626, + "flos": 18325358818560.0, + "grad_norm": 2.243628431238934, + "language_loss": 0.72041738, + "learning_rate": 1.1767778360494453e-06, + "loss": 0.74279773, + "num_input_tokens_seen": 231881555, + "step": 10744, + "time_per_iteration": 2.6329331398010254 + }, + { + "auxiliary_loss_clip": 0.01165347, + "auxiliary_loss_mlp": 0.01104824, + "balance_loss_clip": 1.00205851, + "balance_loss_mlp": 1.00039637, + "epoch": 0.6460243499173305, + "flos": 43581368891520.0, + "grad_norm": 2.049905788346346, + "language_loss": 0.66271591, + "learning_rate": 1.1764229120184896e-06, + "loss": 0.68541765, + "num_input_tokens_seen": 231905945, + "step": 10745, + "time_per_iteration": 2.736609935760498 + }, + { + "auxiliary_loss_clip": 0.0115062, + "auxiliary_loss_mlp": 0.01105723, + "balance_loss_clip": 1.00204515, + "balance_loss_mlp": 1.00053263, + "epoch": 0.6460844731699985, + "flos": 19244026085760.0, + "grad_norm": 2.223409759887879, + "language_loss": 0.73369539, + "learning_rate": 1.1760680192176597e-06, + "loss": 0.75625885, + "num_input_tokens_seen": 231922535, + "step": 10746, + "time_per_iteration": 2.5554492473602295 + }, + { + "auxiliary_loss_clip": 0.01148715, + "auxiliary_loss_mlp": 0.01105162, + "balance_loss_clip": 1.00203609, + "balance_loss_mlp": 1.00063908, + "epoch": 0.6461445964226664, + "flos": 27453348668160.0, + "grad_norm": 2.4406844148758786, + "language_loss": 0.66495609, + "learning_rate": 1.175713157660413e-06, + "loss": 0.68749493, + "num_input_tokens_seen": 231944800, + "step": 10747, + "time_per_iteration": 2.6019577980041504 + }, + { + "auxiliary_loss_clip": 0.01135672, + "auxiliary_loss_mlp": 0.0110504, + "balance_loss_clip": 1.00208974, + "balance_loss_mlp": 1.00070822, + "epoch": 0.6462047196753344, + "flos": 20295489934080.0, + "grad_norm": 1.727591845314219, + "language_loss": 0.67248976, + "learning_rate": 1.1753583273602056e-06, + "loss": 0.69489682, + "num_input_tokens_seen": 231962970, + "step": 10748, + "time_per_iteration": 2.6786587238311768 + }, + { + "auxiliary_loss_clip": 0.0116544, + "auxiliary_loss_mlp": 0.0110591, + "balance_loss_clip": 1.00203156, + "balance_loss_mlp": 1.00062418, + "epoch": 0.6462648429280025, + "flos": 22018340845440.0, + "grad_norm": 1.6523323217097365, + "language_loss": 0.76279199, + "learning_rate": 1.1750035283304937e-06, + "loss": 0.78550541, + "num_input_tokens_seen": 231981195, + "step": 10749, + "time_per_iteration": 2.6283602714538574 + }, + { + "auxiliary_loss_clip": 0.01119037, + "auxiliary_loss_mlp": 0.01105608, + "balance_loss_clip": 1.00185263, + "balance_loss_mlp": 1.00051332, + "epoch": 0.6463249661806704, + "flos": 27781141207680.0, + "grad_norm": 1.4669592593423249, + "language_loss": 0.77218622, + "learning_rate": 1.17464876058473e-06, + "loss": 0.7944327, + "num_input_tokens_seen": 232001735, + "step": 10750, + "time_per_iteration": 2.723407030105591 + }, + { + "auxiliary_loss_clip": 0.01133581, + "auxiliary_loss_mlp": 0.01105174, + "balance_loss_clip": 1.00188684, + "balance_loss_mlp": 1.00046074, + "epoch": 0.6463850894333384, + "flos": 22050588280320.0, + "grad_norm": 2.4541706095835507, + "language_loss": 0.68138385, + "learning_rate": 1.1742940241363683e-06, + "loss": 0.70377135, + "num_input_tokens_seen": 232019830, + "step": 10751, + "time_per_iteration": 2.7642464637756348 + }, + { + "auxiliary_loss_clip": 0.01135467, + "auxiliary_loss_mlp": 0.01105941, + "balance_loss_clip": 1.00197184, + "balance_loss_mlp": 1.0004642, + "epoch": 0.6464452126860063, + "flos": 21106245767040.0, + "grad_norm": 1.7050444125651691, + "language_loss": 0.71159124, + "learning_rate": 1.1739393189988604e-06, + "loss": 0.73400533, + "num_input_tokens_seen": 232039625, + "step": 10752, + "time_per_iteration": 2.7093255519866943 + }, + { + "auxiliary_loss_clip": 0.01119184, + "auxiliary_loss_mlp": 0.01106663, + "balance_loss_clip": 1.00190175, + "balance_loss_mlp": 1.00061417, + "epoch": 0.6465053359386743, + "flos": 16028045694720.0, + "grad_norm": 1.7789860336421397, + "language_loss": 0.78228652, + "learning_rate": 1.1735846451856554e-06, + "loss": 0.80454499, + "num_input_tokens_seen": 232055855, + "step": 10753, + "time_per_iteration": 2.640087604522705 + }, + { + "auxiliary_loss_clip": 0.01165262, + "auxiliary_loss_mlp": 0.01105199, + "balance_loss_clip": 1.00202298, + "balance_loss_mlp": 1.00058079, + "epoch": 0.6465654591913422, + "flos": 23398674641280.0, + "grad_norm": 1.645363895691283, + "language_loss": 0.84910607, + "learning_rate": 1.1732300027102041e-06, + "loss": 0.87181067, + "num_input_tokens_seen": 232073475, + "step": 10754, + "time_per_iteration": 2.5824787616729736 + }, + { + "auxiliary_loss_clip": 0.01133867, + "auxiliary_loss_mlp": 0.0110531, + "balance_loss_clip": 1.00194502, + "balance_loss_mlp": 1.0005964, + "epoch": 0.6466255824440102, + "flos": 15377273038080.0, + "grad_norm": 5.030892052933881, + "language_loss": 0.59442121, + "learning_rate": 1.1728753915859541e-06, + "loss": 0.616813, + "num_input_tokens_seen": 232091090, + "step": 10755, + "time_per_iteration": 2.654804229736328 + }, + { + "auxiliary_loss_clip": 0.01120649, + "auxiliary_loss_mlp": 0.01105249, + "balance_loss_clip": 1.00201929, + "balance_loss_mlp": 1.00063109, + "epoch": 0.6466857056966782, + "flos": 16252846963200.0, + "grad_norm": 2.247250652345657, + "language_loss": 0.6812501, + "learning_rate": 1.1725208118263518e-06, + "loss": 0.70350909, + "num_input_tokens_seen": 232107320, + "step": 10756, + "time_per_iteration": 2.665304660797119 + }, + { + "auxiliary_loss_clip": 0.01103651, + "auxiliary_loss_mlp": 0.01106469, + "balance_loss_clip": 1.00183105, + "balance_loss_mlp": 1.00051594, + "epoch": 0.6467458289493462, + "flos": 21178246579200.0, + "grad_norm": 3.3392780490195744, + "language_loss": 0.74771392, + "learning_rate": 1.172166263444844e-06, + "loss": 0.76981509, + "num_input_tokens_seen": 232123930, + "step": 10757, + "time_per_iteration": 2.695028066635132 + }, + { + "auxiliary_loss_clip": 0.01101718, + "auxiliary_loss_mlp": 0.01104359, + "balance_loss_clip": 1.00188029, + "balance_loss_mlp": 1.00050426, + "epoch": 0.6468059522020141, + "flos": 17968299672960.0, + "grad_norm": 1.4880091087533, + "language_loss": 0.74712646, + "learning_rate": 1.1718117464548734e-06, + "loss": 0.76918721, + "num_input_tokens_seen": 232142905, + "step": 10758, + "time_per_iteration": 2.6670827865600586 + }, + { + "auxiliary_loss_clip": 0.01115149, + "auxiliary_loss_mlp": 0.01106021, + "balance_loss_clip": 1.00177586, + "balance_loss_mlp": 1.00054419, + "epoch": 0.6468660754546821, + "flos": 17890157635200.0, + "grad_norm": 1.7584460791047134, + "language_loss": 0.67780614, + "learning_rate": 1.1714572608698845e-06, + "loss": 0.70001781, + "num_input_tokens_seen": 232162230, + "step": 10759, + "time_per_iteration": 2.6572370529174805 + }, + { + "auxiliary_loss_clip": 0.01119231, + "auxiliary_loss_mlp": 0.01107013, + "balance_loss_clip": 1.00185764, + "balance_loss_mlp": 1.00058281, + "epoch": 0.64692619870735, + "flos": 22600991358720.0, + "grad_norm": 1.5717398389365533, + "language_loss": 0.75373912, + "learning_rate": 1.1711028067033197e-06, + "loss": 0.77600157, + "num_input_tokens_seen": 232182700, + "step": 10760, + "time_per_iteration": 2.6827187538146973 + }, + { + "auxiliary_loss_clip": 0.01135116, + "auxiliary_loss_mlp": 0.01105223, + "balance_loss_clip": 1.0019896, + "balance_loss_mlp": 1.00041461, + "epoch": 0.646986321960018, + "flos": 49600786993920.0, + "grad_norm": 1.6761806358780624, + "language_loss": 0.65532458, + "learning_rate": 1.1707483839686194e-06, + "loss": 0.677728, + "num_input_tokens_seen": 232208235, + "step": 10761, + "time_per_iteration": 4.291199207305908 + }, + { + "auxiliary_loss_clip": 0.01101222, + "auxiliary_loss_mlp": 0.01106731, + "balance_loss_clip": 1.00185895, + "balance_loss_mlp": 1.00049186, + "epoch": 0.6470464452126861, + "flos": 21908454163200.0, + "grad_norm": 1.9222002671538134, + "language_loss": 0.69576442, + "learning_rate": 1.1703939926792235e-06, + "loss": 0.71784395, + "num_input_tokens_seen": 232228720, + "step": 10762, + "time_per_iteration": 2.8120083808898926 + }, + { + "auxiliary_loss_clip": 0.01165449, + "auxiliary_loss_mlp": 0.01106224, + "balance_loss_clip": 1.00196052, + "balance_loss_mlp": 1.00046194, + "epoch": 0.647106568465354, + "flos": 18106124158080.0, + "grad_norm": 2.0494557180669273, + "language_loss": 0.82667714, + "learning_rate": 1.1700396328485705e-06, + "loss": 0.84939384, + "num_input_tokens_seen": 232244655, + "step": 10763, + "time_per_iteration": 2.541126012802124 + }, + { + "auxiliary_loss_clip": 0.01159648, + "auxiliary_loss_mlp": 0.01080846, + "balance_loss_clip": 1.00098097, + "balance_loss_mlp": 0.99997407, + "epoch": 0.647166691718022, + "flos": 69480038125440.0, + "grad_norm": 0.7110113493983676, + "language_loss": 0.5782969, + "learning_rate": 1.1696853044900978e-06, + "loss": 0.60070193, + "num_input_tokens_seen": 232308685, + "step": 10764, + "time_per_iteration": 4.62269139289856 + }, + { + "auxiliary_loss_clip": 0.01117113, + "auxiliary_loss_mlp": 0.01104653, + "balance_loss_clip": 1.00185823, + "balance_loss_mlp": 1.00041676, + "epoch": 0.6472268149706899, + "flos": 34095170661120.0, + "grad_norm": 1.9812503577805083, + "language_loss": 0.60791492, + "learning_rate": 1.1693310076172413e-06, + "loss": 0.63013256, + "num_input_tokens_seen": 232327520, + "step": 10765, + "time_per_iteration": 2.7726237773895264 + }, + { + "auxiliary_loss_clip": 0.01165068, + "auxiliary_loss_mlp": 0.01104866, + "balance_loss_clip": 1.00196981, + "balance_loss_mlp": 1.00053406, + "epoch": 0.6472869382233579, + "flos": 28111232217600.0, + "grad_norm": 1.7366123954787647, + "language_loss": 0.6286248, + "learning_rate": 1.168976742243437e-06, + "loss": 0.65132415, + "num_input_tokens_seen": 232349025, + "step": 10766, + "time_per_iteration": 2.588280200958252 + }, + { + "auxiliary_loss_clip": 0.01134156, + "auxiliary_loss_mlp": 0.01105866, + "balance_loss_clip": 1.00197601, + "balance_loss_mlp": 1.00048482, + "epoch": 0.6473470614760258, + "flos": 22492146170880.0, + "grad_norm": 2.01567521095859, + "language_loss": 0.75171614, + "learning_rate": 1.1686225083821174e-06, + "loss": 0.7741164, + "num_input_tokens_seen": 232367835, + "step": 10767, + "time_per_iteration": 2.6027748584747314 + }, + { + "auxiliary_loss_clip": 0.01152296, + "auxiliary_loss_mlp": 0.0110548, + "balance_loss_clip": 1.0020113, + "balance_loss_mlp": 1.00048041, + "epoch": 0.6474071847286939, + "flos": 14538938538240.0, + "grad_norm": 1.9951494122763067, + "language_loss": 0.78021216, + "learning_rate": 1.1682683060467153e-06, + "loss": 0.80278993, + "num_input_tokens_seen": 232385840, + "step": 10768, + "time_per_iteration": 2.525669574737549 + }, + { + "auxiliary_loss_clip": 0.01105636, + "auxiliary_loss_mlp": 0.01104651, + "balance_loss_clip": 1.00197434, + "balance_loss_mlp": 1.00041473, + "epoch": 0.6474673079813618, + "flos": 24098214988800.0, + "grad_norm": 1.7664555288283468, + "language_loss": 0.71856713, + "learning_rate": 1.167914135250663e-06, + "loss": 0.74067003, + "num_input_tokens_seen": 232406205, + "step": 10769, + "time_per_iteration": 2.689606189727783 + }, + { + "auxiliary_loss_clip": 0.01165312, + "auxiliary_loss_mlp": 0.01104666, + "balance_loss_clip": 1.00209999, + "balance_loss_mlp": 1.00061989, + "epoch": 0.6475274312340298, + "flos": 14976186796800.0, + "grad_norm": 1.7706936279106664, + "language_loss": 0.7221576, + "learning_rate": 1.1675599960073895e-06, + "loss": 0.74485737, + "num_input_tokens_seen": 232424995, + "step": 10770, + "time_per_iteration": 2.606605052947998 + }, + { + "auxiliary_loss_clip": 0.01103796, + "auxiliary_loss_mlp": 0.01106042, + "balance_loss_clip": 1.00183177, + "balance_loss_mlp": 1.00047076, + "epoch": 0.6475875544866977, + "flos": 25045322849280.0, + "grad_norm": 5.468059769103712, + "language_loss": 0.72891796, + "learning_rate": 1.167205888330325e-06, + "loss": 0.75101632, + "num_input_tokens_seen": 232445870, + "step": 10771, + "time_per_iteration": 2.7534403800964355 + }, + { + "auxiliary_loss_clip": 0.01119123, + "auxiliary_loss_mlp": 0.01105138, + "balance_loss_clip": 1.00202274, + "balance_loss_mlp": 1.00052023, + "epoch": 0.6476476777393657, + "flos": 16472153450880.0, + "grad_norm": 2.2567024149752286, + "language_loss": 0.74011016, + "learning_rate": 1.1668518122328958e-06, + "loss": 0.76235282, + "num_input_tokens_seen": 232464285, + "step": 10772, + "time_per_iteration": 2.646963596343994 + }, + { + "auxiliary_loss_clip": 0.01133409, + "auxiliary_loss_mlp": 0.0110338, + "balance_loss_clip": 1.00179136, + "balance_loss_mlp": 1.00047815, + "epoch": 0.6477078009920336, + "flos": 25812267068160.0, + "grad_norm": 1.5499322408144762, + "language_loss": 0.8303076, + "learning_rate": 1.1664977677285305e-06, + "loss": 0.85267556, + "num_input_tokens_seen": 232485815, + "step": 10773, + "time_per_iteration": 4.053814888000488 + }, + { + "auxiliary_loss_clip": 0.01147977, + "auxiliary_loss_mlp": 0.00747289, + "balance_loss_clip": 1.00192809, + "balance_loss_mlp": 1.00063515, + "epoch": 0.6477679242447016, + "flos": 17676130446720.0, + "grad_norm": 1.5344954653151013, + "language_loss": 0.78322995, + "learning_rate": 1.1661437548306524e-06, + "loss": 0.80218261, + "num_input_tokens_seen": 232504875, + "step": 10774, + "time_per_iteration": 3.9793004989624023 + }, + { + "auxiliary_loss_clip": 0.01148708, + "auxiliary_loss_mlp": 0.01105721, + "balance_loss_clip": 1.00196791, + "balance_loss_mlp": 1.00062656, + "epoch": 0.6478280474973696, + "flos": 21032305620480.0, + "grad_norm": 2.3410515745566367, + "language_loss": 0.68416548, + "learning_rate": 1.1657897735526867e-06, + "loss": 0.7067098, + "num_input_tokens_seen": 232521945, + "step": 10775, + "time_per_iteration": 2.5470921993255615 + }, + { + "auxiliary_loss_clip": 0.01118692, + "auxiliary_loss_mlp": 0.01105478, + "balance_loss_clip": 1.00188065, + "balance_loss_mlp": 1.00057423, + "epoch": 0.6478881707500376, + "flos": 21616931381760.0, + "grad_norm": 3.5174346842358353, + "language_loss": 0.65501666, + "learning_rate": 1.1654358239080574e-06, + "loss": 0.67725837, + "num_input_tokens_seen": 232541500, + "step": 10776, + "time_per_iteration": 2.6367318630218506 + }, + { + "auxiliary_loss_clip": 0.01133908, + "auxiliary_loss_mlp": 0.0110563, + "balance_loss_clip": 1.00189829, + "balance_loss_mlp": 1.00053477, + "epoch": 0.6479482940027056, + "flos": 18442571875200.0, + "grad_norm": 2.642584795430439, + "language_loss": 0.78868163, + "learning_rate": 1.1650819059101839e-06, + "loss": 0.811077, + "num_input_tokens_seen": 232559720, + "step": 10777, + "time_per_iteration": 2.6172287464141846 + }, + { + "auxiliary_loss_clip": 0.01148644, + "auxiliary_loss_mlp": 0.01105212, + "balance_loss_clip": 1.00193477, + "balance_loss_mlp": 1.00049829, + "epoch": 0.6480084172553735, + "flos": 22164066322560.0, + "grad_norm": 3.899477374765823, + "language_loss": 0.7367968, + "learning_rate": 1.1647280195724896e-06, + "loss": 0.75933534, + "num_input_tokens_seen": 232579370, + "step": 10778, + "time_per_iteration": 2.6747000217437744 + }, + { + "auxiliary_loss_clip": 0.01148451, + "auxiliary_loss_mlp": 0.01104833, + "balance_loss_clip": 1.00185585, + "balance_loss_mlp": 1.00050139, + "epoch": 0.6480685405080415, + "flos": 24316228586880.0, + "grad_norm": 1.4419288651551136, + "language_loss": 0.78002077, + "learning_rate": 1.1643741649083923e-06, + "loss": 0.80255359, + "num_input_tokens_seen": 232600495, + "step": 10779, + "time_per_iteration": 2.602895975112915 + }, + { + "auxiliary_loss_clip": 0.01143219, + "auxiliary_loss_mlp": 0.01080828, + "balance_loss_clip": 1.00114965, + "balance_loss_mlp": 0.99995631, + "epoch": 0.6481286637607094, + "flos": 59891207760000.0, + "grad_norm": 0.7285319316513847, + "language_loss": 0.59412193, + "learning_rate": 1.1640203419313095e-06, + "loss": 0.61636245, + "num_input_tokens_seen": 232663165, + "step": 10780, + "time_per_iteration": 3.167736291885376 + }, + { + "auxiliary_loss_clip": 0.01042879, + "auxiliary_loss_mlp": 0.01104251, + "balance_loss_clip": 1.00167882, + "balance_loss_mlp": 1.00049102, + "epoch": 0.6481887870133775, + "flos": 25484187219840.0, + "grad_norm": 1.812810742174214, + "language_loss": 0.79284465, + "learning_rate": 1.1636665506546599e-06, + "loss": 0.81431597, + "num_input_tokens_seen": 232683385, + "step": 10781, + "time_per_iteration": 2.900830030441284 + }, + { + "auxiliary_loss_clip": 0.01165354, + "auxiliary_loss_mlp": 0.01105754, + "balance_loss_clip": 1.00207996, + "balance_loss_mlp": 1.00046825, + "epoch": 0.6482489102660454, + "flos": 19930206574080.0, + "grad_norm": 2.0384993225576404, + "language_loss": 0.78362644, + "learning_rate": 1.1633127910918578e-06, + "loss": 0.80633748, + "num_input_tokens_seen": 232699095, + "step": 10782, + "time_per_iteration": 2.5669028759002686 + }, + { + "auxiliary_loss_clip": 0.01148715, + "auxiliary_loss_mlp": 0.00747423, + "balance_loss_clip": 1.00193858, + "balance_loss_mlp": 1.00057459, + "epoch": 0.6483090335187134, + "flos": 26979471515520.0, + "grad_norm": 2.2312976791489385, + "language_loss": 0.64421916, + "learning_rate": 1.1629590632563187e-06, + "loss": 0.66318059, + "num_input_tokens_seen": 232717920, + "step": 10783, + "time_per_iteration": 2.617493152618408 + }, + { + "auxiliary_loss_clip": 0.01165432, + "auxiliary_loss_mlp": 0.01105522, + "balance_loss_clip": 1.00201976, + "balance_loss_mlp": 1.00052238, + "epoch": 0.6483691567713813, + "flos": 25077965333760.0, + "grad_norm": 2.0153583808411386, + "language_loss": 0.88581276, + "learning_rate": 1.1626053671614561e-06, + "loss": 0.90852225, + "num_input_tokens_seen": 232737605, + "step": 10784, + "time_per_iteration": 2.56766676902771 + }, + { + "auxiliary_loss_clip": 0.01133887, + "auxiliary_loss_mlp": 0.01104364, + "balance_loss_clip": 1.00192165, + "balance_loss_mlp": 1.00060427, + "epoch": 0.6484292800240493, + "flos": 16105972250880.0, + "grad_norm": 2.3938840156629713, + "language_loss": 0.73226291, + "learning_rate": 1.1622517028206815e-06, + "loss": 0.75464541, + "num_input_tokens_seen": 232755110, + "step": 10785, + "time_per_iteration": 2.6157877445220947 + }, + { + "auxiliary_loss_clip": 0.01117167, + "auxiliary_loss_mlp": 0.01104399, + "balance_loss_clip": 1.00178289, + "balance_loss_mlp": 1.0004487, + "epoch": 0.6484894032767172, + "flos": 28840398307200.0, + "grad_norm": 1.4712025413005314, + "language_loss": 0.69505441, + "learning_rate": 1.1618980702474071e-06, + "loss": 0.71727002, + "num_input_tokens_seen": 232779040, + "step": 10786, + "time_per_iteration": 2.7631824016571045 + }, + { + "auxiliary_loss_clip": 0.01117428, + "auxiliary_loss_mlp": 0.01105025, + "balance_loss_clip": 1.00178623, + "balance_loss_mlp": 1.00050223, + "epoch": 0.6485495265293852, + "flos": 30227052896640.0, + "grad_norm": 1.9195228476840784, + "language_loss": 0.71160007, + "learning_rate": 1.161544469455041e-06, + "loss": 0.73382461, + "num_input_tokens_seen": 232800515, + "step": 10787, + "time_per_iteration": 2.7085440158843994 + }, + { + "auxiliary_loss_clip": 0.01165335, + "auxiliary_loss_mlp": 0.01106085, + "balance_loss_clip": 1.00195873, + "balance_loss_mlp": 1.00051332, + "epoch": 0.6486096497820532, + "flos": 20082181017600.0, + "grad_norm": 2.5110757261801546, + "language_loss": 0.84237325, + "learning_rate": 1.1611909004569934e-06, + "loss": 0.86508751, + "num_input_tokens_seen": 232818450, + "step": 10788, + "time_per_iteration": 2.4955086708068848 + }, + { + "auxiliary_loss_clip": 0.01117831, + "auxiliary_loss_mlp": 0.01105313, + "balance_loss_clip": 1.00201261, + "balance_loss_mlp": 1.00059915, + "epoch": 0.6486697730347212, + "flos": 17129067333120.0, + "grad_norm": 2.5345211304667403, + "language_loss": 0.77489549, + "learning_rate": 1.1608373632666708e-06, + "loss": 0.79712689, + "num_input_tokens_seen": 232834785, + "step": 10789, + "time_per_iteration": 2.6119065284729004 + }, + { + "auxiliary_loss_clip": 0.01150513, + "auxiliary_loss_mlp": 0.01104459, + "balance_loss_clip": 1.00200319, + "balance_loss_mlp": 1.00050831, + "epoch": 0.6487298962873892, + "flos": 38911940570880.0, + "grad_norm": 1.649198366656869, + "language_loss": 0.75749689, + "learning_rate": 1.160483857897479e-06, + "loss": 0.78004658, + "num_input_tokens_seen": 232856050, + "step": 10790, + "time_per_iteration": 2.7096445560455322 + }, + { + "auxiliary_loss_clip": 0.01165383, + "auxiliary_loss_mlp": 0.0110545, + "balance_loss_clip": 1.00208712, + "balance_loss_mlp": 1.0005455, + "epoch": 0.6487900195400571, + "flos": 11947840076160.0, + "grad_norm": 2.35500636273396, + "language_loss": 0.6008364, + "learning_rate": 1.160130384362823e-06, + "loss": 0.62354469, + "num_input_tokens_seen": 232873945, + "step": 10791, + "time_per_iteration": 2.5032877922058105 + }, + { + "auxiliary_loss_clip": 0.01114961, + "auxiliary_loss_mlp": 0.01105676, + "balance_loss_clip": 1.00166261, + "balance_loss_mlp": 1.00058103, + "epoch": 0.6488501427927251, + "flos": 22344445445760.0, + "grad_norm": 1.7093546619328754, + "language_loss": 0.85972625, + "learning_rate": 1.1597769426761082e-06, + "loss": 0.88193262, + "num_input_tokens_seen": 232892160, + "step": 10792, + "time_per_iteration": 2.6764965057373047 + }, + { + "auxiliary_loss_clip": 0.01133805, + "auxiliary_loss_mlp": 0.01105953, + "balance_loss_clip": 1.00195432, + "balance_loss_mlp": 1.0005722, + "epoch": 0.648910266045393, + "flos": 22236282616320.0, + "grad_norm": 2.076124743791756, + "language_loss": 0.78395808, + "learning_rate": 1.159423532850735e-06, + "loss": 0.8063556, + "num_input_tokens_seen": 232911725, + "step": 10793, + "time_per_iteration": 2.590758800506592 + }, + { + "auxiliary_loss_clip": 0.01116596, + "auxiliary_loss_mlp": 0.01105215, + "balance_loss_clip": 1.00186586, + "balance_loss_mlp": 1.00040603, + "epoch": 0.6489703892980611, + "flos": 25301258231040.0, + "grad_norm": 1.9225244722832733, + "language_loss": 0.74478829, + "learning_rate": 1.1590701549001055e-06, + "loss": 0.7670064, + "num_input_tokens_seen": 232929085, + "step": 10794, + "time_per_iteration": 2.682131290435791 + }, + { + "auxiliary_loss_clip": 0.01148687, + "auxiliary_loss_mlp": 0.00747506, + "balance_loss_clip": 1.00177574, + "balance_loss_mlp": 1.00061178, + "epoch": 0.649030512550729, + "flos": 24571912573440.0, + "grad_norm": 1.8191868561108588, + "language_loss": 0.70165193, + "learning_rate": 1.158716808837621e-06, + "loss": 0.72061384, + "num_input_tokens_seen": 232949455, + "step": 10795, + "time_per_iteration": 2.6086015701293945 + }, + { + "auxiliary_loss_clip": 0.01133953, + "auxiliary_loss_mlp": 0.01106354, + "balance_loss_clip": 1.00192142, + "balance_loss_mlp": 1.00059164, + "epoch": 0.649090635803397, + "flos": 26244702904320.0, + "grad_norm": 1.9311288847710317, + "language_loss": 0.53892154, + "learning_rate": 1.158363494676679e-06, + "loss": 0.5613246, + "num_input_tokens_seen": 232969445, + "step": 10796, + "time_per_iteration": 2.6624464988708496 + }, + { + "auxiliary_loss_clip": 0.01148584, + "auxiliary_loss_mlp": 0.01105923, + "balance_loss_clip": 1.00190926, + "balance_loss_mlp": 1.00044703, + "epoch": 0.6491507590560649, + "flos": 24937375501440.0, + "grad_norm": 5.135310527477437, + "language_loss": 0.77809107, + "learning_rate": 1.1580102124306775e-06, + "loss": 0.80063617, + "num_input_tokens_seen": 232988900, + "step": 10797, + "time_per_iteration": 2.589228868484497 + }, + { + "auxiliary_loss_clip": 0.01100468, + "auxiliary_loss_mlp": 0.0110468, + "balance_loss_clip": 1.00181246, + "balance_loss_mlp": 1.00053859, + "epoch": 0.6492108823087329, + "flos": 19499781899520.0, + "grad_norm": 2.1357957583009406, + "language_loss": 0.70926511, + "learning_rate": 1.1576569621130134e-06, + "loss": 0.73131657, + "num_input_tokens_seen": 233005060, + "step": 10798, + "time_per_iteration": 2.65989089012146 + }, + { + "auxiliary_loss_clip": 0.01103422, + "auxiliary_loss_mlp": 0.01104166, + "balance_loss_clip": 1.00186837, + "balance_loss_mlp": 1.00050139, + "epoch": 0.6492710055614008, + "flos": 19719303868800.0, + "grad_norm": 1.647806243744138, + "language_loss": 0.76898772, + "learning_rate": 1.1573037437370811e-06, + "loss": 0.79106367, + "num_input_tokens_seen": 233023375, + "step": 10799, + "time_per_iteration": 4.099463224411011 + }, + { + "auxiliary_loss_clip": 0.01148963, + "auxiliary_loss_mlp": 0.01106027, + "balance_loss_clip": 1.00178778, + "balance_loss_mlp": 1.00055075, + "epoch": 0.6493311288140688, + "flos": 24317018686080.0, + "grad_norm": 2.1625612468616153, + "language_loss": 0.71976489, + "learning_rate": 1.1569505573162755e-06, + "loss": 0.74231482, + "num_input_tokens_seen": 233043130, + "step": 10800, + "time_per_iteration": 2.5975358486175537 + }, + { + "auxiliary_loss_clip": 0.01145055, + "auxiliary_loss_mlp": 0.01080933, + "balance_loss_clip": 1.00098395, + "balance_loss_mlp": 1.00006127, + "epoch": 0.6493912520667368, + "flos": 70934635290240.0, + "grad_norm": 0.7910683877537503, + "language_loss": 0.60263598, + "learning_rate": 1.1565974028639897e-06, + "loss": 0.62489581, + "num_input_tokens_seen": 233110560, + "step": 10801, + "time_per_iteration": 4.627725124359131 + }, + { + "auxiliary_loss_clip": 0.01150688, + "auxiliary_loss_mlp": 0.01106021, + "balance_loss_clip": 1.00220871, + "balance_loss_mlp": 1.00064015, + "epoch": 0.6494513753194048, + "flos": 25337779384320.0, + "grad_norm": 1.655462318744289, + "language_loss": 0.78266692, + "learning_rate": 1.156244280393614e-06, + "loss": 0.80523407, + "num_input_tokens_seen": 233130080, + "step": 10802, + "time_per_iteration": 2.621818780899048 + }, + { + "auxiliary_loss_clip": 0.01165218, + "auxiliary_loss_mlp": 0.01105922, + "balance_loss_clip": 1.00186133, + "balance_loss_mlp": 1.00063658, + "epoch": 0.6495114985720728, + "flos": 24681978823680.0, + "grad_norm": 1.6181048182393538, + "language_loss": 0.74436432, + "learning_rate": 1.155891189918541e-06, + "loss": 0.76707572, + "num_input_tokens_seen": 233150235, + "step": 10803, + "time_per_iteration": 2.580660820007324 + }, + { + "auxiliary_loss_clip": 0.01085652, + "auxiliary_loss_mlp": 0.01105664, + "balance_loss_clip": 1.00183904, + "balance_loss_mlp": 1.00047421, + "epoch": 0.6495716218247407, + "flos": 23651162317440.0, + "grad_norm": 3.9193078914017314, + "language_loss": 0.69533491, + "learning_rate": 1.1555381314521578e-06, + "loss": 0.71724802, + "num_input_tokens_seen": 233166710, + "step": 10804, + "time_per_iteration": 2.7538065910339355 + }, + { + "auxiliary_loss_clip": 0.01148633, + "auxiliary_loss_mlp": 0.01105624, + "balance_loss_clip": 1.00197411, + "balance_loss_mlp": 1.00052941, + "epoch": 0.6496317450774087, + "flos": 22346169298560.0, + "grad_norm": 1.6636680392785408, + "language_loss": 0.72591007, + "learning_rate": 1.1551851050078537e-06, + "loss": 0.74845266, + "num_input_tokens_seen": 233185445, + "step": 10805, + "time_per_iteration": 2.577627420425415 + }, + { + "auxiliary_loss_clip": 0.01132318, + "auxiliary_loss_mlp": 0.01106281, + "balance_loss_clip": 1.00189877, + "balance_loss_mlp": 1.00042331, + "epoch": 0.6496918683300766, + "flos": 30518647505280.0, + "grad_norm": 4.866781665315095, + "language_loss": 0.65355122, + "learning_rate": 1.1548321105990155e-06, + "loss": 0.67593718, + "num_input_tokens_seen": 233205805, + "step": 10806, + "time_per_iteration": 2.6802852153778076 + }, + { + "auxiliary_loss_clip": 0.01132277, + "auxiliary_loss_mlp": 0.00747554, + "balance_loss_clip": 1.0019443, + "balance_loss_mlp": 1.00069213, + "epoch": 0.6497519915827447, + "flos": 12458992567680.0, + "grad_norm": 2.7004708381973423, + "language_loss": 0.78556091, + "learning_rate": 1.1544791482390275e-06, + "loss": 0.8043592, + "num_input_tokens_seen": 233224215, + "step": 10807, + "time_per_iteration": 2.602567434310913 + }, + { + "auxiliary_loss_clip": 0.01128651, + "auxiliary_loss_mlp": 0.01080864, + "balance_loss_clip": 1.00101829, + "balance_loss_mlp": 0.99999237, + "epoch": 0.6498121148354126, + "flos": 69093748287360.0, + "grad_norm": 0.7897922512662788, + "language_loss": 0.58904463, + "learning_rate": 1.1541262179412745e-06, + "loss": 0.61113977, + "num_input_tokens_seen": 233294440, + "step": 10808, + "time_per_iteration": 3.3233344554901123 + }, + { + "auxiliary_loss_clip": 0.01134093, + "auxiliary_loss_mlp": 0.01105777, + "balance_loss_clip": 1.00206113, + "balance_loss_mlp": 1.00039649, + "epoch": 0.6498722380880806, + "flos": 36897135914880.0, + "grad_norm": 1.8406233413245363, + "language_loss": 0.63465428, + "learning_rate": 1.1537733197191415e-06, + "loss": 0.65705299, + "num_input_tokens_seen": 233316125, + "step": 10809, + "time_per_iteration": 2.726778030395508 + }, + { + "auxiliary_loss_clip": 0.0114855, + "auxiliary_loss_mlp": 0.00747292, + "balance_loss_clip": 1.00203776, + "balance_loss_mlp": 1.00046992, + "epoch": 0.6499323613407485, + "flos": 29017760688000.0, + "grad_norm": 1.9273749760391452, + "language_loss": 0.81456649, + "learning_rate": 1.153420453586008e-06, + "loss": 0.83352494, + "num_input_tokens_seen": 233336140, + "step": 10810, + "time_per_iteration": 2.6310253143310547 + }, + { + "auxiliary_loss_clip": 0.01117802, + "auxiliary_loss_mlp": 0.01104686, + "balance_loss_clip": 1.00190926, + "balance_loss_mlp": 1.00064039, + "epoch": 0.6499924845934165, + "flos": 20119240874880.0, + "grad_norm": 1.6455321224805308, + "language_loss": 0.71682429, + "learning_rate": 1.1530676195552561e-06, + "loss": 0.7390492, + "num_input_tokens_seen": 233356095, + "step": 10811, + "time_per_iteration": 4.02386474609375 + }, + { + "auxiliary_loss_clip": 0.0108417, + "auxiliary_loss_mlp": 0.01105112, + "balance_loss_clip": 1.00181305, + "balance_loss_mlp": 1.00049388, + "epoch": 0.6500526078460844, + "flos": 24421338760320.0, + "grad_norm": 2.424654374280349, + "language_loss": 0.77661318, + "learning_rate": 1.1527148176402649e-06, + "loss": 0.79850602, + "num_input_tokens_seen": 233376830, + "step": 10812, + "time_per_iteration": 4.170602798461914 + }, + { + "auxiliary_loss_clip": 0.01149037, + "auxiliary_loss_mlp": 0.01106574, + "balance_loss_clip": 1.001984, + "balance_loss_mlp": 1.00043011, + "epoch": 0.6501127310987524, + "flos": 23331019374720.0, + "grad_norm": 7.905903141901867, + "language_loss": 0.85447758, + "learning_rate": 1.152362047854413e-06, + "loss": 0.87703371, + "num_input_tokens_seen": 233395275, + "step": 10813, + "time_per_iteration": 2.5895557403564453 + }, + { + "auxiliary_loss_clip": 0.01119237, + "auxiliary_loss_mlp": 0.01105755, + "balance_loss_clip": 1.00196052, + "balance_loss_mlp": 1.00056434, + "epoch": 0.6501728543514204, + "flos": 18697824898560.0, + "grad_norm": 1.6567576233784105, + "language_loss": 0.79693913, + "learning_rate": 1.1520093102110764e-06, + "loss": 0.81918907, + "num_input_tokens_seen": 233413345, + "step": 10814, + "time_per_iteration": 2.6283583641052246 + }, + { + "auxiliary_loss_clip": 0.0110262, + "auxiliary_loss_mlp": 0.00747552, + "balance_loss_clip": 1.00180197, + "balance_loss_mlp": 1.0006777, + "epoch": 0.6502329776040884, + "flos": 44199858199680.0, + "grad_norm": 3.219189207442067, + "language_loss": 0.65928495, + "learning_rate": 1.1516566047236328e-06, + "loss": 0.67778671, + "num_input_tokens_seen": 233436105, + "step": 10815, + "time_per_iteration": 2.870053291320801 + }, + { + "auxiliary_loss_clip": 0.01165573, + "auxiliary_loss_mlp": 0.01106959, + "balance_loss_clip": 1.00208545, + "balance_loss_mlp": 1.00043321, + "epoch": 0.6502931008567564, + "flos": 14574741419520.0, + "grad_norm": 1.8364554867718874, + "language_loss": 0.74653924, + "learning_rate": 1.1513039314054546e-06, + "loss": 0.76926452, + "num_input_tokens_seen": 233452320, + "step": 10816, + "time_per_iteration": 2.5131866931915283 + }, + { + "auxiliary_loss_clip": 0.01133897, + "auxiliary_loss_mlp": 0.01105092, + "balance_loss_clip": 1.00204802, + "balance_loss_mlp": 1.00037873, + "epoch": 0.6503532241094243, + "flos": 21395003201280.0, + "grad_norm": 1.8467427438182167, + "language_loss": 0.73117125, + "learning_rate": 1.1509512902699174e-06, + "loss": 0.75356108, + "num_input_tokens_seen": 233469920, + "step": 10817, + "time_per_iteration": 2.5987355709075928 + }, + { + "auxiliary_loss_clip": 0.01119538, + "auxiliary_loss_mlp": 0.01105623, + "balance_loss_clip": 1.00187504, + "balance_loss_mlp": 1.00043249, + "epoch": 0.6504133473620923, + "flos": 74740840986240.0, + "grad_norm": 1.462003941265123, + "language_loss": 0.72105163, + "learning_rate": 1.1505986813303916e-06, + "loss": 0.74330324, + "num_input_tokens_seen": 233499780, + "step": 10818, + "time_per_iteration": 3.0502736568450928 + }, + { + "auxiliary_loss_clip": 0.0111836, + "auxiliary_loss_mlp": 0.01105233, + "balance_loss_clip": 1.00190198, + "balance_loss_mlp": 1.00051951, + "epoch": 0.6504734706147602, + "flos": 19713270384000.0, + "grad_norm": 1.9601016608733315, + "language_loss": 0.65118682, + "learning_rate": 1.150246104600249e-06, + "loss": 0.67342281, + "num_input_tokens_seen": 233518235, + "step": 10819, + "time_per_iteration": 2.641536235809326 + }, + { + "auxiliary_loss_clip": 0.01119174, + "auxiliary_loss_mlp": 0.01105461, + "balance_loss_clip": 1.00198352, + "balance_loss_mlp": 1.00055671, + "epoch": 0.6505335938674283, + "flos": 25556870390400.0, + "grad_norm": 1.824184883673751, + "language_loss": 0.83638179, + "learning_rate": 1.14989356009286e-06, + "loss": 0.85862821, + "num_input_tokens_seen": 233535215, + "step": 10820, + "time_per_iteration": 0.04440116882324219 + }, + { + "auxiliary_loss_clip": 0.01150825, + "auxiliary_loss_mlp": 0.01105793, + "balance_loss_clip": 1.00204909, + "balance_loss_mlp": 1.00041211, + "epoch": 0.6505937171200962, + "flos": 17821424960640.0, + "grad_norm": 2.0123411350310505, + "language_loss": 0.77507758, + "learning_rate": 1.1495410478215914e-06, + "loss": 0.79764372, + "num_input_tokens_seen": 233552775, + "step": 10821, + "time_per_iteration": 2.5791471004486084 + }, + { + "auxiliary_loss_clip": 0.01116047, + "auxiliary_loss_mlp": 0.0110401, + "balance_loss_clip": 1.00190353, + "balance_loss_mlp": 1.00034547, + "epoch": 0.6506538403727642, + "flos": 20668135582080.0, + "grad_norm": 1.5639963211876649, + "language_loss": 0.80139768, + "learning_rate": 1.1491885677998126e-06, + "loss": 0.82359827, + "num_input_tokens_seen": 233572080, + "step": 10822, + "time_per_iteration": 2.6560444831848145 + }, + { + "auxiliary_loss_clip": 0.01117519, + "auxiliary_loss_mlp": 0.01104289, + "balance_loss_clip": 1.00185466, + "balance_loss_mlp": 1.00043416, + "epoch": 0.6507139636254321, + "flos": 11721422695680.0, + "grad_norm": 1.9274501676192555, + "language_loss": 0.87360215, + "learning_rate": 1.1488361200408883e-06, + "loss": 0.89582026, + "num_input_tokens_seen": 233589155, + "step": 10823, + "time_per_iteration": 2.6647846698760986 + }, + { + "auxiliary_loss_clip": 0.0116534, + "auxiliary_loss_mlp": 0.01105402, + "balance_loss_clip": 1.00200045, + "balance_loss_mlp": 1.00049758, + "epoch": 0.6507740868781001, + "flos": 26761745226240.0, + "grad_norm": 1.7159035148591013, + "language_loss": 0.6632328, + "learning_rate": 1.148483704558183e-06, + "loss": 0.68594021, + "num_input_tokens_seen": 233608180, + "step": 10824, + "time_per_iteration": 2.6069130897521973 + }, + { + "auxiliary_loss_clip": 0.01132548, + "auxiliary_loss_mlp": 0.01105167, + "balance_loss_clip": 1.00181723, + "balance_loss_mlp": 1.00045419, + "epoch": 0.650834210130768, + "flos": 16471722487680.0, + "grad_norm": 7.736662025117533, + "language_loss": 0.87626082, + "learning_rate": 1.1481313213650607e-06, + "loss": 0.89863801, + "num_input_tokens_seen": 233625750, + "step": 10825, + "time_per_iteration": 2.6644833087921143 + }, + { + "auxiliary_loss_clip": 0.01132908, + "auxiliary_loss_mlp": 0.01105803, + "balance_loss_clip": 1.00184405, + "balance_loss_mlp": 1.00042224, + "epoch": 0.650894333383436, + "flos": 17128672283520.0, + "grad_norm": 2.0744286369819673, + "language_loss": 0.73263228, + "learning_rate": 1.147778970474885e-06, + "loss": 0.75501943, + "num_input_tokens_seen": 233644235, + "step": 10826, + "time_per_iteration": 2.5945184230804443 + }, + { + "auxiliary_loss_clip": 0.01149976, + "auxiliary_loss_mlp": 0.01105274, + "balance_loss_clip": 1.00195491, + "balance_loss_mlp": 1.000561, + "epoch": 0.650954456636104, + "flos": 18734238311040.0, + "grad_norm": 2.218169395241856, + "language_loss": 0.68956292, + "learning_rate": 1.1474266519010157e-06, + "loss": 0.71211547, + "num_input_tokens_seen": 233662845, + "step": 10827, + "time_per_iteration": 2.5335378646850586 + }, + { + "auxiliary_loss_clip": 0.01133851, + "auxiliary_loss_mlp": 0.01105483, + "balance_loss_clip": 1.00182366, + "balance_loss_mlp": 1.0004828, + "epoch": 0.651014579888772, + "flos": 24528244613760.0, + "grad_norm": 2.32271716732584, + "language_loss": 0.76823235, + "learning_rate": 1.1470743656568136e-06, + "loss": 0.79062569, + "num_input_tokens_seen": 233681990, + "step": 10828, + "time_per_iteration": 2.6003808975219727 + }, + { + "auxiliary_loss_clip": 0.01148608, + "auxiliary_loss_mlp": 0.01105782, + "balance_loss_clip": 1.0020411, + "balance_loss_mlp": 1.00040054, + "epoch": 0.65107470314144, + "flos": 24061083304320.0, + "grad_norm": 3.165743350844448, + "language_loss": 0.89394206, + "learning_rate": 1.1467221117556362e-06, + "loss": 0.91648591, + "num_input_tokens_seen": 233698930, + "step": 10829, + "time_per_iteration": 2.5615627765655518 + }, + { + "auxiliary_loss_clip": 0.01159535, + "auxiliary_loss_mlp": 0.01080461, + "balance_loss_clip": 1.00093877, + "balance_loss_mlp": 0.99997139, + "epoch": 0.6511348263941079, + "flos": 72480734352000.0, + "grad_norm": 0.6350156666804716, + "language_loss": 0.55397022, + "learning_rate": 1.1463698902108428e-06, + "loss": 0.57637018, + "num_input_tokens_seen": 233769825, + "step": 10830, + "time_per_iteration": 3.2504971027374268 + }, + { + "auxiliary_loss_clip": 0.01117125, + "auxiliary_loss_mlp": 0.01106912, + "balance_loss_clip": 1.00184298, + "balance_loss_mlp": 1.00048232, + "epoch": 0.6511949496467759, + "flos": 23367684182400.0, + "grad_norm": 2.095378530754657, + "language_loss": 0.74638128, + "learning_rate": 1.1460177010357878e-06, + "loss": 0.76862162, + "num_input_tokens_seen": 233787095, + "step": 10831, + "time_per_iteration": 2.636009693145752 + }, + { + "auxiliary_loss_clip": 0.01129053, + "auxiliary_loss_mlp": 0.01080817, + "balance_loss_clip": 1.00098896, + "balance_loss_mlp": 0.99994582, + "epoch": 0.6512550728994438, + "flos": 67333191073920.0, + "grad_norm": 0.6443994300733423, + "language_loss": 0.51115739, + "learning_rate": 1.145665544243828e-06, + "loss": 0.53325605, + "num_input_tokens_seen": 233853050, + "step": 10832, + "time_per_iteration": 3.238170862197876 + }, + { + "auxiliary_loss_clip": 0.01132926, + "auxiliary_loss_mlp": 0.01105803, + "balance_loss_clip": 1.00182104, + "balance_loss_mlp": 1.00042224, + "epoch": 0.6513151961521119, + "flos": 21141689512320.0, + "grad_norm": 2.547699923892103, + "language_loss": 0.83577931, + "learning_rate": 1.145313419848316e-06, + "loss": 0.85816658, + "num_input_tokens_seen": 233871385, + "step": 10833, + "time_per_iteration": 2.6167008876800537 + }, + { + "auxiliary_loss_clip": 0.0113222, + "auxiliary_loss_mlp": 0.01105759, + "balance_loss_clip": 1.00199366, + "balance_loss_mlp": 1.00056839, + "epoch": 0.6513753194047798, + "flos": 15158828476800.0, + "grad_norm": 2.0702462685047265, + "language_loss": 0.83051676, + "learning_rate": 1.1449613278626049e-06, + "loss": 0.85289651, + "num_input_tokens_seen": 233888175, + "step": 10834, + "time_per_iteration": 2.552785634994507 + }, + { + "auxiliary_loss_clip": 0.01150658, + "auxiliary_loss_mlp": 0.01106043, + "balance_loss_clip": 1.00222242, + "balance_loss_mlp": 1.0005666, + "epoch": 0.6514354426574478, + "flos": 30226621933440.0, + "grad_norm": 2.8986691287545496, + "language_loss": 0.77321672, + "learning_rate": 1.1446092683000455e-06, + "loss": 0.7957837, + "num_input_tokens_seen": 233911470, + "step": 10835, + "time_per_iteration": 2.645716428756714 + }, + { + "auxiliary_loss_clip": 0.01134077, + "auxiliary_loss_mlp": 0.01105147, + "balance_loss_clip": 1.00189662, + "balance_loss_mlp": 1.00062394, + "epoch": 0.6514955659101157, + "flos": 24205587719040.0, + "grad_norm": 1.497810784587951, + "language_loss": 0.77321678, + "learning_rate": 1.1442572411739882e-06, + "loss": 0.79560906, + "num_input_tokens_seen": 233932135, + "step": 10836, + "time_per_iteration": 2.6452009677886963 + }, + { + "auxiliary_loss_clip": 0.01117553, + "auxiliary_loss_mlp": 0.01105168, + "balance_loss_clip": 1.00194097, + "balance_loss_mlp": 1.00054979, + "epoch": 0.6515556891627837, + "flos": 12377761960320.0, + "grad_norm": 1.9247936226489568, + "language_loss": 0.82038414, + "learning_rate": 1.143905246497783e-06, + "loss": 0.84261137, + "num_input_tokens_seen": 233947880, + "step": 10837, + "time_per_iteration": 4.012439250946045 + }, + { + "auxiliary_loss_clip": 0.01115116, + "auxiliary_loss_mlp": 0.01105253, + "balance_loss_clip": 1.00181866, + "balance_loss_mlp": 1.00044382, + "epoch": 0.6516158124154516, + "flos": 49601217957120.0, + "grad_norm": 1.8907377331685304, + "language_loss": 0.58693278, + "learning_rate": 1.1435532842847758e-06, + "loss": 0.60913646, + "num_input_tokens_seen": 233971475, + "step": 10838, + "time_per_iteration": 4.314549922943115 + }, + { + "auxiliary_loss_clip": 0.01159528, + "auxiliary_loss_mlp": 0.01080037, + "balance_loss_clip": 1.00093579, + "balance_loss_mlp": 0.99992812, + "epoch": 0.6516759356681197, + "flos": 59702748076800.0, + "grad_norm": 0.7469243691936682, + "language_loss": 0.60845476, + "learning_rate": 1.1432013545483147e-06, + "loss": 0.63085043, + "num_input_tokens_seen": 234030690, + "step": 10839, + "time_per_iteration": 3.1411025524139404 + }, + { + "auxiliary_loss_clip": 0.01132141, + "auxiliary_loss_mlp": 0.01104437, + "balance_loss_clip": 1.0019685, + "balance_loss_mlp": 1.00048637, + "epoch": 0.6517360589207876, + "flos": 37450807130880.0, + "grad_norm": 1.5858029563068443, + "language_loss": 0.67662048, + "learning_rate": 1.1428494573017439e-06, + "loss": 0.69898629, + "num_input_tokens_seen": 234052470, + "step": 10840, + "time_per_iteration": 2.741112470626831 + }, + { + "auxiliary_loss_clip": 0.01100355, + "auxiliary_loss_mlp": 0.01105499, + "balance_loss_clip": 1.0016433, + "balance_loss_mlp": 1.00049901, + "epoch": 0.6517961821734556, + "flos": 25374911068800.0, + "grad_norm": 2.44199725052272, + "language_loss": 0.7369476, + "learning_rate": 1.1424975925584071e-06, + "loss": 0.75900614, + "num_input_tokens_seen": 234071495, + "step": 10841, + "time_per_iteration": 2.714301109313965 + }, + { + "auxiliary_loss_clip": 0.01165265, + "auxiliary_loss_mlp": 0.01106155, + "balance_loss_clip": 1.00191736, + "balance_loss_mlp": 1.00067878, + "epoch": 0.6518563054261236, + "flos": 28766996864640.0, + "grad_norm": 1.5125120656366848, + "language_loss": 0.62518638, + "learning_rate": 1.142145760331648e-06, + "loss": 0.64790058, + "num_input_tokens_seen": 234092325, + "step": 10842, + "time_per_iteration": 2.591515302658081 + }, + { + "auxiliary_loss_clip": 0.0114437, + "auxiliary_loss_mlp": 0.01080423, + "balance_loss_clip": 1.00099683, + "balance_loss_mlp": 0.99993289, + "epoch": 0.6519164286787915, + "flos": 68924750797440.0, + "grad_norm": 0.8162063410227548, + "language_loss": 0.56137896, + "learning_rate": 1.141793960634807e-06, + "loss": 0.58362687, + "num_input_tokens_seen": 234148005, + "step": 10843, + "time_per_iteration": 2.94052791595459 + }, + { + "auxiliary_loss_clip": 0.01148649, + "auxiliary_loss_mlp": 0.01106384, + "balance_loss_clip": 1.00184286, + "balance_loss_mlp": 1.00052643, + "epoch": 0.6519765519314595, + "flos": 20441933683200.0, + "grad_norm": 1.984019117691899, + "language_loss": 0.82507974, + "learning_rate": 1.1414421934812253e-06, + "loss": 0.84763008, + "num_input_tokens_seen": 234164280, + "step": 10844, + "time_per_iteration": 2.5874874591827393 + }, + { + "auxiliary_loss_clip": 0.01149008, + "auxiliary_loss_mlp": 0.01105068, + "balance_loss_clip": 1.00189447, + "balance_loss_mlp": 1.00054502, + "epoch": 0.6520366751841274, + "flos": 28402970480640.0, + "grad_norm": 2.4496150040530296, + "language_loss": 0.59496939, + "learning_rate": 1.1410904588842421e-06, + "loss": 0.6175102, + "num_input_tokens_seen": 234185090, + "step": 10845, + "time_per_iteration": 2.641200542449951 + }, + { + "auxiliary_loss_clip": 0.01149775, + "auxiliary_loss_mlp": 0.01104951, + "balance_loss_clip": 1.00194097, + "balance_loss_mlp": 1.00052392, + "epoch": 0.6520967984367955, + "flos": 22273414300800.0, + "grad_norm": 1.7198866905580261, + "language_loss": 0.79585373, + "learning_rate": 1.140738756857194e-06, + "loss": 0.81840098, + "num_input_tokens_seen": 234204050, + "step": 10846, + "time_per_iteration": 2.651207685470581 + }, + { + "auxiliary_loss_clip": 0.01145026, + "auxiliary_loss_mlp": 0.01080467, + "balance_loss_clip": 1.00095165, + "balance_loss_mlp": 0.99997705, + "epoch": 0.6521569216894634, + "flos": 68917140092160.0, + "grad_norm": 0.7009473851802791, + "language_loss": 0.60200983, + "learning_rate": 1.1403870874134192e-06, + "loss": 0.62426472, + "num_input_tokens_seen": 234269790, + "step": 10847, + "time_per_iteration": 3.237574815750122 + }, + { + "auxiliary_loss_clip": 0.01165384, + "auxiliary_loss_mlp": 0.01106467, + "balance_loss_clip": 1.00209701, + "balance_loss_mlp": 1.00070453, + "epoch": 0.6522170449421314, + "flos": 29130520458240.0, + "grad_norm": 1.7276700352360401, + "language_loss": 0.80756986, + "learning_rate": 1.1400354505662514e-06, + "loss": 0.83028841, + "num_input_tokens_seen": 234290135, + "step": 10848, + "time_per_iteration": 2.585286855697632 + }, + { + "auxiliary_loss_clip": 0.01134873, + "auxiliary_loss_mlp": 0.01104769, + "balance_loss_clip": 1.00195384, + "balance_loss_mlp": 1.00043654, + "epoch": 0.6522771681947993, + "flos": 26651930371200.0, + "grad_norm": 1.949181601377187, + "language_loss": 0.74454635, + "learning_rate": 1.1396838463290263e-06, + "loss": 0.76694274, + "num_input_tokens_seen": 234309535, + "step": 10849, + "time_per_iteration": 4.024526119232178 + }, + { + "auxiliary_loss_clip": 0.01102396, + "auxiliary_loss_mlp": 0.01104684, + "balance_loss_clip": 1.00181365, + "balance_loss_mlp": 1.00054288, + "epoch": 0.6523372914474673, + "flos": 25739763465600.0, + "grad_norm": 2.4280333371180323, + "language_loss": 0.68181431, + "learning_rate": 1.1393322747150752e-06, + "loss": 0.70388508, + "num_input_tokens_seen": 234328755, + "step": 10850, + "time_per_iteration": 4.220803499221802 + }, + { + "auxiliary_loss_clip": 0.01131672, + "auxiliary_loss_mlp": 0.00747438, + "balance_loss_clip": 1.00184011, + "balance_loss_mlp": 1.00063443, + "epoch": 0.6523974147001352, + "flos": 24827345164800.0, + "grad_norm": 1.9232004749275602, + "language_loss": 0.66594756, + "learning_rate": 1.1389807357377313e-06, + "loss": 0.68473864, + "num_input_tokens_seen": 234348655, + "step": 10851, + "time_per_iteration": 2.621918201446533 + }, + { + "auxiliary_loss_clip": 0.01132073, + "auxiliary_loss_mlp": 0.01105718, + "balance_loss_clip": 1.00197768, + "balance_loss_mlp": 1.00052798, + "epoch": 0.6524575379528033, + "flos": 26317637470080.0, + "grad_norm": 2.723008393631815, + "language_loss": 0.74168026, + "learning_rate": 1.1386292294103235e-06, + "loss": 0.76405823, + "num_input_tokens_seen": 234367445, + "step": 10852, + "time_per_iteration": 2.6235828399658203 + }, + { + "auxiliary_loss_clip": 0.01132035, + "auxiliary_loss_mlp": 0.0110598, + "balance_loss_clip": 1.00176787, + "balance_loss_mlp": 1.00040841, + "epoch": 0.6525176612054712, + "flos": 19494143464320.0, + "grad_norm": 1.901805767547197, + "language_loss": 0.66465634, + "learning_rate": 1.1382777557461812e-06, + "loss": 0.68703651, + "num_input_tokens_seen": 234384825, + "step": 10853, + "time_per_iteration": 2.578734874725342 + }, + { + "auxiliary_loss_clip": 0.01114628, + "auxiliary_loss_mlp": 0.01080178, + "balance_loss_clip": 1.00121498, + "balance_loss_mlp": 1.0000689, + "epoch": 0.6525777844581392, + "flos": 71706894721920.0, + "grad_norm": 0.7176746570302546, + "language_loss": 0.63039827, + "learning_rate": 1.137926314758634e-06, + "loss": 0.65234637, + "num_input_tokens_seen": 234450630, + "step": 10854, + "time_per_iteration": 3.340916633605957 + }, + { + "auxiliary_loss_clip": 0.01150408, + "auxiliary_loss_mlp": 0.01106645, + "balance_loss_clip": 1.00197554, + "balance_loss_mlp": 1.00050068, + "epoch": 0.6526379077108072, + "flos": 26653115520000.0, + "grad_norm": 3.0904497339810746, + "language_loss": 0.77389306, + "learning_rate": 1.1375749064610072e-06, + "loss": 0.79646361, + "num_input_tokens_seen": 234473505, + "step": 10855, + "time_per_iteration": 2.627941846847534 + }, + { + "auxiliary_loss_clip": 0.01119075, + "auxiliary_loss_mlp": 0.01105145, + "balance_loss_clip": 1.00195432, + "balance_loss_mlp": 1.00043106, + "epoch": 0.6526980309634751, + "flos": 22820369673600.0, + "grad_norm": 1.879399703823551, + "language_loss": 0.78863543, + "learning_rate": 1.1372235308666256e-06, + "loss": 0.81087762, + "num_input_tokens_seen": 234492485, + "step": 10856, + "time_per_iteration": 2.702989101409912 + }, + { + "auxiliary_loss_clip": 0.01165297, + "auxiliary_loss_mlp": 0.01106089, + "balance_loss_clip": 1.00202143, + "balance_loss_mlp": 1.00042176, + "epoch": 0.6527581542161431, + "flos": 28365048696960.0, + "grad_norm": 1.9595503966764756, + "language_loss": 0.73730266, + "learning_rate": 1.136872187988815e-06, + "loss": 0.76001656, + "num_input_tokens_seen": 234512645, + "step": 10857, + "time_per_iteration": 2.605912208557129 + }, + { + "auxiliary_loss_clip": 0.01135803, + "auxiliary_loss_mlp": 0.01105113, + "balance_loss_clip": 1.00191188, + "balance_loss_mlp": 1.00059009, + "epoch": 0.652818277468811, + "flos": 18369206346240.0, + "grad_norm": 2.4443967964031192, + "language_loss": 0.62801683, + "learning_rate": 1.1365208778408965e-06, + "loss": 0.65042603, + "num_input_tokens_seen": 234529310, + "step": 10858, + "time_per_iteration": 2.584467649459839 + }, + { + "auxiliary_loss_clip": 0.01165082, + "auxiliary_loss_mlp": 0.01105123, + "balance_loss_clip": 1.00191188, + "balance_loss_mlp": 1.00050497, + "epoch": 0.6528784007214791, + "flos": 18036170421120.0, + "grad_norm": 1.621412406137328, + "language_loss": 0.7840184, + "learning_rate": 1.1361696004361939e-06, + "loss": 0.8067205, + "num_input_tokens_seen": 234546685, + "step": 10859, + "time_per_iteration": 2.491086721420288 + }, + { + "auxiliary_loss_clip": 0.01148759, + "auxiliary_loss_mlp": 0.01105718, + "balance_loss_clip": 1.00187516, + "balance_loss_mlp": 1.00043273, + "epoch": 0.652938523974147, + "flos": 22382008093440.0, + "grad_norm": 1.5962559732639716, + "language_loss": 0.67994589, + "learning_rate": 1.1358183557880256e-06, + "loss": 0.70249063, + "num_input_tokens_seen": 234566255, + "step": 10860, + "time_per_iteration": 2.583974838256836 + }, + { + "auxiliary_loss_clip": 0.01148644, + "auxiliary_loss_mlp": 0.01105821, + "balance_loss_clip": 1.00194478, + "balance_loss_mlp": 1.00043976, + "epoch": 0.652998647226815, + "flos": 16764035368320.0, + "grad_norm": 1.9418026381424536, + "language_loss": 0.66653311, + "learning_rate": 1.135467143909712e-06, + "loss": 0.68907773, + "num_input_tokens_seen": 234585405, + "step": 10861, + "time_per_iteration": 2.5429039001464844 + }, + { + "auxiliary_loss_clip": 0.01132168, + "auxiliary_loss_mlp": 0.01105199, + "balance_loss_clip": 1.00187087, + "balance_loss_mlp": 1.00048602, + "epoch": 0.6530587704794829, + "flos": 35772522019200.0, + "grad_norm": 2.919619579193771, + "language_loss": 0.65434885, + "learning_rate": 1.135115964814572e-06, + "loss": 0.67672247, + "num_input_tokens_seen": 234608095, + "step": 10862, + "time_per_iteration": 2.7270188331604004 + }, + { + "auxiliary_loss_clip": 0.01133708, + "auxiliary_loss_mlp": 0.01104533, + "balance_loss_clip": 1.00197279, + "balance_loss_mlp": 1.00058246, + "epoch": 0.6531188937321509, + "flos": 19316134638720.0, + "grad_norm": 1.660836023251045, + "language_loss": 0.77191436, + "learning_rate": 1.13476481851592e-06, + "loss": 0.79429674, + "num_input_tokens_seen": 234627335, + "step": 10863, + "time_per_iteration": 2.6120645999908447 + }, + { + "auxiliary_loss_clip": 0.0113541, + "auxiliary_loss_mlp": 0.01104824, + "balance_loss_clip": 1.00196075, + "balance_loss_mlp": 1.00049162, + "epoch": 0.6531790169848188, + "flos": 22893771116160.0, + "grad_norm": 2.2556318810395615, + "language_loss": 0.74681842, + "learning_rate": 1.1344137050270739e-06, + "loss": 0.76922083, + "num_input_tokens_seen": 234646540, + "step": 10864, + "time_per_iteration": 2.60520339012146 + }, + { + "auxiliary_loss_clip": 0.01148425, + "auxiliary_loss_mlp": 0.01104762, + "balance_loss_clip": 1.001899, + "balance_loss_mlp": 1.00042963, + "epoch": 0.6532391402374869, + "flos": 29563530912000.0, + "grad_norm": 3.2783967648643406, + "language_loss": 0.86439508, + "learning_rate": 1.1340626243613458e-06, + "loss": 0.88692695, + "num_input_tokens_seen": 234665470, + "step": 10865, + "time_per_iteration": 2.6150460243225098 + }, + { + "auxiliary_loss_clip": 0.01120821, + "auxiliary_loss_mlp": 0.00747476, + "balance_loss_clip": 1.00188756, + "balance_loss_mlp": 1.00068808, + "epoch": 0.6532992634901548, + "flos": 23105463920640.0, + "grad_norm": 1.6846428195729102, + "language_loss": 0.81386328, + "learning_rate": 1.133711576532051e-06, + "loss": 0.83254635, + "num_input_tokens_seen": 234683955, + "step": 10866, + "time_per_iteration": 2.6775779724121094 + }, + { + "auxiliary_loss_clip": 0.0113299, + "auxiliary_loss_mlp": 0.01104598, + "balance_loss_clip": 1.00192893, + "balance_loss_mlp": 1.00045729, + "epoch": 0.6533593867428228, + "flos": 26067340523520.0, + "grad_norm": 1.8648593559632132, + "language_loss": 0.82324815, + "learning_rate": 1.1333605615524995e-06, + "loss": 0.84562397, + "num_input_tokens_seen": 234704595, + "step": 10867, + "time_per_iteration": 2.626504898071289 + }, + { + "auxiliary_loss_clip": 0.01134756, + "auxiliary_loss_mlp": 0.01104715, + "balance_loss_clip": 1.00195301, + "balance_loss_mlp": 1.00038314, + "epoch": 0.6534195099954908, + "flos": 21212469262080.0, + "grad_norm": 2.5945854571190847, + "language_loss": 0.81246608, + "learning_rate": 1.1330095794360016e-06, + "loss": 0.8348608, + "num_input_tokens_seen": 234724090, + "step": 10868, + "time_per_iteration": 2.618502616882324 + }, + { + "auxiliary_loss_clip": 0.01117007, + "auxiliary_loss_mlp": 0.01105579, + "balance_loss_clip": 1.00176048, + "balance_loss_mlp": 1.00048375, + "epoch": 0.6534796332481587, + "flos": 19646584784640.0, + "grad_norm": 1.939692924572072, + "language_loss": 0.79718959, + "learning_rate": 1.1326586301958675e-06, + "loss": 0.81941551, + "num_input_tokens_seen": 234742560, + "step": 10869, + "time_per_iteration": 2.6273529529571533 + }, + { + "auxiliary_loss_clip": 0.01148677, + "auxiliary_loss_mlp": 0.01105401, + "balance_loss_clip": 1.00202274, + "balance_loss_mlp": 1.00059247, + "epoch": 0.6535397565008267, + "flos": 24022479162240.0, + "grad_norm": 1.812962407932291, + "language_loss": 0.72095859, + "learning_rate": 1.1323077138454063e-06, + "loss": 0.7434994, + "num_input_tokens_seen": 234762315, + "step": 10870, + "time_per_iteration": 2.5960516929626465 + }, + { + "auxiliary_loss_clip": 0.0111725, + "auxiliary_loss_mlp": 0.01105435, + "balance_loss_clip": 1.00210071, + "balance_loss_mlp": 1.0005312, + "epoch": 0.6535998797534947, + "flos": 24602759377920.0, + "grad_norm": 2.8380631235672285, + "language_loss": 0.74831682, + "learning_rate": 1.1319568303979221e-06, + "loss": 0.77054369, + "num_input_tokens_seen": 234781300, + "step": 10871, + "time_per_iteration": 2.698194742202759 + }, + { + "auxiliary_loss_clip": 0.01151748, + "auxiliary_loss_mlp": 0.00747259, + "balance_loss_clip": 1.00223279, + "balance_loss_mlp": 1.00064838, + "epoch": 0.6536600030061627, + "flos": 23364164649600.0, + "grad_norm": 1.648489144810885, + "language_loss": 0.5602845, + "learning_rate": 1.1316059798667227e-06, + "loss": 0.57927454, + "num_input_tokens_seen": 234801040, + "step": 10872, + "time_per_iteration": 2.574252128601074 + }, + { + "auxiliary_loss_clip": 0.01133178, + "auxiliary_loss_mlp": 0.01105039, + "balance_loss_clip": 1.00189686, + "balance_loss_mlp": 1.00061178, + "epoch": 0.6537201262588306, + "flos": 23878477537920.0, + "grad_norm": 1.660801171322197, + "language_loss": 0.75015163, + "learning_rate": 1.1312551622651112e-06, + "loss": 0.77253377, + "num_input_tokens_seen": 234821415, + "step": 10873, + "time_per_iteration": 2.6294851303100586 + }, + { + "auxiliary_loss_clip": 0.01148506, + "auxiliary_loss_mlp": 0.01104242, + "balance_loss_clip": 1.00201547, + "balance_loss_mlp": 1.00048208, + "epoch": 0.6537802495114986, + "flos": 24354760901760.0, + "grad_norm": 3.9917498321593787, + "language_loss": 0.75636584, + "learning_rate": 1.1309043776063917e-06, + "loss": 0.77889335, + "num_input_tokens_seen": 234843795, + "step": 10874, + "time_per_iteration": 2.6417126655578613 + }, + { + "auxiliary_loss_clip": 0.01117722, + "auxiliary_loss_mlp": 0.01105382, + "balance_loss_clip": 1.00192416, + "balance_loss_mlp": 1.00066853, + "epoch": 0.6538403727641665, + "flos": 27996892248960.0, + "grad_norm": 1.513257756146355, + "language_loss": 0.81696653, + "learning_rate": 1.1305536259038642e-06, + "loss": 0.83919758, + "num_input_tokens_seen": 234862350, + "step": 10875, + "time_per_iteration": 4.115001916885376 + }, + { + "auxiliary_loss_clip": 0.01165217, + "auxiliary_loss_mlp": 0.0110519, + "balance_loss_clip": 1.0019052, + "balance_loss_mlp": 1.00057244, + "epoch": 0.6539004960168345, + "flos": 27563594486400.0, + "grad_norm": 1.5643065294647167, + "language_loss": 0.69826233, + "learning_rate": 1.1302029071708314e-06, + "loss": 0.72096634, + "num_input_tokens_seen": 234881790, + "step": 10876, + "time_per_iteration": 3.983154058456421 + }, + { + "auxiliary_loss_clip": 0.01054139, + "auxiliary_loss_mlp": 0.01105106, + "balance_loss_clip": 1.00180578, + "balance_loss_mlp": 1.00067925, + "epoch": 0.6539606192695024, + "flos": 14530067879040.0, + "grad_norm": 1.9025375080562952, + "language_loss": 0.79398245, + "learning_rate": 1.1298522214205908e-06, + "loss": 0.81557488, + "num_input_tokens_seen": 234897775, + "step": 10877, + "time_per_iteration": 2.744385242462158 + }, + { + "auxiliary_loss_clip": 0.01133654, + "auxiliary_loss_mlp": 0.00747309, + "balance_loss_clip": 1.00188851, + "balance_loss_mlp": 1.00059557, + "epoch": 0.6540207425221705, + "flos": 21616356764160.0, + "grad_norm": 3.3377498752551675, + "language_loss": 0.79358125, + "learning_rate": 1.1295015686664408e-06, + "loss": 0.81239086, + "num_input_tokens_seen": 234918395, + "step": 10878, + "time_per_iteration": 2.622647523880005 + }, + { + "auxiliary_loss_clip": 0.01134899, + "auxiliary_loss_mlp": 0.01104037, + "balance_loss_clip": 1.00185323, + "balance_loss_mlp": 1.00046754, + "epoch": 0.6540808657748384, + "flos": 17668983640320.0, + "grad_norm": 2.001142234113635, + "language_loss": 0.83905995, + "learning_rate": 1.1291509489216797e-06, + "loss": 0.8614493, + "num_input_tokens_seen": 234936260, + "step": 10879, + "time_per_iteration": 2.6362533569335938 + }, + { + "auxiliary_loss_clip": 0.01131284, + "auxiliary_loss_mlp": 0.01105606, + "balance_loss_clip": 1.00178599, + "balance_loss_mlp": 1.00041544, + "epoch": 0.6541409890275064, + "flos": 14538292093440.0, + "grad_norm": 2.846657822286621, + "language_loss": 0.71795797, + "learning_rate": 1.128800362199601e-06, + "loss": 0.74032688, + "num_input_tokens_seen": 234952110, + "step": 10880, + "time_per_iteration": 2.5612246990203857 + }, + { + "auxiliary_loss_clip": 0.01117167, + "auxiliary_loss_mlp": 0.01104125, + "balance_loss_clip": 1.00182378, + "balance_loss_mlp": 1.00046015, + "epoch": 0.6542011122801744, + "flos": 17165301177600.0, + "grad_norm": 2.249050546264035, + "language_loss": 0.84366941, + "learning_rate": 1.1284498085135005e-06, + "loss": 0.86588228, + "num_input_tokens_seen": 234970810, + "step": 10881, + "time_per_iteration": 2.618206262588501 + }, + { + "auxiliary_loss_clip": 0.01117591, + "auxiliary_loss_mlp": 0.01105401, + "balance_loss_clip": 1.00175095, + "balance_loss_mlp": 1.00049663, + "epoch": 0.6542612355328423, + "flos": 18186600579840.0, + "grad_norm": 2.2296593193692376, + "language_loss": 0.78024447, + "learning_rate": 1.1280992878766699e-06, + "loss": 0.80247444, + "num_input_tokens_seen": 234989565, + "step": 10882, + "time_per_iteration": 2.625284433364868 + }, + { + "auxiliary_loss_clip": 0.01165306, + "auxiliary_loss_mlp": 0.01105875, + "balance_loss_clip": 1.00207376, + "balance_loss_mlp": 1.00049448, + "epoch": 0.6543213587855103, + "flos": 19792453916160.0, + "grad_norm": 2.0009266013012645, + "language_loss": 0.82114249, + "learning_rate": 1.1277488003024024e-06, + "loss": 0.84385431, + "num_input_tokens_seen": 235007955, + "step": 10883, + "time_per_iteration": 2.541041374206543 + }, + { + "auxiliary_loss_clip": 0.01103835, + "auxiliary_loss_mlp": 0.01105779, + "balance_loss_clip": 1.00212431, + "balance_loss_mlp": 1.00049376, + "epoch": 0.6543814820381783, + "flos": 21105096531840.0, + "grad_norm": 2.231319954193489, + "language_loss": 0.85524678, + "learning_rate": 1.127398345803988e-06, + "loss": 0.87734288, + "num_input_tokens_seen": 235024860, + "step": 10884, + "time_per_iteration": 2.6741058826446533 + }, + { + "auxiliary_loss_clip": 0.01133875, + "auxiliary_loss_mlp": 0.01106203, + "balance_loss_clip": 1.00202107, + "balance_loss_mlp": 1.00053596, + "epoch": 0.6544416052908463, + "flos": 20194042947840.0, + "grad_norm": 2.167920360396997, + "language_loss": 0.7994346, + "learning_rate": 1.127047924394715e-06, + "loss": 0.82183546, + "num_input_tokens_seen": 235043815, + "step": 10885, + "time_per_iteration": 2.6372365951538086 + }, + { + "auxiliary_loss_clip": 0.01116798, + "auxiliary_loss_mlp": 0.01104494, + "balance_loss_clip": 1.00204301, + "balance_loss_mlp": 1.00044775, + "epoch": 0.6545017285435142, + "flos": 23368258800000.0, + "grad_norm": 1.958636703535149, + "language_loss": 0.71604806, + "learning_rate": 1.1266975360878722e-06, + "loss": 0.73826098, + "num_input_tokens_seen": 235062985, + "step": 10886, + "time_per_iteration": 2.740795850753784 + }, + { + "auxiliary_loss_clip": 0.01148804, + "auxiliary_loss_mlp": 0.01104823, + "balance_loss_clip": 1.00191236, + "balance_loss_mlp": 1.00039554, + "epoch": 0.6545618517961822, + "flos": 19134714021120.0, + "grad_norm": 2.1408943908219102, + "language_loss": 0.77830815, + "learning_rate": 1.1263471808967468e-06, + "loss": 0.80084443, + "num_input_tokens_seen": 235081670, + "step": 10887, + "time_per_iteration": 3.916092872619629 + }, + { + "auxiliary_loss_clip": 0.0113416, + "auxiliary_loss_mlp": 0.01104266, + "balance_loss_clip": 1.00193501, + "balance_loss_mlp": 1.0005064, + "epoch": 0.6546219750488501, + "flos": 14938624149120.0, + "grad_norm": 2.6857867407541547, + "language_loss": 0.78892624, + "learning_rate": 1.1259968588346234e-06, + "loss": 0.81131053, + "num_input_tokens_seen": 235098510, + "step": 10888, + "time_per_iteration": 4.052626609802246 + }, + { + "auxiliary_loss_clip": 0.01148755, + "auxiliary_loss_mlp": 0.0110408, + "balance_loss_clip": 1.0018574, + "balance_loss_mlp": 1.00041604, + "epoch": 0.6546820983015181, + "flos": 36320518886400.0, + "grad_norm": 1.5835391320092405, + "language_loss": 0.6670084, + "learning_rate": 1.1256465699147874e-06, + "loss": 0.68953681, + "num_input_tokens_seen": 235119990, + "step": 10889, + "time_per_iteration": 2.6969246864318848 + }, + { + "auxiliary_loss_clip": 0.01133442, + "auxiliary_loss_mlp": 0.01105243, + "balance_loss_clip": 1.00190187, + "balance_loss_mlp": 1.00043416, + "epoch": 0.654742221554186, + "flos": 20411446014720.0, + "grad_norm": 1.4504303227006876, + "language_loss": 0.80129737, + "learning_rate": 1.1252963141505203e-06, + "loss": 0.82368422, + "num_input_tokens_seen": 235139255, + "step": 10890, + "time_per_iteration": 2.6147615909576416 + }, + { + "auxiliary_loss_clip": 0.01148826, + "auxiliary_loss_mlp": 0.00747457, + "balance_loss_clip": 1.00191998, + "balance_loss_mlp": 1.00068474, + "epoch": 0.6548023448068541, + "flos": 24863650836480.0, + "grad_norm": 2.06705747263789, + "language_loss": 0.65172052, + "learning_rate": 1.1249460915551052e-06, + "loss": 0.67068338, + "num_input_tokens_seen": 235158455, + "step": 10891, + "time_per_iteration": 2.592250108718872 + }, + { + "auxiliary_loss_clip": 0.01150477, + "auxiliary_loss_mlp": 0.01104818, + "balance_loss_clip": 1.00197744, + "balance_loss_mlp": 1.0004859, + "epoch": 0.654862468059522, + "flos": 21427573858560.0, + "grad_norm": 1.9801773069746011, + "language_loss": 0.79247212, + "learning_rate": 1.1245959021418214e-06, + "loss": 0.81502509, + "num_input_tokens_seen": 235177350, + "step": 10892, + "time_per_iteration": 2.552004098892212 + }, + { + "auxiliary_loss_clip": 0.01149252, + "auxiliary_loss_mlp": 0.0110578, + "balance_loss_clip": 1.00203681, + "balance_loss_mlp": 1.00049436, + "epoch": 0.65492259131219, + "flos": 26577846570240.0, + "grad_norm": 1.8131451469399271, + "language_loss": 0.77777791, + "learning_rate": 1.1242457459239497e-06, + "loss": 0.80032825, + "num_input_tokens_seen": 235196435, + "step": 10893, + "time_per_iteration": 2.610625982284546 + }, + { + "auxiliary_loss_clip": 0.01165374, + "auxiliary_loss_mlp": 0.011057, + "balance_loss_clip": 1.00202775, + "balance_loss_mlp": 1.00050974, + "epoch": 0.6549827145648579, + "flos": 21501334437120.0, + "grad_norm": 1.6418244263839743, + "language_loss": 0.70283723, + "learning_rate": 1.123895622914766e-06, + "loss": 0.72554791, + "num_input_tokens_seen": 235215430, + "step": 10894, + "time_per_iteration": 2.5191657543182373 + }, + { + "auxiliary_loss_clip": 0.01150141, + "auxiliary_loss_mlp": 0.01106359, + "balance_loss_clip": 1.00198209, + "balance_loss_mlp": 1.00050116, + "epoch": 0.6550428378175259, + "flos": 22594275515520.0, + "grad_norm": 6.413936826954505, + "language_loss": 0.62653911, + "learning_rate": 1.123545533127549e-06, + "loss": 0.64910406, + "num_input_tokens_seen": 235232015, + "step": 10895, + "time_per_iteration": 2.5468897819519043 + }, + { + "auxiliary_loss_clip": 0.01150487, + "auxiliary_loss_mlp": 0.01104439, + "balance_loss_clip": 1.00189245, + "balance_loss_mlp": 1.000489, + "epoch": 0.655102961070194, + "flos": 12823809050880.0, + "grad_norm": 1.8707621692779313, + "language_loss": 0.78617489, + "learning_rate": 1.1231954765755722e-06, + "loss": 0.80872416, + "num_input_tokens_seen": 235248115, + "step": 10896, + "time_per_iteration": 2.527644395828247 + }, + { + "auxiliary_loss_clip": 0.01132862, + "auxiliary_loss_mlp": 0.011052, + "balance_loss_clip": 1.0017755, + "balance_loss_mlp": 1.00048685, + "epoch": 0.6551630843228619, + "flos": 24791075406720.0, + "grad_norm": 4.570231318650497, + "language_loss": 0.70531452, + "learning_rate": 1.1228454532721111e-06, + "loss": 0.72769517, + "num_input_tokens_seen": 235270785, + "step": 10897, + "time_per_iteration": 2.6744027137756348 + }, + { + "auxiliary_loss_clip": 0.0116532, + "auxiliary_loss_mlp": 0.01105613, + "balance_loss_clip": 1.00201964, + "balance_loss_mlp": 1.00051773, + "epoch": 0.6552232075755299, + "flos": 16724461559040.0, + "grad_norm": 1.6428729449560653, + "language_loss": 0.75417852, + "learning_rate": 1.1224954632304391e-06, + "loss": 0.77688789, + "num_input_tokens_seen": 235287905, + "step": 10898, + "time_per_iteration": 2.515594244003296 + }, + { + "auxiliary_loss_clip": 0.01134065, + "auxiliary_loss_mlp": 0.01104489, + "balance_loss_clip": 1.00205302, + "balance_loss_mlp": 1.00053871, + "epoch": 0.6552833308281978, + "flos": 22016473338240.0, + "grad_norm": 3.118337994263002, + "language_loss": 0.73391736, + "learning_rate": 1.122145506463827e-06, + "loss": 0.75630295, + "num_input_tokens_seen": 235305525, + "step": 10899, + "time_per_iteration": 2.6128623485565186 + }, + { + "auxiliary_loss_clip": 0.01131604, + "auxiliary_loss_mlp": 0.01104343, + "balance_loss_clip": 1.00185061, + "balance_loss_mlp": 1.00048757, + "epoch": 0.6553434540808658, + "flos": 24863399441280.0, + "grad_norm": 3.430029123743832, + "language_loss": 0.56028908, + "learning_rate": 1.1217955829855443e-06, + "loss": 0.58264858, + "num_input_tokens_seen": 235324415, + "step": 10900, + "time_per_iteration": 2.59941029548645 + }, + { + "auxiliary_loss_clip": 0.01148505, + "auxiliary_loss_mlp": 0.01105996, + "balance_loss_clip": 1.00200939, + "balance_loss_mlp": 1.00051999, + "epoch": 0.6554035773335337, + "flos": 23221060865280.0, + "grad_norm": 1.8935510801573083, + "language_loss": 0.76784265, + "learning_rate": 1.1214456928088622e-06, + "loss": 0.79038763, + "num_input_tokens_seen": 235341595, + "step": 10901, + "time_per_iteration": 2.5490305423736572 + }, + { + "auxiliary_loss_clip": 0.01165088, + "auxiliary_loss_mlp": 0.01104759, + "balance_loss_clip": 1.00199068, + "balance_loss_mlp": 1.00052214, + "epoch": 0.6554637005862017, + "flos": 22783597125120.0, + "grad_norm": 1.7014809802759472, + "language_loss": 0.7339412, + "learning_rate": 1.1210958359470463e-06, + "loss": 0.7566396, + "num_input_tokens_seen": 235361700, + "step": 10902, + "time_per_iteration": 2.5729360580444336 + }, + { + "auxiliary_loss_clip": 0.01165228, + "auxiliary_loss_mlp": 0.01105214, + "balance_loss_clip": 1.00207973, + "balance_loss_mlp": 1.00050068, + "epoch": 0.6555238238388696, + "flos": 21507224267520.0, + "grad_norm": 2.583053911262318, + "language_loss": 0.676615, + "learning_rate": 1.1207460124133645e-06, + "loss": 0.69931936, + "num_input_tokens_seen": 235382065, + "step": 10903, + "time_per_iteration": 2.5410079956054688 + }, + { + "auxiliary_loss_clip": 0.01134439, + "auxiliary_loss_mlp": 0.00747563, + "balance_loss_clip": 1.00188112, + "balance_loss_mlp": 1.00063181, + "epoch": 0.6555839470915377, + "flos": 30519473518080.0, + "grad_norm": 1.7854470901477342, + "language_loss": 0.66774619, + "learning_rate": 1.1203962222210832e-06, + "loss": 0.68656623, + "num_input_tokens_seen": 235402130, + "step": 10904, + "time_per_iteration": 2.668303966522217 + }, + { + "auxiliary_loss_clip": 0.01150515, + "auxiliary_loss_mlp": 0.01106205, + "balance_loss_clip": 1.00191021, + "balance_loss_mlp": 1.00053799, + "epoch": 0.6556440703442056, + "flos": 24642943718400.0, + "grad_norm": 2.131554894399051, + "language_loss": 0.9047606, + "learning_rate": 1.120046465383464e-06, + "loss": 0.92732775, + "num_input_tokens_seen": 235420435, + "step": 10905, + "time_per_iteration": 2.5742011070251465 + }, + { + "auxiliary_loss_clip": 0.01149911, + "auxiliary_loss_mlp": 0.01105041, + "balance_loss_clip": 1.00199461, + "balance_loss_mlp": 1.0004226, + "epoch": 0.6557041935968736, + "flos": 23732464752000.0, + "grad_norm": 3.1412165885494745, + "language_loss": 0.75295937, + "learning_rate": 1.1196967419137721e-06, + "loss": 0.77550888, + "num_input_tokens_seen": 235439960, + "step": 10906, + "time_per_iteration": 2.646094560623169 + }, + { + "auxiliary_loss_clip": 0.01165411, + "auxiliary_loss_mlp": 0.01106088, + "balance_loss_clip": 1.00206089, + "balance_loss_mlp": 1.00070667, + "epoch": 0.6557643168495415, + "flos": 11102753819520.0, + "grad_norm": 2.4742747755949024, + "language_loss": 0.75245374, + "learning_rate": 1.119347051825267e-06, + "loss": 0.77516866, + "num_input_tokens_seen": 235457495, + "step": 10907, + "time_per_iteration": 2.5192337036132812 + }, + { + "auxiliary_loss_clip": 0.01117002, + "auxiliary_loss_mlp": 0.01105569, + "balance_loss_clip": 1.00180531, + "balance_loss_mlp": 1.00037825, + "epoch": 0.6558244401022095, + "flos": 30191034533760.0, + "grad_norm": 1.5019787089264256, + "language_loss": 0.72071528, + "learning_rate": 1.118997395131211e-06, + "loss": 0.74294102, + "num_input_tokens_seen": 235479525, + "step": 10908, + "time_per_iteration": 2.7087414264678955 + }, + { + "auxiliary_loss_clip": 0.0116531, + "auxiliary_loss_mlp": 0.011058, + "balance_loss_clip": 1.0020709, + "balance_loss_mlp": 1.00051498, + "epoch": 0.6558845633548775, + "flos": 17931060247680.0, + "grad_norm": 2.53859387750379, + "language_loss": 0.80992168, + "learning_rate": 1.118647771844861e-06, + "loss": 0.83263278, + "num_input_tokens_seen": 235496305, + "step": 10909, + "time_per_iteration": 2.5054924488067627 + }, + { + "auxiliary_loss_clip": 0.01165396, + "auxiliary_loss_mlp": 0.011063, + "balance_loss_clip": 1.002069, + "balance_loss_mlp": 1.00044179, + "epoch": 0.6559446866075455, + "flos": 21904144531200.0, + "grad_norm": 5.701567486324908, + "language_loss": 0.63612676, + "learning_rate": 1.1182981819794767e-06, + "loss": 0.6588437, + "num_input_tokens_seen": 235512545, + "step": 10910, + "time_per_iteration": 2.53517484664917 + }, + { + "auxiliary_loss_clip": 0.01135768, + "auxiliary_loss_mlp": 0.01107006, + "balance_loss_clip": 1.00204623, + "balance_loss_mlp": 1.00057566, + "epoch": 0.6560048098602135, + "flos": 14127976056960.0, + "grad_norm": 2.6699722739098917, + "language_loss": 0.75051224, + "learning_rate": 1.117948625548313e-06, + "loss": 0.77293992, + "num_input_tokens_seen": 235526045, + "step": 10911, + "time_per_iteration": 2.533038854598999 + }, + { + "auxiliary_loss_clip": 0.01165181, + "auxiliary_loss_mlp": 0.01104652, + "balance_loss_clip": 1.00201726, + "balance_loss_mlp": 1.00060642, + "epoch": 0.6560649331128814, + "flos": 18807567926400.0, + "grad_norm": 1.5617469375721469, + "language_loss": 0.75324386, + "learning_rate": 1.1175991025646265e-06, + "loss": 0.77594221, + "num_input_tokens_seen": 235545285, + "step": 10912, + "time_per_iteration": 2.5365939140319824 + }, + { + "auxiliary_loss_clip": 0.0111811, + "auxiliary_loss_mlp": 0.00747492, + "balance_loss_clip": 1.00194144, + "balance_loss_mlp": 1.00063908, + "epoch": 0.6561250563655494, + "flos": 17053618815360.0, + "grad_norm": 1.4762421306090188, + "language_loss": 0.7750873, + "learning_rate": 1.1172496130416697e-06, + "loss": 0.79374337, + "num_input_tokens_seen": 235563150, + "step": 10913, + "time_per_iteration": 4.052580118179321 + }, + { + "auxiliary_loss_clip": 0.0113327, + "auxiliary_loss_mlp": 0.01103576, + "balance_loss_clip": 1.00185251, + "balance_loss_mlp": 1.00048411, + "epoch": 0.6561851796182173, + "flos": 22637656166400.0, + "grad_norm": 2.473490746012027, + "language_loss": 0.70973539, + "learning_rate": 1.1169001569926961e-06, + "loss": 0.73210382, + "num_input_tokens_seen": 235582535, + "step": 10914, + "time_per_iteration": 3.9794647693634033 + }, + { + "auxiliary_loss_clip": 0.01116683, + "auxiliary_loss_mlp": 0.01104803, + "balance_loss_clip": 1.00177205, + "balance_loss_mlp": 1.00047112, + "epoch": 0.6562453028708853, + "flos": 19239213663360.0, + "grad_norm": 2.530531249541015, + "language_loss": 0.73966086, + "learning_rate": 1.116550734430958e-06, + "loss": 0.76187575, + "num_input_tokens_seen": 235601490, + "step": 10915, + "time_per_iteration": 2.6315195560455322 + }, + { + "auxiliary_loss_clip": 0.01117741, + "auxiliary_loss_mlp": 0.01105533, + "balance_loss_clip": 1.00191069, + "balance_loss_mlp": 1.00043845, + "epoch": 0.6563054261235532, + "flos": 23801305167360.0, + "grad_norm": 1.5752172026355928, + "language_loss": 0.79693896, + "learning_rate": 1.1162013453697042e-06, + "loss": 0.81917167, + "num_input_tokens_seen": 235619165, + "step": 10916, + "time_per_iteration": 2.6538503170013428 + }, + { + "auxiliary_loss_clip": 0.01136731, + "auxiliary_loss_mlp": 0.01105189, + "balance_loss_clip": 1.002002, + "balance_loss_mlp": 1.00057101, + "epoch": 0.6563655493762213, + "flos": 19240039676160.0, + "grad_norm": 3.1093689444288164, + "language_loss": 0.76386559, + "learning_rate": 1.1158519898221831e-06, + "loss": 0.7862848, + "num_input_tokens_seen": 235637115, + "step": 10917, + "time_per_iteration": 2.6040899753570557 + }, + { + "auxiliary_loss_clip": 0.01165156, + "auxiliary_loss_mlp": 0.00747446, + "balance_loss_clip": 1.00193989, + "balance_loss_mlp": 1.00050747, + "epoch": 0.6564256726288892, + "flos": 25556439427200.0, + "grad_norm": 2.802717274440554, + "language_loss": 0.70155191, + "learning_rate": 1.1155026678016445e-06, + "loss": 0.72067797, + "num_input_tokens_seen": 235656330, + "step": 10918, + "time_per_iteration": 2.661386251449585 + }, + { + "auxiliary_loss_clip": 0.01116932, + "auxiliary_loss_mlp": 0.01104707, + "balance_loss_clip": 1.00179791, + "balance_loss_mlp": 1.00046992, + "epoch": 0.6564857958815572, + "flos": 22200623389440.0, + "grad_norm": 1.5999164152313383, + "language_loss": 0.76311767, + "learning_rate": 1.115153379321332e-06, + "loss": 0.78533405, + "num_input_tokens_seen": 235674510, + "step": 10919, + "time_per_iteration": 2.6799476146698 + }, + { + "auxiliary_loss_clip": 0.01143002, + "auxiliary_loss_mlp": 0.00745767, + "balance_loss_clip": 1.0009532, + "balance_loss_mlp": 1.00035274, + "epoch": 0.6565459191342251, + "flos": 58123144604160.0, + "grad_norm": 0.7213883459264149, + "language_loss": 0.53120518, + "learning_rate": 1.1148041243944931e-06, + "loss": 0.55009294, + "num_input_tokens_seen": 235735050, + "step": 10920, + "time_per_iteration": 3.15842866897583 + }, + { + "auxiliary_loss_clip": 0.01149983, + "auxiliary_loss_mlp": 0.01105131, + "balance_loss_clip": 1.00201321, + "balance_loss_mlp": 1.00041807, + "epoch": 0.6566060423868931, + "flos": 30809631582720.0, + "grad_norm": 11.149600957416881, + "language_loss": 0.65613723, + "learning_rate": 1.1144549030343697e-06, + "loss": 0.67868841, + "num_input_tokens_seen": 235757545, + "step": 10921, + "time_per_iteration": 2.653895378112793 + }, + { + "auxiliary_loss_clip": 0.01134645, + "auxiliary_loss_mlp": 0.01104765, + "balance_loss_clip": 1.00185847, + "balance_loss_mlp": 1.00043309, + "epoch": 0.6566661656395612, + "flos": 23367432787200.0, + "grad_norm": 2.15940943252086, + "language_loss": 0.81382108, + "learning_rate": 1.114105715254205e-06, + "loss": 0.8362152, + "num_input_tokens_seen": 235777265, + "step": 10922, + "time_per_iteration": 2.6093056201934814 + }, + { + "auxiliary_loss_clip": 0.01103638, + "auxiliary_loss_mlp": 0.0074752, + "balance_loss_clip": 1.00196719, + "balance_loss_mlp": 1.00059366, + "epoch": 0.6567262888922291, + "flos": 25735597488000.0, + "grad_norm": 2.2755230185505066, + "language_loss": 0.71829605, + "learning_rate": 1.1137565610672414e-06, + "loss": 0.73680758, + "num_input_tokens_seen": 235796565, + "step": 10923, + "time_per_iteration": 2.7098639011383057 + }, + { + "auxiliary_loss_clip": 0.01116582, + "auxiliary_loss_mlp": 0.0110565, + "balance_loss_clip": 1.00183415, + "balance_loss_mlp": 1.0005548, + "epoch": 0.6567864121448971, + "flos": 17123716206720.0, + "grad_norm": 2.507802209393623, + "language_loss": 0.80914569, + "learning_rate": 1.1134074404867169e-06, + "loss": 0.83136797, + "num_input_tokens_seen": 235814805, + "step": 10924, + "time_per_iteration": 2.698676109313965 + }, + { + "auxiliary_loss_clip": 0.0115054, + "auxiliary_loss_mlp": 0.01104591, + "balance_loss_clip": 1.00215149, + "balance_loss_mlp": 1.00045013, + "epoch": 0.656846535397565, + "flos": 22419319345920.0, + "grad_norm": 1.9014828853694803, + "language_loss": 0.72099435, + "learning_rate": 1.1130583535258717e-06, + "loss": 0.74354565, + "num_input_tokens_seen": 235833405, + "step": 10925, + "time_per_iteration": 2.591003179550171 + }, + { + "auxiliary_loss_clip": 0.01150545, + "auxiliary_loss_mlp": 0.01106094, + "balance_loss_clip": 1.00199056, + "balance_loss_mlp": 1.00052261, + "epoch": 0.656906658650233, + "flos": 17704535126400.0, + "grad_norm": 2.779978367626392, + "language_loss": 0.72496367, + "learning_rate": 1.112709300197942e-06, + "loss": 0.7475301, + "num_input_tokens_seen": 235848530, + "step": 10926, + "time_per_iteration": 5.369687080383301 + }, + { + "auxiliary_loss_clip": 0.01104122, + "auxiliary_loss_mlp": 0.01106125, + "balance_loss_clip": 1.00181985, + "balance_loss_mlp": 1.00045788, + "epoch": 0.6569667819029009, + "flos": 21175158009600.0, + "grad_norm": 1.6636293973042307, + "language_loss": 0.72990263, + "learning_rate": 1.1123602805161656e-06, + "loss": 0.7520051, + "num_input_tokens_seen": 235867225, + "step": 10927, + "time_per_iteration": 2.6866064071655273 + }, + { + "auxiliary_loss_clip": 0.01109358, + "auxiliary_loss_mlp": 0.01080653, + "balance_loss_clip": 1.00097501, + "balance_loss_mlp": 1.00016332, + "epoch": 0.6570269051555689, + "flos": 68761897511040.0, + "grad_norm": 0.7504278846490107, + "language_loss": 0.64416575, + "learning_rate": 1.112011294493775e-06, + "loss": 0.66606587, + "num_input_tokens_seen": 235932925, + "step": 10928, + "time_per_iteration": 3.286299228668213 + }, + { + "auxiliary_loss_clip": 0.01150392, + "auxiliary_loss_mlp": 0.01105272, + "balance_loss_clip": 1.00192678, + "balance_loss_mlp": 1.00046349, + "epoch": 0.6570870284082369, + "flos": 26319289495680.0, + "grad_norm": 1.6653051149029379, + "language_loss": 0.77780628, + "learning_rate": 1.1116623421440063e-06, + "loss": 0.80036294, + "num_input_tokens_seen": 235952680, + "step": 10929, + "time_per_iteration": 2.618196964263916 + }, + { + "auxiliary_loss_clip": 0.01118053, + "auxiliary_loss_mlp": 0.01105124, + "balance_loss_clip": 1.00179923, + "balance_loss_mlp": 1.00050545, + "epoch": 0.6571471516609049, + "flos": 26174749167360.0, + "grad_norm": 1.7387420394619733, + "language_loss": 0.65062439, + "learning_rate": 1.1113134234800895e-06, + "loss": 0.67285609, + "num_input_tokens_seen": 235972075, + "step": 10930, + "time_per_iteration": 2.696488857269287 + }, + { + "auxiliary_loss_clip": 0.01101577, + "auxiliary_loss_mlp": 0.01104634, + "balance_loss_clip": 1.00180542, + "balance_loss_mlp": 1.00049305, + "epoch": 0.6572072749135728, + "flos": 20376253664640.0, + "grad_norm": 1.6466760774165752, + "language_loss": 0.70857966, + "learning_rate": 1.110964538515258e-06, + "loss": 0.73064178, + "num_input_tokens_seen": 235990340, + "step": 10931, + "time_per_iteration": 2.6680171489715576 + }, + { + "auxiliary_loss_clip": 0.01100687, + "auxiliary_loss_mlp": 0.01105585, + "balance_loss_clip": 1.00173068, + "balance_loss_mlp": 1.00058556, + "epoch": 0.6572673981662408, + "flos": 17128744110720.0, + "grad_norm": 2.085496217247107, + "language_loss": 0.68716091, + "learning_rate": 1.1106156872627393e-06, + "loss": 0.70922363, + "num_input_tokens_seen": 236007470, + "step": 10932, + "time_per_iteration": 2.692657947540283 + }, + { + "auxiliary_loss_clip": 0.01133617, + "auxiliary_loss_mlp": 0.00747405, + "balance_loss_clip": 1.00188601, + "balance_loss_mlp": 1.00056565, + "epoch": 0.6573275214189087, + "flos": 41275113281280.0, + "grad_norm": 1.7283754784645575, + "language_loss": 0.80062491, + "learning_rate": 1.1102668697357626e-06, + "loss": 0.81943512, + "num_input_tokens_seen": 236029030, + "step": 10933, + "time_per_iteration": 2.778362274169922 + }, + { + "auxiliary_loss_clip": 0.01100689, + "auxiliary_loss_mlp": 0.01106388, + "balance_loss_clip": 1.00196505, + "balance_loss_mlp": 1.00053048, + "epoch": 0.6573876446715767, + "flos": 22890143842560.0, + "grad_norm": 2.3842139060163556, + "language_loss": 0.73711765, + "learning_rate": 1.1099180859475571e-06, + "loss": 0.75918841, + "num_input_tokens_seen": 236047160, + "step": 10934, + "time_per_iteration": 2.7347424030303955 + }, + { + "auxiliary_loss_clip": 0.01150107, + "auxiliary_loss_mlp": 0.01104692, + "balance_loss_clip": 1.00189459, + "balance_loss_mlp": 1.00045538, + "epoch": 0.6574477679242448, + "flos": 44018150273280.0, + "grad_norm": 1.680169433416749, + "language_loss": 0.76265144, + "learning_rate": 1.1095693359113454e-06, + "loss": 0.7851994, + "num_input_tokens_seen": 236069215, + "step": 10935, + "time_per_iteration": 2.7782750129699707 + }, + { + "auxiliary_loss_clip": 0.01117246, + "auxiliary_loss_mlp": 0.0110596, + "balance_loss_clip": 1.00192094, + "balance_loss_mlp": 1.00067437, + "epoch": 0.6575078911769127, + "flos": 24571517523840.0, + "grad_norm": 1.600235486704967, + "language_loss": 0.78387785, + "learning_rate": 1.1092206196403538e-06, + "loss": 0.80610996, + "num_input_tokens_seen": 236088335, + "step": 10936, + "time_per_iteration": 2.6936612129211426 + }, + { + "auxiliary_loss_clip": 0.01120321, + "auxiliary_loss_mlp": 0.01104231, + "balance_loss_clip": 1.00191939, + "balance_loss_mlp": 1.00066149, + "epoch": 0.6575680144295807, + "flos": 20924035050240.0, + "grad_norm": 3.4067185329048337, + "language_loss": 0.69189918, + "learning_rate": 1.1088719371478056e-06, + "loss": 0.71414477, + "num_input_tokens_seen": 236108540, + "step": 10937, + "time_per_iteration": 2.651296615600586 + }, + { + "auxiliary_loss_clip": 0.01131718, + "auxiliary_loss_mlp": 0.01105938, + "balance_loss_clip": 1.00189483, + "balance_loss_mlp": 1.0004611, + "epoch": 0.6576281376822486, + "flos": 10925642833920.0, + "grad_norm": 2.921346206311384, + "language_loss": 0.68601865, + "learning_rate": 1.1085232884469236e-06, + "loss": 0.70839512, + "num_input_tokens_seen": 236124495, + "step": 10938, + "time_per_iteration": 2.5927975177764893 + }, + { + "auxiliary_loss_clip": 0.01134226, + "auxiliary_loss_mlp": 0.01105543, + "balance_loss_clip": 1.00200009, + "balance_loss_mlp": 1.00044787, + "epoch": 0.6576882609349166, + "flos": 19281552819840.0, + "grad_norm": 1.8788438204015356, + "language_loss": 0.70963085, + "learning_rate": 1.108174673550927e-06, + "loss": 0.73202854, + "num_input_tokens_seen": 236142550, + "step": 10939, + "time_per_iteration": 2.5871775150299072 + }, + { + "auxiliary_loss_clip": 0.01133771, + "auxiliary_loss_mlp": 0.00747486, + "balance_loss_clip": 1.00192082, + "balance_loss_mlp": 1.00072551, + "epoch": 0.6577483841875845, + "flos": 20220544206720.0, + "grad_norm": 2.3814640977759045, + "language_loss": 0.77690297, + "learning_rate": 1.107826092473037e-06, + "loss": 0.79571557, + "num_input_tokens_seen": 236156620, + "step": 10940, + "time_per_iteration": 2.628964900970459 + }, + { + "auxiliary_loss_clip": 0.01103355, + "auxiliary_loss_mlp": 0.01105586, + "balance_loss_clip": 1.00187147, + "balance_loss_mlp": 1.00049114, + "epoch": 0.6578085074402525, + "flos": 34751078962560.0, + "grad_norm": 1.7876918666880504, + "language_loss": 0.68470258, + "learning_rate": 1.107477545226471e-06, + "loss": 0.706792, + "num_input_tokens_seen": 236177095, + "step": 10941, + "time_per_iteration": 2.8164408206939697 + }, + { + "auxiliary_loss_clip": 0.01150498, + "auxiliary_loss_mlp": 0.00747418, + "balance_loss_clip": 1.00194836, + "balance_loss_mlp": 1.00057316, + "epoch": 0.6578686306929205, + "flos": 23470998675840.0, + "grad_norm": 1.895375914063643, + "language_loss": 0.67825603, + "learning_rate": 1.1071290318244448e-06, + "loss": 0.69723523, + "num_input_tokens_seen": 236194695, + "step": 10942, + "time_per_iteration": 2.600604772567749 + }, + { + "auxiliary_loss_clip": 0.01120638, + "auxiliary_loss_mlp": 0.0110716, + "balance_loss_clip": 1.00197101, + "balance_loss_mlp": 1.00053906, + "epoch": 0.6579287539455885, + "flos": 18077073033600.0, + "grad_norm": 2.224144882305288, + "language_loss": 0.71685421, + "learning_rate": 1.1067805522801753e-06, + "loss": 0.73913217, + "num_input_tokens_seen": 236213885, + "step": 10943, + "time_per_iteration": 2.6207287311553955 + }, + { + "auxiliary_loss_clip": 0.01119889, + "auxiliary_loss_mlp": 0.01104657, + "balance_loss_clip": 1.00200462, + "balance_loss_mlp": 1.00051546, + "epoch": 0.6579888771982564, + "flos": 28661383900800.0, + "grad_norm": 1.7217356125018564, + "language_loss": 0.59387183, + "learning_rate": 1.1064321066068778e-06, + "loss": 0.61611736, + "num_input_tokens_seen": 236237315, + "step": 10944, + "time_per_iteration": 2.7441110610961914 + }, + { + "auxiliary_loss_clip": 0.01148755, + "auxiliary_loss_mlp": 0.01106302, + "balance_loss_clip": 1.00193095, + "balance_loss_mlp": 1.00063467, + "epoch": 0.6580490004509244, + "flos": 25046543911680.0, + "grad_norm": 1.6833727350811396, + "language_loss": 0.72515917, + "learning_rate": 1.1060836948177646e-06, + "loss": 0.74770975, + "num_input_tokens_seen": 236256345, + "step": 10945, + "time_per_iteration": 2.6368706226348877 + }, + { + "auxiliary_loss_clip": 0.01132203, + "auxiliary_loss_mlp": 0.01104177, + "balance_loss_clip": 1.0018332, + "balance_loss_mlp": 1.00051248, + "epoch": 0.6581091237035923, + "flos": 43508793461760.0, + "grad_norm": 1.5698502048067502, + "language_loss": 0.70593339, + "learning_rate": 1.105735316926046e-06, + "loss": 0.72829717, + "num_input_tokens_seen": 236281890, + "step": 10946, + "time_per_iteration": 2.857598304748535 + }, + { + "auxiliary_loss_clip": 0.0114853, + "auxiliary_loss_mlp": 0.01105086, + "balance_loss_clip": 1.00191939, + "balance_loss_mlp": 1.0004679, + "epoch": 0.6581692469562603, + "flos": 22415404763520.0, + "grad_norm": 1.7703651355645282, + "language_loss": 0.82415068, + "learning_rate": 1.105386972944934e-06, + "loss": 0.84668684, + "num_input_tokens_seen": 236298370, + "step": 10947, + "time_per_iteration": 2.599228858947754 + }, + { + "auxiliary_loss_clip": 0.01102703, + "auxiliary_loss_mlp": 0.00747404, + "balance_loss_clip": 1.00172067, + "balance_loss_mlp": 1.00065708, + "epoch": 0.6582293702089284, + "flos": 24859772167680.0, + "grad_norm": 2.3897827971951298, + "language_loss": 0.77175105, + "learning_rate": 1.1050386628876385e-06, + "loss": 0.79025209, + "num_input_tokens_seen": 236317380, + "step": 10948, + "time_per_iteration": 2.7209761142730713 + }, + { + "auxiliary_loss_clip": 0.01148589, + "auxiliary_loss_mlp": 0.01104536, + "balance_loss_clip": 1.00192857, + "balance_loss_mlp": 1.00039518, + "epoch": 0.6582894934615963, + "flos": 23039676161280.0, + "grad_norm": 1.8893054012192863, + "language_loss": 0.79255491, + "learning_rate": 1.1046903867673655e-06, + "loss": 0.81508619, + "num_input_tokens_seen": 236336210, + "step": 10949, + "time_per_iteration": 2.6043031215667725 + }, + { + "auxiliary_loss_clip": 0.01142929, + "auxiliary_loss_mlp": 0.01080099, + "balance_loss_clip": 1.00087547, + "balance_loss_mlp": 0.9999904, + "epoch": 0.6583496167142643, + "flos": 72551980978560.0, + "grad_norm": 0.7344736084344328, + "language_loss": 0.61853135, + "learning_rate": 1.104342144597323e-06, + "loss": 0.64076173, + "num_input_tokens_seen": 236403090, + "step": 10950, + "time_per_iteration": 4.60938286781311 + }, + { + "auxiliary_loss_clip": 0.01150222, + "auxiliary_loss_mlp": 0.01104556, + "balance_loss_clip": 1.00188446, + "balance_loss_mlp": 1.00060582, + "epoch": 0.6584097399669322, + "flos": 13078846592640.0, + "grad_norm": 4.982466802556116, + "language_loss": 0.67412251, + "learning_rate": 1.1039939363907178e-06, + "loss": 0.69667029, + "num_input_tokens_seen": 236420475, + "step": 10951, + "time_per_iteration": 2.570533514022827 + }, + { + "auxiliary_loss_clip": 0.01150455, + "auxiliary_loss_mlp": 0.01104635, + "balance_loss_clip": 1.00200164, + "balance_loss_mlp": 1.00068498, + "epoch": 0.6584698632196002, + "flos": 28693164458880.0, + "grad_norm": 1.4503760975661872, + "language_loss": 0.76827073, + "learning_rate": 1.1036457621607504e-06, + "loss": 0.79082167, + "num_input_tokens_seen": 236441915, + "step": 10952, + "time_per_iteration": 4.267206907272339 + }, + { + "auxiliary_loss_clip": 0.01165051, + "auxiliary_loss_mlp": 0.01104573, + "balance_loss_clip": 1.00198269, + "balance_loss_mlp": 1.00052691, + "epoch": 0.6585299864722681, + "flos": 14319272914560.0, + "grad_norm": 1.8595506012664458, + "language_loss": 0.73260742, + "learning_rate": 1.1032976219206257e-06, + "loss": 0.75530368, + "num_input_tokens_seen": 236460340, + "step": 10953, + "time_per_iteration": 2.5212690830230713 + }, + { + "auxiliary_loss_clip": 0.01117031, + "auxiliary_loss_mlp": 0.01105557, + "balance_loss_clip": 1.00182688, + "balance_loss_mlp": 1.00065279, + "epoch": 0.6585901097249361, + "flos": 26797907243520.0, + "grad_norm": 2.1561254400091108, + "language_loss": 0.78105497, + "learning_rate": 1.102949515683546e-06, + "loss": 0.80328083, + "num_input_tokens_seen": 236478280, + "step": 10954, + "time_per_iteration": 2.696458339691162 + }, + { + "auxiliary_loss_clip": 0.01135698, + "auxiliary_loss_mlp": 0.011056, + "balance_loss_clip": 1.00202763, + "balance_loss_mlp": 1.00060034, + "epoch": 0.658650232977604, + "flos": 18733124989440.0, + "grad_norm": 2.8415826228557792, + "language_loss": 0.69739223, + "learning_rate": 1.1026014434627096e-06, + "loss": 0.71980518, + "num_input_tokens_seen": 236493225, + "step": 10955, + "time_per_iteration": 2.577786684036255 + }, + { + "auxiliary_loss_clip": 0.01135203, + "auxiliary_loss_mlp": 0.01104497, + "balance_loss_clip": 1.00195384, + "balance_loss_mlp": 1.00054657, + "epoch": 0.6587103562302721, + "flos": 24753440931840.0, + "grad_norm": 1.918384204825597, + "language_loss": 0.80348122, + "learning_rate": 1.1022534052713172e-06, + "loss": 0.82587826, + "num_input_tokens_seen": 236514420, + "step": 10956, + "time_per_iteration": 2.641594171524048 + }, + { + "auxiliary_loss_clip": 0.01148309, + "auxiliary_loss_mlp": 0.01104386, + "balance_loss_clip": 1.00191581, + "balance_loss_mlp": 1.00062644, + "epoch": 0.65877047948294, + "flos": 22346133384960.0, + "grad_norm": 2.312086683677873, + "language_loss": 0.81449997, + "learning_rate": 1.1019054011225648e-06, + "loss": 0.83702695, + "num_input_tokens_seen": 236532785, + "step": 10957, + "time_per_iteration": 2.599219560623169 + }, + { + "auxiliary_loss_clip": 0.01131275, + "auxiliary_loss_mlp": 0.01103272, + "balance_loss_clip": 1.00184846, + "balance_loss_mlp": 1.00056112, + "epoch": 0.658830602735608, + "flos": 45180542298240.0, + "grad_norm": 1.8497632263969788, + "language_loss": 0.75561213, + "learning_rate": 1.1015574310296506e-06, + "loss": 0.77795762, + "num_input_tokens_seen": 236553330, + "step": 10958, + "time_per_iteration": 2.8017189502716064 + }, + { + "auxiliary_loss_clip": 0.01118668, + "auxiliary_loss_mlp": 0.01104565, + "balance_loss_clip": 1.00188744, + "balance_loss_mlp": 1.00061417, + "epoch": 0.6588907259882759, + "flos": 19901622326400.0, + "grad_norm": 1.9867642617972543, + "language_loss": 0.7505371, + "learning_rate": 1.1012094950057678e-06, + "loss": 0.77276939, + "num_input_tokens_seen": 236572960, + "step": 10959, + "time_per_iteration": 2.672370195388794 + }, + { + "auxiliary_loss_clip": 0.01149856, + "auxiliary_loss_mlp": 0.01104778, + "balance_loss_clip": 1.00193191, + "balance_loss_mlp": 1.00044572, + "epoch": 0.6589508492409439, + "flos": 24133766474880.0, + "grad_norm": 1.9711624162629728, + "language_loss": 0.64922893, + "learning_rate": 1.1008615930641107e-06, + "loss": 0.67177534, + "num_input_tokens_seen": 236594090, + "step": 10960, + "time_per_iteration": 2.6241796016693115 + }, + { + "auxiliary_loss_clip": 0.01165339, + "auxiliary_loss_mlp": 0.01106001, + "balance_loss_clip": 1.00201797, + "balance_loss_mlp": 1.00052476, + "epoch": 0.659010972493612, + "flos": 18222906251520.0, + "grad_norm": 4.544039480720621, + "language_loss": 0.81725812, + "learning_rate": 1.1005137252178734e-06, + "loss": 0.83997154, + "num_input_tokens_seen": 236610190, + "step": 10961, + "time_per_iteration": 2.5472700595855713 + }, + { + "auxiliary_loss_clip": 0.01119091, + "auxiliary_loss_mlp": 0.01105434, + "balance_loss_clip": 1.00205159, + "balance_loss_mlp": 1.00043488, + "epoch": 0.6590710957462799, + "flos": 27600007898880.0, + "grad_norm": 2.0637809245952896, + "language_loss": 0.73768425, + "learning_rate": 1.1001658914802453e-06, + "loss": 0.75992948, + "num_input_tokens_seen": 236631575, + "step": 10962, + "time_per_iteration": 2.69380521774292 + }, + { + "auxiliary_loss_clip": 0.01150631, + "auxiliary_loss_mlp": 0.0110558, + "balance_loss_clip": 1.00194895, + "balance_loss_mlp": 1.00058079, + "epoch": 0.6591312189989479, + "flos": 20302959962880.0, + "grad_norm": 2.0949366225398665, + "language_loss": 0.80027765, + "learning_rate": 1.0998180918644165e-06, + "loss": 0.82283974, + "num_input_tokens_seen": 236649815, + "step": 10963, + "time_per_iteration": 3.976522207260132 + }, + { + "auxiliary_loss_clip": 0.01103369, + "auxiliary_loss_mlp": 0.00747307, + "balance_loss_clip": 1.00177002, + "balance_loss_mlp": 1.00058138, + "epoch": 0.6591913422516158, + "flos": 12312943868160.0, + "grad_norm": 2.574620907049296, + "language_loss": 0.78802574, + "learning_rate": 1.0994703263835754e-06, + "loss": 0.80653244, + "num_input_tokens_seen": 236668335, + "step": 10964, + "time_per_iteration": 4.161778211593628 + }, + { + "auxiliary_loss_clip": 0.01119228, + "auxiliary_loss_mlp": 0.01105153, + "balance_loss_clip": 1.00191832, + "balance_loss_mlp": 1.00043917, + "epoch": 0.6592514655042838, + "flos": 25884591102720.0, + "grad_norm": 1.7323940689185802, + "language_loss": 0.74060953, + "learning_rate": 1.0991225950509106e-06, + "loss": 0.76285332, + "num_input_tokens_seen": 236688945, + "step": 10965, + "time_per_iteration": 2.6733689308166504 + }, + { + "auxiliary_loss_clip": 0.01118311, + "auxiliary_loss_mlp": 0.01105356, + "balance_loss_clip": 1.00183547, + "balance_loss_mlp": 1.00054705, + "epoch": 0.6593115887569517, + "flos": 14063624841600.0, + "grad_norm": 2.1206441113401264, + "language_loss": 0.73871338, + "learning_rate": 1.0987748978796067e-06, + "loss": 0.76095003, + "num_input_tokens_seen": 236707055, + "step": 10966, + "time_per_iteration": 2.62064266204834 + }, + { + "auxiliary_loss_clip": 0.01150487, + "auxiliary_loss_mlp": 0.01105204, + "balance_loss_clip": 1.00196338, + "balance_loss_mlp": 1.00039554, + "epoch": 0.6593717120096197, + "flos": 24717925359360.0, + "grad_norm": 1.5418030460539969, + "language_loss": 0.76971167, + "learning_rate": 1.0984272348828487e-06, + "loss": 0.79226857, + "num_input_tokens_seen": 236725900, + "step": 10967, + "time_per_iteration": 2.6382503509521484 + }, + { + "auxiliary_loss_clip": 0.01144457, + "auxiliary_loss_mlp": 0.0108006, + "balance_loss_clip": 1.00090957, + "balance_loss_mlp": 0.99995178, + "epoch": 0.6594318352622877, + "flos": 55558083502080.0, + "grad_norm": 0.6913498101638261, + "language_loss": 0.48485732, + "learning_rate": 1.0980796060738221e-06, + "loss": 0.50710249, + "num_input_tokens_seen": 236788415, + "step": 10968, + "time_per_iteration": 3.128622531890869 + }, + { + "auxiliary_loss_clip": 0.01105621, + "auxiliary_loss_mlp": 0.01105484, + "balance_loss_clip": 1.00181007, + "balance_loss_mlp": 1.00048494, + "epoch": 0.6594919585149557, + "flos": 17456931699840.0, + "grad_norm": 3.158284591571703, + "language_loss": 0.79085678, + "learning_rate": 1.0977320114657058e-06, + "loss": 0.8129679, + "num_input_tokens_seen": 236805155, + "step": 10969, + "time_per_iteration": 2.6558971405029297 + }, + { + "auxiliary_loss_clip": 0.01150287, + "auxiliary_loss_mlp": 0.01104277, + "balance_loss_clip": 1.00182629, + "balance_loss_mlp": 1.00042176, + "epoch": 0.6595520817676236, + "flos": 18223229473920.0, + "grad_norm": 3.112129804482672, + "language_loss": 0.65439188, + "learning_rate": 1.0973844510716817e-06, + "loss": 0.67693752, + "num_input_tokens_seen": 236824360, + "step": 10970, + "time_per_iteration": 2.5300252437591553 + }, + { + "auxiliary_loss_clip": 0.01150367, + "auxiliary_loss_mlp": 0.01103707, + "balance_loss_clip": 1.0018785, + "balance_loss_mlp": 1.00032902, + "epoch": 0.6596122050202916, + "flos": 22199761463040.0, + "grad_norm": 1.5787586970468466, + "language_loss": 0.76464552, + "learning_rate": 1.0970369249049308e-06, + "loss": 0.78718626, + "num_input_tokens_seen": 236844640, + "step": 10971, + "time_per_iteration": 2.5900282859802246 + }, + { + "auxiliary_loss_clip": 0.0108886, + "auxiliary_loss_mlp": 0.01105627, + "balance_loss_clip": 1.00191426, + "balance_loss_mlp": 1.00053167, + "epoch": 0.6596723282729595, + "flos": 14173834746240.0, + "grad_norm": 2.5029305396818007, + "language_loss": 0.69727999, + "learning_rate": 1.096689432978629e-06, + "loss": 0.71922481, + "num_input_tokens_seen": 236861160, + "step": 10972, + "time_per_iteration": 2.690171241760254 + }, + { + "auxiliary_loss_clip": 0.01148357, + "auxiliary_loss_mlp": 0.01104277, + "balance_loss_clip": 1.00185275, + "balance_loss_mlp": 1.00042212, + "epoch": 0.6597324515256275, + "flos": 30553193410560.0, + "grad_norm": 2.060889474152448, + "language_loss": 0.5580737, + "learning_rate": 1.0963419753059556e-06, + "loss": 0.58060008, + "num_input_tokens_seen": 236880465, + "step": 10973, + "time_per_iteration": 2.632573366165161 + }, + { + "auxiliary_loss_clip": 0.01131986, + "auxiliary_loss_mlp": 0.01106325, + "balance_loss_clip": 1.00193858, + "balance_loss_mlp": 1.00046718, + "epoch": 0.6597925747782956, + "flos": 17639860688640.0, + "grad_norm": 2.245829656892566, + "language_loss": 0.78771532, + "learning_rate": 1.0959945519000839e-06, + "loss": 0.81009847, + "num_input_tokens_seen": 236897730, + "step": 10974, + "time_per_iteration": 2.585775375366211 + }, + { + "auxiliary_loss_clip": 0.01148717, + "auxiliary_loss_mlp": 0.01105011, + "balance_loss_clip": 1.00185513, + "balance_loss_mlp": 1.00048876, + "epoch": 0.6598526980309635, + "flos": 22819112697600.0, + "grad_norm": 2.45278089996087, + "language_loss": 0.68458331, + "learning_rate": 1.0956471627741906e-06, + "loss": 0.70712054, + "num_input_tokens_seen": 236917300, + "step": 10975, + "time_per_iteration": 2.5688772201538086 + }, + { + "auxiliary_loss_clip": 0.01151643, + "auxiliary_loss_mlp": 0.01104684, + "balance_loss_clip": 1.00204349, + "balance_loss_mlp": 1.00044751, + "epoch": 0.6599128212836315, + "flos": 21068036674560.0, + "grad_norm": 2.531526138909573, + "language_loss": 0.70075512, + "learning_rate": 1.0952998079414464e-06, + "loss": 0.72331834, + "num_input_tokens_seen": 236935590, + "step": 10976, + "time_per_iteration": 2.551781415939331 + }, + { + "auxiliary_loss_clip": 0.01131861, + "auxiliary_loss_mlp": 0.01104059, + "balance_loss_clip": 1.00178432, + "balance_loss_mlp": 1.00048959, + "epoch": 0.6599729445362994, + "flos": 22163527618560.0, + "grad_norm": 2.612450136559612, + "language_loss": 0.6756494, + "learning_rate": 1.0949524874150243e-06, + "loss": 0.69800854, + "num_input_tokens_seen": 236952830, + "step": 10977, + "time_per_iteration": 2.6128299236297607 + }, + { + "auxiliary_loss_clip": 0.01117492, + "auxiliary_loss_mlp": 0.01105741, + "balance_loss_clip": 1.00180769, + "balance_loss_mlp": 1.00055051, + "epoch": 0.6600330677889674, + "flos": 18150079426560.0, + "grad_norm": 2.2642000039015566, + "language_loss": 0.81417835, + "learning_rate": 1.0946052012080952e-06, + "loss": 0.83641064, + "num_input_tokens_seen": 236971930, + "step": 10978, + "time_per_iteration": 2.665858507156372 + }, + { + "auxiliary_loss_clip": 0.01116178, + "auxiliary_loss_mlp": 0.01105687, + "balance_loss_clip": 1.00177896, + "balance_loss_mlp": 1.00059247, + "epoch": 0.6600931910416353, + "flos": 18150115340160.0, + "grad_norm": 2.037315797142522, + "language_loss": 0.66954923, + "learning_rate": 1.0942579493338278e-06, + "loss": 0.69176793, + "num_input_tokens_seen": 236989920, + "step": 10979, + "time_per_iteration": 2.643921136856079 + }, + { + "auxiliary_loss_clip": 0.01134286, + "auxiliary_loss_mlp": 0.01104674, + "balance_loss_clip": 1.00185645, + "balance_loss_mlp": 1.00043738, + "epoch": 0.6601533142943034, + "flos": 17420733768960.0, + "grad_norm": 2.4442357415515903, + "language_loss": 0.73567867, + "learning_rate": 1.0939107318053889e-06, + "loss": 0.75806826, + "num_input_tokens_seen": 237006570, + "step": 10980, + "time_per_iteration": 2.5819060802459717 + }, + { + "auxiliary_loss_clip": 0.0111701, + "auxiliary_loss_mlp": 0.01103979, + "balance_loss_clip": 1.00171483, + "balance_loss_mlp": 1.00050521, + "epoch": 0.6602134375469713, + "flos": 28219574615040.0, + "grad_norm": 1.570446177947312, + "language_loss": 0.72935253, + "learning_rate": 1.0935635486359459e-06, + "loss": 0.75156248, + "num_input_tokens_seen": 237028415, + "step": 10981, + "time_per_iteration": 2.7079169750213623 + }, + { + "auxiliary_loss_clip": 0.01105924, + "auxiliary_loss_mlp": 0.0110482, + "balance_loss_clip": 1.0019542, + "balance_loss_mlp": 1.00048828, + "epoch": 0.6602735607996393, + "flos": 29418056830080.0, + "grad_norm": 2.865963086774859, + "language_loss": 0.68431532, + "learning_rate": 1.0932163998386647e-06, + "loss": 0.70642269, + "num_input_tokens_seen": 237046595, + "step": 10982, + "time_per_iteration": 2.746262550354004 + }, + { + "auxiliary_loss_clip": 0.01150478, + "auxiliary_loss_mlp": 0.0110538, + "balance_loss_clip": 1.00192535, + "balance_loss_mlp": 1.00047588, + "epoch": 0.6603336840523072, + "flos": 18588045957120.0, + "grad_norm": 2.0760232065861546, + "language_loss": 0.69721186, + "learning_rate": 1.0928692854267075e-06, + "loss": 0.71977049, + "num_input_tokens_seen": 237066150, + "step": 10983, + "time_per_iteration": 2.5837182998657227 + }, + { + "auxiliary_loss_clip": 0.01150413, + "auxiliary_loss_mlp": 0.01104581, + "balance_loss_clip": 1.00200319, + "balance_loss_mlp": 1.00043952, + "epoch": 0.6603938073049752, + "flos": 33254860913280.0, + "grad_norm": 1.8903711108808403, + "language_loss": 0.70627224, + "learning_rate": 1.092522205413239e-06, + "loss": 0.72882223, + "num_input_tokens_seen": 237087060, + "step": 10984, + "time_per_iteration": 2.6737141609191895 + }, + { + "auxiliary_loss_clip": 0.01116255, + "auxiliary_loss_mlp": 0.01104434, + "balance_loss_clip": 1.001724, + "balance_loss_mlp": 1.00067401, + "epoch": 0.6604539305576431, + "flos": 17384284442880.0, + "grad_norm": 1.5796807354412448, + "language_loss": 0.83861196, + "learning_rate": 1.0921751598114193e-06, + "loss": 0.8608188, + "num_input_tokens_seen": 237103825, + "step": 10985, + "time_per_iteration": 2.6962568759918213 + }, + { + "auxiliary_loss_clip": 0.01148479, + "auxiliary_loss_mlp": 0.01105208, + "balance_loss_clip": 1.00192809, + "balance_loss_mlp": 1.00049496, + "epoch": 0.6605140538103111, + "flos": 21251145231360.0, + "grad_norm": 2.1117598622694094, + "language_loss": 0.73765063, + "learning_rate": 1.0918281486344077e-06, + "loss": 0.76018751, + "num_input_tokens_seen": 237121740, + "step": 10986, + "time_per_iteration": 2.5992963314056396 + }, + { + "auxiliary_loss_clip": 0.01149816, + "auxiliary_loss_mlp": 0.01103785, + "balance_loss_clip": 1.00191426, + "balance_loss_mlp": 1.00050187, + "epoch": 0.6605741770629792, + "flos": 13881701433600.0, + "grad_norm": 1.9060952860620743, + "language_loss": 0.79425973, + "learning_rate": 1.0914811718953636e-06, + "loss": 0.81679577, + "num_input_tokens_seen": 237139565, + "step": 10987, + "time_per_iteration": 2.528895378112793 + }, + { + "auxiliary_loss_clip": 0.01128943, + "auxiliary_loss_mlp": 0.01079368, + "balance_loss_clip": 1.00116539, + "balance_loss_mlp": 1.00002253, + "epoch": 0.6606343003156471, + "flos": 69316215171840.0, + "grad_norm": 0.8268034696726219, + "language_loss": 0.5412119, + "learning_rate": 1.0911342296074454e-06, + "loss": 0.56329501, + "num_input_tokens_seen": 237201055, + "step": 10988, + "time_per_iteration": 3.247868299484253 + }, + { + "auxiliary_loss_clip": 0.01084844, + "auxiliary_loss_mlp": 0.01103798, + "balance_loss_clip": 1.00184155, + "balance_loss_mlp": 1.00060987, + "epoch": 0.6606944235683151, + "flos": 27272394927360.0, + "grad_norm": 1.5774853092527437, + "language_loss": 0.7758171, + "learning_rate": 1.0907873217838077e-06, + "loss": 0.7977035, + "num_input_tokens_seen": 237221805, + "step": 10989, + "time_per_iteration": 5.630164861679077 + }, + { + "auxiliary_loss_clip": 0.01133033, + "auxiliary_loss_mlp": 0.0110456, + "balance_loss_clip": 1.00191629, + "balance_loss_mlp": 1.00060964, + "epoch": 0.660754546820983, + "flos": 13772820332160.0, + "grad_norm": 2.411731412769791, + "language_loss": 0.76877916, + "learning_rate": 1.0904404484376064e-06, + "loss": 0.7911551, + "num_input_tokens_seen": 237238270, + "step": 10990, + "time_per_iteration": 2.5508553981781006 + }, + { + "auxiliary_loss_clip": 0.01165088, + "auxiliary_loss_mlp": 0.0110462, + "balance_loss_clip": 1.00191343, + "balance_loss_mlp": 1.00038338, + "epoch": 0.660814670073651, + "flos": 15705209232000.0, + "grad_norm": 2.388070394668142, + "language_loss": 0.60724133, + "learning_rate": 1.0900936095819937e-06, + "loss": 0.62993842, + "num_input_tokens_seen": 237255400, + "step": 10991, + "time_per_iteration": 2.494947910308838 + }, + { + "auxiliary_loss_clip": 0.011341, + "auxiliary_loss_mlp": 0.01105, + "balance_loss_clip": 1.00198007, + "balance_loss_mlp": 1.00057268, + "epoch": 0.6608747933263189, + "flos": 20850023076480.0, + "grad_norm": 2.978165306141712, + "language_loss": 0.6827473, + "learning_rate": 1.0897468052301234e-06, + "loss": 0.70513833, + "num_input_tokens_seen": 237273105, + "step": 10992, + "time_per_iteration": 2.5834028720855713 + }, + { + "auxiliary_loss_clip": 0.0114849, + "auxiliary_loss_mlp": 0.01105315, + "balance_loss_clip": 1.0018847, + "balance_loss_mlp": 1.0004108, + "epoch": 0.660934916578987, + "flos": 20632117219200.0, + "grad_norm": 1.9616614071813085, + "language_loss": 0.87809378, + "learning_rate": 1.0894000353951444e-06, + "loss": 0.90063179, + "num_input_tokens_seen": 237292650, + "step": 10993, + "time_per_iteration": 2.5784153938293457 + }, + { + "auxiliary_loss_clip": 0.01150536, + "auxiliary_loss_mlp": 0.01106269, + "balance_loss_clip": 1.0019412, + "balance_loss_mlp": 1.00041139, + "epoch": 0.6609950398316549, + "flos": 25113588647040.0, + "grad_norm": 1.6301429120260382, + "language_loss": 0.66949534, + "learning_rate": 1.0890533000902078e-06, + "loss": 0.69206333, + "num_input_tokens_seen": 237312865, + "step": 10994, + "time_per_iteration": 2.59916615486145 + }, + { + "auxiliary_loss_clip": 0.01118758, + "auxiliary_loss_mlp": 0.01104735, + "balance_loss_clip": 1.00194502, + "balance_loss_mlp": 1.00059342, + "epoch": 0.6610551630843229, + "flos": 18661196004480.0, + "grad_norm": 2.255981002701036, + "language_loss": 0.7663976, + "learning_rate": 1.0887065993284626e-06, + "loss": 0.78863251, + "num_input_tokens_seen": 237331210, + "step": 10995, + "time_per_iteration": 2.628326177597046 + }, + { + "auxiliary_loss_clip": 0.01132862, + "auxiliary_loss_mlp": 0.0110406, + "balance_loss_clip": 1.00184774, + "balance_loss_mlp": 1.00058639, + "epoch": 0.6611152863369908, + "flos": 23258192549760.0, + "grad_norm": 10.375569761624517, + "language_loss": 0.74519038, + "learning_rate": 1.088359933123053e-06, + "loss": 0.76755959, + "num_input_tokens_seen": 237349455, + "step": 10996, + "time_per_iteration": 2.6121466159820557 + }, + { + "auxiliary_loss_clip": 0.0116515, + "auxiliary_loss_mlp": 0.01104838, + "balance_loss_clip": 1.00194669, + "balance_loss_mlp": 1.0005064, + "epoch": 0.6611754095896588, + "flos": 22159720776960.0, + "grad_norm": 1.6307469086425819, + "language_loss": 0.68709403, + "learning_rate": 1.088013301487126e-06, + "loss": 0.70979393, + "num_input_tokens_seen": 237367100, + "step": 10997, + "time_per_iteration": 2.5405147075653076 + }, + { + "auxiliary_loss_clip": 0.01133472, + "auxiliary_loss_mlp": 0.01105026, + "balance_loss_clip": 1.0017786, + "balance_loss_mlp": 1.00050318, + "epoch": 0.6612355328423267, + "flos": 13991228979840.0, + "grad_norm": 1.9994382507177892, + "language_loss": 0.68501157, + "learning_rate": 1.0876667044338269e-06, + "loss": 0.70739651, + "num_input_tokens_seen": 237384840, + "step": 10998, + "time_per_iteration": 2.636029005050659 + }, + { + "auxiliary_loss_clip": 0.01144972, + "auxiliary_loss_mlp": 0.01079799, + "balance_loss_clip": 1.00105834, + "balance_loss_mlp": 1.00007141, + "epoch": 0.6612956560949947, + "flos": 61453716359040.0, + "grad_norm": 0.6498499032698037, + "language_loss": 0.51128483, + "learning_rate": 1.087320141976297e-06, + "loss": 0.5335325, + "num_input_tokens_seen": 237443355, + "step": 10999, + "time_per_iteration": 3.1257143020629883 + }, + { + "auxiliary_loss_clip": 0.01165286, + "auxiliary_loss_mlp": 0.00747402, + "balance_loss_clip": 1.00193393, + "balance_loss_mlp": 1.00042212, + "epoch": 0.6613557793476627, + "flos": 21616644072960.0, + "grad_norm": 3.343606195685976, + "language_loss": 0.70953798, + "learning_rate": 1.086973614127679e-06, + "loss": 0.72866488, + "num_input_tokens_seen": 237459205, + "step": 11000, + "time_per_iteration": 2.571320056915283 + }, + { + "auxiliary_loss_clip": 0.01134467, + "auxiliary_loss_mlp": 0.01103769, + "balance_loss_clip": 1.00200224, + "balance_loss_mlp": 1.00048614, + "epoch": 0.6614159026003307, + "flos": 34020117192960.0, + "grad_norm": 1.7588454029260323, + "language_loss": 0.65303493, + "learning_rate": 1.0866271209011133e-06, + "loss": 0.67541724, + "num_input_tokens_seen": 237483580, + "step": 11001, + "time_per_iteration": 4.109676361083984 + }, + { + "auxiliary_loss_clip": 0.01164965, + "auxiliary_loss_mlp": 0.01104345, + "balance_loss_clip": 1.00192642, + "balance_loss_mlp": 1.00039411, + "epoch": 0.6614760258529987, + "flos": 24097281235200.0, + "grad_norm": 1.745916614750692, + "language_loss": 0.73116612, + "learning_rate": 1.086280662309739e-06, + "loss": 0.75385922, + "num_input_tokens_seen": 237502860, + "step": 11002, + "time_per_iteration": 3.971923828125 + }, + { + "auxiliary_loss_clip": 0.01150299, + "auxiliary_loss_mlp": 0.0110465, + "balance_loss_clip": 1.0020889, + "balance_loss_mlp": 1.00050926, + "epoch": 0.6615361491056666, + "flos": 14903790935040.0, + "grad_norm": 1.9280477965134286, + "language_loss": 0.79061055, + "learning_rate": 1.0859342383666928e-06, + "loss": 0.81316006, + "num_input_tokens_seen": 237521030, + "step": 11003, + "time_per_iteration": 2.553196668624878 + }, + { + "auxiliary_loss_clip": 0.01148091, + "auxiliary_loss_mlp": 0.01105624, + "balance_loss_clip": 1.00200021, + "balance_loss_mlp": 1.00043404, + "epoch": 0.6615962723583346, + "flos": 15304877176320.0, + "grad_norm": 2.1600800511055795, + "language_loss": 0.69129544, + "learning_rate": 1.0855878490851119e-06, + "loss": 0.71383262, + "num_input_tokens_seen": 237539585, + "step": 11004, + "time_per_iteration": 2.5416412353515625 + }, + { + "auxiliary_loss_clip": 0.01148446, + "auxiliary_loss_mlp": 0.01105758, + "balance_loss_clip": 1.00186396, + "balance_loss_mlp": 1.00056779, + "epoch": 0.6616563956110025, + "flos": 18732586285440.0, + "grad_norm": 2.163195588200807, + "language_loss": 0.70041883, + "learning_rate": 1.085241494478132e-06, + "loss": 0.72296083, + "num_input_tokens_seen": 237557655, + "step": 11005, + "time_per_iteration": 2.553382396697998 + }, + { + "auxiliary_loss_clip": 0.01134369, + "auxiliary_loss_mlp": 0.01103484, + "balance_loss_clip": 1.00189877, + "balance_loss_mlp": 1.00058234, + "epoch": 0.6617165188636706, + "flos": 24495063425280.0, + "grad_norm": 1.6956771908326693, + "language_loss": 0.78632241, + "learning_rate": 1.0848951745588855e-06, + "loss": 0.80870092, + "num_input_tokens_seen": 237577000, + "step": 11006, + "time_per_iteration": 2.626079797744751 + }, + { + "auxiliary_loss_clip": 0.01148409, + "auxiliary_loss_mlp": 0.01105336, + "balance_loss_clip": 1.00188553, + "balance_loss_mlp": 1.00062311, + "epoch": 0.6617766421163385, + "flos": 22379673709440.0, + "grad_norm": 1.5599419834873585, + "language_loss": 0.76058912, + "learning_rate": 1.0845488893405068e-06, + "loss": 0.78312659, + "num_input_tokens_seen": 237597960, + "step": 11007, + "time_per_iteration": 2.5854454040527344 + }, + { + "auxiliary_loss_clip": 0.01148134, + "auxiliary_loss_mlp": 0.0110456, + "balance_loss_clip": 1.00184608, + "balance_loss_mlp": 1.00041842, + "epoch": 0.6618367653690065, + "flos": 20850418126080.0, + "grad_norm": 1.50798022425527, + "language_loss": 0.78341877, + "learning_rate": 1.0842026388361248e-06, + "loss": 0.80594575, + "num_input_tokens_seen": 237616385, + "step": 11008, + "time_per_iteration": 2.54232120513916 + }, + { + "auxiliary_loss_clip": 0.01165306, + "auxiliary_loss_mlp": 0.01106106, + "balance_loss_clip": 1.00192261, + "balance_loss_mlp": 1.00053406, + "epoch": 0.6618968886216744, + "flos": 17712328377600.0, + "grad_norm": 1.679561575095976, + "language_loss": 0.81722075, + "learning_rate": 1.0838564230588715e-06, + "loss": 0.83993489, + "num_input_tokens_seen": 237634930, + "step": 11009, + "time_per_iteration": 2.5274698734283447 + }, + { + "auxiliary_loss_clip": 0.01131641, + "auxiliary_loss_mlp": 0.01080621, + "balance_loss_clip": 1.00216842, + "balance_loss_mlp": 1.00013089, + "epoch": 0.6619570118743424, + "flos": 67035347498880.0, + "grad_norm": 0.9811903020381241, + "language_loss": 0.67397881, + "learning_rate": 1.0835102420218735e-06, + "loss": 0.69610143, + "num_input_tokens_seen": 237693175, + "step": 11010, + "time_per_iteration": 3.1359660625457764 + }, + { + "auxiliary_loss_clip": 0.01148441, + "auxiliary_loss_mlp": 0.01105337, + "balance_loss_clip": 1.00188553, + "balance_loss_mlp": 1.00043237, + "epoch": 0.6620171351270103, + "flos": 18660908695680.0, + "grad_norm": 1.4964473563640657, + "language_loss": 0.71163654, + "learning_rate": 1.0831640957382593e-06, + "loss": 0.73417431, + "num_input_tokens_seen": 237713160, + "step": 11011, + "time_per_iteration": 2.673800468444824 + }, + { + "auxiliary_loss_clip": 0.01149022, + "auxiliary_loss_mlp": 0.01104027, + "balance_loss_clip": 1.00205636, + "balance_loss_mlp": 1.00055313, + "epoch": 0.6620772583796783, + "flos": 24170503109760.0, + "grad_norm": 1.538657503487495, + "language_loss": 0.72469717, + "learning_rate": 1.0828179842211557e-06, + "loss": 0.74722767, + "num_input_tokens_seen": 237733600, + "step": 11012, + "time_per_iteration": 2.5799331665039062 + }, + { + "auxiliary_loss_clip": 0.01150226, + "auxiliary_loss_mlp": 0.01104139, + "balance_loss_clip": 1.0020535, + "balance_loss_mlp": 1.00066555, + "epoch": 0.6621373816323463, + "flos": 23623547736960.0, + "grad_norm": 1.841828682095892, + "language_loss": 0.7933082, + "learning_rate": 1.0824719074836845e-06, + "loss": 0.81585187, + "num_input_tokens_seen": 237752135, + "step": 11013, + "time_per_iteration": 2.573723793029785 + }, + { + "auxiliary_loss_clip": 0.01133746, + "auxiliary_loss_mlp": 0.0110423, + "balance_loss_clip": 1.00197577, + "balance_loss_mlp": 1.00046992, + "epoch": 0.6621975048850143, + "flos": 18442212739200.0, + "grad_norm": 2.2455806697062255, + "language_loss": 0.70058912, + "learning_rate": 1.082125865538971e-06, + "loss": 0.72296888, + "num_input_tokens_seen": 237770735, + "step": 11014, + "time_per_iteration": 2.5844051837921143 + }, + { + "auxiliary_loss_clip": 0.01131733, + "auxiliary_loss_mlp": 0.00747195, + "balance_loss_clip": 1.00175595, + "balance_loss_mlp": 1.0003531, + "epoch": 0.6622576281376823, + "flos": 14063876236800.0, + "grad_norm": 2.339278429579953, + "language_loss": 0.76988798, + "learning_rate": 1.081779858400137e-06, + "loss": 0.78867728, + "num_input_tokens_seen": 237789005, + "step": 11015, + "time_per_iteration": 2.578254461288452 + }, + { + "auxiliary_loss_clip": 0.01150202, + "auxiliary_loss_mlp": 0.00747204, + "balance_loss_clip": 1.00183046, + "balance_loss_mlp": 1.00035393, + "epoch": 0.6623177513903502, + "flos": 17018965169280.0, + "grad_norm": 2.8035759539208223, + "language_loss": 0.8233428, + "learning_rate": 1.0814338860803021e-06, + "loss": 0.84231687, + "num_input_tokens_seen": 237807740, + "step": 11016, + "time_per_iteration": 2.550117015838623 + }, + { + "auxiliary_loss_clip": 0.01148775, + "auxiliary_loss_mlp": 0.01105166, + "balance_loss_clip": 1.00187433, + "balance_loss_mlp": 1.00045228, + "epoch": 0.6623778746430182, + "flos": 17271021882240.0, + "grad_norm": 5.373364993252219, + "language_loss": 0.69836825, + "learning_rate": 1.0810879485925864e-06, + "loss": 0.72090769, + "num_input_tokens_seen": 237826340, + "step": 11017, + "time_per_iteration": 2.545165538787842 + }, + { + "auxiliary_loss_clip": 0.01133804, + "auxiliary_loss_mlp": 0.01104217, + "balance_loss_clip": 1.00196433, + "balance_loss_mlp": 1.00055289, + "epoch": 0.6624379978956861, + "flos": 48792688767360.0, + "grad_norm": 1.6847908179902646, + "language_loss": 0.77668804, + "learning_rate": 1.0807420459501084e-06, + "loss": 0.79906821, + "num_input_tokens_seen": 237848305, + "step": 11018, + "time_per_iteration": 2.8466291427612305 + }, + { + "auxiliary_loss_clip": 0.01133492, + "auxiliary_loss_mlp": 0.01103285, + "balance_loss_clip": 1.00178754, + "balance_loss_mlp": 1.00057399, + "epoch": 0.6624981211483542, + "flos": 18952431477120.0, + "grad_norm": 2.0152690319473234, + "language_loss": 0.83224475, + "learning_rate": 1.0803961781659841e-06, + "loss": 0.85461247, + "num_input_tokens_seen": 237867020, + "step": 11019, + "time_per_iteration": 2.5863919258117676 + }, + { + "auxiliary_loss_clip": 0.01149997, + "auxiliary_loss_mlp": 0.0074729, + "balance_loss_clip": 1.00199914, + "balance_loss_mlp": 1.00042868, + "epoch": 0.6625582444010221, + "flos": 23256576437760.0, + "grad_norm": 1.62876386090111, + "language_loss": 0.71827912, + "learning_rate": 1.080050345253328e-06, + "loss": 0.737252, + "num_input_tokens_seen": 237886710, + "step": 11020, + "time_per_iteration": 2.6047167778015137 + }, + { + "auxiliary_loss_clip": 0.01134195, + "auxiliary_loss_mlp": 0.01104868, + "balance_loss_clip": 1.00202346, + "balance_loss_mlp": 1.00034523, + "epoch": 0.6626183676536901, + "flos": 21394823633280.0, + "grad_norm": 2.15034875157477, + "language_loss": 0.72024435, + "learning_rate": 1.0797045472252554e-06, + "loss": 0.74263495, + "num_input_tokens_seen": 237904795, + "step": 11021, + "time_per_iteration": 2.5999059677124023 + }, + { + "auxiliary_loss_clip": 0.01133956, + "auxiliary_loss_mlp": 0.0110593, + "balance_loss_clip": 1.00191021, + "balance_loss_mlp": 1.00045395, + "epoch": 0.662678490906358, + "flos": 14571293713920.0, + "grad_norm": 2.0904249929115974, + "language_loss": 0.83320892, + "learning_rate": 1.0793587840948793e-06, + "loss": 0.85560775, + "num_input_tokens_seen": 237921320, + "step": 11022, + "time_per_iteration": 2.5718557834625244 + }, + { + "auxiliary_loss_clip": 0.01133125, + "auxiliary_loss_mlp": 0.01105868, + "balance_loss_clip": 1.00170982, + "balance_loss_mlp": 1.00039184, + "epoch": 0.662738614159026, + "flos": 15992350554240.0, + "grad_norm": 3.1939813553731735, + "language_loss": 0.72086596, + "learning_rate": 1.0790130558753099e-06, + "loss": 0.74325585, + "num_input_tokens_seen": 237933525, + "step": 11023, + "time_per_iteration": 2.6061525344848633 + }, + { + "auxiliary_loss_clip": 0.01119831, + "auxiliary_loss_mlp": 0.01104348, + "balance_loss_clip": 1.0019685, + "balance_loss_mlp": 1.00039768, + "epoch": 0.6627987374116939, + "flos": 19536338966400.0, + "grad_norm": 2.2678098721370197, + "language_loss": 0.75262803, + "learning_rate": 1.0786673625796574e-06, + "loss": 0.7748698, + "num_input_tokens_seen": 237953395, + "step": 11024, + "time_per_iteration": 2.674193859100342 + }, + { + "auxiliary_loss_clip": 0.01117057, + "auxiliary_loss_mlp": 0.01105739, + "balance_loss_clip": 1.00186872, + "balance_loss_mlp": 1.00035787, + "epoch": 0.662858860664362, + "flos": 15702838934400.0, + "grad_norm": 2.1108316536211196, + "language_loss": 0.69642711, + "learning_rate": 1.0783217042210306e-06, + "loss": 0.71865505, + "num_input_tokens_seen": 237971445, + "step": 11025, + "time_per_iteration": 2.6246118545532227 + }, + { + "auxiliary_loss_clip": 0.0116515, + "auxiliary_loss_mlp": 0.01105183, + "balance_loss_clip": 1.00208735, + "balance_loss_mlp": 1.00056517, + "epoch": 0.6629189839170299, + "flos": 20154289570560.0, + "grad_norm": 1.5487772888921396, + "language_loss": 0.78611732, + "learning_rate": 1.0779760808125379e-06, + "loss": 0.80882061, + "num_input_tokens_seen": 237989965, + "step": 11026, + "time_per_iteration": 4.033273935317993 + }, + { + "auxiliary_loss_clip": 0.01149855, + "auxiliary_loss_mlp": 0.01104619, + "balance_loss_clip": 1.00192118, + "balance_loss_mlp": 1.00057292, + "epoch": 0.6629791071696979, + "flos": 20915415786240.0, + "grad_norm": 1.8234264029217702, + "language_loss": 0.76097995, + "learning_rate": 1.0776304923672842e-06, + "loss": 0.78352469, + "num_input_tokens_seen": 238006820, + "step": 11027, + "time_per_iteration": 2.5517003536224365 + }, + { + "auxiliary_loss_clip": 0.01133801, + "auxiliary_loss_mlp": 0.01104988, + "balance_loss_clip": 1.00190425, + "balance_loss_mlp": 1.00046551, + "epoch": 0.6630392304223659, + "flos": 20846898593280.0, + "grad_norm": 2.589200067068186, + "language_loss": 0.69929826, + "learning_rate": 1.0772849388983742e-06, + "loss": 0.72168618, + "num_input_tokens_seen": 238022560, + "step": 11028, + "time_per_iteration": 2.5708508491516113 + }, + { + "auxiliary_loss_clip": 0.01148295, + "auxiliary_loss_mlp": 0.01104156, + "balance_loss_clip": 1.00188756, + "balance_loss_mlp": 1.00058675, + "epoch": 0.6630993536750338, + "flos": 20995820380800.0, + "grad_norm": 2.2364213297662485, + "language_loss": 0.79577422, + "learning_rate": 1.0769394204189138e-06, + "loss": 0.8182987, + "num_input_tokens_seen": 238041895, + "step": 11029, + "time_per_iteration": 2.550393581390381 + }, + { + "auxiliary_loss_clip": 0.01165056, + "auxiliary_loss_mlp": 0.01105375, + "balance_loss_clip": 1.00189304, + "balance_loss_mlp": 1.0004704, + "epoch": 0.6631594769277018, + "flos": 18259032355200.0, + "grad_norm": 2.289955824139137, + "language_loss": 0.76225817, + "learning_rate": 1.0765939369420012e-06, + "loss": 0.78496253, + "num_input_tokens_seen": 238060445, + "step": 11030, + "time_per_iteration": 2.4783806800842285 + }, + { + "auxiliary_loss_clip": 0.01149799, + "auxiliary_loss_mlp": 0.01105667, + "balance_loss_clip": 1.0020479, + "balance_loss_mlp": 1.00038147, + "epoch": 0.6632196001803697, + "flos": 17820491207040.0, + "grad_norm": 3.401540031142708, + "language_loss": 0.75279242, + "learning_rate": 1.0762484884807391e-06, + "loss": 0.77534711, + "num_input_tokens_seen": 238077080, + "step": 11031, + "time_per_iteration": 2.546583414077759 + }, + { + "auxiliary_loss_clip": 0.011484, + "auxiliary_loss_mlp": 0.01105238, + "balance_loss_clip": 1.00183356, + "balance_loss_mlp": 1.00052428, + "epoch": 0.6632797234330378, + "flos": 12670182581760.0, + "grad_norm": 2.635483270144667, + "language_loss": 0.74541497, + "learning_rate": 1.075903075048228e-06, + "loss": 0.76795137, + "num_input_tokens_seen": 238091045, + "step": 11032, + "time_per_iteration": 2.4946706295013428 + }, + { + "auxiliary_loss_clip": 0.01103509, + "auxiliary_loss_mlp": 0.01103867, + "balance_loss_clip": 1.00168514, + "balance_loss_mlp": 1.00039327, + "epoch": 0.6633398466857057, + "flos": 23584728113280.0, + "grad_norm": 2.0941685463262423, + "language_loss": 0.80667365, + "learning_rate": 1.0755576966575635e-06, + "loss": 0.82874739, + "num_input_tokens_seen": 238110220, + "step": 11033, + "time_per_iteration": 2.687607526779175 + }, + { + "auxiliary_loss_clip": 0.01134007, + "auxiliary_loss_mlp": 0.01104906, + "balance_loss_clip": 1.00179434, + "balance_loss_mlp": 1.00047839, + "epoch": 0.6633999699383737, + "flos": 20631686256000.0, + "grad_norm": 1.6649529642827063, + "language_loss": 0.80090642, + "learning_rate": 1.0752123533218451e-06, + "loss": 0.82329559, + "num_input_tokens_seen": 238130400, + "step": 11034, + "time_per_iteration": 2.6375017166137695 + }, + { + "auxiliary_loss_clip": 0.0114841, + "auxiliary_loss_mlp": 0.01104061, + "balance_loss_clip": 1.00186586, + "balance_loss_mlp": 1.00049138, + "epoch": 0.6634600931910416, + "flos": 21797095023360.0, + "grad_norm": 1.8584684818659691, + "language_loss": 0.75675857, + "learning_rate": 1.074867045054166e-06, + "loss": 0.77928329, + "num_input_tokens_seen": 238148165, + "step": 11035, + "time_per_iteration": 2.58717679977417 + }, + { + "auxiliary_loss_clip": 0.01117043, + "auxiliary_loss_mlp": 0.01104787, + "balance_loss_clip": 1.00173032, + "balance_loss_mlp": 1.00035954, + "epoch": 0.6635202164437096, + "flos": 18732873594240.0, + "grad_norm": 1.943482085318922, + "language_loss": 0.83170915, + "learning_rate": 1.074521771867622e-06, + "loss": 0.85392749, + "num_input_tokens_seen": 238166360, + "step": 11036, + "time_per_iteration": 2.628502368927002 + }, + { + "auxiliary_loss_clip": 0.01159383, + "auxiliary_loss_mlp": 0.01079664, + "balance_loss_clip": 1.00104821, + "balance_loss_mlp": 0.9999367, + "epoch": 0.6635803396963775, + "flos": 60222771227520.0, + "grad_norm": 0.7740820181597494, + "language_loss": 0.52313977, + "learning_rate": 1.0741765337753044e-06, + "loss": 0.54553026, + "num_input_tokens_seen": 238227630, + "step": 11037, + "time_per_iteration": 3.103278398513794 + }, + { + "auxiliary_loss_clip": 0.01103254, + "auxiliary_loss_mlp": 0.01105474, + "balance_loss_clip": 1.0019511, + "balance_loss_mlp": 1.00056934, + "epoch": 0.6636404629490456, + "flos": 29167041611520.0, + "grad_norm": 2.3562679080785816, + "language_loss": 0.78676939, + "learning_rate": 1.0738313307903052e-06, + "loss": 0.80885667, + "num_input_tokens_seen": 238248435, + "step": 11038, + "time_per_iteration": 4.199910402297974 + }, + { + "auxiliary_loss_clip": 0.01116755, + "auxiliary_loss_mlp": 0.01105812, + "balance_loss_clip": 1.00180829, + "balance_loss_mlp": 1.00062227, + "epoch": 0.6637005862017135, + "flos": 38907702766080.0, + "grad_norm": 2.7409150231899777, + "language_loss": 0.64025909, + "learning_rate": 1.073486162925716e-06, + "loss": 0.66248477, + "num_input_tokens_seen": 238268755, + "step": 11039, + "time_per_iteration": 2.774599552154541 + }, + { + "auxiliary_loss_clip": 0.01117971, + "auxiliary_loss_mlp": 0.01104931, + "balance_loss_clip": 1.00193548, + "balance_loss_mlp": 1.00040865, + "epoch": 0.6637607094543815, + "flos": 22783345729920.0, + "grad_norm": 2.1048531195889595, + "language_loss": 0.64111179, + "learning_rate": 1.0731410301946237e-06, + "loss": 0.66334081, + "num_input_tokens_seen": 238290120, + "step": 11040, + "time_per_iteration": 4.120734930038452 + }, + { + "auxiliary_loss_clip": 0.01116534, + "auxiliary_loss_mlp": 0.01104182, + "balance_loss_clip": 1.00174022, + "balance_loss_mlp": 1.00032651, + "epoch": 0.6638208327070495, + "flos": 18114096977280.0, + "grad_norm": 1.9319618210472616, + "language_loss": 0.72114861, + "learning_rate": 1.0727959326101161e-06, + "loss": 0.74335575, + "num_input_tokens_seen": 238309290, + "step": 11041, + "time_per_iteration": 2.640085220336914 + }, + { + "auxiliary_loss_clip": 0.01150118, + "auxiliary_loss_mlp": 0.01105176, + "balance_loss_clip": 1.00195587, + "balance_loss_mlp": 1.00065386, + "epoch": 0.6638809559597174, + "flos": 29424880414080.0, + "grad_norm": 2.3290161789705963, + "language_loss": 0.6144706, + "learning_rate": 1.0724508701852806e-06, + "loss": 0.63702357, + "num_input_tokens_seen": 238327280, + "step": 11042, + "time_per_iteration": 2.6230459213256836 + }, + { + "auxiliary_loss_clip": 0.01148263, + "auxiliary_loss_mlp": 0.01105652, + "balance_loss_clip": 1.00178504, + "balance_loss_mlp": 1.00036681, + "epoch": 0.6639410792123854, + "flos": 28072699902720.0, + "grad_norm": 2.6239567397838868, + "language_loss": 0.68108892, + "learning_rate": 1.0721058429331998e-06, + "loss": 0.70362806, + "num_input_tokens_seen": 238346330, + "step": 11043, + "time_per_iteration": 2.6408684253692627 + }, + { + "auxiliary_loss_clip": 0.01148697, + "auxiliary_loss_mlp": 0.01102423, + "balance_loss_clip": 1.00189686, + "balance_loss_mlp": 1.00047517, + "epoch": 0.6640012024650533, + "flos": 25556367600000.0, + "grad_norm": 1.513918926648717, + "language_loss": 0.83571935, + "learning_rate": 1.0717608508669587e-06, + "loss": 0.85823047, + "num_input_tokens_seen": 238364650, + "step": 11044, + "time_per_iteration": 2.5898923873901367 + }, + { + "auxiliary_loss_clip": 0.01117216, + "auxiliary_loss_mlp": 0.0110537, + "balance_loss_clip": 1.00187731, + "balance_loss_mlp": 1.00046611, + "epoch": 0.6640613257177214, + "flos": 14866946559360.0, + "grad_norm": 2.1061782306312757, + "language_loss": 0.69430637, + "learning_rate": 1.0714158939996392e-06, + "loss": 0.71653223, + "num_input_tokens_seen": 238381630, + "step": 11045, + "time_per_iteration": 2.6780483722686768 + }, + { + "auxiliary_loss_clip": 0.01148332, + "auxiliary_loss_mlp": 0.0110564, + "balance_loss_clip": 1.00191784, + "balance_loss_mlp": 1.00054526, + "epoch": 0.6641214489703893, + "flos": 23221096778880.0, + "grad_norm": 1.6188741429043112, + "language_loss": 0.64324152, + "learning_rate": 1.0710709723443235e-06, + "loss": 0.6657812, + "num_input_tokens_seen": 238402595, + "step": 11046, + "time_per_iteration": 2.5804691314697266 + }, + { + "auxiliary_loss_clip": 0.01118791, + "auxiliary_loss_mlp": 0.0110455, + "balance_loss_clip": 1.00191402, + "balance_loss_mlp": 1.00059915, + "epoch": 0.6641815722230573, + "flos": 37742617221120.0, + "grad_norm": 1.4998045116830823, + "language_loss": 0.71354824, + "learning_rate": 1.070726085914088e-06, + "loss": 0.73578167, + "num_input_tokens_seen": 238426860, + "step": 11047, + "time_per_iteration": 2.787677049636841 + }, + { + "auxiliary_loss_clip": 0.01086268, + "auxiliary_loss_mlp": 0.01105595, + "balance_loss_clip": 1.00181651, + "balance_loss_mlp": 1.00050044, + "epoch": 0.6642416954757252, + "flos": 17931132074880.0, + "grad_norm": 1.8198011390628068, + "language_loss": 0.77257216, + "learning_rate": 1.0703812347220126e-06, + "loss": 0.79449087, + "num_input_tokens_seen": 238443990, + "step": 11048, + "time_per_iteration": 2.6810131072998047 + }, + { + "auxiliary_loss_clip": 0.01113294, + "auxiliary_loss_mlp": 0.01080206, + "balance_loss_clip": 1.00104189, + "balance_loss_mlp": 1.00009751, + "epoch": 0.6643018187283932, + "flos": 51995384104320.0, + "grad_norm": 0.7517836673261256, + "language_loss": 0.55003369, + "learning_rate": 1.0700364187811745e-06, + "loss": 0.57196879, + "num_input_tokens_seen": 238503045, + "step": 11049, + "time_per_iteration": 3.212857723236084 + }, + { + "auxiliary_loss_clip": 0.01148534, + "auxiliary_loss_mlp": 0.01104817, + "balance_loss_clip": 1.00204492, + "balance_loss_mlp": 1.00048494, + "epoch": 0.6643619419810611, + "flos": 30226657847040.0, + "grad_norm": 1.585343171104042, + "language_loss": 0.63969624, + "learning_rate": 1.069691638104648e-06, + "loss": 0.66222978, + "num_input_tokens_seen": 238527320, + "step": 11050, + "time_per_iteration": 2.64977765083313 + }, + { + "auxiliary_loss_clip": 0.01165138, + "auxiliary_loss_mlp": 0.01104845, + "balance_loss_clip": 1.00200939, + "balance_loss_mlp": 1.00051296, + "epoch": 0.6644220652337292, + "flos": 22966131064320.0, + "grad_norm": 1.9597474080090458, + "language_loss": 0.78947067, + "learning_rate": 1.0693468927055085e-06, + "loss": 0.81217051, + "num_input_tokens_seen": 238546030, + "step": 11051, + "time_per_iteration": 2.526823043823242 + }, + { + "auxiliary_loss_clip": 0.01131543, + "auxiliary_loss_mlp": 0.01105213, + "balance_loss_clip": 1.00184202, + "balance_loss_mlp": 1.00049996, + "epoch": 0.6644821884863971, + "flos": 21142228216320.0, + "grad_norm": 1.8377183875270198, + "language_loss": 0.85577977, + "learning_rate": 1.0690021825968276e-06, + "loss": 0.87814736, + "num_input_tokens_seen": 238564175, + "step": 11052, + "time_per_iteration": 2.5901591777801514 + }, + { + "auxiliary_loss_clip": 0.01102367, + "auxiliary_loss_mlp": 0.01106374, + "balance_loss_clip": 1.00175107, + "balance_loss_mlp": 1.00061154, + "epoch": 0.6645423117390651, + "flos": 20192821885440.0, + "grad_norm": 2.1041750965439325, + "language_loss": 0.74619853, + "learning_rate": 1.0686575077916776e-06, + "loss": 0.76828599, + "num_input_tokens_seen": 238581010, + "step": 11053, + "time_per_iteration": 2.6589908599853516 + }, + { + "auxiliary_loss_clip": 0.01133981, + "auxiliary_loss_mlp": 0.01104466, + "balance_loss_clip": 1.00195336, + "balance_loss_mlp": 1.00042057, + "epoch": 0.6646024349917331, + "flos": 24351959640960.0, + "grad_norm": 1.7001367965028125, + "language_loss": 0.79481345, + "learning_rate": 1.0683128683031278e-06, + "loss": 0.81719792, + "num_input_tokens_seen": 238601365, + "step": 11054, + "time_per_iteration": 2.6400396823883057 + }, + { + "auxiliary_loss_clip": 0.01099917, + "auxiliary_loss_mlp": 0.01104503, + "balance_loss_clip": 1.00170803, + "balance_loss_mlp": 1.00045681, + "epoch": 0.664662558244401, + "flos": 18806706000000.0, + "grad_norm": 1.5641348436997997, + "language_loss": 0.74135506, + "learning_rate": 1.0679682641442472e-06, + "loss": 0.76339924, + "num_input_tokens_seen": 238619850, + "step": 11055, + "time_per_iteration": 2.6407644748687744 + }, + { + "auxiliary_loss_clip": 0.01117561, + "auxiliary_loss_mlp": 0.01105298, + "balance_loss_clip": 1.00191641, + "balance_loss_mlp": 1.0005846, + "epoch": 0.664722681497069, + "flos": 18952790613120.0, + "grad_norm": 1.635085846463373, + "language_loss": 0.7279973, + "learning_rate": 1.0676236953281042e-06, + "loss": 0.7502259, + "num_input_tokens_seen": 238637635, + "step": 11056, + "time_per_iteration": 2.642962694168091 + }, + { + "auxiliary_loss_clip": 0.01100743, + "auxiliary_loss_mlp": 0.01104901, + "balance_loss_clip": 1.00172698, + "balance_loss_mlp": 1.00037849, + "epoch": 0.6647828047497369, + "flos": 19571279921280.0, + "grad_norm": 2.0907247963132614, + "language_loss": 0.695701, + "learning_rate": 1.0672791618677641e-06, + "loss": 0.7177574, + "num_input_tokens_seen": 238656200, + "step": 11057, + "time_per_iteration": 2.711577892303467 + }, + { + "auxiliary_loss_clip": 0.01148617, + "auxiliary_loss_mlp": 0.01105348, + "balance_loss_clip": 1.00197196, + "balance_loss_mlp": 1.00044346, + "epoch": 0.664842928002405, + "flos": 23149455102720.0, + "grad_norm": 2.3669220657811256, + "language_loss": 0.80500007, + "learning_rate": 1.066934663776291e-06, + "loss": 0.82753974, + "num_input_tokens_seen": 238675005, + "step": 11058, + "time_per_iteration": 2.59207820892334 + }, + { + "auxiliary_loss_clip": 0.01111495, + "auxiliary_loss_mlp": 0.0108015, + "balance_loss_clip": 1.0009594, + "balance_loss_mlp": 1.00004184, + "epoch": 0.6649030512550729, + "flos": 65244913148160.0, + "grad_norm": 0.79843020102455, + "language_loss": 0.62626904, + "learning_rate": 1.0665902010667496e-06, + "loss": 0.64818549, + "num_input_tokens_seen": 238731425, + "step": 11059, + "time_per_iteration": 3.175233840942383 + }, + { + "auxiliary_loss_clip": 0.01147793, + "auxiliary_loss_mlp": 0.0110406, + "balance_loss_clip": 1.00179696, + "balance_loss_mlp": 1.00049067, + "epoch": 0.6649631745077409, + "flos": 20194797133440.0, + "grad_norm": 1.6102100889314557, + "language_loss": 0.78742599, + "learning_rate": 1.0662457737522008e-06, + "loss": 0.80994451, + "num_input_tokens_seen": 238752020, + "step": 11060, + "time_per_iteration": 2.591865062713623 + }, + { + "auxiliary_loss_clip": 0.01117314, + "auxiliary_loss_mlp": 0.01104112, + "balance_loss_clip": 1.0018003, + "balance_loss_mlp": 1.00054264, + "epoch": 0.6650232977604088, + "flos": 17238558965760.0, + "grad_norm": 2.052740569478437, + "language_loss": 0.79018897, + "learning_rate": 1.0659013818457055e-06, + "loss": 0.8124032, + "num_input_tokens_seen": 238769665, + "step": 11061, + "time_per_iteration": 2.6429340839385986 + }, + { + "auxiliary_loss_clip": 0.01132031, + "auxiliary_loss_mlp": 0.01103953, + "balance_loss_clip": 1.00192201, + "balance_loss_mlp": 1.00038385, + "epoch": 0.6650834210130768, + "flos": 10006867825920.0, + "grad_norm": 3.955011913645394, + "language_loss": 0.56648052, + "learning_rate": 1.0655570253603243e-06, + "loss": 0.58884037, + "num_input_tokens_seen": 238782180, + "step": 11062, + "time_per_iteration": 2.5868449211120605 + }, + { + "auxiliary_loss_clip": 0.01150205, + "auxiliary_loss_mlp": 0.0110567, + "balance_loss_clip": 1.00190699, + "balance_loss_mlp": 1.00038457, + "epoch": 0.6651435442657447, + "flos": 10452088903680.0, + "grad_norm": 1.922475624519617, + "language_loss": 0.75963289, + "learning_rate": 1.0652127043091144e-06, + "loss": 0.78219163, + "num_input_tokens_seen": 238800315, + "step": 11063, + "time_per_iteration": 4.004112482070923 + }, + { + "auxiliary_loss_clip": 0.01084745, + "auxiliary_loss_mlp": 0.01104444, + "balance_loss_clip": 1.00169826, + "balance_loss_mlp": 1.00058842, + "epoch": 0.6652036675184128, + "flos": 22344229964160.0, + "grad_norm": 1.3261483484028707, + "language_loss": 0.70659709, + "learning_rate": 1.0648684187051316e-06, + "loss": 0.72848898, + "num_input_tokens_seen": 238822250, + "step": 11064, + "time_per_iteration": 2.8144071102142334 + }, + { + "auxiliary_loss_clip": 0.01159477, + "auxiliary_loss_mlp": 0.01079649, + "balance_loss_clip": 1.00102925, + "balance_loss_mlp": 0.99992192, + "epoch": 0.6652637907710807, + "flos": 52909633998720.0, + "grad_norm": 0.8441998524666174, + "language_loss": 0.63056815, + "learning_rate": 1.0645241685614322e-06, + "loss": 0.65295941, + "num_input_tokens_seen": 238877190, + "step": 11065, + "time_per_iteration": 3.0459563732147217 + }, + { + "auxiliary_loss_clip": 0.01150199, + "auxiliary_loss_mlp": 0.0110537, + "balance_loss_clip": 1.00198007, + "balance_loss_mlp": 1.00046563, + "epoch": 0.6653239140237487, + "flos": 23104637907840.0, + "grad_norm": 1.8721409178469979, + "language_loss": 0.62307441, + "learning_rate": 1.0641799538910708e-06, + "loss": 0.64563012, + "num_input_tokens_seen": 238896010, + "step": 11066, + "time_per_iteration": 2.5751702785491943 + }, + { + "auxiliary_loss_clip": 0.01118783, + "auxiliary_loss_mlp": 0.01105818, + "balance_loss_clip": 1.00181103, + "balance_loss_mlp": 1.00043654, + "epoch": 0.6653840372764167, + "flos": 25959393175680.0, + "grad_norm": 1.7288354344530477, + "language_loss": 0.70113009, + "learning_rate": 1.0638357747070985e-06, + "loss": 0.72337615, + "num_input_tokens_seen": 238918990, + "step": 11067, + "time_per_iteration": 2.743269681930542 + }, + { + "auxiliary_loss_clip": 0.01129661, + "auxiliary_loss_mlp": 0.01080157, + "balance_loss_clip": 1.00108182, + "balance_loss_mlp": 1.00004852, + "epoch": 0.6654441605290846, + "flos": 66041985899520.0, + "grad_norm": 0.9028758019630732, + "language_loss": 0.72085547, + "learning_rate": 1.0634916310225684e-06, + "loss": 0.74295366, + "num_input_tokens_seen": 238975735, + "step": 11068, + "time_per_iteration": 3.1127102375030518 + }, + { + "auxiliary_loss_clip": 0.01112412, + "auxiliary_loss_mlp": 0.01079654, + "balance_loss_clip": 1.00125098, + "balance_loss_mlp": 0.99992675, + "epoch": 0.6655042837817526, + "flos": 65196112521600.0, + "grad_norm": 0.7101004234394047, + "language_loss": 0.57817078, + "learning_rate": 1.0631475228505285e-06, + "loss": 0.60009146, + "num_input_tokens_seen": 239042360, + "step": 11069, + "time_per_iteration": 3.30775785446167 + }, + { + "auxiliary_loss_clip": 0.0114626, + "auxiliary_loss_mlp": 0.01079696, + "balance_loss_clip": 1.00135231, + "balance_loss_mlp": 0.99996895, + "epoch": 0.6655644070344205, + "flos": 69008746752000.0, + "grad_norm": 0.7787231986409275, + "language_loss": 0.63570642, + "learning_rate": 1.062803450204029e-06, + "loss": 0.65796602, + "num_input_tokens_seen": 239109410, + "step": 11070, + "time_per_iteration": 3.152155876159668 + }, + { + "auxiliary_loss_clip": 0.01165135, + "auxiliary_loss_mlp": 0.01104365, + "balance_loss_clip": 1.00191534, + "balance_loss_mlp": 1.00041437, + "epoch": 0.6656245302870886, + "flos": 36315562809600.0, + "grad_norm": 1.7514994073043146, + "language_loss": 0.58695793, + "learning_rate": 1.062459413096116e-06, + "loss": 0.60965288, + "num_input_tokens_seen": 239135345, + "step": 11071, + "time_per_iteration": 2.677105188369751 + }, + { + "auxiliary_loss_clip": 0.01149533, + "auxiliary_loss_mlp": 0.01104985, + "balance_loss_clip": 1.00203371, + "balance_loss_mlp": 1.00046241, + "epoch": 0.6656846535397565, + "flos": 21794832466560.0, + "grad_norm": 2.0488398626568327, + "language_loss": 0.72896612, + "learning_rate": 1.0621154115398364e-06, + "loss": 0.75151134, + "num_input_tokens_seen": 239154340, + "step": 11072, + "time_per_iteration": 2.597975492477417 + }, + { + "auxiliary_loss_clip": 0.01148137, + "auxiliary_loss_mlp": 0.01104377, + "balance_loss_clip": 1.00194633, + "balance_loss_mlp": 1.00042665, + "epoch": 0.6657447767924245, + "flos": 37487615592960.0, + "grad_norm": 2.803769850747832, + "language_loss": 0.7080791, + "learning_rate": 1.0617714455482353e-06, + "loss": 0.73060417, + "num_input_tokens_seen": 239177815, + "step": 11073, + "time_per_iteration": 2.7210893630981445 + }, + { + "auxiliary_loss_clip": 0.01115955, + "auxiliary_loss_mlp": 0.01104453, + "balance_loss_clip": 1.00176108, + "balance_loss_mlp": 1.00050211, + "epoch": 0.6658049000450924, + "flos": 16837688206080.0, + "grad_norm": 2.7681458224404487, + "language_loss": 0.55620843, + "learning_rate": 1.061427515134354e-06, + "loss": 0.57841247, + "num_input_tokens_seen": 239195735, + "step": 11074, + "time_per_iteration": 2.607083559036255 + }, + { + "auxiliary_loss_clip": 0.01164963, + "auxiliary_loss_mlp": 0.00747241, + "balance_loss_clip": 1.00200856, + "balance_loss_mlp": 1.00039864, + "epoch": 0.6658650232977604, + "flos": 33510975863040.0, + "grad_norm": 3.9872458671784035, + "language_loss": 0.72423267, + "learning_rate": 1.061083620311235e-06, + "loss": 0.74335468, + "num_input_tokens_seen": 239217535, + "step": 11075, + "time_per_iteration": 2.6532351970672607 + }, + { + "auxiliary_loss_clip": 0.0114843, + "auxiliary_loss_mlp": 0.01103687, + "balance_loss_clip": 1.00184047, + "balance_loss_mlp": 1.00049984, + "epoch": 0.6659251465504283, + "flos": 37706311549440.0, + "grad_norm": 1.724009872101617, + "language_loss": 0.66195655, + "learning_rate": 1.0607397610919202e-06, + "loss": 0.68447769, + "num_input_tokens_seen": 239241975, + "step": 11076, + "time_per_iteration": 5.5301477909088135 + }, + { + "auxiliary_loss_clip": 0.01134925, + "auxiliary_loss_mlp": 0.01104846, + "balance_loss_clip": 1.0018239, + "balance_loss_mlp": 1.00051403, + "epoch": 0.6659852698030964, + "flos": 24893420232960.0, + "grad_norm": 1.7029166066155397, + "language_loss": 0.75022525, + "learning_rate": 1.0603959374894468e-06, + "loss": 0.77262294, + "num_input_tokens_seen": 239262025, + "step": 11077, + "time_per_iteration": 2.670610189437866 + }, + { + "auxiliary_loss_clip": 0.0113166, + "auxiliary_loss_mlp": 0.01104103, + "balance_loss_clip": 1.00185454, + "balance_loss_mlp": 1.0005337, + "epoch": 0.6660453930557643, + "flos": 24352821567360.0, + "grad_norm": 1.4891287219280662, + "language_loss": 0.6667766, + "learning_rate": 1.0600521495168538e-06, + "loss": 0.68913424, + "num_input_tokens_seen": 239282775, + "step": 11078, + "time_per_iteration": 2.6349780559539795 + }, + { + "auxiliary_loss_clip": 0.01165201, + "auxiliary_loss_mlp": 0.01105956, + "balance_loss_clip": 1.00197518, + "balance_loss_mlp": 1.00057483, + "epoch": 0.6661055163084323, + "flos": 10597814380800.0, + "grad_norm": 3.9125959395747367, + "language_loss": 0.69288659, + "learning_rate": 1.0597083971871783e-06, + "loss": 0.71559811, + "num_input_tokens_seen": 239299775, + "step": 11079, + "time_per_iteration": 2.523904323577881 + }, + { + "auxiliary_loss_clip": 0.01133033, + "auxiliary_loss_mlp": 0.01104522, + "balance_loss_clip": 1.00178862, + "balance_loss_mlp": 1.00038064, + "epoch": 0.6661656395611003, + "flos": 24057491944320.0, + "grad_norm": 1.6789154877718238, + "language_loss": 0.80228943, + "learning_rate": 1.0593646805134544e-06, + "loss": 0.82466495, + "num_input_tokens_seen": 239319660, + "step": 11080, + "time_per_iteration": 2.6302716732025146 + }, + { + "auxiliary_loss_clip": 0.01116197, + "auxiliary_loss_mlp": 0.01103192, + "balance_loss_clip": 1.00178432, + "balance_loss_mlp": 1.00048161, + "epoch": 0.6662257628137682, + "flos": 23036192542080.0, + "grad_norm": 1.75599823950579, + "language_loss": 0.78287828, + "learning_rate": 1.0590209995087157e-06, + "loss": 0.80507219, + "num_input_tokens_seen": 239339215, + "step": 11081, + "time_per_iteration": 2.6463873386383057 + }, + { + "auxiliary_loss_clip": 0.01116526, + "auxiliary_loss_mlp": 0.01105074, + "balance_loss_clip": 1.00174475, + "balance_loss_mlp": 1.00045609, + "epoch": 0.6662858860664362, + "flos": 24754446512640.0, + "grad_norm": 1.8653411163448477, + "language_loss": 0.79526895, + "learning_rate": 1.0586773541859946e-06, + "loss": 0.81748497, + "num_input_tokens_seen": 239358545, + "step": 11082, + "time_per_iteration": 2.675414800643921 + }, + { + "auxiliary_loss_clip": 0.0111728, + "auxiliary_loss_mlp": 0.0110393, + "balance_loss_clip": 1.00193214, + "balance_loss_mlp": 1.00074244, + "epoch": 0.6663460093191041, + "flos": 20009066883840.0, + "grad_norm": 1.4145196852652968, + "language_loss": 0.84042126, + "learning_rate": 1.0583337445583234e-06, + "loss": 0.86263335, + "num_input_tokens_seen": 239376665, + "step": 11083, + "time_per_iteration": 2.6370904445648193 + }, + { + "auxiliary_loss_clip": 0.01115291, + "auxiliary_loss_mlp": 0.01105846, + "balance_loss_clip": 1.0017786, + "balance_loss_mlp": 1.0005604, + "epoch": 0.6664061325717722, + "flos": 17821389047040.0, + "grad_norm": 2.41219942721401, + "language_loss": 0.85168576, + "learning_rate": 1.057990170638731e-06, + "loss": 0.87389708, + "num_input_tokens_seen": 239394345, + "step": 11084, + "time_per_iteration": 2.6327664852142334 + }, + { + "auxiliary_loss_clip": 0.01133647, + "auxiliary_loss_mlp": 0.01104445, + "balance_loss_clip": 1.00183499, + "balance_loss_mlp": 1.00039911, + "epoch": 0.6664662558244401, + "flos": 18076893465600.0, + "grad_norm": 2.6471414235346353, + "language_loss": 0.72863019, + "learning_rate": 1.0576466324402452e-06, + "loss": 0.75101113, + "num_input_tokens_seen": 239410605, + "step": 11085, + "time_per_iteration": 2.626358985900879 + }, + { + "auxiliary_loss_clip": 0.01133034, + "auxiliary_loss_mlp": 0.01105279, + "balance_loss_clip": 1.00179839, + "balance_loss_mlp": 1.00046968, + "epoch": 0.6665263790771081, + "flos": 21574197175680.0, + "grad_norm": 2.0085393737048505, + "language_loss": 0.80465221, + "learning_rate": 1.057303129975894e-06, + "loss": 0.82703531, + "num_input_tokens_seen": 239427155, + "step": 11086, + "time_per_iteration": 2.6487131118774414 + }, + { + "auxiliary_loss_clip": 0.01131567, + "auxiliary_loss_mlp": 0.01105223, + "balance_loss_clip": 1.00178659, + "balance_loss_mlp": 1.00050962, + "epoch": 0.666586502329776, + "flos": 24206629213440.0, + "grad_norm": 2.2199796979272786, + "language_loss": 0.74959385, + "learning_rate": 1.056959663258702e-06, + "loss": 0.77196181, + "num_input_tokens_seen": 239445510, + "step": 11087, + "time_per_iteration": 2.620633602142334 + }, + { + "auxiliary_loss_clip": 0.01148384, + "auxiliary_loss_mlp": 0.01104705, + "balance_loss_clip": 1.00198054, + "balance_loss_mlp": 1.00046873, + "epoch": 0.666646625582444, + "flos": 22200515648640.0, + "grad_norm": 2.166139565472408, + "language_loss": 0.64892149, + "learning_rate": 1.0566162323016939e-06, + "loss": 0.6714524, + "num_input_tokens_seen": 239464805, + "step": 11088, + "time_per_iteration": 2.5739872455596924 + }, + { + "auxiliary_loss_clip": 0.01150369, + "auxiliary_loss_mlp": 0.01104893, + "balance_loss_clip": 1.00203943, + "balance_loss_mlp": 1.00046575, + "epoch": 0.6667067488351119, + "flos": 18259930195200.0, + "grad_norm": 2.4262845073402204, + "language_loss": 0.64007121, + "learning_rate": 1.0562728371178928e-06, + "loss": 0.66262388, + "num_input_tokens_seen": 239483890, + "step": 11089, + "time_per_iteration": 2.607722520828247 + }, + { + "auxiliary_loss_clip": 0.01164967, + "auxiliary_loss_mlp": 0.01104422, + "balance_loss_clip": 1.00197721, + "balance_loss_mlp": 1.00037658, + "epoch": 0.66676687208778, + "flos": 17236547804160.0, + "grad_norm": 4.568029513623084, + "language_loss": 0.80862659, + "learning_rate": 1.0559294777203221e-06, + "loss": 0.83132052, + "num_input_tokens_seen": 239500080, + "step": 11090, + "time_per_iteration": 2.4821715354919434 + }, + { + "auxiliary_loss_clip": 0.01133549, + "auxiliary_loss_mlp": 0.01105161, + "balance_loss_clip": 1.00191212, + "balance_loss_mlp": 1.00044727, + "epoch": 0.6668269953404479, + "flos": 19752197748480.0, + "grad_norm": 1.9989035820899872, + "language_loss": 0.77902699, + "learning_rate": 1.0555861541219984e-06, + "loss": 0.80141401, + "num_input_tokens_seen": 239517335, + "step": 11091, + "time_per_iteration": 2.5915396213531494 + }, + { + "auxiliary_loss_clip": 0.01164933, + "auxiliary_loss_mlp": 0.01104047, + "balance_loss_clip": 1.00196826, + "balance_loss_mlp": 1.00057364, + "epoch": 0.6668871185931159, + "flos": 20558428467840.0, + "grad_norm": 1.737684345253757, + "language_loss": 0.79316694, + "learning_rate": 1.0552428663359425e-06, + "loss": 0.8158567, + "num_input_tokens_seen": 239536240, + "step": 11092, + "time_per_iteration": 2.501417875289917 + }, + { + "auxiliary_loss_clip": 0.01115231, + "auxiliary_loss_mlp": 0.0107963, + "balance_loss_clip": 1.0009656, + "balance_loss_mlp": 0.99990332, + "epoch": 0.6669472418457839, + "flos": 58088167735680.0, + "grad_norm": 0.7524897627079326, + "language_loss": 0.57692528, + "learning_rate": 1.0548996143751724e-06, + "loss": 0.59887385, + "num_input_tokens_seen": 239598000, + "step": 11093, + "time_per_iteration": 3.219196319580078 + }, + { + "auxiliary_loss_clip": 0.01165028, + "auxiliary_loss_mlp": 0.01104078, + "balance_loss_clip": 1.00202727, + "balance_loss_mlp": 1.00041318, + "epoch": 0.6670073650984518, + "flos": 26065113880320.0, + "grad_norm": 1.586464913959232, + "language_loss": 0.76367128, + "learning_rate": 1.054556398252703e-06, + "loss": 0.78636235, + "num_input_tokens_seen": 239617650, + "step": 11094, + "time_per_iteration": 2.564537525177002 + }, + { + "auxiliary_loss_clip": 0.01164978, + "auxiliary_loss_mlp": 0.01104665, + "balance_loss_clip": 1.00196981, + "balance_loss_mlp": 1.00042868, + "epoch": 0.6670674883511198, + "flos": 32416849635840.0, + "grad_norm": 1.7671905343285694, + "language_loss": 0.73176378, + "learning_rate": 1.05421321798155e-06, + "loss": 0.75446022, + "num_input_tokens_seen": 239639825, + "step": 11095, + "time_per_iteration": 2.6338460445404053 + }, + { + "auxiliary_loss_clip": 0.01150397, + "auxiliary_loss_mlp": 0.01105054, + "balance_loss_clip": 1.00198257, + "balance_loss_mlp": 1.00062633, + "epoch": 0.6671276116037878, + "flos": 18037786533120.0, + "grad_norm": 2.054263903206832, + "language_loss": 0.73136765, + "learning_rate": 1.053870073574727e-06, + "loss": 0.75392222, + "num_input_tokens_seen": 239656300, + "step": 11096, + "time_per_iteration": 2.529388904571533 + }, + { + "auxiliary_loss_clip": 0.01117272, + "auxiliary_loss_mlp": 0.01103414, + "balance_loss_clip": 1.00182867, + "balance_loss_mlp": 1.0005126, + "epoch": 0.6671877348564558, + "flos": 23767046570880.0, + "grad_norm": 1.7905420547316249, + "language_loss": 0.6449874, + "learning_rate": 1.0535269650452456e-06, + "loss": 0.66719425, + "num_input_tokens_seen": 239676655, + "step": 11097, + "time_per_iteration": 2.6794278621673584 + }, + { + "auxiliary_loss_clip": 0.01148201, + "auxiliary_loss_mlp": 0.01104361, + "balance_loss_clip": 1.00176573, + "balance_loss_mlp": 1.0005064, + "epoch": 0.6672478581091237, + "flos": 20918360701440.0, + "grad_norm": 2.2482005789266095, + "language_loss": 0.75581968, + "learning_rate": 1.0531838924061158e-06, + "loss": 0.77834535, + "num_input_tokens_seen": 239695430, + "step": 11098, + "time_per_iteration": 2.5539112091064453 + }, + { + "auxiliary_loss_clip": 0.01165236, + "auxiliary_loss_mlp": 0.01105444, + "balance_loss_clip": 1.00205052, + "balance_loss_mlp": 1.00063479, + "epoch": 0.6673079813617917, + "flos": 27855799626240.0, + "grad_norm": 1.9181094004128334, + "language_loss": 0.74281389, + "learning_rate": 1.0528408556703476e-06, + "loss": 0.76552063, + "num_input_tokens_seen": 239717070, + "step": 11099, + "time_per_iteration": 2.6231625080108643 + }, + { + "auxiliary_loss_clip": 0.0114978, + "auxiliary_loss_mlp": 0.01104626, + "balance_loss_clip": 1.00192463, + "balance_loss_mlp": 1.00058055, + "epoch": 0.6673681046144596, + "flos": 21616859554560.0, + "grad_norm": 2.406759997338168, + "language_loss": 0.77954876, + "learning_rate": 1.0524978548509502e-06, + "loss": 0.80209279, + "num_input_tokens_seen": 239737105, + "step": 11100, + "time_per_iteration": 2.5968899726867676 + }, + { + "auxiliary_loss_clip": 0.01164885, + "auxiliary_loss_mlp": 0.01103995, + "balance_loss_clip": 1.00194299, + "balance_loss_mlp": 1.00061631, + "epoch": 0.6674282278671276, + "flos": 20889884194560.0, + "grad_norm": 3.0934490813949527, + "language_loss": 0.60252422, + "learning_rate": 1.0521548899609288e-06, + "loss": 0.62521303, + "num_input_tokens_seen": 239757835, + "step": 11101, + "time_per_iteration": 2.603572130203247 + }, + { + "auxiliary_loss_clip": 0.01133831, + "auxiliary_loss_mlp": 0.01105899, + "balance_loss_clip": 1.00184333, + "balance_loss_mlp": 1.00061321, + "epoch": 0.6674883511197955, + "flos": 23624194181760.0, + "grad_norm": 1.962123773328304, + "language_loss": 0.71276152, + "learning_rate": 1.0518119610132884e-06, + "loss": 0.7351588, + "num_input_tokens_seen": 239775425, + "step": 11102, + "time_per_iteration": 4.093135595321655 + }, + { + "auxiliary_loss_clip": 0.01150448, + "auxiliary_loss_mlp": 0.01104805, + "balance_loss_clip": 1.00180948, + "balance_loss_mlp": 1.00037789, + "epoch": 0.6675484743724636, + "flos": 19609668581760.0, + "grad_norm": 1.7719438072316238, + "language_loss": 0.84579337, + "learning_rate": 1.051469068021034e-06, + "loss": 0.86834586, + "num_input_tokens_seen": 239794605, + "step": 11103, + "time_per_iteration": 2.5761606693267822 + }, + { + "auxiliary_loss_clip": 0.01133545, + "auxiliary_loss_mlp": 0.01104379, + "balance_loss_clip": 1.0018537, + "balance_loss_mlp": 1.00042844, + "epoch": 0.6676085976251315, + "flos": 14319452482560.0, + "grad_norm": 2.1419445823356305, + "language_loss": 0.78121412, + "learning_rate": 1.0511262109971668e-06, + "loss": 0.8035934, + "num_input_tokens_seen": 239812135, + "step": 11104, + "time_per_iteration": 2.6273326873779297 + }, + { + "auxiliary_loss_clip": 0.01102124, + "auxiliary_loss_mlp": 0.01104751, + "balance_loss_clip": 1.00172687, + "balance_loss_mlp": 1.0004189, + "epoch": 0.6676687208777995, + "flos": 38104596529920.0, + "grad_norm": 1.7329120451480455, + "language_loss": 0.57986403, + "learning_rate": 1.0507833899546889e-06, + "loss": 0.60193276, + "num_input_tokens_seen": 239835845, + "step": 11105, + "time_per_iteration": 2.8723294734954834 + }, + { + "auxiliary_loss_clip": 0.01148556, + "auxiliary_loss_mlp": 0.01106132, + "balance_loss_clip": 1.00192702, + "balance_loss_mlp": 1.00056076, + "epoch": 0.6677288441304675, + "flos": 23981576549760.0, + "grad_norm": 1.7221092697874152, + "language_loss": 0.72984099, + "learning_rate": 1.0504406049066e-06, + "loss": 0.75238788, + "num_input_tokens_seen": 239853820, + "step": 11106, + "time_per_iteration": 2.602548837661743 + }, + { + "auxiliary_loss_clip": 0.01165089, + "auxiliary_loss_mlp": 0.01105077, + "balance_loss_clip": 1.00201702, + "balance_loss_mlp": 1.0005548, + "epoch": 0.6677889673831354, + "flos": 24170682677760.0, + "grad_norm": 1.6281042422625693, + "language_loss": 0.76724875, + "learning_rate": 1.0500978558659e-06, + "loss": 0.78995037, + "num_input_tokens_seen": 239873365, + "step": 11107, + "time_per_iteration": 2.5628609657287598 + }, + { + "auxiliary_loss_clip": 0.01134429, + "auxiliary_loss_mlp": 0.01103328, + "balance_loss_clip": 1.00186062, + "balance_loss_mlp": 1.00033069, + "epoch": 0.6678490906358034, + "flos": 22309648145280.0, + "grad_norm": 2.64730464364678, + "language_loss": 0.90335298, + "learning_rate": 1.049755142845583e-06, + "loss": 0.92573059, + "num_input_tokens_seen": 239891215, + "step": 11108, + "time_per_iteration": 2.6453118324279785 + }, + { + "auxiliary_loss_clip": 0.0111791, + "auxiliary_loss_mlp": 0.01103163, + "balance_loss_clip": 1.00175583, + "balance_loss_mlp": 1.00035727, + "epoch": 0.6679092138884714, + "flos": 36898752026880.0, + "grad_norm": 1.5487475356987157, + "language_loss": 0.82666808, + "learning_rate": 1.049412465858646e-06, + "loss": 0.84887886, + "num_input_tokens_seen": 239913490, + "step": 11109, + "time_per_iteration": 2.795340061187744 + }, + { + "auxiliary_loss_clip": 0.01131844, + "auxiliary_loss_mlp": 0.01105247, + "balance_loss_clip": 1.00179529, + "balance_loss_mlp": 1.00043809, + "epoch": 0.6679693371411394, + "flos": 18150294908160.0, + "grad_norm": 3.0541911070422443, + "language_loss": 0.69640505, + "learning_rate": 1.0490698249180847e-06, + "loss": 0.71877599, + "num_input_tokens_seen": 239931565, + "step": 11110, + "time_per_iteration": 2.6391563415527344 + }, + { + "auxiliary_loss_clip": 0.01132096, + "auxiliary_loss_mlp": 0.01104566, + "balance_loss_clip": 1.00187731, + "balance_loss_mlp": 1.00061512, + "epoch": 0.6680294603938073, + "flos": 27198167472000.0, + "grad_norm": 1.7883956208106069, + "language_loss": 0.73636413, + "learning_rate": 1.04872722003689e-06, + "loss": 0.75873071, + "num_input_tokens_seen": 239952395, + "step": 11111, + "time_per_iteration": 2.7196383476257324 + }, + { + "auxiliary_loss_clip": 0.01164975, + "auxiliary_loss_mlp": 0.01104015, + "balance_loss_clip": 1.00191975, + "balance_loss_mlp": 1.00044608, + "epoch": 0.6680895836464753, + "flos": 21725309692800.0, + "grad_norm": 2.3122889706679532, + "language_loss": 0.65095818, + "learning_rate": 1.0483846512280553e-06, + "loss": 0.67364812, + "num_input_tokens_seen": 239968910, + "step": 11112, + "time_per_iteration": 2.625823497772217 + }, + { + "auxiliary_loss_clip": 0.01133814, + "auxiliary_loss_mlp": 0.01104483, + "balance_loss_clip": 1.00180864, + "balance_loss_mlp": 1.00034201, + "epoch": 0.6681497068991432, + "flos": 19646477043840.0, + "grad_norm": 2.795336588793974, + "language_loss": 0.63376796, + "learning_rate": 1.048042118504569e-06, + "loss": 0.65615094, + "num_input_tokens_seen": 239987680, + "step": 11113, + "time_per_iteration": 2.6250686645507812 + }, + { + "auxiliary_loss_clip": 0.01100054, + "auxiliary_loss_mlp": 0.01103471, + "balance_loss_clip": 1.0018096, + "balance_loss_mlp": 1.00047433, + "epoch": 0.6682098301518112, + "flos": 17419153570560.0, + "grad_norm": 1.8580091533058454, + "language_loss": 0.65926307, + "learning_rate": 1.047699621879422e-06, + "loss": 0.68129832, + "num_input_tokens_seen": 240005790, + "step": 11114, + "time_per_iteration": 5.512645244598389 + }, + { + "auxiliary_loss_clip": 0.01150371, + "auxiliary_loss_mlp": 0.01105239, + "balance_loss_clip": 1.00192118, + "balance_loss_mlp": 1.00052595, + "epoch": 0.6682699534044791, + "flos": 22599016110720.0, + "grad_norm": 1.842379499187933, + "language_loss": 0.78396571, + "learning_rate": 1.0473571613655998e-06, + "loss": 0.80652177, + "num_input_tokens_seen": 240025895, + "step": 11115, + "time_per_iteration": 2.6232025623321533 + }, + { + "auxiliary_loss_clip": 0.01101929, + "auxiliary_loss_mlp": 0.00747332, + "balance_loss_clip": 1.00164545, + "balance_loss_mlp": 1.00048733, + "epoch": 0.6683300766571472, + "flos": 24863686750080.0, + "grad_norm": 2.087285771785852, + "language_loss": 0.79579437, + "learning_rate": 1.0470147369760896e-06, + "loss": 0.81428689, + "num_input_tokens_seen": 240044880, + "step": 11116, + "time_per_iteration": 2.7365334033966064 + }, + { + "auxiliary_loss_clip": 0.01116967, + "auxiliary_loss_mlp": 0.01105175, + "balance_loss_clip": 1.00181365, + "balance_loss_mlp": 1.00055707, + "epoch": 0.6683901999098151, + "flos": 27126633536640.0, + "grad_norm": 1.7362102855586476, + "language_loss": 0.7933684, + "learning_rate": 1.0466723487238768e-06, + "loss": 0.8155899, + "num_input_tokens_seen": 240065785, + "step": 11117, + "time_per_iteration": 2.717135429382324 + }, + { + "auxiliary_loss_clip": 0.0109963, + "auxiliary_loss_mlp": 0.01105513, + "balance_loss_clip": 1.00173914, + "balance_loss_mlp": 1.00060892, + "epoch": 0.6684503231624831, + "flos": 20739023072640.0, + "grad_norm": 1.7210865715715984, + "language_loss": 0.66063297, + "learning_rate": 1.0463299966219441e-06, + "loss": 0.6826843, + "num_input_tokens_seen": 240085130, + "step": 11118, + "time_per_iteration": 2.7174971103668213 + }, + { + "auxiliary_loss_clip": 0.01133526, + "auxiliary_loss_mlp": 0.01104842, + "balance_loss_clip": 1.00187635, + "balance_loss_mlp": 1.00041461, + "epoch": 0.668510446415151, + "flos": 21762189982080.0, + "grad_norm": 1.4613068268033889, + "language_loss": 0.68660653, + "learning_rate": 1.0459876806832727e-06, + "loss": 0.70899022, + "num_input_tokens_seen": 240105495, + "step": 11119, + "time_per_iteration": 2.6245076656341553 + }, + { + "auxiliary_loss_clip": 0.0113361, + "auxiliary_loss_mlp": 0.0110464, + "balance_loss_clip": 1.00193191, + "balance_loss_mlp": 1.00049865, + "epoch": 0.668570569667819, + "flos": 30191250015360.0, + "grad_norm": 3.6093175693573407, + "language_loss": 0.67499411, + "learning_rate": 1.0456454009208448e-06, + "loss": 0.69737661, + "num_input_tokens_seen": 240125455, + "step": 11120, + "time_per_iteration": 2.697932481765747 + }, + { + "auxiliary_loss_clip": 0.01118387, + "auxiliary_loss_mlp": 0.01105226, + "balance_loss_clip": 1.0019033, + "balance_loss_mlp": 1.00051212, + "epoch": 0.668630692920487, + "flos": 24170646764160.0, + "grad_norm": 1.7014606894119666, + "language_loss": 0.72462255, + "learning_rate": 1.045303157347638e-06, + "loss": 0.74685872, + "num_input_tokens_seen": 240143870, + "step": 11121, + "time_per_iteration": 2.6840667724609375 + }, + { + "auxiliary_loss_clip": 0.0113374, + "auxiliary_loss_mlp": 0.01105602, + "balance_loss_clip": 1.00176692, + "balance_loss_mlp": 1.00050724, + "epoch": 0.668690816173155, + "flos": 17457147181440.0, + "grad_norm": 2.872205944898184, + "language_loss": 0.7021507, + "learning_rate": 1.0449609499766316e-06, + "loss": 0.72454411, + "num_input_tokens_seen": 240161020, + "step": 11122, + "time_per_iteration": 2.624030590057373 + }, + { + "auxiliary_loss_clip": 0.01087145, + "auxiliary_loss_mlp": 0.00747287, + "balance_loss_clip": 1.00175071, + "balance_loss_mlp": 1.00047421, + "epoch": 0.668750939425823, + "flos": 25005102595200.0, + "grad_norm": 2.0026957593939567, + "language_loss": 0.71234787, + "learning_rate": 1.0446187788208015e-06, + "loss": 0.73069215, + "num_input_tokens_seen": 240179820, + "step": 11123, + "time_per_iteration": 2.769164562225342 + }, + { + "auxiliary_loss_clip": 0.0113327, + "auxiliary_loss_mlp": 0.01106321, + "balance_loss_clip": 1.00195873, + "balance_loss_mlp": 1.0005585, + "epoch": 0.6688110626784909, + "flos": 24096778444800.0, + "grad_norm": 1.7527215462457402, + "language_loss": 0.79150492, + "learning_rate": 1.0442766438931244e-06, + "loss": 0.81390083, + "num_input_tokens_seen": 240200130, + "step": 11124, + "time_per_iteration": 2.638146162033081 + }, + { + "auxiliary_loss_clip": 0.01132041, + "auxiliary_loss_mlp": 0.01105715, + "balance_loss_clip": 1.00191832, + "balance_loss_mlp": 1.00061977, + "epoch": 0.6688711859311589, + "flos": 21759532375680.0, + "grad_norm": 2.9197552443327095, + "language_loss": 0.74603224, + "learning_rate": 1.0439345452065716e-06, + "loss": 0.76840973, + "num_input_tokens_seen": 240217945, + "step": 11125, + "time_per_iteration": 2.657221794128418 + }, + { + "auxiliary_loss_clip": 0.01121521, + "auxiliary_loss_mlp": 0.01104976, + "balance_loss_clip": 1.00217807, + "balance_loss_mlp": 1.00064421, + "epoch": 0.6689313091838268, + "flos": 22929645824640.0, + "grad_norm": 1.9242753239662804, + "language_loss": 0.66563314, + "learning_rate": 1.043592482774116e-06, + "loss": 0.6878981, + "num_input_tokens_seen": 240237220, + "step": 11126, + "time_per_iteration": 2.6940603256225586 + }, + { + "auxiliary_loss_clip": 0.01150482, + "auxiliary_loss_mlp": 0.01105456, + "balance_loss_clip": 1.00185716, + "balance_loss_mlp": 1.0005517, + "epoch": 0.6689914324364948, + "flos": 20886149180160.0, + "grad_norm": 5.269921682291859, + "language_loss": 0.70998526, + "learning_rate": 1.0432504566087305e-06, + "loss": 0.73254466, + "num_input_tokens_seen": 240256000, + "step": 11127, + "time_per_iteration": 2.592914342880249 + }, + { + "auxiliary_loss_clip": 0.01131607, + "auxiliary_loss_mlp": 0.01106494, + "balance_loss_clip": 1.00182164, + "balance_loss_mlp": 1.00054097, + "epoch": 0.6690515556891627, + "flos": 22748225207040.0, + "grad_norm": 4.1010160560611135, + "language_loss": 0.79958922, + "learning_rate": 1.0429084667233827e-06, + "loss": 0.82197016, + "num_input_tokens_seen": 240275845, + "step": 11128, + "time_per_iteration": 2.611156702041626 + }, + { + "auxiliary_loss_clip": 0.01165183, + "auxiliary_loss_mlp": 0.01105147, + "balance_loss_clip": 1.00198686, + "balance_loss_mlp": 1.00052881, + "epoch": 0.6691116789418308, + "flos": 23331450337920.0, + "grad_norm": 1.7801011745252793, + "language_loss": 0.80766344, + "learning_rate": 1.0425665131310427e-06, + "loss": 0.83036679, + "num_input_tokens_seen": 240294095, + "step": 11129, + "time_per_iteration": 2.5430796146392822 + }, + { + "auxiliary_loss_clip": 0.01150403, + "auxiliary_loss_mlp": 0.0110354, + "balance_loss_clip": 1.0019542, + "balance_loss_mlp": 1.00063872, + "epoch": 0.6691718021944987, + "flos": 32447014081920.0, + "grad_norm": 1.8626539277397538, + "language_loss": 0.70302927, + "learning_rate": 1.0422245958446762e-06, + "loss": 0.72556871, + "num_input_tokens_seen": 240313460, + "step": 11130, + "time_per_iteration": 2.6374857425689697 + }, + { + "auxiliary_loss_clip": 0.01134598, + "auxiliary_loss_mlp": 0.0110379, + "balance_loss_clip": 1.00183535, + "balance_loss_mlp": 1.00050735, + "epoch": 0.6692319254471667, + "flos": 23731602825600.0, + "grad_norm": 2.017111490615119, + "language_loss": 0.7015686, + "learning_rate": 1.0418827148772486e-06, + "loss": 0.72395253, + "num_input_tokens_seen": 240333540, + "step": 11131, + "time_per_iteration": 2.685455322265625 + }, + { + "auxiliary_loss_clip": 0.01148653, + "auxiliary_loss_mlp": 0.01105262, + "balance_loss_clip": 1.00191832, + "balance_loss_mlp": 1.00045311, + "epoch": 0.6692920486998346, + "flos": 14427902620800.0, + "grad_norm": 2.805249144824789, + "language_loss": 0.65848845, + "learning_rate": 1.0415408702417243e-06, + "loss": 0.68102765, + "num_input_tokens_seen": 240350085, + "step": 11132, + "time_per_iteration": 2.521066665649414 + }, + { + "auxiliary_loss_clip": 0.01150474, + "auxiliary_loss_mlp": 0.0110506, + "balance_loss_clip": 1.00191045, + "balance_loss_mlp": 1.0005374, + "epoch": 0.6693521719525026, + "flos": 21507475662720.0, + "grad_norm": 1.622171904943292, + "language_loss": 0.7465167, + "learning_rate": 1.0411990619510661e-06, + "loss": 0.76907206, + "num_input_tokens_seen": 240370015, + "step": 11133, + "time_per_iteration": 2.5890417098999023 + }, + { + "auxiliary_loss_clip": 0.01148119, + "auxiliary_loss_mlp": 0.01105694, + "balance_loss_clip": 1.00198519, + "balance_loss_mlp": 1.00059891, + "epoch": 0.6694122952051706, + "flos": 25406943022080.0, + "grad_norm": 2.0060208593730042, + "language_loss": 0.65709472, + "learning_rate": 1.0408572900182363e-06, + "loss": 0.67963284, + "num_input_tokens_seen": 240390770, + "step": 11134, + "time_per_iteration": 2.6107916831970215 + }, + { + "auxiliary_loss_clip": 0.01150578, + "auxiliary_loss_mlp": 0.01106627, + "balance_loss_clip": 1.00217915, + "balance_loss_mlp": 1.0005784, + "epoch": 0.6694724184578386, + "flos": 25661729168640.0, + "grad_norm": 3.6879658872686862, + "language_loss": 0.76878011, + "learning_rate": 1.0405155544561943e-06, + "loss": 0.79135221, + "num_input_tokens_seen": 240409590, + "step": 11135, + "time_per_iteration": 2.631021499633789 + }, + { + "auxiliary_loss_clip": 0.01150346, + "auxiliary_loss_mlp": 0.01104695, + "balance_loss_clip": 1.00196886, + "balance_loss_mlp": 1.00045848, + "epoch": 0.6695325417105066, + "flos": 17709311635200.0, + "grad_norm": 1.8298890961165657, + "language_loss": 0.74034464, + "learning_rate": 1.040173855277898e-06, + "loss": 0.76289505, + "num_input_tokens_seen": 240428180, + "step": 11136, + "time_per_iteration": 2.5450165271759033 + }, + { + "auxiliary_loss_clip": 0.01149188, + "auxiliary_loss_mlp": 0.01105825, + "balance_loss_clip": 1.00200748, + "balance_loss_mlp": 1.00063467, + "epoch": 0.6695926649631745, + "flos": 24460050643200.0, + "grad_norm": 2.418686180035989, + "language_loss": 0.62176025, + "learning_rate": 1.0398321924963061e-06, + "loss": 0.64431036, + "num_input_tokens_seen": 240447815, + "step": 11137, + "time_per_iteration": 2.604487657546997 + }, + { + "auxiliary_loss_clip": 0.01165111, + "auxiliary_loss_mlp": 0.01104771, + "balance_loss_clip": 1.00204337, + "balance_loss_mlp": 1.0005343, + "epoch": 0.6696527882158425, + "flos": 24280138396800.0, + "grad_norm": 1.8028375130387992, + "language_loss": 0.65768433, + "learning_rate": 1.0394905661243724e-06, + "loss": 0.68038309, + "num_input_tokens_seen": 240468635, + "step": 11138, + "time_per_iteration": 3.9502735137939453 + }, + { + "auxiliary_loss_clip": 0.01119533, + "auxiliary_loss_mlp": 0.01103006, + "balance_loss_clip": 1.00175917, + "balance_loss_mlp": 1.00048625, + "epoch": 0.6697129114685104, + "flos": 23002759958400.0, + "grad_norm": 1.6503147172180206, + "language_loss": 0.72480631, + "learning_rate": 1.039148976175053e-06, + "loss": 0.74703169, + "num_input_tokens_seen": 240488550, + "step": 11139, + "time_per_iteration": 4.033538341522217 + }, + { + "auxiliary_loss_clip": 0.01118306, + "auxiliary_loss_mlp": 0.01103946, + "balance_loss_clip": 1.00174832, + "balance_loss_mlp": 1.00056791, + "epoch": 0.6697730347211784, + "flos": 22638123043200.0, + "grad_norm": 2.033685180762885, + "language_loss": 0.70682061, + "learning_rate": 1.0388074226613016e-06, + "loss": 0.72904313, + "num_input_tokens_seen": 240508330, + "step": 11140, + "time_per_iteration": 2.672309637069702 + }, + { + "auxiliary_loss_clip": 0.01149889, + "auxiliary_loss_mlp": 0.01105037, + "balance_loss_clip": 1.00188828, + "balance_loss_mlp": 1.00041938, + "epoch": 0.6698331579738463, + "flos": 28877242682880.0, + "grad_norm": 1.8893069772935527, + "language_loss": 0.75471389, + "learning_rate": 1.0384659055960691e-06, + "loss": 0.77726316, + "num_input_tokens_seen": 240528470, + "step": 11141, + "time_per_iteration": 2.6066436767578125 + }, + { + "auxiliary_loss_clip": 0.01148398, + "auxiliary_loss_mlp": 0.01105927, + "balance_loss_clip": 1.00187504, + "balance_loss_mlp": 1.00045109, + "epoch": 0.6698932812265144, + "flos": 24207096090240.0, + "grad_norm": 4.217160934692206, + "language_loss": 0.82066166, + "learning_rate": 1.0381244249923052e-06, + "loss": 0.84320498, + "num_input_tokens_seen": 240547815, + "step": 11142, + "time_per_iteration": 2.5854554176330566 + }, + { + "auxiliary_loss_clip": 0.01101514, + "auxiliary_loss_mlp": 0.01104011, + "balance_loss_clip": 1.00170684, + "balance_loss_mlp": 1.00044227, + "epoch": 0.6699534044791823, + "flos": 22090269830400.0, + "grad_norm": 1.6712396813829469, + "language_loss": 0.69619507, + "learning_rate": 1.037782980862959e-06, + "loss": 0.71825027, + "num_input_tokens_seen": 240567765, + "step": 11143, + "time_per_iteration": 2.7174839973449707 + }, + { + "auxiliary_loss_clip": 0.01117226, + "auxiliary_loss_mlp": 0.00747258, + "balance_loss_clip": 1.00179994, + "balance_loss_mlp": 1.00038171, + "epoch": 0.6700135277318503, + "flos": 25192377129600.0, + "grad_norm": 1.5977385368365145, + "language_loss": 0.70110726, + "learning_rate": 1.0374415732209796e-06, + "loss": 0.71975207, + "num_input_tokens_seen": 240590750, + "step": 11144, + "time_per_iteration": 2.7275660037994385 + }, + { + "auxiliary_loss_clip": 0.01133495, + "auxiliary_loss_mlp": 0.01105278, + "balance_loss_clip": 1.00188041, + "balance_loss_mlp": 1.00046921, + "epoch": 0.6700736509845182, + "flos": 23440187784960.0, + "grad_norm": 1.6654372459497218, + "language_loss": 0.74120981, + "learning_rate": 1.0371002020793114e-06, + "loss": 0.76359749, + "num_input_tokens_seen": 240608875, + "step": 11145, + "time_per_iteration": 2.6384127140045166 + }, + { + "auxiliary_loss_clip": 0.01132835, + "auxiliary_loss_mlp": 0.01104754, + "balance_loss_clip": 1.00177765, + "balance_loss_mlp": 1.000422, + "epoch": 0.6701337742371862, + "flos": 24389953251840.0, + "grad_norm": 1.5377376770920717, + "language_loss": 0.70810306, + "learning_rate": 1.0367588674509008e-06, + "loss": 0.73047894, + "num_input_tokens_seen": 240628565, + "step": 11146, + "time_per_iteration": 2.6551320552825928 + }, + { + "auxiliary_loss_clip": 0.01164978, + "auxiliary_loss_mlp": 0.00747265, + "balance_loss_clip": 1.00201881, + "balance_loss_mlp": 1.00039971, + "epoch": 0.6701938974898543, + "flos": 14793652857600.0, + "grad_norm": 2.023834470887654, + "language_loss": 0.78128582, + "learning_rate": 1.0364175693486905e-06, + "loss": 0.80040824, + "num_input_tokens_seen": 240646325, + "step": 11147, + "time_per_iteration": 2.523308515548706 + }, + { + "auxiliary_loss_clip": 0.0114976, + "auxiliary_loss_mlp": 0.00747272, + "balance_loss_clip": 1.00197458, + "balance_loss_mlp": 1.00051045, + "epoch": 0.6702540207425222, + "flos": 20154002261760.0, + "grad_norm": 2.7995037546718375, + "language_loss": 0.70054525, + "learning_rate": 1.0360763077856218e-06, + "loss": 0.71951556, + "num_input_tokens_seen": 240666145, + "step": 11148, + "time_per_iteration": 2.5744779109954834 + }, + { + "auxiliary_loss_clip": 0.01134702, + "auxiliary_loss_mlp": 0.0110504, + "balance_loss_clip": 1.0019387, + "balance_loss_mlp": 1.00051761, + "epoch": 0.6703141439951902, + "flos": 21214157201280.0, + "grad_norm": 5.310513834261252, + "language_loss": 0.69878805, + "learning_rate": 1.035735082774636e-06, + "loss": 0.72118545, + "num_input_tokens_seen": 240685570, + "step": 11149, + "time_per_iteration": 2.62233829498291 + }, + { + "auxiliary_loss_clip": 0.01133936, + "auxiliary_loss_mlp": 0.01104422, + "balance_loss_clip": 1.00180852, + "balance_loss_mlp": 1.00056672, + "epoch": 0.6703742672478581, + "flos": 23112538899840.0, + "grad_norm": 1.762662495351451, + "language_loss": 0.73163664, + "learning_rate": 1.0353938943286727e-06, + "loss": 0.75402021, + "num_input_tokens_seen": 240706945, + "step": 11150, + "time_per_iteration": 2.6640994548797607 + }, + { + "auxiliary_loss_clip": 0.01148236, + "auxiliary_loss_mlp": 0.01104243, + "balance_loss_clip": 1.00196695, + "balance_loss_mlp": 1.00048304, + "epoch": 0.6704343905005261, + "flos": 22528918719360.0, + "grad_norm": 2.0735335129691648, + "language_loss": 0.78199053, + "learning_rate": 1.035052742460671e-06, + "loss": 0.8045153, + "num_input_tokens_seen": 240727990, + "step": 11151, + "time_per_iteration": 4.055109262466431 + }, + { + "auxiliary_loss_clip": 0.01099471, + "auxiliary_loss_mlp": 0.01080128, + "balance_loss_clip": 1.00093603, + "balance_loss_mlp": 1.00001895, + "epoch": 0.670494513753194, + "flos": 64793158773120.0, + "grad_norm": 0.7902929006496288, + "language_loss": 0.55456287, + "learning_rate": 1.0347116271835643e-06, + "loss": 0.57635885, + "num_input_tokens_seen": 240790380, + "step": 11152, + "time_per_iteration": 4.672762870788574 + }, + { + "auxiliary_loss_clip": 0.01131515, + "auxiliary_loss_mlp": 0.01104463, + "balance_loss_clip": 1.00175226, + "balance_loss_mlp": 1.00051248, + "epoch": 0.670554637005862, + "flos": 23511506238720.0, + "grad_norm": 1.9373122846679864, + "language_loss": 0.80648065, + "learning_rate": 1.0343705485102896e-06, + "loss": 0.82884037, + "num_input_tokens_seen": 240811545, + "step": 11153, + "time_per_iteration": 2.6425700187683105 + }, + { + "auxiliary_loss_clip": 0.01115492, + "auxiliary_loss_mlp": 0.00747264, + "balance_loss_clip": 1.00178838, + "balance_loss_mlp": 1.00036466, + "epoch": 0.67061476025853, + "flos": 19463404400640.0, + "grad_norm": 1.6383104580419088, + "language_loss": 0.76298761, + "learning_rate": 1.0340295064537814e-06, + "loss": 0.78161514, + "num_input_tokens_seen": 240831380, + "step": 11154, + "time_per_iteration": 2.680614471435547 + }, + { + "auxiliary_loss_clip": 0.01135613, + "auxiliary_loss_mlp": 0.01105571, + "balance_loss_clip": 1.0019778, + "balance_loss_mlp": 1.00057161, + "epoch": 0.670674883511198, + "flos": 20519967980160.0, + "grad_norm": 1.4923053071665837, + "language_loss": 0.76310205, + "learning_rate": 1.0336885010269702e-06, + "loss": 0.78551388, + "num_input_tokens_seen": 240851855, + "step": 11155, + "time_per_iteration": 2.6771135330200195 + }, + { + "auxiliary_loss_clip": 0.0116539, + "auxiliary_loss_mlp": 0.01105027, + "balance_loss_clip": 1.00216866, + "balance_loss_mlp": 1.00069475, + "epoch": 0.6707350067638659, + "flos": 25483971738240.0, + "grad_norm": 1.9139168246364724, + "language_loss": 0.81885016, + "learning_rate": 1.0333475322427878e-06, + "loss": 0.84155434, + "num_input_tokens_seen": 240869980, + "step": 11156, + "time_per_iteration": 2.577183246612549 + }, + { + "auxiliary_loss_clip": 0.01165, + "auxiliary_loss_mlp": 0.01104619, + "balance_loss_clip": 1.00193, + "balance_loss_mlp": 1.00047755, + "epoch": 0.6707951300165339, + "flos": 22273450214400.0, + "grad_norm": 1.8697524176284595, + "language_loss": 0.74624026, + "learning_rate": 1.033006600114165e-06, + "loss": 0.7689364, + "num_input_tokens_seen": 240888680, + "step": 11157, + "time_per_iteration": 2.5931293964385986 + }, + { + "auxiliary_loss_clip": 0.01150418, + "auxiliary_loss_mlp": 0.0110579, + "balance_loss_clip": 1.0019691, + "balance_loss_mlp": 1.00050449, + "epoch": 0.6708552532692018, + "flos": 23984593292160.0, + "grad_norm": 1.6588350063291417, + "language_loss": 0.74212331, + "learning_rate": 1.0326657046540282e-06, + "loss": 0.76468539, + "num_input_tokens_seen": 240909050, + "step": 11158, + "time_per_iteration": 2.592589855194092 + }, + { + "auxiliary_loss_clip": 0.01165145, + "auxiliary_loss_mlp": 0.01105003, + "balance_loss_clip": 1.00196779, + "balance_loss_mlp": 1.00057626, + "epoch": 0.6709153765218698, + "flos": 24937519155840.0, + "grad_norm": 2.0284892717951495, + "language_loss": 0.81360149, + "learning_rate": 1.0323248458753044e-06, + "loss": 0.836303, + "num_input_tokens_seen": 240930035, + "step": 11159, + "time_per_iteration": 2.5844204425811768 + }, + { + "auxiliary_loss_clip": 0.01133887, + "auxiliary_loss_mlp": 0.01105011, + "balance_loss_clip": 1.00179696, + "balance_loss_mlp": 1.00058353, + "epoch": 0.6709754997745379, + "flos": 17530225401600.0, + "grad_norm": 1.7024742856470167, + "language_loss": 0.77092493, + "learning_rate": 1.0319840237909193e-06, + "loss": 0.79331392, + "num_input_tokens_seen": 240948895, + "step": 11160, + "time_per_iteration": 2.5918917655944824 + }, + { + "auxiliary_loss_clip": 0.01131394, + "auxiliary_loss_mlp": 0.0110388, + "balance_loss_clip": 1.00176287, + "balance_loss_mlp": 1.0004065, + "epoch": 0.6710356230272058, + "flos": 22090880361600.0, + "grad_norm": 1.8870529696776692, + "language_loss": 0.73551303, + "learning_rate": 1.0316432384137978e-06, + "loss": 0.75786579, + "num_input_tokens_seen": 240967770, + "step": 11161, + "time_per_iteration": 2.662715435028076 + }, + { + "auxiliary_loss_clip": 0.01134006, + "auxiliary_loss_mlp": 0.01105389, + "balance_loss_clip": 1.00198591, + "balance_loss_mlp": 1.00057971, + "epoch": 0.6710957462798738, + "flos": 24206449645440.0, + "grad_norm": 3.5333764798681053, + "language_loss": 0.68789476, + "learning_rate": 1.0313024897568618e-06, + "loss": 0.7102887, + "num_input_tokens_seen": 240988985, + "step": 11162, + "time_per_iteration": 2.6696455478668213 + }, + { + "auxiliary_loss_clip": 0.01135625, + "auxiliary_loss_mlp": 0.0110443, + "balance_loss_clip": 1.00188732, + "balance_loss_mlp": 1.00066996, + "epoch": 0.6711558695325417, + "flos": 19093955063040.0, + "grad_norm": 2.1579641856312297, + "language_loss": 0.70201153, + "learning_rate": 1.030961777833032e-06, + "loss": 0.72441202, + "num_input_tokens_seen": 241005455, + "step": 11163, + "time_per_iteration": 2.6371755599975586 + }, + { + "auxiliary_loss_clip": 0.01165043, + "auxiliary_loss_mlp": 0.01103949, + "balance_loss_clip": 1.00204825, + "balance_loss_mlp": 1.00057077, + "epoch": 0.6712159927852097, + "flos": 25557875971200.0, + "grad_norm": 1.9712937554673404, + "language_loss": 0.75655222, + "learning_rate": 1.0306211026552291e-06, + "loss": 0.77924216, + "num_input_tokens_seen": 241026175, + "step": 11164, + "time_per_iteration": 2.5694851875305176 + }, + { + "auxiliary_loss_clip": 0.01164984, + "auxiliary_loss_mlp": 0.01104699, + "balance_loss_clip": 1.00197542, + "balance_loss_mlp": 1.00055742, + "epoch": 0.6712761160378776, + "flos": 22228812587520.0, + "grad_norm": 2.658069408062235, + "language_loss": 0.65213239, + "learning_rate": 1.0302804642363704e-06, + "loss": 0.67482913, + "num_input_tokens_seen": 241044040, + "step": 11165, + "time_per_iteration": 2.54791522026062 + }, + { + "auxiliary_loss_clip": 0.01165005, + "auxiliary_loss_mlp": 0.01104041, + "balance_loss_clip": 1.00197601, + "balance_loss_mlp": 1.00047207, + "epoch": 0.6713362392905456, + "flos": 22455517276800.0, + "grad_norm": 1.9896822612299723, + "language_loss": 0.71472389, + "learning_rate": 1.0299398625893738e-06, + "loss": 0.7374143, + "num_input_tokens_seen": 241063615, + "step": 11166, + "time_per_iteration": 2.673703908920288 + }, + { + "auxiliary_loss_clip": 0.01164962, + "auxiliary_loss_mlp": 0.01103969, + "balance_loss_clip": 1.00202394, + "balance_loss_mlp": 1.00049543, + "epoch": 0.6713963625432136, + "flos": 25630200005760.0, + "grad_norm": 1.9097489810691528, + "language_loss": 0.76776743, + "learning_rate": 1.0295992977271546e-06, + "loss": 0.79045677, + "num_input_tokens_seen": 241082520, + "step": 11167, + "time_per_iteration": 2.6241846084594727 + }, + { + "auxiliary_loss_clip": 0.01150231, + "auxiliary_loss_mlp": 0.01105109, + "balance_loss_clip": 1.00187445, + "balance_loss_mlp": 1.00049114, + "epoch": 0.6714564857958816, + "flos": 35006475640320.0, + "grad_norm": 1.932059523287746, + "language_loss": 0.68584102, + "learning_rate": 1.029258769662629e-06, + "loss": 0.70839441, + "num_input_tokens_seen": 241103505, + "step": 11168, + "time_per_iteration": 2.684311628341675 + }, + { + "auxiliary_loss_clip": 0.01119032, + "auxiliary_loss_mlp": 0.01105844, + "balance_loss_clip": 1.00193167, + "balance_loss_mlp": 1.00065339, + "epoch": 0.6715166090485495, + "flos": 26279931168000.0, + "grad_norm": 3.6901372500976457, + "language_loss": 0.72948074, + "learning_rate": 1.0289182784087068e-06, + "loss": 0.75172949, + "num_input_tokens_seen": 241122885, + "step": 11169, + "time_per_iteration": 2.6856529712677 + }, + { + "auxiliary_loss_clip": 0.01149878, + "auxiliary_loss_mlp": 0.0110568, + "balance_loss_clip": 1.00184178, + "balance_loss_mlp": 1.00049019, + "epoch": 0.6715767323012175, + "flos": 15924156583680.0, + "grad_norm": 3.7912239869963718, + "language_loss": 0.76129234, + "learning_rate": 1.0285778239783005e-06, + "loss": 0.78384793, + "num_input_tokens_seen": 241140865, + "step": 11170, + "time_per_iteration": 2.5186259746551514 + }, + { + "auxiliary_loss_clip": 0.01131813, + "auxiliary_loss_mlp": 0.01105429, + "balance_loss_clip": 1.00181293, + "balance_loss_mlp": 1.00042927, + "epoch": 0.6716368555538854, + "flos": 17491441691520.0, + "grad_norm": 2.7802907770668295, + "language_loss": 0.74119902, + "learning_rate": 1.0282374063843212e-06, + "loss": 0.76357138, + "num_input_tokens_seen": 241158225, + "step": 11171, + "time_per_iteration": 2.575568675994873 + }, + { + "auxiliary_loss_clip": 0.01118954, + "auxiliary_loss_mlp": 0.01105007, + "balance_loss_clip": 1.00182974, + "balance_loss_mlp": 1.00057936, + "epoch": 0.6716969788065534, + "flos": 16761521416320.0, + "grad_norm": 1.598793994729525, + "language_loss": 0.86261153, + "learning_rate": 1.0278970256396762e-06, + "loss": 0.8848511, + "num_input_tokens_seen": 241175215, + "step": 11172, + "time_per_iteration": 2.613992214202881 + }, + { + "auxiliary_loss_clip": 0.01150256, + "auxiliary_loss_mlp": 0.0110463, + "balance_loss_clip": 1.00186729, + "balance_loss_mlp": 1.00048912, + "epoch": 0.6717571020592215, + "flos": 22709800632960.0, + "grad_norm": 1.5493731018316852, + "language_loss": 0.63456935, + "learning_rate": 1.0275566817572733e-06, + "loss": 0.6571182, + "num_input_tokens_seen": 241195250, + "step": 11173, + "time_per_iteration": 2.589165210723877 + }, + { + "auxiliary_loss_clip": 0.01150013, + "auxiliary_loss_mlp": 0.01106617, + "balance_loss_clip": 1.00186038, + "balance_loss_mlp": 1.00056815, + "epoch": 0.6718172253118894, + "flos": 18734094656640.0, + "grad_norm": 21.531006122220564, + "language_loss": 0.71397614, + "learning_rate": 1.02721637475002e-06, + "loss": 0.7365424, + "num_input_tokens_seen": 241210720, + "step": 11174, + "time_per_iteration": 2.53104305267334 + }, + { + "auxiliary_loss_clip": 0.01118766, + "auxiliary_loss_mlp": 0.01104262, + "balance_loss_clip": 1.00197315, + "balance_loss_mlp": 1.00050247, + "epoch": 0.6718773485645574, + "flos": 15632526061440.0, + "grad_norm": 1.9512555972686132, + "language_loss": 0.69124871, + "learning_rate": 1.0268761046308178e-06, + "loss": 0.71347904, + "num_input_tokens_seen": 241227395, + "step": 11175, + "time_per_iteration": 2.6581478118896484 + }, + { + "auxiliary_loss_clip": 0.01132982, + "auxiliary_loss_mlp": 0.0110362, + "balance_loss_clip": 1.00193405, + "balance_loss_mlp": 1.00062358, + "epoch": 0.6719374718172253, + "flos": 19354774694400.0, + "grad_norm": 2.442054959919384, + "language_loss": 0.74155545, + "learning_rate": 1.0265358714125714e-06, + "loss": 0.76392144, + "num_input_tokens_seen": 241246355, + "step": 11176, + "time_per_iteration": 5.332199811935425 + }, + { + "auxiliary_loss_clip": 0.01131518, + "auxiliary_loss_mlp": 0.01104542, + "balance_loss_clip": 1.00173485, + "balance_loss_mlp": 1.00049639, + "epoch": 0.6719975950698933, + "flos": 21981316901760.0, + "grad_norm": 2.0679821218313346, + "language_loss": 0.72884071, + "learning_rate": 1.026195675108182e-06, + "loss": 0.75120133, + "num_input_tokens_seen": 241264180, + "step": 11177, + "time_per_iteration": 2.619908332824707 + }, + { + "auxiliary_loss_clip": 0.01165092, + "auxiliary_loss_mlp": 0.01104505, + "balance_loss_clip": 1.00200963, + "balance_loss_mlp": 1.00064969, + "epoch": 0.6720577183225612, + "flos": 25228072270080.0, + "grad_norm": 2.6112848755831144, + "language_loss": 0.76657856, + "learning_rate": 1.025855515730551e-06, + "loss": 0.78927451, + "num_input_tokens_seen": 241282245, + "step": 11178, + "time_per_iteration": 2.5778157711029053 + }, + { + "auxiliary_loss_clip": 0.01148487, + "auxiliary_loss_mlp": 0.01104251, + "balance_loss_clip": 1.00202477, + "balance_loss_mlp": 1.0004915, + "epoch": 0.6721178415752292, + "flos": 16945886949120.0, + "grad_norm": 1.568737165285035, + "language_loss": 0.70055521, + "learning_rate": 1.0255153932925766e-06, + "loss": 0.72308266, + "num_input_tokens_seen": 241300745, + "step": 11179, + "time_per_iteration": 2.5754995346069336 + }, + { + "auxiliary_loss_clip": 0.01100072, + "auxiliary_loss_mlp": 0.01105339, + "balance_loss_clip": 1.00174713, + "balance_loss_mlp": 1.00052989, + "epoch": 0.6721779648278972, + "flos": 21541375123200.0, + "grad_norm": 1.4834181693947042, + "language_loss": 0.74131906, + "learning_rate": 1.0251753078071557e-06, + "loss": 0.76337314, + "num_input_tokens_seen": 241319320, + "step": 11180, + "time_per_iteration": 2.708671808242798 + }, + { + "auxiliary_loss_clip": 0.01135439, + "auxiliary_loss_mlp": 0.01104349, + "balance_loss_clip": 1.0019145, + "balance_loss_mlp": 1.00039876, + "epoch": 0.6722380880805652, + "flos": 22605444645120.0, + "grad_norm": 1.4924318446122091, + "language_loss": 0.75216794, + "learning_rate": 1.0248352592871848e-06, + "loss": 0.77456582, + "num_input_tokens_seen": 241342225, + "step": 11181, + "time_per_iteration": 2.6718876361846924 + }, + { + "auxiliary_loss_clip": 0.01133501, + "auxiliary_loss_mlp": 0.01103995, + "balance_loss_clip": 1.00177586, + "balance_loss_mlp": 1.00033104, + "epoch": 0.6722982113332331, + "flos": 15925269905280.0, + "grad_norm": 1.990070067099416, + "language_loss": 0.74483144, + "learning_rate": 1.0244952477455585e-06, + "loss": 0.76720643, + "num_input_tokens_seen": 241358240, + "step": 11182, + "time_per_iteration": 2.5698342323303223 + }, + { + "auxiliary_loss_clip": 0.01148478, + "auxiliary_loss_mlp": 0.01103392, + "balance_loss_clip": 1.00188994, + "balance_loss_mlp": 1.0004909, + "epoch": 0.6723583345859011, + "flos": 20596170683520.0, + "grad_norm": 1.7029183152452814, + "language_loss": 0.69970793, + "learning_rate": 1.0241552731951699e-06, + "loss": 0.72222662, + "num_input_tokens_seen": 241378420, + "step": 11183, + "time_per_iteration": 2.5943219661712646 + }, + { + "auxiliary_loss_clip": 0.01101949, + "auxiliary_loss_mlp": 0.01104783, + "balance_loss_clip": 1.00165021, + "balance_loss_mlp": 1.00054622, + "epoch": 0.672418457838569, + "flos": 21725848396800.0, + "grad_norm": 1.4869372451115086, + "language_loss": 0.77768171, + "learning_rate": 1.0238153356489112e-06, + "loss": 0.79974908, + "num_input_tokens_seen": 241397185, + "step": 11184, + "time_per_iteration": 2.6738858222961426 + }, + { + "auxiliary_loss_clip": 0.01133744, + "auxiliary_loss_mlp": 0.00747548, + "balance_loss_clip": 1.00203323, + "balance_loss_mlp": 1.00043654, + "epoch": 0.672478581091237, + "flos": 21470379891840.0, + "grad_norm": 1.9354137120418549, + "language_loss": 0.66168618, + "learning_rate": 1.0234754351196743e-06, + "loss": 0.68049914, + "num_input_tokens_seen": 241415785, + "step": 11185, + "time_per_iteration": 2.604435443878174 + }, + { + "auxiliary_loss_clip": 0.01116724, + "auxiliary_loss_mlp": 0.0110449, + "balance_loss_clip": 1.0017736, + "balance_loss_mlp": 1.00044394, + "epoch": 0.6725387043439051, + "flos": 30846763267200.0, + "grad_norm": 1.6431281567964022, + "language_loss": 0.80702347, + "learning_rate": 1.023135571620345e-06, + "loss": 0.82923561, + "num_input_tokens_seen": 241437390, + "step": 11186, + "time_per_iteration": 2.7003133296966553 + }, + { + "auxiliary_loss_clip": 0.01148149, + "auxiliary_loss_mlp": 0.01103444, + "balance_loss_clip": 1.00200033, + "balance_loss_mlp": 1.00063753, + "epoch": 0.672598827596573, + "flos": 24055947659520.0, + "grad_norm": 1.572492526824129, + "language_loss": 0.80165136, + "learning_rate": 1.022795745163813e-06, + "loss": 0.82416725, + "num_input_tokens_seen": 241458085, + "step": 11187, + "time_per_iteration": 2.6419014930725098 + }, + { + "auxiliary_loss_clip": 0.01104098, + "auxiliary_loss_mlp": 0.01105949, + "balance_loss_clip": 1.00194967, + "balance_loss_mlp": 1.00056767, + "epoch": 0.672658950849241, + "flos": 21871861182720.0, + "grad_norm": 2.2158991255927427, + "language_loss": 0.70650572, + "learning_rate": 1.022455955762965e-06, + "loss": 0.72860622, + "num_input_tokens_seen": 241476880, + "step": 11188, + "time_per_iteration": 2.724592447280884 + }, + { + "auxiliary_loss_clip": 0.01069811, + "auxiliary_loss_mlp": 0.0110452, + "balance_loss_clip": 1.00162554, + "balance_loss_mlp": 1.00056911, + "epoch": 0.6727190741019089, + "flos": 23222102359680.0, + "grad_norm": 1.7974433996431143, + "language_loss": 0.75880313, + "learning_rate": 1.0221162034306842e-06, + "loss": 0.78054643, + "num_input_tokens_seen": 241496535, + "step": 11189, + "time_per_iteration": 4.230287313461304 + }, + { + "auxiliary_loss_clip": 0.01165053, + "auxiliary_loss_mlp": 0.01105979, + "balance_loss_clip": 1.00181961, + "balance_loss_mlp": 1.00050306, + "epoch": 0.6727791973545769, + "flos": 15778610674560.0, + "grad_norm": 2.0645861116181474, + "language_loss": 0.74967778, + "learning_rate": 1.0217764881798562e-06, + "loss": 0.77238816, + "num_input_tokens_seen": 241513465, + "step": 11190, + "time_per_iteration": 2.495739698410034 + }, + { + "auxiliary_loss_clip": 0.01087708, + "auxiliary_loss_mlp": 0.01103791, + "balance_loss_clip": 1.00193584, + "balance_loss_mlp": 1.00050831, + "epoch": 0.6728393206072448, + "flos": 21249852341760.0, + "grad_norm": 1.9151007601073915, + "language_loss": 0.77267373, + "learning_rate": 1.0214368100233612e-06, + "loss": 0.79458874, + "num_input_tokens_seen": 241534125, + "step": 11191, + "time_per_iteration": 2.7203588485717773 + }, + { + "auxiliary_loss_clip": 0.01164879, + "auxiliary_loss_mlp": 0.01104066, + "balance_loss_clip": 1.00191903, + "balance_loss_mlp": 1.00040185, + "epoch": 0.6728994438599128, + "flos": 32123279779200.0, + "grad_norm": 1.8770257921148872, + "language_loss": 0.86037511, + "learning_rate": 1.0210971689740802e-06, + "loss": 0.88306451, + "num_input_tokens_seen": 241556340, + "step": 11192, + "time_per_iteration": 2.6039063930511475 + }, + { + "auxiliary_loss_clip": 0.01150533, + "auxiliary_loss_mlp": 0.01105514, + "balance_loss_clip": 1.00202608, + "balance_loss_mlp": 1.00060952, + "epoch": 0.6729595671125808, + "flos": 23112359331840.0, + "grad_norm": 1.8829041993104676, + "language_loss": 0.7628178, + "learning_rate": 1.0207575650448923e-06, + "loss": 0.78537822, + "num_input_tokens_seen": 241575185, + "step": 11193, + "time_per_iteration": 2.5703394412994385 + }, + { + "auxiliary_loss_clip": 0.01116675, + "auxiliary_loss_mlp": 0.01104743, + "balance_loss_clip": 1.00176764, + "balance_loss_mlp": 1.00069761, + "epoch": 0.6730196903652488, + "flos": 14611406227200.0, + "grad_norm": 1.9701886131869306, + "language_loss": 0.79105949, + "learning_rate": 1.0204179982486758e-06, + "loss": 0.81327367, + "num_input_tokens_seen": 241592970, + "step": 11194, + "time_per_iteration": 2.6327457427978516 + }, + { + "auxiliary_loss_clip": 0.01148533, + "auxiliary_loss_mlp": 0.01104722, + "balance_loss_clip": 1.00192714, + "balance_loss_mlp": 1.00058079, + "epoch": 0.6730798136179167, + "flos": 21105922544640.0, + "grad_norm": 2.3430206094941517, + "language_loss": 0.90181744, + "learning_rate": 1.0200784685983075e-06, + "loss": 0.92434996, + "num_input_tokens_seen": 241610245, + "step": 11195, + "time_per_iteration": 2.5482819080352783 + }, + { + "auxiliary_loss_clip": 0.01149757, + "auxiliary_loss_mlp": 0.01104891, + "balance_loss_clip": 1.0019362, + "balance_loss_mlp": 1.00065422, + "epoch": 0.6731399368705847, + "flos": 28986267438720.0, + "grad_norm": 1.798357118358962, + "language_loss": 0.72756749, + "learning_rate": 1.019738976106662e-06, + "loss": 0.75011396, + "num_input_tokens_seen": 241630350, + "step": 11196, + "time_per_iteration": 2.6212475299835205 + }, + { + "auxiliary_loss_clip": 0.01079879, + "auxiliary_loss_mlp": 0.01078925, + "balance_loss_clip": 1.00094938, + "balance_loss_mlp": 0.99996072, + "epoch": 0.6732000601232526, + "flos": 64743708723840.0, + "grad_norm": 0.7885670668040481, + "language_loss": 0.56519091, + "learning_rate": 1.0193995207866123e-06, + "loss": 0.586779, + "num_input_tokens_seen": 241692380, + "step": 11197, + "time_per_iteration": 3.180731773376465 + }, + { + "auxiliary_loss_clip": 0.01132944, + "auxiliary_loss_mlp": 0.01104002, + "balance_loss_clip": 1.00197315, + "balance_loss_mlp": 1.00033784, + "epoch": 0.6732601833759206, + "flos": 17201642762880.0, + "grad_norm": 2.1897203985684723, + "language_loss": 0.75212121, + "learning_rate": 1.0190601026510312e-06, + "loss": 0.77449071, + "num_input_tokens_seen": 241710430, + "step": 11198, + "time_per_iteration": 2.6316921710968018 + }, + { + "auxiliary_loss_clip": 0.01149918, + "auxiliary_loss_mlp": 0.01105006, + "balance_loss_clip": 1.00187075, + "balance_loss_mlp": 1.00038838, + "epoch": 0.6733203066285887, + "flos": 18658861620480.0, + "grad_norm": 1.9392687701355567, + "language_loss": 0.81063062, + "learning_rate": 1.0187207217127892e-06, + "loss": 0.83317989, + "num_input_tokens_seen": 241724775, + "step": 11199, + "time_per_iteration": 2.718085527420044 + }, + { + "auxiliary_loss_clip": 0.01100353, + "auxiliary_loss_mlp": 0.0110448, + "balance_loss_clip": 1.00169969, + "balance_loss_mlp": 1.00052989, + "epoch": 0.6733804298812566, + "flos": 35809330481280.0, + "grad_norm": 2.0943983277322897, + "language_loss": 0.7162717, + "learning_rate": 1.0183813779847552e-06, + "loss": 0.73832005, + "num_input_tokens_seen": 241744440, + "step": 11200, + "time_per_iteration": 2.8451058864593506 + }, + { + "auxiliary_loss_clip": 0.01165104, + "auxiliary_loss_mlp": 0.01105234, + "balance_loss_clip": 1.00201237, + "balance_loss_mlp": 1.00061631, + "epoch": 0.6734405531339246, + "flos": 61638833099520.0, + "grad_norm": 1.9255786717961274, + "language_loss": 0.64623666, + "learning_rate": 1.0180420714797987e-06, + "loss": 0.66894007, + "num_input_tokens_seen": 241771705, + "step": 11201, + "time_per_iteration": 2.9199984073638916 + }, + { + "auxiliary_loss_clip": 0.01132188, + "auxiliary_loss_mlp": 0.01105506, + "balance_loss_clip": 1.00180101, + "balance_loss_mlp": 1.00060213, + "epoch": 0.6735006763865925, + "flos": 20522338277760.0, + "grad_norm": 2.0222159597338827, + "language_loss": 0.62833816, + "learning_rate": 1.0177028022107856e-06, + "loss": 0.65071505, + "num_input_tokens_seen": 241790830, + "step": 11202, + "time_per_iteration": 2.624006986618042 + }, + { + "auxiliary_loss_clip": 0.01165166, + "auxiliary_loss_mlp": 0.01104482, + "balance_loss_clip": 1.00203109, + "balance_loss_mlp": 1.00043607, + "epoch": 0.6735607996392605, + "flos": 13918869031680.0, + "grad_norm": 1.7209643995068247, + "language_loss": 0.74265289, + "learning_rate": 1.0173635701905796e-06, + "loss": 0.76534933, + "num_input_tokens_seen": 241808165, + "step": 11203, + "time_per_iteration": 2.5283477306365967 + }, + { + "auxiliary_loss_clip": 0.01132994, + "auxiliary_loss_mlp": 0.01106363, + "balance_loss_clip": 1.00189686, + "balance_loss_mlp": 1.00041008, + "epoch": 0.6736209228919284, + "flos": 18807244704000.0, + "grad_norm": 1.9788686452864663, + "language_loss": 0.67891026, + "learning_rate": 1.0170243754320456e-06, + "loss": 0.70130378, + "num_input_tokens_seen": 241826925, + "step": 11204, + "time_per_iteration": 2.591035842895508 + }, + { + "auxiliary_loss_clip": 0.01148669, + "auxiliary_loss_mlp": 0.0110542, + "balance_loss_clip": 1.00202823, + "balance_loss_mlp": 1.0005163, + "epoch": 0.6736810461445965, + "flos": 20373129181440.0, + "grad_norm": 1.7459581631328824, + "language_loss": 0.74214721, + "learning_rate": 1.0166852179480465e-06, + "loss": 0.76468813, + "num_input_tokens_seen": 241845525, + "step": 11205, + "time_per_iteration": 2.5581552982330322 + }, + { + "auxiliary_loss_clip": 0.01164743, + "auxiliary_loss_mlp": 0.01103562, + "balance_loss_clip": 1.00186396, + "balance_loss_mlp": 1.00046957, + "epoch": 0.6737411693972644, + "flos": 30007530927360.0, + "grad_norm": 1.8513164682098513, + "language_loss": 0.71363592, + "learning_rate": 1.0163460977514416e-06, + "loss": 0.73631895, + "num_input_tokens_seen": 241866815, + "step": 11206, + "time_per_iteration": 2.5838871002197266 + }, + { + "auxiliary_loss_clip": 0.01117013, + "auxiliary_loss_mlp": 0.00747436, + "balance_loss_clip": 1.00195301, + "balance_loss_mlp": 1.00053847, + "epoch": 0.6738012926499324, + "flos": 25447342844160.0, + "grad_norm": 2.5927959341416753, + "language_loss": 0.67435372, + "learning_rate": 1.016007014855092e-06, + "loss": 0.69299817, + "num_input_tokens_seen": 241887050, + "step": 11207, + "time_per_iteration": 2.671891450881958 + }, + { + "auxiliary_loss_clip": 0.01102125, + "auxiliary_loss_mlp": 0.01104173, + "balance_loss_clip": 1.00189185, + "balance_loss_mlp": 1.00050879, + "epoch": 0.6738614159026003, + "flos": 20776873029120.0, + "grad_norm": 3.0347999183744125, + "language_loss": 0.74491107, + "learning_rate": 1.0156679692718553e-06, + "loss": 0.76697403, + "num_input_tokens_seen": 241904280, + "step": 11208, + "time_per_iteration": 2.7288224697113037 + }, + { + "auxiliary_loss_clip": 0.01150068, + "auxiliary_loss_mlp": 0.01104793, + "balance_loss_clip": 1.00186694, + "balance_loss_mlp": 1.00055671, + "epoch": 0.6739215391552683, + "flos": 19566898462080.0, + "grad_norm": 1.874680822345781, + "language_loss": 0.75481653, + "learning_rate": 1.0153289610145867e-06, + "loss": 0.77736509, + "num_input_tokens_seen": 241919190, + "step": 11209, + "time_per_iteration": 2.572479486465454 + }, + { + "auxiliary_loss_clip": 0.01114351, + "auxiliary_loss_mlp": 0.01103187, + "balance_loss_clip": 1.00178909, + "balance_loss_mlp": 1.0004766, + "epoch": 0.6739816624079362, + "flos": 24388193485440.0, + "grad_norm": 2.6026208677525062, + "language_loss": 0.66236562, + "learning_rate": 1.0149899900961428e-06, + "loss": 0.68454099, + "num_input_tokens_seen": 241940525, + "step": 11210, + "time_per_iteration": 2.661062479019165 + }, + { + "auxiliary_loss_clip": 0.0116488, + "auxiliary_loss_mlp": 0.01102837, + "balance_loss_clip": 1.00195265, + "balance_loss_mlp": 1.00041246, + "epoch": 0.6740417856606042, + "flos": 22528164533760.0, + "grad_norm": 2.239881605964328, + "language_loss": 0.79977918, + "learning_rate": 1.014651056529377e-06, + "loss": 0.82245636, + "num_input_tokens_seen": 241959290, + "step": 11211, + "time_per_iteration": 2.5379724502563477 + }, + { + "auxiliary_loss_clip": 0.0110043, + "auxiliary_loss_mlp": 0.01103347, + "balance_loss_clip": 1.00177038, + "balance_loss_mlp": 1.00035048, + "epoch": 0.6741019089132723, + "flos": 25775458606080.0, + "grad_norm": 1.5161276423165926, + "language_loss": 0.76380867, + "learning_rate": 1.014312160327143e-06, + "loss": 0.78584647, + "num_input_tokens_seen": 241980715, + "step": 11212, + "time_per_iteration": 2.7593257427215576 + }, + { + "auxiliary_loss_clip": 0.01116497, + "auxiliary_loss_mlp": 0.00747309, + "balance_loss_clip": 1.00171685, + "balance_loss_mlp": 1.00040042, + "epoch": 0.6741620321659402, + "flos": 21105671149440.0, + "grad_norm": 6.7293878412023735, + "language_loss": 0.78019929, + "learning_rate": 1.0139733015022905e-06, + "loss": 0.7988373, + "num_input_tokens_seen": 241999985, + "step": 11213, + "time_per_iteration": 4.039759397506714 + }, + { + "auxiliary_loss_clip": 0.01101268, + "auxiliary_loss_mlp": 0.01104753, + "balance_loss_clip": 1.00199795, + "balance_loss_mlp": 1.00051594, + "epoch": 0.6742221554186082, + "flos": 20740423703040.0, + "grad_norm": 3.0222352855678576, + "language_loss": 0.67783844, + "learning_rate": 1.0136344800676685e-06, + "loss": 0.6998986, + "num_input_tokens_seen": 242018990, + "step": 11214, + "time_per_iteration": 4.03049111366272 + }, + { + "auxiliary_loss_clip": 0.01165192, + "auxiliary_loss_mlp": 0.00747424, + "balance_loss_clip": 1.0020262, + "balance_loss_mlp": 1.00047553, + "epoch": 0.6742822786712761, + "flos": 37774146384000.0, + "grad_norm": 1.778005346860784, + "language_loss": 0.7287069, + "learning_rate": 1.0132956960361263e-06, + "loss": 0.74783301, + "num_input_tokens_seen": 242039340, + "step": 11215, + "time_per_iteration": 2.6754636764526367 + }, + { + "auxiliary_loss_clip": 0.01150133, + "auxiliary_loss_mlp": 0.0074726, + "balance_loss_clip": 1.00186336, + "balance_loss_mlp": 1.00035715, + "epoch": 0.6743424019239441, + "flos": 37263891732480.0, + "grad_norm": 2.119473511980164, + "language_loss": 0.67067134, + "learning_rate": 1.0129569494205096e-06, + "loss": 0.68964529, + "num_input_tokens_seen": 242062215, + "step": 11216, + "time_per_iteration": 2.6814467906951904 + }, + { + "auxiliary_loss_clip": 0.01159329, + "auxiliary_loss_mlp": 0.0107925, + "balance_loss_clip": 1.00096583, + "balance_loss_mlp": 0.99990445, + "epoch": 0.674402525176612, + "flos": 65997746300160.0, + "grad_norm": 0.6795447310859735, + "language_loss": 0.56269073, + "learning_rate": 1.0126182402336646e-06, + "loss": 0.58507645, + "num_input_tokens_seen": 242131130, + "step": 11217, + "time_per_iteration": 3.1991732120513916 + }, + { + "auxiliary_loss_clip": 0.01148298, + "auxiliary_loss_mlp": 0.0110426, + "balance_loss_clip": 1.00194347, + "balance_loss_mlp": 1.00040472, + "epoch": 0.67446264842928, + "flos": 26461208131200.0, + "grad_norm": 1.8319210043889838, + "language_loss": 0.74254608, + "learning_rate": 1.0122795684884363e-06, + "loss": 0.76507163, + "num_input_tokens_seen": 242149720, + "step": 11218, + "time_per_iteration": 2.580268383026123 + }, + { + "auxiliary_loss_clip": 0.01114709, + "auxiliary_loss_mlp": 0.01104141, + "balance_loss_clip": 1.0017581, + "balance_loss_mlp": 1.00066721, + "epoch": 0.674522771681948, + "flos": 23732392924800.0, + "grad_norm": 1.5321508131498962, + "language_loss": 0.65968716, + "learning_rate": 1.0119409341976639e-06, + "loss": 0.68187559, + "num_input_tokens_seen": 242168875, + "step": 11219, + "time_per_iteration": 2.6670045852661133 + }, + { + "auxiliary_loss_clip": 0.01118664, + "auxiliary_loss_mlp": 0.01105267, + "balance_loss_clip": 1.00203753, + "balance_loss_mlp": 1.00055408, + "epoch": 0.674582894934616, + "flos": 24754338771840.0, + "grad_norm": 1.8119476237813212, + "language_loss": 0.74591589, + "learning_rate": 1.0116023373741904e-06, + "loss": 0.76815522, + "num_input_tokens_seen": 242188465, + "step": 11220, + "time_per_iteration": 2.655881881713867 + }, + { + "auxiliary_loss_clip": 0.01148676, + "auxiliary_loss_mlp": 0.01104556, + "balance_loss_clip": 1.00185704, + "balance_loss_mlp": 1.00050998, + "epoch": 0.6746430181872839, + "flos": 24826626892800.0, + "grad_norm": 2.0735168434286524, + "language_loss": 0.70106053, + "learning_rate": 1.0112637780308554e-06, + "loss": 0.72359282, + "num_input_tokens_seen": 242208675, + "step": 11221, + "time_per_iteration": 2.6064963340759277 + }, + { + "auxiliary_loss_clip": 0.01118244, + "auxiliary_loss_mlp": 0.01104887, + "balance_loss_clip": 1.00176692, + "balance_loss_mlp": 1.00045943, + "epoch": 0.6747031414399519, + "flos": 16873491087360.0, + "grad_norm": 1.846188131806698, + "language_loss": 0.58264935, + "learning_rate": 1.010925256180498e-06, + "loss": 0.60488057, + "num_input_tokens_seen": 242227440, + "step": 11222, + "time_per_iteration": 2.6513068675994873 + }, + { + "auxiliary_loss_clip": 0.01148333, + "auxiliary_loss_mlp": 0.01105035, + "balance_loss_clip": 1.00185525, + "balance_loss_mlp": 1.00060821, + "epoch": 0.6747632646926198, + "flos": 22784925928320.0, + "grad_norm": 1.6255549121563526, + "language_loss": 0.76919138, + "learning_rate": 1.0105867718359528e-06, + "loss": 0.7917251, + "num_input_tokens_seen": 242245240, + "step": 11223, + "time_per_iteration": 2.5895280838012695 + }, + { + "auxiliary_loss_clip": 0.01150315, + "auxiliary_loss_mlp": 0.01104851, + "balance_loss_clip": 1.00199914, + "balance_loss_mlp": 1.00051844, + "epoch": 0.6748233879452878, + "flos": 20046090827520.0, + "grad_norm": 1.7207610029969185, + "language_loss": 0.7530508, + "learning_rate": 1.0102483250100574e-06, + "loss": 0.77560246, + "num_input_tokens_seen": 242263435, + "step": 11224, + "time_per_iteration": 2.5585901737213135 + }, + { + "auxiliary_loss_clip": 0.01101498, + "auxiliary_loss_mlp": 0.01103534, + "balance_loss_clip": 1.00197172, + "balance_loss_mlp": 1.00044215, + "epoch": 0.6748835111979558, + "flos": 23002831785600.0, + "grad_norm": 3.032001328216774, + "language_loss": 0.63471693, + "learning_rate": 1.0099099157156445e-06, + "loss": 0.65676725, + "num_input_tokens_seen": 242282765, + "step": 11225, + "time_per_iteration": 2.6749954223632812 + }, + { + "auxiliary_loss_clip": 0.01164808, + "auxiliary_loss_mlp": 0.00747176, + "balance_loss_clip": 1.00196481, + "balance_loss_mlp": 1.00043166, + "epoch": 0.6749436344506238, + "flos": 12197311009920.0, + "grad_norm": 1.891645515046502, + "language_loss": 0.6405381, + "learning_rate": 1.0095715439655462e-06, + "loss": 0.65965796, + "num_input_tokens_seen": 242298980, + "step": 11226, + "time_per_iteration": 2.517630100250244 + }, + { + "auxiliary_loss_clip": 0.01150332, + "auxiliary_loss_mlp": 0.01104552, + "balance_loss_clip": 1.00201309, + "balance_loss_mlp": 1.00050592, + "epoch": 0.6750037577032918, + "flos": 11873720361600.0, + "grad_norm": 2.2670596511340007, + "language_loss": 0.7172519, + "learning_rate": 1.0092332097725945e-06, + "loss": 0.73980069, + "num_input_tokens_seen": 242315420, + "step": 11227, + "time_per_iteration": 5.331405162811279 + }, + { + "auxiliary_loss_clip": 0.01132906, + "auxiliary_loss_mlp": 0.01104056, + "balance_loss_clip": 1.00189567, + "balance_loss_mlp": 1.00039208, + "epoch": 0.6750638809559597, + "flos": 17019611614080.0, + "grad_norm": 3.2617799065121926, + "language_loss": 0.71156365, + "learning_rate": 1.0088949131496183e-06, + "loss": 0.73393321, + "num_input_tokens_seen": 242332805, + "step": 11228, + "time_per_iteration": 2.5740201473236084 + }, + { + "auxiliary_loss_clip": 0.01128698, + "auxiliary_loss_mlp": 0.01079327, + "balance_loss_clip": 1.00107408, + "balance_loss_mlp": 0.9999817, + "epoch": 0.6751240042086277, + "flos": 70951011891840.0, + "grad_norm": 0.7503869087795748, + "language_loss": 0.53262663, + "learning_rate": 1.0085566541094482e-06, + "loss": 0.55470687, + "num_input_tokens_seen": 242396160, + "step": 11229, + "time_per_iteration": 3.2187674045562744 + }, + { + "auxiliary_loss_clip": 0.01150032, + "auxiliary_loss_mlp": 0.01103213, + "balance_loss_clip": 1.00186133, + "balance_loss_mlp": 1.00050211, + "epoch": 0.6751841274612956, + "flos": 22675146986880.0, + "grad_norm": 1.822194189118131, + "language_loss": 0.80403852, + "learning_rate": 1.0082184326649072e-06, + "loss": 0.82657099, + "num_input_tokens_seen": 242414660, + "step": 11230, + "time_per_iteration": 2.602311849594116 + }, + { + "auxiliary_loss_clip": 0.01133556, + "auxiliary_loss_mlp": 0.01103532, + "balance_loss_clip": 1.00187707, + "balance_loss_mlp": 1.00043988, + "epoch": 0.6752442507139637, + "flos": 21288636051840.0, + "grad_norm": 1.4818982652703363, + "language_loss": 0.6577996, + "learning_rate": 1.0078802488288228e-06, + "loss": 0.68017042, + "num_input_tokens_seen": 242434225, + "step": 11231, + "time_per_iteration": 2.58785080909729 + }, + { + "auxiliary_loss_clip": 0.01098963, + "auxiliary_loss_mlp": 0.01105357, + "balance_loss_clip": 1.00175905, + "balance_loss_mlp": 1.00054789, + "epoch": 0.6753043739666316, + "flos": 28256921781120.0, + "grad_norm": 1.7799395866467218, + "language_loss": 0.66129935, + "learning_rate": 1.0075421026140198e-06, + "loss": 0.68334258, + "num_input_tokens_seen": 242454355, + "step": 11232, + "time_per_iteration": 2.7215263843536377 + }, + { + "auxiliary_loss_clip": 0.01121532, + "auxiliary_loss_mlp": 0.01103251, + "balance_loss_clip": 1.00210738, + "balance_loss_mlp": 1.00044465, + "epoch": 0.6753644972192996, + "flos": 21360349555200.0, + "grad_norm": 2.1245282315846232, + "language_loss": 0.72158849, + "learning_rate": 1.0072039940333188e-06, + "loss": 0.74383634, + "num_input_tokens_seen": 242474935, + "step": 11233, + "time_per_iteration": 2.6690032482147217 + }, + { + "auxiliary_loss_clip": 0.01150255, + "auxiliary_loss_mlp": 0.0110452, + "balance_loss_clip": 1.00187826, + "balance_loss_mlp": 1.00047421, + "epoch": 0.6754246204719675, + "flos": 26541971861760.0, + "grad_norm": 2.1111604585550814, + "language_loss": 0.77044284, + "learning_rate": 1.0068659230995418e-06, + "loss": 0.79299057, + "num_input_tokens_seen": 242495530, + "step": 11234, + "time_per_iteration": 2.6142334938049316 + }, + { + "auxiliary_loss_clip": 0.01165076, + "auxiliary_loss_mlp": 0.01104824, + "balance_loss_clip": 1.00207913, + "balance_loss_mlp": 1.00058782, + "epoch": 0.6754847437246355, + "flos": 25556690822400.0, + "grad_norm": 1.8334972286120705, + "language_loss": 0.75254112, + "learning_rate": 1.0065278898255101e-06, + "loss": 0.77524006, + "num_input_tokens_seen": 242514550, + "step": 11235, + "time_per_iteration": 2.564669370651245 + }, + { + "auxiliary_loss_clip": 0.01144389, + "auxiliary_loss_mlp": 0.0107932, + "balance_loss_clip": 1.00090599, + "balance_loss_mlp": 0.99997455, + "epoch": 0.6755448669773034, + "flos": 59513318726400.0, + "grad_norm": 0.7814005942674793, + "language_loss": 0.51387411, + "learning_rate": 1.0061898942240387e-06, + "loss": 0.53611124, + "num_input_tokens_seen": 242569200, + "step": 11236, + "time_per_iteration": 3.0920236110687256 + }, + { + "auxiliary_loss_clip": 0.0111797, + "auxiliary_loss_mlp": 0.01103461, + "balance_loss_clip": 1.00181806, + "balance_loss_mlp": 1.00036883, + "epoch": 0.6756049902299714, + "flos": 23294534135040.0, + "grad_norm": 2.4761991006932096, + "language_loss": 0.75702864, + "learning_rate": 1.0058519363079464e-06, + "loss": 0.77924293, + "num_input_tokens_seen": 242586950, + "step": 11237, + "time_per_iteration": 2.653545618057251 + }, + { + "auxiliary_loss_clip": 0.01133461, + "auxiliary_loss_mlp": 0.01104443, + "balance_loss_clip": 1.00205445, + "balance_loss_mlp": 1.00068331, + "epoch": 0.6756651134826394, + "flos": 31575426566400.0, + "grad_norm": 2.508112923395331, + "language_loss": 0.77102864, + "learning_rate": 1.0055140160900482e-06, + "loss": 0.79340768, + "num_input_tokens_seen": 242607380, + "step": 11238, + "time_per_iteration": 2.68123722076416 + }, + { + "auxiliary_loss_clip": 0.0115037, + "auxiliary_loss_mlp": 0.01105103, + "balance_loss_clip": 1.00202024, + "balance_loss_mlp": 1.00058031, + "epoch": 0.6757252367353074, + "flos": 27272287186560.0, + "grad_norm": 1.589107148511669, + "language_loss": 0.66290557, + "learning_rate": 1.0051761335831587e-06, + "loss": 0.68546033, + "num_input_tokens_seen": 242628025, + "step": 11239, + "time_per_iteration": 2.611217975616455 + }, + { + "auxiliary_loss_clip": 0.01131334, + "auxiliary_loss_mlp": 0.01102667, + "balance_loss_clip": 1.00186741, + "balance_loss_mlp": 1.00043344, + "epoch": 0.6757853599879754, + "flos": 16830900535680.0, + "grad_norm": 1.9471875149579074, + "language_loss": 0.82962644, + "learning_rate": 1.0048382888000898e-06, + "loss": 0.85196638, + "num_input_tokens_seen": 242643825, + "step": 11240, + "time_per_iteration": 2.6028385162353516 + }, + { + "auxiliary_loss_clip": 0.0111844, + "auxiliary_loss_mlp": 0.01106496, + "balance_loss_clip": 1.00184, + "balance_loss_mlp": 1.00035167, + "epoch": 0.6758454832406433, + "flos": 23220055284480.0, + "grad_norm": 2.052605910075908, + "language_loss": 0.74320883, + "learning_rate": 1.0045004817536525e-06, + "loss": 0.76545823, + "num_input_tokens_seen": 242661820, + "step": 11241, + "time_per_iteration": 2.663602828979492 + }, + { + "auxiliary_loss_clip": 0.01101793, + "auxiliary_loss_mlp": 0.01104687, + "balance_loss_clip": 1.0019083, + "balance_loss_mlp": 1.0005455, + "epoch": 0.6759056064933113, + "flos": 16289547684480.0, + "grad_norm": 3.997343097766272, + "language_loss": 0.79972333, + "learning_rate": 1.0041627124566572e-06, + "loss": 0.82178813, + "num_input_tokens_seen": 242679890, + "step": 11242, + "time_per_iteration": 2.6433496475219727 + }, + { + "auxiliary_loss_clip": 0.01149987, + "auxiliary_loss_mlp": 0.01103879, + "balance_loss_clip": 1.00181353, + "balance_loss_mlp": 1.00040519, + "epoch": 0.6759657297459792, + "flos": 25922297404800.0, + "grad_norm": 2.1571802511352822, + "language_loss": 0.72339398, + "learning_rate": 1.0038249809219109e-06, + "loss": 0.74593264, + "num_input_tokens_seen": 242699495, + "step": 11243, + "time_per_iteration": 2.607393980026245 + }, + { + "auxiliary_loss_clip": 0.01148496, + "auxiliary_loss_mlp": 0.01103052, + "balance_loss_clip": 1.00198114, + "balance_loss_mlp": 1.00081825, + "epoch": 0.6760258529986473, + "flos": 23000820624000.0, + "grad_norm": 1.683297975927506, + "language_loss": 0.72799963, + "learning_rate": 1.003487287162221e-06, + "loss": 0.7505151, + "num_input_tokens_seen": 242719500, + "step": 11244, + "time_per_iteration": 2.5757241249084473 + }, + { + "auxiliary_loss_clip": 0.01165261, + "auxiliary_loss_mlp": 0.01104782, + "balance_loss_clip": 1.00210226, + "balance_loss_mlp": 1.00054562, + "epoch": 0.6760859762513152, + "flos": 20959335141120.0, + "grad_norm": 2.2777693735083644, + "language_loss": 0.85728812, + "learning_rate": 1.003149631190393e-06, + "loss": 0.87998855, + "num_input_tokens_seen": 242738325, + "step": 11245, + "time_per_iteration": 2.587578535079956 + }, + { + "auxiliary_loss_clip": 0.01165334, + "auxiliary_loss_mlp": 0.00747509, + "balance_loss_clip": 1.00207973, + "balance_loss_mlp": 1.00052273, + "epoch": 0.6761460995039832, + "flos": 23622937205760.0, + "grad_norm": 1.8988124623129417, + "language_loss": 0.74303967, + "learning_rate": 1.0028120130192327e-06, + "loss": 0.76216805, + "num_input_tokens_seen": 242756620, + "step": 11246, + "time_per_iteration": 2.6796162128448486 + }, + { + "auxiliary_loss_clip": 0.01148388, + "auxiliary_loss_mlp": 0.0110389, + "balance_loss_clip": 1.00190973, + "balance_loss_mlp": 1.0004164, + "epoch": 0.6762062227566511, + "flos": 20770875457920.0, + "grad_norm": 1.8887634551549852, + "language_loss": 0.88160872, + "learning_rate": 1.002474432661539e-06, + "loss": 0.90413153, + "num_input_tokens_seen": 242774505, + "step": 11247, + "time_per_iteration": 2.6098856925964355 + }, + { + "auxiliary_loss_clip": 0.01128822, + "auxiliary_loss_mlp": 0.01079287, + "balance_loss_clip": 1.00102639, + "balance_loss_mlp": 0.99994147, + "epoch": 0.6762663460093191, + "flos": 52818099166080.0, + "grad_norm": 0.8219377771342234, + "language_loss": 0.54029435, + "learning_rate": 1.002136890130115e-06, + "loss": 0.56237543, + "num_input_tokens_seen": 242828645, + "step": 11248, + "time_per_iteration": 3.189460277557373 + }, + { + "auxiliary_loss_clip": 0.01088082, + "auxiliary_loss_mlp": 0.01103056, + "balance_loss_clip": 1.00177252, + "balance_loss_mlp": 1.00053573, + "epoch": 0.676326469261987, + "flos": 23696302734720.0, + "grad_norm": 5.787198963023664, + "language_loss": 0.73418939, + "learning_rate": 1.001799385437761e-06, + "loss": 0.75610077, + "num_input_tokens_seen": 242850100, + "step": 11249, + "time_per_iteration": 2.7523131370544434 + }, + { + "auxiliary_loss_clip": 0.01148398, + "auxiliary_loss_mlp": 0.01105269, + "balance_loss_clip": 1.0018425, + "balance_loss_mlp": 1.00055552, + "epoch": 0.676386592514655, + "flos": 14063732582400.0, + "grad_norm": 2.0612708557316832, + "language_loss": 0.74005836, + "learning_rate": 1.0014619185972732e-06, + "loss": 0.76259506, + "num_input_tokens_seen": 242867775, + "step": 11250, + "time_per_iteration": 2.5397770404815674 + }, + { + "auxiliary_loss_clip": 0.01165195, + "auxiliary_loss_mlp": 0.01104398, + "balance_loss_clip": 1.0020597, + "balance_loss_mlp": 1.00044775, + "epoch": 0.676446715767323, + "flos": 20412236113920.0, + "grad_norm": 2.712449566935304, + "language_loss": 0.75383186, + "learning_rate": 1.0011244896214497e-06, + "loss": 0.77652776, + "num_input_tokens_seen": 242886865, + "step": 11251, + "time_per_iteration": 3.9845962524414062 + }, + { + "auxiliary_loss_clip": 0.01121282, + "auxiliary_loss_mlp": 0.01104024, + "balance_loss_clip": 1.00205469, + "balance_loss_mlp": 1.00045526, + "epoch": 0.676506839019991, + "flos": 21288241002240.0, + "grad_norm": 2.294113315167846, + "language_loss": 0.7028569, + "learning_rate": 1.0007870985230873e-06, + "loss": 0.72510993, + "num_input_tokens_seen": 242906705, + "step": 11252, + "time_per_iteration": 4.050464630126953 + }, + { + "auxiliary_loss_clip": 0.01102349, + "auxiliary_loss_mlp": 0.01103976, + "balance_loss_clip": 1.00192511, + "balance_loss_mlp": 1.00050223, + "epoch": 0.676566962272659, + "flos": 29932477459200.0, + "grad_norm": 1.7106575482408914, + "language_loss": 0.66524804, + "learning_rate": 1.0004497453149765e-06, + "loss": 0.68731123, + "num_input_tokens_seen": 242925215, + "step": 11253, + "time_per_iteration": 2.7359459400177 + }, + { + "auxiliary_loss_clip": 0.01119068, + "auxiliary_loss_mlp": 0.00747404, + "balance_loss_clip": 1.00200868, + "balance_loss_mlp": 1.00050294, + "epoch": 0.6766270855253269, + "flos": 17931203902080.0, + "grad_norm": 2.03211945656317, + "language_loss": 0.77268004, + "learning_rate": 1.0001124300099115e-06, + "loss": 0.79134476, + "num_input_tokens_seen": 242944750, + "step": 11254, + "time_per_iteration": 2.6303634643554688 + }, + { + "auxiliary_loss_clip": 0.0114832, + "auxiliary_loss_mlp": 0.01104079, + "balance_loss_clip": 1.00186348, + "balance_loss_mlp": 1.00050998, + "epoch": 0.6766872087779949, + "flos": 23104853389440.0, + "grad_norm": 2.672867112683004, + "language_loss": 0.71747732, + "learning_rate": 9.997751526206835e-07, + "loss": 0.74000132, + "num_input_tokens_seen": 242963860, + "step": 11255, + "time_per_iteration": 2.5698630809783936 + }, + { + "auxiliary_loss_clip": 0.01101791, + "auxiliary_loss_mlp": 0.00747168, + "balance_loss_clip": 1.00165737, + "balance_loss_mlp": 1.00034487, + "epoch": 0.6767473320306628, + "flos": 26213137827840.0, + "grad_norm": 2.0524082239512764, + "language_loss": 0.7546249, + "learning_rate": 9.994379131600828e-07, + "loss": 0.7731145, + "num_input_tokens_seen": 242983050, + "step": 11256, + "time_per_iteration": 2.715675115585327 + }, + { + "auxiliary_loss_clip": 0.01148296, + "auxiliary_loss_mlp": 0.01104697, + "balance_loss_clip": 1.00185072, + "balance_loss_mlp": 1.00065088, + "epoch": 0.6768074552833309, + "flos": 18368739469440.0, + "grad_norm": 2.948279188569631, + "language_loss": 0.65256238, + "learning_rate": 9.991007116408965e-07, + "loss": 0.67509234, + "num_input_tokens_seen": 243001125, + "step": 11257, + "time_per_iteration": 2.5300605297088623 + }, + { + "auxiliary_loss_clip": 0.01101443, + "auxiliary_loss_mlp": 0.01103142, + "balance_loss_clip": 1.00168753, + "balance_loss_mlp": 1.00043106, + "epoch": 0.6768675785359988, + "flos": 23039927556480.0, + "grad_norm": 1.8557245420856459, + "language_loss": 0.7550329, + "learning_rate": 9.987635480759109e-07, + "loss": 0.77707863, + "num_input_tokens_seen": 243021865, + "step": 11258, + "time_per_iteration": 2.7323825359344482 + }, + { + "auxiliary_loss_clip": 0.0113149, + "auxiliary_loss_mlp": 0.01103664, + "balance_loss_clip": 1.00188756, + "balance_loss_mlp": 1.00047648, + "epoch": 0.6769277017886668, + "flos": 33036524092800.0, + "grad_norm": 1.5643323701729275, + "language_loss": 0.66936219, + "learning_rate": 9.984264224779127e-07, + "loss": 0.69171375, + "num_input_tokens_seen": 243042970, + "step": 11259, + "time_per_iteration": 2.732909917831421 + }, + { + "auxiliary_loss_clip": 0.01132916, + "auxiliary_loss_mlp": 0.01103627, + "balance_loss_clip": 1.00175118, + "balance_loss_mlp": 1.00053453, + "epoch": 0.6769878250413347, + "flos": 20848406964480.0, + "grad_norm": 2.387676380236865, + "language_loss": 0.85555136, + "learning_rate": 9.980893348596839e-07, + "loss": 0.87791681, + "num_input_tokens_seen": 243058470, + "step": 11260, + "time_per_iteration": 2.6097664833068848 + }, + { + "auxiliary_loss_clip": 0.01133824, + "auxiliary_loss_mlp": 0.01105023, + "balance_loss_clip": 1.00182092, + "balance_loss_mlp": 1.0006907, + "epoch": 0.6770479482940027, + "flos": 15595968994560.0, + "grad_norm": 2.0313591419672896, + "language_loss": 0.77754974, + "learning_rate": 9.977522852340081e-07, + "loss": 0.79993814, + "num_input_tokens_seen": 243076630, + "step": 11261, + "time_per_iteration": 2.5889956951141357 + }, + { + "auxiliary_loss_clip": 0.01132228, + "auxiliary_loss_mlp": 0.01103756, + "balance_loss_clip": 1.00177956, + "balance_loss_mlp": 1.00056815, + "epoch": 0.6771080715466706, + "flos": 18621011664000.0, + "grad_norm": 3.302213873238381, + "language_loss": 0.8802458, + "learning_rate": 9.97415273613666e-07, + "loss": 0.90260565, + "num_input_tokens_seen": 243092260, + "step": 11262, + "time_per_iteration": 2.613942861557007 + }, + { + "auxiliary_loss_clip": 0.01131719, + "auxiliary_loss_mlp": 0.0110483, + "balance_loss_clip": 1.00183761, + "balance_loss_mlp": 1.00049782, + "epoch": 0.6771681947993387, + "flos": 12495441893760.0, + "grad_norm": 2.7481366023681244, + "language_loss": 0.74452555, + "learning_rate": 9.97078300011439e-07, + "loss": 0.766891, + "num_input_tokens_seen": 243109405, + "step": 11263, + "time_per_iteration": 2.585878610610962 + }, + { + "auxiliary_loss_clip": 0.01150595, + "auxiliary_loss_mlp": 0.01105367, + "balance_loss_clip": 1.00203514, + "balance_loss_mlp": 1.00046301, + "epoch": 0.6772283180520066, + "flos": 22236964974720.0, + "grad_norm": 3.1990029755578706, + "language_loss": 0.6802938, + "learning_rate": 9.967413644401016e-07, + "loss": 0.70285338, + "num_input_tokens_seen": 243128135, + "step": 11264, + "time_per_iteration": 4.012386798858643 + }, + { + "auxiliary_loss_clip": 0.01131492, + "auxiliary_loss_mlp": 0.01104371, + "balance_loss_clip": 1.00185096, + "balance_loss_mlp": 1.00051558, + "epoch": 0.6772884413046746, + "flos": 16143139848960.0, + "grad_norm": 2.5748839957438774, + "language_loss": 0.73390937, + "learning_rate": 9.964044669124324e-07, + "loss": 0.75626796, + "num_input_tokens_seen": 243146785, + "step": 11265, + "time_per_iteration": 2.5791780948638916 + }, + { + "auxiliary_loss_clip": 0.01118322, + "auxiliary_loss_mlp": 0.01104239, + "balance_loss_clip": 1.00176525, + "balance_loss_mlp": 1.00038433, + "epoch": 0.6773485645573426, + "flos": 19135755515520.0, + "grad_norm": 1.9265388086093764, + "language_loss": 0.61971414, + "learning_rate": 9.96067607441207e-07, + "loss": 0.64193976, + "num_input_tokens_seen": 243165275, + "step": 11266, + "time_per_iteration": 4.07446551322937 + }, + { + "auxiliary_loss_clip": 0.01116698, + "auxiliary_loss_mlp": 0.0110439, + "balance_loss_clip": 1.00187814, + "balance_loss_mlp": 1.00053501, + "epoch": 0.6774086878100105, + "flos": 14136918543360.0, + "grad_norm": 1.886582175309766, + "language_loss": 0.70674634, + "learning_rate": 9.957307860391976e-07, + "loss": 0.72895724, + "num_input_tokens_seen": 243182845, + "step": 11267, + "time_per_iteration": 2.640941619873047 + }, + { + "auxiliary_loss_clip": 0.01164989, + "auxiliary_loss_mlp": 0.0110416, + "balance_loss_clip": 1.00195742, + "balance_loss_mlp": 1.00049615, + "epoch": 0.6774688110626785, + "flos": 22197067943040.0, + "grad_norm": 1.9543027644531752, + "language_loss": 0.70914602, + "learning_rate": 9.953940027191785e-07, + "loss": 0.73183757, + "num_input_tokens_seen": 243201475, + "step": 11268, + "time_per_iteration": 2.585063934326172 + }, + { + "auxiliary_loss_clip": 0.01135165, + "auxiliary_loss_mlp": 0.01104896, + "balance_loss_clip": 1.00206053, + "balance_loss_mlp": 1.00046885, + "epoch": 0.6775289343153464, + "flos": 23039963470080.0, + "grad_norm": 1.636383591922073, + "language_loss": 0.76633322, + "learning_rate": 9.950572574939194e-07, + "loss": 0.78873384, + "num_input_tokens_seen": 243221850, + "step": 11269, + "time_per_iteration": 2.6225087642669678 + }, + { + "auxiliary_loss_clip": 0.01116184, + "auxiliary_loss_mlp": 0.01105295, + "balance_loss_clip": 1.00175309, + "balance_loss_mlp": 1.00058174, + "epoch": 0.6775890575680145, + "flos": 18293506433280.0, + "grad_norm": 2.0765321955180207, + "language_loss": 0.74574149, + "learning_rate": 9.94720550376189e-07, + "loss": 0.76795626, + "num_input_tokens_seen": 243239855, + "step": 11270, + "time_per_iteration": 2.6178982257843018 + }, + { + "auxiliary_loss_clip": 0.01100893, + "auxiliary_loss_mlp": 0.01104277, + "balance_loss_clip": 1.00186574, + "balance_loss_mlp": 1.00051701, + "epoch": 0.6776491808206824, + "flos": 25336450581120.0, + "grad_norm": 1.5787057396156394, + "language_loss": 0.72981071, + "learning_rate": 9.94383881378756e-07, + "loss": 0.75186247, + "num_input_tokens_seen": 243260085, + "step": 11271, + "time_per_iteration": 2.701417922973633 + }, + { + "auxiliary_loss_clip": 0.01165078, + "auxiliary_loss_mlp": 0.01105402, + "balance_loss_clip": 1.00197315, + "balance_loss_mlp": 1.00068879, + "epoch": 0.6777093040733504, + "flos": 26028233591040.0, + "grad_norm": 1.6484126237511612, + "language_loss": 0.68011367, + "learning_rate": 9.94047250514387e-07, + "loss": 0.70281851, + "num_input_tokens_seen": 243280065, + "step": 11272, + "time_per_iteration": 2.583137035369873 + }, + { + "auxiliary_loss_clip": 0.01148352, + "auxiliary_loss_mlp": 0.01105588, + "balance_loss_clip": 1.00183487, + "balance_loss_mlp": 1.00049269, + "epoch": 0.6777694273260183, + "flos": 18003599763840.0, + "grad_norm": 1.8812945622509658, + "language_loss": 0.73918366, + "learning_rate": 9.937106577958481e-07, + "loss": 0.76172304, + "num_input_tokens_seen": 243297775, + "step": 11273, + "time_per_iteration": 2.6081044673919678 + }, + { + "auxiliary_loss_clip": 0.01150569, + "auxiliary_loss_mlp": 0.01104206, + "balance_loss_clip": 1.00200236, + "balance_loss_mlp": 1.00063694, + "epoch": 0.6778295505786863, + "flos": 23441085624960.0, + "grad_norm": 2.5454596131045495, + "language_loss": 0.70208788, + "learning_rate": 9.933741032359015e-07, + "loss": 0.7246356, + "num_input_tokens_seen": 243315760, + "step": 11274, + "time_per_iteration": 2.585942506790161 + }, + { + "auxiliary_loss_clip": 0.01165033, + "auxiliary_loss_mlp": 0.01104785, + "balance_loss_clip": 1.00191283, + "balance_loss_mlp": 1.00054872, + "epoch": 0.6778896738313542, + "flos": 19098408349440.0, + "grad_norm": 1.5524893243882136, + "language_loss": 0.65747201, + "learning_rate": 9.930375868473093e-07, + "loss": 0.68017018, + "num_input_tokens_seen": 243335715, + "step": 11275, + "time_per_iteration": 2.53641939163208 + }, + { + "auxiliary_loss_clip": 0.01150522, + "auxiliary_loss_mlp": 0.01104731, + "balance_loss_clip": 1.00203907, + "balance_loss_mlp": 1.000494, + "epoch": 0.6779497970840223, + "flos": 26103933504000.0, + "grad_norm": 1.931723401678297, + "language_loss": 0.72530431, + "learning_rate": 9.927011086428335e-07, + "loss": 0.7478568, + "num_input_tokens_seen": 243356935, + "step": 11276, + "time_per_iteration": 2.6045329570770264 + }, + { + "auxiliary_loss_clip": 0.01135251, + "auxiliary_loss_mlp": 0.00747392, + "balance_loss_clip": 1.00196242, + "balance_loss_mlp": 1.00038862, + "epoch": 0.6780099203366902, + "flos": 19719232041600.0, + "grad_norm": 1.9856070550854332, + "language_loss": 0.76931709, + "learning_rate": 9.923646686352317e-07, + "loss": 0.78814358, + "num_input_tokens_seen": 243375625, + "step": 11277, + "time_per_iteration": 2.592135190963745 + }, + { + "auxiliary_loss_clip": 0.0113366, + "auxiliary_loss_mlp": 0.01105246, + "balance_loss_clip": 1.00195146, + "balance_loss_mlp": 1.00053215, + "epoch": 0.6780700435893582, + "flos": 18214538382720.0, + "grad_norm": 2.8109109422512057, + "language_loss": 0.84481293, + "learning_rate": 9.920282668372627e-07, + "loss": 0.86720204, + "num_input_tokens_seen": 243390195, + "step": 11278, + "time_per_iteration": 2.584200859069824 + }, + { + "auxiliary_loss_clip": 0.01131659, + "auxiliary_loss_mlp": 0.00747306, + "balance_loss_clip": 1.00179851, + "balance_loss_mlp": 1.00054121, + "epoch": 0.6781301668420262, + "flos": 25376239872000.0, + "grad_norm": 1.6509449657034208, + "language_loss": 0.70230222, + "learning_rate": 9.916919032616844e-07, + "loss": 0.72109187, + "num_input_tokens_seen": 243411690, + "step": 11279, + "time_per_iteration": 2.642406940460205 + }, + { + "auxiliary_loss_clip": 0.01150484, + "auxiliary_loss_mlp": 0.01103821, + "balance_loss_clip": 1.00198483, + "balance_loss_mlp": 1.00044274, + "epoch": 0.6781902900946941, + "flos": 24020432087040.0, + "grad_norm": 1.7405648410273622, + "language_loss": 0.73714119, + "learning_rate": 9.913555779212485e-07, + "loss": 0.75968421, + "num_input_tokens_seen": 243430280, + "step": 11280, + "time_per_iteration": 2.601466417312622 + }, + { + "auxiliary_loss_clip": 0.01150532, + "auxiliary_loss_mlp": 0.01105944, + "balance_loss_clip": 1.00190341, + "balance_loss_mlp": 1.00046802, + "epoch": 0.6782504133473621, + "flos": 19646764352640.0, + "grad_norm": 1.7304979678556665, + "language_loss": 0.70157468, + "learning_rate": 9.910192908287104e-07, + "loss": 0.72413945, + "num_input_tokens_seen": 243448690, + "step": 11281, + "time_per_iteration": 2.5476603507995605 + }, + { + "auxiliary_loss_clip": 0.01164949, + "auxiliary_loss_mlp": 0.01105348, + "balance_loss_clip": 1.00201178, + "balance_loss_mlp": 1.00044394, + "epoch": 0.67831053660003, + "flos": 24932742647040.0, + "grad_norm": 1.8594310210837237, + "language_loss": 0.63664186, + "learning_rate": 9.906830419968217e-07, + "loss": 0.65934479, + "num_input_tokens_seen": 243470695, + "step": 11282, + "time_per_iteration": 2.605731964111328 + }, + { + "auxiliary_loss_clip": 0.01120672, + "auxiliary_loss_mlp": 0.01105732, + "balance_loss_clip": 1.00197124, + "balance_loss_mlp": 1.00054145, + "epoch": 0.6783706598526981, + "flos": 31208383440000.0, + "grad_norm": 1.5203806221017342, + "language_loss": 0.74619114, + "learning_rate": 9.90346831438334e-07, + "loss": 0.76845515, + "num_input_tokens_seen": 243493345, + "step": 11283, + "time_per_iteration": 2.7102370262145996 + }, + { + "auxiliary_loss_clip": 0.01149865, + "auxiliary_loss_mlp": 0.01103561, + "balance_loss_clip": 1.00198984, + "balance_loss_mlp": 1.00037384, + "epoch": 0.678430783105366, + "flos": 35441317687680.0, + "grad_norm": 1.537746151968791, + "language_loss": 0.56741375, + "learning_rate": 9.900106591659948e-07, + "loss": 0.589948, + "num_input_tokens_seen": 243515670, + "step": 11284, + "time_per_iteration": 2.735431432723999 + }, + { + "auxiliary_loss_clip": 0.01131575, + "auxiliary_loss_mlp": 0.01104037, + "balance_loss_clip": 1.00180411, + "balance_loss_mlp": 1.0005635, + "epoch": 0.678490906358034, + "flos": 14428800460800.0, + "grad_norm": 2.3545603320917965, + "language_loss": 0.75124443, + "learning_rate": 9.896745251925535e-07, + "loss": 0.77360058, + "num_input_tokens_seen": 243533625, + "step": 11285, + "time_per_iteration": 2.575075387954712 + }, + { + "auxiliary_loss_clip": 0.0116481, + "auxiliary_loss_mlp": 0.01103387, + "balance_loss_clip": 1.00198638, + "balance_loss_mlp": 1.00048578, + "epoch": 0.6785510296107019, + "flos": 24311236596480.0, + "grad_norm": 1.5681950307719263, + "language_loss": 0.6631707, + "learning_rate": 9.893384295307557e-07, + "loss": 0.68585265, + "num_input_tokens_seen": 243553040, + "step": 11286, + "time_per_iteration": 2.5529422760009766 + }, + { + "auxiliary_loss_clip": 0.01131681, + "auxiliary_loss_mlp": 0.01104218, + "balance_loss_clip": 1.0017252, + "balance_loss_mlp": 1.0004586, + "epoch": 0.6786111528633699, + "flos": 26977244872320.0, + "grad_norm": 2.057002895352956, + "language_loss": 0.52734166, + "learning_rate": 9.890023721933447e-07, + "loss": 0.54970062, + "num_input_tokens_seen": 243572590, + "step": 11287, + "time_per_iteration": 2.66839861869812 + }, + { + "auxiliary_loss_clip": 0.01119371, + "auxiliary_loss_mlp": 0.01104113, + "balance_loss_clip": 1.00186419, + "balance_loss_mlp": 1.00063944, + "epoch": 0.6786712761160378, + "flos": 24317557390080.0, + "grad_norm": 1.589629370318373, + "language_loss": 0.77188158, + "learning_rate": 9.886663531930655e-07, + "loss": 0.79411644, + "num_input_tokens_seen": 243594140, + "step": 11288, + "time_per_iteration": 2.6972815990448 + }, + { + "auxiliary_loss_clip": 0.01148401, + "auxiliary_loss_mlp": 0.01105567, + "balance_loss_clip": 1.00200391, + "balance_loss_mlp": 1.00056744, + "epoch": 0.6787313993687059, + "flos": 22930435923840.0, + "grad_norm": 3.768971725719573, + "language_loss": 0.73328382, + "learning_rate": 9.883303725426593e-07, + "loss": 0.75582349, + "num_input_tokens_seen": 243615170, + "step": 11289, + "time_per_iteration": 4.05685567855835 + }, + { + "auxiliary_loss_clip": 0.01165059, + "auxiliary_loss_mlp": 0.01104327, + "balance_loss_clip": 1.0019958, + "balance_loss_mlp": 1.00056767, + "epoch": 0.6787915226213738, + "flos": 26868435598080.0, + "grad_norm": 1.831847232663603, + "language_loss": 0.80079556, + "learning_rate": 9.879944302548682e-07, + "loss": 0.82348943, + "num_input_tokens_seen": 243635675, + "step": 11290, + "time_per_iteration": 3.941906213760376 + }, + { + "auxiliary_loss_clip": 0.01148755, + "auxiliary_loss_mlp": 0.01103602, + "balance_loss_clip": 1.00198126, + "balance_loss_mlp": 1.00050986, + "epoch": 0.6788516458740418, + "flos": 20008851402240.0, + "grad_norm": 2.3583138991458963, + "language_loss": 0.74984825, + "learning_rate": 9.87658526342428e-07, + "loss": 0.77237183, + "num_input_tokens_seen": 243654950, + "step": 11291, + "time_per_iteration": 2.5818750858306885 + }, + { + "auxiliary_loss_clip": 0.01132268, + "auxiliary_loss_mlp": 0.00747349, + "balance_loss_clip": 1.00180638, + "balance_loss_mlp": 1.00052094, + "epoch": 0.6789117691267098, + "flos": 28727099832960.0, + "grad_norm": 1.7626949999791675, + "language_loss": 0.75593936, + "learning_rate": 9.873226608180785e-07, + "loss": 0.77473557, + "num_input_tokens_seen": 243674970, + "step": 11292, + "time_per_iteration": 2.676482677459717 + }, + { + "auxiliary_loss_clip": 0.01117035, + "auxiliary_loss_mlp": 0.01104977, + "balance_loss_clip": 1.00198102, + "balance_loss_mlp": 1.00045466, + "epoch": 0.6789718923793777, + "flos": 23403451150080.0, + "grad_norm": 2.3866020922236704, + "language_loss": 0.84353518, + "learning_rate": 9.869868336945556e-07, + "loss": 0.86575532, + "num_input_tokens_seen": 243693440, + "step": 11293, + "time_per_iteration": 2.6954891681671143 + }, + { + "auxiliary_loss_clip": 0.01165447, + "auxiliary_loss_mlp": 0.01106551, + "balance_loss_clip": 1.00209689, + "balance_loss_mlp": 1.00059772, + "epoch": 0.6790320156320457, + "flos": 20448865008000.0, + "grad_norm": 2.1902890466449776, + "language_loss": 0.79297769, + "learning_rate": 9.866510449845929e-07, + "loss": 0.81569761, + "num_input_tokens_seen": 243710055, + "step": 11294, + "time_per_iteration": 2.495828866958618 + }, + { + "auxiliary_loss_clip": 0.01131708, + "auxiliary_loss_mlp": 0.01103957, + "balance_loss_clip": 1.00183463, + "balance_loss_mlp": 1.00048339, + "epoch": 0.6790921388847136, + "flos": 24167199058560.0, + "grad_norm": 1.6971315567246334, + "language_loss": 0.78677475, + "learning_rate": 9.86315294700924e-07, + "loss": 0.80913138, + "num_input_tokens_seen": 243728635, + "step": 11295, + "time_per_iteration": 2.6088528633117676 + }, + { + "auxiliary_loss_clip": 0.01133261, + "auxiliary_loss_mlp": 0.01103182, + "balance_loss_clip": 1.00182879, + "balance_loss_mlp": 1.00056672, + "epoch": 0.6791522621373817, + "flos": 21908095027200.0, + "grad_norm": 2.3944124043139157, + "language_loss": 0.71311033, + "learning_rate": 9.859795828562823e-07, + "loss": 0.73547477, + "num_input_tokens_seen": 243748330, + "step": 11296, + "time_per_iteration": 2.6012773513793945 + }, + { + "auxiliary_loss_clip": 0.01148391, + "auxiliary_loss_mlp": 0.01104544, + "balance_loss_clip": 1.00192142, + "balance_loss_mlp": 1.00049829, + "epoch": 0.6792123853900496, + "flos": 24826519152000.0, + "grad_norm": 1.6508650717435764, + "language_loss": 0.70807719, + "learning_rate": 9.856439094633949e-07, + "loss": 0.73060656, + "num_input_tokens_seen": 243769380, + "step": 11297, + "time_per_iteration": 2.573854684829712 + }, + { + "auxiliary_loss_clip": 0.01118364, + "auxiliary_loss_mlp": 0.01104876, + "balance_loss_clip": 1.00181246, + "balance_loss_mlp": 1.0004487, + "epoch": 0.6792725086427176, + "flos": 17566279678080.0, + "grad_norm": 3.2964702141002316, + "language_loss": 0.66659278, + "learning_rate": 9.853082745349918e-07, + "loss": 0.68882519, + "num_input_tokens_seen": 243785510, + "step": 11298, + "time_per_iteration": 2.62601637840271 + }, + { + "auxiliary_loss_clip": 0.01150445, + "auxiliary_loss_mlp": 0.0110399, + "balance_loss_clip": 1.00198615, + "balance_loss_mlp": 1.00042081, + "epoch": 0.6793326318953855, + "flos": 26941837040640.0, + "grad_norm": 1.888124682884577, + "language_loss": 0.72025311, + "learning_rate": 9.84972678083801e-07, + "loss": 0.74279743, + "num_input_tokens_seen": 243805545, + "step": 11299, + "time_per_iteration": 2.5718746185302734 + }, + { + "auxiliary_loss_clip": 0.0116511, + "auxiliary_loss_mlp": 0.01104423, + "balance_loss_clip": 1.00200415, + "balance_loss_mlp": 1.0006634, + "epoch": 0.6793927551480535, + "flos": 24318275662080.0, + "grad_norm": 4.9608139336931725, + "language_loss": 0.77265573, + "learning_rate": 9.846371201225488e-07, + "loss": 0.79535103, + "num_input_tokens_seen": 243825185, + "step": 11300, + "time_per_iteration": 2.6034815311431885 + }, + { + "auxiliary_loss_clip": 0.01147988, + "auxiliary_loss_mlp": 0.01104093, + "balance_loss_clip": 1.0020256, + "balance_loss_mlp": 1.00052381, + "epoch": 0.6794528784007214, + "flos": 11436615757440.0, + "grad_norm": 2.8800732406775573, + "language_loss": 0.63020289, + "learning_rate": 9.843016006639577e-07, + "loss": 0.65272367, + "num_input_tokens_seen": 243841600, + "step": 11301, + "time_per_iteration": 2.5348033905029297 + }, + { + "auxiliary_loss_clip": 0.01147839, + "auxiliary_loss_mlp": 0.01104147, + "balance_loss_clip": 1.00190735, + "balance_loss_mlp": 1.0004828, + "epoch": 0.6795130016533895, + "flos": 25229688382080.0, + "grad_norm": 1.7589428420342432, + "language_loss": 0.82934058, + "learning_rate": 9.839661197207525e-07, + "loss": 0.85186046, + "num_input_tokens_seen": 243862250, + "step": 11302, + "time_per_iteration": 2.607481002807617 + }, + { + "auxiliary_loss_clip": 0.01149846, + "auxiliary_loss_mlp": 0.01104372, + "balance_loss_clip": 1.00188589, + "balance_loss_mlp": 1.00051689, + "epoch": 0.6795731249060574, + "flos": 18296415434880.0, + "grad_norm": 2.0520213045202653, + "language_loss": 0.6968708, + "learning_rate": 9.83630677305654e-07, + "loss": 0.71941298, + "num_input_tokens_seen": 243880560, + "step": 11303, + "time_per_iteration": 5.3116960525512695 + }, + { + "auxiliary_loss_clip": 0.01116946, + "auxiliary_loss_mlp": 0.01105763, + "balance_loss_clip": 1.00191748, + "balance_loss_mlp": 1.00047779, + "epoch": 0.6796332481587254, + "flos": 20300374183680.0, + "grad_norm": 1.8589871830766884, + "language_loss": 0.7001158, + "learning_rate": 9.832952734313813e-07, + "loss": 0.72234297, + "num_input_tokens_seen": 243900635, + "step": 11304, + "time_per_iteration": 2.6462318897247314 + }, + { + "auxiliary_loss_clip": 0.01150472, + "auxiliary_loss_mlp": 0.01105635, + "balance_loss_clip": 1.0020591, + "balance_loss_mlp": 1.00063539, + "epoch": 0.6796933714113934, + "flos": 23586847015680.0, + "grad_norm": 1.8644706535081292, + "language_loss": 0.72475708, + "learning_rate": 9.829599081106536e-07, + "loss": 0.74731815, + "num_input_tokens_seen": 243920160, + "step": 11305, + "time_per_iteration": 2.5700020790100098 + }, + { + "auxiliary_loss_clip": 0.01131556, + "auxiliary_loss_mlp": 0.01104702, + "balance_loss_clip": 1.00178623, + "balance_loss_mlp": 1.00037003, + "epoch": 0.6797534946640613, + "flos": 27119917693440.0, + "grad_norm": 2.248537144151789, + "language_loss": 0.66319919, + "learning_rate": 9.826245813561882e-07, + "loss": 0.68556184, + "num_input_tokens_seen": 243939015, + "step": 11306, + "time_per_iteration": 2.634931802749634 + }, + { + "auxiliary_loss_clip": 0.0113143, + "auxiliary_loss_mlp": 0.0110369, + "balance_loss_clip": 1.0017556, + "balance_loss_mlp": 1.00040734, + "epoch": 0.6798136179167293, + "flos": 22127437428480.0, + "grad_norm": 1.8751031826060687, + "language_loss": 0.79814744, + "learning_rate": 9.822892931807021e-07, + "loss": 0.82049859, + "num_input_tokens_seen": 243958470, + "step": 11307, + "time_per_iteration": 2.591456413269043 + }, + { + "auxiliary_loss_clip": 0.01133383, + "auxiliary_loss_mlp": 0.01104669, + "balance_loss_clip": 1.00193775, + "balance_loss_mlp": 1.00043249, + "epoch": 0.6798737411693972, + "flos": 17488640430720.0, + "grad_norm": 2.337578865943461, + "language_loss": 0.88814425, + "learning_rate": 9.819540435969066e-07, + "loss": 0.91052473, + "num_input_tokens_seen": 243975450, + "step": 11308, + "time_per_iteration": 2.6131670475006104 + }, + { + "auxiliary_loss_clip": 0.01118748, + "auxiliary_loss_mlp": 0.01104368, + "balance_loss_clip": 1.00174594, + "balance_loss_mlp": 1.0005132, + "epoch": 0.6799338644220653, + "flos": 22892262744960.0, + "grad_norm": 1.8269815368075206, + "language_loss": 0.71091062, + "learning_rate": 9.816188326175154e-07, + "loss": 0.73314178, + "num_input_tokens_seen": 243994355, + "step": 11309, + "time_per_iteration": 2.637911558151245 + }, + { + "auxiliary_loss_clip": 0.01116254, + "auxiliary_loss_mlp": 0.01103675, + "balance_loss_clip": 1.00179577, + "balance_loss_mlp": 1.0005827, + "epoch": 0.6799939876747332, + "flos": 23180409648000.0, + "grad_norm": 2.106956106045983, + "language_loss": 0.84407651, + "learning_rate": 9.812836602552411e-07, + "loss": 0.86627579, + "num_input_tokens_seen": 244011620, + "step": 11310, + "time_per_iteration": 2.6441359519958496 + }, + { + "auxiliary_loss_clip": 0.0113387, + "auxiliary_loss_mlp": 0.0110354, + "balance_loss_clip": 1.00194395, + "balance_loss_mlp": 1.00044739, + "epoch": 0.6800541109274012, + "flos": 19499925553920.0, + "grad_norm": 4.263364523294037, + "language_loss": 0.82939649, + "learning_rate": 9.80948526522792e-07, + "loss": 0.85177058, + "num_input_tokens_seen": 244029925, + "step": 11311, + "time_per_iteration": 2.6090497970581055 + }, + { + "auxiliary_loss_clip": 0.0110674, + "auxiliary_loss_mlp": 0.01105209, + "balance_loss_clip": 1.00184655, + "balance_loss_mlp": 1.00039971, + "epoch": 0.6801142341800691, + "flos": 22277652105600.0, + "grad_norm": 1.8522126064757576, + "language_loss": 0.76128936, + "learning_rate": 9.806134314328767e-07, + "loss": 0.78340888, + "num_input_tokens_seen": 244051225, + "step": 11312, + "time_per_iteration": 2.7160205841064453 + }, + { + "auxiliary_loss_clip": 0.01159288, + "auxiliary_loss_mlp": 0.01079434, + "balance_loss_clip": 1.00092673, + "balance_loss_mlp": 1.00008869, + "epoch": 0.6801743574327371, + "flos": 68714817759360.0, + "grad_norm": 0.6788183067138539, + "language_loss": 0.57242268, + "learning_rate": 9.802783749982038e-07, + "loss": 0.59480989, + "num_input_tokens_seen": 244115930, + "step": 11313, + "time_per_iteration": 3.2065165042877197 + }, + { + "auxiliary_loss_clip": 0.01148571, + "auxiliary_loss_mlp": 0.0110387, + "balance_loss_clip": 1.00184345, + "balance_loss_mlp": 1.00039577, + "epoch": 0.680234480685405, + "flos": 29460467813760.0, + "grad_norm": 1.7154561493221683, + "language_loss": 0.68647069, + "learning_rate": 9.799433572314754e-07, + "loss": 0.70899504, + "num_input_tokens_seen": 244137320, + "step": 11314, + "time_per_iteration": 2.6304702758789062 + }, + { + "auxiliary_loss_clip": 0.01150348, + "auxiliary_loss_mlp": 0.01103075, + "balance_loss_clip": 1.00193167, + "balance_loss_mlp": 1.00036407, + "epoch": 0.6802946039380731, + "flos": 15916866122880.0, + "grad_norm": 1.8164953142069138, + "language_loss": 0.81449163, + "learning_rate": 9.796083781453972e-07, + "loss": 0.83702582, + "num_input_tokens_seen": 244152755, + "step": 11315, + "time_per_iteration": 2.5248005390167236 + }, + { + "auxiliary_loss_clip": 0.01101533, + "auxiliary_loss_mlp": 0.01103976, + "balance_loss_clip": 1.00170255, + "balance_loss_mlp": 1.00031161, + "epoch": 0.680354727190741, + "flos": 22018664067840.0, + "grad_norm": 1.6942553708104635, + "language_loss": 0.69839239, + "learning_rate": 9.792734377526718e-07, + "loss": 0.72044748, + "num_input_tokens_seen": 244171480, + "step": 11316, + "time_per_iteration": 2.7027390003204346 + }, + { + "auxiliary_loss_clip": 0.0114822, + "auxiliary_loss_mlp": 0.01104476, + "balance_loss_clip": 1.00193202, + "balance_loss_mlp": 1.00062084, + "epoch": 0.680414850443409, + "flos": 18441494467200.0, + "grad_norm": 2.4361303379555257, + "language_loss": 0.66325068, + "learning_rate": 9.789385360660003e-07, + "loss": 0.68577766, + "num_input_tokens_seen": 244187920, + "step": 11317, + "time_per_iteration": 2.52189302444458 + }, + { + "auxiliary_loss_clip": 0.01148382, + "auxiliary_loss_mlp": 0.01105085, + "balance_loss_clip": 1.0019877, + "balance_loss_mlp": 1.0007534, + "epoch": 0.680474973696077, + "flos": 26358611909760.0, + "grad_norm": 1.6597052257228755, + "language_loss": 0.74786484, + "learning_rate": 9.78603673098082e-07, + "loss": 0.77039957, + "num_input_tokens_seen": 244209565, + "step": 11318, + "time_per_iteration": 2.5836334228515625 + }, + { + "auxiliary_loss_clip": 0.01133641, + "auxiliary_loss_mlp": 0.01102244, + "balance_loss_clip": 1.00191391, + "balance_loss_mlp": 1.00048733, + "epoch": 0.6805350969487449, + "flos": 18333116156160.0, + "grad_norm": 2.5252982928618657, + "language_loss": 0.68151557, + "learning_rate": 9.782688488616143e-07, + "loss": 0.70387447, + "num_input_tokens_seen": 244228015, + "step": 11319, + "time_per_iteration": 2.5748631954193115 + }, + { + "auxiliary_loss_clip": 0.0109972, + "auxiliary_loss_mlp": 0.00747266, + "balance_loss_clip": 1.00161147, + "balance_loss_mlp": 1.00044572, + "epoch": 0.6805952202014129, + "flos": 19937497034880.0, + "grad_norm": 1.7687333759040675, + "language_loss": 0.77122355, + "learning_rate": 9.779340633692945e-07, + "loss": 0.78969336, + "num_input_tokens_seen": 244245615, + "step": 11320, + "time_per_iteration": 2.6684513092041016 + }, + { + "auxiliary_loss_clip": 0.01131439, + "auxiliary_loss_mlp": 0.01104064, + "balance_loss_clip": 1.00185621, + "balance_loss_mlp": 1.00049543, + "epoch": 0.6806553434540809, + "flos": 25224301342080.0, + "grad_norm": 2.0004835113033663, + "language_loss": 0.75001299, + "learning_rate": 9.77599316633817e-07, + "loss": 0.77236807, + "num_input_tokens_seen": 244263625, + "step": 11321, + "time_per_iteration": 2.641115188598633 + }, + { + "auxiliary_loss_clip": 0.01131625, + "auxiliary_loss_mlp": 0.01104501, + "balance_loss_clip": 1.00193453, + "balance_loss_mlp": 1.00074148, + "epoch": 0.6807154667067489, + "flos": 17785586165760.0, + "grad_norm": 4.862019713062391, + "language_loss": 0.73153317, + "learning_rate": 9.772646086678758e-07, + "loss": 0.75389445, + "num_input_tokens_seen": 244282745, + "step": 11322, + "time_per_iteration": 2.57952618598938 + }, + { + "auxiliary_loss_clip": 0.01100123, + "auxiliary_loss_mlp": 0.00747323, + "balance_loss_clip": 1.00166178, + "balance_loss_mlp": 1.00045681, + "epoch": 0.6807755899594168, + "flos": 22199905117440.0, + "grad_norm": 1.659347487747524, + "language_loss": 0.78192604, + "learning_rate": 9.769299394841638e-07, + "loss": 0.8004005, + "num_input_tokens_seen": 244303770, + "step": 11323, + "time_per_iteration": 2.724639654159546 + }, + { + "auxiliary_loss_clip": 0.01111042, + "auxiliary_loss_mlp": 0.01079399, + "balance_loss_clip": 1.00064015, + "balance_loss_mlp": 1.00005341, + "epoch": 0.6808357132120848, + "flos": 68631073200000.0, + "grad_norm": 1.281340753736281, + "language_loss": 0.57093072, + "learning_rate": 9.765953090953714e-07, + "loss": 0.59283519, + "num_input_tokens_seen": 244355910, + "step": 11324, + "time_per_iteration": 2.9883925914764404 + }, + { + "auxiliary_loss_clip": 0.01133083, + "auxiliary_loss_mlp": 0.01105109, + "balance_loss_clip": 1.00190687, + "balance_loss_mlp": 1.00068212, + "epoch": 0.6808958364647527, + "flos": 23843357015040.0, + "grad_norm": 1.850389846729049, + "language_loss": 0.68093592, + "learning_rate": 9.76260717514186e-07, + "loss": 0.70331788, + "num_input_tokens_seen": 244376610, + "step": 11325, + "time_per_iteration": 2.684723138809204 + }, + { + "auxiliary_loss_clip": 0.01150612, + "auxiliary_loss_mlp": 0.01104614, + "balance_loss_clip": 1.00206542, + "balance_loss_mlp": 1.00056791, + "epoch": 0.6809559597174207, + "flos": 17711717846400.0, + "grad_norm": 2.3983233870339467, + "language_loss": 0.7027331, + "learning_rate": 9.759261647532974e-07, + "loss": 0.72528535, + "num_input_tokens_seen": 244393000, + "step": 11326, + "time_per_iteration": 2.570063352584839 + }, + { + "auxiliary_loss_clip": 0.01165037, + "auxiliary_loss_mlp": 0.01104128, + "balance_loss_clip": 1.00194407, + "balance_loss_mlp": 1.00046396, + "epoch": 0.6810160829700886, + "flos": 22491894775680.0, + "grad_norm": 1.839299827273775, + "language_loss": 0.72862887, + "learning_rate": 9.75591650825392e-07, + "loss": 0.75132054, + "num_input_tokens_seen": 244409515, + "step": 11327, + "time_per_iteration": 4.0352606773376465 + }, + { + "auxiliary_loss_clip": 0.01148289, + "auxiliary_loss_mlp": 0.0110422, + "balance_loss_clip": 1.00186181, + "balance_loss_mlp": 1.00045991, + "epoch": 0.6810762062227567, + "flos": 16832875783680.0, + "grad_norm": 1.9920718318767237, + "language_loss": 0.77226794, + "learning_rate": 9.752571757431526e-07, + "loss": 0.79479307, + "num_input_tokens_seen": 244427165, + "step": 11328, + "time_per_iteration": 3.971313238143921 + }, + { + "auxiliary_loss_clip": 0.01165073, + "auxiliary_loss_mlp": 0.01104328, + "balance_loss_clip": 1.00196457, + "balance_loss_mlp": 1.00056863, + "epoch": 0.6811363294754246, + "flos": 12714676554240.0, + "grad_norm": 2.035841787821588, + "language_loss": 0.64567506, + "learning_rate": 9.74922739519265e-07, + "loss": 0.66836905, + "num_input_tokens_seen": 244445705, + "step": 11329, + "time_per_iteration": 2.5069589614868164 + }, + { + "auxiliary_loss_clip": 0.01083982, + "auxiliary_loss_mlp": 0.00747234, + "balance_loss_clip": 1.00164223, + "balance_loss_mlp": 1.00043988, + "epoch": 0.6811964527280926, + "flos": 17711969241600.0, + "grad_norm": 1.9801015316219566, + "language_loss": 0.79382998, + "learning_rate": 9.745883421664096e-07, + "loss": 0.81214213, + "num_input_tokens_seen": 244460415, + "step": 11330, + "time_per_iteration": 2.694089412689209 + }, + { + "auxiliary_loss_clip": 0.01148321, + "auxiliary_loss_mlp": 0.01104954, + "balance_loss_clip": 1.00190067, + "balance_loss_mlp": 1.00052667, + "epoch": 0.6812565759807605, + "flos": 24863471268480.0, + "grad_norm": 2.260720212413778, + "language_loss": 0.63841796, + "learning_rate": 9.742539836972665e-07, + "loss": 0.66095066, + "num_input_tokens_seen": 244480555, + "step": 11331, + "time_per_iteration": 2.5763626098632812 + }, + { + "auxiliary_loss_clip": 0.01101716, + "auxiliary_loss_mlp": 0.01104213, + "balance_loss_clip": 1.00168037, + "balance_loss_mlp": 1.0005486, + "epoch": 0.6813166992334285, + "flos": 17166019449600.0, + "grad_norm": 1.6783402072156353, + "language_loss": 0.72636282, + "learning_rate": 9.739196641245148e-07, + "loss": 0.74842221, + "num_input_tokens_seen": 244498540, + "step": 11332, + "time_per_iteration": 2.6494085788726807 + }, + { + "auxiliary_loss_clip": 0.01148425, + "auxiliary_loss_mlp": 0.01104624, + "balance_loss_clip": 1.00187588, + "balance_loss_mlp": 1.00067389, + "epoch": 0.6813768224860965, + "flos": 18843550375680.0, + "grad_norm": 2.023369257092837, + "language_loss": 0.74445677, + "learning_rate": 9.735853834608326e-07, + "loss": 0.76698732, + "num_input_tokens_seen": 244517015, + "step": 11333, + "time_per_iteration": 2.57077956199646 + }, + { + "auxiliary_loss_clip": 0.01148436, + "auxiliary_loss_mlp": 0.0110559, + "balance_loss_clip": 1.00194132, + "balance_loss_mlp": 1.00049496, + "epoch": 0.6814369457387645, + "flos": 24532733813760.0, + "grad_norm": 1.4995717603871848, + "language_loss": 0.72269022, + "learning_rate": 9.732511417188963e-07, + "loss": 0.74523044, + "num_input_tokens_seen": 244537450, + "step": 11334, + "time_per_iteration": 2.5867509841918945 + }, + { + "auxiliary_loss_clip": 0.01148716, + "auxiliary_loss_mlp": 0.01104094, + "balance_loss_clip": 1.0020752, + "balance_loss_mlp": 1.00052452, + "epoch": 0.6814970689914325, + "flos": 18222978078720.0, + "grad_norm": 2.386148044758692, + "language_loss": 0.85757786, + "learning_rate": 9.729169389113791e-07, + "loss": 0.88010597, + "num_input_tokens_seen": 244555640, + "step": 11335, + "time_per_iteration": 2.557612180709839 + }, + { + "auxiliary_loss_clip": 0.01150035, + "auxiliary_loss_mlp": 0.01103233, + "balance_loss_clip": 1.00188434, + "balance_loss_mlp": 1.0005219, + "epoch": 0.6815571922441004, + "flos": 25228790542080.0, + "grad_norm": 1.6098655429322073, + "language_loss": 0.82075977, + "learning_rate": 9.725827750509542e-07, + "loss": 0.84329247, + "num_input_tokens_seen": 244574005, + "step": 11336, + "time_per_iteration": 2.5851805210113525 + }, + { + "auxiliary_loss_clip": 0.01116861, + "auxiliary_loss_mlp": 0.01103239, + "balance_loss_clip": 1.00156391, + "balance_loss_mlp": 1.00071883, + "epoch": 0.6816173154967684, + "flos": 19456078026240.0, + "grad_norm": 2.4160419753075892, + "language_loss": 0.8141492, + "learning_rate": 9.72248650150294e-07, + "loss": 0.8363502, + "num_input_tokens_seen": 244591395, + "step": 11337, + "time_per_iteration": 2.6106722354888916 + }, + { + "auxiliary_loss_clip": 0.01104331, + "auxiliary_loss_mlp": 0.01103749, + "balance_loss_clip": 1.00174761, + "balance_loss_mlp": 1.00056124, + "epoch": 0.6816774387494363, + "flos": 17931455297280.0, + "grad_norm": 1.628812808041036, + "language_loss": 0.72491634, + "learning_rate": 9.719145642220673e-07, + "loss": 0.74699712, + "num_input_tokens_seen": 244610400, + "step": 11338, + "time_per_iteration": 2.6657614707946777 + }, + { + "auxiliary_loss_clip": 0.0110375, + "auxiliary_loss_mlp": 0.01104527, + "balance_loss_clip": 1.00178003, + "balance_loss_mlp": 1.00057673, + "epoch": 0.6817375620021043, + "flos": 22233014478720.0, + "grad_norm": 1.4351117783636, + "language_loss": 0.77245784, + "learning_rate": 9.715805172789435e-07, + "loss": 0.79454064, + "num_input_tokens_seen": 244630400, + "step": 11339, + "time_per_iteration": 2.73514461517334 + }, + { + "auxiliary_loss_clip": 0.01118397, + "auxiliary_loss_mlp": 0.01104587, + "balance_loss_clip": 1.00183725, + "balance_loss_mlp": 1.00063634, + "epoch": 0.6817976852547722, + "flos": 25374408278400.0, + "grad_norm": 2.0253529589311845, + "language_loss": 0.71029639, + "learning_rate": 9.712465093335901e-07, + "loss": 0.73252624, + "num_input_tokens_seen": 244649155, + "step": 11340, + "time_per_iteration": 4.122373580932617 + }, + { + "auxiliary_loss_clip": 0.01131775, + "auxiliary_loss_mlp": 0.01105153, + "balance_loss_clip": 1.00186455, + "balance_loss_mlp": 1.00053525, + "epoch": 0.6818578085074403, + "flos": 22265764704000.0, + "grad_norm": 2.152856785737619, + "language_loss": 0.83162636, + "learning_rate": 9.709125403986722e-07, + "loss": 0.85399568, + "num_input_tokens_seen": 244665470, + "step": 11341, + "time_per_iteration": 4.060233116149902 + }, + { + "auxiliary_loss_clip": 0.01115239, + "auxiliary_loss_mlp": 0.01104579, + "balance_loss_clip": 1.00177169, + "balance_loss_mlp": 1.00062859, + "epoch": 0.6819179317601082, + "flos": 19318145800320.0, + "grad_norm": 1.6333096659151591, + "language_loss": 0.68281209, + "learning_rate": 9.705786104868531e-07, + "loss": 0.70501029, + "num_input_tokens_seen": 244684390, + "step": 11342, + "time_per_iteration": 2.6326723098754883 + }, + { + "auxiliary_loss_clip": 0.01100552, + "auxiliary_loss_mlp": 0.01103618, + "balance_loss_clip": 1.00175667, + "balance_loss_mlp": 1.00052607, + "epoch": 0.6819780550127762, + "flos": 21104126864640.0, + "grad_norm": 2.130625706645791, + "language_loss": 0.74690855, + "learning_rate": 9.702447196107963e-07, + "loss": 0.76895022, + "num_input_tokens_seen": 244703370, + "step": 11343, + "time_per_iteration": 2.6743290424346924 + }, + { + "auxiliary_loss_clip": 0.01117092, + "auxiliary_loss_mlp": 0.01105772, + "balance_loss_clip": 1.00193393, + "balance_loss_mlp": 1.00048661, + "epoch": 0.6820381782654441, + "flos": 29716403195520.0, + "grad_norm": 1.6763583977140157, + "language_loss": 0.7980541, + "learning_rate": 9.699108677831639e-07, + "loss": 0.8202827, + "num_input_tokens_seen": 244723325, + "step": 11344, + "time_per_iteration": 2.742581367492676 + }, + { + "auxiliary_loss_clip": 0.01116274, + "auxiliary_loss_mlp": 0.01104245, + "balance_loss_clip": 1.00164914, + "balance_loss_mlp": 1.0004853, + "epoch": 0.6820983015181121, + "flos": 29242130993280.0, + "grad_norm": 2.1297569127164815, + "language_loss": 0.66625524, + "learning_rate": 9.695770550166136e-07, + "loss": 0.68846047, + "num_input_tokens_seen": 244745650, + "step": 11345, + "time_per_iteration": 2.729328155517578 + }, + { + "auxiliary_loss_clip": 0.01132227, + "auxiliary_loss_mlp": 0.01105624, + "balance_loss_clip": 1.00190139, + "balance_loss_mlp": 1.0006243, + "epoch": 0.6821584247707801, + "flos": 18871775487360.0, + "grad_norm": 2.392417499180314, + "language_loss": 0.65120387, + "learning_rate": 9.692432813238054e-07, + "loss": 0.67358238, + "num_input_tokens_seen": 244760270, + "step": 11346, + "time_per_iteration": 2.559670925140381 + }, + { + "auxiliary_loss_clip": 0.01088227, + "auxiliary_loss_mlp": 0.00747336, + "balance_loss_clip": 1.00179577, + "balance_loss_mlp": 1.00046873, + "epoch": 0.6822185480234481, + "flos": 21324582587520.0, + "grad_norm": 1.597676403189461, + "language_loss": 0.78588802, + "learning_rate": 9.689095467173952e-07, + "loss": 0.80424362, + "num_input_tokens_seen": 244779565, + "step": 11347, + "time_per_iteration": 2.7425615787506104 + }, + { + "auxiliary_loss_clip": 0.01144294, + "auxiliary_loss_mlp": 0.01078962, + "balance_loss_clip": 1.0008781, + "balance_loss_mlp": 0.99999821, + "epoch": 0.6822786712761161, + "flos": 63488306430720.0, + "grad_norm": 0.7702316015869723, + "language_loss": 0.52495426, + "learning_rate": 9.685758512100378e-07, + "loss": 0.54718685, + "num_input_tokens_seen": 244838480, + "step": 11348, + "time_per_iteration": 3.14587140083313 + }, + { + "auxiliary_loss_clip": 0.01164918, + "auxiliary_loss_mlp": 0.01104088, + "balance_loss_clip": 1.00192261, + "balance_loss_mlp": 1.00051928, + "epoch": 0.682338794528784, + "flos": 21068934514560.0, + "grad_norm": 1.7250336540178643, + "language_loss": 0.79782903, + "learning_rate": 9.682421948143873e-07, + "loss": 0.82051909, + "num_input_tokens_seen": 244855265, + "step": 11349, + "time_per_iteration": 2.5355422496795654 + }, + { + "auxiliary_loss_clip": 0.01148437, + "auxiliary_loss_mlp": 0.0110692, + "balance_loss_clip": 1.00194263, + "balance_loss_mlp": 1.00039411, + "epoch": 0.682398917781452, + "flos": 36283243547520.0, + "grad_norm": 2.5226045787097244, + "language_loss": 0.74046719, + "learning_rate": 9.67908577543096e-07, + "loss": 0.76302075, + "num_input_tokens_seen": 244875555, + "step": 11350, + "time_per_iteration": 2.68703031539917 + }, + { + "auxiliary_loss_clip": 0.01164936, + "auxiliary_loss_mlp": 0.01104095, + "balance_loss_clip": 1.00198305, + "balance_loss_mlp": 1.00052619, + "epoch": 0.6824590410341199, + "flos": 24859197550080.0, + "grad_norm": 1.6079784084677118, + "language_loss": 0.79513359, + "learning_rate": 9.675749994088161e-07, + "loss": 0.81782389, + "num_input_tokens_seen": 244895270, + "step": 11351, + "time_per_iteration": 2.579040050506592 + }, + { + "auxiliary_loss_clip": 0.01148145, + "auxiliary_loss_mlp": 0.01103927, + "balance_loss_clip": 1.00185788, + "balance_loss_mlp": 1.0005486, + "epoch": 0.6825191642867879, + "flos": 22452392793600.0, + "grad_norm": 2.1364773335997747, + "language_loss": 0.73358124, + "learning_rate": 9.672414604241954e-07, + "loss": 0.75610197, + "num_input_tokens_seen": 244914535, + "step": 11352, + "time_per_iteration": 2.5484166145324707 + }, + { + "auxiliary_loss_clip": 0.01102115, + "auxiliary_loss_mlp": 0.01105933, + "balance_loss_clip": 1.00184846, + "balance_loss_mlp": 1.00045633, + "epoch": 0.6825792875394558, + "flos": 29424377623680.0, + "grad_norm": 1.495010846389298, + "language_loss": 0.80044091, + "learning_rate": 9.669079606018814e-07, + "loss": 0.82252145, + "num_input_tokens_seen": 244936095, + "step": 11353, + "time_per_iteration": 2.7286927700042725 + }, + { + "auxiliary_loss_clip": 0.01149698, + "auxiliary_loss_mlp": 0.0110421, + "balance_loss_clip": 1.00185192, + "balance_loss_mlp": 1.00054574, + "epoch": 0.6826394107921239, + "flos": 18770974945920.0, + "grad_norm": 1.8837521634459855, + "language_loss": 0.7822628, + "learning_rate": 9.665744999545218e-07, + "loss": 0.80480182, + "num_input_tokens_seen": 244955290, + "step": 11354, + "time_per_iteration": 2.5520355701446533 + }, + { + "auxiliary_loss_clip": 0.01081732, + "auxiliary_loss_mlp": 0.01103489, + "balance_loss_clip": 1.0016216, + "balance_loss_mlp": 1.00049233, + "epoch": 0.6826995340447918, + "flos": 16617591619200.0, + "grad_norm": 2.300696684127706, + "language_loss": 0.61948335, + "learning_rate": 9.662410784947599e-07, + "loss": 0.64133561, + "num_input_tokens_seen": 244972935, + "step": 11355, + "time_per_iteration": 2.7145473957061768 + }, + { + "auxiliary_loss_clip": 0.01103594, + "auxiliary_loss_mlp": 0.01103822, + "balance_loss_clip": 1.00189483, + "balance_loss_mlp": 1.00053859, + "epoch": 0.6827596572974598, + "flos": 20848299223680.0, + "grad_norm": 2.368043266286649, + "language_loss": 0.8243373, + "learning_rate": 9.659076962352398e-07, + "loss": 0.84641147, + "num_input_tokens_seen": 244989440, + "step": 11356, + "time_per_iteration": 2.6559789180755615 + }, + { + "auxiliary_loss_clip": 0.0113312, + "auxiliary_loss_mlp": 0.01104668, + "balance_loss_clip": 1.00190091, + "balance_loss_mlp": 1.00043154, + "epoch": 0.6828197805501277, + "flos": 22748081552640.0, + "grad_norm": 1.809715666937033, + "language_loss": 0.78279501, + "learning_rate": 9.655743531886052e-07, + "loss": 0.80517292, + "num_input_tokens_seen": 245007830, + "step": 11357, + "time_per_iteration": 2.6038177013397217 + }, + { + "auxiliary_loss_clip": 0.01130282, + "auxiliary_loss_mlp": 0.01079294, + "balance_loss_clip": 1.00091767, + "balance_loss_mlp": 0.99994814, + "epoch": 0.6828799038027957, + "flos": 71646565829760.0, + "grad_norm": 0.8222932533406947, + "language_loss": 0.59666574, + "learning_rate": 9.65241049367493e-07, + "loss": 0.61876154, + "num_input_tokens_seen": 245070720, + "step": 11358, + "time_per_iteration": 3.2168619632720947 + }, + { + "auxiliary_loss_clip": 0.01120808, + "auxiliary_loss_mlp": 0.01106962, + "balance_loss_clip": 1.00186586, + "balance_loss_mlp": 1.00062716, + "epoch": 0.6829400270554637, + "flos": 19829154637440.0, + "grad_norm": 2.03675015279844, + "language_loss": 0.78507924, + "learning_rate": 9.64907784784544e-07, + "loss": 0.80735695, + "num_input_tokens_seen": 245089070, + "step": 11359, + "time_per_iteration": 2.6450910568237305 + }, + { + "auxiliary_loss_clip": 0.01148252, + "auxiliary_loss_mlp": 0.01104255, + "balance_loss_clip": 1.00186563, + "balance_loss_mlp": 1.00049579, + "epoch": 0.6830001503081317, + "flos": 21980634543360.0, + "grad_norm": 1.9607377944507054, + "language_loss": 0.81570673, + "learning_rate": 9.645745594523958e-07, + "loss": 0.8382318, + "num_input_tokens_seen": 245106500, + "step": 11360, + "time_per_iteration": 2.5734317302703857 + }, + { + "auxiliary_loss_clip": 0.01150248, + "auxiliary_loss_mlp": 0.01104957, + "balance_loss_clip": 1.00200951, + "balance_loss_mlp": 1.00062525, + "epoch": 0.6830602735607997, + "flos": 24316767290880.0, + "grad_norm": 2.071625060576778, + "language_loss": 0.75569087, + "learning_rate": 9.642413733836844e-07, + "loss": 0.77824289, + "num_input_tokens_seen": 245125260, + "step": 11361, + "time_per_iteration": 2.5742502212524414 + }, + { + "auxiliary_loss_clip": 0.01127477, + "auxiliary_loss_mlp": 0.0107933, + "balance_loss_clip": 1.00205958, + "balance_loss_mlp": 0.99998438, + "epoch": 0.6831203968134676, + "flos": 57690062323200.0, + "grad_norm": 0.903058815743248, + "language_loss": 0.59635508, + "learning_rate": 9.639082265910437e-07, + "loss": 0.61842322, + "num_input_tokens_seen": 245188730, + "step": 11362, + "time_per_iteration": 3.246567726135254 + }, + { + "auxiliary_loss_clip": 0.01133934, + "auxiliary_loss_mlp": 0.01104221, + "balance_loss_clip": 1.00173736, + "balance_loss_mlp": 1.00055671, + "epoch": 0.6831805200661356, + "flos": 14388436552320.0, + "grad_norm": 2.2950147800956504, + "language_loss": 0.75100631, + "learning_rate": 9.635751190871074e-07, + "loss": 0.77338779, + "num_input_tokens_seen": 245205065, + "step": 11363, + "time_per_iteration": 2.5815951824188232 + }, + { + "auxiliary_loss_clip": 0.01131613, + "auxiliary_loss_mlp": 0.01104706, + "balance_loss_clip": 1.00186276, + "balance_loss_mlp": 1.0006597, + "epoch": 0.6832406433188035, + "flos": 22820297846400.0, + "grad_norm": 2.933053560623575, + "language_loss": 0.89538562, + "learning_rate": 9.632420508845063e-07, + "loss": 0.91774881, + "num_input_tokens_seen": 245224265, + "step": 11364, + "time_per_iteration": 4.07455849647522 + }, + { + "auxiliary_loss_clip": 0.01133635, + "auxiliary_loss_mlp": 0.01103846, + "balance_loss_clip": 1.00191104, + "balance_loss_mlp": 1.00056291, + "epoch": 0.6833007665714715, + "flos": 17561718650880.0, + "grad_norm": 1.9757692921234564, + "language_loss": 0.88017941, + "learning_rate": 9.629090219958697e-07, + "loss": 0.90255427, + "num_input_tokens_seen": 245243360, + "step": 11365, + "time_per_iteration": 4.0177600383758545 + }, + { + "auxiliary_loss_clip": 0.01117296, + "auxiliary_loss_mlp": 0.01105463, + "balance_loss_clip": 1.00183773, + "balance_loss_mlp": 1.00055909, + "epoch": 0.6833608898241395, + "flos": 22445928345600.0, + "grad_norm": 2.392983965721114, + "language_loss": 0.81345505, + "learning_rate": 9.625760324338272e-07, + "loss": 0.83568269, + "num_input_tokens_seen": 245256350, + "step": 11366, + "time_per_iteration": 2.6528217792510986 + }, + { + "auxiliary_loss_clip": 0.0113175, + "auxiliary_loss_mlp": 0.01104437, + "balance_loss_clip": 1.00174403, + "balance_loss_mlp": 1.00048673, + "epoch": 0.6834210130768075, + "flos": 24534637234560.0, + "grad_norm": 1.5458922765717782, + "language_loss": 0.76716614, + "learning_rate": 9.622430822110062e-07, + "loss": 0.78952795, + "num_input_tokens_seen": 245277575, + "step": 11367, + "time_per_iteration": 2.6475119590759277 + }, + { + "auxiliary_loss_clip": 0.0113158, + "auxiliary_loss_mlp": 0.0110549, + "balance_loss_clip": 1.00180352, + "balance_loss_mlp": 1.00068116, + "epoch": 0.6834811363294754, + "flos": 20047132321920.0, + "grad_norm": 1.5182360881873225, + "language_loss": 0.68855464, + "learning_rate": 9.619101713400312e-07, + "loss": 0.71092534, + "num_input_tokens_seen": 245296615, + "step": 11368, + "time_per_iteration": 2.584779739379883 + }, + { + "auxiliary_loss_clip": 0.01118734, + "auxiliary_loss_mlp": 0.01104513, + "balance_loss_clip": 1.0017848, + "balance_loss_mlp": 1.00056267, + "epoch": 0.6835412595821434, + "flos": 24790752184320.0, + "grad_norm": 1.92310445110442, + "language_loss": 0.73412156, + "learning_rate": 9.615772998335261e-07, + "loss": 0.75635397, + "num_input_tokens_seen": 245316275, + "step": 11369, + "time_per_iteration": 2.67160701751709 + }, + { + "auxiliary_loss_clip": 0.01148649, + "auxiliary_loss_mlp": 0.01105479, + "balance_loss_clip": 1.0018872, + "balance_loss_mlp": 1.00057495, + "epoch": 0.6836013828348113, + "flos": 19500356517120.0, + "grad_norm": 2.6937104403278234, + "language_loss": 0.78766465, + "learning_rate": 9.612444677041138e-07, + "loss": 0.81020594, + "num_input_tokens_seen": 245334595, + "step": 11370, + "time_per_iteration": 2.543236017227173 + }, + { + "auxiliary_loss_clip": 0.01144737, + "auxiliary_loss_mlp": 0.01078944, + "balance_loss_clip": 1.00087643, + "balance_loss_mlp": 0.99997932, + "epoch": 0.6836615060874793, + "flos": 58363999251840.0, + "grad_norm": 0.7482151245886909, + "language_loss": 0.59848094, + "learning_rate": 9.609116749644162e-07, + "loss": 0.62071776, + "num_input_tokens_seen": 245389750, + "step": 11371, + "time_per_iteration": 3.0185811519622803 + }, + { + "auxiliary_loss_clip": 0.01131863, + "auxiliary_loss_mlp": 0.01103319, + "balance_loss_clip": 1.00187969, + "balance_loss_mlp": 1.00051308, + "epoch": 0.6837216293401474, + "flos": 12166895168640.0, + "grad_norm": 1.6214676785057618, + "language_loss": 0.63654304, + "learning_rate": 9.605789216270511e-07, + "loss": 0.65889484, + "num_input_tokens_seen": 245407530, + "step": 11372, + "time_per_iteration": 2.588782787322998 + }, + { + "auxiliary_loss_clip": 0.01148415, + "auxiliary_loss_mlp": 0.0110501, + "balance_loss_clip": 1.00202012, + "balance_loss_mlp": 1.00048721, + "epoch": 0.6837817525928153, + "flos": 22127581082880.0, + "grad_norm": 8.449497398321787, + "language_loss": 0.71661031, + "learning_rate": 9.602462077046375e-07, + "loss": 0.7391445, + "num_input_tokens_seen": 245427000, + "step": 11373, + "time_per_iteration": 2.555483341217041 + }, + { + "auxiliary_loss_clip": 0.01113735, + "auxiliary_loss_mlp": 0.01078995, + "balance_loss_clip": 1.00107133, + "balance_loss_mlp": 1.0000304, + "epoch": 0.6838418758454833, + "flos": 65005928985600.0, + "grad_norm": 1.2555647870202198, + "language_loss": 0.56708431, + "learning_rate": 9.599135332097935e-07, + "loss": 0.58901167, + "num_input_tokens_seen": 245491620, + "step": 11374, + "time_per_iteration": 3.3477914333343506 + }, + { + "auxiliary_loss_clip": 0.01148704, + "auxiliary_loss_mlp": 0.01105507, + "balance_loss_clip": 1.00204253, + "balance_loss_mlp": 1.00041258, + "epoch": 0.6839019990981512, + "flos": 21030833162880.0, + "grad_norm": 2.9363107905441685, + "language_loss": 0.73700917, + "learning_rate": 9.595808981551312e-07, + "loss": 0.75955123, + "num_input_tokens_seen": 245511285, + "step": 11375, + "time_per_iteration": 2.5688953399658203 + }, + { + "auxiliary_loss_clip": 0.01133518, + "auxiliary_loss_mlp": 0.01103372, + "balance_loss_clip": 1.00191629, + "balance_loss_mlp": 1.00056553, + "epoch": 0.6839621223508192, + "flos": 24935543907840.0, + "grad_norm": 1.8517301282374374, + "language_loss": 0.70831901, + "learning_rate": 9.592483025532651e-07, + "loss": 0.73068792, + "num_input_tokens_seen": 245532910, + "step": 11376, + "time_per_iteration": 2.6465678215026855 + }, + { + "auxiliary_loss_clip": 0.01165114, + "auxiliary_loss_mlp": 0.01104783, + "balance_loss_clip": 1.00191116, + "balance_loss_mlp": 1.00064206, + "epoch": 0.6840222456034871, + "flos": 26358827391360.0, + "grad_norm": 1.8215244162674482, + "language_loss": 0.74684566, + "learning_rate": 9.58915746416808e-07, + "loss": 0.76954466, + "num_input_tokens_seen": 245550540, + "step": 11377, + "time_per_iteration": 3.979538917541504 + }, + { + "auxiliary_loss_clip": 0.01129915, + "auxiliary_loss_mlp": 0.0107889, + "balance_loss_clip": 1.00089264, + "balance_loss_mlp": 0.99992543, + "epoch": 0.6840823688561551, + "flos": 65988336936960.0, + "grad_norm": 0.7417307862184734, + "language_loss": 0.56925344, + "learning_rate": 9.585832297583707e-07, + "loss": 0.5913415, + "num_input_tokens_seen": 245619570, + "step": 11378, + "time_per_iteration": 3.2433786392211914 + }, + { + "auxiliary_loss_clip": 0.01165034, + "auxiliary_loss_mlp": 0.01105696, + "balance_loss_clip": 1.00188363, + "balance_loss_mlp": 1.00069642, + "epoch": 0.684142492108823, + "flos": 21397588980480.0, + "grad_norm": 1.6540938751809995, + "language_loss": 0.78201199, + "learning_rate": 9.58250752590561e-07, + "loss": 0.80471933, + "num_input_tokens_seen": 245637980, + "step": 11379, + "time_per_iteration": 3.945239782333374 + }, + { + "auxiliary_loss_clip": 0.0116493, + "auxiliary_loss_mlp": 0.01102677, + "balance_loss_clip": 1.00200987, + "balance_loss_mlp": 1.00044322, + "epoch": 0.6842026153614911, + "flos": 18801426700800.0, + "grad_norm": 2.3560106806055945, + "language_loss": 0.69005883, + "learning_rate": 9.57918314925988e-07, + "loss": 0.71273494, + "num_input_tokens_seen": 245655690, + "step": 11380, + "time_per_iteration": 2.5636181831359863 + }, + { + "auxiliary_loss_clip": 0.01131356, + "auxiliary_loss_mlp": 0.01104563, + "balance_loss_clip": 1.00177383, + "balance_loss_mlp": 1.00051701, + "epoch": 0.684262738614159, + "flos": 19646405216640.0, + "grad_norm": 2.129460397805446, + "language_loss": 0.78400326, + "learning_rate": 9.575859167772568e-07, + "loss": 0.80636251, + "num_input_tokens_seen": 245671525, + "step": 11381, + "time_per_iteration": 2.5772323608398438 + }, + { + "auxiliary_loss_clip": 0.01142355, + "auxiliary_loss_mlp": 0.01078983, + "balance_loss_clip": 1.00100064, + "balance_loss_mlp": 1.00001907, + "epoch": 0.684322861866827, + "flos": 62354462739840.0, + "grad_norm": 0.8820850470485162, + "language_loss": 0.67201102, + "learning_rate": 9.572535581569713e-07, + "loss": 0.69422436, + "num_input_tokens_seen": 245724115, + "step": 11382, + "time_per_iteration": 2.973050832748413 + }, + { + "auxiliary_loss_clip": 0.01143462, + "auxiliary_loss_mlp": 0.01079356, + "balance_loss_clip": 1.00095415, + "balance_loss_mlp": 1.00001049, + "epoch": 0.6843829851194949, + "flos": 65805048812160.0, + "grad_norm": 0.8300168563062721, + "language_loss": 0.5815118, + "learning_rate": 9.569212390777356e-07, + "loss": 0.60374004, + "num_input_tokens_seen": 245789245, + "step": 11383, + "time_per_iteration": 3.1458582878112793 + }, + { + "auxiliary_loss_clip": 0.01100036, + "auxiliary_loss_mlp": 0.01103428, + "balance_loss_clip": 1.00163412, + "balance_loss_mlp": 1.00052619, + "epoch": 0.6844431083721629, + "flos": 27855153181440.0, + "grad_norm": 1.7087957367081739, + "language_loss": 0.7991178, + "learning_rate": 9.565889595521517e-07, + "loss": 0.82115245, + "num_input_tokens_seen": 245812420, + "step": 11384, + "time_per_iteration": 2.763584613800049 + }, + { + "auxiliary_loss_clip": 0.01148318, + "auxiliary_loss_mlp": 0.01105119, + "balance_loss_clip": 1.00180089, + "balance_loss_mlp": 1.00059605, + "epoch": 0.684503231624831, + "flos": 18255010032000.0, + "grad_norm": 2.090830887238352, + "language_loss": 0.76864296, + "learning_rate": 9.562567195928187e-07, + "loss": 0.79117727, + "num_input_tokens_seen": 245829135, + "step": 11385, + "time_per_iteration": 2.572145700454712 + }, + { + "auxiliary_loss_clip": 0.01117642, + "auxiliary_loss_mlp": 0.01105616, + "balance_loss_clip": 1.00181043, + "balance_loss_mlp": 1.00061643, + "epoch": 0.6845633548774989, + "flos": 17639681120640.0, + "grad_norm": 1.8065508750031722, + "language_loss": 0.83924568, + "learning_rate": 9.55924519212335e-07, + "loss": 0.86147821, + "num_input_tokens_seen": 245847140, + "step": 11386, + "time_per_iteration": 2.7560737133026123 + }, + { + "auxiliary_loss_clip": 0.01148015, + "auxiliary_loss_mlp": 0.01104776, + "balance_loss_clip": 1.00199747, + "balance_loss_mlp": 1.00073028, + "epoch": 0.6846234781301669, + "flos": 20807576179200.0, + "grad_norm": 2.3303309382189084, + "language_loss": 0.83349836, + "learning_rate": 9.555923584232984e-07, + "loss": 0.85602629, + "num_input_tokens_seen": 245862855, + "step": 11387, + "time_per_iteration": 2.6110455989837646 + }, + { + "auxiliary_loss_clip": 0.01150504, + "auxiliary_loss_mlp": 0.01104746, + "balance_loss_clip": 1.00195837, + "balance_loss_mlp": 1.00050986, + "epoch": 0.6846836013828348, + "flos": 36101176485120.0, + "grad_norm": 1.56360821355678, + "language_loss": 0.72189045, + "learning_rate": 9.552602372383047e-07, + "loss": 0.74444294, + "num_input_tokens_seen": 245885415, + "step": 11388, + "time_per_iteration": 2.6893389225006104 + }, + { + "auxiliary_loss_clip": 0.01148718, + "auxiliary_loss_mlp": 0.01103357, + "balance_loss_clip": 1.00201571, + "balance_loss_mlp": 1.00045586, + "epoch": 0.6847437246355028, + "flos": 43142468607360.0, + "grad_norm": 2.0620938126567405, + "language_loss": 0.62183952, + "learning_rate": 9.549281556699469e-07, + "loss": 0.6443603, + "num_input_tokens_seen": 245906285, + "step": 11389, + "time_per_iteration": 2.729027271270752 + }, + { + "auxiliary_loss_clip": 0.01125767, + "auxiliary_loss_mlp": 0.01078971, + "balance_loss_clip": 1.00115335, + "balance_loss_mlp": 1.00000668, + "epoch": 0.6848038478881707, + "flos": 71663729552640.0, + "grad_norm": 0.7246706610247718, + "language_loss": 0.55984002, + "learning_rate": 9.54596113730818e-07, + "loss": 0.58188736, + "num_input_tokens_seen": 245967620, + "step": 11390, + "time_per_iteration": 3.2758700847625732 + }, + { + "auxiliary_loss_clip": 0.01117075, + "auxiliary_loss_mlp": 0.0074722, + "balance_loss_clip": 1.00193179, + "balance_loss_mlp": 1.00031018, + "epoch": 0.6848639711408387, + "flos": 19937820257280.0, + "grad_norm": 2.3066126056468037, + "language_loss": 0.87756574, + "learning_rate": 9.542641114335109e-07, + "loss": 0.89620876, + "num_input_tokens_seen": 245985075, + "step": 11391, + "time_per_iteration": 2.63140606880188 + }, + { + "auxiliary_loss_clip": 0.01104226, + "auxiliary_loss_mlp": 0.01104856, + "balance_loss_clip": 1.00178277, + "balance_loss_mlp": 1.00071502, + "epoch": 0.6849240943935067, + "flos": 26867501844480.0, + "grad_norm": 1.6175089320264437, + "language_loss": 0.79100895, + "learning_rate": 9.539321487906117e-07, + "loss": 0.81309974, + "num_input_tokens_seen": 246003560, + "step": 11392, + "time_per_iteration": 2.703289270401001 + }, + { + "auxiliary_loss_clip": 0.01132879, + "auxiliary_loss_mlp": 0.01103006, + "balance_loss_clip": 1.00185704, + "balance_loss_mlp": 1.00039101, + "epoch": 0.6849842176461747, + "flos": 13735365425280.0, + "grad_norm": 2.190562337015067, + "language_loss": 0.70502907, + "learning_rate": 9.536002258147104e-07, + "loss": 0.72738791, + "num_input_tokens_seen": 246019600, + "step": 11393, + "time_per_iteration": 2.57724928855896 + }, + { + "auxiliary_loss_clip": 0.01101853, + "auxiliary_loss_mlp": 0.01104902, + "balance_loss_clip": 1.00170362, + "balance_loss_mlp": 1.00056982, + "epoch": 0.6850443408988426, + "flos": 24973070641920.0, + "grad_norm": 1.57345381765901, + "language_loss": 0.64669597, + "learning_rate": 9.532683425183936e-07, + "loss": 0.66876352, + "num_input_tokens_seen": 246038920, + "step": 11394, + "time_per_iteration": 2.708415985107422 + }, + { + "auxiliary_loss_clip": 0.01133958, + "auxiliary_loss_mlp": 0.00747416, + "balance_loss_clip": 1.00204194, + "balance_loss_mlp": 1.00049925, + "epoch": 0.6851044641515106, + "flos": 27744225004800.0, + "grad_norm": 5.198006279976018, + "language_loss": 0.80785435, + "learning_rate": 9.529364989142468e-07, + "loss": 0.82666814, + "num_input_tokens_seen": 246060490, + "step": 11395, + "time_per_iteration": 2.664069652557373 + }, + { + "auxiliary_loss_clip": 0.01119015, + "auxiliary_loss_mlp": 0.0110407, + "balance_loss_clip": 1.00196004, + "balance_loss_mlp": 1.00050092, + "epoch": 0.6851645874041785, + "flos": 24351061800960.0, + "grad_norm": 1.7156525020486264, + "language_loss": 0.72870398, + "learning_rate": 9.526046950148527e-07, + "loss": 0.75093484, + "num_input_tokens_seen": 246081465, + "step": 11396, + "time_per_iteration": 2.6567091941833496 + }, + { + "auxiliary_loss_clip": 0.01117756, + "auxiliary_loss_mlp": 0.0110455, + "balance_loss_clip": 1.00187469, + "balance_loss_mlp": 1.00050449, + "epoch": 0.6852247106568465, + "flos": 15077849264640.0, + "grad_norm": 2.4896118110627845, + "language_loss": 0.79296917, + "learning_rate": 9.522729308327931e-07, + "loss": 0.81519222, + "num_input_tokens_seen": 246096110, + "step": 11397, + "time_per_iteration": 2.6278629302978516 + }, + { + "auxiliary_loss_clip": 0.01068811, + "auxiliary_loss_mlp": 0.01103808, + "balance_loss_clip": 1.00172842, + "balance_loss_mlp": 1.00052488, + "epoch": 0.6852848339095146, + "flos": 18770005278720.0, + "grad_norm": 1.9077731317885236, + "language_loss": 0.71498692, + "learning_rate": 9.519412063806493e-07, + "loss": 0.73671317, + "num_input_tokens_seen": 246114785, + "step": 11398, + "time_per_iteration": 2.757766008377075 + }, + { + "auxiliary_loss_clip": 0.01100357, + "auxiliary_loss_mlp": 0.01103794, + "balance_loss_clip": 1.00168037, + "balance_loss_mlp": 1.00051069, + "epoch": 0.6853449571621825, + "flos": 27854363082240.0, + "grad_norm": 1.8018455236476136, + "language_loss": 0.70856094, + "learning_rate": 9.516095216709996e-07, + "loss": 0.7306025, + "num_input_tokens_seen": 246136375, + "step": 11399, + "time_per_iteration": 2.7723023891448975 + }, + { + "auxiliary_loss_clip": 0.01150607, + "auxiliary_loss_mlp": 0.01104324, + "balance_loss_clip": 1.00200272, + "balance_loss_mlp": 1.00065994, + "epoch": 0.6854050804148505, + "flos": 18150510389760.0, + "grad_norm": 1.5612447323896534, + "language_loss": 0.70281339, + "learning_rate": 9.512778767164217e-07, + "loss": 0.72536272, + "num_input_tokens_seen": 246155090, + "step": 11400, + "time_per_iteration": 2.569063425064087 + }, + { + "auxiliary_loss_clip": 0.01101108, + "auxiliary_loss_mlp": 0.01106402, + "balance_loss_clip": 1.00173318, + "balance_loss_mlp": 1.00054359, + "epoch": 0.6854652036675184, + "flos": 16326212492160.0, + "grad_norm": 2.1311211982383886, + "language_loss": 0.78399122, + "learning_rate": 9.509462715294927e-07, + "loss": 0.80606627, + "num_input_tokens_seen": 246172645, + "step": 11401, + "time_per_iteration": 2.646583080291748 + }, + { + "auxiliary_loss_clip": 0.01164936, + "auxiliary_loss_mlp": 0.01103836, + "balance_loss_clip": 1.00195312, + "balance_loss_mlp": 1.00055337, + "epoch": 0.6855253269201864, + "flos": 14940814878720.0, + "grad_norm": 1.7319312506855504, + "language_loss": 0.75892723, + "learning_rate": 9.50614706122786e-07, + "loss": 0.7816149, + "num_input_tokens_seen": 246189055, + "step": 11402, + "time_per_iteration": 3.9099769592285156 + }, + { + "auxiliary_loss_clip": 0.011507, + "auxiliary_loss_mlp": 0.01105087, + "balance_loss_clip": 1.0019834, + "balance_loss_mlp": 1.00065923, + "epoch": 0.6855854501728543, + "flos": 23037736826880.0, + "grad_norm": 1.7891656587786748, + "language_loss": 0.72887647, + "learning_rate": 9.502831805088742e-07, + "loss": 0.75143433, + "num_input_tokens_seen": 246207990, + "step": 11403, + "time_per_iteration": 4.013492822647095 + }, + { + "auxiliary_loss_clip": 0.01165021, + "auxiliary_loss_mlp": 0.01103881, + "balance_loss_clip": 1.00202227, + "balance_loss_mlp": 1.00050259, + "epoch": 0.6856455734255223, + "flos": 13253623194240.0, + "grad_norm": 2.2108206513247195, + "language_loss": 0.81318319, + "learning_rate": 9.499516947003294e-07, + "loss": 0.83587229, + "num_input_tokens_seen": 246221595, + "step": 11404, + "time_per_iteration": 2.4888274669647217 + }, + { + "auxiliary_loss_clip": 0.01133857, + "auxiliary_loss_mlp": 0.01105377, + "balance_loss_clip": 1.00205719, + "balance_loss_mlp": 1.0006634, + "epoch": 0.6857056966781903, + "flos": 23333461499520.0, + "grad_norm": 1.4502112001276872, + "language_loss": 0.77874291, + "learning_rate": 9.496202487097222e-07, + "loss": 0.80113518, + "num_input_tokens_seen": 246242970, + "step": 11405, + "time_per_iteration": 2.636683940887451 + }, + { + "auxiliary_loss_clip": 0.01144674, + "auxiliary_loss_mlp": 0.01079228, + "balance_loss_clip": 1.00086474, + "balance_loss_mlp": 0.99988264, + "epoch": 0.6857658199308583, + "flos": 61852647784320.0, + "grad_norm": 0.7910766408635449, + "language_loss": 0.61025155, + "learning_rate": 9.492888425496199e-07, + "loss": 0.63249052, + "num_input_tokens_seen": 246300405, + "step": 11406, + "time_per_iteration": 3.172136068344116 + }, + { + "auxiliary_loss_clip": 0.01116023, + "auxiliary_loss_mlp": 0.01105111, + "balance_loss_clip": 1.00175333, + "balance_loss_mlp": 1.00058794, + "epoch": 0.6858259431835262, + "flos": 16654543735680.0, + "grad_norm": 1.9854104067683123, + "language_loss": 0.77296424, + "learning_rate": 9.489574762325907e-07, + "loss": 0.79517561, + "num_input_tokens_seen": 246318780, + "step": 11407, + "time_per_iteration": 2.61989688873291 + }, + { + "auxiliary_loss_clip": 0.01131509, + "auxiliary_loss_mlp": 0.01105281, + "balance_loss_clip": 1.00178099, + "balance_loss_mlp": 1.00075841, + "epoch": 0.6858860664361942, + "flos": 21872974504320.0, + "grad_norm": 8.24809582903417, + "language_loss": 0.71068275, + "learning_rate": 9.486261497711991e-07, + "loss": 0.73305058, + "num_input_tokens_seen": 246339405, + "step": 11408, + "time_per_iteration": 2.609651565551758 + }, + { + "auxiliary_loss_clip": 0.01148339, + "auxiliary_loss_mlp": 0.01104692, + "balance_loss_clip": 1.00184238, + "balance_loss_mlp": 1.00045562, + "epoch": 0.6859461896888621, + "flos": 15267637751040.0, + "grad_norm": 2.998716640713083, + "language_loss": 0.69803387, + "learning_rate": 9.482948631780087e-07, + "loss": 0.72056419, + "num_input_tokens_seen": 246357055, + "step": 11409, + "time_per_iteration": 2.5569827556610107 + }, + { + "auxiliary_loss_clip": 0.01101246, + "auxiliary_loss_mlp": 0.01102244, + "balance_loss_clip": 1.00174916, + "balance_loss_mlp": 1.00048745, + "epoch": 0.6860063129415301, + "flos": 18620293392000.0, + "grad_norm": 1.8602088684957852, + "language_loss": 0.78284913, + "learning_rate": 9.479636164655825e-07, + "loss": 0.80488402, + "num_input_tokens_seen": 246374050, + "step": 11410, + "time_per_iteration": 2.648777723312378 + }, + { + "auxiliary_loss_clip": 0.01147989, + "auxiliary_loss_mlp": 0.01106347, + "balance_loss_clip": 1.0018549, + "balance_loss_mlp": 1.00058496, + "epoch": 0.6860664361941982, + "flos": 23951376190080.0, + "grad_norm": 1.7975209941151846, + "language_loss": 0.71669424, + "learning_rate": 9.476324096464821e-07, + "loss": 0.73923767, + "num_input_tokens_seen": 246392910, + "step": 11411, + "time_per_iteration": 2.6048641204833984 + }, + { + "auxiliary_loss_clip": 0.01102102, + "auxiliary_loss_mlp": 0.01104297, + "balance_loss_clip": 1.00170791, + "balance_loss_mlp": 1.00053751, + "epoch": 0.6861265594468661, + "flos": 20407782827520.0, + "grad_norm": 3.455350668794401, + "language_loss": 0.70357972, + "learning_rate": 9.473012427332654e-07, + "loss": 0.72564369, + "num_input_tokens_seen": 246411540, + "step": 11412, + "time_per_iteration": 2.676635265350342 + }, + { + "auxiliary_loss_clip": 0.01164853, + "auxiliary_loss_mlp": 0.01103727, + "balance_loss_clip": 1.00186634, + "balance_loss_mlp": 1.00053978, + "epoch": 0.6861866826995341, + "flos": 11428571111040.0, + "grad_norm": 2.96773835727311, + "language_loss": 0.71825171, + "learning_rate": 9.469701157384919e-07, + "loss": 0.74093747, + "num_input_tokens_seen": 246423295, + "step": 11413, + "time_per_iteration": 2.467349052429199 + }, + { + "auxiliary_loss_clip": 0.01148674, + "auxiliary_loss_mlp": 0.01104418, + "balance_loss_clip": 1.00181544, + "balance_loss_mlp": 1.00075388, + "epoch": 0.686246805952202, + "flos": 15997593939840.0, + "grad_norm": 1.663232121305445, + "language_loss": 0.73413116, + "learning_rate": 9.466390286747164e-07, + "loss": 0.75666207, + "num_input_tokens_seen": 246441045, + "step": 11414, + "time_per_iteration": 3.9667327404022217 + }, + { + "auxiliary_loss_clip": 0.01133986, + "auxiliary_loss_mlp": 0.01105325, + "balance_loss_clip": 1.00185943, + "balance_loss_mlp": 1.0005157, + "epoch": 0.68630692920487, + "flos": 19826712512640.0, + "grad_norm": 2.7694102156466154, + "language_loss": 0.86644614, + "learning_rate": 9.46307981554495e-07, + "loss": 0.88883924, + "num_input_tokens_seen": 246456905, + "step": 11415, + "time_per_iteration": 2.57292103767395 + }, + { + "auxiliary_loss_clip": 0.0114837, + "auxiliary_loss_mlp": 0.01105674, + "balance_loss_clip": 1.00189614, + "balance_loss_mlp": 1.00057888, + "epoch": 0.6863670524575379, + "flos": 26286216048000.0, + "grad_norm": 1.813323018431588, + "language_loss": 0.67030686, + "learning_rate": 9.459769743903801e-07, + "loss": 0.69284737, + "num_input_tokens_seen": 246477545, + "step": 11416, + "time_per_iteration": 3.9891467094421387 + }, + { + "auxiliary_loss_clip": 0.01134389, + "auxiliary_loss_mlp": 0.01103439, + "balance_loss_clip": 1.00186133, + "balance_loss_mlp": 1.00053763, + "epoch": 0.686427175710206, + "flos": 19173138595200.0, + "grad_norm": 1.3826077438805664, + "language_loss": 0.7592901, + "learning_rate": 9.456460071949237e-07, + "loss": 0.78166831, + "num_input_tokens_seen": 246496705, + "step": 11417, + "time_per_iteration": 2.586909055709839 + }, + { + "auxiliary_loss_clip": 0.0113342, + "auxiliary_loss_mlp": 0.01105215, + "balance_loss_clip": 1.00186348, + "balance_loss_mlp": 1.00059724, + "epoch": 0.6864872989628739, + "flos": 18916628595840.0, + "grad_norm": 1.791590077752692, + "language_loss": 0.77279371, + "learning_rate": 9.45315079980678e-07, + "loss": 0.79518008, + "num_input_tokens_seen": 246514860, + "step": 11418, + "time_per_iteration": 2.5817031860351562 + }, + { + "auxiliary_loss_clip": 0.01099977, + "auxiliary_loss_mlp": 0.0110495, + "balance_loss_clip": 1.00167799, + "balance_loss_mlp": 1.00052249, + "epoch": 0.6865474222155419, + "flos": 25956196865280.0, + "grad_norm": 1.8205829624950962, + "language_loss": 0.76274347, + "learning_rate": 9.449841927601887e-07, + "loss": 0.78479278, + "num_input_tokens_seen": 246536145, + "step": 11419, + "time_per_iteration": 2.724346876144409 + }, + { + "auxiliary_loss_clip": 0.01165059, + "auxiliary_loss_mlp": 0.01104025, + "balance_loss_clip": 1.00192428, + "balance_loss_mlp": 1.00074172, + "epoch": 0.6866075454682098, + "flos": 18478087447680.0, + "grad_norm": 1.8406244784903047, + "language_loss": 0.71390259, + "learning_rate": 9.446533455460044e-07, + "loss": 0.73659348, + "num_input_tokens_seen": 246553265, + "step": 11420, + "time_per_iteration": 2.5182905197143555 + }, + { + "auxiliary_loss_clip": 0.01115888, + "auxiliary_loss_mlp": 0.01104345, + "balance_loss_clip": 1.00169539, + "balance_loss_mlp": 1.00039411, + "epoch": 0.6866676687208778, + "flos": 34239998298240.0, + "grad_norm": 1.3027392011891747, + "language_loss": 0.74599445, + "learning_rate": 9.443225383506712e-07, + "loss": 0.76819676, + "num_input_tokens_seen": 246575130, + "step": 11421, + "time_per_iteration": 2.7472236156463623 + }, + { + "auxiliary_loss_clip": 0.01147971, + "auxiliary_loss_mlp": 0.01103983, + "balance_loss_clip": 1.00170445, + "balance_loss_mlp": 1.00060463, + "epoch": 0.6867277919735457, + "flos": 21721754246400.0, + "grad_norm": 2.116888114490351, + "language_loss": 0.77220428, + "learning_rate": 9.439917711867338e-07, + "loss": 0.79472381, + "num_input_tokens_seen": 246593095, + "step": 11422, + "time_per_iteration": 2.549346446990967 + }, + { + "auxiliary_loss_clip": 0.01149352, + "auxiliary_loss_mlp": 0.01104313, + "balance_loss_clip": 1.0018456, + "balance_loss_mlp": 1.00074363, + "epoch": 0.6867879152262137, + "flos": 24097999507200.0, + "grad_norm": 1.7833687434676777, + "language_loss": 0.77357882, + "learning_rate": 9.436610440667334e-07, + "loss": 0.79611546, + "num_input_tokens_seen": 246612165, + "step": 11423, + "time_per_iteration": 2.5980384349823 + }, + { + "auxiliary_loss_clip": 0.0111606, + "auxiliary_loss_mlp": 0.0110497, + "balance_loss_clip": 1.00178599, + "balance_loss_mlp": 1.00063801, + "epoch": 0.6868480384788818, + "flos": 21615818060160.0, + "grad_norm": 3.5281173369859644, + "language_loss": 0.72857022, + "learning_rate": 9.433303570032129e-07, + "loss": 0.75078052, + "num_input_tokens_seen": 246632065, + "step": 11424, + "time_per_iteration": 2.6534781455993652 + }, + { + "auxiliary_loss_clip": 0.01131549, + "auxiliary_loss_mlp": 0.01103556, + "balance_loss_clip": 1.00179243, + "balance_loss_mlp": 1.00046325, + "epoch": 0.6869081617315497, + "flos": 26286144220800.0, + "grad_norm": 1.7540602344279506, + "language_loss": 0.65254062, + "learning_rate": 9.429997100087112e-07, + "loss": 0.67489165, + "num_input_tokens_seen": 246651245, + "step": 11425, + "time_per_iteration": 2.647991895675659 + }, + { + "auxiliary_loss_clip": 0.01114889, + "auxiliary_loss_mlp": 0.01103053, + "balance_loss_clip": 1.00172555, + "balance_loss_mlp": 1.00043797, + "epoch": 0.6869682849842177, + "flos": 21105096531840.0, + "grad_norm": 1.3993408621563685, + "language_loss": 0.71770096, + "learning_rate": 9.426691030957657e-07, + "loss": 0.73988032, + "num_input_tokens_seen": 246672225, + "step": 11426, + "time_per_iteration": 2.647395133972168 + }, + { + "auxiliary_loss_clip": 0.01102119, + "auxiliary_loss_mlp": 0.01104534, + "balance_loss_clip": 1.00168514, + "balance_loss_mlp": 1.00058317, + "epoch": 0.6870284082368856, + "flos": 17092653920640.0, + "grad_norm": 2.1739341339213487, + "language_loss": 0.84958732, + "learning_rate": 9.423385362769136e-07, + "loss": 0.87165391, + "num_input_tokens_seen": 246688385, + "step": 11427, + "time_per_iteration": 2.6838879585266113 + }, + { + "auxiliary_loss_clip": 0.011477, + "auxiliary_loss_mlp": 0.01103592, + "balance_loss_clip": 1.00185204, + "balance_loss_mlp": 1.00040507, + "epoch": 0.6870885314895536, + "flos": 27308090067840.0, + "grad_norm": 1.826695160592464, + "language_loss": 0.76040328, + "learning_rate": 9.420080095646909e-07, + "loss": 0.78291631, + "num_input_tokens_seen": 246710730, + "step": 11428, + "time_per_iteration": 2.5995380878448486 + }, + { + "auxiliary_loss_clip": 0.01116836, + "auxiliary_loss_mlp": 0.01104925, + "balance_loss_clip": 1.00171411, + "balance_loss_mlp": 1.00049746, + "epoch": 0.6871486547422215, + "flos": 20814543417600.0, + "grad_norm": 1.8776563577238392, + "language_loss": 0.72919178, + "learning_rate": 9.4167752297163e-07, + "loss": 0.75140941, + "num_input_tokens_seen": 246730350, + "step": 11429, + "time_per_iteration": 2.638512372970581 + }, + { + "auxiliary_loss_clip": 0.01133493, + "auxiliary_loss_mlp": 0.01104719, + "balance_loss_clip": 1.00180709, + "balance_loss_mlp": 1.00038707, + "epoch": 0.6872087779948896, + "flos": 30154118330880.0, + "grad_norm": 1.9399915126167913, + "language_loss": 0.82997447, + "learning_rate": 9.413470765102643e-07, + "loss": 0.85235655, + "num_input_tokens_seen": 246751700, + "step": 11430, + "time_per_iteration": 2.6357040405273438 + }, + { + "auxiliary_loss_clip": 0.01150319, + "auxiliary_loss_mlp": 0.01103104, + "balance_loss_clip": 1.0019325, + "balance_loss_mlp": 1.00048816, + "epoch": 0.6872689012475575, + "flos": 20704584908160.0, + "grad_norm": 1.864207868993551, + "language_loss": 0.70493758, + "learning_rate": 9.410166701931225e-07, + "loss": 0.72747183, + "num_input_tokens_seen": 246769860, + "step": 11431, + "time_per_iteration": 2.590345859527588 + }, + { + "auxiliary_loss_clip": 0.01133627, + "auxiliary_loss_mlp": 0.00747434, + "balance_loss_clip": 1.00182962, + "balance_loss_mlp": 1.00046086, + "epoch": 0.6873290245002255, + "flos": 25520852027520.0, + "grad_norm": 1.9327273572513985, + "language_loss": 0.8005091, + "learning_rate": 9.406863040327355e-07, + "loss": 0.81931973, + "num_input_tokens_seen": 246789905, + "step": 11432, + "time_per_iteration": 2.6445438861846924 + }, + { + "auxiliary_loss_clip": 0.01132875, + "auxiliary_loss_mlp": 0.01103064, + "balance_loss_clip": 1.00187647, + "balance_loss_mlp": 1.00044894, + "epoch": 0.6873891477528934, + "flos": 25191479289600.0, + "grad_norm": 1.7456957714941457, + "language_loss": 0.6775021, + "learning_rate": 9.403559780416295e-07, + "loss": 0.69986153, + "num_input_tokens_seen": 246808815, + "step": 11433, + "time_per_iteration": 2.681532144546509 + }, + { + "auxiliary_loss_clip": 0.01148477, + "auxiliary_loss_mlp": 0.01103874, + "balance_loss_clip": 1.00205386, + "balance_loss_mlp": 1.00068688, + "epoch": 0.6874492710055614, + "flos": 35152380685440.0, + "grad_norm": 3.743362719401502, + "language_loss": 0.72964466, + "learning_rate": 9.400256922323309e-07, + "loss": 0.75216818, + "num_input_tokens_seen": 246829775, + "step": 11434, + "time_per_iteration": 2.6807351112365723 + }, + { + "auxiliary_loss_clip": 0.011174, + "auxiliary_loss_mlp": 0.01104822, + "balance_loss_clip": 1.00193, + "balance_loss_mlp": 1.00058532, + "epoch": 0.6875093942582293, + "flos": 17822215059840.0, + "grad_norm": 1.5892132567119734, + "language_loss": 0.80729169, + "learning_rate": 9.396954466173657e-07, + "loss": 0.82951391, + "num_input_tokens_seen": 246848045, + "step": 11435, + "time_per_iteration": 2.661980628967285 + }, + { + "auxiliary_loss_clip": 0.01164966, + "auxiliary_loss_mlp": 0.01104514, + "balance_loss_clip": 1.0019443, + "balance_loss_mlp": 1.00056303, + "epoch": 0.6875695175108973, + "flos": 20704548994560.0, + "grad_norm": 2.55183871301014, + "language_loss": 0.80927581, + "learning_rate": 9.393652412092538e-07, + "loss": 0.83197057, + "num_input_tokens_seen": 246866095, + "step": 11436, + "time_per_iteration": 2.4998743534088135 + }, + { + "auxiliary_loss_clip": 0.01118662, + "auxiliary_loss_mlp": 0.011021, + "balance_loss_clip": 1.00180459, + "balance_loss_mlp": 1.00062919, + "epoch": 0.6876296407635654, + "flos": 25374013228800.0, + "grad_norm": 1.7479637853674663, + "language_loss": 0.81729412, + "learning_rate": 9.390350760205183e-07, + "loss": 0.83950174, + "num_input_tokens_seen": 246883975, + "step": 11437, + "time_per_iteration": 2.701880693435669 + }, + { + "auxiliary_loss_clip": 0.01134112, + "auxiliary_loss_mlp": 0.0110562, + "balance_loss_clip": 1.00187564, + "balance_loss_mlp": 1.00071573, + "epoch": 0.6876897640162333, + "flos": 23222317841280.0, + "grad_norm": 5.801301560775134, + "language_loss": 0.78313994, + "learning_rate": 9.387049510636793e-07, + "loss": 0.80553728, + "num_input_tokens_seen": 246901560, + "step": 11438, + "time_per_iteration": 2.608489513397217 + }, + { + "auxiliary_loss_clip": 0.01164819, + "auxiliary_loss_mlp": 0.01103259, + "balance_loss_clip": 1.00192046, + "balance_loss_mlp": 1.00064313, + "epoch": 0.6877498872689013, + "flos": 27124335066240.0, + "grad_norm": 1.5833994678965184, + "language_loss": 0.72258329, + "learning_rate": 9.383748663512554e-07, + "loss": 0.74526405, + "num_input_tokens_seen": 246922655, + "step": 11439, + "time_per_iteration": 2.5747475624084473 + }, + { + "auxiliary_loss_clip": 0.01148174, + "auxiliary_loss_mlp": 0.01103351, + "balance_loss_clip": 1.00194192, + "balance_loss_mlp": 1.00054443, + "epoch": 0.6878100105215692, + "flos": 11581658876160.0, + "grad_norm": 2.0392465909432937, + "language_loss": 0.75473499, + "learning_rate": 9.380448218957623e-07, + "loss": 0.77725017, + "num_input_tokens_seen": 246940100, + "step": 11440, + "time_per_iteration": 4.011185169219971 + }, + { + "auxiliary_loss_clip": 0.01117229, + "auxiliary_loss_mlp": 0.01103243, + "balance_loss_clip": 1.0018301, + "balance_loss_mlp": 1.00062752, + "epoch": 0.6878701337742372, + "flos": 20303175444480.0, + "grad_norm": 1.6265745585670461, + "language_loss": 0.7189461, + "learning_rate": 9.377148177097167e-07, + "loss": 0.7411508, + "num_input_tokens_seen": 246958545, + "step": 11441, + "time_per_iteration": 4.0240702629089355 + }, + { + "auxiliary_loss_clip": 0.01119094, + "auxiliary_loss_mlp": 0.01105026, + "balance_loss_clip": 1.00181985, + "balance_loss_mlp": 1.00059903, + "epoch": 0.6879302570269051, + "flos": 13840080549120.0, + "grad_norm": 1.6935733389137952, + "language_loss": 0.66605186, + "learning_rate": 9.373848538056317e-07, + "loss": 0.6882931, + "num_input_tokens_seen": 246974805, + "step": 11442, + "time_per_iteration": 2.65515398979187 + }, + { + "auxiliary_loss_clip": 0.01148549, + "auxiliary_loss_mlp": 0.01103998, + "balance_loss_clip": 1.00188518, + "balance_loss_mlp": 1.00052428, + "epoch": 0.6879903802795732, + "flos": 21324654414720.0, + "grad_norm": 2.515678737021584, + "language_loss": 0.69271415, + "learning_rate": 9.370549301960189e-07, + "loss": 0.71523964, + "num_input_tokens_seen": 246992505, + "step": 11443, + "time_per_iteration": 2.5757148265838623 + }, + { + "auxiliary_loss_clip": 0.01135186, + "auxiliary_loss_mlp": 0.01104614, + "balance_loss_clip": 1.00195074, + "balance_loss_mlp": 1.00066316, + "epoch": 0.6880505035322411, + "flos": 25152049134720.0, + "grad_norm": 1.5938405030879725, + "language_loss": 0.76458883, + "learning_rate": 9.367250468933893e-07, + "loss": 0.78698677, + "num_input_tokens_seen": 247013370, + "step": 11444, + "time_per_iteration": 2.639116048812866 + }, + { + "auxiliary_loss_clip": 0.01164788, + "auxiliary_loss_mlp": 0.011038, + "balance_loss_clip": 1.00186646, + "balance_loss_mlp": 1.00051749, + "epoch": 0.6881106267849091, + "flos": 23215530170880.0, + "grad_norm": 2.1071515550715865, + "language_loss": 0.76715386, + "learning_rate": 9.363952039102536e-07, + "loss": 0.7898398, + "num_input_tokens_seen": 247029855, + "step": 11445, + "time_per_iteration": 2.5093319416046143 + }, + { + "auxiliary_loss_clip": 0.01144656, + "auxiliary_loss_mlp": 0.01078664, + "balance_loss_clip": 1.00088787, + "balance_loss_mlp": 1.00008106, + "epoch": 0.688170750037577, + "flos": 48484397312640.0, + "grad_norm": 0.8165688026865355, + "language_loss": 0.58386463, + "learning_rate": 9.360654012591183e-07, + "loss": 0.60609782, + "num_input_tokens_seen": 247085030, + "step": 11446, + "time_per_iteration": 3.1915457248687744 + }, + { + "auxiliary_loss_clip": 0.01150455, + "auxiliary_loss_mlp": 0.01104931, + "balance_loss_clip": 1.00189829, + "balance_loss_mlp": 1.00040817, + "epoch": 0.688230873290245, + "flos": 22783633038720.0, + "grad_norm": 1.636809859346514, + "language_loss": 0.76291865, + "learning_rate": 9.357356389524886e-07, + "loss": 0.78547251, + "num_input_tokens_seen": 247104840, + "step": 11447, + "time_per_iteration": 2.623847246170044 + }, + { + "auxiliary_loss_clip": 0.01133594, + "auxiliary_loss_mlp": 0.01104068, + "balance_loss_clip": 1.00182748, + "balance_loss_mlp": 1.00049877, + "epoch": 0.6882909965429129, + "flos": 22455660931200.0, + "grad_norm": 1.8327423113540564, + "language_loss": 0.72716999, + "learning_rate": 9.354059170028705e-07, + "loss": 0.74954665, + "num_input_tokens_seen": 247121905, + "step": 11448, + "time_per_iteration": 2.5929715633392334 + }, + { + "auxiliary_loss_clip": 0.0115013, + "auxiliary_loss_mlp": 0.01104294, + "balance_loss_clip": 1.00187075, + "balance_loss_mlp": 1.00062943, + "epoch": 0.688351119795581, + "flos": 26214143408640.0, + "grad_norm": 1.6224834393842509, + "language_loss": 0.74679583, + "learning_rate": 9.350762354227673e-07, + "loss": 0.76934004, + "num_input_tokens_seen": 247142375, + "step": 11449, + "time_per_iteration": 2.5921499729156494 + }, + { + "auxiliary_loss_clip": 0.011649, + "auxiliary_loss_mlp": 0.01103419, + "balance_loss_clip": 1.00187922, + "balance_loss_mlp": 1.00051761, + "epoch": 0.6884112430482489, + "flos": 22565260304640.0, + "grad_norm": 1.7420439189855925, + "language_loss": 0.7003938, + "learning_rate": 9.34746594224679e-07, + "loss": 0.723077, + "num_input_tokens_seen": 247161095, + "step": 11450, + "time_per_iteration": 2.5411131381988525 + }, + { + "auxiliary_loss_clip": 0.01117796, + "auxiliary_loss_mlp": 0.01105466, + "balance_loss_clip": 1.00193572, + "balance_loss_mlp": 1.00046635, + "epoch": 0.6884713663009169, + "flos": 17341047446400.0, + "grad_norm": 1.790416135895056, + "language_loss": 0.75655329, + "learning_rate": 9.344169934211068e-07, + "loss": 0.77878588, + "num_input_tokens_seen": 247178565, + "step": 11451, + "time_per_iteration": 2.6235251426696777 + }, + { + "auxiliary_loss_clip": 0.01148292, + "auxiliary_loss_mlp": 0.01104532, + "balance_loss_clip": 1.00184393, + "balance_loss_mlp": 1.00039101, + "epoch": 0.6885314895535849, + "flos": 26470832976000.0, + "grad_norm": 1.5179196078507797, + "language_loss": 0.6950196, + "learning_rate": 9.340874330245505e-07, + "loss": 0.71754777, + "num_input_tokens_seen": 247202345, + "step": 11452, + "time_per_iteration": 4.023314476013184 + }, + { + "auxiliary_loss_clip": 0.01164919, + "auxiliary_loss_mlp": 0.01105146, + "balance_loss_clip": 1.00196421, + "balance_loss_mlp": 1.0006237, + "epoch": 0.6885916128062528, + "flos": 20521548178560.0, + "grad_norm": 1.7096730411095515, + "language_loss": 0.71953285, + "learning_rate": 9.337579130475042e-07, + "loss": 0.74223351, + "num_input_tokens_seen": 247219240, + "step": 11453, + "time_per_iteration": 2.4970407485961914 + }, + { + "auxiliary_loss_clip": 0.01144303, + "auxiliary_loss_mlp": 0.00745431, + "balance_loss_clip": 1.00091088, + "balance_loss_mlp": 1.0001626, + "epoch": 0.6886517360589208, + "flos": 70715795679360.0, + "grad_norm": 0.7815964995333969, + "language_loss": 0.50688845, + "learning_rate": 9.334284335024644e-07, + "loss": 0.5257858, + "num_input_tokens_seen": 247272010, + "step": 11454, + "time_per_iteration": 4.369191646575928 + }, + { + "auxiliary_loss_clip": 0.01147605, + "auxiliary_loss_mlp": 0.01102929, + "balance_loss_clip": 1.00189519, + "balance_loss_mlp": 1.00059962, + "epoch": 0.6887118593115887, + "flos": 17893533513600.0, + "grad_norm": 3.1846796145309106, + "language_loss": 0.75581181, + "learning_rate": 9.330989944019263e-07, + "loss": 0.77831721, + "num_input_tokens_seen": 247290630, + "step": 11455, + "time_per_iteration": 2.544179916381836 + }, + { + "auxiliary_loss_clip": 0.01133505, + "auxiliary_loss_mlp": 0.01104773, + "balance_loss_clip": 1.00181222, + "balance_loss_mlp": 1.00063229, + "epoch": 0.6887719825642568, + "flos": 17453017117440.0, + "grad_norm": 2.2464122662043304, + "language_loss": 0.72707999, + "learning_rate": 9.327695957583803e-07, + "loss": 0.74946284, + "num_input_tokens_seen": 247304800, + "step": 11456, + "time_per_iteration": 2.540127992630005 + }, + { + "auxiliary_loss_clip": 0.01133324, + "auxiliary_loss_mlp": 0.01102786, + "balance_loss_clip": 1.00183117, + "balance_loss_mlp": 1.00074339, + "epoch": 0.6888321058169247, + "flos": 23070199743360.0, + "grad_norm": 2.1586110995372514, + "language_loss": 0.80908155, + "learning_rate": 9.32440237584319e-07, + "loss": 0.83144265, + "num_input_tokens_seen": 247323450, + "step": 11457, + "time_per_iteration": 2.5864875316619873 + }, + { + "auxiliary_loss_clip": 0.01147861, + "auxiliary_loss_mlp": 0.00747296, + "balance_loss_clip": 1.00191259, + "balance_loss_mlp": 1.00032067, + "epoch": 0.6888922290695927, + "flos": 23368833417600.0, + "grad_norm": 1.5552452704954758, + "language_loss": 0.76360655, + "learning_rate": 9.321109198922301e-07, + "loss": 0.78255808, + "num_input_tokens_seen": 247343845, + "step": 11458, + "time_per_iteration": 2.5672802925109863 + }, + { + "auxiliary_loss_clip": 0.01165005, + "auxiliary_loss_mlp": 0.01103309, + "balance_loss_clip": 1.00196862, + "balance_loss_mlp": 1.00050259, + "epoch": 0.6889523523222606, + "flos": 17631636474240.0, + "grad_norm": 3.7405922988194837, + "language_loss": 0.68231022, + "learning_rate": 9.31781642694603e-07, + "loss": 0.70499337, + "num_input_tokens_seen": 247356650, + "step": 11459, + "time_per_iteration": 2.4779467582702637 + }, + { + "auxiliary_loss_clip": 0.01119893, + "auxiliary_loss_mlp": 0.01103135, + "balance_loss_clip": 1.00191665, + "balance_loss_mlp": 1.00051928, + "epoch": 0.6890124755749286, + "flos": 25228144097280.0, + "grad_norm": 1.560099704470052, + "language_loss": 0.68589997, + "learning_rate": 9.314524060039221e-07, + "loss": 0.70813024, + "num_input_tokens_seen": 247377340, + "step": 11460, + "time_per_iteration": 2.659585952758789 + }, + { + "auxiliary_loss_clip": 0.01117328, + "auxiliary_loss_mlp": 0.01104621, + "balance_loss_clip": 1.00162411, + "balance_loss_mlp": 1.00057495, + "epoch": 0.6890725988275965, + "flos": 20230240878720.0, + "grad_norm": 1.780567448705048, + "language_loss": 0.7664519, + "learning_rate": 9.311232098326731e-07, + "loss": 0.78867137, + "num_input_tokens_seen": 247395805, + "step": 11461, + "time_per_iteration": 2.6569559574127197 + }, + { + "auxiliary_loss_clip": 0.01133418, + "auxiliary_loss_mlp": 0.01103281, + "balance_loss_clip": 1.00192857, + "balance_loss_mlp": 1.0005703, + "epoch": 0.6891327220802645, + "flos": 14535311264640.0, + "grad_norm": 1.7203084618625917, + "language_loss": 0.6945045, + "learning_rate": 9.307940541933401e-07, + "loss": 0.7168715, + "num_input_tokens_seen": 247413165, + "step": 11462, + "time_per_iteration": 2.5973384380340576 + }, + { + "auxiliary_loss_clip": 0.01148346, + "auxiliary_loss_mlp": 0.0110367, + "balance_loss_clip": 1.00197196, + "balance_loss_mlp": 1.00048232, + "epoch": 0.6891928453329325, + "flos": 21139139646720.0, + "grad_norm": 1.4927344561452631, + "language_loss": 0.8714605, + "learning_rate": 9.304649390984034e-07, + "loss": 0.89398068, + "num_input_tokens_seen": 247433140, + "step": 11463, + "time_per_iteration": 2.5962138175964355 + }, + { + "auxiliary_loss_clip": 0.01100006, + "auxiliary_loss_mlp": 0.01101895, + "balance_loss_clip": 1.00168443, + "balance_loss_mlp": 1.00051928, + "epoch": 0.6892529685856005, + "flos": 17858520731520.0, + "grad_norm": 1.5302766714734508, + "language_loss": 0.68340194, + "learning_rate": 9.301358645603428e-07, + "loss": 0.70542091, + "num_input_tokens_seen": 247451265, + "step": 11464, + "time_per_iteration": 2.6566219329833984 + }, + { + "auxiliary_loss_clip": 0.0114834, + "auxiliary_loss_mlp": 0.0110403, + "balance_loss_clip": 1.00192213, + "balance_loss_mlp": 1.00065136, + "epoch": 0.6893130918382685, + "flos": 29934811843200.0, + "grad_norm": 1.7504995673313286, + "language_loss": 0.65069693, + "learning_rate": 9.298068305916373e-07, + "loss": 0.67322063, + "num_input_tokens_seen": 247471645, + "step": 11465, + "time_per_iteration": 2.659381151199341 + }, + { + "auxiliary_loss_clip": 0.01148515, + "auxiliary_loss_mlp": 0.01104243, + "balance_loss_clip": 1.00185263, + "balance_loss_mlp": 1.00057817, + "epoch": 0.6893732150909364, + "flos": 24388516707840.0, + "grad_norm": 1.4281614925711268, + "language_loss": 0.72613132, + "learning_rate": 9.294778372047649e-07, + "loss": 0.7486589, + "num_input_tokens_seen": 247491170, + "step": 11466, + "time_per_iteration": 2.578303337097168 + }, + { + "auxiliary_loss_clip": 0.0116496, + "auxiliary_loss_mlp": 0.01102937, + "balance_loss_clip": 1.00190639, + "balance_loss_mlp": 1.00041664, + "epoch": 0.6894333383436044, + "flos": 16982874979200.0, + "grad_norm": 1.7951138742550745, + "language_loss": 0.71744263, + "learning_rate": 9.291488844121995e-07, + "loss": 0.74012154, + "num_input_tokens_seen": 247509005, + "step": 11467, + "time_per_iteration": 2.5165882110595703 + }, + { + "auxiliary_loss_clip": 0.01132277, + "auxiliary_loss_mlp": 0.01103944, + "balance_loss_clip": 1.00177705, + "balance_loss_mlp": 1.00066102, + "epoch": 0.6894934615962723, + "flos": 18985540838400.0, + "grad_norm": 2.2792152165049995, + "language_loss": 0.80905557, + "learning_rate": 9.288199722264156e-07, + "loss": 0.8314178, + "num_input_tokens_seen": 247527050, + "step": 11468, + "time_per_iteration": 2.5729753971099854 + }, + { + "auxiliary_loss_clip": 0.01165196, + "auxiliary_loss_mlp": 0.01104628, + "balance_loss_clip": 1.00201535, + "balance_loss_mlp": 1.00058174, + "epoch": 0.6895535848489404, + "flos": 34531664734080.0, + "grad_norm": 1.6017125213706822, + "language_loss": 0.66061795, + "learning_rate": 9.284911006598875e-07, + "loss": 0.68331617, + "num_input_tokens_seen": 247547765, + "step": 11469, + "time_per_iteration": 2.614156484603882 + }, + { + "auxiliary_loss_clip": 0.01143034, + "auxiliary_loss_mlp": 0.01079155, + "balance_loss_clip": 1.00094366, + "balance_loss_mlp": 1.00019038, + "epoch": 0.6896137081016083, + "flos": 50075852273280.0, + "grad_norm": 0.8262243225581052, + "language_loss": 0.55200374, + "learning_rate": 9.281622697250824e-07, + "loss": 0.57422554, + "num_input_tokens_seen": 247603515, + "step": 11470, + "time_per_iteration": 3.0104174613952637 + }, + { + "auxiliary_loss_clip": 0.01148415, + "auxiliary_loss_mlp": 0.01102608, + "balance_loss_clip": 1.00194216, + "balance_loss_mlp": 1.00056493, + "epoch": 0.6896738313542763, + "flos": 19938215306880.0, + "grad_norm": 3.8137731498948306, + "language_loss": 0.78370368, + "learning_rate": 9.278334794344715e-07, + "loss": 0.80621386, + "num_input_tokens_seen": 247622110, + "step": 11471, + "time_per_iteration": 2.5305867195129395 + }, + { + "auxiliary_loss_clip": 0.01135348, + "auxiliary_loss_mlp": 0.0110363, + "balance_loss_clip": 1.00185561, + "balance_loss_mlp": 1.00053811, + "epoch": 0.6897339546069442, + "flos": 21725489260800.0, + "grad_norm": 2.2853007209165894, + "language_loss": 0.78516078, + "learning_rate": 9.275047298005232e-07, + "loss": 0.80755055, + "num_input_tokens_seen": 247641905, + "step": 11472, + "time_per_iteration": 2.6053550243377686 + }, + { + "auxiliary_loss_clip": 0.01134031, + "auxiliary_loss_mlp": 0.01103505, + "balance_loss_clip": 1.00191271, + "balance_loss_mlp": 1.00050831, + "epoch": 0.6897940778596122, + "flos": 19826497031040.0, + "grad_norm": 1.5827123955119322, + "language_loss": 0.76523131, + "learning_rate": 9.271760208357024e-07, + "loss": 0.78760672, + "num_input_tokens_seen": 247660945, + "step": 11473, + "time_per_iteration": 2.5968172550201416 + }, + { + "auxiliary_loss_clip": 0.01118657, + "auxiliary_loss_mlp": 0.0110417, + "balance_loss_clip": 1.0017761, + "balance_loss_mlp": 1.00050569, + "epoch": 0.6898542011122801, + "flos": 17310056987520.0, + "grad_norm": 2.1283603112342857, + "language_loss": 0.75822461, + "learning_rate": 9.268473525524751e-07, + "loss": 0.78045285, + "num_input_tokens_seen": 247678395, + "step": 11474, + "time_per_iteration": 2.618915557861328 + }, + { + "auxiliary_loss_clip": 0.01085382, + "auxiliary_loss_mlp": 0.0110309, + "balance_loss_clip": 1.00173688, + "balance_loss_mlp": 1.00037932, + "epoch": 0.6899143243649482, + "flos": 24754051463040.0, + "grad_norm": 1.686805689647094, + "language_loss": 0.74374211, + "learning_rate": 9.26518724963303e-07, + "loss": 0.76562685, + "num_input_tokens_seen": 247698380, + "step": 11475, + "time_per_iteration": 2.7536442279815674 + }, + { + "auxiliary_loss_clip": 0.01120229, + "auxiliary_loss_mlp": 0.01104259, + "balance_loss_clip": 1.00184274, + "balance_loss_mlp": 1.00049901, + "epoch": 0.6899744476176161, + "flos": 17234536642560.0, + "grad_norm": 2.1142423325124344, + "language_loss": 0.89076656, + "learning_rate": 9.261901380806491e-07, + "loss": 0.91301143, + "num_input_tokens_seen": 247716370, + "step": 11476, + "time_per_iteration": 2.669419050216675 + }, + { + "auxiliary_loss_clip": 0.01164884, + "auxiliary_loss_mlp": 0.01103177, + "balance_loss_clip": 1.001912, + "balance_loss_mlp": 1.00046611, + "epoch": 0.6900345708702841, + "flos": 25410678036480.0, + "grad_norm": 1.3622271505383077, + "language_loss": 0.70501822, + "learning_rate": 9.258615919169724e-07, + "loss": 0.7276988, + "num_input_tokens_seen": 247737335, + "step": 11477, + "time_per_iteration": 2.5633187294006348 + }, + { + "auxiliary_loss_clip": 0.01148678, + "auxiliary_loss_mlp": 0.01104083, + "balance_loss_clip": 1.00202608, + "balance_loss_mlp": 1.0006094, + "epoch": 0.6900946941229521, + "flos": 23434190213760.0, + "grad_norm": 2.096166409403158, + "language_loss": 0.6810751, + "learning_rate": 9.255330864847313e-07, + "loss": 0.70360267, + "num_input_tokens_seen": 247756680, + "step": 11478, + "time_per_iteration": 4.040339708328247 + }, + { + "auxiliary_loss_clip": 0.01148415, + "auxiliary_loss_mlp": 0.01103773, + "balance_loss_clip": 1.00198257, + "balance_loss_mlp": 1.00058579, + "epoch": 0.69015481737562, + "flos": 17820096157440.0, + "grad_norm": 3.6811971186195147, + "language_loss": 0.76510239, + "learning_rate": 9.252046217963843e-07, + "loss": 0.78762424, + "num_input_tokens_seen": 247774265, + "step": 11479, + "time_per_iteration": 3.99487042427063 + }, + { + "auxiliary_loss_clip": 0.01150223, + "auxiliary_loss_mlp": 0.0110407, + "balance_loss_clip": 1.00192177, + "balance_loss_mlp": 1.00040531, + "epoch": 0.690214940628288, + "flos": 17456500736640.0, + "grad_norm": 1.5483655589730263, + "language_loss": 0.78477651, + "learning_rate": 9.248761978643856e-07, + "loss": 0.80731946, + "num_input_tokens_seen": 247792395, + "step": 11480, + "time_per_iteration": 2.5448555946350098 + }, + { + "auxiliary_loss_clip": 0.01121124, + "auxiliary_loss_mlp": 0.0110308, + "balance_loss_clip": 1.00206387, + "balance_loss_mlp": 1.00046468, + "epoch": 0.6902750638809559, + "flos": 29566691308800.0, + "grad_norm": 1.564363999883176, + "language_loss": 0.75477719, + "learning_rate": 9.245478147011885e-07, + "loss": 0.77701914, + "num_input_tokens_seen": 247811985, + "step": 11481, + "time_per_iteration": 2.7048864364624023 + }, + { + "auxiliary_loss_clip": 0.01116653, + "auxiliary_loss_mlp": 0.01104057, + "balance_loss_clip": 1.0017277, + "balance_loss_mlp": 1.00048757, + "epoch": 0.690335187133624, + "flos": 25557121785600.0, + "grad_norm": 1.9253119882879677, + "language_loss": 0.69397461, + "learning_rate": 9.24219472319246e-07, + "loss": 0.71618164, + "num_input_tokens_seen": 247831880, + "step": 11482, + "time_per_iteration": 2.6866579055786133 + }, + { + "auxiliary_loss_clip": 0.01164997, + "auxiliary_loss_mlp": 0.01103262, + "balance_loss_clip": 1.00195527, + "balance_loss_mlp": 1.0004555, + "epoch": 0.6903953103862919, + "flos": 22488447070080.0, + "grad_norm": 1.4815371727998103, + "language_loss": 0.8249318, + "learning_rate": 9.238911707310096e-07, + "loss": 0.84761435, + "num_input_tokens_seen": 247851170, + "step": 11483, + "time_per_iteration": 2.527776002883911 + }, + { + "auxiliary_loss_clip": 0.01165111, + "auxiliary_loss_mlp": 0.01104565, + "balance_loss_clip": 1.00194168, + "balance_loss_mlp": 1.00051951, + "epoch": 0.6904554336389599, + "flos": 26100521712000.0, + "grad_norm": 2.2208885422791793, + "language_loss": 0.65252542, + "learning_rate": 9.235629099489273e-07, + "loss": 0.67522216, + "num_input_tokens_seen": 247868950, + "step": 11484, + "time_per_iteration": 2.5598270893096924 + }, + { + "auxiliary_loss_clip": 0.01133577, + "auxiliary_loss_mlp": 0.01103774, + "balance_loss_clip": 1.00186276, + "balance_loss_mlp": 1.00049114, + "epoch": 0.6905155568916278, + "flos": 31171754545920.0, + "grad_norm": 1.6658921690935167, + "language_loss": 0.7326808, + "learning_rate": 9.232346899854479e-07, + "loss": 0.75505435, + "num_input_tokens_seen": 247889805, + "step": 11485, + "time_per_iteration": 2.7087981700897217 + }, + { + "auxiliary_loss_clip": 0.01148672, + "auxiliary_loss_mlp": 0.00747369, + "balance_loss_clip": 1.00190437, + "balance_loss_mlp": 1.00043762, + "epoch": 0.6905756801442958, + "flos": 17639681120640.0, + "grad_norm": 1.6904973415816287, + "language_loss": 0.84555566, + "learning_rate": 9.22906510853017e-07, + "loss": 0.86451602, + "num_input_tokens_seen": 247908585, + "step": 11486, + "time_per_iteration": 2.584365129470825 + }, + { + "auxiliary_loss_clip": 0.01085475, + "auxiliary_loss_mlp": 0.01104121, + "balance_loss_clip": 1.00176907, + "balance_loss_mlp": 1.00055194, + "epoch": 0.6906358033969637, + "flos": 22343691260160.0, + "grad_norm": 1.638495305676873, + "language_loss": 0.72627097, + "learning_rate": 9.225783725640786e-07, + "loss": 0.74816692, + "num_input_tokens_seen": 247928480, + "step": 11487, + "time_per_iteration": 2.844078779220581 + }, + { + "auxiliary_loss_clip": 0.01128913, + "auxiliary_loss_mlp": 0.01079046, + "balance_loss_clip": 1.00091016, + "balance_loss_mlp": 1.00008154, + "epoch": 0.6906959266496318, + "flos": 69747789081600.0, + "grad_norm": 0.9190883751504529, + "language_loss": 0.66648471, + "learning_rate": 9.222502751310759e-07, + "loss": 0.6885643, + "num_input_tokens_seen": 247988855, + "step": 11488, + "time_per_iteration": 3.2319438457489014 + }, + { + "auxiliary_loss_clip": 0.01131634, + "auxiliary_loss_mlp": 0.01105523, + "balance_loss_clip": 1.00174892, + "balance_loss_mlp": 1.00061929, + "epoch": 0.6907560499022997, + "flos": 21434253788160.0, + "grad_norm": 1.7332590973830202, + "language_loss": 0.7436077, + "learning_rate": 9.219222185664519e-07, + "loss": 0.76597929, + "num_input_tokens_seen": 248007685, + "step": 11489, + "time_per_iteration": 2.598119020462036 + }, + { + "auxiliary_loss_clip": 0.01150498, + "auxiliary_loss_mlp": 0.01105321, + "balance_loss_clip": 1.002002, + "balance_loss_mlp": 1.00051177, + "epoch": 0.6908161731549677, + "flos": 14392207480320.0, + "grad_norm": 2.009449146282416, + "language_loss": 0.6210776, + "learning_rate": 9.215942028826445e-07, + "loss": 0.64363581, + "num_input_tokens_seen": 248025145, + "step": 11490, + "time_per_iteration": 3.947784423828125 + }, + { + "auxiliary_loss_clip": 0.01131336, + "auxiliary_loss_mlp": 0.01103372, + "balance_loss_clip": 1.00177979, + "balance_loss_mlp": 1.00047088, + "epoch": 0.6908762964076357, + "flos": 20010970304640.0, + "grad_norm": 1.668757094719572, + "language_loss": 0.72757536, + "learning_rate": 9.212662280920937e-07, + "loss": 0.74992239, + "num_input_tokens_seen": 248043750, + "step": 11491, + "time_per_iteration": 2.620492696762085 + }, + { + "auxiliary_loss_clip": 0.01131536, + "auxiliary_loss_mlp": 0.00747389, + "balance_loss_clip": 1.00184238, + "balance_loss_mlp": 1.00041056, + "epoch": 0.6909364196603036, + "flos": 28769079853440.0, + "grad_norm": 1.9341953154356144, + "language_loss": 0.70595646, + "learning_rate": 9.20938294207235e-07, + "loss": 0.72474569, + "num_input_tokens_seen": 248065765, + "step": 11492, + "time_per_iteration": 4.050936698913574 + }, + { + "auxiliary_loss_clip": 0.01099873, + "auxiliary_loss_mlp": 0.01104161, + "balance_loss_clip": 1.00165808, + "balance_loss_mlp": 1.00059175, + "epoch": 0.6909965429129716, + "flos": 22528128620160.0, + "grad_norm": 1.6780969963732686, + "language_loss": 0.74484092, + "learning_rate": 9.206104012405049e-07, + "loss": 0.76688123, + "num_input_tokens_seen": 248083810, + "step": 11493, + "time_per_iteration": 2.6622307300567627 + }, + { + "auxiliary_loss_clip": 0.0116498, + "auxiliary_loss_mlp": 0.01104006, + "balance_loss_clip": 1.00197482, + "balance_loss_mlp": 1.00053287, + "epoch": 0.6910566661656395, + "flos": 18405942981120.0, + "grad_norm": 1.970835248789316, + "language_loss": 0.74770665, + "learning_rate": 9.20282549204336e-07, + "loss": 0.77039653, + "num_input_tokens_seen": 248103185, + "step": 11494, + "time_per_iteration": 2.5051016807556152 + }, + { + "auxiliary_loss_clip": 0.01133324, + "auxiliary_loss_mlp": 0.01103802, + "balance_loss_clip": 1.00174153, + "balance_loss_mlp": 1.00051856, + "epoch": 0.6911167894183076, + "flos": 30773972355840.0, + "grad_norm": 1.532769347847281, + "language_loss": 0.68309534, + "learning_rate": 9.19954738111161e-07, + "loss": 0.70546663, + "num_input_tokens_seen": 248125665, + "step": 11495, + "time_per_iteration": 2.6799309253692627 + }, + { + "auxiliary_loss_clip": 0.01132917, + "auxiliary_loss_mlp": 0.01104059, + "balance_loss_clip": 1.00179863, + "balance_loss_mlp": 1.00049019, + "epoch": 0.6911769126709755, + "flos": 13735724561280.0, + "grad_norm": 1.8466138857322612, + "language_loss": 0.74058366, + "learning_rate": 9.196269679734119e-07, + "loss": 0.7629534, + "num_input_tokens_seen": 248142545, + "step": 11496, + "time_per_iteration": 2.565983533859253 + }, + { + "auxiliary_loss_clip": 0.01117899, + "auxiliary_loss_mlp": 0.01103266, + "balance_loss_clip": 1.00180101, + "balance_loss_mlp": 1.00055516, + "epoch": 0.6912370359236435, + "flos": 17566854295680.0, + "grad_norm": 2.5896997540373343, + "language_loss": 0.80020922, + "learning_rate": 9.19299238803515e-07, + "loss": 0.82242095, + "num_input_tokens_seen": 248160225, + "step": 11497, + "time_per_iteration": 2.624082326889038 + }, + { + "auxiliary_loss_clip": 0.01117249, + "auxiliary_loss_mlp": 0.01106302, + "balance_loss_clip": 1.00187588, + "balance_loss_mlp": 1.00073075, + "epoch": 0.6912971591763114, + "flos": 22090772620800.0, + "grad_norm": 1.6568303636128203, + "language_loss": 0.80708885, + "learning_rate": 9.189715506138993e-07, + "loss": 0.82932431, + "num_input_tokens_seen": 248180430, + "step": 11498, + "time_per_iteration": 2.7519371509552 + }, + { + "auxiliary_loss_clip": 0.01148053, + "auxiliary_loss_mlp": 0.01103643, + "balance_loss_clip": 1.00183129, + "balance_loss_mlp": 1.00064588, + "epoch": 0.6913572824289794, + "flos": 29971476650880.0, + "grad_norm": 1.4758823187310177, + "language_loss": 0.8582738, + "learning_rate": 9.186439034169915e-07, + "loss": 0.88079071, + "num_input_tokens_seen": 248202365, + "step": 11499, + "time_per_iteration": 2.6736369132995605 + }, + { + "auxiliary_loss_clip": 0.01134125, + "auxiliary_loss_mlp": 0.00747229, + "balance_loss_clip": 1.00189948, + "balance_loss_mlp": 1.00042748, + "epoch": 0.6914174056816473, + "flos": 20448936835200.0, + "grad_norm": 1.669345031045693, + "language_loss": 0.75684965, + "learning_rate": 9.183162972252145e-07, + "loss": 0.77566314, + "num_input_tokens_seen": 248221750, + "step": 11500, + "time_per_iteration": 2.6369571685791016 + }, + { + "auxiliary_loss_clip": 0.01088687, + "auxiliary_loss_mlp": 0.01105093, + "balance_loss_clip": 1.00178385, + "balance_loss_mlp": 1.00057006, + "epoch": 0.6914775289343154, + "flos": 21282530739840.0, + "grad_norm": 1.9143242124314008, + "language_loss": 0.77461636, + "learning_rate": 9.179887320509921e-07, + "loss": 0.79655415, + "num_input_tokens_seen": 248239535, + "step": 11501, + "time_per_iteration": 2.7208497524261475 + }, + { + "auxiliary_loss_clip": 0.01150601, + "auxiliary_loss_mlp": 0.01104159, + "balance_loss_clip": 1.00200617, + "balance_loss_mlp": 1.00058961, + "epoch": 0.6915376521869833, + "flos": 23878118401920.0, + "grad_norm": 1.7642818891329004, + "language_loss": 0.73029017, + "learning_rate": 9.176612079067458e-07, + "loss": 0.75283778, + "num_input_tokens_seen": 248259055, + "step": 11502, + "time_per_iteration": 2.604537010192871 + }, + { + "auxiliary_loss_clip": 0.01066029, + "auxiliary_loss_mlp": 0.01104381, + "balance_loss_clip": 1.00154817, + "balance_loss_mlp": 1.00052547, + "epoch": 0.6915977754396513, + "flos": 11510268595200.0, + "grad_norm": 2.022196140916521, + "language_loss": 0.73490554, + "learning_rate": 9.173337248048953e-07, + "loss": 0.75660962, + "num_input_tokens_seen": 248276765, + "step": 11503, + "time_per_iteration": 2.7574641704559326 + }, + { + "auxiliary_loss_clip": 0.01149926, + "auxiliary_loss_mlp": 0.01103571, + "balance_loss_clip": 1.00187969, + "balance_loss_mlp": 1.00047898, + "epoch": 0.6916578986923193, + "flos": 22601278667520.0, + "grad_norm": 1.8282671882579389, + "language_loss": 0.7702477, + "learning_rate": 9.170062827578575e-07, + "loss": 0.79278266, + "num_input_tokens_seen": 248295310, + "step": 11504, + "time_per_iteration": 2.587130546569824 + }, + { + "auxiliary_loss_clip": 0.01118945, + "auxiliary_loss_mlp": 0.01104373, + "balance_loss_clip": 1.00192988, + "balance_loss_mlp": 1.00051796, + "epoch": 0.6917180219449872, + "flos": 23477355383040.0, + "grad_norm": 2.143399248465463, + "language_loss": 0.73782909, + "learning_rate": 9.166788817780499e-07, + "loss": 0.76006222, + "num_input_tokens_seen": 248315230, + "step": 11505, + "time_per_iteration": 2.6600120067596436 + }, + { + "auxiliary_loss_clip": 0.01087837, + "auxiliary_loss_mlp": 0.00747157, + "balance_loss_clip": 1.00168574, + "balance_loss_mlp": 1.00035942, + "epoch": 0.6917781451976552, + "flos": 23732536579200.0, + "grad_norm": 2.268527838025351, + "language_loss": 0.87623113, + "learning_rate": 9.163515218778886e-07, + "loss": 0.89458108, + "num_input_tokens_seen": 248332980, + "step": 11506, + "time_per_iteration": 2.73594331741333 + }, + { + "auxiliary_loss_clip": 0.0113163, + "auxiliary_loss_mlp": 0.01103748, + "balance_loss_clip": 1.00173807, + "balance_loss_mlp": 1.00036991, + "epoch": 0.6918382684503231, + "flos": 31466760946560.0, + "grad_norm": 1.8961088669642814, + "language_loss": 0.7000801, + "learning_rate": 9.160242030697856e-07, + "loss": 0.72243392, + "num_input_tokens_seen": 248352865, + "step": 11507, + "time_per_iteration": 2.7150893211364746 + }, + { + "auxiliary_loss_clip": 0.01133791, + "auxiliary_loss_mlp": 0.0110362, + "balance_loss_clip": 1.00186563, + "balance_loss_mlp": 1.00052786, + "epoch": 0.6918983917029912, + "flos": 21650471706240.0, + "grad_norm": 1.8680754230909906, + "language_loss": 0.76859516, + "learning_rate": 9.156969253661538e-07, + "loss": 0.79096925, + "num_input_tokens_seen": 248371125, + "step": 11508, + "time_per_iteration": 2.6247470378875732 + }, + { + "auxiliary_loss_clip": 0.01150144, + "auxiliary_loss_mlp": 0.01102523, + "balance_loss_clip": 1.00204182, + "balance_loss_mlp": 1.0004797, + "epoch": 0.6919585149556591, + "flos": 25550082720000.0, + "grad_norm": 2.4083415873975893, + "language_loss": 0.75009602, + "learning_rate": 9.153696887794027e-07, + "loss": 0.7726227, + "num_input_tokens_seen": 248390455, + "step": 11509, + "time_per_iteration": 2.6312668323516846 + }, + { + "auxiliary_loss_clip": 0.01100099, + "auxiliary_loss_mlp": 0.01103781, + "balance_loss_clip": 1.00187469, + "balance_loss_mlp": 1.00049758, + "epoch": 0.6920186382083271, + "flos": 23659781581440.0, + "grad_norm": 1.5919573110815812, + "language_loss": 0.64126647, + "learning_rate": 9.150424933219425e-07, + "loss": 0.66330522, + "num_input_tokens_seen": 248411305, + "step": 11510, + "time_per_iteration": 2.7330245971679688 + }, + { + "auxiliary_loss_clip": 0.01116359, + "auxiliary_loss_mlp": 0.01104842, + "balance_loss_clip": 1.0017556, + "balance_loss_mlp": 1.00050974, + "epoch": 0.692078761460995, + "flos": 19061959023360.0, + "grad_norm": 1.7897593640200655, + "language_loss": 0.75358343, + "learning_rate": 9.147153390061788e-07, + "loss": 0.7757954, + "num_input_tokens_seen": 248430190, + "step": 11511, + "time_per_iteration": 2.6334164142608643 + }, + { + "auxiliary_loss_clip": 0.0111892, + "auxiliary_loss_mlp": 0.01103732, + "balance_loss_clip": 1.00191772, + "balance_loss_mlp": 1.00054431, + "epoch": 0.692138884713663, + "flos": 29023291382400.0, + "grad_norm": 1.7423721261938738, + "language_loss": 0.62341619, + "learning_rate": 9.143882258445184e-07, + "loss": 0.6456427, + "num_input_tokens_seen": 248450830, + "step": 11512, + "time_per_iteration": 2.7184255123138428 + }, + { + "auxiliary_loss_clip": 0.01114371, + "auxiliary_loss_mlp": 0.01104356, + "balance_loss_clip": 1.00172687, + "balance_loss_mlp": 1.00059605, + "epoch": 0.6921990079663309, + "flos": 14757849976320.0, + "grad_norm": 1.8387399557226334, + "language_loss": 0.8269822, + "learning_rate": 9.140611538493666e-07, + "loss": 0.84916949, + "num_input_tokens_seen": 248468585, + "step": 11513, + "time_per_iteration": 2.644944429397583 + }, + { + "auxiliary_loss_clip": 0.01083672, + "auxiliary_loss_mlp": 0.01103486, + "balance_loss_clip": 1.00183511, + "balance_loss_mlp": 1.00067949, + "epoch": 0.692259131218999, + "flos": 23841848643840.0, + "grad_norm": 1.961972289764544, + "language_loss": 0.78333473, + "learning_rate": 9.137341230331233e-07, + "loss": 0.80520636, + "num_input_tokens_seen": 248490535, + "step": 11514, + "time_per_iteration": 2.7318685054779053 + }, + { + "auxiliary_loss_clip": 0.01101648, + "auxiliary_loss_mlp": 0.01104124, + "balance_loss_clip": 1.00172591, + "balance_loss_mlp": 1.00055504, + "epoch": 0.6923192544716669, + "flos": 19135073157120.0, + "grad_norm": 2.4181345792723774, + "language_loss": 0.74695301, + "learning_rate": 9.134071334081907e-07, + "loss": 0.76901072, + "num_input_tokens_seen": 248508575, + "step": 11515, + "time_per_iteration": 4.021791696548462 + }, + { + "auxiliary_loss_clip": 0.01102264, + "auxiliary_loss_mlp": 0.01103067, + "balance_loss_clip": 1.00184512, + "balance_loss_mlp": 1.00054693, + "epoch": 0.6923793777243349, + "flos": 28074639237120.0, + "grad_norm": 1.9858872648417056, + "language_loss": 0.5371049, + "learning_rate": 9.130801849869694e-07, + "loss": 0.55915821, + "num_input_tokens_seen": 248527025, + "step": 11516, + "time_per_iteration": 4.1484375 + }, + { + "auxiliary_loss_clip": 0.01150129, + "auxiliary_loss_mlp": 0.01103251, + "balance_loss_clip": 1.00198078, + "balance_loss_mlp": 1.00054061, + "epoch": 0.6924395009770029, + "flos": 16581250033920.0, + "grad_norm": 1.846428406372483, + "language_loss": 0.73226726, + "learning_rate": 9.127532777818557e-07, + "loss": 0.75480103, + "num_input_tokens_seen": 248544275, + "step": 11517, + "time_per_iteration": 2.545222759246826 + }, + { + "auxiliary_loss_clip": 0.01164946, + "auxiliary_loss_mlp": 0.01105109, + "balance_loss_clip": 1.00191379, + "balance_loss_mlp": 1.00058603, + "epoch": 0.6924996242296708, + "flos": 16655297921280.0, + "grad_norm": 1.6731329945892393, + "language_loss": 0.76170498, + "learning_rate": 9.124264118052465e-07, + "loss": 0.78440559, + "num_input_tokens_seen": 248561870, + "step": 11518, + "time_per_iteration": 2.5035924911499023 + }, + { + "auxiliary_loss_clip": 0.01148468, + "auxiliary_loss_mlp": 0.01104111, + "balance_loss_clip": 1.0019598, + "balance_loss_mlp": 1.00044668, + "epoch": 0.6925597474823388, + "flos": 34754167532160.0, + "grad_norm": 1.4393506262203453, + "language_loss": 0.64442992, + "learning_rate": 9.120995870695376e-07, + "loss": 0.66695571, + "num_input_tokens_seen": 248588190, + "step": 11519, + "time_per_iteration": 2.687206983566284 + }, + { + "auxiliary_loss_clip": 0.01134067, + "auxiliary_loss_mlp": 0.01104016, + "balance_loss_clip": 1.00191748, + "balance_loss_mlp": 1.00054216, + "epoch": 0.6926198707350067, + "flos": 21871717528320.0, + "grad_norm": 1.7882471523996457, + "language_loss": 0.62254667, + "learning_rate": 9.117728035871212e-07, + "loss": 0.6449275, + "num_input_tokens_seen": 248606460, + "step": 11520, + "time_per_iteration": 2.7106127738952637 + }, + { + "auxiliary_loss_clip": 0.01119068, + "auxiliary_loss_mlp": 0.01105478, + "balance_loss_clip": 1.00180066, + "balance_loss_mlp": 1.00066948, + "epoch": 0.6926799939876748, + "flos": 13006271162880.0, + "grad_norm": 1.9573540070892526, + "language_loss": 0.77472734, + "learning_rate": 9.114460613703887e-07, + "loss": 0.79697287, + "num_input_tokens_seen": 248623715, + "step": 11521, + "time_per_iteration": 2.631415367126465 + }, + { + "auxiliary_loss_clip": 0.01150203, + "auxiliary_loss_mlp": 0.01104726, + "balance_loss_clip": 1.00198567, + "balance_loss_mlp": 1.00058484, + "epoch": 0.6927401172403427, + "flos": 16761234107520.0, + "grad_norm": 1.9212648133327574, + "language_loss": 0.819736, + "learning_rate": 9.111193604317304e-07, + "loss": 0.84228528, + "num_input_tokens_seen": 248640575, + "step": 11522, + "time_per_iteration": 2.7197959423065186 + }, + { + "auxiliary_loss_clip": 0.01148658, + "auxiliary_loss_mlp": 0.01103271, + "balance_loss_clip": 1.00194907, + "balance_loss_mlp": 1.0005604, + "epoch": 0.6928002404930107, + "flos": 25705648523520.0, + "grad_norm": 1.5247362010069632, + "language_loss": 0.76895463, + "learning_rate": 9.107927007835361e-07, + "loss": 0.79147398, + "num_input_tokens_seen": 248663535, + "step": 11523, + "time_per_iteration": 2.6385624408721924 + }, + { + "auxiliary_loss_clip": 0.01118433, + "auxiliary_loss_mlp": 0.01102354, + "balance_loss_clip": 1.00179219, + "balance_loss_mlp": 1.00050128, + "epoch": 0.6928603637456786, + "flos": 18588261438720.0, + "grad_norm": 1.8252681026157422, + "language_loss": 0.67930949, + "learning_rate": 9.104660824381915e-07, + "loss": 0.70151734, + "num_input_tokens_seen": 248681125, + "step": 11524, + "time_per_iteration": 2.6248908042907715 + }, + { + "auxiliary_loss_clip": 0.01116885, + "auxiliary_loss_mlp": 0.01105398, + "balance_loss_clip": 1.00179875, + "balance_loss_mlp": 1.000494, + "epoch": 0.6929204869983466, + "flos": 22200874784640.0, + "grad_norm": 1.9460328823226811, + "language_loss": 0.64232516, + "learning_rate": 9.101395054080815e-07, + "loss": 0.66454804, + "num_input_tokens_seen": 248700555, + "step": 11525, + "time_per_iteration": 2.716583013534546 + }, + { + "auxiliary_loss_clip": 0.0110154, + "auxiliary_loss_mlp": 0.01103477, + "balance_loss_clip": 1.00186038, + "balance_loss_mlp": 1.00067043, + "epoch": 0.6929806102510145, + "flos": 17894754576000.0, + "grad_norm": 3.018209676285821, + "language_loss": 0.70970893, + "learning_rate": 9.098129697055907e-07, + "loss": 0.73175913, + "num_input_tokens_seen": 248716095, + "step": 11526, + "time_per_iteration": 2.747446298599243 + }, + { + "auxiliary_loss_clip": 0.01133677, + "auxiliary_loss_mlp": 0.01103424, + "balance_loss_clip": 1.00185573, + "balance_loss_mlp": 1.00052226, + "epoch": 0.6930407335036826, + "flos": 19755178577280.0, + "grad_norm": 1.6437896969374943, + "language_loss": 0.76439011, + "learning_rate": 9.094864753431022e-07, + "loss": 0.78676111, + "num_input_tokens_seen": 248735330, + "step": 11527, + "time_per_iteration": 4.080204725265503 + }, + { + "auxiliary_loss_clip": 0.01133288, + "auxiliary_loss_mlp": 0.01103118, + "balance_loss_clip": 1.00180125, + "balance_loss_mlp": 1.00040722, + "epoch": 0.6931008567563505, + "flos": 21544248211200.0, + "grad_norm": 1.5263734622395628, + "language_loss": 0.79301226, + "learning_rate": 9.091600223329952e-07, + "loss": 0.81537628, + "num_input_tokens_seen": 248754530, + "step": 11528, + "time_per_iteration": 2.634241819381714 + }, + { + "auxiliary_loss_clip": 0.01147973, + "auxiliary_loss_mlp": 0.01102328, + "balance_loss_clip": 1.00185859, + "balance_loss_mlp": 1.00057149, + "epoch": 0.6931609800090185, + "flos": 26250018117120.0, + "grad_norm": 1.3879197806956485, + "language_loss": 0.75803542, + "learning_rate": 9.088336106876491e-07, + "loss": 0.78053838, + "num_input_tokens_seen": 248775825, + "step": 11529, + "time_per_iteration": 2.6178228855133057 + }, + { + "auxiliary_loss_clip": 0.01164863, + "auxiliary_loss_mlp": 0.00747231, + "balance_loss_clip": 1.00193763, + "balance_loss_mlp": 1.00046206, + "epoch": 0.6932211032616865, + "flos": 32343376366080.0, + "grad_norm": 1.8067408492909278, + "language_loss": 0.72525257, + "learning_rate": 9.085072404194436e-07, + "loss": 0.74437356, + "num_input_tokens_seen": 248796180, + "step": 11530, + "time_per_iteration": 3.976248264312744 + }, + { + "auxiliary_loss_clip": 0.01132067, + "auxiliary_loss_mlp": 0.01105178, + "balance_loss_clip": 1.00178146, + "balance_loss_mlp": 1.00055957, + "epoch": 0.6932812265143544, + "flos": 22049079909120.0, + "grad_norm": 1.7767026635504704, + "language_loss": 0.78123802, + "learning_rate": 9.081809115407513e-07, + "loss": 0.80361044, + "num_input_tokens_seen": 248814735, + "step": 11531, + "time_per_iteration": 2.5952396392822266 + }, + { + "auxiliary_loss_clip": 0.01148369, + "auxiliary_loss_mlp": 0.01102887, + "balance_loss_clip": 1.00186169, + "balance_loss_mlp": 1.00046277, + "epoch": 0.6933413497670224, + "flos": 26256626219520.0, + "grad_norm": 2.0482934055462962, + "language_loss": 0.69193625, + "learning_rate": 9.078546240639484e-07, + "loss": 0.71444881, + "num_input_tokens_seen": 248839140, + "step": 11532, + "time_per_iteration": 2.6450860500335693 + }, + { + "auxiliary_loss_clip": 0.01133464, + "auxiliary_loss_mlp": 0.01105331, + "balance_loss_clip": 1.00189352, + "balance_loss_mlp": 1.0005219, + "epoch": 0.6934014730196904, + "flos": 19573003774080.0, + "grad_norm": 1.96910923050461, + "language_loss": 0.66965055, + "learning_rate": 9.075283780014082e-07, + "loss": 0.69203854, + "num_input_tokens_seen": 248858300, + "step": 11533, + "time_per_iteration": 2.5856645107269287 + }, + { + "auxiliary_loss_clip": 0.01131581, + "auxiliary_loss_mlp": 0.01104678, + "balance_loss_clip": 1.00193906, + "balance_loss_mlp": 1.00063252, + "epoch": 0.6934615962723584, + "flos": 22119249127680.0, + "grad_norm": 2.469498052321309, + "language_loss": 0.58672929, + "learning_rate": 9.072021733655007e-07, + "loss": 0.60909188, + "num_input_tokens_seen": 248876310, + "step": 11534, + "time_per_iteration": 2.5940182209014893 + }, + { + "auxiliary_loss_clip": 0.01118673, + "auxiliary_loss_mlp": 0.01103875, + "balance_loss_clip": 1.00169516, + "balance_loss_mlp": 1.00030613, + "epoch": 0.6935217195250263, + "flos": 21360816432000.0, + "grad_norm": 2.0250475323259907, + "language_loss": 0.71059, + "learning_rate": 9.068760101685971e-07, + "loss": 0.7328155, + "num_input_tokens_seen": 248895650, + "step": 11535, + "time_per_iteration": 2.633786916732788 + }, + { + "auxiliary_loss_clip": 0.01128247, + "auxiliary_loss_mlp": 0.01078791, + "balance_loss_clip": 1.00083756, + "balance_loss_mlp": 1.00020814, + "epoch": 0.6935818427776943, + "flos": 64063813115520.0, + "grad_norm": 0.7102716675487223, + "language_loss": 0.59075356, + "learning_rate": 9.065498884230638e-07, + "loss": 0.61282396, + "num_input_tokens_seen": 248963920, + "step": 11536, + "time_per_iteration": 3.268576145172119 + }, + { + "auxiliary_loss_clip": 0.01148665, + "auxiliary_loss_mlp": 0.00747239, + "balance_loss_clip": 1.00188279, + "balance_loss_mlp": 1.00044143, + "epoch": 0.6936419660303622, + "flos": 20302564913280.0, + "grad_norm": 1.5454779132381204, + "language_loss": 0.72671032, + "learning_rate": 9.062238081412692e-07, + "loss": 0.74566936, + "num_input_tokens_seen": 248983380, + "step": 11537, + "time_per_iteration": 2.5796828269958496 + }, + { + "auxiliary_loss_clip": 0.01144824, + "auxiliary_loss_mlp": 0.00745649, + "balance_loss_clip": 1.00091183, + "balance_loss_mlp": 1.00033879, + "epoch": 0.6937020892830302, + "flos": 67182581347200.0, + "grad_norm": 0.7491712400851606, + "language_loss": 0.55554712, + "learning_rate": 9.058977693355767e-07, + "loss": 0.5744518, + "num_input_tokens_seen": 249044680, + "step": 11538, + "time_per_iteration": 3.1293084621429443 + }, + { + "auxiliary_loss_clip": 0.01147855, + "auxiliary_loss_mlp": 0.01102563, + "balance_loss_clip": 1.00181055, + "balance_loss_mlp": 1.00051951, + "epoch": 0.6937622125356981, + "flos": 23878190229120.0, + "grad_norm": 1.47839840572277, + "language_loss": 0.77760708, + "learning_rate": 9.055717720183505e-07, + "loss": 0.80011129, + "num_input_tokens_seen": 249061060, + "step": 11539, + "time_per_iteration": 2.6026089191436768 + }, + { + "auxiliary_loss_clip": 0.01131076, + "auxiliary_loss_mlp": 0.01102845, + "balance_loss_clip": 1.0018369, + "balance_loss_mlp": 1.00042057, + "epoch": 0.6938223357883662, + "flos": 28730619365760.0, + "grad_norm": 1.788597583217613, + "language_loss": 0.64006281, + "learning_rate": 9.05245816201953e-07, + "loss": 0.66240191, + "num_input_tokens_seen": 249081430, + "step": 11540, + "time_per_iteration": 2.6612462997436523 + }, + { + "auxiliary_loss_clip": 0.01118575, + "auxiliary_loss_mlp": 0.01103078, + "balance_loss_clip": 1.00202918, + "balance_loss_mlp": 1.00055838, + "epoch": 0.6938824590410341, + "flos": 28655027193600.0, + "grad_norm": 1.5642523072787957, + "language_loss": 0.86739337, + "learning_rate": 9.049199018987437e-07, + "loss": 0.88960993, + "num_input_tokens_seen": 249103020, + "step": 11541, + "time_per_iteration": 2.7234628200531006 + }, + { + "auxiliary_loss_clip": 0.01165156, + "auxiliary_loss_mlp": 0.0074742, + "balance_loss_clip": 1.00194597, + "balance_loss_mlp": 1.00037456, + "epoch": 0.6939425822937021, + "flos": 18983062800000.0, + "grad_norm": 1.7531112197752525, + "language_loss": 0.84281373, + "learning_rate": 9.04594029121081e-07, + "loss": 0.86193949, + "num_input_tokens_seen": 249120810, + "step": 11542, + "time_per_iteration": 2.5421395301818848 + }, + { + "auxiliary_loss_clip": 0.01148296, + "auxiliary_loss_mlp": 0.01104725, + "balance_loss_clip": 1.00193071, + "balance_loss_mlp": 1.00048804, + "epoch": 0.6940027055463701, + "flos": 23075838178560.0, + "grad_norm": 1.8436437679697, + "language_loss": 0.75235206, + "learning_rate": 9.04268197881323e-07, + "loss": 0.77488232, + "num_input_tokens_seen": 249138050, + "step": 11543, + "time_per_iteration": 2.5712661743164062 + }, + { + "auxiliary_loss_clip": 0.01133063, + "auxiliary_loss_mlp": 0.01103732, + "balance_loss_clip": 1.00186419, + "balance_loss_mlp": 1.00063944, + "epoch": 0.694062828799038, + "flos": 18186564666240.0, + "grad_norm": 1.9086951843958075, + "language_loss": 0.76084852, + "learning_rate": 9.039424081918241e-07, + "loss": 0.78321648, + "num_input_tokens_seen": 249155570, + "step": 11544, + "time_per_iteration": 2.595675468444824 + }, + { + "auxiliary_loss_clip": 0.01102121, + "auxiliary_loss_mlp": 0.01104423, + "balance_loss_clip": 1.00165212, + "balance_loss_mlp": 1.00056815, + "epoch": 0.694122952051706, + "flos": 17821532701440.0, + "grad_norm": 2.0149475640351957, + "language_loss": 0.70699829, + "learning_rate": 9.036166600649388e-07, + "loss": 0.72906375, + "num_input_tokens_seen": 249172960, + "step": 11545, + "time_per_iteration": 2.6587250232696533 + }, + { + "auxiliary_loss_clip": 0.01148175, + "auxiliary_loss_mlp": 0.01102855, + "balance_loss_clip": 1.00193644, + "balance_loss_mlp": 1.00052571, + "epoch": 0.694183075304374, + "flos": 21215306436480.0, + "grad_norm": 1.7166360466246824, + "language_loss": 0.79360139, + "learning_rate": 9.0329095351302e-07, + "loss": 0.81611168, + "num_input_tokens_seen": 249192450, + "step": 11546, + "time_per_iteration": 2.559659481048584 + }, + { + "auxiliary_loss_clip": 0.01118364, + "auxiliary_loss_mlp": 0.01104145, + "balance_loss_clip": 1.00180554, + "balance_loss_mlp": 1.00048101, + "epoch": 0.694243198557042, + "flos": 24060508686720.0, + "grad_norm": 1.4240018318725807, + "language_loss": 0.78660047, + "learning_rate": 9.029652885484194e-07, + "loss": 0.80882561, + "num_input_tokens_seen": 249214320, + "step": 11547, + "time_per_iteration": 2.725060224533081 + }, + { + "auxiliary_loss_clip": 0.0113161, + "auxiliary_loss_mlp": 0.00747342, + "balance_loss_clip": 1.00192046, + "balance_loss_mlp": 1.00044954, + "epoch": 0.6943033218097099, + "flos": 21141869080320.0, + "grad_norm": 2.9378051083326655, + "language_loss": 0.80625987, + "learning_rate": 9.026396651834834e-07, + "loss": 0.8250494, + "num_input_tokens_seen": 249230925, + "step": 11548, + "time_per_iteration": 2.61801815032959 + }, + { + "auxiliary_loss_clip": 0.01159187, + "auxiliary_loss_mlp": 0.00745537, + "balance_loss_clip": 1.00085628, + "balance_loss_mlp": 1.00020587, + "epoch": 0.6943634450623779, + "flos": 57812015975040.0, + "grad_norm": 0.6945766088962348, + "language_loss": 0.53749096, + "learning_rate": 9.023140834305613e-07, + "loss": 0.55653816, + "num_input_tokens_seen": 249293975, + "step": 11549, + "time_per_iteration": 3.086249589920044 + }, + { + "auxiliary_loss_clip": 0.01150348, + "auxiliary_loss_mlp": 0.01103258, + "balance_loss_clip": 1.0019381, + "balance_loss_mlp": 1.00064301, + "epoch": 0.6944235683150458, + "flos": 30590684231040.0, + "grad_norm": 1.4209298227225833, + "language_loss": 0.73852628, + "learning_rate": 9.01988543302e-07, + "loss": 0.76106238, + "num_input_tokens_seen": 249315285, + "step": 11550, + "time_per_iteration": 2.6746809482574463 + }, + { + "auxiliary_loss_clip": 0.01131959, + "auxiliary_loss_mlp": 0.01104374, + "balance_loss_clip": 1.0018549, + "balance_loss_mlp": 1.0005188, + "epoch": 0.6944836915677138, + "flos": 19719447523200.0, + "grad_norm": 2.451499241466056, + "language_loss": 0.7431733, + "learning_rate": 9.016630448101425e-07, + "loss": 0.76553661, + "num_input_tokens_seen": 249333505, + "step": 11551, + "time_per_iteration": 2.6046273708343506 + }, + { + "auxiliary_loss_clip": 0.0116494, + "auxiliary_loss_mlp": 0.01104049, + "balance_loss_clip": 1.00190973, + "balance_loss_mlp": 1.00057495, + "epoch": 0.6945438148203817, + "flos": 24863579009280.0, + "grad_norm": 1.5595444878518787, + "language_loss": 0.84450161, + "learning_rate": 9.01337587967333e-07, + "loss": 0.86719149, + "num_input_tokens_seen": 249354180, + "step": 11552, + "time_per_iteration": 2.6218488216400146 + }, + { + "auxiliary_loss_clip": 0.01164979, + "auxiliary_loss_mlp": 0.01103465, + "balance_loss_clip": 1.00197589, + "balance_loss_mlp": 1.00065899, + "epoch": 0.6946039380730498, + "flos": 33326646243840.0, + "grad_norm": 1.6234398344131733, + "language_loss": 0.67596287, + "learning_rate": 9.010121727859117e-07, + "loss": 0.69864732, + "num_input_tokens_seen": 249377035, + "step": 11553, + "time_per_iteration": 3.9948999881744385 + }, + { + "auxiliary_loss_clip": 0.01132949, + "auxiliary_loss_mlp": 0.01105014, + "balance_loss_clip": 1.00193191, + "balance_loss_mlp": 1.00049174, + "epoch": 0.6946640613257177, + "flos": 20850956830080.0, + "grad_norm": 1.543349224489629, + "language_loss": 0.79513919, + "learning_rate": 9.006867992782195e-07, + "loss": 0.81751883, + "num_input_tokens_seen": 249396155, + "step": 11554, + "time_per_iteration": 3.989922523498535 + }, + { + "auxiliary_loss_clip": 0.01148397, + "auxiliary_loss_mlp": 0.01103877, + "balance_loss_clip": 1.0018537, + "balance_loss_mlp": 1.00049877, + "epoch": 0.6947241845783857, + "flos": 19354846521600.0, + "grad_norm": 2.02496210865872, + "language_loss": 0.72634894, + "learning_rate": 9.003614674565934e-07, + "loss": 0.74887162, + "num_input_tokens_seen": 249414555, + "step": 11555, + "time_per_iteration": 2.5495588779449463 + }, + { + "auxiliary_loss_clip": 0.01116471, + "auxiliary_loss_mlp": 0.01103612, + "balance_loss_clip": 1.0016917, + "balance_loss_mlp": 1.00052011, + "epoch": 0.6947843078310536, + "flos": 27120240915840.0, + "grad_norm": 1.6799141174077843, + "language_loss": 0.78061736, + "learning_rate": 9.000361773333705e-07, + "loss": 0.80281818, + "num_input_tokens_seen": 249433570, + "step": 11556, + "time_per_iteration": 2.6604840755462646 + }, + { + "auxiliary_loss_clip": 0.01084769, + "auxiliary_loss_mlp": 0.01104316, + "balance_loss_clip": 1.00161994, + "balance_loss_mlp": 1.00074744, + "epoch": 0.6948444310837216, + "flos": 28585109370240.0, + "grad_norm": 2.1510486182402353, + "language_loss": 0.60462248, + "learning_rate": 8.997109289208869e-07, + "loss": 0.62651336, + "num_input_tokens_seen": 249453735, + "step": 11557, + "time_per_iteration": 2.7660200595855713 + }, + { + "auxiliary_loss_clip": 0.01119902, + "auxiliary_loss_mlp": 0.01103208, + "balance_loss_clip": 1.00174654, + "balance_loss_mlp": 1.00059223, + "epoch": 0.6949045543363896, + "flos": 15669262696320.0, + "grad_norm": 1.814913381793526, + "language_loss": 0.85641038, + "learning_rate": 8.993857222314752e-07, + "loss": 0.87864149, + "num_input_tokens_seen": 249470805, + "step": 11558, + "time_per_iteration": 2.6159188747406006 + }, + { + "auxiliary_loss_clip": 0.01150303, + "auxiliary_loss_mlp": 0.01104097, + "balance_loss_clip": 1.00191951, + "balance_loss_mlp": 1.00052786, + "epoch": 0.6949646775890576, + "flos": 23259413612160.0, + "grad_norm": 1.7737712412036728, + "language_loss": 0.70478487, + "learning_rate": 8.990605572774664e-07, + "loss": 0.72732884, + "num_input_tokens_seen": 249491150, + "step": 11559, + "time_per_iteration": 2.615694284439087 + }, + { + "auxiliary_loss_clip": 0.01114894, + "auxiliary_loss_mlp": 0.01104074, + "balance_loss_clip": 1.00178003, + "balance_loss_mlp": 1.00060034, + "epoch": 0.6950248008417256, + "flos": 22382546797440.0, + "grad_norm": 1.7027452905061118, + "language_loss": 0.7875362, + "learning_rate": 8.987354340711921e-07, + "loss": 0.80972588, + "num_input_tokens_seen": 249511560, + "step": 11560, + "time_per_iteration": 2.6650311946868896 + }, + { + "auxiliary_loss_clip": 0.01131348, + "auxiliary_loss_mlp": 0.01103083, + "balance_loss_clip": 1.00179446, + "balance_loss_mlp": 1.00056338, + "epoch": 0.6950849240943935, + "flos": 23477355383040.0, + "grad_norm": 1.7125420503258315, + "language_loss": 0.76603687, + "learning_rate": 8.9841035262498e-07, + "loss": 0.78838116, + "num_input_tokens_seen": 249531910, + "step": 11561, + "time_per_iteration": 2.6287782192230225 + }, + { + "auxiliary_loss_clip": 0.01164738, + "auxiliary_loss_mlp": 0.01103258, + "balance_loss_clip": 1.00189412, + "balance_loss_mlp": 1.00054693, + "epoch": 0.6951450473470615, + "flos": 17420554200960.0, + "grad_norm": 3.186892852030687, + "language_loss": 0.78646421, + "learning_rate": 8.980853129511577e-07, + "loss": 0.80914414, + "num_input_tokens_seen": 249550300, + "step": 11562, + "time_per_iteration": 2.514157295227051 + }, + { + "auxiliary_loss_clip": 0.01149704, + "auxiliary_loss_mlp": 0.0110479, + "balance_loss_clip": 1.0018599, + "balance_loss_mlp": 1.00045848, + "epoch": 0.6952051705997294, + "flos": 20485745297280.0, + "grad_norm": 2.861326219448494, + "language_loss": 0.69554138, + "learning_rate": 8.977603150620515e-07, + "loss": 0.71808636, + "num_input_tokens_seen": 249567740, + "step": 11563, + "time_per_iteration": 2.5716347694396973 + }, + { + "auxiliary_loss_clip": 0.01150727, + "auxiliary_loss_mlp": 0.01103166, + "balance_loss_clip": 1.0021286, + "balance_loss_mlp": 1.00045502, + "epoch": 0.6952652938523974, + "flos": 13989541040640.0, + "grad_norm": 2.2162937221772427, + "language_loss": 0.7326377, + "learning_rate": 8.974353589699846e-07, + "loss": 0.75517666, + "num_input_tokens_seen": 249582700, + "step": 11564, + "time_per_iteration": 2.543848991394043 + }, + { + "auxiliary_loss_clip": 0.01118176, + "auxiliary_loss_mlp": 0.01106104, + "balance_loss_clip": 1.00195765, + "balance_loss_mlp": 1.00053179, + "epoch": 0.6953254171050653, + "flos": 30953956429440.0, + "grad_norm": 1.850451474881052, + "language_loss": 0.7188549, + "learning_rate": 8.971104446872785e-07, + "loss": 0.74109769, + "num_input_tokens_seen": 249602920, + "step": 11565, + "time_per_iteration": 4.134927034378052 + }, + { + "auxiliary_loss_clip": 0.01129788, + "auxiliary_loss_mlp": 0.01078259, + "balance_loss_clip": 1.00089824, + "balance_loss_mlp": 1.00005805, + "epoch": 0.6953855403577334, + "flos": 61670257499520.0, + "grad_norm": 0.965092937649798, + "language_loss": 0.58496559, + "learning_rate": 8.96785572226255e-07, + "loss": 0.60704613, + "num_input_tokens_seen": 249660400, + "step": 11566, + "time_per_iteration": 3.031259775161743 + }, + { + "auxiliary_loss_clip": 0.01116371, + "auxiliary_loss_mlp": 0.01104336, + "balance_loss_clip": 1.00175977, + "balance_loss_mlp": 1.00048089, + "epoch": 0.6954456636104013, + "flos": 23039029716480.0, + "grad_norm": 4.395091739150487, + "language_loss": 0.74463749, + "learning_rate": 8.964607415992338e-07, + "loss": 0.76684457, + "num_input_tokens_seen": 249679335, + "step": 11567, + "time_per_iteration": 2.706892728805542 + }, + { + "auxiliary_loss_clip": 0.01133485, + "auxiliary_loss_mlp": 0.01103657, + "balance_loss_clip": 1.00186789, + "balance_loss_mlp": 1.00056493, + "epoch": 0.6955057868630693, + "flos": 23918518224000.0, + "grad_norm": 1.2897573704702638, + "language_loss": 0.76912868, + "learning_rate": 8.961359528185313e-07, + "loss": 0.79150009, + "num_input_tokens_seen": 249701805, + "step": 11568, + "time_per_iteration": 4.029050588607788 + }, + { + "auxiliary_loss_clip": 0.01150755, + "auxiliary_loss_mlp": 0.01103423, + "balance_loss_clip": 1.00211501, + "balance_loss_mlp": 1.00052142, + "epoch": 0.6955659101157372, + "flos": 22594634651520.0, + "grad_norm": 1.610576771363691, + "language_loss": 0.72021967, + "learning_rate": 8.958112058964649e-07, + "loss": 0.74276149, + "num_input_tokens_seen": 249720550, + "step": 11569, + "time_per_iteration": 2.578324556350708 + }, + { + "auxiliary_loss_clip": 0.01133205, + "auxiliary_loss_mlp": 0.01102774, + "balance_loss_clip": 1.0019505, + "balance_loss_mlp": 1.00044501, + "epoch": 0.6956260333684052, + "flos": 24572523104640.0, + "grad_norm": 1.5344526210043459, + "language_loss": 0.76915896, + "learning_rate": 8.954865008453471e-07, + "loss": 0.79151881, + "num_input_tokens_seen": 249740325, + "step": 11570, + "time_per_iteration": 2.622746706008911 + }, + { + "auxiliary_loss_clip": 0.01148506, + "auxiliary_loss_mlp": 0.01103578, + "balance_loss_clip": 1.00183392, + "balance_loss_mlp": 1.00039029, + "epoch": 0.6956861566210732, + "flos": 25846058787840.0, + "grad_norm": 1.8938542785250605, + "language_loss": 0.7476207, + "learning_rate": 8.95161837677493e-07, + "loss": 0.77014148, + "num_input_tokens_seen": 249760570, + "step": 11571, + "time_per_iteration": 2.709653377532959 + }, + { + "auxiliary_loss_clip": 0.01150015, + "auxiliary_loss_mlp": 0.01103186, + "balance_loss_clip": 1.00187707, + "balance_loss_mlp": 1.00047517, + "epoch": 0.6957462798737412, + "flos": 15301393557120.0, + "grad_norm": 1.9150651064241289, + "language_loss": 0.74623191, + "learning_rate": 8.948372164052118e-07, + "loss": 0.7687639, + "num_input_tokens_seen": 249778290, + "step": 11572, + "time_per_iteration": 2.6160552501678467 + }, + { + "auxiliary_loss_clip": 0.01131501, + "auxiliary_loss_mlp": 0.01103022, + "balance_loss_clip": 1.001688, + "balance_loss_mlp": 1.00059688, + "epoch": 0.6958064031264092, + "flos": 36246830135040.0, + "grad_norm": 1.6555301899446466, + "language_loss": 0.70046711, + "learning_rate": 8.94512637040814e-07, + "loss": 0.72281229, + "num_input_tokens_seen": 249800925, + "step": 11573, + "time_per_iteration": 2.730752944946289 + }, + { + "auxiliary_loss_clip": 0.01131906, + "auxiliary_loss_mlp": 0.01105888, + "balance_loss_clip": 1.00200272, + "balance_loss_mlp": 1.00060272, + "epoch": 0.6958665263790771, + "flos": 19208725994880.0, + "grad_norm": 1.7574376225441046, + "language_loss": 0.74362218, + "learning_rate": 8.941880995966095e-07, + "loss": 0.76600015, + "num_input_tokens_seen": 249820500, + "step": 11574, + "time_per_iteration": 2.607121467590332 + }, + { + "auxiliary_loss_clip": 0.01118789, + "auxiliary_loss_mlp": 0.01103703, + "balance_loss_clip": 1.00178957, + "balance_loss_mlp": 1.00051594, + "epoch": 0.6959266496317451, + "flos": 21795838047360.0, + "grad_norm": 2.650128399566054, + "language_loss": 0.74757439, + "learning_rate": 8.938636040849014e-07, + "loss": 0.76979929, + "num_input_tokens_seen": 249839845, + "step": 11575, + "time_per_iteration": 2.6721301078796387 + }, + { + "auxiliary_loss_clip": 0.0114854, + "auxiliary_loss_mlp": 0.01103737, + "balance_loss_clip": 1.00192428, + "balance_loss_mlp": 1.00045371, + "epoch": 0.695986772884413, + "flos": 20558248899840.0, + "grad_norm": 1.9459168746708706, + "language_loss": 0.78875428, + "learning_rate": 8.935391505179966e-07, + "loss": 0.81127709, + "num_input_tokens_seen": 249857400, + "step": 11576, + "time_per_iteration": 2.5490365028381348 + }, + { + "auxiliary_loss_clip": 0.01097684, + "auxiliary_loss_mlp": 0.0110402, + "balance_loss_clip": 1.00164819, + "balance_loss_mlp": 1.00045061, + "epoch": 0.696046896137081, + "flos": 14936217937920.0, + "grad_norm": 2.3029689054777247, + "language_loss": 0.56372666, + "learning_rate": 8.932147389081985e-07, + "loss": 0.58574367, + "num_input_tokens_seen": 249871645, + "step": 11577, + "time_per_iteration": 2.639443874359131 + }, + { + "auxiliary_loss_clip": 0.01072693, + "auxiliary_loss_mlp": 0.01102656, + "balance_loss_clip": 1.00172472, + "balance_loss_mlp": 1.000422, + "epoch": 0.696107019389749, + "flos": 30740216549760.0, + "grad_norm": 1.410192482935782, + "language_loss": 0.76640475, + "learning_rate": 8.928903692678081e-07, + "loss": 0.78815824, + "num_input_tokens_seen": 249894215, + "step": 11578, + "time_per_iteration": 3.0010993480682373 + }, + { + "auxiliary_loss_clip": 0.01116833, + "auxiliary_loss_mlp": 0.01104329, + "balance_loss_clip": 1.00184321, + "balance_loss_mlp": 1.00056887, + "epoch": 0.696167142642417, + "flos": 20776729374720.0, + "grad_norm": 2.2260318312052556, + "language_loss": 0.79794931, + "learning_rate": 8.925660416091254e-07, + "loss": 0.82016098, + "num_input_tokens_seen": 249912850, + "step": 11579, + "time_per_iteration": 3.3564400672912598 + }, + { + "auxiliary_loss_clip": 0.01116814, + "auxiliary_loss_mlp": 0.01102312, + "balance_loss_clip": 1.00167143, + "balance_loss_mlp": 1.0003643, + "epoch": 0.6962272658950849, + "flos": 22565152563840.0, + "grad_norm": 4.2536711200818, + "language_loss": 0.72578412, + "learning_rate": 8.922417559444502e-07, + "loss": 0.74797535, + "num_input_tokens_seen": 249932650, + "step": 11580, + "time_per_iteration": 2.6571710109710693 + }, + { + "auxiliary_loss_clip": 0.01132975, + "auxiliary_loss_mlp": 0.01104542, + "balance_loss_clip": 1.00183833, + "balance_loss_mlp": 1.0005914, + "epoch": 0.6962873891477529, + "flos": 22200156512640.0, + "grad_norm": 1.911673839889183, + "language_loss": 0.66263849, + "learning_rate": 8.919175122860787e-07, + "loss": 0.68501371, + "num_input_tokens_seen": 249951205, + "step": 11581, + "time_per_iteration": 2.6299290657043457 + }, + { + "auxiliary_loss_clip": 0.01164818, + "auxiliary_loss_mlp": 0.01102952, + "balance_loss_clip": 1.00183761, + "balance_loss_mlp": 1.00043201, + "epoch": 0.6963475124004208, + "flos": 12489695717760.0, + "grad_norm": 1.9418383919113116, + "language_loss": 0.76795292, + "learning_rate": 8.915933106463056e-07, + "loss": 0.79063058, + "num_input_tokens_seen": 249967045, + "step": 11582, + "time_per_iteration": 2.5078892707824707 + }, + { + "auxiliary_loss_clip": 0.01133405, + "auxiliary_loss_mlp": 0.01103286, + "balance_loss_clip": 1.00179434, + "balance_loss_mlp": 1.00047994, + "epoch": 0.6964076356530888, + "flos": 17165085696000.0, + "grad_norm": 1.9569901438628954, + "language_loss": 0.69873786, + "learning_rate": 8.91269151037425e-07, + "loss": 0.7211048, + "num_input_tokens_seen": 249984565, + "step": 11583, + "time_per_iteration": 2.5790116786956787 + }, + { + "auxiliary_loss_clip": 0.0111727, + "auxiliary_loss_mlp": 0.01104184, + "balance_loss_clip": 1.00186586, + "balance_loss_mlp": 1.00061512, + "epoch": 0.6964677589057569, + "flos": 19937317466880.0, + "grad_norm": 1.6719449672710285, + "language_loss": 0.82277673, + "learning_rate": 8.909450334717301e-07, + "loss": 0.84499121, + "num_input_tokens_seen": 250004235, + "step": 11584, + "time_per_iteration": 2.679811477661133 + }, + { + "auxiliary_loss_clip": 0.01088858, + "auxiliary_loss_mlp": 0.01105004, + "balance_loss_clip": 1.0017643, + "balance_loss_mlp": 1.00057697, + "epoch": 0.6965278821584248, + "flos": 22784064001920.0, + "grad_norm": 2.336874938525979, + "language_loss": 0.79553914, + "learning_rate": 8.906209579615107e-07, + "loss": 0.8174777, + "num_input_tokens_seen": 250017645, + "step": 11585, + "time_per_iteration": 2.7903695106506348 + }, + { + "auxiliary_loss_clip": 0.01164823, + "auxiliary_loss_mlp": 0.01103623, + "balance_loss_clip": 1.00192297, + "balance_loss_mlp": 1.00053108, + "epoch": 0.6965880054110928, + "flos": 20047563285120.0, + "grad_norm": 1.4702619163084492, + "language_loss": 0.77679569, + "learning_rate": 8.90296924519055e-07, + "loss": 0.79948008, + "num_input_tokens_seen": 250037640, + "step": 11586, + "time_per_iteration": 2.598914384841919 + }, + { + "auxiliary_loss_clip": 0.01149509, + "auxiliary_loss_mlp": 0.01102115, + "balance_loss_clip": 1.00183725, + "balance_loss_mlp": 1.00064445, + "epoch": 0.6966481286637607, + "flos": 21908238681600.0, + "grad_norm": 1.6774760178822465, + "language_loss": 0.78598607, + "learning_rate": 8.899729331566519e-07, + "loss": 0.80850232, + "num_input_tokens_seen": 250056490, + "step": 11587, + "time_per_iteration": 2.636857271194458 + }, + { + "auxiliary_loss_clip": 0.0113116, + "auxiliary_loss_mlp": 0.01103366, + "balance_loss_clip": 1.00176334, + "balance_loss_mlp": 1.00046444, + "epoch": 0.6967082519164287, + "flos": 15633172506240.0, + "grad_norm": 2.4978985719935656, + "language_loss": 0.72772443, + "learning_rate": 8.896489838865857e-07, + "loss": 0.75006968, + "num_input_tokens_seen": 250074285, + "step": 11588, + "time_per_iteration": 2.7048087120056152 + }, + { + "auxiliary_loss_clip": 0.01131641, + "auxiliary_loss_mlp": 0.01102188, + "balance_loss_clip": 1.00173306, + "balance_loss_mlp": 1.00052667, + "epoch": 0.6967683751690966, + "flos": 24024598064640.0, + "grad_norm": 1.7345990800254045, + "language_loss": 0.75035983, + "learning_rate": 8.893250767211413e-07, + "loss": 0.77269816, + "num_input_tokens_seen": 250093350, + "step": 11589, + "time_per_iteration": 2.646207094192505 + }, + { + "auxiliary_loss_clip": 0.01133049, + "auxiliary_loss_mlp": 0.01103764, + "balance_loss_clip": 1.00181818, + "balance_loss_mlp": 1.00057673, + "epoch": 0.6968284984217646, + "flos": 31024700265600.0, + "grad_norm": 2.9613016953139635, + "language_loss": 0.6359123, + "learning_rate": 8.890012116726012e-07, + "loss": 0.65828037, + "num_input_tokens_seen": 250114170, + "step": 11590, + "time_per_iteration": 4.044647455215454 + }, + { + "auxiliary_loss_clip": 0.01095283, + "auxiliary_loss_mlp": 0.01079065, + "balance_loss_clip": 1.00126028, + "balance_loss_mlp": 1.00010061, + "epoch": 0.6968886216744326, + "flos": 67622990002560.0, + "grad_norm": 0.7474694120163954, + "language_loss": 0.61246622, + "learning_rate": 8.88677388753248e-07, + "loss": 0.63420975, + "num_input_tokens_seen": 250178250, + "step": 11591, + "time_per_iteration": 3.3344016075134277 + }, + { + "auxiliary_loss_clip": 0.0108321, + "auxiliary_loss_mlp": 0.00747306, + "balance_loss_clip": 1.00174475, + "balance_loss_mlp": 1.00050187, + "epoch": 0.6969487449271006, + "flos": 24863686750080.0, + "grad_norm": 1.6292303077602428, + "language_loss": 0.6903311, + "learning_rate": 8.883536079753582e-07, + "loss": 0.70863622, + "num_input_tokens_seen": 250198420, + "step": 11592, + "time_per_iteration": 4.474552631378174 + }, + { + "auxiliary_loss_clip": 0.01118443, + "auxiliary_loss_mlp": 0.01103442, + "balance_loss_clip": 1.00201511, + "balance_loss_mlp": 1.00054085, + "epoch": 0.6970088681797685, + "flos": 28767858791040.0, + "grad_norm": 1.4678627819550363, + "language_loss": 0.62438333, + "learning_rate": 8.880298693512109e-07, + "loss": 0.64660221, + "num_input_tokens_seen": 250220650, + "step": 11593, + "time_per_iteration": 2.746411085128784 + }, + { + "auxiliary_loss_clip": 0.01130972, + "auxiliary_loss_mlp": 0.01101801, + "balance_loss_clip": 1.00189638, + "balance_loss_mlp": 1.00033045, + "epoch": 0.6970689914324365, + "flos": 27308556944640.0, + "grad_norm": 1.5511376523601115, + "language_loss": 0.54543805, + "learning_rate": 8.877061728930832e-07, + "loss": 0.56776583, + "num_input_tokens_seen": 250241750, + "step": 11594, + "time_per_iteration": 2.667330741882324 + }, + { + "auxiliary_loss_clip": 0.01148174, + "auxiliary_loss_mlp": 0.0110367, + "balance_loss_clip": 1.00182319, + "balance_loss_mlp": 1.00048232, + "epoch": 0.6971291146851044, + "flos": 19136258305920.0, + "grad_norm": 2.9279838016658166, + "language_loss": 0.7703234, + "learning_rate": 8.87382518613248e-07, + "loss": 0.79284179, + "num_input_tokens_seen": 250259445, + "step": 11595, + "time_per_iteration": 2.5549707412719727 + }, + { + "auxiliary_loss_clip": 0.01131527, + "auxiliary_loss_mlp": 0.00747423, + "balance_loss_clip": 1.0018208, + "balance_loss_mlp": 1.00054145, + "epoch": 0.6971892379377724, + "flos": 14610508387200.0, + "grad_norm": 2.1374764734251044, + "language_loss": 0.71297055, + "learning_rate": 8.870589065239793e-07, + "loss": 0.73176003, + "num_input_tokens_seen": 250275640, + "step": 11596, + "time_per_iteration": 2.5977959632873535 + }, + { + "auxiliary_loss_clip": 0.01165028, + "auxiliary_loss_mlp": 0.011035, + "balance_loss_clip": 1.00211096, + "balance_loss_mlp": 1.00050354, + "epoch": 0.6972493611904405, + "flos": 22307457415680.0, + "grad_norm": 1.7046782387427772, + "language_loss": 0.76397777, + "learning_rate": 8.867353366375492e-07, + "loss": 0.78666306, + "num_input_tokens_seen": 250296435, + "step": 11597, + "time_per_iteration": 2.5470619201660156 + }, + { + "auxiliary_loss_clip": 0.01150206, + "auxiliary_loss_mlp": 0.01103903, + "balance_loss_clip": 1.00184882, + "balance_loss_mlp": 1.00052512, + "epoch": 0.6973094844431084, + "flos": 17420374632960.0, + "grad_norm": 1.8922004706798763, + "language_loss": 0.74754608, + "learning_rate": 8.864118089662267e-07, + "loss": 0.77008712, + "num_input_tokens_seen": 250314035, + "step": 11598, + "time_per_iteration": 2.5451912879943848 + }, + { + "auxiliary_loss_clip": 0.01133631, + "auxiliary_loss_mlp": 0.0110371, + "balance_loss_clip": 1.00184286, + "balance_loss_mlp": 1.00052249, + "epoch": 0.6973696076957764, + "flos": 27235370983680.0, + "grad_norm": 1.708323404271452, + "language_loss": 0.89803737, + "learning_rate": 8.860883235222791e-07, + "loss": 0.92041081, + "num_input_tokens_seen": 250332995, + "step": 11599, + "time_per_iteration": 2.6962239742279053 + }, + { + "auxiliary_loss_clip": 0.01149607, + "auxiliary_loss_mlp": 0.01104667, + "balance_loss_clip": 1.0019362, + "balance_loss_mlp": 1.00062144, + "epoch": 0.6974297309484443, + "flos": 22018089450240.0, + "grad_norm": 2.065004273144211, + "language_loss": 0.69844872, + "learning_rate": 8.85764880317974e-07, + "loss": 0.72099143, + "num_input_tokens_seen": 250352120, + "step": 11600, + "time_per_iteration": 2.5698907375335693 + }, + { + "auxiliary_loss_clip": 0.01117235, + "auxiliary_loss_mlp": 0.01104728, + "balance_loss_clip": 1.0017972, + "balance_loss_mlp": 1.00049174, + "epoch": 0.6974898542011123, + "flos": 28366449327360.0, + "grad_norm": 1.7733951418331297, + "language_loss": 0.76836097, + "learning_rate": 8.854414793655771e-07, + "loss": 0.79058063, + "num_input_tokens_seen": 250371705, + "step": 11601, + "time_per_iteration": 2.7227160930633545 + }, + { + "auxiliary_loss_clip": 0.01147916, + "auxiliary_loss_mlp": 0.00747245, + "balance_loss_clip": 1.00179148, + "balance_loss_mlp": 1.0004673, + "epoch": 0.6975499774537802, + "flos": 15232050351360.0, + "grad_norm": 1.7674463689802704, + "language_loss": 0.72404909, + "learning_rate": 8.851181206773508e-07, + "loss": 0.74300063, + "num_input_tokens_seen": 250390485, + "step": 11602, + "time_per_iteration": 2.5415728092193604 + }, + { + "auxiliary_loss_clip": 0.01134285, + "auxiliary_loss_mlp": 0.00747225, + "balance_loss_clip": 1.0019511, + "balance_loss_mlp": 1.00058126, + "epoch": 0.6976101007064482, + "flos": 22157422306560.0, + "grad_norm": 2.7488146990744577, + "language_loss": 0.76241308, + "learning_rate": 8.847948042655567e-07, + "loss": 0.78122818, + "num_input_tokens_seen": 250407020, + "step": 11603, + "time_per_iteration": 4.018279552459717 + }, + { + "auxiliary_loss_clip": 0.01099573, + "auxiliary_loss_mlp": 0.01103043, + "balance_loss_clip": 1.00171769, + "balance_loss_mlp": 1.00042737, + "epoch": 0.6976702239591162, + "flos": 22273522041600.0, + "grad_norm": 2.702998378933323, + "language_loss": 0.61968285, + "learning_rate": 8.844715301424557e-07, + "loss": 0.64170897, + "num_input_tokens_seen": 250425880, + "step": 11604, + "time_per_iteration": 2.6756834983825684 + }, + { + "auxiliary_loss_clip": 0.01150445, + "auxiliary_loss_mlp": 0.01104709, + "balance_loss_clip": 1.00206065, + "balance_loss_mlp": 1.00056779, + "epoch": 0.6977303472117842, + "flos": 25848608653440.0, + "grad_norm": 2.157453324502186, + "language_loss": 0.81323856, + "learning_rate": 8.841482983203057e-07, + "loss": 0.8357901, + "num_input_tokens_seen": 250442925, + "step": 11605, + "time_per_iteration": 3.9905903339385986 + }, + { + "auxiliary_loss_clip": 0.01147964, + "auxiliary_loss_mlp": 0.01102914, + "balance_loss_clip": 1.00182915, + "balance_loss_mlp": 1.00048971, + "epoch": 0.6977904704644521, + "flos": 20959586536320.0, + "grad_norm": 1.8097303935156976, + "language_loss": 0.70302439, + "learning_rate": 8.838251088113638e-07, + "loss": 0.72553319, + "num_input_tokens_seen": 250461220, + "step": 11606, + "time_per_iteration": 2.5920753479003906 + }, + { + "auxiliary_loss_clip": 0.01131689, + "auxiliary_loss_mlp": 0.01103925, + "balance_loss_clip": 1.00184691, + "balance_loss_mlp": 1.00054693, + "epoch": 0.6978505937171201, + "flos": 22055041566720.0, + "grad_norm": 1.884543961210304, + "language_loss": 0.82166469, + "learning_rate": 8.835019616278856e-07, + "loss": 0.84402084, + "num_input_tokens_seen": 250480975, + "step": 11607, + "time_per_iteration": 2.6520254611968994 + }, + { + "auxiliary_loss_clip": 0.01131657, + "auxiliary_loss_mlp": 0.0110359, + "balance_loss_clip": 1.00175798, + "balance_loss_mlp": 1.00049829, + "epoch": 0.697910716969788, + "flos": 20043720529920.0, + "grad_norm": 2.2710370019301247, + "language_loss": 0.79015559, + "learning_rate": 8.831788567821265e-07, + "loss": 0.81250811, + "num_input_tokens_seen": 250497980, + "step": 11608, + "time_per_iteration": 2.5812008380889893 + }, + { + "auxiliary_loss_clip": 0.0113443, + "auxiliary_loss_mlp": 0.0110292, + "balance_loss_clip": 1.001724, + "balance_loss_mlp": 1.00049531, + "epoch": 0.697970840222456, + "flos": 15888245961600.0, + "grad_norm": 2.0509739000645446, + "language_loss": 0.89980584, + "learning_rate": 8.828557942863357e-07, + "loss": 0.92217934, + "num_input_tokens_seen": 250511910, + "step": 11609, + "time_per_iteration": 2.5697412490844727 + }, + { + "auxiliary_loss_clip": 0.0111701, + "auxiliary_loss_mlp": 0.01103743, + "balance_loss_clip": 1.00164223, + "balance_loss_mlp": 1.00046062, + "epoch": 0.698030963475124, + "flos": 21215629658880.0, + "grad_norm": 1.694553854910159, + "language_loss": 0.63768983, + "learning_rate": 8.82532774152765e-07, + "loss": 0.65989733, + "num_input_tokens_seen": 250531090, + "step": 11610, + "time_per_iteration": 2.650297164916992 + }, + { + "auxiliary_loss_clip": 0.01115848, + "auxiliary_loss_mlp": 0.0110177, + "balance_loss_clip": 1.00165319, + "balance_loss_mlp": 1.00053704, + "epoch": 0.698091086727792, + "flos": 33759728524800.0, + "grad_norm": 1.8514376184538865, + "language_loss": 0.84927547, + "learning_rate": 8.822097963936643e-07, + "loss": 0.87145168, + "num_input_tokens_seen": 250551565, + "step": 11611, + "time_per_iteration": 2.762986183166504 + }, + { + "auxiliary_loss_clip": 0.01147845, + "auxiliary_loss_mlp": 0.01103682, + "balance_loss_clip": 1.00193119, + "balance_loss_mlp": 1.00049472, + "epoch": 0.69815120998046, + "flos": 15887850912000.0, + "grad_norm": 1.926641532785966, + "language_loss": 0.70336759, + "learning_rate": 8.818868610212793e-07, + "loss": 0.72588289, + "num_input_tokens_seen": 250569625, + "step": 11612, + "time_per_iteration": 2.545452356338501 + }, + { + "auxiliary_loss_clip": 0.01150407, + "auxiliary_loss_mlp": 0.01103472, + "balance_loss_clip": 1.00199509, + "balance_loss_mlp": 1.0005703, + "epoch": 0.6982113332331279, + "flos": 18947044437120.0, + "grad_norm": 1.8754888518137034, + "language_loss": 0.8135097, + "learning_rate": 8.815639680478573e-07, + "loss": 0.83604848, + "num_input_tokens_seen": 250586960, + "step": 11613, + "time_per_iteration": 2.5364341735839844 + }, + { + "auxiliary_loss_clip": 0.01149758, + "auxiliary_loss_mlp": 0.01102487, + "balance_loss_clip": 1.00194275, + "balance_loss_mlp": 1.00053895, + "epoch": 0.6982714564857959, + "flos": 24389594115840.0, + "grad_norm": 1.9723746830345936, + "language_loss": 0.75377131, + "learning_rate": 8.812411174856411e-07, + "loss": 0.77629375, + "num_input_tokens_seen": 250605080, + "step": 11614, + "time_per_iteration": 2.593736171722412 + }, + { + "auxiliary_loss_clip": 0.01068509, + "auxiliary_loss_mlp": 0.01103261, + "balance_loss_clip": 1.00149274, + "balance_loss_mlp": 1.00045443, + "epoch": 0.6983315797384638, + "flos": 20083725302400.0, + "grad_norm": 2.551333240597364, + "language_loss": 0.7704649, + "learning_rate": 8.809183093468746e-07, + "loss": 0.79218256, + "num_input_tokens_seen": 250623965, + "step": 11615, + "time_per_iteration": 2.770897626876831 + }, + { + "auxiliary_loss_clip": 0.01131496, + "auxiliary_loss_mlp": 0.01102666, + "balance_loss_clip": 1.00167799, + "balance_loss_mlp": 1.00052774, + "epoch": 0.6983917029911318, + "flos": 13512431664000.0, + "grad_norm": 2.4063303166090027, + "language_loss": 0.72800809, + "learning_rate": 8.80595543643797e-07, + "loss": 0.75034976, + "num_input_tokens_seen": 250640675, + "step": 11616, + "time_per_iteration": 2.5817813873291016 + }, + { + "auxiliary_loss_clip": 0.01164739, + "auxiliary_loss_mlp": 0.01103146, + "balance_loss_clip": 1.00195479, + "balance_loss_mlp": 1.00062585, + "epoch": 0.6984518262437998, + "flos": 22018412672640.0, + "grad_norm": 2.1336311282575524, + "language_loss": 0.84272027, + "learning_rate": 8.802728203886487e-07, + "loss": 0.86539912, + "num_input_tokens_seen": 250660295, + "step": 11617, + "time_per_iteration": 2.608337640762329 + }, + { + "auxiliary_loss_clip": 0.01119532, + "auxiliary_loss_mlp": 0.0110334, + "balance_loss_clip": 1.00192046, + "balance_loss_mlp": 1.00053406, + "epoch": 0.6985119494964678, + "flos": 18770615809920.0, + "grad_norm": 2.4790681581498557, + "language_loss": 0.59447354, + "learning_rate": 8.799501395936682e-07, + "loss": 0.61670232, + "num_input_tokens_seen": 250678155, + "step": 11618, + "time_per_iteration": 2.6629984378814697 + }, + { + "auxiliary_loss_clip": 0.01133066, + "auxiliary_loss_mlp": 0.01103654, + "balance_loss_clip": 1.0018456, + "balance_loss_mlp": 1.00065732, + "epoch": 0.6985720727491357, + "flos": 22382834106240.0, + "grad_norm": 2.994787943936449, + "language_loss": 0.82791704, + "learning_rate": 8.796275012710903e-07, + "loss": 0.85028422, + "num_input_tokens_seen": 250697230, + "step": 11619, + "time_per_iteration": 2.7921855449676514 + }, + { + "auxiliary_loss_clip": 0.01148007, + "auxiliary_loss_mlp": 0.01102586, + "balance_loss_clip": 1.00186491, + "balance_loss_mlp": 1.00044775, + "epoch": 0.6986321960018037, + "flos": 39567884785920.0, + "grad_norm": 2.477555831026279, + "language_loss": 0.67341995, + "learning_rate": 8.793049054331494e-07, + "loss": 0.69592595, + "num_input_tokens_seen": 250719865, + "step": 11620, + "time_per_iteration": 2.7270026206970215 + }, + { + "auxiliary_loss_clip": 0.01086818, + "auxiliary_loss_mlp": 0.01103526, + "balance_loss_clip": 1.00171661, + "balance_loss_mlp": 1.00043416, + "epoch": 0.6986923192544716, + "flos": 17967725055360.0, + "grad_norm": 2.2132670956618394, + "language_loss": 0.72464204, + "learning_rate": 8.789823520920794e-07, + "loss": 0.74654549, + "num_input_tokens_seen": 250736565, + "step": 11621, + "time_per_iteration": 2.75146484375 + }, + { + "auxiliary_loss_clip": 0.0110352, + "auxiliary_loss_mlp": 0.01104269, + "balance_loss_clip": 1.00186479, + "balance_loss_mlp": 1.00060487, + "epoch": 0.6987524425071396, + "flos": 25594325297280.0, + "grad_norm": 1.8412967845232702, + "language_loss": 0.68577468, + "learning_rate": 8.7865984126011e-07, + "loss": 0.7078526, + "num_input_tokens_seen": 250757235, + "step": 11622, + "time_per_iteration": 2.72786808013916 + }, + { + "auxiliary_loss_clip": 0.01083258, + "auxiliary_loss_mlp": 0.01101634, + "balance_loss_clip": 1.00163507, + "balance_loss_mlp": 1.00035369, + "epoch": 0.6988125657598077, + "flos": 17530081747200.0, + "grad_norm": 2.658109401921648, + "language_loss": 0.62952787, + "learning_rate": 8.783373729494721e-07, + "loss": 0.65137678, + "num_input_tokens_seen": 250775585, + "step": 11623, + "time_per_iteration": 2.748687744140625 + }, + { + "auxiliary_loss_clip": 0.01165204, + "auxiliary_loss_mlp": 0.01103748, + "balance_loss_clip": 1.00201368, + "balance_loss_mlp": 1.00036955, + "epoch": 0.6988726890124756, + "flos": 39165721136640.0, + "grad_norm": 2.2165286217339473, + "language_loss": 0.60685182, + "learning_rate": 8.780149471723932e-07, + "loss": 0.6295414, + "num_input_tokens_seen": 250795725, + "step": 11624, + "time_per_iteration": 2.7063751220703125 + }, + { + "auxiliary_loss_clip": 0.01148575, + "auxiliary_loss_mlp": 0.01104511, + "balance_loss_clip": 1.0019182, + "balance_loss_mlp": 1.0007515, + "epoch": 0.6989328122651436, + "flos": 20193468330240.0, + "grad_norm": 1.6698491205772779, + "language_loss": 0.78383374, + "learning_rate": 8.776925639411017e-07, + "loss": 0.8063646, + "num_input_tokens_seen": 250814555, + "step": 11625, + "time_per_iteration": 2.6140666007995605 + }, + { + "auxiliary_loss_clip": 0.01116734, + "auxiliary_loss_mlp": 0.01102211, + "balance_loss_clip": 1.00183535, + "balance_loss_mlp": 1.00064492, + "epoch": 0.6989929355178115, + "flos": 21834873152640.0, + "grad_norm": 2.1086126953179964, + "language_loss": 0.66357112, + "learning_rate": 8.773702232678188e-07, + "loss": 0.68576056, + "num_input_tokens_seen": 250833105, + "step": 11626, + "time_per_iteration": 2.649637460708618 + }, + { + "auxiliary_loss_clip": 0.01133423, + "auxiliary_loss_mlp": 0.00747367, + "balance_loss_clip": 1.00177932, + "balance_loss_mlp": 1.00048161, + "epoch": 0.6990530587704795, + "flos": 26322880855680.0, + "grad_norm": 3.5680959151171194, + "language_loss": 0.70359004, + "learning_rate": 8.770479251647697e-07, + "loss": 0.72239798, + "num_input_tokens_seen": 250852570, + "step": 11627, + "time_per_iteration": 2.721522569656372 + }, + { + "auxiliary_loss_clip": 0.01164852, + "auxiliary_loss_mlp": 0.01101731, + "balance_loss_clip": 1.00204754, + "balance_loss_mlp": 1.00045097, + "epoch": 0.6991131820231474, + "flos": 19828975069440.0, + "grad_norm": 2.5552919404525376, + "language_loss": 0.62715924, + "learning_rate": 8.767256696441768e-07, + "loss": 0.6498251, + "num_input_tokens_seen": 250870500, + "step": 11628, + "time_per_iteration": 4.2145795822143555 + }, + { + "auxiliary_loss_clip": 0.01150438, + "auxiliary_loss_mlp": 0.01103903, + "balance_loss_clip": 1.00194609, + "balance_loss_mlp": 1.00042963, + "epoch": 0.6991733052758154, + "flos": 33984817102080.0, + "grad_norm": 2.744170392072995, + "language_loss": 0.68358386, + "learning_rate": 8.764034567182581e-07, + "loss": 0.70612729, + "num_input_tokens_seen": 250892745, + "step": 11629, + "time_per_iteration": 4.147163391113281 + }, + { + "auxiliary_loss_clip": 0.01164926, + "auxiliary_loss_mlp": 0.01103036, + "balance_loss_clip": 1.0020138, + "balance_loss_mlp": 1.00070667, + "epoch": 0.6992334285284834, + "flos": 15633136592640.0, + "grad_norm": 1.8532174036408116, + "language_loss": 0.72202748, + "learning_rate": 8.760812863992337e-07, + "loss": 0.74470705, + "num_input_tokens_seen": 250910225, + "step": 11630, + "time_per_iteration": 2.5273501873016357 + }, + { + "auxiliary_loss_clip": 0.01164905, + "auxiliary_loss_mlp": 0.01103186, + "balance_loss_clip": 1.00200868, + "balance_loss_mlp": 1.0006665, + "epoch": 0.6992935517811514, + "flos": 21726279360000.0, + "grad_norm": 2.070279564089775, + "language_loss": 0.744542, + "learning_rate": 8.757591586993196e-07, + "loss": 0.76722288, + "num_input_tokens_seen": 250929715, + "step": 11631, + "time_per_iteration": 2.54659104347229 + }, + { + "auxiliary_loss_clip": 0.01147693, + "auxiliary_loss_mlp": 0.01104036, + "balance_loss_clip": 1.00186789, + "balance_loss_mlp": 1.00056195, + "epoch": 0.6993536750338193, + "flos": 20115254465280.0, + "grad_norm": 2.1063824831424536, + "language_loss": 0.89311725, + "learning_rate": 8.7543707363073e-07, + "loss": 0.91563451, + "num_input_tokens_seen": 250944230, + "step": 11632, + "time_per_iteration": 2.5464513301849365 + }, + { + "auxiliary_loss_clip": 0.0113166, + "auxiliary_loss_mlp": 0.01104052, + "balance_loss_clip": 1.0018723, + "balance_loss_mlp": 1.0005784, + "epoch": 0.6994137982864873, + "flos": 22010547594240.0, + "grad_norm": 1.8754807676322462, + "language_loss": 0.80353725, + "learning_rate": 8.751150312056792e-07, + "loss": 0.82589436, + "num_input_tokens_seen": 250961865, + "step": 11633, + "time_per_iteration": 2.6018762588500977 + }, + { + "auxiliary_loss_clip": 0.01165021, + "auxiliary_loss_mlp": 0.0110435, + "balance_loss_clip": 1.00196981, + "balance_loss_mlp": 1.0004952, + "epoch": 0.6994739215391552, + "flos": 25519020433920.0, + "grad_norm": 1.9981317275184205, + "language_loss": 0.6706723, + "learning_rate": 8.747930314363794e-07, + "loss": 0.69336593, + "num_input_tokens_seen": 250982025, + "step": 11634, + "time_per_iteration": 2.565861701965332 + }, + { + "auxiliary_loss_clip": 0.01114211, + "auxiliary_loss_mlp": 0.01078515, + "balance_loss_clip": 1.00082505, + "balance_loss_mlp": 0.99993271, + "epoch": 0.6995340447918232, + "flos": 59128357691520.0, + "grad_norm": 0.6910021034268247, + "language_loss": 0.53174347, + "learning_rate": 8.744710743350412e-07, + "loss": 0.55367076, + "num_input_tokens_seen": 251046900, + "step": 11635, + "time_per_iteration": 3.350391387939453 + }, + { + "auxiliary_loss_clip": 0.01131749, + "auxiliary_loss_mlp": 0.01103193, + "balance_loss_clip": 1.0017432, + "balance_loss_mlp": 1.00038695, + "epoch": 0.6995941680444913, + "flos": 17967832796160.0, + "grad_norm": 1.8123224319510374, + "language_loss": 0.82248938, + "learning_rate": 8.741491599138726e-07, + "loss": 0.8448388, + "num_input_tokens_seen": 251065050, + "step": 11636, + "time_per_iteration": 2.605696678161621 + }, + { + "auxiliary_loss_clip": 0.01164937, + "auxiliary_loss_mlp": 0.01103486, + "balance_loss_clip": 1.00187051, + "balance_loss_mlp": 1.00039339, + "epoch": 0.6996542912971592, + "flos": 21980095839360.0, + "grad_norm": 2.191576052439576, + "language_loss": 0.82975984, + "learning_rate": 8.738272881850801e-07, + "loss": 0.85244405, + "num_input_tokens_seen": 251083355, + "step": 11637, + "time_per_iteration": 2.569896936416626 + }, + { + "auxiliary_loss_clip": 0.01100283, + "auxiliary_loss_mlp": 0.01103934, + "balance_loss_clip": 1.00180578, + "balance_loss_mlp": 1.00055528, + "epoch": 0.6997144145498272, + "flos": 11686158518400.0, + "grad_norm": 3.8857662014907013, + "language_loss": 0.68021792, + "learning_rate": 8.735054591608704e-07, + "loss": 0.70226008, + "num_input_tokens_seen": 251096420, + "step": 11638, + "time_per_iteration": 2.6812522411346436 + }, + { + "auxiliary_loss_clip": 0.01148365, + "auxiliary_loss_mlp": 0.01103933, + "balance_loss_clip": 1.00184596, + "balance_loss_mlp": 1.00055456, + "epoch": 0.6997745378024951, + "flos": 29607162958080.0, + "grad_norm": 1.7351810899528393, + "language_loss": 0.77950418, + "learning_rate": 8.731836728534459e-07, + "loss": 0.80202717, + "num_input_tokens_seen": 251115410, + "step": 11639, + "time_per_iteration": 2.672956943511963 + }, + { + "auxiliary_loss_clip": 0.01133984, + "auxiliary_loss_mlp": 0.01103685, + "balance_loss_clip": 1.0019412, + "balance_loss_mlp": 1.00059295, + "epoch": 0.6998346610551631, + "flos": 20886616056960.0, + "grad_norm": 2.249578026832873, + "language_loss": 0.82434636, + "learning_rate": 8.728619292750093e-07, + "loss": 0.84672308, + "num_input_tokens_seen": 251133530, + "step": 11640, + "time_per_iteration": 4.276147127151489 + }, + { + "auxiliary_loss_clip": 0.01115768, + "auxiliary_loss_mlp": 0.01102794, + "balance_loss_clip": 1.00168908, + "balance_loss_mlp": 1.00055981, + "epoch": 0.699894784307831, + "flos": 27163046949120.0, + "grad_norm": 2.7328260113839535, + "language_loss": 0.75237226, + "learning_rate": 8.725402284377619e-07, + "loss": 0.77455795, + "num_input_tokens_seen": 251153985, + "step": 11641, + "time_per_iteration": 2.7541470527648926 + }, + { + "auxiliary_loss_clip": 0.01133526, + "auxiliary_loss_mlp": 0.01103355, + "balance_loss_clip": 1.00188923, + "balance_loss_mlp": 1.00035858, + "epoch": 0.699954907560499, + "flos": 20923640000640.0, + "grad_norm": 1.947860194149672, + "language_loss": 0.78360653, + "learning_rate": 8.722185703539022e-07, + "loss": 0.80597532, + "num_input_tokens_seen": 251173225, + "step": 11642, + "time_per_iteration": 2.661468029022217 + }, + { + "auxiliary_loss_clip": 0.01150257, + "auxiliary_loss_mlp": 0.01104448, + "balance_loss_clip": 1.0019573, + "balance_loss_mlp": 1.00049734, + "epoch": 0.700015030813167, + "flos": 28657792540800.0, + "grad_norm": 2.178306741993864, + "language_loss": 0.75032383, + "learning_rate": 8.718969550356266e-07, + "loss": 0.77287084, + "num_input_tokens_seen": 251192485, + "step": 11643, + "time_per_iteration": 4.0034425258636475 + }, + { + "auxiliary_loss_clip": 0.01115817, + "auxiliary_loss_mlp": 0.01102997, + "balance_loss_clip": 1.0016911, + "balance_loss_mlp": 1.00038195, + "epoch": 0.700075154065835, + "flos": 29205286617600.0, + "grad_norm": 1.3684135780984117, + "language_loss": 0.60384798, + "learning_rate": 8.715753824951315e-07, + "loss": 0.62603617, + "num_input_tokens_seen": 251214965, + "step": 11644, + "time_per_iteration": 2.742992877960205 + }, + { + "auxiliary_loss_clip": 0.01150039, + "auxiliary_loss_mlp": 0.01103572, + "balance_loss_clip": 1.00195122, + "balance_loss_mlp": 1.0004797, + "epoch": 0.7001352773185029, + "flos": 23112431159040.0, + "grad_norm": 1.7191396639632404, + "language_loss": 0.81999648, + "learning_rate": 8.712538527446119e-07, + "loss": 0.84253263, + "num_input_tokens_seen": 251234500, + "step": 11645, + "time_per_iteration": 2.6270864009857178 + }, + { + "auxiliary_loss_clip": 0.01149668, + "auxiliary_loss_mlp": 0.01103143, + "balance_loss_clip": 1.00198698, + "balance_loss_mlp": 1.00062323, + "epoch": 0.7001954005711709, + "flos": 21322858734720.0, + "grad_norm": 3.854009208965578, + "language_loss": 0.6820277, + "learning_rate": 8.709323657962584e-07, + "loss": 0.70455581, + "num_input_tokens_seen": 251254360, + "step": 11646, + "time_per_iteration": 2.61323881149292 + }, + { + "auxiliary_loss_clip": 0.01149929, + "auxiliary_loss_mlp": 0.01102722, + "balance_loss_clip": 1.00198257, + "balance_loss_mlp": 1.0004878, + "epoch": 0.7002555238238388, + "flos": 24535822383360.0, + "grad_norm": 3.080288069239314, + "language_loss": 0.7093569, + "learning_rate": 8.706109216622635e-07, + "loss": 0.73188347, + "num_input_tokens_seen": 251274790, + "step": 11647, + "time_per_iteration": 2.612952947616577 + }, + { + "auxiliary_loss_clip": 0.01148325, + "auxiliary_loss_mlp": 0.0110487, + "balance_loss_clip": 1.00198936, + "balance_loss_mlp": 1.00053823, + "epoch": 0.7003156470765068, + "flos": 39056552726400.0, + "grad_norm": 8.174764496026627, + "language_loss": 0.71225923, + "learning_rate": 8.702895203548155e-07, + "loss": 0.73479122, + "num_input_tokens_seen": 251296275, + "step": 11648, + "time_per_iteration": 2.7296156883239746 + }, + { + "auxiliary_loss_clip": 0.0110193, + "auxiliary_loss_mlp": 0.01103214, + "balance_loss_clip": 1.00168347, + "balance_loss_mlp": 1.00050366, + "epoch": 0.7003757703291749, + "flos": 28804092635520.0, + "grad_norm": 11.112275365919324, + "language_loss": 0.77238727, + "learning_rate": 8.699681618861014e-07, + "loss": 0.79443872, + "num_input_tokens_seen": 251317375, + "step": 11649, + "time_per_iteration": 2.7862353324890137 + }, + { + "auxiliary_loss_clip": 0.01132891, + "auxiliary_loss_mlp": 0.01102616, + "balance_loss_clip": 1.00176191, + "balance_loss_mlp": 1.00047708, + "epoch": 0.7004358935818428, + "flos": 15953854152960.0, + "grad_norm": 1.6719538070656061, + "language_loss": 0.78800046, + "learning_rate": 8.69646846268308e-07, + "loss": 0.81035548, + "num_input_tokens_seen": 251333570, + "step": 11650, + "time_per_iteration": 2.6044108867645264 + }, + { + "auxiliary_loss_clip": 0.01130887, + "auxiliary_loss_mlp": 0.0110227, + "balance_loss_clip": 1.00181866, + "balance_loss_mlp": 1.00041795, + "epoch": 0.7004960168345108, + "flos": 20411984718720.0, + "grad_norm": 1.8539839495684627, + "language_loss": 0.78212792, + "learning_rate": 8.693255735136194e-07, + "loss": 0.80445951, + "num_input_tokens_seen": 251351070, + "step": 11651, + "time_per_iteration": 2.630552053451538 + }, + { + "auxiliary_loss_clip": 0.01116765, + "auxiliary_loss_mlp": 0.01103563, + "balance_loss_clip": 1.00185037, + "balance_loss_mlp": 1.00056601, + "epoch": 0.7005561400871787, + "flos": 17347547808000.0, + "grad_norm": 1.972676308632148, + "language_loss": 0.69222915, + "learning_rate": 8.690043436342198e-07, + "loss": 0.71443242, + "num_input_tokens_seen": 251370005, + "step": 11652, + "time_per_iteration": 2.661924123764038 + }, + { + "auxiliary_loss_clip": 0.0114964, + "auxiliary_loss_mlp": 0.01103372, + "balance_loss_clip": 1.00190032, + "balance_loss_mlp": 1.00047064, + "epoch": 0.7006162633398467, + "flos": 25302120157440.0, + "grad_norm": 1.620707109137788, + "language_loss": 0.74697971, + "learning_rate": 8.686831566422874e-07, + "loss": 0.76950979, + "num_input_tokens_seen": 251391210, + "step": 11653, + "time_per_iteration": 2.6136839389801025 + }, + { + "auxiliary_loss_clip": 0.01133958, + "auxiliary_loss_mlp": 0.01103566, + "balance_loss_clip": 1.00200856, + "balance_loss_mlp": 1.00037885, + "epoch": 0.7006763865925146, + "flos": 20668997508480.0, + "grad_norm": 2.445231133191376, + "language_loss": 0.70659393, + "learning_rate": 8.68362012550003e-07, + "loss": 0.72896916, + "num_input_tokens_seen": 251411505, + "step": 11654, + "time_per_iteration": 2.621764659881592 + }, + { + "auxiliary_loss_clip": 0.01100092, + "auxiliary_loss_mlp": 0.01104177, + "balance_loss_clip": 1.00171292, + "balance_loss_mlp": 1.00041759, + "epoch": 0.7007365098451827, + "flos": 20046449963520.0, + "grad_norm": 7.439157161651911, + "language_loss": 0.72604024, + "learning_rate": 8.680409113695453e-07, + "loss": 0.74808294, + "num_input_tokens_seen": 251428975, + "step": 11655, + "time_per_iteration": 2.6715903282165527 + }, + { + "auxiliary_loss_clip": 0.01148545, + "auxiliary_loss_mlp": 0.0110469, + "balance_loss_clip": 1.00189757, + "balance_loss_mlp": 1.00054836, + "epoch": 0.7007966330978506, + "flos": 20777375819520.0, + "grad_norm": 2.075154912684562, + "language_loss": 0.70464802, + "learning_rate": 8.677198531130889e-07, + "loss": 0.7271803, + "num_input_tokens_seen": 251446940, + "step": 11656, + "time_per_iteration": 2.594040632247925 + }, + { + "auxiliary_loss_clip": 0.01101374, + "auxiliary_loss_mlp": 0.01102505, + "balance_loss_clip": 1.00172591, + "balance_loss_mlp": 1.00046229, + "epoch": 0.7008567563505186, + "flos": 29638189330560.0, + "grad_norm": 1.4635671444299612, + "language_loss": 0.77627838, + "learning_rate": 8.673988377928092e-07, + "loss": 0.79831719, + "num_input_tokens_seen": 251466205, + "step": 11657, + "time_per_iteration": 2.7449543476104736 + }, + { + "auxiliary_loss_clip": 0.01165101, + "auxiliary_loss_mlp": 0.01104258, + "balance_loss_clip": 1.00196743, + "balance_loss_mlp": 1.00049853, + "epoch": 0.7009168796031865, + "flos": 17092007475840.0, + "grad_norm": 1.8557985686777627, + "language_loss": 0.77600908, + "learning_rate": 8.670778654208797e-07, + "loss": 0.79870272, + "num_input_tokens_seen": 251484820, + "step": 11658, + "time_per_iteration": 2.5382168292999268 + }, + { + "auxiliary_loss_clip": 0.01133394, + "auxiliary_loss_mlp": 0.01102461, + "balance_loss_clip": 1.00188291, + "balance_loss_mlp": 1.00051367, + "epoch": 0.7009770028558545, + "flos": 20448972748800.0, + "grad_norm": 1.735063247960449, + "language_loss": 0.82653815, + "learning_rate": 8.667569360094713e-07, + "loss": 0.84889674, + "num_input_tokens_seen": 251502670, + "step": 11659, + "time_per_iteration": 2.6036489009857178 + }, + { + "auxiliary_loss_clip": 0.01118971, + "auxiliary_loss_mlp": 0.01103118, + "balance_loss_clip": 1.0019455, + "balance_loss_mlp": 1.00050235, + "epoch": 0.7010371261085224, + "flos": 19245139407360.0, + "grad_norm": 1.9276836939693296, + "language_loss": 0.69576144, + "learning_rate": 8.664360495707526e-07, + "loss": 0.71798241, + "num_input_tokens_seen": 251521630, + "step": 11660, + "time_per_iteration": 2.6281826496124268 + }, + { + "auxiliary_loss_clip": 0.01165029, + "auxiliary_loss_mlp": 0.01104581, + "balance_loss_clip": 1.00189996, + "balance_loss_mlp": 1.00043988, + "epoch": 0.7010972493611904, + "flos": 22127581082880.0, + "grad_norm": 2.355233402823887, + "language_loss": 0.81193548, + "learning_rate": 8.661152061168924e-07, + "loss": 0.83463156, + "num_input_tokens_seen": 251540105, + "step": 11661, + "time_per_iteration": 2.5438523292541504 + }, + { + "auxiliary_loss_clip": 0.01148152, + "auxiliary_loss_mlp": 0.01103137, + "balance_loss_clip": 1.00182188, + "balance_loss_mlp": 1.00052166, + "epoch": 0.7011573726138585, + "flos": 31391132860800.0, + "grad_norm": 1.7971747474963342, + "language_loss": 0.78943884, + "learning_rate": 8.657944056600579e-07, + "loss": 0.81195176, + "num_input_tokens_seen": 251560530, + "step": 11662, + "time_per_iteration": 2.673306465148926 + }, + { + "auxiliary_loss_clip": 0.01150043, + "auxiliary_loss_mlp": 0.0110384, + "balance_loss_clip": 1.00190568, + "balance_loss_mlp": 1.00046182, + "epoch": 0.7012174958665264, + "flos": 18150582216960.0, + "grad_norm": 3.190137854268252, + "language_loss": 0.8354544, + "learning_rate": 8.654736482124134e-07, + "loss": 0.85799325, + "num_input_tokens_seen": 251577930, + "step": 11663, + "time_per_iteration": 2.5641896724700928 + }, + { + "auxiliary_loss_clip": 0.01141873, + "auxiliary_loss_mlp": 0.01077877, + "balance_loss_clip": 1.00065804, + "balance_loss_mlp": 1.0000571, + "epoch": 0.7012776191191944, + "flos": 60651256567680.0, + "grad_norm": 0.8685406294634347, + "language_loss": 0.53763419, + "learning_rate": 8.651529337861209e-07, + "loss": 0.55983174, + "num_input_tokens_seen": 251638820, + "step": 11664, + "time_per_iteration": 3.1208653450012207 + }, + { + "auxiliary_loss_clip": 0.01133667, + "auxiliary_loss_mlp": 0.01103334, + "balance_loss_clip": 1.00180054, + "balance_loss_mlp": 1.00052834, + "epoch": 0.7013377423718623, + "flos": 27198598435200.0, + "grad_norm": 2.580585182689016, + "language_loss": 0.78901541, + "learning_rate": 8.64832262393344e-07, + "loss": 0.81138539, + "num_input_tokens_seen": 251658070, + "step": 11665, + "time_per_iteration": 4.08051609992981 + }, + { + "auxiliary_loss_clip": 0.01149809, + "auxiliary_loss_mlp": 0.01102209, + "balance_loss_clip": 1.00187254, + "balance_loss_mlp": 1.00045204, + "epoch": 0.7013978656245303, + "flos": 16543543731840.0, + "grad_norm": 2.156316147140639, + "language_loss": 0.76901639, + "learning_rate": 8.645116340462404e-07, + "loss": 0.79153657, + "num_input_tokens_seen": 251671575, + "step": 11666, + "time_per_iteration": 2.514970302581787 + }, + { + "auxiliary_loss_clip": 0.01150072, + "auxiliary_loss_mlp": 0.01103726, + "balance_loss_clip": 1.00198555, + "balance_loss_mlp": 1.00053871, + "epoch": 0.7014579888771982, + "flos": 23143780753920.0, + "grad_norm": 1.9783050698845246, + "language_loss": 0.81036192, + "learning_rate": 8.641910487569695e-07, + "loss": 0.83289993, + "num_input_tokens_seen": 251689350, + "step": 11667, + "time_per_iteration": 4.0569539070129395 + }, + { + "auxiliary_loss_clip": 0.01116254, + "auxiliary_loss_mlp": 0.01104344, + "balance_loss_clip": 1.001683, + "balance_loss_mlp": 1.00067949, + "epoch": 0.7015181121298663, + "flos": 25082095397760.0, + "grad_norm": 2.3186528930386827, + "language_loss": 0.65452683, + "learning_rate": 8.638705065376879e-07, + "loss": 0.67673284, + "num_input_tokens_seen": 251704635, + "step": 11668, + "time_per_iteration": 2.7984044551849365 + }, + { + "auxiliary_loss_clip": 0.01133258, + "auxiliary_loss_mlp": 0.01103075, + "balance_loss_clip": 1.00188994, + "balance_loss_mlp": 1.00045979, + "epoch": 0.7015782353825342, + "flos": 23327894891520.0, + "grad_norm": 2.2181300580834407, + "language_loss": 0.76694137, + "learning_rate": 8.635500074005519e-07, + "loss": 0.78930467, + "num_input_tokens_seen": 251723035, + "step": 11669, + "time_per_iteration": 2.742117404937744 + }, + { + "auxiliary_loss_clip": 0.01128079, + "auxiliary_loss_mlp": 0.01077771, + "balance_loss_clip": 1.00096035, + "balance_loss_mlp": 0.99995154, + "epoch": 0.7016383586352022, + "flos": 70397161107840.0, + "grad_norm": 0.6932184500840746, + "language_loss": 0.54496419, + "learning_rate": 8.632295513577122e-07, + "loss": 0.56702268, + "num_input_tokens_seen": 251791630, + "step": 11670, + "time_per_iteration": 3.370664119720459 + }, + { + "auxiliary_loss_clip": 0.01131591, + "auxiliary_loss_mlp": 0.01102724, + "balance_loss_clip": 1.00171399, + "balance_loss_mlp": 1.00048959, + "epoch": 0.7016984818878701, + "flos": 19792274348160.0, + "grad_norm": 1.8997841513169638, + "language_loss": 0.81641364, + "learning_rate": 8.629091384213218e-07, + "loss": 0.8387568, + "num_input_tokens_seen": 251809840, + "step": 11671, + "time_per_iteration": 2.708545684814453 + }, + { + "auxiliary_loss_clip": 0.01150462, + "auxiliary_loss_mlp": 0.01104449, + "balance_loss_clip": 1.00200438, + "balance_loss_mlp": 1.00049829, + "epoch": 0.7017586051405381, + "flos": 12896923184640.0, + "grad_norm": 2.141411674176966, + "language_loss": 0.74895823, + "learning_rate": 8.625887686035313e-07, + "loss": 0.77150732, + "num_input_tokens_seen": 251827550, + "step": 11672, + "time_per_iteration": 2.6645190715789795 + }, + { + "auxiliary_loss_clip": 0.01150379, + "auxiliary_loss_mlp": 0.01102647, + "balance_loss_clip": 1.0018965, + "balance_loss_mlp": 1.00050879, + "epoch": 0.701818728393206, + "flos": 18332828847360.0, + "grad_norm": 2.0926834733805033, + "language_loss": 0.86762613, + "learning_rate": 8.622684419164883e-07, + "loss": 0.89015639, + "num_input_tokens_seen": 251844880, + "step": 11673, + "time_per_iteration": 2.694748640060425 + }, + { + "auxiliary_loss_clip": 0.01149973, + "auxiliary_loss_mlp": 0.01101863, + "balance_loss_clip": 1.00182879, + "balance_loss_mlp": 1.00039184, + "epoch": 0.701878851645874, + "flos": 17384212615680.0, + "grad_norm": 1.9724481553555568, + "language_loss": 0.73168063, + "learning_rate": 8.619481583723399e-07, + "loss": 0.75419903, + "num_input_tokens_seen": 251861025, + "step": 11674, + "time_per_iteration": 2.6498398780822754 + }, + { + "auxiliary_loss_clip": 0.01150486, + "auxiliary_loss_mlp": 0.00747212, + "balance_loss_clip": 1.00215602, + "balance_loss_mlp": 1.00049376, + "epoch": 0.701938974898542, + "flos": 23915501481600.0, + "grad_norm": 1.5852196421255862, + "language_loss": 0.72019529, + "learning_rate": 8.616279179832329e-07, + "loss": 0.73917228, + "num_input_tokens_seen": 251880175, + "step": 11675, + "time_per_iteration": 2.70175838470459 + }, + { + "auxiliary_loss_clip": 0.0111653, + "auxiliary_loss_mlp": 0.01104031, + "balance_loss_clip": 1.00189221, + "balance_loss_mlp": 1.00036621, + "epoch": 0.70199909815121, + "flos": 21795586652160.0, + "grad_norm": 1.9910803066205252, + "language_loss": 0.50708783, + "learning_rate": 8.613077207613078e-07, + "loss": 0.52929336, + "num_input_tokens_seen": 251899005, + "step": 11676, + "time_per_iteration": 2.723351240158081 + }, + { + "auxiliary_loss_clip": 0.01125322, + "auxiliary_loss_mlp": 0.00745373, + "balance_loss_clip": 1.00080562, + "balance_loss_mlp": 1.00015259, + "epoch": 0.702059221403878, + "flos": 71715047109120.0, + "grad_norm": 0.7328502705676121, + "language_loss": 0.59228474, + "learning_rate": 8.609875667187079e-07, + "loss": 0.61099172, + "num_input_tokens_seen": 251966790, + "step": 11677, + "time_per_iteration": 3.3293204307556152 + }, + { + "auxiliary_loss_clip": 0.01150048, + "auxiliary_loss_mlp": 0.01103315, + "balance_loss_clip": 1.00191772, + "balance_loss_mlp": 1.00041318, + "epoch": 0.7021193446565459, + "flos": 28111052649600.0, + "grad_norm": 2.134531729857144, + "language_loss": 0.6224367, + "learning_rate": 8.606674558675737e-07, + "loss": 0.6449703, + "num_input_tokens_seen": 251989315, + "step": 11678, + "time_per_iteration": 4.253399133682251 + }, + { + "auxiliary_loss_clip": 0.01164861, + "auxiliary_loss_mlp": 0.01103227, + "balance_loss_clip": 1.00199687, + "balance_loss_mlp": 1.00051641, + "epoch": 0.7021794679092139, + "flos": 22924905229440.0, + "grad_norm": 1.8427704657738575, + "language_loss": 0.79228234, + "learning_rate": 8.603473882200444e-07, + "loss": 0.81496322, + "num_input_tokens_seen": 252006620, + "step": 11679, + "time_per_iteration": 2.646815538406372 + }, + { + "auxiliary_loss_clip": 0.0113521, + "auxiliary_loss_mlp": 0.01103008, + "balance_loss_clip": 1.00198364, + "balance_loss_mlp": 1.0004884, + "epoch": 0.7022395911618818, + "flos": 18077827219200.0, + "grad_norm": 2.443174224447244, + "language_loss": 0.70531523, + "learning_rate": 8.600273637882567e-07, + "loss": 0.72769737, + "num_input_tokens_seen": 252024570, + "step": 11680, + "time_per_iteration": 2.707817316055298 + }, + { + "auxiliary_loss_clip": 0.01119096, + "auxiliary_loss_mlp": 0.01104723, + "balance_loss_clip": 1.00195956, + "balance_loss_mlp": 1.0005815, + "epoch": 0.7022997144145499, + "flos": 16034294661120.0, + "grad_norm": 1.5229102887522763, + "language_loss": 0.74788898, + "learning_rate": 8.597073825843446e-07, + "loss": 0.77012718, + "num_input_tokens_seen": 252042775, + "step": 11681, + "time_per_iteration": 4.183990716934204 + }, + { + "auxiliary_loss_clip": 0.01135333, + "auxiliary_loss_mlp": 0.01102635, + "balance_loss_clip": 1.00180566, + "balance_loss_mlp": 1.00049663, + "epoch": 0.7023598376672178, + "flos": 26468678160000.0, + "grad_norm": 4.7081492606487725, + "language_loss": 0.76542473, + "learning_rate": 8.593874446204434e-07, + "loss": 0.78780437, + "num_input_tokens_seen": 252063690, + "step": 11682, + "time_per_iteration": 2.667679786682129 + }, + { + "auxiliary_loss_clip": 0.01114832, + "auxiliary_loss_mlp": 0.00747253, + "balance_loss_clip": 1.00167656, + "balance_loss_mlp": 1.00046444, + "epoch": 0.7024199609198858, + "flos": 17055917285760.0, + "grad_norm": 2.029125739159391, + "language_loss": 0.73560447, + "learning_rate": 8.590675499086841e-07, + "loss": 0.75422537, + "num_input_tokens_seen": 252080335, + "step": 11683, + "time_per_iteration": 2.6480841636657715 + }, + { + "auxiliary_loss_clip": 0.01116595, + "auxiliary_loss_mlp": 0.01103887, + "balance_loss_clip": 1.00176311, + "balance_loss_mlp": 1.00050831, + "epoch": 0.7024800841725537, + "flos": 25849039616640.0, + "grad_norm": 2.1563007880268015, + "language_loss": 0.71381104, + "learning_rate": 8.587476984611976e-07, + "loss": 0.73601586, + "num_input_tokens_seen": 252101075, + "step": 11684, + "time_per_iteration": 2.6728515625 + }, + { + "auxiliary_loss_clip": 0.01150264, + "auxiliary_loss_mlp": 0.01103561, + "balance_loss_clip": 1.00192738, + "balance_loss_mlp": 1.0005641, + "epoch": 0.7025402074252217, + "flos": 23513014609920.0, + "grad_norm": 2.050283689044851, + "language_loss": 0.71670532, + "learning_rate": 8.584278902901128e-07, + "loss": 0.73924351, + "num_input_tokens_seen": 252120510, + "step": 11685, + "time_per_iteration": 2.6483328342437744 + }, + { + "auxiliary_loss_clip": 0.01150223, + "auxiliary_loss_mlp": 0.01103207, + "balance_loss_clip": 1.00188792, + "balance_loss_mlp": 1.00049615, + "epoch": 0.7026003306778896, + "flos": 20150985519360.0, + "grad_norm": 1.840651304191313, + "language_loss": 0.843916, + "learning_rate": 8.581081254075582e-07, + "loss": 0.86645031, + "num_input_tokens_seen": 252137590, + "step": 11686, + "time_per_iteration": 2.553328514099121 + }, + { + "auxiliary_loss_clip": 0.01142951, + "auxiliary_loss_mlp": 0.01078569, + "balance_loss_clip": 1.00087333, + "balance_loss_mlp": 0.99998647, + "epoch": 0.7026604539305576, + "flos": 64772400712320.0, + "grad_norm": 0.9793095250095336, + "language_loss": 0.69942063, + "learning_rate": 8.577884038256566e-07, + "loss": 0.72163582, + "num_input_tokens_seen": 252199830, + "step": 11687, + "time_per_iteration": 3.2771599292755127 + }, + { + "auxiliary_loss_clip": 0.011188, + "auxiliary_loss_mlp": 0.01102773, + "balance_loss_clip": 1.00190997, + "balance_loss_mlp": 1.00053859, + "epoch": 0.7027205771832256, + "flos": 21871466133120.0, + "grad_norm": 1.9315382627999733, + "language_loss": 0.76974374, + "learning_rate": 8.574687255565329e-07, + "loss": 0.7919594, + "num_input_tokens_seen": 252217200, + "step": 11688, + "time_per_iteration": 2.646698474884033 + }, + { + "auxiliary_loss_clip": 0.01164903, + "auxiliary_loss_mlp": 0.01102646, + "balance_loss_clip": 1.00197053, + "balance_loss_mlp": 1.00060296, + "epoch": 0.7027807004358936, + "flos": 23367791923200.0, + "grad_norm": 2.4698118584467683, + "language_loss": 0.69031084, + "learning_rate": 8.571490906123107e-07, + "loss": 0.71298629, + "num_input_tokens_seen": 252236105, + "step": 11689, + "time_per_iteration": 2.5314621925354004 + }, + { + "auxiliary_loss_clip": 0.01131374, + "auxiliary_loss_mlp": 0.01103576, + "balance_loss_clip": 1.00174689, + "balance_loss_mlp": 1.00048399, + "epoch": 0.7028408236885616, + "flos": 15304266645120.0, + "grad_norm": 3.185920484134062, + "language_loss": 0.80363679, + "learning_rate": 8.568294990051086e-07, + "loss": 0.82598633, + "num_input_tokens_seen": 252253315, + "step": 11690, + "time_per_iteration": 2.574408531188965 + }, + { + "auxiliary_loss_clip": 0.01164812, + "auxiliary_loss_mlp": 0.01103304, + "balance_loss_clip": 1.0019567, + "balance_loss_mlp": 1.00059319, + "epoch": 0.7029009469412295, + "flos": 22018197191040.0, + "grad_norm": 2.6977422291339193, + "language_loss": 0.75718307, + "learning_rate": 8.56509950747047e-07, + "loss": 0.77986431, + "num_input_tokens_seen": 252272765, + "step": 11691, + "time_per_iteration": 2.536351442337036 + }, + { + "auxiliary_loss_clip": 0.01132908, + "auxiliary_loss_mlp": 0.01102499, + "balance_loss_clip": 1.00186801, + "balance_loss_mlp": 1.00040853, + "epoch": 0.7029610701938975, + "flos": 21835519597440.0, + "grad_norm": 1.871696664478646, + "language_loss": 0.81810033, + "learning_rate": 8.561904458502429e-07, + "loss": 0.8404544, + "num_input_tokens_seen": 252290510, + "step": 11692, + "time_per_iteration": 2.5936853885650635 + }, + { + "auxiliary_loss_clip": 0.01131712, + "auxiliary_loss_mlp": 0.01103802, + "balance_loss_clip": 1.00177956, + "balance_loss_mlp": 1.00051904, + "epoch": 0.7030211934465654, + "flos": 19135647774720.0, + "grad_norm": 1.5088678713740413, + "language_loss": 0.76940984, + "learning_rate": 8.558709843268111e-07, + "loss": 0.79176497, + "num_input_tokens_seen": 252309365, + "step": 11693, + "time_per_iteration": 2.620779275894165 + }, + { + "auxiliary_loss_clip": 0.01133469, + "auxiliary_loss_mlp": 0.01103696, + "balance_loss_clip": 1.00195408, + "balance_loss_mlp": 1.00050879, + "epoch": 0.7030813166992335, + "flos": 38546010766080.0, + "grad_norm": 3.7002464951894107, + "language_loss": 0.68471611, + "learning_rate": 8.55551566188866e-07, + "loss": 0.70708776, + "num_input_tokens_seen": 252333010, + "step": 11694, + "time_per_iteration": 2.7576074600219727 + }, + { + "auxiliary_loss_clip": 0.01164852, + "auxiliary_loss_mlp": 0.01102958, + "balance_loss_clip": 1.00187707, + "balance_loss_mlp": 1.00043833, + "epoch": 0.7031414399519014, + "flos": 14720897859840.0, + "grad_norm": 2.2553611628509884, + "language_loss": 0.7593621, + "learning_rate": 8.552321914485203e-07, + "loss": 0.78204018, + "num_input_tokens_seen": 252351330, + "step": 11695, + "time_per_iteration": 2.516498327255249 + }, + { + "auxiliary_loss_clip": 0.01133074, + "auxiliary_loss_mlp": 0.01104809, + "balance_loss_clip": 1.00184655, + "balance_loss_mlp": 1.00057292, + "epoch": 0.7032015632045694, + "flos": 14027247342720.0, + "grad_norm": 2.361703021601748, + "language_loss": 0.73977256, + "learning_rate": 8.549128601178852e-07, + "loss": 0.76215136, + "num_input_tokens_seen": 252369580, + "step": 11696, + "time_per_iteration": 2.5649545192718506 + }, + { + "auxiliary_loss_clip": 0.011354, + "auxiliary_loss_mlp": 0.01103202, + "balance_loss_clip": 1.00174153, + "balance_loss_mlp": 1.00049186, + "epoch": 0.7032616864572373, + "flos": 27637175496960.0, + "grad_norm": 1.6249094980089178, + "language_loss": 0.75339246, + "learning_rate": 8.545935722090693e-07, + "loss": 0.77577847, + "num_input_tokens_seen": 252390525, + "step": 11697, + "time_per_iteration": 2.660496234893799 + }, + { + "auxiliary_loss_clip": 0.01102899, + "auxiliary_loss_mlp": 0.01103532, + "balance_loss_clip": 1.00187004, + "balance_loss_mlp": 1.00053549, + "epoch": 0.7033218097099053, + "flos": 17967294092160.0, + "grad_norm": 2.0910763799885985, + "language_loss": 0.81055719, + "learning_rate": 8.542743277341793e-07, + "loss": 0.83262146, + "num_input_tokens_seen": 252407470, + "step": 11698, + "time_per_iteration": 2.6794369220733643 + }, + { + "auxiliary_loss_clip": 0.01133198, + "auxiliary_loss_mlp": 0.01103897, + "balance_loss_clip": 1.00175285, + "balance_loss_mlp": 1.00061417, + "epoch": 0.7033819329625732, + "flos": 19501721233920.0, + "grad_norm": 1.5210455550488542, + "language_loss": 0.84993887, + "learning_rate": 8.539551267053222e-07, + "loss": 0.8723098, + "num_input_tokens_seen": 252427025, + "step": 11699, + "time_per_iteration": 2.625577688217163 + }, + { + "auxiliary_loss_clip": 0.01150306, + "auxiliary_loss_mlp": 0.01103732, + "balance_loss_clip": 1.00203979, + "balance_loss_mlp": 1.00054431, + "epoch": 0.7034420562152413, + "flos": 23987645948160.0, + "grad_norm": 2.818867385964464, + "language_loss": 0.78869361, + "learning_rate": 8.53635969134601e-07, + "loss": 0.811234, + "num_input_tokens_seen": 252445410, + "step": 11700, + "time_per_iteration": 2.601991653442383 + }, + { + "auxiliary_loss_clip": 0.01148058, + "auxiliary_loss_mlp": 0.0110306, + "balance_loss_clip": 1.00173807, + "balance_loss_mlp": 1.00034952, + "epoch": 0.7035021794679092, + "flos": 35043427756800.0, + "grad_norm": 1.7625796416090722, + "language_loss": 0.74730903, + "learning_rate": 8.533168550341186e-07, + "loss": 0.76982021, + "num_input_tokens_seen": 252463905, + "step": 11701, + "time_per_iteration": 2.666654109954834 + }, + { + "auxiliary_loss_clip": 0.01149467, + "auxiliary_loss_mlp": 0.01103954, + "balance_loss_clip": 1.0019418, + "balance_loss_mlp": 1.00057578, + "epoch": 0.7035623027205772, + "flos": 10997428164480.0, + "grad_norm": 2.274877339312347, + "language_loss": 0.83995402, + "learning_rate": 8.529977844159769e-07, + "loss": 0.86248821, + "num_input_tokens_seen": 252478655, + "step": 11702, + "time_per_iteration": 2.5556511878967285 + }, + { + "auxiliary_loss_clip": 0.01164916, + "auxiliary_loss_mlp": 0.01103599, + "balance_loss_clip": 1.00184655, + "balance_loss_mlp": 1.00060213, + "epoch": 0.7036224259732452, + "flos": 23623727304960.0, + "grad_norm": 1.813286983860602, + "language_loss": 0.6133374, + "learning_rate": 8.526787572922738e-07, + "loss": 0.63602257, + "num_input_tokens_seen": 252498740, + "step": 11703, + "time_per_iteration": 3.8818020820617676 + }, + { + "auxiliary_loss_clip": 0.01164805, + "auxiliary_loss_mlp": 0.01103509, + "balance_loss_clip": 1.0018574, + "balance_loss_mlp": 1.00051188, + "epoch": 0.7036825492259131, + "flos": 31686175175040.0, + "grad_norm": 2.0006653557357876, + "language_loss": 0.61292976, + "learning_rate": 8.523597736751067e-07, + "loss": 0.63561285, + "num_input_tokens_seen": 252517800, + "step": 11704, + "time_per_iteration": 2.592200517654419 + }, + { + "auxiliary_loss_clip": 0.01148355, + "auxiliary_loss_mlp": 0.01102106, + "balance_loss_clip": 1.00179291, + "balance_loss_mlp": 1.00044465, + "epoch": 0.7037426724785811, + "flos": 30192866127360.0, + "grad_norm": 1.5816194009716862, + "language_loss": 0.70764917, + "learning_rate": 8.520408335765719e-07, + "loss": 0.73015374, + "num_input_tokens_seen": 252539620, + "step": 11705, + "time_per_iteration": 4.071848630905151 + }, + { + "auxiliary_loss_clip": 0.01148172, + "auxiliary_loss_mlp": 0.01103338, + "balance_loss_clip": 1.00186276, + "balance_loss_mlp": 1.00053155, + "epoch": 0.703802795731249, + "flos": 24311523905280.0, + "grad_norm": 9.082911511303596, + "language_loss": 0.61574602, + "learning_rate": 8.517219370087645e-07, + "loss": 0.63826108, + "num_input_tokens_seen": 252557300, + "step": 11706, + "time_per_iteration": 2.600343942642212 + }, + { + "auxiliary_loss_clip": 0.0114944, + "auxiliary_loss_mlp": 0.01103739, + "balance_loss_clip": 1.00192523, + "balance_loss_mlp": 1.00055134, + "epoch": 0.7038629189839171, + "flos": 22528954632960.0, + "grad_norm": 2.7128495109199724, + "language_loss": 0.67888719, + "learning_rate": 8.514030839837756e-07, + "loss": 0.701419, + "num_input_tokens_seen": 252576715, + "step": 11707, + "time_per_iteration": 2.573047637939453 + }, + { + "auxiliary_loss_clip": 0.01164882, + "auxiliary_loss_mlp": 0.01103186, + "balance_loss_clip": 1.00195742, + "balance_loss_mlp": 1.00047493, + "epoch": 0.703923042236585, + "flos": 26250484993920.0, + "grad_norm": 1.7887822425982425, + "language_loss": 0.76187027, + "learning_rate": 8.510842745136974e-07, + "loss": 0.78455102, + "num_input_tokens_seen": 252596190, + "step": 11708, + "time_per_iteration": 2.5599093437194824 + }, + { + "auxiliary_loss_clip": 0.01135489, + "auxiliary_loss_mlp": 0.01103144, + "balance_loss_clip": 1.00192046, + "balance_loss_mlp": 1.00052857, + "epoch": 0.703983165489253, + "flos": 19390254353280.0, + "grad_norm": 3.043630537585776, + "language_loss": 0.72097313, + "learning_rate": 8.50765508610619e-07, + "loss": 0.74335957, + "num_input_tokens_seen": 252613410, + "step": 11709, + "time_per_iteration": 2.5832457542419434 + }, + { + "auxiliary_loss_clip": 0.0114806, + "auxiliary_loss_mlp": 0.0110324, + "balance_loss_clip": 1.00179446, + "balance_loss_mlp": 1.00043368, + "epoch": 0.7040432887419209, + "flos": 16683630773760.0, + "grad_norm": 2.8325965191652127, + "language_loss": 0.78982651, + "learning_rate": 8.504467862866267e-07, + "loss": 0.81233948, + "num_input_tokens_seen": 252629150, + "step": 11710, + "time_per_iteration": 2.556774854660034 + }, + { + "auxiliary_loss_clip": 0.01149761, + "auxiliary_loss_mlp": 0.01103814, + "balance_loss_clip": 1.00190878, + "balance_loss_mlp": 1.00053072, + "epoch": 0.7041034119945889, + "flos": 21141402203520.0, + "grad_norm": 1.6944863802237677, + "language_loss": 0.77576256, + "learning_rate": 8.501281075538076e-07, + "loss": 0.79829836, + "num_input_tokens_seen": 252648225, + "step": 11711, + "time_per_iteration": 2.6299524307250977 + }, + { + "auxiliary_loss_clip": 0.01116678, + "auxiliary_loss_mlp": 0.01103298, + "balance_loss_clip": 1.00182652, + "balance_loss_mlp": 1.00049162, + "epoch": 0.7041635352472568, + "flos": 16910299549440.0, + "grad_norm": 3.0640211317561743, + "language_loss": 0.7409023, + "learning_rate": 8.498094724242457e-07, + "loss": 0.76310211, + "num_input_tokens_seen": 252665380, + "step": 11712, + "time_per_iteration": 2.629340410232544 + }, + { + "auxiliary_loss_clip": 0.01113193, + "auxiliary_loss_mlp": 0.01078159, + "balance_loss_clip": 1.0007298, + "balance_loss_mlp": 0.99995798, + "epoch": 0.7042236584999249, + "flos": 71681219475840.0, + "grad_norm": 0.8824236937868215, + "language_loss": 0.6462301, + "learning_rate": 8.494908809100247e-07, + "loss": 0.66814363, + "num_input_tokens_seen": 252727950, + "step": 11713, + "time_per_iteration": 3.22125244140625 + }, + { + "auxiliary_loss_clip": 0.01150083, + "auxiliary_loss_mlp": 0.01103166, + "balance_loss_clip": 1.00193965, + "balance_loss_mlp": 1.00055051, + "epoch": 0.7042837817525928, + "flos": 28658187590400.0, + "grad_norm": 1.8569709118225854, + "language_loss": 0.72893167, + "learning_rate": 8.49172333023225e-07, + "loss": 0.75146419, + "num_input_tokens_seen": 252746770, + "step": 11714, + "time_per_iteration": 2.6177008152008057 + }, + { + "auxiliary_loss_clip": 0.01131422, + "auxiliary_loss_mlp": 0.00747532, + "balance_loss_clip": 1.00175166, + "balance_loss_mlp": 1.00070214, + "epoch": 0.7043439050052608, + "flos": 19753562465280.0, + "grad_norm": 2.201487639509222, + "language_loss": 0.79892296, + "learning_rate": 8.488538287759248e-07, + "loss": 0.81771255, + "num_input_tokens_seen": 252765610, + "step": 11715, + "time_per_iteration": 2.5956363677978516 + }, + { + "auxiliary_loss_clip": 0.01133801, + "auxiliary_loss_mlp": 0.01104063, + "balance_loss_clip": 1.00197804, + "balance_loss_mlp": 1.00058901, + "epoch": 0.7044040282579288, + "flos": 11538529620480.0, + "grad_norm": 2.446077006803004, + "language_loss": 0.71207285, + "learning_rate": 8.485353681802037e-07, + "loss": 0.73445153, + "num_input_tokens_seen": 252781610, + "step": 11716, + "time_per_iteration": 2.553450584411621 + }, + { + "auxiliary_loss_clip": 0.01102985, + "auxiliary_loss_mlp": 0.01103757, + "balance_loss_clip": 1.00180852, + "balance_loss_mlp": 1.00037837, + "epoch": 0.7044641515105967, + "flos": 33656126722560.0, + "grad_norm": 2.016360688218956, + "language_loss": 0.66488504, + "learning_rate": 8.482169512481358e-07, + "loss": 0.68695247, + "num_input_tokens_seen": 252800600, + "step": 11717, + "time_per_iteration": 4.21605372428894 + }, + { + "auxiliary_loss_clip": 0.0116495, + "auxiliary_loss_mlp": 0.01103188, + "balance_loss_clip": 1.00192475, + "balance_loss_mlp": 1.00057268, + "epoch": 0.7045242747632647, + "flos": 26723859356160.0, + "grad_norm": 1.590690320093307, + "language_loss": 0.74600554, + "learning_rate": 8.478985779917967e-07, + "loss": 0.76868689, + "num_input_tokens_seen": 252822310, + "step": 11718, + "time_per_iteration": 4.095973014831543 + }, + { + "auxiliary_loss_clip": 0.01150066, + "auxiliary_loss_mlp": 0.01103199, + "balance_loss_clip": 1.00189543, + "balance_loss_mlp": 1.00058341, + "epoch": 0.7045843980159326, + "flos": 26797655848320.0, + "grad_norm": 1.9141540279576819, + "language_loss": 0.79865414, + "learning_rate": 8.475802484232606e-07, + "loss": 0.82118678, + "num_input_tokens_seen": 252842355, + "step": 11719, + "time_per_iteration": 2.6302874088287354 + }, + { + "auxiliary_loss_clip": 0.01148066, + "auxiliary_loss_mlp": 0.01103472, + "balance_loss_clip": 1.00181997, + "balance_loss_mlp": 1.00057077, + "epoch": 0.7046445212686007, + "flos": 41574824363520.0, + "grad_norm": 1.9157323154957688, + "language_loss": 0.65627259, + "learning_rate": 8.472619625545951e-07, + "loss": 0.67878795, + "num_input_tokens_seen": 252866785, + "step": 11720, + "time_per_iteration": 2.7387232780456543 + }, + { + "auxiliary_loss_clip": 0.01133067, + "auxiliary_loss_mlp": 0.01103816, + "balance_loss_clip": 1.00184798, + "balance_loss_mlp": 1.00043809, + "epoch": 0.7047046445212686, + "flos": 15560166113280.0, + "grad_norm": 1.8772226622769412, + "language_loss": 0.79833436, + "learning_rate": 8.46943720397872e-07, + "loss": 0.82070321, + "num_input_tokens_seen": 252881870, + "step": 11721, + "time_per_iteration": 2.591193914413452 + }, + { + "auxiliary_loss_clip": 0.01111402, + "auxiliary_loss_mlp": 0.0107863, + "balance_loss_clip": 1.00084448, + "balance_loss_mlp": 1.00004733, + "epoch": 0.7047647677739366, + "flos": 70410269571840.0, + "grad_norm": 0.7845655350598306, + "language_loss": 0.64756912, + "learning_rate": 8.466255219651582e-07, + "loss": 0.66946942, + "num_input_tokens_seen": 252951300, + "step": 11722, + "time_per_iteration": 3.328732967376709 + }, + { + "auxiliary_loss_clip": 0.01135526, + "auxiliary_loss_mlp": 0.01103759, + "balance_loss_clip": 1.00190067, + "balance_loss_mlp": 1.00047565, + "epoch": 0.7048248910266045, + "flos": 23660032976640.0, + "grad_norm": 1.658858952347305, + "language_loss": 0.66111559, + "learning_rate": 8.463073672685211e-07, + "loss": 0.68350846, + "num_input_tokens_seen": 252971400, + "step": 11723, + "time_per_iteration": 2.6563284397125244 + }, + { + "auxiliary_loss_clip": 0.01118449, + "auxiliary_loss_mlp": 0.01103084, + "balance_loss_clip": 1.00187624, + "balance_loss_mlp": 1.00056398, + "epoch": 0.7048850142792725, + "flos": 21397158017280.0, + "grad_norm": 1.8053421861218726, + "language_loss": 0.80855799, + "learning_rate": 8.459892563200235e-07, + "loss": 0.83077335, + "num_input_tokens_seen": 252989475, + "step": 11724, + "time_per_iteration": 2.723855495452881 + }, + { + "auxiliary_loss_clip": 0.01148265, + "auxiliary_loss_mlp": 0.01104889, + "balance_loss_clip": 1.00184608, + "balance_loss_mlp": 1.00055671, + "epoch": 0.7049451375319404, + "flos": 21648101408640.0, + "grad_norm": 3.705918974574413, + "language_loss": 0.73177552, + "learning_rate": 8.456711891317296e-07, + "loss": 0.75430703, + "num_input_tokens_seen": 253007220, + "step": 11725, + "time_per_iteration": 2.619030475616455 + }, + { + "auxiliary_loss_clip": 0.01104193, + "auxiliary_loss_mlp": 0.01104311, + "balance_loss_clip": 1.00172269, + "balance_loss_mlp": 1.00045609, + "epoch": 0.7050052607846085, + "flos": 14866802904960.0, + "grad_norm": 2.2455401593010733, + "language_loss": 0.78419328, + "learning_rate": 8.453531657156998e-07, + "loss": 0.80627835, + "num_input_tokens_seen": 253025410, + "step": 11726, + "time_per_iteration": 2.6992275714874268 + }, + { + "auxiliary_loss_clip": 0.01133652, + "auxiliary_loss_mlp": 0.01103213, + "balance_loss_clip": 1.00176024, + "balance_loss_mlp": 1.0005976, + "epoch": 0.7050653840372764, + "flos": 19241763528960.0, + "grad_norm": 1.7173703092581236, + "language_loss": 0.70596838, + "learning_rate": 8.450351860839931e-07, + "loss": 0.72833693, + "num_input_tokens_seen": 253043305, + "step": 11727, + "time_per_iteration": 2.614750385284424 + }, + { + "auxiliary_loss_clip": 0.01164629, + "auxiliary_loss_mlp": 0.00747226, + "balance_loss_clip": 1.00185645, + "balance_loss_mlp": 1.00048685, + "epoch": 0.7051255072899444, + "flos": 27780422935680.0, + "grad_norm": 2.040662712960504, + "language_loss": 0.69058406, + "learning_rate": 8.44717250248668e-07, + "loss": 0.70970261, + "num_input_tokens_seen": 253062790, + "step": 11728, + "time_per_iteration": 2.6017627716064453 + }, + { + "auxiliary_loss_clip": 0.01116674, + "auxiliary_loss_mlp": 0.00747378, + "balance_loss_clip": 1.00178885, + "balance_loss_mlp": 1.00054944, + "epoch": 0.7051856305426124, + "flos": 27892033470720.0, + "grad_norm": 1.8131363979225141, + "language_loss": 0.73151541, + "learning_rate": 8.443993582217803e-07, + "loss": 0.75015593, + "num_input_tokens_seen": 253082055, + "step": 11729, + "time_per_iteration": 2.671491861343384 + }, + { + "auxiliary_loss_clip": 0.01134958, + "auxiliary_loss_mlp": 0.01103915, + "balance_loss_clip": 1.00191319, + "balance_loss_mlp": 1.00044131, + "epoch": 0.7052457537952803, + "flos": 25043563082880.0, + "grad_norm": 1.8595878202651581, + "language_loss": 0.78063226, + "learning_rate": 8.440815100153862e-07, + "loss": 0.80302095, + "num_input_tokens_seen": 253102575, + "step": 11730, + "time_per_iteration": 2.6403286457061768 + }, + { + "auxiliary_loss_clip": 0.01165073, + "auxiliary_loss_mlp": 0.01104716, + "balance_loss_clip": 1.00190282, + "balance_loss_mlp": 1.00057435, + "epoch": 0.7053058770479483, + "flos": 21871717528320.0, + "grad_norm": 2.0485229370510667, + "language_loss": 0.63114935, + "learning_rate": 8.437637056415359e-07, + "loss": 0.65384722, + "num_input_tokens_seen": 253121290, + "step": 11731, + "time_per_iteration": 2.5175418853759766 + }, + { + "auxiliary_loss_clip": 0.01103954, + "auxiliary_loss_mlp": 0.01104432, + "balance_loss_clip": 1.00181246, + "balance_loss_mlp": 1.00038576, + "epoch": 0.7053660003006162, + "flos": 16398716094720.0, + "grad_norm": 2.42481839027467, + "language_loss": 0.74038029, + "learning_rate": 8.434459451122815e-07, + "loss": 0.76246417, + "num_input_tokens_seen": 253139720, + "step": 11732, + "time_per_iteration": 2.670989513397217 + }, + { + "auxiliary_loss_clip": 0.01150279, + "auxiliary_loss_mlp": 0.0110297, + "balance_loss_clip": 1.00204325, + "balance_loss_mlp": 1.0004499, + "epoch": 0.7054261235532843, + "flos": 22711560399360.0, + "grad_norm": 1.5219844704132892, + "language_loss": 0.71192563, + "learning_rate": 8.431282284396735e-07, + "loss": 0.73445815, + "num_input_tokens_seen": 253160250, + "step": 11733, + "time_per_iteration": 2.577897310256958 + }, + { + "auxiliary_loss_clip": 0.01116075, + "auxiliary_loss_mlp": 0.01103257, + "balance_loss_clip": 1.00164032, + "balance_loss_mlp": 1.00054622, + "epoch": 0.7054862468059522, + "flos": 13589711775360.0, + "grad_norm": 2.2388811438736855, + "language_loss": 0.74092668, + "learning_rate": 8.428105556357583e-07, + "loss": 0.76311994, + "num_input_tokens_seen": 253178710, + "step": 11734, + "time_per_iteration": 2.612062454223633 + }, + { + "auxiliary_loss_clip": 0.01116531, + "auxiliary_loss_mlp": 0.01104372, + "balance_loss_clip": 1.0017401, + "balance_loss_mlp": 1.00061178, + "epoch": 0.7055463700586202, + "flos": 15880704105600.0, + "grad_norm": 2.0493703460156185, + "language_loss": 0.69054687, + "learning_rate": 8.424929267125829e-07, + "loss": 0.71275592, + "num_input_tokens_seen": 253194805, + "step": 11735, + "time_per_iteration": 2.5878422260284424 + }, + { + "auxiliary_loss_clip": 0.01133234, + "auxiliary_loss_mlp": 0.01104599, + "balance_loss_clip": 1.00188923, + "balance_loss_mlp": 1.00055301, + "epoch": 0.7056064933112881, + "flos": 23076161400960.0, + "grad_norm": 2.1285551802027274, + "language_loss": 0.72575855, + "learning_rate": 8.421753416821933e-07, + "loss": 0.74813688, + "num_input_tokens_seen": 253213895, + "step": 11736, + "time_per_iteration": 2.6151320934295654 + }, + { + "auxiliary_loss_clip": 0.01134892, + "auxiliary_loss_mlp": 0.01102852, + "balance_loss_clip": 1.00194061, + "balance_loss_mlp": 1.0004276, + "epoch": 0.7056666165639561, + "flos": 24057168721920.0, + "grad_norm": 2.0881517470812088, + "language_loss": 0.69054133, + "learning_rate": 8.41857800556629e-07, + "loss": 0.71291876, + "num_input_tokens_seen": 253231620, + "step": 11737, + "time_per_iteration": 2.6918718814849854 + }, + { + "auxiliary_loss_clip": 0.01114901, + "auxiliary_loss_mlp": 0.01105259, + "balance_loss_clip": 1.00182307, + "balance_loss_mlp": 1.00073671, + "epoch": 0.705726739816624, + "flos": 17493237371520.0, + "grad_norm": 7.169962937298716, + "language_loss": 0.67683071, + "learning_rate": 8.415403033479332e-07, + "loss": 0.69903231, + "num_input_tokens_seen": 253249590, + "step": 11738, + "time_per_iteration": 2.62554931640625 + }, + { + "auxiliary_loss_clip": 0.01165069, + "auxiliary_loss_mlp": 0.01104953, + "balance_loss_clip": 1.00203884, + "balance_loss_mlp": 1.00052595, + "epoch": 0.7057868630692921, + "flos": 51350426472960.0, + "grad_norm": 1.7085866875044373, + "language_loss": 0.7481094, + "learning_rate": 8.41222850068145e-07, + "loss": 0.77080965, + "num_input_tokens_seen": 253273870, + "step": 11739, + "time_per_iteration": 2.840013027191162 + }, + { + "auxiliary_loss_clip": 0.01132876, + "auxiliary_loss_mlp": 0.00747409, + "balance_loss_clip": 1.00190425, + "balance_loss_mlp": 1.00057447, + "epoch": 0.70584698632196, + "flos": 26102963836800.0, + "grad_norm": 1.5778834669028117, + "language_loss": 0.71134883, + "learning_rate": 8.409054407293032e-07, + "loss": 0.73015165, + "num_input_tokens_seen": 253293720, + "step": 11740, + "time_per_iteration": 2.6496102809906006 + }, + { + "auxiliary_loss_clip": 0.011177, + "auxiliary_loss_mlp": 0.01102342, + "balance_loss_clip": 1.00178504, + "balance_loss_mlp": 1.00048971, + "epoch": 0.705907109574628, + "flos": 21543134889600.0, + "grad_norm": 1.7632110476247707, + "language_loss": 0.81767374, + "learning_rate": 8.405880753434434e-07, + "loss": 0.83987415, + "num_input_tokens_seen": 253313700, + "step": 11741, + "time_per_iteration": 4.036290884017944 + }, + { + "auxiliary_loss_clip": 0.01131578, + "auxiliary_loss_mlp": 0.01103998, + "balance_loss_clip": 1.00181127, + "balance_loss_mlp": 1.00052428, + "epoch": 0.705967232827296, + "flos": 22710842127360.0, + "grad_norm": 2.083141418512194, + "language_loss": 0.77967978, + "learning_rate": 8.402707539225993e-07, + "loss": 0.80203557, + "num_input_tokens_seen": 253332425, + "step": 11742, + "time_per_iteration": 3.9732396602630615 + }, + { + "auxiliary_loss_clip": 0.01165044, + "auxiliary_loss_mlp": 0.01103761, + "balance_loss_clip": 1.00188601, + "balance_loss_mlp": 1.00057328, + "epoch": 0.7060273560799639, + "flos": 28691225124480.0, + "grad_norm": 1.7506552653035263, + "language_loss": 0.64181387, + "learning_rate": 8.39953476478805e-07, + "loss": 0.66450191, + "num_input_tokens_seen": 253353620, + "step": 11743, + "time_per_iteration": 2.5823974609375 + }, + { + "auxiliary_loss_clip": 0.01135298, + "auxiliary_loss_mlp": 0.01103803, + "balance_loss_clip": 1.00188005, + "balance_loss_mlp": 1.00042486, + "epoch": 0.7060874793326319, + "flos": 15706178899200.0, + "grad_norm": 1.998551297056308, + "language_loss": 0.6578964, + "learning_rate": 8.396362430240902e-07, + "loss": 0.68028736, + "num_input_tokens_seen": 253370930, + "step": 11744, + "time_per_iteration": 2.5889079570770264 + }, + { + "auxiliary_loss_clip": 0.01149965, + "auxiliary_loss_mlp": 0.01103613, + "balance_loss_clip": 1.00189543, + "balance_loss_mlp": 1.00052035, + "epoch": 0.7061476025852998, + "flos": 21506757390720.0, + "grad_norm": 2.0082642427200623, + "language_loss": 0.63599122, + "learning_rate": 8.393190535704857e-07, + "loss": 0.65852696, + "num_input_tokens_seen": 253389810, + "step": 11745, + "time_per_iteration": 2.5758538246154785 + }, + { + "auxiliary_loss_clip": 0.01120704, + "auxiliary_loss_mlp": 0.01104311, + "balance_loss_clip": 1.00186002, + "balance_loss_mlp": 1.00045538, + "epoch": 0.7062077258379679, + "flos": 28181832399360.0, + "grad_norm": 2.4386583505571355, + "language_loss": 0.71981215, + "learning_rate": 8.390019081300188e-07, + "loss": 0.74206233, + "num_input_tokens_seen": 253408685, + "step": 11746, + "time_per_iteration": 2.7144553661346436 + }, + { + "auxiliary_loss_clip": 0.01083354, + "auxiliary_loss_mlp": 0.01103593, + "balance_loss_clip": 1.00167561, + "balance_loss_mlp": 1.00050044, + "epoch": 0.7062678490906358, + "flos": 27853680723840.0, + "grad_norm": 1.9695552797989668, + "language_loss": 0.79174387, + "learning_rate": 8.386848067147175e-07, + "loss": 0.81361336, + "num_input_tokens_seen": 253429685, + "step": 11747, + "time_per_iteration": 2.763078212738037 + }, + { + "auxiliary_loss_clip": 0.01150249, + "auxiliary_loss_mlp": 0.01102946, + "balance_loss_clip": 1.00194192, + "balance_loss_mlp": 1.00061691, + "epoch": 0.7063279723433038, + "flos": 23184862934400.0, + "grad_norm": 1.9441876420439383, + "language_loss": 0.65077925, + "learning_rate": 8.383677493366031e-07, + "loss": 0.67331123, + "num_input_tokens_seen": 253448260, + "step": 11748, + "time_per_iteration": 2.5831449031829834 + }, + { + "auxiliary_loss_clip": 0.01099505, + "auxiliary_loss_mlp": 0.01104803, + "balance_loss_clip": 1.00164533, + "balance_loss_mlp": 1.00047123, + "epoch": 0.7063880955959717, + "flos": 20188655907840.0, + "grad_norm": 2.0743464326445578, + "language_loss": 0.79087585, + "learning_rate": 8.380507360077003e-07, + "loss": 0.8129189, + "num_input_tokens_seen": 253467725, + "step": 11749, + "time_per_iteration": 2.665123462677002 + }, + { + "auxiliary_loss_clip": 0.01158962, + "auxiliary_loss_mlp": 0.01078128, + "balance_loss_clip": 1.00081825, + "balance_loss_mlp": 0.99992698, + "epoch": 0.7064482188486397, + "flos": 63668182763520.0, + "grad_norm": 0.7941496062170197, + "language_loss": 0.54086596, + "learning_rate": 8.377337667400304e-07, + "loss": 0.56323689, + "num_input_tokens_seen": 253526940, + "step": 11750, + "time_per_iteration": 3.046473979949951 + }, + { + "auxiliary_loss_clip": 0.01133099, + "auxiliary_loss_mlp": 0.0110414, + "balance_loss_clip": 1.00194085, + "balance_loss_mlp": 1.00057089, + "epoch": 0.7065083421013076, + "flos": 25191227894400.0, + "grad_norm": 2.114251726831096, + "language_loss": 0.78707325, + "learning_rate": 8.37416841545612e-07, + "loss": 0.80944562, + "num_input_tokens_seen": 253546160, + "step": 11751, + "time_per_iteration": 2.6293632984161377 + }, + { + "auxiliary_loss_clip": 0.01114124, + "auxiliary_loss_mlp": 0.01102423, + "balance_loss_clip": 1.00169063, + "balance_loss_mlp": 1.00042772, + "epoch": 0.7065684653539757, + "flos": 22893699288960.0, + "grad_norm": 1.9358971636844655, + "language_loss": 0.67696428, + "learning_rate": 8.370999604364634e-07, + "loss": 0.69912982, + "num_input_tokens_seen": 253565505, + "step": 11752, + "time_per_iteration": 2.6495721340179443 + }, + { + "auxiliary_loss_clip": 0.01087388, + "auxiliary_loss_mlp": 0.00747467, + "balance_loss_clip": 1.00172019, + "balance_loss_mlp": 1.00060332, + "epoch": 0.7066285886066436, + "flos": 23550254035200.0, + "grad_norm": 2.0797403243424015, + "language_loss": 0.76433158, + "learning_rate": 8.367831234246025e-07, + "loss": 0.78268009, + "num_input_tokens_seen": 253585125, + "step": 11753, + "time_per_iteration": 2.729262113571167 + }, + { + "auxiliary_loss_clip": 0.01117767, + "auxiliary_loss_mlp": 0.0074726, + "balance_loss_clip": 1.0017271, + "balance_loss_mlp": 1.00065804, + "epoch": 0.7066887118593116, + "flos": 21069293650560.0, + "grad_norm": 1.5149993064872687, + "language_loss": 0.70902574, + "learning_rate": 8.364663305220405e-07, + "loss": 0.72767603, + "num_input_tokens_seen": 253604815, + "step": 11754, + "time_per_iteration": 2.662764549255371 + }, + { + "auxiliary_loss_clip": 0.01120471, + "auxiliary_loss_mlp": 0.01103562, + "balance_loss_clip": 1.00192213, + "balance_loss_mlp": 1.00047016, + "epoch": 0.7067488351119796, + "flos": 21176307244800.0, + "grad_norm": 1.7087995279647863, + "language_loss": 0.89385897, + "learning_rate": 8.361495817407919e-07, + "loss": 0.91609931, + "num_input_tokens_seen": 253622855, + "step": 11755, + "time_per_iteration": 4.063548564910889 + }, + { + "auxiliary_loss_clip": 0.0113139, + "auxiliary_loss_mlp": 0.00747489, + "balance_loss_clip": 1.00179482, + "balance_loss_mlp": 1.00062883, + "epoch": 0.7068089583646475, + "flos": 20449224144000.0, + "grad_norm": 1.5719638934198719, + "language_loss": 0.79596174, + "learning_rate": 8.358328770928678e-07, + "loss": 0.81475055, + "num_input_tokens_seen": 253642760, + "step": 11756, + "time_per_iteration": 3.9762489795684814 + }, + { + "auxiliary_loss_clip": 0.01098407, + "auxiliary_loss_mlp": 0.01078555, + "balance_loss_clip": 1.00083447, + "balance_loss_mlp": 0.99997234, + "epoch": 0.7068690816173155, + "flos": 59109179829120.0, + "grad_norm": 0.8274788401784192, + "language_loss": 0.60384357, + "learning_rate": 8.355162165902785e-07, + "loss": 0.62561321, + "num_input_tokens_seen": 253695685, + "step": 11757, + "time_per_iteration": 3.038968801498413 + }, + { + "auxiliary_loss_clip": 0.01119045, + "auxiliary_loss_mlp": 0.01104355, + "balance_loss_clip": 1.00190139, + "balance_loss_mlp": 1.00059497, + "epoch": 0.7069292048699835, + "flos": 16251554073600.0, + "grad_norm": 1.9880591193044843, + "language_loss": 0.80322802, + "learning_rate": 8.351996002450307e-07, + "loss": 0.82546204, + "num_input_tokens_seen": 253713305, + "step": 11758, + "time_per_iteration": 2.6343462467193604 + }, + { + "auxiliary_loss_clip": 0.01116076, + "auxiliary_loss_mlp": 0.00747402, + "balance_loss_clip": 1.00180459, + "balance_loss_mlp": 1.00062394, + "epoch": 0.7069893281226515, + "flos": 41172768455040.0, + "grad_norm": 4.903965645362374, + "language_loss": 0.77285421, + "learning_rate": 8.348830280691304e-07, + "loss": 0.79148901, + "num_input_tokens_seen": 253736100, + "step": 11759, + "time_per_iteration": 2.9105820655822754 + }, + { + "auxiliary_loss_clip": 0.01150118, + "auxiliary_loss_mlp": 0.0110336, + "balance_loss_clip": 1.00186777, + "balance_loss_mlp": 1.00036287, + "epoch": 0.7070494513753194, + "flos": 24207275658240.0, + "grad_norm": 2.189452653465788, + "language_loss": 0.68061173, + "learning_rate": 8.34566500074583e-07, + "loss": 0.70314658, + "num_input_tokens_seen": 253757350, + "step": 11760, + "time_per_iteration": 2.7128407955169678 + }, + { + "auxiliary_loss_clip": 0.01119081, + "auxiliary_loss_mlp": 0.0110332, + "balance_loss_clip": 1.00186574, + "balance_loss_mlp": 1.00041819, + "epoch": 0.7071095746279874, + "flos": 20185675079040.0, + "grad_norm": 1.9679856878351063, + "language_loss": 0.7998724, + "learning_rate": 8.342500162733899e-07, + "loss": 0.82209635, + "num_input_tokens_seen": 253772855, + "step": 11761, + "time_per_iteration": 2.613734245300293 + }, + { + "auxiliary_loss_clip": 0.01135061, + "auxiliary_loss_mlp": 0.01104179, + "balance_loss_clip": 1.00183392, + "balance_loss_mlp": 1.00051522, + "epoch": 0.7071696978806553, + "flos": 18183045133440.0, + "grad_norm": 2.727090442168862, + "language_loss": 0.74946439, + "learning_rate": 8.33933576677553e-07, + "loss": 0.77185678, + "num_input_tokens_seen": 253790360, + "step": 11762, + "time_per_iteration": 2.5804152488708496 + }, + { + "auxiliary_loss_clip": 0.0113352, + "auxiliary_loss_mlp": 0.01103498, + "balance_loss_clip": 1.00178933, + "balance_loss_mlp": 1.00050092, + "epoch": 0.7072298211333233, + "flos": 24131719399680.0, + "grad_norm": 2.653814417895852, + "language_loss": 0.76673496, + "learning_rate": 8.336171812990724e-07, + "loss": 0.78910518, + "num_input_tokens_seen": 253810585, + "step": 11763, + "time_per_iteration": 2.627162456512451 + }, + { + "auxiliary_loss_clip": 0.01118557, + "auxiliary_loss_mlp": 0.00747317, + "balance_loss_clip": 1.00170946, + "balance_loss_mlp": 1.00055552, + "epoch": 0.7072899443859912, + "flos": 27198418867200.0, + "grad_norm": 5.219317834704044, + "language_loss": 0.78750014, + "learning_rate": 8.333008301499453e-07, + "loss": 0.8061589, + "num_input_tokens_seen": 253829080, + "step": 11764, + "time_per_iteration": 2.701349973678589 + }, + { + "auxiliary_loss_clip": 0.01101896, + "auxiliary_loss_mlp": 0.01104476, + "balance_loss_clip": 1.00161421, + "balance_loss_mlp": 1.00062037, + "epoch": 0.7073500676386593, + "flos": 16435596384000.0, + "grad_norm": 1.6911740416313938, + "language_loss": 0.79336786, + "learning_rate": 8.32984523242167e-07, + "loss": 0.81543159, + "num_input_tokens_seen": 253846780, + "step": 11765, + "time_per_iteration": 2.6416568756103516 + }, + { + "auxiliary_loss_clip": 0.01164744, + "auxiliary_loss_mlp": 0.01102695, + "balance_loss_clip": 1.00185263, + "balance_loss_mlp": 1.0004611, + "epoch": 0.7074101908913272, + "flos": 27673732563840.0, + "grad_norm": 1.610963318205762, + "language_loss": 0.68238246, + "learning_rate": 8.326682605877324e-07, + "loss": 0.70505679, + "num_input_tokens_seen": 253867075, + "step": 11766, + "time_per_iteration": 2.5877668857574463 + }, + { + "auxiliary_loss_clip": 0.0113364, + "auxiliary_loss_mlp": 0.01104418, + "balance_loss_clip": 1.00189269, + "balance_loss_mlp": 1.00056338, + "epoch": 0.7074703141439952, + "flos": 22238078296320.0, + "grad_norm": 3.042118971293179, + "language_loss": 0.64202207, + "learning_rate": 8.323520421986352e-07, + "loss": 0.66440266, + "num_input_tokens_seen": 253885790, + "step": 11767, + "time_per_iteration": 2.6021969318389893 + }, + { + "auxiliary_loss_clip": 0.01148244, + "auxiliary_loss_mlp": 0.01104104, + "balance_loss_clip": 1.00183296, + "balance_loss_mlp": 1.00044012, + "epoch": 0.7075304373966632, + "flos": 29643217234560.0, + "grad_norm": 1.4792199505467107, + "language_loss": 0.52860516, + "learning_rate": 8.320358680868646e-07, + "loss": 0.55112863, + "num_input_tokens_seen": 253907070, + "step": 11768, + "time_per_iteration": 2.6152403354644775 + }, + { + "auxiliary_loss_clip": 0.01132579, + "auxiliary_loss_mlp": 0.00747436, + "balance_loss_clip": 1.00177479, + "balance_loss_mlp": 1.00061738, + "epoch": 0.7075905606493311, + "flos": 19755214490880.0, + "grad_norm": 1.6559608816683424, + "language_loss": 0.75844538, + "learning_rate": 8.317197382644119e-07, + "loss": 0.77724558, + "num_input_tokens_seen": 253927290, + "step": 11769, + "time_per_iteration": 2.603940725326538 + }, + { + "auxiliary_loss_clip": 0.01130049, + "auxiliary_loss_mlp": 0.0107809, + "balance_loss_clip": 1.00082541, + "balance_loss_mlp": 0.99988824, + "epoch": 0.7076506839019991, + "flos": 65716132694400.0, + "grad_norm": 0.8556546903458168, + "language_loss": 0.61996734, + "learning_rate": 8.314036527432637e-07, + "loss": 0.64204872, + "num_input_tokens_seen": 253983440, + "step": 11770, + "time_per_iteration": 3.0759975910186768 + }, + { + "auxiliary_loss_clip": 0.01117048, + "auxiliary_loss_mlp": 0.01104995, + "balance_loss_clip": 1.00183773, + "balance_loss_mlp": 1.00066328, + "epoch": 0.707710807154667, + "flos": 23765286804480.0, + "grad_norm": 2.136900431611418, + "language_loss": 0.76118481, + "learning_rate": 8.310876115354055e-07, + "loss": 0.78340518, + "num_input_tokens_seen": 254003825, + "step": 11771, + "time_per_iteration": 2.693390130996704 + }, + { + "auxiliary_loss_clip": 0.01148063, + "auxiliary_loss_mlp": 0.01102427, + "balance_loss_clip": 1.00186598, + "balance_loss_mlp": 1.00038397, + "epoch": 0.7077709304073351, + "flos": 21251360712960.0, + "grad_norm": 1.613101748255806, + "language_loss": 0.71324569, + "learning_rate": 8.307716146528221e-07, + "loss": 0.73575056, + "num_input_tokens_seen": 254023345, + "step": 11772, + "time_per_iteration": 2.5836498737335205 + }, + { + "auxiliary_loss_clip": 0.01102364, + "auxiliary_loss_mlp": 0.01104178, + "balance_loss_clip": 1.00181079, + "balance_loss_mlp": 1.00051332, + "epoch": 0.707831053660003, + "flos": 20740746925440.0, + "grad_norm": 1.80083118101453, + "language_loss": 0.70288265, + "learning_rate": 8.30455662107496e-07, + "loss": 0.72494805, + "num_input_tokens_seen": 254041815, + "step": 11773, + "time_per_iteration": 2.7084429264068604 + }, + { + "auxiliary_loss_clip": 0.01149334, + "auxiliary_loss_mlp": 0.01103861, + "balance_loss_clip": 1.00185752, + "balance_loss_mlp": 1.00048256, + "epoch": 0.707891176912671, + "flos": 21980993679360.0, + "grad_norm": 3.703863897649987, + "language_loss": 0.69807243, + "learning_rate": 8.301397539114095e-07, + "loss": 0.72060442, + "num_input_tokens_seen": 254062065, + "step": 11774, + "time_per_iteration": 2.573930025100708 + }, + { + "auxiliary_loss_clip": 0.01133253, + "auxiliary_loss_mlp": 0.0110273, + "balance_loss_clip": 1.00200844, + "balance_loss_mlp": 1.00049663, + "epoch": 0.7079513001653389, + "flos": 21068970428160.0, + "grad_norm": 1.5314556488260453, + "language_loss": 0.74337542, + "learning_rate": 8.298238900765407e-07, + "loss": 0.76573515, + "num_input_tokens_seen": 254080605, + "step": 11775, + "time_per_iteration": 2.5867183208465576 + }, + { + "auxiliary_loss_clip": 0.01118054, + "auxiliary_loss_mlp": 0.00747419, + "balance_loss_clip": 1.00192797, + "balance_loss_mlp": 1.00066805, + "epoch": 0.7080114234180069, + "flos": 18040659621120.0, + "grad_norm": 1.7831787305026268, + "language_loss": 0.87198186, + "learning_rate": 8.295080706148665e-07, + "loss": 0.89063668, + "num_input_tokens_seen": 254098710, + "step": 11776, + "time_per_iteration": 2.610501289367676 + }, + { + "auxiliary_loss_clip": 0.01150348, + "auxiliary_loss_mlp": 0.01103286, + "balance_loss_clip": 1.00188971, + "balance_loss_mlp": 1.00057554, + "epoch": 0.7080715466706748, + "flos": 15122271409920.0, + "grad_norm": 1.755589875338636, + "language_loss": 0.75127852, + "learning_rate": 8.291922955383641e-07, + "loss": 0.7738148, + "num_input_tokens_seen": 254117200, + "step": 11777, + "time_per_iteration": 3.9799423217773438 + }, + { + "auxiliary_loss_clip": 0.01133771, + "auxiliary_loss_mlp": 0.01105415, + "balance_loss_clip": 1.00195336, + "balance_loss_mlp": 1.00051045, + "epoch": 0.7081316699233429, + "flos": 14422802889600.0, + "grad_norm": 2.1149070069897586, + "language_loss": 0.82010376, + "learning_rate": 8.288765648590066e-07, + "loss": 0.84249568, + "num_input_tokens_seen": 254132115, + "step": 11778, + "time_per_iteration": 2.571528911590576 + }, + { + "auxiliary_loss_clip": 0.01133143, + "auxiliary_loss_mlp": 0.01102239, + "balance_loss_clip": 1.00174856, + "balance_loss_mlp": 1.00048161, + "epoch": 0.7081917931760108, + "flos": 23222389668480.0, + "grad_norm": 1.6577544310110313, + "language_loss": 0.84962988, + "learning_rate": 8.285608785887673e-07, + "loss": 0.87198377, + "num_input_tokens_seen": 254152285, + "step": 11779, + "time_per_iteration": 2.6479990482330322 + }, + { + "auxiliary_loss_clip": 0.0113329, + "auxiliary_loss_mlp": 0.0110431, + "balance_loss_clip": 1.00175357, + "balance_loss_mlp": 1.00055027, + "epoch": 0.7082519164286788, + "flos": 39308429871360.0, + "grad_norm": 2.0197358116166426, + "language_loss": 0.71395421, + "learning_rate": 8.28245236739618e-07, + "loss": 0.73633021, + "num_input_tokens_seen": 254172805, + "step": 11780, + "time_per_iteration": 4.115112066268921 + }, + { + "auxiliary_loss_clip": 0.01103774, + "auxiliary_loss_mlp": 0.01103534, + "balance_loss_clip": 1.00184345, + "balance_loss_mlp": 1.00044131, + "epoch": 0.7083120396813467, + "flos": 21651154064640.0, + "grad_norm": 1.536695313945322, + "language_loss": 0.72819555, + "learning_rate": 8.279296393235256e-07, + "loss": 0.75026864, + "num_input_tokens_seen": 254191890, + "step": 11781, + "time_per_iteration": 2.695141315460205 + }, + { + "auxiliary_loss_clip": 0.01148169, + "auxiliary_loss_mlp": 0.01103631, + "balance_loss_clip": 1.00186157, + "balance_loss_mlp": 1.00044358, + "epoch": 0.7083721629340147, + "flos": 17567033863680.0, + "grad_norm": 1.6958572488199664, + "language_loss": 0.77708179, + "learning_rate": 8.276140863524585e-07, + "loss": 0.79959977, + "num_input_tokens_seen": 254210150, + "step": 11782, + "time_per_iteration": 2.5615007877349854 + }, + { + "auxiliary_loss_clip": 0.01130791, + "auxiliary_loss_mlp": 0.01103065, + "balance_loss_clip": 1.00172138, + "balance_loss_mlp": 1.00044942, + "epoch": 0.7084322861866827, + "flos": 29350509304320.0, + "grad_norm": 1.3996129218616942, + "language_loss": 0.69698894, + "learning_rate": 8.272985778383828e-07, + "loss": 0.71932745, + "num_input_tokens_seen": 254233015, + "step": 11783, + "time_per_iteration": 2.674029588699341 + }, + { + "auxiliary_loss_clip": 0.01103156, + "auxiliary_loss_mlp": 0.01103136, + "balance_loss_clip": 1.00168836, + "balance_loss_mlp": 1.00042534, + "epoch": 0.7084924094393507, + "flos": 20194294343040.0, + "grad_norm": 2.149556339134318, + "language_loss": 0.79055923, + "learning_rate": 8.269831137932632e-07, + "loss": 0.81262219, + "num_input_tokens_seen": 254251345, + "step": 11784, + "time_per_iteration": 2.6772677898406982 + }, + { + "auxiliary_loss_clip": 0.01165016, + "auxiliary_loss_mlp": 0.01103858, + "balance_loss_clip": 1.00193429, + "balance_loss_mlp": 1.00047934, + "epoch": 0.7085525326920187, + "flos": 23477211728640.0, + "grad_norm": 2.2760430152405884, + "language_loss": 0.7695241, + "learning_rate": 8.266676942290609e-07, + "loss": 0.7922129, + "num_input_tokens_seen": 254269905, + "step": 11785, + "time_per_iteration": 2.5581257343292236 + }, + { + "auxiliary_loss_clip": 0.01133438, + "auxiliary_loss_mlp": 0.01103684, + "balance_loss_clip": 1.00195003, + "balance_loss_mlp": 1.00059175, + "epoch": 0.7086126559446866, + "flos": 25958818558080.0, + "grad_norm": 2.08158087366302, + "language_loss": 0.78050053, + "learning_rate": 8.26352319157738e-07, + "loss": 0.8028717, + "num_input_tokens_seen": 254289990, + "step": 11786, + "time_per_iteration": 2.6485767364501953 + }, + { + "auxiliary_loss_clip": 0.01164906, + "auxiliary_loss_mlp": 0.01103874, + "balance_loss_clip": 1.00191641, + "balance_loss_mlp": 1.00040054, + "epoch": 0.7086727791973546, + "flos": 26724793109760.0, + "grad_norm": 2.3781144275810973, + "language_loss": 0.79073536, + "learning_rate": 8.260369885912526e-07, + "loss": 0.81342316, + "num_input_tokens_seen": 254309085, + "step": 11787, + "time_per_iteration": 2.632967710494995 + }, + { + "auxiliary_loss_clip": 0.01148219, + "auxiliary_loss_mlp": 0.0110396, + "balance_loss_clip": 1.00188041, + "balance_loss_mlp": 1.00048614, + "epoch": 0.7087329024500225, + "flos": 21683365585920.0, + "grad_norm": 1.819033207157159, + "language_loss": 0.76592094, + "learning_rate": 8.257217025415615e-07, + "loss": 0.78844273, + "num_input_tokens_seen": 254327045, + "step": 11788, + "time_per_iteration": 2.550727605819702 + }, + { + "auxiliary_loss_clip": 0.01118734, + "auxiliary_loss_mlp": 0.01105819, + "balance_loss_clip": 1.00195527, + "balance_loss_mlp": 1.0005331, + "epoch": 0.7087930257026905, + "flos": 17931060247680.0, + "grad_norm": 1.9256679673987485, + "language_loss": 0.6806826, + "learning_rate": 8.254064610206212e-07, + "loss": 0.70292813, + "num_input_tokens_seen": 254344585, + "step": 11789, + "time_per_iteration": 2.6274263858795166 + }, + { + "auxiliary_loss_clip": 0.01081955, + "auxiliary_loss_mlp": 0.0110421, + "balance_loss_clip": 1.0015893, + "balance_loss_mlp": 1.00064063, + "epoch": 0.7088531489553584, + "flos": 18911528864640.0, + "grad_norm": 1.6574918662227456, + "language_loss": 0.77449751, + "learning_rate": 8.250912640403858e-07, + "loss": 0.79635918, + "num_input_tokens_seen": 254362470, + "step": 11790, + "time_per_iteration": 2.7855963706970215 + }, + { + "auxiliary_loss_clip": 0.01133137, + "auxiliary_loss_mlp": 0.01104933, + "balance_loss_clip": 1.00183797, + "balance_loss_mlp": 1.00041032, + "epoch": 0.7089132722080265, + "flos": 27380880979200.0, + "grad_norm": 1.7703661436891482, + "language_loss": 0.71296406, + "learning_rate": 8.247761116128085e-07, + "loss": 0.73534477, + "num_input_tokens_seen": 254383190, + "step": 11791, + "time_per_iteration": 2.6535959243774414 + }, + { + "auxiliary_loss_clip": 0.01148256, + "auxiliary_loss_mlp": 0.01103746, + "balance_loss_clip": 1.00193763, + "balance_loss_mlp": 1.00055838, + "epoch": 0.7089733954606944, + "flos": 22162917087360.0, + "grad_norm": 1.6163143955880355, + "language_loss": 0.81953943, + "learning_rate": 8.244610037498376e-07, + "loss": 0.84205949, + "num_input_tokens_seen": 254403115, + "step": 11792, + "time_per_iteration": 2.5622761249542236 + }, + { + "auxiliary_loss_clip": 0.01117663, + "auxiliary_loss_mlp": 0.01104575, + "balance_loss_clip": 1.00188136, + "balance_loss_mlp": 1.00052929, + "epoch": 0.7090335187133624, + "flos": 24425827960320.0, + "grad_norm": 1.963444252159354, + "language_loss": 0.64941335, + "learning_rate": 8.241459404634232e-07, + "loss": 0.67163575, + "num_input_tokens_seen": 254421875, + "step": 11793, + "time_per_iteration": 4.135344505310059 + }, + { + "auxiliary_loss_clip": 0.01149966, + "auxiliary_loss_mlp": 0.01103242, + "balance_loss_clip": 1.00195932, + "balance_loss_mlp": 1.00043631, + "epoch": 0.7090936419660303, + "flos": 21835232288640.0, + "grad_norm": 1.949488610000514, + "language_loss": 0.7028352, + "learning_rate": 8.238309217655133e-07, + "loss": 0.72536737, + "num_input_tokens_seen": 254440765, + "step": 11794, + "time_per_iteration": 3.9442763328552246 + }, + { + "auxiliary_loss_clip": 0.01133737, + "auxiliary_loss_mlp": 0.01103449, + "balance_loss_clip": 1.00199175, + "balance_loss_mlp": 1.00045192, + "epoch": 0.7091537652186983, + "flos": 20082360585600.0, + "grad_norm": 1.7913581639726537, + "language_loss": 0.76201195, + "learning_rate": 8.23515947668052e-07, + "loss": 0.78438383, + "num_input_tokens_seen": 254459480, + "step": 11795, + "time_per_iteration": 2.604341745376587 + }, + { + "auxiliary_loss_clip": 0.01115058, + "auxiliary_loss_mlp": 0.01104032, + "balance_loss_clip": 1.0016886, + "balance_loss_mlp": 1.00055873, + "epoch": 0.7092138884713663, + "flos": 13151565676800.0, + "grad_norm": 2.6999774344069896, + "language_loss": 0.74852812, + "learning_rate": 8.232010181829838e-07, + "loss": 0.77071899, + "num_input_tokens_seen": 254473985, + "step": 11796, + "time_per_iteration": 2.5940492153167725 + }, + { + "auxiliary_loss_clip": 0.01150186, + "auxiliary_loss_mlp": 0.01105385, + "balance_loss_clip": 1.00198412, + "balance_loss_mlp": 1.00057602, + "epoch": 0.7092740117240343, + "flos": 21645982506240.0, + "grad_norm": 1.7006532021595653, + "language_loss": 0.73707628, + "learning_rate": 8.228861333222523e-07, + "loss": 0.75963199, + "num_input_tokens_seen": 254492135, + "step": 11797, + "time_per_iteration": 2.6227314472198486 + }, + { + "auxiliary_loss_clip": 0.01100402, + "auxiliary_loss_mlp": 0.0110435, + "balance_loss_clip": 1.00175118, + "balance_loss_mlp": 1.00049531, + "epoch": 0.7093341349767023, + "flos": 21032521102080.0, + "grad_norm": 1.4875003746942717, + "language_loss": 0.7904498, + "learning_rate": 8.225712930977953e-07, + "loss": 0.81249732, + "num_input_tokens_seen": 254512865, + "step": 11798, + "time_per_iteration": 2.727177381515503 + }, + { + "auxiliary_loss_clip": 0.01135557, + "auxiliary_loss_mlp": 0.01103774, + "balance_loss_clip": 1.00199795, + "balance_loss_mlp": 1.00058675, + "epoch": 0.7093942582293702, + "flos": 22017658487040.0, + "grad_norm": 1.7973073664102566, + "language_loss": 0.665748, + "learning_rate": 8.222564975215529e-07, + "loss": 0.68814135, + "num_input_tokens_seen": 254532605, + "step": 11799, + "time_per_iteration": 2.6279735565185547 + }, + { + "auxiliary_loss_clip": 0.01165045, + "auxiliary_loss_mlp": 0.01103547, + "balance_loss_clip": 1.00209367, + "balance_loss_mlp": 1.00045502, + "epoch": 0.7094543814820382, + "flos": 27235586465280.0, + "grad_norm": 1.6094253898656359, + "language_loss": 0.81503481, + "learning_rate": 8.219417466054622e-07, + "loss": 0.83772075, + "num_input_tokens_seen": 254553780, + "step": 11800, + "time_per_iteration": 2.601945161819458 + }, + { + "auxiliary_loss_clip": 0.01131545, + "auxiliary_loss_mlp": 0.01103309, + "balance_loss_clip": 1.0017997, + "balance_loss_mlp": 1.00040793, + "epoch": 0.7095145047347061, + "flos": 12089148180480.0, + "grad_norm": 2.0384740349781394, + "language_loss": 0.86351889, + "learning_rate": 8.21627040361459e-07, + "loss": 0.88586742, + "num_input_tokens_seen": 254567510, + "step": 11801, + "time_per_iteration": 2.663971424102783 + }, + { + "auxiliary_loss_clip": 0.01164826, + "auxiliary_loss_mlp": 0.01103548, + "balance_loss_clip": 1.00187349, + "balance_loss_mlp": 1.0005511, + "epoch": 0.7095746279873741, + "flos": 19383789905280.0, + "grad_norm": 1.9151946234420867, + "language_loss": 0.76514715, + "learning_rate": 8.213123788014758e-07, + "loss": 0.78783083, + "num_input_tokens_seen": 254585565, + "step": 11802, + "time_per_iteration": 2.546619415283203 + }, + { + "auxiliary_loss_clip": 0.01150319, + "auxiliary_loss_mlp": 0.01103816, + "balance_loss_clip": 1.00201833, + "balance_loss_mlp": 1.00081897, + "epoch": 0.709634751240042, + "flos": 21360600950400.0, + "grad_norm": 1.7778966733790837, + "language_loss": 0.81771708, + "learning_rate": 8.209977619374462e-07, + "loss": 0.84025842, + "num_input_tokens_seen": 254603465, + "step": 11803, + "time_per_iteration": 2.6787002086639404 + }, + { + "auxiliary_loss_clip": 0.01164982, + "auxiliary_loss_mlp": 0.01103995, + "balance_loss_clip": 1.00193632, + "balance_loss_mlp": 1.00042582, + "epoch": 0.7096948744927101, + "flos": 13917037438080.0, + "grad_norm": 2.231751153272247, + "language_loss": 0.67568135, + "learning_rate": 8.206831897812995e-07, + "loss": 0.69837117, + "num_input_tokens_seen": 254620500, + "step": 11804, + "time_per_iteration": 2.6313257217407227 + }, + { + "auxiliary_loss_clip": 0.01149992, + "auxiliary_loss_mlp": 0.01102383, + "balance_loss_clip": 1.00185645, + "balance_loss_mlp": 1.00043511, + "epoch": 0.709754997745378, + "flos": 30298335436800.0, + "grad_norm": 2.8477169933885267, + "language_loss": 0.78092903, + "learning_rate": 8.203686623449637e-07, + "loss": 0.80345279, + "num_input_tokens_seen": 254638565, + "step": 11805, + "time_per_iteration": 2.7431838512420654 + }, + { + "auxiliary_loss_clip": 0.01133304, + "auxiliary_loss_mlp": 0.0074753, + "balance_loss_clip": 1.00183713, + "balance_loss_mlp": 1.0006988, + "epoch": 0.709815120998046, + "flos": 18515147304960.0, + "grad_norm": 1.990567287613928, + "language_loss": 0.7899816, + "learning_rate": 8.200541796403667e-07, + "loss": 0.80878991, + "num_input_tokens_seen": 254657505, + "step": 11806, + "time_per_iteration": 2.6799142360687256 + }, + { + "auxiliary_loss_clip": 0.01133103, + "auxiliary_loss_mlp": 0.01103899, + "balance_loss_clip": 1.00190723, + "balance_loss_mlp": 1.00052083, + "epoch": 0.7098752442507139, + "flos": 22272588288000.0, + "grad_norm": 1.974380813948785, + "language_loss": 0.56449461, + "learning_rate": 8.197397416794332e-07, + "loss": 0.58686459, + "num_input_tokens_seen": 254674730, + "step": 11807, + "time_per_iteration": 2.626927375793457 + }, + { + "auxiliary_loss_clip": 0.01164953, + "auxiliary_loss_mlp": 0.01104772, + "balance_loss_clip": 1.00183237, + "balance_loss_mlp": 1.00063038, + "epoch": 0.7099353675033819, + "flos": 19275447507840.0, + "grad_norm": 2.117889879923465, + "language_loss": 0.68260205, + "learning_rate": 8.194253484740882e-07, + "loss": 0.70529926, + "num_input_tokens_seen": 254691665, + "step": 11808, + "time_per_iteration": 2.5107431411743164 + }, + { + "auxiliary_loss_clip": 0.01148574, + "auxiliary_loss_mlp": 0.01103994, + "balance_loss_clip": 1.00182068, + "balance_loss_mlp": 1.00042534, + "epoch": 0.70999549075605, + "flos": 21908525990400.0, + "grad_norm": 1.7925777831202108, + "language_loss": 0.71276766, + "learning_rate": 8.191110000362513e-07, + "loss": 0.73529327, + "num_input_tokens_seen": 254711610, + "step": 11809, + "time_per_iteration": 2.596463680267334 + }, + { + "auxiliary_loss_clip": 0.01158866, + "auxiliary_loss_mlp": 0.01078141, + "balance_loss_clip": 1.00082064, + "balance_loss_mlp": 0.99993962, + "epoch": 0.7100556140087179, + "flos": 70456053456000.0, + "grad_norm": 0.7465932198790544, + "language_loss": 0.59469074, + "learning_rate": 8.187966963778435e-07, + "loss": 0.61706078, + "num_input_tokens_seen": 254772615, + "step": 11810, + "time_per_iteration": 3.173022508621216 + }, + { + "auxiliary_loss_clip": 0.01074012, + "auxiliary_loss_mlp": 0.01103329, + "balance_loss_clip": 1.00180912, + "balance_loss_mlp": 1.00061786, + "epoch": 0.7101157372613859, + "flos": 23039568420480.0, + "grad_norm": 1.5823571750067154, + "language_loss": 0.74398237, + "learning_rate": 8.18482437510784e-07, + "loss": 0.76575577, + "num_input_tokens_seen": 254791375, + "step": 11811, + "time_per_iteration": 2.7817132472991943 + }, + { + "auxiliary_loss_clip": 0.0111892, + "auxiliary_loss_mlp": 0.01102248, + "balance_loss_clip": 1.00192165, + "balance_loss_mlp": 1.00049043, + "epoch": 0.7101758605140538, + "flos": 23185329811200.0, + "grad_norm": 1.8349868600483763, + "language_loss": 0.83379781, + "learning_rate": 8.181682234469882e-07, + "loss": 0.85600948, + "num_input_tokens_seen": 254809300, + "step": 11812, + "time_per_iteration": 2.654036283493042 + }, + { + "auxiliary_loss_clip": 0.01164994, + "auxiliary_loss_mlp": 0.01104221, + "balance_loss_clip": 1.0020107, + "balance_loss_mlp": 1.0004611, + "epoch": 0.7102359837667218, + "flos": 23696123166720.0, + "grad_norm": 1.5985450369434002, + "language_loss": 0.70115918, + "learning_rate": 8.178540541983716e-07, + "loss": 0.72385132, + "num_input_tokens_seen": 254829325, + "step": 11813, + "time_per_iteration": 2.569655418395996 + }, + { + "auxiliary_loss_clip": 0.01164748, + "auxiliary_loss_mlp": 0.01103083, + "balance_loss_clip": 1.00190616, + "balance_loss_mlp": 1.00037217, + "epoch": 0.7102961070193897, + "flos": 19391116279680.0, + "grad_norm": 1.7037989030367244, + "language_loss": 0.81716484, + "learning_rate": 8.175399297768495e-07, + "loss": 0.83984315, + "num_input_tokens_seen": 254847690, + "step": 11814, + "time_per_iteration": 2.501634120941162 + }, + { + "auxiliary_loss_clip": 0.01164849, + "auxiliary_loss_mlp": 0.01103581, + "balance_loss_clip": 1.00194335, + "balance_loss_mlp": 1.000489, + "epoch": 0.7103562302720577, + "flos": 21507511576320.0, + "grad_norm": 1.8661746510084642, + "language_loss": 0.75624996, + "learning_rate": 8.172258501943301e-07, + "loss": 0.7789343, + "num_input_tokens_seen": 254865960, + "step": 11815, + "time_per_iteration": 3.981786012649536 + }, + { + "auxiliary_loss_clip": 0.01100494, + "auxiliary_loss_mlp": 0.01102504, + "balance_loss_clip": 1.00162125, + "balance_loss_mlp": 1.00046039, + "epoch": 0.7104163535247257, + "flos": 14535059869440.0, + "grad_norm": 9.1202622893316, + "language_loss": 0.78659093, + "learning_rate": 8.16911815462725e-07, + "loss": 0.80862093, + "num_input_tokens_seen": 254882815, + "step": 11816, + "time_per_iteration": 2.6998543739318848 + }, + { + "auxiliary_loss_clip": 0.01133692, + "auxiliary_loss_mlp": 0.01102362, + "balance_loss_clip": 1.00179636, + "balance_loss_mlp": 1.00060499, + "epoch": 0.7104764767773937, + "flos": 11400310085760.0, + "grad_norm": 2.2871158782398298, + "language_loss": 0.86174452, + "learning_rate": 8.165978255939426e-07, + "loss": 0.88410497, + "num_input_tokens_seen": 254898705, + "step": 11817, + "time_per_iteration": 2.622135639190674 + }, + { + "auxiliary_loss_clip": 0.01100362, + "auxiliary_loss_mlp": 0.01102675, + "balance_loss_clip": 1.00181293, + "balance_loss_mlp": 1.00053644, + "epoch": 0.7105366000300616, + "flos": 11690432236800.0, + "grad_norm": 3.0181558236643466, + "language_loss": 0.84383535, + "learning_rate": 8.162838805998897e-07, + "loss": 0.86586571, + "num_input_tokens_seen": 254913665, + "step": 11818, + "time_per_iteration": 4.146755933761597 + }, + { + "auxiliary_loss_clip": 0.01164813, + "auxiliary_loss_mlp": 0.01103497, + "balance_loss_clip": 1.0018543, + "balance_loss_mlp": 1.00049973, + "epoch": 0.7105967232827296, + "flos": 19354020508800.0, + "grad_norm": 2.8029200228573803, + "language_loss": 0.7567125, + "learning_rate": 8.159699804924709e-07, + "loss": 0.77939558, + "num_input_tokens_seen": 254932140, + "step": 11819, + "time_per_iteration": 2.5469987392425537 + }, + { + "auxiliary_loss_clip": 0.01101615, + "auxiliary_loss_mlp": 0.01104733, + "balance_loss_clip": 1.00178289, + "balance_loss_mlp": 1.00040078, + "epoch": 0.7106568465353975, + "flos": 22930400010240.0, + "grad_norm": 1.6418481903207047, + "language_loss": 0.70761126, + "learning_rate": 8.156561252835883e-07, + "loss": 0.72967476, + "num_input_tokens_seen": 254951580, + "step": 11820, + "time_per_iteration": 2.7357702255249023 + }, + { + "auxiliary_loss_clip": 0.01148125, + "auxiliary_loss_mlp": 0.01103272, + "balance_loss_clip": 1.00193357, + "balance_loss_mlp": 1.00046635, + "epoch": 0.7107169697880655, + "flos": 19099665325440.0, + "grad_norm": 1.9451348314820498, + "language_loss": 0.75646472, + "learning_rate": 8.153423149851449e-07, + "loss": 0.77897871, + "num_input_tokens_seen": 254969425, + "step": 11821, + "time_per_iteration": 2.5682830810546875 + }, + { + "auxiliary_loss_clip": 0.010985, + "auxiliary_loss_mlp": 0.01078573, + "balance_loss_clip": 1.00082707, + "balance_loss_mlp": 0.99999064, + "epoch": 0.7107770930407336, + "flos": 63638054231040.0, + "grad_norm": 0.7683871682029068, + "language_loss": 0.55059528, + "learning_rate": 8.150285496090388e-07, + "loss": 0.57236606, + "num_input_tokens_seen": 255032680, + "step": 11822, + "time_per_iteration": 3.2972397804260254 + }, + { + "auxiliary_loss_clip": 0.01147768, + "auxiliary_loss_mlp": 0.01103107, + "balance_loss_clip": 1.00183547, + "balance_loss_mlp": 1.00058675, + "epoch": 0.7108372162934015, + "flos": 22054466949120.0, + "grad_norm": 2.6699589376228796, + "language_loss": 0.60553586, + "learning_rate": 8.147148291671688e-07, + "loss": 0.62804461, + "num_input_tokens_seen": 255054400, + "step": 11823, + "time_per_iteration": 2.6209022998809814 + }, + { + "auxiliary_loss_clip": 0.01148052, + "auxiliary_loss_mlp": 0.0110307, + "balance_loss_clip": 1.00188112, + "balance_loss_mlp": 1.00045443, + "epoch": 0.7108973395460695, + "flos": 19135144984320.0, + "grad_norm": 2.0989299860961204, + "language_loss": 0.71223485, + "learning_rate": 8.144011536714322e-07, + "loss": 0.7347461, + "num_input_tokens_seen": 255072785, + "step": 11824, + "time_per_iteration": 2.5997695922851562 + }, + { + "auxiliary_loss_clip": 0.01134913, + "auxiliary_loss_mlp": 0.00747236, + "balance_loss_clip": 1.00191689, + "balance_loss_mlp": 1.00047183, + "epoch": 0.7109574627987374, + "flos": 17894431353600.0, + "grad_norm": 1.5349210237869793, + "language_loss": 0.72611928, + "learning_rate": 8.140875231337223e-07, + "loss": 0.74494076, + "num_input_tokens_seen": 255091820, + "step": 11825, + "time_per_iteration": 2.6198034286499023 + }, + { + "auxiliary_loss_clip": 0.01133618, + "auxiliary_loss_mlp": 0.01103768, + "balance_loss_clip": 1.00189257, + "balance_loss_mlp": 1.00048506, + "epoch": 0.7110175860514054, + "flos": 28979623422720.0, + "grad_norm": 2.108424135308073, + "language_loss": 0.79259431, + "learning_rate": 8.137739375659321e-07, + "loss": 0.81496823, + "num_input_tokens_seen": 255111720, + "step": 11826, + "time_per_iteration": 2.6878387928009033 + }, + { + "auxiliary_loss_clip": 0.01147851, + "auxiliary_loss_mlp": 0.01103248, + "balance_loss_clip": 1.00175142, + "balance_loss_mlp": 1.0005374, + "epoch": 0.7110777093040733, + "flos": 26173312623360.0, + "grad_norm": 1.6809511161999748, + "language_loss": 0.8304044, + "learning_rate": 8.134603969799527e-07, + "loss": 0.85291535, + "num_input_tokens_seen": 255133495, + "step": 11827, + "time_per_iteration": 2.6143453121185303 + }, + { + "auxiliary_loss_clip": 0.01118301, + "auxiliary_loss_mlp": 0.01103877, + "balance_loss_clip": 1.00178933, + "balance_loss_mlp": 1.00059414, + "epoch": 0.7111378325567413, + "flos": 26869943969280.0, + "grad_norm": 1.752960811375643, + "language_loss": 0.62494713, + "learning_rate": 8.131469013876748e-07, + "loss": 0.64716893, + "num_input_tokens_seen": 255156880, + "step": 11828, + "time_per_iteration": 2.7015066146850586 + }, + { + "auxiliary_loss_clip": 0.01164745, + "auxiliary_loss_mlp": 0.01103247, + "balance_loss_clip": 1.00185466, + "balance_loss_mlp": 1.00053573, + "epoch": 0.7111979558094093, + "flos": 27271820309760.0, + "grad_norm": 2.6918847373529755, + "language_loss": 0.72152704, + "learning_rate": 8.128334508009846e-07, + "loss": 0.74420696, + "num_input_tokens_seen": 255178920, + "step": 11829, + "time_per_iteration": 4.108738422393799 + }, + { + "auxiliary_loss_clip": 0.01164907, + "auxiliary_loss_mlp": 0.01102863, + "balance_loss_clip": 1.00197327, + "balance_loss_mlp": 1.00043821, + "epoch": 0.7112580790620773, + "flos": 25046938961280.0, + "grad_norm": 1.870872876787212, + "language_loss": 0.80126196, + "learning_rate": 8.125200452317697e-07, + "loss": 0.82393968, + "num_input_tokens_seen": 255198095, + "step": 11830, + "time_per_iteration": 2.5904881954193115 + }, + { + "auxiliary_loss_clip": 0.01150296, + "auxiliary_loss_mlp": 0.01103534, + "balance_loss_clip": 1.00201046, + "balance_loss_mlp": 1.00063264, + "epoch": 0.7113182023147452, + "flos": 21646628951040.0, + "grad_norm": 2.096458621245477, + "language_loss": 0.84375, + "learning_rate": 8.122066846919138e-07, + "loss": 0.8662883, + "num_input_tokens_seen": 255215860, + "step": 11831, + "time_per_iteration": 3.963836908340454 + }, + { + "auxiliary_loss_clip": 0.01135474, + "auxiliary_loss_mlp": 0.01102957, + "balance_loss_clip": 1.00185752, + "balance_loss_mlp": 1.0004375, + "epoch": 0.7113783255674132, + "flos": 20996287257600.0, + "grad_norm": 2.597875599323244, + "language_loss": 0.7741555, + "learning_rate": 8.118933691932985e-07, + "loss": 0.79653978, + "num_input_tokens_seen": 255235425, + "step": 11832, + "time_per_iteration": 2.6337978839874268 + }, + { + "auxiliary_loss_clip": 0.01142242, + "auxiliary_loss_mlp": 0.01078162, + "balance_loss_clip": 1.00087619, + "balance_loss_mlp": 0.99996024, + "epoch": 0.7114384488200811, + "flos": 66771080161920.0, + "grad_norm": 0.7431323816592678, + "language_loss": 0.56623936, + "learning_rate": 8.115800987478059e-07, + "loss": 0.5884434, + "num_input_tokens_seen": 255291680, + "step": 11833, + "time_per_iteration": 3.067823886871338 + }, + { + "auxiliary_loss_clip": 0.01100365, + "auxiliary_loss_mlp": 0.01102872, + "balance_loss_clip": 1.00159776, + "balance_loss_mlp": 1.000543, + "epoch": 0.7114985720727491, + "flos": 25010058672000.0, + "grad_norm": 2.320333509648756, + "language_loss": 0.70946157, + "learning_rate": 8.11266873367315e-07, + "loss": 0.73149395, + "num_input_tokens_seen": 255313880, + "step": 11834, + "time_per_iteration": 2.744389057159424 + }, + { + "auxiliary_loss_clip": 0.01164955, + "auxiliary_loss_mlp": 0.01104038, + "balance_loss_clip": 1.00199723, + "balance_loss_mlp": 1.00046921, + "epoch": 0.7115586953254172, + "flos": 21470128496640.0, + "grad_norm": 2.441063859237715, + "language_loss": 0.79474837, + "learning_rate": 8.10953693063704e-07, + "loss": 0.8174383, + "num_input_tokens_seen": 255332390, + "step": 11835, + "time_per_iteration": 2.5851809978485107 + }, + { + "auxiliary_loss_clip": 0.01147946, + "auxiliary_loss_mlp": 0.01102713, + "balance_loss_clip": 1.0018816, + "balance_loss_mlp": 1.00038385, + "epoch": 0.7116188185780851, + "flos": 28622600190720.0, + "grad_norm": 1.5455481668347333, + "language_loss": 0.75776899, + "learning_rate": 8.10640557848848e-07, + "loss": 0.78027552, + "num_input_tokens_seen": 255354025, + "step": 11836, + "time_per_iteration": 2.6734414100646973 + }, + { + "auxiliary_loss_clip": 0.01071077, + "auxiliary_loss_mlp": 0.0110255, + "balance_loss_clip": 1.00145447, + "balance_loss_mlp": 1.00050688, + "epoch": 0.7116789418307531, + "flos": 25293608634240.0, + "grad_norm": 1.9415784612456437, + "language_loss": 0.70003438, + "learning_rate": 8.103274677346208e-07, + "loss": 0.72177064, + "num_input_tokens_seen": 255371400, + "step": 11837, + "time_per_iteration": 2.8139030933380127 + }, + { + "auxiliary_loss_clip": 0.01148309, + "auxiliary_loss_mlp": 0.01104215, + "balance_loss_clip": 1.00187778, + "balance_loss_mlp": 1.00064552, + "epoch": 0.711739065083421, + "flos": 25557301353600.0, + "grad_norm": 1.9966402236995429, + "language_loss": 0.6199888, + "learning_rate": 8.100144227328958e-07, + "loss": 0.64251411, + "num_input_tokens_seen": 255390710, + "step": 11838, + "time_per_iteration": 2.6263372898101807 + }, + { + "auxiliary_loss_clip": 0.01148201, + "auxiliary_loss_mlp": 0.01103561, + "balance_loss_clip": 1.00187659, + "balance_loss_mlp": 1.00037313, + "epoch": 0.711799188336089, + "flos": 26140993361280.0, + "grad_norm": 2.511419061700788, + "language_loss": 0.67747509, + "learning_rate": 8.097014228555426e-07, + "loss": 0.69999272, + "num_input_tokens_seen": 255408790, + "step": 11839, + "time_per_iteration": 2.6061835289001465 + }, + { + "auxiliary_loss_clip": 0.01164753, + "auxiliary_loss_mlp": 0.0110393, + "balance_loss_clip": 1.00189996, + "balance_loss_mlp": 1.00064731, + "epoch": 0.7118593115887569, + "flos": 21140648017920.0, + "grad_norm": 1.8716326227608002, + "language_loss": 0.83920479, + "learning_rate": 8.093884681144305e-07, + "loss": 0.86189163, + "num_input_tokens_seen": 255426280, + "step": 11840, + "time_per_iteration": 2.5468947887420654 + }, + { + "auxiliary_loss_clip": 0.01133276, + "auxiliary_loss_mlp": 0.01103712, + "balance_loss_clip": 1.0017556, + "balance_loss_mlp": 1.00052488, + "epoch": 0.711919434841425, + "flos": 14975684006400.0, + "grad_norm": 1.8489152424999433, + "language_loss": 0.77045417, + "learning_rate": 8.090755585214277e-07, + "loss": 0.79282403, + "num_input_tokens_seen": 255442935, + "step": 11841, + "time_per_iteration": 2.580247402191162 + }, + { + "auxiliary_loss_clip": 0.01134557, + "auxiliary_loss_mlp": 0.01103662, + "balance_loss_clip": 1.00180972, + "balance_loss_mlp": 1.00056982, + "epoch": 0.7119795580940929, + "flos": 16508997826560.0, + "grad_norm": 2.799372052932021, + "language_loss": 0.74776423, + "learning_rate": 8.087626940883994e-07, + "loss": 0.77014643, + "num_input_tokens_seen": 255460925, + "step": 11842, + "time_per_iteration": 2.6359777450561523 + }, + { + "auxiliary_loss_clip": 0.01143718, + "auxiliary_loss_mlp": 0.01079286, + "balance_loss_clip": 1.00169778, + "balance_loss_mlp": 1.00032139, + "epoch": 0.7120396813467609, + "flos": 66570736055040.0, + "grad_norm": 0.833717649295129, + "language_loss": 0.61623406, + "learning_rate": 8.084498748272082e-07, + "loss": 0.63846409, + "num_input_tokens_seen": 255521360, + "step": 11843, + "time_per_iteration": 3.1320645809173584 + }, + { + "auxiliary_loss_clip": 0.01164922, + "auxiliary_loss_mlp": 0.01102506, + "balance_loss_clip": 1.00196147, + "balance_loss_mlp": 1.00046301, + "epoch": 0.7120998045994288, + "flos": 26432731624320.0, + "grad_norm": 1.785195596216216, + "language_loss": 0.80353862, + "learning_rate": 8.081371007497171e-07, + "loss": 0.82621288, + "num_input_tokens_seen": 255541435, + "step": 11844, + "time_per_iteration": 2.5675384998321533 + }, + { + "auxiliary_loss_clip": 0.01103792, + "auxiliary_loss_mlp": 0.01102921, + "balance_loss_clip": 1.00174844, + "balance_loss_mlp": 1.00040054, + "epoch": 0.7121599278520968, + "flos": 16427982700800.0, + "grad_norm": 2.4357432307709987, + "language_loss": 0.7895357, + "learning_rate": 8.078243718677873e-07, + "loss": 0.81160283, + "num_input_tokens_seen": 255558505, + "step": 11845, + "time_per_iteration": 2.6697592735290527 + }, + { + "auxiliary_loss_clip": 0.01150189, + "auxiliary_loss_mlp": 0.01102932, + "balance_loss_clip": 1.00192094, + "balance_loss_mlp": 1.00045955, + "epoch": 0.7122200511047647, + "flos": 28949889939840.0, + "grad_norm": 2.008558828111982, + "language_loss": 0.7689538, + "learning_rate": 8.075116881932762e-07, + "loss": 0.79148495, + "num_input_tokens_seen": 255577815, + "step": 11846, + "time_per_iteration": 2.642418622970581 + }, + { + "auxiliary_loss_clip": 0.01150301, + "auxiliary_loss_mlp": 0.01104155, + "balance_loss_clip": 1.00193858, + "balance_loss_mlp": 1.00049019, + "epoch": 0.7122801743574327, + "flos": 16471866142080.0, + "grad_norm": 1.844265462363675, + "language_loss": 0.58188593, + "learning_rate": 8.071990497380421e-07, + "loss": 0.6044305, + "num_input_tokens_seen": 255595885, + "step": 11847, + "time_per_iteration": 2.5925099849700928 + }, + { + "auxiliary_loss_clip": 0.01147532, + "auxiliary_loss_mlp": 0.00747133, + "balance_loss_clip": 1.00200045, + "balance_loss_mlp": 1.00048518, + "epoch": 0.7123402976101008, + "flos": 20631039811200.0, + "grad_norm": 1.4088631096221524, + "language_loss": 0.71560282, + "learning_rate": 8.068864565139395e-07, + "loss": 0.73454952, + "num_input_tokens_seen": 255616750, + "step": 11848, + "time_per_iteration": 2.5983760356903076 + }, + { + "auxiliary_loss_clip": 0.01144516, + "auxiliary_loss_mlp": 0.01078101, + "balance_loss_clip": 1.00081801, + "balance_loss_mlp": 0.99990016, + "epoch": 0.7124004208627687, + "flos": 62325734837760.0, + "grad_norm": 0.8710504251617877, + "language_loss": 0.63051128, + "learning_rate": 8.065739085328211e-07, + "loss": 0.65273738, + "num_input_tokens_seen": 255677900, + "step": 11849, + "time_per_iteration": 3.1238274574279785 + }, + { + "auxiliary_loss_clip": 0.01133704, + "auxiliary_loss_mlp": 0.01103381, + "balance_loss_clip": 1.00173593, + "balance_loss_mlp": 1.00057518, + "epoch": 0.7124605441154367, + "flos": 39675975788160.0, + "grad_norm": 1.7397716111482313, + "language_loss": 0.64179188, + "learning_rate": 8.0626140580654e-07, + "loss": 0.66416276, + "num_input_tokens_seen": 255699140, + "step": 11850, + "time_per_iteration": 2.768805980682373 + }, + { + "auxiliary_loss_clip": 0.01150198, + "auxiliary_loss_mlp": 0.01103208, + "balance_loss_clip": 1.00184488, + "balance_loss_mlp": 1.00040233, + "epoch": 0.7125206673681046, + "flos": 28181868312960.0, + "grad_norm": 1.630705234832993, + "language_loss": 0.6971792, + "learning_rate": 8.05948948346946e-07, + "loss": 0.71971327, + "num_input_tokens_seen": 255719640, + "step": 11851, + "time_per_iteration": 2.7031235694885254 + }, + { + "auxiliary_loss_clip": 0.01149297, + "auxiliary_loss_mlp": 0.01102001, + "balance_loss_clip": 1.00193572, + "balance_loss_mlp": 1.00043464, + "epoch": 0.7125807906207726, + "flos": 26176939896960.0, + "grad_norm": 2.0621885594018834, + "language_loss": 0.83354032, + "learning_rate": 8.056365361658882e-07, + "loss": 0.85605329, + "num_input_tokens_seen": 255740450, + "step": 11852, + "time_per_iteration": 2.6100099086761475 + }, + { + "auxiliary_loss_clip": 0.01150187, + "auxiliary_loss_mlp": 0.00747482, + "balance_loss_clip": 1.00190306, + "balance_loss_mlp": 1.00060821, + "epoch": 0.7126409138734405, + "flos": 17157328358400.0, + "grad_norm": 2.4871204492220578, + "language_loss": 0.72877109, + "learning_rate": 8.053241692752126e-07, + "loss": 0.74774784, + "num_input_tokens_seen": 255758070, + "step": 11853, + "time_per_iteration": 3.9071428775787354 + }, + { + "auxiliary_loss_clip": 0.01116305, + "auxiliary_loss_mlp": 0.01101267, + "balance_loss_clip": 1.00175273, + "balance_loss_mlp": 1.00046349, + "epoch": 0.7127010371261085, + "flos": 18769933451520.0, + "grad_norm": 1.9353175820070982, + "language_loss": 0.92590141, + "learning_rate": 8.050118476867635e-07, + "loss": 0.94807714, + "num_input_tokens_seen": 255775685, + "step": 11854, + "time_per_iteration": 2.611736536026001 + }, + { + "auxiliary_loss_clip": 0.01148268, + "auxiliary_loss_mlp": 0.01102248, + "balance_loss_clip": 1.00174141, + "balance_loss_mlp": 1.0004909, + "epoch": 0.7127611603787765, + "flos": 20376433232640.0, + "grad_norm": 1.901847772291841, + "language_loss": 0.79293406, + "learning_rate": 8.046995714123856e-07, + "loss": 0.81543922, + "num_input_tokens_seen": 255794750, + "step": 11855, + "time_per_iteration": 2.536709785461426 + }, + { + "auxiliary_loss_clip": 0.01102341, + "auxiliary_loss_mlp": 0.01103755, + "balance_loss_clip": 1.00184608, + "balance_loss_mlp": 1.00037718, + "epoch": 0.7128212836314445, + "flos": 20449008662400.0, + "grad_norm": 1.640758074754376, + "language_loss": 0.72841579, + "learning_rate": 8.043873404639192e-07, + "loss": 0.75047672, + "num_input_tokens_seen": 255813325, + "step": 11856, + "time_per_iteration": 4.031708002090454 + }, + { + "auxiliary_loss_clip": 0.01148617, + "auxiliary_loss_mlp": 0.01103395, + "balance_loss_clip": 1.00203991, + "balance_loss_mlp": 1.00058913, + "epoch": 0.7128814068841124, + "flos": 23440834229760.0, + "grad_norm": 1.766120309250239, + "language_loss": 0.70186657, + "learning_rate": 8.040751548532046e-07, + "loss": 0.72438669, + "num_input_tokens_seen": 255832470, + "step": 11857, + "time_per_iteration": 2.593008279800415 + }, + { + "auxiliary_loss_clip": 0.01149946, + "auxiliary_loss_mlp": 0.01102446, + "balance_loss_clip": 1.00193763, + "balance_loss_mlp": 1.00049853, + "epoch": 0.7129415301367804, + "flos": 18222942165120.0, + "grad_norm": 2.2969989945658273, + "language_loss": 0.84908259, + "learning_rate": 8.03763014592081e-07, + "loss": 0.87160647, + "num_input_tokens_seen": 255849740, + "step": 11858, + "time_per_iteration": 2.572049617767334 + }, + { + "auxiliary_loss_clip": 0.01165031, + "auxiliary_loss_mlp": 0.01103646, + "balance_loss_clip": 1.00197792, + "balance_loss_mlp": 1.0004586, + "epoch": 0.7130016533894483, + "flos": 15523896355200.0, + "grad_norm": 1.747204150668056, + "language_loss": 0.80546117, + "learning_rate": 8.034509196923829e-07, + "loss": 0.82814795, + "num_input_tokens_seen": 255866975, + "step": 11859, + "time_per_iteration": 2.55295991897583 + }, + { + "auxiliary_loss_clip": 0.01131243, + "auxiliary_loss_mlp": 0.01102751, + "balance_loss_clip": 1.00171399, + "balance_loss_mlp": 1.00042176, + "epoch": 0.7130617766421163, + "flos": 57115668960000.0, + "grad_norm": 1.2723161682937254, + "language_loss": 0.6890167, + "learning_rate": 8.031388701659456e-07, + "loss": 0.71135664, + "num_input_tokens_seen": 255892915, + "step": 11860, + "time_per_iteration": 2.9523086547851562 + }, + { + "auxiliary_loss_clip": 0.0114799, + "auxiliary_loss_mlp": 0.01103588, + "balance_loss_clip": 1.00179076, + "balance_loss_mlp": 1.00044835, + "epoch": 0.7131218998947844, + "flos": 19788252024960.0, + "grad_norm": 2.2278011609159387, + "language_loss": 0.64098501, + "learning_rate": 8.028268660246023e-07, + "loss": 0.66350079, + "num_input_tokens_seen": 255911480, + "step": 11861, + "time_per_iteration": 2.6646339893341064 + }, + { + "auxiliary_loss_clip": 0.01133028, + "auxiliary_loss_mlp": 0.01104699, + "balance_loss_clip": 1.00191617, + "balance_loss_mlp": 1.00046277, + "epoch": 0.7131820231474523, + "flos": 26651894457600.0, + "grad_norm": 2.192198615190472, + "language_loss": 0.67314142, + "learning_rate": 8.025149072801849e-07, + "loss": 0.69551873, + "num_input_tokens_seen": 255931140, + "step": 11862, + "time_per_iteration": 2.6580750942230225 + }, + { + "auxiliary_loss_clip": 0.01133251, + "auxiliary_loss_mlp": 0.01102498, + "balance_loss_clip": 1.00191402, + "balance_loss_mlp": 1.00055003, + "epoch": 0.7132421464001203, + "flos": 29205609840000.0, + "grad_norm": 2.4892375591665665, + "language_loss": 0.67097378, + "learning_rate": 8.022029939445214e-07, + "loss": 0.69333124, + "num_input_tokens_seen": 255951665, + "step": 11863, + "time_per_iteration": 2.679049491882324 + }, + { + "auxiliary_loss_clip": 0.01100415, + "auxiliary_loss_mlp": 0.01104144, + "balance_loss_clip": 1.00184822, + "balance_loss_mlp": 1.00057518, + "epoch": 0.7133022696527882, + "flos": 23073611535360.0, + "grad_norm": 1.9009135804421777, + "language_loss": 0.65621203, + "learning_rate": 8.018911260294414e-07, + "loss": 0.67825758, + "num_input_tokens_seen": 255970055, + "step": 11864, + "time_per_iteration": 2.727846145629883 + }, + { + "auxiliary_loss_clip": 0.01150308, + "auxiliary_loss_mlp": 0.01104107, + "balance_loss_clip": 1.00203454, + "balance_loss_mlp": 1.00063324, + "epoch": 0.7133623929054562, + "flos": 17457111267840.0, + "grad_norm": 2.3360933522832226, + "language_loss": 0.86097372, + "learning_rate": 8.015793035467697e-07, + "loss": 0.88351786, + "num_input_tokens_seen": 255987720, + "step": 11865, + "time_per_iteration": 2.5846054553985596 + }, + { + "auxiliary_loss_clip": 0.01118882, + "auxiliary_loss_mlp": 0.01102502, + "balance_loss_clip": 1.00182021, + "balance_loss_mlp": 1.00045872, + "epoch": 0.7134225161581241, + "flos": 19536554448000.0, + "grad_norm": 2.3574720951935686, + "language_loss": 0.7505095, + "learning_rate": 8.012675265083304e-07, + "loss": 0.77272332, + "num_input_tokens_seen": 256005490, + "step": 11866, + "time_per_iteration": 2.6314737796783447 + }, + { + "auxiliary_loss_clip": 0.01119136, + "auxiliary_loss_mlp": 0.01104523, + "balance_loss_clip": 1.00201261, + "balance_loss_mlp": 1.00047755, + "epoch": 0.7134826394107922, + "flos": 26250089944320.0, + "grad_norm": 2.208717141446268, + "language_loss": 0.7069214, + "learning_rate": 8.009557949259464e-07, + "loss": 0.72915804, + "num_input_tokens_seen": 256026030, + "step": 11867, + "time_per_iteration": 4.117161989212036 + }, + { + "auxiliary_loss_clip": 0.01147959, + "auxiliary_loss_mlp": 0.01102528, + "balance_loss_clip": 1.00189209, + "balance_loss_mlp": 1.00048459, + "epoch": 0.7135427626634601, + "flos": 15815311395840.0, + "grad_norm": 4.2395051980502645, + "language_loss": 0.71995366, + "learning_rate": 8.006441088114397e-07, + "loss": 0.74245858, + "num_input_tokens_seen": 256043680, + "step": 11868, + "time_per_iteration": 2.565556526184082 + }, + { + "auxiliary_loss_clip": 0.01099306, + "auxiliary_loss_mlp": 0.01103718, + "balance_loss_clip": 1.0018034, + "balance_loss_mlp": 1.00033939, + "epoch": 0.7136028859161281, + "flos": 18223409041920.0, + "grad_norm": 11.025247928098855, + "language_loss": 0.6603111, + "learning_rate": 8.003324681766286e-07, + "loss": 0.68234134, + "num_input_tokens_seen": 256059705, + "step": 11869, + "time_per_iteration": 4.029053211212158 + }, + { + "auxiliary_loss_clip": 0.011335, + "auxiliary_loss_mlp": 0.01103094, + "balance_loss_clip": 1.00181127, + "balance_loss_mlp": 1.00038362, + "epoch": 0.713663009168796, + "flos": 24314827956480.0, + "grad_norm": 1.5978504110371874, + "language_loss": 0.77818781, + "learning_rate": 8.000208730333298e-07, + "loss": 0.8005538, + "num_input_tokens_seen": 256079785, + "step": 11870, + "time_per_iteration": 2.6693267822265625 + }, + { + "auxiliary_loss_clip": 0.011017, + "auxiliary_loss_mlp": 0.01103895, + "balance_loss_clip": 1.00189745, + "balance_loss_mlp": 1.00051701, + "epoch": 0.713723132421464, + "flos": 26538488242560.0, + "grad_norm": 1.6614416956816351, + "language_loss": 0.81448996, + "learning_rate": 7.997093233933597e-07, + "loss": 0.83654594, + "num_input_tokens_seen": 256099000, + "step": 11871, + "time_per_iteration": 2.71527099609375 + }, + { + "auxiliary_loss_clip": 0.01116329, + "auxiliary_loss_mlp": 0.01104251, + "balance_loss_clip": 1.0018065, + "balance_loss_mlp": 1.00068212, + "epoch": 0.7137832556741319, + "flos": 19865675790720.0, + "grad_norm": 2.6862743956830815, + "language_loss": 0.79045254, + "learning_rate": 7.993978192685331e-07, + "loss": 0.81265837, + "num_input_tokens_seen": 256117985, + "step": 11872, + "time_per_iteration": 2.6598551273345947 + }, + { + "auxiliary_loss_clip": 0.01148408, + "auxiliary_loss_mlp": 0.01104295, + "balance_loss_clip": 1.00184238, + "balance_loss_mlp": 1.00043941, + "epoch": 0.7138433789267999, + "flos": 21688932193920.0, + "grad_norm": 3.178446258728924, + "language_loss": 0.83861673, + "learning_rate": 7.990863606706606e-07, + "loss": 0.86114371, + "num_input_tokens_seen": 256134350, + "step": 11873, + "time_per_iteration": 2.5579583644866943 + }, + { + "auxiliary_loss_clip": 0.01116545, + "auxiliary_loss_mlp": 0.01103144, + "balance_loss_clip": 1.00173831, + "balance_loss_mlp": 1.00043321, + "epoch": 0.713903502179468, + "flos": 17602729004160.0, + "grad_norm": 1.9845535469622866, + "language_loss": 0.85973716, + "learning_rate": 7.987749476115539e-07, + "loss": 0.88193405, + "num_input_tokens_seen": 256150610, + "step": 11874, + "time_per_iteration": 2.6156604290008545 + }, + { + "auxiliary_loss_clip": 0.01149554, + "auxiliary_loss_mlp": 0.01104012, + "balance_loss_clip": 1.00185406, + "balance_loss_mlp": 1.00044298, + "epoch": 0.7139636254321359, + "flos": 18040336398720.0, + "grad_norm": 2.1165874170839682, + "language_loss": 0.83131033, + "learning_rate": 7.984635801030228e-07, + "loss": 0.85384595, + "num_input_tokens_seen": 256168620, + "step": 11875, + "time_per_iteration": 2.5497851371765137 + }, + { + "auxiliary_loss_clip": 0.01133415, + "auxiliary_loss_mlp": 0.01104427, + "balance_loss_clip": 1.0018822, + "balance_loss_mlp": 1.00057197, + "epoch": 0.7140237486848039, + "flos": 23331127115520.0, + "grad_norm": 1.9803781696294296, + "language_loss": 0.69779027, + "learning_rate": 7.981522581568721e-07, + "loss": 0.72016865, + "num_input_tokens_seen": 256186700, + "step": 11876, + "time_per_iteration": 2.602728843688965 + }, + { + "auxiliary_loss_clip": 0.01164949, + "auxiliary_loss_mlp": 0.01103297, + "balance_loss_clip": 1.00195682, + "balance_loss_mlp": 1.00049067, + "epoch": 0.7140838719374718, + "flos": 16837077674880.0, + "grad_norm": 2.0104681898046275, + "language_loss": 0.77740812, + "learning_rate": 7.978409817849079e-07, + "loss": 0.80009061, + "num_input_tokens_seen": 256205390, + "step": 11877, + "time_per_iteration": 2.513625144958496 + }, + { + "auxiliary_loss_clip": 0.01148068, + "auxiliary_loss_mlp": 0.01102821, + "balance_loss_clip": 1.00191462, + "balance_loss_mlp": 1.00049186, + "epoch": 0.7141439951901398, + "flos": 21142012734720.0, + "grad_norm": 2.1313126158359204, + "language_loss": 0.6925509, + "learning_rate": 7.97529750998934e-07, + "loss": 0.71505976, + "num_input_tokens_seen": 256224575, + "step": 11878, + "time_per_iteration": 2.560638904571533 + }, + { + "auxiliary_loss_clip": 0.01114125, + "auxiliary_loss_mlp": 0.01102498, + "balance_loss_clip": 1.00181603, + "balance_loss_mlp": 1.00055003, + "epoch": 0.7142041184428077, + "flos": 24717709877760.0, + "grad_norm": 1.9144426905249876, + "language_loss": 0.67281258, + "learning_rate": 7.972185658107535e-07, + "loss": 0.69497883, + "num_input_tokens_seen": 256242130, + "step": 11879, + "time_per_iteration": 2.672656297683716 + }, + { + "auxiliary_loss_clip": 0.01101771, + "auxiliary_loss_mlp": 0.01103591, + "balance_loss_clip": 1.00177681, + "balance_loss_mlp": 1.00040364, + "epoch": 0.7142642416954758, + "flos": 21908202768000.0, + "grad_norm": 1.993463828565993, + "language_loss": 0.68923795, + "learning_rate": 7.969074262321646e-07, + "loss": 0.71129161, + "num_input_tokens_seen": 256261920, + "step": 11880, + "time_per_iteration": 2.694578170776367 + }, + { + "auxiliary_loss_clip": 0.01133643, + "auxiliary_loss_mlp": 0.01104233, + "balance_loss_clip": 1.00187755, + "balance_loss_mlp": 1.00056899, + "epoch": 0.7143243649481437, + "flos": 20805636844800.0, + "grad_norm": 3.0468796201024424, + "language_loss": 0.80761564, + "learning_rate": 7.965963322749674e-07, + "loss": 0.82999444, + "num_input_tokens_seen": 256277970, + "step": 11881, + "time_per_iteration": 2.634662628173828 + }, + { + "auxiliary_loss_clip": 0.01118485, + "auxiliary_loss_mlp": 0.01103747, + "balance_loss_clip": 1.00182796, + "balance_loss_mlp": 1.00055933, + "epoch": 0.7143844882008117, + "flos": 27235011847680.0, + "grad_norm": 1.7160149062816554, + "language_loss": 0.63588482, + "learning_rate": 7.962852839509579e-07, + "loss": 0.65810716, + "num_input_tokens_seen": 256298205, + "step": 11882, + "time_per_iteration": 2.7463104724884033 + }, + { + "auxiliary_loss_clip": 0.01164848, + "auxiliary_loss_mlp": 0.01103685, + "balance_loss_clip": 1.00198245, + "balance_loss_mlp": 1.00040221, + "epoch": 0.7144446114534796, + "flos": 17929623703680.0, + "grad_norm": 1.7503555933145185, + "language_loss": 0.68915689, + "learning_rate": 7.959742812719304e-07, + "loss": 0.7118423, + "num_input_tokens_seen": 256316685, + "step": 11883, + "time_per_iteration": 2.511861801147461 + }, + { + "auxiliary_loss_clip": 0.01150014, + "auxiliary_loss_mlp": 0.01103835, + "balance_loss_clip": 1.00199294, + "balance_loss_mlp": 1.0005517, + "epoch": 0.7145047347061476, + "flos": 20740962407040.0, + "grad_norm": 1.9588271463073654, + "language_loss": 0.77580649, + "learning_rate": 7.956633242496788e-07, + "loss": 0.79834497, + "num_input_tokens_seen": 256334205, + "step": 11884, + "time_per_iteration": 2.605978488922119 + }, + { + "auxiliary_loss_clip": 0.01150395, + "auxiliary_loss_mlp": 0.01104211, + "balance_loss_clip": 1.00186098, + "balance_loss_mlp": 1.00045097, + "epoch": 0.7145648579588155, + "flos": 21178605715200.0, + "grad_norm": 2.172336092407925, + "language_loss": 0.74039972, + "learning_rate": 7.953524128959954e-07, + "loss": 0.76294577, + "num_input_tokens_seen": 256353340, + "step": 11885, + "time_per_iteration": 2.584876537322998 + }, + { + "auxiliary_loss_clip": 0.01127713, + "auxiliary_loss_mlp": 0.01078278, + "balance_loss_clip": 1.00079489, + "balance_loss_mlp": 1.00007653, + "epoch": 0.7146249812114835, + "flos": 64784539509120.0, + "grad_norm": 0.8815120084704142, + "language_loss": 0.66365939, + "learning_rate": 7.95041547222669e-07, + "loss": 0.68571931, + "num_input_tokens_seen": 256411550, + "step": 11886, + "time_per_iteration": 3.137631416320801 + }, + { + "auxiliary_loss_clip": 0.01098906, + "auxiliary_loss_mlp": 0.01102302, + "balance_loss_clip": 1.00164318, + "balance_loss_mlp": 1.00035441, + "epoch": 0.7146851044641516, + "flos": 18113881495680.0, + "grad_norm": 1.7507955429861386, + "language_loss": 0.7543751, + "learning_rate": 7.947307272414874e-07, + "loss": 0.7763871, + "num_input_tokens_seen": 256430360, + "step": 11887, + "time_per_iteration": 2.6897377967834473 + }, + { + "auxiliary_loss_clip": 0.01147558, + "auxiliary_loss_mlp": 0.01102901, + "balance_loss_clip": 1.00186777, + "balance_loss_mlp": 1.00038075, + "epoch": 0.7147452277168195, + "flos": 19243846517760.0, + "grad_norm": 1.561162972230485, + "language_loss": 0.71453923, + "learning_rate": 7.944199529642372e-07, + "loss": 0.73704386, + "num_input_tokens_seen": 256449750, + "step": 11888, + "time_per_iteration": 2.542884349822998 + }, + { + "auxiliary_loss_clip": 0.01150275, + "auxiliary_loss_mlp": 0.01103487, + "balance_loss_clip": 1.00182962, + "balance_loss_mlp": 1.00058579, + "epoch": 0.7148053509694875, + "flos": 23764712186880.0, + "grad_norm": 147.9586565821841, + "language_loss": 0.84434807, + "learning_rate": 7.941092244027041e-07, + "loss": 0.86688572, + "num_input_tokens_seen": 256467330, + "step": 11889, + "time_per_iteration": 2.565129041671753 + }, + { + "auxiliary_loss_clip": 0.01099761, + "auxiliary_loss_mlp": 0.01103249, + "balance_loss_clip": 1.00167501, + "balance_loss_mlp": 1.00044298, + "epoch": 0.7148654742221554, + "flos": 22485322586880.0, + "grad_norm": 1.882062252317216, + "language_loss": 0.75950503, + "learning_rate": 7.937985415686695e-07, + "loss": 0.78153515, + "num_input_tokens_seen": 256485705, + "step": 11890, + "time_per_iteration": 4.175903081893921 + }, + { + "auxiliary_loss_clip": 0.0111837, + "auxiliary_loss_mlp": 0.01103032, + "balance_loss_clip": 1.00182879, + "balance_loss_mlp": 1.00051165, + "epoch": 0.7149255974748234, + "flos": 24679213476480.0, + "grad_norm": 2.401109049211083, + "language_loss": 0.73896468, + "learning_rate": 7.934879044739147e-07, + "loss": 0.76117867, + "num_input_tokens_seen": 256504755, + "step": 11891, + "time_per_iteration": 2.6651952266693115 + }, + { + "auxiliary_loss_clip": 0.01101931, + "auxiliary_loss_mlp": 0.01103711, + "balance_loss_clip": 1.00174689, + "balance_loss_mlp": 1.00052381, + "epoch": 0.7149857207274913, + "flos": 18405583845120.0, + "grad_norm": 2.0217158854128634, + "language_loss": 0.68035758, + "learning_rate": 7.931773131302211e-07, + "loss": 0.70241404, + "num_input_tokens_seen": 256523670, + "step": 11892, + "time_per_iteration": 2.702946662902832 + }, + { + "auxiliary_loss_clip": 0.01115276, + "auxiliary_loss_mlp": 0.01105158, + "balance_loss_clip": 1.00172257, + "balance_loss_mlp": 1.00053954, + "epoch": 0.7150458439801594, + "flos": 24969515195520.0, + "grad_norm": 2.1382291467666388, + "language_loss": 0.74002117, + "learning_rate": 7.928667675493632e-07, + "loss": 0.76222551, + "num_input_tokens_seen": 256542225, + "step": 11893, + "time_per_iteration": 2.654179573059082 + }, + { + "auxiliary_loss_clip": 0.0116519, + "auxiliary_loss_mlp": 0.01104464, + "balance_loss_clip": 1.00205839, + "balance_loss_mlp": 1.00051367, + "epoch": 0.7151059672328273, + "flos": 16690777580160.0, + "grad_norm": 2.290900558422401, + "language_loss": 0.66172433, + "learning_rate": 7.925562677431185e-07, + "loss": 0.68442094, + "num_input_tokens_seen": 256560730, + "step": 11894, + "time_per_iteration": 3.8766117095947266 + }, + { + "auxiliary_loss_clip": 0.01117186, + "auxiliary_loss_mlp": 0.01103596, + "balance_loss_clip": 1.00176978, + "balance_loss_mlp": 1.00050402, + "epoch": 0.7151660904854953, + "flos": 27271820309760.0, + "grad_norm": 2.853197724982347, + "language_loss": 0.77630138, + "learning_rate": 7.922458137232613e-07, + "loss": 0.79850924, + "num_input_tokens_seen": 256580505, + "step": 11895, + "time_per_iteration": 2.6892824172973633 + }, + { + "auxiliary_loss_clip": 0.01148043, + "auxiliary_loss_mlp": 0.01102936, + "balance_loss_clip": 1.00183415, + "balance_loss_mlp": 1.00041556, + "epoch": 0.7152262137381632, + "flos": 18332254229760.0, + "grad_norm": 1.8152973794109224, + "language_loss": 0.69034654, + "learning_rate": 7.919354055015643e-07, + "loss": 0.71285635, + "num_input_tokens_seen": 256597330, + "step": 11896, + "time_per_iteration": 2.60123872756958 + }, + { + "auxiliary_loss_clip": 0.01131529, + "auxiliary_loss_mlp": 0.01103876, + "balance_loss_clip": 1.00170803, + "balance_loss_mlp": 1.00059342, + "epoch": 0.7152863369908312, + "flos": 21799285752960.0, + "grad_norm": 1.7506278460838913, + "language_loss": 0.86426115, + "learning_rate": 7.91625043089798e-07, + "loss": 0.88661516, + "num_input_tokens_seen": 256616030, + "step": 11897, + "time_per_iteration": 2.694850444793701 + }, + { + "auxiliary_loss_clip": 0.01132821, + "auxiliary_loss_mlp": 0.01102542, + "balance_loss_clip": 1.00188923, + "balance_loss_mlp": 1.00049901, + "epoch": 0.7153464602434991, + "flos": 22158427887360.0, + "grad_norm": 2.0549701305407133, + "language_loss": 0.78394026, + "learning_rate": 7.913147264997304e-07, + "loss": 0.80629385, + "num_input_tokens_seen": 256635570, + "step": 11898, + "time_per_iteration": 2.623929023742676 + }, + { + "auxiliary_loss_clip": 0.01133491, + "auxiliary_loss_mlp": 0.01104672, + "balance_loss_clip": 1.00196636, + "balance_loss_mlp": 1.00043607, + "epoch": 0.7154065834961671, + "flos": 24716057852160.0, + "grad_norm": 1.6952463163107843, + "language_loss": 0.73158622, + "learning_rate": 7.910044557431302e-07, + "loss": 0.75396788, + "num_input_tokens_seen": 256655290, + "step": 11899, + "time_per_iteration": 2.6666853427886963 + }, + { + "auxiliary_loss_clip": 0.01150095, + "auxiliary_loss_mlp": 0.01103698, + "balance_loss_clip": 1.00178647, + "balance_loss_mlp": 1.0006057, + "epoch": 0.7154667067488351, + "flos": 22601494149120.0, + "grad_norm": 2.5826156594928467, + "language_loss": 0.75623679, + "learning_rate": 7.906942308317614e-07, + "loss": 0.77877474, + "num_input_tokens_seen": 256671605, + "step": 11900, + "time_per_iteration": 2.5507428646087646 + }, + { + "auxiliary_loss_clip": 0.01148323, + "auxiliary_loss_mlp": 0.01103402, + "balance_loss_clip": 1.00201488, + "balance_loss_mlp": 1.00050104, + "epoch": 0.7155268300015031, + "flos": 18771154513920.0, + "grad_norm": 1.8538434835263038, + "language_loss": 0.80954456, + "learning_rate": 7.903840517773886e-07, + "loss": 0.83206189, + "num_input_tokens_seen": 256689680, + "step": 11901, + "time_per_iteration": 2.5459818840026855 + }, + { + "auxiliary_loss_clip": 0.01117976, + "auxiliary_loss_mlp": 0.01104448, + "balance_loss_clip": 1.00191665, + "balance_loss_mlp": 1.00049758, + "epoch": 0.7155869532541711, + "flos": 18296343607680.0, + "grad_norm": 3.5078173176754732, + "language_loss": 0.81721497, + "learning_rate": 7.900739185917744e-07, + "loss": 0.83943915, + "num_input_tokens_seen": 256707760, + "step": 11902, + "time_per_iteration": 2.598830223083496 + }, + { + "auxiliary_loss_clip": 0.01116446, + "auxiliary_loss_mlp": 0.01103717, + "balance_loss_clip": 1.00184703, + "balance_loss_mlp": 1.00033844, + "epoch": 0.715647076506839, + "flos": 11980805783040.0, + "grad_norm": 2.4218142215961844, + "language_loss": 0.68224889, + "learning_rate": 7.897638312866785e-07, + "loss": 0.70445049, + "num_input_tokens_seen": 256724150, + "step": 11903, + "time_per_iteration": 2.6359899044036865 + }, + { + "auxiliary_loss_clip": 0.01116838, + "auxiliary_loss_mlp": 0.01101898, + "balance_loss_clip": 1.00164223, + "balance_loss_mlp": 1.00042677, + "epoch": 0.715707199759507, + "flos": 18951641377920.0, + "grad_norm": 1.6832626327173297, + "language_loss": 0.75691319, + "learning_rate": 7.894537898738589e-07, + "loss": 0.77910054, + "num_input_tokens_seen": 256742780, + "step": 11904, + "time_per_iteration": 4.135519027709961 + }, + { + "auxiliary_loss_clip": 0.01131551, + "auxiliary_loss_mlp": 0.01103731, + "balance_loss_clip": 1.00190568, + "balance_loss_mlp": 1.00054348, + "epoch": 0.7157673230121749, + "flos": 15304410299520.0, + "grad_norm": 1.9889525179682748, + "language_loss": 0.72505689, + "learning_rate": 7.891437943650727e-07, + "loss": 0.7474097, + "num_input_tokens_seen": 256761355, + "step": 11905, + "time_per_iteration": 2.5640695095062256 + }, + { + "auxiliary_loss_clip": 0.01114807, + "auxiliary_loss_mlp": 0.01103281, + "balance_loss_clip": 1.00169039, + "balance_loss_mlp": 1.00057054, + "epoch": 0.715827446264843, + "flos": 23221850964480.0, + "grad_norm": 1.724127891135691, + "language_loss": 0.78097063, + "learning_rate": 7.88833844772076e-07, + "loss": 0.80315149, + "num_input_tokens_seen": 256781335, + "step": 11906, + "time_per_iteration": 2.655980110168457 + }, + { + "auxiliary_loss_clip": 0.01125462, + "auxiliary_loss_mlp": 0.01078546, + "balance_loss_clip": 1.00074387, + "balance_loss_mlp": 0.99996293, + "epoch": 0.7158875695175109, + "flos": 60975421833600.0, + "grad_norm": 0.7335121664535006, + "language_loss": 0.55313218, + "learning_rate": 7.885239411066205e-07, + "loss": 0.57517225, + "num_input_tokens_seen": 256838890, + "step": 11907, + "time_per_iteration": 3.138270616531372 + }, + { + "auxiliary_loss_clip": 0.01150344, + "auxiliary_loss_mlp": 0.0110292, + "balance_loss_clip": 1.00194049, + "balance_loss_mlp": 1.00049508, + "epoch": 0.7159476927701789, + "flos": 17128780024320.0, + "grad_norm": 2.072000254997135, + "language_loss": 0.69800806, + "learning_rate": 7.882140833804593e-07, + "loss": 0.72054064, + "num_input_tokens_seen": 256858145, + "step": 11908, + "time_per_iteration": 3.9384212493896484 + }, + { + "auxiliary_loss_clip": 0.01105791, + "auxiliary_loss_mlp": 0.01103898, + "balance_loss_clip": 1.00184488, + "balance_loss_mlp": 1.00051928, + "epoch": 0.7160078160228468, + "flos": 22490601886080.0, + "grad_norm": 1.6367190894429053, + "language_loss": 0.70989239, + "learning_rate": 7.879042716053415e-07, + "loss": 0.73198926, + "num_input_tokens_seen": 256878545, + "step": 11909, + "time_per_iteration": 2.718311309814453 + }, + { + "auxiliary_loss_clip": 0.01148276, + "auxiliary_loss_mlp": 0.01103686, + "balance_loss_clip": 1.00189412, + "balance_loss_mlp": 1.00049853, + "epoch": 0.7160679392755148, + "flos": 30590935626240.0, + "grad_norm": 1.9921012357632333, + "language_loss": 0.75254869, + "learning_rate": 7.875945057930144e-07, + "loss": 0.77506828, + "num_input_tokens_seen": 256899920, + "step": 11910, + "time_per_iteration": 2.644557476043701 + }, + { + "auxiliary_loss_clip": 0.01131073, + "auxiliary_loss_mlp": 0.01103377, + "balance_loss_clip": 1.00170302, + "balance_loss_mlp": 1.00047576, + "epoch": 0.7161280625281827, + "flos": 21323648833920.0, + "grad_norm": 1.4527529061967004, + "language_loss": 0.76776201, + "learning_rate": 7.872847859552251e-07, + "loss": 0.79010653, + "num_input_tokens_seen": 256918460, + "step": 11911, + "time_per_iteration": 2.5955705642700195 + }, + { + "auxiliary_loss_clip": 0.01118523, + "auxiliary_loss_mlp": 0.01104076, + "balance_loss_clip": 1.00195098, + "balance_loss_mlp": 1.00041175, + "epoch": 0.7161881857808508, + "flos": 61860078921600.0, + "grad_norm": 1.7157765617086178, + "language_loss": 0.58894074, + "learning_rate": 7.869751121037192e-07, + "loss": 0.61116672, + "num_input_tokens_seen": 256942015, + "step": 11912, + "time_per_iteration": 3.0275399684906006 + }, + { + "auxiliary_loss_clip": 0.01148123, + "auxiliary_loss_mlp": 0.01103512, + "balance_loss_clip": 1.00194573, + "balance_loss_mlp": 1.00041938, + "epoch": 0.7162483090335187, + "flos": 20812101292800.0, + "grad_norm": 2.082748281136718, + "language_loss": 0.78041863, + "learning_rate": 7.866654842502376e-07, + "loss": 0.80293494, + "num_input_tokens_seen": 256961065, + "step": 11913, + "time_per_iteration": 2.5742411613464355 + }, + { + "auxiliary_loss_clip": 0.01131314, + "auxiliary_loss_mlp": 0.01101657, + "balance_loss_clip": 1.00181866, + "balance_loss_mlp": 1.00047255, + "epoch": 0.7163084322861867, + "flos": 24097532630400.0, + "grad_norm": 1.6028634261973933, + "language_loss": 0.74359244, + "learning_rate": 7.863559024065234e-07, + "loss": 0.76592219, + "num_input_tokens_seen": 256982165, + "step": 11914, + "time_per_iteration": 2.6202054023742676 + }, + { + "auxiliary_loss_clip": 0.01116082, + "auxiliary_loss_mlp": 0.01102614, + "balance_loss_clip": 1.00179803, + "balance_loss_mlp": 1.00047553, + "epoch": 0.7163685555388547, + "flos": 20080888128000.0, + "grad_norm": 1.5953814437225324, + "language_loss": 0.74299705, + "learning_rate": 7.860463665843143e-07, + "loss": 0.76518404, + "num_input_tokens_seen": 256999825, + "step": 11915, + "time_per_iteration": 2.639709949493408 + }, + { + "auxiliary_loss_clip": 0.01164806, + "auxiliary_loss_mlp": 0.01103035, + "balance_loss_clip": 1.00185585, + "balance_loss_mlp": 1.00051558, + "epoch": 0.7164286787915226, + "flos": 17456967613440.0, + "grad_norm": 5.750283163404888, + "language_loss": 0.80820084, + "learning_rate": 7.85736876795349e-07, + "loss": 0.83087921, + "num_input_tokens_seen": 257017450, + "step": 11916, + "time_per_iteration": 2.491790294647217 + }, + { + "auxiliary_loss_clip": 0.01068492, + "auxiliary_loss_mlp": 0.01103438, + "balance_loss_clip": 1.00149798, + "balance_loss_mlp": 1.00053596, + "epoch": 0.7164888020441906, + "flos": 19718908819200.0, + "grad_norm": 1.8248133203165369, + "language_loss": 0.68084252, + "learning_rate": 7.854274330513626e-07, + "loss": 0.70256174, + "num_input_tokens_seen": 257035465, + "step": 11917, + "time_per_iteration": 2.742328405380249 + }, + { + "auxiliary_loss_clip": 0.01133518, + "auxiliary_loss_mlp": 0.01103022, + "balance_loss_clip": 1.00183427, + "balance_loss_mlp": 1.00040638, + "epoch": 0.7165489252968585, + "flos": 21470523546240.0, + "grad_norm": 1.8233174259358715, + "language_loss": 0.75721163, + "learning_rate": 7.851180353640896e-07, + "loss": 0.77957702, + "num_input_tokens_seen": 257053750, + "step": 11918, + "time_per_iteration": 2.6076407432556152 + }, + { + "auxiliary_loss_clip": 0.01125927, + "auxiliary_loss_mlp": 0.01077787, + "balance_loss_clip": 1.00093353, + "balance_loss_mlp": 0.99996704, + "epoch": 0.7166090485495266, + "flos": 69928060464000.0, + "grad_norm": 0.6306213498000321, + "language_loss": 0.53939551, + "learning_rate": 7.848086837452639e-07, + "loss": 0.56143266, + "num_input_tokens_seen": 257121215, + "step": 11919, + "time_per_iteration": 3.207554578781128 + }, + { + "auxiliary_loss_clip": 0.01131503, + "auxiliary_loss_mlp": 0.01104459, + "balance_loss_clip": 1.00194907, + "balance_loss_mlp": 1.00050855, + "epoch": 0.7166691718021945, + "flos": 27343892949120.0, + "grad_norm": 38.15513147395759, + "language_loss": 0.68963957, + "learning_rate": 7.844993782066132e-07, + "loss": 0.71199918, + "num_input_tokens_seen": 257143370, + "step": 11920, + "time_per_iteration": 2.651592969894409 + }, + { + "auxiliary_loss_clip": 0.01134863, + "auxiliary_loss_mlp": 0.01102571, + "balance_loss_clip": 1.00182796, + "balance_loss_mlp": 1.00043225, + "epoch": 0.7167292950548625, + "flos": 30408868563840.0, + "grad_norm": 1.8492916789864735, + "language_loss": 0.75223684, + "learning_rate": 7.841901187598678e-07, + "loss": 0.77461123, + "num_input_tokens_seen": 257162160, + "step": 11921, + "time_per_iteration": 2.656146287918091 + }, + { + "auxiliary_loss_clip": 0.0111676, + "auxiliary_loss_mlp": 0.01104846, + "balance_loss_clip": 1.00184405, + "balance_loss_mlp": 1.00051427, + "epoch": 0.7167894183075304, + "flos": 14571257800320.0, + "grad_norm": 11.19679786986279, + "language_loss": 0.7537815, + "learning_rate": 7.83880905416755e-07, + "loss": 0.77599758, + "num_input_tokens_seen": 257179300, + "step": 11922, + "time_per_iteration": 2.6306278705596924 + }, + { + "auxiliary_loss_clip": 0.01130049, + "auxiliary_loss_mlp": 0.01077818, + "balance_loss_clip": 1.00081289, + "balance_loss_mlp": 0.99999827, + "epoch": 0.7168495415601984, + "flos": 64110674407680.0, + "grad_norm": 0.74915083485507, + "language_loss": 0.55129731, + "learning_rate": 7.83571738189001e-07, + "loss": 0.57337606, + "num_input_tokens_seen": 257235470, + "step": 11923, + "time_per_iteration": 2.9687650203704834 + }, + { + "auxiliary_loss_clip": 0.01116861, + "auxiliary_loss_mlp": 0.01104081, + "balance_loss_clip": 1.00173008, + "balance_loss_mlp": 1.00051188, + "epoch": 0.7169096648128663, + "flos": 24681440119680.0, + "grad_norm": 1.452882602209828, + "language_loss": 0.76920438, + "learning_rate": 7.832626170883279e-07, + "loss": 0.79141378, + "num_input_tokens_seen": 257255850, + "step": 11924, + "time_per_iteration": 2.712731122970581 + }, + { + "auxiliary_loss_clip": 0.01114632, + "auxiliary_loss_mlp": 0.01102475, + "balance_loss_clip": 1.00172555, + "balance_loss_mlp": 1.00062203, + "epoch": 0.7169697880655344, + "flos": 20667525050880.0, + "grad_norm": 1.893338339677608, + "language_loss": 0.6828388, + "learning_rate": 7.829535421264588e-07, + "loss": 0.70500994, + "num_input_tokens_seen": 257275425, + "step": 11925, + "time_per_iteration": 2.6559035778045654 + }, + { + "auxiliary_loss_clip": 0.01131696, + "auxiliary_loss_mlp": 0.01102025, + "balance_loss_clip": 1.00174248, + "balance_loss_mlp": 1.00045848, + "epoch": 0.7170299113182023, + "flos": 21032700670080.0, + "grad_norm": 2.129581731778519, + "language_loss": 0.77514315, + "learning_rate": 7.826445133151133e-07, + "loss": 0.79748029, + "num_input_tokens_seen": 257295740, + "step": 11926, + "time_per_iteration": 2.685365915298462 + }, + { + "auxiliary_loss_clip": 0.01148496, + "auxiliary_loss_mlp": 0.00747424, + "balance_loss_clip": 1.00171685, + "balance_loss_mlp": 1.00049448, + "epoch": 0.7170900345708703, + "flos": 22893304239360.0, + "grad_norm": 3.5023995299091912, + "language_loss": 0.77063787, + "learning_rate": 7.823355306660093e-07, + "loss": 0.78959709, + "num_input_tokens_seen": 257315970, + "step": 11927, + "time_per_iteration": 2.589686393737793 + }, + { + "auxiliary_loss_clip": 0.01150139, + "auxiliary_loss_mlp": 0.0110237, + "balance_loss_clip": 1.00201297, + "balance_loss_mlp": 1.00042212, + "epoch": 0.7171501578235383, + "flos": 15518688883200.0, + "grad_norm": 1.6342414062594592, + "language_loss": 0.68989837, + "learning_rate": 7.820265941908642e-07, + "loss": 0.71242344, + "num_input_tokens_seen": 257334230, + "step": 11928, + "time_per_iteration": 4.052880048751831 + }, + { + "auxiliary_loss_clip": 0.0110266, + "auxiliary_loss_mlp": 0.01102191, + "balance_loss_clip": 1.00177431, + "balance_loss_mlp": 1.00043344, + "epoch": 0.7172102810762062, + "flos": 26104292640000.0, + "grad_norm": 1.9390971793503324, + "language_loss": 0.6535629, + "learning_rate": 7.817177039013931e-07, + "loss": 0.67561144, + "num_input_tokens_seen": 257352145, + "step": 11929, + "time_per_iteration": 2.7113969326019287 + }, + { + "auxiliary_loss_clip": 0.01135561, + "auxiliary_loss_mlp": 0.01103728, + "balance_loss_clip": 1.0018332, + "balance_loss_mlp": 1.00054014, + "epoch": 0.7172704043288742, + "flos": 21506649649920.0, + "grad_norm": 1.8485607523434533, + "language_loss": 0.69339371, + "learning_rate": 7.81408859809308e-07, + "loss": 0.71578658, + "num_input_tokens_seen": 257371460, + "step": 11930, + "time_per_iteration": 2.575265407562256 + }, + { + "auxiliary_loss_clip": 0.01116325, + "auxiliary_loss_mlp": 0.01103459, + "balance_loss_clip": 1.00173211, + "balance_loss_mlp": 1.00036669, + "epoch": 0.7173305275815421, + "flos": 18770939032320.0, + "grad_norm": 2.0229714128305716, + "language_loss": 0.80318815, + "learning_rate": 7.811000619263219e-07, + "loss": 0.82538599, + "num_input_tokens_seen": 257390800, + "step": 11931, + "time_per_iteration": 2.634340763092041 + }, + { + "auxiliary_loss_clip": 0.01148538, + "auxiliary_loss_mlp": 0.01102348, + "balance_loss_clip": 1.00189352, + "balance_loss_mlp": 1.00049567, + "epoch": 0.7173906508342102, + "flos": 16179876483840.0, + "grad_norm": 2.3814436573560274, + "language_loss": 0.78472364, + "learning_rate": 7.80791310264143e-07, + "loss": 0.8072325, + "num_input_tokens_seen": 257407495, + "step": 11932, + "time_per_iteration": 3.9112346172332764 + }, + { + "auxiliary_loss_clip": 0.0114827, + "auxiliary_loss_mlp": 0.01102393, + "balance_loss_clip": 1.00183415, + "balance_loss_mlp": 1.0004456, + "epoch": 0.7174507740868781, + "flos": 26613864933120.0, + "grad_norm": 1.5921021242300086, + "language_loss": 0.74951077, + "learning_rate": 7.804826048344803e-07, + "loss": 0.77201736, + "num_input_tokens_seen": 257429675, + "step": 11933, + "time_per_iteration": 2.6183605194091797 + }, + { + "auxiliary_loss_clip": 0.01164919, + "auxiliary_loss_mlp": 0.01105157, + "balance_loss_clip": 1.00192761, + "balance_loss_mlp": 1.00044382, + "epoch": 0.7175108973395461, + "flos": 18432911116800.0, + "grad_norm": 2.331158884956383, + "language_loss": 0.69264227, + "learning_rate": 7.801739456490388e-07, + "loss": 0.715343, + "num_input_tokens_seen": 257442765, + "step": 11934, + "time_per_iteration": 2.4850525856018066 + }, + { + "auxiliary_loss_clip": 0.01149458, + "auxiliary_loss_mlp": 0.01104009, + "balance_loss_clip": 1.00174356, + "balance_loss_mlp": 1.00044036, + "epoch": 0.717571020592214, + "flos": 23914962777600.0, + "grad_norm": 1.918638953989172, + "language_loss": 0.86645257, + "learning_rate": 7.798653327195237e-07, + "loss": 0.8889873, + "num_input_tokens_seen": 257459310, + "step": 11935, + "time_per_iteration": 2.5824978351593018 + }, + { + "auxiliary_loss_clip": 0.0112013, + "auxiliary_loss_mlp": 0.01103972, + "balance_loss_clip": 1.00210333, + "balance_loss_mlp": 1.00049829, + "epoch": 0.717631143844882, + "flos": 38256930109440.0, + "grad_norm": 1.5452660056928669, + "language_loss": 0.74085993, + "learning_rate": 7.795567660576388e-07, + "loss": 0.76310098, + "num_input_tokens_seen": 257484750, + "step": 11936, + "time_per_iteration": 2.8285651206970215 + }, + { + "auxiliary_loss_clip": 0.01158876, + "auxiliary_loss_mlp": 0.0107784, + "balance_loss_clip": 1.00079167, + "balance_loss_mlp": 1.00002062, + "epoch": 0.7176912670975499, + "flos": 65515896328320.0, + "grad_norm": 0.7562583237561735, + "language_loss": 0.55979359, + "learning_rate": 7.79248245675082e-07, + "loss": 0.58216083, + "num_input_tokens_seen": 257543110, + "step": 11937, + "time_per_iteration": 3.069875478744507 + }, + { + "auxiliary_loss_clip": 0.01148412, + "auxiliary_loss_mlp": 0.01103919, + "balance_loss_clip": 1.00195253, + "balance_loss_mlp": 1.00054073, + "epoch": 0.717751390350218, + "flos": 31281066610560.0, + "grad_norm": 2.519643842784039, + "language_loss": 0.54673308, + "learning_rate": 7.789397715835542e-07, + "loss": 0.56925637, + "num_input_tokens_seen": 257567410, + "step": 11938, + "time_per_iteration": 2.6433229446411133 + }, + { + "auxiliary_loss_clip": 0.01147732, + "auxiliary_loss_mlp": 0.01101768, + "balance_loss_clip": 1.00171793, + "balance_loss_mlp": 1.00039279, + "epoch": 0.7178115136028859, + "flos": 19859031774720.0, + "grad_norm": 1.5045848238023196, + "language_loss": 0.76609957, + "learning_rate": 7.786313437947527e-07, + "loss": 0.78859454, + "num_input_tokens_seen": 257586270, + "step": 11939, + "time_per_iteration": 2.5552194118499756 + }, + { + "auxiliary_loss_clip": 0.01127648, + "auxiliary_loss_mlp": 0.01078099, + "balance_loss_clip": 1.00070739, + "balance_loss_mlp": 0.99989796, + "epoch": 0.7178716368555539, + "flos": 64348655967360.0, + "grad_norm": 0.753393495620709, + "language_loss": 0.61443412, + "learning_rate": 7.783229623203738e-07, + "loss": 0.63649154, + "num_input_tokens_seen": 257647415, + "step": 11940, + "time_per_iteration": 3.1079750061035156 + }, + { + "auxiliary_loss_clip": 0.011165, + "auxiliary_loss_mlp": 0.01102727, + "balance_loss_clip": 1.00176501, + "balance_loss_mlp": 1.00039816, + "epoch": 0.7179317601082219, + "flos": 26762607152640.0, + "grad_norm": 1.7319785732573163, + "language_loss": 0.59215754, + "learning_rate": 7.780146271721097e-07, + "loss": 0.61434984, + "num_input_tokens_seen": 257669795, + "step": 11941, + "time_per_iteration": 2.7061970233917236 + }, + { + "auxiliary_loss_clip": 0.01132725, + "auxiliary_loss_mlp": 0.01103478, + "balance_loss_clip": 1.00181818, + "balance_loss_mlp": 1.00038624, + "epoch": 0.7179918833608898, + "flos": 23513804709120.0, + "grad_norm": 2.0660343391799105, + "language_loss": 0.79262269, + "learning_rate": 7.777063383616543e-07, + "loss": 0.81498468, + "num_input_tokens_seen": 257687415, + "step": 11942, + "time_per_iteration": 4.083225250244141 + }, + { + "auxiliary_loss_clip": 0.0114805, + "auxiliary_loss_mlp": 0.01103793, + "balance_loss_clip": 1.0019362, + "balance_loss_mlp": 1.00060487, + "epoch": 0.7180520066135578, + "flos": 17165588486400.0, + "grad_norm": 3.173653436095552, + "language_loss": 0.66269875, + "learning_rate": 7.773980959006968e-07, + "loss": 0.68521714, + "num_input_tokens_seen": 257706215, + "step": 11943, + "time_per_iteration": 2.6043856143951416 + }, + { + "auxiliary_loss_clip": 0.01164684, + "auxiliary_loss_mlp": 0.01103062, + "balance_loss_clip": 1.00186074, + "balance_loss_mlp": 1.00044668, + "epoch": 0.7181121298662257, + "flos": 17566638814080.0, + "grad_norm": 1.7373451100027104, + "language_loss": 0.78931522, + "learning_rate": 7.770898998009254e-07, + "loss": 0.81199276, + "num_input_tokens_seen": 257724740, + "step": 11944, + "time_per_iteration": 2.5348665714263916 + }, + { + "auxiliary_loss_clip": 0.01133362, + "auxiliary_loss_mlp": 0.00747412, + "balance_loss_clip": 1.00195646, + "balance_loss_mlp": 1.00049937, + "epoch": 0.7181722531188938, + "flos": 11947660508160.0, + "grad_norm": 2.096865076959307, + "language_loss": 0.62671572, + "learning_rate": 7.767817500740277e-07, + "loss": 0.64552343, + "num_input_tokens_seen": 257742060, + "step": 11945, + "time_per_iteration": 3.9742558002471924 + }, + { + "auxiliary_loss_clip": 0.01142731, + "auxiliary_loss_mlp": 0.01077756, + "balance_loss_clip": 1.00083876, + "balance_loss_mlp": 0.99993658, + "epoch": 0.7182323763715617, + "flos": 65503649790720.0, + "grad_norm": 0.7047787328132636, + "language_loss": 0.5111419, + "learning_rate": 7.76473646731689e-07, + "loss": 0.53334677, + "num_input_tokens_seen": 257802250, + "step": 11946, + "time_per_iteration": 3.123749256134033 + }, + { + "auxiliary_loss_clip": 0.01117185, + "auxiliary_loss_mlp": 0.01104437, + "balance_loss_clip": 1.00178647, + "balance_loss_mlp": 1.00048649, + "epoch": 0.7182924996242297, + "flos": 20630932070400.0, + "grad_norm": 1.6937267293783491, + "language_loss": 0.74557167, + "learning_rate": 7.761655897855925e-07, + "loss": 0.76778787, + "num_input_tokens_seen": 257821155, + "step": 11947, + "time_per_iteration": 2.6652019023895264 + }, + { + "auxiliary_loss_clip": 0.01099232, + "auxiliary_loss_mlp": 0.00747258, + "balance_loss_clip": 1.00161815, + "balance_loss_mlp": 1.00049293, + "epoch": 0.7183526228768976, + "flos": 16216433550720.0, + "grad_norm": 1.5380315724076756, + "language_loss": 0.72407252, + "learning_rate": 7.758575792474187e-07, + "loss": 0.74253738, + "num_input_tokens_seen": 257839905, + "step": 11948, + "time_per_iteration": 2.6719582080841064 + }, + { + "auxiliary_loss_clip": 0.01135423, + "auxiliary_loss_mlp": 0.01104112, + "balance_loss_clip": 1.00179744, + "balance_loss_mlp": 1.00054252, + "epoch": 0.7184127461295656, + "flos": 22232655342720.0, + "grad_norm": 1.811266611553126, + "language_loss": 0.71752822, + "learning_rate": 7.755496151288483e-07, + "loss": 0.7399236, + "num_input_tokens_seen": 257860055, + "step": 11949, + "time_per_iteration": 2.6112420558929443 + }, + { + "auxiliary_loss_clip": 0.0116482, + "auxiliary_loss_mlp": 0.00747312, + "balance_loss_clip": 1.00198567, + "balance_loss_mlp": 1.00043845, + "epoch": 0.7184728693822335, + "flos": 27344503480320.0, + "grad_norm": 1.732059016892416, + "language_loss": 0.76157463, + "learning_rate": 7.752416974415598e-07, + "loss": 0.78069597, + "num_input_tokens_seen": 257879315, + "step": 11950, + "time_per_iteration": 2.5921690464019775 + }, + { + "auxiliary_loss_clip": 0.01164858, + "auxiliary_loss_mlp": 0.0110393, + "balance_loss_clip": 1.0019623, + "balance_loss_mlp": 1.00045633, + "epoch": 0.7185329926349016, + "flos": 16508530949760.0, + "grad_norm": 2.5326631256118235, + "language_loss": 0.67246711, + "learning_rate": 7.749338261972282e-07, + "loss": 0.69515502, + "num_input_tokens_seen": 257896570, + "step": 11951, + "time_per_iteration": 2.4999356269836426 + }, + { + "auxiliary_loss_clip": 0.01133577, + "auxiliary_loss_mlp": 0.01104277, + "balance_loss_clip": 1.00191402, + "balance_loss_mlp": 1.00051749, + "epoch": 0.7185931158875695, + "flos": 23951052967680.0, + "grad_norm": 1.7030147808207938, + "language_loss": 0.78516984, + "learning_rate": 7.746260014075286e-07, + "loss": 0.80754834, + "num_input_tokens_seen": 257916855, + "step": 11952, + "time_per_iteration": 2.6054916381835938 + }, + { + "auxiliary_loss_clip": 0.01150271, + "auxiliary_loss_mlp": 0.01104174, + "balance_loss_clip": 1.00189579, + "balance_loss_mlp": 1.00050998, + "epoch": 0.7186532391402375, + "flos": 26542007775360.0, + "grad_norm": 2.086488680302078, + "language_loss": 0.7486369, + "learning_rate": 7.743182230841352e-07, + "loss": 0.77118134, + "num_input_tokens_seen": 257937140, + "step": 11953, + "time_per_iteration": 2.5957703590393066 + }, + { + "auxiliary_loss_clip": 0.01149522, + "auxiliary_loss_mlp": 0.01104272, + "balance_loss_clip": 1.00180924, + "balance_loss_mlp": 1.00041699, + "epoch": 0.7187133623929055, + "flos": 22383049587840.0, + "grad_norm": 2.0400998230334366, + "language_loss": 0.73079443, + "learning_rate": 7.740104912387164e-07, + "loss": 0.75333238, + "num_input_tokens_seen": 257956785, + "step": 11954, + "time_per_iteration": 2.5572004318237305 + }, + { + "auxiliary_loss_clip": 0.01131414, + "auxiliary_loss_mlp": 0.01103645, + "balance_loss_clip": 1.00197828, + "balance_loss_mlp": 1.00055313, + "epoch": 0.7187734856455734, + "flos": 15779580341760.0, + "grad_norm": 1.6976044059798665, + "language_loss": 0.74342096, + "learning_rate": 7.737028058829425e-07, + "loss": 0.76577157, + "num_input_tokens_seen": 257975455, + "step": 11955, + "time_per_iteration": 2.5890796184539795 + }, + { + "auxiliary_loss_clip": 0.01118928, + "auxiliary_loss_mlp": 0.01103409, + "balance_loss_clip": 1.00171328, + "balance_loss_mlp": 1.00041223, + "epoch": 0.7188336088982414, + "flos": 31759612531200.0, + "grad_norm": 1.844826351731144, + "language_loss": 0.7328769, + "learning_rate": 7.733951670284817e-07, + "loss": 0.75510025, + "num_input_tokens_seen": 257996850, + "step": 11956, + "time_per_iteration": 2.70589017868042 + }, + { + "auxiliary_loss_clip": 0.01070317, + "auxiliary_loss_mlp": 0.0110386, + "balance_loss_clip": 1.00160575, + "balance_loss_mlp": 1.00048184, + "epoch": 0.7188937321509093, + "flos": 21465208333440.0, + "grad_norm": 1.8914226398186884, + "language_loss": 0.71187264, + "learning_rate": 7.730875746869987e-07, + "loss": 0.73361444, + "num_input_tokens_seen": 258016145, + "step": 11957, + "time_per_iteration": 2.8024511337280273 + }, + { + "auxiliary_loss_clip": 0.01086648, + "auxiliary_loss_mlp": 0.01104005, + "balance_loss_clip": 1.00157595, + "balance_loss_mlp": 1.00053167, + "epoch": 0.7189538554035774, + "flos": 27271497087360.0, + "grad_norm": 2.009984120951958, + "language_loss": 0.73367226, + "learning_rate": 7.727800288701582e-07, + "loss": 0.75557876, + "num_input_tokens_seen": 258035420, + "step": 11958, + "time_per_iteration": 2.811389923095703 + }, + { + "auxiliary_loss_clip": 0.01150047, + "auxiliary_loss_mlp": 0.01102539, + "balance_loss_clip": 1.00191689, + "balance_loss_mlp": 1.00059092, + "epoch": 0.7190139786562453, + "flos": 21580625710080.0, + "grad_norm": 4.090899565392837, + "language_loss": 0.84239322, + "learning_rate": 7.724725295896215e-07, + "loss": 0.86491907, + "num_input_tokens_seen": 258053520, + "step": 11959, + "time_per_iteration": 2.6562907695770264 + }, + { + "auxiliary_loss_clip": 0.01164822, + "auxiliary_loss_mlp": 0.01103793, + "balance_loss_clip": 1.00192857, + "balance_loss_mlp": 1.00051045, + "epoch": 0.7190741019089133, + "flos": 26721237663360.0, + "grad_norm": 5.122060071590041, + "language_loss": 0.81753933, + "learning_rate": 7.7216507685705e-07, + "loss": 0.84022558, + "num_input_tokens_seen": 258073020, + "step": 11960, + "time_per_iteration": 2.573892116546631 + }, + { + "auxiliary_loss_clip": 0.01133496, + "auxiliary_loss_mlp": 0.01103078, + "balance_loss_clip": 1.00187159, + "balance_loss_mlp": 1.00055814, + "epoch": 0.7191342251615812, + "flos": 26104759516800.0, + "grad_norm": 2.3029880986759554, + "language_loss": 0.7757194, + "learning_rate": 7.718576706841013e-07, + "loss": 0.79808509, + "num_input_tokens_seen": 258093155, + "step": 11961, + "time_per_iteration": 2.6504766941070557 + }, + { + "auxiliary_loss_clip": 0.01147998, + "auxiliary_loss_mlp": 0.01103118, + "balance_loss_clip": 1.00193274, + "balance_loss_mlp": 1.00059795, + "epoch": 0.7191943484142492, + "flos": 22967028904320.0, + "grad_norm": 3.274469501543857, + "language_loss": 0.75180948, + "learning_rate": 7.715503110824326e-07, + "loss": 0.7743206, + "num_input_tokens_seen": 258113905, + "step": 11962, + "time_per_iteration": 2.573554039001465 + }, + { + "auxiliary_loss_clip": 0.01147556, + "auxiliary_loss_mlp": 0.01103297, + "balance_loss_clip": 1.00183797, + "balance_loss_mlp": 1.0003953, + "epoch": 0.7192544716669171, + "flos": 22565332131840.0, + "grad_norm": 1.580370170799957, + "language_loss": 0.75294179, + "learning_rate": 7.712429980637001e-07, + "loss": 0.77545035, + "num_input_tokens_seen": 258132820, + "step": 11963, + "time_per_iteration": 2.6133430004119873 + }, + { + "auxiliary_loss_clip": 0.01116777, + "auxiliary_loss_mlp": 0.01105095, + "balance_loss_clip": 1.00191355, + "balance_loss_mlp": 1.0004766, + "epoch": 0.7193145949195852, + "flos": 18982200873600.0, + "grad_norm": 3.476275887090379, + "language_loss": 0.80583775, + "learning_rate": 7.709357316395564e-07, + "loss": 0.82805651, + "num_input_tokens_seen": 258148055, + "step": 11964, + "time_per_iteration": 2.707437515258789 + }, + { + "auxiliary_loss_clip": 0.01148394, + "auxiliary_loss_mlp": 0.01102608, + "balance_loss_clip": 1.00185633, + "balance_loss_mlp": 1.00046945, + "epoch": 0.7193747181722531, + "flos": 18004246208640.0, + "grad_norm": 2.0061443384046656, + "language_loss": 0.74819404, + "learning_rate": 7.70628511821652e-07, + "loss": 0.77070403, + "num_input_tokens_seen": 258165995, + "step": 11965, + "time_per_iteration": 2.5683228969573975 + }, + { + "auxiliary_loss_clip": 0.01131765, + "auxiliary_loss_mlp": 0.01103181, + "balance_loss_clip": 1.00180554, + "balance_loss_mlp": 1.00046992, + "epoch": 0.7194348414249211, + "flos": 24389414547840.0, + "grad_norm": 1.5971507087213592, + "language_loss": 0.77806711, + "learning_rate": 7.703213386216377e-07, + "loss": 0.80041647, + "num_input_tokens_seen": 258186165, + "step": 11966, + "time_per_iteration": 4.045959234237671 + }, + { + "auxiliary_loss_clip": 0.0113522, + "auxiliary_loss_mlp": 0.01103214, + "balance_loss_clip": 1.00177097, + "balance_loss_mlp": 1.00040793, + "epoch": 0.7194949646775891, + "flos": 22163455791360.0, + "grad_norm": 2.5727606245960453, + "language_loss": 0.73118556, + "learning_rate": 7.700142120511619e-07, + "loss": 0.7535699, + "num_input_tokens_seen": 258204595, + "step": 11967, + "time_per_iteration": 2.642845630645752 + }, + { + "auxiliary_loss_clip": 0.01133257, + "auxiliary_loss_mlp": 0.01101603, + "balance_loss_clip": 1.00195849, + "balance_loss_mlp": 1.00060844, + "epoch": 0.719555087930257, + "flos": 20266366982400.0, + "grad_norm": 2.2438142016303737, + "language_loss": 0.81386584, + "learning_rate": 7.6970713212187e-07, + "loss": 0.83621442, + "num_input_tokens_seen": 258223110, + "step": 11968, + "time_per_iteration": 2.6191046237945557 + }, + { + "auxiliary_loss_clip": 0.01132353, + "auxiliary_loss_mlp": 0.01102535, + "balance_loss_clip": 1.00178313, + "balance_loss_mlp": 1.00039697, + "epoch": 0.719615211182925, + "flos": 24716309247360.0, + "grad_norm": 3.4010880431940818, + "language_loss": 0.76602006, + "learning_rate": 7.69400098845407e-07, + "loss": 0.78836894, + "num_input_tokens_seen": 258242660, + "step": 11969, + "time_per_iteration": 2.626096248626709 + }, + { + "auxiliary_loss_clip": 0.01118818, + "auxiliary_loss_mlp": 0.01103217, + "balance_loss_clip": 1.00171947, + "balance_loss_mlp": 1.00041127, + "epoch": 0.719675334435593, + "flos": 20009641501440.0, + "grad_norm": 1.877053066334702, + "language_loss": 0.70877707, + "learning_rate": 7.69093112233417e-07, + "loss": 0.73099744, + "num_input_tokens_seen": 258261850, + "step": 11970, + "time_per_iteration": 4.003031492233276 + }, + { + "auxiliary_loss_clip": 0.01127652, + "auxiliary_loss_mlp": 0.01078181, + "balance_loss_clip": 1.00074029, + "balance_loss_mlp": 0.99997979, + "epoch": 0.719735457688261, + "flos": 44199861177600.0, + "grad_norm": 0.9185546595457782, + "language_loss": 0.60816061, + "learning_rate": 7.68786172297538e-07, + "loss": 0.63021898, + "num_input_tokens_seen": 258312570, + "step": 11971, + "time_per_iteration": 3.0696861743927 + }, + { + "auxiliary_loss_clip": 0.01164951, + "auxiliary_loss_mlp": 0.01104153, + "balance_loss_clip": 1.00184989, + "balance_loss_mlp": 1.000489, + "epoch": 0.7197955809409289, + "flos": 16802890905600.0, + "grad_norm": 1.8756255123887478, + "language_loss": 0.80048686, + "learning_rate": 7.684792790494105e-07, + "loss": 0.82317787, + "num_input_tokens_seen": 258331600, + "step": 11972, + "time_per_iteration": 2.544780731201172 + }, + { + "auxiliary_loss_clip": 0.01135123, + "auxiliary_loss_mlp": 0.01103989, + "balance_loss_clip": 1.00194097, + "balance_loss_mlp": 1.00051546, + "epoch": 0.7198557041935969, + "flos": 24535391420160.0, + "grad_norm": 6.704906229791568, + "language_loss": 0.75913787, + "learning_rate": 7.681724325006733e-07, + "loss": 0.78152901, + "num_input_tokens_seen": 258351785, + "step": 11973, + "time_per_iteration": 2.6328701972961426 + }, + { + "auxiliary_loss_clip": 0.0109615, + "auxiliary_loss_mlp": 0.01077781, + "balance_loss_clip": 1.00070655, + "balance_loss_mlp": 0.99996108, + "epoch": 0.7199158274462648, + "flos": 70710839602560.0, + "grad_norm": 0.8456551995515519, + "language_loss": 0.57178497, + "learning_rate": 7.6786563266296e-07, + "loss": 0.59352434, + "num_input_tokens_seen": 258404035, + "step": 11974, + "time_per_iteration": 3.061892509460449 + }, + { + "auxiliary_loss_clip": 0.01131806, + "auxiliary_loss_mlp": 0.01103621, + "balance_loss_clip": 1.00170684, + "balance_loss_mlp": 1.00043321, + "epoch": 0.7199759506989328, + "flos": 29347995352320.0, + "grad_norm": 2.1743179454826156, + "language_loss": 0.61188108, + "learning_rate": 7.675588795479062e-07, + "loss": 0.63423532, + "num_input_tokens_seen": 258424850, + "step": 11975, + "time_per_iteration": 2.667069435119629 + }, + { + "auxiliary_loss_clip": 0.01149925, + "auxiliary_loss_mlp": 0.01102264, + "balance_loss_clip": 1.00179887, + "balance_loss_mlp": 1.000507, + "epoch": 0.7200360739516007, + "flos": 24640465680000.0, + "grad_norm": 1.9175502367631387, + "language_loss": 0.67444324, + "learning_rate": 7.672521731671425e-07, + "loss": 0.6969651, + "num_input_tokens_seen": 258445485, + "step": 11976, + "time_per_iteration": 2.5890824794769287 + }, + { + "auxiliary_loss_clip": 0.01118221, + "auxiliary_loss_mlp": 0.01103153, + "balance_loss_clip": 1.00181162, + "balance_loss_mlp": 1.0003469, + "epoch": 0.7200961972042688, + "flos": 20812855478400.0, + "grad_norm": 2.374301868739361, + "language_loss": 0.66977555, + "learning_rate": 7.669455135323004e-07, + "loss": 0.6919893, + "num_input_tokens_seen": 258464505, + "step": 11977, + "time_per_iteration": 2.6535801887512207 + }, + { + "auxiliary_loss_clip": 0.01133502, + "auxiliary_loss_mlp": 0.01104014, + "balance_loss_clip": 1.00182569, + "balance_loss_mlp": 1.00044489, + "epoch": 0.7201563204569367, + "flos": 31245910174080.0, + "grad_norm": 1.5444913824716469, + "language_loss": 0.75323987, + "learning_rate": 7.666389006550074e-07, + "loss": 0.7756151, + "num_input_tokens_seen": 258487190, + "step": 11978, + "time_per_iteration": 2.659261465072632 + }, + { + "auxiliary_loss_clip": 0.01164727, + "auxiliary_loss_mlp": 0.01102622, + "balance_loss_clip": 1.00185633, + "balance_loss_mlp": 1.0003885, + "epoch": 0.7202164437096047, + "flos": 26651391667200.0, + "grad_norm": 1.7642318188577624, + "language_loss": 0.78644645, + "learning_rate": 7.663323345468908e-07, + "loss": 0.80911994, + "num_input_tokens_seen": 258503790, + "step": 11979, + "time_per_iteration": 2.553213119506836 + }, + { + "auxiliary_loss_clip": 0.01150167, + "auxiliary_loss_mlp": 0.01103786, + "balance_loss_clip": 1.00193715, + "balance_loss_mlp": 1.00050306, + "epoch": 0.7202765669622727, + "flos": 25959608657280.0, + "grad_norm": 1.5562958241063531, + "language_loss": 0.64785349, + "learning_rate": 7.660258152195767e-07, + "loss": 0.67039299, + "num_input_tokens_seen": 258527335, + "step": 11980, + "time_per_iteration": 4.1146790981292725 + }, + { + "auxiliary_loss_clip": 0.0115009, + "auxiliary_loss_mlp": 0.01103095, + "balance_loss_clip": 1.00189412, + "balance_loss_mlp": 1.00047994, + "epoch": 0.7203366902149406, + "flos": 28512354372480.0, + "grad_norm": 2.208565098580054, + "language_loss": 0.66689199, + "learning_rate": 7.657193426846871e-07, + "loss": 0.68942386, + "num_input_tokens_seen": 258546690, + "step": 11981, + "time_per_iteration": 2.5937633514404297 + }, + { + "auxiliary_loss_clip": 0.01135341, + "auxiliary_loss_mlp": 0.01103939, + "balance_loss_clip": 1.00203347, + "balance_loss_mlp": 1.000561, + "epoch": 0.7203968134676086, + "flos": 21106030285440.0, + "grad_norm": 1.8199556047519614, + "language_loss": 0.7382704, + "learning_rate": 7.65412916953843e-07, + "loss": 0.76066327, + "num_input_tokens_seen": 258566340, + "step": 11982, + "time_per_iteration": 2.6049368381500244 + }, + { + "auxiliary_loss_clip": 0.01133679, + "auxiliary_loss_mlp": 0.00747393, + "balance_loss_clip": 1.00185251, + "balance_loss_mlp": 1.00047493, + "epoch": 0.7204569367202766, + "flos": 18332146488960.0, + "grad_norm": 2.002198062511011, + "language_loss": 0.66144949, + "learning_rate": 7.65106538038665e-07, + "loss": 0.68026018, + "num_input_tokens_seen": 258584455, + "step": 11983, + "time_per_iteration": 3.996455669403076 + }, + { + "auxiliary_loss_clip": 0.01132006, + "auxiliary_loss_mlp": 0.01103108, + "balance_loss_clip": 1.00170159, + "balance_loss_mlp": 1.00049317, + "epoch": 0.7205170599729446, + "flos": 23255103980160.0, + "grad_norm": 1.520049492194114, + "language_loss": 0.66509652, + "learning_rate": 7.648002059507715e-07, + "loss": 0.68744767, + "num_input_tokens_seen": 258604725, + "step": 11984, + "time_per_iteration": 2.6327664852142334 + }, + { + "auxiliary_loss_clip": 0.01150147, + "auxiliary_loss_mlp": 0.01103719, + "balance_loss_clip": 1.0020709, + "balance_loss_mlp": 1.00053191, + "epoch": 0.7205771832256125, + "flos": 20120892900480.0, + "grad_norm": 1.6569835072629147, + "language_loss": 0.73827147, + "learning_rate": 7.644939207017771e-07, + "loss": 0.76081014, + "num_input_tokens_seen": 258622885, + "step": 11985, + "time_per_iteration": 2.581392288208008 + }, + { + "auxiliary_loss_clip": 0.01148181, + "auxiliary_loss_mlp": 0.01102701, + "balance_loss_clip": 1.00186718, + "balance_loss_mlp": 1.00046742, + "epoch": 0.7206373064782805, + "flos": 27703250565120.0, + "grad_norm": 2.2047964404570326, + "language_loss": 0.62678856, + "learning_rate": 7.641876823032977e-07, + "loss": 0.64929736, + "num_input_tokens_seen": 258644305, + "step": 11986, + "time_per_iteration": 2.6246423721313477 + }, + { + "auxiliary_loss_clip": 0.01133332, + "auxiliary_loss_mlp": 0.01103907, + "balance_loss_clip": 1.00186086, + "balance_loss_mlp": 1.00052881, + "epoch": 0.7206974297309484, + "flos": 17968156018560.0, + "grad_norm": 2.0500553183814696, + "language_loss": 0.72683191, + "learning_rate": 7.638814907669455e-07, + "loss": 0.74920428, + "num_input_tokens_seen": 258661775, + "step": 11987, + "time_per_iteration": 2.5977585315704346 + }, + { + "auxiliary_loss_clip": 0.01130866, + "auxiliary_loss_mlp": 0.01103406, + "balance_loss_clip": 1.00174785, + "balance_loss_mlp": 1.00040913, + "epoch": 0.7207575529836164, + "flos": 16983162288000.0, + "grad_norm": 1.8215297805071486, + "language_loss": 0.78572357, + "learning_rate": 7.635753461043301e-07, + "loss": 0.80806637, + "num_input_tokens_seen": 258679830, + "step": 11988, + "time_per_iteration": 2.604112386703491 + }, + { + "auxiliary_loss_clip": 0.01164781, + "auxiliary_loss_mlp": 0.01102873, + "balance_loss_clip": 1.00187421, + "balance_loss_mlp": 1.00044811, + "epoch": 0.7208176762362843, + "flos": 18727594295040.0, + "grad_norm": 2.168792944474132, + "language_loss": 0.79233158, + "learning_rate": 7.632692483270618e-07, + "loss": 0.81500816, + "num_input_tokens_seen": 258697415, + "step": 11989, + "time_per_iteration": 2.537156581878662 + }, + { + "auxiliary_loss_clip": 0.01164715, + "auxiliary_loss_mlp": 0.01102721, + "balance_loss_clip": 1.00194371, + "balance_loss_mlp": 1.0003922, + "epoch": 0.7208777994889524, + "flos": 18734489706240.0, + "grad_norm": 1.8117039683171625, + "language_loss": 0.82417738, + "learning_rate": 7.629631974467481e-07, + "loss": 0.84685177, + "num_input_tokens_seen": 258716755, + "step": 11990, + "time_per_iteration": 2.5730838775634766 + }, + { + "auxiliary_loss_clip": 0.01132936, + "auxiliary_loss_mlp": 0.01103126, + "balance_loss_clip": 1.00191045, + "balance_loss_mlp": 1.00051069, + "epoch": 0.7209379227416203, + "flos": 14793437376000.0, + "grad_norm": 43.74378993254058, + "language_loss": 0.76407218, + "learning_rate": 7.626571934749931e-07, + "loss": 0.78643274, + "num_input_tokens_seen": 258733270, + "step": 11991, + "time_per_iteration": 2.6187591552734375 + }, + { + "auxiliary_loss_clip": 0.01116485, + "auxiliary_loss_mlp": 0.01101883, + "balance_loss_clip": 1.00175571, + "balance_loss_mlp": 1.00041175, + "epoch": 0.7209980459942883, + "flos": 29636860527360.0, + "grad_norm": 1.7232028704906415, + "language_loss": 0.72626758, + "learning_rate": 7.623512364234022e-07, + "loss": 0.74845123, + "num_input_tokens_seen": 258755270, + "step": 11992, + "time_per_iteration": 2.7029013633728027 + }, + { + "auxiliary_loss_clip": 0.0114965, + "auxiliary_loss_mlp": 0.01103708, + "balance_loss_clip": 1.00180912, + "balance_loss_mlp": 1.00042474, + "epoch": 0.7210581692469563, + "flos": 23477175815040.0, + "grad_norm": 4.944538053857788, + "language_loss": 0.66289556, + "learning_rate": 7.620453263035755e-07, + "loss": 0.6854291, + "num_input_tokens_seen": 258775340, + "step": 11993, + "time_per_iteration": 2.5811331272125244 + }, + { + "auxiliary_loss_clip": 0.01149541, + "auxiliary_loss_mlp": 0.01103739, + "balance_loss_clip": 1.00178361, + "balance_loss_mlp": 1.00045609, + "epoch": 0.7211182924996242, + "flos": 26099839353600.0, + "grad_norm": 2.3163271648449966, + "language_loss": 0.65911734, + "learning_rate": 7.61739463127115e-07, + "loss": 0.6816501, + "num_input_tokens_seen": 258794580, + "step": 11994, + "time_per_iteration": 2.59024977684021 + }, + { + "auxiliary_loss_clip": 0.01150273, + "auxiliary_loss_mlp": 0.01104111, + "balance_loss_clip": 1.00199342, + "balance_loss_mlp": 1.00044656, + "epoch": 0.7211784157522922, + "flos": 17712076982400.0, + "grad_norm": 2.257608818455812, + "language_loss": 0.66488922, + "learning_rate": 7.614336469056172e-07, + "loss": 0.68743306, + "num_input_tokens_seen": 258812330, + "step": 11995, + "time_per_iteration": 2.5334525108337402 + }, + { + "auxiliary_loss_clip": 0.01132734, + "auxiliary_loss_mlp": 0.01102982, + "balance_loss_clip": 1.001858, + "balance_loss_mlp": 1.00046241, + "epoch": 0.7212385390049602, + "flos": 24423637230720.0, + "grad_norm": 1.6742668258917222, + "language_loss": 0.79528618, + "learning_rate": 7.6112787765068e-07, + "loss": 0.81764334, + "num_input_tokens_seen": 258831770, + "step": 11996, + "time_per_iteration": 2.631757974624634 + }, + { + "auxiliary_loss_clip": 0.0116477, + "auxiliary_loss_mlp": 0.01103675, + "balance_loss_clip": 1.00196838, + "balance_loss_mlp": 1.00039244, + "epoch": 0.7212986622576282, + "flos": 28147250580480.0, + "grad_norm": 1.9148281899110202, + "language_loss": 0.81486553, + "learning_rate": 7.60822155373899e-07, + "loss": 0.83754998, + "num_input_tokens_seen": 258849090, + "step": 11997, + "time_per_iteration": 2.580138921737671 + }, + { + "auxiliary_loss_clip": 0.01164817, + "auxiliary_loss_mlp": 0.01103366, + "balance_loss_clip": 1.00186658, + "balance_loss_mlp": 1.00046468, + "epoch": 0.7213587855102961, + "flos": 21835770992640.0, + "grad_norm": 2.54798956447781, + "language_loss": 0.66721636, + "learning_rate": 7.605164800868646e-07, + "loss": 0.68989813, + "num_input_tokens_seen": 258868230, + "step": 11998, + "time_per_iteration": 2.5261569023132324 + }, + { + "auxiliary_loss_clip": 0.01164851, + "auxiliary_loss_mlp": 0.01102787, + "balance_loss_clip": 1.00194526, + "balance_loss_mlp": 1.00045776, + "epoch": 0.7214189087629641, + "flos": 14611549881600.0, + "grad_norm": 1.8790614091041646, + "language_loss": 0.72522485, + "learning_rate": 7.602108518011696e-07, + "loss": 0.7479012, + "num_input_tokens_seen": 258885525, + "step": 11999, + "time_per_iteration": 2.516451835632324 + }, + { + "auxiliary_loss_clip": 0.01133091, + "auxiliary_loss_mlp": 0.01102928, + "balance_loss_clip": 1.00186443, + "balance_loss_mlp": 1.00040758, + "epoch": 0.721479032015632, + "flos": 19390864884480.0, + "grad_norm": 2.5260399788335035, + "language_loss": 0.83101475, + "learning_rate": 7.599052705284039e-07, + "loss": 0.85337496, + "num_input_tokens_seen": 258903245, + "step": 12000, + "time_per_iteration": 2.57578444480896 + }, + { + "auxiliary_loss_clip": 0.01148169, + "auxiliary_loss_mlp": 0.01103641, + "balance_loss_clip": 1.00186896, + "balance_loss_mlp": 1.00054884, + "epoch": 0.7215391552683, + "flos": 18512884748160.0, + "grad_norm": 1.9642706939021426, + "language_loss": 0.77038217, + "learning_rate": 7.59599736280154e-07, + "loss": 0.79290026, + "num_input_tokens_seen": 258921245, + "step": 12001, + "time_per_iteration": 2.56463885307312 + }, + { + "auxiliary_loss_clip": 0.01150223, + "auxiliary_loss_mlp": 0.01102774, + "balance_loss_clip": 1.00203109, + "balance_loss_mlp": 1.00053978, + "epoch": 0.721599278520968, + "flos": 23258731253760.0, + "grad_norm": 1.970529112280425, + "language_loss": 0.81616747, + "learning_rate": 7.592942490680066e-07, + "loss": 0.83869743, + "num_input_tokens_seen": 258939425, + "step": 12002, + "time_per_iteration": 2.5762369632720947 + }, + { + "auxiliary_loss_clip": 0.01148037, + "auxiliary_loss_mlp": 0.01104238, + "balance_loss_clip": 1.00187564, + "balance_loss_mlp": 1.00047791, + "epoch": 0.721659401773636, + "flos": 39199045979520.0, + "grad_norm": 2.0073034069971953, + "language_loss": 0.62068915, + "learning_rate": 7.589888089035462e-07, + "loss": 0.64321196, + "num_input_tokens_seen": 258960710, + "step": 12003, + "time_per_iteration": 2.7641613483428955 + }, + { + "auxiliary_loss_clip": 0.01164983, + "auxiliary_loss_mlp": 0.01104955, + "balance_loss_clip": 1.00197482, + "balance_loss_mlp": 1.00052774, + "epoch": 0.7217195250263039, + "flos": 14939917038720.0, + "grad_norm": 2.139382126003483, + "language_loss": 0.69062203, + "learning_rate": 7.586834157983544e-07, + "loss": 0.71332139, + "num_input_tokens_seen": 258978475, + "step": 12004, + "time_per_iteration": 3.930516481399536 + }, + { + "auxiliary_loss_clip": 0.01127061, + "auxiliary_loss_mlp": 0.01078044, + "balance_loss_clip": 1.00153744, + "balance_loss_mlp": 1.00022399, + "epoch": 0.7217796482789719, + "flos": 70869206666880.0, + "grad_norm": 0.8578592665071498, + "language_loss": 0.5411396, + "learning_rate": 7.583780697640112e-07, + "loss": 0.56319064, + "num_input_tokens_seen": 259037520, + "step": 12005, + "time_per_iteration": 3.111509323120117 + }, + { + "auxiliary_loss_clip": 0.01115765, + "auxiliary_loss_mlp": 0.01103578, + "balance_loss_clip": 1.00183105, + "balance_loss_mlp": 1.00048542, + "epoch": 0.7218397715316398, + "flos": 37451525402880.0, + "grad_norm": 1.7111518053661607, + "language_loss": 0.63157046, + "learning_rate": 7.580727708120962e-07, + "loss": 0.65376389, + "num_input_tokens_seen": 259061325, + "step": 12006, + "time_per_iteration": 2.7972447872161865 + }, + { + "auxiliary_loss_clip": 0.01133712, + "auxiliary_loss_mlp": 0.0110272, + "balance_loss_clip": 1.00187099, + "balance_loss_mlp": 1.0004859, + "epoch": 0.7218998947843078, + "flos": 22710662559360.0, + "grad_norm": 1.8311471999233766, + "language_loss": 0.91940361, + "learning_rate": 7.577675189541865e-07, + "loss": 0.94176793, + "num_input_tokens_seen": 259078135, + "step": 12007, + "time_per_iteration": 2.6101269721984863 + }, + { + "auxiliary_loss_clip": 0.01118264, + "auxiliary_loss_mlp": 0.0110435, + "balance_loss_clip": 1.00168014, + "balance_loss_mlp": 1.00059032, + "epoch": 0.7219600180369758, + "flos": 12167182477440.0, + "grad_norm": 1.850330951945998, + "language_loss": 0.6397267, + "learning_rate": 7.574623142018568e-07, + "loss": 0.66195285, + "num_input_tokens_seen": 259095910, + "step": 12008, + "time_per_iteration": 3.9712095260620117 + }, + { + "auxiliary_loss_clip": 0.01147544, + "auxiliary_loss_mlp": 0.01103589, + "balance_loss_clip": 1.00181007, + "balance_loss_mlp": 1.00049663, + "epoch": 0.7220201412896438, + "flos": 22596573985920.0, + "grad_norm": 2.929827965914815, + "language_loss": 0.78717268, + "learning_rate": 7.57157156566681e-07, + "loss": 0.80968404, + "num_input_tokens_seen": 259114225, + "step": 12009, + "time_per_iteration": 2.5921173095703125 + }, + { + "auxiliary_loss_clip": 0.01148009, + "auxiliary_loss_mlp": 0.01104223, + "balance_loss_clip": 1.00184894, + "balance_loss_mlp": 1.00055873, + "epoch": 0.7220802645423118, + "flos": 26718651884160.0, + "grad_norm": 1.9314102480880415, + "language_loss": 0.63999689, + "learning_rate": 7.568520460602297e-07, + "loss": 0.66251922, + "num_input_tokens_seen": 259134660, + "step": 12010, + "time_per_iteration": 2.5980114936828613 + }, + { + "auxiliary_loss_clip": 0.01164888, + "auxiliary_loss_mlp": 0.01103955, + "balance_loss_clip": 1.00195944, + "balance_loss_mlp": 1.00048137, + "epoch": 0.7221403877949797, + "flos": 24420548661120.0, + "grad_norm": 2.0111214733612868, + "language_loss": 0.77009821, + "learning_rate": 7.565469826940742e-07, + "loss": 0.7927866, + "num_input_tokens_seen": 259153300, + "step": 12011, + "time_per_iteration": 2.549872875213623 + }, + { + "auxiliary_loss_clip": 0.01150289, + "auxiliary_loss_mlp": 0.01102654, + "balance_loss_clip": 1.00205374, + "balance_loss_mlp": 1.00051498, + "epoch": 0.7222005110476477, + "flos": 23514379326720.0, + "grad_norm": 1.663618238495224, + "language_loss": 0.79272151, + "learning_rate": 7.56241966479781e-07, + "loss": 0.81525099, + "num_input_tokens_seen": 259172115, + "step": 12012, + "time_per_iteration": 2.567579507827759 + }, + { + "auxiliary_loss_clip": 0.0113124, + "auxiliary_loss_mlp": 0.01103017, + "balance_loss_clip": 1.00178206, + "balance_loss_mlp": 1.00049663, + "epoch": 0.7222606343003156, + "flos": 23112538899840.0, + "grad_norm": 1.9867761341183414, + "language_loss": 0.75537896, + "learning_rate": 7.559369974289171e-07, + "loss": 0.77772152, + "num_input_tokens_seen": 259191345, + "step": 12013, + "time_per_iteration": 2.58474063873291 + }, + { + "auxiliary_loss_clip": 0.01164666, + "auxiliary_loss_mlp": 0.01103633, + "balance_loss_clip": 1.00194883, + "balance_loss_mlp": 1.00034964, + "epoch": 0.7223207575529836, + "flos": 24351169541760.0, + "grad_norm": 1.588671777590944, + "language_loss": 0.76095891, + "learning_rate": 7.556320755530484e-07, + "loss": 0.78364193, + "num_input_tokens_seen": 259211700, + "step": 12014, + "time_per_iteration": 2.5473525524139404 + }, + { + "auxiliary_loss_clip": 0.01149633, + "auxiliary_loss_mlp": 0.01104133, + "balance_loss_clip": 1.00186849, + "balance_loss_mlp": 1.00046873, + "epoch": 0.7223808808056515, + "flos": 28330179569280.0, + "grad_norm": 1.7217896665229975, + "language_loss": 0.86401844, + "learning_rate": 7.553272008637346e-07, + "loss": 0.88655609, + "num_input_tokens_seen": 259233825, + "step": 12015, + "time_per_iteration": 2.623060464859009 + }, + { + "auxiliary_loss_clip": 0.01147915, + "auxiliary_loss_mlp": 0.01102785, + "balance_loss_clip": 1.00184536, + "balance_loss_mlp": 1.00055122, + "epoch": 0.7224410040583196, + "flos": 21069437304960.0, + "grad_norm": 1.7661952162922143, + "language_loss": 0.78259349, + "learning_rate": 7.55022373372538e-07, + "loss": 0.8051005, + "num_input_tokens_seen": 259253055, + "step": 12016, + "time_per_iteration": 2.5431692600250244 + }, + { + "auxiliary_loss_clip": 0.01118702, + "auxiliary_loss_mlp": 0.01103409, + "balance_loss_clip": 1.0019052, + "balance_loss_mlp": 1.0006032, + "epoch": 0.7225011273109875, + "flos": 26795429205120.0, + "grad_norm": 1.5708467841547642, + "language_loss": 0.77805412, + "learning_rate": 7.547175930910186e-07, + "loss": 0.80027521, + "num_input_tokens_seen": 259273420, + "step": 12017, + "time_per_iteration": 4.1530725955963135 + }, + { + "auxiliary_loss_clip": 0.01164604, + "auxiliary_loss_mlp": 0.01102161, + "balance_loss_clip": 1.00181663, + "balance_loss_mlp": 1.00040424, + "epoch": 0.7225612505636555, + "flos": 23583578878080.0, + "grad_norm": 2.198460855383326, + "language_loss": 0.73721915, + "learning_rate": 7.54412860030732e-07, + "loss": 0.7598868, + "num_input_tokens_seen": 259291000, + "step": 12018, + "time_per_iteration": 2.539512872695923 + }, + { + "auxiliary_loss_clip": 0.01116403, + "auxiliary_loss_mlp": 0.01101712, + "balance_loss_clip": 1.00181508, + "balance_loss_mlp": 1.00052738, + "epoch": 0.7226213738163234, + "flos": 20777627214720.0, + "grad_norm": 1.6559404540746314, + "language_loss": 0.7761634, + "learning_rate": 7.541081742032347e-07, + "loss": 0.79834455, + "num_input_tokens_seen": 259312390, + "step": 12019, + "time_per_iteration": 2.673811674118042 + }, + { + "auxiliary_loss_clip": 0.01132894, + "auxiliary_loss_mlp": 0.01103017, + "balance_loss_clip": 1.00186563, + "balance_loss_mlp": 1.0004015, + "epoch": 0.7226814970689914, + "flos": 32635832901120.0, + "grad_norm": 1.7999629917180986, + "language_loss": 0.74199325, + "learning_rate": 7.53803535620081e-07, + "loss": 0.76435232, + "num_input_tokens_seen": 259332645, + "step": 12020, + "time_per_iteration": 2.7276575565338135 + }, + { + "auxiliary_loss_clip": 0.01149954, + "auxiliary_loss_mlp": 0.01103796, + "balance_loss_clip": 1.00195742, + "balance_loss_mlp": 1.00051343, + "epoch": 0.7227416203216595, + "flos": 22454368041600.0, + "grad_norm": 1.5748799555483004, + "language_loss": 0.77497, + "learning_rate": 7.534989442928219e-07, + "loss": 0.79750746, + "num_input_tokens_seen": 259353810, + "step": 12021, + "time_per_iteration": 4.032118320465088 + }, + { + "auxiliary_loss_clip": 0.01116534, + "auxiliary_loss_mlp": 0.01103278, + "balance_loss_clip": 1.00185752, + "balance_loss_mlp": 1.00056744, + "epoch": 0.7228017435743274, + "flos": 21652303299840.0, + "grad_norm": 1.9614250910545323, + "language_loss": 0.68532348, + "learning_rate": 7.531944002330073e-07, + "loss": 0.70752162, + "num_input_tokens_seen": 259372460, + "step": 12022, + "time_per_iteration": 2.6501333713531494 + }, + { + "auxiliary_loss_clip": 0.01149273, + "auxiliary_loss_mlp": 0.0110371, + "balance_loss_clip": 1.00181842, + "balance_loss_mlp": 1.00042748, + "epoch": 0.7228618668269954, + "flos": 29533474206720.0, + "grad_norm": 1.926176832926942, + "language_loss": 0.69195956, + "learning_rate": 7.528899034521858e-07, + "loss": 0.71448934, + "num_input_tokens_seen": 259393275, + "step": 12023, + "time_per_iteration": 2.639547109603882 + }, + { + "auxiliary_loss_clip": 0.01134889, + "auxiliary_loss_mlp": 0.01103207, + "balance_loss_clip": 1.00173736, + "balance_loss_mlp": 1.00049663, + "epoch": 0.7229219900796633, + "flos": 27453815544960.0, + "grad_norm": 2.06451352343669, + "language_loss": 0.71051598, + "learning_rate": 7.525854539619052e-07, + "loss": 0.73289698, + "num_input_tokens_seen": 259416205, + "step": 12024, + "time_per_iteration": 2.6720306873321533 + }, + { + "auxiliary_loss_clip": 0.01114697, + "auxiliary_loss_mlp": 0.01102278, + "balance_loss_clip": 1.00165939, + "balance_loss_mlp": 1.00061643, + "epoch": 0.7229821133323313, + "flos": 16289368116480.0, + "grad_norm": 1.778453182677524, + "language_loss": 0.75363487, + "learning_rate": 7.522810517737089e-07, + "loss": 0.77580464, + "num_input_tokens_seen": 259433115, + "step": 12025, + "time_per_iteration": 2.6044209003448486 + }, + { + "auxiliary_loss_clip": 0.0114985, + "auxiliary_loss_mlp": 0.01102586, + "balance_loss_clip": 1.00191212, + "balance_loss_mlp": 1.00044775, + "epoch": 0.7230422365849992, + "flos": 20412343854720.0, + "grad_norm": 1.9744012815790952, + "language_loss": 0.75830084, + "learning_rate": 7.519766968991395e-07, + "loss": 0.7808252, + "num_input_tokens_seen": 259450475, + "step": 12026, + "time_per_iteration": 2.541821002960205 + }, + { + "auxiliary_loss_clip": 0.01148123, + "auxiliary_loss_mlp": 0.01103809, + "balance_loss_clip": 1.00190651, + "balance_loss_mlp": 1.00052595, + "epoch": 0.7231023598376672, + "flos": 25593499284480.0, + "grad_norm": 1.8246875677803571, + "language_loss": 0.66989273, + "learning_rate": 7.516723893497388e-07, + "loss": 0.69241202, + "num_input_tokens_seen": 259469355, + "step": 12027, + "time_per_iteration": 2.6805787086486816 + }, + { + "auxiliary_loss_clip": 0.01098603, + "auxiliary_loss_mlp": 0.01103916, + "balance_loss_clip": 1.00161457, + "balance_loss_mlp": 1.00044227, + "epoch": 0.7231624830903352, + "flos": 25149607009920.0, + "grad_norm": 2.33796954674, + "language_loss": 0.79245818, + "learning_rate": 7.513681291370469e-07, + "loss": 0.8144834, + "num_input_tokens_seen": 259486565, + "step": 12028, + "time_per_iteration": 2.6888723373413086 + }, + { + "auxiliary_loss_clip": 0.01119966, + "auxiliary_loss_mlp": 0.01102554, + "balance_loss_clip": 1.00187349, + "balance_loss_mlp": 1.00041556, + "epoch": 0.7232226063430032, + "flos": 21725740656000.0, + "grad_norm": 3.1320802134263444, + "language_loss": 0.82087541, + "learning_rate": 7.510639162726e-07, + "loss": 0.84310067, + "num_input_tokens_seen": 259505070, + "step": 12029, + "time_per_iteration": 2.6446216106414795 + }, + { + "auxiliary_loss_clip": 0.01127397, + "auxiliary_loss_mlp": 0.01078283, + "balance_loss_clip": 1.00079083, + "balance_loss_mlp": 1.0000813, + "epoch": 0.7232827295956711, + "flos": 68436798491520.0, + "grad_norm": 0.8043922920886006, + "language_loss": 0.61828262, + "learning_rate": 7.507597507679347e-07, + "loss": 0.64033937, + "num_input_tokens_seen": 259569135, + "step": 12030, + "time_per_iteration": 3.219194173812866 + }, + { + "auxiliary_loss_clip": 0.01149751, + "auxiliary_loss_mlp": 0.01102328, + "balance_loss_clip": 1.00185728, + "balance_loss_mlp": 1.00038075, + "epoch": 0.7233428528483391, + "flos": 20192642317440.0, + "grad_norm": 1.5737268502178205, + "language_loss": 0.7769959, + "learning_rate": 7.504556326345859e-07, + "loss": 0.79951668, + "num_input_tokens_seen": 259587035, + "step": 12031, + "time_per_iteration": 2.5403940677642822 + }, + { + "auxiliary_loss_clip": 0.01148201, + "auxiliary_loss_mlp": 0.01104034, + "balance_loss_clip": 1.00191057, + "balance_loss_mlp": 1.00036979, + "epoch": 0.723402976101007, + "flos": 23949472769280.0, + "grad_norm": 1.7522817767046697, + "language_loss": 0.81307948, + "learning_rate": 7.501515618840834e-07, + "loss": 0.83560181, + "num_input_tokens_seen": 259606140, + "step": 12032, + "time_per_iteration": 2.596350908279419 + }, + { + "auxiliary_loss_clip": 0.01119045, + "auxiliary_loss_mlp": 0.01104495, + "balance_loss_clip": 1.0018456, + "balance_loss_mlp": 1.00054455, + "epoch": 0.723463099353675, + "flos": 20813394182400.0, + "grad_norm": 1.7155753827380582, + "language_loss": 0.7495079, + "learning_rate": 7.498475385279592e-07, + "loss": 0.7717433, + "num_input_tokens_seen": 259624275, + "step": 12033, + "time_per_iteration": 2.6299521923065186 + }, + { + "auxiliary_loss_clip": 0.01116493, + "auxiliary_loss_mlp": 0.01102618, + "balance_loss_clip": 1.00170612, + "balance_loss_mlp": 1.00038385, + "epoch": 0.723523222606343, + "flos": 19098013299840.0, + "grad_norm": 1.7934393997813267, + "language_loss": 0.75102949, + "learning_rate": 7.495435625777423e-07, + "loss": 0.77322054, + "num_input_tokens_seen": 259643465, + "step": 12034, + "time_per_iteration": 2.6432011127471924 + }, + { + "auxiliary_loss_clip": 0.01131701, + "auxiliary_loss_mlp": 0.01101954, + "balance_loss_clip": 1.00174713, + "balance_loss_mlp": 1.00053024, + "epoch": 0.723583345859011, + "flos": 26506994993280.0, + "grad_norm": 2.02352002458693, + "language_loss": 0.80289042, + "learning_rate": 7.492396340449578e-07, + "loss": 0.82522702, + "num_input_tokens_seen": 259662500, + "step": 12035, + "time_per_iteration": 2.687185764312744 + }, + { + "auxiliary_loss_clip": 0.01088775, + "auxiliary_loss_mlp": 0.01103147, + "balance_loss_clip": 1.00198436, + "balance_loss_mlp": 1.00053144, + "epoch": 0.723643469111679, + "flos": 16033863697920.0, + "grad_norm": 2.09167713084477, + "language_loss": 0.6090042, + "learning_rate": 7.489357529411326e-07, + "loss": 0.63092339, + "num_input_tokens_seen": 259680140, + "step": 12036, + "time_per_iteration": 2.7539072036743164 + }, + { + "auxiliary_loss_clip": 0.01147414, + "auxiliary_loss_mlp": 0.01102322, + "balance_loss_clip": 1.00180423, + "balance_loss_mlp": 1.00056529, + "epoch": 0.7237035923643469, + "flos": 21945549934080.0, + "grad_norm": 1.6788120802274895, + "language_loss": 0.67440522, + "learning_rate": 7.486319192777883e-07, + "loss": 0.69690257, + "num_input_tokens_seen": 259700160, + "step": 12037, + "time_per_iteration": 2.56252384185791 + }, + { + "auxiliary_loss_clip": 0.01164736, + "auxiliary_loss_mlp": 0.01103586, + "balance_loss_clip": 1.00198531, + "balance_loss_mlp": 1.00058937, + "epoch": 0.7237637156170149, + "flos": 23583112001280.0, + "grad_norm": 5.548040919801743, + "language_loss": 0.71877682, + "learning_rate": 7.483281330664479e-07, + "loss": 0.74146003, + "num_input_tokens_seen": 259720525, + "step": 12038, + "time_per_iteration": 2.667353868484497 + }, + { + "auxiliary_loss_clip": 0.01164887, + "auxiliary_loss_mlp": 0.01103626, + "balance_loss_clip": 1.00202286, + "balance_loss_mlp": 1.00043845, + "epoch": 0.7238238388696828, + "flos": 20594698225920.0, + "grad_norm": 1.6684029197662327, + "language_loss": 0.72096086, + "learning_rate": 7.480243943186293e-07, + "loss": 0.74364603, + "num_input_tokens_seen": 259738680, + "step": 12039, + "time_per_iteration": 2.5060229301452637 + }, + { + "auxiliary_loss_clip": 0.01164697, + "auxiliary_loss_mlp": 0.01103391, + "balance_loss_clip": 1.00187862, + "balance_loss_mlp": 1.00058508, + "epoch": 0.7238839621223508, + "flos": 24207024263040.0, + "grad_norm": 1.7777364649706093, + "language_loss": 0.75644577, + "learning_rate": 7.477207030458513e-07, + "loss": 0.77912664, + "num_input_tokens_seen": 259758790, + "step": 12040, + "time_per_iteration": 2.5823869705200195 + }, + { + "auxiliary_loss_clip": 0.01116745, + "auxiliary_loss_mlp": 0.01103451, + "balance_loss_clip": 1.00170064, + "balance_loss_mlp": 1.00054979, + "epoch": 0.7239440853750188, + "flos": 14209745368320.0, + "grad_norm": 1.8843800906682509, + "language_loss": 0.76359677, + "learning_rate": 7.474170592596301e-07, + "loss": 0.78579879, + "num_input_tokens_seen": 259777370, + "step": 12041, + "time_per_iteration": 2.638505220413208 + }, + { + "auxiliary_loss_clip": 0.01149921, + "auxiliary_loss_mlp": 0.01103212, + "balance_loss_clip": 1.001773, + "balance_loss_mlp": 1.00040615, + "epoch": 0.7240042086276868, + "flos": 21614812479360.0, + "grad_norm": 2.133977048642898, + "language_loss": 0.63394856, + "learning_rate": 7.471134629714797e-07, + "loss": 0.65647984, + "num_input_tokens_seen": 259794665, + "step": 12042, + "time_per_iteration": 3.9791042804718018 + }, + { + "auxiliary_loss_clip": 0.01117062, + "auxiliary_loss_mlp": 0.01103446, + "balance_loss_clip": 1.0017463, + "balance_loss_mlp": 1.00044942, + "epoch": 0.7240643318803547, + "flos": 23331450337920.0, + "grad_norm": 1.9854668125745016, + "language_loss": 0.83756948, + "learning_rate": 7.468099141929116e-07, + "loss": 0.85977459, + "num_input_tokens_seen": 259811110, + "step": 12043, + "time_per_iteration": 2.644911289215088 + }, + { + "auxiliary_loss_clip": 0.01114815, + "auxiliary_loss_mlp": 0.01104257, + "balance_loss_clip": 1.00177217, + "balance_loss_mlp": 1.00040197, + "epoch": 0.7241244551330227, + "flos": 24024849459840.0, + "grad_norm": 1.6699753571919411, + "language_loss": 0.64361548, + "learning_rate": 7.465064129354379e-07, + "loss": 0.66580623, + "num_input_tokens_seen": 259831080, + "step": 12044, + "time_per_iteration": 2.653287649154663 + }, + { + "auxiliary_loss_clip": 0.01165029, + "auxiliary_loss_mlp": 0.01103965, + "balance_loss_clip": 1.00213051, + "balance_loss_mlp": 1.00058639, + "epoch": 0.7241845783856906, + "flos": 18730323728640.0, + "grad_norm": 1.5454188335366046, + "language_loss": 0.81739247, + "learning_rate": 7.462029592105658e-07, + "loss": 0.84008247, + "num_input_tokens_seen": 259850135, + "step": 12045, + "time_per_iteration": 3.893394947052002 + }, + { + "auxiliary_loss_clip": 0.01164657, + "auxiliary_loss_mlp": 0.01103016, + "balance_loss_clip": 1.00192094, + "balance_loss_mlp": 1.00049591, + "epoch": 0.7242447016383586, + "flos": 19498668577920.0, + "grad_norm": 1.552931327820169, + "language_loss": 0.7203216, + "learning_rate": 7.458995530298034e-07, + "loss": 0.74299836, + "num_input_tokens_seen": 259868185, + "step": 12046, + "time_per_iteration": 2.506277561187744 + }, + { + "auxiliary_loss_clip": 0.01118289, + "auxiliary_loss_mlp": 0.01103956, + "balance_loss_clip": 1.00184631, + "balance_loss_mlp": 1.00057733, + "epoch": 0.7243048248910267, + "flos": 22163491704960.0, + "grad_norm": 2.561096971200729, + "language_loss": 0.71380186, + "learning_rate": 7.455961944046553e-07, + "loss": 0.73602426, + "num_input_tokens_seen": 259887055, + "step": 12047, + "time_per_iteration": 2.696706533432007 + }, + { + "auxiliary_loss_clip": 0.011354, + "auxiliary_loss_mlp": 0.01104206, + "balance_loss_clip": 1.00203085, + "balance_loss_mlp": 1.00044632, + "epoch": 0.7243649481436946, + "flos": 27672762896640.0, + "grad_norm": 2.053630875582716, + "language_loss": 0.69689411, + "learning_rate": 7.45292883346627e-07, + "loss": 0.71929014, + "num_input_tokens_seen": 259908295, + "step": 12048, + "time_per_iteration": 2.6424789428710938 + }, + { + "auxiliary_loss_clip": 0.01129737, + "auxiliary_loss_mlp": 0.01077819, + "balance_loss_clip": 1.00068772, + "balance_loss_mlp": 0.99999923, + "epoch": 0.7244250713963626, + "flos": 63244545759360.0, + "grad_norm": 0.8263978287251094, + "language_loss": 0.53761351, + "learning_rate": 7.449896198672168e-07, + "loss": 0.55968904, + "num_input_tokens_seen": 259968475, + "step": 12049, + "time_per_iteration": 3.172487735748291 + }, + { + "auxiliary_loss_clip": 0.01134902, + "auxiliary_loss_mlp": 0.01104849, + "balance_loss_clip": 1.00199378, + "balance_loss_mlp": 1.00042176, + "epoch": 0.7244851946490305, + "flos": 17967114524160.0, + "grad_norm": 2.552014709541181, + "language_loss": 0.60155618, + "learning_rate": 7.446864039779258e-07, + "loss": 0.6239537, + "num_input_tokens_seen": 259984865, + "step": 12050, + "time_per_iteration": 2.553075075149536 + }, + { + "auxiliary_loss_clip": 0.01096819, + "auxiliary_loss_mlp": 0.0107788, + "balance_loss_clip": 1.00067472, + "balance_loss_mlp": 1.00006044, + "epoch": 0.7245453179016985, + "flos": 70943649603840.0, + "grad_norm": 0.7143944540971232, + "language_loss": 0.53233981, + "learning_rate": 7.443832356902528e-07, + "loss": 0.5540868, + "num_input_tokens_seen": 260046735, + "step": 12051, + "time_per_iteration": 3.212343692779541 + }, + { + "auxiliary_loss_clip": 0.01150096, + "auxiliary_loss_mlp": 0.01102753, + "balance_loss_clip": 1.0019021, + "balance_loss_mlp": 1.00061464, + "epoch": 0.7246054411543664, + "flos": 24568464867840.0, + "grad_norm": 1.5020894461137169, + "language_loss": 0.72521961, + "learning_rate": 7.440801150156927e-07, + "loss": 0.74774814, + "num_input_tokens_seen": 260067950, + "step": 12052, + "time_per_iteration": 2.6209051609039307 + }, + { + "auxiliary_loss_clip": 0.01149982, + "auxiliary_loss_mlp": 0.01103437, + "balance_loss_clip": 1.00185037, + "balance_loss_mlp": 1.00053596, + "epoch": 0.7246655644070344, + "flos": 32338312548480.0, + "grad_norm": 3.787007330418277, + "language_loss": 0.74157149, + "learning_rate": 7.437770419657415e-07, + "loss": 0.76410568, + "num_input_tokens_seen": 260087730, + "step": 12053, + "time_per_iteration": 2.630906820297241 + }, + { + "auxiliary_loss_clip": 0.01117322, + "auxiliary_loss_mlp": 0.01103107, + "balance_loss_clip": 1.00180292, + "balance_loss_mlp": 1.00039637, + "epoch": 0.7247256876597024, + "flos": 21872471713920.0, + "grad_norm": 1.8683983143945373, + "language_loss": 0.77727562, + "learning_rate": 7.434740165518898e-07, + "loss": 0.7994799, + "num_input_tokens_seen": 260107760, + "step": 12054, + "time_per_iteration": 2.641031503677368 + }, + { + "auxiliary_loss_clip": 0.01118526, + "auxiliary_loss_mlp": 0.01103166, + "balance_loss_clip": 1.00175858, + "balance_loss_mlp": 1.00055027, + "epoch": 0.7247858109123704, + "flos": 16213093585920.0, + "grad_norm": 2.7032579453083825, + "language_loss": 0.6815958, + "learning_rate": 7.431710387856301e-07, + "loss": 0.70381272, + "num_input_tokens_seen": 260123660, + "step": 12055, + "time_per_iteration": 4.03033447265625 + }, + { + "auxiliary_loss_clip": 0.01117219, + "auxiliary_loss_mlp": 0.01102613, + "balance_loss_clip": 1.00162935, + "balance_loss_mlp": 1.00047421, + "epoch": 0.7248459341650383, + "flos": 20850705434880.0, + "grad_norm": 1.750523877929357, + "language_loss": 0.73928249, + "learning_rate": 7.428681086784496e-07, + "loss": 0.76148081, + "num_input_tokens_seen": 260142690, + "step": 12056, + "time_per_iteration": 2.630174398422241 + }, + { + "auxiliary_loss_clip": 0.01164675, + "auxiliary_loss_mlp": 0.01101772, + "balance_loss_clip": 1.00187242, + "balance_loss_mlp": 1.00039625, + "epoch": 0.7249060574177063, + "flos": 25921794614400.0, + "grad_norm": 1.4870779792826714, + "language_loss": 0.7070502, + "learning_rate": 7.425652262418368e-07, + "loss": 0.72971469, + "num_input_tokens_seen": 260162590, + "step": 12057, + "time_per_iteration": 2.5557234287261963 + }, + { + "auxiliary_loss_clip": 0.0110144, + "auxiliary_loss_mlp": 0.01103816, + "balance_loss_clip": 1.00189114, + "balance_loss_mlp": 1.00053275, + "epoch": 0.7249661806703742, + "flos": 17345536646400.0, + "grad_norm": 1.8551880448420863, + "language_loss": 0.624982, + "learning_rate": 7.42262391487277e-07, + "loss": 0.64703465, + "num_input_tokens_seen": 260181065, + "step": 12058, + "time_per_iteration": 4.204615354537964 + }, + { + "auxiliary_loss_clip": 0.01103363, + "auxiliary_loss_mlp": 0.0110304, + "balance_loss_clip": 1.00175595, + "balance_loss_mlp": 1.00042439, + "epoch": 0.7250263039230422, + "flos": 19574153009280.0, + "grad_norm": 2.706477553055853, + "language_loss": 0.74564105, + "learning_rate": 7.419596044262535e-07, + "loss": 0.76770508, + "num_input_tokens_seen": 260200330, + "step": 12059, + "time_per_iteration": 2.6863458156585693 + }, + { + "auxiliary_loss_clip": 0.01149021, + "auxiliary_loss_mlp": 0.01102066, + "balance_loss_clip": 1.00184679, + "balance_loss_mlp": 1.00059557, + "epoch": 0.7250864271757103, + "flos": 21976648133760.0, + "grad_norm": 1.6862643211781458, + "language_loss": 0.79329026, + "learning_rate": 7.416568650702472e-07, + "loss": 0.81580114, + "num_input_tokens_seen": 260219975, + "step": 12060, + "time_per_iteration": 2.6503853797912598 + }, + { + "auxiliary_loss_clip": 0.0114789, + "auxiliary_loss_mlp": 0.01102803, + "balance_loss_clip": 1.00178385, + "balance_loss_mlp": 1.00047421, + "epoch": 0.7251465504283782, + "flos": 25012608537600.0, + "grad_norm": 1.881565259941791, + "language_loss": 0.76471519, + "learning_rate": 7.413541734307393e-07, + "loss": 0.78722215, + "num_input_tokens_seen": 260242025, + "step": 12061, + "time_per_iteration": 2.6055150032043457 + }, + { + "auxiliary_loss_clip": 0.01164752, + "auxiliary_loss_mlp": 0.00747219, + "balance_loss_clip": 1.00200951, + "balance_loss_mlp": 1.00052881, + "epoch": 0.7252066736810462, + "flos": 16690131135360.0, + "grad_norm": 1.67401620532375, + "language_loss": 0.81433403, + "learning_rate": 7.410515295192068e-07, + "loss": 0.83345377, + "num_input_tokens_seen": 260260015, + "step": 12062, + "time_per_iteration": 2.526155471801758 + }, + { + "auxiliary_loss_clip": 0.0110423, + "auxiliary_loss_mlp": 0.01104669, + "balance_loss_clip": 1.00195336, + "balance_loss_mlp": 1.00043201, + "epoch": 0.7252667969337141, + "flos": 25703026830720.0, + "grad_norm": 6.177176304127506, + "language_loss": 0.69223583, + "learning_rate": 7.407489333471262e-07, + "loss": 0.71432483, + "num_input_tokens_seen": 260278635, + "step": 12063, + "time_per_iteration": 2.710665702819824 + }, + { + "auxiliary_loss_clip": 0.01116561, + "auxiliary_loss_mlp": 0.01101819, + "balance_loss_clip": 1.00170517, + "balance_loss_mlp": 1.00034809, + "epoch": 0.7253269201863821, + "flos": 18259930195200.0, + "grad_norm": 1.5226766162142455, + "language_loss": 0.69936788, + "learning_rate": 7.40446384925973e-07, + "loss": 0.72155166, + "num_input_tokens_seen": 260298510, + "step": 12064, + "time_per_iteration": 2.63624906539917 + }, + { + "auxiliary_loss_clip": 0.01133227, + "auxiliary_loss_mlp": 0.01102413, + "balance_loss_clip": 1.00182784, + "balance_loss_mlp": 1.00046492, + "epoch": 0.72538704343905, + "flos": 20411805150720.0, + "grad_norm": 1.8251018340844904, + "language_loss": 0.90136123, + "learning_rate": 7.401438842672192e-07, + "loss": 0.92371762, + "num_input_tokens_seen": 260317405, + "step": 12065, + "time_per_iteration": 2.584773302078247 + }, + { + "auxiliary_loss_clip": 0.01143862, + "auxiliary_loss_mlp": 0.01077806, + "balance_loss_clip": 1.00066864, + "balance_loss_mlp": 0.99998611, + "epoch": 0.725447166691718, + "flos": 70151209706880.0, + "grad_norm": 0.6484462343169065, + "language_loss": 0.56102818, + "learning_rate": 7.398414313823349e-07, + "loss": 0.58324486, + "num_input_tokens_seen": 260388085, + "step": 12066, + "time_per_iteration": 3.2750730514526367 + }, + { + "auxiliary_loss_clip": 0.01098792, + "auxiliary_loss_mlp": 0.0110258, + "balance_loss_clip": 1.00158131, + "balance_loss_mlp": 1.00044107, + "epoch": 0.725507289944386, + "flos": 27052334254080.0, + "grad_norm": 2.0763802021845432, + "language_loss": 0.7691083, + "learning_rate": 7.395390262827897e-07, + "loss": 0.79112196, + "num_input_tokens_seen": 260406165, + "step": 12067, + "time_per_iteration": 2.7339084148406982 + }, + { + "auxiliary_loss_clip": 0.01126953, + "auxiliary_loss_mlp": 0.01077898, + "balance_loss_clip": 1.00076485, + "balance_loss_mlp": 1.00007772, + "epoch": 0.725567413197054, + "flos": 62921924778240.0, + "grad_norm": 0.7212556399657236, + "language_loss": 0.57055229, + "learning_rate": 7.392366689800515e-07, + "loss": 0.59260082, + "num_input_tokens_seen": 260461365, + "step": 12068, + "time_per_iteration": 3.03587007522583 + }, + { + "auxiliary_loss_clip": 0.01096435, + "auxiliary_loss_mlp": 0.01078078, + "balance_loss_clip": 1.00081587, + "balance_loss_mlp": 1.00025833, + "epoch": 0.7256275364497219, + "flos": 60295957188480.0, + "grad_norm": 0.6595056273134242, + "language_loss": 0.55409557, + "learning_rate": 7.389343594855848e-07, + "loss": 0.57584071, + "num_input_tokens_seen": 260523795, + "step": 12069, + "time_per_iteration": 3.227827548980713 + }, + { + "auxiliary_loss_clip": 0.01115844, + "auxiliary_loss_mlp": 0.01101834, + "balance_loss_clip": 1.00179195, + "balance_loss_mlp": 1.00045824, + "epoch": 0.7256876597023899, + "flos": 24498511130880.0, + "grad_norm": 1.9578516525778484, + "language_loss": 0.80067897, + "learning_rate": 7.38632097810854e-07, + "loss": 0.82285577, + "num_input_tokens_seen": 260544765, + "step": 12070, + "time_per_iteration": 2.679234027862549 + }, + { + "auxiliary_loss_clip": 0.0113318, + "auxiliary_loss_mlp": 0.01102067, + "balance_loss_clip": 1.00182915, + "balance_loss_mlp": 1.00050068, + "epoch": 0.7257477829550578, + "flos": 24352749740160.0, + "grad_norm": 2.072631131946963, + "language_loss": 0.72437716, + "learning_rate": 7.383298839673197e-07, + "loss": 0.74672961, + "num_input_tokens_seen": 260564340, + "step": 12071, + "time_per_iteration": 2.6431307792663574 + }, + { + "auxiliary_loss_clip": 0.01164677, + "auxiliary_loss_mlp": 0.01102942, + "balance_loss_clip": 1.00193131, + "balance_loss_mlp": 1.00061321, + "epoch": 0.7258079062077258, + "flos": 17202217380480.0, + "grad_norm": 2.6305803676482826, + "language_loss": 0.70192868, + "learning_rate": 7.380277179664436e-07, + "loss": 0.72460485, + "num_input_tokens_seen": 260582565, + "step": 12072, + "time_per_iteration": 2.5186619758605957 + }, + { + "auxiliary_loss_clip": 0.01116902, + "auxiliary_loss_mlp": 0.01103519, + "balance_loss_clip": 1.00179231, + "balance_loss_mlp": 1.00052238, + "epoch": 0.7258680294603939, + "flos": 21580338401280.0, + "grad_norm": 1.9527939972539956, + "language_loss": 0.78853595, + "learning_rate": 7.377255998196821e-07, + "loss": 0.81074011, + "num_input_tokens_seen": 260601700, + "step": 12073, + "time_per_iteration": 2.6667873859405518 + }, + { + "auxiliary_loss_clip": 0.01133424, + "auxiliary_loss_mlp": 0.01102798, + "balance_loss_clip": 1.00186002, + "balance_loss_mlp": 1.00037336, + "epoch": 0.7259281527130618, + "flos": 34855399036800.0, + "grad_norm": 1.508554227479584, + "language_loss": 0.70280743, + "learning_rate": 7.374235295384923e-07, + "loss": 0.72516966, + "num_input_tokens_seen": 260623040, + "step": 12074, + "time_per_iteration": 2.696659564971924 + }, + { + "auxiliary_loss_clip": 0.01135621, + "auxiliary_loss_mlp": 0.01103374, + "balance_loss_clip": 1.00196898, + "balance_loss_mlp": 1.00037682, + "epoch": 0.7259882759657298, + "flos": 25404644551680.0, + "grad_norm": 2.312934200224251, + "language_loss": 0.74179369, + "learning_rate": 7.371215071343302e-07, + "loss": 0.76418364, + "num_input_tokens_seen": 260642735, + "step": 12075, + "time_per_iteration": 2.650409460067749 + }, + { + "auxiliary_loss_clip": 0.01148494, + "auxiliary_loss_mlp": 0.01103231, + "balance_loss_clip": 1.00180483, + "balance_loss_mlp": 1.00042534, + "epoch": 0.7260483992183977, + "flos": 62953630531200.0, + "grad_norm": 1.646299790609234, + "language_loss": 0.63829595, + "learning_rate": 7.368195326186458e-07, + "loss": 0.66081321, + "num_input_tokens_seen": 260669935, + "step": 12076, + "time_per_iteration": 2.9246773719787598 + }, + { + "auxiliary_loss_clip": 0.01119341, + "auxiliary_loss_mlp": 0.0110326, + "balance_loss_clip": 1.00175929, + "balance_loss_mlp": 1.0003581, + "epoch": 0.7261085224710657, + "flos": 26467528924800.0, + "grad_norm": 1.8674256430725817, + "language_loss": 0.78925902, + "learning_rate": 7.365176060028912e-07, + "loss": 0.81148505, + "num_input_tokens_seen": 260689605, + "step": 12077, + "time_per_iteration": 2.6665780544281006 + }, + { + "auxiliary_loss_clip": 0.01158723, + "auxiliary_loss_mlp": 0.00745441, + "balance_loss_clip": 1.00070977, + "balance_loss_mlp": 1.00014448, + "epoch": 0.7261686457237336, + "flos": 66772732187520.0, + "grad_norm": 0.8705282360302973, + "language_loss": 0.65029061, + "learning_rate": 7.362157272985163e-07, + "loss": 0.66933221, + "num_input_tokens_seen": 260748265, + "step": 12078, + "time_per_iteration": 3.0937001705169678 + }, + { + "auxiliary_loss_clip": 0.01142075, + "auxiliary_loss_mlp": 0.01077897, + "balance_loss_clip": 1.00060713, + "balance_loss_mlp": 1.00007713, + "epoch": 0.7262287689764017, + "flos": 69999594399360.0, + "grad_norm": 0.7128535852961837, + "language_loss": 0.59259486, + "learning_rate": 7.359138965169671e-07, + "loss": 0.61479449, + "num_input_tokens_seen": 260816715, + "step": 12079, + "time_per_iteration": 4.639646768569946 + }, + { + "auxiliary_loss_clip": 0.01099226, + "auxiliary_loss_mlp": 0.01102484, + "balance_loss_clip": 1.00170612, + "balance_loss_mlp": 1.0004406, + "epoch": 0.7262888922290696, + "flos": 23805435231360.0, + "grad_norm": 1.9222626054018364, + "language_loss": 0.65050781, + "learning_rate": 7.356121136696895e-07, + "loss": 0.67252493, + "num_input_tokens_seen": 260836765, + "step": 12080, + "time_per_iteration": 2.7701385021209717 + }, + { + "auxiliary_loss_clip": 0.01101838, + "auxiliary_loss_mlp": 0.01102672, + "balance_loss_clip": 1.00160718, + "balance_loss_mlp": 1.00034261, + "epoch": 0.7263490154817376, + "flos": 19500320603520.0, + "grad_norm": 2.4981042148798687, + "language_loss": 0.69875342, + "learning_rate": 7.35310378768128e-07, + "loss": 0.72079849, + "num_input_tokens_seen": 260854610, + "step": 12081, + "time_per_iteration": 2.685502529144287 + }, + { + "auxiliary_loss_clip": 0.01165014, + "auxiliary_loss_mlp": 0.01103407, + "balance_loss_clip": 1.00204217, + "balance_loss_mlp": 1.00050592, + "epoch": 0.7264091387344055, + "flos": 16286243633280.0, + "grad_norm": 1.98250462742032, + "language_loss": 0.81517416, + "learning_rate": 7.350086918237237e-07, + "loss": 0.83785832, + "num_input_tokens_seen": 260871620, + "step": 12082, + "time_per_iteration": 3.9447624683380127 + }, + { + "auxiliary_loss_clip": 0.01150207, + "auxiliary_loss_mlp": 0.01104913, + "balance_loss_clip": 1.00180578, + "balance_loss_mlp": 1.0004853, + "epoch": 0.7264692619870735, + "flos": 24352031468160.0, + "grad_norm": 1.8383151484587132, + "language_loss": 0.77372825, + "learning_rate": 7.347070528479158e-07, + "loss": 0.79627949, + "num_input_tokens_seen": 260890490, + "step": 12083, + "time_per_iteration": 2.709268808364868 + }, + { + "auxiliary_loss_clip": 0.01164978, + "auxiliary_loss_mlp": 0.01103081, + "balance_loss_clip": 1.00203872, + "balance_loss_mlp": 1.00046611, + "epoch": 0.7265293852397414, + "flos": 25119478477440.0, + "grad_norm": 1.9710428722995121, + "language_loss": 0.73108602, + "learning_rate": 7.344054618521433e-07, + "loss": 0.7537666, + "num_input_tokens_seen": 260909700, + "step": 12084, + "time_per_iteration": 2.5708467960357666 + }, + { + "auxiliary_loss_clip": 0.01164902, + "auxiliary_loss_mlp": 0.01103612, + "balance_loss_clip": 1.0019691, + "balance_loss_mlp": 1.00042462, + "epoch": 0.7265895084924094, + "flos": 22638230784000.0, + "grad_norm": 2.9072739955493807, + "language_loss": 0.77816236, + "learning_rate": 7.34103918847843e-07, + "loss": 0.80084747, + "num_input_tokens_seen": 260929090, + "step": 12085, + "time_per_iteration": 2.583993911743164 + }, + { + "auxiliary_loss_clip": 0.01148154, + "auxiliary_loss_mlp": 0.01103882, + "balance_loss_clip": 1.00183654, + "balance_loss_mlp": 1.00050378, + "epoch": 0.7266496317450775, + "flos": 23368222886400.0, + "grad_norm": 1.620192439643025, + "language_loss": 0.72443163, + "learning_rate": 7.338024238464493e-07, + "loss": 0.74695206, + "num_input_tokens_seen": 260946615, + "step": 12086, + "time_per_iteration": 2.602094888687134 + }, + { + "auxiliary_loss_clip": 0.01118223, + "auxiliary_loss_mlp": 0.0110353, + "balance_loss_clip": 1.00188124, + "balance_loss_mlp": 1.0005337, + "epoch": 0.7267097549977454, + "flos": 28074603323520.0, + "grad_norm": 1.630004338569707, + "language_loss": 0.69374359, + "learning_rate": 7.335009768593938e-07, + "loss": 0.7159611, + "num_input_tokens_seen": 260968515, + "step": 12087, + "time_per_iteration": 2.703871250152588 + }, + { + "auxiliary_loss_clip": 0.01165004, + "auxiliary_loss_mlp": 0.01104701, + "balance_loss_clip": 1.00202692, + "balance_loss_mlp": 1.00046444, + "epoch": 0.7267698782504134, + "flos": 22195523658240.0, + "grad_norm": 4.957438006001939, + "language_loss": 0.78954065, + "learning_rate": 7.331995778981088e-07, + "loss": 0.81223762, + "num_input_tokens_seen": 260986790, + "step": 12088, + "time_per_iteration": 2.541236162185669 + }, + { + "auxiliary_loss_clip": 0.01148267, + "auxiliary_loss_mlp": 0.01103581, + "balance_loss_clip": 1.00170302, + "balance_loss_mlp": 1.0005846, + "epoch": 0.7268300015030813, + "flos": 18514859996160.0, + "grad_norm": 1.7176388093580648, + "language_loss": 0.74040097, + "learning_rate": 7.328982269740221e-07, + "loss": 0.76291943, + "num_input_tokens_seen": 261004925, + "step": 12089, + "time_per_iteration": 2.5178170204162598 + }, + { + "auxiliary_loss_clip": 0.01135543, + "auxiliary_loss_mlp": 0.01103511, + "balance_loss_clip": 1.00190234, + "balance_loss_mlp": 1.00060916, + "epoch": 0.7268901247557493, + "flos": 23986029836160.0, + "grad_norm": 1.7758818516049453, + "language_loss": 0.71035755, + "learning_rate": 7.325969240985616e-07, + "loss": 0.73274803, + "num_input_tokens_seen": 261023895, + "step": 12090, + "time_per_iteration": 2.66727352142334 + }, + { + "auxiliary_loss_clip": 0.01083737, + "auxiliary_loss_mlp": 0.01104873, + "balance_loss_clip": 1.00160372, + "balance_loss_mlp": 1.0004456, + "epoch": 0.7269502480084172, + "flos": 32088087429120.0, + "grad_norm": 1.8744857354492268, + "language_loss": 0.77391613, + "learning_rate": 7.322956692831528e-07, + "loss": 0.79580224, + "num_input_tokens_seen": 261045445, + "step": 12091, + "time_per_iteration": 2.807913064956665 + }, + { + "auxiliary_loss_clip": 0.01150151, + "auxiliary_loss_mlp": 0.00747425, + "balance_loss_clip": 1.00185812, + "balance_loss_mlp": 1.00057352, + "epoch": 0.7270103712610853, + "flos": 19062785036160.0, + "grad_norm": 2.0244221699337555, + "language_loss": 0.7130568, + "learning_rate": 7.319944625392205e-07, + "loss": 0.73203254, + "num_input_tokens_seen": 261064275, + "step": 12092, + "time_per_iteration": 4.027644634246826 + }, + { + "auxiliary_loss_clip": 0.011492, + "auxiliary_loss_mlp": 0.01103217, + "balance_loss_clip": 1.00187325, + "balance_loss_mlp": 1.00050616, + "epoch": 0.7270704945137532, + "flos": 34532921710080.0, + "grad_norm": 1.9731696093437483, + "language_loss": 0.60822958, + "learning_rate": 7.31693303878184e-07, + "loss": 0.6307537, + "num_input_tokens_seen": 261083310, + "step": 12093, + "time_per_iteration": 2.6568350791931152 + }, + { + "auxiliary_loss_clip": 0.01131801, + "auxiliary_loss_mlp": 0.01103816, + "balance_loss_clip": 1.00187385, + "balance_loss_mlp": 1.00053287, + "epoch": 0.7271306177664212, + "flos": 21507583403520.0, + "grad_norm": 1.5193323845276667, + "language_loss": 0.75516671, + "learning_rate": 7.313921933114644e-07, + "loss": 0.77752292, + "num_input_tokens_seen": 261103460, + "step": 12094, + "time_per_iteration": 2.6264092922210693 + }, + { + "auxiliary_loss_clip": 0.01118925, + "auxiliary_loss_mlp": 0.01102307, + "balance_loss_clip": 1.00178087, + "balance_loss_mlp": 1.00045443, + "epoch": 0.7271907410190891, + "flos": 22272444633600.0, + "grad_norm": 6.002930081152204, + "language_loss": 0.84755111, + "learning_rate": 7.310911308504808e-07, + "loss": 0.86976343, + "num_input_tokens_seen": 261121375, + "step": 12095, + "time_per_iteration": 2.6602249145507812 + }, + { + "auxiliary_loss_clip": 0.01147935, + "auxiliary_loss_mlp": 0.0110319, + "balance_loss_clip": 1.00183272, + "balance_loss_mlp": 1.00057459, + "epoch": 0.7272508642717571, + "flos": 22893124671360.0, + "grad_norm": 1.6941567223245815, + "language_loss": 0.77648103, + "learning_rate": 7.307901165066479e-07, + "loss": 0.79899228, + "num_input_tokens_seen": 261141105, + "step": 12096, + "time_per_iteration": 4.011425018310547 + }, + { + "auxiliary_loss_clip": 0.01164953, + "auxiliary_loss_mlp": 0.01102936, + "balance_loss_clip": 1.00207543, + "balance_loss_mlp": 1.00051105, + "epoch": 0.727310987524425, + "flos": 11655886331520.0, + "grad_norm": 2.1207089176464375, + "language_loss": 0.72332889, + "learning_rate": 7.30489150291381e-07, + "loss": 0.7460078, + "num_input_tokens_seen": 261159255, + "step": 12097, + "time_per_iteration": 2.523834466934204 + }, + { + "auxiliary_loss_clip": 0.01148069, + "auxiliary_loss_mlp": 0.00747351, + "balance_loss_clip": 1.00188839, + "balance_loss_mlp": 1.00049353, + "epoch": 0.727371110777093, + "flos": 24535319592960.0, + "grad_norm": 2.0556751155450046, + "language_loss": 0.77065694, + "learning_rate": 7.301882322160935e-07, + "loss": 0.7896111, + "num_input_tokens_seen": 261177960, + "step": 12098, + "time_per_iteration": 2.608929395675659 + }, + { + "auxiliary_loss_clip": 0.01132804, + "auxiliary_loss_mlp": 0.01103293, + "balance_loss_clip": 1.00168228, + "balance_loss_mlp": 1.00039124, + "epoch": 0.7274312340297611, + "flos": 74739835405440.0, + "grad_norm": 1.944501284567607, + "language_loss": 0.67559892, + "learning_rate": 7.298873622921952e-07, + "loss": 0.6979599, + "num_input_tokens_seen": 261205660, + "step": 12099, + "time_per_iteration": 2.9993584156036377 + }, + { + "auxiliary_loss_clip": 0.01149929, + "auxiliary_loss_mlp": 0.01104102, + "balance_loss_clip": 1.00184655, + "balance_loss_mlp": 1.00053298, + "epoch": 0.727491357282429, + "flos": 22342865247360.0, + "grad_norm": 1.7509158410077459, + "language_loss": 0.72801816, + "learning_rate": 7.29586540531095e-07, + "loss": 0.7505585, + "num_input_tokens_seen": 261225185, + "step": 12100, + "time_per_iteration": 2.583646774291992 + }, + { + "auxiliary_loss_clip": 0.01150219, + "auxiliary_loss_mlp": 0.01104089, + "balance_loss_clip": 1.00199676, + "balance_loss_mlp": 1.00051999, + "epoch": 0.727551480535097, + "flos": 23297550877440.0, + "grad_norm": 1.3404408982754268, + "language_loss": 0.74855053, + "learning_rate": 7.292857669442005e-07, + "loss": 0.77109367, + "num_input_tokens_seen": 261247965, + "step": 12101, + "time_per_iteration": 2.6579387187957764 + }, + { + "auxiliary_loss_clip": 0.01116702, + "auxiliary_loss_mlp": 0.01102745, + "balance_loss_clip": 1.00193334, + "balance_loss_mlp": 1.00051069, + "epoch": 0.7276116037877649, + "flos": 21470559459840.0, + "grad_norm": 2.1348715492868466, + "language_loss": 0.82333368, + "learning_rate": 7.289850415429177e-07, + "loss": 0.84552813, + "num_input_tokens_seen": 261267585, + "step": 12102, + "time_per_iteration": 2.672288656234741 + }, + { + "auxiliary_loss_clip": 0.01148026, + "auxiliary_loss_mlp": 0.01103167, + "balance_loss_clip": 1.00189412, + "balance_loss_mlp": 1.00055194, + "epoch": 0.7276717270404329, + "flos": 21464059098240.0, + "grad_norm": 2.2573326199969626, + "language_loss": 0.82008719, + "learning_rate": 7.286843643386495e-07, + "loss": 0.84259915, + "num_input_tokens_seen": 261285200, + "step": 12103, + "time_per_iteration": 2.595551013946533 + }, + { + "auxiliary_loss_clip": 0.01131408, + "auxiliary_loss_mlp": 0.01103437, + "balance_loss_clip": 1.0018183, + "balance_loss_mlp": 1.00043976, + "epoch": 0.7277318502931008, + "flos": 16837221329280.0, + "grad_norm": 1.768822894013313, + "language_loss": 0.66239667, + "learning_rate": 7.283837353427968e-07, + "loss": 0.68474507, + "num_input_tokens_seen": 261303645, + "step": 12104, + "time_per_iteration": 2.6184794902801514 + }, + { + "auxiliary_loss_clip": 0.0111831, + "auxiliary_loss_mlp": 0.01101973, + "balance_loss_clip": 1.00177979, + "balance_loss_mlp": 1.00050211, + "epoch": 0.7277919735457689, + "flos": 33400550476800.0, + "grad_norm": 1.7700671756198587, + "language_loss": 0.66373014, + "learning_rate": 7.280831545667611e-07, + "loss": 0.68593299, + "num_input_tokens_seen": 261323265, + "step": 12105, + "time_per_iteration": 2.748629570007324 + }, + { + "auxiliary_loss_clip": 0.01164711, + "auxiliary_loss_mlp": 0.01102955, + "balance_loss_clip": 1.00195169, + "balance_loss_mlp": 1.0005306, + "epoch": 0.7278520967984368, + "flos": 19206499351680.0, + "grad_norm": 1.9926237411024907, + "language_loss": 0.7593323, + "learning_rate": 7.27782622021939e-07, + "loss": 0.78200895, + "num_input_tokens_seen": 261339745, + "step": 12106, + "time_per_iteration": 2.5300118923187256 + }, + { + "auxiliary_loss_clip": 0.01149787, + "auxiliary_loss_mlp": 0.01104643, + "balance_loss_clip": 1.00192177, + "balance_loss_mlp": 1.00050187, + "epoch": 0.7279122200511048, + "flos": 34094667870720.0, + "grad_norm": 2.234235903797896, + "language_loss": 0.70724684, + "learning_rate": 7.274821377197273e-07, + "loss": 0.7297911, + "num_input_tokens_seen": 261359310, + "step": 12107, + "time_per_iteration": 2.6553444862365723 + }, + { + "auxiliary_loss_clip": 0.01150171, + "auxiliary_loss_mlp": 0.01102873, + "balance_loss_clip": 1.00188231, + "balance_loss_mlp": 1.00054383, + "epoch": 0.7279723433037727, + "flos": 54599049348480.0, + "grad_norm": 1.4846617739027874, + "language_loss": 0.75135773, + "learning_rate": 7.271817016715205e-07, + "loss": 0.77388817, + "num_input_tokens_seen": 261384640, + "step": 12108, + "time_per_iteration": 2.884833335876465 + }, + { + "auxiliary_loss_clip": 0.01164749, + "auxiliary_loss_mlp": 0.01103927, + "balance_loss_clip": 1.00192904, + "balance_loss_mlp": 1.00054908, + "epoch": 0.7280324665564407, + "flos": 36137482156800.0, + "grad_norm": 1.4929602040521657, + "language_loss": 0.6693747, + "learning_rate": 7.268813138887124e-07, + "loss": 0.69206142, + "num_input_tokens_seen": 261405290, + "step": 12109, + "time_per_iteration": 2.6612045764923096 + }, + { + "auxiliary_loss_clip": 0.01116014, + "auxiliary_loss_mlp": 0.01103999, + "balance_loss_clip": 1.0017488, + "balance_loss_mlp": 1.00052583, + "epoch": 0.7280925898091086, + "flos": 11618539165440.0, + "grad_norm": 1.8810005490598891, + "language_loss": 0.63005054, + "learning_rate": 7.265809743826912e-07, + "loss": 0.65225071, + "num_input_tokens_seen": 261419710, + "step": 12110, + "time_per_iteration": 2.623002290725708 + }, + { + "auxiliary_loss_clip": 0.01114835, + "auxiliary_loss_mlp": 0.01103999, + "balance_loss_clip": 1.00154448, + "balance_loss_mlp": 1.00042951, + "epoch": 0.7281527130617766, + "flos": 34277094069120.0, + "grad_norm": 1.559998300119801, + "language_loss": 0.58105147, + "learning_rate": 7.26280683164847e-07, + "loss": 0.60323977, + "num_input_tokens_seen": 261442385, + "step": 12111, + "time_per_iteration": 2.7432358264923096 + }, + { + "auxiliary_loss_clip": 0.01084281, + "auxiliary_loss_mlp": 0.01103041, + "balance_loss_clip": 1.0015862, + "balance_loss_mlp": 1.00042582, + "epoch": 0.7282128363144446, + "flos": 13918043018880.0, + "grad_norm": 2.0044104050954425, + "language_loss": 0.74198228, + "learning_rate": 7.259804402465677e-07, + "loss": 0.76385546, + "num_input_tokens_seen": 261459805, + "step": 12112, + "time_per_iteration": 2.743711233139038 + }, + { + "auxiliary_loss_clip": 0.01149986, + "auxiliary_loss_mlp": 0.01102311, + "balance_loss_clip": 1.00184309, + "balance_loss_mlp": 1.00055408, + "epoch": 0.7282729595671126, + "flos": 20777627214720.0, + "grad_norm": 2.125998807620854, + "language_loss": 0.66216254, + "learning_rate": 7.25680245639237e-07, + "loss": 0.68468553, + "num_input_tokens_seen": 261477175, + "step": 12113, + "time_per_iteration": 2.5566246509552 + }, + { + "auxiliary_loss_clip": 0.01116274, + "auxiliary_loss_mlp": 0.01103508, + "balance_loss_clip": 1.00164902, + "balance_loss_mlp": 1.00051093, + "epoch": 0.7283330828197806, + "flos": 16325422392960.0, + "grad_norm": 2.047075701771177, + "language_loss": 0.73535514, + "learning_rate": 7.253800993542399e-07, + "loss": 0.75755298, + "num_input_tokens_seen": 261494990, + "step": 12114, + "time_per_iteration": 2.6417081356048584 + }, + { + "auxiliary_loss_clip": 0.01132935, + "auxiliary_loss_mlp": 0.01102811, + "balance_loss_clip": 1.00191545, + "balance_loss_mlp": 1.00057697, + "epoch": 0.7283932060724485, + "flos": 27490193043840.0, + "grad_norm": 1.9371228086435617, + "language_loss": 0.68284392, + "learning_rate": 7.250800014029564e-07, + "loss": 0.70520139, + "num_input_tokens_seen": 261514445, + "step": 12115, + "time_per_iteration": 2.6544814109802246 + }, + { + "auxiliary_loss_clip": 0.01164964, + "auxiliary_loss_mlp": 0.01104251, + "balance_loss_clip": 1.00197518, + "balance_loss_mlp": 1.00049114, + "epoch": 0.7284533293251165, + "flos": 18367877543040.0, + "grad_norm": 1.7127944416065186, + "language_loss": 0.5975945, + "learning_rate": 7.247799517967674e-07, + "loss": 0.62028664, + "num_input_tokens_seen": 261533565, + "step": 12116, + "time_per_iteration": 2.529583215713501 + }, + { + "auxiliary_loss_clip": 0.01148452, + "auxiliary_loss_mlp": 0.01103786, + "balance_loss_clip": 1.00189769, + "balance_loss_mlp": 1.00050306, + "epoch": 0.7285134525777844, + "flos": 21725525174400.0, + "grad_norm": 1.914138024817186, + "language_loss": 0.72976065, + "learning_rate": 7.2447995054705e-07, + "loss": 0.75228298, + "num_input_tokens_seen": 261553795, + "step": 12117, + "time_per_iteration": 3.9727442264556885 + }, + { + "auxiliary_loss_clip": 0.01149946, + "auxiliary_loss_mlp": 0.01103736, + "balance_loss_clip": 1.00190353, + "balance_loss_mlp": 1.000453, + "epoch": 0.7285735758304525, + "flos": 20741357456640.0, + "grad_norm": 3.0508752073106575, + "language_loss": 0.69449335, + "learning_rate": 7.241799976651807e-07, + "loss": 0.71703011, + "num_input_tokens_seen": 261572565, + "step": 12118, + "time_per_iteration": 2.5583484172821045 + }, + { + "auxiliary_loss_clip": 0.01101822, + "auxiliary_loss_mlp": 0.01103337, + "balance_loss_clip": 1.00179255, + "balance_loss_mlp": 1.00072169, + "epoch": 0.7286336990831204, + "flos": 17310954827520.0, + "grad_norm": 2.341834650093816, + "language_loss": 0.84191918, + "learning_rate": 7.238800931625346e-07, + "loss": 0.86397076, + "num_input_tokens_seen": 261590910, + "step": 12119, + "time_per_iteration": 2.6665539741516113 + }, + { + "auxiliary_loss_clip": 0.0116484, + "auxiliary_loss_mlp": 0.01103531, + "balance_loss_clip": 1.00194764, + "balance_loss_mlp": 1.00043929, + "epoch": 0.7286938223357884, + "flos": 19787390098560.0, + "grad_norm": 2.484534627877468, + "language_loss": 0.81871152, + "learning_rate": 7.235802370504831e-07, + "loss": 0.84139526, + "num_input_tokens_seen": 261606005, + "step": 12120, + "time_per_iteration": 2.5443379878997803 + }, + { + "auxiliary_loss_clip": 0.01117018, + "auxiliary_loss_mlp": 0.01102853, + "balance_loss_clip": 1.001858, + "balance_loss_mlp": 1.00071406, + "epoch": 0.7287539455884563, + "flos": 15340859625600.0, + "grad_norm": 2.1512078018662213, + "language_loss": 0.78487146, + "learning_rate": 7.232804293403963e-07, + "loss": 0.80707014, + "num_input_tokens_seen": 261622305, + "step": 12121, + "time_per_iteration": 4.046996831893921 + }, + { + "auxiliary_loss_clip": 0.01164741, + "auxiliary_loss_mlp": 0.01103849, + "balance_loss_clip": 1.00181532, + "balance_loss_mlp": 1.00047112, + "epoch": 0.7288140688411243, + "flos": 25192484870400.0, + "grad_norm": 2.2138465899275293, + "language_loss": 0.6954096, + "learning_rate": 7.229806700436441e-07, + "loss": 0.71809554, + "num_input_tokens_seen": 261642465, + "step": 12122, + "time_per_iteration": 2.592257499694824 + }, + { + "auxiliary_loss_clip": 0.01101785, + "auxiliary_loss_mlp": 0.0110216, + "balance_loss_clip": 1.00164676, + "balance_loss_mlp": 1.00040281, + "epoch": 0.7288741920937922, + "flos": 23984162328960.0, + "grad_norm": 3.8414050862663336, + "language_loss": 0.87183666, + "learning_rate": 7.226809591715923e-07, + "loss": 0.89387608, + "num_input_tokens_seen": 261661420, + "step": 12123, + "time_per_iteration": 2.7478370666503906 + }, + { + "auxiliary_loss_clip": 0.01118534, + "auxiliary_loss_mlp": 0.01103439, + "balance_loss_clip": 1.00179315, + "balance_loss_mlp": 1.00053728, + "epoch": 0.7289343153464602, + "flos": 22744921155840.0, + "grad_norm": 3.5482121244103997, + "language_loss": 0.82869047, + "learning_rate": 7.223812967356065e-07, + "loss": 0.85091019, + "num_input_tokens_seen": 261680865, + "step": 12124, + "time_per_iteration": 2.6860897541046143 + }, + { + "auxiliary_loss_clip": 0.01134369, + "auxiliary_loss_mlp": 0.01103462, + "balance_loss_clip": 1.00187826, + "balance_loss_mlp": 1.00046515, + "epoch": 0.7289944385991282, + "flos": 24900028335360.0, + "grad_norm": 4.035282179215689, + "language_loss": 0.66842943, + "learning_rate": 7.220816827470499e-07, + "loss": 0.6908077, + "num_input_tokens_seen": 261701455, + "step": 12125, + "time_per_iteration": 2.6596360206604004 + }, + { + "auxiliary_loss_clip": 0.01150065, + "auxiliary_loss_mlp": 0.01104187, + "balance_loss_clip": 1.00184262, + "balance_loss_mlp": 1.00061774, + "epoch": 0.7290545618517962, + "flos": 22967064817920.0, + "grad_norm": 3.2419270274557914, + "language_loss": 0.75193393, + "learning_rate": 7.217821172172855e-07, + "loss": 0.77447647, + "num_input_tokens_seen": 261721260, + "step": 12126, + "time_per_iteration": 2.62980580329895 + }, + { + "auxiliary_loss_clip": 0.01125572, + "auxiliary_loss_mlp": 0.01077448, + "balance_loss_clip": 1.00075197, + "balance_loss_mlp": 1.00000978, + "epoch": 0.7291146851044642, + "flos": 61901523216000.0, + "grad_norm": 0.8112427723024329, + "language_loss": 0.58582902, + "learning_rate": 7.2148260015767e-07, + "loss": 0.60785925, + "num_input_tokens_seen": 261779370, + "step": 12127, + "time_per_iteration": 3.0814244747161865 + }, + { + "auxiliary_loss_clip": 0.01117984, + "auxiliary_loss_mlp": 0.01101629, + "balance_loss_clip": 1.001827, + "balance_loss_mlp": 1.00044394, + "epoch": 0.7291748083571321, + "flos": 23330947547520.0, + "grad_norm": 2.734765962937538, + "language_loss": 0.69055915, + "learning_rate": 7.21183131579562e-07, + "loss": 0.71275526, + "num_input_tokens_seen": 261798050, + "step": 12128, + "time_per_iteration": 2.685603141784668 + }, + { + "auxiliary_loss_clip": 0.01133481, + "auxiliary_loss_mlp": 0.01103098, + "balance_loss_clip": 1.00188518, + "balance_loss_mlp": 1.00048304, + "epoch": 0.7292349316098001, + "flos": 28330000001280.0, + "grad_norm": 1.961460726635344, + "language_loss": 0.6560213, + "learning_rate": 7.20883711494319e-07, + "loss": 0.67838711, + "num_input_tokens_seen": 261817660, + "step": 12129, + "time_per_iteration": 2.6586506366729736 + }, + { + "auxiliary_loss_clip": 0.01164556, + "auxiliary_loss_mlp": 0.01102426, + "balance_loss_clip": 1.00192726, + "balance_loss_mlp": 1.0003823, + "epoch": 0.729295054862468, + "flos": 24132222190080.0, + "grad_norm": 12.14881356885503, + "language_loss": 0.74385548, + "learning_rate": 7.205843399132927e-07, + "loss": 0.76652533, + "num_input_tokens_seen": 261837935, + "step": 12130, + "time_per_iteration": 3.9467813968658447 + }, + { + "auxiliary_loss_clip": 0.01135314, + "auxiliary_loss_mlp": 0.01103192, + "balance_loss_clip": 1.00187147, + "balance_loss_mlp": 1.0005765, + "epoch": 0.7293551781151361, + "flos": 22816239609600.0, + "grad_norm": 1.665999555205973, + "language_loss": 0.69742113, + "learning_rate": 7.202850168478374e-07, + "loss": 0.71980619, + "num_input_tokens_seen": 261857575, + "step": 12131, + "time_per_iteration": 2.6628482341766357 + }, + { + "auxiliary_loss_clip": 0.01113962, + "auxiliary_loss_mlp": 0.01101823, + "balance_loss_clip": 1.00173807, + "balance_loss_mlp": 1.00054264, + "epoch": 0.729415301367804, + "flos": 22126683242880.0, + "grad_norm": 1.6414434070191632, + "language_loss": 0.77383125, + "learning_rate": 7.199857423093025e-07, + "loss": 0.7959891, + "num_input_tokens_seen": 261877265, + "step": 12132, + "time_per_iteration": 2.6312360763549805 + }, + { + "auxiliary_loss_clip": 0.01148207, + "auxiliary_loss_mlp": 0.01103007, + "balance_loss_clip": 1.00184977, + "balance_loss_mlp": 1.00058222, + "epoch": 0.729475424620472, + "flos": 12349608675840.0, + "grad_norm": 1.9744019173366887, + "language_loss": 0.7884233, + "learning_rate": 7.196865163090358e-07, + "loss": 0.81093544, + "num_input_tokens_seen": 261893695, + "step": 12133, + "time_per_iteration": 2.5230321884155273 + }, + { + "auxiliary_loss_clip": 0.01102052, + "auxiliary_loss_mlp": 0.0110247, + "balance_loss_clip": 1.00165081, + "balance_loss_mlp": 1.00052249, + "epoch": 0.7295355478731399, + "flos": 22195308176640.0, + "grad_norm": 1.8202215423751946, + "language_loss": 0.72326458, + "learning_rate": 7.193873388583846e-07, + "loss": 0.74530971, + "num_input_tokens_seen": 261911825, + "step": 12134, + "time_per_iteration": 4.097760438919067 + }, + { + "auxiliary_loss_clip": 0.01132908, + "auxiliary_loss_mlp": 0.01104007, + "balance_loss_clip": 1.00188828, + "balance_loss_mlp": 1.00062883, + "epoch": 0.7295956711258079, + "flos": 23222030532480.0, + "grad_norm": 1.6822099112615618, + "language_loss": 0.7130444, + "learning_rate": 7.190882099686939e-07, + "loss": 0.73541355, + "num_input_tokens_seen": 261931190, + "step": 12135, + "time_per_iteration": 2.597050905227661 + }, + { + "auxiliary_loss_clip": 0.01118685, + "auxiliary_loss_mlp": 0.01103463, + "balance_loss_clip": 1.0018003, + "balance_loss_mlp": 1.00056183, + "epoch": 0.7296557943784758, + "flos": 31869104163840.0, + "grad_norm": 1.8721838572506027, + "language_loss": 0.62315124, + "learning_rate": 7.187891296513075e-07, + "loss": 0.64537263, + "num_input_tokens_seen": 261951240, + "step": 12136, + "time_per_iteration": 2.7489283084869385 + }, + { + "auxiliary_loss_clip": 0.0114822, + "auxiliary_loss_mlp": 0.00747307, + "balance_loss_clip": 1.00172758, + "balance_loss_mlp": 1.0004282, + "epoch": 0.7297159176311439, + "flos": 26651714889600.0, + "grad_norm": 2.214845797052369, + "language_loss": 0.74953198, + "learning_rate": 7.184900979175654e-07, + "loss": 0.76848727, + "num_input_tokens_seen": 261971605, + "step": 12137, + "time_per_iteration": 2.599416494369507 + }, + { + "auxiliary_loss_clip": 0.01148588, + "auxiliary_loss_mlp": 0.00747353, + "balance_loss_clip": 1.00194418, + "balance_loss_mlp": 1.00044405, + "epoch": 0.7297760408838118, + "flos": 24749562263040.0, + "grad_norm": 1.6719012681563814, + "language_loss": 0.7423321, + "learning_rate": 7.181911147788069e-07, + "loss": 0.7612915, + "num_input_tokens_seen": 261990830, + "step": 12138, + "time_per_iteration": 2.616213798522949 + }, + { + "auxiliary_loss_clip": 0.0111647, + "auxiliary_loss_mlp": 0.01102448, + "balance_loss_clip": 1.00164151, + "balance_loss_mlp": 1.00040507, + "epoch": 0.7298361641364798, + "flos": 18073768982400.0, + "grad_norm": 2.447781966114373, + "language_loss": 0.72071201, + "learning_rate": 7.178921802463702e-07, + "loss": 0.74290121, + "num_input_tokens_seen": 262008190, + "step": 12139, + "time_per_iteration": 2.6266279220581055 + }, + { + "auxiliary_loss_clip": 0.01147684, + "auxiliary_loss_mlp": 0.01101895, + "balance_loss_clip": 1.00177979, + "balance_loss_mlp": 1.00051951, + "epoch": 0.7298962873891478, + "flos": 29895597169920.0, + "grad_norm": 1.6053392553684058, + "language_loss": 0.73524839, + "learning_rate": 7.175932943315898e-07, + "loss": 0.75774419, + "num_input_tokens_seen": 262030460, + "step": 12140, + "time_per_iteration": 2.646458864212036 + }, + { + "auxiliary_loss_clip": 0.01131266, + "auxiliary_loss_mlp": 0.01103781, + "balance_loss_clip": 1.0018177, + "balance_loss_mlp": 1.00049758, + "epoch": 0.7299564106418157, + "flos": 32266096254720.0, + "grad_norm": 1.8812471613728532, + "language_loss": 0.55364799, + "learning_rate": 7.172944570458003e-07, + "loss": 0.57599843, + "num_input_tokens_seen": 262050830, + "step": 12141, + "time_per_iteration": 2.682983636856079 + }, + { + "auxiliary_loss_clip": 0.01118068, + "auxiliary_loss_mlp": 0.0110259, + "balance_loss_clip": 1.00186992, + "balance_loss_mlp": 1.0004518, + "epoch": 0.7300165338944837, + "flos": 22930292269440.0, + "grad_norm": 1.371487136212449, + "language_loss": 0.72547561, + "learning_rate": 7.169956684003342e-07, + "loss": 0.74768221, + "num_input_tokens_seen": 262071245, + "step": 12142, + "time_per_iteration": 2.683856964111328 + }, + { + "auxiliary_loss_clip": 0.01164737, + "auxiliary_loss_mlp": 0.0110226, + "balance_loss_clip": 1.00193262, + "balance_loss_mlp": 1.00059867, + "epoch": 0.7300766571471516, + "flos": 19828795501440.0, + "grad_norm": 1.8225730002296257, + "language_loss": 0.73889709, + "learning_rate": 7.16696928406521e-07, + "loss": 0.76156712, + "num_input_tokens_seen": 262087525, + "step": 12143, + "time_per_iteration": 2.534496307373047 + }, + { + "auxiliary_loss_clip": 0.01117134, + "auxiliary_loss_mlp": 0.01103292, + "balance_loss_clip": 1.00176287, + "balance_loss_mlp": 1.00048602, + "epoch": 0.7301367803998197, + "flos": 24347829576960.0, + "grad_norm": 1.878713937889012, + "language_loss": 0.66334748, + "learning_rate": 7.163982370756882e-07, + "loss": 0.68555176, + "num_input_tokens_seen": 262107355, + "step": 12144, + "time_per_iteration": 2.6699576377868652 + }, + { + "auxiliary_loss_clip": 0.01131384, + "auxiliary_loss_mlp": 0.01103406, + "balance_loss_clip": 1.00180471, + "balance_loss_mlp": 1.00050414, + "epoch": 0.7301969036524876, + "flos": 15304518040320.0, + "grad_norm": 2.991496645359057, + "language_loss": 0.79259992, + "learning_rate": 7.160995944191627e-07, + "loss": 0.81494778, + "num_input_tokens_seen": 262125645, + "step": 12145, + "time_per_iteration": 2.758730411529541 + }, + { + "auxiliary_loss_clip": 0.01116818, + "auxiliary_loss_mlp": 0.01102443, + "balance_loss_clip": 1.00190413, + "balance_loss_mlp": 1.00049567, + "epoch": 0.7302570269051556, + "flos": 23507268433920.0, + "grad_norm": 1.7852914132216036, + "language_loss": 0.91648054, + "learning_rate": 7.158010004482702e-07, + "loss": 0.93867314, + "num_input_tokens_seen": 262144075, + "step": 12146, + "time_per_iteration": 2.6922824382781982 + }, + { + "auxiliary_loss_clip": 0.01164674, + "auxiliary_loss_mlp": 0.01102395, + "balance_loss_clip": 1.00204277, + "balance_loss_mlp": 1.00044703, + "epoch": 0.7303171501578235, + "flos": 20523056549760.0, + "grad_norm": 1.8843335157309975, + "language_loss": 0.62178898, + "learning_rate": 7.155024551743316e-07, + "loss": 0.64445966, + "num_input_tokens_seen": 262165940, + "step": 12147, + "time_per_iteration": 2.5769340991973877 + }, + { + "auxiliary_loss_clip": 0.01164899, + "auxiliary_loss_mlp": 0.01103221, + "balance_loss_clip": 1.00203228, + "balance_loss_mlp": 1.00050986, + "epoch": 0.7303772734104915, + "flos": 18332613365760.0, + "grad_norm": 1.9938885736129748, + "language_loss": 0.75589776, + "learning_rate": 7.152039586086693e-07, + "loss": 0.778579, + "num_input_tokens_seen": 262184520, + "step": 12148, + "time_per_iteration": 2.518598794937134 + }, + { + "auxiliary_loss_clip": 0.01126687, + "auxiliary_loss_mlp": 0.00745336, + "balance_loss_clip": 1.00058651, + "balance_loss_mlp": 1.00001168, + "epoch": 0.7304373966631594, + "flos": 60654776100480.0, + "grad_norm": 0.6890089567401315, + "language_loss": 0.56639665, + "learning_rate": 7.149055107626017e-07, + "loss": 0.58511692, + "num_input_tokens_seen": 262247070, + "step": 12149, + "time_per_iteration": 3.1580982208251953 + }, + { + "auxiliary_loss_clip": 0.01133155, + "auxiliary_loss_mlp": 0.0110382, + "balance_loss_clip": 1.00183558, + "balance_loss_mlp": 1.00053692, + "epoch": 0.7304975199158275, + "flos": 19828077229440.0, + "grad_norm": 1.7243381891468783, + "language_loss": 0.73712695, + "learning_rate": 7.146071116474451e-07, + "loss": 0.75949669, + "num_input_tokens_seen": 262266605, + "step": 12150, + "time_per_iteration": 2.6643590927124023 + }, + { + "auxiliary_loss_clip": 0.01164737, + "auxiliary_loss_mlp": 0.01103004, + "balance_loss_clip": 1.00187349, + "balance_loss_mlp": 1.00038815, + "epoch": 0.7305576431684954, + "flos": 13223997452160.0, + "grad_norm": 7.430609631056908, + "language_loss": 0.8414712, + "learning_rate": 7.143087612745158e-07, + "loss": 0.86414874, + "num_input_tokens_seen": 262283880, + "step": 12151, + "time_per_iteration": 2.487255096435547 + }, + { + "auxiliary_loss_clip": 0.01120542, + "auxiliary_loss_mlp": 0.01103507, + "balance_loss_clip": 1.0019505, + "balance_loss_mlp": 1.00060511, + "epoch": 0.7306177664211634, + "flos": 24060472773120.0, + "grad_norm": 1.8119577379346823, + "language_loss": 0.77842331, + "learning_rate": 7.14010459655127e-07, + "loss": 0.80066383, + "num_input_tokens_seen": 262304155, + "step": 12152, + "time_per_iteration": 2.6436736583709717 + }, + { + "auxiliary_loss_clip": 0.01114626, + "auxiliary_loss_mlp": 0.01103985, + "balance_loss_clip": 1.0017643, + "balance_loss_mlp": 1.00051093, + "epoch": 0.7306778896738314, + "flos": 27089106802560.0, + "grad_norm": 1.6808170085019498, + "language_loss": 0.79714715, + "learning_rate": 7.137122068005919e-07, + "loss": 0.8193332, + "num_input_tokens_seen": 262325660, + "step": 12153, + "time_per_iteration": 2.6798667907714844 + }, + { + "auxiliary_loss_clip": 0.01148049, + "auxiliary_loss_mlp": 0.01103656, + "balance_loss_clip": 1.00185943, + "balance_loss_mlp": 1.00046825, + "epoch": 0.7307380129264993, + "flos": 16690669839360.0, + "grad_norm": 1.6913633511672996, + "language_loss": 0.67750531, + "learning_rate": 7.134140027222173e-07, + "loss": 0.70002234, + "num_input_tokens_seen": 262344075, + "step": 12154, + "time_per_iteration": 2.5255520343780518 + }, + { + "auxiliary_loss_clip": 0.01102053, + "auxiliary_loss_mlp": 0.01104001, + "balance_loss_clip": 1.00173843, + "balance_loss_mlp": 1.00043178, + "epoch": 0.7307981361791673, + "flos": 21725740656000.0, + "grad_norm": 2.575145783836456, + "language_loss": 0.66145539, + "learning_rate": 7.131158474313128e-07, + "loss": 0.68351591, + "num_input_tokens_seen": 262363305, + "step": 12155, + "time_per_iteration": 4.0384368896484375 + }, + { + "auxiliary_loss_clip": 0.01132674, + "auxiliary_loss_mlp": 0.01101557, + "balance_loss_clip": 1.0016278, + "balance_loss_mlp": 1.00046778, + "epoch": 0.7308582594318352, + "flos": 18040659621120.0, + "grad_norm": 1.9012527072974639, + "language_loss": 0.81623024, + "learning_rate": 7.128177409391851e-07, + "loss": 0.83857256, + "num_input_tokens_seen": 262380730, + "step": 12156, + "time_per_iteration": 2.611100196838379 + }, + { + "auxiliary_loss_clip": 0.01118106, + "auxiliary_loss_mlp": 0.01101747, + "balance_loss_clip": 1.00179899, + "balance_loss_mlp": 1.00046659, + "epoch": 0.7309183826845033, + "flos": 13844964798720.0, + "grad_norm": 3.577179271917184, + "language_loss": 0.75689948, + "learning_rate": 7.125196832571367e-07, + "loss": 0.77909803, + "num_input_tokens_seen": 262395480, + "step": 12157, + "time_per_iteration": 2.620875835418701 + }, + { + "auxiliary_loss_clip": 0.01148076, + "auxiliary_loss_mlp": 0.01101555, + "balance_loss_clip": 1.00175524, + "balance_loss_mlp": 1.0003705, + "epoch": 0.7309785059371712, + "flos": 17019216564480.0, + "grad_norm": 2.483691073690068, + "language_loss": 0.72713172, + "learning_rate": 7.122216743964713e-07, + "loss": 0.74962807, + "num_input_tokens_seen": 262413340, + "step": 12158, + "time_per_iteration": 3.924041748046875 + }, + { + "auxiliary_loss_clip": 0.01133246, + "auxiliary_loss_mlp": 0.01103579, + "balance_loss_clip": 1.00189233, + "balance_loss_mlp": 1.00048637, + "epoch": 0.7310386291898392, + "flos": 26502398052480.0, + "grad_norm": 3.3083797794068497, + "language_loss": 0.85865772, + "learning_rate": 7.119237143684896e-07, + "loss": 0.88102597, + "num_input_tokens_seen": 262433455, + "step": 12159, + "time_per_iteration": 2.6557419300079346 + }, + { + "auxiliary_loss_clip": 0.01132459, + "auxiliary_loss_mlp": 0.01103661, + "balance_loss_clip": 1.00181937, + "balance_loss_mlp": 1.00056839, + "epoch": 0.7310987524425071, + "flos": 16945922862720.0, + "grad_norm": 2.3570971380781627, + "language_loss": 0.7389977, + "learning_rate": 7.116258031844895e-07, + "loss": 0.76135898, + "num_input_tokens_seen": 262450335, + "step": 12160, + "time_per_iteration": 2.5805130004882812 + }, + { + "auxiliary_loss_clip": 0.0114829, + "auxiliary_loss_mlp": 0.01103681, + "balance_loss_clip": 1.00191665, + "balance_loss_mlp": 1.00049329, + "epoch": 0.7311588756951751, + "flos": 13845288021120.0, + "grad_norm": 2.042174543708777, + "language_loss": 0.72629678, + "learning_rate": 7.113279408557675e-07, + "loss": 0.74881649, + "num_input_tokens_seen": 262468240, + "step": 12161, + "time_per_iteration": 2.5404584407806396 + }, + { + "auxiliary_loss_clip": 0.01118398, + "auxiliary_loss_mlp": 0.00747354, + "balance_loss_clip": 1.00177944, + "balance_loss_mlp": 1.00039458, + "epoch": 0.731218998947843, + "flos": 28767894704640.0, + "grad_norm": 1.8523123972207252, + "language_loss": 0.69674736, + "learning_rate": 7.110301273936192e-07, + "loss": 0.71540493, + "num_input_tokens_seen": 262487045, + "step": 12162, + "time_per_iteration": 2.6951117515563965 + }, + { + "auxiliary_loss_clip": 0.01147742, + "auxiliary_loss_mlp": 0.01103672, + "balance_loss_clip": 1.00204396, + "balance_loss_mlp": 1.00038874, + "epoch": 0.7312791222005111, + "flos": 27088783580160.0, + "grad_norm": 1.8599997529117556, + "language_loss": 0.66854662, + "learning_rate": 7.107323628093382e-07, + "loss": 0.69106072, + "num_input_tokens_seen": 262504855, + "step": 12163, + "time_per_iteration": 2.6029131412506104 + }, + { + "auxiliary_loss_clip": 0.01131945, + "auxiliary_loss_mlp": 0.01102281, + "balance_loss_clip": 1.0017463, + "balance_loss_mlp": 1.00042844, + "epoch": 0.731339245453179, + "flos": 20924035050240.0, + "grad_norm": 2.021342716110525, + "language_loss": 0.6830942, + "learning_rate": 7.104346471142153e-07, + "loss": 0.70543647, + "num_input_tokens_seen": 262524920, + "step": 12164, + "time_per_iteration": 2.6136975288391113 + }, + { + "auxiliary_loss_clip": 0.01099603, + "auxiliary_loss_mlp": 0.01102843, + "balance_loss_clip": 1.00172043, + "balance_loss_mlp": 1.00060868, + "epoch": 0.731399368705847, + "flos": 23075694524160.0, + "grad_norm": 1.5422028408578643, + "language_loss": 0.7317791, + "learning_rate": 7.101369803195391e-07, + "loss": 0.75380355, + "num_input_tokens_seen": 262545725, + "step": 12165, + "time_per_iteration": 2.7100093364715576 + }, + { + "auxiliary_loss_clip": 0.01150121, + "auxiliary_loss_mlp": 0.01103698, + "balance_loss_clip": 1.00190854, + "balance_loss_mlp": 1.00051069, + "epoch": 0.731459491958515, + "flos": 23582681038080.0, + "grad_norm": 2.8457867301547384, + "language_loss": 0.76625884, + "learning_rate": 7.098393624365988e-07, + "loss": 0.78879702, + "num_input_tokens_seen": 262565480, + "step": 12166, + "time_per_iteration": 2.616037368774414 + }, + { + "auxiliary_loss_clip": 0.01134601, + "auxiliary_loss_mlp": 0.01102735, + "balance_loss_clip": 1.00203943, + "balance_loss_mlp": 1.00040531, + "epoch": 0.7315196152111829, + "flos": 22379278659840.0, + "grad_norm": 1.7457896601644478, + "language_loss": 0.79963183, + "learning_rate": 7.095417934766781e-07, + "loss": 0.82200509, + "num_input_tokens_seen": 262584145, + "step": 12167, + "time_per_iteration": 4.106269121170044 + }, + { + "auxiliary_loss_clip": 0.01148255, + "auxiliary_loss_mlp": 0.01102401, + "balance_loss_clip": 1.00191712, + "balance_loss_mlp": 1.00073957, + "epoch": 0.7315797384638509, + "flos": 26177047637760.0, + "grad_norm": 1.6562178988196459, + "language_loss": 0.77035868, + "learning_rate": 7.092442734510622e-07, + "loss": 0.79286528, + "num_input_tokens_seen": 262604045, + "step": 12168, + "time_per_iteration": 2.581580400466919 + }, + { + "auxiliary_loss_clip": 0.01149818, + "auxiliary_loss_mlp": 0.01103369, + "balance_loss_clip": 1.00181627, + "balance_loss_mlp": 1.00046802, + "epoch": 0.7316398617165188, + "flos": 21506326427520.0, + "grad_norm": 2.3552615549850797, + "language_loss": 0.81891203, + "learning_rate": 7.089468023710326e-07, + "loss": 0.8414439, + "num_input_tokens_seen": 262624540, + "step": 12169, + "time_per_iteration": 2.557767868041992 + }, + { + "auxiliary_loss_clip": 0.01148312, + "auxiliary_loss_mlp": 0.0110324, + "balance_loss_clip": 1.00180137, + "balance_loss_mlp": 1.0006249, + "epoch": 0.7316999849691869, + "flos": 30482557315200.0, + "grad_norm": 1.769080876339074, + "language_loss": 0.70019174, + "learning_rate": 7.08649380247871e-07, + "loss": 0.72270727, + "num_input_tokens_seen": 262644545, + "step": 12170, + "time_per_iteration": 2.6230804920196533 + }, + { + "auxiliary_loss_clip": 0.01164669, + "auxiliary_loss_mlp": 0.01102565, + "balance_loss_clip": 1.00191307, + "balance_loss_mlp": 1.00052214, + "epoch": 0.7317601082218548, + "flos": 21543781334400.0, + "grad_norm": 1.9510454984101453, + "language_loss": 0.69714582, + "learning_rate": 7.083520070928533e-07, + "loss": 0.71981812, + "num_input_tokens_seen": 262662570, + "step": 12171, + "time_per_iteration": 3.897498846054077 + }, + { + "auxiliary_loss_clip": 0.01164674, + "auxiliary_loss_mlp": 0.01102578, + "balance_loss_clip": 1.00197649, + "balance_loss_mlp": 1.00062978, + "epoch": 0.7318202314745228, + "flos": 33251592775680.0, + "grad_norm": 1.6398542613626053, + "language_loss": 0.65643513, + "learning_rate": 7.080546829172564e-07, + "loss": 0.67910767, + "num_input_tokens_seen": 262683245, + "step": 12172, + "time_per_iteration": 2.604048252105713 + }, + { + "auxiliary_loss_clip": 0.01164735, + "auxiliary_loss_mlp": 0.01103796, + "balance_loss_clip": 1.00197685, + "balance_loss_mlp": 1.00041723, + "epoch": 0.7318803547271907, + "flos": 20157054917760.0, + "grad_norm": 3.063634423035229, + "language_loss": 0.61308581, + "learning_rate": 7.077574077323564e-07, + "loss": 0.6357711, + "num_input_tokens_seen": 262701585, + "step": 12173, + "time_per_iteration": 2.501736640930176 + }, + { + "auxiliary_loss_clip": 0.01100427, + "auxiliary_loss_mlp": 0.0110303, + "balance_loss_clip": 1.00171554, + "balance_loss_mlp": 1.00050998, + "epoch": 0.7319404779798587, + "flos": 20558536208640.0, + "grad_norm": 1.8781077524738268, + "language_loss": 0.74056673, + "learning_rate": 7.074601815494243e-07, + "loss": 0.76260126, + "num_input_tokens_seen": 262719295, + "step": 12174, + "time_per_iteration": 2.727388620376587 + }, + { + "auxiliary_loss_clip": 0.01164737, + "auxiliary_loss_mlp": 0.01102197, + "balance_loss_clip": 1.00200868, + "balance_loss_mlp": 1.00034499, + "epoch": 0.7320006012325266, + "flos": 28695391102080.0, + "grad_norm": 1.5017748499924055, + "language_loss": 0.80951464, + "learning_rate": 7.071630043797317e-07, + "loss": 0.83218408, + "num_input_tokens_seen": 262739995, + "step": 12175, + "time_per_iteration": 2.567852020263672 + }, + { + "auxiliary_loss_clip": 0.01132512, + "auxiliary_loss_mlp": 0.011024, + "balance_loss_clip": 1.00182307, + "balance_loss_mlp": 1.00045252, + "epoch": 0.7320607244851947, + "flos": 16362697731840.0, + "grad_norm": 2.1308664089974014, + "language_loss": 0.7654177, + "learning_rate": 7.068658762345488e-07, + "loss": 0.78776681, + "num_input_tokens_seen": 262757680, + "step": 12176, + "time_per_iteration": 2.624953269958496 + }, + { + "auxiliary_loss_clip": 0.01148038, + "auxiliary_loss_mlp": 0.01102914, + "balance_loss_clip": 1.00194311, + "balance_loss_mlp": 1.00048923, + "epoch": 0.7321208477378626, + "flos": 20955097336320.0, + "grad_norm": 1.5812031506667963, + "language_loss": 0.76760852, + "learning_rate": 7.065687971251399e-07, + "loss": 0.79011804, + "num_input_tokens_seen": 262776990, + "step": 12177, + "time_per_iteration": 2.609271764755249 + }, + { + "auxiliary_loss_clip": 0.01114449, + "auxiliary_loss_mlp": 0.01102474, + "balance_loss_clip": 1.00156629, + "balance_loss_mlp": 1.00052643, + "epoch": 0.7321809709905306, + "flos": 13845072539520.0, + "grad_norm": 2.2257106525727246, + "language_loss": 0.74003839, + "learning_rate": 7.06271767062772e-07, + "loss": 0.76220763, + "num_input_tokens_seen": 262795440, + "step": 12178, + "time_per_iteration": 2.648393154144287 + }, + { + "auxiliary_loss_clip": 0.01134747, + "auxiliary_loss_mlp": 0.01103658, + "balance_loss_clip": 1.00178599, + "balance_loss_mlp": 1.00047064, + "epoch": 0.7322410942431986, + "flos": 26979938392320.0, + "grad_norm": 2.1188468336955215, + "language_loss": 0.82278657, + "learning_rate": 7.059747860587084e-07, + "loss": 0.84517062, + "num_input_tokens_seen": 262816385, + "step": 12179, + "time_per_iteration": 2.6628825664520264 + }, + { + "auxiliary_loss_clip": 0.01134949, + "auxiliary_loss_mlp": 0.01102412, + "balance_loss_clip": 1.00201619, + "balance_loss_mlp": 1.00055957, + "epoch": 0.7323012174958665, + "flos": 17639717034240.0, + "grad_norm": 3.8662540628386375, + "language_loss": 0.74564093, + "learning_rate": 7.056778541242115e-07, + "loss": 0.76801449, + "num_input_tokens_seen": 262834955, + "step": 12180, + "time_per_iteration": 2.5716753005981445 + }, + { + "auxiliary_loss_clip": 0.01147999, + "auxiliary_loss_mlp": 0.00747406, + "balance_loss_clip": 1.00169384, + "balance_loss_mlp": 1.00044978, + "epoch": 0.7323613407485345, + "flos": 32342765834880.0, + "grad_norm": 2.1557585914384205, + "language_loss": 0.7994833, + "learning_rate": 7.053809712705396e-07, + "loss": 0.81843734, + "num_input_tokens_seen": 262853555, + "step": 12181, + "time_per_iteration": 2.6590170860290527 + }, + { + "auxiliary_loss_clip": 0.01149635, + "auxiliary_loss_mlp": 0.00747349, + "balance_loss_clip": 1.00193822, + "balance_loss_mlp": 1.00049663, + "epoch": 0.7324214640012024, + "flos": 18362777811840.0, + "grad_norm": 1.7240780749853426, + "language_loss": 0.71848315, + "learning_rate": 7.050841375089506e-07, + "loss": 0.73745298, + "num_input_tokens_seen": 262870975, + "step": 12182, + "time_per_iteration": 2.5546302795410156 + }, + { + "auxiliary_loss_clip": 0.01164939, + "auxiliary_loss_mlp": 0.01103681, + "balance_loss_clip": 1.00210726, + "balance_loss_mlp": 1.00049305, + "epoch": 0.7324815872538705, + "flos": 30812289189120.0, + "grad_norm": 1.639954693827653, + "language_loss": 0.71172285, + "learning_rate": 7.047873528507015e-07, + "loss": 0.73440903, + "num_input_tokens_seen": 262892635, + "step": 12183, + "time_per_iteration": 2.6251680850982666 + }, + { + "auxiliary_loss_clip": 0.01148441, + "auxiliary_loss_mlp": 0.01104142, + "balance_loss_clip": 1.00182378, + "balance_loss_mlp": 1.00057244, + "epoch": 0.7325417105065384, + "flos": 21505069451520.0, + "grad_norm": 2.347057190523795, + "language_loss": 0.72966635, + "learning_rate": 7.04490617307045e-07, + "loss": 0.75219214, + "num_input_tokens_seen": 262910725, + "step": 12184, + "time_per_iteration": 2.5547034740448 + }, + { + "auxiliary_loss_clip": 0.01127987, + "auxiliary_loss_mlp": 0.01077595, + "balance_loss_clip": 1.00080323, + "balance_loss_mlp": 1.00015628, + "epoch": 0.7326018337592064, + "flos": 67257742556160.0, + "grad_norm": 0.7538019935868445, + "language_loss": 0.65179998, + "learning_rate": 7.041939308892344e-07, + "loss": 0.67385578, + "num_input_tokens_seen": 262974150, + "step": 12185, + "time_per_iteration": 3.1574697494506836 + }, + { + "auxiliary_loss_clip": 0.01164675, + "auxiliary_loss_mlp": 0.01103348, + "balance_loss_clip": 1.00181556, + "balance_loss_mlp": 1.00035071, + "epoch": 0.7326619570118743, + "flos": 22857070394880.0, + "grad_norm": 2.7443458975326487, + "language_loss": 0.80532235, + "learning_rate": 7.038972936085197e-07, + "loss": 0.82800257, + "num_input_tokens_seen": 262993370, + "step": 12186, + "time_per_iteration": 2.542893648147583 + }, + { + "auxiliary_loss_clip": 0.01149816, + "auxiliary_loss_mlp": 0.01103418, + "balance_loss_clip": 1.00190282, + "balance_loss_mlp": 1.00051618, + "epoch": 0.7327220802645423, + "flos": 23327499841920.0, + "grad_norm": 1.721127672899568, + "language_loss": 0.73017561, + "learning_rate": 7.036007054761508e-07, + "loss": 0.75270796, + "num_input_tokens_seen": 263012665, + "step": 12187, + "time_per_iteration": 2.549713611602783 + }, + { + "auxiliary_loss_clip": 0.01164723, + "auxiliary_loss_mlp": 0.011038, + "balance_loss_clip": 1.00196624, + "balance_loss_mlp": 1.00042176, + "epoch": 0.7327822035172102, + "flos": 23180661043200.0, + "grad_norm": 1.7733390704361323, + "language_loss": 0.88952392, + "learning_rate": 7.033041665033716e-07, + "loss": 0.91220915, + "num_input_tokens_seen": 263031475, + "step": 12188, + "time_per_iteration": 2.549870491027832 + }, + { + "auxiliary_loss_clip": 0.01103673, + "auxiliary_loss_mlp": 0.01103284, + "balance_loss_clip": 1.00181925, + "balance_loss_mlp": 1.0003829, + "epoch": 0.7328423267698783, + "flos": 21066600130560.0, + "grad_norm": 2.5009054994170774, + "language_loss": 0.74490082, + "learning_rate": 7.030076767014284e-07, + "loss": 0.7669704, + "num_input_tokens_seen": 263051445, + "step": 12189, + "time_per_iteration": 2.706714153289795 + }, + { + "auxiliary_loss_clip": 0.01114961, + "auxiliary_loss_mlp": 0.01102576, + "balance_loss_clip": 1.00166082, + "balance_loss_mlp": 1.00034189, + "epoch": 0.7329024500225462, + "flos": 21689578638720.0, + "grad_norm": 2.016532770249939, + "language_loss": 0.82645082, + "learning_rate": 7.027112360815648e-07, + "loss": 0.84862626, + "num_input_tokens_seen": 263070835, + "step": 12190, + "time_per_iteration": 2.6933693885803223 + }, + { + "auxiliary_loss_clip": 0.0111693, + "auxiliary_loss_mlp": 0.0110419, + "balance_loss_clip": 1.00175178, + "balance_loss_mlp": 1.00062132, + "epoch": 0.7329625732752142, + "flos": 24164038661760.0, + "grad_norm": 2.0520149396303617, + "language_loss": 0.7186476, + "learning_rate": 7.024148446550204e-07, + "loss": 0.74085879, + "num_input_tokens_seen": 263090070, + "step": 12191, + "time_per_iteration": 2.6583995819091797 + }, + { + "auxiliary_loss_clip": 0.01164752, + "auxiliary_loss_mlp": 0.01103102, + "balance_loss_clip": 1.00197911, + "balance_loss_mlp": 1.00048637, + "epoch": 0.7330226965278822, + "flos": 30077915627520.0, + "grad_norm": 1.5857882728648929, + "language_loss": 0.69007468, + "learning_rate": 7.021185024330361e-07, + "loss": 0.71275318, + "num_input_tokens_seen": 263110030, + "step": 12192, + "time_per_iteration": 2.575634479522705 + }, + { + "auxiliary_loss_clip": 0.01147925, + "auxiliary_loss_mlp": 0.01103116, + "balance_loss_clip": 1.00182855, + "balance_loss_mlp": 1.00050068, + "epoch": 0.7330828197805501, + "flos": 23368294713600.0, + "grad_norm": 1.6324400254451783, + "language_loss": 0.73423886, + "learning_rate": 7.01822209426848e-07, + "loss": 0.75674921, + "num_input_tokens_seen": 263129735, + "step": 12193, + "time_per_iteration": 3.934354305267334 + }, + { + "auxiliary_loss_clip": 0.01149666, + "auxiliary_loss_mlp": 0.01103503, + "balance_loss_clip": 1.00198174, + "balance_loss_mlp": 1.00050557, + "epoch": 0.7331429430332181, + "flos": 21032808410880.0, + "grad_norm": 1.6710859051987732, + "language_loss": 0.76991451, + "learning_rate": 7.015259656476911e-07, + "loss": 0.79244626, + "num_input_tokens_seen": 263149100, + "step": 12194, + "time_per_iteration": 2.5462913513183594 + }, + { + "auxiliary_loss_clip": 0.01147912, + "auxiliary_loss_mlp": 0.01103906, + "balance_loss_clip": 1.00192261, + "balance_loss_mlp": 1.00043285, + "epoch": 0.733203066285886, + "flos": 14647891466880.0, + "grad_norm": 1.610576617112145, + "language_loss": 0.70529032, + "learning_rate": 7.012297711067998e-07, + "loss": 0.72780854, + "num_input_tokens_seen": 263166620, + "step": 12195, + "time_per_iteration": 2.557788610458374 + }, + { + "auxiliary_loss_clip": 0.01164763, + "auxiliary_loss_mlp": 0.01102931, + "balance_loss_clip": 1.00185299, + "balance_loss_mlp": 1.00050592, + "epoch": 0.7332631895385541, + "flos": 17165301177600.0, + "grad_norm": 1.871463390714944, + "language_loss": 0.72357512, + "learning_rate": 7.009336258154057e-07, + "loss": 0.74625206, + "num_input_tokens_seen": 263184780, + "step": 12196, + "time_per_iteration": 3.9401514530181885 + }, + { + "auxiliary_loss_clip": 0.01164703, + "auxiliary_loss_mlp": 0.01102981, + "balance_loss_clip": 1.00197458, + "balance_loss_mlp": 1.00036538, + "epoch": 0.733323312791222, + "flos": 28658151676800.0, + "grad_norm": 1.6550755561110229, + "language_loss": 0.71498406, + "learning_rate": 7.006375297847394e-07, + "loss": 0.73766094, + "num_input_tokens_seen": 263204625, + "step": 12197, + "time_per_iteration": 2.5972564220428467 + }, + { + "auxiliary_loss_clip": 0.01084405, + "auxiliary_loss_mlp": 0.00747455, + "balance_loss_clip": 1.00158727, + "balance_loss_mlp": 1.00041616, + "epoch": 0.73338343604389, + "flos": 16618417632000.0, + "grad_norm": 1.987471716474168, + "language_loss": 0.7820915, + "learning_rate": 7.003414830260282e-07, + "loss": 0.80041015, + "num_input_tokens_seen": 263221565, + "step": 12198, + "time_per_iteration": 2.726247787475586 + }, + { + "auxiliary_loss_clip": 0.01086173, + "auxiliary_loss_mlp": 0.01101961, + "balance_loss_clip": 1.00167489, + "balance_loss_mlp": 1.00049019, + "epoch": 0.7334435592965579, + "flos": 21142084561920.0, + "grad_norm": 1.856675906458639, + "language_loss": 0.7444669, + "learning_rate": 7.000454855504974e-07, + "loss": 0.76634818, + "num_input_tokens_seen": 263240620, + "step": 12199, + "time_per_iteration": 2.7483878135681152 + }, + { + "auxiliary_loss_clip": 0.01133448, + "auxiliary_loss_mlp": 0.01103259, + "balance_loss_clip": 1.00187016, + "balance_loss_mlp": 1.00054872, + "epoch": 0.7335036825492259, + "flos": 17125332318720.0, + "grad_norm": 3.3631127311723716, + "language_loss": 0.76874983, + "learning_rate": 6.997495373693729e-07, + "loss": 0.79111695, + "num_input_tokens_seen": 263254365, + "step": 12200, + "time_per_iteration": 2.5710997581481934 + }, + { + "auxiliary_loss_clip": 0.01116564, + "auxiliary_loss_mlp": 0.01103295, + "balance_loss_clip": 1.00179625, + "balance_loss_mlp": 1.00058401, + "epoch": 0.7335638058018938, + "flos": 23731818307200.0, + "grad_norm": 2.29110036771086, + "language_loss": 0.61485302, + "learning_rate": 6.994536384938754e-07, + "loss": 0.63705164, + "num_input_tokens_seen": 263275880, + "step": 12201, + "time_per_iteration": 2.697512149810791 + }, + { + "auxiliary_loss_clip": 0.01118437, + "auxiliary_loss_mlp": 0.00747196, + "balance_loss_clip": 1.00172782, + "balance_loss_mlp": 1.00036836, + "epoch": 0.7336239290545619, + "flos": 34933289679360.0, + "grad_norm": 3.7155885136968667, + "language_loss": 0.52052224, + "learning_rate": 6.991577889352264e-07, + "loss": 0.53917861, + "num_input_tokens_seen": 263298315, + "step": 12202, + "time_per_iteration": 2.7706403732299805 + }, + { + "auxiliary_loss_clip": 0.01133365, + "auxiliary_loss_mlp": 0.01102775, + "balance_loss_clip": 1.00175476, + "balance_loss_mlp": 1.00035012, + "epoch": 0.7336840523072298, + "flos": 21103049456640.0, + "grad_norm": 1.8042876310931883, + "language_loss": 0.68446183, + "learning_rate": 6.98861988704645e-07, + "loss": 0.70682317, + "num_input_tokens_seen": 263318615, + "step": 12203, + "time_per_iteration": 2.621054172515869 + }, + { + "auxiliary_loss_clip": 0.01131924, + "auxiliary_loss_mlp": 0.01104807, + "balance_loss_clip": 1.00185061, + "balance_loss_mlp": 1.00057101, + "epoch": 0.7337441755598978, + "flos": 24024418496640.0, + "grad_norm": 2.604726385030324, + "language_loss": 0.66091543, + "learning_rate": 6.985662378133474e-07, + "loss": 0.68328279, + "num_input_tokens_seen": 263336705, + "step": 12204, + "time_per_iteration": 4.152954578399658 + }, + { + "auxiliary_loss_clip": 0.01131145, + "auxiliary_loss_mlp": 0.01103018, + "balance_loss_clip": 1.00186872, + "balance_loss_mlp": 1.00059366, + "epoch": 0.7338042988125658, + "flos": 22711309004160.0, + "grad_norm": 2.4767946080556174, + "language_loss": 0.77396256, + "learning_rate": 6.982705362725479e-07, + "loss": 0.79630417, + "num_input_tokens_seen": 263355065, + "step": 12205, + "time_per_iteration": 2.618196725845337 + }, + { + "auxiliary_loss_clip": 0.01101625, + "auxiliary_loss_mlp": 0.01103218, + "balance_loss_clip": 1.00192261, + "balance_loss_mlp": 1.00050735, + "epoch": 0.7338644220652337, + "flos": 21360996000000.0, + "grad_norm": 2.17577269639012, + "language_loss": 0.79807198, + "learning_rate": 6.979748840934601e-07, + "loss": 0.82012045, + "num_input_tokens_seen": 263374460, + "step": 12206, + "time_per_iteration": 2.839231491088867 + }, + { + "auxiliary_loss_clip": 0.01116222, + "auxiliary_loss_mlp": 0.01102444, + "balance_loss_clip": 1.00173807, + "balance_loss_mlp": 1.00049639, + "epoch": 0.7339245453179017, + "flos": 30920236536960.0, + "grad_norm": 2.2065117359788, + "language_loss": 0.71727192, + "learning_rate": 6.976792812872958e-07, + "loss": 0.73945856, + "num_input_tokens_seen": 263393610, + "step": 12207, + "time_per_iteration": 2.6970973014831543 + }, + { + "auxiliary_loss_clip": 0.01127706, + "auxiliary_loss_mlp": 0.01077468, + "balance_loss_clip": 1.00068474, + "balance_loss_mlp": 1.0000298, + "epoch": 0.7339846685705697, + "flos": 67899429072000.0, + "grad_norm": 0.7767998935947098, + "language_loss": 0.54828823, + "learning_rate": 6.97383727865263e-07, + "loss": 0.57033992, + "num_input_tokens_seen": 263450340, + "step": 12208, + "time_per_iteration": 3.242201089859009 + }, + { + "auxiliary_loss_clip": 0.01164702, + "auxiliary_loss_mlp": 0.01102608, + "balance_loss_clip": 1.00192165, + "balance_loss_mlp": 1.00046921, + "epoch": 0.7340447918232377, + "flos": 22236749493120.0, + "grad_norm": 1.36772137318066, + "language_loss": 0.80512655, + "learning_rate": 6.970882238385703e-07, + "loss": 0.82779956, + "num_input_tokens_seen": 263471735, + "step": 12209, + "time_per_iteration": 2.5753567218780518 + }, + { + "auxiliary_loss_clip": 0.01164639, + "auxiliary_loss_mlp": 0.01102292, + "balance_loss_clip": 1.00189078, + "balance_loss_mlp": 1.00043917, + "epoch": 0.7341049150759056, + "flos": 23764784014080.0, + "grad_norm": 1.5757408737755374, + "language_loss": 0.78685296, + "learning_rate": 6.96792769218423e-07, + "loss": 0.80952227, + "num_input_tokens_seen": 263493245, + "step": 12210, + "time_per_iteration": 3.9201762676239014 + }, + { + "auxiliary_loss_clip": 0.01164543, + "auxiliary_loss_mlp": 0.0110268, + "balance_loss_clip": 1.00191617, + "balance_loss_mlp": 1.00035059, + "epoch": 0.7341650383285736, + "flos": 17236547804160.0, + "grad_norm": 1.8680492756333469, + "language_loss": 0.76253396, + "learning_rate": 6.964973640160236e-07, + "loss": 0.7852062, + "num_input_tokens_seen": 263511660, + "step": 12211, + "time_per_iteration": 2.5457420349121094 + }, + { + "auxiliary_loss_clip": 0.01133117, + "auxiliary_loss_mlp": 0.01103082, + "balance_loss_clip": 1.00177646, + "balance_loss_mlp": 1.00046706, + "epoch": 0.7342251615812415, + "flos": 23403953940480.0, + "grad_norm": 1.903394357444615, + "language_loss": 0.7165001, + "learning_rate": 6.962020082425748e-07, + "loss": 0.73886216, + "num_input_tokens_seen": 263530875, + "step": 12212, + "time_per_iteration": 2.605304718017578 + }, + { + "auxiliary_loss_clip": 0.01164889, + "auxiliary_loss_mlp": 0.01102192, + "balance_loss_clip": 1.00210893, + "balance_loss_mlp": 1.00043511, + "epoch": 0.7342852848339095, + "flos": 22747183712640.0, + "grad_norm": 1.5804072116461383, + "language_loss": 0.69156432, + "learning_rate": 6.959067019092766e-07, + "loss": 0.71423519, + "num_input_tokens_seen": 263551585, + "step": 12213, + "time_per_iteration": 2.5205183029174805 + }, + { + "auxiliary_loss_clip": 0.01158684, + "auxiliary_loss_mlp": 0.01077068, + "balance_loss_clip": 1.00078762, + "balance_loss_mlp": 1.00001097, + "epoch": 0.7343454080865774, + "flos": 53942353925760.0, + "grad_norm": 0.7247032851914181, + "language_loss": 0.54387999, + "learning_rate": 6.956114450273276e-07, + "loss": 0.56623751, + "num_input_tokens_seen": 263609545, + "step": 12214, + "time_per_iteration": 2.978543996810913 + }, + { + "auxiliary_loss_clip": 0.01164831, + "auxiliary_loss_mlp": 0.01102772, + "balance_loss_clip": 1.00195074, + "balance_loss_mlp": 1.0004425, + "epoch": 0.7344055313392455, + "flos": 12166859255040.0, + "grad_norm": 3.495467226954771, + "language_loss": 0.70303261, + "learning_rate": 6.953162376079233e-07, + "loss": 0.7257086, + "num_input_tokens_seen": 263627880, + "step": 12215, + "time_per_iteration": 2.4878711700439453 + }, + { + "auxiliary_loss_clip": 0.01133917, + "auxiliary_loss_mlp": 0.01102127, + "balance_loss_clip": 1.00192559, + "balance_loss_mlp": 1.00046504, + "epoch": 0.7344656545919134, + "flos": 18550052346240.0, + "grad_norm": 1.65950734599576, + "language_loss": 0.72696412, + "learning_rate": 6.950210796622573e-07, + "loss": 0.74932462, + "num_input_tokens_seen": 263645665, + "step": 12216, + "time_per_iteration": 2.5693519115448 + }, + { + "auxiliary_loss_clip": 0.01164865, + "auxiliary_loss_mlp": 0.01104469, + "balance_loss_clip": 1.00192356, + "balance_loss_mlp": 1.00051856, + "epoch": 0.7345257778445814, + "flos": 23661649088640.0, + "grad_norm": 1.6673580174373421, + "language_loss": 0.7825352, + "learning_rate": 6.947259712015236e-07, + "loss": 0.80522859, + "num_input_tokens_seen": 263668170, + "step": 12217, + "time_per_iteration": 2.5455052852630615 + }, + { + "auxiliary_loss_clip": 0.01114778, + "auxiliary_loss_mlp": 0.01101874, + "balance_loss_clip": 1.00168705, + "balance_loss_mlp": 1.00049877, + "epoch": 0.7345859010972494, + "flos": 13808659127040.0, + "grad_norm": 1.9828302126524977, + "language_loss": 0.77856958, + "learning_rate": 6.94430912236911e-07, + "loss": 0.80073607, + "num_input_tokens_seen": 263684190, + "step": 12218, + "time_per_iteration": 2.620379686355591 + }, + { + "auxiliary_loss_clip": 0.01102883, + "auxiliary_loss_mlp": 0.01102916, + "balance_loss_clip": 1.00167847, + "balance_loss_mlp": 1.00039589, + "epoch": 0.7346460243499173, + "flos": 22272731942400.0, + "grad_norm": 1.8793342494749075, + "language_loss": 0.72049344, + "learning_rate": 6.941359027796092e-07, + "loss": 0.74255139, + "num_input_tokens_seen": 263702095, + "step": 12219, + "time_per_iteration": 2.6846394538879395 + }, + { + "auxiliary_loss_clip": 0.01135126, + "auxiliary_loss_mlp": 0.0110177, + "balance_loss_clip": 1.00179422, + "balance_loss_mlp": 1.00049019, + "epoch": 0.7347061476025853, + "flos": 23255247634560.0, + "grad_norm": 1.7754032906604627, + "language_loss": 0.75026357, + "learning_rate": 6.938409428408061e-07, + "loss": 0.77263248, + "num_input_tokens_seen": 263721385, + "step": 12220, + "time_per_iteration": 2.6132256984710693 + }, + { + "auxiliary_loss_clip": 0.01150104, + "auxiliary_loss_mlp": 0.01103004, + "balance_loss_clip": 1.00191772, + "balance_loss_mlp": 1.00048423, + "epoch": 0.7347662708552533, + "flos": 15267565923840.0, + "grad_norm": 1.8710025033172555, + "language_loss": 0.66035974, + "learning_rate": 6.93546032431684e-07, + "loss": 0.68289083, + "num_input_tokens_seen": 263737835, + "step": 12221, + "time_per_iteration": 2.555976629257202 + }, + { + "auxiliary_loss_clip": 0.01134619, + "auxiliary_loss_mlp": 0.01103163, + "balance_loss_clip": 1.00183678, + "balance_loss_mlp": 1.00045204, + "epoch": 0.7348263941079213, + "flos": 24859987649280.0, + "grad_norm": 1.7198793203175602, + "language_loss": 0.69029379, + "learning_rate": 6.932511715634273e-07, + "loss": 0.71267164, + "num_input_tokens_seen": 263756480, + "step": 12222, + "time_per_iteration": 2.63258957862854 + }, + { + "auxiliary_loss_clip": 0.01099278, + "auxiliary_loss_mlp": 0.01102391, + "balance_loss_clip": 1.00180483, + "balance_loss_mlp": 1.00034738, + "epoch": 0.7348865173605892, + "flos": 24352103295360.0, + "grad_norm": 2.0377920273230714, + "language_loss": 0.65856063, + "learning_rate": 6.92956360247217e-07, + "loss": 0.68057728, + "num_input_tokens_seen": 263776440, + "step": 12223, + "time_per_iteration": 2.7414488792419434 + }, + { + "auxiliary_loss_clip": 0.01150143, + "auxiliary_loss_mlp": 0.01102323, + "balance_loss_clip": 1.00195694, + "balance_loss_mlp": 1.000471, + "epoch": 0.7349466406132572, + "flos": 20004613597440.0, + "grad_norm": 1.8271146391132522, + "language_loss": 0.72235131, + "learning_rate": 6.926615984942332e-07, + "loss": 0.74487603, + "num_input_tokens_seen": 263793700, + "step": 12224, + "time_per_iteration": 2.632873773574829 + }, + { + "auxiliary_loss_clip": 0.01116738, + "auxiliary_loss_mlp": 0.01102482, + "balance_loss_clip": 1.00170922, + "balance_loss_mlp": 1.00043869, + "epoch": 0.7350067638659251, + "flos": 29825068815360.0, + "grad_norm": 1.7228536716557132, + "language_loss": 0.72638309, + "learning_rate": 6.92366886315652e-07, + "loss": 0.74857533, + "num_input_tokens_seen": 263814620, + "step": 12225, + "time_per_iteration": 2.717592239379883 + }, + { + "auxiliary_loss_clip": 0.01164784, + "auxiliary_loss_mlp": 0.01103728, + "balance_loss_clip": 1.00184548, + "balance_loss_mlp": 1.00054049, + "epoch": 0.7350668871185931, + "flos": 21866150920320.0, + "grad_norm": 1.7914215704228105, + "language_loss": 0.76169097, + "learning_rate": 6.920722237226501e-07, + "loss": 0.78437614, + "num_input_tokens_seen": 263832725, + "step": 12226, + "time_per_iteration": 2.557455062866211 + }, + { + "auxiliary_loss_clip": 0.01133148, + "auxiliary_loss_mlp": 0.0110225, + "balance_loss_clip": 1.00184417, + "balance_loss_mlp": 1.00039804, + "epoch": 0.735127010371261, + "flos": 22566122231040.0, + "grad_norm": 1.7956332282317413, + "language_loss": 0.67414343, + "learning_rate": 6.917776107264008e-07, + "loss": 0.69649744, + "num_input_tokens_seen": 263853850, + "step": 12227, + "time_per_iteration": 2.6137263774871826 + }, + { + "auxiliary_loss_clip": 0.01149645, + "auxiliary_loss_mlp": 0.01103806, + "balance_loss_clip": 1.00192666, + "balance_loss_mlp": 1.00061822, + "epoch": 0.7351871336239291, + "flos": 25884339707520.0, + "grad_norm": 1.4482982489601668, + "language_loss": 0.63567984, + "learning_rate": 6.914830473380749e-07, + "loss": 0.65821439, + "num_input_tokens_seen": 263874760, + "step": 12228, + "time_per_iteration": 2.6063730716705322 + }, + { + "auxiliary_loss_clip": 0.01133392, + "auxiliary_loss_mlp": 0.01102024, + "balance_loss_clip": 1.00180864, + "balance_loss_mlp": 1.00045764, + "epoch": 0.735247256876597, + "flos": 17932173569280.0, + "grad_norm": 2.324304226547959, + "language_loss": 0.63341868, + "learning_rate": 6.911885335688427e-07, + "loss": 0.65577286, + "num_input_tokens_seen": 263893390, + "step": 12229, + "time_per_iteration": 2.571645975112915 + }, + { + "auxiliary_loss_clip": 0.01131773, + "auxiliary_loss_mlp": 0.01103689, + "balance_loss_clip": 1.00178981, + "balance_loss_mlp": 1.00050163, + "epoch": 0.735307380129265, + "flos": 28875159694080.0, + "grad_norm": 1.7550636840813887, + "language_loss": 0.73552281, + "learning_rate": 6.908940694298726e-07, + "loss": 0.75787747, + "num_input_tokens_seen": 263911180, + "step": 12230, + "time_per_iteration": 4.041917324066162 + }, + { + "auxiliary_loss_clip": 0.01084753, + "auxiliary_loss_mlp": 0.0110318, + "balance_loss_clip": 1.00158012, + "balance_loss_mlp": 1.00046968, + "epoch": 0.7353675033819329, + "flos": 13625658311040.0, + "grad_norm": 2.0439187002720343, + "language_loss": 0.72377312, + "learning_rate": 6.90599654932332e-07, + "loss": 0.74565244, + "num_input_tokens_seen": 263928975, + "step": 12231, + "time_per_iteration": 2.6945719718933105 + }, + { + "auxiliary_loss_clip": 0.01148252, + "auxiliary_loss_mlp": 0.01104709, + "balance_loss_clip": 1.00194263, + "balance_loss_mlp": 1.00056767, + "epoch": 0.7354276266346009, + "flos": 19463081178240.0, + "grad_norm": 2.2299864949396904, + "language_loss": 0.63940167, + "learning_rate": 6.903052900873823e-07, + "loss": 0.66193128, + "num_input_tokens_seen": 263944495, + "step": 12232, + "time_per_iteration": 2.5623958110809326 + }, + { + "auxiliary_loss_clip": 0.01133421, + "auxiliary_loss_mlp": 0.01102937, + "balance_loss_clip": 1.00179839, + "balance_loss_mlp": 1.00051212, + "epoch": 0.735487749887269, + "flos": 15771858917760.0, + "grad_norm": 1.7622007711804635, + "language_loss": 0.75123382, + "learning_rate": 6.900109749061874e-07, + "loss": 0.77359742, + "num_input_tokens_seen": 263961325, + "step": 12233, + "time_per_iteration": 4.063643455505371 + }, + { + "auxiliary_loss_clip": 0.01164683, + "auxiliary_loss_mlp": 0.01102769, + "balance_loss_clip": 1.00185537, + "balance_loss_mlp": 1.0004878, + "epoch": 0.7355478731399369, + "flos": 18260648467200.0, + "grad_norm": 1.955163599866963, + "language_loss": 0.7357558, + "learning_rate": 6.897167093999079e-07, + "loss": 0.75843036, + "num_input_tokens_seen": 263980445, + "step": 12234, + "time_per_iteration": 2.5809824466705322 + }, + { + "auxiliary_loss_clip": 0.01150128, + "auxiliary_loss_mlp": 0.01104142, + "balance_loss_clip": 1.00194573, + "balance_loss_mlp": 1.00057292, + "epoch": 0.7356079963926049, + "flos": 26542043688960.0, + "grad_norm": 2.5461997143372277, + "language_loss": 0.60166931, + "learning_rate": 6.894224935797017e-07, + "loss": 0.62421197, + "num_input_tokens_seen": 263999330, + "step": 12235, + "time_per_iteration": 2.5849335193634033 + }, + { + "auxiliary_loss_clip": 0.01133696, + "auxiliary_loss_mlp": 0.01101996, + "balance_loss_clip": 1.00185061, + "balance_loss_mlp": 1.00043011, + "epoch": 0.7356681196452728, + "flos": 10778624467200.0, + "grad_norm": 2.688253155151933, + "language_loss": 0.8560816, + "learning_rate": 6.891283274567259e-07, + "loss": 0.87843853, + "num_input_tokens_seen": 264014150, + "step": 12236, + "time_per_iteration": 2.5636706352233887 + }, + { + "auxiliary_loss_clip": 0.01148156, + "auxiliary_loss_mlp": 0.00747291, + "balance_loss_clip": 1.00185907, + "balance_loss_mlp": 1.00040245, + "epoch": 0.7357282428979408, + "flos": 19718693337600.0, + "grad_norm": 2.048607420675566, + "language_loss": 0.69271076, + "learning_rate": 6.888342110421364e-07, + "loss": 0.71166521, + "num_input_tokens_seen": 264033140, + "step": 12237, + "time_per_iteration": 2.564481258392334 + }, + { + "auxiliary_loss_clip": 0.01053638, + "auxiliary_loss_mlp": 0.01102795, + "balance_loss_clip": 1.00163949, + "balance_loss_mlp": 1.00046551, + "epoch": 0.7357883661506087, + "flos": 19464014931840.0, + "grad_norm": 2.6097088727247524, + "language_loss": 0.72120523, + "learning_rate": 6.885401443470839e-07, + "loss": 0.7427696, + "num_input_tokens_seen": 264052105, + "step": 12238, + "time_per_iteration": 2.9847402572631836 + }, + { + "auxiliary_loss_clip": 0.01118184, + "auxiliary_loss_mlp": 0.01104247, + "balance_loss_clip": 1.0017426, + "balance_loss_mlp": 1.00039172, + "epoch": 0.7358484894032767, + "flos": 27123006263040.0, + "grad_norm": 2.0620238309223624, + "language_loss": 0.72633481, + "learning_rate": 6.882461273827205e-07, + "loss": 0.74855918, + "num_input_tokens_seen": 264070690, + "step": 12239, + "time_per_iteration": 3.0217957496643066 + }, + { + "auxiliary_loss_clip": 0.01132511, + "auxiliary_loss_mlp": 0.01101987, + "balance_loss_clip": 1.00192213, + "balance_loss_mlp": 1.00051606, + "epoch": 0.7359086126559446, + "flos": 24502282058880.0, + "grad_norm": 1.404507031792648, + "language_loss": 0.78972483, + "learning_rate": 6.879521601601954e-07, + "loss": 0.81206983, + "num_input_tokens_seen": 264094225, + "step": 12240, + "time_per_iteration": 2.688603401184082 + }, + { + "auxiliary_loss_clip": 0.01149538, + "auxiliary_loss_mlp": 0.0110214, + "balance_loss_clip": 1.00190187, + "balance_loss_mlp": 1.00066948, + "epoch": 0.7359687359086127, + "flos": 23331270769920.0, + "grad_norm": 1.7277186255176937, + "language_loss": 0.82827497, + "learning_rate": 6.876582426906565e-07, + "loss": 0.85079181, + "num_input_tokens_seen": 264113190, + "step": 12241, + "time_per_iteration": 2.581254005432129 + }, + { + "auxiliary_loss_clip": 0.01149276, + "auxiliary_loss_mlp": 0.01102022, + "balance_loss_clip": 1.00179124, + "balance_loss_mlp": 1.00036025, + "epoch": 0.7360288591612806, + "flos": 20193396503040.0, + "grad_norm": 2.046497752824815, + "language_loss": 0.78888631, + "learning_rate": 6.873643749852484e-07, + "loss": 0.81139928, + "num_input_tokens_seen": 264132050, + "step": 12242, + "time_per_iteration": 2.5681252479553223 + }, + { + "auxiliary_loss_clip": 0.01101696, + "auxiliary_loss_mlp": 0.01102024, + "balance_loss_clip": 1.00178516, + "balance_loss_mlp": 1.00045776, + "epoch": 0.7360889824139486, + "flos": 24972783333120.0, + "grad_norm": 3.2637287819119525, + "language_loss": 0.79064, + "learning_rate": 6.870705570551145e-07, + "loss": 0.8126772, + "num_input_tokens_seen": 264152800, + "step": 12243, + "time_per_iteration": 4.143785715103149 + }, + { + "auxiliary_loss_clip": 0.01150249, + "auxiliary_loss_mlp": 0.01103886, + "balance_loss_clip": 1.00186515, + "balance_loss_mlp": 1.00050795, + "epoch": 0.7361491056666165, + "flos": 15012312900480.0, + "grad_norm": 2.2200281662918737, + "language_loss": 0.74632096, + "learning_rate": 6.867767889113969e-07, + "loss": 0.76886231, + "num_input_tokens_seen": 264169650, + "step": 12244, + "time_per_iteration": 2.604536533355713 + }, + { + "auxiliary_loss_clip": 0.01150313, + "auxiliary_loss_mlp": 0.01102386, + "balance_loss_clip": 1.00189495, + "balance_loss_mlp": 1.00043821, + "epoch": 0.7362092289192845, + "flos": 22930400010240.0, + "grad_norm": 1.8488308238157665, + "language_loss": 0.69364226, + "learning_rate": 6.864830705652347e-07, + "loss": 0.71616924, + "num_input_tokens_seen": 264190530, + "step": 12245, + "time_per_iteration": 2.6481637954711914 + }, + { + "auxiliary_loss_clip": 0.01116371, + "auxiliary_loss_mlp": 0.01102352, + "balance_loss_clip": 1.00173926, + "balance_loss_mlp": 1.0004046, + "epoch": 0.7362693521719526, + "flos": 20702681487360.0, + "grad_norm": 2.0823326900882155, + "language_loss": 0.73484659, + "learning_rate": 6.861894020277658e-07, + "loss": 0.75703382, + "num_input_tokens_seen": 264210820, + "step": 12246, + "time_per_iteration": 2.67568302154541 + }, + { + "auxiliary_loss_clip": 0.01132461, + "auxiliary_loss_mlp": 0.0110122, + "balance_loss_clip": 1.00168777, + "balance_loss_mlp": 1.00032163, + "epoch": 0.7363294754246205, + "flos": 13111381336320.0, + "grad_norm": 1.995513364547212, + "language_loss": 0.73796904, + "learning_rate": 6.858957833101266e-07, + "loss": 0.76030588, + "num_input_tokens_seen": 264227430, + "step": 12247, + "time_per_iteration": 2.6120827198028564 + }, + { + "auxiliary_loss_clip": 0.01147877, + "auxiliary_loss_mlp": 0.01101873, + "balance_loss_clip": 1.00188088, + "balance_loss_mlp": 1.00049698, + "epoch": 0.7363895986772885, + "flos": 14027426910720.0, + "grad_norm": 1.6592983510696293, + "language_loss": 0.74484825, + "learning_rate": 6.856022144234526e-07, + "loss": 0.76734573, + "num_input_tokens_seen": 264245230, + "step": 12248, + "time_per_iteration": 3.886634349822998 + }, + { + "auxiliary_loss_clip": 0.01132781, + "auxiliary_loss_mlp": 0.01103907, + "balance_loss_clip": 1.00180006, + "balance_loss_mlp": 1.00043344, + "epoch": 0.7364497219299564, + "flos": 19719986227200.0, + "grad_norm": 1.8991041800078479, + "language_loss": 0.72730064, + "learning_rate": 6.853086953788727e-07, + "loss": 0.74966753, + "num_input_tokens_seen": 264263945, + "step": 12249, + "time_per_iteration": 2.625307321548462 + }, + { + "auxiliary_loss_clip": 0.0113379, + "auxiliary_loss_mlp": 0.01103329, + "balance_loss_clip": 1.00184965, + "balance_loss_mlp": 1.00042808, + "epoch": 0.7365098451826244, + "flos": 21361391049600.0, + "grad_norm": 2.9187656826058848, + "language_loss": 0.76855326, + "learning_rate": 6.850152261875189e-07, + "loss": 0.79092443, + "num_input_tokens_seen": 264281500, + "step": 12250, + "time_per_iteration": 2.5812864303588867 + }, + { + "auxiliary_loss_clip": 0.01100012, + "auxiliary_loss_mlp": 0.01102763, + "balance_loss_clip": 1.00161338, + "balance_loss_mlp": 1.00043356, + "epoch": 0.7365699684352923, + "flos": 23368222886400.0, + "grad_norm": 1.5470842296391987, + "language_loss": 0.71466851, + "learning_rate": 6.8472180686052e-07, + "loss": 0.73669624, + "num_input_tokens_seen": 264301625, + "step": 12251, + "time_per_iteration": 2.736602306365967 + }, + { + "auxiliary_loss_clip": 0.01148278, + "auxiliary_loss_mlp": 0.0110259, + "balance_loss_clip": 1.00186956, + "balance_loss_mlp": 1.00045156, + "epoch": 0.7366300916879603, + "flos": 59524879927680.0, + "grad_norm": 1.8105534645024495, + "language_loss": 0.65774471, + "learning_rate": 6.844284374090015e-07, + "loss": 0.68025339, + "num_input_tokens_seen": 264323975, + "step": 12252, + "time_per_iteration": 2.892364740371704 + }, + { + "auxiliary_loss_clip": 0.01104186, + "auxiliary_loss_mlp": 0.01103728, + "balance_loss_clip": 1.00192928, + "balance_loss_mlp": 1.00063586, + "epoch": 0.7366902149406283, + "flos": 20923137210240.0, + "grad_norm": 1.6549894977488107, + "language_loss": 0.79138517, + "learning_rate": 6.841351178440884e-07, + "loss": 0.81346428, + "num_input_tokens_seen": 264343785, + "step": 12253, + "time_per_iteration": 2.724008083343506 + }, + { + "auxiliary_loss_clip": 0.01164658, + "auxiliary_loss_mlp": 0.00747239, + "balance_loss_clip": 1.00192165, + "balance_loss_mlp": 1.00042415, + "epoch": 0.7367503381932963, + "flos": 17348158339200.0, + "grad_norm": 2.1247507080704415, + "language_loss": 0.76095784, + "learning_rate": 6.83841848176905e-07, + "loss": 0.78007674, + "num_input_tokens_seen": 264361130, + "step": 12254, + "time_per_iteration": 2.4995322227478027 + }, + { + "auxiliary_loss_clip": 0.01133098, + "auxiliary_loss_mlp": 0.01103517, + "balance_loss_clip": 1.00180078, + "balance_loss_mlp": 1.00052059, + "epoch": 0.7368104614459642, + "flos": 17821317219840.0, + "grad_norm": 5.820914519487285, + "language_loss": 0.69131339, + "learning_rate": 6.835486284185692e-07, + "loss": 0.71367955, + "num_input_tokens_seen": 264376965, + "step": 12255, + "time_per_iteration": 2.582963466644287 + }, + { + "auxiliary_loss_clip": 0.01147948, + "auxiliary_loss_mlp": 0.01103684, + "balance_loss_clip": 1.00186801, + "balance_loss_mlp": 1.00040126, + "epoch": 0.7368705846986322, + "flos": 24606099342720.0, + "grad_norm": 1.5353152142642148, + "language_loss": 0.74993765, + "learning_rate": 6.832554585802012e-07, + "loss": 0.77245396, + "num_input_tokens_seen": 264396310, + "step": 12256, + "time_per_iteration": 2.6324269771575928 + }, + { + "auxiliary_loss_clip": 0.01148193, + "auxiliary_loss_mlp": 0.01102792, + "balance_loss_clip": 1.0019021, + "balance_loss_mlp": 1.00046325, + "epoch": 0.7369307079513001, + "flos": 34970169968640.0, + "grad_norm": 1.7214151476818516, + "language_loss": 0.73847324, + "learning_rate": 6.829623386729182e-07, + "loss": 0.76098311, + "num_input_tokens_seen": 264418085, + "step": 12257, + "time_per_iteration": 2.6744909286499023 + }, + { + "auxiliary_loss_clip": 0.0115013, + "auxiliary_loss_mlp": 0.01102949, + "balance_loss_clip": 1.00190389, + "balance_loss_mlp": 1.00062037, + "epoch": 0.7369908312039681, + "flos": 21214588164480.0, + "grad_norm": 1.4997649260014152, + "language_loss": 0.7813642, + "learning_rate": 6.826692687078362e-07, + "loss": 0.80389512, + "num_input_tokens_seen": 264437595, + "step": 12258, + "time_per_iteration": 2.594146251678467 + }, + { + "auxiliary_loss_clip": 0.01148345, + "auxiliary_loss_mlp": 0.01102896, + "balance_loss_clip": 1.00186443, + "balance_loss_mlp": 1.00047123, + "epoch": 0.7370509544566362, + "flos": 23623655477760.0, + "grad_norm": 1.429900649708886, + "language_loss": 0.66427106, + "learning_rate": 6.823762486960674e-07, + "loss": 0.68678343, + "num_input_tokens_seen": 264457385, + "step": 12259, + "time_per_iteration": 2.5855226516723633 + }, + { + "auxiliary_loss_clip": 0.011481, + "auxiliary_loss_mlp": 0.01103165, + "balance_loss_clip": 1.00195551, + "balance_loss_mlp": 1.00045455, + "epoch": 0.7371110777093041, + "flos": 24827704300800.0, + "grad_norm": 1.7313493533819213, + "language_loss": 0.73590112, + "learning_rate": 6.820832786487225e-07, + "loss": 0.75841367, + "num_input_tokens_seen": 264477205, + "step": 12260, + "time_per_iteration": 2.617696762084961 + }, + { + "auxiliary_loss_clip": 0.01150233, + "auxiliary_loss_mlp": 0.01103659, + "balance_loss_clip": 1.00192022, + "balance_loss_mlp": 1.00056648, + "epoch": 0.7371712009619721, + "flos": 23149491016320.0, + "grad_norm": 2.437920971127015, + "language_loss": 0.73319387, + "learning_rate": 6.817903585769125e-07, + "loss": 0.75573283, + "num_input_tokens_seen": 264497195, + "step": 12261, + "time_per_iteration": 2.568950653076172 + }, + { + "auxiliary_loss_clip": 0.0113329, + "auxiliary_loss_mlp": 0.01104296, + "balance_loss_clip": 1.00189567, + "balance_loss_mlp": 1.00053656, + "epoch": 0.73723132421464, + "flos": 23112898035840.0, + "grad_norm": 2.5440039411338566, + "language_loss": 0.67312157, + "learning_rate": 6.814974884917438e-07, + "loss": 0.69549745, + "num_input_tokens_seen": 264516950, + "step": 12262, + "time_per_iteration": 2.6298787593841553 + }, + { + "auxiliary_loss_clip": 0.01164737, + "auxiliary_loss_mlp": 0.01103091, + "balance_loss_clip": 1.00187898, + "balance_loss_mlp": 1.00047576, + "epoch": 0.737291447467308, + "flos": 19273328605440.0, + "grad_norm": 1.9263035392966337, + "language_loss": 0.88672411, + "learning_rate": 6.81204668404322e-07, + "loss": 0.90940237, + "num_input_tokens_seen": 264532675, + "step": 12263, + "time_per_iteration": 2.5141208171844482 + }, + { + "auxiliary_loss_clip": 0.01164355, + "auxiliary_loss_mlp": 0.01101191, + "balance_loss_clip": 1.00185478, + "balance_loss_mlp": 1.00038791, + "epoch": 0.7373515707199759, + "flos": 25118257415040.0, + "grad_norm": 1.434700107713543, + "language_loss": 0.67502689, + "learning_rate": 6.809118983257522e-07, + "loss": 0.69768238, + "num_input_tokens_seen": 264555635, + "step": 12264, + "time_per_iteration": 2.5895144939422607 + }, + { + "auxiliary_loss_clip": 0.01164663, + "auxiliary_loss_mlp": 0.0110139, + "balance_loss_clip": 1.00187349, + "balance_loss_mlp": 1.00049114, + "epoch": 0.737411693972644, + "flos": 32408481767040.0, + "grad_norm": 1.7614250922562964, + "language_loss": 0.79918206, + "learning_rate": 6.806191782671356e-07, + "loss": 0.82184255, + "num_input_tokens_seen": 264573140, + "step": 12265, + "time_per_iteration": 2.607553482055664 + }, + { + "auxiliary_loss_clip": 0.01150152, + "auxiliary_loss_mlp": 0.01103558, + "balance_loss_clip": 1.00184941, + "balance_loss_mlp": 1.00037086, + "epoch": 0.7374718172253119, + "flos": 24315797623680.0, + "grad_norm": 1.6222874082023935, + "language_loss": 0.74470955, + "learning_rate": 6.803265082395711e-07, + "loss": 0.76724666, + "num_input_tokens_seen": 264591610, + "step": 12266, + "time_per_iteration": 2.622664451599121 + }, + { + "auxiliary_loss_clip": 0.01148404, + "auxiliary_loss_mlp": 0.01103202, + "balance_loss_clip": 1.0017643, + "balance_loss_mlp": 1.0004915, + "epoch": 0.7375319404779799, + "flos": 27156115624320.0, + "grad_norm": 1.6413672613935724, + "language_loss": 0.72939932, + "learning_rate": 6.800338882541576e-07, + "loss": 0.75191545, + "num_input_tokens_seen": 264611170, + "step": 12267, + "time_per_iteration": 2.6230127811431885 + }, + { + "auxiliary_loss_clip": 0.0111392, + "auxiliary_loss_mlp": 0.01101833, + "balance_loss_clip": 1.00165582, + "balance_loss_mlp": 1.00045717, + "epoch": 0.7375920637306478, + "flos": 18879999701760.0, + "grad_norm": 2.7318834026073953, + "language_loss": 0.82816595, + "learning_rate": 6.797413183219923e-07, + "loss": 0.8503235, + "num_input_tokens_seen": 264629365, + "step": 12268, + "time_per_iteration": 4.803835153579712 + }, + { + "auxiliary_loss_clip": 0.0116467, + "auxiliary_loss_mlp": 0.01103017, + "balance_loss_clip": 1.00193751, + "balance_loss_mlp": 1.00059247, + "epoch": 0.7376521869833158, + "flos": 15669765486720.0, + "grad_norm": 1.6909334640503526, + "language_loss": 0.73294121, + "learning_rate": 6.794487984541677e-07, + "loss": 0.7556181, + "num_input_tokens_seen": 264647915, + "step": 12269, + "time_per_iteration": 2.5149543285369873 + }, + { + "auxiliary_loss_clip": 0.01135506, + "auxiliary_loss_mlp": 0.01104268, + "balance_loss_clip": 1.00193226, + "balance_loss_mlp": 1.00050783, + "epoch": 0.7377123102359837, + "flos": 36971973901440.0, + "grad_norm": 2.1265406794577277, + "language_loss": 0.70662749, + "learning_rate": 6.791563286617776e-07, + "loss": 0.72902524, + "num_input_tokens_seen": 264669620, + "step": 12270, + "time_per_iteration": 2.726290702819824 + }, + { + "auxiliary_loss_clip": 0.01150018, + "auxiliary_loss_mlp": 0.01102953, + "balance_loss_clip": 1.00188804, + "balance_loss_mlp": 1.00052845, + "epoch": 0.7377724334886517, + "flos": 24496284487680.0, + "grad_norm": 1.8086812459728707, + "language_loss": 0.69537747, + "learning_rate": 6.788639089559119e-07, + "loss": 0.71790719, + "num_input_tokens_seen": 264689345, + "step": 12271, + "time_per_iteration": 4.037028789520264 + }, + { + "auxiliary_loss_clip": 0.01133364, + "auxiliary_loss_mlp": 0.0110292, + "balance_loss_clip": 1.00193465, + "balance_loss_mlp": 1.00049543, + "epoch": 0.7378325567413198, + "flos": 24390025079040.0, + "grad_norm": 2.413093219692908, + "language_loss": 0.67682648, + "learning_rate": 6.785715393476586e-07, + "loss": 0.69918931, + "num_input_tokens_seen": 264707625, + "step": 12272, + "time_per_iteration": 2.6456899642944336 + }, + { + "auxiliary_loss_clip": 0.0113071, + "auxiliary_loss_mlp": 0.01101745, + "balance_loss_clip": 1.00179076, + "balance_loss_mlp": 1.00036883, + "epoch": 0.7378926799939877, + "flos": 17416388223360.0, + "grad_norm": 1.7765244139497216, + "language_loss": 0.78070951, + "learning_rate": 6.782792198481049e-07, + "loss": 0.80303413, + "num_input_tokens_seen": 264725575, + "step": 12273, + "time_per_iteration": 2.568695306777954 + }, + { + "auxiliary_loss_clip": 0.01164601, + "auxiliary_loss_mlp": 0.01103005, + "balance_loss_clip": 1.00185907, + "balance_loss_mlp": 1.00048482, + "epoch": 0.7379528032466557, + "flos": 18474208778880.0, + "grad_norm": 1.7901350843845776, + "language_loss": 0.83619714, + "learning_rate": 6.779869504683355e-07, + "loss": 0.85887313, + "num_input_tokens_seen": 264742855, + "step": 12274, + "time_per_iteration": 2.5045864582061768 + }, + { + "auxiliary_loss_clip": 0.01131879, + "auxiliary_loss_mlp": 0.00747521, + "balance_loss_clip": 1.00184226, + "balance_loss_mlp": 1.00059664, + "epoch": 0.7380129264993236, + "flos": 17821999578240.0, + "grad_norm": 1.8067284884898736, + "language_loss": 0.7367413, + "learning_rate": 6.776947312194341e-07, + "loss": 0.7555353, + "num_input_tokens_seen": 264761155, + "step": 12275, + "time_per_iteration": 2.610617160797119 + }, + { + "auxiliary_loss_clip": 0.01116746, + "auxiliary_loss_mlp": 0.01104125, + "balance_loss_clip": 1.00179696, + "balance_loss_mlp": 1.00055623, + "epoch": 0.7380730497519916, + "flos": 22997372918400.0, + "grad_norm": 1.9255195136531387, + "language_loss": 0.73501289, + "learning_rate": 6.774025621124813e-07, + "loss": 0.75722158, + "num_input_tokens_seen": 264780660, + "step": 12276, + "time_per_iteration": 2.6550393104553223 + }, + { + "auxiliary_loss_clip": 0.0116478, + "auxiliary_loss_mlp": 0.01102839, + "balance_loss_clip": 1.00190401, + "balance_loss_mlp": 1.000319, + "epoch": 0.7381331730046595, + "flos": 20266259241600.0, + "grad_norm": 2.630982480684184, + "language_loss": 0.77910441, + "learning_rate": 6.771104431585551e-07, + "loss": 0.80178058, + "num_input_tokens_seen": 264798850, + "step": 12277, + "time_per_iteration": 2.5106725692749023 + }, + { + "auxiliary_loss_clip": 0.01164551, + "auxiliary_loss_mlp": 0.01103035, + "balance_loss_clip": 1.00188446, + "balance_loss_mlp": 1.00061059, + "epoch": 0.7381932962573275, + "flos": 19754532132480.0, + "grad_norm": 1.8015899967688394, + "language_loss": 0.78898704, + "learning_rate": 6.768183743687338e-07, + "loss": 0.81166291, + "num_input_tokens_seen": 264816795, + "step": 12278, + "time_per_iteration": 2.567896604537964 + }, + { + "auxiliary_loss_clip": 0.01148244, + "auxiliary_loss_mlp": 0.00747276, + "balance_loss_clip": 1.00179243, + "balance_loss_mlp": 1.00040865, + "epoch": 0.7382534195099955, + "flos": 17305316392320.0, + "grad_norm": 2.128540086679487, + "language_loss": 0.71875727, + "learning_rate": 6.765263557540921e-07, + "loss": 0.7377125, + "num_input_tokens_seen": 264834105, + "step": 12279, + "time_per_iteration": 2.5280723571777344 + }, + { + "auxiliary_loss_clip": 0.01149216, + "auxiliary_loss_mlp": 0.01102947, + "balance_loss_clip": 1.00180256, + "balance_loss_mlp": 1.00052214, + "epoch": 0.7383135427626635, + "flos": 18697358021760.0, + "grad_norm": 3.069509365941108, + "language_loss": 0.86073947, + "learning_rate": 6.762343873257034e-07, + "loss": 0.88326108, + "num_input_tokens_seen": 264850895, + "step": 12280, + "time_per_iteration": 4.022377014160156 + }, + { + "auxiliary_loss_clip": 0.01116767, + "auxiliary_loss_mlp": 0.01103096, + "balance_loss_clip": 1.00175858, + "balance_loss_mlp": 1.00048089, + "epoch": 0.7383736660153314, + "flos": 20881300844160.0, + "grad_norm": 1.8869789222051696, + "language_loss": 0.72477692, + "learning_rate": 6.759424690946408e-07, + "loss": 0.74697554, + "num_input_tokens_seen": 264869505, + "step": 12281, + "time_per_iteration": 2.6419413089752197 + }, + { + "auxiliary_loss_clip": 0.01101347, + "auxiliary_loss_mlp": 0.0110354, + "balance_loss_clip": 1.00174928, + "balance_loss_mlp": 1.00044811, + "epoch": 0.7384337892679994, + "flos": 20663215418880.0, + "grad_norm": 1.684523032928027, + "language_loss": 0.60589457, + "learning_rate": 6.756506010719711e-07, + "loss": 0.6279434, + "num_input_tokens_seen": 264886915, + "step": 12282, + "time_per_iteration": 2.71179461479187 + }, + { + "auxiliary_loss_clip": 0.01114559, + "auxiliary_loss_mlp": 0.01104249, + "balance_loss_clip": 1.00171018, + "balance_loss_mlp": 1.00053692, + "epoch": 0.7384939125206673, + "flos": 29169627390720.0, + "grad_norm": 1.671976509353974, + "language_loss": 0.67949283, + "learning_rate": 6.753587832687632e-07, + "loss": 0.7016809, + "num_input_tokens_seen": 264910350, + "step": 12283, + "time_per_iteration": 2.724846363067627 + }, + { + "auxiliary_loss_clip": 0.01164685, + "auxiliary_loss_mlp": 0.00747388, + "balance_loss_clip": 1.00194347, + "balance_loss_mlp": 1.00047159, + "epoch": 0.7385540357733353, + "flos": 36312833376000.0, + "grad_norm": 1.7116368386770038, + "language_loss": 0.75858867, + "learning_rate": 6.750670156960832e-07, + "loss": 0.77770936, + "num_input_tokens_seen": 264930705, + "step": 12284, + "time_per_iteration": 2.6854145526885986 + }, + { + "auxiliary_loss_clip": 0.0115012, + "auxiliary_loss_mlp": 0.011033, + "balance_loss_clip": 1.00183129, + "balance_loss_mlp": 1.00049388, + "epoch": 0.7386141590260034, + "flos": 20302600826880.0, + "grad_norm": 1.8758987247799326, + "language_loss": 0.69437021, + "learning_rate": 6.747752983649954e-07, + "loss": 0.7169044, + "num_input_tokens_seen": 264946975, + "step": 12285, + "time_per_iteration": 3.972520589828491 + }, + { + "auxiliary_loss_clip": 0.01132748, + "auxiliary_loss_mlp": 0.01104158, + "balance_loss_clip": 1.00171971, + "balance_loss_mlp": 1.00049376, + "epoch": 0.7386742822786713, + "flos": 25483792170240.0, + "grad_norm": 1.8635336829696436, + "language_loss": 0.79567468, + "learning_rate": 6.744836312865602e-07, + "loss": 0.81804383, + "num_input_tokens_seen": 264967665, + "step": 12286, + "time_per_iteration": 2.750627040863037 + }, + { + "auxiliary_loss_clip": 0.01099945, + "auxiliary_loss_mlp": 0.01102779, + "balance_loss_clip": 1.00160503, + "balance_loss_mlp": 1.00044954, + "epoch": 0.7387344055313393, + "flos": 13771958405760.0, + "grad_norm": 1.952105617822162, + "language_loss": 0.6534093, + "learning_rate": 6.741920144718396e-07, + "loss": 0.67543656, + "num_input_tokens_seen": 264985480, + "step": 12287, + "time_per_iteration": 2.670584201812744 + }, + { + "auxiliary_loss_clip": 0.0113243, + "auxiliary_loss_mlp": 0.01102213, + "balance_loss_clip": 1.00178933, + "balance_loss_mlp": 1.00036037, + "epoch": 0.7387945287840072, + "flos": 27855189095040.0, + "grad_norm": 2.0659169079255864, + "language_loss": 0.76812881, + "learning_rate": 6.739004479318903e-07, + "loss": 0.79047525, + "num_input_tokens_seen": 265004790, + "step": 12288, + "time_per_iteration": 2.6532857418060303 + }, + { + "auxiliary_loss_clip": 0.01148145, + "auxiliary_loss_mlp": 0.00747597, + "balance_loss_clip": 1.00193286, + "balance_loss_mlp": 1.00059772, + "epoch": 0.7388546520366752, + "flos": 44233039388160.0, + "grad_norm": 1.5859725990035802, + "language_loss": 0.58077651, + "learning_rate": 6.736089316777684e-07, + "loss": 0.59973395, + "num_input_tokens_seen": 265028790, + "step": 12289, + "time_per_iteration": 2.7764222621917725 + }, + { + "auxiliary_loss_clip": 0.01158652, + "auxiliary_loss_mlp": 0.00745401, + "balance_loss_clip": 1.00072932, + "balance_loss_mlp": 1.00025547, + "epoch": 0.7389147752893431, + "flos": 70680890638080.0, + "grad_norm": 0.6397050858330666, + "language_loss": 0.49257433, + "learning_rate": 6.733174657205287e-07, + "loss": 0.51161492, + "num_input_tokens_seen": 265096660, + "step": 12290, + "time_per_iteration": 3.21647572517395 + }, + { + "auxiliary_loss_clip": 0.01149803, + "auxiliary_loss_mlp": 0.01104109, + "balance_loss_clip": 1.00195825, + "balance_loss_mlp": 1.00044429, + "epoch": 0.7389748985420111, + "flos": 25994980575360.0, + "grad_norm": 2.0854772686436074, + "language_loss": 0.67103088, + "learning_rate": 6.730260500712237e-07, + "loss": 0.69357002, + "num_input_tokens_seen": 265116375, + "step": 12291, + "time_per_iteration": 2.583220958709717 + }, + { + "auxiliary_loss_clip": 0.0109401, + "auxiliary_loss_mlp": 0.01077893, + "balance_loss_clip": 1.00072885, + "balance_loss_mlp": 1.00007308, + "epoch": 0.7390350217946791, + "flos": 54403661318400.0, + "grad_norm": 0.9861288553150966, + "language_loss": 0.60864359, + "learning_rate": 6.727346847409052e-07, + "loss": 0.63036257, + "num_input_tokens_seen": 265161230, + "step": 12292, + "time_per_iteration": 2.8904426097869873 + }, + { + "auxiliary_loss_clip": 0.01101113, + "auxiliary_loss_mlp": 0.01102825, + "balance_loss_clip": 1.00179374, + "balance_loss_mlp": 1.00049579, + "epoch": 0.7390951450473471, + "flos": 32196968530560.0, + "grad_norm": 1.84093737975031, + "language_loss": 0.67403483, + "learning_rate": 6.724433697406191e-07, + "loss": 0.69607425, + "num_input_tokens_seen": 265182515, + "step": 12293, + "time_per_iteration": 2.7887470722198486 + }, + { + "auxiliary_loss_clip": 0.01149358, + "auxiliary_loss_mlp": 0.01104033, + "balance_loss_clip": 1.00188506, + "balance_loss_mlp": 1.0004636, + "epoch": 0.739155268300015, + "flos": 16684241304960.0, + "grad_norm": 2.317958485610997, + "language_loss": 0.83892089, + "learning_rate": 6.721521050814134e-07, + "loss": 0.86145484, + "num_input_tokens_seen": 265198160, + "step": 12294, + "time_per_iteration": 2.5553057193756104 + }, + { + "auxiliary_loss_clip": 0.0111631, + "auxiliary_loss_mlp": 0.01102494, + "balance_loss_clip": 1.00167751, + "balance_loss_mlp": 1.00045061, + "epoch": 0.739215391552683, + "flos": 31649761762560.0, + "grad_norm": 1.5790733250481452, + "language_loss": 0.73075223, + "learning_rate": 6.718608907743337e-07, + "loss": 0.7529403, + "num_input_tokens_seen": 265218480, + "step": 12295, + "time_per_iteration": 2.750563144683838 + }, + { + "auxiliary_loss_clip": 0.01149049, + "auxiliary_loss_mlp": 0.01102114, + "balance_loss_clip": 1.00190544, + "balance_loss_mlp": 1.00073814, + "epoch": 0.7392755148053509, + "flos": 29718522097920.0, + "grad_norm": 2.208864878494229, + "language_loss": 0.78853899, + "learning_rate": 6.715697268304215e-07, + "loss": 0.81105065, + "num_input_tokens_seen": 265240165, + "step": 12296, + "time_per_iteration": 2.690469980239868 + }, + { + "auxiliary_loss_clip": 0.01164707, + "auxiliary_loss_mlp": 0.01102404, + "balance_loss_clip": 1.0019443, + "balance_loss_mlp": 1.00045657, + "epoch": 0.7393356380580189, + "flos": 37050475075200.0, + "grad_norm": 2.28674645973652, + "language_loss": 0.66847384, + "learning_rate": 6.712786132607182e-07, + "loss": 0.69114494, + "num_input_tokens_seen": 265263295, + "step": 12297, + "time_per_iteration": 2.697967290878296 + }, + { + "auxiliary_loss_clip": 0.0113341, + "auxiliary_loss_mlp": 0.01103026, + "balance_loss_clip": 1.00175393, + "balance_loss_mlp": 1.00060165, + "epoch": 0.739395761310687, + "flos": 19719627091200.0, + "grad_norm": 1.5916182757234518, + "language_loss": 0.68690038, + "learning_rate": 6.709875500762645e-07, + "loss": 0.70926476, + "num_input_tokens_seen": 265282740, + "step": 12298, + "time_per_iteration": 2.6072943210601807 + }, + { + "auxiliary_loss_clip": 0.01134364, + "auxiliary_loss_mlp": 0.0110315, + "balance_loss_clip": 1.00185728, + "balance_loss_mlp": 1.00043929, + "epoch": 0.7394558845633549, + "flos": 11801504067840.0, + "grad_norm": 2.0107274814734146, + "language_loss": 0.74907434, + "learning_rate": 6.706965372880946e-07, + "loss": 0.77144957, + "num_input_tokens_seen": 265300175, + "step": 12299, + "time_per_iteration": 2.6655960083007812 + }, + { + "auxiliary_loss_clip": 0.01128597, + "auxiliary_loss_mlp": 0.01077908, + "balance_loss_clip": 1.00162947, + "balance_loss_mlp": 1.00008833, + "epoch": 0.7395160078160229, + "flos": 66195827850240.0, + "grad_norm": 1.4286574770186629, + "language_loss": 0.60833049, + "learning_rate": 6.704055749072455e-07, + "loss": 0.63039553, + "num_input_tokens_seen": 265363275, + "step": 12300, + "time_per_iteration": 3.253984212875366 + }, + { + "auxiliary_loss_clip": 0.01133025, + "auxiliary_loss_mlp": 0.01103416, + "balance_loss_clip": 1.00195909, + "balance_loss_mlp": 1.00051427, + "epoch": 0.7395761310686908, + "flos": 21249708687360.0, + "grad_norm": 2.4567050761123, + "language_loss": 0.8039335, + "learning_rate": 6.7011466294475e-07, + "loss": 0.82629788, + "num_input_tokens_seen": 265382935, + "step": 12301, + "time_per_iteration": 2.6316139698028564 + }, + { + "auxiliary_loss_clip": 0.01164657, + "auxiliary_loss_mlp": 0.01102263, + "balance_loss_clip": 1.0019207, + "balance_loss_mlp": 1.00050545, + "epoch": 0.7396362543213588, + "flos": 25955299025280.0, + "grad_norm": 1.745622644127866, + "language_loss": 0.72853309, + "learning_rate": 6.698238014116406e-07, + "loss": 0.75120223, + "num_input_tokens_seen": 265403245, + "step": 12302, + "time_per_iteration": 2.5507681369781494 + }, + { + "auxiliary_loss_clip": 0.01164835, + "auxiliary_loss_mlp": 0.01103552, + "balance_loss_clip": 1.00186157, + "balance_loss_mlp": 1.00065064, + "epoch": 0.7396963775740267, + "flos": 27377936064000.0, + "grad_norm": 1.9015125598426827, + "language_loss": 0.74276483, + "learning_rate": 6.695329903189451e-07, + "loss": 0.76544875, + "num_input_tokens_seen": 265423105, + "step": 12303, + "time_per_iteration": 2.5950756072998047 + }, + { + "auxiliary_loss_clip": 0.01164413, + "auxiliary_loss_mlp": 0.01102063, + "balance_loss_clip": 1.00176644, + "balance_loss_mlp": 1.00040114, + "epoch": 0.7397565008266948, + "flos": 25520133755520.0, + "grad_norm": 1.8319897855834428, + "language_loss": 0.5438298, + "learning_rate": 6.692422296776927e-07, + "loss": 0.56649458, + "num_input_tokens_seen": 265443445, + "step": 12304, + "time_per_iteration": 2.5862643718719482 + }, + { + "auxiliary_loss_clip": 0.011332, + "auxiliary_loss_mlp": 0.01103022, + "balance_loss_clip": 1.00182557, + "balance_loss_mlp": 1.00059783, + "epoch": 0.7398166240793627, + "flos": 23727760070400.0, + "grad_norm": 7.015493426550742, + "language_loss": 0.84458864, + "learning_rate": 6.689515194989084e-07, + "loss": 0.86695087, + "num_input_tokens_seen": 265462085, + "step": 12305, + "time_per_iteration": 4.050261735916138 + }, + { + "auxiliary_loss_clip": 0.01128054, + "auxiliary_loss_mlp": 0.01077426, + "balance_loss_clip": 1.00080013, + "balance_loss_mlp": 0.9999876, + "epoch": 0.7398767473320307, + "flos": 67267582882560.0, + "grad_norm": 0.8645773274113493, + "language_loss": 0.57681316, + "learning_rate": 6.68660859793615e-07, + "loss": 0.59886795, + "num_input_tokens_seen": 265521190, + "step": 12306, + "time_per_iteration": 3.190011501312256 + }, + { + "auxiliary_loss_clip": 0.01131162, + "auxiliary_loss_mlp": 0.01103211, + "balance_loss_clip": 1.00174475, + "balance_loss_mlp": 1.00050008, + "epoch": 0.7399368705846986, + "flos": 22018699981440.0, + "grad_norm": 2.079712465469422, + "language_loss": 0.81374174, + "learning_rate": 6.683702505728355e-07, + "loss": 0.8360855, + "num_input_tokens_seen": 265539705, + "step": 12307, + "time_per_iteration": 2.5905840396881104 + }, + { + "auxiliary_loss_clip": 0.0114799, + "auxiliary_loss_mlp": 0.01102213, + "balance_loss_clip": 1.00184584, + "balance_loss_mlp": 1.0005039, + "epoch": 0.7399969938373666, + "flos": 14173870659840.0, + "grad_norm": 1.7190577969257947, + "language_loss": 0.7020359, + "learning_rate": 6.680796918475893e-07, + "loss": 0.72453797, + "num_input_tokens_seen": 265555855, + "step": 12308, + "time_per_iteration": 4.0074920654296875 + }, + { + "auxiliary_loss_clip": 0.01132731, + "auxiliary_loss_mlp": 0.01101888, + "balance_loss_clip": 1.00182915, + "balance_loss_mlp": 1.00051188, + "epoch": 0.7400571170900345, + "flos": 25301473712640.0, + "grad_norm": 1.787581789233217, + "language_loss": 0.81523252, + "learning_rate": 6.67789183628896e-07, + "loss": 0.83757865, + "num_input_tokens_seen": 265575455, + "step": 12309, + "time_per_iteration": 2.660099983215332 + }, + { + "auxiliary_loss_clip": 0.01148102, + "auxiliary_loss_mlp": 0.01103117, + "balance_loss_clip": 1.00178814, + "balance_loss_mlp": 1.00050211, + "epoch": 0.7401172403427025, + "flos": 22711344917760.0, + "grad_norm": 1.7273989056876893, + "language_loss": 0.72611082, + "learning_rate": 6.674987259277692e-07, + "loss": 0.74862301, + "num_input_tokens_seen": 265595250, + "step": 12310, + "time_per_iteration": 2.578932523727417 + }, + { + "auxiliary_loss_clip": 0.01117918, + "auxiliary_loss_mlp": 0.01103598, + "balance_loss_clip": 1.00194979, + "balance_loss_mlp": 1.00060105, + "epoch": 0.7401773635953706, + "flos": 18067448188800.0, + "grad_norm": 2.31857702809708, + "language_loss": 0.88648492, + "learning_rate": 6.672083187552239e-07, + "loss": 0.90870005, + "num_input_tokens_seen": 265606945, + "step": 12311, + "time_per_iteration": 2.682116746902466 + }, + { + "auxiliary_loss_clip": 0.01083378, + "auxiliary_loss_mlp": 0.01103024, + "balance_loss_clip": 1.00148118, + "balance_loss_mlp": 1.00040889, + "epoch": 0.7402374868480385, + "flos": 22712135016960.0, + "grad_norm": 3.293101853185224, + "language_loss": 0.8006916, + "learning_rate": 6.669179621222738e-07, + "loss": 0.8225556, + "num_input_tokens_seen": 265626115, + "step": 12312, + "time_per_iteration": 2.7539522647857666 + }, + { + "auxiliary_loss_clip": 0.01103941, + "auxiliary_loss_mlp": 0.01102924, + "balance_loss_clip": 1.00188184, + "balance_loss_mlp": 1.00059438, + "epoch": 0.7402976101007065, + "flos": 22856675345280.0, + "grad_norm": 1.7199445514546545, + "language_loss": 0.7871573, + "learning_rate": 6.666276560399273e-07, + "loss": 0.80922592, + "num_input_tokens_seen": 265646520, + "step": 12313, + "time_per_iteration": 2.7126240730285645 + }, + { + "auxiliary_loss_clip": 0.01099658, + "auxiliary_loss_mlp": 0.0110312, + "balance_loss_clip": 1.00168931, + "balance_loss_mlp": 1.00050473, + "epoch": 0.7403577333533744, + "flos": 12345801834240.0, + "grad_norm": 8.48982838258307, + "language_loss": 0.79040688, + "learning_rate": 6.663374005191937e-07, + "loss": 0.81243467, + "num_input_tokens_seen": 265661875, + "step": 12314, + "time_per_iteration": 2.630429983139038 + }, + { + "auxiliary_loss_clip": 0.01141745, + "auxiliary_loss_mlp": 0.01077053, + "balance_loss_clip": 1.00086164, + "balance_loss_mlp": 0.99999601, + "epoch": 0.7404178566060424, + "flos": 60327270869760.0, + "grad_norm": 0.8169996557270991, + "language_loss": 0.551292, + "learning_rate": 6.660471955710809e-07, + "loss": 0.57347995, + "num_input_tokens_seen": 265721255, + "step": 12315, + "time_per_iteration": 3.1027028560638428 + }, + { + "auxiliary_loss_clip": 0.01147843, + "auxiliary_loss_mlp": 0.01102995, + "balance_loss_clip": 1.00188541, + "balance_loss_mlp": 1.00047457, + "epoch": 0.7404779798587103, + "flos": 32014650072960.0, + "grad_norm": 2.407801419071798, + "language_loss": 0.79281139, + "learning_rate": 6.65757041206591e-07, + "loss": 0.81531978, + "num_input_tokens_seen": 265743970, + "step": 12316, + "time_per_iteration": 2.706874132156372 + }, + { + "auxiliary_loss_clip": 0.01147878, + "auxiliary_loss_mlp": 0.01102705, + "balance_loss_clip": 1.00176334, + "balance_loss_mlp": 1.000471, + "epoch": 0.7405381031113784, + "flos": 12889704551040.0, + "grad_norm": 1.8304385586406553, + "language_loss": 0.74975729, + "learning_rate": 6.654669374367275e-07, + "loss": 0.77226317, + "num_input_tokens_seen": 265760890, + "step": 12317, + "time_per_iteration": 2.6190552711486816 + }, + { + "auxiliary_loss_clip": 0.01132969, + "auxiliary_loss_mlp": 0.01101859, + "balance_loss_clip": 1.00187898, + "balance_loss_mlp": 1.00048351, + "epoch": 0.7405982263640463, + "flos": 20229127557120.0, + "grad_norm": 2.3179421760151793, + "language_loss": 0.81453454, + "learning_rate": 6.651768842724917e-07, + "loss": 0.83688283, + "num_input_tokens_seen": 265779600, + "step": 12318, + "time_per_iteration": 4.1387410163879395 + }, + { + "auxiliary_loss_clip": 0.01132807, + "auxiliary_loss_mlp": 0.01102974, + "balance_loss_clip": 1.00184345, + "balance_loss_mlp": 1.00035906, + "epoch": 0.7406583496167143, + "flos": 17567213431680.0, + "grad_norm": 3.8524962953383115, + "language_loss": 0.7675699, + "learning_rate": 6.648868817248827e-07, + "loss": 0.78992772, + "num_input_tokens_seen": 265797030, + "step": 12319, + "time_per_iteration": 2.596662998199463 + }, + { + "auxiliary_loss_clip": 0.01134629, + "auxiliary_loss_mlp": 0.01102257, + "balance_loss_clip": 1.00197184, + "balance_loss_mlp": 1.00049973, + "epoch": 0.7407184728693822, + "flos": 18295733076480.0, + "grad_norm": 2.1213368312003182, + "language_loss": 0.63866186, + "learning_rate": 6.64596929804897e-07, + "loss": 0.66103077, + "num_input_tokens_seen": 265815055, + "step": 12320, + "time_per_iteration": 2.6409313678741455 + }, + { + "auxiliary_loss_clip": 0.01147999, + "auxiliary_loss_mlp": 0.0110494, + "balance_loss_clip": 1.00180614, + "balance_loss_mlp": 1.00060856, + "epoch": 0.7407785961220502, + "flos": 16690562098560.0, + "grad_norm": 2.368531333351881, + "language_loss": 0.82068837, + "learning_rate": 6.643070285235288e-07, + "loss": 0.84321773, + "num_input_tokens_seen": 265828480, + "step": 12321, + "time_per_iteration": 2.6486656665802 + }, + { + "auxiliary_loss_clip": 0.01131262, + "auxiliary_loss_mlp": 0.0110478, + "balance_loss_clip": 1.00169301, + "balance_loss_mlp": 1.00073457, + "epoch": 0.7408387193747181, + "flos": 22088330496000.0, + "grad_norm": 1.883778031944603, + "language_loss": 0.7208457, + "learning_rate": 6.640171778917727e-07, + "loss": 0.74320614, + "num_input_tokens_seen": 265845825, + "step": 12322, + "time_per_iteration": 2.671633005142212 + }, + { + "auxiliary_loss_clip": 0.01149683, + "auxiliary_loss_mlp": 0.00747327, + "balance_loss_clip": 1.001948, + "balance_loss_mlp": 1.00046706, + "epoch": 0.7408988426273861, + "flos": 24236721832320.0, + "grad_norm": 1.7072612651454662, + "language_loss": 0.64144558, + "learning_rate": 6.637273779206183e-07, + "loss": 0.66041565, + "num_input_tokens_seen": 265866335, + "step": 12323, + "time_per_iteration": 2.622185230255127 + }, + { + "auxiliary_loss_clip": 0.01116293, + "auxiliary_loss_mlp": 0.01103043, + "balance_loss_clip": 1.00179255, + "balance_loss_mlp": 1.00042748, + "epoch": 0.7409589658800542, + "flos": 29023004073600.0, + "grad_norm": 1.3543271620166877, + "language_loss": 0.75682539, + "learning_rate": 6.634376286210559e-07, + "loss": 0.77901876, + "num_input_tokens_seen": 265888945, + "step": 12324, + "time_per_iteration": 4.148586988449097 + }, + { + "auxiliary_loss_clip": 0.01131714, + "auxiliary_loss_mlp": 0.01102559, + "balance_loss_clip": 1.00160718, + "balance_loss_mlp": 1.00032496, + "epoch": 0.7410190891327221, + "flos": 19351362902400.0, + "grad_norm": 1.8733289083160525, + "language_loss": 0.74738121, + "learning_rate": 6.63147930004073e-07, + "loss": 0.76972389, + "num_input_tokens_seen": 265908030, + "step": 12325, + "time_per_iteration": 2.6895551681518555 + }, + { + "auxiliary_loss_clip": 0.01099769, + "auxiliary_loss_mlp": 0.01103413, + "balance_loss_clip": 1.001634, + "balance_loss_mlp": 1.00051188, + "epoch": 0.7410792123853901, + "flos": 22747650589440.0, + "grad_norm": 1.9111304337385204, + "language_loss": 0.68441761, + "learning_rate": 6.628582820806545e-07, + "loss": 0.70644945, + "num_input_tokens_seen": 265927030, + "step": 12326, + "time_per_iteration": 2.710951566696167 + }, + { + "auxiliary_loss_clip": 0.01114513, + "auxiliary_loss_mlp": 0.01102214, + "balance_loss_clip": 1.00174153, + "balance_loss_mlp": 1.00055218, + "epoch": 0.741139335638058, + "flos": 25372433030400.0, + "grad_norm": 1.650771014901999, + "language_loss": 0.893911, + "learning_rate": 6.625686848617835e-07, + "loss": 0.91607821, + "num_input_tokens_seen": 265945490, + "step": 12327, + "time_per_iteration": 2.68936824798584 + }, + { + "auxiliary_loss_clip": 0.01164527, + "auxiliary_loss_mlp": 0.01102799, + "balance_loss_clip": 1.00188684, + "balance_loss_mlp": 1.00047016, + "epoch": 0.741199458890726, + "flos": 18585639745920.0, + "grad_norm": 1.839184397970835, + "language_loss": 0.85506022, + "learning_rate": 6.62279138358442e-07, + "loss": 0.87773353, + "num_input_tokens_seen": 265963265, + "step": 12328, + "time_per_iteration": 2.5141794681549072 + }, + { + "auxiliary_loss_clip": 0.01147792, + "auxiliary_loss_mlp": 0.01101607, + "balance_loss_clip": 1.00192034, + "balance_loss_mlp": 1.00042212, + "epoch": 0.7412595821433939, + "flos": 22127078292480.0, + "grad_norm": 1.7514580864573495, + "language_loss": 0.6658169, + "learning_rate": 6.619896425816103e-07, + "loss": 0.68831086, + "num_input_tokens_seen": 265982270, + "step": 12329, + "time_per_iteration": 2.576960802078247 + }, + { + "auxiliary_loss_clip": 0.01116806, + "auxiliary_loss_mlp": 0.01103728, + "balance_loss_clip": 1.00183499, + "balance_loss_mlp": 1.00034976, + "epoch": 0.741319705396062, + "flos": 29169699217920.0, + "grad_norm": 1.5438026300931256, + "language_loss": 0.66626412, + "learning_rate": 6.617001975422647e-07, + "loss": 0.68846941, + "num_input_tokens_seen": 266003835, + "step": 12330, + "time_per_iteration": 2.68815016746521 + }, + { + "auxiliary_loss_clip": 0.01117553, + "auxiliary_loss_mlp": 0.01104341, + "balance_loss_clip": 1.00181437, + "balance_loss_mlp": 1.00048542, + "epoch": 0.7413798286487299, + "flos": 20667489137280.0, + "grad_norm": 2.0232472049194588, + "language_loss": 0.85483956, + "learning_rate": 6.614108032513823e-07, + "loss": 0.87705851, + "num_input_tokens_seen": 266021595, + "step": 12331, + "time_per_iteration": 2.618523120880127 + }, + { + "auxiliary_loss_clip": 0.01072044, + "auxiliary_loss_mlp": 0.01102742, + "balance_loss_clip": 1.00170481, + "balance_loss_mlp": 1.00041294, + "epoch": 0.7414399519013979, + "flos": 16398895662720.0, + "grad_norm": 3.329498054016139, + "language_loss": 0.69567168, + "learning_rate": 6.611214597199364e-07, + "loss": 0.71741951, + "num_input_tokens_seen": 266039860, + "step": 12332, + "time_per_iteration": 2.8051578998565674 + }, + { + "auxiliary_loss_clip": 0.0116486, + "auxiliary_loss_mlp": 0.01102344, + "balance_loss_clip": 1.0020225, + "balance_loss_mlp": 1.00058675, + "epoch": 0.7415000751540658, + "flos": 25630235919360.0, + "grad_norm": 1.7510638785365125, + "language_loss": 0.62996221, + "learning_rate": 6.608321669588984e-07, + "loss": 0.65263426, + "num_input_tokens_seen": 266058050, + "step": 12333, + "time_per_iteration": 2.614258289337158 + }, + { + "auxiliary_loss_clip": 0.01131714, + "auxiliary_loss_mlp": 0.01101498, + "balance_loss_clip": 1.0018332, + "balance_loss_mlp": 1.00050402, + "epoch": 0.7415601984067338, + "flos": 24499732193280.0, + "grad_norm": 1.7042784806501803, + "language_loss": 0.71064353, + "learning_rate": 6.605429249792387e-07, + "loss": 0.73297572, + "num_input_tokens_seen": 266078060, + "step": 12334, + "time_per_iteration": 2.664907217025757 + }, + { + "auxiliary_loss_clip": 0.01116526, + "auxiliary_loss_mlp": 0.01102927, + "balance_loss_clip": 1.00192976, + "balance_loss_mlp": 1.00050235, + "epoch": 0.7416203216594017, + "flos": 20887154760960.0, + "grad_norm": 1.6527649170011416, + "language_loss": 0.82344067, + "learning_rate": 6.602537337919257e-07, + "loss": 0.84563518, + "num_input_tokens_seen": 266097110, + "step": 12335, + "time_per_iteration": 2.632859230041504 + }, + { + "auxiliary_loss_clip": 0.01164726, + "auxiliary_loss_mlp": 0.01103569, + "balance_loss_clip": 1.00193584, + "balance_loss_mlp": 1.0005722, + "epoch": 0.7416804449120697, + "flos": 15624265933440.0, + "grad_norm": 2.6820173227852764, + "language_loss": 0.74976784, + "learning_rate": 6.599645934079259e-07, + "loss": 0.7724508, + "num_input_tokens_seen": 266110870, + "step": 12336, + "time_per_iteration": 2.5003063678741455 + }, + { + "auxiliary_loss_clip": 0.01101893, + "auxiliary_loss_mlp": 0.0110293, + "balance_loss_clip": 1.00183153, + "balance_loss_mlp": 1.00041032, + "epoch": 0.7417405681647377, + "flos": 17120483982720.0, + "grad_norm": 1.8324935202799537, + "language_loss": 0.733693, + "learning_rate": 6.596755038382029e-07, + "loss": 0.75574124, + "num_input_tokens_seen": 266127845, + "step": 12337, + "time_per_iteration": 2.741928815841675 + }, + { + "auxiliary_loss_clip": 0.01133374, + "auxiliary_loss_mlp": 0.0110243, + "balance_loss_clip": 1.00187266, + "balance_loss_mlp": 1.00048256, + "epoch": 0.7418006914174057, + "flos": 18880322924160.0, + "grad_norm": 1.7350900796649604, + "language_loss": 0.76367724, + "learning_rate": 6.593864650937186e-07, + "loss": 0.7860353, + "num_input_tokens_seen": 266145400, + "step": 12338, + "time_per_iteration": 2.64308762550354 + }, + { + "auxiliary_loss_clip": 0.0114978, + "auxiliary_loss_mlp": 0.01101504, + "balance_loss_clip": 1.00181031, + "balance_loss_mlp": 1.00041497, + "epoch": 0.7418608146700737, + "flos": 21580733450880.0, + "grad_norm": 1.6925619218070735, + "language_loss": 0.72934002, + "learning_rate": 6.590974771854345e-07, + "loss": 0.75185287, + "num_input_tokens_seen": 266164430, + "step": 12339, + "time_per_iteration": 2.573136806488037 + }, + { + "auxiliary_loss_clip": 0.01134359, + "auxiliary_loss_mlp": 0.0110253, + "balance_loss_clip": 1.00187778, + "balance_loss_mlp": 1.00048697, + "epoch": 0.7419209379227416, + "flos": 22340459036160.0, + "grad_norm": 2.148665596961087, + "language_loss": 0.79958189, + "learning_rate": 6.588085401243077e-07, + "loss": 0.82195079, + "num_input_tokens_seen": 266183855, + "step": 12340, + "time_per_iteration": 2.6688315868377686 + }, + { + "auxiliary_loss_clip": 0.01117905, + "auxiliary_loss_mlp": 0.01103023, + "balance_loss_clip": 1.00171471, + "balance_loss_mlp": 1.00050259, + "epoch": 0.7419810611754096, + "flos": 16762275601920.0, + "grad_norm": 1.53901727949163, + "language_loss": 0.75503778, + "learning_rate": 6.585196539212958e-07, + "loss": 0.77724707, + "num_input_tokens_seen": 266202085, + "step": 12341, + "time_per_iteration": 2.6316914558410645 + }, + { + "auxiliary_loss_clip": 0.01132625, + "auxiliary_loss_mlp": 0.01101048, + "balance_loss_clip": 1.00183654, + "balance_loss_mlp": 1.0005306, + "epoch": 0.7420411844280775, + "flos": 26212958259840.0, + "grad_norm": 1.38283281709068, + "language_loss": 0.79984832, + "learning_rate": 6.582308185873535e-07, + "loss": 0.82218504, + "num_input_tokens_seen": 266223445, + "step": 12342, + "time_per_iteration": 2.7285609245300293 + }, + { + "auxiliary_loss_clip": 0.01117912, + "auxiliary_loss_mlp": 0.01102944, + "balance_loss_clip": 1.00177538, + "balance_loss_mlp": 1.00042462, + "epoch": 0.7421013076807456, + "flos": 68529371840640.0, + "grad_norm": 1.7082321344717455, + "language_loss": 0.77306741, + "learning_rate": 6.57942034133433e-07, + "loss": 0.79527593, + "num_input_tokens_seen": 266246575, + "step": 12343, + "time_per_iteration": 3.037792921066284 + }, + { + "auxiliary_loss_clip": 0.01132836, + "auxiliary_loss_mlp": 0.01102793, + "balance_loss_clip": 1.00183332, + "balance_loss_mlp": 1.0004642, + "epoch": 0.7421614309334135, + "flos": 24425325169920.0, + "grad_norm": 1.8995393602983004, + "language_loss": 0.67667913, + "learning_rate": 6.576533005704843e-07, + "loss": 0.69903541, + "num_input_tokens_seen": 266266055, + "step": 12344, + "time_per_iteration": 4.09035062789917 + }, + { + "auxiliary_loss_clip": 0.01097666, + "auxiliary_loss_mlp": 0.0110368, + "balance_loss_clip": 1.00160182, + "balance_loss_mlp": 1.0004921, + "epoch": 0.7422215541860815, + "flos": 12311076360960.0, + "grad_norm": 2.2313034089161086, + "language_loss": 0.81408948, + "learning_rate": 6.573646179094572e-07, + "loss": 0.83610296, + "num_input_tokens_seen": 266282240, + "step": 12345, + "time_per_iteration": 2.8004777431488037 + }, + { + "auxiliary_loss_clip": 0.01119522, + "auxiliary_loss_mlp": 0.01102701, + "balance_loss_clip": 1.00197506, + "balance_loss_mlp": 1.00046682, + "epoch": 0.7422816774387494, + "flos": 19645579203840.0, + "grad_norm": 1.9492610945816615, + "language_loss": 0.70621222, + "learning_rate": 6.570759861612988e-07, + "loss": 0.72843444, + "num_input_tokens_seen": 266300980, + "step": 12346, + "time_per_iteration": 4.101367950439453 + }, + { + "auxiliary_loss_clip": 0.01148027, + "auxiliary_loss_mlp": 0.01102498, + "balance_loss_clip": 1.00187671, + "balance_loss_mlp": 1.00054979, + "epoch": 0.7423418006914174, + "flos": 32015978876160.0, + "grad_norm": 1.4848299582419826, + "language_loss": 0.73243624, + "learning_rate": 6.56787405336953e-07, + "loss": 0.75494146, + "num_input_tokens_seen": 266322215, + "step": 12347, + "time_per_iteration": 2.6660714149475098 + }, + { + "auxiliary_loss_clip": 0.01133429, + "auxiliary_loss_mlp": 0.01103427, + "balance_loss_clip": 1.0018084, + "balance_loss_mlp": 1.00043058, + "epoch": 0.7424019239440853, + "flos": 18916951818240.0, + "grad_norm": 2.9067133681837873, + "language_loss": 0.80990988, + "learning_rate": 6.564988754473642e-07, + "loss": 0.83227849, + "num_input_tokens_seen": 266341600, + "step": 12348, + "time_per_iteration": 2.5797219276428223 + }, + { + "auxiliary_loss_clip": 0.01164425, + "auxiliary_loss_mlp": 0.01102215, + "balance_loss_clip": 1.00193238, + "balance_loss_mlp": 1.00055349, + "epoch": 0.7424620471967533, + "flos": 35876518871040.0, + "grad_norm": 1.5767979110804724, + "language_loss": 0.72306585, + "learning_rate": 6.562103965034724e-07, + "loss": 0.74573225, + "num_input_tokens_seen": 266362895, + "step": 12349, + "time_per_iteration": 2.649834156036377 + }, + { + "auxiliary_loss_clip": 0.011332, + "auxiliary_loss_mlp": 0.01103222, + "balance_loss_clip": 1.00176048, + "balance_loss_mlp": 1.00051093, + "epoch": 0.7425221704494213, + "flos": 27016603200000.0, + "grad_norm": 31.539525413568647, + "language_loss": 0.79266578, + "learning_rate": 6.559219685162165e-07, + "loss": 0.81502998, + "num_input_tokens_seen": 266384015, + "step": 12350, + "time_per_iteration": 2.651555061340332 + }, + { + "auxiliary_loss_clip": 0.01100642, + "auxiliary_loss_mlp": 0.01102296, + "balance_loss_clip": 1.00176263, + "balance_loss_mlp": 1.0005393, + "epoch": 0.7425822937020893, + "flos": 34167135559680.0, + "grad_norm": 2.106868443981883, + "language_loss": 0.75204444, + "learning_rate": 6.556335914965343e-07, + "loss": 0.77407384, + "num_input_tokens_seen": 266405990, + "step": 12351, + "time_per_iteration": 2.7838268280029297 + }, + { + "auxiliary_loss_clip": 0.01086325, + "auxiliary_loss_mlp": 0.0110212, + "balance_loss_clip": 1.00166631, + "balance_loss_mlp": 1.00041032, + "epoch": 0.7426424169547573, + "flos": 21283572234240.0, + "grad_norm": 1.9310542146894378, + "language_loss": 0.81668699, + "learning_rate": 6.553452654553611e-07, + "loss": 0.83857143, + "num_input_tokens_seen": 266424260, + "step": 12352, + "time_per_iteration": 2.7095048427581787 + }, + { + "auxiliary_loss_clip": 0.01149093, + "auxiliary_loss_mlp": 0.01102141, + "balance_loss_clip": 1.00181413, + "balance_loss_mlp": 1.00057483, + "epoch": 0.7427025402074252, + "flos": 22448442297600.0, + "grad_norm": 2.3809418758868084, + "language_loss": 0.71800721, + "learning_rate": 6.550569904036307e-07, + "loss": 0.74051952, + "num_input_tokens_seen": 266444580, + "step": 12353, + "time_per_iteration": 2.5846240520477295 + }, + { + "auxiliary_loss_clip": 0.0114996, + "auxiliary_loss_mlp": 0.01102936, + "balance_loss_clip": 1.00191927, + "balance_loss_mlp": 1.00051153, + "epoch": 0.7427626634600932, + "flos": 22524609087360.0, + "grad_norm": 1.882404318397057, + "language_loss": 0.7216177, + "learning_rate": 6.547687663522739e-07, + "loss": 0.7441467, + "num_input_tokens_seen": 266465640, + "step": 12354, + "time_per_iteration": 2.609966278076172 + }, + { + "auxiliary_loss_clip": 0.01142508, + "auxiliary_loss_mlp": 0.01077151, + "balance_loss_clip": 1.00081265, + "balance_loss_mlp": 1.0000937, + "epoch": 0.7428227867127611, + "flos": 67209477655680.0, + "grad_norm": 0.6930915267731538, + "language_loss": 0.59553194, + "learning_rate": 6.544805933122199e-07, + "loss": 0.61772847, + "num_input_tokens_seen": 266531950, + "step": 12355, + "time_per_iteration": 3.2682549953460693 + }, + { + "auxiliary_loss_clip": 0.01164613, + "auxiliary_loss_mlp": 0.01101933, + "balance_loss_clip": 1.00190938, + "balance_loss_mlp": 1.00036681, + "epoch": 0.7428829099654292, + "flos": 14721221082240.0, + "grad_norm": 1.778856208353488, + "language_loss": 0.67484027, + "learning_rate": 6.541924712943971e-07, + "loss": 0.69750571, + "num_input_tokens_seen": 266550665, + "step": 12356, + "time_per_iteration": 3.9981770515441895 + }, + { + "auxiliary_loss_clip": 0.0114998, + "auxiliary_loss_mlp": 0.00747384, + "balance_loss_clip": 1.00179529, + "balance_loss_mlp": 1.00043344, + "epoch": 0.7429430332180971, + "flos": 48646496413440.0, + "grad_norm": 1.6430714885526183, + "language_loss": 0.71994364, + "learning_rate": 6.539044003097301e-07, + "loss": 0.73891723, + "num_input_tokens_seen": 266572455, + "step": 12357, + "time_per_iteration": 2.8100664615631104 + }, + { + "auxiliary_loss_clip": 0.01133219, + "auxiliary_loss_mlp": 0.01101127, + "balance_loss_clip": 1.00182104, + "balance_loss_mlp": 1.00051475, + "epoch": 0.7430031564707651, + "flos": 16764071281920.0, + "grad_norm": 1.7833667969561446, + "language_loss": 0.65262496, + "learning_rate": 6.53616380369143e-07, + "loss": 0.67496848, + "num_input_tokens_seen": 266590895, + "step": 12358, + "time_per_iteration": 2.641937494277954 + }, + { + "auxiliary_loss_clip": 0.01100073, + "auxiliary_loss_mlp": 0.0110289, + "balance_loss_clip": 1.00165641, + "balance_loss_mlp": 1.00046575, + "epoch": 0.743063279723433, + "flos": 23870576545920.0, + "grad_norm": 1.831311263640214, + "language_loss": 0.80637079, + "learning_rate": 6.533284114835591e-07, + "loss": 0.82840049, + "num_input_tokens_seen": 266607660, + "step": 12359, + "time_per_iteration": 2.7142438888549805 + }, + { + "auxiliary_loss_clip": 0.01148385, + "auxiliary_loss_mlp": 0.01102257, + "balance_loss_clip": 1.00177097, + "balance_loss_mlp": 1.0005002, + "epoch": 0.743123402976101, + "flos": 14391704689920.0, + "grad_norm": 2.731926940239826, + "language_loss": 0.68449807, + "learning_rate": 6.530404936638956e-07, + "loss": 0.70700449, + "num_input_tokens_seen": 266624260, + "step": 12360, + "time_per_iteration": 2.5840697288513184 + }, + { + "auxiliary_loss_clip": 0.01148126, + "auxiliary_loss_mlp": 0.00747331, + "balance_loss_clip": 1.00169861, + "balance_loss_mlp": 1.00044465, + "epoch": 0.7431835262287689, + "flos": 27454318335360.0, + "grad_norm": 1.6232811124035214, + "language_loss": 0.7284202, + "learning_rate": 6.527526269210715e-07, + "loss": 0.74737477, + "num_input_tokens_seen": 266644210, + "step": 12361, + "time_per_iteration": 2.6160643100738525 + }, + { + "auxiliary_loss_clip": 0.0110201, + "auxiliary_loss_mlp": 0.01102368, + "balance_loss_clip": 1.0017041, + "balance_loss_mlp": 1.00042057, + "epoch": 0.743243649481437, + "flos": 20959514709120.0, + "grad_norm": 1.989825201065731, + "language_loss": 0.56236279, + "learning_rate": 6.524648112660027e-07, + "loss": 0.5844065, + "num_input_tokens_seen": 266664230, + "step": 12362, + "time_per_iteration": 4.161105394363403 + }, + { + "auxiliary_loss_clip": 0.01118222, + "auxiliary_loss_mlp": 0.01101366, + "balance_loss_clip": 1.00183511, + "balance_loss_mlp": 1.00046742, + "epoch": 0.7433037727341049, + "flos": 22783166161920.0, + "grad_norm": 1.8694513888387259, + "language_loss": 0.77311945, + "learning_rate": 6.521770467096039e-07, + "loss": 0.79531527, + "num_input_tokens_seen": 266683270, + "step": 12363, + "time_per_iteration": 2.650986671447754 + }, + { + "auxiliary_loss_clip": 0.01133316, + "auxiliary_loss_mlp": 0.01101954, + "balance_loss_clip": 1.0017345, + "balance_loss_mlp": 1.00057805, + "epoch": 0.7433638959867729, + "flos": 22196708807040.0, + "grad_norm": 2.2092243061437844, + "language_loss": 0.77663815, + "learning_rate": 6.518893332627862e-07, + "loss": 0.79899085, + "num_input_tokens_seen": 266701235, + "step": 12364, + "time_per_iteration": 2.60278058052063 + }, + { + "auxiliary_loss_clip": 0.0114788, + "auxiliary_loss_mlp": 0.01101911, + "balance_loss_clip": 1.00183344, + "balance_loss_mlp": 1.00058341, + "epoch": 0.7434240192394409, + "flos": 23296760778240.0, + "grad_norm": 1.5565864335982826, + "language_loss": 0.78368497, + "learning_rate": 6.516016709364604e-07, + "loss": 0.80618286, + "num_input_tokens_seen": 266721495, + "step": 12365, + "time_per_iteration": 2.69637131690979 + }, + { + "auxiliary_loss_clip": 0.0113311, + "auxiliary_loss_mlp": 0.01102383, + "balance_loss_clip": 1.00168848, + "balance_loss_mlp": 1.00043488, + "epoch": 0.7434841424921088, + "flos": 54009575251200.0, + "grad_norm": 1.843137588627983, + "language_loss": 0.76544207, + "learning_rate": 6.513140597415346e-07, + "loss": 0.78779703, + "num_input_tokens_seen": 266747400, + "step": 12366, + "time_per_iteration": 2.871833324432373 + }, + { + "auxiliary_loss_clip": 0.01149711, + "auxiliary_loss_mlp": 0.01101556, + "balance_loss_clip": 1.0019176, + "balance_loss_mlp": 1.00046682, + "epoch": 0.7435442657447768, + "flos": 21433966479360.0, + "grad_norm": 1.6977764057789917, + "language_loss": 0.7145983, + "learning_rate": 6.510264996889141e-07, + "loss": 0.73711097, + "num_input_tokens_seen": 266767630, + "step": 12367, + "time_per_iteration": 2.590641736984253 + }, + { + "auxiliary_loss_clip": 0.01118652, + "auxiliary_loss_mlp": 0.01103296, + "balance_loss_clip": 1.00178599, + "balance_loss_mlp": 1.00058544, + "epoch": 0.7436043889974447, + "flos": 24499408970880.0, + "grad_norm": 2.0583777725785577, + "language_loss": 0.74627662, + "learning_rate": 6.507389907895038e-07, + "loss": 0.76849616, + "num_input_tokens_seen": 266788015, + "step": 12368, + "time_per_iteration": 2.6480305194854736 + }, + { + "auxiliary_loss_clip": 0.01147689, + "auxiliary_loss_mlp": 0.01101763, + "balance_loss_clip": 1.00184536, + "balance_loss_mlp": 1.00048292, + "epoch": 0.7436645122501128, + "flos": 40698388512000.0, + "grad_norm": 1.8063745880067767, + "language_loss": 0.69559407, + "learning_rate": 6.50451533054207e-07, + "loss": 0.71808863, + "num_input_tokens_seen": 266809010, + "step": 12369, + "time_per_iteration": 2.726809501647949 + }, + { + "auxiliary_loss_clip": 0.01131475, + "auxiliary_loss_mlp": 0.00747275, + "balance_loss_clip": 1.00187957, + "balance_loss_mlp": 1.00036085, + "epoch": 0.7437246355027807, + "flos": 18908835344640.0, + "grad_norm": 1.95578634754107, + "language_loss": 0.75502825, + "learning_rate": 6.501641264939233e-07, + "loss": 0.77381569, + "num_input_tokens_seen": 266825390, + "step": 12370, + "time_per_iteration": 2.5991415977478027 + }, + { + "auxiliary_loss_clip": 0.01164662, + "auxiliary_loss_mlp": 0.01102049, + "balance_loss_clip": 1.00202, + "balance_loss_mlp": 1.00057864, + "epoch": 0.7437847587554487, + "flos": 21543817248000.0, + "grad_norm": 1.5064962972841833, + "language_loss": 0.7824775, + "learning_rate": 6.498767711195503e-07, + "loss": 0.80514467, + "num_input_tokens_seen": 266844675, + "step": 12371, + "time_per_iteration": 2.5356080532073975 + }, + { + "auxiliary_loss_clip": 0.01133146, + "auxiliary_loss_mlp": 0.01102343, + "balance_loss_clip": 1.00174773, + "balance_loss_mlp": 1.00039542, + "epoch": 0.7438448820081166, + "flos": 27782470010880.0, + "grad_norm": 1.6163140229131474, + "language_loss": 0.69777042, + "learning_rate": 6.495894669419857e-07, + "loss": 0.72012532, + "num_input_tokens_seen": 266865160, + "step": 12372, + "time_per_iteration": 2.650749444961548 + }, + { + "auxiliary_loss_clip": 0.01133703, + "auxiliary_loss_mlp": 0.01102128, + "balance_loss_clip": 1.00182843, + "balance_loss_mlp": 1.00046635, + "epoch": 0.7439050052607846, + "flos": 17967832796160.0, + "grad_norm": 2.088492480384656, + "language_loss": 0.75029051, + "learning_rate": 6.493022139721245e-07, + "loss": 0.77264881, + "num_input_tokens_seen": 266883285, + "step": 12373, + "time_per_iteration": 2.5807735919952393 + }, + { + "auxiliary_loss_clip": 0.01102423, + "auxiliary_loss_mlp": 0.01103218, + "balance_loss_clip": 1.00169265, + "balance_loss_mlp": 1.00050735, + "epoch": 0.7439651285134525, + "flos": 22958696949120.0, + "grad_norm": 3.919782845263118, + "language_loss": 0.77263099, + "learning_rate": 6.49015012220858e-07, + "loss": 0.79468739, + "num_input_tokens_seen": 266900960, + "step": 12374, + "time_per_iteration": 2.672877788543701 + }, + { + "auxiliary_loss_clip": 0.01086195, + "auxiliary_loss_mlp": 0.01102319, + "balance_loss_clip": 1.00189245, + "balance_loss_mlp": 1.00046635, + "epoch": 0.7440252517661206, + "flos": 18806777827200.0, + "grad_norm": 2.246670032962595, + "language_loss": 0.76714504, + "learning_rate": 6.487278616990774e-07, + "loss": 0.78903019, + "num_input_tokens_seen": 266917710, + "step": 12375, + "time_per_iteration": 2.6794989109039307 + }, + { + "auxiliary_loss_clip": 0.01148009, + "auxiliary_loss_mlp": 0.01101337, + "balance_loss_clip": 1.00182521, + "balance_loss_mlp": 1.00053334, + "epoch": 0.7440853750187885, + "flos": 20266295155200.0, + "grad_norm": 1.8256951226575964, + "language_loss": 0.77406287, + "learning_rate": 6.484407624176733e-07, + "loss": 0.79655629, + "num_input_tokens_seen": 266934220, + "step": 12376, + "time_per_iteration": 2.540295362472534 + }, + { + "auxiliary_loss_clip": 0.01118176, + "auxiliary_loss_mlp": 0.01102117, + "balance_loss_clip": 1.00179136, + "balance_loss_mlp": 1.00036025, + "epoch": 0.7441454982714565, + "flos": 25337276593920.0, + "grad_norm": 1.7052737546200272, + "language_loss": 0.79274857, + "learning_rate": 6.481537143875296e-07, + "loss": 0.81495148, + "num_input_tokens_seen": 266955210, + "step": 12377, + "time_per_iteration": 2.7364423274993896 + }, + { + "auxiliary_loss_clip": 0.01148282, + "auxiliary_loss_mlp": 0.01102396, + "balance_loss_clip": 1.0018189, + "balance_loss_mlp": 1.00044799, + "epoch": 0.7442056215241245, + "flos": 64480910866560.0, + "grad_norm": 2.2169012570069575, + "language_loss": 0.67574906, + "learning_rate": 6.478667176195322e-07, + "loss": 0.69825578, + "num_input_tokens_seen": 266976555, + "step": 12378, + "time_per_iteration": 2.9373202323913574 + }, + { + "auxiliary_loss_clip": 0.01116345, + "auxiliary_loss_mlp": 0.01102975, + "balance_loss_clip": 1.00166321, + "balance_loss_mlp": 1.00055075, + "epoch": 0.7442657447767924, + "flos": 31285376242560.0, + "grad_norm": 2.0629480179517548, + "language_loss": 0.7182641, + "learning_rate": 6.475797721245648e-07, + "loss": 0.7404573, + "num_input_tokens_seen": 266997640, + "step": 12379, + "time_per_iteration": 2.7159814834594727 + }, + { + "auxiliary_loss_clip": 0.01116135, + "auxiliary_loss_mlp": 0.00747365, + "balance_loss_clip": 1.00174809, + "balance_loss_mlp": 1.00047588, + "epoch": 0.7443258680294604, + "flos": 20807899401600.0, + "grad_norm": 2.175711305511339, + "language_loss": 0.65455484, + "learning_rate": 6.472928779135085e-07, + "loss": 0.67318982, + "num_input_tokens_seen": 267016165, + "step": 12380, + "time_per_iteration": 2.6591808795928955 + }, + { + "auxiliary_loss_clip": 0.01148029, + "auxiliary_loss_mlp": 0.01102624, + "balance_loss_clip": 1.00191021, + "balance_loss_mlp": 1.00058043, + "epoch": 0.7443859912821283, + "flos": 22199833290240.0, + "grad_norm": 2.0505662418087724, + "language_loss": 0.78687143, + "learning_rate": 6.470060349972411e-07, + "loss": 0.80937797, + "num_input_tokens_seen": 267034075, + "step": 12381, + "time_per_iteration": 3.9579954147338867 + }, + { + "auxiliary_loss_clip": 0.01116265, + "auxiliary_loss_mlp": 0.01103418, + "balance_loss_clip": 1.00175929, + "balance_loss_mlp": 1.00061202, + "epoch": 0.7444461145347964, + "flos": 22017838055040.0, + "grad_norm": 2.234105786508144, + "language_loss": 0.7291671, + "learning_rate": 6.467192433866411e-07, + "loss": 0.75136399, + "num_input_tokens_seen": 267053645, + "step": 12382, + "time_per_iteration": 2.6444711685180664 + }, + { + "auxiliary_loss_clip": 0.01110507, + "auxiliary_loss_mlp": 0.01077085, + "balance_loss_clip": 1.00058711, + "balance_loss_mlp": 1.00002849, + "epoch": 0.7445062377874643, + "flos": 70559047704960.0, + "grad_norm": 0.656888056002975, + "language_loss": 0.54668081, + "learning_rate": 6.464325030925831e-07, + "loss": 0.56855679, + "num_input_tokens_seen": 267121830, + "step": 12383, + "time_per_iteration": 3.415623426437378 + }, + { + "auxiliary_loss_clip": 0.01133428, + "auxiliary_loss_mlp": 0.01102604, + "balance_loss_clip": 1.00168443, + "balance_loss_mlp": 1.00056064, + "epoch": 0.7445663610401323, + "flos": 22164425458560.0, + "grad_norm": 2.197839545123562, + "language_loss": 0.75186491, + "learning_rate": 6.461458141259395e-07, + "loss": 0.77422523, + "num_input_tokens_seen": 267141145, + "step": 12384, + "time_per_iteration": 4.060142755508423 + }, + { + "auxiliary_loss_clip": 0.01148302, + "auxiliary_loss_mlp": 0.01101677, + "balance_loss_clip": 1.00175774, + "balance_loss_mlp": 1.00039744, + "epoch": 0.7446264842928002, + "flos": 24170251714560.0, + "grad_norm": 1.9948314057516474, + "language_loss": 0.79440117, + "learning_rate": 6.458591764975823e-07, + "loss": 0.81690097, + "num_input_tokens_seen": 267159280, + "step": 12385, + "time_per_iteration": 2.579008102416992 + }, + { + "auxiliary_loss_clip": 0.01114745, + "auxiliary_loss_mlp": 0.01104049, + "balance_loss_clip": 1.00177073, + "balance_loss_mlp": 1.00057578, + "epoch": 0.7446866075454682, + "flos": 24134556574080.0, + "grad_norm": 6.286413383454434, + "language_loss": 0.81442738, + "learning_rate": 6.455725902183813e-07, + "loss": 0.83661532, + "num_input_tokens_seen": 267179390, + "step": 12386, + "time_per_iteration": 2.641904830932617 + }, + { + "auxiliary_loss_clip": 0.01149875, + "auxiliary_loss_mlp": 0.01101985, + "balance_loss_clip": 1.00209498, + "balance_loss_mlp": 1.00051439, + "epoch": 0.7447467307981361, + "flos": 23548063305600.0, + "grad_norm": 1.8198370846860774, + "language_loss": 0.70948005, + "learning_rate": 6.452860552992037e-07, + "loss": 0.73199868, + "num_input_tokens_seen": 267198165, + "step": 12387, + "time_per_iteration": 2.6028006076812744 + }, + { + "auxiliary_loss_clip": 0.01114963, + "auxiliary_loss_mlp": 0.01102497, + "balance_loss_clip": 1.00165558, + "balance_loss_mlp": 1.00045407, + "epoch": 0.7448068540508042, + "flos": 19567832215680.0, + "grad_norm": 2.02154508747507, + "language_loss": 0.70196027, + "learning_rate": 6.449995717509138e-07, + "loss": 0.72413492, + "num_input_tokens_seen": 267214520, + "step": 12388, + "time_per_iteration": 2.588212490081787 + }, + { + "auxiliary_loss_clip": 0.01149974, + "auxiliary_loss_mlp": 0.01102349, + "balance_loss_clip": 1.00199103, + "balance_loss_mlp": 1.0004009, + "epoch": 0.7448669773034721, + "flos": 21839721488640.0, + "grad_norm": 1.5031620300890245, + "language_loss": 0.85070139, + "learning_rate": 6.447131395843761e-07, + "loss": 0.87322462, + "num_input_tokens_seen": 267236555, + "step": 12389, + "time_per_iteration": 2.599355936050415 + }, + { + "auxiliary_loss_clip": 0.01101475, + "auxiliary_loss_mlp": 0.01102135, + "balance_loss_clip": 1.00159812, + "balance_loss_mlp": 1.00056863, + "epoch": 0.7449271005561401, + "flos": 25155389099520.0, + "grad_norm": 1.803858071738217, + "language_loss": 0.79007697, + "learning_rate": 6.444267588104526e-07, + "loss": 0.81211305, + "num_input_tokens_seen": 267254800, + "step": 12390, + "time_per_iteration": 2.687051773071289 + }, + { + "auxiliary_loss_clip": 0.01133438, + "auxiliary_loss_mlp": 0.01103089, + "balance_loss_clip": 1.00190187, + "balance_loss_mlp": 1.00047374, + "epoch": 0.7449872238088081, + "flos": 22273342473600.0, + "grad_norm": 1.8071154020546802, + "language_loss": 0.84538269, + "learning_rate": 6.441404294400014e-07, + "loss": 0.8677479, + "num_input_tokens_seen": 267274610, + "step": 12391, + "time_per_iteration": 2.5945537090301514 + }, + { + "auxiliary_loss_clip": 0.01164659, + "auxiliary_loss_mlp": 0.01101808, + "balance_loss_clip": 1.00194526, + "balance_loss_mlp": 1.00052738, + "epoch": 0.745047347061476, + "flos": 20594805966720.0, + "grad_norm": 3.158488060243014, + "language_loss": 0.73296106, + "learning_rate": 6.438541514838811e-07, + "loss": 0.75562572, + "num_input_tokens_seen": 267292600, + "step": 12392, + "time_per_iteration": 2.508485794067383 + }, + { + "auxiliary_loss_clip": 0.01147734, + "auxiliary_loss_mlp": 0.0110158, + "balance_loss_clip": 1.00180149, + "balance_loss_mlp": 1.00058603, + "epoch": 0.745107470314144, + "flos": 22127545169280.0, + "grad_norm": 1.7333151891579233, + "language_loss": 0.76399529, + "learning_rate": 6.435679249529487e-07, + "loss": 0.78648841, + "num_input_tokens_seen": 267311295, + "step": 12393, + "time_per_iteration": 2.546613931655884 + }, + { + "auxiliary_loss_clip": 0.01147878, + "auxiliary_loss_mlp": 0.01103436, + "balance_loss_clip": 1.0019145, + "balance_loss_mlp": 1.00062966, + "epoch": 0.745167593566812, + "flos": 22236498097920.0, + "grad_norm": 1.9684690666450102, + "language_loss": 0.72600895, + "learning_rate": 6.432817498580552e-07, + "loss": 0.74852204, + "num_input_tokens_seen": 267328390, + "step": 12394, + "time_per_iteration": 3.9897689819335938 + }, + { + "auxiliary_loss_clip": 0.01082989, + "auxiliary_loss_mlp": 0.00747094, + "balance_loss_clip": 1.00158632, + "balance_loss_mlp": 1.00036192, + "epoch": 0.74522771681948, + "flos": 20666232161280.0, + "grad_norm": 1.955600975487772, + "language_loss": 0.81398237, + "learning_rate": 6.429956262100535e-07, + "loss": 0.8322832, + "num_input_tokens_seen": 267348185, + "step": 12395, + "time_per_iteration": 2.7127559185028076 + }, + { + "auxiliary_loss_clip": 0.01148196, + "auxiliary_loss_mlp": 0.01102783, + "balance_loss_clip": 1.00183272, + "balance_loss_mlp": 1.00045419, + "epoch": 0.7452878400721479, + "flos": 21106999952640.0, + "grad_norm": 2.064463382225863, + "language_loss": 0.71537745, + "learning_rate": 6.427095540197937e-07, + "loss": 0.7378872, + "num_input_tokens_seen": 267367010, + "step": 12396, + "time_per_iteration": 2.566871166229248 + }, + { + "auxiliary_loss_clip": 0.01099761, + "auxiliary_loss_mlp": 0.01103222, + "balance_loss_clip": 1.00167167, + "balance_loss_mlp": 1.00041628, + "epoch": 0.7453479633248159, + "flos": 26688056474880.0, + "grad_norm": 2.001740307141748, + "language_loss": 0.67948931, + "learning_rate": 6.424235332981245e-07, + "loss": 0.70151913, + "num_input_tokens_seen": 267386605, + "step": 12397, + "time_per_iteration": 2.7074971199035645 + }, + { + "auxiliary_loss_clip": 0.01164439, + "auxiliary_loss_mlp": 0.01102188, + "balance_loss_clip": 1.00189614, + "balance_loss_mlp": 1.0006218, + "epoch": 0.7454080865774838, + "flos": 17016056167680.0, + "grad_norm": 2.0674428573974426, + "language_loss": 0.76768613, + "learning_rate": 6.421375640558908e-07, + "loss": 0.79035246, + "num_input_tokens_seen": 267404135, + "step": 12398, + "time_per_iteration": 2.53950572013855 + }, + { + "auxiliary_loss_clip": 0.0114784, + "auxiliary_loss_mlp": 0.01101881, + "balance_loss_clip": 1.00186777, + "balance_loss_mlp": 1.00050533, + "epoch": 0.7454682098301518, + "flos": 21323900229120.0, + "grad_norm": 1.9031906082676266, + "language_loss": 0.78033602, + "learning_rate": 6.418516463039363e-07, + "loss": 0.8028332, + "num_input_tokens_seen": 267423120, + "step": 12399, + "time_per_iteration": 3.960020065307617 + }, + { + "auxiliary_loss_clip": 0.01135142, + "auxiliary_loss_mlp": 0.01102186, + "balance_loss_clip": 1.00189543, + "balance_loss_mlp": 1.00052392, + "epoch": 0.7455283330828197, + "flos": 17858341163520.0, + "grad_norm": 2.1344412963547414, + "language_loss": 0.74141192, + "learning_rate": 6.415657800531038e-07, + "loss": 0.76378512, + "num_input_tokens_seen": 267441250, + "step": 12400, + "time_per_iteration": 2.5624444484710693 + }, + { + "auxiliary_loss_clip": 0.01147694, + "auxiliary_loss_mlp": 0.0110273, + "balance_loss_clip": 1.00180721, + "balance_loss_mlp": 1.00049627, + "epoch": 0.7455884563354878, + "flos": 30774259664640.0, + "grad_norm": 1.7944944603477846, + "language_loss": 0.82258976, + "learning_rate": 6.412799653142327e-07, + "loss": 0.84509403, + "num_input_tokens_seen": 267462820, + "step": 12401, + "time_per_iteration": 2.632120370864868 + }, + { + "auxiliary_loss_clip": 0.01114415, + "auxiliary_loss_mlp": 0.0110177, + "balance_loss_clip": 1.00176072, + "balance_loss_mlp": 1.00049019, + "epoch": 0.7456485795881557, + "flos": 23185545292800.0, + "grad_norm": 1.9834088244274732, + "language_loss": 0.64433742, + "learning_rate": 6.409942020981611e-07, + "loss": 0.66649926, + "num_input_tokens_seen": 267483065, + "step": 12402, + "time_per_iteration": 2.6509995460510254 + }, + { + "auxiliary_loss_clip": 0.01117706, + "auxiliary_loss_mlp": 0.01101308, + "balance_loss_clip": 1.00165391, + "balance_loss_mlp": 1.00050485, + "epoch": 0.7457087028408237, + "flos": 38727144074880.0, + "grad_norm": 1.5049514616576296, + "language_loss": 0.72951567, + "learning_rate": 6.407084904157265e-07, + "loss": 0.75170577, + "num_input_tokens_seen": 267504825, + "step": 12403, + "time_per_iteration": 2.760305881500244 + }, + { + "auxiliary_loss_clip": 0.01108264, + "auxiliary_loss_mlp": 0.0107706, + "balance_loss_clip": 1.00068343, + "balance_loss_mlp": 1.00000274, + "epoch": 0.7457688260934917, + "flos": 56043737337600.0, + "grad_norm": 0.8296343460367139, + "language_loss": 0.58809203, + "learning_rate": 6.404228302777621e-07, + "loss": 0.6099453, + "num_input_tokens_seen": 267559260, + "step": 12404, + "time_per_iteration": 3.0205280780792236 + }, + { + "auxiliary_loss_clip": 0.0116457, + "auxiliary_loss_mlp": 0.01101811, + "balance_loss_clip": 1.0018611, + "balance_loss_mlp": 1.00053132, + "epoch": 0.7458289493461596, + "flos": 20116152305280.0, + "grad_norm": 2.306906254536771, + "language_loss": 0.77603543, + "learning_rate": 6.401372216950995e-07, + "loss": 0.79869926, + "num_input_tokens_seen": 267578720, + "step": 12405, + "time_per_iteration": 2.54228138923645 + }, + { + "auxiliary_loss_clip": 0.01135009, + "auxiliary_loss_mlp": 0.01102001, + "balance_loss_clip": 1.00189877, + "balance_loss_mlp": 1.00062549, + "epoch": 0.7458890725988276, + "flos": 20193073280640.0, + "grad_norm": 1.6823232615079697, + "language_loss": 0.69195509, + "learning_rate": 6.398516646785698e-07, + "loss": 0.71432519, + "num_input_tokens_seen": 267598250, + "step": 12406, + "time_per_iteration": 2.5875403881073 + }, + { + "auxiliary_loss_clip": 0.01070433, + "auxiliary_loss_mlp": 0.01103498, + "balance_loss_clip": 1.00162256, + "balance_loss_mlp": 1.00069237, + "epoch": 0.7459491958514956, + "flos": 17018749687680.0, + "grad_norm": 2.2864180163943724, + "language_loss": 0.64754045, + "learning_rate": 6.39566159239002e-07, + "loss": 0.66927981, + "num_input_tokens_seen": 267615430, + "step": 12407, + "time_per_iteration": 2.7968435287475586 + }, + { + "auxiliary_loss_clip": 0.0111715, + "auxiliary_loss_mlp": 0.0110339, + "balance_loss_clip": 1.00177658, + "balance_loss_mlp": 1.000489, + "epoch": 0.7460093191041636, + "flos": 25078719519360.0, + "grad_norm": 1.7091007849344766, + "language_loss": 0.72283554, + "learning_rate": 6.392807053872212e-07, + "loss": 0.74504089, + "num_input_tokens_seen": 267635075, + "step": 12408, + "time_per_iteration": 2.8984382152557373 + }, + { + "auxiliary_loss_clip": 0.01149628, + "auxiliary_loss_mlp": 0.01102563, + "balance_loss_clip": 1.00190711, + "balance_loss_mlp": 1.00051951, + "epoch": 0.7460694423568315, + "flos": 21908525990400.0, + "grad_norm": 2.2707194889007742, + "language_loss": 0.72865701, + "learning_rate": 6.38995303134053e-07, + "loss": 0.75117898, + "num_input_tokens_seen": 267654105, + "step": 12409, + "time_per_iteration": 2.636267900466919 + }, + { + "auxiliary_loss_clip": 0.01149851, + "auxiliary_loss_mlp": 0.01101128, + "balance_loss_clip": 1.00188184, + "balance_loss_mlp": 1.0005157, + "epoch": 0.7461295656094995, + "flos": 21215737399680.0, + "grad_norm": 1.6221140151086861, + "language_loss": 0.66093671, + "learning_rate": 6.38709952490319e-07, + "loss": 0.68344653, + "num_input_tokens_seen": 267673090, + "step": 12410, + "time_per_iteration": 2.554870367050171 + }, + { + "auxiliary_loss_clip": 0.0114807, + "auxiliary_loss_mlp": 0.00747248, + "balance_loss_clip": 1.00181043, + "balance_loss_mlp": 1.00035453, + "epoch": 0.7461896888621674, + "flos": 22346851656960.0, + "grad_norm": 1.9536597412313181, + "language_loss": 0.84772944, + "learning_rate": 6.384246534668396e-07, + "loss": 0.86668265, + "num_input_tokens_seen": 267690605, + "step": 12411, + "time_per_iteration": 2.5501554012298584 + }, + { + "auxiliary_loss_clip": 0.01116972, + "auxiliary_loss_mlp": 0.01101795, + "balance_loss_clip": 1.00178719, + "balance_loss_mlp": 1.00041914, + "epoch": 0.7462498121148354, + "flos": 25482930243840.0, + "grad_norm": 4.231011788374101, + "language_loss": 0.7782234, + "learning_rate": 6.381394060744339e-07, + "loss": 0.80041105, + "num_input_tokens_seen": 267710540, + "step": 12412, + "time_per_iteration": 2.676722288131714 + }, + { + "auxiliary_loss_clip": 0.01118968, + "auxiliary_loss_mlp": 0.01102301, + "balance_loss_clip": 1.00194812, + "balance_loss_mlp": 1.00054383, + "epoch": 0.7463099353675033, + "flos": 33947936812800.0, + "grad_norm": 1.9087287656692424, + "language_loss": 0.62392402, + "learning_rate": 6.378542103239188e-07, + "loss": 0.6461367, + "num_input_tokens_seen": 267730780, + "step": 12413, + "time_per_iteration": 2.727484703063965 + }, + { + "auxiliary_loss_clip": 0.01144181, + "auxiliary_loss_mlp": 0.00745239, + "balance_loss_clip": 1.0007838, + "balance_loss_mlp": 1.00001585, + "epoch": 0.7463700586201714, + "flos": 62767723691520.0, + "grad_norm": 0.7156537681765994, + "language_loss": 0.54901749, + "learning_rate": 6.375690662261082e-07, + "loss": 0.56791168, + "num_input_tokens_seen": 267794240, + "step": 12414, + "time_per_iteration": 3.1698901653289795 + }, + { + "auxiliary_loss_clip": 0.01133305, + "auxiliary_loss_mlp": 0.01103025, + "balance_loss_clip": 1.00191426, + "balance_loss_mlp": 1.00040996, + "epoch": 0.7464301818728393, + "flos": 33432654257280.0, + "grad_norm": 2.1775960470517224, + "language_loss": 0.54528069, + "learning_rate": 6.372839737918154e-07, + "loss": 0.567644, + "num_input_tokens_seen": 267817190, + "step": 12415, + "time_per_iteration": 2.7086431980133057 + }, + { + "auxiliary_loss_clip": 0.01086379, + "auxiliary_loss_mlp": 0.0110277, + "balance_loss_clip": 1.00180662, + "balance_loss_mlp": 1.00048828, + "epoch": 0.7464903051255073, + "flos": 26869872142080.0, + "grad_norm": 2.133325196346278, + "language_loss": 0.74930483, + "learning_rate": 6.369989330318506e-07, + "loss": 0.77119637, + "num_input_tokens_seen": 267836245, + "step": 12416, + "time_per_iteration": 2.7451412677764893 + }, + { + "auxiliary_loss_clip": 0.01101776, + "auxiliary_loss_mlp": 0.01102387, + "balance_loss_clip": 1.00171471, + "balance_loss_mlp": 1.0005343, + "epoch": 0.7465504283781753, + "flos": 44086954775040.0, + "grad_norm": 1.6105819665201762, + "language_loss": 0.69463122, + "learning_rate": 6.367139439570233e-07, + "loss": 0.71667278, + "num_input_tokens_seen": 267858310, + "step": 12417, + "time_per_iteration": 2.889871120452881 + }, + { + "auxiliary_loss_clip": 0.01116308, + "auxiliary_loss_mlp": 0.01102833, + "balance_loss_clip": 1.00180471, + "balance_loss_mlp": 1.00050378, + "epoch": 0.7466105516308432, + "flos": 19676102785920.0, + "grad_norm": 1.7243758551878885, + "language_loss": 0.73883891, + "learning_rate": 6.364290065781392e-07, + "loss": 0.76103032, + "num_input_tokens_seen": 267876345, + "step": 12418, + "time_per_iteration": 4.015305757522583 + }, + { + "auxiliary_loss_clip": 0.0114786, + "auxiliary_loss_mlp": 0.01102737, + "balance_loss_clip": 1.00187612, + "balance_loss_mlp": 1.00050282, + "epoch": 0.7466706748835112, + "flos": 20520722165760.0, + "grad_norm": 1.67800027355012, + "language_loss": 0.69224608, + "learning_rate": 6.361441209060039e-07, + "loss": 0.71475208, + "num_input_tokens_seen": 267896740, + "step": 12419, + "time_per_iteration": 2.563992738723755 + }, + { + "auxiliary_loss_clip": 0.01164397, + "auxiliary_loss_mlp": 0.01101734, + "balance_loss_clip": 1.00192809, + "balance_loss_mlp": 1.00054884, + "epoch": 0.7467307981361792, + "flos": 21690260997120.0, + "grad_norm": 1.7059563308130834, + "language_loss": 0.74532545, + "learning_rate": 6.358592869514216e-07, + "loss": 0.76798677, + "num_input_tokens_seen": 267914765, + "step": 12420, + "time_per_iteration": 2.5543251037597656 + }, + { + "auxiliary_loss_clip": 0.01149955, + "auxiliary_loss_mlp": 0.0110297, + "balance_loss_clip": 1.00195384, + "balance_loss_mlp": 1.00035501, + "epoch": 0.7467909213888472, + "flos": 19573686132480.0, + "grad_norm": 2.585567701391798, + "language_loss": 0.6721791, + "learning_rate": 6.355745047251904e-07, + "loss": 0.69470835, + "num_input_tokens_seen": 267934085, + "step": 12421, + "time_per_iteration": 2.554813861846924 + }, + { + "auxiliary_loss_clip": 0.01131497, + "auxiliary_loss_mlp": 0.01103777, + "balance_loss_clip": 1.00168872, + "balance_loss_mlp": 1.00049448, + "epoch": 0.7468510446415151, + "flos": 23695225326720.0, + "grad_norm": 1.731327916587546, + "language_loss": 0.7273339, + "learning_rate": 6.352897742381107e-07, + "loss": 0.74968666, + "num_input_tokens_seen": 267955170, + "step": 12422, + "time_per_iteration": 4.112053632736206 + }, + { + "auxiliary_loss_clip": 0.01116137, + "auxiliary_loss_mlp": 0.01102995, + "balance_loss_clip": 1.00180709, + "balance_loss_mlp": 1.00057065, + "epoch": 0.7469111678941831, + "flos": 29315783831040.0, + "grad_norm": 2.0536763074399587, + "language_loss": 0.74864084, + "learning_rate": 6.350050955009796e-07, + "loss": 0.77083218, + "num_input_tokens_seen": 267974980, + "step": 12423, + "time_per_iteration": 2.6913888454437256 + }, + { + "auxiliary_loss_clip": 0.01147847, + "auxiliary_loss_mlp": 0.01101469, + "balance_loss_clip": 1.0018332, + "balance_loss_mlp": 1.00037897, + "epoch": 0.746971291146851, + "flos": 21798639308160.0, + "grad_norm": 1.3097920417148743, + "language_loss": 0.67864788, + "learning_rate": 6.347204685245929e-07, + "loss": 0.70114106, + "num_input_tokens_seen": 267994985, + "step": 12424, + "time_per_iteration": 2.58150053024292 + }, + { + "auxiliary_loss_clip": 0.01150093, + "auxiliary_loss_mlp": 0.01102573, + "balance_loss_clip": 1.00202322, + "balance_loss_mlp": 1.00043464, + "epoch": 0.747031414399519, + "flos": 36245070368640.0, + "grad_norm": 2.7176766181885226, + "language_loss": 0.74585915, + "learning_rate": 6.344358933197418e-07, + "loss": 0.76838583, + "num_input_tokens_seen": 268014985, + "step": 12425, + "time_per_iteration": 2.6914806365966797 + }, + { + "auxiliary_loss_clip": 0.01116019, + "auxiliary_loss_mlp": 0.01101999, + "balance_loss_clip": 1.0016675, + "balance_loss_mlp": 1.00043273, + "epoch": 0.7470915376521869, + "flos": 19974916028160.0, + "grad_norm": 1.9696616624986332, + "language_loss": 0.69130445, + "learning_rate": 6.341513698972194e-07, + "loss": 0.71348464, + "num_input_tokens_seen": 268034395, + "step": 12426, + "time_per_iteration": 2.6479032039642334 + }, + { + "auxiliary_loss_clip": 0.0111369, + "auxiliary_loss_mlp": 0.01102167, + "balance_loss_clip": 1.00167096, + "balance_loss_mlp": 1.00060046, + "epoch": 0.747151660904855, + "flos": 20084299920000.0, + "grad_norm": 1.4861192643166987, + "language_loss": 0.65778339, + "learning_rate": 6.338668982678139e-07, + "loss": 0.67994195, + "num_input_tokens_seen": 268054485, + "step": 12427, + "time_per_iteration": 2.608215093612671 + }, + { + "auxiliary_loss_clip": 0.01164641, + "auxiliary_loss_mlp": 0.01102363, + "balance_loss_clip": 1.00194788, + "balance_loss_mlp": 1.00051093, + "epoch": 0.7472117841575229, + "flos": 16290373697280.0, + "grad_norm": 1.8614749739295817, + "language_loss": 0.74592972, + "learning_rate": 6.335824784423118e-07, + "loss": 0.76859969, + "num_input_tokens_seen": 268072250, + "step": 12428, + "time_per_iteration": 2.480921506881714 + }, + { + "auxiliary_loss_clip": 0.01149667, + "auxiliary_loss_mlp": 0.01103825, + "balance_loss_clip": 1.00191844, + "balance_loss_mlp": 1.00044644, + "epoch": 0.7472719074101909, + "flos": 21389939383680.0, + "grad_norm": 1.78262126659586, + "language_loss": 0.580311, + "learning_rate": 6.33298110431499e-07, + "loss": 0.60284591, + "num_input_tokens_seen": 268089840, + "step": 12429, + "time_per_iteration": 2.529219388961792 + }, + { + "auxiliary_loss_clip": 0.01148292, + "auxiliary_loss_mlp": 0.01103454, + "balance_loss_clip": 1.00183225, + "balance_loss_mlp": 1.00045753, + "epoch": 0.7473320306628589, + "flos": 29643289061760.0, + "grad_norm": 1.8535712487023224, + "language_loss": 0.60622156, + "learning_rate": 6.330137942461595e-07, + "loss": 0.628739, + "num_input_tokens_seen": 268109360, + "step": 12430, + "time_per_iteration": 2.6222288608551025 + }, + { + "auxiliary_loss_clip": 0.01134629, + "auxiliary_loss_mlp": 0.01102318, + "balance_loss_clip": 1.00176263, + "balance_loss_mlp": 1.00065637, + "epoch": 0.7473921539155268, + "flos": 24136100858880.0, + "grad_norm": 2.3722444151439337, + "language_loss": 0.75700611, + "learning_rate": 6.327295298970734e-07, + "loss": 0.77937555, + "num_input_tokens_seen": 268131840, + "step": 12431, + "time_per_iteration": 2.650682210922241 + }, + { + "auxiliary_loss_clip": 0.0114775, + "auxiliary_loss_mlp": 0.0110301, + "balance_loss_clip": 1.00181293, + "balance_loss_mlp": 1.00039434, + "epoch": 0.7474522771681948, + "flos": 17487958072320.0, + "grad_norm": 2.1107815487395225, + "language_loss": 0.75553429, + "learning_rate": 6.32445317395021e-07, + "loss": 0.7780419, + "num_input_tokens_seen": 268148300, + "step": 12432, + "time_per_iteration": 3.947416067123413 + }, + { + "auxiliary_loss_clip": 0.01131676, + "auxiliary_loss_mlp": 0.01104395, + "balance_loss_clip": 1.00181508, + "balance_loss_mlp": 1.00054038, + "epoch": 0.7475124004208628, + "flos": 16727298733440.0, + "grad_norm": 2.0299167996964287, + "language_loss": 0.70234746, + "learning_rate": 6.321611567507787e-07, + "loss": 0.7247082, + "num_input_tokens_seen": 268166450, + "step": 12433, + "time_per_iteration": 2.5607805252075195 + }, + { + "auxiliary_loss_clip": 0.01099897, + "auxiliary_loss_mlp": 0.01102579, + "balance_loss_clip": 1.00173664, + "balance_loss_mlp": 1.00053596, + "epoch": 0.7475725236735308, + "flos": 19720237622400.0, + "grad_norm": 1.6885210716462467, + "language_loss": 0.6724084, + "learning_rate": 6.318770479751232e-07, + "loss": 0.69443315, + "num_input_tokens_seen": 268186165, + "step": 12434, + "time_per_iteration": 2.6885833740234375 + }, + { + "auxiliary_loss_clip": 0.01164277, + "auxiliary_loss_mlp": 0.01100999, + "balance_loss_clip": 1.00186515, + "balance_loss_mlp": 1.00057733, + "epoch": 0.7476326469261987, + "flos": 26286000566400.0, + "grad_norm": 1.4524138386870298, + "language_loss": 0.7977457, + "learning_rate": 6.315929910788263e-07, + "loss": 0.82039845, + "num_input_tokens_seen": 268208145, + "step": 12435, + "time_per_iteration": 2.569735050201416 + }, + { + "auxiliary_loss_clip": 0.01117742, + "auxiliary_loss_mlp": 0.01102741, + "balance_loss_clip": 1.00166857, + "balance_loss_mlp": 1.00050735, + "epoch": 0.7476927701788667, + "flos": 31831828824960.0, + "grad_norm": 3.1187189832703193, + "language_loss": 0.67562181, + "learning_rate": 6.313089860726604e-07, + "loss": 0.69782662, + "num_input_tokens_seen": 268228345, + "step": 12436, + "time_per_iteration": 2.7091727256774902 + }, + { + "auxiliary_loss_clip": 0.01120558, + "auxiliary_loss_mlp": 0.01103026, + "balance_loss_clip": 1.00168371, + "balance_loss_mlp": 1.00050581, + "epoch": 0.7477528934315346, + "flos": 31795487239680.0, + "grad_norm": 1.562328541799245, + "language_loss": 0.70702881, + "learning_rate": 6.31025032967396e-07, + "loss": 0.72926462, + "num_input_tokens_seen": 268250260, + "step": 12437, + "time_per_iteration": 4.138742446899414 + }, + { + "auxiliary_loss_clip": 0.01117556, + "auxiliary_loss_mlp": 0.01101306, + "balance_loss_clip": 1.00182438, + "balance_loss_mlp": 1.00040722, + "epoch": 0.7478130166842026, + "flos": 20371979946240.0, + "grad_norm": 2.0706667996616552, + "language_loss": 0.67009062, + "learning_rate": 6.307411317737986e-07, + "loss": 0.69227922, + "num_input_tokens_seen": 268268440, + "step": 12438, + "time_per_iteration": 2.6723251342773438 + }, + { + "auxiliary_loss_clip": 0.01133413, + "auxiliary_loss_mlp": 0.01101999, + "balance_loss_clip": 1.00180602, + "balance_loss_mlp": 1.00062323, + "epoch": 0.7478731399368705, + "flos": 18148930191360.0, + "grad_norm": 1.8587074196702067, + "language_loss": 0.80790156, + "learning_rate": 6.304572825026344e-07, + "loss": 0.83025563, + "num_input_tokens_seen": 268285765, + "step": 12439, + "time_per_iteration": 2.5633928775787354 + }, + { + "auxiliary_loss_clip": 0.0111642, + "auxiliary_loss_mlp": 0.01101639, + "balance_loss_clip": 1.00169826, + "balance_loss_mlp": 1.00055003, + "epoch": 0.7479332631895386, + "flos": 15267889146240.0, + "grad_norm": 2.046011232585584, + "language_loss": 0.70448959, + "learning_rate": 6.301734851646674e-07, + "loss": 0.72667015, + "num_input_tokens_seen": 268304015, + "step": 12440, + "time_per_iteration": 2.627243757247925 + }, + { + "auxiliary_loss_clip": 0.01132354, + "auxiliary_loss_mlp": 0.01101061, + "balance_loss_clip": 1.00197268, + "balance_loss_mlp": 1.00049615, + "epoch": 0.7479933864422065, + "flos": 21142515525120.0, + "grad_norm": 1.803996990183884, + "language_loss": 0.73876095, + "learning_rate": 6.298897397706597e-07, + "loss": 0.76109511, + "num_input_tokens_seen": 268323290, + "step": 12441, + "time_per_iteration": 2.598057508468628 + }, + { + "auxiliary_loss_clip": 0.01148455, + "auxiliary_loss_mlp": 0.00747371, + "balance_loss_clip": 1.00191891, + "balance_loss_mlp": 1.0004375, + "epoch": 0.7480535096948745, + "flos": 14392027912320.0, + "grad_norm": 2.182560341658816, + "language_loss": 0.82687533, + "learning_rate": 6.296060463313698e-07, + "loss": 0.84583366, + "num_input_tokens_seen": 268339490, + "step": 12442, + "time_per_iteration": 2.617398977279663 + }, + { + "auxiliary_loss_clip": 0.01084695, + "auxiliary_loss_mlp": 0.01103705, + "balance_loss_clip": 1.00179529, + "balance_loss_mlp": 1.00061309, + "epoch": 0.7481136329475425, + "flos": 27344683048320.0, + "grad_norm": 1.8413075243098824, + "language_loss": 0.62298155, + "learning_rate": 6.293224048575565e-07, + "loss": 0.64486551, + "num_input_tokens_seen": 268359865, + "step": 12443, + "time_per_iteration": 2.8077921867370605 + }, + { + "auxiliary_loss_clip": 0.01116768, + "auxiliary_loss_mlp": 0.01101738, + "balance_loss_clip": 1.00169694, + "balance_loss_mlp": 1.00036216, + "epoch": 0.7481737562002104, + "flos": 19531454716800.0, + "grad_norm": 2.301717449944804, + "language_loss": 0.71694088, + "learning_rate": 6.29038815359975e-07, + "loss": 0.73912591, + "num_input_tokens_seen": 268377065, + "step": 12444, + "time_per_iteration": 2.65134596824646 + }, + { + "auxiliary_loss_clip": 0.0110208, + "auxiliary_loss_mlp": 0.01102236, + "balance_loss_clip": 1.00189424, + "balance_loss_mlp": 1.00047863, + "epoch": 0.7482338794528784, + "flos": 21760035166080.0, + "grad_norm": 1.6363820503947704, + "language_loss": 0.68849546, + "learning_rate": 6.287552778493786e-07, + "loss": 0.71053863, + "num_input_tokens_seen": 268396935, + "step": 12445, + "time_per_iteration": 2.7719218730926514 + }, + { + "auxiliary_loss_clip": 0.01147861, + "auxiliary_loss_mlp": 0.01101562, + "balance_loss_clip": 1.00188589, + "balance_loss_mlp": 1.00037682, + "epoch": 0.7482940027055464, + "flos": 18697358021760.0, + "grad_norm": 1.7613891939605903, + "language_loss": 0.73983693, + "learning_rate": 6.28471792336519e-07, + "loss": 0.76233119, + "num_input_tokens_seen": 268414460, + "step": 12446, + "time_per_iteration": 2.573824644088745 + }, + { + "auxiliary_loss_clip": 0.01132766, + "auxiliary_loss_mlp": 0.00747459, + "balance_loss_clip": 1.0017972, + "balance_loss_mlp": 1.00054526, + "epoch": 0.7483541259582144, + "flos": 15998024903040.0, + "grad_norm": 2.2228247552198384, + "language_loss": 0.7346341, + "learning_rate": 6.281883588321475e-07, + "loss": 0.75343639, + "num_input_tokens_seen": 268432225, + "step": 12447, + "time_per_iteration": 2.5675852298736572 + }, + { + "auxiliary_loss_clip": 0.01100676, + "auxiliary_loss_mlp": 0.01102794, + "balance_loss_clip": 1.00167787, + "balance_loss_mlp": 1.00046468, + "epoch": 0.7484142492108823, + "flos": 25556295772800.0, + "grad_norm": 2.8974759079449335, + "language_loss": 0.72108734, + "learning_rate": 6.279049773470109e-07, + "loss": 0.74312198, + "num_input_tokens_seen": 268449270, + "step": 12448, + "time_per_iteration": 2.707998275756836 + }, + { + "auxiliary_loss_clip": 0.0116458, + "auxiliary_loss_mlp": 0.01102133, + "balance_loss_clip": 1.0018878, + "balance_loss_mlp": 1.00056696, + "epoch": 0.7484743724635503, + "flos": 22887737631360.0, + "grad_norm": 1.79727530413929, + "language_loss": 0.73828745, + "learning_rate": 6.276216478918543e-07, + "loss": 0.76095456, + "num_input_tokens_seen": 268467250, + "step": 12449, + "time_per_iteration": 2.5311155319213867 + }, + { + "auxiliary_loss_clip": 0.01117611, + "auxiliary_loss_mlp": 0.01103228, + "balance_loss_clip": 1.0019213, + "balance_loss_mlp": 1.00051773, + "epoch": 0.7485344957162182, + "flos": 25300288563840.0, + "grad_norm": 2.552933514855128, + "language_loss": 0.61159813, + "learning_rate": 6.273383704774225e-07, + "loss": 0.63380647, + "num_input_tokens_seen": 268487270, + "step": 12450, + "time_per_iteration": 2.6588175296783447 + }, + { + "auxiliary_loss_clip": 0.01164313, + "auxiliary_loss_mlp": 0.01101683, + "balance_loss_clip": 1.00178051, + "balance_loss_mlp": 1.00049782, + "epoch": 0.7485946189688862, + "flos": 27053016612480.0, + "grad_norm": 1.8375633036161316, + "language_loss": 0.70400953, + "learning_rate": 6.270551451144577e-07, + "loss": 0.72666949, + "num_input_tokens_seen": 268508020, + "step": 12451, + "time_per_iteration": 2.5598514080047607 + }, + { + "auxiliary_loss_clip": 0.01149548, + "auxiliary_loss_mlp": 0.01103311, + "balance_loss_clip": 1.00182545, + "balance_loss_mlp": 1.00040913, + "epoch": 0.7486547422215541, + "flos": 26906752431360.0, + "grad_norm": 2.596525989074832, + "language_loss": 0.80800891, + "learning_rate": 6.267719718136988e-07, + "loss": 0.8305375, + "num_input_tokens_seen": 268527375, + "step": 12452, + "time_per_iteration": 2.6248269081115723 + }, + { + "auxiliary_loss_clip": 0.01164976, + "auxiliary_loss_mlp": 0.011037, + "balance_loss_clip": 1.00214493, + "balance_loss_mlp": 1.0005126, + "epoch": 0.7487148654742222, + "flos": 22346277039360.0, + "grad_norm": 1.993679347762075, + "language_loss": 0.7135154, + "learning_rate": 6.264888505858843e-07, + "loss": 0.73620212, + "num_input_tokens_seen": 268544870, + "step": 12453, + "time_per_iteration": 2.525951623916626 + }, + { + "auxiliary_loss_clip": 0.01131453, + "auxiliary_loss_mlp": 0.01102958, + "balance_loss_clip": 1.00187099, + "balance_loss_mlp": 1.00053382, + "epoch": 0.7487749887268901, + "flos": 23038814234880.0, + "grad_norm": 1.5916594717577743, + "language_loss": 0.74002314, + "learning_rate": 6.262057814417517e-07, + "loss": 0.76236725, + "num_input_tokens_seen": 268564580, + "step": 12454, + "time_per_iteration": 2.6224982738494873 + }, + { + "auxiliary_loss_clip": 0.01129197, + "auxiliary_loss_mlp": 0.01076336, + "balance_loss_clip": 1.00086021, + "balance_loss_mlp": 1.00004184, + "epoch": 0.7488351119795581, + "flos": 71525294536320.0, + "grad_norm": 0.7318638558460829, + "language_loss": 0.59358579, + "learning_rate": 6.259227643920322e-07, + "loss": 0.61564112, + "num_input_tokens_seen": 268629550, + "step": 12455, + "time_per_iteration": 3.3109052181243896 + }, + { + "auxiliary_loss_clip": 0.01118607, + "auxiliary_loss_mlp": 0.01101819, + "balance_loss_clip": 1.00186288, + "balance_loss_mlp": 1.00034785, + "epoch": 0.748895235232226, + "flos": 17196255722880.0, + "grad_norm": 1.9692080393324634, + "language_loss": 0.79642141, + "learning_rate": 6.256397994474592e-07, + "loss": 0.81862569, + "num_input_tokens_seen": 268646645, + "step": 12456, + "time_per_iteration": 4.068426132202148 + }, + { + "auxiliary_loss_clip": 0.01143771, + "auxiliary_loss_mlp": 0.01076655, + "balance_loss_clip": 1.00080669, + "balance_loss_mlp": 0.99998009, + "epoch": 0.748955358484894, + "flos": 58979256336000.0, + "grad_norm": 0.8429307894161129, + "language_loss": 0.61477315, + "learning_rate": 6.25356886618763e-07, + "loss": 0.63697743, + "num_input_tokens_seen": 268702275, + "step": 12457, + "time_per_iteration": 3.0672550201416016 + }, + { + "auxiliary_loss_clip": 0.01134428, + "auxiliary_loss_mlp": 0.01102407, + "balance_loss_clip": 1.00189257, + "balance_loss_mlp": 1.00045943, + "epoch": 0.749015481737562, + "flos": 11360413054080.0, + "grad_norm": 1.9553689030737667, + "language_loss": 0.67182469, + "learning_rate": 6.250740259166711e-07, + "loss": 0.69419301, + "num_input_tokens_seen": 268716265, + "step": 12458, + "time_per_iteration": 2.6176230907440186 + }, + { + "auxiliary_loss_clip": 0.01100078, + "auxiliary_loss_mlp": 0.01102194, + "balance_loss_clip": 1.0015744, + "balance_loss_mlp": 1.00053239, + "epoch": 0.74907560499023, + "flos": 21106497162240.0, + "grad_norm": 3.379673112543979, + "language_loss": 0.79847908, + "learning_rate": 6.247912173519106e-07, + "loss": 0.8205018, + "num_input_tokens_seen": 268734330, + "step": 12459, + "time_per_iteration": 4.183409690856934 + }, + { + "auxiliary_loss_clip": 0.01118725, + "auxiliary_loss_mlp": 0.01101803, + "balance_loss_clip": 1.00186622, + "balance_loss_mlp": 1.00061822, + "epoch": 0.749135728242898, + "flos": 22268027260800.0, + "grad_norm": 1.5193236275539592, + "language_loss": 0.80211461, + "learning_rate": 6.245084609352043e-07, + "loss": 0.82431996, + "num_input_tokens_seen": 268753500, + "step": 12460, + "time_per_iteration": 2.6478500366210938 + }, + { + "auxiliary_loss_clip": 0.0113346, + "auxiliary_loss_mlp": 0.01102416, + "balance_loss_clip": 1.0019331, + "balance_loss_mlp": 1.00056338, + "epoch": 0.7491958514955659, + "flos": 24057527857920.0, + "grad_norm": 1.9631832022415263, + "language_loss": 0.86069906, + "learning_rate": 6.242257566772755e-07, + "loss": 0.88305777, + "num_input_tokens_seen": 268772055, + "step": 12461, + "time_per_iteration": 2.653305768966675 + }, + { + "auxiliary_loss_clip": 0.01148126, + "auxiliary_loss_mlp": 0.01101705, + "balance_loss_clip": 1.00193286, + "balance_loss_mlp": 1.00051975, + "epoch": 0.7492559747482339, + "flos": 24492118510080.0, + "grad_norm": 3.7918436201994763, + "language_loss": 0.69890249, + "learning_rate": 6.239431045888435e-07, + "loss": 0.72140086, + "num_input_tokens_seen": 268792265, + "step": 12462, + "time_per_iteration": 2.621769428253174 + }, + { + "auxiliary_loss_clip": 0.01164525, + "auxiliary_loss_mlp": 0.01102182, + "balance_loss_clip": 1.00196648, + "balance_loss_mlp": 1.00042462, + "epoch": 0.7493160980009018, + "flos": 27745338326400.0, + "grad_norm": 2.3856860108687394, + "language_loss": 0.70022523, + "learning_rate": 6.236605046806267e-07, + "loss": 0.72289234, + "num_input_tokens_seen": 268812735, + "step": 12463, + "time_per_iteration": 2.596756935119629 + }, + { + "auxiliary_loss_clip": 0.01116453, + "auxiliary_loss_mlp": 0.01102365, + "balance_loss_clip": 1.00176466, + "balance_loss_mlp": 1.00051272, + "epoch": 0.7493762212535698, + "flos": 30226190970240.0, + "grad_norm": 1.7473586668992158, + "language_loss": 0.77351272, + "learning_rate": 6.233779569633419e-07, + "loss": 0.79570091, + "num_input_tokens_seen": 268833090, + "step": 12464, + "time_per_iteration": 2.761014938354492 + }, + { + "auxiliary_loss_clip": 0.0113112, + "auxiliary_loss_mlp": 0.0110152, + "balance_loss_clip": 1.00166035, + "balance_loss_mlp": 1.00043011, + "epoch": 0.7494363445062378, + "flos": 21944472526080.0, + "grad_norm": 1.6804913580954082, + "language_loss": 0.78364551, + "learning_rate": 6.230954614477034e-07, + "loss": 0.80597186, + "num_input_tokens_seen": 268851880, + "step": 12465, + "time_per_iteration": 2.669161796569824 + }, + { + "auxiliary_loss_clip": 0.01117035, + "auxiliary_loss_mlp": 0.01103474, + "balance_loss_clip": 1.00187945, + "balance_loss_mlp": 1.00047696, + "epoch": 0.7494964677589058, + "flos": 12490342162560.0, + "grad_norm": 2.61421258077512, + "language_loss": 0.74074483, + "learning_rate": 6.22813018144422e-07, + "loss": 0.76294994, + "num_input_tokens_seen": 268867910, + "step": 12466, + "time_per_iteration": 2.578735828399658 + }, + { + "auxiliary_loss_clip": 0.0115013, + "auxiliary_loss_mlp": 0.01102571, + "balance_loss_clip": 1.00198269, + "balance_loss_mlp": 1.00052834, + "epoch": 0.7495565910115737, + "flos": 21653057485440.0, + "grad_norm": 2.1239649317220493, + "language_loss": 0.66395795, + "learning_rate": 6.22530627064209e-07, + "loss": 0.68648499, + "num_input_tokens_seen": 268887260, + "step": 12467, + "time_per_iteration": 2.54768967628479 + }, + { + "auxiliary_loss_clip": 0.01101003, + "auxiliary_loss_mlp": 0.0074724, + "balance_loss_clip": 1.00169373, + "balance_loss_mlp": 1.00037634, + "epoch": 0.7496167142642417, + "flos": 15268535591040.0, + "grad_norm": 4.63230374294104, + "language_loss": 0.76629102, + "learning_rate": 6.222482882177735e-07, + "loss": 0.78477347, + "num_input_tokens_seen": 268902520, + "step": 12468, + "time_per_iteration": 2.65258526802063 + }, + { + "auxiliary_loss_clip": 0.01115733, + "auxiliary_loss_mlp": 0.01101733, + "balance_loss_clip": 1.00154209, + "balance_loss_mlp": 1.0004524, + "epoch": 0.7496768375169096, + "flos": 22054933825920.0, + "grad_norm": 2.125723730195766, + "language_loss": 0.69064695, + "learning_rate": 6.219660016158201e-07, + "loss": 0.7128216, + "num_input_tokens_seen": 268920970, + "step": 12469, + "time_per_iteration": 4.050732374191284 + }, + { + "auxiliary_loss_clip": 0.01132902, + "auxiliary_loss_mlp": 0.01102467, + "balance_loss_clip": 1.00193095, + "balance_loss_mlp": 1.00042367, + "epoch": 0.7497369607695776, + "flos": 19057038860160.0, + "grad_norm": 1.8371739772820657, + "language_loss": 0.68871665, + "learning_rate": 6.216837672690543e-07, + "loss": 0.7110703, + "num_input_tokens_seen": 268936600, + "step": 12470, + "time_per_iteration": 2.622087240219116 + }, + { + "auxiliary_loss_clip": 0.01132703, + "auxiliary_loss_mlp": 0.01103233, + "balance_loss_clip": 1.00190008, + "balance_loss_mlp": 1.00052226, + "epoch": 0.7497970840222457, + "flos": 21617434172160.0, + "grad_norm": 1.7691799298277486, + "language_loss": 0.74795818, + "learning_rate": 6.214015851881793e-07, + "loss": 0.77031755, + "num_input_tokens_seen": 268956560, + "step": 12471, + "time_per_iteration": 2.6076486110687256 + }, + { + "auxiliary_loss_clip": 0.01132926, + "auxiliary_loss_mlp": 0.01102761, + "balance_loss_clip": 1.00183225, + "balance_loss_mlp": 1.00043154, + "epoch": 0.7498572072749136, + "flos": 13735580906880.0, + "grad_norm": 2.1556012611474085, + "language_loss": 0.77108037, + "learning_rate": 6.211194553838929e-07, + "loss": 0.79343724, + "num_input_tokens_seen": 268973945, + "step": 12472, + "time_per_iteration": 2.564937114715576 + }, + { + "auxiliary_loss_clip": 0.01147913, + "auxiliary_loss_mlp": 0.00747352, + "balance_loss_clip": 1.001917, + "balance_loss_mlp": 1.00043821, + "epoch": 0.7499173305275816, + "flos": 22966526113920.0, + "grad_norm": 1.5872929323281166, + "language_loss": 0.84366626, + "learning_rate": 6.208373778668951e-07, + "loss": 0.86261892, + "num_input_tokens_seen": 268993245, + "step": 12473, + "time_per_iteration": 2.5679285526275635 + }, + { + "auxiliary_loss_clip": 0.01118479, + "auxiliary_loss_mlp": 0.01103202, + "balance_loss_clip": 1.00183797, + "balance_loss_mlp": 1.00049114, + "epoch": 0.7499774537802495, + "flos": 22740467869440.0, + "grad_norm": 2.0622668553865093, + "language_loss": 0.73683858, + "learning_rate": 6.205553526478829e-07, + "loss": 0.75905538, + "num_input_tokens_seen": 269012125, + "step": 12474, + "time_per_iteration": 4.025373697280884 + }, + { + "auxiliary_loss_clip": 0.01133256, + "auxiliary_loss_mlp": 0.0110334, + "balance_loss_clip": 1.00178003, + "balance_loss_mlp": 1.00053406, + "epoch": 0.7500375770329175, + "flos": 18296559089280.0, + "grad_norm": 2.1437119057614256, + "language_loss": 0.74551558, + "learning_rate": 6.202733797375492e-07, + "loss": 0.76788151, + "num_input_tokens_seen": 269030545, + "step": 12475, + "time_per_iteration": 2.6784861087799072 + }, + { + "auxiliary_loss_clip": 0.01150214, + "auxiliary_loss_mlp": 0.01104634, + "balance_loss_clip": 1.00189888, + "balance_loss_mlp": 1.00068331, + "epoch": 0.7500977002855854, + "flos": 19169978198400.0, + "grad_norm": 2.1193836858586215, + "language_loss": 0.79943198, + "learning_rate": 6.199914591465878e-07, + "loss": 0.82198042, + "num_input_tokens_seen": 269048180, + "step": 12476, + "time_per_iteration": 2.5923428535461426 + }, + { + "auxiliary_loss_clip": 0.01116863, + "auxiliary_loss_mlp": 0.01102134, + "balance_loss_clip": 1.00178874, + "balance_loss_mlp": 1.00047207, + "epoch": 0.7501578235382534, + "flos": 22163886754560.0, + "grad_norm": 1.89185253006817, + "language_loss": 0.77478844, + "learning_rate": 6.19709590885688e-07, + "loss": 0.79697841, + "num_input_tokens_seen": 269068600, + "step": 12477, + "time_per_iteration": 2.668348789215088 + }, + { + "auxiliary_loss_clip": 0.01126974, + "auxiliary_loss_mlp": 0.01076623, + "balance_loss_clip": 1.0006175, + "balance_loss_mlp": 0.99994773, + "epoch": 0.7502179467909214, + "flos": 64465040033280.0, + "grad_norm": 0.80557038092952, + "language_loss": 0.54464692, + "learning_rate": 6.194277749655394e-07, + "loss": 0.56668288, + "num_input_tokens_seen": 269119045, + "step": 12478, + "time_per_iteration": 3.172255516052246 + }, + { + "auxiliary_loss_clip": 0.01133283, + "auxiliary_loss_mlp": 0.01102174, + "balance_loss_clip": 1.00174999, + "balance_loss_mlp": 1.00051284, + "epoch": 0.7502780700435894, + "flos": 20478275268480.0, + "grad_norm": 1.5812230717980649, + "language_loss": 0.80272436, + "learning_rate": 6.191460113968272e-07, + "loss": 0.82507896, + "num_input_tokens_seen": 269136755, + "step": 12479, + "time_per_iteration": 2.616851806640625 + }, + { + "auxiliary_loss_clip": 0.01148213, + "auxiliary_loss_mlp": 0.01103671, + "balance_loss_clip": 1.00188363, + "balance_loss_mlp": 1.0005784, + "epoch": 0.7503381932962573, + "flos": 20445273648000.0, + "grad_norm": 5.93119093415431, + "language_loss": 0.62892699, + "learning_rate": 6.188643001902369e-07, + "loss": 0.65144575, + "num_input_tokens_seen": 269156120, + "step": 12480, + "time_per_iteration": 2.5913403034210205 + }, + { + "auxiliary_loss_clip": 0.01133009, + "auxiliary_loss_mlp": 0.01101502, + "balance_loss_clip": 1.00171065, + "balance_loss_mlp": 1.00069809, + "epoch": 0.7503983165489253, + "flos": 22381936266240.0, + "grad_norm": 1.7872448168134478, + "language_loss": 0.77935529, + "learning_rate": 6.185826413564512e-07, + "loss": 0.80170035, + "num_input_tokens_seen": 269175650, + "step": 12481, + "time_per_iteration": 2.658585786819458 + }, + { + "auxiliary_loss_clip": 0.01119086, + "auxiliary_loss_mlp": 0.0110231, + "balance_loss_clip": 1.00180638, + "balance_loss_mlp": 1.00045753, + "epoch": 0.7504584398015932, + "flos": 24899453717760.0, + "grad_norm": 2.4654984248364427, + "language_loss": 0.71137881, + "learning_rate": 6.183010349061501e-07, + "loss": 0.73359275, + "num_input_tokens_seen": 269197080, + "step": 12482, + "time_per_iteration": 2.7523210048675537 + }, + { + "auxiliary_loss_clip": 0.01164652, + "auxiliary_loss_mlp": 0.01102487, + "balance_loss_clip": 1.00192571, + "balance_loss_mlp": 1.00063503, + "epoch": 0.7505185630542612, + "flos": 25885237547520.0, + "grad_norm": 1.824122158563631, + "language_loss": 0.7003597, + "learning_rate": 6.180194808500118e-07, + "loss": 0.72303104, + "num_input_tokens_seen": 269218600, + "step": 12483, + "time_per_iteration": 2.573648452758789 + }, + { + "auxiliary_loss_clip": 0.01164476, + "auxiliary_loss_mlp": 0.01102099, + "balance_loss_clip": 1.0018599, + "balance_loss_mlp": 1.00043702, + "epoch": 0.7505786863069293, + "flos": 23143852581120.0, + "grad_norm": 2.149038985480152, + "language_loss": 0.74176455, + "learning_rate": 6.177379791987131e-07, + "loss": 0.76443028, + "num_input_tokens_seen": 269239245, + "step": 12484, + "time_per_iteration": 2.5513813495635986 + }, + { + "auxiliary_loss_clip": 0.01132061, + "auxiliary_loss_mlp": 0.01101901, + "balance_loss_clip": 1.00177932, + "balance_loss_mlp": 1.00042987, + "epoch": 0.7506388095595972, + "flos": 16983377769600.0, + "grad_norm": 2.166233042295337, + "language_loss": 0.84379917, + "learning_rate": 6.174565299629295e-07, + "loss": 0.86613882, + "num_input_tokens_seen": 269258520, + "step": 12485, + "time_per_iteration": 2.605663299560547 + }, + { + "auxiliary_loss_clip": 0.01116227, + "auxiliary_loss_mlp": 0.01102349, + "balance_loss_clip": 1.00164676, + "balance_loss_mlp": 1.0004009, + "epoch": 0.7506989328122652, + "flos": 22344984149760.0, + "grad_norm": 1.912419669376751, + "language_loss": 0.78298068, + "learning_rate": 6.171751331533323e-07, + "loss": 0.80516642, + "num_input_tokens_seen": 269278320, + "step": 12486, + "time_per_iteration": 2.6683080196380615 + }, + { + "auxiliary_loss_clip": 0.01149962, + "auxiliary_loss_mlp": 0.01102727, + "balance_loss_clip": 1.0018214, + "balance_loss_mlp": 1.00049269, + "epoch": 0.7507590560649331, + "flos": 25776069137280.0, + "grad_norm": 2.114981174916502, + "language_loss": 0.72482908, + "learning_rate": 6.168937887805932e-07, + "loss": 0.74735594, + "num_input_tokens_seen": 269298025, + "step": 12487, + "time_per_iteration": 2.5915725231170654 + }, + { + "auxiliary_loss_clip": 0.01133266, + "auxiliary_loss_mlp": 0.01102676, + "balance_loss_clip": 1.00191045, + "balance_loss_mlp": 1.00044227, + "epoch": 0.7508191793176011, + "flos": 24279420124800.0, + "grad_norm": 1.713752677378302, + "language_loss": 0.66797042, + "learning_rate": 6.166124968553801e-07, + "loss": 0.69032991, + "num_input_tokens_seen": 269316770, + "step": 12488, + "time_per_iteration": 2.6278579235076904 + }, + { + "auxiliary_loss_clip": 0.01083739, + "auxiliary_loss_mlp": 0.01102777, + "balance_loss_clip": 1.00169039, + "balance_loss_mlp": 1.00063801, + "epoch": 0.750879302570269, + "flos": 19899575251200.0, + "grad_norm": 2.359221024504411, + "language_loss": 0.77337337, + "learning_rate": 6.163312573883592e-07, + "loss": 0.79523849, + "num_input_tokens_seen": 269334755, + "step": 12489, + "time_per_iteration": 2.6850268840789795 + }, + { + "auxiliary_loss_clip": 0.01147762, + "auxiliary_loss_mlp": 0.01102534, + "balance_loss_clip": 1.00193262, + "balance_loss_mlp": 1.0004909, + "epoch": 0.750939425822937, + "flos": 29205681667200.0, + "grad_norm": 1.6106632941575962, + "language_loss": 0.75067687, + "learning_rate": 6.160500703901956e-07, + "loss": 0.77317989, + "num_input_tokens_seen": 269353810, + "step": 12490, + "time_per_iteration": 2.6403443813323975 + }, + { + "auxiliary_loss_clip": 0.01164531, + "auxiliary_loss_mlp": 0.01102621, + "balance_loss_clip": 1.00204861, + "balance_loss_mlp": 1.00057757, + "epoch": 0.750999549075605, + "flos": 21142300043520.0, + "grad_norm": 3.398905220485415, + "language_loss": 0.78399539, + "learning_rate": 6.157689358715527e-07, + "loss": 0.80666691, + "num_input_tokens_seen": 269372910, + "step": 12491, + "time_per_iteration": 2.524296522140503 + }, + { + "auxiliary_loss_clip": 0.01149846, + "auxiliary_loss_mlp": 0.01102565, + "balance_loss_clip": 1.00192583, + "balance_loss_mlp": 1.00042677, + "epoch": 0.751059672328273, + "flos": 23547740083200.0, + "grad_norm": 1.9755146243512745, + "language_loss": 0.76281095, + "learning_rate": 6.154878538430899e-07, + "loss": 0.78533506, + "num_input_tokens_seen": 269391545, + "step": 12492, + "time_per_iteration": 2.6280462741851807 + }, + { + "auxiliary_loss_clip": 0.01117575, + "auxiliary_loss_mlp": 0.0110181, + "balance_loss_clip": 1.00164461, + "balance_loss_mlp": 1.0004344, + "epoch": 0.7511197955809409, + "flos": 18989742729600.0, + "grad_norm": 1.8137121909348037, + "language_loss": 0.71199811, + "learning_rate": 6.152068243154671e-07, + "loss": 0.73419195, + "num_input_tokens_seen": 269408530, + "step": 12493, + "time_per_iteration": 2.600013494491577 + }, + { + "auxiliary_loss_clip": 0.01147384, + "auxiliary_loss_mlp": 0.00747259, + "balance_loss_clip": 1.00187981, + "balance_loss_mlp": 1.00036848, + "epoch": 0.7511799188336089, + "flos": 22046961006720.0, + "grad_norm": 1.6621564172112162, + "language_loss": 0.8039546, + "learning_rate": 6.149258472993395e-07, + "loss": 0.82290101, + "num_input_tokens_seen": 269425930, + "step": 12494, + "time_per_iteration": 3.9107506275177 + }, + { + "auxiliary_loss_clip": 0.0116466, + "auxiliary_loss_mlp": 0.01102526, + "balance_loss_clip": 1.00194049, + "balance_loss_mlp": 1.00038743, + "epoch": 0.7512400420862768, + "flos": 16467125546880.0, + "grad_norm": 4.687208061818498, + "language_loss": 0.7850678, + "learning_rate": 6.146449228053634e-07, + "loss": 0.80773962, + "num_input_tokens_seen": 269443945, + "step": 12495, + "time_per_iteration": 2.4940404891967773 + }, + { + "auxiliary_loss_clip": 0.0116449, + "auxiliary_loss_mlp": 0.00747233, + "balance_loss_clip": 1.00190532, + "balance_loss_mlp": 1.00033796, + "epoch": 0.7513001653389448, + "flos": 20448326304000.0, + "grad_norm": 2.328583517105899, + "language_loss": 0.71144426, + "learning_rate": 6.143640508441898e-07, + "loss": 0.73056149, + "num_input_tokens_seen": 269463625, + "step": 12496, + "time_per_iteration": 2.5446479320526123 + }, + { + "auxiliary_loss_clip": 0.01101306, + "auxiliary_loss_mlp": 0.01101457, + "balance_loss_clip": 1.00169051, + "balance_loss_mlp": 1.0005579, + "epoch": 0.7513602885916129, + "flos": 23476816679040.0, + "grad_norm": 1.8800477834729639, + "language_loss": 0.78210866, + "learning_rate": 6.140832314264705e-07, + "loss": 0.80413628, + "num_input_tokens_seen": 269483415, + "step": 12497, + "time_per_iteration": 4.096110105514526 + }, + { + "auxiliary_loss_clip": 0.01147741, + "auxiliary_loss_mlp": 0.01101952, + "balance_loss_clip": 1.00178432, + "balance_loss_mlp": 1.00057685, + "epoch": 0.7514204118442808, + "flos": 26797224885120.0, + "grad_norm": 1.5126627184589707, + "language_loss": 0.76544839, + "learning_rate": 6.13802464562855e-07, + "loss": 0.78794533, + "num_input_tokens_seen": 269504635, + "step": 12498, + "time_per_iteration": 2.621281862258911 + }, + { + "auxiliary_loss_clip": 0.01133188, + "auxiliary_loss_mlp": 0.01101544, + "balance_loss_clip": 1.00187445, + "balance_loss_mlp": 1.0004549, + "epoch": 0.7514805350969488, + "flos": 19865639877120.0, + "grad_norm": 1.9483209288911816, + "language_loss": 0.73712039, + "learning_rate": 6.135217502639878e-07, + "loss": 0.75946772, + "num_input_tokens_seen": 269523955, + "step": 12499, + "time_per_iteration": 2.593384027481079 + }, + { + "auxiliary_loss_clip": 0.01149308, + "auxiliary_loss_mlp": 0.01101896, + "balance_loss_clip": 1.00183296, + "balance_loss_mlp": 1.00042558, + "epoch": 0.7515406583496167, + "flos": 24571553437440.0, + "grad_norm": 2.0348793652485218, + "language_loss": 0.79796946, + "learning_rate": 6.132410885405148e-07, + "loss": 0.82048148, + "num_input_tokens_seen": 269544410, + "step": 12500, + "time_per_iteration": 2.614014148712158 + }, + { + "auxiliary_loss_clip": 0.0114946, + "auxiliary_loss_mlp": 0.01104606, + "balance_loss_clip": 1.00188613, + "balance_loss_mlp": 1.00046504, + "epoch": 0.7516007816022847, + "flos": 20120246455680.0, + "grad_norm": 1.9049221444526838, + "language_loss": 0.73403513, + "learning_rate": 6.129604794030794e-07, + "loss": 0.75657582, + "num_input_tokens_seen": 269563315, + "step": 12501, + "time_per_iteration": 2.544682025909424 + }, + { + "auxiliary_loss_clip": 0.01132587, + "auxiliary_loss_mlp": 0.01100997, + "balance_loss_clip": 1.00171685, + "balance_loss_mlp": 1.00038433, + "epoch": 0.7516609048549526, + "flos": 22784638619520.0, + "grad_norm": 1.707678302886986, + "language_loss": 0.7810868, + "learning_rate": 6.126799228623207e-07, + "loss": 0.80342257, + "num_input_tokens_seen": 269583950, + "step": 12502, + "time_per_iteration": 2.6393744945526123 + }, + { + "auxiliary_loss_clip": 0.01131449, + "auxiliary_loss_mlp": 0.01102447, + "balance_loss_clip": 1.00182056, + "balance_loss_mlp": 1.00064254, + "epoch": 0.7517210281076206, + "flos": 10634012311680.0, + "grad_norm": 2.296292080893317, + "language_loss": 0.7073704, + "learning_rate": 6.123994189288786e-07, + "loss": 0.72970939, + "num_input_tokens_seen": 269600120, + "step": 12503, + "time_per_iteration": 2.64805006980896 + }, + { + "auxiliary_loss_clip": 0.01158651, + "auxiliary_loss_mlp": 0.01076675, + "balance_loss_clip": 1.00080156, + "balance_loss_mlp": 0.99999982, + "epoch": 0.7517811513602886, + "flos": 66052221275520.0, + "grad_norm": 0.9684621918561618, + "language_loss": 0.63915825, + "learning_rate": 6.121189676133903e-07, + "loss": 0.66151154, + "num_input_tokens_seen": 269659815, + "step": 12504, + "time_per_iteration": 3.0269739627838135 + }, + { + "auxiliary_loss_clip": 0.01118073, + "auxiliary_loss_mlp": 0.01101501, + "balance_loss_clip": 1.0017997, + "balance_loss_mlp": 1.0005064, + "epoch": 0.7518412746129566, + "flos": 37268345018880.0, + "grad_norm": 1.636537055871358, + "language_loss": 0.68815416, + "learning_rate": 6.118385689264896e-07, + "loss": 0.71034992, + "num_input_tokens_seen": 269684565, + "step": 12505, + "time_per_iteration": 2.8009965419769287 + }, + { + "auxiliary_loss_clip": 0.01141719, + "auxiliary_loss_mlp": 0.00745388, + "balance_loss_clip": 1.00092423, + "balance_loss_mlp": 1.00015378, + "epoch": 0.7519013978656245, + "flos": 60518567727360.0, + "grad_norm": 0.6417840886567063, + "language_loss": 0.55099344, + "learning_rate": 6.11558222878809e-07, + "loss": 0.56986451, + "num_input_tokens_seen": 269752325, + "step": 12506, + "time_per_iteration": 3.243720531463623 + }, + { + "auxiliary_loss_clip": 0.01149842, + "auxiliary_loss_mlp": 0.01102993, + "balance_loss_clip": 1.00198734, + "balance_loss_mlp": 1.00056791, + "epoch": 0.7519615211182925, + "flos": 18806885568000.0, + "grad_norm": 2.466029301300952, + "language_loss": 0.77899933, + "learning_rate": 6.112779294809796e-07, + "loss": 0.80152774, + "num_input_tokens_seen": 269770630, + "step": 12507, + "time_per_iteration": 3.9477181434631348 + }, + { + "auxiliary_loss_clip": 0.01132878, + "auxiliary_loss_mlp": 0.01102225, + "balance_loss_clip": 1.00182676, + "balance_loss_mlp": 1.00056362, + "epoch": 0.7520216443709604, + "flos": 14575244209920.0, + "grad_norm": 2.078956855743123, + "language_loss": 0.71289021, + "learning_rate": 6.10997688743631e-07, + "loss": 0.73524129, + "num_input_tokens_seen": 269787280, + "step": 12508, + "time_per_iteration": 2.6732540130615234 + }, + { + "auxiliary_loss_clip": 0.01149918, + "auxiliary_loss_mlp": 0.01101954, + "balance_loss_clip": 1.0019629, + "balance_loss_mlp": 1.00048339, + "epoch": 0.7520817676236284, + "flos": 17056599644160.0, + "grad_norm": 13.037936958279584, + "language_loss": 0.72216129, + "learning_rate": 6.107175006773885e-07, + "loss": 0.74467999, + "num_input_tokens_seen": 269805205, + "step": 12509, + "time_per_iteration": 2.5337564945220947 + }, + { + "auxiliary_loss_clip": 0.01164655, + "auxiliary_loss_mlp": 0.01103713, + "balance_loss_clip": 1.00184703, + "balance_loss_mlp": 1.00062084, + "epoch": 0.7521418908762965, + "flos": 25666397936640.0, + "grad_norm": 1.894936498885081, + "language_loss": 0.61928618, + "learning_rate": 6.104373652928785e-07, + "loss": 0.64196992, + "num_input_tokens_seen": 269824820, + "step": 12510, + "time_per_iteration": 2.5674169063568115 + }, + { + "auxiliary_loss_clip": 0.01148044, + "auxiliary_loss_mlp": 0.01102227, + "balance_loss_clip": 1.00197411, + "balance_loss_mlp": 1.00066054, + "epoch": 0.7522020141289644, + "flos": 20886759711360.0, + "grad_norm": 2.217875622738785, + "language_loss": 0.81616175, + "learning_rate": 6.10157282600722e-07, + "loss": 0.83866453, + "num_input_tokens_seen": 269842825, + "step": 12511, + "time_per_iteration": 2.556273937225342 + }, + { + "auxiliary_loss_clip": 0.01133184, + "auxiliary_loss_mlp": 0.01103176, + "balance_loss_clip": 1.00183082, + "balance_loss_mlp": 1.0005604, + "epoch": 0.7522621373816324, + "flos": 12640305444480.0, + "grad_norm": 1.8026030666840396, + "language_loss": 0.75835037, + "learning_rate": 6.098772526115412e-07, + "loss": 0.78071398, + "num_input_tokens_seen": 269859000, + "step": 12512, + "time_per_iteration": 3.9422221183776855 + }, + { + "auxiliary_loss_clip": 0.01147991, + "auxiliary_loss_mlp": 0.01100658, + "balance_loss_clip": 1.00178277, + "balance_loss_mlp": 1.00047469, + "epoch": 0.7523222606343003, + "flos": 25626141768960.0, + "grad_norm": 1.7959032828171135, + "language_loss": 0.82317442, + "learning_rate": 6.095972753359537e-07, + "loss": 0.84566092, + "num_input_tokens_seen": 269878895, + "step": 12513, + "time_per_iteration": 2.592473030090332 + }, + { + "auxiliary_loss_clip": 0.01148215, + "auxiliary_loss_mlp": 0.01102665, + "balance_loss_clip": 1.00178409, + "balance_loss_mlp": 1.00052679, + "epoch": 0.7523823838869683, + "flos": 20448900921600.0, + "grad_norm": 7.328528696498674, + "language_loss": 0.75151801, + "learning_rate": 6.093173507845771e-07, + "loss": 0.77402687, + "num_input_tokens_seen": 269897280, + "step": 12514, + "time_per_iteration": 2.5464468002319336 + }, + { + "auxiliary_loss_clip": 0.01148002, + "auxiliary_loss_mlp": 0.01101572, + "balance_loss_clip": 1.00176716, + "balance_loss_mlp": 1.00038671, + "epoch": 0.7524425071396362, + "flos": 14720610551040.0, + "grad_norm": 2.745904377200479, + "language_loss": 0.69050825, + "learning_rate": 6.090374789680271e-07, + "loss": 0.71300399, + "num_input_tokens_seen": 269914640, + "step": 12515, + "time_per_iteration": 2.5622475147247314 + }, + { + "auxiliary_loss_clip": 0.0114783, + "auxiliary_loss_mlp": 0.01101918, + "balance_loss_clip": 1.00176179, + "balance_loss_mlp": 1.00063741, + "epoch": 0.7525026303923043, + "flos": 30592048947840.0, + "grad_norm": 2.083835261756325, + "language_loss": 0.70500416, + "learning_rate": 6.087576598969137e-07, + "loss": 0.72750163, + "num_input_tokens_seen": 269934960, + "step": 12516, + "time_per_iteration": 2.6815292835235596 + }, + { + "auxiliary_loss_clip": 0.0109949, + "auxiliary_loss_mlp": 0.01101008, + "balance_loss_clip": 1.00159812, + "balance_loss_mlp": 1.00029969, + "epoch": 0.7525627536449722, + "flos": 24791757765120.0, + "grad_norm": 1.6866114621645958, + "language_loss": 0.89438093, + "learning_rate": 6.084778935818495e-07, + "loss": 0.91638595, + "num_input_tokens_seen": 269956655, + "step": 12517, + "time_per_iteration": 2.7518134117126465 + }, + { + "auxiliary_loss_clip": 0.01131145, + "auxiliary_loss_mlp": 0.01103305, + "balance_loss_clip": 1.00179243, + "balance_loss_mlp": 1.00049889, + "epoch": 0.7526228768976402, + "flos": 20779782030720.0, + "grad_norm": 1.524797323010005, + "language_loss": 0.74398202, + "learning_rate": 6.081981800334437e-07, + "loss": 0.76632655, + "num_input_tokens_seen": 269976835, + "step": 12518, + "time_per_iteration": 2.6622707843780518 + }, + { + "auxiliary_loss_clip": 0.01099479, + "auxiliary_loss_mlp": 0.01077926, + "balance_loss_clip": 1.00168061, + "balance_loss_mlp": 1.0001061, + "epoch": 0.7526830001503081, + "flos": 66559243703040.0, + "grad_norm": 0.7028174066591524, + "language_loss": 0.55658996, + "learning_rate": 6.079185192623017e-07, + "loss": 0.57836401, + "num_input_tokens_seen": 270040630, + "step": 12519, + "time_per_iteration": 3.3452508449554443 + }, + { + "auxiliary_loss_clip": 0.01148203, + "auxiliary_loss_mlp": 0.01101766, + "balance_loss_clip": 1.00185013, + "balance_loss_mlp": 1.00048542, + "epoch": 0.7527431234029761, + "flos": 23477894087040.0, + "grad_norm": 1.8798044437031387, + "language_loss": 0.78007591, + "learning_rate": 6.07638911279029e-07, + "loss": 0.80257559, + "num_input_tokens_seen": 270059695, + "step": 12520, + "time_per_iteration": 2.687786102294922 + }, + { + "auxiliary_loss_clip": 0.01150002, + "auxiliary_loss_mlp": 0.01101452, + "balance_loss_clip": 1.00182402, + "balance_loss_mlp": 1.00055385, + "epoch": 0.752803246655644, + "flos": 22049546785920.0, + "grad_norm": 2.4443464861417064, + "language_loss": 0.73694181, + "learning_rate": 6.07359356094229e-07, + "loss": 0.7594564, + "num_input_tokens_seen": 270078420, + "step": 12521, + "time_per_iteration": 2.63912034034729 + }, + { + "auxiliary_loss_clip": 0.01132723, + "auxiliary_loss_mlp": 0.01104214, + "balance_loss_clip": 1.00194502, + "balance_loss_mlp": 1.00054955, + "epoch": 0.752863369908312, + "flos": 30153795108480.0, + "grad_norm": 1.833935676081193, + "language_loss": 0.67476082, + "learning_rate": 6.070798537185016e-07, + "loss": 0.69713014, + "num_input_tokens_seen": 270097040, + "step": 12522, + "time_per_iteration": 2.721297025680542 + }, + { + "auxiliary_loss_clip": 0.01147961, + "auxiliary_loss_mlp": 0.01102787, + "balance_loss_clip": 1.00192404, + "balance_loss_mlp": 1.0007441, + "epoch": 0.7529234931609801, + "flos": 24567638855040.0, + "grad_norm": 1.580847864699783, + "language_loss": 0.78210014, + "learning_rate": 6.068004041624453e-07, + "loss": 0.80460763, + "num_input_tokens_seen": 270116365, + "step": 12523, + "time_per_iteration": 2.5999596118927 + }, + { + "auxiliary_loss_clip": 0.01164559, + "auxiliary_loss_mlp": 0.0110164, + "balance_loss_clip": 1.00198269, + "balance_loss_mlp": 1.00045538, + "epoch": 0.752983616413648, + "flos": 23112395245440.0, + "grad_norm": 1.915894390363025, + "language_loss": 0.8084712, + "learning_rate": 6.065210074366571e-07, + "loss": 0.83113313, + "num_input_tokens_seen": 270135395, + "step": 12524, + "time_per_iteration": 2.567188262939453 + }, + { + "auxiliary_loss_clip": 0.01149706, + "auxiliary_loss_mlp": 0.0074719, + "balance_loss_clip": 1.00187898, + "balance_loss_mlp": 1.00044, + "epoch": 0.753043739666316, + "flos": 24316946858880.0, + "grad_norm": 1.7364287365876245, + "language_loss": 0.74294406, + "learning_rate": 6.062416635517326e-07, + "loss": 0.761913, + "num_input_tokens_seen": 270156425, + "step": 12525, + "time_per_iteration": 2.587639093399048 + }, + { + "auxiliary_loss_clip": 0.01115912, + "auxiliary_loss_mlp": 0.01101958, + "balance_loss_clip": 1.00182164, + "balance_loss_mlp": 1.0005821, + "epoch": 0.7531038629189839, + "flos": 24243294021120.0, + "grad_norm": 1.8524272347003834, + "language_loss": 0.71886969, + "learning_rate": 6.059623725182641e-07, + "loss": 0.74104834, + "num_input_tokens_seen": 270176905, + "step": 12526, + "time_per_iteration": 2.733642339706421 + }, + { + "auxiliary_loss_clip": 0.01132806, + "auxiliary_loss_mlp": 0.01101365, + "balance_loss_clip": 1.00177479, + "balance_loss_mlp": 1.00037146, + "epoch": 0.7531639861716519, + "flos": 30188807890560.0, + "grad_norm": 2.094163503182918, + "language_loss": 0.72661823, + "learning_rate": 6.056831343468414e-07, + "loss": 0.74895996, + "num_input_tokens_seen": 270196640, + "step": 12527, + "time_per_iteration": 2.640117883682251 + }, + { + "auxiliary_loss_clip": 0.01114642, + "auxiliary_loss_mlp": 0.01101584, + "balance_loss_clip": 1.00170934, + "balance_loss_mlp": 1.00039947, + "epoch": 0.7532241094243198, + "flos": 18223193560320.0, + "grad_norm": 1.9804358316663948, + "language_loss": 0.80639505, + "learning_rate": 6.054039490480539e-07, + "loss": 0.82855737, + "num_input_tokens_seen": 270213905, + "step": 12528, + "time_per_iteration": 2.634030818939209 + }, + { + "auxiliary_loss_clip": 0.01086778, + "auxiliary_loss_mlp": 0.0110236, + "balance_loss_clip": 1.00166786, + "balance_loss_mlp": 1.00050735, + "epoch": 0.7532842326769879, + "flos": 20881049448960.0, + "grad_norm": 2.0591370460566596, + "language_loss": 0.85585833, + "learning_rate": 6.051248166324892e-07, + "loss": 0.87774968, + "num_input_tokens_seen": 270231995, + "step": 12529, + "time_per_iteration": 2.6848843097686768 + }, + { + "auxiliary_loss_clip": 0.01115934, + "auxiliary_loss_mlp": 0.01103246, + "balance_loss_clip": 1.00185227, + "balance_loss_mlp": 1.00053573, + "epoch": 0.7533443559296558, + "flos": 18078689145600.0, + "grad_norm": 2.9112720787633166, + "language_loss": 0.73489922, + "learning_rate": 6.048457371107303e-07, + "loss": 0.7570911, + "num_input_tokens_seen": 270251480, + "step": 12530, + "time_per_iteration": 2.656662940979004 + }, + { + "auxiliary_loss_clip": 0.01096962, + "auxiliary_loss_mlp": 0.01078009, + "balance_loss_clip": 1.00169039, + "balance_loss_mlp": 1.00018907, + "epoch": 0.7534044791823238, + "flos": 50254830766080.0, + "grad_norm": 0.8322277364062073, + "language_loss": 0.63694763, + "learning_rate": 6.045667104933612e-07, + "loss": 0.65869737, + "num_input_tokens_seen": 270306480, + "step": 12531, + "time_per_iteration": 3.105501651763916 + }, + { + "auxiliary_loss_clip": 0.01131124, + "auxiliary_loss_mlp": 0.01103167, + "balance_loss_clip": 1.00183034, + "balance_loss_mlp": 1.00036073, + "epoch": 0.7534646024349917, + "flos": 20850274471680.0, + "grad_norm": 2.0821316972931947, + "language_loss": 0.69774222, + "learning_rate": 6.042877367909633e-07, + "loss": 0.72008508, + "num_input_tokens_seen": 270324595, + "step": 12532, + "time_per_iteration": 3.943328380584717 + }, + { + "auxiliary_loss_clip": 0.01134163, + "auxiliary_loss_mlp": 0.01100964, + "balance_loss_clip": 1.00193822, + "balance_loss_mlp": 1.00044703, + "epoch": 0.7535247256876597, + "flos": 23071779941760.0, + "grad_norm": 1.8798275557483257, + "language_loss": 0.77991742, + "learning_rate": 6.040088160141132e-07, + "loss": 0.80226868, + "num_input_tokens_seen": 270344375, + "step": 12533, + "time_per_iteration": 2.6094977855682373 + }, + { + "auxiliary_loss_clip": 0.01144214, + "auxiliary_loss_mlp": 0.01076596, + "balance_loss_clip": 1.00082123, + "balance_loss_mlp": 0.99992031, + "epoch": 0.7535848489403276, + "flos": 58623418252800.0, + "grad_norm": 0.8689401400707192, + "language_loss": 0.57341146, + "learning_rate": 6.037299481733886e-07, + "loss": 0.59561956, + "num_input_tokens_seen": 270405235, + "step": 12534, + "time_per_iteration": 3.1604228019714355 + }, + { + "auxiliary_loss_clip": 0.01133155, + "auxiliary_loss_mlp": 0.01101984, + "balance_loss_clip": 1.00177956, + "balance_loss_mlp": 1.00032258, + "epoch": 0.7536449721929956, + "flos": 26577882483840.0, + "grad_norm": 1.7463961842638578, + "language_loss": 0.71143192, + "learning_rate": 6.03451133279365e-07, + "loss": 0.7337833, + "num_input_tokens_seen": 270425820, + "step": 12535, + "time_per_iteration": 3.9983129501342773 + }, + { + "auxiliary_loss_clip": 0.01133099, + "auxiliary_loss_mlp": 0.01102671, + "balance_loss_clip": 1.00169849, + "balance_loss_mlp": 1.0004375, + "epoch": 0.7537050954456637, + "flos": 25735992537600.0, + "grad_norm": 1.7511455876743327, + "language_loss": 0.80927724, + "learning_rate": 6.031723713426135e-07, + "loss": 0.831635, + "num_input_tokens_seen": 270447120, + "step": 12536, + "time_per_iteration": 2.6183555126190186 + }, + { + "auxiliary_loss_clip": 0.01134568, + "auxiliary_loss_mlp": 0.01101794, + "balance_loss_clip": 1.00199366, + "balance_loss_mlp": 1.00051343, + "epoch": 0.7537652186983316, + "flos": 30224431203840.0, + "grad_norm": 4.208018393127227, + "language_loss": 0.74476779, + "learning_rate": 6.028936623737067e-07, + "loss": 0.76713139, + "num_input_tokens_seen": 270468680, + "step": 12537, + "time_per_iteration": 2.6586191654205322 + }, + { + "auxiliary_loss_clip": 0.01164586, + "auxiliary_loss_mlp": 0.01102433, + "balance_loss_clip": 1.00185299, + "balance_loss_mlp": 1.00058067, + "epoch": 0.7538253419509996, + "flos": 12641239198080.0, + "grad_norm": 1.6432018416373764, + "language_loss": 0.73827744, + "learning_rate": 6.026150063832111e-07, + "loss": 0.76094759, + "num_input_tokens_seen": 270486310, + "step": 12538, + "time_per_iteration": 2.5216095447540283 + }, + { + "auxiliary_loss_clip": 0.01114552, + "auxiliary_loss_mlp": 0.01102076, + "balance_loss_clip": 1.00177073, + "balance_loss_mlp": 1.00051022, + "epoch": 0.7538854652036675, + "flos": 23185976256000.0, + "grad_norm": 2.0732625463378, + "language_loss": 0.67437208, + "learning_rate": 6.023364033816956e-07, + "loss": 0.69653839, + "num_input_tokens_seen": 270507210, + "step": 12539, + "time_per_iteration": 2.6658775806427 + }, + { + "auxiliary_loss_clip": 0.01164405, + "auxiliary_loss_mlp": 0.0110139, + "balance_loss_clip": 1.00185955, + "balance_loss_mlp": 1.00039542, + "epoch": 0.7539455884563355, + "flos": 23186227651200.0, + "grad_norm": 2.353021305991569, + "language_loss": 0.74759692, + "learning_rate": 6.020578533797229e-07, + "loss": 0.77025485, + "num_input_tokens_seen": 270525250, + "step": 12540, + "time_per_iteration": 2.5691113471984863 + }, + { + "auxiliary_loss_clip": 0.01164644, + "auxiliary_loss_mlp": 0.01101809, + "balance_loss_clip": 1.00194407, + "balance_loss_mlp": 1.00043368, + "epoch": 0.7540057117090034, + "flos": 13181155505280.0, + "grad_norm": 2.5442146241612114, + "language_loss": 0.72672963, + "learning_rate": 6.017793563878566e-07, + "loss": 0.74939418, + "num_input_tokens_seen": 270539295, + "step": 12541, + "time_per_iteration": 2.4952890872955322 + }, + { + "auxiliary_loss_clip": 0.01164515, + "auxiliary_loss_mlp": 0.01101431, + "balance_loss_clip": 1.00186992, + "balance_loss_mlp": 1.00043726, + "epoch": 0.7540658349616715, + "flos": 45478134478080.0, + "grad_norm": 1.6193136199201084, + "language_loss": 0.7187742, + "learning_rate": 6.015009124166576e-07, + "loss": 0.74143362, + "num_input_tokens_seen": 270562815, + "step": 12542, + "time_per_iteration": 2.739391803741455 + }, + { + "auxiliary_loss_clip": 0.01133026, + "auxiliary_loss_mlp": 0.01101768, + "balance_loss_clip": 1.00179386, + "balance_loss_mlp": 1.00039256, + "epoch": 0.7541259582143394, + "flos": 19930817105280.0, + "grad_norm": 2.112084853820145, + "language_loss": 0.84461737, + "learning_rate": 6.012225214766844e-07, + "loss": 0.86696535, + "num_input_tokens_seen": 270579055, + "step": 12543, + "time_per_iteration": 2.577909231185913 + }, + { + "auxiliary_loss_clip": 0.01113919, + "auxiliary_loss_mlp": 0.01102478, + "balance_loss_clip": 1.00182843, + "balance_loss_mlp": 1.00043511, + "epoch": 0.7541860814670074, + "flos": 27198239299200.0, + "grad_norm": 2.3073958968217645, + "language_loss": 0.73498923, + "learning_rate": 6.009441835784927e-07, + "loss": 0.75715321, + "num_input_tokens_seen": 270599080, + "step": 12544, + "time_per_iteration": 2.6903889179229736 + }, + { + "auxiliary_loss_clip": 0.01147838, + "auxiliary_loss_mlp": 0.0110163, + "balance_loss_clip": 1.00177789, + "balance_loss_mlp": 1.00054049, + "epoch": 0.7542462047196753, + "flos": 21324151624320.0, + "grad_norm": 2.1747247830310044, + "language_loss": 0.68375432, + "learning_rate": 6.006658987326383e-07, + "loss": 0.706249, + "num_input_tokens_seen": 270618715, + "step": 12545, + "time_per_iteration": 4.034446477890015 + }, + { + "auxiliary_loss_clip": 0.01135076, + "auxiliary_loss_mlp": 0.01101757, + "balance_loss_clip": 1.00186288, + "balance_loss_mlp": 1.00047719, + "epoch": 0.7543063279723433, + "flos": 11940944664960.0, + "grad_norm": 2.9360841298902725, + "language_loss": 0.68971902, + "learning_rate": 6.003876669496728e-07, + "loss": 0.71208739, + "num_input_tokens_seen": 270635695, + "step": 12546, + "time_per_iteration": 2.598893880844116 + }, + { + "auxiliary_loss_clip": 0.01147911, + "auxiliary_loss_mlp": 0.01103016, + "balance_loss_clip": 1.00182188, + "balance_loss_mlp": 1.00059116, + "epoch": 0.7543664512250112, + "flos": 22819974624000.0, + "grad_norm": 2.9497548909540647, + "language_loss": 0.73406804, + "learning_rate": 6.00109488240147e-07, + "loss": 0.75657731, + "num_input_tokens_seen": 270654325, + "step": 12547, + "time_per_iteration": 2.567474126815796 + }, + { + "auxiliary_loss_clip": 0.01164393, + "auxiliary_loss_mlp": 0.0110183, + "balance_loss_clip": 1.00182366, + "balance_loss_mlp": 1.00050211, + "epoch": 0.7544265744776792, + "flos": 20923855482240.0, + "grad_norm": 2.0436617420591254, + "language_loss": 0.67747629, + "learning_rate": 5.998313626146099e-07, + "loss": 0.70013851, + "num_input_tokens_seen": 270674260, + "step": 12548, + "time_per_iteration": 2.530026912689209 + }, + { + "auxiliary_loss_clip": 0.01131733, + "auxiliary_loss_mlp": 0.01102391, + "balance_loss_clip": 1.0017544, + "balance_loss_mlp": 1.00044274, + "epoch": 0.7544866977303473, + "flos": 15195493284480.0, + "grad_norm": 1.7863808671892851, + "language_loss": 0.87377298, + "learning_rate": 5.995532900836088e-07, + "loss": 0.89611417, + "num_input_tokens_seen": 270692200, + "step": 12549, + "time_per_iteration": 4.003859281539917 + }, + { + "auxiliary_loss_clip": 0.01099295, + "auxiliary_loss_mlp": 0.01100591, + "balance_loss_clip": 1.00160813, + "balance_loss_mlp": 1.00045538, + "epoch": 0.7545468209830152, + "flos": 27083683848960.0, + "grad_norm": 1.9794020394319936, + "language_loss": 0.77148318, + "learning_rate": 5.992752706576865e-07, + "loss": 0.79348201, + "num_input_tokens_seen": 270709675, + "step": 12550, + "time_per_iteration": 2.706597328186035 + }, + { + "auxiliary_loss_clip": 0.01164554, + "auxiliary_loss_mlp": 0.0110209, + "balance_loss_clip": 1.0018959, + "balance_loss_mlp": 1.0004282, + "epoch": 0.7546069442356832, + "flos": 26871703735680.0, + "grad_norm": 1.512294589185539, + "language_loss": 0.69645858, + "learning_rate": 5.98997304347386e-07, + "loss": 0.71912503, + "num_input_tokens_seen": 270733055, + "step": 12551, + "time_per_iteration": 2.5686118602752686 + }, + { + "auxiliary_loss_clip": 0.01133357, + "auxiliary_loss_mlp": 0.01102778, + "balance_loss_clip": 1.00197387, + "balance_loss_mlp": 1.00054431, + "epoch": 0.7546670674883511, + "flos": 15743131015680.0, + "grad_norm": 3.1154208310555913, + "language_loss": 0.8617816, + "learning_rate": 5.987193911632487e-07, + "loss": 0.88414294, + "num_input_tokens_seen": 270749275, + "step": 12552, + "time_per_iteration": 2.6746339797973633 + }, + { + "auxiliary_loss_clip": 0.01149476, + "auxiliary_loss_mlp": 0.01102297, + "balance_loss_clip": 1.00191927, + "balance_loss_mlp": 1.00044453, + "epoch": 0.7547271907410191, + "flos": 23477714519040.0, + "grad_norm": 2.6038766954383616, + "language_loss": 0.78595841, + "learning_rate": 5.98441531115812e-07, + "loss": 0.80847615, + "num_input_tokens_seen": 270768230, + "step": 12553, + "time_per_iteration": 2.5730173587799072 + }, + { + "auxiliary_loss_clip": 0.01147774, + "auxiliary_loss_mlp": 0.01103252, + "balance_loss_clip": 1.00186181, + "balance_loss_mlp": 1.00044572, + "epoch": 0.754787313993687, + "flos": 31722804069120.0, + "grad_norm": 2.3434803770364674, + "language_loss": 0.63017142, + "learning_rate": 5.981637242156135e-07, + "loss": 0.65268171, + "num_input_tokens_seen": 270786285, + "step": 12554, + "time_per_iteration": 2.6652495861053467 + }, + { + "auxiliary_loss_clip": 0.01134393, + "auxiliary_loss_mlp": 0.01101783, + "balance_loss_clip": 1.00184846, + "balance_loss_mlp": 1.00050318, + "epoch": 0.7548474372463551, + "flos": 27563055782400.0, + "grad_norm": 1.95914514230266, + "language_loss": 0.73437899, + "learning_rate": 5.978859704731864e-07, + "loss": 0.75674075, + "num_input_tokens_seen": 270805505, + "step": 12555, + "time_per_iteration": 2.6511776447296143 + }, + { + "auxiliary_loss_clip": 0.01131694, + "auxiliary_loss_mlp": 0.01103105, + "balance_loss_clip": 1.00186479, + "balance_loss_mlp": 1.00039387, + "epoch": 0.754907560499023, + "flos": 19318576763520.0, + "grad_norm": 5.123747740351394, + "language_loss": 0.78485441, + "learning_rate": 5.976082698990645e-07, + "loss": 0.8072024, + "num_input_tokens_seen": 270824610, + "step": 12556, + "time_per_iteration": 2.635931968688965 + }, + { + "auxiliary_loss_clip": 0.01141968, + "auxiliary_loss_mlp": 0.01076995, + "balance_loss_clip": 1.00070965, + "balance_loss_mlp": 0.99993861, + "epoch": 0.754967683751691, + "flos": 69744628684800.0, + "grad_norm": 0.74700186682754, + "language_loss": 0.50429249, + "learning_rate": 5.973306225037769e-07, + "loss": 0.52648211, + "num_input_tokens_seen": 270886155, + "step": 12557, + "time_per_iteration": 3.117635726928711 + }, + { + "auxiliary_loss_clip": 0.01148101, + "auxiliary_loss_mlp": 0.01102798, + "balance_loss_clip": 1.00189543, + "balance_loss_mlp": 1.00046873, + "epoch": 0.7550278070043589, + "flos": 24421913377920.0, + "grad_norm": 3.177868622901092, + "language_loss": 0.71244764, + "learning_rate": 5.970530282978525e-07, + "loss": 0.73495662, + "num_input_tokens_seen": 270905325, + "step": 12558, + "time_per_iteration": 2.6831467151641846 + }, + { + "auxiliary_loss_clip": 0.01135044, + "auxiliary_loss_mlp": 0.01102101, + "balance_loss_clip": 1.00182915, + "balance_loss_mlp": 1.00043917, + "epoch": 0.7550879302570269, + "flos": 32634611838720.0, + "grad_norm": 1.9395364238866255, + "language_loss": 0.80244631, + "learning_rate": 5.967754872918187e-07, + "loss": 0.82481778, + "num_input_tokens_seen": 270927535, + "step": 12559, + "time_per_iteration": 2.7273833751678467 + }, + { + "auxiliary_loss_clip": 0.0109832, + "auxiliary_loss_mlp": 0.01103303, + "balance_loss_clip": 1.00168145, + "balance_loss_mlp": 1.00040138, + "epoch": 0.7551480535096948, + "flos": 21795550738560.0, + "grad_norm": 1.6444162201400403, + "language_loss": 0.78589308, + "learning_rate": 5.96497999496199e-07, + "loss": 0.80790931, + "num_input_tokens_seen": 270946920, + "step": 12560, + "time_per_iteration": 2.780756711959839 + }, + { + "auxiliary_loss_clip": 0.01100985, + "auxiliary_loss_mlp": 0.01102186, + "balance_loss_clip": 1.00167084, + "balance_loss_mlp": 1.00052428, + "epoch": 0.7552081767623628, + "flos": 18515111391360.0, + "grad_norm": 1.6494039091553412, + "language_loss": 0.70755833, + "learning_rate": 5.96220564921515e-07, + "loss": 0.72959006, + "num_input_tokens_seen": 270965705, + "step": 12561, + "time_per_iteration": 2.6758439540863037 + }, + { + "auxiliary_loss_clip": 0.01132989, + "auxiliary_loss_mlp": 0.00747294, + "balance_loss_clip": 1.00173247, + "balance_loss_mlp": 1.00050569, + "epoch": 0.7552683000150308, + "flos": 27634805199360.0, + "grad_norm": 1.9736207274068773, + "language_loss": 0.75929409, + "learning_rate": 5.959431835782889e-07, + "loss": 0.77809691, + "num_input_tokens_seen": 270986550, + "step": 12562, + "time_per_iteration": 2.685452699661255 + }, + { + "auxiliary_loss_clip": 0.01132813, + "auxiliary_loss_mlp": 0.01102635, + "balance_loss_clip": 1.00172818, + "balance_loss_mlp": 1.00054371, + "epoch": 0.7553284232676988, + "flos": 20302924049280.0, + "grad_norm": 1.8694333110735784, + "language_loss": 0.75636542, + "learning_rate": 5.956658554770371e-07, + "loss": 0.7787199, + "num_input_tokens_seen": 271006250, + "step": 12563, + "time_per_iteration": 2.6503024101257324 + }, + { + "auxiliary_loss_clip": 0.0112107, + "auxiliary_loss_mlp": 0.01105319, + "balance_loss_clip": 1.00192714, + "balance_loss_mlp": 1.00051045, + "epoch": 0.7553885465203668, + "flos": 33255471444480.0, + "grad_norm": 2.4174874375215767, + "language_loss": 0.67028105, + "learning_rate": 5.953885806282768e-07, + "loss": 0.692545, + "num_input_tokens_seen": 271025575, + "step": 12564, + "time_per_iteration": 2.783787965774536 + }, + { + "auxiliary_loss_clip": 0.01133457, + "auxiliary_loss_mlp": 0.01102641, + "balance_loss_clip": 1.00187743, + "balance_loss_mlp": 1.00050259, + "epoch": 0.7554486697730347, + "flos": 21616249023360.0, + "grad_norm": 2.2471683323623624, + "language_loss": 0.68432975, + "learning_rate": 5.951113590425228e-07, + "loss": 0.70669073, + "num_input_tokens_seen": 271045805, + "step": 12565, + "time_per_iteration": 2.6140801906585693 + }, + { + "auxiliary_loss_clip": 0.0113122, + "auxiliary_loss_mlp": 0.01104297, + "balance_loss_clip": 1.00166488, + "balance_loss_mlp": 1.00044155, + "epoch": 0.7555087930257027, + "flos": 27632973605760.0, + "grad_norm": 3.06424657539262, + "language_loss": 0.75276351, + "learning_rate": 5.94834190730287e-07, + "loss": 0.77511871, + "num_input_tokens_seen": 271066065, + "step": 12566, + "time_per_iteration": 2.7043306827545166 + }, + { + "auxiliary_loss_clip": 0.0115005, + "auxiliary_loss_mlp": 0.01103187, + "balance_loss_clip": 1.00188363, + "balance_loss_mlp": 1.00057185, + "epoch": 0.7555689162783706, + "flos": 23621644316160.0, + "grad_norm": 2.8033415960323875, + "language_loss": 0.74195063, + "learning_rate": 5.945570757020789e-07, + "loss": 0.76448298, + "num_input_tokens_seen": 271085870, + "step": 12567, + "time_per_iteration": 2.573615550994873 + }, + { + "auxiliary_loss_clip": 0.01164613, + "auxiliary_loss_mlp": 0.01102396, + "balance_loss_clip": 1.00189018, + "balance_loss_mlp": 1.00044823, + "epoch": 0.7556290395310387, + "flos": 24863076218880.0, + "grad_norm": 1.8465348135464765, + "language_loss": 0.63025111, + "learning_rate": 5.942800139684073e-07, + "loss": 0.6529212, + "num_input_tokens_seen": 271104260, + "step": 12568, + "time_per_iteration": 2.577338457107544 + }, + { + "auxiliary_loss_clip": 0.01053505, + "auxiliary_loss_mlp": 0.01102676, + "balance_loss_clip": 1.00145233, + "balance_loss_mlp": 1.00044227, + "epoch": 0.7556891627837066, + "flos": 43543770330240.0, + "grad_norm": 2.322057625916169, + "language_loss": 0.66500592, + "learning_rate": 5.940030055397789e-07, + "loss": 0.68656778, + "num_input_tokens_seen": 271125745, + "step": 12569, + "time_per_iteration": 4.629219055175781 + }, + { + "auxiliary_loss_clip": 0.01147521, + "auxiliary_loss_mlp": 0.01103109, + "balance_loss_clip": 1.00187397, + "balance_loss_mlp": 1.00068474, + "epoch": 0.7557492860363746, + "flos": 26650924790400.0, + "grad_norm": 1.9033228061430965, + "language_loss": 0.67345339, + "learning_rate": 5.93726050426697e-07, + "loss": 0.69595975, + "num_input_tokens_seen": 271147145, + "step": 12570, + "time_per_iteration": 2.9792635440826416 + }, + { + "auxiliary_loss_clip": 0.01164551, + "auxiliary_loss_mlp": 0.01102456, + "balance_loss_clip": 1.0019362, + "balance_loss_mlp": 1.00050867, + "epoch": 0.7558094092890425, + "flos": 55182885010560.0, + "grad_norm": 2.0181758076865757, + "language_loss": 0.71654063, + "learning_rate": 5.934491486396647e-07, + "loss": 0.73921072, + "num_input_tokens_seen": 271170865, + "step": 12571, + "time_per_iteration": 2.837472677230835 + }, + { + "auxiliary_loss_clip": 0.01102089, + "auxiliary_loss_mlp": 0.01103093, + "balance_loss_clip": 1.00167954, + "balance_loss_mlp": 1.00038195, + "epoch": 0.7558695325417105, + "flos": 23988292392960.0, + "grad_norm": 1.6351134654707893, + "language_loss": 0.73630655, + "learning_rate": 5.931723001891811e-07, + "loss": 0.75835836, + "num_input_tokens_seen": 271191450, + "step": 12572, + "time_per_iteration": 2.75718355178833 + }, + { + "auxiliary_loss_clip": 0.0113161, + "auxiliary_loss_mlp": 0.01102952, + "balance_loss_clip": 1.00175154, + "balance_loss_mlp": 1.00052714, + "epoch": 0.7559296557943784, + "flos": 14611262572800.0, + "grad_norm": 2.105994253784951, + "language_loss": 0.7633208, + "learning_rate": 5.928955050857456e-07, + "loss": 0.78566647, + "num_input_tokens_seen": 271207335, + "step": 12573, + "time_per_iteration": 3.978018283843994 + }, + { + "auxiliary_loss_clip": 0.01119153, + "auxiliary_loss_mlp": 0.01102945, + "balance_loss_clip": 1.00178134, + "balance_loss_mlp": 1.00042534, + "epoch": 0.7559897790470465, + "flos": 18550483309440.0, + "grad_norm": 2.023814624798767, + "language_loss": 0.69046032, + "learning_rate": 5.926187633398527e-07, + "loss": 0.71268129, + "num_input_tokens_seen": 271226895, + "step": 12574, + "time_per_iteration": 2.708062171936035 + }, + { + "auxiliary_loss_clip": 0.01118544, + "auxiliary_loss_mlp": 0.01102397, + "balance_loss_clip": 1.00167441, + "balance_loss_mlp": 1.00054443, + "epoch": 0.7560499022997144, + "flos": 17967868709760.0, + "grad_norm": 3.939452760930282, + "language_loss": 0.71881777, + "learning_rate": 5.923420749619974e-07, + "loss": 0.74102712, + "num_input_tokens_seen": 271244375, + "step": 12575, + "time_per_iteration": 2.6486012935638428 + }, + { + "auxiliary_loss_clip": 0.0116447, + "auxiliary_loss_mlp": 0.00747349, + "balance_loss_clip": 1.00182056, + "balance_loss_mlp": 1.0003891, + "epoch": 0.7561100255523824, + "flos": 15737815802880.0, + "grad_norm": 2.6189349664795514, + "language_loss": 0.72320801, + "learning_rate": 5.92065439962673e-07, + "loss": 0.74232614, + "num_input_tokens_seen": 271259530, + "step": 12576, + "time_per_iteration": 2.490011692047119 + }, + { + "auxiliary_loss_clip": 0.01115759, + "auxiliary_loss_mlp": 0.01102302, + "balance_loss_clip": 1.00173688, + "balance_loss_mlp": 1.00040197, + "epoch": 0.7561701488050504, + "flos": 15888102307200.0, + "grad_norm": 1.8521646313118654, + "language_loss": 0.6728881, + "learning_rate": 5.917888583523669e-07, + "loss": 0.69506872, + "num_input_tokens_seen": 271276835, + "step": 12577, + "time_per_iteration": 2.625021457672119 + }, + { + "auxiliary_loss_clip": 0.01133101, + "auxiliary_loss_mlp": 0.01101795, + "balance_loss_clip": 1.00183523, + "balance_loss_mlp": 1.00051475, + "epoch": 0.7562302720577183, + "flos": 20339157893760.0, + "grad_norm": 1.6988338540159573, + "language_loss": 0.78284705, + "learning_rate": 5.915123301415685e-07, + "loss": 0.80519605, + "num_input_tokens_seen": 271296275, + "step": 12578, + "time_per_iteration": 2.612267017364502 + }, + { + "auxiliary_loss_clip": 0.01147988, + "auxiliary_loss_mlp": 0.01101718, + "balance_loss_clip": 1.00181234, + "balance_loss_mlp": 1.0005331, + "epoch": 0.7562903953103863, + "flos": 20812209033600.0, + "grad_norm": 2.386994257599257, + "language_loss": 0.75713885, + "learning_rate": 5.912358553407641e-07, + "loss": 0.77963591, + "num_input_tokens_seen": 271315685, + "step": 12579, + "time_per_iteration": 2.6184961795806885 + }, + { + "auxiliary_loss_clip": 0.01100206, + "auxiliary_loss_mlp": 0.01102746, + "balance_loss_clip": 1.00169611, + "balance_loss_mlp": 1.00051212, + "epoch": 0.7563505185630542, + "flos": 37596999484800.0, + "grad_norm": 1.9613744255604966, + "language_loss": 0.62633079, + "learning_rate": 5.90959433960437e-07, + "loss": 0.64836025, + "num_input_tokens_seen": 271336790, + "step": 12580, + "time_per_iteration": 2.8449010848999023 + }, + { + "auxiliary_loss_clip": 0.01101628, + "auxiliary_loss_mlp": 0.01102006, + "balance_loss_clip": 1.00176489, + "balance_loss_mlp": 1.00043988, + "epoch": 0.7564106418157223, + "flos": 20230995064320.0, + "grad_norm": 1.8997440431152237, + "language_loss": 0.75016677, + "learning_rate": 5.906830660110691e-07, + "loss": 0.77220315, + "num_input_tokens_seen": 271355470, + "step": 12581, + "time_per_iteration": 2.73445200920105 + }, + { + "auxiliary_loss_clip": 0.01114977, + "auxiliary_loss_mlp": 0.0110255, + "balance_loss_clip": 1.00177312, + "balance_loss_mlp": 1.00041127, + "epoch": 0.7564707650683902, + "flos": 24754877475840.0, + "grad_norm": 1.9107858267622808, + "language_loss": 0.6263938, + "learning_rate": 5.904067515031412e-07, + "loss": 0.64856905, + "num_input_tokens_seen": 271375810, + "step": 12582, + "time_per_iteration": 4.136514186859131 + }, + { + "auxiliary_loss_clip": 0.01158637, + "auxiliary_loss_mlp": 0.01076641, + "balance_loss_clip": 1.00082994, + "balance_loss_mlp": 0.99996608, + "epoch": 0.7565308883210582, + "flos": 48530076433920.0, + "grad_norm": 0.9931597742470676, + "language_loss": 0.60716909, + "learning_rate": 5.901304904471307e-07, + "loss": 0.62952185, + "num_input_tokens_seen": 271424775, + "step": 12583, + "time_per_iteration": 2.876451253890991 + }, + { + "auxiliary_loss_clip": 0.01131343, + "auxiliary_loss_mlp": 0.01102592, + "balance_loss_clip": 1.00164342, + "balance_loss_mlp": 1.00054884, + "epoch": 0.7565910115737261, + "flos": 12495082757760.0, + "grad_norm": 2.031252847893469, + "language_loss": 0.78769279, + "learning_rate": 5.898542828535125e-07, + "loss": 0.81003213, + "num_input_tokens_seen": 271440500, + "step": 12584, + "time_per_iteration": 2.6314167976379395 + }, + { + "auxiliary_loss_clip": 0.01134804, + "auxiliary_loss_mlp": 0.0110255, + "balance_loss_clip": 1.00199199, + "balance_loss_mlp": 1.0005064, + "epoch": 0.7566511348263941, + "flos": 21173003193600.0, + "grad_norm": 1.8975339086168914, + "language_loss": 0.77493346, + "learning_rate": 5.895781287327612e-07, + "loss": 0.79730701, + "num_input_tokens_seen": 271458180, + "step": 12585, + "time_per_iteration": 2.6780667304992676 + }, + { + "auxiliary_loss_clip": 0.01164699, + "auxiliary_loss_mlp": 0.01102462, + "balance_loss_clip": 1.00200653, + "balance_loss_mlp": 1.00041842, + "epoch": 0.756711258079062, + "flos": 21754827694080.0, + "grad_norm": 1.8202328709164024, + "language_loss": 0.82837307, + "learning_rate": 5.893020280953493e-07, + "loss": 0.85104477, + "num_input_tokens_seen": 271475730, + "step": 12586, + "time_per_iteration": 2.5514914989471436 + }, + { + "auxiliary_loss_clip": 0.01164605, + "auxiliary_loss_mlp": 0.01102817, + "balance_loss_clip": 1.00192153, + "balance_loss_mlp": 1.00048828, + "epoch": 0.75677138133173, + "flos": 22382905933440.0, + "grad_norm": 2.2260671004631556, + "language_loss": 0.83619297, + "learning_rate": 5.890259809517459e-07, + "loss": 0.85886717, + "num_input_tokens_seen": 271495030, + "step": 12587, + "time_per_iteration": 3.9976320266723633 + }, + { + "auxiliary_loss_clip": 0.01116389, + "auxiliary_loss_mlp": 0.01101292, + "balance_loss_clip": 1.00180125, + "balance_loss_mlp": 1.00029755, + "epoch": 0.756831504584398, + "flos": 22708974620160.0, + "grad_norm": 1.6640576948898813, + "language_loss": 0.71074122, + "learning_rate": 5.88749987312418e-07, + "loss": 0.73291802, + "num_input_tokens_seen": 271515355, + "step": 12588, + "time_per_iteration": 2.6492319107055664 + }, + { + "auxiliary_loss_clip": 0.0116469, + "auxiliary_loss_mlp": 0.00747393, + "balance_loss_clip": 1.001881, + "balance_loss_mlp": 1.00043702, + "epoch": 0.756891627837066, + "flos": 24098358643200.0, + "grad_norm": 1.7662648633845623, + "language_loss": 0.69042212, + "learning_rate": 5.884740471878327e-07, + "loss": 0.70954299, + "num_input_tokens_seen": 271535090, + "step": 12589, + "time_per_iteration": 2.5923211574554443 + }, + { + "auxiliary_loss_clip": 0.01147631, + "auxiliary_loss_mlp": 0.0110182, + "balance_loss_clip": 1.00172973, + "balance_loss_mlp": 1.00044477, + "epoch": 0.756951751089734, + "flos": 19749001438080.0, + "grad_norm": 1.6624966451993857, + "language_loss": 0.92329884, + "learning_rate": 5.881981605884522e-07, + "loss": 0.94579339, + "num_input_tokens_seen": 271551075, + "step": 12590, + "time_per_iteration": 2.5672500133514404 + }, + { + "auxiliary_loss_clip": 0.01132925, + "auxiliary_loss_mlp": 0.01102153, + "balance_loss_clip": 1.00186622, + "balance_loss_mlp": 1.00053859, + "epoch": 0.7570118743424019, + "flos": 35079266551680.0, + "grad_norm": 1.7709418688670002, + "language_loss": 0.65602601, + "learning_rate": 5.879223275247391e-07, + "loss": 0.67837673, + "num_input_tokens_seen": 271571035, + "step": 12591, + "time_per_iteration": 2.731215715408325 + }, + { + "auxiliary_loss_clip": 0.01147498, + "auxiliary_loss_mlp": 0.01102248, + "balance_loss_clip": 1.00202286, + "balance_loss_mlp": 1.00030065, + "epoch": 0.7570719975950699, + "flos": 25594540778880.0, + "grad_norm": 2.837216801405024, + "language_loss": 0.73706466, + "learning_rate": 5.876465480071528e-07, + "loss": 0.75956213, + "num_input_tokens_seen": 271592950, + "step": 12592, + "time_per_iteration": 2.611879348754883 + }, + { + "auxiliary_loss_clip": 0.01147838, + "auxiliary_loss_mlp": 0.01101859, + "balance_loss_clip": 1.00180531, + "balance_loss_mlp": 1.00057828, + "epoch": 0.7571321208477378, + "flos": 10816223028480.0, + "grad_norm": 2.4176522881370635, + "language_loss": 0.71746659, + "learning_rate": 5.873708220461522e-07, + "loss": 0.73996359, + "num_input_tokens_seen": 271608835, + "step": 12593, + "time_per_iteration": 2.5542633533477783 + }, + { + "auxiliary_loss_clip": 0.01164692, + "auxiliary_loss_mlp": 0.01102424, + "balance_loss_clip": 1.00194454, + "balance_loss_mlp": 1.00038123, + "epoch": 0.7571922441004059, + "flos": 18260109763200.0, + "grad_norm": 12.879851067372405, + "language_loss": 0.66205966, + "learning_rate": 5.870951496521903e-07, + "loss": 0.68473077, + "num_input_tokens_seen": 271627730, + "step": 12594, + "time_per_iteration": 2.51316237449646 + }, + { + "auxiliary_loss_clip": 0.01114983, + "auxiliary_loss_mlp": 0.01103077, + "balance_loss_clip": 1.00169241, + "balance_loss_mlp": 1.00046182, + "epoch": 0.7572523673530738, + "flos": 22890502978560.0, + "grad_norm": 2.527618529032907, + "language_loss": 0.8092159, + "learning_rate": 5.86819530835722e-07, + "loss": 0.83139646, + "num_input_tokens_seen": 271646415, + "step": 12595, + "time_per_iteration": 2.7742886543273926 + }, + { + "auxiliary_loss_clip": 0.01115374, + "auxiliary_loss_mlp": 0.01101501, + "balance_loss_clip": 1.00166237, + "balance_loss_mlp": 1.00050724, + "epoch": 0.7573124906057418, + "flos": 20996323171200.0, + "grad_norm": 3.355797342166557, + "language_loss": 0.71894485, + "learning_rate": 5.865439656071993e-07, + "loss": 0.74111354, + "num_input_tokens_seen": 271666240, + "step": 12596, + "time_per_iteration": 2.6980550289154053 + }, + { + "auxiliary_loss_clip": 0.01036599, + "auxiliary_loss_mlp": 0.0110105, + "balance_loss_clip": 1.00151396, + "balance_loss_mlp": 1.00043702, + "epoch": 0.7573726138584097, + "flos": 20886292834560.0, + "grad_norm": 1.602951270923162, + "language_loss": 0.80667979, + "learning_rate": 5.862684539770706e-07, + "loss": 0.82805622, + "num_input_tokens_seen": 271686370, + "step": 12597, + "time_per_iteration": 3.0674471855163574 + }, + { + "auxiliary_loss_clip": 0.01115914, + "auxiliary_loss_mlp": 0.011032, + "balance_loss_clip": 1.00175714, + "balance_loss_mlp": 1.00048876, + "epoch": 0.7574327371110777, + "flos": 24530507170560.0, + "grad_norm": 1.8145265174101974, + "language_loss": 0.83258194, + "learning_rate": 5.859929959557835e-07, + "loss": 0.85477304, + "num_input_tokens_seen": 271705050, + "step": 12598, + "time_per_iteration": 2.8483211994171143 + }, + { + "auxiliary_loss_clip": 0.0113153, + "auxiliary_loss_mlp": 0.01102044, + "balance_loss_clip": 1.00178766, + "balance_loss_mlp": 1.00038266, + "epoch": 0.7574928603637456, + "flos": 23364523785600.0, + "grad_norm": 1.8375033882845546, + "language_loss": 0.6251356, + "learning_rate": 5.857175915537845e-07, + "loss": 0.64747131, + "num_input_tokens_seen": 271724915, + "step": 12599, + "time_per_iteration": 2.6444082260131836 + }, + { + "auxiliary_loss_clip": 0.01134352, + "auxiliary_loss_mlp": 0.00747392, + "balance_loss_clip": 1.00197005, + "balance_loss_mlp": 1.0004003, + "epoch": 0.7575529836164137, + "flos": 13516274419200.0, + "grad_norm": 2.351814645131566, + "language_loss": 0.63894713, + "learning_rate": 5.854422407815161e-07, + "loss": 0.65776455, + "num_input_tokens_seen": 271742410, + "step": 12600, + "time_per_iteration": 2.5910215377807617 + }, + { + "auxiliary_loss_clip": 0.01132943, + "auxiliary_loss_mlp": 0.01102188, + "balance_loss_clip": 1.00190246, + "balance_loss_mlp": 1.00052667, + "epoch": 0.7576131068690816, + "flos": 19646584784640.0, + "grad_norm": 2.0302966376889113, + "language_loss": 0.6630187, + "learning_rate": 5.851669436494191e-07, + "loss": 0.68536997, + "num_input_tokens_seen": 271761425, + "step": 12601, + "time_per_iteration": 2.979959726333618 + }, + { + "auxiliary_loss_clip": 0.01134823, + "auxiliary_loss_mlp": 0.01101471, + "balance_loss_clip": 1.00197387, + "balance_loss_mlp": 1.00047696, + "epoch": 0.7576732301217496, + "flos": 20048245643520.0, + "grad_norm": 1.881376937284284, + "language_loss": 0.67647707, + "learning_rate": 5.848917001679335e-07, + "loss": 0.69884002, + "num_input_tokens_seen": 271780875, + "step": 12602, + "time_per_iteration": 2.75185489654541 + }, + { + "auxiliary_loss_clip": 0.01149095, + "auxiliary_loss_mlp": 0.0110211, + "balance_loss_clip": 1.00196326, + "balance_loss_mlp": 1.00063944, + "epoch": 0.7577333533744176, + "flos": 15377093470080.0, + "grad_norm": 1.7481321058672552, + "language_loss": 0.67039812, + "learning_rate": 5.846165103474967e-07, + "loss": 0.69291025, + "num_input_tokens_seen": 271799490, + "step": 12603, + "time_per_iteration": 2.577789306640625 + }, + { + "auxiliary_loss_clip": 0.0113509, + "auxiliary_loss_mlp": 0.01101304, + "balance_loss_clip": 1.00176537, + "balance_loss_mlp": 1.00059617, + "epoch": 0.7577934766270855, + "flos": 17894862316800.0, + "grad_norm": 2.194498804053399, + "language_loss": 0.61744308, + "learning_rate": 5.843413741985439e-07, + "loss": 0.63980699, + "num_input_tokens_seen": 271817040, + "step": 12604, + "time_per_iteration": 2.584886074066162 + }, + { + "auxiliary_loss_clip": 0.01164604, + "auxiliary_loss_mlp": 0.01101877, + "balance_loss_clip": 1.00204051, + "balance_loss_mlp": 1.00069189, + "epoch": 0.7578535998797535, + "flos": 21613770984960.0, + "grad_norm": 1.8894941205521991, + "language_loss": 0.79798901, + "learning_rate": 5.840662917315076e-07, + "loss": 0.8206538, + "num_input_tokens_seen": 271835480, + "step": 12605, + "time_per_iteration": 2.522986888885498 + }, + { + "auxiliary_loss_clip": 0.01164561, + "auxiliary_loss_mlp": 0.0110262, + "balance_loss_clip": 1.00188208, + "balance_loss_mlp": 1.00038648, + "epoch": 0.7579137231324214, + "flos": 18478374756480.0, + "grad_norm": 2.6109635800911812, + "language_loss": 0.79409522, + "learning_rate": 5.837912629568198e-07, + "loss": 0.81676704, + "num_input_tokens_seen": 271849835, + "step": 12606, + "time_per_iteration": 2.4992682933807373 + }, + { + "auxiliary_loss_clip": 0.01149546, + "auxiliary_loss_mlp": 0.01100666, + "balance_loss_clip": 1.00189352, + "balance_loss_mlp": 1.00043452, + "epoch": 0.7579738463850895, + "flos": 23255032152960.0, + "grad_norm": 1.3919355459339307, + "language_loss": 0.73097241, + "learning_rate": 5.835162878849087e-07, + "loss": 0.75347459, + "num_input_tokens_seen": 271869560, + "step": 12607, + "time_per_iteration": 3.941347122192383 + }, + { + "auxiliary_loss_clip": 0.01131526, + "auxiliary_loss_mlp": 0.01103256, + "balance_loss_clip": 1.00176609, + "balance_loss_mlp": 1.0004499, + "epoch": 0.7580339696377574, + "flos": 14027031861120.0, + "grad_norm": 3.11041377343407, + "language_loss": 0.74855393, + "learning_rate": 5.83241366526202e-07, + "loss": 0.7709018, + "num_input_tokens_seen": 271887950, + "step": 12608, + "time_per_iteration": 2.5636067390441895 + }, + { + "auxiliary_loss_clip": 0.01114274, + "auxiliary_loss_mlp": 0.0074719, + "balance_loss_clip": 1.00163031, + "balance_loss_mlp": 1.0003705, + "epoch": 0.7580940928904254, + "flos": 25082777756160.0, + "grad_norm": 1.6510120077780799, + "language_loss": 0.71956217, + "learning_rate": 5.829664988911245e-07, + "loss": 0.73817682, + "num_input_tokens_seen": 271907700, + "step": 12609, + "time_per_iteration": 2.6671769618988037 + }, + { + "auxiliary_loss_clip": 0.01164475, + "auxiliary_loss_mlp": 0.01102166, + "balance_loss_clip": 1.00184345, + "balance_loss_mlp": 1.00040865, + "epoch": 0.7581542161430933, + "flos": 23836425690240.0, + "grad_norm": 2.1828109613978497, + "language_loss": 0.81241679, + "learning_rate": 5.826916849901007e-07, + "loss": 0.83508313, + "num_input_tokens_seen": 271926840, + "step": 12610, + "time_per_iteration": 3.90122127532959 + }, + { + "auxiliary_loss_clip": 0.0113402, + "auxiliary_loss_mlp": 0.01103417, + "balance_loss_clip": 1.00190544, + "balance_loss_mlp": 1.00041974, + "epoch": 0.7582143393957613, + "flos": 22237000888320.0, + "grad_norm": 4.051333599842518, + "language_loss": 0.70388323, + "learning_rate": 5.824169248335488e-07, + "loss": 0.72625756, + "num_input_tokens_seen": 271946465, + "step": 12611, + "time_per_iteration": 2.6114444732666016 + }, + { + "auxiliary_loss_clip": 0.01164468, + "auxiliary_loss_mlp": 0.01102023, + "balance_loss_clip": 1.00189078, + "balance_loss_mlp": 1.00045657, + "epoch": 0.7582744626484292, + "flos": 21106389421440.0, + "grad_norm": 1.8300652214654642, + "language_loss": 0.70913374, + "learning_rate": 5.821422184318893e-07, + "loss": 0.73179865, + "num_input_tokens_seen": 271967295, + "step": 12612, + "time_per_iteration": 2.54044246673584 + }, + { + "auxiliary_loss_clip": 0.01083176, + "auxiliary_loss_mlp": 0.01103343, + "balance_loss_clip": 1.00158942, + "balance_loss_mlp": 1.00063229, + "epoch": 0.7583345859010973, + "flos": 24604770539520.0, + "grad_norm": 1.7273108969808908, + "language_loss": 0.59788281, + "learning_rate": 5.818675657955397e-07, + "loss": 0.619748, + "num_input_tokens_seen": 271987960, + "step": 12613, + "time_per_iteration": 2.7342519760131836 + }, + { + "auxiliary_loss_clip": 0.01134113, + "auxiliary_loss_mlp": 0.01101999, + "balance_loss_clip": 1.00181818, + "balance_loss_mlp": 1.0005281, + "epoch": 0.7583947091537652, + "flos": 33546814657920.0, + "grad_norm": 1.7707581649406494, + "language_loss": 0.59717029, + "learning_rate": 5.815929669349135e-07, + "loss": 0.61953139, + "num_input_tokens_seen": 272011780, + "step": 12614, + "time_per_iteration": 2.7007391452789307 + }, + { + "auxiliary_loss_clip": 0.01116934, + "auxiliary_loss_mlp": 0.01102674, + "balance_loss_clip": 1.00164819, + "balance_loss_mlp": 1.00043964, + "epoch": 0.7584548324064332, + "flos": 20121000641280.0, + "grad_norm": 1.8268758617259695, + "language_loss": 0.73589849, + "learning_rate": 5.813184218604246e-07, + "loss": 0.75809461, + "num_input_tokens_seen": 272030825, + "step": 12615, + "time_per_iteration": 2.6263554096221924 + }, + { + "auxiliary_loss_clip": 0.01126939, + "auxiliary_loss_mlp": 0.01076652, + "balance_loss_clip": 1.00091398, + "balance_loss_mlp": 0.99997663, + "epoch": 0.7585149556591012, + "flos": 70402584061440.0, + "grad_norm": 0.8115483094630342, + "language_loss": 0.67689651, + "learning_rate": 5.810439305824828e-07, + "loss": 0.69893247, + "num_input_tokens_seen": 272095825, + "step": 12616, + "time_per_iteration": 3.235987424850464 + }, + { + "auxiliary_loss_clip": 0.01118216, + "auxiliary_loss_mlp": 0.01103054, + "balance_loss_clip": 1.00182939, + "balance_loss_mlp": 1.0005343, + "epoch": 0.7585750789117691, + "flos": 16143786293760.0, + "grad_norm": 1.7816865689974997, + "language_loss": 0.84637809, + "learning_rate": 5.807694931114979e-07, + "loss": 0.86859071, + "num_input_tokens_seen": 272113950, + "step": 12617, + "time_per_iteration": 2.6168510913848877 + }, + { + "auxiliary_loss_clip": 0.01114569, + "auxiliary_loss_mlp": 0.01102152, + "balance_loss_clip": 1.00171852, + "balance_loss_mlp": 1.00058615, + "epoch": 0.7586352021644371, + "flos": 17493165544320.0, + "grad_norm": 2.4276803435215086, + "language_loss": 0.74488026, + "learning_rate": 5.804951094578757e-07, + "loss": 0.76704752, + "num_input_tokens_seen": 272130315, + "step": 12618, + "time_per_iteration": 2.631819486618042 + }, + { + "auxiliary_loss_clip": 0.01131471, + "auxiliary_loss_mlp": 0.01103552, + "balance_loss_clip": 1.00183034, + "balance_loss_mlp": 1.00045967, + "epoch": 0.758695325417105, + "flos": 17275187859840.0, + "grad_norm": 2.057075971625107, + "language_loss": 0.7733531, + "learning_rate": 5.802207796320209e-07, + "loss": 0.79570335, + "num_input_tokens_seen": 272149080, + "step": 12619, + "time_per_iteration": 2.590785026550293 + }, + { + "auxiliary_loss_clip": 0.0111665, + "auxiliary_loss_mlp": 0.01101403, + "balance_loss_clip": 1.0017451, + "balance_loss_mlp": 1.00050426, + "epoch": 0.7587554486697731, + "flos": 29495660163840.0, + "grad_norm": 2.508826059676197, + "language_loss": 0.82602465, + "learning_rate": 5.79946503644337e-07, + "loss": 0.84820521, + "num_input_tokens_seen": 272168285, + "step": 12620, + "time_per_iteration": 4.098154783248901 + }, + { + "auxiliary_loss_clip": 0.01133185, + "auxiliary_loss_mlp": 0.01103614, + "balance_loss_clip": 1.00172007, + "balance_loss_mlp": 1.0005219, + "epoch": 0.758815571922441, + "flos": 16100800692480.0, + "grad_norm": 4.098996263390048, + "language_loss": 0.82515818, + "learning_rate": 5.796722815052242e-07, + "loss": 0.84752619, + "num_input_tokens_seen": 272184585, + "step": 12621, + "time_per_iteration": 2.590121030807495 + }, + { + "auxiliary_loss_clip": 0.01132454, + "auxiliary_loss_mlp": 0.01102292, + "balance_loss_clip": 1.00173521, + "balance_loss_mlp": 1.00053489, + "epoch": 0.758875695175109, + "flos": 16143714466560.0, + "grad_norm": 1.9918209790223358, + "language_loss": 0.73566687, + "learning_rate": 5.7939811322508e-07, + "loss": 0.75801432, + "num_input_tokens_seen": 272200205, + "step": 12622, + "time_per_iteration": 2.5603692531585693 + }, + { + "auxiliary_loss_clip": 0.01141924, + "auxiliary_loss_mlp": 0.01076638, + "balance_loss_clip": 1.00082469, + "balance_loss_mlp": 0.99996263, + "epoch": 0.7589358184277769, + "flos": 68462006860800.0, + "grad_norm": 0.8222226618759185, + "language_loss": 0.60777938, + "learning_rate": 5.791239988143024e-07, + "loss": 0.62996507, + "num_input_tokens_seen": 272259670, + "step": 12623, + "time_per_iteration": 3.153391122817993 + }, + { + "auxiliary_loss_clip": 0.01164624, + "auxiliary_loss_mlp": 0.01102607, + "balance_loss_clip": 1.00200856, + "balance_loss_mlp": 1.0005641, + "epoch": 0.7589959416804449, + "flos": 20047311889920.0, + "grad_norm": 1.9356722225357619, + "language_loss": 0.6749149, + "learning_rate": 5.788499382832847e-07, + "loss": 0.69758725, + "num_input_tokens_seen": 272277925, + "step": 12624, + "time_per_iteration": 2.54844331741333 + }, + { + "auxiliary_loss_clip": 0.01164455, + "auxiliary_loss_mlp": 0.01101635, + "balance_loss_clip": 1.0019033, + "balance_loss_mlp": 1.00035477, + "epoch": 0.7590560649331128, + "flos": 18771800958720.0, + "grad_norm": 1.8695400461297356, + "language_loss": 0.75672096, + "learning_rate": 5.785759316424196e-07, + "loss": 0.77938187, + "num_input_tokens_seen": 272296010, + "step": 12625, + "time_per_iteration": 3.905367136001587 + }, + { + "auxiliary_loss_clip": 0.01134448, + "auxiliary_loss_mlp": 0.0110283, + "balance_loss_clip": 1.00188732, + "balance_loss_mlp": 1.00059664, + "epoch": 0.7591161881857809, + "flos": 29825284296960.0, + "grad_norm": 2.238086497785443, + "language_loss": 0.63344347, + "learning_rate": 5.783019789020977e-07, + "loss": 0.65581626, + "num_input_tokens_seen": 272318330, + "step": 12626, + "time_per_iteration": 2.683814764022827 + }, + { + "auxiliary_loss_clip": 0.01102461, + "auxiliary_loss_mlp": 0.00747246, + "balance_loss_clip": 1.00177407, + "balance_loss_mlp": 1.00040376, + "epoch": 0.7591763114384488, + "flos": 20302708567680.0, + "grad_norm": 2.028097217948513, + "language_loss": 0.73769975, + "learning_rate": 5.780280800727084e-07, + "loss": 0.75619686, + "num_input_tokens_seen": 272335265, + "step": 12627, + "time_per_iteration": 2.670701503753662 + }, + { + "auxiliary_loss_clip": 0.01147664, + "auxiliary_loss_mlp": 0.01101904, + "balance_loss_clip": 1.00174987, + "balance_loss_mlp": 1.00052798, + "epoch": 0.7592364346911168, + "flos": 20813609664000.0, + "grad_norm": 2.2526202769894996, + "language_loss": 0.68628407, + "learning_rate": 5.777542351646356e-07, + "loss": 0.70877975, + "num_input_tokens_seen": 272354795, + "step": 12628, + "time_per_iteration": 2.63661527633667 + }, + { + "auxiliary_loss_clip": 0.01148436, + "auxiliary_loss_mlp": 0.01104139, + "balance_loss_clip": 1.00191855, + "balance_loss_mlp": 1.00047445, + "epoch": 0.7592965579437848, + "flos": 21251504367360.0, + "grad_norm": 2.0175001757915654, + "language_loss": 0.6333437, + "learning_rate": 5.774804441882648e-07, + "loss": 0.65586948, + "num_input_tokens_seen": 272372875, + "step": 12629, + "time_per_iteration": 2.5509629249572754 + }, + { + "auxiliary_loss_clip": 0.01133277, + "auxiliary_loss_mlp": 0.01101403, + "balance_loss_clip": 1.00171757, + "balance_loss_mlp": 1.00050426, + "epoch": 0.7593566811964527, + "flos": 26213604704640.0, + "grad_norm": 1.807395144174357, + "language_loss": 0.7779085, + "learning_rate": 5.772067071539786e-07, + "loss": 0.8002553, + "num_input_tokens_seen": 272394715, + "step": 12630, + "time_per_iteration": 2.6359024047851562 + }, + { + "auxiliary_loss_clip": 0.01158523, + "auxiliary_loss_mlp": 0.01076238, + "balance_loss_clip": 1.0007751, + "balance_loss_mlp": 0.99994391, + "epoch": 0.7594168044491207, + "flos": 71237255374080.0, + "grad_norm": 0.8102671050064373, + "language_loss": 0.61405551, + "learning_rate": 5.769330240721562e-07, + "loss": 0.63640308, + "num_input_tokens_seen": 272458775, + "step": 12631, + "time_per_iteration": 3.161876678466797 + }, + { + "auxiliary_loss_clip": 0.01116733, + "auxiliary_loss_mlp": 0.00747498, + "balance_loss_clip": 1.0019238, + "balance_loss_mlp": 1.0004952, + "epoch": 0.7594769277017887, + "flos": 26613326229120.0, + "grad_norm": 1.6471006970937032, + "language_loss": 0.73898184, + "learning_rate": 5.766593949531767e-07, + "loss": 0.75762415, + "num_input_tokens_seen": 272479355, + "step": 12632, + "time_per_iteration": 2.6966800689697266 + }, + { + "auxiliary_loss_clip": 0.01131299, + "auxiliary_loss_mlp": 0.01102472, + "balance_loss_clip": 1.00180089, + "balance_loss_mlp": 1.00052428, + "epoch": 0.7595370509544567, + "flos": 17595941333760.0, + "grad_norm": 1.834972818055277, + "language_loss": 0.74488544, + "learning_rate": 5.763858198074154e-07, + "loss": 0.76722324, + "num_input_tokens_seen": 272493555, + "step": 12633, + "time_per_iteration": 2.6393823623657227 + }, + { + "auxiliary_loss_clip": 0.01131193, + "auxiliary_loss_mlp": 0.0110227, + "balance_loss_clip": 1.0019716, + "balance_loss_mlp": 1.0005132, + "epoch": 0.7595971742071246, + "flos": 18002953319040.0, + "grad_norm": 1.8569877086249682, + "language_loss": 0.73355544, + "learning_rate": 5.76112298645246e-07, + "loss": 0.75589013, + "num_input_tokens_seen": 272508925, + "step": 12634, + "time_per_iteration": 2.6296517848968506 + }, + { + "auxiliary_loss_clip": 0.01164708, + "auxiliary_loss_mlp": 0.01102688, + "balance_loss_clip": 1.00209105, + "balance_loss_mlp": 1.00054932, + "epoch": 0.7596572974597926, + "flos": 28840326480000.0, + "grad_norm": 1.7243939277276616, + "language_loss": 0.64543045, + "learning_rate": 5.758388314770408e-07, + "loss": 0.66810441, + "num_input_tokens_seen": 272528805, + "step": 12635, + "time_per_iteration": 2.6036975383758545 + }, + { + "auxiliary_loss_clip": 0.01102001, + "auxiliary_loss_mlp": 0.01102687, + "balance_loss_clip": 1.00176489, + "balance_loss_mlp": 1.00045288, + "epoch": 0.7597174207124605, + "flos": 14282823588480.0, + "grad_norm": 5.030963170474795, + "language_loss": 0.69122303, + "learning_rate": 5.7556541831317e-07, + "loss": 0.71326983, + "num_input_tokens_seen": 272546655, + "step": 12636, + "time_per_iteration": 2.671130895614624 + }, + { + "auxiliary_loss_clip": 0.0113351, + "auxiliary_loss_mlp": 0.01102716, + "balance_loss_clip": 1.0018729, + "balance_loss_mlp": 1.00048256, + "epoch": 0.7597775439651285, + "flos": 21688932193920.0, + "grad_norm": 2.0543024783009805, + "language_loss": 0.81388026, + "learning_rate": 5.752920591640018e-07, + "loss": 0.8362425, + "num_input_tokens_seen": 272564010, + "step": 12637, + "time_per_iteration": 2.6198811531066895 + }, + { + "auxiliary_loss_clip": 0.01147853, + "auxiliary_loss_mlp": 0.01102536, + "balance_loss_clip": 1.00173128, + "balance_loss_mlp": 1.00039744, + "epoch": 0.7598376672177964, + "flos": 36101248312320.0, + "grad_norm": 1.7551455320906855, + "language_loss": 0.66286111, + "learning_rate": 5.750187540399017e-07, + "loss": 0.68536502, + "num_input_tokens_seen": 272585840, + "step": 12638, + "time_per_iteration": 2.6801209449768066 + }, + { + "auxiliary_loss_clip": 0.01164601, + "auxiliary_loss_mlp": 0.0110347, + "balance_loss_clip": 1.00188029, + "balance_loss_mlp": 1.00066352, + "epoch": 0.7598977904704645, + "flos": 18332326056960.0, + "grad_norm": 3.8446054529673246, + "language_loss": 0.65318209, + "learning_rate": 5.747455029512323e-07, + "loss": 0.67586279, + "num_input_tokens_seen": 272602300, + "step": 12639, + "time_per_iteration": 2.5048329830169678 + }, + { + "auxiliary_loss_clip": 0.01149828, + "auxiliary_loss_mlp": 0.01102648, + "balance_loss_clip": 1.00195336, + "balance_loss_mlp": 1.00050962, + "epoch": 0.7599579137231324, + "flos": 20192642317440.0, + "grad_norm": 2.466476031021267, + "language_loss": 0.7043308, + "learning_rate": 5.744723059083572e-07, + "loss": 0.72685558, + "num_input_tokens_seen": 272619595, + "step": 12640, + "time_per_iteration": 2.5685503482818604 + }, + { + "auxiliary_loss_clip": 0.01131263, + "auxiliary_loss_mlp": 0.01103817, + "balance_loss_clip": 1.00189447, + "balance_loss_mlp": 1.00043845, + "epoch": 0.7600180369758004, + "flos": 24024849459840.0, + "grad_norm": 1.7757370004211377, + "language_loss": 0.67085254, + "learning_rate": 5.741991629216343e-07, + "loss": 0.69320333, + "num_input_tokens_seen": 272638825, + "step": 12641, + "time_per_iteration": 2.668757200241089 + }, + { + "auxiliary_loss_clip": 0.01147855, + "auxiliary_loss_mlp": 0.01103211, + "balance_loss_clip": 1.00177431, + "balance_loss_mlp": 1.00040483, + "epoch": 0.7600781602284684, + "flos": 18989527248000.0, + "grad_norm": 2.0774261452949063, + "language_loss": 0.66288841, + "learning_rate": 5.73926074001422e-07, + "loss": 0.68539906, + "num_input_tokens_seen": 272657240, + "step": 12642, + "time_per_iteration": 2.5931475162506104 + }, + { + "auxiliary_loss_clip": 0.01133768, + "auxiliary_loss_mlp": 0.01102227, + "balance_loss_clip": 1.00186586, + "balance_loss_mlp": 1.00047028, + "epoch": 0.7601382834811363, + "flos": 26067520091520.0, + "grad_norm": 2.52662736255375, + "language_loss": 0.75655913, + "learning_rate": 5.736530391580765e-07, + "loss": 0.7789191, + "num_input_tokens_seen": 272677520, + "step": 12643, + "time_per_iteration": 2.6380388736724854 + }, + { + "auxiliary_loss_clip": 0.01116264, + "auxiliary_loss_mlp": 0.01102888, + "balance_loss_clip": 1.00174403, + "balance_loss_mlp": 1.00055885, + "epoch": 0.7601984067338043, + "flos": 18844232734080.0, + "grad_norm": 2.3279552065118416, + "language_loss": 0.78635734, + "learning_rate": 5.733800584019508e-07, + "loss": 0.80854893, + "num_input_tokens_seen": 272696770, + "step": 12644, + "time_per_iteration": 4.014946937561035 + }, + { + "auxiliary_loss_clip": 0.01135407, + "auxiliary_loss_mlp": 0.01102002, + "balance_loss_clip": 1.00192738, + "balance_loss_mlp": 1.00053072, + "epoch": 0.7602585299864723, + "flos": 24646391424000.0, + "grad_norm": 1.4767843704383858, + "language_loss": 0.80226099, + "learning_rate": 5.731071317433957e-07, + "loss": 0.82463515, + "num_input_tokens_seen": 272718340, + "step": 12645, + "time_per_iteration": 2.687654733657837 + }, + { + "auxiliary_loss_clip": 0.01133897, + "auxiliary_loss_mlp": 0.01103423, + "balance_loss_clip": 1.00190628, + "balance_loss_mlp": 1.00042582, + "epoch": 0.7603186532391403, + "flos": 23842100039040.0, + "grad_norm": 1.6168288314398993, + "language_loss": 0.73152322, + "learning_rate": 5.728342591927611e-07, + "loss": 0.75389636, + "num_input_tokens_seen": 272739575, + "step": 12646, + "time_per_iteration": 2.6484580039978027 + }, + { + "auxiliary_loss_clip": 0.01149806, + "auxiliary_loss_mlp": 0.01102025, + "balance_loss_clip": 1.00180292, + "balance_loss_mlp": 1.00045884, + "epoch": 0.7603787764918082, + "flos": 22199905117440.0, + "grad_norm": 1.9981382372816452, + "language_loss": 0.67524874, + "learning_rate": 5.725614407603949e-07, + "loss": 0.69776702, + "num_input_tokens_seen": 272758710, + "step": 12647, + "time_per_iteration": 4.072390556335449 + }, + { + "auxiliary_loss_clip": 0.01144022, + "auxiliary_loss_mlp": 0.01076655, + "balance_loss_clip": 1.00080752, + "balance_loss_mlp": 0.99997979, + "epoch": 0.7604388997444762, + "flos": 54086894254080.0, + "grad_norm": 0.6681262012710444, + "language_loss": 0.49026659, + "learning_rate": 5.722886764566415e-07, + "loss": 0.51247334, + "num_input_tokens_seen": 272814855, + "step": 12648, + "time_per_iteration": 3.083062171936035 + }, + { + "auxiliary_loss_clip": 0.01147733, + "auxiliary_loss_mlp": 0.0110198, + "balance_loss_clip": 1.00180721, + "balance_loss_mlp": 1.00060439, + "epoch": 0.7604990229971441, + "flos": 19681920789120.0, + "grad_norm": 1.637073818257986, + "language_loss": 0.76398468, + "learning_rate": 5.720159662918451e-07, + "loss": 0.78648186, + "num_input_tokens_seen": 272834400, + "step": 12649, + "time_per_iteration": 2.5992727279663086 + }, + { + "auxiliary_loss_clip": 0.01116629, + "auxiliary_loss_mlp": 0.01102387, + "balance_loss_clip": 1.0015974, + "balance_loss_mlp": 1.00053477, + "epoch": 0.7605591462498121, + "flos": 25228036356480.0, + "grad_norm": 1.560865226029497, + "language_loss": 0.68536884, + "learning_rate": 5.717433102763462e-07, + "loss": 0.70755899, + "num_input_tokens_seen": 272854760, + "step": 12650, + "time_per_iteration": 2.703279972076416 + }, + { + "auxiliary_loss_clip": 0.01141903, + "auxiliary_loss_mlp": 0.01076619, + "balance_loss_clip": 1.00069439, + "balance_loss_mlp": 0.99994326, + "epoch": 0.76061926950248, + "flos": 66783757662720.0, + "grad_norm": 0.7500749175577384, + "language_loss": 0.62713653, + "learning_rate": 5.714707084204838e-07, + "loss": 0.64932173, + "num_input_tokens_seen": 272919030, + "step": 12651, + "time_per_iteration": 3.145146369934082 + }, + { + "auxiliary_loss_clip": 0.01114594, + "auxiliary_loss_mlp": 0.0110256, + "balance_loss_clip": 1.00174403, + "balance_loss_mlp": 1.00061202, + "epoch": 0.7606793927551481, + "flos": 25338354001920.0, + "grad_norm": 1.3621584960813586, + "language_loss": 0.71567976, + "learning_rate": 5.711981607345951e-07, + "loss": 0.73785126, + "num_input_tokens_seen": 272938925, + "step": 12652, + "time_per_iteration": 2.6655452251434326 + }, + { + "auxiliary_loss_clip": 0.01084944, + "auxiliary_loss_mlp": 0.0110254, + "balance_loss_clip": 1.00159597, + "balance_loss_mlp": 1.00068736, + "epoch": 0.760739516007816, + "flos": 18223624523520.0, + "grad_norm": 2.6122060761118293, + "language_loss": 0.8021006, + "learning_rate": 5.709256672290152e-07, + "loss": 0.82397544, + "num_input_tokens_seen": 272954945, + "step": 12653, + "time_per_iteration": 2.7001147270202637 + }, + { + "auxiliary_loss_clip": 0.01164825, + "auxiliary_loss_mlp": 0.01104394, + "balance_loss_clip": 1.00197661, + "balance_loss_mlp": 1.00044322, + "epoch": 0.760799639260484, + "flos": 22559119079040.0, + "grad_norm": 1.5032597210033387, + "language_loss": 0.80025524, + "learning_rate": 5.706532279140785e-07, + "loss": 0.82294744, + "num_input_tokens_seen": 272972855, + "step": 12654, + "time_per_iteration": 2.529442310333252 + }, + { + "auxiliary_loss_clip": 0.01116728, + "auxiliary_loss_mlp": 0.01102889, + "balance_loss_clip": 1.00165319, + "balance_loss_mlp": 1.00056016, + "epoch": 0.760859762513152, + "flos": 22309324922880.0, + "grad_norm": 2.0744234090702722, + "language_loss": 0.79219687, + "learning_rate": 5.703808428001136e-07, + "loss": 0.81439304, + "num_input_tokens_seen": 272989895, + "step": 12655, + "time_per_iteration": 2.659841775894165 + }, + { + "auxiliary_loss_clip": 0.01147782, + "auxiliary_loss_mlp": 0.01101687, + "balance_loss_clip": 1.00179982, + "balance_loss_mlp": 1.00040698, + "epoch": 0.7609198857658199, + "flos": 24863902231680.0, + "grad_norm": 1.4929384477135381, + "language_loss": 0.68689072, + "learning_rate": 5.701085118974505e-07, + "loss": 0.7093854, + "num_input_tokens_seen": 273011695, + "step": 12656, + "time_per_iteration": 2.632578134536743 + }, + { + "auxiliary_loss_clip": 0.01149975, + "auxiliary_loss_mlp": 0.01103533, + "balance_loss_clip": 1.00188577, + "balance_loss_mlp": 1.0004406, + "epoch": 0.760980009018488, + "flos": 16836790366080.0, + "grad_norm": 2.6612091716296318, + "language_loss": 0.73223507, + "learning_rate": 5.698362352164164e-07, + "loss": 0.75477022, + "num_input_tokens_seen": 273028815, + "step": 12657, + "time_per_iteration": 2.5550148487091064 + }, + { + "auxiliary_loss_clip": 0.01129877, + "auxiliary_loss_mlp": 0.010766, + "balance_loss_clip": 1.00082803, + "balance_loss_mlp": 0.9999243, + "epoch": 0.7610401322711559, + "flos": 61230603029760.0, + "grad_norm": 0.8763609364292798, + "language_loss": 0.6492179, + "learning_rate": 5.695640127673347e-07, + "loss": 0.67128265, + "num_input_tokens_seen": 273084080, + "step": 12658, + "time_per_iteration": 4.551838636398315 + }, + { + "auxiliary_loss_clip": 0.01148077, + "auxiliary_loss_mlp": 0.0110222, + "balance_loss_clip": 1.00178456, + "balance_loss_mlp": 1.00055838, + "epoch": 0.7611002555238239, + "flos": 19640730867840.0, + "grad_norm": 1.6377840486771373, + "language_loss": 0.79303497, + "learning_rate": 5.692918445605293e-07, + "loss": 0.81553793, + "num_input_tokens_seen": 273102295, + "step": 12659, + "time_per_iteration": 2.5796658992767334 + }, + { + "auxiliary_loss_clip": 0.01147873, + "auxiliary_loss_mlp": 0.0110189, + "balance_loss_clip": 1.00180042, + "balance_loss_mlp": 1.00032377, + "epoch": 0.7611603787764918, + "flos": 26872206526080.0, + "grad_norm": 1.5219388465937818, + "language_loss": 0.69145793, + "learning_rate": 5.690197306063209e-07, + "loss": 0.71395552, + "num_input_tokens_seen": 273123400, + "step": 12660, + "time_per_iteration": 2.672243118286133 + }, + { + "auxiliary_loss_clip": 0.01164694, + "auxiliary_loss_mlp": 0.01102836, + "balance_loss_clip": 1.00194967, + "balance_loss_mlp": 1.00050688, + "epoch": 0.7612205020291598, + "flos": 27344252085120.0, + "grad_norm": 1.8203409978836773, + "language_loss": 0.70517838, + "learning_rate": 5.687476709150281e-07, + "loss": 0.72785366, + "num_input_tokens_seen": 273145150, + "step": 12661, + "time_per_iteration": 2.7010717391967773 + }, + { + "auxiliary_loss_clip": 0.01147892, + "auxiliary_loss_mlp": 0.0110243, + "balance_loss_clip": 1.00173283, + "balance_loss_mlp": 1.00048256, + "epoch": 0.7612806252818277, + "flos": 29314598682240.0, + "grad_norm": 1.532675875586459, + "language_loss": 0.83450949, + "learning_rate": 5.68475665496966e-07, + "loss": 0.85701275, + "num_input_tokens_seen": 273165180, + "step": 12662, + "time_per_iteration": 2.658548593521118 + }, + { + "auxiliary_loss_clip": 0.01131244, + "auxiliary_loss_mlp": 0.01102719, + "balance_loss_clip": 1.0017066, + "balance_loss_mlp": 1.00067568, + "epoch": 0.7613407485344957, + "flos": 19026048401280.0, + "grad_norm": 1.6410122168892753, + "language_loss": 0.68747509, + "learning_rate": 5.682037143624505e-07, + "loss": 0.70981467, + "num_input_tokens_seen": 273184005, + "step": 12663, + "time_per_iteration": 3.9986932277679443 + }, + { + "auxiliary_loss_clip": 0.01147898, + "auxiliary_loss_mlp": 0.01101831, + "balance_loss_clip": 1.00195885, + "balance_loss_mlp": 1.00045586, + "epoch": 0.7614008717871636, + "flos": 23256037733760.0, + "grad_norm": 1.9339825486558344, + "language_loss": 0.70323408, + "learning_rate": 5.67931817521794e-07, + "loss": 0.72573137, + "num_input_tokens_seen": 273203565, + "step": 12664, + "time_per_iteration": 2.566366195678711 + }, + { + "auxiliary_loss_clip": 0.01148538, + "auxiliary_loss_mlp": 0.01103972, + "balance_loss_clip": 1.00190449, + "balance_loss_mlp": 1.00059319, + "epoch": 0.7614609950398317, + "flos": 21579907438080.0, + "grad_norm": 1.6191666093669579, + "language_loss": 0.79728758, + "learning_rate": 5.676599749853066e-07, + "loss": 0.81981266, + "num_input_tokens_seen": 273221645, + "step": 12665, + "time_per_iteration": 2.576507329940796 + }, + { + "auxiliary_loss_clip": 0.01164524, + "auxiliary_loss_mlp": 0.00747186, + "balance_loss_clip": 1.00205159, + "balance_loss_mlp": 1.00041378, + "epoch": 0.7615211182924996, + "flos": 29277897960960.0, + "grad_norm": 2.5358639957979268, + "language_loss": 0.87998068, + "learning_rate": 5.673881867632959e-07, + "loss": 0.89909774, + "num_input_tokens_seen": 273242040, + "step": 12666, + "time_per_iteration": 2.665452480316162 + }, + { + "auxiliary_loss_clip": 0.01101553, + "auxiliary_loss_mlp": 0.01103246, + "balance_loss_clip": 1.00169587, + "balance_loss_mlp": 1.00048733, + "epoch": 0.7615812415451676, + "flos": 13261129136640.0, + "grad_norm": 2.9571782994740485, + "language_loss": 0.83210319, + "learning_rate": 5.671164528660693e-07, + "loss": 0.85415119, + "num_input_tokens_seen": 273257365, + "step": 12667, + "time_per_iteration": 2.6596107482910156 + }, + { + "auxiliary_loss_clip": 0.01131356, + "auxiliary_loss_mlp": 0.01101477, + "balance_loss_clip": 1.00178337, + "balance_loss_mlp": 1.00048304, + "epoch": 0.7616413647978356, + "flos": 18584741905920.0, + "grad_norm": 1.6913418806308425, + "language_loss": 0.7869097, + "learning_rate": 5.668447733039296e-07, + "loss": 0.80923808, + "num_input_tokens_seen": 273274710, + "step": 12668, + "time_per_iteration": 2.5727996826171875 + }, + { + "auxiliary_loss_clip": 0.01116708, + "auxiliary_loss_mlp": 0.01101808, + "balance_loss_clip": 1.00177288, + "balance_loss_mlp": 1.00052786, + "epoch": 0.7617014880505035, + "flos": 18516188799360.0, + "grad_norm": 3.342488135080786, + "language_loss": 0.64452994, + "learning_rate": 5.6657314808718e-07, + "loss": 0.66671503, + "num_input_tokens_seen": 273292870, + "step": 12669, + "time_per_iteration": 2.6342520713806152 + }, + { + "auxiliary_loss_clip": 0.01133572, + "auxiliary_loss_mlp": 0.01103746, + "balance_loss_clip": 1.00186837, + "balance_loss_mlp": 1.00055838, + "epoch": 0.7617616113031715, + "flos": 24973178382720.0, + "grad_norm": 2.325813512057413, + "language_loss": 0.66333395, + "learning_rate": 5.663015772261202e-07, + "loss": 0.68570709, + "num_input_tokens_seen": 273312375, + "step": 12670, + "time_per_iteration": 2.670170545578003 + }, + { + "auxiliary_loss_clip": 0.01148079, + "auxiliary_loss_mlp": 0.01103532, + "balance_loss_clip": 1.00188017, + "balance_loss_mlp": 1.00044012, + "epoch": 0.7618217345558395, + "flos": 23295036925440.0, + "grad_norm": 1.6741515138263001, + "language_loss": 0.73122001, + "learning_rate": 5.660300607310493e-07, + "loss": 0.75373614, + "num_input_tokens_seen": 273332590, + "step": 12671, + "time_per_iteration": 2.580364227294922 + }, + { + "auxiliary_loss_clip": 0.01120084, + "auxiliary_loss_mlp": 0.0110157, + "balance_loss_clip": 1.00170386, + "balance_loss_mlp": 1.00048101, + "epoch": 0.7618818578085075, + "flos": 25482894330240.0, + "grad_norm": 1.7299272270310802, + "language_loss": 0.73351055, + "learning_rate": 5.657585986122613e-07, + "loss": 0.75572711, + "num_input_tokens_seen": 273352885, + "step": 12672, + "time_per_iteration": 2.719313383102417 + }, + { + "auxiliary_loss_clip": 0.01127067, + "auxiliary_loss_mlp": 0.01076627, + "balance_loss_clip": 1.00101352, + "balance_loss_mlp": 0.9999513, + "epoch": 0.7619419810611754, + "flos": 61151994115200.0, + "grad_norm": 0.7597936629001047, + "language_loss": 0.56699955, + "learning_rate": 5.654871908800506e-07, + "loss": 0.58903646, + "num_input_tokens_seen": 273411730, + "step": 12673, + "time_per_iteration": 3.1434030532836914 + }, + { + "auxiliary_loss_clip": 0.01147773, + "auxiliary_loss_mlp": 0.01102707, + "balance_loss_clip": 1.00189579, + "balance_loss_mlp": 1.00047278, + "epoch": 0.7620021043138434, + "flos": 23258659426560.0, + "grad_norm": 1.9146659074062056, + "language_loss": 0.74845219, + "learning_rate": 5.652158375447102e-07, + "loss": 0.77095699, + "num_input_tokens_seen": 273430020, + "step": 12674, + "time_per_iteration": 2.5965683460235596 + }, + { + "auxiliary_loss_clip": 0.0113478, + "auxiliary_loss_mlp": 0.0110194, + "balance_loss_clip": 1.00186825, + "balance_loss_mlp": 1.00046921, + "epoch": 0.7620622275665113, + "flos": 25082490447360.0, + "grad_norm": 2.2938064782911955, + "language_loss": 0.71936285, + "learning_rate": 5.649445386165286e-07, + "loss": 0.74173003, + "num_input_tokens_seen": 273448690, + "step": 12675, + "time_per_iteration": 2.606480121612549 + }, + { + "auxiliary_loss_clip": 0.01149832, + "auxiliary_loss_mlp": 0.01102711, + "balance_loss_clip": 1.00185156, + "balance_loss_mlp": 1.0005728, + "epoch": 0.7621223508191793, + "flos": 20155007842560.0, + "grad_norm": 1.9869731674341105, + "language_loss": 0.73272526, + "learning_rate": 5.646732941057936e-07, + "loss": 0.75525075, + "num_input_tokens_seen": 273465190, + "step": 12676, + "time_per_iteration": 2.6036527156829834 + }, + { + "auxiliary_loss_clip": 0.01117251, + "auxiliary_loss_mlp": 0.00747504, + "balance_loss_clip": 1.00180936, + "balance_loss_mlp": 1.00046611, + "epoch": 0.7621824740718472, + "flos": 18000187971840.0, + "grad_norm": 2.5516543008187447, + "language_loss": 0.5370537, + "learning_rate": 5.644021040227927e-07, + "loss": 0.55570126, + "num_input_tokens_seen": 273478620, + "step": 12677, + "time_per_iteration": 2.597764015197754 + }, + { + "auxiliary_loss_clip": 0.01103792, + "auxiliary_loss_mlp": 0.01103737, + "balance_loss_clip": 1.00189519, + "balance_loss_mlp": 1.00054944, + "epoch": 0.7622425973245153, + "flos": 21725668828800.0, + "grad_norm": 2.386458811738558, + "language_loss": 0.78620714, + "learning_rate": 5.641309683778064e-07, + "loss": 0.80828243, + "num_input_tokens_seen": 273497635, + "step": 12678, + "time_per_iteration": 2.7153429985046387 + }, + { + "auxiliary_loss_clip": 0.01116473, + "auxiliary_loss_mlp": 0.01102621, + "balance_loss_clip": 1.00176716, + "balance_loss_mlp": 1.00057781, + "epoch": 0.7623027205771832, + "flos": 19718549683200.0, + "grad_norm": 2.2855890431049954, + "language_loss": 0.77276886, + "learning_rate": 5.638598871811175e-07, + "loss": 0.79495978, + "num_input_tokens_seen": 273513955, + "step": 12679, + "time_per_iteration": 2.620866537094116 + }, + { + "auxiliary_loss_clip": 0.01147741, + "auxiliary_loss_mlp": 0.0110245, + "balance_loss_clip": 1.00180829, + "balance_loss_mlp": 1.0004549, + "epoch": 0.7623628438298512, + "flos": 23988831096960.0, + "grad_norm": 1.4861967988376468, + "language_loss": 0.8012538, + "learning_rate": 5.635888604430059e-07, + "loss": 0.8237558, + "num_input_tokens_seen": 273533970, + "step": 12680, + "time_per_iteration": 2.6617565155029297 + }, + { + "auxiliary_loss_clip": 0.01133142, + "auxiliary_loss_mlp": 0.01103767, + "balance_loss_clip": 1.00184751, + "balance_loss_mlp": 1.00038898, + "epoch": 0.7624229670825191, + "flos": 22345702421760.0, + "grad_norm": 2.6509793942324653, + "language_loss": 0.62991953, + "learning_rate": 5.633178881737493e-07, + "loss": 0.65228862, + "num_input_tokens_seen": 273553090, + "step": 12681, + "time_per_iteration": 2.61307692527771 + }, + { + "auxiliary_loss_clip": 0.01115724, + "auxiliary_loss_mlp": 0.0110178, + "balance_loss_clip": 1.00160384, + "balance_loss_mlp": 1.00049949, + "epoch": 0.7624830903351871, + "flos": 22711775880960.0, + "grad_norm": 1.7539042912841536, + "language_loss": 0.76111925, + "learning_rate": 5.63046970383622e-07, + "loss": 0.78329432, + "num_input_tokens_seen": 273572460, + "step": 12682, + "time_per_iteration": 4.038892984390259 + }, + { + "auxiliary_loss_clip": 0.01132749, + "auxiliary_loss_mlp": 0.01101926, + "balance_loss_clip": 1.00184035, + "balance_loss_mlp": 1.00045514, + "epoch": 0.7625432135878552, + "flos": 25593714766080.0, + "grad_norm": 1.4346508550914838, + "language_loss": 0.68352777, + "learning_rate": 5.627761070828974e-07, + "loss": 0.70587456, + "num_input_tokens_seen": 273592815, + "step": 12683, + "time_per_iteration": 2.768779754638672 + }, + { + "auxiliary_loss_clip": 0.01117938, + "auxiliary_loss_mlp": 0.00747329, + "balance_loss_clip": 1.00163698, + "balance_loss_mlp": 1.00033534, + "epoch": 0.7626033368405231, + "flos": 23987645948160.0, + "grad_norm": 3.02334491927733, + "language_loss": 0.83194041, + "learning_rate": 5.625052982818472e-07, + "loss": 0.85059309, + "num_input_tokens_seen": 273611790, + "step": 12684, + "time_per_iteration": 4.163455009460449 + }, + { + "auxiliary_loss_clip": 0.01131288, + "auxiliary_loss_mlp": 0.01103193, + "balance_loss_clip": 1.00179505, + "balance_loss_mlp": 1.00057757, + "epoch": 0.7626634600931911, + "flos": 12599115523200.0, + "grad_norm": 3.110551264912074, + "language_loss": 0.82553291, + "learning_rate": 5.622345439907396e-07, + "loss": 0.84787774, + "num_input_tokens_seen": 273628340, + "step": 12685, + "time_per_iteration": 2.5706615447998047 + }, + { + "auxiliary_loss_clip": 0.01115985, + "auxiliary_loss_mlp": 0.00747264, + "balance_loss_clip": 1.00170159, + "balance_loss_mlp": 1.00042248, + "epoch": 0.762723583345859, + "flos": 26322593546880.0, + "grad_norm": 2.8179672208237, + "language_loss": 0.76883125, + "learning_rate": 5.619638442198422e-07, + "loss": 0.78746372, + "num_input_tokens_seen": 273646585, + "step": 12686, + "time_per_iteration": 2.8302485942840576 + }, + { + "auxiliary_loss_clip": 0.01102276, + "auxiliary_loss_mlp": 0.01103952, + "balance_loss_clip": 1.0018177, + "balance_loss_mlp": 1.00066853, + "epoch": 0.762783706598527, + "flos": 21907053532800.0, + "grad_norm": 1.6995671191530661, + "language_loss": 0.71916866, + "learning_rate": 5.616931989794198e-07, + "loss": 0.74123091, + "num_input_tokens_seen": 273665410, + "step": 12687, + "time_per_iteration": 2.6794111728668213 + }, + { + "auxiliary_loss_clip": 0.01135027, + "auxiliary_loss_mlp": 0.01102865, + "balance_loss_clip": 1.00188768, + "balance_loss_mlp": 1.00063097, + "epoch": 0.7628438298511949, + "flos": 15339782217600.0, + "grad_norm": 2.0303939530657846, + "language_loss": 0.64665818, + "learning_rate": 5.614226082797369e-07, + "loss": 0.6690371, + "num_input_tokens_seen": 273683035, + "step": 12688, + "time_per_iteration": 2.656628370285034 + }, + { + "auxiliary_loss_clip": 0.01149489, + "auxiliary_loss_mlp": 0.01102215, + "balance_loss_clip": 1.00197506, + "balance_loss_mlp": 1.000458, + "epoch": 0.7629039531038629, + "flos": 13006307076480.0, + "grad_norm": 1.936994519807252, + "language_loss": 0.70421135, + "learning_rate": 5.611520721310515e-07, + "loss": 0.72672838, + "num_input_tokens_seen": 273700130, + "step": 12689, + "time_per_iteration": 2.5244436264038086 + }, + { + "auxiliary_loss_clip": 0.01116875, + "auxiliary_loss_mlp": 0.01103861, + "balance_loss_clip": 1.0017525, + "balance_loss_mlp": 1.00067365, + "epoch": 0.7629640763565309, + "flos": 26171660597760.0, + "grad_norm": 2.119654893430659, + "language_loss": 0.6981222, + "learning_rate": 5.608815905436238e-07, + "loss": 0.72032958, + "num_input_tokens_seen": 273720310, + "step": 12690, + "time_per_iteration": 2.7238714694976807 + }, + { + "auxiliary_loss_clip": 0.01133147, + "auxiliary_loss_mlp": 0.01103634, + "balance_loss_clip": 1.00185978, + "balance_loss_mlp": 1.00054181, + "epoch": 0.7630241996091989, + "flos": 36793713680640.0, + "grad_norm": 1.5052163132672782, + "language_loss": 0.69634771, + "learning_rate": 5.606111635277109e-07, + "loss": 0.71871549, + "num_input_tokens_seen": 273744475, + "step": 12691, + "time_per_iteration": 2.750941753387451 + }, + { + "auxiliary_loss_clip": 0.01148083, + "auxiliary_loss_mlp": 0.01102427, + "balance_loss_clip": 1.00182104, + "balance_loss_mlp": 1.00047934, + "epoch": 0.7630843228618668, + "flos": 21835160461440.0, + "grad_norm": 1.5874827121592856, + "language_loss": 0.81398106, + "learning_rate": 5.603407910935662e-07, + "loss": 0.8364861, + "num_input_tokens_seen": 273764635, + "step": 12692, + "time_per_iteration": 2.6397671699523926 + }, + { + "auxiliary_loss_clip": 0.01116441, + "auxiliary_loss_mlp": 0.01103295, + "balance_loss_clip": 1.0018878, + "balance_loss_mlp": 1.00048888, + "epoch": 0.7631444461145348, + "flos": 12640520926080.0, + "grad_norm": 2.2457647925399513, + "language_loss": 0.76926494, + "learning_rate": 5.600704732514438e-07, + "loss": 0.7914623, + "num_input_tokens_seen": 273780115, + "step": 12693, + "time_per_iteration": 2.623105049133301 + }, + { + "auxiliary_loss_clip": 0.01114698, + "auxiliary_loss_mlp": 0.01104043, + "balance_loss_clip": 1.00168979, + "balance_loss_mlp": 1.00056958, + "epoch": 0.7632045693672027, + "flos": 16836610798080.0, + "grad_norm": 3.311377903055248, + "language_loss": 0.72917032, + "learning_rate": 5.598002100115933e-07, + "loss": 0.75135779, + "num_input_tokens_seen": 273796605, + "step": 12694, + "time_per_iteration": 2.664547920227051 + }, + { + "auxiliary_loss_clip": 0.01147823, + "auxiliary_loss_mlp": 0.0110232, + "balance_loss_clip": 1.00183272, + "balance_loss_mlp": 1.00037169, + "epoch": 0.7632646926198707, + "flos": 22017335264640.0, + "grad_norm": 3.7794932146210694, + "language_loss": 0.70406616, + "learning_rate": 5.595300013842625e-07, + "loss": 0.72656763, + "num_input_tokens_seen": 273816515, + "step": 12695, + "time_per_iteration": 4.035363435745239 + }, + { + "auxiliary_loss_clip": 0.01164593, + "auxiliary_loss_mlp": 0.01102155, + "balance_loss_clip": 1.00191569, + "balance_loss_mlp": 1.00049376, + "epoch": 0.7633248158725388, + "flos": 23114011357440.0, + "grad_norm": 1.8334464510885788, + "language_loss": 0.72100389, + "learning_rate": 5.592598473796985e-07, + "loss": 0.74367136, + "num_input_tokens_seen": 273837060, + "step": 12696, + "time_per_iteration": 2.57305645942688 + }, + { + "auxiliary_loss_clip": 0.01086575, + "auxiliary_loss_mlp": 0.01103115, + "balance_loss_clip": 1.00157428, + "balance_loss_mlp": 1.00049973, + "epoch": 0.7633849391252067, + "flos": 10889839952640.0, + "grad_norm": 2.307359138905449, + "language_loss": 0.71554565, + "learning_rate": 5.589897480081453e-07, + "loss": 0.73744261, + "num_input_tokens_seen": 273853365, + "step": 12697, + "time_per_iteration": 2.688929557800293 + }, + { + "auxiliary_loss_clip": 0.01114944, + "auxiliary_loss_mlp": 0.01102708, + "balance_loss_clip": 1.00187325, + "balance_loss_mlp": 1.00047398, + "epoch": 0.7634450623778747, + "flos": 20994168355200.0, + "grad_norm": 2.0773802652826063, + "language_loss": 0.67053008, + "learning_rate": 5.587197032798461e-07, + "loss": 0.69270664, + "num_input_tokens_seen": 273870750, + "step": 12698, + "time_per_iteration": 2.6625187397003174 + }, + { + "auxiliary_loss_clip": 0.01150014, + "auxiliary_loss_mlp": 0.01102543, + "balance_loss_clip": 1.00189185, + "balance_loss_mlp": 1.00049925, + "epoch": 0.7635051856305426, + "flos": 18882046776960.0, + "grad_norm": 2.215623042195512, + "language_loss": 0.72000504, + "learning_rate": 5.5844971320504e-07, + "loss": 0.74253058, + "num_input_tokens_seen": 273890890, + "step": 12699, + "time_per_iteration": 2.625603437423706 + }, + { + "auxiliary_loss_clip": 0.01134645, + "auxiliary_loss_mlp": 0.01101897, + "balance_loss_clip": 1.00180888, + "balance_loss_mlp": 1.00052142, + "epoch": 0.7635653088832106, + "flos": 34786989584640.0, + "grad_norm": 1.6388362529138676, + "language_loss": 0.73075306, + "learning_rate": 5.581797777939648e-07, + "loss": 0.75311852, + "num_input_tokens_seen": 273914015, + "step": 12700, + "time_per_iteration": 4.126269817352295 + }, + { + "auxiliary_loss_clip": 0.01164588, + "auxiliary_loss_mlp": 0.01102698, + "balance_loss_clip": 1.00187194, + "balance_loss_mlp": 1.00046456, + "epoch": 0.7636254321358785, + "flos": 23178434400000.0, + "grad_norm": 2.0584157338568425, + "language_loss": 0.69378948, + "learning_rate": 5.579098970568574e-07, + "loss": 0.71646237, + "num_input_tokens_seen": 273927415, + "step": 12701, + "time_per_iteration": 2.511171579360962 + }, + { + "auxiliary_loss_clip": 0.01131591, + "auxiliary_loss_mlp": 0.01102344, + "balance_loss_clip": 1.00182772, + "balance_loss_mlp": 1.00049186, + "epoch": 0.7636855553885465, + "flos": 21325229032320.0, + "grad_norm": 1.6476284964046914, + "language_loss": 0.64264762, + "learning_rate": 5.576400710039508e-07, + "loss": 0.66498697, + "num_input_tokens_seen": 273946690, + "step": 12702, + "time_per_iteration": 2.679450035095215 + }, + { + "auxiliary_loss_clip": 0.01114956, + "auxiliary_loss_mlp": 0.01103175, + "balance_loss_clip": 1.00179052, + "balance_loss_mlp": 1.0004648, + "epoch": 0.7637456786412145, + "flos": 28658079849600.0, + "grad_norm": 1.915085470809997, + "language_loss": 0.65758634, + "learning_rate": 5.57370299645477e-07, + "loss": 0.67976773, + "num_input_tokens_seen": 273966870, + "step": 12703, + "time_per_iteration": 2.701688528060913 + }, + { + "auxiliary_loss_clip": 0.01133733, + "auxiliary_loss_mlp": 0.01101564, + "balance_loss_clip": 1.0019896, + "balance_loss_mlp": 1.00037909, + "epoch": 0.7638058018938825, + "flos": 21907269014400.0, + "grad_norm": 2.4526723264454446, + "language_loss": 0.83964455, + "learning_rate": 5.571005829916668e-07, + "loss": 0.86199749, + "num_input_tokens_seen": 273986360, + "step": 12704, + "time_per_iteration": 2.620074510574341 + }, + { + "auxiliary_loss_clip": 0.01135165, + "auxiliary_loss_mlp": 0.01102211, + "balance_loss_clip": 1.00196481, + "balance_loss_mlp": 1.00064468, + "epoch": 0.7638659251465504, + "flos": 29643899592960.0, + "grad_norm": 1.7354441124423954, + "language_loss": 0.67927253, + "learning_rate": 5.568309210527469e-07, + "loss": 0.70164627, + "num_input_tokens_seen": 274009745, + "step": 12705, + "time_per_iteration": 2.6912872791290283 + }, + { + "auxiliary_loss_clip": 0.01133015, + "auxiliary_loss_mlp": 0.01102737, + "balance_loss_clip": 1.00192022, + "balance_loss_mlp": 1.00059879, + "epoch": 0.7639260483992184, + "flos": 26141172929280.0, + "grad_norm": 1.784436020726662, + "language_loss": 0.73789048, + "learning_rate": 5.565613138389427e-07, + "loss": 0.76024795, + "num_input_tokens_seen": 274028775, + "step": 12706, + "time_per_iteration": 2.6803979873657227 + }, + { + "auxiliary_loss_clip": 0.01150109, + "auxiliary_loss_mlp": 0.01103082, + "balance_loss_clip": 1.00197744, + "balance_loss_mlp": 1.00046611, + "epoch": 0.7639861716518863, + "flos": 20156695781760.0, + "grad_norm": 1.8747486319437814, + "language_loss": 0.7818985, + "learning_rate": 5.562917613604781e-07, + "loss": 0.80443043, + "num_input_tokens_seen": 274047520, + "step": 12707, + "time_per_iteration": 2.5402581691741943 + }, + { + "auxiliary_loss_clip": 0.01133066, + "auxiliary_loss_mlp": 0.01103125, + "balance_loss_clip": 1.00177848, + "balance_loss_mlp": 1.00041413, + "epoch": 0.7640462949045543, + "flos": 18583125793920.0, + "grad_norm": 1.8457580900256387, + "language_loss": 0.79857385, + "learning_rate": 5.560222636275751e-07, + "loss": 0.82093573, + "num_input_tokens_seen": 274065350, + "step": 12708, + "time_per_iteration": 2.6199498176574707 + }, + { + "auxiliary_loss_clip": 0.01142124, + "auxiliary_loss_mlp": 0.01076669, + "balance_loss_clip": 1.00089312, + "balance_loss_mlp": 0.9999935, + "epoch": 0.7641064181572224, + "flos": 68321991646080.0, + "grad_norm": 0.813213748750682, + "language_loss": 0.56435776, + "learning_rate": 5.557528206504521e-07, + "loss": 0.58654571, + "num_input_tokens_seen": 274122315, + "step": 12709, + "time_per_iteration": 3.1747820377349854 + }, + { + "auxiliary_loss_clip": 0.01149865, + "auxiliary_loss_mlp": 0.01103001, + "balance_loss_clip": 1.00194478, + "balance_loss_mlp": 1.00057602, + "epoch": 0.7641665414098903, + "flos": 17968982031360.0, + "grad_norm": 1.8007811267936003, + "language_loss": 0.63533628, + "learning_rate": 5.554834324393271e-07, + "loss": 0.65786493, + "num_input_tokens_seen": 274140555, + "step": 12710, + "time_per_iteration": 2.534857988357544 + }, + { + "auxiliary_loss_clip": 0.01099638, + "auxiliary_loss_mlp": 0.00747411, + "balance_loss_clip": 1.00176525, + "balance_loss_mlp": 1.00049305, + "epoch": 0.7642266646625583, + "flos": 21252078984960.0, + "grad_norm": 2.500531584152544, + "language_loss": 0.65420526, + "learning_rate": 5.552140990044154e-07, + "loss": 0.67267579, + "num_input_tokens_seen": 274161125, + "step": 12711, + "time_per_iteration": 2.7176144123077393 + }, + { + "auxiliary_loss_clip": 0.01131154, + "auxiliary_loss_mlp": 0.01102595, + "balance_loss_clip": 1.00179327, + "balance_loss_mlp": 1.00055194, + "epoch": 0.7642867879152262, + "flos": 22747794243840.0, + "grad_norm": 1.8199671701226345, + "language_loss": 0.73101795, + "learning_rate": 5.549448203559293e-07, + "loss": 0.75335544, + "num_input_tokens_seen": 274180835, + "step": 12712, + "time_per_iteration": 2.683156728744507 + }, + { + "auxiliary_loss_clip": 0.01114551, + "auxiliary_loss_mlp": 0.01103495, + "balance_loss_clip": 1.00174296, + "balance_loss_mlp": 1.00049806, + "epoch": 0.7643469111678942, + "flos": 23332132696320.0, + "grad_norm": 1.551984854437669, + "language_loss": 0.80220276, + "learning_rate": 5.546755965040804e-07, + "loss": 0.82438326, + "num_input_tokens_seen": 274201190, + "step": 12713, + "time_per_iteration": 2.687976121902466 + }, + { + "auxiliary_loss_clip": 0.01149529, + "auxiliary_loss_mlp": 0.00747411, + "balance_loss_clip": 1.00190496, + "balance_loss_mlp": 1.00045943, + "epoch": 0.7644070344205621, + "flos": 19857092440320.0, + "grad_norm": 2.098433445487721, + "language_loss": 0.83572125, + "learning_rate": 5.544064274590776e-07, + "loss": 0.85469067, + "num_input_tokens_seen": 274217595, + "step": 12714, + "time_per_iteration": 2.569237470626831 + }, + { + "auxiliary_loss_clip": 0.01149931, + "auxiliary_loss_mlp": 0.01103295, + "balance_loss_clip": 1.00180578, + "balance_loss_mlp": 1.00048947, + "epoch": 0.7644671576732301, + "flos": 22090628966400.0, + "grad_norm": 1.6292411915752019, + "language_loss": 0.72651899, + "learning_rate": 5.541373132311287e-07, + "loss": 0.74905121, + "num_input_tokens_seen": 274237885, + "step": 12715, + "time_per_iteration": 2.585205316543579 + }, + { + "auxiliary_loss_clip": 0.0111661, + "auxiliary_loss_mlp": 0.01102371, + "balance_loss_clip": 1.00159085, + "balance_loss_mlp": 1.00042272, + "epoch": 0.7645272809258981, + "flos": 25481421872640.0, + "grad_norm": 1.7652863673323775, + "language_loss": 0.62849486, + "learning_rate": 5.538682538304376e-07, + "loss": 0.65068465, + "num_input_tokens_seen": 274258820, + "step": 12716, + "time_per_iteration": 2.6951446533203125 + }, + { + "auxiliary_loss_clip": 0.01164783, + "auxiliary_loss_mlp": 0.01104274, + "balance_loss_clip": 1.00190783, + "balance_loss_mlp": 1.0006094, + "epoch": 0.7645874041785661, + "flos": 21541877913600.0, + "grad_norm": 2.2849596729726462, + "language_loss": 0.79950356, + "learning_rate": 5.535992492672068e-07, + "loss": 0.82219422, + "num_input_tokens_seen": 274278835, + "step": 12717, + "time_per_iteration": 2.6303017139434814 + }, + { + "auxiliary_loss_clip": 0.01164506, + "auxiliary_loss_mlp": 0.01101968, + "balance_loss_clip": 1.00199127, + "balance_loss_mlp": 1.00059295, + "epoch": 0.764647527431234, + "flos": 20630896156800.0, + "grad_norm": 2.2026635665193535, + "language_loss": 0.66539836, + "learning_rate": 5.53330299551638e-07, + "loss": 0.68806314, + "num_input_tokens_seen": 274297110, + "step": 12718, + "time_per_iteration": 2.583843469619751 + }, + { + "auxiliary_loss_clip": 0.01115798, + "auxiliary_loss_mlp": 0.01102132, + "balance_loss_clip": 1.00173187, + "balance_loss_mlp": 1.00056624, + "epoch": 0.764707650683902, + "flos": 21434074220160.0, + "grad_norm": 6.69083817138974, + "language_loss": 0.77281421, + "learning_rate": 5.530614046939286e-07, + "loss": 0.79499352, + "num_input_tokens_seen": 274315610, + "step": 12719, + "time_per_iteration": 2.681286334991455 + }, + { + "auxiliary_loss_clip": 0.01164633, + "auxiliary_loss_mlp": 0.01102682, + "balance_loss_clip": 1.0019021, + "balance_loss_mlp": 1.00044835, + "epoch": 0.7647677739365699, + "flos": 22711201263360.0, + "grad_norm": 3.1238386146505603, + "language_loss": 0.70247126, + "learning_rate": 5.527925647042754e-07, + "loss": 0.72514445, + "num_input_tokens_seen": 274333975, + "step": 12720, + "time_per_iteration": 3.965759754180908 + }, + { + "auxiliary_loss_clip": 0.01114714, + "auxiliary_loss_mlp": 0.01102798, + "balance_loss_clip": 1.00177503, + "balance_loss_mlp": 1.00056422, + "epoch": 0.7648278971892379, + "flos": 21324115710720.0, + "grad_norm": 1.7286861162049867, + "language_loss": 0.73931146, + "learning_rate": 5.52523779592875e-07, + "loss": 0.76148653, + "num_input_tokens_seen": 274353695, + "step": 12721, + "time_per_iteration": 2.6657159328460693 + }, + { + "auxiliary_loss_clip": 0.01116679, + "auxiliary_loss_mlp": 0.01102984, + "balance_loss_clip": 1.00168395, + "balance_loss_mlp": 1.00055957, + "epoch": 0.764888020441906, + "flos": 20667345482880.0, + "grad_norm": 1.8086655319752694, + "language_loss": 0.73745632, + "learning_rate": 5.522550493699163e-07, + "loss": 0.75965297, + "num_input_tokens_seen": 274371120, + "step": 12722, + "time_per_iteration": 4.13118577003479 + }, + { + "auxiliary_loss_clip": 0.01149782, + "auxiliary_loss_mlp": 0.01102884, + "balance_loss_clip": 1.00191772, + "balance_loss_mlp": 1.00055432, + "epoch": 0.7649481436945739, + "flos": 25082526360960.0, + "grad_norm": 2.0196385710569844, + "language_loss": 0.74142277, + "learning_rate": 5.519863740455912e-07, + "loss": 0.76394945, + "num_input_tokens_seen": 274389665, + "step": 12723, + "time_per_iteration": 2.603097915649414 + }, + { + "auxiliary_loss_clip": 0.01164684, + "auxiliary_loss_mlp": 0.01102946, + "balance_loss_clip": 1.0018096, + "balance_loss_mlp": 1.00042593, + "epoch": 0.7650082669472419, + "flos": 24900890261760.0, + "grad_norm": 1.809389173395872, + "language_loss": 0.7286545, + "learning_rate": 5.517177536300881e-07, + "loss": 0.75133079, + "num_input_tokens_seen": 274408750, + "step": 12724, + "time_per_iteration": 2.5701868534088135 + }, + { + "auxiliary_loss_clip": 0.01147791, + "auxiliary_loss_mlp": 0.01101989, + "balance_loss_clip": 1.00192666, + "balance_loss_mlp": 1.00042224, + "epoch": 0.7650683901999098, + "flos": 14647388676480.0, + "grad_norm": 1.9526805348394212, + "language_loss": 0.83950424, + "learning_rate": 5.514491881335935e-07, + "loss": 0.86200202, + "num_input_tokens_seen": 274424600, + "step": 12725, + "time_per_iteration": 2.539403200149536 + }, + { + "auxiliary_loss_clip": 0.01116676, + "auxiliary_loss_mlp": 0.01102875, + "balance_loss_clip": 1.00178444, + "balance_loss_mlp": 1.0004499, + "epoch": 0.7651285134525778, + "flos": 26352434770560.0, + "grad_norm": 1.8106362000695881, + "language_loss": 0.77571386, + "learning_rate": 5.511806775662901e-07, + "loss": 0.79790938, + "num_input_tokens_seen": 274443075, + "step": 12726, + "time_per_iteration": 2.6979081630706787 + }, + { + "auxiliary_loss_clip": 0.01147342, + "auxiliary_loss_mlp": 0.01102473, + "balance_loss_clip": 1.00188255, + "balance_loss_mlp": 1.00033486, + "epoch": 0.7651886367052457, + "flos": 26646866553600.0, + "grad_norm": 1.6527683049508648, + "language_loss": 0.7043426, + "learning_rate": 5.509122219383615e-07, + "loss": 0.72684073, + "num_input_tokens_seen": 274463240, + "step": 12727, + "time_per_iteration": 2.6751391887664795 + }, + { + "auxiliary_loss_clip": 0.01164485, + "auxiliary_loss_mlp": 0.01101274, + "balance_loss_clip": 1.00184202, + "balance_loss_mlp": 1.00047052, + "epoch": 0.7652487599579137, + "flos": 25702847262720.0, + "grad_norm": 2.3709551635869204, + "language_loss": 0.79523122, + "learning_rate": 5.506438212599864e-07, + "loss": 0.81788874, + "num_input_tokens_seen": 274482750, + "step": 12728, + "time_per_iteration": 2.5693960189819336 + }, + { + "auxiliary_loss_clip": 0.01164719, + "auxiliary_loss_mlp": 0.01103185, + "balance_loss_clip": 1.00194764, + "balance_loss_mlp": 1.00037944, + "epoch": 0.7653088832105817, + "flos": 28585576247040.0, + "grad_norm": 1.7398664781180626, + "language_loss": 0.55395621, + "learning_rate": 5.503754755413424e-07, + "loss": 0.57663524, + "num_input_tokens_seen": 274503545, + "step": 12729, + "time_per_iteration": 2.574751138687134 + }, + { + "auxiliary_loss_clip": 0.01131517, + "auxiliary_loss_mlp": 0.00747325, + "balance_loss_clip": 1.0017035, + "balance_loss_mlp": 1.00047743, + "epoch": 0.7653690064632497, + "flos": 23366750428800.0, + "grad_norm": 1.8795527965647747, + "language_loss": 0.77973133, + "learning_rate": 5.501071847926055e-07, + "loss": 0.79851973, + "num_input_tokens_seen": 274523825, + "step": 12730, + "time_per_iteration": 2.627380847930908 + }, + { + "auxiliary_loss_clip": 0.01148127, + "auxiliary_loss_mlp": 0.01103678, + "balance_loss_clip": 1.00199521, + "balance_loss_mlp": 1.00058615, + "epoch": 0.7654291297159176, + "flos": 15773905992960.0, + "grad_norm": 1.9567385257542937, + "language_loss": 0.69373012, + "learning_rate": 5.498389490239495e-07, + "loss": 0.71624827, + "num_input_tokens_seen": 274541625, + "step": 12731, + "time_per_iteration": 2.5445284843444824 + }, + { + "auxiliary_loss_clip": 0.01164639, + "auxiliary_loss_mlp": 0.01102757, + "balance_loss_clip": 1.00195622, + "balance_loss_mlp": 1.00052321, + "epoch": 0.7654892529685856, + "flos": 18033800123520.0, + "grad_norm": 2.2581087769055417, + "language_loss": 0.70255148, + "learning_rate": 5.495707682455471e-07, + "loss": 0.72522539, + "num_input_tokens_seen": 274557580, + "step": 12732, + "time_per_iteration": 2.4913878440856934 + }, + { + "auxiliary_loss_clip": 0.01132802, + "auxiliary_loss_mlp": 0.01103051, + "balance_loss_clip": 1.00186217, + "balance_loss_mlp": 1.00053072, + "epoch": 0.7655493762212535, + "flos": 27236017428480.0, + "grad_norm": 1.889669086212629, + "language_loss": 0.78234804, + "learning_rate": 5.493026424675653e-07, + "loss": 0.80470651, + "num_input_tokens_seen": 274578135, + "step": 12733, + "time_per_iteration": 4.109253883361816 + }, + { + "auxiliary_loss_clip": 0.01149624, + "auxiliary_loss_mlp": 0.01102561, + "balance_loss_clip": 1.00190437, + "balance_loss_mlp": 1.00056601, + "epoch": 0.7656094994739215, + "flos": 20773964027520.0, + "grad_norm": 3.2616553851838397, + "language_loss": 0.77485681, + "learning_rate": 5.490345717001726e-07, + "loss": 0.79737866, + "num_input_tokens_seen": 274595655, + "step": 12734, + "time_per_iteration": 2.567307710647583 + }, + { + "auxiliary_loss_clip": 0.01133456, + "auxiliary_loss_mlp": 0.01103446, + "balance_loss_clip": 1.00205445, + "balance_loss_mlp": 1.00044918, + "epoch": 0.7656696227265896, + "flos": 23039245198080.0, + "grad_norm": 1.8731148781921692, + "language_loss": 0.73478508, + "learning_rate": 5.48766555953535e-07, + "loss": 0.75715411, + "num_input_tokens_seen": 274616305, + "step": 12735, + "time_per_iteration": 2.72116756439209 + }, + { + "auxiliary_loss_clip": 0.01132536, + "auxiliary_loss_mlp": 0.01102353, + "balance_loss_clip": 1.0018661, + "balance_loss_mlp": 1.00050092, + "epoch": 0.7657297459792575, + "flos": 27525636789120.0, + "grad_norm": 1.6275499459180414, + "language_loss": 0.72906333, + "learning_rate": 5.484985952378145e-07, + "loss": 0.75141227, + "num_input_tokens_seen": 274638110, + "step": 12736, + "time_per_iteration": 2.711294651031494 + }, + { + "auxiliary_loss_clip": 0.0114832, + "auxiliary_loss_mlp": 0.00747522, + "balance_loss_clip": 1.0018332, + "balance_loss_mlp": 1.00047588, + "epoch": 0.7657898692319255, + "flos": 17128456801920.0, + "grad_norm": 1.9455311330342415, + "language_loss": 0.77294523, + "learning_rate": 5.482306895631728e-07, + "loss": 0.79190367, + "num_input_tokens_seen": 274656565, + "step": 12737, + "time_per_iteration": 2.542881965637207 + }, + { + "auxiliary_loss_clip": 0.01133305, + "auxiliary_loss_mlp": 0.0110229, + "balance_loss_clip": 1.0016948, + "balance_loss_mlp": 1.00053346, + "epoch": 0.7658499924845934, + "flos": 21465747037440.0, + "grad_norm": 2.062803693828778, + "language_loss": 0.76638854, + "learning_rate": 5.479628389397699e-07, + "loss": 0.78874457, + "num_input_tokens_seen": 274674215, + "step": 12738, + "time_per_iteration": 4.080326080322266 + }, + { + "auxiliary_loss_clip": 0.01133762, + "auxiliary_loss_mlp": 0.01103271, + "balance_loss_clip": 1.00185347, + "balance_loss_mlp": 1.00041771, + "epoch": 0.7659101157372614, + "flos": 29496665744640.0, + "grad_norm": 1.9277226405963128, + "language_loss": 0.62801534, + "learning_rate": 5.476950433777603e-07, + "loss": 0.65038568, + "num_input_tokens_seen": 274693445, + "step": 12739, + "time_per_iteration": 2.676084280014038 + }, + { + "auxiliary_loss_clip": 0.01164528, + "auxiliary_loss_mlp": 0.011028, + "balance_loss_clip": 1.00190091, + "balance_loss_mlp": 1.00047076, + "epoch": 0.7659702389899293, + "flos": 18551812112640.0, + "grad_norm": 2.0945843772219903, + "language_loss": 0.78928924, + "learning_rate": 5.474273028873004e-07, + "loss": 0.81196249, + "num_input_tokens_seen": 274712815, + "step": 12740, + "time_per_iteration": 2.508975028991699 + }, + { + "auxiliary_loss_clip": 0.01149353, + "auxiliary_loss_mlp": 0.01102903, + "balance_loss_clip": 1.00183415, + "balance_loss_mlp": 1.00047874, + "epoch": 0.7660303622425974, + "flos": 23549176627200.0, + "grad_norm": 2.0968971183999283, + "language_loss": 0.65656799, + "learning_rate": 5.471596174785429e-07, + "loss": 0.6790905, + "num_input_tokens_seen": 274732690, + "step": 12741, + "time_per_iteration": 2.6066064834594727 + }, + { + "auxiliary_loss_clip": 0.01133083, + "auxiliary_loss_mlp": 0.01102995, + "balance_loss_clip": 1.00190783, + "balance_loss_mlp": 1.00037944, + "epoch": 0.7660904854952653, + "flos": 18916736336640.0, + "grad_norm": 1.6160421773522518, + "language_loss": 0.76086456, + "learning_rate": 5.468919871616386e-07, + "loss": 0.78322536, + "num_input_tokens_seen": 274752460, + "step": 12742, + "time_per_iteration": 2.6275432109832764 + }, + { + "auxiliary_loss_clip": 0.01130456, + "auxiliary_loss_mlp": 0.01101861, + "balance_loss_clip": 1.00184965, + "balance_loss_mlp": 1.00048566, + "epoch": 0.7661506087479333, + "flos": 23147515768320.0, + "grad_norm": 1.375688520261622, + "language_loss": 0.76460612, + "learning_rate": 5.46624411946736e-07, + "loss": 0.78692925, + "num_input_tokens_seen": 274773070, + "step": 12743, + "time_per_iteration": 2.6728286743164062 + }, + { + "auxiliary_loss_clip": 0.011334, + "auxiliary_loss_mlp": 0.01102053, + "balance_loss_clip": 1.00175071, + "balance_loss_mlp": 1.00043905, + "epoch": 0.7662107320006012, + "flos": 17565776887680.0, + "grad_norm": 2.0400923549289747, + "language_loss": 0.74696416, + "learning_rate": 5.463568918439805e-07, + "loss": 0.7693187, + "num_input_tokens_seen": 274790220, + "step": 12744, + "time_per_iteration": 2.6164681911468506 + }, + { + "auxiliary_loss_clip": 0.0115007, + "auxiliary_loss_mlp": 0.01103029, + "balance_loss_clip": 1.00191188, + "balance_loss_mlp": 1.00050855, + "epoch": 0.7662708552532692, + "flos": 22303075956480.0, + "grad_norm": 2.3009412103532485, + "language_loss": 0.71507978, + "learning_rate": 5.460894268635181e-07, + "loss": 0.73761076, + "num_input_tokens_seen": 274805095, + "step": 12745, + "time_per_iteration": 2.5975959300994873 + }, + { + "auxiliary_loss_clip": 0.01149614, + "auxiliary_loss_mlp": 0.01102666, + "balance_loss_clip": 1.00187063, + "balance_loss_mlp": 1.00043178, + "epoch": 0.7663309785059371, + "flos": 15742053607680.0, + "grad_norm": 2.3631860243582317, + "language_loss": 0.77071095, + "learning_rate": 5.458220170154896e-07, + "loss": 0.79323375, + "num_input_tokens_seen": 274821800, + "step": 12746, + "time_per_iteration": 2.5510365962982178 + }, + { + "auxiliary_loss_clip": 0.01111722, + "auxiliary_loss_mlp": 0.01076283, + "balance_loss_clip": 1.00080776, + "balance_loss_mlp": 0.99998915, + "epoch": 0.7663911017586051, + "flos": 62163312514560.0, + "grad_norm": 0.6678644352580538, + "language_loss": 0.56795627, + "learning_rate": 5.455546623100362e-07, + "loss": 0.5898363, + "num_input_tokens_seen": 274886970, + "step": 12747, + "time_per_iteration": 3.244110107421875 + }, + { + "auxiliary_loss_clip": 0.01164551, + "auxiliary_loss_mlp": 0.01101707, + "balance_loss_clip": 1.00189507, + "balance_loss_mlp": 1.00052261, + "epoch": 0.7664512250112732, + "flos": 26506025326080.0, + "grad_norm": 1.5496332501225354, + "language_loss": 0.72440839, + "learning_rate": 5.452873627572956e-07, + "loss": 0.74707103, + "num_input_tokens_seen": 274907240, + "step": 12748, + "time_per_iteration": 2.5799524784088135 + }, + { + "auxiliary_loss_clip": 0.01118253, + "auxiliary_loss_mlp": 0.01102049, + "balance_loss_clip": 1.00171971, + "balance_loss_mlp": 1.00048244, + "epoch": 0.7665113482639411, + "flos": 16249542912000.0, + "grad_norm": 1.8091200282893347, + "language_loss": 0.69238818, + "learning_rate": 5.450201183674052e-07, + "loss": 0.71459115, + "num_input_tokens_seen": 274924650, + "step": 12749, + "time_per_iteration": 2.632452964782715 + }, + { + "auxiliary_loss_clip": 0.0114792, + "auxiliary_loss_mlp": 0.01102635, + "balance_loss_clip": 1.00175989, + "balance_loss_mlp": 1.0004015, + "epoch": 0.7665714715166091, + "flos": 27197880163200.0, + "grad_norm": 1.5641705163915873, + "language_loss": 0.73549342, + "learning_rate": 5.447529291504967e-07, + "loss": 0.75799894, + "num_input_tokens_seen": 274944550, + "step": 12750, + "time_per_iteration": 2.6223812103271484 + }, + { + "auxiliary_loss_clip": 0.01147452, + "auxiliary_loss_mlp": 0.01101537, + "balance_loss_clip": 1.00174582, + "balance_loss_mlp": 1.00054312, + "epoch": 0.766631594769277, + "flos": 21067785279360.0, + "grad_norm": 2.315927127223714, + "language_loss": 0.75780278, + "learning_rate": 5.444857951167026e-07, + "loss": 0.78029275, + "num_input_tokens_seen": 274961330, + "step": 12751, + "time_per_iteration": 2.624615430831909 + }, + { + "auxiliary_loss_clip": 0.0111778, + "auxiliary_loss_mlp": 0.01102412, + "balance_loss_clip": 1.00172091, + "balance_loss_mlp": 1.00055981, + "epoch": 0.766691718021945, + "flos": 24097963593600.0, + "grad_norm": 1.6694468015172246, + "language_loss": 0.61061049, + "learning_rate": 5.442187162761537e-07, + "loss": 0.63281244, + "num_input_tokens_seen": 274981655, + "step": 12752, + "time_per_iteration": 2.7088799476623535 + }, + { + "auxiliary_loss_clip": 0.01147912, + "auxiliary_loss_mlp": 0.01103555, + "balance_loss_clip": 1.0018611, + "balance_loss_mlp": 1.00055838, + "epoch": 0.7667518412746129, + "flos": 23440654661760.0, + "grad_norm": 1.8954743801306733, + "language_loss": 0.69198835, + "learning_rate": 5.439516926389767e-07, + "loss": 0.71450305, + "num_input_tokens_seen": 274999970, + "step": 12753, + "time_per_iteration": 2.608823537826538 + }, + { + "auxiliary_loss_clip": 0.01149973, + "auxiliary_loss_mlp": 0.01102401, + "balance_loss_clip": 1.00194454, + "balance_loss_mlp": 1.00054884, + "epoch": 0.766811964527281, + "flos": 18148786536960.0, + "grad_norm": 2.1306012176582954, + "language_loss": 0.62212658, + "learning_rate": 5.436847242152971e-07, + "loss": 0.64465034, + "num_input_tokens_seen": 275015805, + "step": 12754, + "time_per_iteration": 2.573513984680176 + }, + { + "auxiliary_loss_clip": 0.01164587, + "auxiliary_loss_mlp": 0.01102302, + "balance_loss_clip": 1.00205994, + "balance_loss_mlp": 1.00044966, + "epoch": 0.7668720877799489, + "flos": 19536051657600.0, + "grad_norm": 4.003774585071999, + "language_loss": 0.79140782, + "learning_rate": 5.434178110152401e-07, + "loss": 0.81407666, + "num_input_tokens_seen": 275031810, + "step": 12755, + "time_per_iteration": 2.5565340518951416 + }, + { + "auxiliary_loss_clip": 0.01164475, + "auxiliary_loss_mlp": 0.01102281, + "balance_loss_clip": 1.00188828, + "balance_loss_mlp": 1.00052381, + "epoch": 0.7669322110326169, + "flos": 22674320974080.0, + "grad_norm": 1.821575089342018, + "language_loss": 0.70085627, + "learning_rate": 5.431509530489242e-07, + "loss": 0.72352386, + "num_input_tokens_seen": 275049325, + "step": 12756, + "time_per_iteration": 2.525536060333252 + }, + { + "auxiliary_loss_clip": 0.01147855, + "auxiliary_loss_mlp": 0.01102624, + "balance_loss_clip": 1.00182498, + "balance_loss_mlp": 1.00067592, + "epoch": 0.7669923342852848, + "flos": 26469396432000.0, + "grad_norm": 1.5692603385907022, + "language_loss": 0.70043212, + "learning_rate": 5.428841503264706e-07, + "loss": 0.72293693, + "num_input_tokens_seen": 275070865, + "step": 12757, + "time_per_iteration": 3.9978530406951904 + }, + { + "auxiliary_loss_clip": 0.0113096, + "auxiliary_loss_mlp": 0.01101998, + "balance_loss_clip": 1.00179994, + "balance_loss_mlp": 1.00062215, + "epoch": 0.7670524575379528, + "flos": 22856136641280.0, + "grad_norm": 2.255436695998613, + "language_loss": 0.75938094, + "learning_rate": 5.426174028579955e-07, + "loss": 0.78171057, + "num_input_tokens_seen": 275088015, + "step": 12758, + "time_per_iteration": 2.6267201900482178 + }, + { + "auxiliary_loss_clip": 0.01149305, + "auxiliary_loss_mlp": 0.01102196, + "balance_loss_clip": 1.00188756, + "balance_loss_mlp": 1.00053453, + "epoch": 0.7671125807906207, + "flos": 22452141398400.0, + "grad_norm": 1.6457887700351497, + "language_loss": 0.7613312, + "learning_rate": 5.423507106536156e-07, + "loss": 0.78384626, + "num_input_tokens_seen": 275106975, + "step": 12759, + "time_per_iteration": 2.5695650577545166 + }, + { + "auxiliary_loss_clip": 0.01133177, + "auxiliary_loss_mlp": 0.01101623, + "balance_loss_clip": 1.00169158, + "balance_loss_mlp": 1.00043845, + "epoch": 0.7671727040432887, + "flos": 35371543518720.0, + "grad_norm": 1.9015137721458402, + "language_loss": 0.68500698, + "learning_rate": 5.420840737234425e-07, + "loss": 0.70735502, + "num_input_tokens_seen": 275129560, + "step": 12760, + "time_per_iteration": 2.733236789703369 + }, + { + "auxiliary_loss_clip": 0.01132984, + "auxiliary_loss_mlp": 0.01102475, + "balance_loss_clip": 1.0018146, + "balance_loss_mlp": 1.00052774, + "epoch": 0.7672328272959568, + "flos": 22494947431680.0, + "grad_norm": 1.38089673819357, + "language_loss": 0.79222775, + "learning_rate": 5.418174920775871e-07, + "loss": 0.81458235, + "num_input_tokens_seen": 275151180, + "step": 12761, + "time_per_iteration": 4.067000865936279 + }, + { + "auxiliary_loss_clip": 0.01133088, + "auxiliary_loss_mlp": 0.01101377, + "balance_loss_clip": 1.00184202, + "balance_loss_mlp": 1.00047851, + "epoch": 0.7672929505486247, + "flos": 22815557251200.0, + "grad_norm": 2.0136188721621426, + "language_loss": 0.66190994, + "learning_rate": 5.415509657261589e-07, + "loss": 0.68425453, + "num_input_tokens_seen": 275170605, + "step": 12762, + "time_per_iteration": 2.6017343997955322 + }, + { + "auxiliary_loss_clip": 0.0114773, + "auxiliary_loss_mlp": 0.01102404, + "balance_loss_clip": 1.00174975, + "balance_loss_mlp": 1.00045609, + "epoch": 0.7673530738012927, + "flos": 20338834671360.0, + "grad_norm": 11.072953947768198, + "language_loss": 0.73908079, + "learning_rate": 5.412844946792639e-07, + "loss": 0.76158214, + "num_input_tokens_seen": 275188750, + "step": 12763, + "time_per_iteration": 2.559990644454956 + }, + { + "auxiliary_loss_clip": 0.01131045, + "auxiliary_loss_mlp": 0.01102612, + "balance_loss_clip": 1.00187707, + "balance_loss_mlp": 1.00047302, + "epoch": 0.7674131970539606, + "flos": 34933576988160.0, + "grad_norm": 1.4584336802472264, + "language_loss": 0.70619076, + "learning_rate": 5.410180789470067e-07, + "loss": 0.72852737, + "num_input_tokens_seen": 275211365, + "step": 12764, + "time_per_iteration": 2.7494091987609863 + }, + { + "auxiliary_loss_clip": 0.01149069, + "auxiliary_loss_mlp": 0.01101057, + "balance_loss_clip": 1.00187123, + "balance_loss_mlp": 1.00034928, + "epoch": 0.7674733203066286, + "flos": 28328850766080.0, + "grad_norm": 2.4964284863792896, + "language_loss": 0.69451857, + "learning_rate": 5.40751718539491e-07, + "loss": 0.7170198, + "num_input_tokens_seen": 275231670, + "step": 12765, + "time_per_iteration": 2.653140068054199 + }, + { + "auxiliary_loss_clip": 0.01133805, + "auxiliary_loss_mlp": 0.01101198, + "balance_loss_clip": 1.00185025, + "balance_loss_mlp": 1.00039458, + "epoch": 0.7675334435592965, + "flos": 16289727252480.0, + "grad_norm": 1.8389122992461886, + "language_loss": 0.6095612, + "learning_rate": 5.404854134668162e-07, + "loss": 0.63191122, + "num_input_tokens_seen": 275249425, + "step": 12766, + "time_per_iteration": 2.5725934505462646 + }, + { + "auxiliary_loss_clip": 0.0111231, + "auxiliary_loss_mlp": 0.01077046, + "balance_loss_clip": 1.00157666, + "balance_loss_mlp": 0.99998933, + "epoch": 0.7675935668119646, + "flos": 64826232220800.0, + "grad_norm": 0.7688092270838777, + "language_loss": 0.60811174, + "learning_rate": 5.402191637390803e-07, + "loss": 0.63000524, + "num_input_tokens_seen": 275312485, + "step": 12767, + "time_per_iteration": 3.3292877674102783 + }, + { + "auxiliary_loss_clip": 0.01131433, + "auxiliary_loss_mlp": 0.0110123, + "balance_loss_clip": 1.00175071, + "balance_loss_mlp": 1.00042629, + "epoch": 0.7676536900646325, + "flos": 22675398382080.0, + "grad_norm": 1.7944441660011534, + "language_loss": 0.69812226, + "learning_rate": 5.399529693663801e-07, + "loss": 0.72044885, + "num_input_tokens_seen": 275331680, + "step": 12768, + "time_per_iteration": 2.5974652767181396 + }, + { + "auxiliary_loss_clip": 0.0115, + "auxiliary_loss_mlp": 0.01103887, + "balance_loss_clip": 1.00204039, + "balance_loss_mlp": 1.00050879, + "epoch": 0.7677138133173005, + "flos": 26939682224640.0, + "grad_norm": 1.639470875316276, + "language_loss": 0.7060712, + "learning_rate": 5.3968683035881e-07, + "loss": 0.72861004, + "num_input_tokens_seen": 275351615, + "step": 12769, + "time_per_iteration": 2.592782974243164 + }, + { + "auxiliary_loss_clip": 0.01147891, + "auxiliary_loss_mlp": 0.01102445, + "balance_loss_clip": 1.00185204, + "balance_loss_mlp": 1.00040221, + "epoch": 0.7677739365699684, + "flos": 23799545400960.0, + "grad_norm": 1.813890758354462, + "language_loss": 0.80300587, + "learning_rate": 5.394207467264611e-07, + "loss": 0.82550925, + "num_input_tokens_seen": 275368815, + "step": 12770, + "time_per_iteration": 2.577693223953247 + }, + { + "auxiliary_loss_clip": 0.01117823, + "auxiliary_loss_mlp": 0.01102244, + "balance_loss_clip": 1.00185442, + "balance_loss_mlp": 1.00048685, + "epoch": 0.7678340598226364, + "flos": 34455497944320.0, + "grad_norm": 1.8211787103177421, + "language_loss": 0.7874974, + "learning_rate": 5.391547184794245e-07, + "loss": 0.80969805, + "num_input_tokens_seen": 275389345, + "step": 12771, + "time_per_iteration": 2.780492067337036 + }, + { + "auxiliary_loss_clip": 0.01164552, + "auxiliary_loss_mlp": 0.0110153, + "balance_loss_clip": 1.00181198, + "balance_loss_mlp": 1.00044048, + "epoch": 0.7678941830753043, + "flos": 23841740903040.0, + "grad_norm": 1.385992534887705, + "language_loss": 0.68193126, + "learning_rate": 5.388887456277876e-07, + "loss": 0.70459205, + "num_input_tokens_seen": 275411240, + "step": 12772, + "time_per_iteration": 4.0595574378967285 + }, + { + "auxiliary_loss_clip": 0.01149601, + "auxiliary_loss_mlp": 0.01101678, + "balance_loss_clip": 1.00188541, + "balance_loss_mlp": 1.00039816, + "epoch": 0.7679543063279723, + "flos": 25410929431680.0, + "grad_norm": 1.5943787385190757, + "language_loss": 0.73110408, + "learning_rate": 5.386228281816349e-07, + "loss": 0.75361681, + "num_input_tokens_seen": 275432010, + "step": 12773, + "time_per_iteration": 2.635385274887085 + }, + { + "auxiliary_loss_clip": 0.01118331, + "auxiliary_loss_mlp": 0.01100956, + "balance_loss_clip": 1.0018127, + "balance_loss_mlp": 1.00039148, + "epoch": 0.7680144295806404, + "flos": 27962382257280.0, + "grad_norm": 1.9181188153611515, + "language_loss": 0.81158412, + "learning_rate": 5.383569661510512e-07, + "loss": 0.83377695, + "num_input_tokens_seen": 275453710, + "step": 12774, + "time_per_iteration": 2.7103970050811768 + }, + { + "auxiliary_loss_clip": 0.01149782, + "auxiliary_loss_mlp": 0.00747233, + "balance_loss_clip": 1.00192189, + "balance_loss_mlp": 1.00041294, + "epoch": 0.7680745528333083, + "flos": 20412810731520.0, + "grad_norm": 1.7322390162539028, + "language_loss": 0.7007882, + "learning_rate": 5.380911595461177e-07, + "loss": 0.71975839, + "num_input_tokens_seen": 275472915, + "step": 12775, + "time_per_iteration": 4.022375106811523 + }, + { + "auxiliary_loss_clip": 0.01094082, + "auxiliary_loss_mlp": 0.01076589, + "balance_loss_clip": 1.00064182, + "balance_loss_mlp": 0.99991381, + "epoch": 0.7681346760859763, + "flos": 68401103351040.0, + "grad_norm": 0.7109495760794522, + "language_loss": 0.56856847, + "learning_rate": 5.378254083769147e-07, + "loss": 0.59027517, + "num_input_tokens_seen": 275534785, + "step": 12776, + "time_per_iteration": 3.317384719848633 + }, + { + "auxiliary_loss_clip": 0.01149803, + "auxiliary_loss_mlp": 0.01102629, + "balance_loss_clip": 1.00193095, + "balance_loss_mlp": 1.00068092, + "epoch": 0.7681947993386442, + "flos": 21251468453760.0, + "grad_norm": 1.9194389396648668, + "language_loss": 0.74001151, + "learning_rate": 5.375597126535188e-07, + "loss": 0.76253587, + "num_input_tokens_seen": 275553205, + "step": 12777, + "time_per_iteration": 2.631317377090454 + }, + { + "auxiliary_loss_clip": 0.01118011, + "auxiliary_loss_mlp": 0.01101784, + "balance_loss_clip": 1.00180197, + "balance_loss_mlp": 1.00045598, + "epoch": 0.7682549225913122, + "flos": 21397696721280.0, + "grad_norm": 2.5021249007880852, + "language_loss": 0.70300752, + "learning_rate": 5.372940723860043e-07, + "loss": 0.72520548, + "num_input_tokens_seen": 275571490, + "step": 12778, + "time_per_iteration": 2.7051968574523926 + }, + { + "auxiliary_loss_clip": 0.01147787, + "auxiliary_loss_mlp": 0.01101988, + "balance_loss_clip": 1.00190198, + "balance_loss_mlp": 1.00051665, + "epoch": 0.7683150458439801, + "flos": 23038921975680.0, + "grad_norm": 3.3622500075740698, + "language_loss": 0.70158041, + "learning_rate": 5.37028487584446e-07, + "loss": 0.72407818, + "num_input_tokens_seen": 275589665, + "step": 12779, + "time_per_iteration": 2.607337236404419 + }, + { + "auxiliary_loss_clip": 0.01133469, + "auxiliary_loss_mlp": 0.01102385, + "balance_loss_clip": 1.00182939, + "balance_loss_mlp": 1.0004369, + "epoch": 0.7683751690966482, + "flos": 67332397996800.0, + "grad_norm": 1.581392135519769, + "language_loss": 0.58747482, + "learning_rate": 5.367629582589133e-07, + "loss": 0.60983336, + "num_input_tokens_seen": 275615605, + "step": 12780, + "time_per_iteration": 3.07169508934021 + }, + { + "auxiliary_loss_clip": 0.01147975, + "auxiliary_loss_mlp": 0.01103937, + "balance_loss_clip": 1.00181353, + "balance_loss_mlp": 1.00055909, + "epoch": 0.7684352923493161, + "flos": 21798890703360.0, + "grad_norm": 2.0354205824930083, + "language_loss": 0.68050176, + "learning_rate": 5.364974844194759e-07, + "loss": 0.70302093, + "num_input_tokens_seen": 275634965, + "step": 12781, + "time_per_iteration": 2.6858582496643066 + }, + { + "auxiliary_loss_clip": 0.01101107, + "auxiliary_loss_mlp": 0.01102927, + "balance_loss_clip": 1.00179946, + "balance_loss_mlp": 1.00050211, + "epoch": 0.7684954156019841, + "flos": 25847603072640.0, + "grad_norm": 1.5596666134400958, + "language_loss": 0.79580414, + "learning_rate": 5.362320660762016e-07, + "loss": 0.81784451, + "num_input_tokens_seen": 275655785, + "step": 12782, + "time_per_iteration": 2.8511946201324463 + }, + { + "auxiliary_loss_clip": 0.01132581, + "auxiliary_loss_mlp": 0.01102225, + "balance_loss_clip": 1.00195885, + "balance_loss_mlp": 1.0004679, + "epoch": 0.768555538854652, + "flos": 25447378757760.0, + "grad_norm": 1.6802947245819015, + "language_loss": 0.66366172, + "learning_rate": 5.35966703239153e-07, + "loss": 0.68600971, + "num_input_tokens_seen": 275676160, + "step": 12783, + "time_per_iteration": 2.680408239364624 + }, + { + "auxiliary_loss_clip": 0.0113261, + "auxiliary_loss_mlp": 0.01101942, + "balance_loss_clip": 1.00182414, + "balance_loss_mlp": 1.0006144, + "epoch": 0.76861566210732, + "flos": 19646369303040.0, + "grad_norm": 1.9096097812905823, + "language_loss": 0.68827015, + "learning_rate": 5.357013959183938e-07, + "loss": 0.71061569, + "num_input_tokens_seen": 275695660, + "step": 12784, + "time_per_iteration": 2.634589910507202 + }, + { + "auxiliary_loss_clip": 0.01100046, + "auxiliary_loss_mlp": 0.01101222, + "balance_loss_clip": 1.00167167, + "balance_loss_mlp": 1.00041842, + "epoch": 0.7686757853599879, + "flos": 22419032037120.0, + "grad_norm": 2.405785195444207, + "language_loss": 0.80018562, + "learning_rate": 5.354361441239843e-07, + "loss": 0.82219827, + "num_input_tokens_seen": 275714025, + "step": 12785, + "time_per_iteration": 2.6971516609191895 + }, + { + "auxiliary_loss_clip": 0.01149694, + "auxiliary_loss_mlp": 0.01103003, + "balance_loss_clip": 1.0019294, + "balance_loss_mlp": 1.00048316, + "epoch": 0.768735908612656, + "flos": 47774262453120.0, + "grad_norm": 1.8110781241079725, + "language_loss": 0.77546322, + "learning_rate": 5.351709478659836e-07, + "loss": 0.7979902, + "num_input_tokens_seen": 275737300, + "step": 12786, + "time_per_iteration": 2.806792736053467 + }, + { + "auxiliary_loss_clip": 0.01164447, + "auxiliary_loss_mlp": 0.01101999, + "balance_loss_clip": 1.00184298, + "balance_loss_mlp": 1.00043249, + "epoch": 0.7687960318653239, + "flos": 30263179000320.0, + "grad_norm": 1.840362965786567, + "language_loss": 0.58745146, + "learning_rate": 5.349058071544468e-07, + "loss": 0.61011589, + "num_input_tokens_seen": 275757895, + "step": 12787, + "time_per_iteration": 2.624379873275757 + }, + { + "auxiliary_loss_clip": 0.01133022, + "auxiliary_loss_mlp": 0.01101905, + "balance_loss_clip": 1.00177872, + "balance_loss_mlp": 1.00043416, + "epoch": 0.7688561551179919, + "flos": 19573434737280.0, + "grad_norm": 1.6030953373149628, + "language_loss": 0.75847125, + "learning_rate": 5.346407219994292e-07, + "loss": 0.78082049, + "num_input_tokens_seen": 275776745, + "step": 12788, + "time_per_iteration": 2.709102153778076 + }, + { + "auxiliary_loss_clip": 0.01099309, + "auxiliary_loss_mlp": 0.00747383, + "balance_loss_clip": 1.00177169, + "balance_loss_mlp": 1.00048864, + "epoch": 0.7689162783706599, + "flos": 22783776693120.0, + "grad_norm": 1.6670882138471477, + "language_loss": 0.66962337, + "learning_rate": 5.343756924109821e-07, + "loss": 0.68809032, + "num_input_tokens_seen": 275797205, + "step": 12789, + "time_per_iteration": 2.7067415714263916 + }, + { + "auxiliary_loss_clip": 0.01133184, + "auxiliary_loss_mlp": 0.01103299, + "balance_loss_clip": 1.00186181, + "balance_loss_mlp": 1.00058877, + "epoch": 0.7689764016233278, + "flos": 34204195416960.0, + "grad_norm": 2.320528725451361, + "language_loss": 0.68781531, + "learning_rate": 5.341107183991553e-07, + "loss": 0.7101801, + "num_input_tokens_seen": 275817935, + "step": 12790, + "time_per_iteration": 2.7290520668029785 + }, + { + "auxiliary_loss_clip": 0.01130887, + "auxiliary_loss_mlp": 0.01101631, + "balance_loss_clip": 1.00172138, + "balance_loss_mlp": 1.00054145, + "epoch": 0.7690365248759958, + "flos": 17274469587840.0, + "grad_norm": 1.8335781988380981, + "language_loss": 0.68474585, + "learning_rate": 5.338457999739969e-07, + "loss": 0.70707101, + "num_input_tokens_seen": 275837145, + "step": 12791, + "time_per_iteration": 2.6033904552459717 + }, + { + "auxiliary_loss_clip": 0.01147284, + "auxiliary_loss_mlp": 0.01101726, + "balance_loss_clip": 1.00186884, + "balance_loss_mlp": 1.00054145, + "epoch": 0.7690966481286637, + "flos": 18223157646720.0, + "grad_norm": 1.7417525986587157, + "language_loss": 0.79571402, + "learning_rate": 5.335809371455526e-07, + "loss": 0.81820416, + "num_input_tokens_seen": 275855705, + "step": 12792, + "time_per_iteration": 2.5694563388824463 + }, + { + "auxiliary_loss_clip": 0.01114928, + "auxiliary_loss_mlp": 0.00747278, + "balance_loss_clip": 1.00179899, + "balance_loss_mlp": 1.00035608, + "epoch": 0.7691567713813318, + "flos": 21537568281600.0, + "grad_norm": 1.976932263589341, + "language_loss": 0.730896, + "learning_rate": 5.333161299238673e-07, + "loss": 0.74951816, + "num_input_tokens_seen": 275873930, + "step": 12793, + "time_per_iteration": 2.6686105728149414 + }, + { + "auxiliary_loss_clip": 0.01117084, + "auxiliary_loss_mlp": 0.01103332, + "balance_loss_clip": 1.00184822, + "balance_loss_mlp": 1.00052547, + "epoch": 0.7692168946339997, + "flos": 39379999720320.0, + "grad_norm": 1.717431653095792, + "language_loss": 0.6380713, + "learning_rate": 5.330513783189803e-07, + "loss": 0.66027546, + "num_input_tokens_seen": 275895895, + "step": 12794, + "time_per_iteration": 2.832306146621704 + }, + { + "auxiliary_loss_clip": 0.01135426, + "auxiliary_loss_mlp": 0.01102415, + "balance_loss_clip": 1.00194454, + "balance_loss_mlp": 1.00056243, + "epoch": 0.7692770178866677, + "flos": 25009950931200.0, + "grad_norm": 1.6521656741401818, + "language_loss": 0.76799339, + "learning_rate": 5.327866823409319e-07, + "loss": 0.79037178, + "num_input_tokens_seen": 275917825, + "step": 12795, + "time_per_iteration": 4.062920808792114 + }, + { + "auxiliary_loss_clip": 0.01114635, + "auxiliary_loss_mlp": 0.01104013, + "balance_loss_clip": 1.00176132, + "balance_loss_mlp": 1.00053978, + "epoch": 0.7693371411393356, + "flos": 24716273333760.0, + "grad_norm": 1.499374458787249, + "language_loss": 0.71753544, + "learning_rate": 5.325220419997601e-07, + "loss": 0.73972189, + "num_input_tokens_seen": 275937890, + "step": 12796, + "time_per_iteration": 2.7079153060913086 + }, + { + "auxiliary_loss_clip": 0.01164595, + "auxiliary_loss_mlp": 0.01102586, + "balance_loss_clip": 1.00194561, + "balance_loss_mlp": 1.00044775, + "epoch": 0.7693972643920036, + "flos": 15924803028480.0, + "grad_norm": 2.5184006244831956, + "language_loss": 0.65140259, + "learning_rate": 5.32257457305499e-07, + "loss": 0.67407441, + "num_input_tokens_seen": 275954495, + "step": 12797, + "time_per_iteration": 2.5410966873168945 + }, + { + "auxiliary_loss_clip": 0.01116444, + "auxiliary_loss_mlp": 0.01102562, + "balance_loss_clip": 1.00188375, + "balance_loss_mlp": 1.00061405, + "epoch": 0.7694573876446715, + "flos": 25405901527680.0, + "grad_norm": 2.264753403020208, + "language_loss": 0.91696191, + "learning_rate": 5.319929282681823e-07, + "loss": 0.939152, + "num_input_tokens_seen": 275972395, + "step": 12798, + "time_per_iteration": 4.065299987792969 + }, + { + "auxiliary_loss_clip": 0.01100788, + "auxiliary_loss_mlp": 0.0110276, + "balance_loss_clip": 1.00178242, + "balance_loss_mlp": 1.00033545, + "epoch": 0.7695175108973396, + "flos": 16654220513280.0, + "grad_norm": 1.9865205423846342, + "language_loss": 0.82564878, + "learning_rate": 5.317284548978418e-07, + "loss": 0.84768432, + "num_input_tokens_seen": 275989020, + "step": 12799, + "time_per_iteration": 2.6574301719665527 + }, + { + "auxiliary_loss_clip": 0.01085255, + "auxiliary_loss_mlp": 0.01103604, + "balance_loss_clip": 1.00175905, + "balance_loss_mlp": 1.00041699, + "epoch": 0.7695776341500075, + "flos": 13626520237440.0, + "grad_norm": 2.0641650284757214, + "language_loss": 0.78107178, + "learning_rate": 5.314640372045045e-07, + "loss": 0.80296034, + "num_input_tokens_seen": 276006525, + "step": 12800, + "time_per_iteration": 2.716022491455078 + }, + { + "auxiliary_loss_clip": 0.01131561, + "auxiliary_loss_mlp": 0.01103031, + "balance_loss_clip": 1.00181222, + "balance_loss_mlp": 1.00051081, + "epoch": 0.7696377574026755, + "flos": 24276690691200.0, + "grad_norm": 1.5914529052759208, + "language_loss": 0.83970195, + "learning_rate": 5.31199675198198e-07, + "loss": 0.86204791, + "num_input_tokens_seen": 276027130, + "step": 12801, + "time_per_iteration": 2.641894578933716 + }, + { + "auxiliary_loss_clip": 0.01132818, + "auxiliary_loss_mlp": 0.01101526, + "balance_loss_clip": 1.00174904, + "balance_loss_mlp": 1.00053132, + "epoch": 0.7696978806553435, + "flos": 20923137210240.0, + "grad_norm": 1.963972889281149, + "language_loss": 0.71858221, + "learning_rate": 5.30935368888947e-07, + "loss": 0.74092567, + "num_input_tokens_seen": 276045715, + "step": 12802, + "time_per_iteration": 2.6555330753326416 + }, + { + "auxiliary_loss_clip": 0.0113508, + "auxiliary_loss_mlp": 0.01102002, + "balance_loss_clip": 1.0019232, + "balance_loss_mlp": 1.0005306, + "epoch": 0.7697580039080114, + "flos": 22929609911040.0, + "grad_norm": 1.7264207069577215, + "language_loss": 0.75322974, + "learning_rate": 5.306711182867747e-07, + "loss": 0.77560055, + "num_input_tokens_seen": 276065375, + "step": 12803, + "time_per_iteration": 2.6685173511505127 + }, + { + "auxiliary_loss_clip": 0.01127931, + "auxiliary_loss_mlp": 0.01076593, + "balance_loss_clip": 1.00082171, + "balance_loss_mlp": 0.99991798, + "epoch": 0.7698181271606794, + "flos": 68717654933760.0, + "grad_norm": 0.7418247942774285, + "language_loss": 0.5583055, + "learning_rate": 5.304069234017001e-07, + "loss": 0.58035064, + "num_input_tokens_seen": 276131405, + "step": 12804, + "time_per_iteration": 3.286254644393921 + }, + { + "auxiliary_loss_clip": 0.01127601, + "auxiliary_loss_mlp": 0.01076235, + "balance_loss_clip": 1.00073409, + "balance_loss_mlp": 0.99994069, + "epoch": 0.7698782504133473, + "flos": 67409716999680.0, + "grad_norm": 0.7421180450315036, + "language_loss": 0.53987408, + "learning_rate": 5.301427842437429e-07, + "loss": 0.56191242, + "num_input_tokens_seen": 276200755, + "step": 12805, + "time_per_iteration": 3.3295133113861084 + }, + { + "auxiliary_loss_clip": 0.01115846, + "auxiliary_loss_mlp": 0.01103029, + "balance_loss_clip": 1.00188112, + "balance_loss_mlp": 1.00050914, + "epoch": 0.7699383736660154, + "flos": 22488842119680.0, + "grad_norm": 2.078916202839278, + "language_loss": 0.72857481, + "learning_rate": 5.298787008229187e-07, + "loss": 0.75076354, + "num_input_tokens_seen": 276217880, + "step": 12806, + "time_per_iteration": 2.639233112335205 + }, + { + "auxiliary_loss_clip": 0.01132724, + "auxiliary_loss_mlp": 0.01102167, + "balance_loss_clip": 1.00180805, + "balance_loss_mlp": 1.00050485, + "epoch": 0.7699984969186833, + "flos": 21539723097600.0, + "grad_norm": 2.0623279724215977, + "language_loss": 0.74833918, + "learning_rate": 5.296146731492408e-07, + "loss": 0.77068806, + "num_input_tokens_seen": 276234810, + "step": 12807, + "time_per_iteration": 2.703587770462036 + }, + { + "auxiliary_loss_clip": 0.01148106, + "auxiliary_loss_mlp": 0.01103232, + "balance_loss_clip": 1.00187898, + "balance_loss_mlp": 1.00052166, + "epoch": 0.7700586201713513, + "flos": 21719096640000.0, + "grad_norm": 3.329131248262002, + "language_loss": 0.79775029, + "learning_rate": 5.293507012327218e-07, + "loss": 0.82026362, + "num_input_tokens_seen": 276252850, + "step": 12808, + "time_per_iteration": 2.572519063949585 + }, + { + "auxiliary_loss_clip": 0.01148536, + "auxiliary_loss_mlp": 0.01103881, + "balance_loss_clip": 1.0019002, + "balance_loss_mlp": 1.00050235, + "epoch": 0.7701187434240192, + "flos": 27856015107840.0, + "grad_norm": 2.686756451204112, + "language_loss": 0.79208875, + "learning_rate": 5.290867850833718e-07, + "loss": 0.81461287, + "num_input_tokens_seen": 276272525, + "step": 12809, + "time_per_iteration": 2.6428709030151367 + }, + { + "auxiliary_loss_clip": 0.011165, + "auxiliary_loss_mlp": 0.01101235, + "balance_loss_clip": 1.00172043, + "balance_loss_mlp": 1.0004319, + "epoch": 0.7701788666766872, + "flos": 28621307301120.0, + "grad_norm": 2.4487615724685607, + "language_loss": 0.7055999, + "learning_rate": 5.288229247111993e-07, + "loss": 0.72777724, + "num_input_tokens_seen": 276294210, + "step": 12810, + "time_per_iteration": 4.0993571281433105 + }, + { + "auxiliary_loss_clip": 0.01132574, + "auxiliary_loss_mlp": 0.01103343, + "balance_loss_clip": 1.00182509, + "balance_loss_mlp": 1.00044167, + "epoch": 0.7702389899293551, + "flos": 14246446089600.0, + "grad_norm": 7.070611194263328, + "language_loss": 0.78384554, + "learning_rate": 5.285591201262079e-07, + "loss": 0.80620474, + "num_input_tokens_seen": 276310290, + "step": 12811, + "time_per_iteration": 2.5746266841888428 + }, + { + "auxiliary_loss_clip": 0.0112857, + "auxiliary_loss_mlp": 0.01076255, + "balance_loss_clip": 1.00082636, + "balance_loss_mlp": 0.9999606, + "epoch": 0.7702991131820232, + "flos": 70574128439040.0, + "grad_norm": 0.8078461142300907, + "language_loss": 0.56739289, + "learning_rate": 5.28295371338402e-07, + "loss": 0.58944118, + "num_input_tokens_seen": 276371715, + "step": 12812, + "time_per_iteration": 3.2123467922210693 + }, + { + "auxiliary_loss_clip": 0.01115822, + "auxiliary_loss_mlp": 0.01103256, + "balance_loss_clip": 1.00183523, + "balance_loss_mlp": 1.00064075, + "epoch": 0.7703592364346911, + "flos": 25480021242240.0, + "grad_norm": 1.88802018620595, + "language_loss": 0.72213387, + "learning_rate": 5.280316783577836e-07, + "loss": 0.74432468, + "num_input_tokens_seen": 276389895, + "step": 12813, + "time_per_iteration": 4.150543689727783 + }, + { + "auxiliary_loss_clip": 0.01147969, + "auxiliary_loss_mlp": 0.01103047, + "balance_loss_clip": 1.00180972, + "balance_loss_mlp": 1.0004313, + "epoch": 0.7704193596873591, + "flos": 19280906375040.0, + "grad_norm": 1.7714069112299362, + "language_loss": 0.66170561, + "learning_rate": 5.27768041194351e-07, + "loss": 0.68421578, + "num_input_tokens_seen": 276408990, + "step": 12814, + "time_per_iteration": 2.6259617805480957 + }, + { + "auxiliary_loss_clip": 0.01133295, + "auxiliary_loss_mlp": 0.01101681, + "balance_loss_clip": 1.00171113, + "balance_loss_mlp": 1.00049639, + "epoch": 0.7704794829400271, + "flos": 23658452778240.0, + "grad_norm": 1.9367675164300375, + "language_loss": 0.66008341, + "learning_rate": 5.275044598581018e-07, + "loss": 0.68243313, + "num_input_tokens_seen": 276428190, + "step": 12815, + "time_per_iteration": 2.6417059898376465 + }, + { + "auxiliary_loss_clip": 0.01147831, + "auxiliary_loss_mlp": 0.01102799, + "balance_loss_clip": 1.00191212, + "balance_loss_mlp": 1.00046945, + "epoch": 0.770539606192695, + "flos": 18989311766400.0, + "grad_norm": 2.050313716356516, + "language_loss": 0.65040803, + "learning_rate": 5.272409343590322e-07, + "loss": 0.67291427, + "num_input_tokens_seen": 276446855, + "step": 12816, + "time_per_iteration": 2.5562119483947754 + }, + { + "auxiliary_loss_clip": 0.01148119, + "auxiliary_loss_mlp": 0.01101901, + "balance_loss_clip": 1.00187302, + "balance_loss_mlp": 1.00057268, + "epoch": 0.770599729445363, + "flos": 11830160142720.0, + "grad_norm": 2.356170842766777, + "language_loss": 0.72095382, + "learning_rate": 5.26977464707133e-07, + "loss": 0.7434541, + "num_input_tokens_seen": 276462000, + "step": 12817, + "time_per_iteration": 2.53505277633667 + }, + { + "auxiliary_loss_clip": 0.0109864, + "auxiliary_loss_mlp": 0.01102128, + "balance_loss_clip": 1.00165343, + "balance_loss_mlp": 1.00056207, + "epoch": 0.770659852698031, + "flos": 17822610109440.0, + "grad_norm": 1.8933111754601566, + "language_loss": 0.6119653, + "learning_rate": 5.267140509123957e-07, + "loss": 0.633973, + "num_input_tokens_seen": 276481190, + "step": 12818, + "time_per_iteration": 2.6737825870513916 + }, + { + "auxiliary_loss_clip": 0.01147909, + "auxiliary_loss_mlp": 0.01102223, + "balance_loss_clip": 1.00195789, + "balance_loss_mlp": 1.00046587, + "epoch": 0.770719975950699, + "flos": 21871968923520.0, + "grad_norm": 2.142997774624734, + "language_loss": 0.67552656, + "learning_rate": 5.264506929848093e-07, + "loss": 0.69802785, + "num_input_tokens_seen": 276499520, + "step": 12819, + "time_per_iteration": 2.6118195056915283 + }, + { + "auxiliary_loss_clip": 0.01164636, + "auxiliary_loss_mlp": 0.01102926, + "balance_loss_clip": 1.00191236, + "balance_loss_mlp": 1.00050163, + "epoch": 0.7707800992033669, + "flos": 21325049464320.0, + "grad_norm": 1.8844502401888743, + "language_loss": 0.57578731, + "learning_rate": 5.261873909343608e-07, + "loss": 0.59846294, + "num_input_tokens_seen": 276519110, + "step": 12820, + "time_per_iteration": 2.6096315383911133 + }, + { + "auxiliary_loss_clip": 0.01131403, + "auxiliary_loss_mlp": 0.0110198, + "balance_loss_clip": 1.00173163, + "balance_loss_mlp": 1.00050879, + "epoch": 0.7708402224560349, + "flos": 28179426188160.0, + "grad_norm": 1.896525997230924, + "language_loss": 0.80907524, + "learning_rate": 5.259241447710343e-07, + "loss": 0.8314091, + "num_input_tokens_seen": 276538805, + "step": 12821, + "time_per_iteration": 2.6905226707458496 + }, + { + "auxiliary_loss_clip": 0.01164592, + "auxiliary_loss_mlp": 0.01102541, + "balance_loss_clip": 1.00194299, + "balance_loss_mlp": 1.00059283, + "epoch": 0.7709003457087028, + "flos": 15377057556480.0, + "grad_norm": 2.281385240189809, + "language_loss": 0.68616843, + "learning_rate": 5.256609545048114e-07, + "loss": 0.70883977, + "num_input_tokens_seen": 276554770, + "step": 12822, + "time_per_iteration": 2.5485432147979736 + }, + { + "auxiliary_loss_clip": 0.01132937, + "auxiliary_loss_mlp": 0.01102142, + "balance_loss_clip": 1.00182748, + "balance_loss_mlp": 1.0005753, + "epoch": 0.7709604689613708, + "flos": 30621854257920.0, + "grad_norm": 1.8912006030540807, + "language_loss": 0.72227311, + "learning_rate": 5.253978201456733e-07, + "loss": 0.7446239, + "num_input_tokens_seen": 276574535, + "step": 12823, + "time_per_iteration": 2.729448080062866 + }, + { + "auxiliary_loss_clip": 0.01148102, + "auxiliary_loss_mlp": 0.01103645, + "balance_loss_clip": 1.00189161, + "balance_loss_mlp": 1.00055313, + "epoch": 0.7710205922140387, + "flos": 20301272023680.0, + "grad_norm": 2.035616448513739, + "language_loss": 0.76842451, + "learning_rate": 5.251347417035969e-07, + "loss": 0.79094195, + "num_input_tokens_seen": 276592925, + "step": 12824, + "time_per_iteration": 2.563427209854126 + }, + { + "auxiliary_loss_clip": 0.01133439, + "auxiliary_loss_mlp": 0.01102598, + "balance_loss_clip": 1.00175691, + "balance_loss_mlp": 1.00036442, + "epoch": 0.7710807154667068, + "flos": 19644214487040.0, + "grad_norm": 2.2540270326792755, + "language_loss": 0.72124505, + "learning_rate": 5.248717191885592e-07, + "loss": 0.74360538, + "num_input_tokens_seen": 276610540, + "step": 12825, + "time_per_iteration": 2.6129183769226074 + }, + { + "auxiliary_loss_clip": 0.01164381, + "auxiliary_loss_mlp": 0.0110078, + "balance_loss_clip": 1.00198579, + "balance_loss_mlp": 1.00064445, + "epoch": 0.7711408387193747, + "flos": 20006337450240.0, + "grad_norm": 1.4634720012606768, + "language_loss": 0.73577011, + "learning_rate": 5.246087526105343e-07, + "loss": 0.75842172, + "num_input_tokens_seen": 276629200, + "step": 12826, + "time_per_iteration": 2.6283445358276367 + }, + { + "auxiliary_loss_clip": 0.01164517, + "auxiliary_loss_mlp": 0.01102727, + "balance_loss_clip": 1.0018177, + "balance_loss_mlp": 1.00058866, + "epoch": 0.7712009619720427, + "flos": 24971131307520.0, + "grad_norm": 1.899640687042665, + "language_loss": 0.80794287, + "learning_rate": 5.243458419794933e-07, + "loss": 0.83061528, + "num_input_tokens_seen": 276648655, + "step": 12827, + "time_per_iteration": 2.7077198028564453 + }, + { + "auxiliary_loss_clip": 0.01158524, + "auxiliary_loss_mlp": 0.0107586, + "balance_loss_clip": 1.0007652, + "balance_loss_mlp": 0.99994773, + "epoch": 0.7712610852247107, + "flos": 63249681404160.0, + "grad_norm": 0.85558552296526, + "language_loss": 0.55210292, + "learning_rate": 5.240829873054051e-07, + "loss": 0.5744468, + "num_input_tokens_seen": 276716500, + "step": 12828, + "time_per_iteration": 3.288641929626465 + }, + { + "auxiliary_loss_clip": 0.01119554, + "auxiliary_loss_mlp": 0.01101174, + "balance_loss_clip": 1.0019567, + "balance_loss_mlp": 1.00046587, + "epoch": 0.7713212084773786, + "flos": 18697860812160.0, + "grad_norm": 1.9241703364917389, + "language_loss": 0.69715303, + "learning_rate": 5.23820188598238e-07, + "loss": 0.71936035, + "num_input_tokens_seen": 276733535, + "step": 12829, + "time_per_iteration": 2.6320455074310303 + }, + { + "auxiliary_loss_clip": 0.01132636, + "auxiliary_loss_mlp": 0.01103201, + "balance_loss_clip": 1.00201976, + "balance_loss_mlp": 1.00058603, + "epoch": 0.7713813317300466, + "flos": 14173367869440.0, + "grad_norm": 17.61563487285485, + "language_loss": 0.79877692, + "learning_rate": 5.235574458679579e-07, + "loss": 0.82113528, + "num_input_tokens_seen": 276749575, + "step": 12830, + "time_per_iteration": 2.5875208377838135 + }, + { + "auxiliary_loss_clip": 0.01147806, + "auxiliary_loss_mlp": 0.01102192, + "balance_loss_clip": 1.00175369, + "balance_loss_mlp": 1.00053048, + "epoch": 0.7714414549827145, + "flos": 25703960584320.0, + "grad_norm": 1.6282479514641628, + "language_loss": 0.78238386, + "learning_rate": 5.232947591245269e-07, + "loss": 0.80488384, + "num_input_tokens_seen": 276769460, + "step": 12831, + "time_per_iteration": 2.599883794784546 + }, + { + "auxiliary_loss_clip": 0.01133192, + "auxiliary_loss_mlp": 0.01102083, + "balance_loss_clip": 1.00173974, + "balance_loss_mlp": 1.00042117, + "epoch": 0.7715015782353826, + "flos": 30555312312960.0, + "grad_norm": 1.791440923311531, + "language_loss": 0.61031854, + "learning_rate": 5.230321283779071e-07, + "loss": 0.6326713, + "num_input_tokens_seen": 276790820, + "step": 12832, + "time_per_iteration": 2.6741864681243896 + }, + { + "auxiliary_loss_clip": 0.01131699, + "auxiliary_loss_mlp": 0.01102596, + "balance_loss_clip": 1.00174689, + "balance_loss_mlp": 1.00055337, + "epoch": 0.7715617014880505, + "flos": 20229343038720.0, + "grad_norm": 1.559578232550576, + "language_loss": 0.79551202, + "learning_rate": 5.227695536380572e-07, + "loss": 0.81785494, + "num_input_tokens_seen": 276811345, + "step": 12833, + "time_per_iteration": 4.04229211807251 + }, + { + "auxiliary_loss_clip": 0.01093521, + "auxiliary_loss_mlp": 0.01076545, + "balance_loss_clip": 1.00107503, + "balance_loss_mlp": 1.00025117, + "epoch": 0.7716218247407185, + "flos": 63664770971520.0, + "grad_norm": 0.8511622663592082, + "language_loss": 0.55343062, + "learning_rate": 5.22507034914933e-07, + "loss": 0.57513124, + "num_input_tokens_seen": 276870950, + "step": 12834, + "time_per_iteration": 3.237727642059326 + }, + { + "auxiliary_loss_clip": 0.01103476, + "auxiliary_loss_mlp": 0.01102532, + "balance_loss_clip": 1.00170004, + "balance_loss_mlp": 1.00048876, + "epoch": 0.7716819479933864, + "flos": 19791807471360.0, + "grad_norm": 2.1539084317654775, + "language_loss": 0.72929698, + "learning_rate": 5.222445722184903e-07, + "loss": 0.75135696, + "num_input_tokens_seen": 276890760, + "step": 12835, + "time_per_iteration": 2.696366548538208 + }, + { + "auxiliary_loss_clip": 0.01115785, + "auxiliary_loss_mlp": 0.00747302, + "balance_loss_clip": 1.00169694, + "balance_loss_mlp": 1.0004847, + "epoch": 0.7717420712460544, + "flos": 18442176825600.0, + "grad_norm": 2.563611775285241, + "language_loss": 0.70151949, + "learning_rate": 5.219821655586814e-07, + "loss": 0.72015035, + "num_input_tokens_seen": 276909625, + "step": 12836, + "time_per_iteration": 4.176286220550537 + }, + { + "auxiliary_loss_clip": 0.01132541, + "auxiliary_loss_mlp": 0.0110194, + "balance_loss_clip": 1.00182962, + "balance_loss_mlp": 1.00046885, + "epoch": 0.7718021944987223, + "flos": 35189476456320.0, + "grad_norm": 1.867511738330775, + "language_loss": 0.59306043, + "learning_rate": 5.217198149454575e-07, + "loss": 0.6154052, + "num_input_tokens_seen": 276930760, + "step": 12837, + "time_per_iteration": 2.8106801509857178 + }, + { + "auxiliary_loss_clip": 0.01143247, + "auxiliary_loss_mlp": 0.01076686, + "balance_loss_clip": 1.00151491, + "balance_loss_mlp": 1.00001073, + "epoch": 0.7718623177513904, + "flos": 67923167961600.0, + "grad_norm": 0.858596671579872, + "language_loss": 0.55761236, + "learning_rate": 5.214575203887666e-07, + "loss": 0.57981163, + "num_input_tokens_seen": 276989580, + "step": 12838, + "time_per_iteration": 3.1582322120666504 + }, + { + "auxiliary_loss_clip": 0.0114934, + "auxiliary_loss_mlp": 0.0110198, + "balance_loss_clip": 1.00185299, + "balance_loss_mlp": 1.00050914, + "epoch": 0.7719224410040583, + "flos": 18581401941120.0, + "grad_norm": 2.273066726925118, + "language_loss": 0.69466412, + "learning_rate": 5.211952818985538e-07, + "loss": 0.71717733, + "num_input_tokens_seen": 277005450, + "step": 12839, + "time_per_iteration": 2.6261520385742188 + }, + { + "auxiliary_loss_clip": 0.0114791, + "auxiliary_loss_mlp": 0.01101986, + "balance_loss_clip": 1.00191796, + "balance_loss_mlp": 1.00051546, + "epoch": 0.7719825642567263, + "flos": 23075802264960.0, + "grad_norm": 1.8226019977475176, + "language_loss": 0.80131078, + "learning_rate": 5.209330994847647e-07, + "loss": 0.82380974, + "num_input_tokens_seen": 277023055, + "step": 12840, + "time_per_iteration": 2.6961894035339355 + }, + { + "auxiliary_loss_clip": 0.01148056, + "auxiliary_loss_mlp": 0.00747298, + "balance_loss_clip": 1.00185037, + "balance_loss_mlp": 1.00045407, + "epoch": 0.7720426875093943, + "flos": 20339086066560.0, + "grad_norm": 1.7618940568152845, + "language_loss": 0.79651415, + "learning_rate": 5.206709731573402e-07, + "loss": 0.81546772, + "num_input_tokens_seen": 277041150, + "step": 12841, + "time_per_iteration": 2.6683380603790283 + }, + { + "auxiliary_loss_clip": 0.01115865, + "auxiliary_loss_mlp": 0.01102147, + "balance_loss_clip": 1.00178456, + "balance_loss_mlp": 1.00048506, + "epoch": 0.7721028107620622, + "flos": 23880704181120.0, + "grad_norm": 2.9262688436285966, + "language_loss": 0.76350367, + "learning_rate": 5.204089029262208e-07, + "loss": 0.78568375, + "num_input_tokens_seen": 277063895, + "step": 12842, + "time_per_iteration": 2.7447664737701416 + }, + { + "auxiliary_loss_clip": 0.01101526, + "auxiliary_loss_mlp": 0.00747392, + "balance_loss_clip": 1.00184369, + "balance_loss_mlp": 1.00040913, + "epoch": 0.7721629340147302, + "flos": 26651571235200.0, + "grad_norm": 2.610377662897976, + "language_loss": 0.68728912, + "learning_rate": 5.201468888013445e-07, + "loss": 0.70577824, + "num_input_tokens_seen": 277084045, + "step": 12843, + "time_per_iteration": 2.7605669498443604 + }, + { + "auxiliary_loss_clip": 0.011334, + "auxiliary_loss_mlp": 0.01102885, + "balance_loss_clip": 1.00167549, + "balance_loss_mlp": 1.00046027, + "epoch": 0.7722230572673981, + "flos": 21178857110400.0, + "grad_norm": 2.265991645661614, + "language_loss": 0.73907852, + "learning_rate": 5.198849307926465e-07, + "loss": 0.76144135, + "num_input_tokens_seen": 277102625, + "step": 12844, + "time_per_iteration": 2.676663398742676 + }, + { + "auxiliary_loss_clip": 0.01149638, + "auxiliary_loss_mlp": 0.01101922, + "balance_loss_clip": 1.00189042, + "balance_loss_mlp": 1.00054657, + "epoch": 0.7722831805200662, + "flos": 27964644814080.0, + "grad_norm": 1.3645181712867565, + "language_loss": 0.71584487, + "learning_rate": 5.196230289100596e-07, + "loss": 0.73836052, + "num_input_tokens_seen": 277123210, + "step": 12845, + "time_per_iteration": 2.664987802505493 + }, + { + "auxiliary_loss_clip": 0.01164416, + "auxiliary_loss_mlp": 0.01101852, + "balance_loss_clip": 1.00183892, + "balance_loss_mlp": 1.0004766, + "epoch": 0.7723433037727341, + "flos": 33875576864640.0, + "grad_norm": 1.802233966538744, + "language_loss": 0.64430481, + "learning_rate": 5.193611831635159e-07, + "loss": 0.66696751, + "num_input_tokens_seen": 277144895, + "step": 12846, + "time_per_iteration": 2.651904821395874 + }, + { + "auxiliary_loss_clip": 0.01144103, + "auxiliary_loss_mlp": 0.0074538, + "balance_loss_clip": 1.00074983, + "balance_loss_mlp": 1.00012803, + "epoch": 0.7724034270254021, + "flos": 62848271940480.0, + "grad_norm": 0.8228490612422659, + "language_loss": 0.61735004, + "learning_rate": 5.19099393562945e-07, + "loss": 0.63624489, + "num_input_tokens_seen": 277205160, + "step": 12847, + "time_per_iteration": 4.553454399108887 + }, + { + "auxiliary_loss_clip": 0.01164695, + "auxiliary_loss_mlp": 0.01101922, + "balance_loss_clip": 1.00188923, + "balance_loss_mlp": 1.00045133, + "epoch": 0.77246355027807, + "flos": 23295467888640.0, + "grad_norm": 1.707328286989398, + "language_loss": 0.78969282, + "learning_rate": 5.188376601182732e-07, + "loss": 0.81235898, + "num_input_tokens_seen": 277223005, + "step": 12848, + "time_per_iteration": 2.5487685203552246 + }, + { + "auxiliary_loss_clip": 0.011159, + "auxiliary_loss_mlp": 0.01102858, + "balance_loss_clip": 1.00179088, + "balance_loss_mlp": 1.0004338, + "epoch": 0.772523673530738, + "flos": 20121287950080.0, + "grad_norm": 2.262843638905799, + "language_loss": 0.7291894, + "learning_rate": 5.185759828394261e-07, + "loss": 0.75137705, + "num_input_tokens_seen": 277241785, + "step": 12849, + "time_per_iteration": 2.6559598445892334 + }, + { + "auxiliary_loss_clip": 0.01164473, + "auxiliary_loss_mlp": 0.01102453, + "balance_loss_clip": 1.00187087, + "balance_loss_mlp": 1.00040996, + "epoch": 0.7725837967834059, + "flos": 17820096157440.0, + "grad_norm": 1.9291056346854334, + "language_loss": 0.78301001, + "learning_rate": 5.183143617363261e-07, + "loss": 0.80567932, + "num_input_tokens_seen": 277259050, + "step": 12850, + "time_per_iteration": 2.5188400745391846 + }, + { + "auxiliary_loss_clip": 0.01084526, + "auxiliary_loss_mlp": 0.00747283, + "balance_loss_clip": 1.00148952, + "balance_loss_mlp": 1.00038266, + "epoch": 0.772643920036074, + "flos": 27198921657600.0, + "grad_norm": 1.7022769193037228, + "language_loss": 0.79620886, + "learning_rate": 5.180527968188935e-07, + "loss": 0.81452698, + "num_input_tokens_seen": 277278235, + "step": 12851, + "time_per_iteration": 4.276271820068359 + }, + { + "auxiliary_loss_clip": 0.01149932, + "auxiliary_loss_mlp": 0.01102229, + "balance_loss_clip": 1.00193477, + "balance_loss_mlp": 1.00047231, + "epoch": 0.7727040432887419, + "flos": 21579512388480.0, + "grad_norm": 4.154334761018278, + "language_loss": 0.73601353, + "learning_rate": 5.177912880970474e-07, + "loss": 0.75853515, + "num_input_tokens_seen": 277298355, + "step": 12852, + "time_per_iteration": 2.5913636684417725 + }, + { + "auxiliary_loss_clip": 0.01164464, + "auxiliary_loss_mlp": 0.01101643, + "balance_loss_clip": 1.00183177, + "balance_loss_mlp": 1.00055397, + "epoch": 0.7727641665414099, + "flos": 22236641752320.0, + "grad_norm": 2.3837926783595003, + "language_loss": 0.81890655, + "learning_rate": 5.17529835580704e-07, + "loss": 0.84156764, + "num_input_tokens_seen": 277316095, + "step": 12853, + "time_per_iteration": 2.5158944129943848 + }, + { + "auxiliary_loss_clip": 0.01158512, + "auxiliary_loss_mlp": 0.0107621, + "balance_loss_clip": 1.00075281, + "balance_loss_mlp": 0.99991626, + "epoch": 0.7728242897940779, + "flos": 54832221463680.0, + "grad_norm": 0.8090428652169018, + "language_loss": 0.54468942, + "learning_rate": 5.172684392797786e-07, + "loss": 0.56703663, + "num_input_tokens_seen": 277380130, + "step": 12854, + "time_per_iteration": 3.1817734241485596 + }, + { + "auxiliary_loss_clip": 0.01147824, + "auxiliary_loss_mlp": 0.01103432, + "balance_loss_clip": 1.00185955, + "balance_loss_mlp": 1.00043547, + "epoch": 0.7728844130467458, + "flos": 34461962392320.0, + "grad_norm": 2.9509351789537406, + "language_loss": 0.7196542, + "learning_rate": 5.170070992041826e-07, + "loss": 0.74216676, + "num_input_tokens_seen": 277404015, + "step": 12855, + "time_per_iteration": 2.6969707012176514 + }, + { + "auxiliary_loss_clip": 0.01164407, + "auxiliary_loss_mlp": 0.01102023, + "balance_loss_clip": 1.00184965, + "balance_loss_mlp": 1.00045717, + "epoch": 0.7729445362994138, + "flos": 18916341287040.0, + "grad_norm": 1.582874066126187, + "language_loss": 0.67777818, + "learning_rate": 5.167458153638254e-07, + "loss": 0.70044243, + "num_input_tokens_seen": 277421375, + "step": 12856, + "time_per_iteration": 2.5213236808776855 + }, + { + "auxiliary_loss_clip": 0.01118691, + "auxiliary_loss_mlp": 0.01101705, + "balance_loss_clip": 1.00187755, + "balance_loss_mlp": 1.00042462, + "epoch": 0.7730046595520818, + "flos": 22200048771840.0, + "grad_norm": 2.061511793687444, + "language_loss": 0.78998512, + "learning_rate": 5.164845877686162e-07, + "loss": 0.8121891, + "num_input_tokens_seen": 277440170, + "step": 12857, + "time_per_iteration": 2.689701795578003 + }, + { + "auxiliary_loss_clip": 0.0108522, + "auxiliary_loss_mlp": 0.00747259, + "balance_loss_clip": 1.00162458, + "balance_loss_mlp": 1.00043988, + "epoch": 0.7730647828047498, + "flos": 13552328695680.0, + "grad_norm": 1.802917004752364, + "language_loss": 0.78504413, + "learning_rate": 5.162234164284591e-07, + "loss": 0.80336893, + "num_input_tokens_seen": 277456880, + "step": 12858, + "time_per_iteration": 2.7972540855407715 + }, + { + "auxiliary_loss_clip": 0.01164653, + "auxiliary_loss_mlp": 0.01102257, + "balance_loss_clip": 1.00186706, + "balance_loss_mlp": 1.00040436, + "epoch": 0.7731249060574177, + "flos": 21976037602560.0, + "grad_norm": 2.038715303317967, + "language_loss": 0.77150607, + "learning_rate": 5.159623013532591e-07, + "loss": 0.79417515, + "num_input_tokens_seen": 277475365, + "step": 12859, + "time_per_iteration": 2.5708796977996826 + }, + { + "auxiliary_loss_clip": 0.01148231, + "auxiliary_loss_mlp": 0.01101329, + "balance_loss_clip": 1.00193024, + "balance_loss_mlp": 1.00047839, + "epoch": 0.7731850293100857, + "flos": 22601817371520.0, + "grad_norm": 1.4004581121482498, + "language_loss": 0.67881989, + "learning_rate": 5.157012425529186e-07, + "loss": 0.70131552, + "num_input_tokens_seen": 277494975, + "step": 12860, + "time_per_iteration": 2.630528450012207 + }, + { + "auxiliary_loss_clip": 0.01164687, + "auxiliary_loss_mlp": 0.01103197, + "balance_loss_clip": 1.00186753, + "balance_loss_mlp": 1.0004859, + "epoch": 0.7732451525627536, + "flos": 14098422142080.0, + "grad_norm": 2.270998327235363, + "language_loss": 0.74100292, + "learning_rate": 5.154402400373343e-07, + "loss": 0.76368177, + "num_input_tokens_seen": 277510520, + "step": 12861, + "time_per_iteration": 2.5310604572296143 + }, + { + "auxiliary_loss_clip": 0.01148191, + "auxiliary_loss_mlp": 0.01102273, + "balance_loss_clip": 1.00188088, + "balance_loss_mlp": 1.00042009, + "epoch": 0.7733052758154216, + "flos": 21470020755840.0, + "grad_norm": 1.8693352571723265, + "language_loss": 0.74819744, + "learning_rate": 5.15179293816405e-07, + "loss": 0.77070206, + "num_input_tokens_seen": 277530505, + "step": 12862, + "time_per_iteration": 2.6219379901885986 + }, + { + "auxiliary_loss_clip": 0.01101499, + "auxiliary_loss_mlp": 0.01101632, + "balance_loss_clip": 1.00165951, + "balance_loss_mlp": 1.00044727, + "epoch": 0.7733653990680895, + "flos": 21394284929280.0, + "grad_norm": 1.5133705363174255, + "language_loss": 0.82922935, + "learning_rate": 5.149184039000256e-07, + "loss": 0.85126066, + "num_input_tokens_seen": 277550810, + "step": 12863, + "time_per_iteration": 2.7185401916503906 + }, + { + "auxiliary_loss_clip": 0.01164567, + "auxiliary_loss_mlp": 0.01102323, + "balance_loss_clip": 1.00198531, + "balance_loss_mlp": 1.00047004, + "epoch": 0.7734255223207576, + "flos": 17676058619520.0, + "grad_norm": 1.5932692018140593, + "language_loss": 0.73126274, + "learning_rate": 5.146575702980898e-07, + "loss": 0.75393164, + "num_input_tokens_seen": 277567680, + "step": 12864, + "time_per_iteration": 2.543863534927368 + }, + { + "auxiliary_loss_clip": 0.01134313, + "auxiliary_loss_mlp": 0.0110209, + "balance_loss_clip": 1.00175488, + "balance_loss_mlp": 1.00042868, + "epoch": 0.7734856455734255, + "flos": 25230837617280.0, + "grad_norm": 1.9878007163093674, + "language_loss": 0.82005382, + "learning_rate": 5.143967930204871e-07, + "loss": 0.84241784, + "num_input_tokens_seen": 277588970, + "step": 12865, + "time_per_iteration": 2.7008461952209473 + }, + { + "auxiliary_loss_clip": 0.01164725, + "auxiliary_loss_mlp": 0.01103567, + "balance_loss_clip": 1.00197744, + "balance_loss_mlp": 1.00047517, + "epoch": 0.7735457688260935, + "flos": 23433112805760.0, + "grad_norm": 2.349267131854116, + "language_loss": 0.7156812, + "learning_rate": 5.141360720771077e-07, + "loss": 0.73836416, + "num_input_tokens_seen": 277605450, + "step": 12866, + "time_per_iteration": 2.519284725189209 + }, + { + "auxiliary_loss_clip": 0.01099503, + "auxiliary_loss_mlp": 0.00747371, + "balance_loss_clip": 1.00177622, + "balance_loss_mlp": 1.00046575, + "epoch": 0.7736058920787615, + "flos": 18729246320640.0, + "grad_norm": 3.0371227904236275, + "language_loss": 0.6511215, + "learning_rate": 5.138754074778371e-07, + "loss": 0.66959023, + "num_input_tokens_seen": 277622530, + "step": 12867, + "time_per_iteration": 2.717923879623413 + }, + { + "auxiliary_loss_clip": 0.01147831, + "auxiliary_loss_mlp": 0.01101728, + "balance_loss_clip": 1.00177956, + "balance_loss_mlp": 1.00054264, + "epoch": 0.7736660153314294, + "flos": 22893304239360.0, + "grad_norm": 1.4208410131552802, + "language_loss": 0.70823824, + "learning_rate": 5.136147992325595e-07, + "loss": 0.73073387, + "num_input_tokens_seen": 277642700, + "step": 12868, + "time_per_iteration": 2.6064138412475586 + }, + { + "auxiliary_loss_clip": 0.01149183, + "auxiliary_loss_mlp": 0.01102132, + "balance_loss_clip": 1.00195718, + "balance_loss_mlp": 1.00056553, + "epoch": 0.7737261385840974, + "flos": 13800901789440.0, + "grad_norm": 1.9292447857960513, + "language_loss": 0.78388345, + "learning_rate": 5.133542473511578e-07, + "loss": 0.80639666, + "num_input_tokens_seen": 277660005, + "step": 12869, + "time_per_iteration": 2.628260374069214 + }, + { + "auxiliary_loss_clip": 0.01148111, + "auxiliary_loss_mlp": 0.01101331, + "balance_loss_clip": 1.00185847, + "balance_loss_mlp": 1.00033689, + "epoch": 0.7737862618367654, + "flos": 28730727106560.0, + "grad_norm": 1.7535163097483975, + "language_loss": 0.74050689, + "learning_rate": 5.130937518435124e-07, + "loss": 0.76300132, + "num_input_tokens_seen": 277682890, + "step": 12870, + "time_per_iteration": 4.091325759887695 + }, + { + "auxiliary_loss_clip": 0.01149397, + "auxiliary_loss_mlp": 0.01101913, + "balance_loss_clip": 1.00187516, + "balance_loss_mlp": 1.00044203, + "epoch": 0.7738463850894334, + "flos": 17018570119680.0, + "grad_norm": 2.130258125854745, + "language_loss": 0.76159567, + "learning_rate": 5.12833312719501e-07, + "loss": 0.78410876, + "num_input_tokens_seen": 277699330, + "step": 12871, + "time_per_iteration": 2.5355873107910156 + }, + { + "auxiliary_loss_clip": 0.01133093, + "auxiliary_loss_mlp": 0.01101426, + "balance_loss_clip": 1.0017668, + "balance_loss_mlp": 1.00047958, + "epoch": 0.7739065083421013, + "flos": 20704010290560.0, + "grad_norm": 1.9402039692510304, + "language_loss": 0.69274867, + "learning_rate": 5.12572929988999e-07, + "loss": 0.71509385, + "num_input_tokens_seen": 277718750, + "step": 12872, + "time_per_iteration": 2.658006429672241 + }, + { + "auxiliary_loss_clip": 0.01164505, + "auxiliary_loss_mlp": 0.01102184, + "balance_loss_clip": 1.00185561, + "balance_loss_mlp": 1.00042653, + "epoch": 0.7739666315947693, + "flos": 20697222620160.0, + "grad_norm": 2.0243661036456317, + "language_loss": 0.84802258, + "learning_rate": 5.123126036618804e-07, + "loss": 0.87068945, + "num_input_tokens_seen": 277734645, + "step": 12873, + "time_per_iteration": 4.029783487319946 + }, + { + "auxiliary_loss_clip": 0.01164555, + "auxiliary_loss_mlp": 0.01102872, + "balance_loss_clip": 1.00191748, + "balance_loss_mlp": 1.00049496, + "epoch": 0.7740267548474372, + "flos": 29570677718400.0, + "grad_norm": 2.624811444422591, + "language_loss": 0.65283209, + "learning_rate": 5.120523337480174e-07, + "loss": 0.67550635, + "num_input_tokens_seen": 277755535, + "step": 12874, + "time_per_iteration": 2.5864415168762207 + }, + { + "auxiliary_loss_clip": 0.01100722, + "auxiliary_loss_mlp": 0.01101719, + "balance_loss_clip": 1.00172186, + "balance_loss_mlp": 1.00034308, + "epoch": 0.7740868781001052, + "flos": 23659099223040.0, + "grad_norm": 1.6642352583358622, + "language_loss": 0.62221384, + "learning_rate": 5.117921202572785e-07, + "loss": 0.64423829, + "num_input_tokens_seen": 277775585, + "step": 12875, + "time_per_iteration": 2.6902413368225098 + }, + { + "auxiliary_loss_clip": 0.011498, + "auxiliary_loss_mlp": 0.01102312, + "balance_loss_clip": 1.00174618, + "balance_loss_mlp": 1.00045979, + "epoch": 0.7741470013527731, + "flos": 24717314828160.0, + "grad_norm": 2.100472065419054, + "language_loss": 0.65359461, + "learning_rate": 5.115319631995318e-07, + "loss": 0.67611569, + "num_input_tokens_seen": 277794795, + "step": 12876, + "time_per_iteration": 2.632166624069214 + }, + { + "auxiliary_loss_clip": 0.01133335, + "auxiliary_loss_mlp": 0.0110116, + "balance_loss_clip": 1.00172532, + "balance_loss_mlp": 1.00054765, + "epoch": 0.7742071246054412, + "flos": 21871645701120.0, + "grad_norm": 2.336798812241943, + "language_loss": 0.71773803, + "learning_rate": 5.112718625846433e-07, + "loss": 0.74008298, + "num_input_tokens_seen": 277813235, + "step": 12877, + "time_per_iteration": 2.624553680419922 + }, + { + "auxiliary_loss_clip": 0.01118303, + "auxiliary_loss_mlp": 0.01103091, + "balance_loss_clip": 1.00180185, + "balance_loss_mlp": 1.000476, + "epoch": 0.7742672478581091, + "flos": 22674249146880.0, + "grad_norm": 1.784969804881915, + "language_loss": 0.82916754, + "learning_rate": 5.110118184224736e-07, + "loss": 0.85138154, + "num_input_tokens_seen": 277832560, + "step": 12878, + "time_per_iteration": 2.6655983924865723 + }, + { + "auxiliary_loss_clip": 0.01131045, + "auxiliary_loss_mlp": 0.01102536, + "balance_loss_clip": 1.00176692, + "balance_loss_mlp": 1.00049329, + "epoch": 0.7743273711107771, + "flos": 18840892769280.0, + "grad_norm": 1.6370760748583304, + "language_loss": 0.7342844, + "learning_rate": 5.10751830722885e-07, + "loss": 0.75662017, + "num_input_tokens_seen": 277850120, + "step": 12879, + "time_per_iteration": 2.6661200523376465 + }, + { + "auxiliary_loss_clip": 0.01132393, + "auxiliary_loss_mlp": 0.01100531, + "balance_loss_clip": 1.00173545, + "balance_loss_mlp": 1.0004909, + "epoch": 0.7743874943634451, + "flos": 28729326476160.0, + "grad_norm": 2.0063746428220006, + "language_loss": 0.79421663, + "learning_rate": 5.104918994957364e-07, + "loss": 0.81654596, + "num_input_tokens_seen": 277871020, + "step": 12880, + "time_per_iteration": 2.7447330951690674 + }, + { + "auxiliary_loss_clip": 0.01131159, + "auxiliary_loss_mlp": 0.01101028, + "balance_loss_clip": 1.00174403, + "balance_loss_mlp": 1.0006063, + "epoch": 0.774447617616113, + "flos": 21909639312000.0, + "grad_norm": 1.8256484032368394, + "language_loss": 0.70314908, + "learning_rate": 5.102320247508847e-07, + "loss": 0.7254709, + "num_input_tokens_seen": 277891525, + "step": 12881, + "time_per_iteration": 2.701345682144165 + }, + { + "auxiliary_loss_clip": 0.01133295, + "auxiliary_loss_mlp": 0.01103808, + "balance_loss_clip": 1.00176656, + "balance_loss_mlp": 1.00062037, + "epoch": 0.774507740868781, + "flos": 19500643825920.0, + "grad_norm": 3.4633254405639327, + "language_loss": 0.84578991, + "learning_rate": 5.099722064981832e-07, + "loss": 0.86816096, + "num_input_tokens_seen": 277910425, + "step": 12882, + "time_per_iteration": 2.646580219268799 + }, + { + "auxiliary_loss_clip": 0.01110484, + "auxiliary_loss_mlp": 0.01077131, + "balance_loss_clip": 1.00161076, + "balance_loss_mlp": 1.00007427, + "epoch": 0.774567864121449, + "flos": 59426560402560.0, + "grad_norm": 0.7669156467966934, + "language_loss": 0.60497797, + "learning_rate": 5.097124447474858e-07, + "loss": 0.62685418, + "num_input_tokens_seen": 277972795, + "step": 12883, + "time_per_iteration": 3.211366891860962 + }, + { + "auxiliary_loss_clip": 0.01100081, + "auxiliary_loss_mlp": 0.01102218, + "balance_loss_clip": 1.0016557, + "balance_loss_mlp": 1.00046086, + "epoch": 0.774627987374117, + "flos": 13225326255360.0, + "grad_norm": 1.7275868520543236, + "language_loss": 0.72554272, + "learning_rate": 5.094527395086416e-07, + "loss": 0.74756569, + "num_input_tokens_seen": 277990675, + "step": 12884, + "time_per_iteration": 2.671203851699829 + }, + { + "auxiliary_loss_clip": 0.01147911, + "auxiliary_loss_mlp": 0.01102128, + "balance_loss_clip": 1.00190473, + "balance_loss_mlp": 1.00065756, + "epoch": 0.7746881106267849, + "flos": 21394033534080.0, + "grad_norm": 1.508580978171977, + "language_loss": 0.81221747, + "learning_rate": 5.091930907914986e-07, + "loss": 0.83471787, + "num_input_tokens_seen": 278010050, + "step": 12885, + "time_per_iteration": 4.069132328033447 + }, + { + "auxiliary_loss_clip": 0.01164303, + "auxiliary_loss_mlp": 0.01100625, + "balance_loss_clip": 1.00181174, + "balance_loss_mlp": 1.00048888, + "epoch": 0.7747482338794529, + "flos": 25629338079360.0, + "grad_norm": 2.2782151077348565, + "language_loss": 0.64036018, + "learning_rate": 5.089334986059029e-07, + "loss": 0.66300946, + "num_input_tokens_seen": 278030660, + "step": 12886, + "time_per_iteration": 2.6317522525787354 + }, + { + "auxiliary_loss_clip": 0.01117185, + "auxiliary_loss_mlp": 0.01101014, + "balance_loss_clip": 1.00183952, + "balance_loss_mlp": 1.0004971, + "epoch": 0.7748083571321208, + "flos": 11546933402880.0, + "grad_norm": 1.9974755947348106, + "language_loss": 0.69215685, + "learning_rate": 5.086739629616987e-07, + "loss": 0.71433884, + "num_input_tokens_seen": 278047645, + "step": 12887, + "time_per_iteration": 2.6573078632354736 + }, + { + "auxiliary_loss_clip": 0.01149189, + "auxiliary_loss_mlp": 0.01100535, + "balance_loss_clip": 1.00174427, + "balance_loss_mlp": 1.00039887, + "epoch": 0.7748684803847888, + "flos": 19062425900160.0, + "grad_norm": 4.179646629763003, + "language_loss": 0.7035237, + "learning_rate": 5.084144838687275e-07, + "loss": 0.72602093, + "num_input_tokens_seen": 278066170, + "step": 12888, + "time_per_iteration": 2.5799505710601807 + }, + { + "auxiliary_loss_clip": 0.01147963, + "auxiliary_loss_mlp": 0.01102613, + "balance_loss_clip": 1.00170159, + "balance_loss_mlp": 1.00047469, + "epoch": 0.7749286036374567, + "flos": 22273162905600.0, + "grad_norm": 1.996916974018548, + "language_loss": 0.81856537, + "learning_rate": 5.081550613368279e-07, + "loss": 0.84107113, + "num_input_tokens_seen": 278085545, + "step": 12889, + "time_per_iteration": 4.034530878067017 + }, + { + "auxiliary_loss_clip": 0.01116179, + "auxiliary_loss_mlp": 0.01102406, + "balance_loss_clip": 1.00174642, + "balance_loss_mlp": 1.00055385, + "epoch": 0.7749887268901248, + "flos": 20192462749440.0, + "grad_norm": 1.91755275306929, + "language_loss": 0.79559016, + "learning_rate": 5.07895695375838e-07, + "loss": 0.81777596, + "num_input_tokens_seen": 278102995, + "step": 12890, + "time_per_iteration": 2.658613920211792 + }, + { + "auxiliary_loss_clip": 0.01118214, + "auxiliary_loss_mlp": 0.01101522, + "balance_loss_clip": 1.00187898, + "balance_loss_mlp": 1.0003376, + "epoch": 0.7750488501427927, + "flos": 20337541781760.0, + "grad_norm": 1.9804733582796028, + "language_loss": 0.66161728, + "learning_rate": 5.076363859955932e-07, + "loss": 0.68381464, + "num_input_tokens_seen": 278121460, + "step": 12891, + "time_per_iteration": 2.7049148082733154 + }, + { + "auxiliary_loss_clip": 0.01149734, + "auxiliary_loss_mlp": 0.01102504, + "balance_loss_clip": 1.00184345, + "balance_loss_mlp": 1.0004611, + "epoch": 0.7751089733954607, + "flos": 28364043116160.0, + "grad_norm": 1.5556543501528488, + "language_loss": 0.78797555, + "learning_rate": 5.073771332059257e-07, + "loss": 0.81049794, + "num_input_tokens_seen": 278143905, + "step": 12892, + "time_per_iteration": 2.633829355239868 + }, + { + "auxiliary_loss_clip": 0.01147857, + "auxiliary_loss_mlp": 0.01102627, + "balance_loss_clip": 1.00189996, + "balance_loss_mlp": 1.00039339, + "epoch": 0.7751690966481286, + "flos": 16943803960320.0, + "grad_norm": 2.155841109330861, + "language_loss": 0.67260182, + "learning_rate": 5.071179370166669e-07, + "loss": 0.69510669, + "num_input_tokens_seen": 278160850, + "step": 12893, + "time_per_iteration": 2.574268341064453 + }, + { + "auxiliary_loss_clip": 0.01142777, + "auxiliary_loss_mlp": 0.01075888, + "balance_loss_clip": 1.00085902, + "balance_loss_mlp": 0.99997562, + "epoch": 0.7752292199007966, + "flos": 65668050339840.0, + "grad_norm": 0.8178738881282471, + "language_loss": 0.58452153, + "learning_rate": 5.068587974376468e-07, + "loss": 0.60670817, + "num_input_tokens_seen": 278219950, + "step": 12894, + "time_per_iteration": 3.222916603088379 + }, + { + "auxiliary_loss_clip": 0.01132324, + "auxiliary_loss_mlp": 0.01102032, + "balance_loss_clip": 1.00179422, + "balance_loss_mlp": 1.00046539, + "epoch": 0.7752893431534646, + "flos": 20594662312320.0, + "grad_norm": 2.1288290703224195, + "language_loss": 0.7848599, + "learning_rate": 5.065997144786895e-07, + "loss": 0.80720347, + "num_input_tokens_seen": 278237805, + "step": 12895, + "time_per_iteration": 2.665515661239624 + }, + { + "auxiliary_loss_clip": 0.01116501, + "auxiliary_loss_mlp": 0.01101873, + "balance_loss_clip": 1.00170469, + "balance_loss_mlp": 1.00064087, + "epoch": 0.7753494664061326, + "flos": 20485350247680.0, + "grad_norm": 1.8368244098953352, + "language_loss": 0.67758423, + "learning_rate": 5.063406881496209e-07, + "loss": 0.69976795, + "num_input_tokens_seen": 278257660, + "step": 12896, + "time_per_iteration": 2.758275270462036 + }, + { + "auxiliary_loss_clip": 0.01130785, + "auxiliary_loss_mlp": 0.01102456, + "balance_loss_clip": 1.00181425, + "balance_loss_mlp": 1.00060344, + "epoch": 0.7754095896588006, + "flos": 20265900105600.0, + "grad_norm": 1.8240697667939585, + "language_loss": 0.69044435, + "learning_rate": 5.060817184602629e-07, + "loss": 0.71277672, + "num_input_tokens_seen": 278275110, + "step": 12897, + "time_per_iteration": 2.626903533935547 + }, + { + "auxiliary_loss_clip": 0.01164596, + "auxiliary_loss_mlp": 0.01102606, + "balance_loss_clip": 1.00198042, + "balance_loss_mlp": 1.00056314, + "epoch": 0.7754697129114685, + "flos": 23331091201920.0, + "grad_norm": 1.753464710275933, + "language_loss": 0.75223261, + "learning_rate": 5.058228054204364e-07, + "loss": 0.77490461, + "num_input_tokens_seen": 278293035, + "step": 12898, + "time_per_iteration": 2.5943615436553955 + }, + { + "auxiliary_loss_clip": 0.0114782, + "auxiliary_loss_mlp": 0.00747497, + "balance_loss_clip": 1.00182593, + "balance_loss_mlp": 1.00046766, + "epoch": 0.7755298361641365, + "flos": 17347619635200.0, + "grad_norm": 3.281121643845967, + "language_loss": 0.70050144, + "learning_rate": 5.055639490399588e-07, + "loss": 0.71945465, + "num_input_tokens_seen": 278311010, + "step": 12899, + "time_per_iteration": 2.610470771789551 + }, + { + "auxiliary_loss_clip": 0.01117896, + "auxiliary_loss_mlp": 0.01102595, + "balance_loss_clip": 1.00179613, + "balance_loss_mlp": 1.00045657, + "epoch": 0.7755899594168044, + "flos": 19645866512640.0, + "grad_norm": 2.112695641279265, + "language_loss": 0.74664795, + "learning_rate": 5.053051493286453e-07, + "loss": 0.76885289, + "num_input_tokens_seen": 278329900, + "step": 12900, + "time_per_iteration": 2.7123708724975586 + }, + { + "auxiliary_loss_clip": 0.01148079, + "auxiliary_loss_mlp": 0.01101441, + "balance_loss_clip": 1.00182033, + "balance_loss_mlp": 1.00054264, + "epoch": 0.7756500826694724, + "flos": 27414457217280.0, + "grad_norm": 2.040105261367984, + "language_loss": 0.77501935, + "learning_rate": 5.050464062963113e-07, + "loss": 0.79751456, + "num_input_tokens_seen": 278349980, + "step": 12901, + "time_per_iteration": 2.644747495651245 + }, + { + "auxiliary_loss_clip": 0.01147983, + "auxiliary_loss_mlp": 0.01102637, + "balance_loss_clip": 1.00200605, + "balance_loss_mlp": 1.00049829, + "epoch": 0.7757102059221404, + "flos": 28730511624960.0, + "grad_norm": 1.3900897818006512, + "language_loss": 0.77329069, + "learning_rate": 5.047877199527666e-07, + "loss": 0.79579693, + "num_input_tokens_seen": 278372485, + "step": 12902, + "time_per_iteration": 2.679438591003418 + }, + { + "auxiliary_loss_clip": 0.01149329, + "auxiliary_loss_mlp": 0.01101776, + "balance_loss_clip": 1.00187135, + "balance_loss_mlp": 1.00035238, + "epoch": 0.7757703291748084, + "flos": 22486795044480.0, + "grad_norm": 1.896007135372512, + "language_loss": 0.72771955, + "learning_rate": 5.045290903078215e-07, + "loss": 0.75023055, + "num_input_tokens_seen": 278391660, + "step": 12903, + "time_per_iteration": 2.6004934310913086 + }, + { + "auxiliary_loss_clip": 0.0113098, + "auxiliary_loss_mlp": 0.01102485, + "balance_loss_clip": 1.00180197, + "balance_loss_mlp": 1.00034678, + "epoch": 0.7758304524274763, + "flos": 21430159637760.0, + "grad_norm": 2.010194985969721, + "language_loss": 0.76337123, + "learning_rate": 5.042705173712835e-07, + "loss": 0.78570592, + "num_input_tokens_seen": 278409125, + "step": 12904, + "time_per_iteration": 2.701770067214966 + }, + { + "auxiliary_loss_clip": 0.01164387, + "auxiliary_loss_mlp": 0.01101328, + "balance_loss_clip": 1.00200367, + "balance_loss_mlp": 1.00042939, + "epoch": 0.7758905756801443, + "flos": 23659242877440.0, + "grad_norm": 2.1372188837915114, + "language_loss": 0.68699205, + "learning_rate": 5.040120011529576e-07, + "loss": 0.70964921, + "num_input_tokens_seen": 278429450, + "step": 12905, + "time_per_iteration": 2.585308313369751 + }, + { + "auxiliary_loss_clip": 0.01147756, + "auxiliary_loss_mlp": 0.00747234, + "balance_loss_clip": 1.00195611, + "balance_loss_mlp": 1.00036919, + "epoch": 0.7759506989328122, + "flos": 28365479660160.0, + "grad_norm": 2.5156433433681817, + "language_loss": 0.67325842, + "learning_rate": 5.037535416626459e-07, + "loss": 0.69220841, + "num_input_tokens_seen": 278449925, + "step": 12906, + "time_per_iteration": 2.650275945663452 + }, + { + "auxiliary_loss_clip": 0.01118544, + "auxiliary_loss_mlp": 0.01102736, + "balance_loss_clip": 1.00191379, + "balance_loss_mlp": 1.0005976, + "epoch": 0.7760108221854802, + "flos": 14902785354240.0, + "grad_norm": 1.873660320929291, + "language_loss": 0.81479883, + "learning_rate": 5.034951389101498e-07, + "loss": 0.83701158, + "num_input_tokens_seen": 278467255, + "step": 12907, + "time_per_iteration": 2.663470506668091 + }, + { + "auxiliary_loss_clip": 0.01149799, + "auxiliary_loss_mlp": 0.01101311, + "balance_loss_clip": 1.00197542, + "balance_loss_mlp": 1.00050807, + "epoch": 0.7760709454381483, + "flos": 14792503622400.0, + "grad_norm": 2.087530340261102, + "language_loss": 0.67592257, + "learning_rate": 5.032367929052685e-07, + "loss": 0.69843364, + "num_input_tokens_seen": 278484250, + "step": 12908, + "time_per_iteration": 4.057271957397461 + }, + { + "auxiliary_loss_clip": 0.01116446, + "auxiliary_loss_mlp": 0.01102275, + "balance_loss_clip": 1.00162983, + "balance_loss_mlp": 1.00061321, + "epoch": 0.7761310686908162, + "flos": 17379831156480.0, + "grad_norm": 1.7516553420340553, + "language_loss": 0.70442909, + "learning_rate": 5.029785036577976e-07, + "loss": 0.72661632, + "num_input_tokens_seen": 278502740, + "step": 12909, + "time_per_iteration": 2.6393513679504395 + }, + { + "auxiliary_loss_clip": 0.01147176, + "auxiliary_loss_mlp": 0.01101299, + "balance_loss_clip": 1.00182629, + "balance_loss_mlp": 1.00054312, + "epoch": 0.7761911919434842, + "flos": 25556547168000.0, + "grad_norm": 2.239830368915493, + "language_loss": 0.68127763, + "learning_rate": 5.027202711775324e-07, + "loss": 0.70376241, + "num_input_tokens_seen": 278523890, + "step": 12910, + "time_per_iteration": 2.6021945476531982 + }, + { + "auxiliary_loss_clip": 0.01099647, + "auxiliary_loss_mlp": 0.01102443, + "balance_loss_clip": 1.00170124, + "balance_loss_mlp": 1.00059092, + "epoch": 0.7762513151961521, + "flos": 23179763203200.0, + "grad_norm": 2.8050652577964246, + "language_loss": 0.71673012, + "learning_rate": 5.024620954742646e-07, + "loss": 0.73875105, + "num_input_tokens_seen": 278543185, + "step": 12911, + "time_per_iteration": 4.126030921936035 + }, + { + "auxiliary_loss_clip": 0.01164674, + "auxiliary_loss_mlp": 0.00747333, + "balance_loss_clip": 1.00205374, + "balance_loss_mlp": 1.0004952, + "epoch": 0.7763114384488201, + "flos": 21689614552320.0, + "grad_norm": 2.90071433034927, + "language_loss": 0.63381636, + "learning_rate": 5.022039765577836e-07, + "loss": 0.6529364, + "num_input_tokens_seen": 278559220, + "step": 12912, + "time_per_iteration": 2.524510622024536 + }, + { + "auxiliary_loss_clip": 0.01125876, + "auxiliary_loss_mlp": 0.01076201, + "balance_loss_clip": 1.00095189, + "balance_loss_mlp": 0.99990731, + "epoch": 0.776371561701488, + "flos": 69025554316800.0, + "grad_norm": 0.7648312724224532, + "language_loss": 0.53228474, + "learning_rate": 5.019459144378779e-07, + "loss": 0.55430555, + "num_input_tokens_seen": 278618185, + "step": 12913, + "time_per_iteration": 3.2479958534240723 + }, + { + "auxiliary_loss_clip": 0.01131238, + "auxiliary_loss_mlp": 0.01102148, + "balance_loss_clip": 1.00191379, + "balance_loss_mlp": 1.00048625, + "epoch": 0.776431684954156, + "flos": 22893914770560.0, + "grad_norm": 1.851948857747241, + "language_loss": 0.62335485, + "learning_rate": 5.016879091243338e-07, + "loss": 0.64568871, + "num_input_tokens_seen": 278636210, + "step": 12914, + "time_per_iteration": 2.651430606842041 + }, + { + "auxiliary_loss_clip": 0.01133167, + "auxiliary_loss_mlp": 0.01102237, + "balance_loss_clip": 1.00178552, + "balance_loss_mlp": 1.0004797, + "epoch": 0.776491808206824, + "flos": 20261554560000.0, + "grad_norm": 1.8299536430088539, + "language_loss": 0.82302827, + "learning_rate": 5.014299606269339e-07, + "loss": 0.84538227, + "num_input_tokens_seen": 278653305, + "step": 12915, + "time_per_iteration": 2.653395414352417 + }, + { + "auxiliary_loss_clip": 0.01148164, + "auxiliary_loss_mlp": 0.01102573, + "balance_loss_clip": 1.00173533, + "balance_loss_mlp": 1.00043488, + "epoch": 0.776551931459492, + "flos": 26759051706240.0, + "grad_norm": 2.7778792758825803, + "language_loss": 0.74698079, + "learning_rate": 5.011720689554603e-07, + "loss": 0.76948816, + "num_input_tokens_seen": 278671850, + "step": 12916, + "time_per_iteration": 2.628196954727173 + }, + { + "auxiliary_loss_clip": 0.01085878, + "auxiliary_loss_mlp": 0.01101885, + "balance_loss_clip": 1.00157428, + "balance_loss_mlp": 1.00041401, + "epoch": 0.7766120547121599, + "flos": 52665080250240.0, + "grad_norm": 1.4337754899405928, + "language_loss": 0.65834826, + "learning_rate": 5.009142341196919e-07, + "loss": 0.68022591, + "num_input_tokens_seen": 278697860, + "step": 12917, + "time_per_iteration": 3.126600980758667 + }, + { + "auxiliary_loss_clip": 0.01149263, + "auxiliary_loss_mlp": 0.01102731, + "balance_loss_clip": 1.00177383, + "balance_loss_mlp": 1.00049663, + "epoch": 0.7766721779648279, + "flos": 25156215112320.0, + "grad_norm": 1.444924825130526, + "language_loss": 0.64575541, + "learning_rate": 5.006564561294065e-07, + "loss": 0.66827542, + "num_input_tokens_seen": 278720655, + "step": 12918, + "time_per_iteration": 3.2362685203552246 + }, + { + "auxiliary_loss_clip": 0.01164595, + "auxiliary_loss_mlp": 0.01101408, + "balance_loss_clip": 1.00195098, + "balance_loss_mlp": 1.0005573, + "epoch": 0.7767323012174958, + "flos": 23760761690880.0, + "grad_norm": 2.278659607757313, + "language_loss": 0.73484576, + "learning_rate": 5.003987349943777e-07, + "loss": 0.75750577, + "num_input_tokens_seen": 278737375, + "step": 12919, + "time_per_iteration": 2.6184282302856445 + }, + { + "auxiliary_loss_clip": 0.01099121, + "auxiliary_loss_mlp": 0.01103163, + "balance_loss_clip": 1.00169122, + "balance_loss_mlp": 1.00045204, + "epoch": 0.7767924244701638, + "flos": 22086642556800.0, + "grad_norm": 2.5422613322064596, + "language_loss": 0.79094732, + "learning_rate": 5.001410707243792e-07, + "loss": 0.8129701, + "num_input_tokens_seen": 278756510, + "step": 12920, + "time_per_iteration": 2.7374308109283447 + }, + { + "auxiliary_loss_clip": 0.01147892, + "auxiliary_loss_mlp": 0.01101862, + "balance_loss_clip": 1.00188994, + "balance_loss_mlp": 1.00048614, + "epoch": 0.7768525477228319, + "flos": 21981640124160.0, + "grad_norm": 1.5746461511910395, + "language_loss": 0.70797688, + "learning_rate": 4.998834633291829e-07, + "loss": 0.73047441, + "num_input_tokens_seen": 278775410, + "step": 12921, + "time_per_iteration": 2.639431953430176 + }, + { + "auxiliary_loss_clip": 0.01148404, + "auxiliary_loss_mlp": 0.01102552, + "balance_loss_clip": 1.00193477, + "balance_loss_mlp": 1.0005089, + "epoch": 0.7769126709754998, + "flos": 21794581071360.0, + "grad_norm": 1.648128265330026, + "language_loss": 0.76180696, + "learning_rate": 4.996259128185547e-07, + "loss": 0.78431654, + "num_input_tokens_seen": 278794260, + "step": 12922, + "time_per_iteration": 2.6061086654663086 + }, + { + "auxiliary_loss_clip": 0.01101767, + "auxiliary_loss_mlp": 0.01102388, + "balance_loss_clip": 1.00173259, + "balance_loss_mlp": 1.00053549, + "epoch": 0.7769727942281678, + "flos": 20047994248320.0, + "grad_norm": 1.9754675553750578, + "language_loss": 0.80502534, + "learning_rate": 4.993684192022625e-07, + "loss": 0.8270669, + "num_input_tokens_seen": 278813290, + "step": 12923, + "time_per_iteration": 4.174696207046509 + }, + { + "auxiliary_loss_clip": 0.01114664, + "auxiliary_loss_mlp": 0.01101807, + "balance_loss_clip": 1.00162554, + "balance_loss_mlp": 1.00052667, + "epoch": 0.7770329174808357, + "flos": 21686777377920.0, + "grad_norm": 3.1088531367821433, + "language_loss": 0.92272103, + "learning_rate": 4.991109824900699e-07, + "loss": 0.94488573, + "num_input_tokens_seen": 278830610, + "step": 12924, + "time_per_iteration": 2.6439208984375 + }, + { + "auxiliary_loss_clip": 0.01147937, + "auxiliary_loss_mlp": 0.01102075, + "balance_loss_clip": 1.00184095, + "balance_loss_mlp": 1.0004133, + "epoch": 0.7770930407335037, + "flos": 25849255098240.0, + "grad_norm": 2.3912589719201414, + "language_loss": 0.65952259, + "learning_rate": 4.988536026917401e-07, + "loss": 0.68202269, + "num_input_tokens_seen": 278849530, + "step": 12925, + "time_per_iteration": 2.5991904735565186 + }, + { + "auxiliary_loss_clip": 0.01114707, + "auxiliary_loss_mlp": 0.0110265, + "balance_loss_clip": 1.00180674, + "balance_loss_mlp": 1.00051141, + "epoch": 0.7771531639861716, + "flos": 24347865490560.0, + "grad_norm": 1.802742151141632, + "language_loss": 0.71955401, + "learning_rate": 4.985962798170314e-07, + "loss": 0.74172759, + "num_input_tokens_seen": 278869005, + "step": 12926, + "time_per_iteration": 4.170345783233643 + }, + { + "auxiliary_loss_clip": 0.01148301, + "auxiliary_loss_mlp": 0.01102827, + "balance_loss_clip": 1.00186348, + "balance_loss_mlp": 1.00040197, + "epoch": 0.7772132872388396, + "flos": 25629948610560.0, + "grad_norm": 1.6776132973036977, + "language_loss": 0.65763432, + "learning_rate": 4.983390138757027e-07, + "loss": 0.68014562, + "num_input_tokens_seen": 278888790, + "step": 12927, + "time_per_iteration": 2.582171678543091 + }, + { + "auxiliary_loss_clip": 0.01131112, + "auxiliary_loss_mlp": 0.01103379, + "balance_loss_clip": 1.00175107, + "balance_loss_mlp": 1.0005734, + "epoch": 0.7772734104915076, + "flos": 26067412350720.0, + "grad_norm": 1.767894855041738, + "language_loss": 0.72478724, + "learning_rate": 4.980818048775093e-07, + "loss": 0.74713218, + "num_input_tokens_seen": 278908150, + "step": 12928, + "time_per_iteration": 2.608595132827759 + }, + { + "auxiliary_loss_clip": 0.0110121, + "auxiliary_loss_mlp": 0.01101981, + "balance_loss_clip": 1.00154352, + "balance_loss_mlp": 1.00046206, + "epoch": 0.7773335337441756, + "flos": 22925048883840.0, + "grad_norm": 1.6272689329132577, + "language_loss": 0.74164832, + "learning_rate": 4.978246528322036e-07, + "loss": 0.76368022, + "num_input_tokens_seen": 278927425, + "step": 12929, + "time_per_iteration": 2.699305295944214 + }, + { + "auxiliary_loss_clip": 0.01114463, + "auxiliary_loss_mlp": 0.01103215, + "balance_loss_clip": 1.0017271, + "balance_loss_mlp": 1.00040913, + "epoch": 0.7773936569968435, + "flos": 20776765288320.0, + "grad_norm": 2.24510558495746, + "language_loss": 0.77937585, + "learning_rate": 4.975675577495377e-07, + "loss": 0.80155265, + "num_input_tokens_seen": 278946475, + "step": 12930, + "time_per_iteration": 2.6725897789001465 + }, + { + "auxiliary_loss_clip": 0.01164658, + "auxiliary_loss_mlp": 0.01102707, + "balance_loss_clip": 1.00195265, + "balance_loss_mlp": 1.0004735, + "epoch": 0.7774537802495115, + "flos": 20372267255040.0, + "grad_norm": 2.030074445328326, + "language_loss": 0.79506099, + "learning_rate": 4.973105196392613e-07, + "loss": 0.81773466, + "num_input_tokens_seen": 278964345, + "step": 12931, + "time_per_iteration": 2.8167874813079834 + }, + { + "auxiliary_loss_clip": 0.01110422, + "auxiliary_loss_mlp": 0.01077089, + "balance_loss_clip": 1.00143075, + "balance_loss_mlp": 1.00003242, + "epoch": 0.7775139035021794, + "flos": 53912081738880.0, + "grad_norm": 0.8071465662728222, + "language_loss": 0.59776461, + "learning_rate": 4.970535385111199e-07, + "loss": 0.61963975, + "num_input_tokens_seen": 279022380, + "step": 12932, + "time_per_iteration": 3.321650266647339 + }, + { + "auxiliary_loss_clip": 0.01147948, + "auxiliary_loss_mlp": 0.01101916, + "balance_loss_clip": 1.00183868, + "balance_loss_mlp": 1.00049257, + "epoch": 0.7775740267548474, + "flos": 28842481296000.0, + "grad_norm": 1.5771660958670712, + "language_loss": 0.76413584, + "learning_rate": 4.967966143748595e-07, + "loss": 0.78663445, + "num_input_tokens_seen": 279044275, + "step": 12933, + "time_per_iteration": 2.946699857711792 + }, + { + "auxiliary_loss_clip": 0.01134316, + "auxiliary_loss_mlp": 0.01102571, + "balance_loss_clip": 1.00197983, + "balance_loss_mlp": 1.00062323, + "epoch": 0.7776341500075155, + "flos": 21872471713920.0, + "grad_norm": 4.665215903826266, + "language_loss": 0.73557329, + "learning_rate": 4.965397472402215e-07, + "loss": 0.75794214, + "num_input_tokens_seen": 279063375, + "step": 12934, + "time_per_iteration": 2.9521193504333496 + }, + { + "auxiliary_loss_clip": 0.01101521, + "auxiliary_loss_mlp": 0.01102956, + "balance_loss_clip": 1.00171888, + "balance_loss_mlp": 1.00043619, + "epoch": 0.7776942732601834, + "flos": 20229845829120.0, + "grad_norm": 4.564621648428727, + "language_loss": 0.70673245, + "learning_rate": 4.962829371169475e-07, + "loss": 0.72877723, + "num_input_tokens_seen": 279082680, + "step": 12935, + "time_per_iteration": 3.0155463218688965 + }, + { + "auxiliary_loss_clip": 0.01133198, + "auxiliary_loss_mlp": 0.00747375, + "balance_loss_clip": 1.00183356, + "balance_loss_mlp": 1.00053561, + "epoch": 0.7777543965128514, + "flos": 22231829329920.0, + "grad_norm": 1.9920272198072844, + "language_loss": 0.83654964, + "learning_rate": 4.960261840147746e-07, + "loss": 0.85535538, + "num_input_tokens_seen": 279099805, + "step": 12936, + "time_per_iteration": 2.936624765396118 + }, + { + "auxiliary_loss_clip": 0.01149993, + "auxiliary_loss_mlp": 0.01101714, + "balance_loss_clip": 1.00184512, + "balance_loss_mlp": 1.00033832, + "epoch": 0.7778145197655193, + "flos": 14501950508160.0, + "grad_norm": 4.064471740365458, + "language_loss": 0.67365539, + "learning_rate": 4.957694879434397e-07, + "loss": 0.69617242, + "num_input_tokens_seen": 279117975, + "step": 12937, + "time_per_iteration": 2.8103134632110596 + }, + { + "auxiliary_loss_clip": 0.01164526, + "auxiliary_loss_mlp": 0.01102791, + "balance_loss_clip": 1.00184155, + "balance_loss_mlp": 1.00055683, + "epoch": 0.7778746430181873, + "flos": 21140288881920.0, + "grad_norm": 1.7559734241175555, + "language_loss": 0.8738755, + "learning_rate": 4.955128489126777e-07, + "loss": 0.89654863, + "num_input_tokens_seen": 279137255, + "step": 12938, + "time_per_iteration": 2.8551013469696045 + }, + { + "auxiliary_loss_clip": 0.01147856, + "auxiliary_loss_mlp": 0.0110266, + "balance_loss_clip": 1.00188458, + "balance_loss_mlp": 1.0005213, + "epoch": 0.7779347662708552, + "flos": 20266366982400.0, + "grad_norm": 2.64089393738461, + "language_loss": 0.85268795, + "learning_rate": 4.95256266932218e-07, + "loss": 0.87519312, + "num_input_tokens_seen": 279154500, + "step": 12939, + "time_per_iteration": 2.8770627975463867 + }, + { + "auxiliary_loss_clip": 0.01164348, + "auxiliary_loss_mlp": 0.007473, + "balance_loss_clip": 1.00188255, + "balance_loss_mlp": 1.00051391, + "epoch": 0.7779948895235232, + "flos": 19209013303680.0, + "grad_norm": 2.0958003437590444, + "language_loss": 0.69106007, + "learning_rate": 4.949997420117915e-07, + "loss": 0.71017659, + "num_input_tokens_seen": 279173635, + "step": 12940, + "time_per_iteration": 2.826066493988037 + }, + { + "auxiliary_loss_clip": 0.01117374, + "auxiliary_loss_mlp": 0.01101661, + "balance_loss_clip": 1.0017184, + "balance_loss_mlp": 1.00047612, + "epoch": 0.7780550127761912, + "flos": 23914711382400.0, + "grad_norm": 1.6248797047743566, + "language_loss": 0.77775788, + "learning_rate": 4.947432741611255e-07, + "loss": 0.79994822, + "num_input_tokens_seen": 279194430, + "step": 12941, + "time_per_iteration": 2.9701738357543945 + }, + { + "auxiliary_loss_clip": 0.01149825, + "auxiliary_loss_mlp": 0.0110292, + "balance_loss_clip": 1.00179601, + "balance_loss_mlp": 1.00049591, + "epoch": 0.7781151360288592, + "flos": 32415951795840.0, + "grad_norm": 3.435715660369444, + "language_loss": 0.73274863, + "learning_rate": 4.944868633899462e-07, + "loss": 0.75527608, + "num_input_tokens_seen": 279212920, + "step": 12942, + "time_per_iteration": 2.8676085472106934 + }, + { + "auxiliary_loss_clip": 0.01099651, + "auxiliary_loss_mlp": 0.01101172, + "balance_loss_clip": 1.00163102, + "balance_loss_mlp": 1.000512, + "epoch": 0.7781752592815271, + "flos": 22346384780160.0, + "grad_norm": 4.079187719943614, + "language_loss": 0.67513686, + "learning_rate": 4.942305097079751e-07, + "loss": 0.6971451, + "num_input_tokens_seen": 279232310, + "step": 12943, + "time_per_iteration": 2.8302414417266846 + }, + { + "auxiliary_loss_clip": 0.01128451, + "auxiliary_loss_mlp": 0.01075937, + "balance_loss_clip": 1.00081611, + "balance_loss_mlp": 1.0000248, + "epoch": 0.7782353825341951, + "flos": 70460183520000.0, + "grad_norm": 0.7702279951775355, + "language_loss": 0.58493501, + "learning_rate": 4.939742131249347e-07, + "loss": 0.60697889, + "num_input_tokens_seen": 279295375, + "step": 12944, + "time_per_iteration": 3.3277761936187744 + }, + { + "auxiliary_loss_clip": 0.01164699, + "auxiliary_loss_mlp": 0.0110304, + "balance_loss_clip": 1.00189459, + "balance_loss_mlp": 1.00052047, + "epoch": 0.778295505786863, + "flos": 19062569554560.0, + "grad_norm": 1.9926421414751436, + "language_loss": 0.6770097, + "learning_rate": 4.937179736505428e-07, + "loss": 0.69968712, + "num_input_tokens_seen": 279313660, + "step": 12945, + "time_per_iteration": 2.5209133625030518 + }, + { + "auxiliary_loss_clip": 0.01149855, + "auxiliary_loss_mlp": 0.01102639, + "balance_loss_clip": 1.00196993, + "balance_loss_mlp": 1.00050068, + "epoch": 0.778355629039531, + "flos": 20999734963200.0, + "grad_norm": 1.840900233392222, + "language_loss": 0.69785064, + "learning_rate": 4.93461791294516e-07, + "loss": 0.72037554, + "num_input_tokens_seen": 279334495, + "step": 12946, + "time_per_iteration": 4.025217294692993 + }, + { + "auxiliary_loss_clip": 0.01164599, + "auxiliary_loss_mlp": 0.01102523, + "balance_loss_clip": 1.00197208, + "balance_loss_mlp": 1.00047946, + "epoch": 0.7784157522921991, + "flos": 21398091770880.0, + "grad_norm": 1.9578793781045574, + "language_loss": 0.65673602, + "learning_rate": 4.932056660665689e-07, + "loss": 0.67940718, + "num_input_tokens_seen": 279352985, + "step": 12947, + "time_per_iteration": 2.536729097366333 + }, + { + "auxiliary_loss_clip": 0.01086729, + "auxiliary_loss_mlp": 0.01102378, + "balance_loss_clip": 1.00177622, + "balance_loss_mlp": 1.00052583, + "epoch": 0.778475875544867, + "flos": 20813861059200.0, + "grad_norm": 1.8476850505528366, + "language_loss": 0.65191817, + "learning_rate": 4.929495979764147e-07, + "loss": 0.67380935, + "num_input_tokens_seen": 279371360, + "step": 12948, + "time_per_iteration": 2.751528024673462 + }, + { + "auxiliary_loss_clip": 0.01164507, + "auxiliary_loss_mlp": 0.01101886, + "balance_loss_clip": 1.00197029, + "balance_loss_mlp": 1.00051033, + "epoch": 0.778535998797535, + "flos": 14355363104640.0, + "grad_norm": 1.9818735988187957, + "language_loss": 0.75077325, + "learning_rate": 4.926935870337625e-07, + "loss": 0.77343714, + "num_input_tokens_seen": 279389400, + "step": 12949, + "time_per_iteration": 4.155183792114258 + }, + { + "auxiliary_loss_clip": 0.01164673, + "auxiliary_loss_mlp": 0.01103884, + "balance_loss_clip": 1.0019877, + "balance_loss_mlp": 1.00050545, + "epoch": 0.7785961220502029, + "flos": 19209552007680.0, + "grad_norm": 1.4197750042688109, + "language_loss": 0.69331193, + "learning_rate": 4.924376332483202e-07, + "loss": 0.71599746, + "num_input_tokens_seen": 279409715, + "step": 12950, + "time_per_iteration": 2.6352553367614746 + }, + { + "auxiliary_loss_clip": 0.0114818, + "auxiliary_loss_mlp": 0.01102976, + "balance_loss_clip": 1.00178587, + "balance_loss_mlp": 1.00045574, + "epoch": 0.7786562453028709, + "flos": 25738757884800.0, + "grad_norm": 2.4020408035925636, + "language_loss": 0.71962082, + "learning_rate": 4.921817366297938e-07, + "loss": 0.74213243, + "num_input_tokens_seen": 279427705, + "step": 12951, + "time_per_iteration": 2.595184087753296 + }, + { + "auxiliary_loss_clip": 0.01133155, + "auxiliary_loss_mlp": 0.01102279, + "balance_loss_clip": 1.00173056, + "balance_loss_mlp": 1.00061715, + "epoch": 0.7787163685555388, + "flos": 25739440243200.0, + "grad_norm": 1.9767982999164184, + "language_loss": 0.65947545, + "learning_rate": 4.919258971878877e-07, + "loss": 0.68182981, + "num_input_tokens_seen": 279448215, + "step": 12952, + "time_per_iteration": 2.6214606761932373 + }, + { + "auxiliary_loss_clip": 0.0113075, + "auxiliary_loss_mlp": 0.01100555, + "balance_loss_clip": 1.00165606, + "balance_loss_mlp": 1.00046718, + "epoch": 0.7787764918082068, + "flos": 22747722416640.0, + "grad_norm": 1.6968529721415904, + "language_loss": 0.81451547, + "learning_rate": 4.916701149323022e-07, + "loss": 0.83682859, + "num_input_tokens_seen": 279466260, + "step": 12953, + "time_per_iteration": 2.613297939300537 + }, + { + "auxiliary_loss_clip": 0.01164698, + "auxiliary_loss_mlp": 0.0110173, + "balance_loss_clip": 1.0019443, + "balance_loss_mlp": 1.0004499, + "epoch": 0.7788366150608748, + "flos": 15190860430080.0, + "grad_norm": 2.5131096066946976, + "language_loss": 0.77086484, + "learning_rate": 4.91414389872737e-07, + "loss": 0.79352909, + "num_input_tokens_seen": 279484520, + "step": 12954, + "time_per_iteration": 2.5114331245422363 + }, + { + "auxiliary_loss_clip": 0.01149902, + "auxiliary_loss_mlp": 0.0110183, + "balance_loss_clip": 1.0018065, + "balance_loss_mlp": 1.00045455, + "epoch": 0.7788967383135428, + "flos": 21210242618880.0, + "grad_norm": 1.910698646343437, + "language_loss": 0.73165619, + "learning_rate": 4.911587220188905e-07, + "loss": 0.7541734, + "num_input_tokens_seen": 279503130, + "step": 12955, + "time_per_iteration": 2.6026999950408936 + }, + { + "auxiliary_loss_clip": 0.01133721, + "auxiliary_loss_mlp": 0.011027, + "balance_loss_clip": 1.00177813, + "balance_loss_mlp": 1.00056159, + "epoch": 0.7789568615662107, + "flos": 21682970536320.0, + "grad_norm": 1.4555583239765961, + "language_loss": 0.68614417, + "learning_rate": 4.909031113804551e-07, + "loss": 0.70850837, + "num_input_tokens_seen": 279521930, + "step": 12956, + "time_per_iteration": 2.599909782409668 + }, + { + "auxiliary_loss_clip": 0.01115862, + "auxiliary_loss_mlp": 0.01101509, + "balance_loss_clip": 1.00175571, + "balance_loss_mlp": 1.00051498, + "epoch": 0.7790169848188787, + "flos": 26360371676160.0, + "grad_norm": 1.6784302843710788, + "language_loss": 0.75939262, + "learning_rate": 4.906475579671252e-07, + "loss": 0.78156638, + "num_input_tokens_seen": 279542375, + "step": 12957, + "time_per_iteration": 2.6988561153411865 + }, + { + "auxiliary_loss_clip": 0.0106786, + "auxiliary_loss_mlp": 0.01101286, + "balance_loss_clip": 1.00153029, + "balance_loss_mlp": 1.00048268, + "epoch": 0.7790771080715466, + "flos": 25516183259520.0, + "grad_norm": 2.2314190788541133, + "language_loss": 0.77445501, + "learning_rate": 4.903920617885917e-07, + "loss": 0.79614645, + "num_input_tokens_seen": 279561885, + "step": 12958, + "time_per_iteration": 2.775609016418457 + }, + { + "auxiliary_loss_clip": 0.01149623, + "auxiliary_loss_mlp": 0.01102694, + "balance_loss_clip": 1.00189006, + "balance_loss_mlp": 1.00055575, + "epoch": 0.7791372313242146, + "flos": 16034186920320.0, + "grad_norm": 2.0350832768483564, + "language_loss": 0.71875495, + "learning_rate": 4.901366228545418e-07, + "loss": 0.74127817, + "num_input_tokens_seen": 279579965, + "step": 12959, + "time_per_iteration": 2.543565273284912 + }, + { + "auxiliary_loss_clip": 0.01148563, + "auxiliary_loss_mlp": 0.00747351, + "balance_loss_clip": 1.00203168, + "balance_loss_mlp": 1.00055575, + "epoch": 0.7791973545768827, + "flos": 23842207779840.0, + "grad_norm": 1.648184183182767, + "language_loss": 0.77713108, + "learning_rate": 4.898812411746632e-07, + "loss": 0.79609025, + "num_input_tokens_seen": 279599030, + "step": 12960, + "time_per_iteration": 4.05276346206665 + }, + { + "auxiliary_loss_clip": 0.01149894, + "auxiliary_loss_mlp": 0.01102927, + "balance_loss_clip": 1.00190496, + "balance_loss_mlp": 1.00059748, + "epoch": 0.7792574778295506, + "flos": 24168384207360.0, + "grad_norm": 1.9241026431791264, + "language_loss": 0.75132251, + "learning_rate": 4.896259167586385e-07, + "loss": 0.77385068, + "num_input_tokens_seen": 279614400, + "step": 12961, + "time_per_iteration": 2.5813097953796387 + }, + { + "auxiliary_loss_clip": 0.01133147, + "auxiliary_loss_mlp": 0.01101332, + "balance_loss_clip": 1.00193024, + "balance_loss_mlp": 1.00052857, + "epoch": 0.7793176010822186, + "flos": 21464921024640.0, + "grad_norm": 2.20399541674496, + "language_loss": 0.73621392, + "learning_rate": 4.893706496161511e-07, + "loss": 0.75855869, + "num_input_tokens_seen": 279633745, + "step": 12962, + "time_per_iteration": 2.6278326511383057 + }, + { + "auxiliary_loss_clip": 0.01148854, + "auxiliary_loss_mlp": 0.01100991, + "balance_loss_clip": 1.00181055, + "balance_loss_mlp": 1.00047338, + "epoch": 0.7793777243348865, + "flos": 20666699038080.0, + "grad_norm": 1.9120928455268942, + "language_loss": 0.70026529, + "learning_rate": 4.891154397568795e-07, + "loss": 0.72276378, + "num_input_tokens_seen": 279651165, + "step": 12963, + "time_per_iteration": 2.5525765419006348 + }, + { + "auxiliary_loss_clip": 0.01147783, + "auxiliary_loss_mlp": 0.0074729, + "balance_loss_clip": 1.00191045, + "balance_loss_mlp": 1.00051951, + "epoch": 0.7794378475875545, + "flos": 27125771610240.0, + "grad_norm": 3.2012657990514826, + "language_loss": 0.63725591, + "learning_rate": 4.888602871905019e-07, + "loss": 0.65620661, + "num_input_tokens_seen": 279671175, + "step": 12964, + "time_per_iteration": 4.076018333435059 + }, + { + "auxiliary_loss_clip": 0.01132268, + "auxiliary_loss_mlp": 0.01101913, + "balance_loss_clip": 1.0017947, + "balance_loss_mlp": 1.00044179, + "epoch": 0.7794979708402224, + "flos": 28074136446720.0, + "grad_norm": 1.6923050622875977, + "language_loss": 0.76714474, + "learning_rate": 4.88605191926694e-07, + "loss": 0.78948647, + "num_input_tokens_seen": 279688675, + "step": 12965, + "time_per_iteration": 2.6727850437164307 + }, + { + "auxiliary_loss_clip": 0.01149331, + "auxiliary_loss_mlp": 0.01100924, + "balance_loss_clip": 1.00183964, + "balance_loss_mlp": 1.00050235, + "epoch": 0.7795580940928905, + "flos": 26869548919680.0, + "grad_norm": 2.255952451907541, + "language_loss": 0.7295351, + "learning_rate": 4.883501539751289e-07, + "loss": 0.7520377, + "num_input_tokens_seen": 279710245, + "step": 12966, + "time_per_iteration": 2.61790132522583 + }, + { + "auxiliary_loss_clip": 0.01132202, + "auxiliary_loss_mlp": 0.00747293, + "balance_loss_clip": 1.00192142, + "balance_loss_mlp": 1.00059199, + "epoch": 0.7796182173455584, + "flos": 23835384195840.0, + "grad_norm": 1.5034621414823457, + "language_loss": 0.74177223, + "learning_rate": 4.880951733454768e-07, + "loss": 0.76056719, + "num_input_tokens_seen": 279729045, + "step": 12967, + "time_per_iteration": 2.619568109512329 + }, + { + "auxiliary_loss_clip": 0.01164562, + "auxiliary_loss_mlp": 0.01101623, + "balance_loss_clip": 1.00191951, + "balance_loss_mlp": 1.00053382, + "epoch": 0.7796783405982264, + "flos": 19792238434560.0, + "grad_norm": 2.2295705659922707, + "language_loss": 0.72061121, + "learning_rate": 4.878402500474073e-07, + "loss": 0.74327302, + "num_input_tokens_seen": 279748350, + "step": 12968, + "time_per_iteration": 2.5481584072113037 + }, + { + "auxiliary_loss_clip": 0.01133153, + "auxiliary_loss_mlp": 0.01101601, + "balance_loss_clip": 1.00171256, + "balance_loss_mlp": 1.000512, + "epoch": 0.7797384638508943, + "flos": 15450207603840.0, + "grad_norm": 2.224619733871103, + "language_loss": 0.60761714, + "learning_rate": 4.875853840905874e-07, + "loss": 0.62996471, + "num_input_tokens_seen": 279765620, + "step": 12969, + "time_per_iteration": 2.598975896835327 + }, + { + "auxiliary_loss_clip": 0.01147889, + "auxiliary_loss_mlp": 0.01100336, + "balance_loss_clip": 1.00178695, + "balance_loss_mlp": 1.00062943, + "epoch": 0.7797985871035623, + "flos": 20922742160640.0, + "grad_norm": 1.7737846076549684, + "language_loss": 0.70183033, + "learning_rate": 4.873305754846811e-07, + "loss": 0.72431254, + "num_input_tokens_seen": 279782485, + "step": 12970, + "time_per_iteration": 2.5993356704711914 + }, + { + "auxiliary_loss_clip": 0.01115582, + "auxiliary_loss_mlp": 0.007474, + "balance_loss_clip": 1.00199938, + "balance_loss_mlp": 1.00054049, + "epoch": 0.7798587103562302, + "flos": 36937212514560.0, + "grad_norm": 1.6954623131622466, + "language_loss": 0.72314554, + "learning_rate": 4.870758242393507e-07, + "loss": 0.74177539, + "num_input_tokens_seen": 279804170, + "step": 12971, + "time_per_iteration": 2.792646884918213 + }, + { + "auxiliary_loss_clip": 0.01119687, + "auxiliary_loss_mlp": 0.01102652, + "balance_loss_clip": 1.00199211, + "balance_loss_mlp": 1.00060844, + "epoch": 0.7799188336088982, + "flos": 22419283432320.0, + "grad_norm": 1.7650605466119433, + "language_loss": 0.74330807, + "learning_rate": 4.868211303642578e-07, + "loss": 0.76553142, + "num_input_tokens_seen": 279823730, + "step": 12972, + "time_per_iteration": 2.635892391204834 + }, + { + "auxiliary_loss_clip": 0.01164634, + "auxiliary_loss_mlp": 0.01102589, + "balance_loss_clip": 1.00198317, + "balance_loss_mlp": 1.00045061, + "epoch": 0.7799789568615663, + "flos": 18880466578560.0, + "grad_norm": 1.7311073358630293, + "language_loss": 0.71210951, + "learning_rate": 4.865664938690584e-07, + "loss": 0.73478174, + "num_input_tokens_seen": 279843035, + "step": 12973, + "time_per_iteration": 2.534050703048706 + }, + { + "auxiliary_loss_clip": 0.01147809, + "auxiliary_loss_mlp": 0.01101685, + "balance_loss_clip": 1.00189281, + "balance_loss_mlp": 1.00045204, + "epoch": 0.7800390801142342, + "flos": 20262272832000.0, + "grad_norm": 2.4385999518012498, + "language_loss": 0.77290386, + "learning_rate": 4.863119147634089e-07, + "loss": 0.79539883, + "num_input_tokens_seen": 279861450, + "step": 12974, + "time_per_iteration": 2.561988592147827 + }, + { + "auxiliary_loss_clip": 0.01116096, + "auxiliary_loss_mlp": 0.0110166, + "balance_loss_clip": 1.00179327, + "balance_loss_mlp": 1.00047529, + "epoch": 0.7800992033669022, + "flos": 16690310703360.0, + "grad_norm": 1.6271360437693039, + "language_loss": 0.6922425, + "learning_rate": 4.86057393056964e-07, + "loss": 0.71442002, + "num_input_tokens_seen": 279878660, + "step": 12975, + "time_per_iteration": 2.636906147003174 + }, + { + "auxiliary_loss_clip": 0.01115721, + "auxiliary_loss_mlp": 0.01100913, + "balance_loss_clip": 1.00174224, + "balance_loss_mlp": 1.00039577, + "epoch": 0.7801593266195701, + "flos": 18585208782720.0, + "grad_norm": 1.958043871334261, + "language_loss": 0.81841671, + "learning_rate": 4.858029287593739e-07, + "loss": 0.84058309, + "num_input_tokens_seen": 279895685, + "step": 12976, + "time_per_iteration": 2.599717855453491 + }, + { + "auxiliary_loss_clip": 0.01132686, + "auxiliary_loss_mlp": 0.00747453, + "balance_loss_clip": 1.0017494, + "balance_loss_mlp": 1.00052238, + "epoch": 0.7802194498722381, + "flos": 25484941405440.0, + "grad_norm": 1.413325587122751, + "language_loss": 0.66023564, + "learning_rate": 4.85548521880289e-07, + "loss": 0.67903697, + "num_input_tokens_seen": 279917240, + "step": 12977, + "time_per_iteration": 2.6673338413238525 + }, + { + "auxiliary_loss_clip": 0.01132062, + "auxiliary_loss_mlp": 0.01101629, + "balance_loss_clip": 1.00179541, + "balance_loss_mlp": 1.00044417, + "epoch": 0.780279573124906, + "flos": 31176315573120.0, + "grad_norm": 1.503186859137074, + "language_loss": 0.75117642, + "learning_rate": 4.852941724293554e-07, + "loss": 0.77351326, + "num_input_tokens_seen": 279938665, + "step": 12978, + "time_per_iteration": 2.6685843467712402 + }, + { + "auxiliary_loss_clip": 0.0113318, + "auxiliary_loss_mlp": 0.01103603, + "balance_loss_clip": 1.00182819, + "balance_loss_mlp": 1.0004158, + "epoch": 0.780339696377574, + "flos": 26944027770240.0, + "grad_norm": 1.8248054527042619, + "language_loss": 0.62139285, + "learning_rate": 4.85039880416219e-07, + "loss": 0.64376068, + "num_input_tokens_seen": 279957965, + "step": 12979, + "time_per_iteration": 2.6272895336151123 + }, + { + "auxiliary_loss_clip": 0.01164384, + "auxiliary_loss_mlp": 0.01101871, + "balance_loss_clip": 1.00187922, + "balance_loss_mlp": 1.00049567, + "epoch": 0.780399819630242, + "flos": 27957426180480.0, + "grad_norm": 1.8388470498548368, + "language_loss": 0.76750571, + "learning_rate": 4.847856458505217e-07, + "loss": 0.79016829, + "num_input_tokens_seen": 279977490, + "step": 12980, + "time_per_iteration": 2.592773199081421 + }, + { + "auxiliary_loss_clip": 0.01164631, + "auxiliary_loss_mlp": 0.01102893, + "balance_loss_clip": 1.00194144, + "balance_loss_mlp": 1.00065887, + "epoch": 0.78045994288291, + "flos": 22486795044480.0, + "grad_norm": 1.9734134515673305, + "language_loss": 0.77880919, + "learning_rate": 4.845314687419046e-07, + "loss": 0.80148447, + "num_input_tokens_seen": 279994220, + "step": 12981, + "time_per_iteration": 2.502683639526367 + }, + { + "auxiliary_loss_clip": 0.01117636, + "auxiliary_loss_mlp": 0.01101669, + "balance_loss_clip": 1.00208616, + "balance_loss_mlp": 1.00058007, + "epoch": 0.7805200661355779, + "flos": 20850849089280.0, + "grad_norm": 1.86216764325233, + "language_loss": 0.72877973, + "learning_rate": 4.842773491000067e-07, + "loss": 0.75097275, + "num_input_tokens_seen": 280012590, + "step": 12982, + "time_per_iteration": 2.6719303131103516 + }, + { + "auxiliary_loss_clip": 0.01133518, + "auxiliary_loss_mlp": 0.01101998, + "balance_loss_clip": 1.00171328, + "balance_loss_mlp": 1.00043154, + "epoch": 0.7805801893882459, + "flos": 25665966973440.0, + "grad_norm": 1.465286711120438, + "language_loss": 0.73300022, + "learning_rate": 4.840232869344636e-07, + "loss": 0.75535536, + "num_input_tokens_seen": 280033700, + "step": 12983, + "time_per_iteration": 2.637035369873047 + }, + { + "auxiliary_loss_clip": 0.01131243, + "auxiliary_loss_mlp": 0.01102592, + "balance_loss_clip": 1.00164008, + "balance_loss_mlp": 1.00045371, + "epoch": 0.7806403126409138, + "flos": 11327806483200.0, + "grad_norm": 2.262795996850527, + "language_loss": 0.74695504, + "learning_rate": 4.837692822549086e-07, + "loss": 0.76929343, + "num_input_tokens_seen": 280052215, + "step": 12984, + "time_per_iteration": 3.949150800704956 + }, + { + "auxiliary_loss_clip": 0.01135248, + "auxiliary_loss_mlp": 0.01101994, + "balance_loss_clip": 1.00206614, + "balance_loss_mlp": 1.00061846, + "epoch": 0.7807004358935818, + "flos": 19573362910080.0, + "grad_norm": 1.8268822443873567, + "language_loss": 0.81570423, + "learning_rate": 4.835153350709746e-07, + "loss": 0.83807665, + "num_input_tokens_seen": 280070525, + "step": 12985, + "time_per_iteration": 2.618274450302124 + }, + { + "auxiliary_loss_clip": 0.01132535, + "auxiliary_loss_mlp": 0.01101888, + "balance_loss_clip": 1.00176001, + "balance_loss_mlp": 1.00051212, + "epoch": 0.7807605591462499, + "flos": 19135827342720.0, + "grad_norm": 1.7516230383976807, + "language_loss": 0.7705375, + "learning_rate": 4.832614453922915e-07, + "loss": 0.79288173, + "num_input_tokens_seen": 280089855, + "step": 12986, + "time_per_iteration": 2.6088035106658936 + }, + { + "auxiliary_loss_clip": 0.01149459, + "auxiliary_loss_mlp": 0.0110267, + "balance_loss_clip": 1.00189018, + "balance_loss_mlp": 1.00053144, + "epoch": 0.7808206823989178, + "flos": 32374654133760.0, + "grad_norm": 1.742838245528691, + "language_loss": 0.73960936, + "learning_rate": 4.830076132284859e-07, + "loss": 0.76213062, + "num_input_tokens_seen": 280109960, + "step": 12987, + "time_per_iteration": 4.069766998291016 + }, + { + "auxiliary_loss_clip": 0.01141848, + "auxiliary_loss_mlp": 0.01076264, + "balance_loss_clip": 1.0008502, + "balance_loss_mlp": 0.99997061, + "epoch": 0.7808808056515858, + "flos": 55050235061760.0, + "grad_norm": 0.7308778605529533, + "language_loss": 0.55021608, + "learning_rate": 4.82753838589184e-07, + "loss": 0.57239723, + "num_input_tokens_seen": 280169805, + "step": 12988, + "time_per_iteration": 3.212707757949829 + }, + { + "auxiliary_loss_clip": 0.01133796, + "auxiliary_loss_mlp": 0.01101507, + "balance_loss_clip": 1.00188708, + "balance_loss_mlp": 1.00051284, + "epoch": 0.7809409289042537, + "flos": 12859468277760.0, + "grad_norm": 3.820979133017951, + "language_loss": 0.80995655, + "learning_rate": 4.82500121484009e-07, + "loss": 0.83230966, + "num_input_tokens_seen": 280184630, + "step": 12989, + "time_per_iteration": 2.6085193157196045 + }, + { + "auxiliary_loss_clip": 0.0111581, + "auxiliary_loss_mlp": 0.01101229, + "balance_loss_clip": 1.00166011, + "balance_loss_mlp": 1.00042593, + "epoch": 0.7810010521569217, + "flos": 21687244254720.0, + "grad_norm": 2.236847796070149, + "language_loss": 0.70524728, + "learning_rate": 4.822464619225806e-07, + "loss": 0.72741771, + "num_input_tokens_seen": 280203880, + "step": 12990, + "time_per_iteration": 2.7399966716766357 + }, + { + "auxiliary_loss_clip": 0.01132996, + "auxiliary_loss_mlp": 0.01101678, + "balance_loss_clip": 1.00181115, + "balance_loss_mlp": 1.00030231, + "epoch": 0.7810611754095896, + "flos": 16757068129920.0, + "grad_norm": 2.4819571219027248, + "language_loss": 0.77729851, + "learning_rate": 4.819928599145184e-07, + "loss": 0.7996453, + "num_input_tokens_seen": 280220460, + "step": 12991, + "time_per_iteration": 2.58500599861145 + }, + { + "auxiliary_loss_clip": 0.01117132, + "auxiliary_loss_mlp": 0.01102255, + "balance_loss_clip": 1.00160694, + "balance_loss_mlp": 1.00049806, + "epoch": 0.7811212986622577, + "flos": 43507464658560.0, + "grad_norm": 1.5165840661932994, + "language_loss": 0.65806007, + "learning_rate": 4.817393154694398e-07, + "loss": 0.68025398, + "num_input_tokens_seen": 280242680, + "step": 12992, + "time_per_iteration": 2.8611361980438232 + }, + { + "auxiliary_loss_clip": 0.01164637, + "auxiliary_loss_mlp": 0.01102158, + "balance_loss_clip": 1.00198948, + "balance_loss_mlp": 1.00040078, + "epoch": 0.7811814219149256, + "flos": 21757700782080.0, + "grad_norm": 1.7565026755426483, + "language_loss": 0.6210345, + "learning_rate": 4.814858285969578e-07, + "loss": 0.64370239, + "num_input_tokens_seen": 280260655, + "step": 12993, + "time_per_iteration": 2.5190658569335938 + }, + { + "auxiliary_loss_clip": 0.01132998, + "auxiliary_loss_mlp": 0.01102101, + "balance_loss_clip": 1.00175941, + "balance_loss_mlp": 1.00043941, + "epoch": 0.7812415451675936, + "flos": 24061514267520.0, + "grad_norm": 1.5830238268304682, + "language_loss": 0.68400013, + "learning_rate": 4.812323993066862e-07, + "loss": 0.70635116, + "num_input_tokens_seen": 280281185, + "step": 12994, + "time_per_iteration": 2.639277935028076 + }, + { + "auxiliary_loss_clip": 0.01164607, + "auxiliary_loss_mlp": 0.01101418, + "balance_loss_clip": 1.0019865, + "balance_loss_mlp": 1.00042415, + "epoch": 0.7813016684202615, + "flos": 18989706816000.0, + "grad_norm": 2.309376328571195, + "language_loss": 0.69305706, + "learning_rate": 4.809790276082335e-07, + "loss": 0.71571732, + "num_input_tokens_seen": 280298255, + "step": 12995, + "time_per_iteration": 2.513885259628296 + }, + { + "auxiliary_loss_clip": 0.01114978, + "auxiliary_loss_mlp": 0.01100507, + "balance_loss_clip": 1.00180626, + "balance_loss_mlp": 1.00037098, + "epoch": 0.7813617916729295, + "flos": 25260786581760.0, + "grad_norm": 3.9662763535101586, + "language_loss": 0.75083065, + "learning_rate": 4.807257135112088e-07, + "loss": 0.77298558, + "num_input_tokens_seen": 280319000, + "step": 12996, + "time_per_iteration": 2.6656529903411865 + }, + { + "auxiliary_loss_clip": 0.01164719, + "auxiliary_loss_mlp": 0.01103399, + "balance_loss_clip": 1.00192261, + "balance_loss_mlp": 1.00049722, + "epoch": 0.7814219149255974, + "flos": 17966037116160.0, + "grad_norm": 3.0730953776048446, + "language_loss": 0.68282521, + "learning_rate": 4.804724570252167e-07, + "loss": 0.70550644, + "num_input_tokens_seen": 280336375, + "step": 12997, + "time_per_iteration": 2.505833864212036 + }, + { + "auxiliary_loss_clip": 0.01164599, + "auxiliary_loss_mlp": 0.01102906, + "balance_loss_clip": 1.00189221, + "balance_loss_mlp": 1.00048089, + "epoch": 0.7814820381782654, + "flos": 25776176878080.0, + "grad_norm": 1.6511047890261235, + "language_loss": 0.82397032, + "learning_rate": 4.802192581598614e-07, + "loss": 0.84664536, + "num_input_tokens_seen": 280358760, + "step": 12998, + "time_per_iteration": 4.013528347015381 + }, + { + "auxiliary_loss_clip": 0.0113528, + "auxiliary_loss_mlp": 0.01102536, + "balance_loss_clip": 1.00185609, + "balance_loss_mlp": 1.00049329, + "epoch": 0.7815421614309335, + "flos": 20519572930560.0, + "grad_norm": 1.8781610226050753, + "language_loss": 0.7451129, + "learning_rate": 4.799661169247453e-07, + "loss": 0.7674911, + "num_input_tokens_seen": 280377085, + "step": 12999, + "time_per_iteration": 2.6034536361694336 + }, + { + "auxiliary_loss_clip": 0.01149774, + "auxiliary_loss_mlp": 0.01102612, + "balance_loss_clip": 1.00199723, + "balance_loss_mlp": 1.00047326, + "epoch": 0.7816022846836014, + "flos": 21287666384640.0, + "grad_norm": 1.453482960072705, + "language_loss": 0.84299219, + "learning_rate": 4.797130333294652e-07, + "loss": 0.86551607, + "num_input_tokens_seen": 280395465, + "step": 13000, + "time_per_iteration": 2.5725691318511963 + }, + { + "auxiliary_loss_clip": 0.01148396, + "auxiliary_loss_mlp": 0.01102448, + "balance_loss_clip": 1.00198889, + "balance_loss_mlp": 1.00040531, + "epoch": 0.7816624079362694, + "flos": 19208402772480.0, + "grad_norm": 1.7985823280304711, + "language_loss": 0.65795165, + "learning_rate": 4.794600073836192e-07, + "loss": 0.68046004, + "num_input_tokens_seen": 280412775, + "step": 13001, + "time_per_iteration": 2.5775721073150635 + }, + { + "auxiliary_loss_clip": 0.01118761, + "auxiliary_loss_mlp": 0.0110248, + "balance_loss_clip": 1.00185299, + "balance_loss_mlp": 1.00053239, + "epoch": 0.7817225311889373, + "flos": 26104687689600.0, + "grad_norm": 1.5696709775919522, + "language_loss": 0.67037129, + "learning_rate": 4.792070390968027e-07, + "loss": 0.69258374, + "num_input_tokens_seen": 280432905, + "step": 13002, + "time_per_iteration": 4.119348526000977 + }, + { + "auxiliary_loss_clip": 0.01149471, + "auxiliary_loss_mlp": 0.01104173, + "balance_loss_clip": 1.00196838, + "balance_loss_mlp": 1.00050807, + "epoch": 0.7817826544416053, + "flos": 21250929749760.0, + "grad_norm": 2.373795089195399, + "language_loss": 0.73380691, + "learning_rate": 4.78954128478607e-07, + "loss": 0.75634336, + "num_input_tokens_seen": 280450785, + "step": 13003, + "time_per_iteration": 2.5716617107391357 + }, + { + "auxiliary_loss_clip": 0.01149934, + "auxiliary_loss_mlp": 0.01101601, + "balance_loss_clip": 1.00199842, + "balance_loss_mlp": 1.00046432, + "epoch": 0.7818427776942732, + "flos": 19932181822080.0, + "grad_norm": 1.6554652150212863, + "language_loss": 0.62014514, + "learning_rate": 4.787012755386233e-07, + "loss": 0.6426605, + "num_input_tokens_seen": 280468400, + "step": 13004, + "time_per_iteration": 2.566174030303955 + }, + { + "auxiliary_loss_clip": 0.01164236, + "auxiliary_loss_mlp": 0.01100762, + "balance_loss_clip": 1.00182796, + "balance_loss_mlp": 1.00053048, + "epoch": 0.7819029009469413, + "flos": 11363753018880.0, + "grad_norm": 2.149057327024703, + "language_loss": 0.82995844, + "learning_rate": 4.784484802864403e-07, + "loss": 0.85260844, + "num_input_tokens_seen": 280483930, + "step": 13005, + "time_per_iteration": 2.512258291244507 + }, + { + "auxiliary_loss_clip": 0.01116468, + "auxiliary_loss_mlp": 0.00747423, + "balance_loss_clip": 1.00171101, + "balance_loss_mlp": 1.00050831, + "epoch": 0.7819630241996092, + "flos": 24279276470400.0, + "grad_norm": 1.700972701813179, + "language_loss": 0.72391105, + "learning_rate": 4.781957427316432e-07, + "loss": 0.7425499, + "num_input_tokens_seen": 280503465, + "step": 13006, + "time_per_iteration": 2.732542037963867 + }, + { + "auxiliary_loss_clip": 0.01147721, + "auxiliary_loss_mlp": 0.00747499, + "balance_loss_clip": 1.00180471, + "balance_loss_mlp": 1.00052166, + "epoch": 0.7820231474522772, + "flos": 22708902792960.0, + "grad_norm": 1.7451003513802192, + "language_loss": 0.72037148, + "learning_rate": 4.779430628838157e-07, + "loss": 0.73932374, + "num_input_tokens_seen": 280523375, + "step": 13007, + "time_per_iteration": 2.5965404510498047 + }, + { + "auxiliary_loss_clip": 0.01164631, + "auxiliary_loss_mlp": 0.01102743, + "balance_loss_clip": 1.00191617, + "balance_loss_mlp": 1.00050867, + "epoch": 0.7820832707049451, + "flos": 20047419630720.0, + "grad_norm": 2.0750208192326967, + "language_loss": 0.69140875, + "learning_rate": 4.776904407525397e-07, + "loss": 0.71408248, + "num_input_tokens_seen": 280542920, + "step": 13008, + "time_per_iteration": 2.581439971923828 + }, + { + "auxiliary_loss_clip": 0.01133558, + "auxiliary_loss_mlp": 0.01102952, + "balance_loss_clip": 1.00182104, + "balance_loss_mlp": 1.00033641, + "epoch": 0.7821433939576131, + "flos": 27162795553920.0, + "grad_norm": 1.7141493118462832, + "language_loss": 0.69738436, + "learning_rate": 4.774378763473954e-07, + "loss": 0.71974951, + "num_input_tokens_seen": 280561700, + "step": 13009, + "time_per_iteration": 2.6818196773529053 + }, + { + "auxiliary_loss_clip": 0.01116587, + "auxiliary_loss_mlp": 0.01101996, + "balance_loss_clip": 1.00175381, + "balance_loss_mlp": 1.00052476, + "epoch": 0.782203517210281, + "flos": 22602068766720.0, + "grad_norm": 2.073873510423325, + "language_loss": 0.81974965, + "learning_rate": 4.771853696779586e-07, + "loss": 0.84193552, + "num_input_tokens_seen": 280580605, + "step": 13010, + "time_per_iteration": 2.747640609741211 + }, + { + "auxiliary_loss_clip": 0.01149207, + "auxiliary_loss_mlp": 0.0110213, + "balance_loss_clip": 1.00182486, + "balance_loss_mlp": 1.00046802, + "epoch": 0.782263640462949, + "flos": 29059812535680.0, + "grad_norm": 1.7502176225201391, + "language_loss": 0.62399364, + "learning_rate": 4.76932920753806e-07, + "loss": 0.64650697, + "num_input_tokens_seen": 280601495, + "step": 13011, + "time_per_iteration": 2.6835460662841797 + }, + { + "auxiliary_loss_clip": 0.01147861, + "auxiliary_loss_mlp": 0.01101307, + "balance_loss_clip": 1.00193119, + "balance_loss_mlp": 1.00036049, + "epoch": 0.782323763715617, + "flos": 25299498464640.0, + "grad_norm": 1.9932887262220444, + "language_loss": 0.7065028, + "learning_rate": 4.7668052958450913e-07, + "loss": 0.72899449, + "num_input_tokens_seen": 280622760, + "step": 13012, + "time_per_iteration": 2.7075557708740234 + }, + { + "auxiliary_loss_clip": 0.01158446, + "auxiliary_loss_mlp": 0.01075868, + "balance_loss_clip": 1.00073159, + "balance_loss_mlp": 0.99995595, + "epoch": 0.782383886968285, + "flos": 65194388668800.0, + "grad_norm": 0.7070107349582009, + "language_loss": 0.55019987, + "learning_rate": 4.764281961796395e-07, + "loss": 0.57254303, + "num_input_tokens_seen": 280687115, + "step": 13013, + "time_per_iteration": 3.1816251277923584 + }, + { + "auxiliary_loss_clip": 0.0113363, + "auxiliary_loss_mlp": 0.01104106, + "balance_loss_clip": 1.00195944, + "balance_loss_mlp": 1.00063229, + "epoch": 0.782444010220953, + "flos": 18405440190720.0, + "grad_norm": 2.1699179285941073, + "language_loss": 0.65725935, + "learning_rate": 4.76175920548765e-07, + "loss": 0.67963666, + "num_input_tokens_seen": 280705000, + "step": 13014, + "time_per_iteration": 2.603994369506836 + }, + { + "auxiliary_loss_clip": 0.01125164, + "auxiliary_loss_mlp": 0.01075519, + "balance_loss_clip": 1.00065875, + "balance_loss_mlp": 0.99998826, + "epoch": 0.7825041334736209, + "flos": 63955003841280.0, + "grad_norm": 0.7220990806353721, + "language_loss": 0.58453053, + "learning_rate": 4.759237027014524e-07, + "loss": 0.60653734, + "num_input_tokens_seen": 280773525, + "step": 13015, + "time_per_iteration": 3.233450412750244 + }, + { + "auxiliary_loss_clip": 0.01133772, + "auxiliary_loss_mlp": 0.01101697, + "balance_loss_clip": 1.0019232, + "balance_loss_mlp": 1.00041699, + "epoch": 0.7825642567262889, + "flos": 20339373375360.0, + "grad_norm": 2.060000658237004, + "language_loss": 0.7475096, + "learning_rate": 4.756715426472666e-07, + "loss": 0.76986426, + "num_input_tokens_seen": 280791915, + "step": 13016, + "time_per_iteration": 2.6272292137145996 + }, + { + "auxiliary_loss_clip": 0.01164495, + "auxiliary_loss_mlp": 0.01103153, + "balance_loss_clip": 1.00189161, + "balance_loss_mlp": 1.00044227, + "epoch": 0.7826243799789568, + "flos": 20262955190400.0, + "grad_norm": 1.6347426504696227, + "language_loss": 0.75348306, + "learning_rate": 4.7541944039576766e-07, + "loss": 0.77615952, + "num_input_tokens_seen": 280811460, + "step": 13017, + "time_per_iteration": 2.5127763748168945 + }, + { + "auxiliary_loss_clip": 0.01130586, + "auxiliary_loss_mlp": 0.0110297, + "balance_loss_clip": 1.00172532, + "balance_loss_mlp": 1.00054538, + "epoch": 0.7826845032316249, + "flos": 21132926593920.0, + "grad_norm": 2.4349960790265675, + "language_loss": 0.7549423, + "learning_rate": 4.7516739595651636e-07, + "loss": 0.77727783, + "num_input_tokens_seen": 280825415, + "step": 13018, + "time_per_iteration": 2.5792317390441895 + }, + { + "auxiliary_loss_clip": 0.01164508, + "auxiliary_loss_mlp": 0.01102228, + "balance_loss_clip": 1.00189364, + "balance_loss_mlp": 1.00047064, + "epoch": 0.7827446264842928, + "flos": 22492253911680.0, + "grad_norm": 1.508645476109364, + "language_loss": 0.77243686, + "learning_rate": 4.749154093390708e-07, + "loss": 0.79510427, + "num_input_tokens_seen": 280845335, + "step": 13019, + "time_per_iteration": 2.5522220134735107 + }, + { + "auxiliary_loss_clip": 0.0109976, + "auxiliary_loss_mlp": 0.01101574, + "balance_loss_clip": 1.00159907, + "balance_loss_mlp": 1.00029397, + "epoch": 0.7828047497369608, + "flos": 28840649702400.0, + "grad_norm": 1.684415725104357, + "language_loss": 0.67656219, + "learning_rate": 4.746634805529852e-07, + "loss": 0.69857556, + "num_input_tokens_seen": 280867145, + "step": 13020, + "time_per_iteration": 2.745633363723755 + }, + { + "auxiliary_loss_clip": 0.01147928, + "auxiliary_loss_mlp": 0.01102688, + "balance_loss_clip": 1.00194502, + "balance_loss_mlp": 1.00054979, + "epoch": 0.7828648729896287, + "flos": 23257689759360.0, + "grad_norm": 1.8064906450461602, + "language_loss": 0.62344134, + "learning_rate": 4.7441160960781325e-07, + "loss": 0.64594746, + "num_input_tokens_seen": 280886185, + "step": 13021, + "time_per_iteration": 2.570589780807495 + }, + { + "auxiliary_loss_clip": 0.01164415, + "auxiliary_loss_mlp": 0.0110171, + "balance_loss_clip": 1.00184703, + "balance_loss_mlp": 1.00062048, + "epoch": 0.7829249962422967, + "flos": 25265670831360.0, + "grad_norm": 1.8671009239173246, + "language_loss": 0.68758476, + "learning_rate": 4.7415979651310636e-07, + "loss": 0.71024597, + "num_input_tokens_seen": 280907665, + "step": 13022, + "time_per_iteration": 3.9474470615386963 + }, + { + "auxiliary_loss_clip": 0.01094616, + "auxiliary_loss_mlp": 0.01076404, + "balance_loss_clip": 1.00107741, + "balance_loss_mlp": 1.00010991, + "epoch": 0.7829851194949646, + "flos": 70722044645760.0, + "grad_norm": 0.6375533956638539, + "language_loss": 0.56220275, + "learning_rate": 4.739080412784131e-07, + "loss": 0.58391291, + "num_input_tokens_seen": 280971405, + "step": 13023, + "time_per_iteration": 3.445141077041626 + }, + { + "auxiliary_loss_clip": 0.01134874, + "auxiliary_loss_mlp": 0.01101289, + "balance_loss_clip": 1.00186586, + "balance_loss_mlp": 1.00048542, + "epoch": 0.7830452427476327, + "flos": 25660795415040.0, + "grad_norm": 1.7714975235604777, + "language_loss": 0.6718486, + "learning_rate": 4.736563439132792e-07, + "loss": 0.69421017, + "num_input_tokens_seen": 280989615, + "step": 13024, + "time_per_iteration": 2.7230184078216553 + }, + { + "auxiliary_loss_clip": 0.01164626, + "auxiliary_loss_mlp": 0.01103291, + "balance_loss_clip": 1.00196791, + "balance_loss_mlp": 1.00038934, + "epoch": 0.7831053660003006, + "flos": 22784315397120.0, + "grad_norm": 1.653509189292984, + "language_loss": 0.77974093, + "learning_rate": 4.734047044272498e-07, + "loss": 0.80242014, + "num_input_tokens_seen": 281009450, + "step": 13025, + "time_per_iteration": 3.987398624420166 + }, + { + "auxiliary_loss_clip": 0.01133172, + "auxiliary_loss_mlp": 0.01101571, + "balance_loss_clip": 1.00194633, + "balance_loss_mlp": 1.00057685, + "epoch": 0.7831654892529686, + "flos": 25812267068160.0, + "grad_norm": 2.139680943394627, + "language_loss": 0.78558189, + "learning_rate": 4.731531228298673e-07, + "loss": 0.80792928, + "num_input_tokens_seen": 281028120, + "step": 13026, + "time_per_iteration": 2.657722234725952 + }, + { + "auxiliary_loss_clip": 0.0114775, + "auxiliary_loss_mlp": 0.01101104, + "balance_loss_clip": 1.00190234, + "balance_loss_mlp": 1.00039601, + "epoch": 0.7832256125056366, + "flos": 20771557816320.0, + "grad_norm": 1.9970564818376493, + "language_loss": 0.75563323, + "learning_rate": 4.729015991306715e-07, + "loss": 0.77812177, + "num_input_tokens_seen": 281042130, + "step": 13027, + "time_per_iteration": 2.5736746788024902 + }, + { + "auxiliary_loss_clip": 0.01149315, + "auxiliary_loss_mlp": 0.01101479, + "balance_loss_clip": 1.00192428, + "balance_loss_mlp": 1.00048494, + "epoch": 0.7832857357583045, + "flos": 21506541909120.0, + "grad_norm": 1.7383748607797729, + "language_loss": 0.70518541, + "learning_rate": 4.726501333391997e-07, + "loss": 0.72769332, + "num_input_tokens_seen": 281060945, + "step": 13028, + "time_per_iteration": 2.589991807937622 + }, + { + "auxiliary_loss_clip": 0.01101381, + "auxiliary_loss_mlp": 0.0110233, + "balance_loss_clip": 1.00175464, + "balance_loss_mlp": 1.00057316, + "epoch": 0.7833458590109725, + "flos": 18077791305600.0, + "grad_norm": 1.9939827827873955, + "language_loss": 0.68899679, + "learning_rate": 4.7239872546498774e-07, + "loss": 0.71103394, + "num_input_tokens_seen": 281079270, + "step": 13029, + "time_per_iteration": 2.6959221363067627 + }, + { + "auxiliary_loss_clip": 0.01118299, + "auxiliary_loss_mlp": 0.01103793, + "balance_loss_clip": 1.00168514, + "balance_loss_mlp": 1.00050998, + "epoch": 0.7834059822636404, + "flos": 28288738252800.0, + "grad_norm": 1.7217436644506479, + "language_loss": 0.80837286, + "learning_rate": 4.721473755175698e-07, + "loss": 0.83059382, + "num_input_tokens_seen": 281099500, + "step": 13030, + "time_per_iteration": 2.8359715938568115 + }, + { + "auxiliary_loss_clip": 0.01149853, + "auxiliary_loss_mlp": 0.01102285, + "balance_loss_clip": 1.00182772, + "balance_loss_mlp": 1.00043213, + "epoch": 0.7834661055163085, + "flos": 31686211088640.0, + "grad_norm": 2.1155543197842746, + "language_loss": 0.70452416, + "learning_rate": 4.71896083506476e-07, + "loss": 0.72704554, + "num_input_tokens_seen": 281121250, + "step": 13031, + "time_per_iteration": 2.8111770153045654 + }, + { + "auxiliary_loss_clip": 0.01116198, + "auxiliary_loss_mlp": 0.01101583, + "balance_loss_clip": 1.00179195, + "balance_loss_mlp": 1.00039864, + "epoch": 0.7835262287689764, + "flos": 12933192942720.0, + "grad_norm": 3.091414443729803, + "language_loss": 0.78535801, + "learning_rate": 4.7164484944123574e-07, + "loss": 0.80753583, + "num_input_tokens_seen": 281138760, + "step": 13032, + "time_per_iteration": 2.752291202545166 + }, + { + "auxiliary_loss_clip": 0.01149856, + "auxiliary_loss_mlp": 0.01102931, + "balance_loss_clip": 1.00188923, + "balance_loss_mlp": 1.00060177, + "epoch": 0.7835863520216444, + "flos": 16143211676160.0, + "grad_norm": 2.8323567676586863, + "language_loss": 0.62969053, + "learning_rate": 4.7139367333137726e-07, + "loss": 0.6522184, + "num_input_tokens_seen": 281157420, + "step": 13033, + "time_per_iteration": 2.746814250946045 + }, + { + "auxiliary_loss_clip": 0.01147741, + "auxiliary_loss_mlp": 0.01102131, + "balance_loss_clip": 1.00182366, + "balance_loss_mlp": 1.00046945, + "epoch": 0.7836464752743123, + "flos": 11509909459200.0, + "grad_norm": 1.7354060370236677, + "language_loss": 0.72074938, + "learning_rate": 4.7114255518642255e-07, + "loss": 0.74324811, + "num_input_tokens_seen": 281174620, + "step": 13034, + "time_per_iteration": 2.632880449295044 + }, + { + "auxiliary_loss_clip": 0.01164744, + "auxiliary_loss_mlp": 0.00747504, + "balance_loss_clip": 1.00200415, + "balance_loss_mlp": 1.00049388, + "epoch": 0.7837065985269803, + "flos": 18223696350720.0, + "grad_norm": 2.398645953364429, + "language_loss": 0.71799743, + "learning_rate": 4.7089149501589555e-07, + "loss": 0.73711991, + "num_input_tokens_seen": 281193865, + "step": 13035, + "time_per_iteration": 4.171926259994507 + }, + { + "auxiliary_loss_clip": 0.01164558, + "auxiliary_loss_mlp": 0.01102908, + "balance_loss_clip": 1.00191057, + "balance_loss_mlp": 1.00057888, + "epoch": 0.7837667217796482, + "flos": 24754410599040.0, + "grad_norm": 3.023496399789661, + "language_loss": 0.66277325, + "learning_rate": 4.7064049282931664e-07, + "loss": 0.68544793, + "num_input_tokens_seen": 281212250, + "step": 13036, + "time_per_iteration": 2.7411396503448486 + }, + { + "auxiliary_loss_clip": 0.01148021, + "auxiliary_loss_mlp": 0.01103462, + "balance_loss_clip": 1.00194454, + "balance_loss_mlp": 1.00046515, + "epoch": 0.7838268450323163, + "flos": 22383121415040.0, + "grad_norm": 3.6753818579386106, + "language_loss": 0.72770077, + "learning_rate": 4.703895486362031e-07, + "loss": 0.75021559, + "num_input_tokens_seen": 281230850, + "step": 13037, + "time_per_iteration": 2.6315813064575195 + }, + { + "auxiliary_loss_clip": 0.01117914, + "auxiliary_loss_mlp": 0.01101855, + "balance_loss_clip": 1.00164044, + "balance_loss_mlp": 1.00047994, + "epoch": 0.7838869682849842, + "flos": 19500284689920.0, + "grad_norm": 3.1895063238266914, + "language_loss": 0.60508287, + "learning_rate": 4.701386624460717e-07, + "loss": 0.62728053, + "num_input_tokens_seen": 281249810, + "step": 13038, + "time_per_iteration": 2.7859725952148438 + }, + { + "auxiliary_loss_clip": 0.01133047, + "auxiliary_loss_mlp": 0.01101979, + "balance_loss_clip": 1.00175071, + "balance_loss_mlp": 1.00050783, + "epoch": 0.7839470915376522, + "flos": 32892845690880.0, + "grad_norm": 1.7227980152318612, + "language_loss": 0.67802411, + "learning_rate": 4.698878342684349e-07, + "loss": 0.70037436, + "num_input_tokens_seen": 281273730, + "step": 13039, + "time_per_iteration": 2.8011062145233154 + }, + { + "auxiliary_loss_clip": 0.0111416, + "auxiliary_loss_mlp": 0.01100895, + "balance_loss_clip": 1.00160718, + "balance_loss_mlp": 1.00037789, + "epoch": 0.7840072147903202, + "flos": 29676003373440.0, + "grad_norm": 3.681058011997338, + "language_loss": 0.69520342, + "learning_rate": 4.6963706411280537e-07, + "loss": 0.71735394, + "num_input_tokens_seen": 281293670, + "step": 13040, + "time_per_iteration": 4.227052688598633 + }, + { + "auxiliary_loss_clip": 0.01101213, + "auxiliary_loss_mlp": 0.01101653, + "balance_loss_clip": 1.00163817, + "balance_loss_mlp": 1.00046802, + "epoch": 0.7840673380429881, + "flos": 18186744234240.0, + "grad_norm": 1.5239062829421453, + "language_loss": 0.67496753, + "learning_rate": 4.6938635198869116e-07, + "loss": 0.69699621, + "num_input_tokens_seen": 281313070, + "step": 13041, + "time_per_iteration": 2.7990095615386963 + }, + { + "auxiliary_loss_clip": 0.01142695, + "auxiliary_loss_mlp": 0.00745482, + "balance_loss_clip": 1.00086117, + "balance_loss_mlp": 1.00024819, + "epoch": 0.7841274612956561, + "flos": 66346006613760.0, + "grad_norm": 0.6586157439314383, + "language_loss": 0.57369214, + "learning_rate": 4.691356979055998e-07, + "loss": 0.59257388, + "num_input_tokens_seen": 281374880, + "step": 13042, + "time_per_iteration": 3.202361822128296 + }, + { + "auxiliary_loss_clip": 0.01133132, + "auxiliary_loss_mlp": 0.01101909, + "balance_loss_clip": 1.0019697, + "balance_loss_mlp": 1.00043821, + "epoch": 0.784187584548324, + "flos": 26648482665600.0, + "grad_norm": 1.972955631711505, + "language_loss": 0.8400557, + "learning_rate": 4.688851018730369e-07, + "loss": 0.86240613, + "num_input_tokens_seen": 281392620, + "step": 13043, + "time_per_iteration": 2.756932020187378 + }, + { + "auxiliary_loss_clip": 0.01147525, + "auxiliary_loss_mlp": 0.01101904, + "balance_loss_clip": 1.00186253, + "balance_loss_mlp": 1.00043309, + "epoch": 0.7842477078009921, + "flos": 25740158515200.0, + "grad_norm": 1.550736762106087, + "language_loss": 0.88457942, + "learning_rate": 4.6863456390050425e-07, + "loss": 0.90707368, + "num_input_tokens_seen": 281413140, + "step": 13044, + "time_per_iteration": 2.740086078643799 + }, + { + "auxiliary_loss_clip": 0.01132904, + "auxiliary_loss_mlp": 0.01103526, + "balance_loss_clip": 1.00184762, + "balance_loss_mlp": 1.00052941, + "epoch": 0.78430783105366, + "flos": 21980957765760.0, + "grad_norm": 1.6872371261222465, + "language_loss": 0.78877985, + "learning_rate": 4.6838408399750195e-07, + "loss": 0.81114417, + "num_input_tokens_seen": 281430860, + "step": 13045, + "time_per_iteration": 2.7601778507232666 + }, + { + "auxiliary_loss_clip": 0.01131058, + "auxiliary_loss_mlp": 0.01102154, + "balance_loss_clip": 1.00181913, + "balance_loss_mlp": 1.00039744, + "epoch": 0.784367954306328, + "flos": 23842279607040.0, + "grad_norm": 1.6015798810372905, + "language_loss": 0.72305644, + "learning_rate": 4.6813366217352925e-07, + "loss": 0.74538863, + "num_input_tokens_seen": 281451385, + "step": 13046, + "time_per_iteration": 2.763957977294922 + }, + { + "auxiliary_loss_clip": 0.01103105, + "auxiliary_loss_mlp": 0.01101975, + "balance_loss_clip": 1.00176895, + "balance_loss_mlp": 1.00050402, + "epoch": 0.7844280775589959, + "flos": 24826662806400.0, + "grad_norm": 1.5366358275952057, + "language_loss": 0.63001621, + "learning_rate": 4.678832984380809e-07, + "loss": 0.65206707, + "num_input_tokens_seen": 281472255, + "step": 13047, + "time_per_iteration": 2.9065864086151123 + }, + { + "auxiliary_loss_clip": 0.01149221, + "auxiliary_loss_mlp": 0.01102047, + "balance_loss_clip": 1.00189447, + "balance_loss_mlp": 1.00048077, + "epoch": 0.7844882008116639, + "flos": 22455660931200.0, + "grad_norm": 1.7299689342858582, + "language_loss": 0.7300328, + "learning_rate": 4.676329928006515e-07, + "loss": 0.75254554, + "num_input_tokens_seen": 281492860, + "step": 13048, + "time_per_iteration": 2.7408969402313232 + }, + { + "auxiliary_loss_clip": 0.01131578, + "auxiliary_loss_mlp": 0.0110276, + "balance_loss_clip": 1.00185513, + "balance_loss_mlp": 1.00052619, + "epoch": 0.7845483240643318, + "flos": 26104041244800.0, + "grad_norm": 2.0405338638489776, + "language_loss": 0.7423861, + "learning_rate": 4.6738274527073243e-07, + "loss": 0.7647295, + "num_input_tokens_seen": 281511815, + "step": 13049, + "time_per_iteration": 2.7223103046417236 + }, + { + "auxiliary_loss_clip": 0.01164739, + "auxiliary_loss_mlp": 0.01103705, + "balance_loss_clip": 1.00192654, + "balance_loss_mlp": 1.00042152, + "epoch": 0.7846084473169999, + "flos": 19354307817600.0, + "grad_norm": 2.1796037028248714, + "language_loss": 0.72746569, + "learning_rate": 4.6713255585781454e-07, + "loss": 0.75015008, + "num_input_tokens_seen": 281530090, + "step": 13050, + "time_per_iteration": 2.616609573364258 + }, + { + "auxiliary_loss_clip": 0.01149984, + "auxiliary_loss_mlp": 0.01101583, + "balance_loss_clip": 1.00190401, + "balance_loss_mlp": 1.00049353, + "epoch": 0.7846685705696678, + "flos": 23325811902720.0, + "grad_norm": 2.1444864999928215, + "language_loss": 0.73489749, + "learning_rate": 4.668824245713825e-07, + "loss": 0.75741315, + "num_input_tokens_seen": 281547075, + "step": 13051, + "time_per_iteration": 2.638481378555298 + }, + { + "auxiliary_loss_clip": 0.01164588, + "auxiliary_loss_mlp": 0.01102728, + "balance_loss_clip": 1.00198436, + "balance_loss_mlp": 1.00058937, + "epoch": 0.7847286938223358, + "flos": 35809545962880.0, + "grad_norm": 2.208322115928983, + "language_loss": 0.72740829, + "learning_rate": 4.666323514209227e-07, + "loss": 0.75008154, + "num_input_tokens_seen": 281568080, + "step": 13052, + "time_per_iteration": 2.785404682159424 + }, + { + "auxiliary_loss_clip": 0.01130855, + "auxiliary_loss_mlp": 0.01101084, + "balance_loss_clip": 1.00179029, + "balance_loss_mlp": 1.00047183, + "epoch": 0.7847888170750038, + "flos": 18478159274880.0, + "grad_norm": 2.05109858653438, + "language_loss": 0.69396079, + "learning_rate": 4.663823364159183e-07, + "loss": 0.71628022, + "num_input_tokens_seen": 281586925, + "step": 13053, + "time_per_iteration": 2.6515657901763916 + }, + { + "auxiliary_loss_clip": 0.01149407, + "auxiliary_loss_mlp": 0.01100947, + "balance_loss_clip": 1.00208628, + "balance_loss_mlp": 1.00042939, + "epoch": 0.7848489403276717, + "flos": 25119155255040.0, + "grad_norm": 1.9913494784106454, + "language_loss": 0.70492828, + "learning_rate": 4.6613237956584893e-07, + "loss": 0.72743177, + "num_input_tokens_seen": 281603915, + "step": 13054, + "time_per_iteration": 2.691438674926758 + }, + { + "auxiliary_loss_clip": 0.01148018, + "auxiliary_loss_mlp": 0.01102319, + "balance_loss_clip": 1.00185502, + "balance_loss_mlp": 1.00056219, + "epoch": 0.7849090635803397, + "flos": 26502433966080.0, + "grad_norm": 1.5977681782800408, + "language_loss": 0.75736564, + "learning_rate": 4.658824808801938e-07, + "loss": 0.77986896, + "num_input_tokens_seen": 281624220, + "step": 13055, + "time_per_iteration": 2.6610159873962402 + }, + { + "auxiliary_loss_clip": 0.01164654, + "auxiliary_loss_mlp": 0.01102826, + "balance_loss_clip": 1.00196266, + "balance_loss_mlp": 1.00040126, + "epoch": 0.7849691868330076, + "flos": 20959658363520.0, + "grad_norm": 1.851585961105205, + "language_loss": 0.74855566, + "learning_rate": 4.656326403684283e-07, + "loss": 0.77123046, + "num_input_tokens_seen": 281642325, + "step": 13056, + "time_per_iteration": 2.6306300163269043 + }, + { + "auxiliary_loss_clip": 0.01084123, + "auxiliary_loss_mlp": 0.01101798, + "balance_loss_clip": 1.00169158, + "balance_loss_mlp": 1.00042295, + "epoch": 0.7850293100856757, + "flos": 26067484177920.0, + "grad_norm": 1.6374685357687087, + "language_loss": 0.70287013, + "learning_rate": 4.6538285804002744e-07, + "loss": 0.7247293, + "num_input_tokens_seen": 281663065, + "step": 13057, + "time_per_iteration": 2.8840441703796387 + }, + { + "auxiliary_loss_clip": 0.01116532, + "auxiliary_loss_mlp": 0.01102397, + "balance_loss_clip": 1.00176179, + "balance_loss_mlp": 1.00044978, + "epoch": 0.7850894333383436, + "flos": 22491894775680.0, + "grad_norm": 2.2848950193377635, + "language_loss": 0.76379633, + "learning_rate": 4.6513313390446175e-07, + "loss": 0.78598559, + "num_input_tokens_seen": 281681005, + "step": 13058, + "time_per_iteration": 2.7788939476013184 + }, + { + "auxiliary_loss_clip": 0.01149709, + "auxiliary_loss_mlp": 0.01102279, + "balance_loss_clip": 1.00189257, + "balance_loss_mlp": 1.00042641, + "epoch": 0.7851495565910116, + "flos": 20558643949440.0, + "grad_norm": 2.001287144985841, + "language_loss": 0.70941699, + "learning_rate": 4.6488346797120146e-07, + "loss": 0.73193693, + "num_input_tokens_seen": 281697965, + "step": 13059, + "time_per_iteration": 4.0555572509765625 + }, + { + "auxiliary_loss_clip": 0.01116793, + "auxiliary_loss_mlp": 0.01102945, + "balance_loss_clip": 1.00165772, + "balance_loss_mlp": 1.00061631, + "epoch": 0.7852096798436795, + "flos": 15924838942080.0, + "grad_norm": 1.910959301958986, + "language_loss": 0.76766443, + "learning_rate": 4.646338602497144e-07, + "loss": 0.7898618, + "num_input_tokens_seen": 281716035, + "step": 13060, + "time_per_iteration": 2.7274181842803955 + }, + { + "auxiliary_loss_clip": 0.01132501, + "auxiliary_loss_mlp": 0.01101742, + "balance_loss_clip": 1.00200677, + "balance_loss_mlp": 1.00046206, + "epoch": 0.7852698030963475, + "flos": 19062282245760.0, + "grad_norm": 1.8590556470791055, + "language_loss": 0.77278471, + "learning_rate": 4.643843107494654e-07, + "loss": 0.79512709, + "num_input_tokens_seen": 281732815, + "step": 13061, + "time_per_iteration": 2.736858367919922 + }, + { + "auxiliary_loss_clip": 0.01116234, + "auxiliary_loss_mlp": 0.01101856, + "balance_loss_clip": 1.0016067, + "balance_loss_mlp": 1.00038552, + "epoch": 0.7853299263490154, + "flos": 24644380262400.0, + "grad_norm": 2.1348917923757145, + "language_loss": 0.73811579, + "learning_rate": 4.641348194799164e-07, + "loss": 0.7602967, + "num_input_tokens_seen": 281751980, + "step": 13062, + "time_per_iteration": 2.8313076496124268 + }, + { + "auxiliary_loss_clip": 0.0114963, + "auxiliary_loss_mlp": 0.01101548, + "balance_loss_clip": 1.00188422, + "balance_loss_mlp": 1.00036311, + "epoch": 0.7853900496016835, + "flos": 22017981709440.0, + "grad_norm": 1.7519146872478806, + "language_loss": 0.68708813, + "learning_rate": 4.638853864505297e-07, + "loss": 0.70959991, + "num_input_tokens_seen": 281772670, + "step": 13063, + "time_per_iteration": 4.072641134262085 + }, + { + "auxiliary_loss_clip": 0.01149926, + "auxiliary_loss_mlp": 0.01101412, + "balance_loss_clip": 1.00202775, + "balance_loss_mlp": 1.00041807, + "epoch": 0.7854501728543514, + "flos": 30227412032640.0, + "grad_norm": 1.7335947610705293, + "language_loss": 0.72763491, + "learning_rate": 4.636360116707625e-07, + "loss": 0.7501483, + "num_input_tokens_seen": 281792930, + "step": 13064, + "time_per_iteration": 2.7922427654266357 + }, + { + "auxiliary_loss_clip": 0.01117998, + "auxiliary_loss_mlp": 0.01101342, + "balance_loss_clip": 1.00174689, + "balance_loss_mlp": 1.0004431, + "epoch": 0.7855102961070194, + "flos": 18843694030080.0, + "grad_norm": 3.4715436685565986, + "language_loss": 0.67924726, + "learning_rate": 4.633866951500718e-07, + "loss": 0.70144057, + "num_input_tokens_seen": 281811805, + "step": 13065, + "time_per_iteration": 2.7926430702209473 + }, + { + "auxiliary_loss_clip": 0.0114817, + "auxiliary_loss_mlp": 0.01101952, + "balance_loss_clip": 1.00196087, + "balance_loss_mlp": 1.00057638, + "epoch": 0.7855704193596874, + "flos": 22309971367680.0, + "grad_norm": 1.909728976128478, + "language_loss": 0.76562852, + "learning_rate": 4.6313743689791196e-07, + "loss": 0.78812969, + "num_input_tokens_seen": 281831885, + "step": 13066, + "time_per_iteration": 2.678917407989502 + }, + { + "auxiliary_loss_clip": 0.01158525, + "auxiliary_loss_mlp": 0.01075862, + "balance_loss_clip": 1.00081158, + "balance_loss_mlp": 0.99994963, + "epoch": 0.7856305426123553, + "flos": 60004434407040.0, + "grad_norm": 0.7015842074242387, + "language_loss": 0.53411204, + "learning_rate": 4.628882369237346e-07, + "loss": 0.55645585, + "num_input_tokens_seen": 281900310, + "step": 13067, + "time_per_iteration": 3.2622299194335938 + }, + { + "auxiliary_loss_clip": 0.01098169, + "auxiliary_loss_mlp": 0.01101745, + "balance_loss_clip": 1.00155437, + "balance_loss_mlp": 1.00036979, + "epoch": 0.7856906658650233, + "flos": 21868593045120.0, + "grad_norm": 1.5490173786159864, + "language_loss": 0.67850548, + "learning_rate": 4.62639095236989e-07, + "loss": 0.7005046, + "num_input_tokens_seen": 281918870, + "step": 13068, + "time_per_iteration": 2.785585641860962 + }, + { + "auxiliary_loss_clip": 0.01115183, + "auxiliary_loss_mlp": 0.01101288, + "balance_loss_clip": 1.00169492, + "balance_loss_mlp": 1.0003891, + "epoch": 0.7857507891176913, + "flos": 23622937205760.0, + "grad_norm": 2.1458075794394675, + "language_loss": 0.68221325, + "learning_rate": 4.6239001184712267e-07, + "loss": 0.70437795, + "num_input_tokens_seen": 281936905, + "step": 13069, + "time_per_iteration": 2.8522074222564697 + }, + { + "auxiliary_loss_clip": 0.01149068, + "auxiliary_loss_mlp": 0.01102202, + "balance_loss_clip": 1.00192297, + "balance_loss_mlp": 1.00054026, + "epoch": 0.7858109123703593, + "flos": 25520061928320.0, + "grad_norm": 1.6261748479254188, + "language_loss": 0.76988345, + "learning_rate": 4.6214098676358195e-07, + "loss": 0.79239619, + "num_input_tokens_seen": 281955625, + "step": 13070, + "time_per_iteration": 2.6722497940063477 + }, + { + "auxiliary_loss_clip": 0.01101463, + "auxiliary_loss_mlp": 0.01101657, + "balance_loss_clip": 1.0016849, + "balance_loss_mlp": 1.00037706, + "epoch": 0.7858710356230272, + "flos": 17457398576640.0, + "grad_norm": 1.5843418208970623, + "language_loss": 0.66090798, + "learning_rate": 4.618920199958083e-07, + "loss": 0.68293923, + "num_input_tokens_seen": 281973285, + "step": 13071, + "time_per_iteration": 2.790945053100586 + }, + { + "auxiliary_loss_clip": 0.01101518, + "auxiliary_loss_mlp": 0.01102632, + "balance_loss_clip": 1.0016712, + "balance_loss_mlp": 1.00049329, + "epoch": 0.7859311588756952, + "flos": 24679680353280.0, + "grad_norm": 1.7169600741358821, + "language_loss": 0.73957598, + "learning_rate": 4.616431115532442e-07, + "loss": 0.76161748, + "num_input_tokens_seen": 281991410, + "step": 13072, + "time_per_iteration": 2.8132355213165283 + }, + { + "auxiliary_loss_clip": 0.0114814, + "auxiliary_loss_mlp": 0.01102404, + "balance_loss_clip": 1.00188601, + "balance_loss_mlp": 1.00036085, + "epoch": 0.7859912821283631, + "flos": 21799142098560.0, + "grad_norm": 2.0104878370413712, + "language_loss": 0.71240389, + "learning_rate": 4.613942614453268e-07, + "loss": 0.7349093, + "num_input_tokens_seen": 282010845, + "step": 13073, + "time_per_iteration": 4.178360462188721 + }, + { + "auxiliary_loss_clip": 0.01131536, + "auxiliary_loss_mlp": 0.01102564, + "balance_loss_clip": 1.00189412, + "balance_loss_mlp": 1.00061655, + "epoch": 0.7860514053810311, + "flos": 20847293642880.0, + "grad_norm": 1.5882365453130343, + "language_loss": 0.76282227, + "learning_rate": 4.611454696814938e-07, + "loss": 0.78516322, + "num_input_tokens_seen": 282029635, + "step": 13074, + "time_per_iteration": 2.7689597606658936 + }, + { + "auxiliary_loss_clip": 0.01117456, + "auxiliary_loss_mlp": 0.01101516, + "balance_loss_clip": 1.00198603, + "balance_loss_mlp": 1.0005219, + "epoch": 0.786111528633699, + "flos": 24315689882880.0, + "grad_norm": 1.6190899363501816, + "language_loss": 0.74950296, + "learning_rate": 4.608967362711782e-07, + "loss": 0.77169269, + "num_input_tokens_seen": 282050285, + "step": 13075, + "time_per_iteration": 2.778965711593628 + }, + { + "auxiliary_loss_clip": 0.01116022, + "auxiliary_loss_mlp": 0.01102079, + "balance_loss_clip": 1.00166011, + "balance_loss_mlp": 1.00032258, + "epoch": 0.7861716518863671, + "flos": 24353180703360.0, + "grad_norm": 1.6348122312846858, + "language_loss": 0.68607253, + "learning_rate": 4.6064806122381283e-07, + "loss": 0.7082535, + "num_input_tokens_seen": 282071040, + "step": 13076, + "time_per_iteration": 2.7963430881500244 + }, + { + "auxiliary_loss_clip": 0.01149687, + "auxiliary_loss_mlp": 0.01102533, + "balance_loss_clip": 1.00186813, + "balance_loss_mlp": 1.00048947, + "epoch": 0.786231775139035, + "flos": 14022399006720.0, + "grad_norm": 2.835402821226179, + "language_loss": 0.80344838, + "learning_rate": 4.603994445488282e-07, + "loss": 0.82597053, + "num_input_tokens_seen": 282086610, + "step": 13077, + "time_per_iteration": 2.6425135135650635 + }, + { + "auxiliary_loss_clip": 0.01147695, + "auxiliary_loss_mlp": 0.01102531, + "balance_loss_clip": 1.0018189, + "balance_loss_mlp": 1.00053596, + "epoch": 0.786291898391703, + "flos": 33724248865920.0, + "grad_norm": 1.5932868087940713, + "language_loss": 0.70815825, + "learning_rate": 4.6015088625564956e-07, + "loss": 0.7306605, + "num_input_tokens_seen": 282107440, + "step": 13078, + "time_per_iteration": 4.29844856262207 + }, + { + "auxiliary_loss_clip": 0.01149656, + "auxiliary_loss_mlp": 0.01101631, + "balance_loss_clip": 1.00194216, + "balance_loss_mlp": 1.00054169, + "epoch": 0.786352021644371, + "flos": 25811476968960.0, + "grad_norm": 1.665595024133801, + "language_loss": 0.81242979, + "learning_rate": 4.599023863537039e-07, + "loss": 0.8349427, + "num_input_tokens_seen": 282127290, + "step": 13079, + "time_per_iteration": 2.798039674758911 + }, + { + "auxiliary_loss_clip": 0.01119395, + "auxiliary_loss_mlp": 0.01101376, + "balance_loss_clip": 1.00188708, + "balance_loss_mlp": 1.00038218, + "epoch": 0.7864121448970389, + "flos": 28910818920960.0, + "grad_norm": 1.5164610771457319, + "language_loss": 0.68402588, + "learning_rate": 4.596539448524146e-07, + "loss": 0.70623362, + "num_input_tokens_seen": 282147505, + "step": 13080, + "time_per_iteration": 2.9422388076782227 + }, + { + "auxiliary_loss_clip": 0.01147833, + "auxiliary_loss_mlp": 0.01102766, + "balance_loss_clip": 1.00189257, + "balance_loss_mlp": 1.00053191, + "epoch": 0.7864722681497069, + "flos": 19208833735680.0, + "grad_norm": 2.239467727559362, + "language_loss": 0.69522882, + "learning_rate": 4.594055617612016e-07, + "loss": 0.71773481, + "num_input_tokens_seen": 282166450, + "step": 13081, + "time_per_iteration": 2.7471001148223877 + }, + { + "auxiliary_loss_clip": 0.01131223, + "auxiliary_loss_mlp": 0.01102426, + "balance_loss_clip": 1.00178719, + "balance_loss_mlp": 1.00057387, + "epoch": 0.7865323914023749, + "flos": 21871573873920.0, + "grad_norm": 1.775474471223732, + "language_loss": 0.68467569, + "learning_rate": 4.591572370894838e-07, + "loss": 0.70701218, + "num_input_tokens_seen": 282186465, + "step": 13082, + "time_per_iteration": 2.7829129695892334 + }, + { + "auxiliary_loss_clip": 0.01133343, + "auxiliary_loss_mlp": 0.01102246, + "balance_loss_clip": 1.00173235, + "balance_loss_mlp": 1.00048923, + "epoch": 0.7865925146550429, + "flos": 25520313323520.0, + "grad_norm": 2.003793844654295, + "language_loss": 0.66222095, + "learning_rate": 4.589089708466789e-07, + "loss": 0.68457687, + "num_input_tokens_seen": 282207180, + "step": 13083, + "time_per_iteration": 2.791179895401001 + }, + { + "auxiliary_loss_clip": 0.01130983, + "auxiliary_loss_mlp": 0.01103133, + "balance_loss_clip": 1.00176156, + "balance_loss_mlp": 1.00051737, + "epoch": 0.7866526379077108, + "flos": 19097366855040.0, + "grad_norm": 4.07946089768423, + "language_loss": 0.74835396, + "learning_rate": 4.5866076304220015e-07, + "loss": 0.77069509, + "num_input_tokens_seen": 282225865, + "step": 13084, + "time_per_iteration": 2.809587240219116 + }, + { + "auxiliary_loss_clip": 0.01132522, + "auxiliary_loss_mlp": 0.01101296, + "balance_loss_clip": 1.00176954, + "balance_loss_mlp": 1.00049305, + "epoch": 0.7867127611603788, + "flos": 16173771171840.0, + "grad_norm": 1.8373489968389425, + "language_loss": 0.70214927, + "learning_rate": 4.584126136854591e-07, + "loss": 0.72448748, + "num_input_tokens_seen": 282242895, + "step": 13085, + "time_per_iteration": 2.7547261714935303 + }, + { + "auxiliary_loss_clip": 0.01133001, + "auxiliary_loss_mlp": 0.01102894, + "balance_loss_clip": 1.00172877, + "balance_loss_mlp": 1.00046921, + "epoch": 0.7867728844130467, + "flos": 20773640805120.0, + "grad_norm": 1.7801782114552318, + "language_loss": 0.72203654, + "learning_rate": 4.5816452278586617e-07, + "loss": 0.74439549, + "num_input_tokens_seen": 282260425, + "step": 13086, + "time_per_iteration": 2.7347583770751953 + }, + { + "auxiliary_loss_clip": 0.01164359, + "auxiliary_loss_mlp": 0.01102016, + "balance_loss_clip": 1.00182152, + "balance_loss_mlp": 1.00045013, + "epoch": 0.7868330076657147, + "flos": 21760106993280.0, + "grad_norm": 1.778526551690119, + "language_loss": 0.74758375, + "learning_rate": 4.5791649035282965e-07, + "loss": 0.77024746, + "num_input_tokens_seen": 282279335, + "step": 13087, + "time_per_iteration": 2.607964038848877 + }, + { + "auxiliary_loss_clip": 0.01132682, + "auxiliary_loss_mlp": 0.01100714, + "balance_loss_clip": 1.00182974, + "balance_loss_mlp": 1.0005784, + "epoch": 0.7868931309183826, + "flos": 25700692446720.0, + "grad_norm": 1.628168641304645, + "language_loss": 0.71711093, + "learning_rate": 4.5766851639575456e-07, + "loss": 0.73944491, + "num_input_tokens_seen": 282299905, + "step": 13088, + "time_per_iteration": 2.773458957672119 + }, + { + "auxiliary_loss_clip": 0.01158495, + "auxiliary_loss_mlp": 0.01075487, + "balance_loss_clip": 1.00076628, + "balance_loss_mlp": 0.99995655, + "epoch": 0.7869532541710507, + "flos": 64644883430400.0, + "grad_norm": 0.6661966478923141, + "language_loss": 0.55488652, + "learning_rate": 4.574206009240431e-07, + "loss": 0.57722634, + "num_input_tokens_seen": 282367620, + "step": 13089, + "time_per_iteration": 3.225831985473633 + }, + { + "auxiliary_loss_clip": 0.01141379, + "auxiliary_loss_mlp": 0.01075873, + "balance_loss_clip": 1.00064445, + "balance_loss_mlp": 0.9999606, + "epoch": 0.7870133774237186, + "flos": 67453600440960.0, + "grad_norm": 0.7220432946807488, + "language_loss": 0.5001905, + "learning_rate": 4.571727439470976e-07, + "loss": 0.52236307, + "num_input_tokens_seen": 282435695, + "step": 13090, + "time_per_iteration": 3.3168606758117676 + }, + { + "auxiliary_loss_clip": 0.01148967, + "auxiliary_loss_mlp": 0.01100938, + "balance_loss_clip": 1.00190675, + "balance_loss_mlp": 1.00042129, + "epoch": 0.7870735006763866, + "flos": 26068310190720.0, + "grad_norm": 1.5734133179064058, + "language_loss": 0.83733177, + "learning_rate": 4.5692494547431583e-07, + "loss": 0.8598308, + "num_input_tokens_seen": 282456025, + "step": 13091, + "time_per_iteration": 2.692439556121826 + }, + { + "auxiliary_loss_clip": 0.01142167, + "auxiliary_loss_mlp": 0.01075858, + "balance_loss_clip": 1.0007925, + "balance_loss_mlp": 0.99994582, + "epoch": 0.7871336239290546, + "flos": 70289572896000.0, + "grad_norm": 0.719628576499681, + "language_loss": 0.63961411, + "learning_rate": 4.566772055150947e-07, + "loss": 0.6617943, + "num_input_tokens_seen": 282520995, + "step": 13092, + "time_per_iteration": 3.1529479026794434 + }, + { + "auxiliary_loss_clip": 0.01133396, + "auxiliary_loss_mlp": 0.01101866, + "balance_loss_clip": 1.00180817, + "balance_loss_mlp": 1.00049043, + "epoch": 0.7871937471817225, + "flos": 15778574760960.0, + "grad_norm": 2.0964131810447326, + "language_loss": 0.79001755, + "learning_rate": 4.564295240788285e-07, + "loss": 0.81237018, + "num_input_tokens_seen": 282539355, + "step": 13093, + "time_per_iteration": 2.6152496337890625 + }, + { + "auxiliary_loss_clip": 0.01131339, + "auxiliary_loss_mlp": 0.01101188, + "balance_loss_clip": 1.00185859, + "balance_loss_mlp": 1.00047982, + "epoch": 0.7872538704343905, + "flos": 20485242506880.0, + "grad_norm": 3.2241810817160785, + "language_loss": 0.75597316, + "learning_rate": 4.561819011749106e-07, + "loss": 0.7782985, + "num_input_tokens_seen": 282555735, + "step": 13094, + "time_per_iteration": 2.694915294647217 + }, + { + "auxiliary_loss_clip": 0.01103726, + "auxiliary_loss_mlp": 0.01102188, + "balance_loss_clip": 1.00187194, + "balance_loss_mlp": 1.0004307, + "epoch": 0.7873139936870585, + "flos": 25082670015360.0, + "grad_norm": 1.6523568160855364, + "language_loss": 0.79594219, + "learning_rate": 4.5593433681272884e-07, + "loss": 0.81800133, + "num_input_tokens_seen": 282574550, + "step": 13095, + "time_per_iteration": 2.735318660736084 + }, + { + "auxiliary_loss_clip": 0.01147389, + "auxiliary_loss_mlp": 0.01103034, + "balance_loss_clip": 1.00180566, + "balance_loss_mlp": 1.00041866, + "epoch": 0.7873741169397265, + "flos": 30883176679680.0, + "grad_norm": 2.2230900336918844, + "language_loss": 0.67994589, + "learning_rate": 4.556868310016715e-07, + "loss": 0.70245016, + "num_input_tokens_seen": 282596520, + "step": 13096, + "time_per_iteration": 2.685102939605713 + }, + { + "auxiliary_loss_clip": 0.01133348, + "auxiliary_loss_mlp": 0.01100621, + "balance_loss_clip": 1.00170183, + "balance_loss_mlp": 1.00048566, + "epoch": 0.7874342401923944, + "flos": 46791962242560.0, + "grad_norm": 1.4341988722422783, + "language_loss": 0.7034831, + "learning_rate": 4.55439383751125e-07, + "loss": 0.72582281, + "num_input_tokens_seen": 282620560, + "step": 13097, + "time_per_iteration": 4.18690299987793 + }, + { + "auxiliary_loss_clip": 0.01133287, + "auxiliary_loss_mlp": 0.01103663, + "balance_loss_clip": 1.00193286, + "balance_loss_mlp": 1.00057101, + "epoch": 0.7874943634450624, + "flos": 23584548545280.0, + "grad_norm": 1.7185880479227522, + "language_loss": 0.80703568, + "learning_rate": 4.5519199507047126e-07, + "loss": 0.82940525, + "num_input_tokens_seen": 282639830, + "step": 13098, + "time_per_iteration": 2.6951217651367188 + }, + { + "auxiliary_loss_clip": 0.01116207, + "auxiliary_loss_mlp": 0.01101278, + "balance_loss_clip": 1.00187755, + "balance_loss_mlp": 1.00047457, + "epoch": 0.7875544866977303, + "flos": 20191169859840.0, + "grad_norm": 1.8856606720277604, + "language_loss": 0.7404722, + "learning_rate": 4.5494466496909177e-07, + "loss": 0.76264703, + "num_input_tokens_seen": 282660130, + "step": 13099, + "time_per_iteration": 2.678342580795288 + }, + { + "auxiliary_loss_clip": 0.01133017, + "auxiliary_loss_mlp": 0.0110218, + "balance_loss_clip": 1.0018512, + "balance_loss_mlp": 1.00032735, + "epoch": 0.7876146099503983, + "flos": 22602571557120.0, + "grad_norm": 1.6894946926705543, + "language_loss": 0.78108227, + "learning_rate": 4.5469739345636603e-07, + "loss": 0.80343413, + "num_input_tokens_seen": 282681125, + "step": 13100, + "time_per_iteration": 4.070339679718018 + }, + { + "auxiliary_loss_clip": 0.01150042, + "auxiliary_loss_mlp": 0.00747428, + "balance_loss_clip": 1.0020051, + "balance_loss_mlp": 1.00049484, + "epoch": 0.7876747332030662, + "flos": 10705833555840.0, + "grad_norm": 2.632878972967601, + "language_loss": 0.6670022, + "learning_rate": 4.5445018054167007e-07, + "loss": 0.68597686, + "num_input_tokens_seen": 282696690, + "step": 13101, + "time_per_iteration": 2.619901657104492 + }, + { + "auxiliary_loss_clip": 0.01131052, + "auxiliary_loss_mlp": 0.01102016, + "balance_loss_clip": 1.00174236, + "balance_loss_mlp": 1.00054514, + "epoch": 0.7877348564557343, + "flos": 38399315621760.0, + "grad_norm": 1.5522980593089475, + "language_loss": 0.77790201, + "learning_rate": 4.5420302623437745e-07, + "loss": 0.80023271, + "num_input_tokens_seen": 282721210, + "step": 13102, + "time_per_iteration": 2.7700040340423584 + }, + { + "auxiliary_loss_clip": 0.0114994, + "auxiliary_loss_mlp": 0.01101853, + "balance_loss_clip": 1.00185573, + "balance_loss_mlp": 1.00057292, + "epoch": 0.7877949797084022, + "flos": 18329524796160.0, + "grad_norm": 2.8042693563380703, + "language_loss": 0.82375503, + "learning_rate": 4.5395593054386093e-07, + "loss": 0.84627301, + "num_input_tokens_seen": 282738505, + "step": 13103, + "time_per_iteration": 2.5419423580169678 + }, + { + "auxiliary_loss_clip": 0.0114797, + "auxiliary_loss_mlp": 0.01103405, + "balance_loss_clip": 1.0019269, + "balance_loss_mlp": 1.0004077, + "epoch": 0.7878551029610702, + "flos": 25806736373760.0, + "grad_norm": 2.163809338395572, + "language_loss": 0.80565274, + "learning_rate": 4.537088934794913e-07, + "loss": 0.82816648, + "num_input_tokens_seen": 282756895, + "step": 13104, + "time_per_iteration": 2.6117963790893555 + }, + { + "auxiliary_loss_clip": 0.01164571, + "auxiliary_loss_mlp": 0.01102558, + "balance_loss_clip": 1.00196099, + "balance_loss_mlp": 1.00051439, + "epoch": 0.7879152262137382, + "flos": 22342685679360.0, + "grad_norm": 1.9688982247849245, + "language_loss": 0.74281281, + "learning_rate": 4.5346191505063515e-07, + "loss": 0.76548409, + "num_input_tokens_seen": 282774955, + "step": 13105, + "time_per_iteration": 2.534292221069336 + }, + { + "auxiliary_loss_clip": 0.01083296, + "auxiliary_loss_mlp": 0.01102829, + "balance_loss_clip": 1.00150239, + "balance_loss_mlp": 1.00059509, + "epoch": 0.7879753494664061, + "flos": 24785329230720.0, + "grad_norm": 1.54262492549223, + "language_loss": 0.75792545, + "learning_rate": 4.5321499526665776e-07, + "loss": 0.77978671, + "num_input_tokens_seen": 282793165, + "step": 13106, + "time_per_iteration": 2.758423328399658 + }, + { + "auxiliary_loss_clip": 0.01097858, + "auxiliary_loss_mlp": 0.01102024, + "balance_loss_clip": 1.00157666, + "balance_loss_mlp": 1.00045776, + "epoch": 0.7880354727190741, + "flos": 16909078487040.0, + "grad_norm": 38.05089386518054, + "language_loss": 0.73443156, + "learning_rate": 4.5296813413692337e-07, + "loss": 0.75643039, + "num_input_tokens_seen": 282809820, + "step": 13107, + "time_per_iteration": 2.77961802482605 + }, + { + "auxiliary_loss_clip": 0.01164509, + "auxiliary_loss_mlp": 0.0110266, + "balance_loss_clip": 1.00199938, + "balance_loss_mlp": 1.00052154, + "epoch": 0.7880955959717421, + "flos": 22230500526720.0, + "grad_norm": 1.5592699774265424, + "language_loss": 0.73215359, + "learning_rate": 4.5272133167079165e-07, + "loss": 0.75482529, + "num_input_tokens_seen": 282828600, + "step": 13108, + "time_per_iteration": 2.5226361751556396 + }, + { + "auxiliary_loss_clip": 0.01158458, + "auxiliary_loss_mlp": 0.01075857, + "balance_loss_clip": 1.00077367, + "balance_loss_mlp": 0.99994415, + "epoch": 0.7881557192244101, + "flos": 69183200131200.0, + "grad_norm": 0.8858906523716616, + "language_loss": 0.60314852, + "learning_rate": 4.5247458787762216e-07, + "loss": 0.62549168, + "num_input_tokens_seen": 282882775, + "step": 13109, + "time_per_iteration": 3.056122064590454 + }, + { + "auxiliary_loss_clip": 0.01119333, + "auxiliary_loss_mlp": 0.01102211, + "balance_loss_clip": 1.00209379, + "balance_loss_mlp": 1.00050163, + "epoch": 0.788215842477078, + "flos": 24935436167040.0, + "grad_norm": 1.647958436847076, + "language_loss": 0.72528303, + "learning_rate": 4.5222790276677126e-07, + "loss": 0.74749851, + "num_input_tokens_seen": 282902680, + "step": 13110, + "time_per_iteration": 2.6916134357452393 + }, + { + "auxiliary_loss_clip": 0.01099346, + "auxiliary_loss_mlp": 0.01101359, + "balance_loss_clip": 1.00167418, + "balance_loss_mlp": 1.00036478, + "epoch": 0.788275965729746, + "flos": 26106483369600.0, + "grad_norm": 1.8975866919072473, + "language_loss": 0.75113475, + "learning_rate": 4.5198127634759455e-07, + "loss": 0.7731418, + "num_input_tokens_seen": 282923625, + "step": 13111, + "time_per_iteration": 4.261708974838257 + }, + { + "auxiliary_loss_clip": 0.01150037, + "auxiliary_loss_mlp": 0.0110146, + "balance_loss_clip": 1.00192928, + "balance_loss_mlp": 1.00046623, + "epoch": 0.7883360889824139, + "flos": 21214803646080.0, + "grad_norm": 2.262038432363754, + "language_loss": 0.61408281, + "learning_rate": 4.5173470862944206e-07, + "loss": 0.63659775, + "num_input_tokens_seen": 282941955, + "step": 13112, + "time_per_iteration": 2.5729455947875977 + }, + { + "auxiliary_loss_clip": 0.01131755, + "auxiliary_loss_mlp": 0.0110234, + "balance_loss_clip": 1.00183189, + "balance_loss_mlp": 1.0004878, + "epoch": 0.7883962122350819, + "flos": 21142551438720.0, + "grad_norm": 2.057051694143309, + "language_loss": 0.67512965, + "learning_rate": 4.514881996216644e-07, + "loss": 0.69747061, + "num_input_tokens_seen": 282961280, + "step": 13113, + "time_per_iteration": 2.611882448196411 + }, + { + "auxiliary_loss_clip": 0.01115591, + "auxiliary_loss_mlp": 0.01101854, + "balance_loss_clip": 1.0018208, + "balance_loss_mlp": 1.00047803, + "epoch": 0.7884563354877498, + "flos": 15302901928320.0, + "grad_norm": 5.504586424954256, + "language_loss": 0.58018351, + "learning_rate": 4.5124174933361e-07, + "loss": 0.60235792, + "num_input_tokens_seen": 282978210, + "step": 13114, + "time_per_iteration": 2.625703811645508 + }, + { + "auxiliary_loss_clip": 0.01098125, + "auxiliary_loss_mlp": 0.01102512, + "balance_loss_clip": 1.00171089, + "balance_loss_mlp": 1.00046873, + "epoch": 0.7885164587404179, + "flos": 24388301226240.0, + "grad_norm": 1.5629574696973458, + "language_loss": 0.66592401, + "learning_rate": 4.5099535777462306e-07, + "loss": 0.68793035, + "num_input_tokens_seen": 282998845, + "step": 13115, + "time_per_iteration": 2.726902961730957 + }, + { + "auxiliary_loss_clip": 0.01133954, + "auxiliary_loss_mlp": 0.01102545, + "balance_loss_clip": 1.00177264, + "balance_loss_mlp": 1.00040674, + "epoch": 0.7885765819930858, + "flos": 14385886686720.0, + "grad_norm": 2.1176427639445508, + "language_loss": 0.88771921, + "learning_rate": 4.50749024954048e-07, + "loss": 0.91008425, + "num_input_tokens_seen": 283015200, + "step": 13116, + "time_per_iteration": 3.972418785095215 + }, + { + "auxiliary_loss_clip": 0.01131466, + "auxiliary_loss_mlp": 0.01104143, + "balance_loss_clip": 1.00175571, + "balance_loss_mlp": 1.00057435, + "epoch": 0.7886367052457538, + "flos": 18259930195200.0, + "grad_norm": 5.426842874368886, + "language_loss": 0.72926795, + "learning_rate": 4.505027508812245e-07, + "loss": 0.75162399, + "num_input_tokens_seen": 283033680, + "step": 13117, + "time_per_iteration": 2.633838176727295 + }, + { + "auxiliary_loss_clip": 0.01147555, + "auxiliary_loss_mlp": 0.01100891, + "balance_loss_clip": 1.00184298, + "balance_loss_mlp": 1.00046968, + "epoch": 0.7886968284984217, + "flos": 15305092657920.0, + "grad_norm": 1.7389659044640708, + "language_loss": 0.8017723, + "learning_rate": 4.502565355654926e-07, + "loss": 0.82425678, + "num_input_tokens_seen": 283050620, + "step": 13118, + "time_per_iteration": 2.542447090148926 + }, + { + "auxiliary_loss_clip": 0.01147692, + "auxiliary_loss_mlp": 0.01102542, + "balance_loss_clip": 1.0019145, + "balance_loss_mlp": 1.00040364, + "epoch": 0.7887569517510897, + "flos": 21215450090880.0, + "grad_norm": 1.633970215075471, + "language_loss": 0.72945249, + "learning_rate": 4.500103790161878e-07, + "loss": 0.75195479, + "num_input_tokens_seen": 283070215, + "step": 13119, + "time_per_iteration": 2.5936901569366455 + }, + { + "auxiliary_loss_clip": 0.01149689, + "auxiliary_loss_mlp": 0.0110223, + "balance_loss_clip": 1.00186658, + "balance_loss_mlp": 1.0004729, + "epoch": 0.7888170750037578, + "flos": 22711237176960.0, + "grad_norm": 2.1821232222955746, + "language_loss": 0.71830374, + "learning_rate": 4.4976428124264454e-07, + "loss": 0.74082291, + "num_input_tokens_seen": 283091485, + "step": 13120, + "time_per_iteration": 2.6130993366241455 + }, + { + "auxiliary_loss_clip": 0.01132676, + "auxiliary_loss_mlp": 0.00747292, + "balance_loss_clip": 1.00185311, + "balance_loss_mlp": 1.00049114, + "epoch": 0.7888771982564257, + "flos": 36429148592640.0, + "grad_norm": 1.5649082218364438, + "language_loss": 0.7891227, + "learning_rate": 4.4951824225419564e-07, + "loss": 0.80792236, + "num_input_tokens_seen": 283115040, + "step": 13121, + "time_per_iteration": 2.766710042953491 + }, + { + "auxiliary_loss_clip": 0.01149697, + "auxiliary_loss_mlp": 0.01101509, + "balance_loss_clip": 1.00185311, + "balance_loss_mlp": 1.00051546, + "epoch": 0.7889373215090937, + "flos": 27309993488640.0, + "grad_norm": 1.4211550977967462, + "language_loss": 0.80515218, + "learning_rate": 4.4927226206017057e-07, + "loss": 0.82766426, + "num_input_tokens_seen": 283136925, + "step": 13122, + "time_per_iteration": 2.6492371559143066 + }, + { + "auxiliary_loss_clip": 0.01132298, + "auxiliary_loss_mlp": 0.01102018, + "balance_loss_clip": 1.00193119, + "balance_loss_mlp": 1.00045133, + "epoch": 0.7889974447617616, + "flos": 19829010983040.0, + "grad_norm": 2.2753884397475375, + "language_loss": 0.78120047, + "learning_rate": 4.4902634066989597e-07, + "loss": 0.80354369, + "num_input_tokens_seen": 283155725, + "step": 13123, + "time_per_iteration": 2.6065139770507812 + }, + { + "auxiliary_loss_clip": 0.01119278, + "auxiliary_loss_mlp": 0.01102008, + "balance_loss_clip": 1.00177193, + "balance_loss_mlp": 1.00044167, + "epoch": 0.7890575680144296, + "flos": 17271201450240.0, + "grad_norm": 1.7819496655375104, + "language_loss": 0.67214692, + "learning_rate": 4.487804780926985e-07, + "loss": 0.69435978, + "num_input_tokens_seen": 283173845, + "step": 13124, + "time_per_iteration": 2.672260284423828 + }, + { + "auxiliary_loss_clip": 0.0113353, + "auxiliary_loss_mlp": 0.01102672, + "balance_loss_clip": 1.00202751, + "balance_loss_mlp": 1.00043774, + "epoch": 0.7891176912670975, + "flos": 27600151553280.0, + "grad_norm": 3.7512060781342433, + "language_loss": 0.72993326, + "learning_rate": 4.4853467433790036e-07, + "loss": 0.75229526, + "num_input_tokens_seen": 283191985, + "step": 13125, + "time_per_iteration": 2.6624441146850586 + }, + { + "auxiliary_loss_clip": 0.01135131, + "auxiliary_loss_mlp": 0.01102708, + "balance_loss_clip": 1.00165522, + "balance_loss_mlp": 1.00047469, + "epoch": 0.7891778145197655, + "flos": 22711668140160.0, + "grad_norm": 2.5951322612139816, + "language_loss": 0.7280435, + "learning_rate": 4.4828892941482267e-07, + "loss": 0.75042188, + "num_input_tokens_seen": 283210855, + "step": 13126, + "time_per_iteration": 2.6695141792297363 + }, + { + "auxiliary_loss_clip": 0.01134579, + "auxiliary_loss_mlp": 0.0110285, + "balance_loss_clip": 1.00175965, + "balance_loss_mlp": 1.00042558, + "epoch": 0.7892379377724335, + "flos": 17310775259520.0, + "grad_norm": 2.8188077436076706, + "language_loss": 0.76604235, + "learning_rate": 4.480432433327845e-07, + "loss": 0.78841662, + "num_input_tokens_seen": 283229665, + "step": 13127, + "time_per_iteration": 2.583575963973999 + }, + { + "auxiliary_loss_clip": 0.01149507, + "auxiliary_loss_mlp": 0.01102223, + "balance_loss_clip": 1.00194359, + "balance_loss_mlp": 1.00056159, + "epoch": 0.7892980610251015, + "flos": 25775674087680.0, + "grad_norm": 1.9660326014895741, + "language_loss": 0.85955483, + "learning_rate": 4.47797616101103e-07, + "loss": 0.88207209, + "num_input_tokens_seen": 283248615, + "step": 13128, + "time_per_iteration": 2.6101109981536865 + }, + { + "auxiliary_loss_clip": 0.01149756, + "auxiliary_loss_mlp": 0.01101767, + "balance_loss_clip": 1.00189316, + "balance_loss_mlp": 1.00058186, + "epoch": 0.7893581842777694, + "flos": 21579943351680.0, + "grad_norm": 2.0179679440922063, + "language_loss": 0.68532193, + "learning_rate": 4.475520477290904e-07, + "loss": 0.70783716, + "num_input_tokens_seen": 283267135, + "step": 13129, + "time_per_iteration": 2.6074209213256836 + }, + { + "auxiliary_loss_clip": 0.01143923, + "auxiliary_loss_mlp": 0.0107589, + "balance_loss_clip": 1.00077128, + "balance_loss_mlp": 0.99997777, + "epoch": 0.7894183075304374, + "flos": 69016468176000.0, + "grad_norm": 0.7091284226709548, + "language_loss": 0.61514032, + "learning_rate": 4.473065382260597e-07, + "loss": 0.63733846, + "num_input_tokens_seen": 283328940, + "step": 13130, + "time_per_iteration": 3.1244192123413086 + }, + { + "auxiliary_loss_clip": 0.01147482, + "auxiliary_loss_mlp": 0.0110191, + "balance_loss_clip": 1.00195086, + "balance_loss_mlp": 1.00043893, + "epoch": 0.7894784307831053, + "flos": 24243258107520.0, + "grad_norm": 2.101225375534961, + "language_loss": 0.73578858, + "learning_rate": 4.4706108760132124e-07, + "loss": 0.75828254, + "num_input_tokens_seen": 283350000, + "step": 13131, + "time_per_iteration": 2.6076557636260986 + }, + { + "auxiliary_loss_clip": 0.01132568, + "auxiliary_loss_mlp": 0.01104406, + "balance_loss_clip": 1.00192642, + "balance_loss_mlp": 1.00035977, + "epoch": 0.7895385540357733, + "flos": 20266546550400.0, + "grad_norm": 2.777983012745221, + "language_loss": 0.69145596, + "learning_rate": 4.4681569586418153e-07, + "loss": 0.7138257, + "num_input_tokens_seen": 283368020, + "step": 13132, + "time_per_iteration": 2.611736297607422 + }, + { + "auxiliary_loss_clip": 0.01147868, + "auxiliary_loss_mlp": 0.01101714, + "balance_loss_clip": 1.00183141, + "balance_loss_mlp": 1.00052941, + "epoch": 0.7895986772884414, + "flos": 20996574566400.0, + "grad_norm": 3.5061014248094047, + "language_loss": 0.6204282, + "learning_rate": 4.465703630239468e-07, + "loss": 0.64292407, + "num_input_tokens_seen": 283387030, + "step": 13133, + "time_per_iteration": 2.5785977840423584 + }, + { + "auxiliary_loss_clip": 0.01115552, + "auxiliary_loss_mlp": 0.01103597, + "balance_loss_clip": 1.00181139, + "balance_loss_mlp": 1.00050521, + "epoch": 0.7896588005411093, + "flos": 18657999694080.0, + "grad_norm": 2.7588412786560452, + "language_loss": 0.79797947, + "learning_rate": 4.463250890899195e-07, + "loss": 0.820171, + "num_input_tokens_seen": 283402090, + "step": 13134, + "time_per_iteration": 4.037726402282715 + }, + { + "auxiliary_loss_clip": 0.01149965, + "auxiliary_loss_mlp": 0.01101736, + "balance_loss_clip": 1.00190985, + "balance_loss_mlp": 1.00055087, + "epoch": 0.7897189237937773, + "flos": 18405907067520.0, + "grad_norm": 2.048224827745834, + "language_loss": 0.80023509, + "learning_rate": 4.460798740713998e-07, + "loss": 0.82275212, + "num_input_tokens_seen": 283421035, + "step": 13135, + "time_per_iteration": 2.5407328605651855 + }, + { + "auxiliary_loss_clip": 0.01147206, + "auxiliary_loss_mlp": 0.01101709, + "balance_loss_clip": 1.00190639, + "balance_loss_mlp": 1.00042915, + "epoch": 0.7897790470464452, + "flos": 23731602825600.0, + "grad_norm": 1.6542464224289837, + "language_loss": 0.72501099, + "learning_rate": 4.4583471797768733e-07, + "loss": 0.74750018, + "num_input_tokens_seen": 283441830, + "step": 13136, + "time_per_iteration": 2.5776777267456055 + }, + { + "auxiliary_loss_clip": 0.01164752, + "auxiliary_loss_mlp": 0.01103405, + "balance_loss_clip": 1.00194073, + "balance_loss_mlp": 1.00059867, + "epoch": 0.7898391702991132, + "flos": 15918949111680.0, + "grad_norm": 2.117945296846739, + "language_loss": 0.71442437, + "learning_rate": 4.455896208180778e-07, + "loss": 0.73710591, + "num_input_tokens_seen": 283459540, + "step": 13137, + "time_per_iteration": 2.5077333450317383 + }, + { + "auxiliary_loss_clip": 0.01164407, + "auxiliary_loss_mlp": 0.0110181, + "balance_loss_clip": 1.00193882, + "balance_loss_mlp": 1.00053, + "epoch": 0.7898992935517811, + "flos": 19829046896640.0, + "grad_norm": 1.908860262094774, + "language_loss": 0.74172425, + "learning_rate": 4.4534458260186645e-07, + "loss": 0.76438642, + "num_input_tokens_seen": 283478790, + "step": 13138, + "time_per_iteration": 3.9300858974456787 + }, + { + "auxiliary_loss_clip": 0.01114654, + "auxiliary_loss_mlp": 0.01102487, + "balance_loss_clip": 1.00172329, + "balance_loss_mlp": 1.00053954, + "epoch": 0.7899594168044491, + "flos": 16216253982720.0, + "grad_norm": 2.089932376753255, + "language_loss": 0.68168867, + "learning_rate": 4.4509960333834426e-07, + "loss": 0.70386004, + "num_input_tokens_seen": 283495720, + "step": 13139, + "time_per_iteration": 2.623852014541626 + }, + { + "auxiliary_loss_clip": 0.01144054, + "auxiliary_loss_mlp": 0.01076287, + "balance_loss_clip": 1.00075316, + "balance_loss_mlp": 0.99999315, + "epoch": 0.790019540057117, + "flos": 68331005959680.0, + "grad_norm": 0.8630576544783654, + "language_loss": 0.60176319, + "learning_rate": 4.448546830368003e-07, + "loss": 0.62396657, + "num_input_tokens_seen": 283558795, + "step": 13140, + "time_per_iteration": 3.23620867729187 + }, + { + "auxiliary_loss_clip": 0.01164684, + "auxiliary_loss_mlp": 0.01101702, + "balance_loss_clip": 1.00200343, + "balance_loss_mlp": 1.00051701, + "epoch": 0.7900796633097851, + "flos": 30332773601280.0, + "grad_norm": 1.7985492650404034, + "language_loss": 0.7620253, + "learning_rate": 4.4460982170652304e-07, + "loss": 0.78468919, + "num_input_tokens_seen": 283579305, + "step": 13141, + "time_per_iteration": 2.623871326446533 + }, + { + "auxiliary_loss_clip": 0.01148235, + "auxiliary_loss_mlp": 0.01102632, + "balance_loss_clip": 1.00185204, + "balance_loss_mlp": 1.00049353, + "epoch": 0.790139786562453, + "flos": 22126790983680.0, + "grad_norm": 1.9629294003316982, + "language_loss": 0.68615949, + "learning_rate": 4.4436501935679694e-07, + "loss": 0.70866811, + "num_input_tokens_seen": 283597840, + "step": 13142, + "time_per_iteration": 2.5655434131622314 + }, + { + "auxiliary_loss_clip": 0.01082442, + "auxiliary_loss_mlp": 0.01076313, + "balance_loss_clip": 1.00090146, + "balance_loss_mlp": 1.00001895, + "epoch": 0.790199909815121, + "flos": 58207284213120.0, + "grad_norm": 0.8026745412381601, + "language_loss": 0.59993225, + "learning_rate": 4.441202759969049e-07, + "loss": 0.6215198, + "num_input_tokens_seen": 283647950, + "step": 13143, + "time_per_iteration": 3.270463705062866 + }, + { + "auxiliary_loss_clip": 0.01115827, + "auxiliary_loss_mlp": 0.01102106, + "balance_loss_clip": 1.00178146, + "balance_loss_mlp": 1.00053918, + "epoch": 0.7902600330677889, + "flos": 34533316759680.0, + "grad_norm": 1.731556092965511, + "language_loss": 0.74623972, + "learning_rate": 4.4387559163612875e-07, + "loss": 0.76841903, + "num_input_tokens_seen": 283670645, + "step": 13144, + "time_per_iteration": 3.0519325733184814 + }, + { + "auxiliary_loss_clip": 0.01149521, + "auxiliary_loss_mlp": 0.01102287, + "balance_loss_clip": 1.00195885, + "balance_loss_mlp": 1.00043452, + "epoch": 0.7903201563204569, + "flos": 22346384780160.0, + "grad_norm": 2.4641699635189718, + "language_loss": 0.83273304, + "learning_rate": 4.4363096628374605e-07, + "loss": 0.85525107, + "num_input_tokens_seen": 283688830, + "step": 13145, + "time_per_iteration": 2.6868042945861816 + }, + { + "auxiliary_loss_clip": 0.01148052, + "auxiliary_loss_mlp": 0.01101314, + "balance_loss_clip": 1.00175333, + "balance_loss_mlp": 1.00041485, + "epoch": 0.790380279573125, + "flos": 22053533195520.0, + "grad_norm": 1.780499139230545, + "language_loss": 0.72851217, + "learning_rate": 4.4338639994903235e-07, + "loss": 0.75100577, + "num_input_tokens_seen": 283708625, + "step": 13146, + "time_per_iteration": 2.593125104904175 + }, + { + "auxiliary_loss_clip": 0.01164518, + "auxiliary_loss_mlp": 0.0110234, + "balance_loss_clip": 1.0018723, + "balance_loss_mlp": 1.00039268, + "epoch": 0.7904404028257929, + "flos": 20302600826880.0, + "grad_norm": 2.6466690097969936, + "language_loss": 0.75810599, + "learning_rate": 4.4314189264126246e-07, + "loss": 0.78077459, + "num_input_tokens_seen": 283725710, + "step": 13147, + "time_per_iteration": 2.540329933166504 + }, + { + "auxiliary_loss_clip": 0.01148004, + "auxiliary_loss_mlp": 0.01101904, + "balance_loss_clip": 1.00176716, + "balance_loss_mlp": 1.00062358, + "epoch": 0.7905005260784609, + "flos": 20008923229440.0, + "grad_norm": 1.778646680934502, + "language_loss": 0.72521615, + "learning_rate": 4.428974443697087e-07, + "loss": 0.74771523, + "num_input_tokens_seen": 283744150, + "step": 13148, + "time_per_iteration": 2.5732929706573486 + }, + { + "auxiliary_loss_clip": 0.01149922, + "auxiliary_loss_mlp": 0.01103338, + "balance_loss_clip": 1.00198126, + "balance_loss_mlp": 1.00053191, + "epoch": 0.7905606493311288, + "flos": 26905926418560.0, + "grad_norm": 1.9332838131112744, + "language_loss": 0.71646506, + "learning_rate": 4.4265305514363913e-07, + "loss": 0.7389977, + "num_input_tokens_seen": 283764170, + "step": 13149, + "time_per_iteration": 2.6171815395355225 + }, + { + "auxiliary_loss_clip": 0.01120625, + "auxiliary_loss_mlp": 0.01102544, + "balance_loss_clip": 1.00187159, + "balance_loss_mlp": 1.00050068, + "epoch": 0.7906207725837968, + "flos": 23696230907520.0, + "grad_norm": 2.310285730365275, + "language_loss": 0.6600275, + "learning_rate": 4.424087249723225e-07, + "loss": 0.6822592, + "num_input_tokens_seen": 283784305, + "step": 13150, + "time_per_iteration": 4.637053728103638 + }, + { + "auxiliary_loss_clip": 0.01164568, + "auxiliary_loss_mlp": 0.01101219, + "balance_loss_clip": 1.00190794, + "balance_loss_mlp": 1.0004158, + "epoch": 0.7906808958364647, + "flos": 20848837927680.0, + "grad_norm": 2.001828381912437, + "language_loss": 0.69858056, + "learning_rate": 4.421644538650231e-07, + "loss": 0.72123843, + "num_input_tokens_seen": 283804040, + "step": 13151, + "time_per_iteration": 2.530289888381958 + }, + { + "auxiliary_loss_clip": 0.01132687, + "auxiliary_loss_mlp": 0.01103202, + "balance_loss_clip": 1.00184321, + "balance_loss_mlp": 1.00058627, + "epoch": 0.7907410190891327, + "flos": 40735196974080.0, + "grad_norm": 1.6051799620376703, + "language_loss": 0.70053303, + "learning_rate": 4.4192024183100306e-07, + "loss": 0.72289199, + "num_input_tokens_seen": 283827120, + "step": 13152, + "time_per_iteration": 2.7796032428741455 + }, + { + "auxiliary_loss_clip": 0.01115934, + "auxiliary_loss_mlp": 0.00747284, + "balance_loss_clip": 1.00181663, + "balance_loss_mlp": 1.00058246, + "epoch": 0.7908011423418007, + "flos": 13261165050240.0, + "grad_norm": 2.2265044643396057, + "language_loss": 0.72882825, + "learning_rate": 4.4167608887952367e-07, + "loss": 0.74746042, + "num_input_tokens_seen": 283844820, + "step": 13153, + "time_per_iteration": 2.6305811405181885 + }, + { + "auxiliary_loss_clip": 0.01164459, + "auxiliary_loss_mlp": 0.01102777, + "balance_loss_clip": 1.00193977, + "balance_loss_mlp": 1.00044775, + "epoch": 0.7908612655944687, + "flos": 19754747614080.0, + "grad_norm": 1.5407902750799394, + "language_loss": 0.78631914, + "learning_rate": 4.4143199501984306e-07, + "loss": 0.80899155, + "num_input_tokens_seen": 283862870, + "step": 13154, + "time_per_iteration": 3.895953416824341 + }, + { + "auxiliary_loss_clip": 0.01148162, + "auxiliary_loss_mlp": 0.0110382, + "balance_loss_clip": 1.00183606, + "balance_loss_mlp": 1.00034618, + "epoch": 0.7909213888471366, + "flos": 21287738211840.0, + "grad_norm": 3.1196230471734703, + "language_loss": 0.70237547, + "learning_rate": 4.411879602612185e-07, + "loss": 0.7248953, + "num_input_tokens_seen": 283882405, + "step": 13155, + "time_per_iteration": 2.5458154678344727 + }, + { + "auxiliary_loss_clip": 0.01164675, + "auxiliary_loss_mlp": 0.01102894, + "balance_loss_clip": 1.00199771, + "balance_loss_mlp": 1.00046968, + "epoch": 0.7909815120998046, + "flos": 22528882805760.0, + "grad_norm": 1.6530582910829166, + "language_loss": 0.77032483, + "learning_rate": 4.4094398461290174e-07, + "loss": 0.79300046, + "num_input_tokens_seen": 283902070, + "step": 13156, + "time_per_iteration": 2.5682718753814697 + }, + { + "auxiliary_loss_clip": 0.01119748, + "auxiliary_loss_mlp": 0.01101898, + "balance_loss_clip": 1.00166178, + "balance_loss_mlp": 1.00042748, + "epoch": 0.7910416353524725, + "flos": 26727702111360.0, + "grad_norm": 1.5629958182884147, + "language_loss": 0.65492427, + "learning_rate": 4.4070006808414526e-07, + "loss": 0.67714071, + "num_input_tokens_seen": 283924100, + "step": 13157, + "time_per_iteration": 2.734570264816284 + }, + { + "auxiliary_loss_clip": 0.0114998, + "auxiliary_loss_mlp": 0.01102581, + "balance_loss_clip": 1.00183439, + "balance_loss_mlp": 1.00053823, + "epoch": 0.7911017586051405, + "flos": 24644847139200.0, + "grad_norm": 2.3697917856346953, + "language_loss": 0.74011791, + "learning_rate": 4.4045621068419894e-07, + "loss": 0.76264346, + "num_input_tokens_seen": 283944955, + "step": 13158, + "time_per_iteration": 2.618440866470337 + }, + { + "auxiliary_loss_clip": 0.01147788, + "auxiliary_loss_mlp": 0.01101147, + "balance_loss_clip": 1.00184035, + "balance_loss_mlp": 1.00063014, + "epoch": 0.7911618818578086, + "flos": 17565489578880.0, + "grad_norm": 2.32402993470019, + "language_loss": 0.67240858, + "learning_rate": 4.40212412422309e-07, + "loss": 0.69489795, + "num_input_tokens_seen": 283963125, + "step": 13159, + "time_per_iteration": 2.535630226135254 + }, + { + "auxiliary_loss_clip": 0.01147724, + "auxiliary_loss_mlp": 0.01101646, + "balance_loss_clip": 1.00183773, + "balance_loss_mlp": 1.00046134, + "epoch": 0.7912220051104765, + "flos": 16721660298240.0, + "grad_norm": 1.7815810299872101, + "language_loss": 0.66976547, + "learning_rate": 4.399686733077206e-07, + "loss": 0.69225913, + "num_input_tokens_seen": 283982850, + "step": 13160, + "time_per_iteration": 2.567885398864746 + }, + { + "auxiliary_loss_clip": 0.01132533, + "auxiliary_loss_mlp": 0.01100473, + "balance_loss_clip": 1.00167251, + "balance_loss_mlp": 1.00043225, + "epoch": 0.7912821283631445, + "flos": 13698736531200.0, + "grad_norm": 2.0327648325035583, + "language_loss": 0.72660112, + "learning_rate": 4.3972499334967694e-07, + "loss": 0.74893117, + "num_input_tokens_seen": 283998275, + "step": 13161, + "time_per_iteration": 2.601494312286377 + }, + { + "auxiliary_loss_clip": 0.01132818, + "auxiliary_loss_mlp": 0.01102479, + "balance_loss_clip": 1.00194263, + "balance_loss_mlp": 1.00043583, + "epoch": 0.7913422516158124, + "flos": 23769021818880.0, + "grad_norm": 1.757565957806689, + "language_loss": 0.73689818, + "learning_rate": 4.39481372557418e-07, + "loss": 0.75925118, + "num_input_tokens_seen": 284018750, + "step": 13162, + "time_per_iteration": 2.663170099258423 + }, + { + "auxiliary_loss_clip": 0.01131183, + "auxiliary_loss_mlp": 0.0110251, + "balance_loss_clip": 1.00187373, + "balance_loss_mlp": 1.00046682, + "epoch": 0.7914023748684804, + "flos": 19938251220480.0, + "grad_norm": 1.8739693458636957, + "language_loss": 0.72141504, + "learning_rate": 4.392378109401811e-07, + "loss": 0.74375194, + "num_input_tokens_seen": 284037850, + "step": 13163, + "time_per_iteration": 2.6075222492218018 + }, + { + "auxiliary_loss_clip": 0.01117792, + "auxiliary_loss_mlp": 0.01102188, + "balance_loss_clip": 1.00179267, + "balance_loss_mlp": 1.00052655, + "epoch": 0.7914624981211483, + "flos": 20594805966720.0, + "grad_norm": 2.308196316616035, + "language_loss": 0.69711888, + "learning_rate": 4.3899430850720296e-07, + "loss": 0.71931875, + "num_input_tokens_seen": 284056380, + "step": 13164, + "time_per_iteration": 2.65451979637146 + }, + { + "auxiliary_loss_clip": 0.01115129, + "auxiliary_loss_mlp": 0.01101732, + "balance_loss_clip": 1.00170696, + "balance_loss_mlp": 1.00045145, + "epoch": 0.7915226213738163, + "flos": 21799465320960.0, + "grad_norm": 2.2189780035806446, + "language_loss": 0.66390896, + "learning_rate": 4.387508652677177e-07, + "loss": 0.68607754, + "num_input_tokens_seen": 284074945, + "step": 13165, + "time_per_iteration": 2.714221954345703 + }, + { + "auxiliary_loss_clip": 0.01101259, + "auxiliary_loss_mlp": 0.01100566, + "balance_loss_clip": 1.0015614, + "balance_loss_mlp": 1.0003823, + "epoch": 0.7915827446264843, + "flos": 16288362535680.0, + "grad_norm": 1.8716205731280213, + "language_loss": 0.72153372, + "learning_rate": 4.385074812309557e-07, + "loss": 0.74355197, + "num_input_tokens_seen": 284092070, + "step": 13166, + "time_per_iteration": 2.6571481227874756 + }, + { + "auxiliary_loss_clip": 0.01164393, + "auxiliary_loss_mlp": 0.01102037, + "balance_loss_clip": 1.00193405, + "balance_loss_mlp": 1.00051868, + "epoch": 0.7916428678791523, + "flos": 25702595867520.0, + "grad_norm": 2.048210017524092, + "language_loss": 0.77278018, + "learning_rate": 4.382641564061462e-07, + "loss": 0.79544449, + "num_input_tokens_seen": 284112255, + "step": 13167, + "time_per_iteration": 2.5633492469787598 + }, + { + "auxiliary_loss_clip": 0.01113539, + "auxiliary_loss_mlp": 0.01101374, + "balance_loss_clip": 1.00166368, + "balance_loss_mlp": 1.00047481, + "epoch": 0.7917029911318202, + "flos": 23878513451520.0, + "grad_norm": 1.6358860027112907, + "language_loss": 0.84218216, + "learning_rate": 4.3802089080251713e-07, + "loss": 0.86433125, + "num_input_tokens_seen": 284132330, + "step": 13168, + "time_per_iteration": 2.6527504920959473 + }, + { + "auxiliary_loss_clip": 0.01164577, + "auxiliary_loss_mlp": 0.01101996, + "balance_loss_clip": 1.00194263, + "balance_loss_mlp": 1.00042987, + "epoch": 0.7917631143844882, + "flos": 21646593037440.0, + "grad_norm": 1.866430123490195, + "language_loss": 0.72658992, + "learning_rate": 4.3777768442929155e-07, + "loss": 0.74925566, + "num_input_tokens_seen": 284150640, + "step": 13169, + "time_per_iteration": 2.5300915241241455 + }, + { + "auxiliary_loss_clip": 0.0116462, + "auxiliary_loss_mlp": 0.01102847, + "balance_loss_clip": 1.00191426, + "balance_loss_mlp": 1.00051761, + "epoch": 0.7918232376371561, + "flos": 38874198355200.0, + "grad_norm": 1.7235087682019739, + "language_loss": 0.67162716, + "learning_rate": 4.3753453729569287e-07, + "loss": 0.69430184, + "num_input_tokens_seen": 284171910, + "step": 13170, + "time_per_iteration": 2.674300193786621 + }, + { + "auxiliary_loss_clip": 0.01149285, + "auxiliary_loss_mlp": 0.01102389, + "balance_loss_clip": 1.00179613, + "balance_loss_mlp": 1.00044143, + "epoch": 0.7918833608898241, + "flos": 20775544225920.0, + "grad_norm": 1.8794243077280084, + "language_loss": 0.70915663, + "learning_rate": 4.372914494109412e-07, + "loss": 0.73167336, + "num_input_tokens_seen": 284191340, + "step": 13171, + "time_per_iteration": 2.571497678756714 + }, + { + "auxiliary_loss_clip": 0.01147774, + "auxiliary_loss_mlp": 0.01101792, + "balance_loss_clip": 1.00181174, + "balance_loss_mlp": 1.00041616, + "epoch": 0.7919434841424922, + "flos": 33910122769920.0, + "grad_norm": 2.0070765505255572, + "language_loss": 0.67043817, + "learning_rate": 4.370484207842553e-07, + "loss": 0.6929338, + "num_input_tokens_seen": 284212495, + "step": 13172, + "time_per_iteration": 4.020458221435547 + }, + { + "auxiliary_loss_clip": 0.01132986, + "auxiliary_loss_mlp": 0.01101663, + "balance_loss_clip": 1.00190341, + "balance_loss_mlp": 1.00047827, + "epoch": 0.7920036073951601, + "flos": 21064660796160.0, + "grad_norm": 1.7278694548329951, + "language_loss": 0.79470938, + "learning_rate": 4.3680545142484893e-07, + "loss": 0.81705588, + "num_input_tokens_seen": 284230825, + "step": 13173, + "time_per_iteration": 2.6314849853515625 + }, + { + "auxiliary_loss_clip": 0.01119444, + "auxiliary_loss_mlp": 0.01101391, + "balance_loss_clip": 1.00173593, + "balance_loss_mlp": 1.00039673, + "epoch": 0.7920637306478281, + "flos": 23655974739840.0, + "grad_norm": 1.8295532257115286, + "language_loss": 0.76646936, + "learning_rate": 4.365625413419365e-07, + "loss": 0.78867769, + "num_input_tokens_seen": 284250365, + "step": 13174, + "time_per_iteration": 2.665126323699951 + }, + { + "auxiliary_loss_clip": 0.01134347, + "auxiliary_loss_mlp": 0.01101212, + "balance_loss_clip": 1.00165701, + "balance_loss_mlp": 1.00050378, + "epoch": 0.792123853900496, + "flos": 27195438038400.0, + "grad_norm": 1.5176060372318776, + "language_loss": 0.71897316, + "learning_rate": 4.363196905447297e-07, + "loss": 0.74132878, + "num_input_tokens_seen": 284269635, + "step": 13175, + "time_per_iteration": 2.644775867462158 + }, + { + "auxiliary_loss_clip": 0.01149714, + "auxiliary_loss_mlp": 0.01102328, + "balance_loss_clip": 1.00184584, + "balance_loss_mlp": 1.00047565, + "epoch": 0.792183977153164, + "flos": 19098659744640.0, + "grad_norm": 2.4526499067918794, + "language_loss": 0.59662741, + "learning_rate": 4.360768990424364e-07, + "loss": 0.61914778, + "num_input_tokens_seen": 284288380, + "step": 13176, + "time_per_iteration": 3.9507713317871094 + }, + { + "auxiliary_loss_clip": 0.01164338, + "auxiliary_loss_mlp": 0.01101678, + "balance_loss_clip": 1.00196624, + "balance_loss_mlp": 1.00049341, + "epoch": 0.7922441004058319, + "flos": 17128851851520.0, + "grad_norm": 1.7807975994384466, + "language_loss": 0.73465276, + "learning_rate": 4.3583416684426376e-07, + "loss": 0.75731289, + "num_input_tokens_seen": 284306920, + "step": 13177, + "time_per_iteration": 2.4935615062713623 + }, + { + "auxiliary_loss_clip": 0.01150024, + "auxiliary_loss_mlp": 0.01101696, + "balance_loss_clip": 1.00194931, + "balance_loss_mlp": 1.00051129, + "epoch": 0.7923042236585, + "flos": 17821640442240.0, + "grad_norm": 2.4450174834891114, + "language_loss": 0.64051843, + "learning_rate": 4.355914939594174e-07, + "loss": 0.66303563, + "num_input_tokens_seen": 284324700, + "step": 13178, + "time_per_iteration": 2.5414862632751465 + }, + { + "auxiliary_loss_clip": 0.01130909, + "auxiliary_loss_mlp": 0.0110114, + "balance_loss_clip": 1.00170183, + "balance_loss_mlp": 1.00043225, + "epoch": 0.7923643469111679, + "flos": 29935206892800.0, + "grad_norm": 1.6048242482555166, + "language_loss": 0.68269104, + "learning_rate": 4.3534888039709726e-07, + "loss": 0.70501155, + "num_input_tokens_seen": 284345985, + "step": 13179, + "time_per_iteration": 2.682995319366455 + }, + { + "auxiliary_loss_clip": 0.01164263, + "auxiliary_loss_mlp": 0.01101913, + "balance_loss_clip": 1.00185728, + "balance_loss_mlp": 1.00044191, + "epoch": 0.7924244701638359, + "flos": 22674716023680.0, + "grad_norm": 1.9555014752790492, + "language_loss": 0.74130827, + "learning_rate": 4.3510632616650444e-07, + "loss": 0.76397002, + "num_input_tokens_seen": 284364475, + "step": 13180, + "time_per_iteration": 2.533351421356201 + }, + { + "auxiliary_loss_clip": 0.01148243, + "auxiliary_loss_mlp": 0.01103693, + "balance_loss_clip": 1.00193787, + "balance_loss_mlp": 1.00060105, + "epoch": 0.7924845934165038, + "flos": 17968156018560.0, + "grad_norm": 1.8175390566615848, + "language_loss": 0.8141433, + "learning_rate": 4.3486383127683646e-07, + "loss": 0.83666265, + "num_input_tokens_seen": 284382125, + "step": 13181, + "time_per_iteration": 2.5345234870910645 + }, + { + "auxiliary_loss_clip": 0.01133346, + "auxiliary_loss_mlp": 0.01101761, + "balance_loss_clip": 1.00174117, + "balance_loss_mlp": 1.0005759, + "epoch": 0.7925447166691718, + "flos": 23476960333440.0, + "grad_norm": 1.7442188664932639, + "language_loss": 0.77163947, + "learning_rate": 4.346213957372895e-07, + "loss": 0.79399049, + "num_input_tokens_seen": 284401585, + "step": 13182, + "time_per_iteration": 2.6677865982055664 + }, + { + "auxiliary_loss_clip": 0.0114927, + "auxiliary_loss_mlp": 0.01103434, + "balance_loss_clip": 1.00204873, + "balance_loss_mlp": 1.00053287, + "epoch": 0.7926048399218397, + "flos": 20447572118400.0, + "grad_norm": 2.2021796249747956, + "language_loss": 0.73580861, + "learning_rate": 4.34379019557056e-07, + "loss": 0.75833565, + "num_input_tokens_seen": 284419125, + "step": 13183, + "time_per_iteration": 2.612184524536133 + }, + { + "auxiliary_loss_clip": 0.01133174, + "auxiliary_loss_mlp": 0.011021, + "balance_loss_clip": 1.00186777, + "balance_loss_mlp": 1.00043869, + "epoch": 0.7926649631745077, + "flos": 37160038535040.0, + "grad_norm": 1.6428453648194095, + "language_loss": 0.68255019, + "learning_rate": 4.341367027453264e-07, + "loss": 0.70490289, + "num_input_tokens_seen": 284440445, + "step": 13184, + "time_per_iteration": 2.7856125831604004 + }, + { + "auxiliary_loss_clip": 0.01114919, + "auxiliary_loss_mlp": 0.01102939, + "balance_loss_clip": 1.00172901, + "balance_loss_mlp": 1.0006094, + "epoch": 0.7927250864271758, + "flos": 17018606033280.0, + "grad_norm": 2.675690765300163, + "language_loss": 0.70858544, + "learning_rate": 4.338944453112907e-07, + "loss": 0.73076403, + "num_input_tokens_seen": 284459370, + "step": 13185, + "time_per_iteration": 2.6831228733062744 + }, + { + "auxiliary_loss_clip": 0.01148253, + "auxiliary_loss_mlp": 0.01102014, + "balance_loss_clip": 1.00184107, + "balance_loss_mlp": 1.00049496, + "epoch": 0.7927852096798437, + "flos": 17749208666880.0, + "grad_norm": 5.892757776601594, + "language_loss": 0.64752412, + "learning_rate": 4.3365224726413375e-07, + "loss": 0.67002678, + "num_input_tokens_seen": 284477525, + "step": 13186, + "time_per_iteration": 2.5392656326293945 + }, + { + "auxiliary_loss_clip": 0.01149909, + "auxiliary_loss_mlp": 0.0110169, + "balance_loss_clip": 1.00188327, + "balance_loss_mlp": 1.00050545, + "epoch": 0.7928453329325117, + "flos": 23838436851840.0, + "grad_norm": 1.5961831613388895, + "language_loss": 0.77099848, + "learning_rate": 4.334101086130408e-07, + "loss": 0.79351449, + "num_input_tokens_seen": 284496590, + "step": 13187, + "time_per_iteration": 2.5945699214935303 + }, + { + "auxiliary_loss_clip": 0.01134658, + "auxiliary_loss_mlp": 0.01101203, + "balance_loss_clip": 1.00182438, + "balance_loss_mlp": 1.00049531, + "epoch": 0.7929054561851796, + "flos": 17454920538240.0, + "grad_norm": 2.3714312535036592, + "language_loss": 0.72683579, + "learning_rate": 4.3316802936719334e-07, + "loss": 0.74919438, + "num_input_tokens_seen": 284511470, + "step": 13188, + "time_per_iteration": 4.144640207290649 + }, + { + "auxiliary_loss_clip": 0.01164426, + "auxiliary_loss_mlp": 0.00747554, + "balance_loss_clip": 1.00181031, + "balance_loss_mlp": 1.00051916, + "epoch": 0.7929655794378476, + "flos": 21981280988160.0, + "grad_norm": 6.2988774778065695, + "language_loss": 0.62848157, + "learning_rate": 4.329260095357725e-07, + "loss": 0.64760137, + "num_input_tokens_seen": 284531125, + "step": 13189, + "time_per_iteration": 2.549309730529785 + }, + { + "auxiliary_loss_clip": 0.01097741, + "auxiliary_loss_mlp": 0.01100797, + "balance_loss_clip": 1.00160265, + "balance_loss_mlp": 1.00056589, + "epoch": 0.7930257026905155, + "flos": 17273930883840.0, + "grad_norm": 2.4054120355222754, + "language_loss": 0.72699851, + "learning_rate": 4.3268404912795307e-07, + "loss": 0.74898392, + "num_input_tokens_seen": 284549340, + "step": 13190, + "time_per_iteration": 2.858771324157715 + }, + { + "auxiliary_loss_clip": 0.011477, + "auxiliary_loss_mlp": 0.01101369, + "balance_loss_clip": 1.00192237, + "balance_loss_mlp": 1.00047064, + "epoch": 0.7930858259431836, + "flos": 27300584125440.0, + "grad_norm": 2.4561220645220505, + "language_loss": 0.73331803, + "learning_rate": 4.3244214815291166e-07, + "loss": 0.75580871, + "num_input_tokens_seen": 284567060, + "step": 13191, + "time_per_iteration": 2.7998814582824707 + }, + { + "auxiliary_loss_clip": 0.01147646, + "auxiliary_loss_mlp": 0.01101776, + "balance_loss_clip": 1.00178719, + "balance_loss_mlp": 1.00049531, + "epoch": 0.7931459491958515, + "flos": 19863736456320.0, + "grad_norm": 2.118897605913186, + "language_loss": 0.68781722, + "learning_rate": 4.322003066198219e-07, + "loss": 0.71031141, + "num_input_tokens_seen": 284586600, + "step": 13192, + "time_per_iteration": 4.246841907501221 + }, + { + "auxiliary_loss_clip": 0.01119494, + "auxiliary_loss_mlp": 0.01102465, + "balance_loss_clip": 1.00186729, + "balance_loss_mlp": 1.00051737, + "epoch": 0.7932060724485195, + "flos": 23147120718720.0, + "grad_norm": 6.911481012588073, + "language_loss": 0.75037998, + "learning_rate": 4.3195852453785274e-07, + "loss": 0.77259958, + "num_input_tokens_seen": 284605715, + "step": 13193, + "time_per_iteration": 2.893897294998169 + }, + { + "auxiliary_loss_clip": 0.01147792, + "auxiliary_loss_mlp": 0.01102321, + "balance_loss_clip": 1.00193894, + "balance_loss_mlp": 1.00046873, + "epoch": 0.7932661957011874, + "flos": 29934847756800.0, + "grad_norm": 1.608993254502581, + "language_loss": 0.72044122, + "learning_rate": 4.317168019161741e-07, + "loss": 0.74294233, + "num_input_tokens_seen": 284628540, + "step": 13194, + "time_per_iteration": 2.878735065460205 + }, + { + "auxiliary_loss_clip": 0.01164603, + "auxiliary_loss_mlp": 0.0110311, + "balance_loss_clip": 1.0018537, + "balance_loss_mlp": 1.00049496, + "epoch": 0.7933263189538554, + "flos": 22559119079040.0, + "grad_norm": 2.04683900169901, + "language_loss": 0.69871199, + "learning_rate": 4.314751387639517e-07, + "loss": 0.72138917, + "num_input_tokens_seen": 284646040, + "step": 13195, + "time_per_iteration": 2.7467684745788574 + }, + { + "auxiliary_loss_clip": 0.01099789, + "auxiliary_loss_mlp": 0.01102117, + "balance_loss_clip": 1.0017513, + "balance_loss_mlp": 1.00055039, + "epoch": 0.7933864422065233, + "flos": 25479051575040.0, + "grad_norm": 1.6140155270711325, + "language_loss": 0.77417296, + "learning_rate": 4.3123353509034844e-07, + "loss": 0.79619199, + "num_input_tokens_seen": 284665110, + "step": 13196, + "time_per_iteration": 3.0463829040527344 + }, + { + "auxiliary_loss_clip": 0.01114249, + "auxiliary_loss_mlp": 0.01102375, + "balance_loss_clip": 1.00173414, + "balance_loss_mlp": 1.00047457, + "epoch": 0.7934465654591913, + "flos": 33583156243200.0, + "grad_norm": 1.724770143363832, + "language_loss": 0.68861306, + "learning_rate": 4.309919909045268e-07, + "loss": 0.71077931, + "num_input_tokens_seen": 284686515, + "step": 13197, + "time_per_iteration": 2.992372989654541 + }, + { + "auxiliary_loss_clip": 0.01147537, + "auxiliary_loss_mlp": 0.01101534, + "balance_loss_clip": 1.00179422, + "balance_loss_mlp": 1.00034869, + "epoch": 0.7935066887118594, + "flos": 31432538263680.0, + "grad_norm": 1.8001545976326823, + "language_loss": 0.65180123, + "learning_rate": 4.30750506215646e-07, + "loss": 0.67429191, + "num_input_tokens_seen": 284707300, + "step": 13198, + "time_per_iteration": 2.6969051361083984 + }, + { + "auxiliary_loss_clip": 0.01099885, + "auxiliary_loss_mlp": 0.01101375, + "balance_loss_clip": 1.00151873, + "balance_loss_mlp": 1.00047636, + "epoch": 0.7935668119645273, + "flos": 14682616940160.0, + "grad_norm": 2.668227934670117, + "language_loss": 0.72745883, + "learning_rate": 4.30509081032864e-07, + "loss": 0.74947143, + "num_input_tokens_seen": 284723545, + "step": 13199, + "time_per_iteration": 2.6979689598083496 + }, + { + "auxiliary_loss_clip": 0.01135384, + "auxiliary_loss_mlp": 0.01101247, + "balance_loss_clip": 1.00196862, + "balance_loss_mlp": 1.00044358, + "epoch": 0.7936269352171953, + "flos": 18004246208640.0, + "grad_norm": 1.7932174955585045, + "language_loss": 0.80455273, + "learning_rate": 4.302677153653349e-07, + "loss": 0.82691902, + "num_input_tokens_seen": 284742650, + "step": 13200, + "time_per_iteration": 2.6460378170013428 + }, + { + "auxiliary_loss_clip": 0.0114759, + "auxiliary_loss_mlp": 0.01101331, + "balance_loss_clip": 1.00187778, + "balance_loss_mlp": 1.00057578, + "epoch": 0.7936870584698632, + "flos": 18880215183360.0, + "grad_norm": 1.9137116712553421, + "language_loss": 0.77148467, + "learning_rate": 4.3002640922221077e-07, + "loss": 0.79397392, + "num_input_tokens_seen": 284760955, + "step": 13201, + "time_per_iteration": 2.5794854164123535 + }, + { + "auxiliary_loss_clip": 0.01164426, + "auxiliary_loss_mlp": 0.01102085, + "balance_loss_clip": 1.00196981, + "balance_loss_mlp": 1.00051892, + "epoch": 0.7937471817225312, + "flos": 23367001824000.0, + "grad_norm": 3.5702605503479767, + "language_loss": 0.67137361, + "learning_rate": 4.2978516261264296e-07, + "loss": 0.69403875, + "num_input_tokens_seen": 284780745, + "step": 13202, + "time_per_iteration": 2.6004977226257324 + }, + { + "auxiliary_loss_clip": 0.01147318, + "auxiliary_loss_mlp": 0.01102467, + "balance_loss_clip": 1.00182629, + "balance_loss_mlp": 1.00042367, + "epoch": 0.7938073049751991, + "flos": 22674428714880.0, + "grad_norm": 2.1683993266083785, + "language_loss": 0.75007123, + "learning_rate": 4.2954397554577884e-07, + "loss": 0.77256906, + "num_input_tokens_seen": 284799000, + "step": 13203, + "time_per_iteration": 2.584181070327759 + }, + { + "auxiliary_loss_clip": 0.01084773, + "auxiliary_loss_mlp": 0.01102045, + "balance_loss_clip": 1.00156617, + "balance_loss_mlp": 1.00038373, + "epoch": 0.7938674282278672, + "flos": 22851431959680.0, + "grad_norm": 1.909194972779855, + "language_loss": 0.66283059, + "learning_rate": 4.293028480307643e-07, + "loss": 0.68469876, + "num_input_tokens_seen": 284817450, + "step": 13204, + "time_per_iteration": 2.764758348464966 + }, + { + "auxiliary_loss_clip": 0.0108343, + "auxiliary_loss_mlp": 0.0110032, + "balance_loss_clip": 1.00155175, + "balance_loss_mlp": 1.00037456, + "epoch": 0.7939275514805351, + "flos": 27012509049600.0, + "grad_norm": 1.351382838940114, + "language_loss": 0.79302204, + "learning_rate": 4.290617800767438e-07, + "loss": 0.81485951, + "num_input_tokens_seen": 284838865, + "step": 13205, + "time_per_iteration": 2.797518730163574 + }, + { + "auxiliary_loss_clip": 0.01120347, + "auxiliary_loss_mlp": 0.01101972, + "balance_loss_clip": 1.00180769, + "balance_loss_mlp": 1.00050092, + "epoch": 0.7939876747332031, + "flos": 21142838747520.0, + "grad_norm": 2.0556705868768828, + "language_loss": 0.77790332, + "learning_rate": 4.28820771692858e-07, + "loss": 0.80012643, + "num_input_tokens_seen": 284857975, + "step": 13206, + "time_per_iteration": 2.688328504562378 + }, + { + "auxiliary_loss_clip": 0.01132728, + "auxiliary_loss_mlp": 0.01102679, + "balance_loss_clip": 1.00182164, + "balance_loss_mlp": 1.00063562, + "epoch": 0.794047797985871, + "flos": 23289075267840.0, + "grad_norm": 1.9166163969545214, + "language_loss": 0.79354066, + "learning_rate": 4.285798228882456e-07, + "loss": 0.81589472, + "num_input_tokens_seen": 284877145, + "step": 13207, + "time_per_iteration": 2.6248090267181396 + }, + { + "auxiliary_loss_clip": 0.01115641, + "auxiliary_loss_mlp": 0.01102023, + "balance_loss_clip": 1.00175095, + "balance_loss_mlp": 1.00045681, + "epoch": 0.794107921238539, + "flos": 24608074590720.0, + "grad_norm": 2.1993520708140393, + "language_loss": 0.84127462, + "learning_rate": 4.2833893367204375e-07, + "loss": 0.86345124, + "num_input_tokens_seen": 284895560, + "step": 13208, + "time_per_iteration": 2.7335925102233887 + }, + { + "auxiliary_loss_clip": 0.01097008, + "auxiliary_loss_mlp": 0.01076414, + "balance_loss_clip": 1.00079703, + "balance_loss_mlp": 1.0001204, + "epoch": 0.7941680444912069, + "flos": 64093690252800.0, + "grad_norm": 0.7236528039868071, + "language_loss": 0.58319616, + "learning_rate": 4.280981040533875e-07, + "loss": 0.60493028, + "num_input_tokens_seen": 284963135, + "step": 13209, + "time_per_iteration": 4.963472604751587 + }, + { + "auxiliary_loss_clip": 0.01118392, + "auxiliary_loss_mlp": 0.01103102, + "balance_loss_clip": 1.00190139, + "balance_loss_mlp": 1.0005815, + "epoch": 0.794228167743875, + "flos": 24388839930240.0, + "grad_norm": 2.48054142789611, + "language_loss": 0.63164747, + "learning_rate": 4.2785733404140825e-07, + "loss": 0.65386236, + "num_input_tokens_seen": 284981755, + "step": 13210, + "time_per_iteration": 3.067897081375122 + }, + { + "auxiliary_loss_clip": 0.01148019, + "auxiliary_loss_mlp": 0.01102007, + "balance_loss_clip": 1.00183678, + "balance_loss_mlp": 1.00053608, + "epoch": 0.794288290996543, + "flos": 28512498026880.0, + "grad_norm": 1.5054270005686512, + "language_loss": 0.68809241, + "learning_rate": 4.2761662364523676e-07, + "loss": 0.71059263, + "num_input_tokens_seen": 285003060, + "step": 13211, + "time_per_iteration": 2.7513418197631836 + }, + { + "auxiliary_loss_clip": 0.0114785, + "auxiliary_loss_mlp": 0.01102153, + "balance_loss_clip": 1.00183856, + "balance_loss_mlp": 1.00058663, + "epoch": 0.7943484142492109, + "flos": 25922117836800.0, + "grad_norm": 1.6211850879277434, + "language_loss": 0.72333509, + "learning_rate": 4.2737597287400074e-07, + "loss": 0.74583519, + "num_input_tokens_seen": 285021640, + "step": 13212, + "time_per_iteration": 2.6258275508880615 + }, + { + "auxiliary_loss_clip": 0.01149243, + "auxiliary_loss_mlp": 0.0110068, + "balance_loss_clip": 1.00190103, + "balance_loss_mlp": 1.00044858, + "epoch": 0.7944085375018789, + "flos": 23915286000000.0, + "grad_norm": 1.8533816764275184, + "language_loss": 0.80994433, + "learning_rate": 4.271353817368246e-07, + "loss": 0.83244359, + "num_input_tokens_seen": 285040490, + "step": 13213, + "time_per_iteration": 2.6425323486328125 + }, + { + "auxiliary_loss_clip": 0.01147401, + "auxiliary_loss_mlp": 0.01103037, + "balance_loss_clip": 1.00191844, + "balance_loss_mlp": 1.00051665, + "epoch": 0.7944686607545468, + "flos": 20229953569920.0, + "grad_norm": 3.0486147779489174, + "language_loss": 0.6763851, + "learning_rate": 4.268948502428327e-07, + "loss": 0.69888949, + "num_input_tokens_seen": 285059270, + "step": 13214, + "time_per_iteration": 4.027361869812012 + }, + { + "auxiliary_loss_clip": 0.01164415, + "auxiliary_loss_mlp": 0.01101271, + "balance_loss_clip": 1.00195098, + "balance_loss_mlp": 1.00056338, + "epoch": 0.7945287840072148, + "flos": 21980993679360.0, + "grad_norm": 2.944880935855826, + "language_loss": 0.7224766, + "learning_rate": 4.2665437840114535e-07, + "loss": 0.74513346, + "num_input_tokens_seen": 285075390, + "step": 13215, + "time_per_iteration": 2.5852856636047363 + }, + { + "auxiliary_loss_clip": 0.01101003, + "auxiliary_loss_mlp": 0.01101599, + "balance_loss_clip": 1.00166023, + "balance_loss_mlp": 1.00041413, + "epoch": 0.7945889072598827, + "flos": 26397718842240.0, + "grad_norm": 1.7234646988038906, + "language_loss": 0.7899465, + "learning_rate": 4.2641396622088253e-07, + "loss": 0.8119725, + "num_input_tokens_seen": 285096290, + "step": 13216, + "time_per_iteration": 2.8834495544433594 + }, + { + "auxiliary_loss_clip": 0.01149878, + "auxiliary_loss_mlp": 0.01101653, + "balance_loss_clip": 1.00203967, + "balance_loss_mlp": 1.0004679, + "epoch": 0.7946490305125508, + "flos": 25810255906560.0, + "grad_norm": 1.6119380022472232, + "language_loss": 0.73639822, + "learning_rate": 4.261736137111598e-07, + "loss": 0.75891352, + "num_input_tokens_seen": 285116020, + "step": 13217, + "time_per_iteration": 2.6355786323547363 + }, + { + "auxiliary_loss_clip": 0.0113289, + "auxiliary_loss_mlp": 0.01100259, + "balance_loss_clip": 1.00194418, + "balance_loss_mlp": 1.00050449, + "epoch": 0.7947091537652187, + "flos": 15960965045760.0, + "grad_norm": 1.763759439224047, + "language_loss": 0.74156582, + "learning_rate": 4.259333208810907e-07, + "loss": 0.7638973, + "num_input_tokens_seen": 285133510, + "step": 13218, + "time_per_iteration": 2.6449296474456787 + }, + { + "auxiliary_loss_clip": 0.01147383, + "auxiliary_loss_mlp": 0.01102278, + "balance_loss_clip": 1.00182247, + "balance_loss_mlp": 1.00042558, + "epoch": 0.7947692770178867, + "flos": 18587866389120.0, + "grad_norm": 3.08479144983545, + "language_loss": 0.83342057, + "learning_rate": 4.2569308773978817e-07, + "loss": 0.85591716, + "num_input_tokens_seen": 285151690, + "step": 13219, + "time_per_iteration": 2.594224214553833 + }, + { + "auxiliary_loss_clip": 0.01149727, + "auxiliary_loss_mlp": 0.0110322, + "balance_loss_clip": 1.00192738, + "balance_loss_mlp": 1.0005095, + "epoch": 0.7948294002705546, + "flos": 20442220992000.0, + "grad_norm": 1.8587834748741163, + "language_loss": 0.75534308, + "learning_rate": 4.2545291429636123e-07, + "loss": 0.77787256, + "num_input_tokens_seen": 285170485, + "step": 13220, + "time_per_iteration": 2.6362853050231934 + }, + { + "auxiliary_loss_clip": 0.01133843, + "auxiliary_loss_mlp": 0.01102564, + "balance_loss_clip": 1.00198674, + "balance_loss_mlp": 1.00061572, + "epoch": 0.7948895235232226, + "flos": 38181194282880.0, + "grad_norm": 1.8338067572744887, + "language_loss": 0.72289419, + "learning_rate": 4.252128005599176e-07, + "loss": 0.74525821, + "num_input_tokens_seen": 285191050, + "step": 13221, + "time_per_iteration": 2.762274742126465 + }, + { + "auxiliary_loss_clip": 0.01148957, + "auxiliary_loss_mlp": 0.01101749, + "balance_loss_clip": 1.00194514, + "balance_loss_mlp": 1.00046897, + "epoch": 0.7949496467758905, + "flos": 15559806977280.0, + "grad_norm": 1.9633666197190827, + "language_loss": 0.74937642, + "learning_rate": 4.249727465395634e-07, + "loss": 0.77188343, + "num_input_tokens_seen": 285208750, + "step": 13222, + "time_per_iteration": 2.554828643798828 + }, + { + "auxiliary_loss_clip": 0.01125712, + "auxiliary_loss_mlp": 0.01075959, + "balance_loss_clip": 1.000916, + "balance_loss_mlp": 1.00004673, + "epoch": 0.7950097700285585, + "flos": 70897036728960.0, + "grad_norm": 0.7637038948985144, + "language_loss": 0.67039967, + "learning_rate": 4.247327522443993e-07, + "loss": 0.69241643, + "num_input_tokens_seen": 285264605, + "step": 13223, + "time_per_iteration": 3.094369888305664 + }, + { + "auxiliary_loss_clip": 0.01147763, + "auxiliary_loss_mlp": 0.01101151, + "balance_loss_clip": 1.00181174, + "balance_loss_mlp": 1.00044346, + "epoch": 0.7950698932812266, + "flos": 23951627585280.0, + "grad_norm": 1.5203969901759518, + "language_loss": 0.70618904, + "learning_rate": 4.2449281768352717e-07, + "loss": 0.72867811, + "num_input_tokens_seen": 285283940, + "step": 13224, + "time_per_iteration": 2.640294313430786 + }, + { + "auxiliary_loss_clip": 0.01158389, + "auxiliary_loss_mlp": 0.01075474, + "balance_loss_clip": 1.00074673, + "balance_loss_mlp": 0.9999426, + "epoch": 0.7951300165338945, + "flos": 60282561415680.0, + "grad_norm": 0.687103106709534, + "language_loss": 0.5501557, + "learning_rate": 4.2425294286604527e-07, + "loss": 0.57249433, + "num_input_tokens_seen": 285349525, + "step": 13225, + "time_per_iteration": 4.72002387046814 + }, + { + "auxiliary_loss_clip": 0.01132332, + "auxiliary_loss_mlp": 0.01100891, + "balance_loss_clip": 1.00174689, + "balance_loss_mlp": 1.00037372, + "epoch": 0.7951901397865625, + "flos": 22819004956800.0, + "grad_norm": 2.19323983305608, + "language_loss": 0.64459789, + "learning_rate": 4.2401312780105034e-07, + "loss": 0.66693008, + "num_input_tokens_seen": 285367355, + "step": 13226, + "time_per_iteration": 2.657498836517334 + }, + { + "auxiliary_loss_clip": 0.01097477, + "auxiliary_loss_mlp": 0.01102794, + "balance_loss_clip": 1.00168884, + "balance_loss_mlp": 1.00056028, + "epoch": 0.7952502630392304, + "flos": 35695672871040.0, + "grad_norm": 2.4634393489239574, + "language_loss": 0.70296156, + "learning_rate": 4.237733724976349e-07, + "loss": 0.72496426, + "num_input_tokens_seen": 285386190, + "step": 13227, + "time_per_iteration": 2.809317111968994 + }, + { + "auxiliary_loss_clip": 0.01117378, + "auxiliary_loss_mlp": 0.011011, + "balance_loss_clip": 1.00193298, + "balance_loss_mlp": 1.0003916, + "epoch": 0.7953103862918984, + "flos": 25629840869760.0, + "grad_norm": 1.744384748072851, + "language_loss": 0.69476843, + "learning_rate": 4.2353367696489184e-07, + "loss": 0.71695316, + "num_input_tokens_seen": 285406150, + "step": 13228, + "time_per_iteration": 2.6994481086730957 + }, + { + "auxiliary_loss_clip": 0.01101813, + "auxiliary_loss_mlp": 0.0110185, + "balance_loss_clip": 1.00166821, + "balance_loss_mlp": 1.00056934, + "epoch": 0.7953705095445663, + "flos": 40551980676480.0, + "grad_norm": 1.504851432360716, + "language_loss": 0.70720726, + "learning_rate": 4.232940412119095e-07, + "loss": 0.72924387, + "num_input_tokens_seen": 285429900, + "step": 13229, + "time_per_iteration": 4.323679208755493 + }, + { + "auxiliary_loss_clip": 0.01147452, + "auxiliary_loss_mlp": 0.01103877, + "balance_loss_clip": 1.00189543, + "balance_loss_mlp": 1.00049901, + "epoch": 0.7954306327972344, + "flos": 27636672706560.0, + "grad_norm": 2.3012522729302174, + "language_loss": 0.71553016, + "learning_rate": 4.2305446524777457e-07, + "loss": 0.73804343, + "num_input_tokens_seen": 285452555, + "step": 13230, + "time_per_iteration": 2.6485486030578613 + }, + { + "auxiliary_loss_clip": 0.01125058, + "auxiliary_loss_mlp": 0.01075475, + "balance_loss_clip": 1.00081062, + "balance_loss_mlp": 0.99994433, + "epoch": 0.7954907560499023, + "flos": 59504055995520.0, + "grad_norm": 0.8955766726399588, + "language_loss": 0.63554543, + "learning_rate": 4.2281494908157247e-07, + "loss": 0.65755075, + "num_input_tokens_seen": 285515700, + "step": 13231, + "time_per_iteration": 3.2588648796081543 + }, + { + "auxiliary_loss_clip": 0.01132482, + "auxiliary_loss_mlp": 0.01101157, + "balance_loss_clip": 1.00171685, + "balance_loss_mlp": 1.0004487, + "epoch": 0.7955508793025703, + "flos": 20120533764480.0, + "grad_norm": 1.8957674260086967, + "language_loss": 0.70046431, + "learning_rate": 4.2257549272238566e-07, + "loss": 0.72280067, + "num_input_tokens_seen": 285533910, + "step": 13232, + "time_per_iteration": 2.6153621673583984 + }, + { + "auxiliary_loss_clip": 0.01149832, + "auxiliary_loss_mlp": 0.0110115, + "balance_loss_clip": 1.00183058, + "balance_loss_mlp": 1.00034642, + "epoch": 0.7956110025552382, + "flos": 26505378881280.0, + "grad_norm": 1.63914620548202, + "language_loss": 0.78093696, + "learning_rate": 4.223360961792952e-07, + "loss": 0.80344677, + "num_input_tokens_seen": 285554080, + "step": 13233, + "time_per_iteration": 2.666821241378784 + }, + { + "auxiliary_loss_clip": 0.01147638, + "auxiliary_loss_mlp": 0.01101821, + "balance_loss_clip": 1.0018239, + "balance_loss_mlp": 1.00044584, + "epoch": 0.7956711258079062, + "flos": 22565475786240.0, + "grad_norm": 3.084813486411632, + "language_loss": 0.78551733, + "learning_rate": 4.220967594613769e-07, + "loss": 0.80801189, + "num_input_tokens_seen": 285572325, + "step": 13234, + "time_per_iteration": 2.625424385070801 + }, + { + "auxiliary_loss_clip": 0.01132354, + "auxiliary_loss_mlp": 0.00747349, + "balance_loss_clip": 1.00174201, + "balance_loss_mlp": 1.00046313, + "epoch": 0.7957312490605741, + "flos": 17379005143680.0, + "grad_norm": 2.242508352378773, + "language_loss": 0.70024908, + "learning_rate": 4.218574825777077e-07, + "loss": 0.71904606, + "num_input_tokens_seen": 285589770, + "step": 13235, + "time_per_iteration": 2.668041944503784 + }, + { + "auxiliary_loss_clip": 0.01116853, + "auxiliary_loss_mlp": 0.01102048, + "balance_loss_clip": 1.00180209, + "balance_loss_mlp": 1.0003866, + "epoch": 0.7957913723132422, + "flos": 22491427898880.0, + "grad_norm": 2.6253547262045016, + "language_loss": 0.68028796, + "learning_rate": 4.2161826553736145e-07, + "loss": 0.70247698, + "num_input_tokens_seen": 285610065, + "step": 13236, + "time_per_iteration": 2.6738486289978027 + }, + { + "auxiliary_loss_clip": 0.01100485, + "auxiliary_loss_mlp": 0.01100436, + "balance_loss_clip": 1.00171399, + "balance_loss_mlp": 1.00044322, + "epoch": 0.7958514955659101, + "flos": 22638087129600.0, + "grad_norm": 1.6114794868932143, + "language_loss": 0.74929142, + "learning_rate": 4.2137910834940826e-07, + "loss": 0.77130061, + "num_input_tokens_seen": 285628480, + "step": 13237, + "time_per_iteration": 2.765493392944336 + }, + { + "auxiliary_loss_clip": 0.01147725, + "auxiliary_loss_mlp": 0.01102481, + "balance_loss_clip": 1.00193012, + "balance_loss_mlp": 1.00053322, + "epoch": 0.7959116188185781, + "flos": 20704225772160.0, + "grad_norm": 1.8290887726324658, + "language_loss": 0.70958173, + "learning_rate": 4.211400110229175e-07, + "loss": 0.7320838, + "num_input_tokens_seen": 285647805, + "step": 13238, + "time_per_iteration": 2.59208607673645 + }, + { + "auxiliary_loss_clip": 0.01148036, + "auxiliary_loss_mlp": 0.01102059, + "balance_loss_clip": 1.00171471, + "balance_loss_mlp": 1.00039756, + "epoch": 0.7959717420712461, + "flos": 19024683684480.0, + "grad_norm": 2.9322740813178556, + "language_loss": 0.73686004, + "learning_rate": 4.2090097356695684e-07, + "loss": 0.75936097, + "num_input_tokens_seen": 285665505, + "step": 13239, + "time_per_iteration": 2.588419198989868 + }, + { + "auxiliary_loss_clip": 0.01164579, + "auxiliary_loss_mlp": 0.01101988, + "balance_loss_clip": 1.00192404, + "balance_loss_mlp": 1.00051689, + "epoch": 0.796031865323914, + "flos": 26356636661760.0, + "grad_norm": 1.8938178413872861, + "language_loss": 0.68995738, + "learning_rate": 4.2066199599058814e-07, + "loss": 0.71262306, + "num_input_tokens_seen": 285685855, + "step": 13240, + "time_per_iteration": 2.6172127723693848 + }, + { + "auxiliary_loss_clip": 0.01143905, + "auxiliary_loss_mlp": 0.01075467, + "balance_loss_clip": 1.00074172, + "balance_loss_mlp": 0.99993563, + "epoch": 0.796091988576582, + "flos": 62069440320000.0, + "grad_norm": 0.8899003342461881, + "language_loss": 0.58689439, + "learning_rate": 4.2042307830287526e-07, + "loss": 0.60908806, + "num_input_tokens_seen": 285735710, + "step": 13241, + "time_per_iteration": 2.960085391998291 + }, + { + "auxiliary_loss_clip": 0.01116184, + "auxiliary_loss_mlp": 0.01102049, + "balance_loss_clip": 1.00182164, + "balance_loss_mlp": 1.0004828, + "epoch": 0.7961521118292499, + "flos": 39020103400320.0, + "grad_norm": 2.2319540967701683, + "language_loss": 0.6483984, + "learning_rate": 4.201842205128772e-07, + "loss": 0.67058074, + "num_input_tokens_seen": 285757045, + "step": 13242, + "time_per_iteration": 2.8411781787872314 + }, + { + "auxiliary_loss_clip": 0.01164497, + "auxiliary_loss_mlp": 0.01102186, + "balance_loss_clip": 1.00194836, + "balance_loss_mlp": 1.00061941, + "epoch": 0.796212235081918, + "flos": 21762836426880.0, + "grad_norm": 1.9103115843811775, + "language_loss": 0.76034766, + "learning_rate": 4.199454226296526e-07, + "loss": 0.78301442, + "num_input_tokens_seen": 285776050, + "step": 13243, + "time_per_iteration": 2.603795289993286 + }, + { + "auxiliary_loss_clip": 0.01117883, + "auxiliary_loss_mlp": 0.01101631, + "balance_loss_clip": 1.00167072, + "balance_loss_mlp": 1.00044644, + "epoch": 0.7962723583345859, + "flos": 21178857110400.0, + "grad_norm": 1.7101198171960477, + "language_loss": 0.79629999, + "learning_rate": 4.1970668466225565e-07, + "loss": 0.81849509, + "num_input_tokens_seen": 285796830, + "step": 13244, + "time_per_iteration": 2.7801294326782227 + }, + { + "auxiliary_loss_clip": 0.01149904, + "auxiliary_loss_mlp": 0.01102027, + "balance_loss_clip": 1.00174105, + "balance_loss_mlp": 1.00046074, + "epoch": 0.7963324815872539, + "flos": 17128636369920.0, + "grad_norm": 2.1588432885505364, + "language_loss": 0.68545341, + "learning_rate": 4.1946800661973934e-07, + "loss": 0.70797271, + "num_input_tokens_seen": 285814755, + "step": 13245, + "time_per_iteration": 2.5626115798950195 + }, + { + "auxiliary_loss_clip": 0.01133382, + "auxiliary_loss_mlp": 0.01101634, + "balance_loss_clip": 1.00178766, + "balance_loss_mlp": 1.00054455, + "epoch": 0.7963926048399218, + "flos": 21397481239680.0, + "grad_norm": 1.7251545624661093, + "language_loss": 0.79043674, + "learning_rate": 4.192293885111549e-07, + "loss": 0.81278694, + "num_input_tokens_seen": 285834255, + "step": 13246, + "time_per_iteration": 2.6260664463043213 + }, + { + "auxiliary_loss_clip": 0.01132267, + "auxiliary_loss_mlp": 0.01102247, + "balance_loss_clip": 1.00176728, + "balance_loss_mlp": 1.00048971, + "epoch": 0.7964527280925898, + "flos": 25184188828800.0, + "grad_norm": 4.057292546187352, + "language_loss": 0.66401505, + "learning_rate": 4.1899083034555007e-07, + "loss": 0.68636012, + "num_input_tokens_seen": 285853540, + "step": 13247, + "time_per_iteration": 4.045266628265381 + }, + { + "auxiliary_loss_clip": 0.01131304, + "auxiliary_loss_mlp": 0.01101187, + "balance_loss_clip": 1.00173414, + "balance_loss_mlp": 1.00047851, + "epoch": 0.7965128513452577, + "flos": 27015884928000.0, + "grad_norm": 2.1592816030901805, + "language_loss": 0.71816921, + "learning_rate": 4.1875233213197123e-07, + "loss": 0.74049407, + "num_input_tokens_seen": 285872705, + "step": 13248, + "time_per_iteration": 2.7148425579071045 + }, + { + "auxiliary_loss_clip": 0.01133693, + "auxiliary_loss_mlp": 0.0110255, + "balance_loss_clip": 1.0016582, + "balance_loss_mlp": 1.0005064, + "epoch": 0.7965729745979258, + "flos": 24419578993920.0, + "grad_norm": 2.900389678829253, + "language_loss": 0.76265001, + "learning_rate": 4.1851389387946255e-07, + "loss": 0.78501236, + "num_input_tokens_seen": 285890290, + "step": 13249, + "time_per_iteration": 2.7054107189178467 + }, + { + "auxiliary_loss_clip": 0.01131467, + "auxiliary_loss_mlp": 0.01101851, + "balance_loss_clip": 1.00179529, + "balance_loss_mlp": 1.00038028, + "epoch": 0.7966330978505937, + "flos": 18840389978880.0, + "grad_norm": 2.2483131489835997, + "language_loss": 0.61424822, + "learning_rate": 4.1827551559706674e-07, + "loss": 0.63658142, + "num_input_tokens_seen": 285909190, + "step": 13250, + "time_per_iteration": 2.6334636211395264 + }, + { + "auxiliary_loss_clip": 0.01134192, + "auxiliary_loss_mlp": 0.01100652, + "balance_loss_clip": 1.00175989, + "balance_loss_mlp": 1.00042033, + "epoch": 0.7966932211032617, + "flos": 13152319862400.0, + "grad_norm": 1.9590079533732436, + "language_loss": 0.72216344, + "learning_rate": 4.180371972938206e-07, + "loss": 0.74451184, + "num_input_tokens_seen": 285927570, + "step": 13251, + "time_per_iteration": 4.124300241470337 + }, + { + "auxiliary_loss_clip": 0.01164621, + "auxiliary_loss_mlp": 0.01102376, + "balance_loss_clip": 1.00197649, + "balance_loss_mlp": 1.00052333, + "epoch": 0.7967533443559297, + "flos": 23949760078080.0, + "grad_norm": 1.916296310509508, + "language_loss": 0.73174328, + "learning_rate": 4.177989389787624e-07, + "loss": 0.75441325, + "num_input_tokens_seen": 285945810, + "step": 13252, + "time_per_iteration": 2.5433928966522217 + }, + { + "auxiliary_loss_clip": 0.01164257, + "auxiliary_loss_mlp": 0.01100553, + "balance_loss_clip": 1.00192142, + "balance_loss_mlp": 1.00046539, + "epoch": 0.7968134676085976, + "flos": 30368791964160.0, + "grad_norm": 2.556603595269762, + "language_loss": 0.66054547, + "learning_rate": 4.175607406609278e-07, + "loss": 0.68319362, + "num_input_tokens_seen": 285964235, + "step": 13253, + "time_per_iteration": 2.6215717792510986 + }, + { + "auxiliary_loss_clip": 0.01117598, + "auxiliary_loss_mlp": 0.0110144, + "balance_loss_clip": 1.00178564, + "balance_loss_mlp": 1.00054169, + "epoch": 0.7968735908612656, + "flos": 23075048079360.0, + "grad_norm": 1.495891489131677, + "language_loss": 0.68019211, + "learning_rate": 4.1732260234934767e-07, + "loss": 0.70238256, + "num_input_tokens_seen": 285983710, + "step": 13254, + "time_per_iteration": 2.6878535747528076 + }, + { + "auxiliary_loss_clip": 0.01149249, + "auxiliary_loss_mlp": 0.01101553, + "balance_loss_clip": 1.00177252, + "balance_loss_mlp": 1.00051093, + "epoch": 0.7969337141139335, + "flos": 23582250074880.0, + "grad_norm": 1.9273550220764637, + "language_loss": 0.69706112, + "learning_rate": 4.1708452405305314e-07, + "loss": 0.71956909, + "num_input_tokens_seen": 286003425, + "step": 13255, + "time_per_iteration": 2.680652379989624 + }, + { + "auxiliary_loss_clip": 0.01164317, + "auxiliary_loss_mlp": 0.01101432, + "balance_loss_clip": 1.00180984, + "balance_loss_mlp": 1.0004375, + "epoch": 0.7969938373666016, + "flos": 19755860935680.0, + "grad_norm": 2.0598601315114964, + "language_loss": 0.79376149, + "learning_rate": 4.168465057810733e-07, + "loss": 0.81641901, + "num_input_tokens_seen": 286020130, + "step": 13256, + "time_per_iteration": 2.599698305130005 + }, + { + "auxiliary_loss_clip": 0.0114764, + "auxiliary_loss_mlp": 0.01102009, + "balance_loss_clip": 1.00181198, + "balance_loss_mlp": 1.00053787, + "epoch": 0.7970539606192695, + "flos": 24134089697280.0, + "grad_norm": 1.8737939481332406, + "language_loss": 0.6597476, + "learning_rate": 4.166085475424315e-07, + "loss": 0.68224406, + "num_input_tokens_seen": 286040230, + "step": 13257, + "time_per_iteration": 2.6487619876861572 + }, + { + "auxiliary_loss_clip": 0.01133289, + "auxiliary_loss_mlp": 0.01102978, + "balance_loss_clip": 1.00189042, + "balance_loss_mlp": 1.00050557, + "epoch": 0.7971140838719375, + "flos": 17968622895360.0, + "grad_norm": 3.6125099801208975, + "language_loss": 0.71845794, + "learning_rate": 4.163706493461523e-07, + "loss": 0.74082065, + "num_input_tokens_seen": 286059475, + "step": 13258, + "time_per_iteration": 2.6529541015625 + }, + { + "auxiliary_loss_clip": 0.01149827, + "auxiliary_loss_mlp": 0.01102454, + "balance_loss_clip": 1.0018878, + "balance_loss_mlp": 1.0004108, + "epoch": 0.7971742071246054, + "flos": 19169547235200.0, + "grad_norm": 1.7953564333661598, + "language_loss": 0.68863237, + "learning_rate": 4.1613281120125655e-07, + "loss": 0.71115524, + "num_input_tokens_seen": 286077820, + "step": 13259, + "time_per_iteration": 2.6770081520080566 + }, + { + "auxiliary_loss_clip": 0.01147901, + "auxiliary_loss_mlp": 0.01100567, + "balance_loss_clip": 1.00181937, + "balance_loss_mlp": 1.00052667, + "epoch": 0.7972343303772734, + "flos": 27125951178240.0, + "grad_norm": 1.734895212403024, + "language_loss": 0.73759472, + "learning_rate": 4.158950331167641e-07, + "loss": 0.76007944, + "num_input_tokens_seen": 286097285, + "step": 13260, + "time_per_iteration": 2.648895740509033 + }, + { + "auxiliary_loss_clip": 0.01133069, + "auxiliary_loss_mlp": 0.01101193, + "balance_loss_clip": 1.00170243, + "balance_loss_mlp": 1.00048459, + "epoch": 0.7972944536299413, + "flos": 20996646393600.0, + "grad_norm": 2.216147185778759, + "language_loss": 0.78583521, + "learning_rate": 4.1565731510169065e-07, + "loss": 0.80817783, + "num_input_tokens_seen": 286116000, + "step": 13261, + "time_per_iteration": 2.680698871612549 + }, + { + "auxiliary_loss_clip": 0.0114943, + "auxiliary_loss_mlp": 0.01099692, + "balance_loss_clip": 1.00184774, + "balance_loss_mlp": 1.00050974, + "epoch": 0.7973545768826094, + "flos": 21580015178880.0, + "grad_norm": 1.6627301075227272, + "language_loss": 0.76426321, + "learning_rate": 4.154196571650501e-07, + "loss": 0.78675443, + "num_input_tokens_seen": 286135110, + "step": 13262, + "time_per_iteration": 2.69518780708313 + }, + { + "auxiliary_loss_clip": 0.01115877, + "auxiliary_loss_mlp": 0.01102968, + "balance_loss_clip": 1.00176084, + "balance_loss_mlp": 1.00044847, + "epoch": 0.7974147001352773, + "flos": 20558536208640.0, + "grad_norm": 2.2422635554779737, + "language_loss": 0.70266747, + "learning_rate": 4.1518205931585524e-07, + "loss": 0.72485596, + "num_input_tokens_seen": 286152835, + "step": 13263, + "time_per_iteration": 4.153708219528198 + }, + { + "auxiliary_loss_clip": 0.01148244, + "auxiliary_loss_mlp": 0.01102405, + "balance_loss_clip": 1.00188398, + "balance_loss_mlp": 1.0006001, + "epoch": 0.7974748233879453, + "flos": 20996790048000.0, + "grad_norm": 2.2676881904469366, + "language_loss": 0.71114415, + "learning_rate": 4.149445215631153e-07, + "loss": 0.73365062, + "num_input_tokens_seen": 286171785, + "step": 13264, + "time_per_iteration": 2.6368119716644287 + }, + { + "auxiliary_loss_clip": 0.01164333, + "auxiliary_loss_mlp": 0.01100888, + "balance_loss_clip": 1.00186872, + "balance_loss_mlp": 1.00056124, + "epoch": 0.7975349466406133, + "flos": 22565188477440.0, + "grad_norm": 1.81203961791007, + "language_loss": 0.77307415, + "learning_rate": 4.1470704391583776e-07, + "loss": 0.7957263, + "num_input_tokens_seen": 286190420, + "step": 13265, + "time_per_iteration": 2.560006618499756 + }, + { + "auxiliary_loss_clip": 0.01116499, + "auxiliary_loss_mlp": 0.01101487, + "balance_loss_clip": 1.00176167, + "balance_loss_mlp": 1.0004456, + "epoch": 0.7975950698932812, + "flos": 21689542725120.0, + "grad_norm": 1.8014657965472134, + "language_loss": 0.75527453, + "learning_rate": 4.144696263830285e-07, + "loss": 0.77745444, + "num_input_tokens_seen": 286210105, + "step": 13266, + "time_per_iteration": 2.701756000518799 + }, + { + "auxiliary_loss_clip": 0.01131039, + "auxiliary_loss_mlp": 0.0110118, + "balance_loss_clip": 1.00182652, + "balance_loss_mlp": 1.00037634, + "epoch": 0.7976551931459492, + "flos": 19604568850560.0, + "grad_norm": 10.817951957458359, + "language_loss": 0.83999521, + "learning_rate": 4.1423226897369015e-07, + "loss": 0.86231738, + "num_input_tokens_seen": 286228180, + "step": 13267, + "time_per_iteration": 4.162346363067627 + }, + { + "auxiliary_loss_clip": 0.01147692, + "auxiliary_loss_mlp": 0.01102394, + "balance_loss_clip": 1.00186825, + "balance_loss_mlp": 1.00054145, + "epoch": 0.7977153163986171, + "flos": 21687603390720.0, + "grad_norm": 1.6926489954021275, + "language_loss": 0.76107407, + "learning_rate": 4.139949716968223e-07, + "loss": 0.78357494, + "num_input_tokens_seen": 286247305, + "step": 13268, + "time_per_iteration": 2.6278462409973145 + }, + { + "auxiliary_loss_clip": 0.01164492, + "auxiliary_loss_mlp": 0.01101812, + "balance_loss_clip": 1.00195765, + "balance_loss_mlp": 1.00043678, + "epoch": 0.7977754396512852, + "flos": 23476780765440.0, + "grad_norm": 1.5774942402601215, + "language_loss": 0.77700227, + "learning_rate": 4.1375773456142403e-07, + "loss": 0.79966533, + "num_input_tokens_seen": 286268145, + "step": 13269, + "time_per_iteration": 2.5963964462280273 + }, + { + "auxiliary_loss_clip": 0.01149331, + "auxiliary_loss_mlp": 0.0110176, + "balance_loss_clip": 1.00177073, + "balance_loss_mlp": 1.00048006, + "epoch": 0.7978355629039531, + "flos": 22382223575040.0, + "grad_norm": 1.7949289506079673, + "language_loss": 0.82224768, + "learning_rate": 4.135205575764922e-07, + "loss": 0.84475857, + "num_input_tokens_seen": 286286775, + "step": 13270, + "time_per_iteration": 2.823089122772217 + }, + { + "auxiliary_loss_clip": 0.01116763, + "auxiliary_loss_mlp": 0.01101639, + "balance_loss_clip": 1.00162852, + "balance_loss_mlp": 1.00035906, + "epoch": 0.7978956861566211, + "flos": 20266331068800.0, + "grad_norm": 1.8599574908388106, + "language_loss": 0.59582829, + "learning_rate": 4.1328344075101905e-07, + "loss": 0.61801231, + "num_input_tokens_seen": 286305590, + "step": 13271, + "time_per_iteration": 3.136228561401367 + }, + { + "auxiliary_loss_clip": 0.01132794, + "auxiliary_loss_mlp": 0.01102458, + "balance_loss_clip": 1.00188076, + "balance_loss_mlp": 1.00051045, + "epoch": 0.797955809409289, + "flos": 28112417366400.0, + "grad_norm": 2.2360733629553433, + "language_loss": 0.73143566, + "learning_rate": 4.130463840939975e-07, + "loss": 0.75378817, + "num_input_tokens_seen": 286328050, + "step": 13272, + "time_per_iteration": 2.681318759918213 + }, + { + "auxiliary_loss_clip": 0.01086032, + "auxiliary_loss_mlp": 0.01101896, + "balance_loss_clip": 1.00167704, + "balance_loss_mlp": 1.00042486, + "epoch": 0.798015932661957, + "flos": 15559591495680.0, + "grad_norm": 2.56743491484712, + "language_loss": 0.71906573, + "learning_rate": 4.128093876144161e-07, + "loss": 0.74094498, + "num_input_tokens_seen": 286345265, + "step": 13273, + "time_per_iteration": 2.71527361869812 + }, + { + "auxiliary_loss_clip": 0.01133643, + "auxiliary_loss_mlp": 0.01101816, + "balance_loss_clip": 1.00196278, + "balance_loss_mlp": 1.0004406, + "epoch": 0.7980760559146249, + "flos": 23951196622080.0, + "grad_norm": 1.7119584575959705, + "language_loss": 0.75557739, + "learning_rate": 4.1257245132126117e-07, + "loss": 0.77793199, + "num_input_tokens_seen": 286364465, + "step": 13274, + "time_per_iteration": 2.6490800380706787 + }, + { + "auxiliary_loss_clip": 0.01099377, + "auxiliary_loss_mlp": 0.01100852, + "balance_loss_clip": 1.00161541, + "balance_loss_mlp": 1.00043046, + "epoch": 0.798136179167293, + "flos": 28038082170240.0, + "grad_norm": 1.3109076822054229, + "language_loss": 0.77718997, + "learning_rate": 4.12335575223518e-07, + "loss": 0.79919231, + "num_input_tokens_seen": 286385565, + "step": 13275, + "time_per_iteration": 2.760261058807373 + }, + { + "auxiliary_loss_clip": 0.01147643, + "auxiliary_loss_mlp": 0.0110215, + "balance_loss_clip": 1.00170803, + "balance_loss_mlp": 1.0005362, + "epoch": 0.7981963024199609, + "flos": 35984538046080.0, + "grad_norm": 2.2519847551937966, + "language_loss": 0.64105713, + "learning_rate": 4.1209875933016877e-07, + "loss": 0.66355503, + "num_input_tokens_seen": 286403950, + "step": 13276, + "time_per_iteration": 2.7025890350341797 + }, + { + "auxiliary_loss_clip": 0.01116504, + "auxiliary_loss_mlp": 0.01101166, + "balance_loss_clip": 1.00170994, + "balance_loss_mlp": 1.00055361, + "epoch": 0.7982564256726289, + "flos": 25884914325120.0, + "grad_norm": 2.0838446103752686, + "language_loss": 0.61134326, + "learning_rate": 4.118620036501945e-07, + "loss": 0.63352001, + "num_input_tokens_seen": 286426160, + "step": 13277, + "time_per_iteration": 2.7256875038146973 + }, + { + "auxiliary_loss_clip": 0.01133393, + "auxiliary_loss_mlp": 0.01102452, + "balance_loss_clip": 1.00193167, + "balance_loss_mlp": 1.00050378, + "epoch": 0.7983165489252969, + "flos": 25739152934400.0, + "grad_norm": 2.7384368761030644, + "language_loss": 0.79754311, + "learning_rate": 4.1162530819257227e-07, + "loss": 0.81990153, + "num_input_tokens_seen": 286446610, + "step": 13278, + "time_per_iteration": 2.6758456230163574 + }, + { + "auxiliary_loss_clip": 0.01131257, + "auxiliary_loss_mlp": 0.01102068, + "balance_loss_clip": 1.00169039, + "balance_loss_mlp": 1.00064516, + "epoch": 0.7983766721779648, + "flos": 21908202768000.0, + "grad_norm": 1.8953308779428706, + "language_loss": 0.63632751, + "learning_rate": 4.113886729662768e-07, + "loss": 0.65866077, + "num_input_tokens_seen": 286465460, + "step": 13279, + "time_per_iteration": 2.633312702178955 + }, + { + "auxiliary_loss_clip": 0.01149616, + "auxiliary_loss_mlp": 0.01099414, + "balance_loss_clip": 1.00187802, + "balance_loss_mlp": 1.00042295, + "epoch": 0.7984367954306328, + "flos": 29347420734720.0, + "grad_norm": 1.7168369635493745, + "language_loss": 0.7058391, + "learning_rate": 4.111520979802825e-07, + "loss": 0.72832942, + "num_input_tokens_seen": 286485720, + "step": 13280, + "time_per_iteration": 2.714466094970703 + }, + { + "auxiliary_loss_clip": 0.01118168, + "auxiliary_loss_mlp": 0.01102041, + "balance_loss_clip": 1.00176001, + "balance_loss_mlp": 1.00047517, + "epoch": 0.7984969186833007, + "flos": 31357772104320.0, + "grad_norm": 1.8598561957383466, + "language_loss": 0.62620044, + "learning_rate": 4.1091558324355955e-07, + "loss": 0.64840251, + "num_input_tokens_seen": 286507465, + "step": 13281, + "time_per_iteration": 2.917008876800537 + }, + { + "auxiliary_loss_clip": 0.01149906, + "auxiliary_loss_mlp": 0.01102964, + "balance_loss_clip": 1.0018965, + "balance_loss_mlp": 1.0005393, + "epoch": 0.7985570419359688, + "flos": 24312924535680.0, + "grad_norm": 1.8367193738437086, + "language_loss": 0.80117166, + "learning_rate": 4.1067912876507683e-07, + "loss": 0.82370031, + "num_input_tokens_seen": 286526345, + "step": 13282, + "time_per_iteration": 2.6593730449676514 + }, + { + "auxiliary_loss_clip": 0.01118013, + "auxiliary_loss_mlp": 0.00747377, + "balance_loss_clip": 1.00175142, + "balance_loss_mlp": 1.00044143, + "epoch": 0.7986171651886367, + "flos": 15742233175680.0, + "grad_norm": 1.8637368622304913, + "language_loss": 0.71826136, + "learning_rate": 4.10442734553802e-07, + "loss": 0.73691529, + "num_input_tokens_seen": 286544095, + "step": 13283, + "time_per_iteration": 2.774988889694214 + }, + { + "auxiliary_loss_clip": 0.01147469, + "auxiliary_loss_mlp": 0.0110179, + "balance_loss_clip": 1.00174665, + "balance_loss_mlp": 1.0005095, + "epoch": 0.7986772884413047, + "flos": 11619401091840.0, + "grad_norm": 1.8842309518242226, + "language_loss": 0.73432434, + "learning_rate": 4.102064006186967e-07, + "loss": 0.75681698, + "num_input_tokens_seen": 286560960, + "step": 13284, + "time_per_iteration": 2.554175615310669 + }, + { + "auxiliary_loss_clip": 0.01135306, + "auxiliary_loss_mlp": 0.01101313, + "balance_loss_clip": 1.00186968, + "balance_loss_mlp": 1.00050998, + "epoch": 0.7987374116939726, + "flos": 22091059929600.0, + "grad_norm": 2.400482886089255, + "language_loss": 0.70031404, + "learning_rate": 4.0997012696872415e-07, + "loss": 0.72268021, + "num_input_tokens_seen": 286579865, + "step": 13285, + "time_per_iteration": 4.015994071960449 + }, + { + "auxiliary_loss_clip": 0.0113365, + "auxiliary_loss_mlp": 0.0110117, + "balance_loss_clip": 1.0019145, + "balance_loss_mlp": 1.00036705, + "epoch": 0.7987975349466406, + "flos": 17890696339200.0, + "grad_norm": 1.7918800495095484, + "language_loss": 0.73521745, + "learning_rate": 4.097339136128437e-07, + "loss": 0.75756562, + "num_input_tokens_seen": 286597295, + "step": 13286, + "time_per_iteration": 2.595148801803589 + }, + { + "auxiliary_loss_clip": 0.01131039, + "auxiliary_loss_mlp": 0.01102294, + "balance_loss_clip": 1.0017972, + "balance_loss_mlp": 1.00044119, + "epoch": 0.7988576581993085, + "flos": 19719232041600.0, + "grad_norm": 2.1885450680802405, + "language_loss": 0.74957967, + "learning_rate": 4.0949776056001296e-07, + "loss": 0.77191293, + "num_input_tokens_seen": 286616270, + "step": 13287, + "time_per_iteration": 2.6208994388580322 + }, + { + "auxiliary_loss_clip": 0.01132931, + "auxiliary_loss_mlp": 0.01101577, + "balance_loss_clip": 1.0018059, + "balance_loss_mlp": 1.00039244, + "epoch": 0.7989177814519766, + "flos": 28036358317440.0, + "grad_norm": 3.64045325537024, + "language_loss": 0.61771303, + "learning_rate": 4.092616678191863e-07, + "loss": 0.6400581, + "num_input_tokens_seen": 286638315, + "step": 13288, + "time_per_iteration": 2.7000980377197266 + }, + { + "auxiliary_loss_clip": 0.01147773, + "auxiliary_loss_mlp": 0.01101616, + "balance_loss_clip": 1.00194335, + "balance_loss_mlp": 1.0004313, + "epoch": 0.7989779047046445, + "flos": 28871029630080.0, + "grad_norm": 2.2810474131904948, + "language_loss": 0.70017183, + "learning_rate": 4.090256353993169e-07, + "loss": 0.72266567, + "num_input_tokens_seen": 286658630, + "step": 13289, + "time_per_iteration": 4.1130194664001465 + }, + { + "auxiliary_loss_clip": 0.01114675, + "auxiliary_loss_mlp": 0.01100679, + "balance_loss_clip": 1.00178826, + "balance_loss_mlp": 1.00054276, + "epoch": 0.7990380279573125, + "flos": 18186887888640.0, + "grad_norm": 2.2482551265478716, + "language_loss": 0.62482715, + "learning_rate": 4.0878966330935506e-07, + "loss": 0.64698076, + "num_input_tokens_seen": 286676870, + "step": 13290, + "time_per_iteration": 2.6158130168914795 + }, + { + "auxiliary_loss_clip": 0.01149316, + "auxiliary_loss_mlp": 0.01101985, + "balance_loss_clip": 1.00190699, + "balance_loss_mlp": 1.00041914, + "epoch": 0.7990981512099805, + "flos": 20879936127360.0, + "grad_norm": 1.783690783379781, + "language_loss": 0.71046269, + "learning_rate": 4.08553751558248e-07, + "loss": 0.73297572, + "num_input_tokens_seen": 286694300, + "step": 13291, + "time_per_iteration": 2.654195547103882 + }, + { + "auxiliary_loss_clip": 0.01114773, + "auxiliary_loss_mlp": 0.0110038, + "balance_loss_clip": 1.00163341, + "balance_loss_mlp": 1.00033927, + "epoch": 0.7991582744626484, + "flos": 26099911180800.0, + "grad_norm": 1.6943303917862857, + "language_loss": 0.63552618, + "learning_rate": 4.083179001549422e-07, + "loss": 0.65767771, + "num_input_tokens_seen": 286714545, + "step": 13292, + "time_per_iteration": 2.6859569549560547 + }, + { + "auxiliary_loss_clip": 0.01147842, + "auxiliary_loss_mlp": 0.01101367, + "balance_loss_clip": 1.00183165, + "balance_loss_mlp": 1.00046778, + "epoch": 0.7992183977153164, + "flos": 35295843605760.0, + "grad_norm": 1.7767647976582557, + "language_loss": 0.56282598, + "learning_rate": 4.0808210910838105e-07, + "loss": 0.58531809, + "num_input_tokens_seen": 286734525, + "step": 13293, + "time_per_iteration": 2.809255361557007 + }, + { + "auxiliary_loss_clip": 0.01131262, + "auxiliary_loss_mlp": 0.01102081, + "balance_loss_clip": 1.00176513, + "balance_loss_mlp": 1.00061059, + "epoch": 0.7992785209679844, + "flos": 51853426577280.0, + "grad_norm": 2.5495397080572064, + "language_loss": 0.71844685, + "learning_rate": 4.0784637842750704e-07, + "loss": 0.74078029, + "num_input_tokens_seen": 286753430, + "step": 13294, + "time_per_iteration": 2.8608312606811523 + }, + { + "auxiliary_loss_clip": 0.01118407, + "auxiliary_loss_mlp": 0.01102518, + "balance_loss_clip": 1.00195086, + "balance_loss_mlp": 1.00047469, + "epoch": 0.7993386442206524, + "flos": 22565116650240.0, + "grad_norm": 1.879977872431191, + "language_loss": 0.72837925, + "learning_rate": 4.0761070812125675e-07, + "loss": 0.75058848, + "num_input_tokens_seen": 286771915, + "step": 13295, + "time_per_iteration": 2.6580896377563477 + }, + { + "auxiliary_loss_clip": 0.01120277, + "auxiliary_loss_mlp": 0.01101196, + "balance_loss_clip": 1.00185966, + "balance_loss_mlp": 1.00063074, + "epoch": 0.7993987674733203, + "flos": 18800277465600.0, + "grad_norm": 1.8952117893154956, + "language_loss": 0.76591229, + "learning_rate": 4.0737509819856797e-07, + "loss": 0.78812695, + "num_input_tokens_seen": 286789835, + "step": 13296, + "time_per_iteration": 2.6302924156188965 + }, + { + "auxiliary_loss_clip": 0.01111561, + "auxiliary_loss_mlp": 0.01076621, + "balance_loss_clip": 1.00094938, + "balance_loss_mlp": 1.00032675, + "epoch": 0.7994588907259883, + "flos": 69421720394880.0, + "grad_norm": 0.7026996077477301, + "language_loss": 0.60815132, + "learning_rate": 4.0713954866837573e-07, + "loss": 0.63003314, + "num_input_tokens_seen": 286855580, + "step": 13297, + "time_per_iteration": 3.3080945014953613 + }, + { + "auxiliary_loss_clip": 0.01131111, + "auxiliary_loss_mlp": 0.0110194, + "balance_loss_clip": 1.00181663, + "balance_loss_mlp": 1.00046957, + "epoch": 0.7995190139786562, + "flos": 13480327883520.0, + "grad_norm": 2.6719982803368283, + "language_loss": 0.70460922, + "learning_rate": 4.0690405953961073e-07, + "loss": 0.72693974, + "num_input_tokens_seen": 286874360, + "step": 13298, + "time_per_iteration": 2.628392219543457 + }, + { + "auxiliary_loss_clip": 0.01118202, + "auxiliary_loss_mlp": 0.01102234, + "balance_loss_clip": 1.00180435, + "balance_loss_mlp": 1.00057256, + "epoch": 0.7995791372313242, + "flos": 21652842003840.0, + "grad_norm": 2.7188472223850275, + "language_loss": 0.75807977, + "learning_rate": 4.066686308212037e-07, + "loss": 0.78028411, + "num_input_tokens_seen": 286891950, + "step": 13299, + "time_per_iteration": 2.674118757247925 + }, + { + "auxiliary_loss_clip": 0.01133009, + "auxiliary_loss_mlp": 0.01101497, + "balance_loss_clip": 1.00187254, + "balance_loss_mlp": 1.00059819, + "epoch": 0.7996392604839921, + "flos": 26068130622720.0, + "grad_norm": 1.8355172595444558, + "language_loss": 0.77692723, + "learning_rate": 4.064332625220828e-07, + "loss": 0.7992723, + "num_input_tokens_seen": 286911725, + "step": 13300, + "time_per_iteration": 2.7136166095733643 + }, + { + "auxiliary_loss_clip": 0.01119915, + "auxiliary_loss_mlp": 0.01103116, + "balance_loss_clip": 1.00177026, + "balance_loss_mlp": 1.00050056, + "epoch": 0.7996993837366602, + "flos": 24606889441920.0, + "grad_norm": 1.5927782782523254, + "language_loss": 0.6356647, + "learning_rate": 4.0619795465117115e-07, + "loss": 0.65789497, + "num_input_tokens_seen": 286931400, + "step": 13301, + "time_per_iteration": 4.140388250350952 + }, + { + "auxiliary_loss_clip": 0.01149709, + "auxiliary_loss_mlp": 0.01101691, + "balance_loss_clip": 1.00190949, + "balance_loss_mlp": 1.00050616, + "epoch": 0.7997595069893281, + "flos": 20992049452800.0, + "grad_norm": 1.6866496871276517, + "language_loss": 0.71760035, + "learning_rate": 4.059627072173928e-07, + "loss": 0.74011433, + "num_input_tokens_seen": 286949795, + "step": 13302, + "time_per_iteration": 2.5957798957824707 + }, + { + "auxiliary_loss_clip": 0.01164523, + "auxiliary_loss_mlp": 0.00747531, + "balance_loss_clip": 1.00188053, + "balance_loss_mlp": 1.00060844, + "epoch": 0.7998196302419961, + "flos": 24426510318720.0, + "grad_norm": 1.9534418034683665, + "language_loss": 0.83617747, + "learning_rate": 4.057275202296684e-07, + "loss": 0.85529798, + "num_input_tokens_seen": 286968805, + "step": 13303, + "time_per_iteration": 2.6165950298309326 + }, + { + "auxiliary_loss_clip": 0.01164406, + "auxiliary_loss_mlp": 0.0110133, + "balance_loss_clip": 1.00190663, + "balance_loss_mlp": 1.00052619, + "epoch": 0.7998797534946641, + "flos": 30264651457920.0, + "grad_norm": 1.6712804764981908, + "language_loss": 0.58977044, + "learning_rate": 4.054923936969166e-07, + "loss": 0.61242783, + "num_input_tokens_seen": 286990235, + "step": 13304, + "time_per_iteration": 2.6281979084014893 + }, + { + "auxiliary_loss_clip": 0.01164422, + "auxiliary_loss_mlp": 0.01101603, + "balance_loss_clip": 1.00172436, + "balance_loss_mlp": 1.00041771, + "epoch": 0.799939876747332, + "flos": 23513984277120.0, + "grad_norm": 1.5569315021640968, + "language_loss": 0.69239414, + "learning_rate": 4.0525732762805265e-07, + "loss": 0.71505427, + "num_input_tokens_seen": 287011060, + "step": 13305, + "time_per_iteration": 4.107693433761597 + }, + { + "auxiliary_loss_clip": 0.0111818, + "auxiliary_loss_mlp": 0.01101448, + "balance_loss_clip": 1.00172484, + "balance_loss_mlp": 1.00054932, + "epoch": 0.8, + "flos": 19318109886720.0, + "grad_norm": 1.6990046746623895, + "language_loss": 0.69924545, + "learning_rate": 4.0502232203199107e-07, + "loss": 0.72144175, + "num_input_tokens_seen": 287029215, + "step": 13306, + "time_per_iteration": 2.6680431365966797 + }, + { + "auxiliary_loss_clip": 0.01147945, + "auxiliary_loss_mlp": 0.01101878, + "balance_loss_clip": 1.00195134, + "balance_loss_mlp": 1.00059807, + "epoch": 0.800060123252668, + "flos": 32412432263040.0, + "grad_norm": 1.671125151033622, + "language_loss": 0.69596434, + "learning_rate": 4.0478737691764286e-07, + "loss": 0.71846259, + "num_input_tokens_seen": 287050855, + "step": 13307, + "time_per_iteration": 2.7500765323638916 + }, + { + "auxiliary_loss_clip": 0.01134912, + "auxiliary_loss_mlp": 0.01101791, + "balance_loss_clip": 1.00184703, + "balance_loss_mlp": 1.00051129, + "epoch": 0.800120246505336, + "flos": 20010611168640.0, + "grad_norm": 3.1203242429993168, + "language_loss": 0.76313126, + "learning_rate": 4.0455249229391677e-07, + "loss": 0.78549832, + "num_input_tokens_seen": 287069915, + "step": 13308, + "time_per_iteration": 2.633068799972534 + }, + { + "auxiliary_loss_clip": 0.01099801, + "auxiliary_loss_mlp": 0.01103013, + "balance_loss_clip": 1.0016458, + "balance_loss_mlp": 1.00039768, + "epoch": 0.8001803697580039, + "flos": 31868278151040.0, + "grad_norm": 1.5114501145944166, + "language_loss": 0.78540248, + "learning_rate": 4.0431766816972e-07, + "loss": 0.80743062, + "num_input_tokens_seen": 287091450, + "step": 13309, + "time_per_iteration": 2.774336099624634 + }, + { + "auxiliary_loss_clip": 0.01158321, + "auxiliary_loss_mlp": 0.01075122, + "balance_loss_clip": 1.00074959, + "balance_loss_mlp": 0.99997282, + "epoch": 0.8002404930106719, + "flos": 63392066916480.0, + "grad_norm": 0.9082518425856367, + "language_loss": 0.64672709, + "learning_rate": 4.040829045539571e-07, + "loss": 0.66906154, + "num_input_tokens_seen": 287148365, + "step": 13310, + "time_per_iteration": 3.111508369445801 + }, + { + "auxiliary_loss_clip": 0.01149316, + "auxiliary_loss_mlp": 0.01101585, + "balance_loss_clip": 1.00193715, + "balance_loss_mlp": 1.00059044, + "epoch": 0.8003006162633398, + "flos": 27855476403840.0, + "grad_norm": 2.0806807404628027, + "language_loss": 0.83009994, + "learning_rate": 4.0384820145553156e-07, + "loss": 0.85260898, + "num_input_tokens_seen": 287168280, + "step": 13311, + "time_per_iteration": 2.6336843967437744 + }, + { + "auxiliary_loss_clip": 0.0114963, + "auxiliary_loss_mlp": 0.01101191, + "balance_loss_clip": 1.00184393, + "balance_loss_mlp": 1.0004828, + "epoch": 0.8003607395160078, + "flos": 18223337214720.0, + "grad_norm": 1.9083413443839292, + "language_loss": 0.66132474, + "learning_rate": 4.0361355888334116e-07, + "loss": 0.683833, + "num_input_tokens_seen": 287185980, + "step": 13312, + "time_per_iteration": 2.657241106033325 + }, + { + "auxiliary_loss_clip": 0.01164523, + "auxiliary_loss_mlp": 0.01102091, + "balance_loss_clip": 1.00190413, + "balance_loss_mlp": 1.00052452, + "epoch": 0.8004208627686757, + "flos": 20886975192960.0, + "grad_norm": 1.7695807796557426, + "language_loss": 0.7489574, + "learning_rate": 4.033789768462843e-07, + "loss": 0.77162349, + "num_input_tokens_seen": 287203875, + "step": 13313, + "time_per_iteration": 2.767042398452759 + }, + { + "auxiliary_loss_clip": 0.01149695, + "auxiliary_loss_mlp": 0.01101665, + "balance_loss_clip": 1.00177908, + "balance_loss_mlp": 1.00057554, + "epoch": 0.8004809860213438, + "flos": 26436143416320.0, + "grad_norm": 1.3499839839422827, + "language_loss": 0.75646758, + "learning_rate": 4.031444553532575e-07, + "loss": 0.77898121, + "num_input_tokens_seen": 287226445, + "step": 13314, + "time_per_iteration": 2.725587844848633 + }, + { + "auxiliary_loss_clip": 0.01093944, + "auxiliary_loss_mlp": 0.01076055, + "balance_loss_clip": 1.00068331, + "balance_loss_mlp": 1.00014234, + "epoch": 0.8005411092740117, + "flos": 63648612829440.0, + "grad_norm": 0.8174755857150496, + "language_loss": 0.53726941, + "learning_rate": 4.029099944131522e-07, + "loss": 0.55896944, + "num_input_tokens_seen": 287286240, + "step": 13315, + "time_per_iteration": 3.181408643722534 + }, + { + "auxiliary_loss_clip": 0.01131078, + "auxiliary_loss_mlp": 0.01100492, + "balance_loss_clip": 1.00173235, + "balance_loss_mlp": 1.00049937, + "epoch": 0.8006012325266797, + "flos": 36138056774400.0, + "grad_norm": 1.7031525246023531, + "language_loss": 0.71235538, + "learning_rate": 4.026755940348603e-07, + "loss": 0.73467106, + "num_input_tokens_seen": 287310265, + "step": 13316, + "time_per_iteration": 2.7641289234161377 + }, + { + "auxiliary_loss_clip": 0.01131142, + "auxiliary_loss_mlp": 0.01102477, + "balance_loss_clip": 1.00183153, + "balance_loss_mlp": 1.00052941, + "epoch": 0.8006613557793477, + "flos": 33838947970560.0, + "grad_norm": 4.327036351037348, + "language_loss": 0.6489439, + "learning_rate": 4.024412542272706e-07, + "loss": 0.67128015, + "num_input_tokens_seen": 287331610, + "step": 13317, + "time_per_iteration": 2.817340850830078 + }, + { + "auxiliary_loss_clip": 0.0115837, + "auxiliary_loss_mlp": 0.01075501, + "balance_loss_clip": 1.00074172, + "balance_loss_mlp": 0.99997014, + "epoch": 0.8007214790320156, + "flos": 67348310699520.0, + "grad_norm": 0.764359458826703, + "language_loss": 0.5905109, + "learning_rate": 4.0220697499926783e-07, + "loss": 0.61284959, + "num_input_tokens_seen": 287394795, + "step": 13318, + "time_per_iteration": 3.2211380004882812 + }, + { + "auxiliary_loss_clip": 0.01116285, + "auxiliary_loss_mlp": 0.01101601, + "balance_loss_clip": 1.00181699, + "balance_loss_mlp": 1.00041652, + "epoch": 0.8007816022846836, + "flos": 23185653033600.0, + "grad_norm": 1.7337343782556356, + "language_loss": 0.66302693, + "learning_rate": 4.019727563597366e-07, + "loss": 0.68520576, + "num_input_tokens_seen": 287414595, + "step": 13319, + "time_per_iteration": 2.776564836502075 + }, + { + "auxiliary_loss_clip": 0.01164483, + "auxiliary_loss_mlp": 0.00747412, + "balance_loss_clip": 1.00198102, + "balance_loss_mlp": 1.00055504, + "epoch": 0.8008417255373516, + "flos": 21981388728960.0, + "grad_norm": 1.7588534149171853, + "language_loss": 0.74124527, + "learning_rate": 4.0173859831755873e-07, + "loss": 0.76036417, + "num_input_tokens_seen": 287434395, + "step": 13320, + "time_per_iteration": 2.673759698867798 + }, + { + "auxiliary_loss_clip": 0.01147749, + "auxiliary_loss_mlp": 0.01101813, + "balance_loss_clip": 1.0018127, + "balance_loss_mlp": 1.00034177, + "epoch": 0.8009018487900196, + "flos": 16727334647040.0, + "grad_norm": 1.9541468194770009, + "language_loss": 0.80066061, + "learning_rate": 4.015045008816138e-07, + "loss": 0.82315624, + "num_input_tokens_seen": 287450590, + "step": 13321, + "time_per_iteration": 2.6698198318481445 + }, + { + "auxiliary_loss_clip": 0.01087864, + "auxiliary_loss_mlp": 0.01100676, + "balance_loss_clip": 1.0018003, + "balance_loss_mlp": 1.00053978, + "epoch": 0.8009619720426875, + "flos": 20813609664000.0, + "grad_norm": 3.7303448188375956, + "language_loss": 0.66006631, + "learning_rate": 4.0127046406077825e-07, + "loss": 0.6819517, + "num_input_tokens_seen": 287468455, + "step": 13322, + "time_per_iteration": 2.833056926727295 + }, + { + "auxiliary_loss_clip": 0.01149727, + "auxiliary_loss_mlp": 0.01101106, + "balance_loss_clip": 1.00183606, + "balance_loss_mlp": 1.00039756, + "epoch": 0.8010220952953555, + "flos": 17931096161280.0, + "grad_norm": 2.111796406035136, + "language_loss": 0.77915525, + "learning_rate": 4.010364878639265e-07, + "loss": 0.80166364, + "num_input_tokens_seen": 287486485, + "step": 13323, + "time_per_iteration": 4.36149263381958 + }, + { + "auxiliary_loss_clip": 0.01164516, + "auxiliary_loss_mlp": 0.01102418, + "balance_loss_clip": 1.00191104, + "balance_loss_mlp": 1.00037527, + "epoch": 0.8010822185480234, + "flos": 24572235795840.0, + "grad_norm": 2.69403908118121, + "language_loss": 0.71560633, + "learning_rate": 4.00802572299932e-07, + "loss": 0.73827565, + "num_input_tokens_seen": 287503940, + "step": 13324, + "time_per_iteration": 2.7258481979370117 + }, + { + "auxiliary_loss_clip": 0.01117726, + "auxiliary_loss_mlp": 0.01102806, + "balance_loss_clip": 1.00183535, + "balance_loss_mlp": 1.00038159, + "epoch": 0.8011423418006914, + "flos": 21829988903040.0, + "grad_norm": 1.7979854222686544, + "language_loss": 0.76436079, + "learning_rate": 4.005687173776635e-07, + "loss": 0.78656608, + "num_input_tokens_seen": 287521660, + "step": 13325, + "time_per_iteration": 2.752178430557251 + }, + { + "auxiliary_loss_clip": 0.01148722, + "auxiliary_loss_mlp": 0.01100093, + "balance_loss_clip": 1.00188696, + "balance_loss_mlp": 1.00048113, + "epoch": 0.8012024650533593, + "flos": 23915178259200.0, + "grad_norm": 1.81089753240291, + "language_loss": 0.7975955, + "learning_rate": 4.003349231059898e-07, + "loss": 0.82008362, + "num_input_tokens_seen": 287541505, + "step": 13326, + "time_per_iteration": 2.973623037338257 + }, + { + "auxiliary_loss_clip": 0.01149116, + "auxiliary_loss_mlp": 0.01100834, + "balance_loss_clip": 1.00184476, + "balance_loss_mlp": 1.00041223, + "epoch": 0.8012625883060274, + "flos": 23587062497280.0, + "grad_norm": 2.0939407584065233, + "language_loss": 0.65942347, + "learning_rate": 4.001011894937765e-07, + "loss": 0.68192291, + "num_input_tokens_seen": 287560015, + "step": 13327, + "time_per_iteration": 4.947316408157349 + }, + { + "auxiliary_loss_clip": 0.01147433, + "auxiliary_loss_mlp": 0.01101021, + "balance_loss_clip": 1.00178075, + "balance_loss_mlp": 1.00059891, + "epoch": 0.8013227115586953, + "flos": 20813932886400.0, + "grad_norm": 2.530581058111503, + "language_loss": 0.74022442, + "learning_rate": 3.9986751654988636e-07, + "loss": 0.7627089, + "num_input_tokens_seen": 287579150, + "step": 13328, + "time_per_iteration": 2.813432455062866 + }, + { + "auxiliary_loss_clip": 0.01101153, + "auxiliary_loss_mlp": 0.01102112, + "balance_loss_clip": 1.00182891, + "balance_loss_mlp": 1.00045013, + "epoch": 0.8013828348113633, + "flos": 15888317788800.0, + "grad_norm": 2.676558143030606, + "language_loss": 0.740924, + "learning_rate": 3.996339042831798e-07, + "loss": 0.76295662, + "num_input_tokens_seen": 287597420, + "step": 13329, + "time_per_iteration": 3.182967185974121 + }, + { + "auxiliary_loss_clip": 0.0114349, + "auxiliary_loss_mlp": 0.01075065, + "balance_loss_clip": 1.00073826, + "balance_loss_mlp": 0.9999159, + "epoch": 0.8014429580640313, + "flos": 71062981562880.0, + "grad_norm": 0.7020120141911806, + "language_loss": 0.52892494, + "learning_rate": 3.9940035270251605e-07, + "loss": 0.55111051, + "num_input_tokens_seen": 287667280, + "step": 13330, + "time_per_iteration": 4.319533348083496 + }, + { + "auxiliary_loss_clip": 0.01131213, + "auxiliary_loss_mlp": 0.01102584, + "balance_loss_clip": 1.00167036, + "balance_loss_mlp": 1.00054073, + "epoch": 0.8015030813166992, + "flos": 23076340968960.0, + "grad_norm": 2.171737596020219, + "language_loss": 0.72537506, + "learning_rate": 3.991668618167519e-07, + "loss": 0.74771297, + "num_input_tokens_seen": 287687375, + "step": 13331, + "time_per_iteration": 2.8831846714019775 + }, + { + "auxiliary_loss_clip": 0.0114794, + "auxiliary_loss_mlp": 0.01101294, + "balance_loss_clip": 1.00176668, + "balance_loss_mlp": 1.00039482, + "epoch": 0.8015632045693672, + "flos": 21872328059520.0, + "grad_norm": 1.9852074156387447, + "language_loss": 0.77320713, + "learning_rate": 3.989334316347401e-07, + "loss": 0.79569948, + "num_input_tokens_seen": 287707895, + "step": 13332, + "time_per_iteration": 2.722057819366455 + }, + { + "auxiliary_loss_clip": 0.0116443, + "auxiliary_loss_mlp": 0.01102051, + "balance_loss_clip": 1.00186539, + "balance_loss_mlp": 1.00048447, + "epoch": 0.8016233278220352, + "flos": 23656728925440.0, + "grad_norm": 2.171125464437868, + "language_loss": 0.83713466, + "learning_rate": 3.987000621653338e-07, + "loss": 0.85979944, + "num_input_tokens_seen": 287723990, + "step": 13333, + "time_per_iteration": 2.734543561935425 + }, + { + "auxiliary_loss_clip": 0.01132379, + "auxiliary_loss_mlp": 0.01102428, + "balance_loss_clip": 1.00178301, + "balance_loss_mlp": 1.00043297, + "epoch": 0.8016834510747032, + "flos": 16253170185600.0, + "grad_norm": 1.5504157436852433, + "language_loss": 0.73517954, + "learning_rate": 3.9846675341738133e-07, + "loss": 0.75752759, + "num_input_tokens_seen": 287742380, + "step": 13334, + "time_per_iteration": 2.742008924484253 + }, + { + "auxiliary_loss_clip": 0.01117377, + "auxiliary_loss_mlp": 0.01102015, + "balance_loss_clip": 1.00179625, + "balance_loss_mlp": 1.00049651, + "epoch": 0.8017435743273711, + "flos": 12276027665280.0, + "grad_norm": 2.0188610867714427, + "language_loss": 0.74462914, + "learning_rate": 3.9823350539972967e-07, + "loss": 0.76682305, + "num_input_tokens_seen": 287760130, + "step": 13335, + "time_per_iteration": 2.7725658416748047 + }, + { + "auxiliary_loss_clip": 0.01116473, + "auxiliary_loss_mlp": 0.0110161, + "balance_loss_clip": 1.00160563, + "balance_loss_mlp": 1.00042534, + "epoch": 0.8018036975800391, + "flos": 17196112068480.0, + "grad_norm": 1.8078364520154027, + "language_loss": 0.75466609, + "learning_rate": 3.9800031812122416e-07, + "loss": 0.77684695, + "num_input_tokens_seen": 287777565, + "step": 13336, + "time_per_iteration": 2.927257776260376 + }, + { + "auxiliary_loss_clip": 0.01114665, + "auxiliary_loss_mlp": 0.0110357, + "balance_loss_clip": 1.00165057, + "balance_loss_mlp": 1.00047731, + "epoch": 0.801863820832707, + "flos": 20631865824000.0, + "grad_norm": 2.2597599992105053, + "language_loss": 0.75723451, + "learning_rate": 3.977671915907068e-07, + "loss": 0.77941686, + "num_input_tokens_seen": 287796310, + "step": 13337, + "time_per_iteration": 3.4255895614624023 + }, + { + "auxiliary_loss_clip": 0.01085885, + "auxiliary_loss_mlp": 0.00747504, + "balance_loss_clip": 1.00168562, + "balance_loss_mlp": 1.00051188, + "epoch": 0.801923944085375, + "flos": 30445569285120.0, + "grad_norm": 2.338351707985944, + "language_loss": 0.80190837, + "learning_rate": 3.9753412581701883e-07, + "loss": 0.82024229, + "num_input_tokens_seen": 287817330, + "step": 13338, + "time_per_iteration": 2.9801106452941895 + }, + { + "auxiliary_loss_clip": 0.01116633, + "auxiliary_loss_mlp": 0.01101454, + "balance_loss_clip": 1.00183773, + "balance_loss_mlp": 1.0004127, + "epoch": 0.801984067338043, + "flos": 20010575255040.0, + "grad_norm": 2.138585773488615, + "language_loss": 0.74354947, + "learning_rate": 3.9730112080899733e-07, + "loss": 0.76573038, + "num_input_tokens_seen": 287835095, + "step": 13339, + "time_per_iteration": 5.8387017250061035 + }, + { + "auxiliary_loss_clip": 0.0114771, + "auxiliary_loss_mlp": 0.0110083, + "balance_loss_clip": 1.00174463, + "balance_loss_mlp": 1.00040853, + "epoch": 0.802044190590711, + "flos": 22784028088320.0, + "grad_norm": 2.0054904015269344, + "language_loss": 0.78945142, + "learning_rate": 3.970681765754775e-07, + "loss": 0.8119368, + "num_input_tokens_seen": 287854595, + "step": 13340, + "time_per_iteration": 4.516579866409302 + }, + { + "auxiliary_loss_clip": 0.01117306, + "auxiliary_loss_mlp": 0.01101021, + "balance_loss_clip": 1.00164723, + "balance_loss_mlp": 1.00040817, + "epoch": 0.8021043138433789, + "flos": 27600115639680.0, + "grad_norm": 1.9183340678055407, + "language_loss": 0.68154198, + "learning_rate": 3.968352931252936e-07, + "loss": 0.70372522, + "num_input_tokens_seen": 287876960, + "step": 13341, + "time_per_iteration": 3.6648693084716797 + }, + { + "auxiliary_loss_clip": 0.01129608, + "auxiliary_loss_mlp": 0.01075051, + "balance_loss_clip": 1.00073183, + "balance_loss_mlp": 0.99990124, + "epoch": 0.8021644370960469, + "flos": 62063730057600.0, + "grad_norm": 0.8070095071352109, + "language_loss": 0.61637294, + "learning_rate": 3.9660247046727547e-07, + "loss": 0.63841951, + "num_input_tokens_seen": 287936530, + "step": 13342, + "time_per_iteration": 5.201406240463257 + }, + { + "auxiliary_loss_clip": 0.01133873, + "auxiliary_loss_mlp": 0.01102666, + "balance_loss_clip": 1.00184178, + "balance_loss_mlp": 1.00062275, + "epoch": 0.8022245603487148, + "flos": 23361794352000.0, + "grad_norm": 2.066093006143079, + "language_loss": 0.63680166, + "learning_rate": 3.963697086102522e-07, + "loss": 0.65916705, + "num_input_tokens_seen": 287954285, + "step": 13343, + "time_per_iteration": 2.8611185550689697 + }, + { + "auxiliary_loss_clip": 0.0113055, + "auxiliary_loss_mlp": 0.01100388, + "balance_loss_clip": 1.00170672, + "balance_loss_mlp": 1.00039554, + "epoch": 0.8022846836013828, + "flos": 10853354712960.0, + "grad_norm": 1.927683812725378, + "language_loss": 0.68808496, + "learning_rate": 3.96137007563051e-07, + "loss": 0.71039426, + "num_input_tokens_seen": 287971595, + "step": 13344, + "time_per_iteration": 3.004026412963867 + }, + { + "auxiliary_loss_clip": 0.011479, + "auxiliary_loss_mlp": 0.01101112, + "balance_loss_clip": 1.00192666, + "balance_loss_mlp": 1.00040352, + "epoch": 0.8023448068540509, + "flos": 29240443054080.0, + "grad_norm": 1.5329235611058745, + "language_loss": 0.69828475, + "learning_rate": 3.9590436733449506e-07, + "loss": 0.72077489, + "num_input_tokens_seen": 287992540, + "step": 13345, + "time_per_iteration": 2.864203929901123 + }, + { + "auxiliary_loss_clip": 0.01127548, + "auxiliary_loss_mlp": 0.0107558, + "balance_loss_clip": 1.00076103, + "balance_loss_mlp": 1.00004864, + "epoch": 0.8024049301067188, + "flos": 64153588181760.0, + "grad_norm": 0.8926131486360938, + "language_loss": 0.63120168, + "learning_rate": 3.956717879334059e-07, + "loss": 0.65323293, + "num_input_tokens_seen": 288052810, + "step": 13346, + "time_per_iteration": 3.3023769855499268 + }, + { + "auxiliary_loss_clip": 0.01132702, + "auxiliary_loss_mlp": 0.01101243, + "balance_loss_clip": 1.00180542, + "balance_loss_mlp": 1.00043941, + "epoch": 0.8024650533593868, + "flos": 28585360765440.0, + "grad_norm": 1.8694599119559372, + "language_loss": 0.72517705, + "learning_rate": 3.9543926936860327e-07, + "loss": 0.74751645, + "num_input_tokens_seen": 288073045, + "step": 13347, + "time_per_iteration": 2.8016114234924316 + }, + { + "auxiliary_loss_clip": 0.01147675, + "auxiliary_loss_mlp": 0.01101461, + "balance_loss_clip": 1.00185084, + "balance_loss_mlp": 1.00037193, + "epoch": 0.8025251766120547, + "flos": 16982264448000.0, + "grad_norm": 2.1053782966478316, + "language_loss": 0.72605908, + "learning_rate": 3.9520681164890493e-07, + "loss": 0.74855047, + "num_input_tokens_seen": 288091165, + "step": 13348, + "time_per_iteration": 2.7067317962646484 + }, + { + "auxiliary_loss_clip": 0.01132494, + "auxiliary_loss_mlp": 0.01100989, + "balance_loss_clip": 1.00201082, + "balance_loss_mlp": 1.00042367, + "epoch": 0.8025852998647227, + "flos": 22163671272960.0, + "grad_norm": 2.148047066490238, + "language_loss": 0.75707865, + "learning_rate": 3.9497441478312444e-07, + "loss": 0.77941346, + "num_input_tokens_seen": 288110595, + "step": 13349, + "time_per_iteration": 2.783545732498169 + }, + { + "auxiliary_loss_clip": 0.01164495, + "auxiliary_loss_mlp": 0.01101813, + "balance_loss_clip": 1.00200903, + "balance_loss_mlp": 1.00053263, + "epoch": 0.8026454231173906, + "flos": 22017012042240.0, + "grad_norm": 2.037850927859156, + "language_loss": 0.83324778, + "learning_rate": 3.947420787800755e-07, + "loss": 0.8559109, + "num_input_tokens_seen": 288128995, + "step": 13350, + "time_per_iteration": 2.6675596237182617 + }, + { + "auxiliary_loss_clip": 0.01147734, + "auxiliary_loss_mlp": 0.0110086, + "balance_loss_clip": 1.0018512, + "balance_loss_mlp": 1.00062871, + "epoch": 0.8027055463700586, + "flos": 22491320158080.0, + "grad_norm": 1.60262874673936, + "language_loss": 0.70968211, + "learning_rate": 3.945098036485679e-07, + "loss": 0.73216808, + "num_input_tokens_seen": 288149265, + "step": 13351, + "time_per_iteration": 2.7687957286834717 + }, + { + "auxiliary_loss_clip": 0.01118923, + "auxiliary_loss_mlp": 0.01101181, + "balance_loss_clip": 1.0019958, + "balance_loss_mlp": 1.00047255, + "epoch": 0.8027656696227266, + "flos": 28912901909760.0, + "grad_norm": 1.581397055793518, + "language_loss": 0.61688095, + "learning_rate": 3.9427758939740885e-07, + "loss": 0.63908207, + "num_input_tokens_seen": 288170745, + "step": 13352, + "time_per_iteration": 2.929734230041504 + }, + { + "auxiliary_loss_clip": 0.01147307, + "auxiliary_loss_mlp": 0.01101794, + "balance_loss_clip": 1.00197995, + "balance_loss_mlp": 1.00056148, + "epoch": 0.8028257928753946, + "flos": 18589374760320.0, + "grad_norm": 1.6668874511894398, + "language_loss": 0.77299953, + "learning_rate": 3.940454360354046e-07, + "loss": 0.79549056, + "num_input_tokens_seen": 288189415, + "step": 13353, + "time_per_iteration": 2.7033469676971436 + }, + { + "auxiliary_loss_clip": 0.01083447, + "auxiliary_loss_mlp": 0.01104063, + "balance_loss_clip": 1.00161088, + "balance_loss_mlp": 1.00049412, + "epoch": 0.8028859161280625, + "flos": 19130009339520.0, + "grad_norm": 2.1818491921844427, + "language_loss": 0.73019028, + "learning_rate": 3.938133435713582e-07, + "loss": 0.75206542, + "num_input_tokens_seen": 288206900, + "step": 13354, + "time_per_iteration": 2.8506572246551514 + }, + { + "auxiliary_loss_clip": 0.01116498, + "auxiliary_loss_mlp": 0.01102289, + "balance_loss_clip": 1.00168395, + "balance_loss_mlp": 1.00053155, + "epoch": 0.8029460393807305, + "flos": 20229881742720.0, + "grad_norm": 1.7741313225757032, + "language_loss": 0.65970123, + "learning_rate": 3.935813120140714e-07, + "loss": 0.68188912, + "num_input_tokens_seen": 288224800, + "step": 13355, + "time_per_iteration": 6.123399019241333 + }, + { + "auxiliary_loss_clip": 0.01118675, + "auxiliary_loss_mlp": 0.01102956, + "balance_loss_clip": 1.00181222, + "balance_loss_mlp": 1.00043583, + "epoch": 0.8030061626333984, + "flos": 49783320933120.0, + "grad_norm": 2.0023702447889464, + "language_loss": 0.68694943, + "learning_rate": 3.9334934137234235e-07, + "loss": 0.70916575, + "num_input_tokens_seen": 288249400, + "step": 13356, + "time_per_iteration": 4.0967323780059814 + }, + { + "auxiliary_loss_clip": 0.01101436, + "auxiliary_loss_mlp": 0.01099951, + "balance_loss_clip": 1.00167656, + "balance_loss_mlp": 1.0004828, + "epoch": 0.8030662858860664, + "flos": 21615243442560.0, + "grad_norm": 1.6305532398867728, + "language_loss": 0.77536309, + "learning_rate": 3.931174316549666e-07, + "loss": 0.79737699, + "num_input_tokens_seen": 288268780, + "step": 13357, + "time_per_iteration": 2.8304269313812256 + }, + { + "auxiliary_loss_clip": 0.01116735, + "auxiliary_loss_mlp": 0.01101244, + "balance_loss_clip": 1.00170398, + "balance_loss_mlp": 1.00034499, + "epoch": 0.8031264091387345, + "flos": 25630056351360.0, + "grad_norm": 2.1357992344685375, + "language_loss": 0.771155, + "learning_rate": 3.9288558287073937e-07, + "loss": 0.79333472, + "num_input_tokens_seen": 288290830, + "step": 13358, + "time_per_iteration": 2.8301632404327393 + }, + { + "auxiliary_loss_clip": 0.01149644, + "auxiliary_loss_mlp": 0.01101173, + "balance_loss_clip": 1.00181782, + "balance_loss_mlp": 1.00036943, + "epoch": 0.8031865323914024, + "flos": 19646225648640.0, + "grad_norm": 1.7470394623138967, + "language_loss": 0.84907389, + "learning_rate": 3.9265379502845143e-07, + "loss": 0.87158203, + "num_input_tokens_seen": 288308865, + "step": 13359, + "time_per_iteration": 2.6713814735412598 + }, + { + "auxiliary_loss_clip": 0.01131127, + "auxiliary_loss_mlp": 0.01100343, + "balance_loss_clip": 1.00174856, + "balance_loss_mlp": 1.00044584, + "epoch": 0.8032466556440704, + "flos": 26169110732160.0, + "grad_norm": 1.9021302494227619, + "language_loss": 0.73443794, + "learning_rate": 3.924220681368928e-07, + "loss": 0.75675261, + "num_input_tokens_seen": 288327325, + "step": 13360, + "time_per_iteration": 4.834981203079224 + }, + { + "auxiliary_loss_clip": 0.01164387, + "auxiliary_loss_mlp": 0.0110102, + "balance_loss_clip": 1.00182998, + "balance_loss_mlp": 1.00050282, + "epoch": 0.8033067788967383, + "flos": 25520026014720.0, + "grad_norm": 2.42682374305666, + "language_loss": 0.69636631, + "learning_rate": 3.921904022048512e-07, + "loss": 0.71902043, + "num_input_tokens_seen": 288347285, + "step": 13361, + "time_per_iteration": 4.1293418407440186 + }, + { + "auxiliary_loss_clip": 0.01164394, + "auxiliary_loss_mlp": 0.01102139, + "balance_loss_clip": 1.00180244, + "balance_loss_mlp": 1.00057256, + "epoch": 0.8033669021494063, + "flos": 24024274842240.0, + "grad_norm": 2.055005103676703, + "language_loss": 0.70142114, + "learning_rate": 3.919587972411098e-07, + "loss": 0.72408646, + "num_input_tokens_seen": 288367785, + "step": 13362, + "time_per_iteration": 2.8560924530029297 + }, + { + "auxiliary_loss_clip": 0.01164743, + "auxiliary_loss_mlp": 0.01104018, + "balance_loss_clip": 1.00197697, + "balance_loss_mlp": 1.00054455, + "epoch": 0.8034270254020742, + "flos": 13588059749760.0, + "grad_norm": 2.1090999398119283, + "language_loss": 0.78215933, + "learning_rate": 3.91727253254452e-07, + "loss": 0.804847, + "num_input_tokens_seen": 288384135, + "step": 13363, + "time_per_iteration": 2.6000137329101562 + }, + { + "auxiliary_loss_clip": 0.01147157, + "auxiliary_loss_mlp": 0.01101589, + "balance_loss_clip": 1.00177491, + "balance_loss_mlp": 1.0004518, + "epoch": 0.8034871486547422, + "flos": 27412661537280.0, + "grad_norm": 2.2843326543763807, + "language_loss": 0.75083947, + "learning_rate": 3.9149577025365787e-07, + "loss": 0.77332687, + "num_input_tokens_seen": 288403805, + "step": 13364, + "time_per_iteration": 4.257301092147827 + }, + { + "auxiliary_loss_clip": 0.01148058, + "auxiliary_loss_mlp": 0.01101491, + "balance_loss_clip": 1.00191343, + "balance_loss_mlp": 1.00040197, + "epoch": 0.8035472719074102, + "flos": 32598593475840.0, + "grad_norm": 3.048627727703324, + "language_loss": 0.60864848, + "learning_rate": 3.9126434824750596e-07, + "loss": 0.63114393, + "num_input_tokens_seen": 288424895, + "step": 13365, + "time_per_iteration": 4.295812606811523 + }, + { + "auxiliary_loss_clip": 0.01133675, + "auxiliary_loss_mlp": 0.01101898, + "balance_loss_clip": 1.00193453, + "balance_loss_mlp": 1.00042748, + "epoch": 0.8036073951600782, + "flos": 21287989607040.0, + "grad_norm": 1.8243961676217637, + "language_loss": 0.66345072, + "learning_rate": 3.910329872447706e-07, + "loss": 0.68580645, + "num_input_tokens_seen": 288443865, + "step": 13366, + "time_per_iteration": 2.68955135345459 + }, + { + "auxiliary_loss_clip": 0.01164481, + "auxiliary_loss_mlp": 0.01102043, + "balance_loss_clip": 1.00197351, + "balance_loss_mlp": 1.0004766, + "epoch": 0.8036675184127461, + "flos": 18113845582080.0, + "grad_norm": 2.0805017647212356, + "language_loss": 0.74768174, + "learning_rate": 3.908016872542259e-07, + "loss": 0.770347, + "num_input_tokens_seen": 288461065, + "step": 13367, + "time_per_iteration": 2.6882503032684326 + }, + { + "auxiliary_loss_clip": 0.01164276, + "auxiliary_loss_mlp": 0.01100437, + "balance_loss_clip": 1.00179887, + "balance_loss_mlp": 1.00039673, + "epoch": 0.8037276416654141, + "flos": 26030280666240.0, + "grad_norm": 1.4528816344984867, + "language_loss": 0.73947662, + "learning_rate": 3.905704482846428e-07, + "loss": 0.76212376, + "num_input_tokens_seen": 288481865, + "step": 13368, + "time_per_iteration": 2.7321455478668213 + }, + { + "auxiliary_loss_clip": 0.01164485, + "auxiliary_loss_mlp": 0.0110209, + "balance_loss_clip": 1.00188708, + "balance_loss_mlp": 1.0004288, + "epoch": 0.803787764918082, + "flos": 18802180886400.0, + "grad_norm": 1.9823281567900506, + "language_loss": 0.70257962, + "learning_rate": 3.90339270344789e-07, + "loss": 0.72524536, + "num_input_tokens_seen": 288499345, + "step": 13369, + "time_per_iteration": 2.57553768157959 + }, + { + "auxiliary_loss_clip": 0.01134959, + "auxiliary_loss_mlp": 0.01100655, + "balance_loss_clip": 1.0017159, + "balance_loss_mlp": 1.00042367, + "epoch": 0.80384788817075, + "flos": 20225787592320.0, + "grad_norm": 3.4630250141436214, + "language_loss": 0.73890555, + "learning_rate": 3.901081534434312e-07, + "loss": 0.7612617, + "num_input_tokens_seen": 288517660, + "step": 13370, + "time_per_iteration": 2.712247848510742 + }, + { + "auxiliary_loss_clip": 0.01132981, + "auxiliary_loss_mlp": 0.01102803, + "balance_loss_clip": 1.00172305, + "balance_loss_mlp": 1.00056863, + "epoch": 0.8039080114234181, + "flos": 18515290959360.0, + "grad_norm": 4.661099162024483, + "language_loss": 0.86458439, + "learning_rate": 3.898770975893342e-07, + "loss": 0.88694215, + "num_input_tokens_seen": 288534180, + "step": 13371, + "time_per_iteration": 4.473369836807251 + }, + { + "auxiliary_loss_clip": 0.01147903, + "auxiliary_loss_mlp": 0.01102087, + "balance_loss_clip": 1.00173855, + "balance_loss_mlp": 1.00042486, + "epoch": 0.803968134676086, + "flos": 22382510883840.0, + "grad_norm": 3.7514587808270825, + "language_loss": 0.74529099, + "learning_rate": 3.89646102791259e-07, + "loss": 0.76779085, + "num_input_tokens_seen": 288553350, + "step": 13372, + "time_per_iteration": 2.751478672027588 + }, + { + "auxiliary_loss_clip": 0.01100897, + "auxiliary_loss_mlp": 0.01101791, + "balance_loss_clip": 1.00154853, + "balance_loss_mlp": 1.00041604, + "epoch": 0.804028257928754, + "flos": 23842566915840.0, + "grad_norm": 2.1783987524655988, + "language_loss": 0.7944119, + "learning_rate": 3.894151690579646e-07, + "loss": 0.81643879, + "num_input_tokens_seen": 288571325, + "step": 13373, + "time_per_iteration": 2.7843945026397705 + }, + { + "auxiliary_loss_clip": 0.01134932, + "auxiliary_loss_mlp": 0.01100441, + "balance_loss_clip": 1.00186253, + "balance_loss_mlp": 1.0004487, + "epoch": 0.8040883811814219, + "flos": 23550720912000.0, + "grad_norm": 2.0650835830579792, + "language_loss": 0.74546838, + "learning_rate": 3.8918429639820815e-07, + "loss": 0.76782215, + "num_input_tokens_seen": 288592100, + "step": 13374, + "time_per_iteration": 2.6997177600860596 + }, + { + "auxiliary_loss_clip": 0.01100948, + "auxiliary_loss_mlp": 0.01103116, + "balance_loss_clip": 1.00164294, + "balance_loss_mlp": 1.0005008, + "epoch": 0.8041485044340899, + "flos": 19026263882880.0, + "grad_norm": 2.5171171023472647, + "language_loss": 0.68318743, + "learning_rate": 3.889534848207452e-07, + "loss": 0.70522809, + "num_input_tokens_seen": 288612305, + "step": 13375, + "time_per_iteration": 2.767235517501831 + }, + { + "auxiliary_loss_clip": 0.01111781, + "auxiliary_loss_mlp": 0.01075184, + "balance_loss_clip": 1.00073051, + "balance_loss_mlp": 1.00003397, + "epoch": 0.8042086276867578, + "flos": 70005663797760.0, + "grad_norm": 0.7228927118707936, + "language_loss": 0.55668199, + "learning_rate": 3.887227343343271e-07, + "loss": 0.57855165, + "num_input_tokens_seen": 288676015, + "step": 13376, + "time_per_iteration": 6.36173677444458 + }, + { + "auxiliary_loss_clip": 0.011022, + "auxiliary_loss_mlp": 0.01101832, + "balance_loss_clip": 1.00188518, + "balance_loss_mlp": 1.00045621, + "epoch": 0.8042687509394258, + "flos": 21872435800320.0, + "grad_norm": 1.9045511621582942, + "language_loss": 0.7304213, + "learning_rate": 3.8849204494770425e-07, + "loss": 0.75246161, + "num_input_tokens_seen": 288696455, + "step": 13377, + "time_per_iteration": 2.8600997924804688 + }, + { + "auxiliary_loss_clip": 0.01147623, + "auxiliary_loss_mlp": 0.0110192, + "balance_loss_clip": 1.00179672, + "balance_loss_mlp": 1.00035357, + "epoch": 0.8043288741920938, + "flos": 26614870513920.0, + "grad_norm": 1.674942043005587, + "language_loss": 0.70259738, + "learning_rate": 3.8826141666962567e-07, + "loss": 0.72509277, + "num_input_tokens_seen": 288715560, + "step": 13378, + "time_per_iteration": 2.766906499862671 + }, + { + "auxiliary_loss_clip": 0.01149916, + "auxiliary_loss_mlp": 0.01101168, + "balance_loss_clip": 1.00189757, + "balance_loss_mlp": 1.00045967, + "epoch": 0.8043889974447618, + "flos": 33403387651200.0, + "grad_norm": 1.4137124948225765, + "language_loss": 0.69261062, + "learning_rate": 3.880308495088347e-07, + "loss": 0.71512151, + "num_input_tokens_seen": 288739485, + "step": 13379, + "time_per_iteration": 4.290202379226685 + }, + { + "auxiliary_loss_clip": 0.01164517, + "auxiliary_loss_mlp": 0.01102293, + "balance_loss_clip": 1.00197327, + "balance_loss_mlp": 1.00053596, + "epoch": 0.8044491206974297, + "flos": 20375966355840.0, + "grad_norm": 1.8804901167213366, + "language_loss": 0.76341796, + "learning_rate": 3.8780034347407533e-07, + "loss": 0.78608608, + "num_input_tokens_seen": 288757420, + "step": 13380, + "time_per_iteration": 2.799522876739502 + }, + { + "auxiliary_loss_clip": 0.0110175, + "auxiliary_loss_mlp": 0.01100853, + "balance_loss_clip": 1.00181413, + "balance_loss_mlp": 1.00033593, + "epoch": 0.8045092439500977, + "flos": 23403810286080.0, + "grad_norm": 1.97558024071987, + "language_loss": 0.69274849, + "learning_rate": 3.875698985740887e-07, + "loss": 0.71477455, + "num_input_tokens_seen": 288775535, + "step": 13381, + "time_per_iteration": 2.861253023147583 + }, + { + "auxiliary_loss_clip": 0.01149361, + "auxiliary_loss_mlp": 0.01101954, + "balance_loss_clip": 1.00193799, + "balance_loss_mlp": 1.0004828, + "epoch": 0.8045693672027656, + "flos": 24097245321600.0, + "grad_norm": 1.764799810263997, + "language_loss": 0.6396929, + "learning_rate": 3.873395148176135e-07, + "loss": 0.66220599, + "num_input_tokens_seen": 288795035, + "step": 13382, + "time_per_iteration": 2.728942394256592 + }, + { + "auxiliary_loss_clip": 0.01130834, + "auxiliary_loss_mlp": 0.01101518, + "balance_loss_clip": 1.00172877, + "balance_loss_mlp": 1.00061893, + "epoch": 0.8046294904554336, + "flos": 27707165147520.0, + "grad_norm": 2.6333434068997024, + "language_loss": 0.76059663, + "learning_rate": 3.8710919221338487e-07, + "loss": 0.78292012, + "num_input_tokens_seen": 288816270, + "step": 13383, + "time_per_iteration": 2.6776041984558105 + }, + { + "auxiliary_loss_clip": 0.01149428, + "auxiliary_loss_mlp": 0.01101909, + "balance_loss_clip": 1.00188601, + "balance_loss_mlp": 1.00053334, + "epoch": 0.8046896137081017, + "flos": 24972998814720.0, + "grad_norm": 1.7267165253094379, + "language_loss": 0.69657087, + "learning_rate": 3.868789307701381e-07, + "loss": 0.71908426, + "num_input_tokens_seen": 288836050, + "step": 13384, + "time_per_iteration": 2.6666793823242188 + }, + { + "auxiliary_loss_clip": 0.01147301, + "auxiliary_loss_mlp": 0.01102283, + "balance_loss_clip": 1.00183499, + "balance_loss_mlp": 1.00043106, + "epoch": 0.8047497369607696, + "flos": 17675484001920.0, + "grad_norm": 2.104654840934814, + "language_loss": 0.79435003, + "learning_rate": 3.8664873049660375e-07, + "loss": 0.81684589, + "num_input_tokens_seen": 288852900, + "step": 13385, + "time_per_iteration": 2.551802635192871 + }, + { + "auxiliary_loss_clip": 0.01164449, + "auxiliary_loss_mlp": 0.01102871, + "balance_loss_clip": 1.00181651, + "balance_loss_mlp": 1.00063729, + "epoch": 0.8048098602134376, + "flos": 22382079920640.0, + "grad_norm": 1.5960496967594855, + "language_loss": 0.72399354, + "learning_rate": 3.864185914015108e-07, + "loss": 0.74666673, + "num_input_tokens_seen": 288872625, + "step": 13386, + "time_per_iteration": 2.59224271774292 + }, + { + "auxiliary_loss_clip": 0.01125702, + "auxiliary_loss_mlp": 0.01075133, + "balance_loss_clip": 1.00094628, + "balance_loss_mlp": 0.99998343, + "epoch": 0.8048699834661055, + "flos": 71200949702400.0, + "grad_norm": 0.6782983296290311, + "language_loss": 0.51198244, + "learning_rate": 3.861885134935865e-07, + "loss": 0.5339908, + "num_input_tokens_seen": 288939180, + "step": 13387, + "time_per_iteration": 3.2541866302490234 + }, + { + "auxiliary_loss_clip": 0.01164326, + "auxiliary_loss_mlp": 0.01101917, + "balance_loss_clip": 1.00172734, + "balance_loss_mlp": 1.00054121, + "epoch": 0.8049301067187735, + "flos": 23660320285440.0, + "grad_norm": 2.3207332305421877, + "language_loss": 0.74013382, + "learning_rate": 3.859584967815559e-07, + "loss": 0.76279622, + "num_input_tokens_seen": 288958925, + "step": 13388, + "time_per_iteration": 2.5734124183654785 + }, + { + "auxiliary_loss_clip": 0.01116788, + "auxiliary_loss_mlp": 0.01101647, + "balance_loss_clip": 1.00180149, + "balance_loss_mlp": 1.00046253, + "epoch": 0.8049902299714414, + "flos": 24426330750720.0, + "grad_norm": 1.9022538357506906, + "language_loss": 0.7164554, + "learning_rate": 3.857285412741411e-07, + "loss": 0.73863971, + "num_input_tokens_seen": 288980935, + "step": 13389, + "time_per_iteration": 2.7564618587493896 + }, + { + "auxiliary_loss_clip": 0.01131308, + "auxiliary_loss_mlp": 0.01101863, + "balance_loss_clip": 1.00189018, + "balance_loss_mlp": 1.00048792, + "epoch": 0.8050503532241094, + "flos": 17492626840320.0, + "grad_norm": 3.5912713438130557, + "language_loss": 0.83057815, + "learning_rate": 3.8549864698006097e-07, + "loss": 0.85290986, + "num_input_tokens_seen": 288996780, + "step": 13390, + "time_per_iteration": 2.586784839630127 + }, + { + "auxiliary_loss_clip": 0.01144095, + "auxiliary_loss_mlp": 0.01075169, + "balance_loss_clip": 1.00078893, + "balance_loss_mlp": 1.00001931, + "epoch": 0.8051104764767774, + "flos": 57658030369920.0, + "grad_norm": 0.7880853893133482, + "language_loss": 0.55482906, + "learning_rate": 3.8526881390803424e-07, + "loss": 0.57702172, + "num_input_tokens_seen": 289057590, + "step": 13391, + "time_per_iteration": 3.139289379119873 + }, + { + "auxiliary_loss_clip": 0.01147433, + "auxiliary_loss_mlp": 0.01100558, + "balance_loss_clip": 1.00181127, + "balance_loss_mlp": 1.00042248, + "epoch": 0.8051705997294454, + "flos": 18003456109440.0, + "grad_norm": 1.9163621636334465, + "language_loss": 0.84611666, + "learning_rate": 3.850390420667762e-07, + "loss": 0.86859655, + "num_input_tokens_seen": 289076285, + "step": 13392, + "time_per_iteration": 2.561366081237793 + }, + { + "auxiliary_loss_clip": 0.01115748, + "auxiliary_loss_mlp": 0.01101283, + "balance_loss_clip": 1.00162053, + "balance_loss_mlp": 1.00047946, + "epoch": 0.8052307229821133, + "flos": 26397754755840.0, + "grad_norm": 1.4015890282056547, + "language_loss": 0.70445585, + "learning_rate": 3.8480933146499914e-07, + "loss": 0.72662616, + "num_input_tokens_seen": 289097585, + "step": 13393, + "time_per_iteration": 2.76208233833313 + }, + { + "auxiliary_loss_clip": 0.01147875, + "auxiliary_loss_mlp": 0.01101514, + "balance_loss_clip": 1.00185716, + "balance_loss_mlp": 1.00047195, + "epoch": 0.8052908462347813, + "flos": 21757018423680.0, + "grad_norm": 2.3822628403160286, + "language_loss": 0.76177734, + "learning_rate": 3.84579682111414e-07, + "loss": 0.78427124, + "num_input_tokens_seen": 289116890, + "step": 13394, + "time_per_iteration": 2.6392462253570557 + }, + { + "auxiliary_loss_clip": 0.01164388, + "auxiliary_loss_mlp": 0.0110166, + "balance_loss_clip": 1.00187719, + "balance_loss_mlp": 1.00066614, + "epoch": 0.8053509694874492, + "flos": 25442279026560.0, + "grad_norm": 1.5584686420120444, + "language_loss": 0.64894396, + "learning_rate": 3.843500940147304e-07, + "loss": 0.67160439, + "num_input_tokens_seen": 289136670, + "step": 13395, + "time_per_iteration": 2.6575968265533447 + }, + { + "auxiliary_loss_clip": 0.01143998, + "auxiliary_loss_mlp": 0.01075169, + "balance_loss_clip": 1.0007484, + "balance_loss_mlp": 1.00001907, + "epoch": 0.8054110927401172, + "flos": 57668122091520.0, + "grad_norm": 0.763221172445279, + "language_loss": 0.57401335, + "learning_rate": 3.8412056718365206e-07, + "loss": 0.596205, + "num_input_tokens_seen": 289200150, + "step": 13396, + "time_per_iteration": 3.271980047225952 + }, + { + "auxiliary_loss_clip": 0.01147691, + "auxiliary_loss_mlp": 0.01101702, + "balance_loss_clip": 1.00178385, + "balance_loss_mlp": 1.00051713, + "epoch": 0.8054712159927853, + "flos": 19276201693440.0, + "grad_norm": 4.952620083394366, + "language_loss": 0.76921785, + "learning_rate": 3.8389110162688353e-07, + "loss": 0.79171181, + "num_input_tokens_seen": 289218125, + "step": 13397, + "time_per_iteration": 2.591320276260376 + }, + { + "auxiliary_loss_clip": 0.01148049, + "auxiliary_loss_mlp": 0.01101362, + "balance_loss_clip": 1.00193405, + "balance_loss_mlp": 1.000368, + "epoch": 0.8055313392454532, + "flos": 17967617314560.0, + "grad_norm": 1.594264752694857, + "language_loss": 0.70572942, + "learning_rate": 3.836616973531266e-07, + "loss": 0.72822356, + "num_input_tokens_seen": 289237115, + "step": 13398, + "time_per_iteration": 4.163023948669434 + }, + { + "auxiliary_loss_clip": 0.01132654, + "auxiliary_loss_mlp": 0.01102093, + "balance_loss_clip": 1.00176811, + "balance_loss_mlp": 1.00043154, + "epoch": 0.8055914624981212, + "flos": 13478352635520.0, + "grad_norm": 2.8909224369687103, + "language_loss": 0.6931932, + "learning_rate": 3.834323543710805e-07, + "loss": 0.71554065, + "num_input_tokens_seen": 289253635, + "step": 13399, + "time_per_iteration": 2.6130969524383545 + }, + { + "auxiliary_loss_clip": 0.01164529, + "auxiliary_loss_mlp": 0.01101494, + "balance_loss_clip": 1.00202835, + "balance_loss_mlp": 1.00059509, + "epoch": 0.8056515857507891, + "flos": 13224787551360.0, + "grad_norm": 2.2643379290681227, + "language_loss": 0.72762978, + "learning_rate": 3.8320307268944153e-07, + "loss": 0.75029004, + "num_input_tokens_seen": 289270085, + "step": 13400, + "time_per_iteration": 2.6133878231048584 + }, + { + "auxiliary_loss_clip": 0.01149522, + "auxiliary_loss_mlp": 0.01100896, + "balance_loss_clip": 1.00182796, + "balance_loss_mlp": 1.00037909, + "epoch": 0.8057117090034571, + "flos": 23878190229120.0, + "grad_norm": 2.1157280467210047, + "language_loss": 0.64417624, + "learning_rate": 3.829738523169037e-07, + "loss": 0.66668046, + "num_input_tokens_seen": 289289645, + "step": 13401, + "time_per_iteration": 2.6441280841827393 + }, + { + "auxiliary_loss_clip": 0.01149769, + "auxiliary_loss_mlp": 0.01102117, + "balance_loss_clip": 1.00185013, + "balance_loss_mlp": 1.00036001, + "epoch": 0.805771832256125, + "flos": 21214300855680.0, + "grad_norm": 3.1979934070571043, + "language_loss": 0.83943427, + "learning_rate": 3.8274469326215985e-07, + "loss": 0.86195308, + "num_input_tokens_seen": 289306630, + "step": 13402, + "time_per_iteration": 4.040388822555542 + }, + { + "auxiliary_loss_clip": 0.01100456, + "auxiliary_loss_mlp": 0.01101721, + "balance_loss_clip": 1.00168443, + "balance_loss_mlp": 1.0004406, + "epoch": 0.805831955508793, + "flos": 17566818382080.0, + "grad_norm": 1.8091030276403521, + "language_loss": 0.678882, + "learning_rate": 3.8251559553389876e-07, + "loss": 0.70090377, + "num_input_tokens_seen": 289324960, + "step": 13403, + "time_per_iteration": 2.7108805179595947 + }, + { + "auxiliary_loss_clip": 0.01117966, + "auxiliary_loss_mlp": 0.0074706, + "balance_loss_clip": 1.00189626, + "balance_loss_mlp": 1.00052118, + "epoch": 0.805892078761461, + "flos": 26907542530560.0, + "grad_norm": 1.9435154456429813, + "language_loss": 0.8513968, + "learning_rate": 3.822865591408084e-07, + "loss": 0.87004709, + "num_input_tokens_seen": 289344980, + "step": 13404, + "time_per_iteration": 2.716395616531372 + }, + { + "auxiliary_loss_clip": 0.01115514, + "auxiliary_loss_mlp": 0.01100255, + "balance_loss_clip": 1.001773, + "balance_loss_mlp": 1.00040483, + "epoch": 0.805952202014129, + "flos": 31506442496640.0, + "grad_norm": 1.8189700355417933, + "language_loss": 0.70228279, + "learning_rate": 3.820575840915743e-07, + "loss": 0.72444046, + "num_input_tokens_seen": 289367500, + "step": 13405, + "time_per_iteration": 2.888580799102783 + }, + { + "auxiliary_loss_clip": 0.0114976, + "auxiliary_loss_mlp": 0.01101048, + "balance_loss_clip": 1.00189996, + "balance_loss_mlp": 1.0003401, + "epoch": 0.8060123252667969, + "flos": 24389953251840.0, + "grad_norm": 2.7231527295481786, + "language_loss": 0.75527322, + "learning_rate": 3.818286703948788e-07, + "loss": 0.77778137, + "num_input_tokens_seen": 289385930, + "step": 13406, + "time_per_iteration": 2.68571138381958 + }, + { + "auxiliary_loss_clip": 0.01147626, + "auxiliary_loss_mlp": 0.01102035, + "balance_loss_clip": 1.00183582, + "balance_loss_mlp": 1.00046849, + "epoch": 0.8060724485194649, + "flos": 23479941162240.0, + "grad_norm": 1.6206763301060485, + "language_loss": 0.76152861, + "learning_rate": 3.815998180594018e-07, + "loss": 0.78402519, + "num_input_tokens_seen": 289408025, + "step": 13407, + "time_per_iteration": 2.6945557594299316 + }, + { + "auxiliary_loss_clip": 0.01133132, + "auxiliary_loss_mlp": 0.0074732, + "balance_loss_clip": 1.0017786, + "balance_loss_mlp": 1.00044644, + "epoch": 0.8061325717721328, + "flos": 18624495283200.0, + "grad_norm": 4.658150751874491, + "language_loss": 0.73761344, + "learning_rate": 3.81371027093822e-07, + "loss": 0.75641799, + "num_input_tokens_seen": 289426575, + "step": 13408, + "time_per_iteration": 2.768268346786499 + }, + { + "auxiliary_loss_clip": 0.01132645, + "auxiliary_loss_mlp": 0.01101294, + "balance_loss_clip": 1.00174093, + "balance_loss_mlp": 1.00049031, + "epoch": 0.8061926950248008, + "flos": 23582752865280.0, + "grad_norm": 2.4988163744627148, + "language_loss": 0.70291704, + "learning_rate": 3.8114229750681523e-07, + "loss": 0.72525644, + "num_input_tokens_seen": 289447760, + "step": 13409, + "time_per_iteration": 2.697463274002075 + }, + { + "auxiliary_loss_clip": 0.0116439, + "auxiliary_loss_mlp": 0.01101068, + "balance_loss_clip": 1.0018909, + "balance_loss_mlp": 1.00036025, + "epoch": 0.8062528182774689, + "flos": 11143333209600.0, + "grad_norm": 2.171722814907374, + "language_loss": 0.76627517, + "learning_rate": 3.809136293070545e-07, + "loss": 0.7889297, + "num_input_tokens_seen": 289463920, + "step": 13410, + "time_per_iteration": 2.5161361694335938 + }, + { + "auxiliary_loss_clip": 0.01147859, + "auxiliary_loss_mlp": 0.01101411, + "balance_loss_clip": 1.00189638, + "balance_loss_mlp": 1.00060785, + "epoch": 0.8063129415301368, + "flos": 22346815743360.0, + "grad_norm": 2.1602075310570554, + "language_loss": 0.68598998, + "learning_rate": 3.806850225032117e-07, + "loss": 0.70848268, + "num_input_tokens_seen": 289482635, + "step": 13411, + "time_per_iteration": 2.5797019004821777 + }, + { + "auxiliary_loss_clip": 0.01133474, + "auxiliary_loss_mlp": 0.01101478, + "balance_loss_clip": 1.00181603, + "balance_loss_mlp": 1.00057971, + "epoch": 0.8063730647828048, + "flos": 23988400133760.0, + "grad_norm": 1.6739545744474493, + "language_loss": 0.6835078, + "learning_rate": 3.804564771039551e-07, + "loss": 0.70585734, + "num_input_tokens_seen": 289502040, + "step": 13412, + "time_per_iteration": 2.691345453262329 + }, + { + "auxiliary_loss_clip": 0.01147885, + "auxiliary_loss_mlp": 0.01102004, + "balance_loss_clip": 1.00193095, + "balance_loss_mlp": 1.00053263, + "epoch": 0.8064331880354727, + "flos": 21321494017920.0, + "grad_norm": 1.8584350379258288, + "language_loss": 0.81395501, + "learning_rate": 3.8022799311795064e-07, + "loss": 0.83645391, + "num_input_tokens_seen": 289520740, + "step": 13413, + "time_per_iteration": 2.643014430999756 + }, + { + "auxiliary_loss_clip": 0.01149531, + "auxiliary_loss_mlp": 0.01102278, + "balance_loss_clip": 1.00192523, + "balance_loss_mlp": 1.00052118, + "epoch": 0.8064933112881407, + "flos": 19682890456320.0, + "grad_norm": 2.4196728321402277, + "language_loss": 0.84953415, + "learning_rate": 3.7999957055386303e-07, + "loss": 0.87205225, + "num_input_tokens_seen": 289535840, + "step": 13414, + "time_per_iteration": 4.707251071929932 + }, + { + "auxiliary_loss_clip": 0.01134192, + "auxiliary_loss_mlp": 0.01101091, + "balance_loss_clip": 1.00196624, + "balance_loss_mlp": 1.00047839, + "epoch": 0.8065534345408086, + "flos": 19279721226240.0, + "grad_norm": 1.9135347654167125, + "language_loss": 0.66879749, + "learning_rate": 3.7977120942035467e-07, + "loss": 0.69115037, + "num_input_tokens_seen": 289555205, + "step": 13415, + "time_per_iteration": 2.791307210922241 + }, + { + "auxiliary_loss_clip": 0.01116024, + "auxiliary_loss_mlp": 0.01100135, + "balance_loss_clip": 1.00162077, + "balance_loss_mlp": 1.00042868, + "epoch": 0.8066135577934767, + "flos": 19677718897920.0, + "grad_norm": 1.5978551727237054, + "language_loss": 0.76124161, + "learning_rate": 3.7954290972608383e-07, + "loss": 0.78340322, + "num_input_tokens_seen": 289573000, + "step": 13416, + "time_per_iteration": 2.7942681312561035 + }, + { + "auxiliary_loss_clip": 0.01150051, + "auxiliary_loss_mlp": 0.01102386, + "balance_loss_clip": 1.00183344, + "balance_loss_mlp": 1.00062871, + "epoch": 0.8066736810461446, + "flos": 21143592933120.0, + "grad_norm": 1.630517292226601, + "language_loss": 0.65131557, + "learning_rate": 3.793146714797086e-07, + "loss": 0.67383999, + "num_input_tokens_seen": 289592625, + "step": 13417, + "time_per_iteration": 4.220306873321533 + }, + { + "auxiliary_loss_clip": 0.01118132, + "auxiliary_loss_mlp": 0.01102067, + "balance_loss_clip": 1.00175667, + "balance_loss_mlp": 1.00064361, + "epoch": 0.8067338042988126, + "flos": 22598261925120.0, + "grad_norm": 1.599755858283399, + "language_loss": 0.80494523, + "learning_rate": 3.7908649468988306e-07, + "loss": 0.82714719, + "num_input_tokens_seen": 289610780, + "step": 13418, + "time_per_iteration": 2.964689016342163 + }, + { + "auxiliary_loss_clip": 0.01132188, + "auxiliary_loss_mlp": 0.01102352, + "balance_loss_clip": 1.00184238, + "balance_loss_mlp": 1.00040436, + "epoch": 0.8067939275514805, + "flos": 16508423208960.0, + "grad_norm": 2.5241733682020158, + "language_loss": 0.84645039, + "learning_rate": 3.7885837936526066e-07, + "loss": 0.86879575, + "num_input_tokens_seen": 289628890, + "step": 13419, + "time_per_iteration": 2.663529872894287 + }, + { + "auxiliary_loss_clip": 0.01117416, + "auxiliary_loss_mlp": 0.0074751, + "balance_loss_clip": 1.00187159, + "balance_loss_mlp": 1.0005343, + "epoch": 0.8068540508041485, + "flos": 28541836460160.0, + "grad_norm": 1.671420484784095, + "language_loss": 0.76254117, + "learning_rate": 3.7863032551449047e-07, + "loss": 0.7811904, + "num_input_tokens_seen": 289647220, + "step": 13420, + "time_per_iteration": 2.9028754234313965 + }, + { + "auxiliary_loss_clip": 0.01149909, + "auxiliary_loss_mlp": 0.00747407, + "balance_loss_clip": 1.00187707, + "balance_loss_mlp": 1.00054145, + "epoch": 0.8069141740568164, + "flos": 21652482867840.0, + "grad_norm": 1.9496200176797982, + "language_loss": 0.78630346, + "learning_rate": 3.784023331462207e-07, + "loss": 0.80527663, + "num_input_tokens_seen": 289665800, + "step": 13421, + "time_per_iteration": 3.107465982437134 + }, + { + "auxiliary_loss_clip": 0.01114373, + "auxiliary_loss_mlp": 0.01101559, + "balance_loss_clip": 1.00170708, + "balance_loss_mlp": 1.00042236, + "epoch": 0.8069742973094844, + "flos": 17529327561600.0, + "grad_norm": 1.936970886691989, + "language_loss": 0.79572564, + "learning_rate": 3.78174402269098e-07, + "loss": 0.81788498, + "num_input_tokens_seen": 289682705, + "step": 13422, + "time_per_iteration": 3.0902483463287354 + }, + { + "auxiliary_loss_clip": 0.01164272, + "auxiliary_loss_mlp": 0.01101073, + "balance_loss_clip": 1.0017916, + "balance_loss_mlp": 1.00046039, + "epoch": 0.8070344205621525, + "flos": 23367037737600.0, + "grad_norm": 1.5697006932335549, + "language_loss": 0.67610878, + "learning_rate": 3.7794653289176347e-07, + "loss": 0.69876224, + "num_input_tokens_seen": 289702920, + "step": 13423, + "time_per_iteration": 2.6230666637420654 + }, + { + "auxiliary_loss_clip": 0.01131909, + "auxiliary_loss_mlp": 0.01101418, + "balance_loss_clip": 1.00179911, + "balance_loss_mlp": 1.00051939, + "epoch": 0.8070945438148204, + "flos": 22930184528640.0, + "grad_norm": 1.9199887354803986, + "language_loss": 0.80200565, + "learning_rate": 3.7771872502285904e-07, + "loss": 0.82433891, + "num_input_tokens_seen": 289723280, + "step": 13424, + "time_per_iteration": 2.857567310333252 + }, + { + "auxiliary_loss_clip": 0.0114774, + "auxiliary_loss_mlp": 0.01102825, + "balance_loss_clip": 1.00171793, + "balance_loss_mlp": 1.00049591, + "epoch": 0.8071546670674884, + "flos": 25300683613440.0, + "grad_norm": 2.329433732867046, + "language_loss": 0.78715169, + "learning_rate": 3.774909786710232e-07, + "loss": 0.80965739, + "num_input_tokens_seen": 289743475, + "step": 13425, + "time_per_iteration": 3.5551748275756836 + }, + { + "auxiliary_loss_clip": 0.01133135, + "auxiliary_loss_mlp": 0.01101238, + "balance_loss_clip": 1.00178742, + "balance_loss_mlp": 1.00057793, + "epoch": 0.8072147903201563, + "flos": 18113701927680.0, + "grad_norm": 2.782250324529096, + "language_loss": 0.751845, + "learning_rate": 3.772632938448923e-07, + "loss": 0.77418876, + "num_input_tokens_seen": 289761400, + "step": 13426, + "time_per_iteration": 3.1119956970214844 + }, + { + "auxiliary_loss_clip": 0.01147707, + "auxiliary_loss_mlp": 0.01101392, + "balance_loss_clip": 1.00186253, + "balance_loss_mlp": 1.00039816, + "epoch": 0.8072749135728243, + "flos": 26688164215680.0, + "grad_norm": 1.9433765774410539, + "language_loss": 0.73500752, + "learning_rate": 3.770356705530997e-07, + "loss": 0.7574985, + "num_input_tokens_seen": 289781025, + "step": 13427, + "time_per_iteration": 2.745934247970581 + }, + { + "auxiliary_loss_clip": 0.01085194, + "auxiliary_loss_mlp": 0.01101125, + "balance_loss_clip": 1.00167763, + "balance_loss_mlp": 1.00041747, + "epoch": 0.8073350368254922, + "flos": 19240291071360.0, + "grad_norm": 1.6972653779886693, + "language_loss": 0.6964488, + "learning_rate": 3.768081088042774e-07, + "loss": 0.71831191, + "num_input_tokens_seen": 289798380, + "step": 13428, + "time_per_iteration": 2.931555986404419 + }, + { + "auxiliary_loss_clip": 0.01134388, + "auxiliary_loss_mlp": 0.01101164, + "balance_loss_clip": 1.00174201, + "balance_loss_mlp": 1.00036097, + "epoch": 0.8073951600781603, + "flos": 13334530579200.0, + "grad_norm": 1.7810375827750597, + "language_loss": 0.75031406, + "learning_rate": 3.765806086070544e-07, + "loss": 0.77266955, + "num_input_tokens_seen": 289814515, + "step": 13429, + "time_per_iteration": 2.7726705074310303 + }, + { + "auxiliary_loss_clip": 0.01147613, + "auxiliary_loss_mlp": 0.01100387, + "balance_loss_clip": 1.00187135, + "balance_loss_mlp": 1.00053787, + "epoch": 0.8074552833308282, + "flos": 22853191726080.0, + "grad_norm": 2.141677356392136, + "language_loss": 0.66836214, + "learning_rate": 3.763531699700568e-07, + "loss": 0.69084209, + "num_input_tokens_seen": 289834315, + "step": 13430, + "time_per_iteration": 2.7490525245666504 + }, + { + "auxiliary_loss_clip": 0.01118535, + "auxiliary_loss_mlp": 0.01101187, + "balance_loss_clip": 1.00179029, + "balance_loss_mlp": 1.00038385, + "epoch": 0.8075154065834962, + "flos": 20339409288960.0, + "grad_norm": 4.160133690759055, + "language_loss": 0.80138427, + "learning_rate": 3.7612579290190994e-07, + "loss": 0.82358152, + "num_input_tokens_seen": 289853770, + "step": 13431, + "time_per_iteration": 2.7251715660095215 + }, + { + "auxiliary_loss_clip": 0.01130722, + "auxiliary_loss_mlp": 0.01101216, + "balance_loss_clip": 1.00171995, + "balance_loss_mlp": 1.00041294, + "epoch": 0.8075755298361641, + "flos": 21908059113600.0, + "grad_norm": 2.349973803357236, + "language_loss": 0.80302799, + "learning_rate": 3.7589847741123593e-07, + "loss": 0.82534736, + "num_input_tokens_seen": 289870480, + "step": 13432, + "time_per_iteration": 3.2323219776153564 + }, + { + "auxiliary_loss_clip": 0.01116272, + "auxiliary_loss_mlp": 0.01102487, + "balance_loss_clip": 1.00190783, + "balance_loss_mlp": 1.0005393, + "epoch": 0.8076356530888321, + "flos": 15669298609920.0, + "grad_norm": 3.079692786499428, + "language_loss": 0.69944787, + "learning_rate": 3.7567122350665415e-07, + "loss": 0.72163546, + "num_input_tokens_seen": 289888275, + "step": 13433, + "time_per_iteration": 3.602397918701172 + }, + { + "auxiliary_loss_clip": 0.01131483, + "auxiliary_loss_mlp": 0.01100734, + "balance_loss_clip": 1.0016973, + "balance_loss_mlp": 1.00040698, + "epoch": 0.8076957763415, + "flos": 37777414521600.0, + "grad_norm": 1.6449851434970246, + "language_loss": 0.72711688, + "learning_rate": 3.754440311967828e-07, + "loss": 0.749439, + "num_input_tokens_seen": 289911495, + "step": 13434, + "time_per_iteration": 2.853257656097412 + }, + { + "auxiliary_loss_clip": 0.01117821, + "auxiliary_loss_mlp": 0.01101569, + "balance_loss_clip": 1.0019362, + "balance_loss_mlp": 1.00038457, + "epoch": 0.807755899594168, + "flos": 19610781903360.0, + "grad_norm": 1.8007211634004996, + "language_loss": 0.680255, + "learning_rate": 3.752169004902361e-07, + "loss": 0.7024489, + "num_input_tokens_seen": 289930045, + "step": 13435, + "time_per_iteration": 4.777345180511475 + }, + { + "auxiliary_loss_clip": 0.01098556, + "auxiliary_loss_mlp": 0.01102282, + "balance_loss_clip": 1.00175333, + "balance_loss_mlp": 1.00042999, + "epoch": 0.8078160228468361, + "flos": 23294893271040.0, + "grad_norm": 1.7906745891502969, + "language_loss": 0.75017941, + "learning_rate": 3.749898313956279e-07, + "loss": 0.77218783, + "num_input_tokens_seen": 289950815, + "step": 13436, + "time_per_iteration": 5.522803783416748 + }, + { + "auxiliary_loss_clip": 0.01164201, + "auxiliary_loss_mlp": 0.01101067, + "balance_loss_clip": 1.00181079, + "balance_loss_mlp": 1.00045395, + "epoch": 0.807876146099504, + "flos": 27162651899520.0, + "grad_norm": 1.8440548195970965, + "language_loss": 0.70158792, + "learning_rate": 3.747628239215674e-07, + "loss": 0.7242406, + "num_input_tokens_seen": 289971730, + "step": 13437, + "time_per_iteration": 2.784252882003784 + }, + { + "auxiliary_loss_clip": 0.01132111, + "auxiliary_loss_mlp": 0.01101138, + "balance_loss_clip": 1.0019803, + "balance_loss_mlp": 1.00047779, + "epoch": 0.807936269352172, + "flos": 27160030206720.0, + "grad_norm": 2.054829513032784, + "language_loss": 0.72566092, + "learning_rate": 3.745358780766636e-07, + "loss": 0.74799341, + "num_input_tokens_seen": 289992995, + "step": 13438, + "time_per_iteration": 3.043843984603882 + }, + { + "auxiliary_loss_clip": 0.01130803, + "auxiliary_loss_mlp": 0.01101192, + "balance_loss_clip": 1.00176263, + "balance_loss_mlp": 1.00057912, + "epoch": 0.8079963926048399, + "flos": 20740423703040.0, + "grad_norm": 2.122052619596174, + "language_loss": 0.77441955, + "learning_rate": 3.7430899386952344e-07, + "loss": 0.79673946, + "num_input_tokens_seen": 290009405, + "step": 13439, + "time_per_iteration": 2.8136374950408936 + }, + { + "auxiliary_loss_clip": 0.0116432, + "auxiliary_loss_mlp": 0.01100892, + "balance_loss_clip": 1.00200629, + "balance_loss_mlp": 1.00037467, + "epoch": 0.8080565158575079, + "flos": 25009663622400.0, + "grad_norm": 2.5312025453776767, + "language_loss": 0.78613079, + "learning_rate": 3.7408217130874786e-07, + "loss": 0.80878294, + "num_input_tokens_seen": 290031085, + "step": 13440, + "time_per_iteration": 4.26666784286499 + }, + { + "auxiliary_loss_clip": 0.01131329, + "auxiliary_loss_mlp": 0.00747463, + "balance_loss_clip": 1.00163317, + "balance_loss_mlp": 1.00046325, + "epoch": 0.8081166391101758, + "flos": 18698076293760.0, + "grad_norm": 2.48357344399507, + "language_loss": 0.59112537, + "learning_rate": 3.7385541040293946e-07, + "loss": 0.60991329, + "num_input_tokens_seen": 290048670, + "step": 13441, + "time_per_iteration": 2.7375073432922363 + }, + { + "auxiliary_loss_clip": 0.01149577, + "auxiliary_loss_mlp": 0.01101005, + "balance_loss_clip": 1.00190604, + "balance_loss_mlp": 1.00048804, + "epoch": 0.8081767623628439, + "flos": 19828651847040.0, + "grad_norm": 2.2375963503317355, + "language_loss": 0.76379287, + "learning_rate": 3.7362871116069684e-07, + "loss": 0.78629869, + "num_input_tokens_seen": 290064085, + "step": 13442, + "time_per_iteration": 2.7747437953948975 + }, + { + "auxiliary_loss_clip": 0.01130903, + "auxiliary_loss_mlp": 0.01102055, + "balance_loss_clip": 1.00176835, + "balance_loss_mlp": 1.0004884, + "epoch": 0.8082368856155118, + "flos": 35772952982400.0, + "grad_norm": 2.5101464290628264, + "language_loss": 0.70590985, + "learning_rate": 3.734020735906169e-07, + "loss": 0.72823942, + "num_input_tokens_seen": 290086255, + "step": 13443, + "time_per_iteration": 2.7823703289031982 + }, + { + "auxiliary_loss_clip": 0.01116659, + "auxiliary_loss_mlp": 0.01100997, + "balance_loss_clip": 1.00184107, + "balance_loss_mlp": 1.00047982, + "epoch": 0.8082970088681798, + "flos": 17198015489280.0, + "grad_norm": 2.373732228274663, + "language_loss": 0.82261896, + "learning_rate": 3.7317549770129286e-07, + "loss": 0.84479547, + "num_input_tokens_seen": 290103995, + "step": 13444, + "time_per_iteration": 2.7521865367889404 + }, + { + "auxiliary_loss_clip": 0.011121, + "auxiliary_loss_mlp": 0.00745271, + "balance_loss_clip": 1.00073791, + "balance_loss_mlp": 1.00009406, + "epoch": 0.8083571321208477, + "flos": 63555207511680.0, + "grad_norm": 0.842184032118322, + "language_loss": 0.53593636, + "learning_rate": 3.7294898350131754e-07, + "loss": 0.55451006, + "num_input_tokens_seen": 290157245, + "step": 13445, + "time_per_iteration": 3.1286234855651855 + }, + { + "auxiliary_loss_clip": 0.01116232, + "auxiliary_loss_mlp": 0.01101356, + "balance_loss_clip": 1.00182903, + "balance_loss_mlp": 1.00045681, + "epoch": 0.8084172553735157, + "flos": 17930701111680.0, + "grad_norm": 2.117225818354941, + "language_loss": 0.7227155, + "learning_rate": 3.7272253099927964e-07, + "loss": 0.74489135, + "num_input_tokens_seen": 290174970, + "step": 13446, + "time_per_iteration": 2.7207109928131104 + }, + { + "auxiliary_loss_clip": 0.01133383, + "auxiliary_loss_mlp": 0.01102156, + "balance_loss_clip": 1.0018636, + "balance_loss_mlp": 1.00058937, + "epoch": 0.8084773786261836, + "flos": 24097999507200.0, + "grad_norm": 2.880740478817904, + "language_loss": 0.71404886, + "learning_rate": 3.7249614020376606e-07, + "loss": 0.73640418, + "num_input_tokens_seen": 290194395, + "step": 13447, + "time_per_iteration": 2.7600674629211426 + }, + { + "auxiliary_loss_clip": 0.01084978, + "auxiliary_loss_mlp": 0.01102974, + "balance_loss_clip": 1.00159812, + "balance_loss_mlp": 1.00045371, + "epoch": 0.8085375018788516, + "flos": 15588211656960.0, + "grad_norm": 2.1997096935764686, + "language_loss": 0.75191927, + "learning_rate": 3.7226981112336197e-07, + "loss": 0.77379882, + "num_input_tokens_seen": 290209200, + "step": 13448, + "time_per_iteration": 2.8794162273406982 + }, + { + "auxiliary_loss_clip": 0.01158265, + "auxiliary_loss_mlp": 0.01075076, + "balance_loss_clip": 1.00072551, + "balance_loss_mlp": 0.99992603, + "epoch": 0.8085976251315197, + "flos": 67561296393600.0, + "grad_norm": 0.7300131845238605, + "language_loss": 0.6388799, + "learning_rate": 3.7204354376665024e-07, + "loss": 0.6612134, + "num_input_tokens_seen": 290274565, + "step": 13449, + "time_per_iteration": 3.1808669567108154 + }, + { + "auxiliary_loss_clip": 0.01149742, + "auxiliary_loss_mlp": 0.01100705, + "balance_loss_clip": 1.00192571, + "balance_loss_mlp": 1.00037813, + "epoch": 0.8086577483841876, + "flos": 22561453463040.0, + "grad_norm": 1.6953835671577793, + "language_loss": 0.74344355, + "learning_rate": 3.718173381422105e-07, + "loss": 0.76594806, + "num_input_tokens_seen": 290293630, + "step": 13450, + "time_per_iteration": 2.6566691398620605 + }, + { + "auxiliary_loss_clip": 0.01132309, + "auxiliary_loss_mlp": 0.00747451, + "balance_loss_clip": 1.00169766, + "balance_loss_mlp": 1.00054014, + "epoch": 0.8087178716368556, + "flos": 17968084191360.0, + "grad_norm": 2.8449060589146153, + "language_loss": 0.73875785, + "learning_rate": 3.7159119425861986e-07, + "loss": 0.75755543, + "num_input_tokens_seen": 290311450, + "step": 13451, + "time_per_iteration": 5.353319406509399 + }, + { + "auxiliary_loss_clip": 0.01135262, + "auxiliary_loss_mlp": 0.01102321, + "balance_loss_clip": 1.00180328, + "balance_loss_mlp": 1.00046849, + "epoch": 0.8087779948895235, + "flos": 21719527603200.0, + "grad_norm": 1.911991794395632, + "language_loss": 0.79844135, + "learning_rate": 3.713651121244543e-07, + "loss": 0.82081723, + "num_input_tokens_seen": 290330165, + "step": 13452, + "time_per_iteration": 5.56468939781189 + }, + { + "auxiliary_loss_clip": 0.01148004, + "auxiliary_loss_mlp": 0.01101692, + "balance_loss_clip": 1.00188875, + "balance_loss_mlp": 1.00050735, + "epoch": 0.8088381181421915, + "flos": 29092885983360.0, + "grad_norm": 2.0549692214276685, + "language_loss": 0.78364962, + "learning_rate": 3.711390917482875e-07, + "loss": 0.80614656, + "num_input_tokens_seen": 290350815, + "step": 13453, + "time_per_iteration": 2.759385108947754 + }, + { + "auxiliary_loss_clip": 0.01101427, + "auxiliary_loss_mlp": 0.01101299, + "balance_loss_clip": 1.00174546, + "balance_loss_mlp": 1.00039983, + "epoch": 0.8088982413948594, + "flos": 22198432659840.0, + "grad_norm": 2.602857823157538, + "language_loss": 0.77136064, + "learning_rate": 3.709131331386892e-07, + "loss": 0.79338789, + "num_input_tokens_seen": 290367380, + "step": 13454, + "time_per_iteration": 2.7922661304473877 + }, + { + "auxiliary_loss_clip": 0.01116127, + "auxiliary_loss_mlp": 0.01101127, + "balance_loss_clip": 1.00166154, + "balance_loss_mlp": 1.00041866, + "epoch": 0.8089583646475275, + "flos": 28036717453440.0, + "grad_norm": 1.6704074682699503, + "language_loss": 0.7667824, + "learning_rate": 3.7068723630422795e-07, + "loss": 0.78895497, + "num_input_tokens_seen": 290387965, + "step": 13455, + "time_per_iteration": 4.361567497253418 + }, + { + "auxiliary_loss_clip": 0.01133182, + "auxiliary_loss_mlp": 0.01100942, + "balance_loss_clip": 1.00174689, + "balance_loss_mlp": 1.00042462, + "epoch": 0.8090184879001954, + "flos": 16617735273600.0, + "grad_norm": 1.716271724722092, + "language_loss": 0.78553736, + "learning_rate": 3.70461401253471e-07, + "loss": 0.80787861, + "num_input_tokens_seen": 290404150, + "step": 13456, + "time_per_iteration": 3.0895063877105713 + }, + { + "auxiliary_loss_clip": 0.01164401, + "auxiliary_loss_mlp": 0.0110109, + "balance_loss_clip": 1.00201464, + "balance_loss_mlp": 1.00052476, + "epoch": 0.8090786111528634, + "flos": 27340804379520.0, + "grad_norm": 1.9783176863776493, + "language_loss": 0.71385145, + "learning_rate": 3.702356279949801e-07, + "loss": 0.7365064, + "num_input_tokens_seen": 290422370, + "step": 13457, + "time_per_iteration": 4.0083277225494385 + }, + { + "auxiliary_loss_clip": 0.01132701, + "auxiliary_loss_mlp": 0.01100833, + "balance_loss_clip": 1.00178766, + "balance_loss_mlp": 1.00050688, + "epoch": 0.8091387344055313, + "flos": 21105742976640.0, + "grad_norm": 1.9979069582239304, + "language_loss": 0.72510922, + "learning_rate": 3.700099165373176e-07, + "loss": 0.74744451, + "num_input_tokens_seen": 290442645, + "step": 13458, + "time_per_iteration": 2.9126131534576416 + }, + { + "auxiliary_loss_clip": 0.01147566, + "auxiliary_loss_mlp": 0.01101371, + "balance_loss_clip": 1.00177848, + "balance_loss_mlp": 1.00056815, + "epoch": 0.8091988576581993, + "flos": 11655060318720.0, + "grad_norm": 2.5084169092781554, + "language_loss": 0.79317462, + "learning_rate": 3.6978426688904275e-07, + "loss": 0.81566405, + "num_input_tokens_seen": 290458520, + "step": 13459, + "time_per_iteration": 2.728288412094116 + }, + { + "auxiliary_loss_clip": 0.01130828, + "auxiliary_loss_mlp": 0.01101899, + "balance_loss_clip": 1.00175631, + "balance_loss_mlp": 1.00042772, + "epoch": 0.8092589809108672, + "flos": 22963329803520.0, + "grad_norm": 2.4531285969408363, + "language_loss": 0.80068672, + "learning_rate": 3.695586790587113e-07, + "loss": 0.82301402, + "num_input_tokens_seen": 290474465, + "step": 13460, + "time_per_iteration": 2.7476682662963867 + }, + { + "auxiliary_loss_clip": 0.01132464, + "auxiliary_loss_mlp": 0.01101854, + "balance_loss_clip": 1.00175488, + "balance_loss_mlp": 1.00047874, + "epoch": 0.8093191041635353, + "flos": 13260985482240.0, + "grad_norm": 1.8737254627970368, + "language_loss": 0.84495497, + "learning_rate": 3.693331530548789e-07, + "loss": 0.86729819, + "num_input_tokens_seen": 290492060, + "step": 13461, + "time_per_iteration": 2.7636048793792725 + }, + { + "auxiliary_loss_clip": 0.01147788, + "auxiliary_loss_mlp": 0.01102285, + "balance_loss_clip": 1.00194454, + "balance_loss_mlp": 1.00062346, + "epoch": 0.8093792274162032, + "flos": 25516003691520.0, + "grad_norm": 1.9662948345579117, + "language_loss": 0.7629441, + "learning_rate": 3.69107688886096e-07, + "loss": 0.78544486, + "num_input_tokens_seen": 290511510, + "step": 13462, + "time_per_iteration": 2.799628257751465 + }, + { + "auxiliary_loss_clip": 0.01133208, + "auxiliary_loss_mlp": 0.01101685, + "balance_loss_clip": 1.00192308, + "balance_loss_mlp": 1.00050044, + "epoch": 0.8094393506688712, + "flos": 23546483107200.0, + "grad_norm": 1.6697117752883885, + "language_loss": 0.83221132, + "learning_rate": 3.6888228656091357e-07, + "loss": 0.85456026, + "num_input_tokens_seen": 290530035, + "step": 13463, + "time_per_iteration": 2.8186194896698 + }, + { + "auxiliary_loss_clip": 0.01164312, + "auxiliary_loss_mlp": 0.0110102, + "balance_loss_clip": 1.00185347, + "balance_loss_mlp": 1.00050294, + "epoch": 0.8094994739215392, + "flos": 17055917285760.0, + "grad_norm": 4.612942091067382, + "language_loss": 0.62192601, + "learning_rate": 3.686569460878779e-07, + "loss": 0.64457929, + "num_input_tokens_seen": 290548245, + "step": 13464, + "time_per_iteration": 2.6423089504241943 + }, + { + "auxiliary_loss_clip": 0.01164213, + "auxiliary_loss_mlp": 0.01100338, + "balance_loss_clip": 1.00186336, + "balance_loss_mlp": 1.00048876, + "epoch": 0.8095595971742071, + "flos": 23551223702400.0, + "grad_norm": 1.6051444278797484, + "language_loss": 0.62004733, + "learning_rate": 3.684316674755341e-07, + "loss": 0.64269286, + "num_input_tokens_seen": 290568625, + "step": 13465, + "time_per_iteration": 2.7529876232147217 + }, + { + "auxiliary_loss_clip": 0.01147697, + "auxiliary_loss_mlp": 0.01101339, + "balance_loss_clip": 1.00184321, + "balance_loss_mlp": 1.00063062, + "epoch": 0.8096197204268751, + "flos": 20373201008640.0, + "grad_norm": 2.0784915527970393, + "language_loss": 0.81917787, + "learning_rate": 3.682064507324256e-07, + "loss": 0.84166825, + "num_input_tokens_seen": 290586575, + "step": 13466, + "time_per_iteration": 2.6569418907165527 + }, + { + "auxiliary_loss_clip": 0.01134136, + "auxiliary_loss_mlp": 0.00747422, + "balance_loss_clip": 1.00180697, + "balance_loss_mlp": 1.00048125, + "epoch": 0.809679843679543, + "flos": 27818775682560.0, + "grad_norm": 1.9978517493425925, + "language_loss": 0.75948262, + "learning_rate": 3.6798129586709204e-07, + "loss": 0.7782982, + "num_input_tokens_seen": 290606790, + "step": 13467, + "time_per_iteration": 2.7253897190093994 + }, + { + "auxiliary_loss_clip": 0.01118005, + "auxiliary_loss_mlp": 0.01101448, + "balance_loss_clip": 1.00162101, + "balance_loss_mlp": 1.00045419, + "epoch": 0.8097399669322111, + "flos": 22014103040640.0, + "grad_norm": 26.522755669269273, + "language_loss": 0.79226106, + "learning_rate": 3.6775620288807073e-07, + "loss": 0.81445563, + "num_input_tokens_seen": 290625525, + "step": 13468, + "time_per_iteration": 2.736180305480957 + }, + { + "auxiliary_loss_clip": 0.01149545, + "auxiliary_loss_mlp": 0.01100077, + "balance_loss_clip": 1.00176144, + "balance_loss_mlp": 1.00041807, + "epoch": 0.809800090184879, + "flos": 18988988544000.0, + "grad_norm": 1.8067359024178895, + "language_loss": 0.67862374, + "learning_rate": 3.675311718038978e-07, + "loss": 0.70111996, + "num_input_tokens_seen": 290644935, + "step": 13469, + "time_per_iteration": 2.6923294067382812 + }, + { + "auxiliary_loss_clip": 0.01110963, + "auxiliary_loss_mlp": 0.01074958, + "balance_loss_clip": 1.00079107, + "balance_loss_mlp": 1.00019026, + "epoch": 0.809860213437547, + "flos": 66099516508800.0, + "grad_norm": 0.688469476084047, + "language_loss": 0.54755801, + "learning_rate": 3.6730620262310683e-07, + "loss": 0.56941724, + "num_input_tokens_seen": 290710735, + "step": 13470, + "time_per_iteration": 3.398522138595581 + }, + { + "auxiliary_loss_clip": 0.01164317, + "auxiliary_loss_mlp": 0.01100875, + "balance_loss_clip": 1.00184965, + "balance_loss_mlp": 1.00045252, + "epoch": 0.8099203366902149, + "flos": 20882485992960.0, + "grad_norm": 1.7718198349843162, + "language_loss": 0.69313735, + "learning_rate": 3.670812953542279e-07, + "loss": 0.71578926, + "num_input_tokens_seen": 290729565, + "step": 13471, + "time_per_iteration": 2.6504573822021484 + }, + { + "auxiliary_loss_clip": 0.01148802, + "auxiliary_loss_mlp": 0.01101343, + "balance_loss_clip": 1.00181472, + "balance_loss_mlp": 1.00034904, + "epoch": 0.8099804599428829, + "flos": 26030927111040.0, + "grad_norm": 1.724400923716326, + "language_loss": 0.79588711, + "learning_rate": 3.6685645000579003e-07, + "loss": 0.81838858, + "num_input_tokens_seen": 290749360, + "step": 13472, + "time_per_iteration": 2.718106985092163 + }, + { + "auxiliary_loss_clip": 0.01144016, + "auxiliary_loss_mlp": 0.01074799, + "balance_loss_clip": 1.00081587, + "balance_loss_mlp": 1.00003111, + "epoch": 0.8100405831955508, + "flos": 69303573584640.0, + "grad_norm": 0.7456277447554714, + "language_loss": 0.57761484, + "learning_rate": 3.666316665863201e-07, + "loss": 0.59980297, + "num_input_tokens_seen": 290812145, + "step": 13473, + "time_per_iteration": 5.520570278167725 + }, + { + "auxiliary_loss_clip": 0.01100279, + "auxiliary_loss_mlp": 0.01101834, + "balance_loss_clip": 1.0017724, + "balance_loss_mlp": 1.0004108, + "epoch": 0.8101007064482189, + "flos": 15012492468480.0, + "grad_norm": 1.869064609493537, + "language_loss": 0.73913485, + "learning_rate": 3.664069451043399e-07, + "loss": 0.76115596, + "num_input_tokens_seen": 290829845, + "step": 13474, + "time_per_iteration": 2.6809144020080566 + }, + { + "auxiliary_loss_clip": 0.01148016, + "auxiliary_loss_mlp": 0.0110135, + "balance_loss_clip": 1.00177801, + "balance_loss_mlp": 1.00054657, + "epoch": 0.8101608297008868, + "flos": 21067210661760.0, + "grad_norm": 1.6823341506113554, + "language_loss": 0.78469419, + "learning_rate": 3.661822855683723e-07, + "loss": 0.80718791, + "num_input_tokens_seen": 290848815, + "step": 13475, + "time_per_iteration": 2.6290855407714844 + }, + { + "auxiliary_loss_clip": 0.01147549, + "auxiliary_loss_mlp": 0.01100883, + "balance_loss_clip": 1.00179911, + "balance_loss_mlp": 1.0004611, + "epoch": 0.8102209529535548, + "flos": 23731279603200.0, + "grad_norm": 1.7006647784985163, + "language_loss": 0.7509523, + "learning_rate": 3.659576879869364e-07, + "loss": 0.77343661, + "num_input_tokens_seen": 290868580, + "step": 13476, + "time_per_iteration": 2.6728789806365967 + }, + { + "auxiliary_loss_clip": 0.01133171, + "auxiliary_loss_mlp": 0.01102517, + "balance_loss_clip": 1.00180566, + "balance_loss_mlp": 1.00056911, + "epoch": 0.8102810762062228, + "flos": 10955879107200.0, + "grad_norm": 2.178643512623324, + "language_loss": 0.73527002, + "learning_rate": 3.657331523685485e-07, + "loss": 0.75762689, + "num_input_tokens_seen": 290883540, + "step": 13477, + "time_per_iteration": 4.10218358039856 + }, + { + "auxiliary_loss_clip": 0.01133841, + "auxiliary_loss_mlp": 0.01101521, + "balance_loss_clip": 1.00196886, + "balance_loss_mlp": 1.00047874, + "epoch": 0.8103411994588907, + "flos": 14648825220480.0, + "grad_norm": 1.9174543318570783, + "language_loss": 0.70026213, + "learning_rate": 3.6550867872172365e-07, + "loss": 0.72261572, + "num_input_tokens_seen": 290901560, + "step": 13478, + "time_per_iteration": 2.69145131111145 + }, + { + "auxiliary_loss_clip": 0.0115824, + "auxiliary_loss_mlp": 0.01074705, + "balance_loss_clip": 1.00075626, + "balance_loss_mlp": 0.99993676, + "epoch": 0.8104013227115587, + "flos": 59153314665600.0, + "grad_norm": 0.689609418309128, + "language_loss": 0.52169842, + "learning_rate": 3.6528426705497293e-07, + "loss": 0.54402786, + "num_input_tokens_seen": 290959185, + "step": 13479, + "time_per_iteration": 3.11555552482605 + }, + { + "auxiliary_loss_clip": 0.01102556, + "auxiliary_loss_mlp": 0.01102092, + "balance_loss_clip": 1.00192881, + "balance_loss_mlp": 1.00043011, + "epoch": 0.8104614459642266, + "flos": 19828687760640.0, + "grad_norm": 1.8475171639320498, + "language_loss": 0.71593797, + "learning_rate": 3.650599173768072e-07, + "loss": 0.73798454, + "num_input_tokens_seen": 290979585, + "step": 13480, + "time_per_iteration": 2.754538059234619 + }, + { + "auxiliary_loss_clip": 0.01164494, + "auxiliary_loss_mlp": 0.01101906, + "balance_loss_clip": 1.00195491, + "balance_loss_mlp": 1.00053048, + "epoch": 0.8105215692168947, + "flos": 25374264624000.0, + "grad_norm": 1.7839040673888356, + "language_loss": 0.79591244, + "learning_rate": 3.648356296957327e-07, + "loss": 0.81857646, + "num_input_tokens_seen": 291000865, + "step": 13481, + "time_per_iteration": 2.628737449645996 + }, + { + "auxiliary_loss_clip": 0.01130948, + "auxiliary_loss_mlp": 0.01101221, + "balance_loss_clip": 1.00168025, + "balance_loss_mlp": 1.00051272, + "epoch": 0.8105816924695626, + "flos": 20481722974080.0, + "grad_norm": 1.8224290924265936, + "language_loss": 0.72162569, + "learning_rate": 3.646114040202548e-07, + "loss": 0.74394739, + "num_input_tokens_seen": 291018285, + "step": 13482, + "time_per_iteration": 2.729369878768921 + }, + { + "auxiliary_loss_clip": 0.01082322, + "auxiliary_loss_mlp": 0.01101117, + "balance_loss_clip": 1.00151408, + "balance_loss_mlp": 1.000314, + "epoch": 0.8106418157222306, + "flos": 14538687143040.0, + "grad_norm": 2.554746032755817, + "language_loss": 0.65570676, + "learning_rate": 3.6438724035887705e-07, + "loss": 0.67754114, + "num_input_tokens_seen": 291035745, + "step": 13483, + "time_per_iteration": 2.76619553565979 + }, + { + "auxiliary_loss_clip": 0.01134738, + "auxiliary_loss_mlp": 0.0110114, + "balance_loss_clip": 1.00189483, + "balance_loss_mlp": 1.0004319, + "epoch": 0.8107019389748985, + "flos": 22564470205440.0, + "grad_norm": 1.6131532407087217, + "language_loss": 0.76343262, + "learning_rate": 3.641631387200992e-07, + "loss": 0.7857914, + "num_input_tokens_seen": 291053280, + "step": 13484, + "time_per_iteration": 2.7482337951660156 + }, + { + "auxiliary_loss_clip": 0.01133633, + "auxiliary_loss_mlp": 0.01102411, + "balance_loss_clip": 1.00181282, + "balance_loss_mlp": 1.00046325, + "epoch": 0.8107620622275665, + "flos": 19609560840960.0, + "grad_norm": 1.7639428879451498, + "language_loss": 0.72114301, + "learning_rate": 3.639390991124183e-07, + "loss": 0.74350345, + "num_input_tokens_seen": 291072855, + "step": 13485, + "time_per_iteration": 2.752894401550293 + }, + { + "auxiliary_loss_clip": 0.01120235, + "auxiliary_loss_mlp": 0.0110073, + "balance_loss_clip": 1.00187194, + "balance_loss_mlp": 1.00045097, + "epoch": 0.8108221854802344, + "flos": 16143498984960.0, + "grad_norm": 2.04821908686906, + "language_loss": 0.75502223, + "learning_rate": 3.637151215443308e-07, + "loss": 0.77723187, + "num_input_tokens_seen": 291090285, + "step": 13486, + "time_per_iteration": 2.709512948989868 + }, + { + "auxiliary_loss_clip": 0.01131206, + "auxiliary_loss_mlp": 0.01102324, + "balance_loss_clip": 1.00176144, + "balance_loss_mlp": 1.00056732, + "epoch": 0.8108823087329025, + "flos": 21106209853440.0, + "grad_norm": 3.2753541844758627, + "language_loss": 0.72157586, + "learning_rate": 3.6349120602433045e-07, + "loss": 0.74391115, + "num_input_tokens_seen": 291107675, + "step": 13487, + "time_per_iteration": 2.703970432281494 + }, + { + "auxiliary_loss_clip": 0.01084112, + "auxiliary_loss_mlp": 0.01099719, + "balance_loss_clip": 1.00163782, + "balance_loss_mlp": 1.00039434, + "epoch": 0.8109424319855704, + "flos": 29199648182400.0, + "grad_norm": 2.262679894559799, + "language_loss": 0.84297359, + "learning_rate": 3.6326735256090715e-07, + "loss": 0.86481196, + "num_input_tokens_seen": 291126900, + "step": 13488, + "time_per_iteration": 2.8399693965911865 + }, + { + "auxiliary_loss_clip": 0.01164416, + "auxiliary_loss_mlp": 0.01101449, + "balance_loss_clip": 1.00193644, + "balance_loss_mlp": 1.00045514, + "epoch": 0.8110025552382384, + "flos": 23111856541440.0, + "grad_norm": 2.729115284761457, + "language_loss": 0.74263054, + "learning_rate": 3.630435611625502e-07, + "loss": 0.76528925, + "num_input_tokens_seen": 291145285, + "step": 13489, + "time_per_iteration": 4.198072195053101 + }, + { + "auxiliary_loss_clip": 0.01098575, + "auxiliary_loss_mlp": 0.00747347, + "balance_loss_clip": 1.00160813, + "balance_loss_mlp": 1.00042474, + "epoch": 0.8110626784909064, + "flos": 22379961018240.0, + "grad_norm": 1.595362296224618, + "language_loss": 0.71814603, + "learning_rate": 3.628198318377453e-07, + "loss": 0.73660529, + "num_input_tokens_seen": 291163485, + "step": 13490, + "time_per_iteration": 2.8109676837921143 + }, + { + "auxiliary_loss_clip": 0.01118277, + "auxiliary_loss_mlp": 0.01102063, + "balance_loss_clip": 1.00175083, + "balance_loss_mlp": 1.00059175, + "epoch": 0.8111228017435743, + "flos": 23368043318400.0, + "grad_norm": 3.2307616035390563, + "language_loss": 0.71020329, + "learning_rate": 3.625961645949762e-07, + "loss": 0.73240674, + "num_input_tokens_seen": 291182215, + "step": 13491, + "time_per_iteration": 2.7438406944274902 + }, + { + "auxiliary_loss_clip": 0.01164294, + "auxiliary_loss_mlp": 0.01101027, + "balance_loss_clip": 1.00187731, + "balance_loss_mlp": 1.00041401, + "epoch": 0.8111829249962423, + "flos": 21286553063040.0, + "grad_norm": 1.5106657612550447, + "language_loss": 0.67558777, + "learning_rate": 3.623725594427245e-07, + "loss": 0.698241, + "num_input_tokens_seen": 291203145, + "step": 13492, + "time_per_iteration": 4.076419115066528 + }, + { + "auxiliary_loss_clip": 0.01099726, + "auxiliary_loss_mlp": 0.01101585, + "balance_loss_clip": 1.00171888, + "balance_loss_mlp": 1.00049591, + "epoch": 0.8112430482489102, + "flos": 22345558767360.0, + "grad_norm": 1.9357681391304384, + "language_loss": 0.72226787, + "learning_rate": 3.6214901638947006e-07, + "loss": 0.74428099, + "num_input_tokens_seen": 291220600, + "step": 13493, + "time_per_iteration": 2.780097484588623 + }, + { + "auxiliary_loss_clip": 0.01149883, + "auxiliary_loss_mlp": 0.0110199, + "balance_loss_clip": 1.00183249, + "balance_loss_mlp": 1.00056636, + "epoch": 0.8113031715015783, + "flos": 31138321962240.0, + "grad_norm": 1.7737916703791754, + "language_loss": 0.70570165, + "learning_rate": 3.619255354436885e-07, + "loss": 0.72822034, + "num_input_tokens_seen": 291241195, + "step": 13494, + "time_per_iteration": 2.6832168102264404 + }, + { + "auxiliary_loss_clip": 0.01147151, + "auxiliary_loss_mlp": 0.01102274, + "balance_loss_clip": 1.0018332, + "balance_loss_mlp": 1.00051737, + "epoch": 0.8113632947542462, + "flos": 25335445000320.0, + "grad_norm": 4.03912006411238, + "language_loss": 0.7671687, + "learning_rate": 3.6170211661385543e-07, + "loss": 0.78966296, + "num_input_tokens_seen": 291258715, + "step": 13495, + "time_per_iteration": 2.653904914855957 + }, + { + "auxiliary_loss_clip": 0.0113298, + "auxiliary_loss_mlp": 0.01100909, + "balance_loss_clip": 1.00175881, + "balance_loss_mlp": 1.00043929, + "epoch": 0.8114234180069142, + "flos": 28439168411520.0, + "grad_norm": 1.7882662996999494, + "language_loss": 0.79450279, + "learning_rate": 3.614787599084417e-07, + "loss": 0.8168416, + "num_input_tokens_seen": 291278030, + "step": 13496, + "time_per_iteration": 2.771211624145508 + }, + { + "auxiliary_loss_clip": 0.01149605, + "auxiliary_loss_mlp": 0.01101858, + "balance_loss_clip": 1.00187898, + "balance_loss_mlp": 1.00048196, + "epoch": 0.8114835412595821, + "flos": 20338870584960.0, + "grad_norm": 1.5861789935613184, + "language_loss": 0.71249908, + "learning_rate": 3.6125546533591787e-07, + "loss": 0.73501372, + "num_input_tokens_seen": 291296740, + "step": 13497, + "time_per_iteration": 2.656743049621582 + }, + { + "auxiliary_loss_clip": 0.01117845, + "auxiliary_loss_mlp": 0.01101832, + "balance_loss_clip": 1.00184608, + "balance_loss_mlp": 1.00045645, + "epoch": 0.8115436645122501, + "flos": 22490889194880.0, + "grad_norm": 1.9080296864578774, + "language_loss": 0.7658999, + "learning_rate": 3.610322329047508e-07, + "loss": 0.78809667, + "num_input_tokens_seen": 291318730, + "step": 13498, + "time_per_iteration": 2.819458484649658 + }, + { + "auxiliary_loss_clip": 0.01164288, + "auxiliary_loss_mlp": 0.01100556, + "balance_loss_clip": 1.00186825, + "balance_loss_mlp": 1.00051534, + "epoch": 0.811603787764918, + "flos": 13845288021120.0, + "grad_norm": 2.1851567052464675, + "language_loss": 0.83575988, + "learning_rate": 3.608090626234055e-07, + "loss": 0.85840833, + "num_input_tokens_seen": 291336755, + "step": 13499, + "time_per_iteration": 2.6074161529541016 + }, + { + "auxiliary_loss_clip": 0.0111817, + "auxiliary_loss_mlp": 0.0110116, + "balance_loss_clip": 1.0017066, + "balance_loss_mlp": 1.00035667, + "epoch": 0.8116639110175861, + "flos": 21614632911360.0, + "grad_norm": 1.5713801488455164, + "language_loss": 0.76059943, + "learning_rate": 3.6058595450034603e-07, + "loss": 0.78279275, + "num_input_tokens_seen": 291356795, + "step": 13500, + "time_per_iteration": 2.764678478240967 + }, + { + "auxiliary_loss_clip": 0.01126861, + "auxiliary_loss_mlp": 0.01074924, + "balance_loss_clip": 1.00068784, + "balance_loss_mlp": 1.00015569, + "epoch": 0.811724034270254, + "flos": 64459799625600.0, + "grad_norm": 0.8126203094522277, + "language_loss": 0.59977615, + "learning_rate": 3.603629085440303e-07, + "loss": 0.62179399, + "num_input_tokens_seen": 291416005, + "step": 13501, + "time_per_iteration": 3.308178424835205 + }, + { + "auxiliary_loss_clip": 0.01147273, + "auxiliary_loss_mlp": 0.01099888, + "balance_loss_clip": 1.00176454, + "balance_loss_mlp": 1.0004195, + "epoch": 0.811784157522922, + "flos": 24754123290240.0, + "grad_norm": 1.734331099243856, + "language_loss": 0.78908587, + "learning_rate": 3.6013992476291753e-07, + "loss": 0.81155753, + "num_input_tokens_seen": 291434870, + "step": 13502, + "time_per_iteration": 2.723360300064087 + }, + { + "auxiliary_loss_clip": 0.01134497, + "auxiliary_loss_mlp": 0.01101379, + "balance_loss_clip": 1.00174928, + "balance_loss_mlp": 1.00057554, + "epoch": 0.81184428077559, + "flos": 12167146563840.0, + "grad_norm": 1.8969906036382984, + "language_loss": 0.71229011, + "learning_rate": 3.599170031654635e-07, + "loss": 0.73464888, + "num_input_tokens_seen": 291452230, + "step": 13503, + "time_per_iteration": 2.7994232177734375 + }, + { + "auxiliary_loss_clip": 0.01133222, + "auxiliary_loss_mlp": 0.01102011, + "balance_loss_clip": 1.00189829, + "balance_loss_mlp": 1.00044513, + "epoch": 0.8119044040282579, + "flos": 44422037775360.0, + "grad_norm": 1.7274809378688418, + "language_loss": 0.67667937, + "learning_rate": 3.5969414376012065e-07, + "loss": 0.69903171, + "num_input_tokens_seen": 291477425, + "step": 13504, + "time_per_iteration": 2.9723165035247803 + }, + { + "auxiliary_loss_clip": 0.01130636, + "auxiliary_loss_mlp": 0.01101701, + "balance_loss_clip": 1.00164771, + "balance_loss_mlp": 1.00042033, + "epoch": 0.8119645272809259, + "flos": 52155507957120.0, + "grad_norm": 1.8837938513986205, + "language_loss": 0.74788398, + "learning_rate": 3.594713465553403e-07, + "loss": 0.77020741, + "num_input_tokens_seen": 291501070, + "step": 13505, + "time_per_iteration": 2.9570257663726807 + }, + { + "auxiliary_loss_clip": 0.01130319, + "auxiliary_loss_mlp": 0.01101592, + "balance_loss_clip": 1.00180149, + "balance_loss_mlp": 1.0004077, + "epoch": 0.8120246505335939, + "flos": 30232978640640.0, + "grad_norm": 2.103982857104081, + "language_loss": 0.72570622, + "learning_rate": 3.5924861155957123e-07, + "loss": 0.7480253, + "num_input_tokens_seen": 291524945, + "step": 13506, + "time_per_iteration": 2.860478162765503 + }, + { + "auxiliary_loss_clip": 0.01164459, + "auxiliary_loss_mlp": 0.01101888, + "balance_loss_clip": 1.00181282, + "balance_loss_mlp": 1.00041676, + "epoch": 0.8120847737862619, + "flos": 22127652910080.0, + "grad_norm": 2.1276228247182893, + "language_loss": 0.75828612, + "learning_rate": 3.590259387812593e-07, + "loss": 0.78094947, + "num_input_tokens_seen": 291544605, + "step": 13507, + "time_per_iteration": 2.631883382797241 + }, + { + "auxiliary_loss_clip": 0.01164567, + "auxiliary_loss_mlp": 0.01102548, + "balance_loss_clip": 1.00182354, + "balance_loss_mlp": 1.00040936, + "epoch": 0.8121448970389298, + "flos": 23295180579840.0, + "grad_norm": 1.6400905252362419, + "language_loss": 0.70021546, + "learning_rate": 3.5880332822884783e-07, + "loss": 0.72288668, + "num_input_tokens_seen": 291563850, + "step": 13508, + "time_per_iteration": 2.7324888706207275 + }, + { + "auxiliary_loss_clip": 0.01147612, + "auxiliary_loss_mlp": 0.01101356, + "balance_loss_clip": 1.00184202, + "balance_loss_mlp": 1.00045681, + "epoch": 0.8122050202915978, + "flos": 22164138149760.0, + "grad_norm": 2.120560288185867, + "language_loss": 0.76196229, + "learning_rate": 3.585807799107785e-07, + "loss": 0.78445196, + "num_input_tokens_seen": 291581730, + "step": 13509, + "time_per_iteration": 2.645249843597412 + }, + { + "auxiliary_loss_clip": 0.01164442, + "auxiliary_loss_mlp": 0.01101467, + "balance_loss_clip": 1.00189805, + "balance_loss_mlp": 1.00047278, + "epoch": 0.8122651435442657, + "flos": 23258946735360.0, + "grad_norm": 1.7171302245767972, + "language_loss": 0.76815116, + "learning_rate": 3.58358293835491e-07, + "loss": 0.79081023, + "num_input_tokens_seen": 291601225, + "step": 13510, + "time_per_iteration": 2.6864187717437744 + }, + { + "auxiliary_loss_clip": 0.01149714, + "auxiliary_loss_mlp": 0.01102415, + "balance_loss_clip": 1.00178266, + "balance_loss_mlp": 1.0004673, + "epoch": 0.8123252667969337, + "flos": 16140015365760.0, + "grad_norm": 1.8295284357671853, + "language_loss": 0.69755614, + "learning_rate": 3.581358700114212e-07, + "loss": 0.7200774, + "num_input_tokens_seen": 291616995, + "step": 13511, + "time_per_iteration": 4.349576234817505 + }, + { + "auxiliary_loss_clip": 0.01133121, + "auxiliary_loss_mlp": 0.01102273, + "balance_loss_clip": 1.00182676, + "balance_loss_mlp": 1.0005157, + "epoch": 0.8123853900496016, + "flos": 21245399055360.0, + "grad_norm": 1.6441910661467884, + "language_loss": 0.79586709, + "learning_rate": 3.57913508447004e-07, + "loss": 0.81822103, + "num_input_tokens_seen": 291636145, + "step": 13512, + "time_per_iteration": 2.672750949859619 + }, + { + "auxiliary_loss_clip": 0.01147665, + "auxiliary_loss_mlp": 0.01101578, + "balance_loss_clip": 1.00176835, + "balance_loss_mlp": 1.0004884, + "epoch": 0.8124455133022697, + "flos": 64377596373120.0, + "grad_norm": 2.461762101303134, + "language_loss": 0.63617885, + "learning_rate": 3.5769120915067076e-07, + "loss": 0.65867126, + "num_input_tokens_seen": 291662440, + "step": 13513, + "time_per_iteration": 3.132507801055908 + }, + { + "auxiliary_loss_clip": 0.01099443, + "auxiliary_loss_mlp": 0.01101578, + "balance_loss_clip": 1.00165606, + "balance_loss_mlp": 1.00048864, + "epoch": 0.8125056365549376, + "flos": 23842207779840.0, + "grad_norm": 2.2554121529085482, + "language_loss": 0.71362484, + "learning_rate": 3.5746897213085194e-07, + "loss": 0.73563504, + "num_input_tokens_seen": 291680950, + "step": 13514, + "time_per_iteration": 2.8581976890563965 + }, + { + "auxiliary_loss_clip": 0.01114746, + "auxiliary_loss_mlp": 0.01100853, + "balance_loss_clip": 1.00162685, + "balance_loss_mlp": 1.00033522, + "epoch": 0.8125657598076056, + "flos": 23550325862400.0, + "grad_norm": 1.5640133633411897, + "language_loss": 0.62708032, + "learning_rate": 3.5724679739597364e-07, + "loss": 0.64923632, + "num_input_tokens_seen": 291702395, + "step": 13515, + "time_per_iteration": 4.054006099700928 + }, + { + "auxiliary_loss_clip": 0.01163944, + "auxiliary_loss_mlp": 0.00747239, + "balance_loss_clip": 1.00177836, + "balance_loss_mlp": 1.00039864, + "epoch": 0.8126258830602736, + "flos": 20704225772160.0, + "grad_norm": 1.5568576722035905, + "language_loss": 0.74923098, + "learning_rate": 3.570246849544616e-07, + "loss": 0.76834273, + "num_input_tokens_seen": 291721135, + "step": 13516, + "time_per_iteration": 2.7133638858795166 + }, + { + "auxiliary_loss_clip": 0.01099908, + "auxiliary_loss_mlp": 0.0110169, + "balance_loss_clip": 1.0018146, + "balance_loss_mlp": 1.00050569, + "epoch": 0.8126860063129415, + "flos": 23618160696960.0, + "grad_norm": 1.7111900847120023, + "language_loss": 0.91416144, + "learning_rate": 3.5680263481473907e-07, + "loss": 0.93617737, + "num_input_tokens_seen": 291741235, + "step": 13517, + "time_per_iteration": 2.7937355041503906 + }, + { + "auxiliary_loss_clip": 0.01147769, + "auxiliary_loss_mlp": 0.00747418, + "balance_loss_clip": 1.00189734, + "balance_loss_mlp": 1.00060141, + "epoch": 0.8127461295656095, + "flos": 25007149670400.0, + "grad_norm": 1.462701198881606, + "language_loss": 0.78605479, + "learning_rate": 3.565806469852244e-07, + "loss": 0.80500662, + "num_input_tokens_seen": 291761430, + "step": 13518, + "time_per_iteration": 2.7207796573638916 + }, + { + "auxiliary_loss_clip": 0.01149715, + "auxiliary_loss_mlp": 0.01101283, + "balance_loss_clip": 1.00192904, + "balance_loss_mlp": 1.00047934, + "epoch": 0.8128062528182775, + "flos": 27342169096320.0, + "grad_norm": 1.5973293111908933, + "language_loss": 0.79205358, + "learning_rate": 3.56358721474336e-07, + "loss": 0.81456351, + "num_input_tokens_seen": 291781755, + "step": 13519, + "time_per_iteration": 2.7457034587860107 + }, + { + "auxiliary_loss_clip": 0.01164282, + "auxiliary_loss_mlp": 0.01101108, + "balance_loss_clip": 1.00186729, + "balance_loss_mlp": 1.00049567, + "epoch": 0.8128663760709455, + "flos": 26506312634880.0, + "grad_norm": 1.770354712938982, + "language_loss": 0.70488513, + "learning_rate": 3.561368582904905e-07, + "loss": 0.72753894, + "num_input_tokens_seen": 291804410, + "step": 13520, + "time_per_iteration": 2.71176815032959 + }, + { + "auxiliary_loss_clip": 0.01134144, + "auxiliary_loss_mlp": 0.0110222, + "balance_loss_clip": 1.00182915, + "balance_loss_mlp": 1.00046349, + "epoch": 0.8129264993236134, + "flos": 17931239815680.0, + "grad_norm": 1.661087486677517, + "language_loss": 0.72770238, + "learning_rate": 3.5591505744209925e-07, + "loss": 0.75006604, + "num_input_tokens_seen": 291823285, + "step": 13521, + "time_per_iteration": 2.713230848312378 + }, + { + "auxiliary_loss_clip": 0.01148946, + "auxiliary_loss_mlp": 0.0110104, + "balance_loss_clip": 1.00170779, + "balance_loss_mlp": 1.00042737, + "epoch": 0.8129866225762814, + "flos": 26177694082560.0, + "grad_norm": 2.0254952739618663, + "language_loss": 0.69937211, + "learning_rate": 3.5569331893757394e-07, + "loss": 0.72187197, + "num_input_tokens_seen": 291845305, + "step": 13522, + "time_per_iteration": 2.7562925815582275 + }, + { + "auxiliary_loss_clip": 0.01147487, + "auxiliary_loss_mlp": 0.0110035, + "balance_loss_clip": 1.0019455, + "balance_loss_mlp": 1.00050044, + "epoch": 0.8130467458289493, + "flos": 21032197879680.0, + "grad_norm": 1.4798448378567601, + "language_loss": 0.70375723, + "learning_rate": 3.554716427853233e-07, + "loss": 0.72623563, + "num_input_tokens_seen": 291863715, + "step": 13523, + "time_per_iteration": 2.7204110622406006 + }, + { + "auxiliary_loss_clip": 0.01149156, + "auxiliary_loss_mlp": 0.01101104, + "balance_loss_clip": 1.0018177, + "balance_loss_mlp": 1.00039601, + "epoch": 0.8131068690816173, + "flos": 15487051979520.0, + "grad_norm": 2.7247617084539937, + "language_loss": 0.71264684, + "learning_rate": 3.5525002899375256e-07, + "loss": 0.73514944, + "num_input_tokens_seen": 291880735, + "step": 13524, + "time_per_iteration": 2.684549331665039 + }, + { + "auxiliary_loss_clip": 0.01147697, + "auxiliary_loss_mlp": 0.01100578, + "balance_loss_clip": 1.00182402, + "balance_loss_mlp": 1.00039482, + "epoch": 0.8131669923342852, + "flos": 29351227576320.0, + "grad_norm": 5.164722440348009, + "language_loss": 0.62524688, + "learning_rate": 3.550284775712653e-07, + "loss": 0.64772969, + "num_input_tokens_seen": 291900535, + "step": 13525, + "time_per_iteration": 2.7937588691711426 + }, + { + "auxiliary_loss_clip": 0.01131919, + "auxiliary_loss_mlp": 0.01100748, + "balance_loss_clip": 1.00183213, + "balance_loss_mlp": 1.00051677, + "epoch": 0.8132271155869533, + "flos": 35256162055680.0, + "grad_norm": 4.048282352316087, + "language_loss": 0.65663874, + "learning_rate": 3.548069885262628e-07, + "loss": 0.67896545, + "num_input_tokens_seen": 291919760, + "step": 13526, + "time_per_iteration": 2.8086140155792236 + }, + { + "auxiliary_loss_clip": 0.01130719, + "auxiliary_loss_mlp": 0.01100056, + "balance_loss_clip": 1.00172472, + "balance_loss_mlp": 1.00039673, + "epoch": 0.8132872388396212, + "flos": 27781895393280.0, + "grad_norm": 1.6947967896111358, + "language_loss": 0.7533052, + "learning_rate": 3.5458556186714473e-07, + "loss": 0.77561295, + "num_input_tokens_seen": 291938915, + "step": 13527, + "time_per_iteration": 2.8006041049957275 + }, + { + "auxiliary_loss_clip": 0.01164299, + "auxiliary_loss_mlp": 0.01101879, + "balance_loss_clip": 1.00177026, + "balance_loss_mlp": 1.00040817, + "epoch": 0.8133473620922892, + "flos": 27819601695360.0, + "grad_norm": 1.6711884478114543, + "language_loss": 0.70779097, + "learning_rate": 3.5436419760230706e-07, + "loss": 0.73045278, + "num_input_tokens_seen": 291958145, + "step": 13528, + "time_per_iteration": 4.036668300628662 + }, + { + "auxiliary_loss_clip": 0.01164385, + "auxiliary_loss_mlp": 0.01100606, + "balance_loss_clip": 1.00187075, + "balance_loss_mlp": 1.00037527, + "epoch": 0.8134074853449572, + "flos": 18989527248000.0, + "grad_norm": 35.27365821607101, + "language_loss": 0.68846792, + "learning_rate": 3.5414289574014357e-07, + "loss": 0.71111774, + "num_input_tokens_seen": 291976860, + "step": 13529, + "time_per_iteration": 2.5308094024658203 + }, + { + "auxiliary_loss_clip": 0.01148588, + "auxiliary_loss_mlp": 0.01100648, + "balance_loss_clip": 1.00192225, + "balance_loss_mlp": 1.00032127, + "epoch": 0.8134676085976251, + "flos": 24242863057920.0, + "grad_norm": 1.448701836562854, + "language_loss": 0.7743476, + "learning_rate": 3.5392165628904635e-07, + "loss": 0.79683995, + "num_input_tokens_seen": 291998085, + "step": 13530, + "time_per_iteration": 4.194007396697998 + }, + { + "auxiliary_loss_clip": 0.01147471, + "auxiliary_loss_mlp": 0.01101414, + "balance_loss_clip": 1.00180876, + "balance_loss_mlp": 1.00051522, + "epoch": 0.8135277318502931, + "flos": 19062389986560.0, + "grad_norm": 2.0185003870891767, + "language_loss": 0.8229934, + "learning_rate": 3.537004792574052e-07, + "loss": 0.84548223, + "num_input_tokens_seen": 292016585, + "step": 13531, + "time_per_iteration": 2.6012914180755615 + }, + { + "auxiliary_loss_clip": 0.01134828, + "auxiliary_loss_mlp": 0.01101952, + "balance_loss_clip": 1.0017755, + "balance_loss_mlp": 1.00048113, + "epoch": 0.813587855102961, + "flos": 17269728992640.0, + "grad_norm": 3.481887730556663, + "language_loss": 0.71496499, + "learning_rate": 3.534793646536065e-07, + "loss": 0.73733282, + "num_input_tokens_seen": 292033255, + "step": 13532, + "time_per_iteration": 2.620518684387207 + }, + { + "auxiliary_loss_clip": 0.01117718, + "auxiliary_loss_mlp": 0.01101128, + "balance_loss_clip": 1.00172091, + "balance_loss_mlp": 1.00051498, + "epoch": 0.8136479783556291, + "flos": 20157593621760.0, + "grad_norm": 2.092614680689523, + "language_loss": 0.76205897, + "learning_rate": 3.5325831248603533e-07, + "loss": 0.7842474, + "num_input_tokens_seen": 292051800, + "step": 13533, + "time_per_iteration": 2.7038252353668213 + }, + { + "auxiliary_loss_clip": 0.01164394, + "auxiliary_loss_mlp": 0.00747437, + "balance_loss_clip": 1.00182796, + "balance_loss_mlp": 1.00048947, + "epoch": 0.813708101608297, + "flos": 22052348046720.0, + "grad_norm": 2.066995264656293, + "language_loss": 0.76797223, + "learning_rate": 3.5303732276307495e-07, + "loss": 0.78709054, + "num_input_tokens_seen": 292072215, + "step": 13534, + "time_per_iteration": 2.603023052215576 + }, + { + "auxiliary_loss_clip": 0.01148497, + "auxiliary_loss_mlp": 0.01101228, + "balance_loss_clip": 1.00201416, + "balance_loss_mlp": 1.00047207, + "epoch": 0.813768224860965, + "flos": 16173412035840.0, + "grad_norm": 2.388926855258721, + "language_loss": 0.93400699, + "learning_rate": 3.5281639549310336e-07, + "loss": 0.95650423, + "num_input_tokens_seen": 292088830, + "step": 13535, + "time_per_iteration": 2.7938740253448486 + }, + { + "auxiliary_loss_clip": 0.01115012, + "auxiliary_loss_mlp": 0.0110062, + "balance_loss_clip": 1.0017345, + "balance_loss_mlp": 1.00043702, + "epoch": 0.8138283481136329, + "flos": 24352318776960.0, + "grad_norm": 1.9398941996776078, + "language_loss": 0.70287716, + "learning_rate": 3.52595530684499e-07, + "loss": 0.72503352, + "num_input_tokens_seen": 292109225, + "step": 13536, + "time_per_iteration": 2.77551531791687 + }, + { + "auxiliary_loss_clip": 0.01117172, + "auxiliary_loss_mlp": 0.0110119, + "balance_loss_clip": 1.00178885, + "balance_loss_mlp": 1.0004822, + "epoch": 0.8138884713663009, + "flos": 25516362827520.0, + "grad_norm": 1.4176806048117236, + "language_loss": 0.75259244, + "learning_rate": 3.5237472834563775e-07, + "loss": 0.7747761, + "num_input_tokens_seen": 292129660, + "step": 13537, + "time_per_iteration": 2.820038080215454 + }, + { + "auxiliary_loss_clip": 0.01134649, + "auxiliary_loss_mlp": 0.01100297, + "balance_loss_clip": 1.0018301, + "balance_loss_mlp": 1.00044703, + "epoch": 0.8139485946189688, + "flos": 22454368041600.0, + "grad_norm": 1.7793600298344134, + "language_loss": 0.76458979, + "learning_rate": 3.5215398848489163e-07, + "loss": 0.78693926, + "num_input_tokens_seen": 292149090, + "step": 13538, + "time_per_iteration": 2.739067792892456 + }, + { + "auxiliary_loss_clip": 0.01147796, + "auxiliary_loss_mlp": 0.0110152, + "balance_loss_clip": 1.0017134, + "balance_loss_mlp": 1.00043058, + "epoch": 0.8140087178716369, + "flos": 21250391045760.0, + "grad_norm": 1.5296130829299368, + "language_loss": 0.7779007, + "learning_rate": 3.5193331111063176e-07, + "loss": 0.80039388, + "num_input_tokens_seen": 292169260, + "step": 13539, + "time_per_iteration": 2.7004640102386475 + }, + { + "auxiliary_loss_clip": 0.0110333, + "auxiliary_loss_mlp": 0.01100148, + "balance_loss_clip": 1.00174797, + "balance_loss_mlp": 1.00044155, + "epoch": 0.8140688411243048, + "flos": 39415730774400.0, + "grad_norm": 2.96526642516372, + "language_loss": 0.66190463, + "learning_rate": 3.5171269623122533e-07, + "loss": 0.6839394, + "num_input_tokens_seen": 292188145, + "step": 13540, + "time_per_iteration": 2.9728477001190186 + }, + { + "auxiliary_loss_clip": 0.01149638, + "auxiliary_loss_mlp": 0.01100954, + "balance_loss_clip": 1.00186467, + "balance_loss_mlp": 1.00043726, + "epoch": 0.8141289643769728, + "flos": 25415885508480.0, + "grad_norm": 1.4054769655346122, + "language_loss": 0.67372453, + "learning_rate": 3.5149214385503913e-07, + "loss": 0.69623047, + "num_input_tokens_seen": 292212135, + "step": 13541, + "time_per_iteration": 2.755842685699463 + }, + { + "auxiliary_loss_clip": 0.01164201, + "auxiliary_loss_mlp": 0.0110143, + "balance_loss_clip": 1.00191665, + "balance_loss_mlp": 1.00043607, + "epoch": 0.8141890876296408, + "flos": 12568053237120.0, + "grad_norm": 1.961599511599324, + "language_loss": 0.69366872, + "learning_rate": 3.512716539904355e-07, + "loss": 0.71632504, + "num_input_tokens_seen": 292230645, + "step": 13542, + "time_per_iteration": 2.631840467453003 + }, + { + "auxiliary_loss_clip": 0.01164476, + "auxiliary_loss_mlp": 0.01102473, + "balance_loss_clip": 1.00183582, + "balance_loss_mlp": 1.00052524, + "epoch": 0.8142492108823087, + "flos": 14967172483200.0, + "grad_norm": 3.0488311675802486, + "language_loss": 0.79987991, + "learning_rate": 3.5105122664577613e-07, + "loss": 0.8225494, + "num_input_tokens_seen": 292243540, + "step": 13543, + "time_per_iteration": 2.5223119258880615 + }, + { + "auxiliary_loss_clip": 0.0111541, + "auxiliary_loss_mlp": 0.0110269, + "balance_loss_clip": 1.00169253, + "balance_loss_mlp": 1.00055194, + "epoch": 0.8143093341349767, + "flos": 12422004537600.0, + "grad_norm": 2.072408281033914, + "language_loss": 0.78045321, + "learning_rate": 3.5083086182942003e-07, + "loss": 0.80263418, + "num_input_tokens_seen": 292261715, + "step": 13544, + "time_per_iteration": 2.7464051246643066 + }, + { + "auxiliary_loss_clip": 0.01164716, + "auxiliary_loss_mlp": 0.01103239, + "balance_loss_clip": 1.00200391, + "balance_loss_mlp": 1.00043249, + "epoch": 0.8143694573876447, + "flos": 11910564737280.0, + "grad_norm": 3.431667845863326, + "language_loss": 0.73516071, + "learning_rate": 3.5061055954972264e-07, + "loss": 0.75784028, + "num_input_tokens_seen": 292275080, + "step": 13545, + "time_per_iteration": 2.535295009613037 + }, + { + "auxiliary_loss_clip": 0.01149554, + "auxiliary_loss_mlp": 0.01099579, + "balance_loss_clip": 1.00186563, + "balance_loss_mlp": 1.00044417, + "epoch": 0.8144295806403127, + "flos": 21212900225280.0, + "grad_norm": 1.6608356317917379, + "language_loss": 0.76741213, + "learning_rate": 3.5039031981503776e-07, + "loss": 0.78990346, + "num_input_tokens_seen": 292294635, + "step": 13546, + "time_per_iteration": 2.7303683757781982 + }, + { + "auxiliary_loss_clip": 0.01147765, + "auxiliary_loss_mlp": 0.01102031, + "balance_loss_clip": 1.00189161, + "balance_loss_mlp": 1.00036919, + "epoch": 0.8144897038929806, + "flos": 19865280741120.0, + "grad_norm": 2.7937119451707773, + "language_loss": 0.70537519, + "learning_rate": 3.501701426337178e-07, + "loss": 0.72787315, + "num_input_tokens_seen": 292312695, + "step": 13547, + "time_per_iteration": 2.659045696258545 + }, + { + "auxiliary_loss_clip": 0.01164405, + "auxiliary_loss_mlp": 0.0110264, + "balance_loss_clip": 1.00190294, + "balance_loss_mlp": 1.00059676, + "epoch": 0.8145498271456486, + "flos": 24571733005440.0, + "grad_norm": 3.2360362214340936, + "language_loss": 0.70658201, + "learning_rate": 3.49950028014111e-07, + "loss": 0.72925246, + "num_input_tokens_seen": 292332005, + "step": 13548, + "time_per_iteration": 2.6049046516418457 + }, + { + "auxiliary_loss_clip": 0.01147866, + "auxiliary_loss_mlp": 0.0110226, + "balance_loss_clip": 1.00199199, + "balance_loss_mlp": 1.00040746, + "epoch": 0.8146099503983165, + "flos": 20193037367040.0, + "grad_norm": 2.8267890444932737, + "language_loss": 0.76557934, + "learning_rate": 3.4972997596456444e-07, + "loss": 0.78808057, + "num_input_tokens_seen": 292348365, + "step": 13549, + "time_per_iteration": 4.249311447143555 + }, + { + "auxiliary_loss_clip": 0.01164402, + "auxiliary_loss_mlp": 0.01101873, + "balance_loss_clip": 1.00188875, + "balance_loss_mlp": 1.0004977, + "epoch": 0.8146700736509845, + "flos": 19536949497600.0, + "grad_norm": 3.1604127172589904, + "language_loss": 0.71589971, + "learning_rate": 3.4950998649342233e-07, + "loss": 0.73856246, + "num_input_tokens_seen": 292368050, + "step": 13550, + "time_per_iteration": 2.5596375465393066 + }, + { + "auxiliary_loss_clip": 0.01147559, + "auxiliary_loss_mlp": 0.01100855, + "balance_loss_clip": 1.00180149, + "balance_loss_mlp": 1.00043273, + "epoch": 0.8147301969036524, + "flos": 18041341979520.0, + "grad_norm": 1.822056278805171, + "language_loss": 0.71950221, + "learning_rate": 3.4929005960902826e-07, + "loss": 0.74198639, + "num_input_tokens_seen": 292385315, + "step": 13551, + "time_per_iteration": 2.626567840576172 + }, + { + "auxiliary_loss_clip": 0.01098384, + "auxiliary_loss_mlp": 0.01103061, + "balance_loss_clip": 1.00166297, + "balance_loss_mlp": 1.00044608, + "epoch": 0.8147903201563205, + "flos": 18004713085440.0, + "grad_norm": 1.985516463680685, + "language_loss": 0.68456089, + "learning_rate": 3.4907019531971926e-07, + "loss": 0.70657539, + "num_input_tokens_seen": 292403375, + "step": 13552, + "time_per_iteration": 2.6732516288757324 + }, + { + "auxiliary_loss_clip": 0.01164476, + "auxiliary_loss_mlp": 0.01101514, + "balance_loss_clip": 1.00191379, + "balance_loss_mlp": 1.0006156, + "epoch": 0.8148504434089884, + "flos": 20259327916800.0, + "grad_norm": 1.8581876187832498, + "language_loss": 0.81934631, + "learning_rate": 3.4885039363383407e-07, + "loss": 0.84200621, + "num_input_tokens_seen": 292419260, + "step": 13553, + "time_per_iteration": 3.9193546772003174 + }, + { + "auxiliary_loss_clip": 0.01149598, + "auxiliary_loss_mlp": 0.01101708, + "balance_loss_clip": 1.00183094, + "balance_loss_mlp": 1.00042772, + "epoch": 0.8149105666616564, + "flos": 12494723621760.0, + "grad_norm": 1.7427079146388156, + "language_loss": 0.68390512, + "learning_rate": 3.4863065455970795e-07, + "loss": 0.70641816, + "num_input_tokens_seen": 292436095, + "step": 13554, + "time_per_iteration": 2.612032413482666 + }, + { + "auxiliary_loss_clip": 0.01134775, + "auxiliary_loss_mlp": 0.0110185, + "balance_loss_clip": 1.00196588, + "balance_loss_mlp": 1.0004741, + "epoch": 0.8149706899143244, + "flos": 32523683662080.0, + "grad_norm": 1.7257493368575527, + "language_loss": 0.66592866, + "learning_rate": 3.484109781056723e-07, + "loss": 0.68829489, + "num_input_tokens_seen": 292457190, + "step": 13555, + "time_per_iteration": 2.703632354736328 + }, + { + "auxiliary_loss_clip": 0.0114986, + "auxiliary_loss_mlp": 0.0110125, + "balance_loss_clip": 1.00179291, + "balance_loss_mlp": 1.00054145, + "epoch": 0.8150308131669923, + "flos": 19386088375680.0, + "grad_norm": 2.285754494715216, + "language_loss": 0.72910655, + "learning_rate": 3.4819136428005844e-07, + "loss": 0.75161767, + "num_input_tokens_seen": 292474300, + "step": 13556, + "time_per_iteration": 2.659471273422241 + }, + { + "auxiliary_loss_clip": 0.01147563, + "auxiliary_loss_mlp": 0.0110067, + "balance_loss_clip": 1.00192845, + "balance_loss_mlp": 1.00039113, + "epoch": 0.8150909364196604, + "flos": 17421380213760.0, + "grad_norm": 2.045092277437589, + "language_loss": 0.80539334, + "learning_rate": 3.4797181309119307e-07, + "loss": 0.82787567, + "num_input_tokens_seen": 292492420, + "step": 13557, + "time_per_iteration": 2.629922866821289 + }, + { + "auxiliary_loss_clip": 0.0113237, + "auxiliary_loss_mlp": 0.01101371, + "balance_loss_clip": 1.0017575, + "balance_loss_mlp": 1.00061488, + "epoch": 0.8151510596723283, + "flos": 27162795553920.0, + "grad_norm": 1.6546877458465943, + "language_loss": 0.6560027, + "learning_rate": 3.4775232454740255e-07, + "loss": 0.67834014, + "num_input_tokens_seen": 292512895, + "step": 13558, + "time_per_iteration": 2.7162704467773438 + }, + { + "auxiliary_loss_clip": 0.01158211, + "auxiliary_loss_mlp": 0.01074729, + "balance_loss_clip": 1.00076962, + "balance_loss_mlp": 0.99996072, + "epoch": 0.8152111829249963, + "flos": 64219052718720.0, + "grad_norm": 0.791031021563361, + "language_loss": 0.57021827, + "learning_rate": 3.4753289865700896e-07, + "loss": 0.59254766, + "num_input_tokens_seen": 292566580, + "step": 13559, + "time_per_iteration": 3.0398693084716797 + }, + { + "auxiliary_loss_clip": 0.01128958, + "auxiliary_loss_mlp": 0.01074783, + "balance_loss_clip": 1.00078702, + "balance_loss_mlp": 1.0000149, + "epoch": 0.8152713061776642, + "flos": 67072012306560.0, + "grad_norm": 0.6788110817584543, + "language_loss": 0.553877, + "learning_rate": 3.473135354283334e-07, + "loss": 0.57591438, + "num_input_tokens_seen": 292621490, + "step": 13560, + "time_per_iteration": 3.02711820602417 + }, + { + "auxiliary_loss_clip": 0.01134799, + "auxiliary_loss_mlp": 0.0110101, + "balance_loss_clip": 1.00175858, + "balance_loss_mlp": 1.00049269, + "epoch": 0.8153314294303322, + "flos": 14391130072320.0, + "grad_norm": 1.7233583258378602, + "language_loss": 0.67641753, + "learning_rate": 3.470942348696948e-07, + "loss": 0.69877565, + "num_input_tokens_seen": 292638660, + "step": 13561, + "time_per_iteration": 2.70417857170105 + }, + { + "auxiliary_loss_clip": 0.01148128, + "auxiliary_loss_mlp": 0.01101343, + "balance_loss_clip": 1.00175273, + "balance_loss_mlp": 1.00044453, + "epoch": 0.8153915526830001, + "flos": 25623520076160.0, + "grad_norm": 1.558824335945036, + "language_loss": 0.81534231, + "learning_rate": 3.468749969894085e-07, + "loss": 0.83783698, + "num_input_tokens_seen": 292658545, + "step": 13562, + "time_per_iteration": 2.6829240322113037 + }, + { + "auxiliary_loss_clip": 0.01132865, + "auxiliary_loss_mlp": 0.01101566, + "balance_loss_clip": 1.00190735, + "balance_loss_mlp": 1.00047636, + "epoch": 0.8154516759356681, + "flos": 23369156640000.0, + "grad_norm": 1.5397894115892976, + "language_loss": 0.7197305, + "learning_rate": 3.4665582179578734e-07, + "loss": 0.74207485, + "num_input_tokens_seen": 292678460, + "step": 13563, + "time_per_iteration": 2.7393693923950195 + }, + { + "auxiliary_loss_clip": 0.01053693, + "auxiliary_loss_mlp": 0.01101037, + "balance_loss_clip": 1.00158584, + "balance_loss_mlp": 1.00042415, + "epoch": 0.815511799188336, + "flos": 28149189914880.0, + "grad_norm": 1.471204062193317, + "language_loss": 0.69873643, + "learning_rate": 3.4643670929714387e-07, + "loss": 0.72028375, + "num_input_tokens_seen": 292699815, + "step": 13564, + "time_per_iteration": 3.0874733924865723 + }, + { + "auxiliary_loss_clip": 0.01131114, + "auxiliary_loss_mlp": 0.01101635, + "balance_loss_clip": 1.00165915, + "balance_loss_mlp": 1.00045037, + "epoch": 0.8155719224410041, + "flos": 16983413683200.0, + "grad_norm": 1.988073264812325, + "language_loss": 0.70072699, + "learning_rate": 3.462176595017854e-07, + "loss": 0.72305453, + "num_input_tokens_seen": 292717370, + "step": 13565, + "time_per_iteration": 3.0295188426971436 + }, + { + "auxiliary_loss_clip": 0.01149663, + "auxiliary_loss_mlp": 0.0110104, + "balance_loss_clip": 1.00186014, + "balance_loss_mlp": 1.00057018, + "epoch": 0.815632045693672, + "flos": 24681727428480.0, + "grad_norm": 2.425583323807871, + "language_loss": 0.78835845, + "learning_rate": 3.459986724180188e-07, + "loss": 0.81086546, + "num_input_tokens_seen": 292737110, + "step": 13566, + "time_per_iteration": 4.216582536697388 + }, + { + "auxiliary_loss_clip": 0.01132838, + "auxiliary_loss_mlp": 0.01100598, + "balance_loss_clip": 1.00184405, + "balance_loss_mlp": 1.00050998, + "epoch": 0.81569216894634, + "flos": 19938323047680.0, + "grad_norm": 1.6438233958888244, + "language_loss": 0.82336658, + "learning_rate": 3.457797480541491e-07, + "loss": 0.84570098, + "num_input_tokens_seen": 292756510, + "step": 13567, + "time_per_iteration": 2.8031976222991943 + }, + { + "auxiliary_loss_clip": 0.01164198, + "auxiliary_loss_mlp": 0.01100081, + "balance_loss_clip": 1.00186276, + "balance_loss_mlp": 1.00037444, + "epoch": 0.8157522921990079, + "flos": 21799393493760.0, + "grad_norm": 1.8942832255197433, + "language_loss": 0.79568899, + "learning_rate": 3.455608864184771e-07, + "loss": 0.81833178, + "num_input_tokens_seen": 292776710, + "step": 13568, + "time_per_iteration": 4.097700119018555 + }, + { + "auxiliary_loss_clip": 0.01133247, + "auxiliary_loss_mlp": 0.01099678, + "balance_loss_clip": 1.00172758, + "balance_loss_mlp": 1.00044847, + "epoch": 0.8158124154516759, + "flos": 18508323720960.0, + "grad_norm": 2.431492899322159, + "language_loss": 0.77381247, + "learning_rate": 3.453420875193016e-07, + "loss": 0.79614174, + "num_input_tokens_seen": 292794350, + "step": 13569, + "time_per_iteration": 2.6772541999816895 + }, + { + "auxiliary_loss_clip": 0.01164301, + "auxiliary_loss_mlp": 0.01101078, + "balance_loss_clip": 1.00191426, + "balance_loss_mlp": 1.00046587, + "epoch": 0.815872538704344, + "flos": 26830801123200.0, + "grad_norm": 2.297654890374813, + "language_loss": 0.58767366, + "learning_rate": 3.451233513649199e-07, + "loss": 0.61032748, + "num_input_tokens_seen": 292814005, + "step": 13570, + "time_per_iteration": 2.6595585346221924 + }, + { + "auxiliary_loss_clip": 0.01149885, + "auxiliary_loss_mlp": 0.01102339, + "balance_loss_clip": 1.00194097, + "balance_loss_mlp": 1.0005821, + "epoch": 0.8159326619570119, + "flos": 21725704742400.0, + "grad_norm": 1.9322959237903357, + "language_loss": 0.8254934, + "learning_rate": 3.4490467796362687e-07, + "loss": 0.84801567, + "num_input_tokens_seen": 292833485, + "step": 13571, + "time_per_iteration": 2.7338037490844727 + }, + { + "auxiliary_loss_clip": 0.01133176, + "auxiliary_loss_mlp": 0.01101638, + "balance_loss_clip": 1.00188351, + "balance_loss_mlp": 1.00073969, + "epoch": 0.8159927852096799, + "flos": 13840726993920.0, + "grad_norm": 2.4232165686700835, + "language_loss": 0.78927219, + "learning_rate": 3.446860673237142e-07, + "loss": 0.8116203, + "num_input_tokens_seen": 292848045, + "step": 13572, + "time_per_iteration": 2.6556055545806885 + }, + { + "auxiliary_loss_clip": 0.01164356, + "auxiliary_loss_mlp": 0.01101052, + "balance_loss_clip": 1.00193143, + "balance_loss_mlp": 1.00053513, + "epoch": 0.8160529084623478, + "flos": 24499516711680.0, + "grad_norm": 1.5544547852053034, + "language_loss": 0.64881325, + "learning_rate": 3.4446751945347186e-07, + "loss": 0.6714673, + "num_input_tokens_seen": 292869965, + "step": 13573, + "time_per_iteration": 2.66963267326355 + }, + { + "auxiliary_loss_clip": 0.01116128, + "auxiliary_loss_mlp": 0.01100404, + "balance_loss_clip": 1.00172174, + "balance_loss_mlp": 1.00055408, + "epoch": 0.8161130317150158, + "flos": 24826339584000.0, + "grad_norm": 1.7714009336544527, + "language_loss": 0.75104523, + "learning_rate": 3.442490343611868e-07, + "loss": 0.77321059, + "num_input_tokens_seen": 292889680, + "step": 13574, + "time_per_iteration": 2.7626771926879883 + }, + { + "auxiliary_loss_clip": 0.01149587, + "auxiliary_loss_mlp": 0.01101762, + "balance_loss_clip": 1.00177169, + "balance_loss_mlp": 1.0004822, + "epoch": 0.8161731549676837, + "flos": 30956542208640.0, + "grad_norm": 1.7509120447083566, + "language_loss": 0.59630477, + "learning_rate": 3.4403061205514485e-07, + "loss": 0.61881828, + "num_input_tokens_seen": 292912360, + "step": 13575, + "time_per_iteration": 2.779816150665283 + }, + { + "auxiliary_loss_clip": 0.01067958, + "auxiliary_loss_mlp": 0.01101908, + "balance_loss_clip": 1.00158, + "balance_loss_mlp": 1.00043702, + "epoch": 0.8162332782203517, + "flos": 18551991680640.0, + "grad_norm": 1.8306008611184568, + "language_loss": 0.74531144, + "learning_rate": 3.4381225254362736e-07, + "loss": 0.76701009, + "num_input_tokens_seen": 292928325, + "step": 13576, + "time_per_iteration": 2.8355345726013184 + }, + { + "auxiliary_loss_clip": 0.01126334, + "auxiliary_loss_mlp": 0.01074701, + "balance_loss_clip": 1.00069201, + "balance_loss_mlp": 0.99993342, + "epoch": 0.8162934014730197, + "flos": 70386853904640.0, + "grad_norm": 0.825021862838297, + "language_loss": 0.58634788, + "learning_rate": 3.435939558349155e-07, + "loss": 0.60835826, + "num_input_tokens_seen": 292992795, + "step": 13577, + "time_per_iteration": 3.2825284004211426 + }, + { + "auxiliary_loss_clip": 0.01102434, + "auxiliary_loss_mlp": 0.01100645, + "balance_loss_clip": 1.00184011, + "balance_loss_mlp": 1.0005095, + "epoch": 0.8163535247256877, + "flos": 21214839559680.0, + "grad_norm": 2.421099916777343, + "language_loss": 0.70499206, + "learning_rate": 3.4337572193728747e-07, + "loss": 0.72702289, + "num_input_tokens_seen": 293011950, + "step": 13578, + "time_per_iteration": 2.854623317718506 + }, + { + "auxiliary_loss_clip": 0.01116511, + "auxiliary_loss_mlp": 0.01101447, + "balance_loss_clip": 1.00168657, + "balance_loss_mlp": 1.00054848, + "epoch": 0.8164136479783556, + "flos": 21098847565440.0, + "grad_norm": 1.8854947614864377, + "language_loss": 0.73849243, + "learning_rate": 3.431575508590172e-07, + "loss": 0.76067203, + "num_input_tokens_seen": 293030175, + "step": 13579, + "time_per_iteration": 2.74560546875 + }, + { + "auxiliary_loss_clip": 0.0116441, + "auxiliary_loss_mlp": 0.01101111, + "balance_loss_clip": 1.00191224, + "balance_loss_mlp": 1.00040305, + "epoch": 0.8164737712310236, + "flos": 21720640924800.0, + "grad_norm": 1.8072081138029767, + "language_loss": 0.79247439, + "learning_rate": 3.4293944260837873e-07, + "loss": 0.81512958, + "num_input_tokens_seen": 293047980, + "step": 13580, + "time_per_iteration": 2.6406667232513428 + }, + { + "auxiliary_loss_clip": 0.01117636, + "auxiliary_loss_mlp": 0.0110088, + "balance_loss_clip": 1.00168169, + "balance_loss_mlp": 1.00060165, + "epoch": 0.8165338944836915, + "flos": 19536805843200.0, + "grad_norm": 1.8740941470793313, + "language_loss": 0.68610126, + "learning_rate": 3.4272139719364314e-07, + "loss": 0.7082864, + "num_input_tokens_seen": 293067030, + "step": 13581, + "time_per_iteration": 2.7155303955078125 + }, + { + "auxiliary_loss_clip": 0.01164459, + "auxiliary_loss_mlp": 0.01100761, + "balance_loss_clip": 1.00195932, + "balance_loss_mlp": 1.00043428, + "epoch": 0.8165940177363595, + "flos": 22928568416640.0, + "grad_norm": 1.5271479750514312, + "language_loss": 0.59617609, + "learning_rate": 3.4250341462307786e-07, + "loss": 0.6188283, + "num_input_tokens_seen": 293085575, + "step": 13582, + "time_per_iteration": 2.6376125812530518 + }, + { + "auxiliary_loss_clip": 0.01116712, + "auxiliary_loss_mlp": 0.00747233, + "balance_loss_clip": 1.00170434, + "balance_loss_mlp": 1.00041902, + "epoch": 0.8166541409890276, + "flos": 23370377702400.0, + "grad_norm": 1.6319950036736555, + "language_loss": 0.82420146, + "learning_rate": 3.4228549490494897e-07, + "loss": 0.84284091, + "num_input_tokens_seen": 293108200, + "step": 13583, + "time_per_iteration": 2.7848868370056152 + }, + { + "auxiliary_loss_clip": 0.01134532, + "auxiliary_loss_mlp": 0.01101212, + "balance_loss_clip": 1.00186729, + "balance_loss_mlp": 1.00040877, + "epoch": 0.8167142642416955, + "flos": 18441997257600.0, + "grad_norm": 1.9769619788771233, + "language_loss": 0.74363297, + "learning_rate": 3.4206763804752093e-07, + "loss": 0.76599044, + "num_input_tokens_seen": 293126020, + "step": 13584, + "time_per_iteration": 2.726151466369629 + }, + { + "auxiliary_loss_clip": 0.01147314, + "auxiliary_loss_mlp": 0.01101901, + "balance_loss_clip": 1.00197649, + "balance_loss_mlp": 1.00043035, + "epoch": 0.8167743874943635, + "flos": 21214983214080.0, + "grad_norm": 1.561896507750269, + "language_loss": 0.7486254, + "learning_rate": 3.4184984405905405e-07, + "loss": 0.77111757, + "num_input_tokens_seen": 293144620, + "step": 13585, + "time_per_iteration": 2.747734785079956 + }, + { + "auxiliary_loss_clip": 0.01131026, + "auxiliary_loss_mlp": 0.01100972, + "balance_loss_clip": 1.00179136, + "balance_loss_mlp": 1.00045478, + "epoch": 0.8168345107470314, + "flos": 18697681244160.0, + "grad_norm": 1.655298059330485, + "language_loss": 0.69656926, + "learning_rate": 3.416321129478068e-07, + "loss": 0.71888924, + "num_input_tokens_seen": 293162850, + "step": 13586, + "time_per_iteration": 4.202932119369507 + }, + { + "auxiliary_loss_clip": 0.01084113, + "auxiliary_loss_mlp": 0.01100702, + "balance_loss_clip": 1.001616, + "balance_loss_mlp": 1.00056648, + "epoch": 0.8168946339996994, + "flos": 16253098358400.0, + "grad_norm": 1.4811806288725475, + "language_loss": 0.60828829, + "learning_rate": 3.4141444472203594e-07, + "loss": 0.63013649, + "num_input_tokens_seen": 293181620, + "step": 13587, + "time_per_iteration": 2.8174729347229004 + }, + { + "auxiliary_loss_clip": 0.01147781, + "auxiliary_loss_mlp": 0.01102041, + "balance_loss_clip": 1.00177693, + "balance_loss_mlp": 1.00056982, + "epoch": 0.8169547572523673, + "flos": 26941585645440.0, + "grad_norm": 3.251393364105336, + "language_loss": 0.68931758, + "learning_rate": 3.4119683938999624e-07, + "loss": 0.71181583, + "num_input_tokens_seen": 293200270, + "step": 13588, + "time_per_iteration": 2.6736068725585938 + }, + { + "auxiliary_loss_clip": 0.01133174, + "auxiliary_loss_mlp": 0.01101468, + "balance_loss_clip": 1.0018599, + "balance_loss_mlp": 1.00047374, + "epoch": 0.8170148805050353, + "flos": 18952323736320.0, + "grad_norm": 1.4935768026033622, + "language_loss": 0.73174667, + "learning_rate": 3.4097929695993854e-07, + "loss": 0.75409311, + "num_input_tokens_seen": 293218960, + "step": 13589, + "time_per_iteration": 2.809105396270752 + }, + { + "auxiliary_loss_clip": 0.01147528, + "auxiliary_loss_mlp": 0.01101183, + "balance_loss_clip": 1.00177813, + "balance_loss_mlp": 1.00047517, + "epoch": 0.8170750037577033, + "flos": 21834909066240.0, + "grad_norm": 2.111523156668551, + "language_loss": 0.73346978, + "learning_rate": 3.4076181744011166e-07, + "loss": 0.75595689, + "num_input_tokens_seen": 293236450, + "step": 13590, + "time_per_iteration": 2.642897367477417 + }, + { + "auxiliary_loss_clip": 0.01164493, + "auxiliary_loss_mlp": 0.01102352, + "balance_loss_clip": 1.00195169, + "balance_loss_mlp": 1.00049973, + "epoch": 0.8171351270103713, + "flos": 33507169021440.0, + "grad_norm": 1.8215664325444163, + "language_loss": 0.6524117, + "learning_rate": 3.4054440083876345e-07, + "loss": 0.67508018, + "num_input_tokens_seen": 293256480, + "step": 13591, + "time_per_iteration": 4.053126335144043 + }, + { + "auxiliary_loss_clip": 0.01164422, + "auxiliary_loss_mlp": 0.01101906, + "balance_loss_clip": 1.00182128, + "balance_loss_mlp": 1.00053012, + "epoch": 0.8171952502630392, + "flos": 22708184520960.0, + "grad_norm": 1.7748252642333986, + "language_loss": 0.67733765, + "learning_rate": 3.403270471641373e-07, + "loss": 0.70000094, + "num_input_tokens_seen": 293274960, + "step": 13592, + "time_per_iteration": 2.6117422580718994 + }, + { + "auxiliary_loss_clip": 0.01134483, + "auxiliary_loss_mlp": 0.01101359, + "balance_loss_clip": 1.00195837, + "balance_loss_mlp": 1.00036478, + "epoch": 0.8172553735157072, + "flos": 26723715701760.0, + "grad_norm": 1.8599593451954874, + "language_loss": 0.66503775, + "learning_rate": 3.401097564244759e-07, + "loss": 0.68739617, + "num_input_tokens_seen": 293295945, + "step": 13593, + "time_per_iteration": 2.700491189956665 + }, + { + "auxiliary_loss_clip": 0.01149044, + "auxiliary_loss_mlp": 0.01100266, + "balance_loss_clip": 1.00180054, + "balance_loss_mlp": 1.00051177, + "epoch": 0.8173154967683751, + "flos": 15961072786560.0, + "grad_norm": 1.9467147719812428, + "language_loss": 0.6941393, + "learning_rate": 3.398925286280188e-07, + "loss": 0.71663237, + "num_input_tokens_seen": 293313300, + "step": 13594, + "time_per_iteration": 2.6533615589141846 + }, + { + "auxiliary_loss_clip": 0.01164474, + "auxiliary_loss_mlp": 0.0110233, + "balance_loss_clip": 1.00184441, + "balance_loss_mlp": 1.00043035, + "epoch": 0.8173756200210431, + "flos": 25986720447360.0, + "grad_norm": 1.8378753699742625, + "language_loss": 0.65547013, + "learning_rate": 3.3967536378300456e-07, + "loss": 0.6781382, + "num_input_tokens_seen": 293333085, + "step": 13595, + "time_per_iteration": 2.6012229919433594 + }, + { + "auxiliary_loss_clip": 0.01115497, + "auxiliary_loss_mlp": 0.01101942, + "balance_loss_clip": 1.00168586, + "balance_loss_mlp": 1.00047159, + "epoch": 0.8174357432737112, + "flos": 25664422688640.0, + "grad_norm": 1.5825219902692822, + "language_loss": 0.78721029, + "learning_rate": 3.394582618976658e-07, + "loss": 0.8093847, + "num_input_tokens_seen": 293351895, + "step": 13596, + "time_per_iteration": 2.772336006164551 + }, + { + "auxiliary_loss_clip": 0.01132745, + "auxiliary_loss_mlp": 0.01100238, + "balance_loss_clip": 1.00169492, + "balance_loss_mlp": 1.00034046, + "epoch": 0.8174958665263791, + "flos": 21835088634240.0, + "grad_norm": 3.476155227352761, + "language_loss": 0.58288038, + "learning_rate": 3.392412229802362e-07, + "loss": 0.60521024, + "num_input_tokens_seen": 293371165, + "step": 13597, + "time_per_iteration": 2.695727586746216 + }, + { + "auxiliary_loss_clip": 0.01099269, + "auxiliary_loss_mlp": 0.01099461, + "balance_loss_clip": 1.00157189, + "balance_loss_mlp": 1.00037408, + "epoch": 0.8175559897790471, + "flos": 22455517276800.0, + "grad_norm": 1.4645272150481332, + "language_loss": 0.82370883, + "learning_rate": 3.390242470389462e-07, + "loss": 0.84569609, + "num_input_tokens_seen": 293391150, + "step": 13598, + "time_per_iteration": 2.8185014724731445 + }, + { + "auxiliary_loss_clip": 0.01066916, + "auxiliary_loss_mlp": 0.01101753, + "balance_loss_clip": 1.00150609, + "balance_loss_mlp": 1.00047255, + "epoch": 0.817616113031715, + "flos": 23615790399360.0, + "grad_norm": 1.7232002620698128, + "language_loss": 0.82678539, + "learning_rate": 3.3880733408202277e-07, + "loss": 0.848472, + "num_input_tokens_seen": 293409440, + "step": 13599, + "time_per_iteration": 2.871924877166748 + }, + { + "auxiliary_loss_clip": 0.01101241, + "auxiliary_loss_mlp": 0.01100644, + "balance_loss_clip": 1.00159061, + "balance_loss_mlp": 1.00050831, + "epoch": 0.817676236284383, + "flos": 27672260106240.0, + "grad_norm": 2.7596693727914525, + "language_loss": 0.83856922, + "learning_rate": 3.3859048411769186e-07, + "loss": 0.86058807, + "num_input_tokens_seen": 293428995, + "step": 13600, + "time_per_iteration": 2.8287205696105957 + }, + { + "auxiliary_loss_clip": 0.01114131, + "auxiliary_loss_mlp": 0.01101146, + "balance_loss_clip": 1.00169909, + "balance_loss_mlp": 1.0004859, + "epoch": 0.8177363595370509, + "flos": 24681009156480.0, + "grad_norm": 1.7918160638361613, + "language_loss": 0.74118459, + "learning_rate": 3.383736971541766e-07, + "loss": 0.76333737, + "num_input_tokens_seen": 293449155, + "step": 13601, + "time_per_iteration": 2.7521286010742188 + }, + { + "auxiliary_loss_clip": 0.01114491, + "auxiliary_loss_mlp": 0.01102351, + "balance_loss_clip": 1.00177228, + "balance_loss_mlp": 1.00049901, + "epoch": 0.817796482789719, + "flos": 17346326745600.0, + "grad_norm": 2.5368341837330095, + "language_loss": 0.67832446, + "learning_rate": 3.3815697319969737e-07, + "loss": 0.70049286, + "num_input_tokens_seen": 293466125, + "step": 13602, + "time_per_iteration": 2.6499345302581787 + }, + { + "auxiliary_loss_clip": 0.011158, + "auxiliary_loss_mlp": 0.01101252, + "balance_loss_clip": 1.00177836, + "balance_loss_mlp": 1.00035357, + "epoch": 0.8178566060423869, + "flos": 17778475272960.0, + "grad_norm": 2.2503220647959585, + "language_loss": 0.84084868, + "learning_rate": 3.379403122624718e-07, + "loss": 0.86301917, + "num_input_tokens_seen": 293481345, + "step": 13603, + "time_per_iteration": 4.114781141281128 + }, + { + "auxiliary_loss_clip": 0.01097834, + "auxiliary_loss_mlp": 0.01101246, + "balance_loss_clip": 1.00165939, + "balance_loss_mlp": 1.00039482, + "epoch": 0.8179167292950549, + "flos": 24973250209920.0, + "grad_norm": 1.5928400173990338, + "language_loss": 0.69194341, + "learning_rate": 3.377237143507159e-07, + "loss": 0.71393418, + "num_input_tokens_seen": 293502330, + "step": 13604, + "time_per_iteration": 2.8166940212249756 + }, + { + "auxiliary_loss_clip": 0.01131875, + "auxiliary_loss_mlp": 0.01101326, + "balance_loss_clip": 1.0019989, + "balance_loss_mlp": 1.00052285, + "epoch": 0.8179768525477228, + "flos": 22856783086080.0, + "grad_norm": 1.709072349233728, + "language_loss": 0.74198306, + "learning_rate": 3.3750717947264406e-07, + "loss": 0.76431501, + "num_input_tokens_seen": 293521415, + "step": 13605, + "time_per_iteration": 2.689866542816162 + }, + { + "auxiliary_loss_clip": 0.01114941, + "auxiliary_loss_mlp": 0.01100271, + "balance_loss_clip": 1.00174284, + "balance_loss_mlp": 1.00051713, + "epoch": 0.8180369758003908, + "flos": 18515147304960.0, + "grad_norm": 2.22755779866851, + "language_loss": 0.7404685, + "learning_rate": 3.372907076364666e-07, + "loss": 0.76262063, + "num_input_tokens_seen": 293539245, + "step": 13606, + "time_per_iteration": 4.141143798828125 + }, + { + "auxiliary_loss_clip": 0.0116424, + "auxiliary_loss_mlp": 0.01100785, + "balance_loss_clip": 1.00186419, + "balance_loss_mlp": 1.000458, + "epoch": 0.8180970990530587, + "flos": 33182105915520.0, + "grad_norm": 1.7217372328868328, + "language_loss": 0.65491629, + "learning_rate": 3.370742988503916e-07, + "loss": 0.67756653, + "num_input_tokens_seen": 293560640, + "step": 13607, + "time_per_iteration": 2.6669774055480957 + }, + { + "auxiliary_loss_clip": 0.01130843, + "auxiliary_loss_mlp": 0.01101125, + "balance_loss_clip": 1.0018208, + "balance_loss_mlp": 1.00041699, + "epoch": 0.8181572223057267, + "flos": 25010022758400.0, + "grad_norm": 1.8694510899674461, + "language_loss": 0.70480871, + "learning_rate": 3.3685795312262634e-07, + "loss": 0.72712839, + "num_input_tokens_seen": 293579465, + "step": 13608, + "time_per_iteration": 2.665836811065674 + }, + { + "auxiliary_loss_clip": 0.01149579, + "auxiliary_loss_mlp": 0.01100867, + "balance_loss_clip": 1.00185668, + "balance_loss_mlp": 1.00054073, + "epoch": 0.8182173455583948, + "flos": 28548731871360.0, + "grad_norm": 2.3243905370523925, + "language_loss": 0.79458046, + "learning_rate": 3.366416704613735e-07, + "loss": 0.81708491, + "num_input_tokens_seen": 293600540, + "step": 13609, + "time_per_iteration": 2.7277817726135254 + }, + { + "auxiliary_loss_clip": 0.01111961, + "auxiliary_loss_mlp": 0.01074807, + "balance_loss_clip": 1.00082231, + "balance_loss_mlp": 1.00003898, + "epoch": 0.8182774688110627, + "flos": 72028043245440.0, + "grad_norm": 0.7535243953677951, + "language_loss": 0.55921686, + "learning_rate": 3.3642545087483544e-07, + "loss": 0.58108455, + "num_input_tokens_seen": 293665160, + "step": 13610, + "time_per_iteration": 3.3529136180877686 + }, + { + "auxiliary_loss_clip": 0.01084129, + "auxiliary_loss_mlp": 0.00747213, + "balance_loss_clip": 1.00168133, + "balance_loss_mlp": 1.0004909, + "epoch": 0.8183375920637307, + "flos": 19755358145280.0, + "grad_norm": 1.872991937634758, + "language_loss": 0.77772045, + "learning_rate": 3.362092943712107e-07, + "loss": 0.79603386, + "num_input_tokens_seen": 293683995, + "step": 13611, + "time_per_iteration": 2.7627830505371094 + }, + { + "auxiliary_loss_clip": 0.01114595, + "auxiliary_loss_mlp": 0.01103185, + "balance_loss_clip": 1.00157714, + "balance_loss_mlp": 1.00037909, + "epoch": 0.8183977153163986, + "flos": 22341895580160.0, + "grad_norm": 2.3541633043940697, + "language_loss": 0.77593422, + "learning_rate": 3.3599320095869745e-07, + "loss": 0.79811203, + "num_input_tokens_seen": 293704115, + "step": 13612, + "time_per_iteration": 2.6945197582244873 + }, + { + "auxiliary_loss_clip": 0.01115131, + "auxiliary_loss_mlp": 0.01101367, + "balance_loss_clip": 1.0016892, + "balance_loss_mlp": 1.00046802, + "epoch": 0.8184578385690666, + "flos": 17712472032000.0, + "grad_norm": 2.138173332689969, + "language_loss": 0.8597917, + "learning_rate": 3.3577717064548793e-07, + "loss": 0.88195664, + "num_input_tokens_seen": 293722225, + "step": 13613, + "time_per_iteration": 2.685466766357422 + }, + { + "auxiliary_loss_clip": 0.01147695, + "auxiliary_loss_mlp": 0.01101908, + "balance_loss_clip": 1.00195432, + "balance_loss_mlp": 1.00062728, + "epoch": 0.8185179618217345, + "flos": 25701159323520.0, + "grad_norm": 1.7773046594305089, + "language_loss": 0.72764516, + "learning_rate": 3.355612034397746e-07, + "loss": 0.75014126, + "num_input_tokens_seen": 293743995, + "step": 13614, + "time_per_iteration": 2.6499242782592773 + }, + { + "auxiliary_loss_clip": 0.01133118, + "auxiliary_loss_mlp": 0.0110192, + "balance_loss_clip": 1.00180709, + "balance_loss_mlp": 1.00054455, + "epoch": 0.8185780850744026, + "flos": 25960326929280.0, + "grad_norm": 1.7802533588736495, + "language_loss": 0.80861223, + "learning_rate": 3.353452993497479e-07, + "loss": 0.8309626, + "num_input_tokens_seen": 293764935, + "step": 13615, + "time_per_iteration": 2.7315330505371094 + }, + { + "auxiliary_loss_clip": 0.01149472, + "auxiliary_loss_mlp": 0.01101246, + "balance_loss_clip": 1.00169015, + "balance_loss_mlp": 1.00044215, + "epoch": 0.8186382083270705, + "flos": 25228431406080.0, + "grad_norm": 1.7780474993753501, + "language_loss": 0.7575016, + "learning_rate": 3.3512945838359375e-07, + "loss": 0.78000879, + "num_input_tokens_seen": 293784035, + "step": 13616, + "time_per_iteration": 2.660349130630493 + }, + { + "auxiliary_loss_clip": 0.011178, + "auxiliary_loss_mlp": 0.01100475, + "balance_loss_clip": 1.00163746, + "balance_loss_mlp": 1.00053012, + "epoch": 0.8186983315797385, + "flos": 22415009713920.0, + "grad_norm": 2.162076538297941, + "language_loss": 0.75299597, + "learning_rate": 3.349136805494979e-07, + "loss": 0.77517867, + "num_input_tokens_seen": 293803360, + "step": 13617, + "time_per_iteration": 2.7070891857147217 + }, + { + "auxiliary_loss_clip": 0.01130801, + "auxiliary_loss_mlp": 0.01100273, + "balance_loss_clip": 1.00170016, + "balance_loss_mlp": 1.00042343, + "epoch": 0.8187584548324064, + "flos": 22018017623040.0, + "grad_norm": 3.2878025611139443, + "language_loss": 0.68742549, + "learning_rate": 3.346979658556415e-07, + "loss": 0.70973629, + "num_input_tokens_seen": 293821325, + "step": 13618, + "time_per_iteration": 2.673067331314087 + }, + { + "auxiliary_loss_clip": 0.01131363, + "auxiliary_loss_mlp": 0.01102387, + "balance_loss_clip": 1.00170493, + "balance_loss_mlp": 1.00053501, + "epoch": 0.8188185780850744, + "flos": 29241664116480.0, + "grad_norm": 1.932281103493104, + "language_loss": 0.6987586, + "learning_rate": 3.344823143102058e-07, + "loss": 0.72109616, + "num_input_tokens_seen": 293840315, + "step": 13619, + "time_per_iteration": 2.707291603088379 + }, + { + "auxiliary_loss_clip": 0.01081526, + "auxiliary_loss_mlp": 0.01101271, + "balance_loss_clip": 1.00160265, + "balance_loss_mlp": 1.00037193, + "epoch": 0.8188787013377423, + "flos": 20696504348160.0, + "grad_norm": 1.78046075678276, + "language_loss": 0.73977959, + "learning_rate": 3.3426672592136694e-07, + "loss": 0.76160753, + "num_input_tokens_seen": 293855685, + "step": 13620, + "time_per_iteration": 2.770095109939575 + }, + { + "auxiliary_loss_clip": 0.01132756, + "auxiliary_loss_mlp": 0.00747334, + "balance_loss_clip": 1.00168967, + "balance_loss_mlp": 1.00048316, + "epoch": 0.8189388245904103, + "flos": 23732967542400.0, + "grad_norm": 1.7807326299040451, + "language_loss": 0.75939083, + "learning_rate": 3.340512006973011e-07, + "loss": 0.77819169, + "num_input_tokens_seen": 293875540, + "step": 13621, + "time_per_iteration": 2.7296323776245117 + }, + { + "auxiliary_loss_clip": 0.0113497, + "auxiliary_loss_mlp": 0.01101705, + "balance_loss_clip": 1.00176764, + "balance_loss_mlp": 1.00042534, + "epoch": 0.8189989478430784, + "flos": 28255090187520.0, + "grad_norm": 1.949102346435176, + "language_loss": 0.65585327, + "learning_rate": 3.3383573864618076e-07, + "loss": 0.67822003, + "num_input_tokens_seen": 293896570, + "step": 13622, + "time_per_iteration": 2.752488613128662 + }, + { + "auxiliary_loss_clip": 0.01164299, + "auxiliary_loss_mlp": 0.01101188, + "balance_loss_clip": 1.00196683, + "balance_loss_mlp": 1.00038457, + "epoch": 0.8190590710957463, + "flos": 21397696721280.0, + "grad_norm": 1.8015312632048484, + "language_loss": 0.74946815, + "learning_rate": 3.3362033977617653e-07, + "loss": 0.77212304, + "num_input_tokens_seen": 293914680, + "step": 13623, + "time_per_iteration": 4.106971502304077 + }, + { + "auxiliary_loss_clip": 0.01133172, + "auxiliary_loss_mlp": 0.01101509, + "balance_loss_clip": 1.00180364, + "balance_loss_mlp": 1.00051534, + "epoch": 0.8191191943484143, + "flos": 38796451367040.0, + "grad_norm": 2.64390766507531, + "language_loss": 0.63530147, + "learning_rate": 3.3340500409545527e-07, + "loss": 0.65764821, + "num_input_tokens_seen": 293936480, + "step": 13624, + "time_per_iteration": 2.7669057846069336 + }, + { + "auxiliary_loss_clip": 0.01164156, + "auxiliary_loss_mlp": 0.01100557, + "balance_loss_clip": 1.00185061, + "balance_loss_mlp": 1.00065947, + "epoch": 0.8191793176010822, + "flos": 25446516831360.0, + "grad_norm": 1.72927754287202, + "language_loss": 0.78185123, + "learning_rate": 3.3318973161218386e-07, + "loss": 0.80449837, + "num_input_tokens_seen": 293957815, + "step": 13625, + "time_per_iteration": 2.618861436843872 + }, + { + "auxiliary_loss_clip": 0.01149812, + "auxiliary_loss_mlp": 0.00747628, + "balance_loss_clip": 1.00180721, + "balance_loss_mlp": 1.0005703, + "epoch": 0.8192394408537502, + "flos": 25083029151360.0, + "grad_norm": 2.070544347259482, + "language_loss": 0.75969893, + "learning_rate": 3.329745223345244e-07, + "loss": 0.77867329, + "num_input_tokens_seen": 293975440, + "step": 13626, + "time_per_iteration": 2.602865219116211 + }, + { + "auxiliary_loss_clip": 0.01147601, + "auxiliary_loss_mlp": 0.01100758, + "balance_loss_clip": 1.00180221, + "balance_loss_mlp": 1.00052655, + "epoch": 0.8192995641064181, + "flos": 27673732563840.0, + "grad_norm": 2.2283057646939355, + "language_loss": 0.7371543, + "learning_rate": 3.3275937627063823e-07, + "loss": 0.75963783, + "num_input_tokens_seen": 293997540, + "step": 13627, + "time_per_iteration": 2.659087657928467 + }, + { + "auxiliary_loss_clip": 0.01164478, + "auxiliary_loss_mlp": 0.01101557, + "balance_loss_clip": 1.00196314, + "balance_loss_mlp": 1.0005151, + "epoch": 0.8193596873590862, + "flos": 21288492397440.0, + "grad_norm": 1.5388185924267828, + "language_loss": 0.68955612, + "learning_rate": 3.3254429342868353e-07, + "loss": 0.7122165, + "num_input_tokens_seen": 294017030, + "step": 13628, + "time_per_iteration": 3.9466805458068848 + }, + { + "auxiliary_loss_clip": 0.01132944, + "auxiliary_loss_mlp": 0.01101652, + "balance_loss_clip": 1.00179744, + "balance_loss_mlp": 1.00056243, + "epoch": 0.8194198106117541, + "flos": 17492626840320.0, + "grad_norm": 1.5545176152382103, + "language_loss": 0.85133088, + "learning_rate": 3.323292738168171e-07, + "loss": 0.87367684, + "num_input_tokens_seen": 294035700, + "step": 13629, + "time_per_iteration": 2.625178098678589 + }, + { + "auxiliary_loss_clip": 0.01164238, + "auxiliary_loss_mlp": 0.01101016, + "balance_loss_clip": 1.00183868, + "balance_loss_mlp": 1.00049853, + "epoch": 0.8194799338644221, + "flos": 15267925059840.0, + "grad_norm": 2.384477794224069, + "language_loss": 0.74310076, + "learning_rate": 3.3211431744319084e-07, + "loss": 0.76575339, + "num_input_tokens_seen": 294049730, + "step": 13630, + "time_per_iteration": 2.548633575439453 + }, + { + "auxiliary_loss_clip": 0.0114942, + "auxiliary_loss_mlp": 0.01101892, + "balance_loss_clip": 1.00185454, + "balance_loss_mlp": 1.00061178, + "epoch": 0.81954005711709, + "flos": 14718814871040.0, + "grad_norm": 1.8290565102456486, + "language_loss": 0.7197125, + "learning_rate": 3.31899424315957e-07, + "loss": 0.74222559, + "num_input_tokens_seen": 294066545, + "step": 13631, + "time_per_iteration": 2.565154552459717 + }, + { + "auxiliary_loss_clip": 0.0116426, + "auxiliary_loss_mlp": 0.01100493, + "balance_loss_clip": 1.00181127, + "balance_loss_mlp": 1.00050044, + "epoch": 0.819600180369758, + "flos": 23074042498560.0, + "grad_norm": 1.5565662350808118, + "language_loss": 0.76637912, + "learning_rate": 3.3168459444326447e-07, + "loss": 0.78902662, + "num_input_tokens_seen": 294087455, + "step": 13632, + "time_per_iteration": 2.6263859272003174 + }, + { + "auxiliary_loss_clip": 0.01134707, + "auxiliary_loss_mlp": 0.01100671, + "balance_loss_clip": 1.00178647, + "balance_loss_mlp": 1.00043976, + "epoch": 0.8196603036224259, + "flos": 27599792417280.0, + "grad_norm": 1.7614715309532936, + "language_loss": 0.65568, + "learning_rate": 3.314698278332588e-07, + "loss": 0.67803383, + "num_input_tokens_seen": 294107480, + "step": 13633, + "time_per_iteration": 2.6922948360443115 + }, + { + "auxiliary_loss_clip": 0.01149397, + "auxiliary_loss_mlp": 0.01100807, + "balance_loss_clip": 1.00191522, + "balance_loss_mlp": 1.00048041, + "epoch": 0.8197204268750939, + "flos": 28582020800640.0, + "grad_norm": 1.5074502445702946, + "language_loss": 0.75771439, + "learning_rate": 3.3125512449408513e-07, + "loss": 0.7802164, + "num_input_tokens_seen": 294130115, + "step": 13634, + "time_per_iteration": 2.712947368621826 + }, + { + "auxiliary_loss_clip": 0.01086423, + "auxiliary_loss_mlp": 0.00747342, + "balance_loss_clip": 1.00165701, + "balance_loss_mlp": 1.00052595, + "epoch": 0.819780550127762, + "flos": 23258300290560.0, + "grad_norm": 2.181218183313532, + "language_loss": 0.8185699, + "learning_rate": 3.310404844338841e-07, + "loss": 0.83690751, + "num_input_tokens_seen": 294148495, + "step": 13635, + "time_per_iteration": 2.793584108352661 + }, + { + "auxiliary_loss_clip": 0.01149607, + "auxiliary_loss_mlp": 0.01101088, + "balance_loss_clip": 1.00178528, + "balance_loss_mlp": 1.00047541, + "epoch": 0.8198406733804299, + "flos": 26685255214080.0, + "grad_norm": 3.1671171454654936, + "language_loss": 0.75858277, + "learning_rate": 3.308259076607949e-07, + "loss": 0.78108972, + "num_input_tokens_seen": 294169595, + "step": 13636, + "time_per_iteration": 2.7151942253112793 + }, + { + "auxiliary_loss_clip": 0.01117214, + "auxiliary_loss_mlp": 0.01100065, + "balance_loss_clip": 1.00167453, + "balance_loss_mlp": 1.00040627, + "epoch": 0.8199007966330979, + "flos": 20084084438400.0, + "grad_norm": 3.9746835112066847, + "language_loss": 0.81081271, + "learning_rate": 3.3061139418295445e-07, + "loss": 0.83298552, + "num_input_tokens_seen": 294183885, + "step": 13637, + "time_per_iteration": 2.65278959274292 + }, + { + "auxiliary_loss_clip": 0.01147544, + "auxiliary_loss_mlp": 0.01100579, + "balance_loss_clip": 1.00197685, + "balance_loss_mlp": 1.00044346, + "epoch": 0.8199609198857658, + "flos": 31902788142720.0, + "grad_norm": 2.121168670759897, + "language_loss": 0.7125566, + "learning_rate": 3.3039694400849725e-07, + "loss": 0.7350378, + "num_input_tokens_seen": 294200150, + "step": 13638, + "time_per_iteration": 2.7480340003967285 + }, + { + "auxiliary_loss_clip": 0.01101436, + "auxiliary_loss_mlp": 0.01102431, + "balance_loss_clip": 1.00164652, + "balance_loss_mlp": 1.0005784, + "epoch": 0.8200210431384338, + "flos": 26470150617600.0, + "grad_norm": 1.8396888426784694, + "language_loss": 0.79842508, + "learning_rate": 3.3018255714555564e-07, + "loss": 0.82046366, + "num_input_tokens_seen": 294220385, + "step": 13639, + "time_per_iteration": 2.7799065113067627 + }, + { + "auxiliary_loss_clip": 0.01118837, + "auxiliary_loss_mlp": 0.01100195, + "balance_loss_clip": 1.00182056, + "balance_loss_mlp": 1.00034547, + "epoch": 0.8200811663911017, + "flos": 22091454979200.0, + "grad_norm": 2.1297402181114604, + "language_loss": 0.79021877, + "learning_rate": 3.299682336022589e-07, + "loss": 0.81240916, + "num_input_tokens_seen": 294239355, + "step": 13640, + "time_per_iteration": 2.7472822666168213 + }, + { + "auxiliary_loss_clip": 0.01116605, + "auxiliary_loss_mlp": 0.01102647, + "balance_loss_clip": 1.00181079, + "balance_loss_mlp": 1.00050831, + "epoch": 0.8201412896437698, + "flos": 37593659520000.0, + "grad_norm": 1.900209188608753, + "language_loss": 0.63283849, + "learning_rate": 3.297539733867336e-07, + "loss": 0.65503103, + "num_input_tokens_seen": 294259395, + "step": 13641, + "time_per_iteration": 4.3437840938568115 + }, + { + "auxiliary_loss_clip": 0.01099745, + "auxiliary_loss_mlp": 0.01101018, + "balance_loss_clip": 1.00165248, + "balance_loss_mlp": 1.00050044, + "epoch": 0.8202014128964377, + "flos": 19646333389440.0, + "grad_norm": 2.2727619394877783, + "language_loss": 0.73668933, + "learning_rate": 3.295397765071055e-07, + "loss": 0.75869691, + "num_input_tokens_seen": 294277365, + "step": 13642, + "time_per_iteration": 2.732762575149536 + }, + { + "auxiliary_loss_clip": 0.01131395, + "auxiliary_loss_mlp": 0.01101188, + "balance_loss_clip": 1.00179708, + "balance_loss_mlp": 1.00047982, + "epoch": 0.8202615361491057, + "flos": 31467335564160.0, + "grad_norm": 1.9326372158423952, + "language_loss": 0.70459294, + "learning_rate": 3.2932564297149615e-07, + "loss": 0.72691882, + "num_input_tokens_seen": 294297555, + "step": 13643, + "time_per_iteration": 4.1734983921051025 + }, + { + "auxiliary_loss_clip": 0.01147977, + "auxiliary_loss_mlp": 0.01100495, + "balance_loss_clip": 1.00185704, + "balance_loss_mlp": 1.00050282, + "epoch": 0.8203216594017736, + "flos": 24715555061760.0, + "grad_norm": 2.087880648360961, + "language_loss": 0.65531623, + "learning_rate": 3.291115727880256e-07, + "loss": 0.67780089, + "num_input_tokens_seen": 294317600, + "step": 13644, + "time_per_iteration": 2.638913154602051 + }, + { + "auxiliary_loss_clip": 0.01115606, + "auxiliary_loss_mlp": 0.01102628, + "balance_loss_clip": 1.00180328, + "balance_loss_mlp": 1.00068045, + "epoch": 0.8203817826544416, + "flos": 26031824951040.0, + "grad_norm": 1.4043196923235752, + "language_loss": 0.70853245, + "learning_rate": 3.2889756596481234e-07, + "loss": 0.7307148, + "num_input_tokens_seen": 294340215, + "step": 13645, + "time_per_iteration": 2.721829414367676 + }, + { + "auxiliary_loss_clip": 0.01130564, + "auxiliary_loss_mlp": 0.01100657, + "balance_loss_clip": 1.00167668, + "balance_loss_mlp": 1.00052094, + "epoch": 0.8204419059071095, + "flos": 25954544839680.0, + "grad_norm": 1.8096987398734417, + "language_loss": 0.71591413, + "learning_rate": 3.286836225099707e-07, + "loss": 0.73822635, + "num_input_tokens_seen": 294358590, + "step": 13646, + "time_per_iteration": 2.7007217407226562 + }, + { + "auxiliary_loss_clip": 0.01134356, + "auxiliary_loss_mlp": 0.01102328, + "balance_loss_clip": 1.00189078, + "balance_loss_mlp": 1.00057089, + "epoch": 0.8205020291597775, + "flos": 23580059345280.0, + "grad_norm": 2.2781745712039063, + "language_loss": 0.78736961, + "learning_rate": 3.284697424316132e-07, + "loss": 0.80973643, + "num_input_tokens_seen": 294375825, + "step": 13647, + "time_per_iteration": 2.700533151626587 + }, + { + "auxiliary_loss_clip": 0.01164091, + "auxiliary_loss_mlp": 0.01101053, + "balance_loss_clip": 1.00185883, + "balance_loss_mlp": 1.00053537, + "epoch": 0.8205621524124456, + "flos": 26799164219520.0, + "grad_norm": 2.8139560271506414, + "language_loss": 0.67974186, + "learning_rate": 3.2825592573785034e-07, + "loss": 0.70239329, + "num_input_tokens_seen": 294398500, + "step": 13648, + "time_per_iteration": 2.6240339279174805 + }, + { + "auxiliary_loss_clip": 0.01132821, + "auxiliary_loss_mlp": 0.01101858, + "balance_loss_clip": 1.0018065, + "balance_loss_mlp": 1.0003866, + "epoch": 0.8206222756651135, + "flos": 27527863432320.0, + "grad_norm": 1.74519180969448, + "language_loss": 0.80082059, + "learning_rate": 3.28042172436791e-07, + "loss": 0.82316744, + "num_input_tokens_seen": 294418840, + "step": 13649, + "time_per_iteration": 2.6747820377349854 + }, + { + "auxiliary_loss_clip": 0.01148585, + "auxiliary_loss_mlp": 0.01101974, + "balance_loss_clip": 1.00205231, + "balance_loss_mlp": 1.0005033, + "epoch": 0.8206823989177815, + "flos": 21178605715200.0, + "grad_norm": 1.596036167885907, + "language_loss": 0.68859673, + "learning_rate": 3.278284825365396e-07, + "loss": 0.71110237, + "num_input_tokens_seen": 294438215, + "step": 13650, + "time_per_iteration": 2.598095655441284 + }, + { + "auxiliary_loss_clip": 0.01132898, + "auxiliary_loss_mlp": 0.0110203, + "balance_loss_clip": 1.00182056, + "balance_loss_mlp": 1.00046372, + "epoch": 0.8207425221704494, + "flos": 11509622150400.0, + "grad_norm": 2.1737533296056712, + "language_loss": 0.60510051, + "learning_rate": 3.276148560452001e-07, + "loss": 0.62744975, + "num_input_tokens_seen": 294455260, + "step": 13651, + "time_per_iteration": 2.5944533348083496 + }, + { + "auxiliary_loss_clip": 0.01118419, + "auxiliary_loss_mlp": 0.00747389, + "balance_loss_clip": 1.00182748, + "balance_loss_mlp": 1.00053787, + "epoch": 0.8208026454231174, + "flos": 19791987039360.0, + "grad_norm": 1.8450972780535368, + "language_loss": 0.7235651, + "learning_rate": 3.2740129297087293e-07, + "loss": 0.74222314, + "num_input_tokens_seen": 294473205, + "step": 13652, + "time_per_iteration": 2.67108154296875 + }, + { + "auxiliary_loss_clip": 0.0113082, + "auxiliary_loss_mlp": 0.01100277, + "balance_loss_clip": 1.00169492, + "balance_loss_mlp": 1.00052285, + "epoch": 0.8208627686757853, + "flos": 15667538843520.0, + "grad_norm": 1.941885217946954, + "language_loss": 0.72874022, + "learning_rate": 3.271877933216558e-07, + "loss": 0.75105119, + "num_input_tokens_seen": 294490645, + "step": 13653, + "time_per_iteration": 2.634385347366333 + }, + { + "auxiliary_loss_clip": 0.01101677, + "auxiliary_loss_mlp": 0.01102796, + "balance_loss_clip": 1.0017395, + "balance_loss_mlp": 1.00037122, + "epoch": 0.8209228919284534, + "flos": 37482659516160.0, + "grad_norm": 1.7902604702815808, + "language_loss": 0.62837869, + "learning_rate": 3.269743571056451e-07, + "loss": 0.65042341, + "num_input_tokens_seen": 294513500, + "step": 13654, + "time_per_iteration": 2.8816754817962646 + }, + { + "auxiliary_loss_clip": 0.01119798, + "auxiliary_loss_mlp": 0.01101002, + "balance_loss_clip": 1.00164258, + "balance_loss_mlp": 1.0004847, + "epoch": 0.8209830151811213, + "flos": 23112969863040.0, + "grad_norm": 2.8266321836193335, + "language_loss": 0.70066619, + "learning_rate": 3.2676098433093447e-07, + "loss": 0.72287422, + "num_input_tokens_seen": 294535710, + "step": 13655, + "time_per_iteration": 2.7580268383026123 + }, + { + "auxiliary_loss_clip": 0.01130791, + "auxiliary_loss_mlp": 0.01100855, + "balance_loss_clip": 1.00173724, + "balance_loss_mlp": 1.00052857, + "epoch": 0.8210431384337893, + "flos": 21288169175040.0, + "grad_norm": 2.515623601545804, + "language_loss": 0.81838083, + "learning_rate": 3.265476750056162e-07, + "loss": 0.84069723, + "num_input_tokens_seen": 294554055, + "step": 13656, + "time_per_iteration": 2.665060043334961 + }, + { + "auxiliary_loss_clip": 0.01132551, + "auxiliary_loss_mlp": 0.01100439, + "balance_loss_clip": 1.00178599, + "balance_loss_mlp": 1.00049365, + "epoch": 0.8211032616864572, + "flos": 11502403516800.0, + "grad_norm": 2.2640042062209926, + "language_loss": 0.73777062, + "learning_rate": 3.2633442913777654e-07, + "loss": 0.76010048, + "num_input_tokens_seen": 294570390, + "step": 13657, + "time_per_iteration": 2.6698246002197266 + }, + { + "auxiliary_loss_clip": 0.01118165, + "auxiliary_loss_mlp": 0.01100093, + "balance_loss_clip": 1.00172734, + "balance_loss_mlp": 1.0004822, + "epoch": 0.8211633849391252, + "flos": 29821477455360.0, + "grad_norm": 2.19067836611278, + "language_loss": 0.55737603, + "learning_rate": 3.2612124673550325e-07, + "loss": 0.57955855, + "num_input_tokens_seen": 294593050, + "step": 13658, + "time_per_iteration": 2.746356964111328 + }, + { + "auxiliary_loss_clip": 0.01085407, + "auxiliary_loss_mlp": 0.01101403, + "balance_loss_clip": 1.00172377, + "balance_loss_mlp": 1.00050414, + "epoch": 0.8212235081917931, + "flos": 13115439573120.0, + "grad_norm": 2.060180481219799, + "language_loss": 0.79079306, + "learning_rate": 3.259081278068805e-07, + "loss": 0.81266111, + "num_input_tokens_seen": 294608550, + "step": 13659, + "time_per_iteration": 2.8433926105499268 + }, + { + "auxiliary_loss_clip": 0.0114741, + "auxiliary_loss_mlp": 0.01099924, + "balance_loss_clip": 1.00170946, + "balance_loss_mlp": 1.00050354, + "epoch": 0.8212836314444611, + "flos": 40515351782400.0, + "grad_norm": 2.350925559321035, + "language_loss": 0.60258192, + "learning_rate": 3.256950723599887e-07, + "loss": 0.62505525, + "num_input_tokens_seen": 294630380, + "step": 13660, + "time_per_iteration": 4.3162453174591064 + }, + { + "auxiliary_loss_clip": 0.01149699, + "auxiliary_loss_mlp": 0.01101461, + "balance_loss_clip": 1.00196242, + "balance_loss_mlp": 1.00046682, + "epoch": 0.8213437546971292, + "flos": 18770543982720.0, + "grad_norm": 2.117922079744404, + "language_loss": 0.72245902, + "learning_rate": 3.254820804029075e-07, + "loss": 0.74497068, + "num_input_tokens_seen": 294648655, + "step": 13661, + "time_per_iteration": 2.6035261154174805 + }, + { + "auxiliary_loss_clip": 0.01148483, + "auxiliary_loss_mlp": 0.01102292, + "balance_loss_clip": 1.00191128, + "balance_loss_mlp": 1.00053525, + "epoch": 0.8214038779497971, + "flos": 19682279925120.0, + "grad_norm": 1.9839896367057874, + "language_loss": 0.74641478, + "learning_rate": 3.252691519437143e-07, + "loss": 0.76892257, + "num_input_tokens_seen": 294666915, + "step": 13662, + "time_per_iteration": 2.637533664703369 + }, + { + "auxiliary_loss_clip": 0.01158123, + "auxiliary_loss_mlp": 0.0107471, + "balance_loss_clip": 1.00071001, + "balance_loss_mlp": 0.99994165, + "epoch": 0.8214640012024651, + "flos": 71602969697280.0, + "grad_norm": 0.7426139644378984, + "language_loss": 0.54071081, + "learning_rate": 3.250562869904825e-07, + "loss": 0.56303912, + "num_input_tokens_seen": 294731545, + "step": 13663, + "time_per_iteration": 3.222086191177368 + }, + { + "auxiliary_loss_clip": 0.01103422, + "auxiliary_loss_mlp": 0.01101721, + "balance_loss_clip": 1.00172043, + "balance_loss_mlp": 1.00044036, + "epoch": 0.821524124455133, + "flos": 14757203531520.0, + "grad_norm": 2.1718736074980134, + "language_loss": 0.65822548, + "learning_rate": 3.248434855512838e-07, + "loss": 0.68027693, + "num_input_tokens_seen": 294748745, + "step": 13664, + "time_per_iteration": 2.7600772380828857 + }, + { + "auxiliary_loss_clip": 0.01132485, + "auxiliary_loss_mlp": 0.01100389, + "balance_loss_clip": 1.00186312, + "balance_loss_mlp": 1.00044417, + "epoch": 0.821584247707801, + "flos": 25082274965760.0, + "grad_norm": 1.569637570488794, + "language_loss": 0.75288504, + "learning_rate": 3.246307476341881e-07, + "loss": 0.77521384, + "num_input_tokens_seen": 294768955, + "step": 13665, + "time_per_iteration": 2.68660569190979 + }, + { + "auxiliary_loss_clip": 0.01147994, + "auxiliary_loss_mlp": 0.00747444, + "balance_loss_clip": 1.00190783, + "balance_loss_mlp": 1.00050271, + "epoch": 0.8216443709604689, + "flos": 36830701710720.0, + "grad_norm": 2.226021401463398, + "language_loss": 0.65527409, + "learning_rate": 3.2441807324726256e-07, + "loss": 0.67422843, + "num_input_tokens_seen": 294789250, + "step": 13666, + "time_per_iteration": 4.180990219116211 + }, + { + "auxiliary_loss_clip": 0.01097631, + "auxiliary_loss_mlp": 0.01100935, + "balance_loss_clip": 1.00164068, + "balance_loss_mlp": 1.00041747, + "epoch": 0.821704494213137, + "flos": 25081808088960.0, + "grad_norm": 1.8118362993328576, + "language_loss": 0.77130562, + "learning_rate": 3.2420546239857174e-07, + "loss": 0.79329127, + "num_input_tokens_seen": 294809760, + "step": 13667, + "time_per_iteration": 2.737842559814453 + }, + { + "auxiliary_loss_clip": 0.01114382, + "auxiliary_loss_mlp": 0.01101162, + "balance_loss_clip": 1.00170779, + "balance_loss_mlp": 1.00054967, + "epoch": 0.8217646174658049, + "flos": 14356117290240.0, + "grad_norm": 2.039185663724637, + "language_loss": 0.77576101, + "learning_rate": 3.239929150961773e-07, + "loss": 0.79791653, + "num_input_tokens_seen": 294826495, + "step": 13668, + "time_per_iteration": 2.630941152572632 + }, + { + "auxiliary_loss_clip": 0.01097335, + "auxiliary_loss_mlp": 0.01101263, + "balance_loss_clip": 1.00161755, + "balance_loss_mlp": 1.00045967, + "epoch": 0.8218247407184729, + "flos": 22090557139200.0, + "grad_norm": 1.8672790171064806, + "language_loss": 0.73649347, + "learning_rate": 3.2378043134813984e-07, + "loss": 0.75847948, + "num_input_tokens_seen": 294845370, + "step": 13669, + "time_per_iteration": 2.7606635093688965 + }, + { + "auxiliary_loss_clip": 0.01147741, + "auxiliary_loss_mlp": 0.01100672, + "balance_loss_clip": 1.0019083, + "balance_loss_mlp": 1.00039315, + "epoch": 0.8218848639711408, + "flos": 16764035368320.0, + "grad_norm": 1.8748154221154427, + "language_loss": 0.78492081, + "learning_rate": 3.235680111625161e-07, + "loss": 0.80740494, + "num_input_tokens_seen": 294863740, + "step": 13670, + "time_per_iteration": 2.6020259857177734 + }, + { + "auxiliary_loss_clip": 0.01147313, + "auxiliary_loss_mlp": 0.01102694, + "balance_loss_clip": 1.00191617, + "balance_loss_mlp": 1.00055587, + "epoch": 0.8219449872238088, + "flos": 25994801007360.0, + "grad_norm": 1.6112566322757371, + "language_loss": 0.74794424, + "learning_rate": 3.2335565454736123e-07, + "loss": 0.77044427, + "num_input_tokens_seen": 294882815, + "step": 13671, + "time_per_iteration": 2.6632893085479736 + }, + { + "auxiliary_loss_clip": 0.0114772, + "auxiliary_loss_mlp": 0.01102689, + "balance_loss_clip": 1.00180089, + "balance_loss_mlp": 1.00055075, + "epoch": 0.8220051104764767, + "flos": 20778094091520.0, + "grad_norm": 1.9385511496172856, + "language_loss": 0.76306462, + "learning_rate": 3.23143361510728e-07, + "loss": 0.78556865, + "num_input_tokens_seen": 294901985, + "step": 13672, + "time_per_iteration": 2.604538679122925 + }, + { + "auxiliary_loss_clip": 0.01099829, + "auxiliary_loss_mlp": 0.01101836, + "balance_loss_clip": 1.00161326, + "balance_loss_mlp": 1.00041246, + "epoch": 0.8220652337291448, + "flos": 14574849160320.0, + "grad_norm": 2.6604391063627464, + "language_loss": 0.74898928, + "learning_rate": 3.2293113206066733e-07, + "loss": 0.77100593, + "num_input_tokens_seen": 294919705, + "step": 13673, + "time_per_iteration": 2.7161312103271484 + }, + { + "auxiliary_loss_clip": 0.01131351, + "auxiliary_loss_mlp": 0.01100783, + "balance_loss_clip": 1.00174153, + "balance_loss_mlp": 1.00045609, + "epoch": 0.8221253569818128, + "flos": 23805866194560.0, + "grad_norm": 2.7548505099728637, + "language_loss": 0.79778361, + "learning_rate": 3.227189662052254e-07, + "loss": 0.82010496, + "num_input_tokens_seen": 294939900, + "step": 13674, + "time_per_iteration": 2.6966729164123535 + }, + { + "auxiliary_loss_clip": 0.01133025, + "auxiliary_loss_mlp": 0.01101235, + "balance_loss_clip": 1.00178194, + "balance_loss_mlp": 1.00062239, + "epoch": 0.8221854802344807, + "flos": 21288241002240.0, + "grad_norm": 3.395419012307893, + "language_loss": 0.69941622, + "learning_rate": 3.225068639524484e-07, + "loss": 0.72175884, + "num_input_tokens_seen": 294959110, + "step": 13675, + "time_per_iteration": 2.6690473556518555 + }, + { + "auxiliary_loss_clip": 0.01149809, + "auxiliary_loss_mlp": 0.01100218, + "balance_loss_clip": 1.00196445, + "balance_loss_mlp": 1.00051188, + "epoch": 0.8222456034871487, + "flos": 20956785275520.0, + "grad_norm": 1.5765488572013573, + "language_loss": 0.74584901, + "learning_rate": 3.2229482531037965e-07, + "loss": 0.76834929, + "num_input_tokens_seen": 294978660, + "step": 13676, + "time_per_iteration": 2.665727138519287 + }, + { + "auxiliary_loss_clip": 0.01131465, + "auxiliary_loss_mlp": 0.01101069, + "balance_loss_clip": 1.00172782, + "balance_loss_mlp": 1.00055146, + "epoch": 0.8223057267398166, + "flos": 21397517153280.0, + "grad_norm": 2.2010150866247056, + "language_loss": 0.80402267, + "learning_rate": 3.2208285028705893e-07, + "loss": 0.82634801, + "num_input_tokens_seen": 294998075, + "step": 13677, + "time_per_iteration": 2.619459867477417 + }, + { + "auxiliary_loss_clip": 0.0114994, + "auxiliary_loss_mlp": 0.01102163, + "balance_loss_clip": 1.00194216, + "balance_loss_mlp": 1.00059652, + "epoch": 0.8223658499924846, + "flos": 15268212368640.0, + "grad_norm": 1.7926724233891427, + "language_loss": 0.70200104, + "learning_rate": 3.218709388905245e-07, + "loss": 0.72452205, + "num_input_tokens_seen": 295015950, + "step": 13678, + "time_per_iteration": 2.606837272644043 + }, + { + "auxiliary_loss_clip": 0.01164108, + "auxiliary_loss_mlp": 0.01100992, + "balance_loss_clip": 1.00177956, + "balance_loss_mlp": 1.0003798, + "epoch": 0.8224259732451525, + "flos": 31249537447680.0, + "grad_norm": 1.7538959950439603, + "language_loss": 0.71290827, + "learning_rate": 3.216590911288133e-07, + "loss": 0.73555923, + "num_input_tokens_seen": 295036800, + "step": 13679, + "time_per_iteration": 4.042535781860352 + }, + { + "auxiliary_loss_clip": 0.0113237, + "auxiliary_loss_mlp": 0.01100757, + "balance_loss_clip": 1.00174654, + "balance_loss_mlp": 1.00033486, + "epoch": 0.8224860964978206, + "flos": 21574628138880.0, + "grad_norm": 1.9607684148844438, + "language_loss": 0.69922864, + "learning_rate": 3.214473070099564e-07, + "loss": 0.72155994, + "num_input_tokens_seen": 295055300, + "step": 13680, + "time_per_iteration": 2.641491413116455 + }, + { + "auxiliary_loss_clip": 0.01115263, + "auxiliary_loss_mlp": 0.01101952, + "balance_loss_clip": 1.00177598, + "balance_loss_mlp": 1.00048149, + "epoch": 0.8225462197504885, + "flos": 25483217552640.0, + "grad_norm": 1.7727645690921892, + "language_loss": 0.59775782, + "learning_rate": 3.21235586541986e-07, + "loss": 0.61992997, + "num_input_tokens_seen": 295076420, + "step": 13681, + "time_per_iteration": 2.800832748413086 + }, + { + "auxiliary_loss_clip": 0.01133298, + "auxiliary_loss_mlp": 0.01102534, + "balance_loss_clip": 1.00175345, + "balance_loss_mlp": 1.00058603, + "epoch": 0.8226063430031565, + "flos": 39385458587520.0, + "grad_norm": 1.5428293540544005, + "language_loss": 0.692765, + "learning_rate": 3.2102392973293047e-07, + "loss": 0.7151233, + "num_input_tokens_seen": 295100540, + "step": 13682, + "time_per_iteration": 4.222346067428589 + }, + { + "auxiliary_loss_clip": 0.01164299, + "auxiliary_loss_mlp": 0.01102214, + "balance_loss_clip": 1.00188529, + "balance_loss_mlp": 1.00055242, + "epoch": 0.8226664662558244, + "flos": 22815269942400.0, + "grad_norm": 2.3500049379886803, + "language_loss": 0.79349542, + "learning_rate": 3.20812336590816e-07, + "loss": 0.81616056, + "num_input_tokens_seen": 295120180, + "step": 13683, + "time_per_iteration": 2.622910499572754 + }, + { + "auxiliary_loss_clip": 0.01164157, + "auxiliary_loss_mlp": 0.01100248, + "balance_loss_clip": 1.00186181, + "balance_loss_mlp": 1.00054169, + "epoch": 0.8227265895084924, + "flos": 25665607837440.0, + "grad_norm": 2.183665410795634, + "language_loss": 0.85881507, + "learning_rate": 3.206008071236661e-07, + "loss": 0.88145918, + "num_input_tokens_seen": 295138530, + "step": 13684, + "time_per_iteration": 2.5827159881591797 + }, + { + "auxiliary_loss_clip": 0.01164053, + "auxiliary_loss_mlp": 0.01099565, + "balance_loss_clip": 1.00186288, + "balance_loss_mlp": 1.00052595, + "epoch": 0.8227867127611603, + "flos": 26179274280960.0, + "grad_norm": 2.113619705857542, + "language_loss": 0.8003937, + "learning_rate": 3.2038934133950157e-07, + "loss": 0.82302988, + "num_input_tokens_seen": 295160260, + "step": 13685, + "time_per_iteration": 2.6153204441070557 + }, + { + "auxiliary_loss_clip": 0.01116562, + "auxiliary_loss_mlp": 0.01101091, + "balance_loss_clip": 1.0017221, + "balance_loss_mlp": 1.00047839, + "epoch": 0.8228468360138284, + "flos": 22018053536640.0, + "grad_norm": 1.6483983429294817, + "language_loss": 0.69095707, + "learning_rate": 3.2017793924634194e-07, + "loss": 0.71313357, + "num_input_tokens_seen": 295177055, + "step": 13686, + "time_per_iteration": 2.7195427417755127 + }, + { + "auxiliary_loss_clip": 0.01114474, + "auxiliary_loss_mlp": 0.01101618, + "balance_loss_clip": 1.00156212, + "balance_loss_mlp": 1.00052834, + "epoch": 0.8229069592664963, + "flos": 14903359971840.0, + "grad_norm": 2.287823963582307, + "language_loss": 0.77956808, + "learning_rate": 3.1996660085220263e-07, + "loss": 0.80172896, + "num_input_tokens_seen": 295193870, + "step": 13687, + "time_per_iteration": 2.648643970489502 + }, + { + "auxiliary_loss_clip": 0.01147479, + "auxiliary_loss_mlp": 0.01100926, + "balance_loss_clip": 1.00181127, + "balance_loss_mlp": 1.0003612, + "epoch": 0.8229670825191643, + "flos": 15669478177920.0, + "grad_norm": 1.9948932715957162, + "language_loss": 0.72780013, + "learning_rate": 3.1975532616509825e-07, + "loss": 0.75028425, + "num_input_tokens_seen": 295211040, + "step": 13688, + "time_per_iteration": 2.601494312286377 + }, + { + "auxiliary_loss_clip": 0.0116433, + "auxiliary_loss_mlp": 0.00747367, + "balance_loss_clip": 1.00190997, + "balance_loss_mlp": 1.00051212, + "epoch": 0.8230272057718323, + "flos": 23183498217600.0, + "grad_norm": 1.6353396086182173, + "language_loss": 0.7336781, + "learning_rate": 3.1954411519304025e-07, + "loss": 0.7527951, + "num_input_tokens_seen": 295231300, + "step": 13689, + "time_per_iteration": 2.5726511478424072 + }, + { + "auxiliary_loss_clip": 0.0114971, + "auxiliary_loss_mlp": 0.01101139, + "balance_loss_clip": 1.00175238, + "balance_loss_mlp": 1.0004313, + "epoch": 0.8230873290245002, + "flos": 21032413361280.0, + "grad_norm": 2.1871283066347, + "language_loss": 0.6875332, + "learning_rate": 3.1933296794403887e-07, + "loss": 0.7100417, + "num_input_tokens_seen": 295251045, + "step": 13690, + "time_per_iteration": 2.6308631896972656 + }, + { + "auxiliary_loss_clip": 0.01104202, + "auxiliary_loss_mlp": 0.01101593, + "balance_loss_clip": 1.00170636, + "balance_loss_mlp": 1.00050378, + "epoch": 0.8231474522771682, + "flos": 21250139650560.0, + "grad_norm": 1.8109233887363785, + "language_loss": 0.85385823, + "learning_rate": 3.191218844260988e-07, + "loss": 0.87591624, + "num_input_tokens_seen": 295270225, + "step": 13691, + "time_per_iteration": 2.7031991481781006 + }, + { + "auxiliary_loss_clip": 0.01149668, + "auxiliary_loss_mlp": 0.01101053, + "balance_loss_clip": 1.00186062, + "balance_loss_mlp": 1.0004406, + "epoch": 0.8232075755298361, + "flos": 23842028211840.0, + "grad_norm": 1.7358291710795006, + "language_loss": 0.77032793, + "learning_rate": 3.189108646472252e-07, + "loss": 0.79283512, + "num_input_tokens_seen": 295288950, + "step": 13692, + "time_per_iteration": 2.637915849685669 + }, + { + "auxiliary_loss_clip": 0.01147467, + "auxiliary_loss_mlp": 0.0110071, + "balance_loss_clip": 1.00180638, + "balance_loss_mlp": 1.00038362, + "epoch": 0.8232676987825042, + "flos": 21653955325440.0, + "grad_norm": 1.5853811970581482, + "language_loss": 0.71611297, + "learning_rate": 3.186999086154205e-07, + "loss": 0.73859471, + "num_input_tokens_seen": 295309405, + "step": 13693, + "time_per_iteration": 2.621337413787842 + }, + { + "auxiliary_loss_clip": 0.01118047, + "auxiliary_loss_mlp": 0.01100697, + "balance_loss_clip": 1.00161958, + "balance_loss_mlp": 1.00065637, + "epoch": 0.8233278220351721, + "flos": 26322701287680.0, + "grad_norm": 1.3328257505418366, + "language_loss": 0.83884007, + "learning_rate": 3.1848901633868355e-07, + "loss": 0.86102748, + "num_input_tokens_seen": 295331115, + "step": 13694, + "time_per_iteration": 2.7561416625976562 + }, + { + "auxiliary_loss_clip": 0.01082724, + "auxiliary_loss_mlp": 0.01101704, + "balance_loss_clip": 1.00146937, + "balance_loss_mlp": 1.00042367, + "epoch": 0.8233879452878401, + "flos": 21725812483200.0, + "grad_norm": 1.6707576558178987, + "language_loss": 0.76924533, + "learning_rate": 3.182781878250118e-07, + "loss": 0.79108959, + "num_input_tokens_seen": 295350495, + "step": 13695, + "time_per_iteration": 2.8637123107910156 + }, + { + "auxiliary_loss_clip": 0.01132551, + "auxiliary_loss_mlp": 0.01101382, + "balance_loss_clip": 1.00188088, + "balance_loss_mlp": 1.00048363, + "epoch": 0.823448068540508, + "flos": 20557746109440.0, + "grad_norm": 1.9424084475965664, + "language_loss": 0.80786628, + "learning_rate": 3.1806742308239985e-07, + "loss": 0.83020562, + "num_input_tokens_seen": 295368225, + "step": 13696, + "time_per_iteration": 2.7442634105682373 + }, + { + "auxiliary_loss_clip": 0.0114206, + "auxiliary_loss_mlp": 0.01074404, + "balance_loss_clip": 1.00083125, + "balance_loss_mlp": 1.00001764, + "epoch": 0.823508191793176, + "flos": 67273688194560.0, + "grad_norm": 0.7415659687422851, + "language_loss": 0.63828439, + "learning_rate": 3.178567221188393e-07, + "loss": 0.66044903, + "num_input_tokens_seen": 295430035, + "step": 13697, + "time_per_iteration": 3.30047869682312 + }, + { + "auxiliary_loss_clip": 0.01113985, + "auxiliary_loss_mlp": 0.01100222, + "balance_loss_clip": 1.00163031, + "balance_loss_mlp": 1.0002768, + "epoch": 0.8235683150458439, + "flos": 17928402641280.0, + "grad_norm": 2.028667387487589, + "language_loss": 0.73149121, + "learning_rate": 3.1764608494232037e-07, + "loss": 0.75363326, + "num_input_tokens_seen": 295447765, + "step": 13698, + "time_per_iteration": 4.226579666137695 + }, + { + "auxiliary_loss_clip": 0.01120195, + "auxiliary_loss_mlp": 0.01101683, + "balance_loss_clip": 1.00175798, + "balance_loss_mlp": 1.00045073, + "epoch": 0.823628438298512, + "flos": 18916089891840.0, + "grad_norm": 2.4989886250543534, + "language_loss": 0.71535802, + "learning_rate": 3.174355115608305e-07, + "loss": 0.73757684, + "num_input_tokens_seen": 295464810, + "step": 13699, + "time_per_iteration": 2.7271602153778076 + }, + { + "auxiliary_loss_clip": 0.01132847, + "auxiliary_loss_mlp": 0.01100202, + "balance_loss_clip": 1.00179887, + "balance_loss_mlp": 1.0004003, + "epoch": 0.8236885615511799, + "flos": 18696460181760.0, + "grad_norm": 2.4788405880694055, + "language_loss": 0.82130361, + "learning_rate": 3.1722500198235526e-07, + "loss": 0.84363413, + "num_input_tokens_seen": 295482605, + "step": 13700, + "time_per_iteration": 2.6783361434936523 + }, + { + "auxiliary_loss_clip": 0.01130909, + "auxiliary_loss_mlp": 0.01101398, + "balance_loss_clip": 1.00174284, + "balance_loss_mlp": 1.00054669, + "epoch": 0.8237486848038479, + "flos": 23695009845120.0, + "grad_norm": 3.426190538598376, + "language_loss": 0.73375738, + "learning_rate": 3.170145562148763e-07, + "loss": 0.75608045, + "num_input_tokens_seen": 295503780, + "step": 13701, + "time_per_iteration": 2.7313883304595947 + }, + { + "auxiliary_loss_clip": 0.01147612, + "auxiliary_loss_mlp": 0.01101747, + "balance_loss_clip": 1.00165772, + "balance_loss_mlp": 1.00046635, + "epoch": 0.8238088080565159, + "flos": 23441301106560.0, + "grad_norm": 1.9571744314774089, + "language_loss": 0.69510198, + "learning_rate": 3.1680417426637384e-07, + "loss": 0.71759558, + "num_input_tokens_seen": 295522035, + "step": 13702, + "time_per_iteration": 2.679257869720459 + }, + { + "auxiliary_loss_clip": 0.01100009, + "auxiliary_loss_mlp": 0.01100774, + "balance_loss_clip": 1.00167108, + "balance_loss_mlp": 1.00044775, + "epoch": 0.8238689313091838, + "flos": 22746537267840.0, + "grad_norm": 2.0712585188102484, + "language_loss": 0.74529588, + "learning_rate": 3.1659385614482603e-07, + "loss": 0.76730371, + "num_input_tokens_seen": 295541190, + "step": 13703, + "time_per_iteration": 2.7358126640319824 + }, + { + "auxiliary_loss_clip": 0.01164598, + "auxiliary_loss_mlp": 0.01103064, + "balance_loss_clip": 1.0018847, + "balance_loss_mlp": 1.00054359, + "epoch": 0.8239290545618518, + "flos": 25630092264960.0, + "grad_norm": 2.578502500294616, + "language_loss": 0.69984329, + "learning_rate": 3.1638360185820755e-07, + "loss": 0.72251999, + "num_input_tokens_seen": 295558860, + "step": 13704, + "time_per_iteration": 3.967869281768799 + }, + { + "auxiliary_loss_clip": 0.01164224, + "auxiliary_loss_mlp": 0.01100727, + "balance_loss_clip": 1.00188804, + "balance_loss_mlp": 1.00049567, + "epoch": 0.8239891778145197, + "flos": 26026473824640.0, + "grad_norm": 1.5571739655915149, + "language_loss": 0.64214242, + "learning_rate": 3.161734114144916e-07, + "loss": 0.66479194, + "num_input_tokens_seen": 295578155, + "step": 13705, + "time_per_iteration": 2.6291184425354004 + }, + { + "auxiliary_loss_clip": 0.0116435, + "auxiliary_loss_mlp": 0.01101601, + "balance_loss_clip": 1.00184989, + "balance_loss_mlp": 1.00051212, + "epoch": 0.8240493010671878, + "flos": 21833257040640.0, + "grad_norm": 1.546552240421712, + "language_loss": 0.69340569, + "learning_rate": 3.1596328482164915e-07, + "loss": 0.71606517, + "num_input_tokens_seen": 295599170, + "step": 13706, + "time_per_iteration": 2.6073310375213623 + }, + { + "auxiliary_loss_clip": 0.01133206, + "auxiliary_loss_mlp": 0.01101816, + "balance_loss_clip": 1.00195432, + "balance_loss_mlp": 1.00048852, + "epoch": 0.8241094243198557, + "flos": 18551919853440.0, + "grad_norm": 1.7290120473706463, + "language_loss": 0.69363451, + "learning_rate": 3.157532220876475e-07, + "loss": 0.71598476, + "num_input_tokens_seen": 295617465, + "step": 13707, + "time_per_iteration": 2.7029855251312256 + }, + { + "auxiliary_loss_clip": 0.01117097, + "auxiliary_loss_mlp": 0.0110135, + "balance_loss_clip": 1.00166082, + "balance_loss_mlp": 1.00045133, + "epoch": 0.8241695475725237, + "flos": 25447163276160.0, + "grad_norm": 2.051764093357108, + "language_loss": 0.78843915, + "learning_rate": 3.1554322322045226e-07, + "loss": 0.81062365, + "num_input_tokens_seen": 295634960, + "step": 13708, + "time_per_iteration": 2.8800013065338135 + }, + { + "auxiliary_loss_clip": 0.01149942, + "auxiliary_loss_mlp": 0.01101317, + "balance_loss_clip": 1.00189877, + "balance_loss_mlp": 1.00041795, + "epoch": 0.8242296708251916, + "flos": 18989670902400.0, + "grad_norm": 1.923871284430361, + "language_loss": 0.68537164, + "learning_rate": 3.1533328822802664e-07, + "loss": 0.70788419, + "num_input_tokens_seen": 295652725, + "step": 13709, + "time_per_iteration": 2.6494252681732178 + }, + { + "auxiliary_loss_clip": 0.01103758, + "auxiliary_loss_mlp": 0.01101429, + "balance_loss_clip": 1.00172067, + "balance_loss_mlp": 1.00053012, + "epoch": 0.8242897940778596, + "flos": 22600883617920.0, + "grad_norm": 1.911809013536468, + "language_loss": 0.82697558, + "learning_rate": 3.151234171183319e-07, + "loss": 0.8490274, + "num_input_tokens_seen": 295671195, + "step": 13710, + "time_per_iteration": 2.728219747543335 + }, + { + "auxiliary_loss_clip": 0.01147052, + "auxiliary_loss_mlp": 0.01100864, + "balance_loss_clip": 1.00176167, + "balance_loss_mlp": 1.00048983, + "epoch": 0.8243499173305275, + "flos": 21468153248640.0, + "grad_norm": 1.8988708752423862, + "language_loss": 0.78127331, + "learning_rate": 3.149136098993257e-07, + "loss": 0.80375248, + "num_input_tokens_seen": 295689130, + "step": 13711, + "time_per_iteration": 2.6516013145446777 + }, + { + "auxiliary_loss_clip": 0.01118247, + "auxiliary_loss_mlp": 0.01101587, + "balance_loss_clip": 1.00169313, + "balance_loss_mlp": 1.00040233, + "epoch": 0.8244100405831956, + "flos": 20010359773440.0, + "grad_norm": 2.0612684014919354, + "language_loss": 0.6556704, + "learning_rate": 3.1470386657896473e-07, + "loss": 0.67786872, + "num_input_tokens_seen": 295706385, + "step": 13712, + "time_per_iteration": 2.7101240158081055 + }, + { + "auxiliary_loss_clip": 0.01148344, + "auxiliary_loss_mlp": 0.01100441, + "balance_loss_clip": 1.00203729, + "balance_loss_mlp": 1.00054324, + "epoch": 0.8244701638358635, + "flos": 26430684549120.0, + "grad_norm": 1.606646279247819, + "language_loss": 0.74375927, + "learning_rate": 3.14494187165202e-07, + "loss": 0.76624715, + "num_input_tokens_seen": 295727925, + "step": 13713, + "time_per_iteration": 2.695756435394287 + }, + { + "auxiliary_loss_clip": 0.0114789, + "auxiliary_loss_mlp": 0.01101873, + "balance_loss_clip": 1.00180864, + "balance_loss_mlp": 1.00030708, + "epoch": 0.8245302870885315, + "flos": 17640004343040.0, + "grad_norm": 2.0837267644949473, + "language_loss": 0.80999613, + "learning_rate": 3.1428457166598833e-07, + "loss": 0.83249378, + "num_input_tokens_seen": 295744420, + "step": 13714, + "time_per_iteration": 2.659907102584839 + }, + { + "auxiliary_loss_clip": 0.01147541, + "auxiliary_loss_mlp": 0.01100913, + "balance_loss_clip": 1.00189686, + "balance_loss_mlp": 1.00049114, + "epoch": 0.8245904103411995, + "flos": 26209510554240.0, + "grad_norm": 1.7684833034760428, + "language_loss": 0.65942329, + "learning_rate": 3.1407502008927235e-07, + "loss": 0.68190783, + "num_input_tokens_seen": 295765105, + "step": 13715, + "time_per_iteration": 2.811426877975464 + }, + { + "auxiliary_loss_clip": 0.01117379, + "auxiliary_loss_mlp": 0.01101257, + "balance_loss_clip": 1.00170946, + "balance_loss_mlp": 1.00045395, + "epoch": 0.8246505335938674, + "flos": 24205084928640.0, + "grad_norm": 1.7754056460702246, + "language_loss": 0.75375462, + "learning_rate": 3.1386553244300086e-07, + "loss": 0.77594101, + "num_input_tokens_seen": 295784200, + "step": 13716, + "time_per_iteration": 4.284864664077759 + }, + { + "auxiliary_loss_clip": 0.01110797, + "auxiliary_loss_mlp": 0.01075214, + "balance_loss_clip": 1.0009433, + "balance_loss_mlp": 1.00006473, + "epoch": 0.8247106568465354, + "flos": 67092195749760.0, + "grad_norm": 0.7121238481028911, + "language_loss": 0.58953553, + "learning_rate": 3.136561087351175e-07, + "loss": 0.61139566, + "num_input_tokens_seen": 295846555, + "step": 13717, + "time_per_iteration": 3.3964197635650635 + }, + { + "auxiliary_loss_clip": 0.01149633, + "auxiliary_loss_mlp": 0.00747265, + "balance_loss_clip": 1.00196385, + "balance_loss_mlp": 1.0004195, + "epoch": 0.8247707800992033, + "flos": 12568232805120.0, + "grad_norm": 1.9191363864792, + "language_loss": 0.79405224, + "learning_rate": 3.1344674897356373e-07, + "loss": 0.81302118, + "num_input_tokens_seen": 295863425, + "step": 13718, + "time_per_iteration": 2.692861318588257 + }, + { + "auxiliary_loss_clip": 0.01148532, + "auxiliary_loss_mlp": 0.01100654, + "balance_loss_clip": 1.00199533, + "balance_loss_mlp": 1.0004704, + "epoch": 0.8248309033518714, + "flos": 15923617879680.0, + "grad_norm": 1.7182077227970818, + "language_loss": 0.68932027, + "learning_rate": 3.132374531662778e-07, + "loss": 0.71181214, + "num_input_tokens_seen": 295880925, + "step": 13719, + "time_per_iteration": 4.122766971588135 + }, + { + "auxiliary_loss_clip": 0.01132857, + "auxiliary_loss_mlp": 0.01101892, + "balance_loss_clip": 1.00165892, + "balance_loss_mlp": 1.00042057, + "epoch": 0.8248910266045393, + "flos": 17564735393280.0, + "grad_norm": 4.470200732246392, + "language_loss": 0.69741422, + "learning_rate": 3.13028221321197e-07, + "loss": 0.71976173, + "num_input_tokens_seen": 295898205, + "step": 13720, + "time_per_iteration": 2.716238260269165 + }, + { + "auxiliary_loss_clip": 0.01066491, + "auxiliary_loss_mlp": 0.01101766, + "balance_loss_clip": 1.001719, + "balance_loss_mlp": 1.00039101, + "epoch": 0.8249511498572073, + "flos": 28619655275520.0, + "grad_norm": 1.6116266568392512, + "language_loss": 0.76314664, + "learning_rate": 3.1281905344625467e-07, + "loss": 0.78482926, + "num_input_tokens_seen": 295918130, + "step": 13721, + "time_per_iteration": 3.2745559215545654 + }, + { + "auxiliary_loss_clip": 0.01086027, + "auxiliary_loss_mlp": 0.01100422, + "balance_loss_clip": 1.00161421, + "balance_loss_mlp": 1.0002867, + "epoch": 0.8250112731098752, + "flos": 25556583081600.0, + "grad_norm": 2.236434673916244, + "language_loss": 0.77813196, + "learning_rate": 3.1260994954938305e-07, + "loss": 0.7999965, + "num_input_tokens_seen": 295937760, + "step": 13722, + "time_per_iteration": 3.170448064804077 + }, + { + "auxiliary_loss_clip": 0.01164331, + "auxiliary_loss_mlp": 0.01100297, + "balance_loss_clip": 1.00190401, + "balance_loss_mlp": 1.00044775, + "epoch": 0.8250713963625432, + "flos": 27746164339200.0, + "grad_norm": 1.7643001175418205, + "language_loss": 0.62482488, + "learning_rate": 3.1240090963851205e-07, + "loss": 0.64747119, + "num_input_tokens_seen": 295957585, + "step": 13723, + "time_per_iteration": 2.6601827144622803 + }, + { + "auxiliary_loss_clip": 0.011643, + "auxiliary_loss_mlp": 0.0110112, + "balance_loss_clip": 1.00185108, + "balance_loss_mlp": 1.00055456, + "epoch": 0.8251315196152111, + "flos": 21610610588160.0, + "grad_norm": 1.5237101214157152, + "language_loss": 0.74488771, + "learning_rate": 3.121919337215666e-07, + "loss": 0.76754189, + "num_input_tokens_seen": 295977135, + "step": 13724, + "time_per_iteration": 2.660759449005127 + }, + { + "auxiliary_loss_clip": 0.01116279, + "auxiliary_loss_mlp": 0.01102104, + "balance_loss_clip": 1.00175452, + "balance_loss_mlp": 1.00053811, + "epoch": 0.8251916428678792, + "flos": 28579363194240.0, + "grad_norm": 1.787156981286404, + "language_loss": 0.63547826, + "learning_rate": 3.1198302180647253e-07, + "loss": 0.65766209, + "num_input_tokens_seen": 295996265, + "step": 13725, + "time_per_iteration": 2.7676148414611816 + }, + { + "auxiliary_loss_clip": 0.01132855, + "auxiliary_loss_mlp": 0.01101002, + "balance_loss_clip": 1.00191844, + "balance_loss_mlp": 1.00043654, + "epoch": 0.8252517661205471, + "flos": 23075191733760.0, + "grad_norm": 28.571425729269983, + "language_loss": 0.81924617, + "learning_rate": 3.1177417390115125e-07, + "loss": 0.84158468, + "num_input_tokens_seen": 296014745, + "step": 13726, + "time_per_iteration": 2.7279138565063477 + }, + { + "auxiliary_loss_clip": 0.01149562, + "auxiliary_loss_mlp": 0.0110072, + "balance_loss_clip": 1.00183129, + "balance_loss_mlp": 1.00048923, + "epoch": 0.8253118893732151, + "flos": 31759576617600.0, + "grad_norm": 1.8108451575087832, + "language_loss": 0.70234358, + "learning_rate": 3.1156539001352286e-07, + "loss": 0.72484642, + "num_input_tokens_seen": 296036960, + "step": 13727, + "time_per_iteration": 2.766557216644287 + }, + { + "auxiliary_loss_clip": 0.01149259, + "auxiliary_loss_mlp": 0.01101964, + "balance_loss_clip": 1.00189281, + "balance_loss_mlp": 1.00058806, + "epoch": 0.8253720126258831, + "flos": 18296415434880.0, + "grad_norm": 2.022550606314986, + "language_loss": 0.62790751, + "learning_rate": 3.113566701515036e-07, + "loss": 0.65041971, + "num_input_tokens_seen": 296056540, + "step": 13728, + "time_per_iteration": 2.7073733806610107 + }, + { + "auxiliary_loss_clip": 0.01133138, + "auxiliary_loss_mlp": 0.01102833, + "balance_loss_clip": 1.00179553, + "balance_loss_mlp": 1.00059867, + "epoch": 0.825432135878551, + "flos": 26797332625920.0, + "grad_norm": 3.1186245560001615, + "language_loss": 0.7138344, + "learning_rate": 3.111480143230092e-07, + "loss": 0.73619407, + "num_input_tokens_seen": 296077950, + "step": 13729, + "time_per_iteration": 2.741219997406006 + }, + { + "auxiliary_loss_clip": 0.01128228, + "auxiliary_loss_mlp": 0.01074795, + "balance_loss_clip": 1.00084257, + "balance_loss_mlp": 1.00002658, + "epoch": 0.825492259131219, + "flos": 54219116217600.0, + "grad_norm": 0.8417460978776803, + "language_loss": 0.62651718, + "learning_rate": 3.109394225359514e-07, + "loss": 0.64854741, + "num_input_tokens_seen": 296127060, + "step": 13730, + "time_per_iteration": 3.067247152328491 + }, + { + "auxiliary_loss_clip": 0.01099062, + "auxiliary_loss_mlp": 0.01101188, + "balance_loss_clip": 1.00170708, + "balance_loss_mlp": 1.00057578, + "epoch": 0.825552382383887, + "flos": 43756145493120.0, + "grad_norm": 3.259863452442923, + "language_loss": 0.63240069, + "learning_rate": 3.1073089479823945e-07, + "loss": 0.65440327, + "num_input_tokens_seen": 296147775, + "step": 13731, + "time_per_iteration": 2.9817442893981934 + }, + { + "auxiliary_loss_clip": 0.01120306, + "auxiliary_loss_mlp": 0.00747498, + "balance_loss_clip": 1.00186074, + "balance_loss_mlp": 1.00052321, + "epoch": 0.825612505636555, + "flos": 12602814624000.0, + "grad_norm": 2.4065670440468483, + "language_loss": 0.69896114, + "learning_rate": 3.105224311177812e-07, + "loss": 0.71763921, + "num_input_tokens_seen": 296163560, + "step": 13732, + "time_per_iteration": 2.696932792663574 + }, + { + "auxiliary_loss_clip": 0.01147897, + "auxiliary_loss_mlp": 0.01102673, + "balance_loss_clip": 1.00186968, + "balance_loss_mlp": 1.00062966, + "epoch": 0.8256726288892229, + "flos": 17595618111360.0, + "grad_norm": 2.469435434630619, + "language_loss": 0.70993167, + "learning_rate": 3.103140315024817e-07, + "loss": 0.73243737, + "num_input_tokens_seen": 296178730, + "step": 13733, + "time_per_iteration": 2.6589269638061523 + }, + { + "auxiliary_loss_clip": 0.01164155, + "auxiliary_loss_mlp": 0.01100844, + "balance_loss_clip": 1.00179887, + "balance_loss_mlp": 1.00042224, + "epoch": 0.8257327521418909, + "flos": 23805794367360.0, + "grad_norm": 1.462741166983702, + "language_loss": 0.82243901, + "learning_rate": 3.1010569596024437e-07, + "loss": 0.84508896, + "num_input_tokens_seen": 296200175, + "step": 13734, + "time_per_iteration": 2.625012159347534 + }, + { + "auxiliary_loss_clip": 0.01132913, + "auxiliary_loss_mlp": 0.01100694, + "balance_loss_clip": 1.00173855, + "balance_loss_mlp": 1.00046325, + "epoch": 0.8257928753945588, + "flos": 19281121856640.0, + "grad_norm": 1.8562707401271208, + "language_loss": 0.82702595, + "learning_rate": 3.098974244989676e-07, + "loss": 0.84936202, + "num_input_tokens_seen": 296219305, + "step": 13735, + "time_per_iteration": 2.6924238204956055 + }, + { + "auxiliary_loss_clip": 0.01149569, + "auxiliary_loss_mlp": 0.01100542, + "balance_loss_clip": 1.00180566, + "balance_loss_mlp": 1.00050163, + "epoch": 0.8258529986472268, + "flos": 18478841633280.0, + "grad_norm": 1.8742715098198106, + "language_loss": 0.71046931, + "learning_rate": 3.096892171265497e-07, + "loss": 0.73297042, + "num_input_tokens_seen": 296236945, + "step": 13736, + "time_per_iteration": 4.1535890102386475 + }, + { + "auxiliary_loss_clip": 0.01142573, + "auxiliary_loss_mlp": 0.01074789, + "balance_loss_clip": 1.00087333, + "balance_loss_mlp": 1.00002146, + "epoch": 0.8259131218998947, + "flos": 62137957512960.0, + "grad_norm": 0.8595566626186644, + "language_loss": 0.67959344, + "learning_rate": 3.0948107385088665e-07, + "loss": 0.70176709, + "num_input_tokens_seen": 296294685, + "step": 13737, + "time_per_iteration": 3.1703040599823 + }, + { + "auxiliary_loss_clip": 0.0113302, + "auxiliary_loss_mlp": 0.01102255, + "balance_loss_clip": 1.00182974, + "balance_loss_mlp": 1.00049841, + "epoch": 0.8259732451525628, + "flos": 22159038418560.0, + "grad_norm": 2.0121817547375858, + "language_loss": 0.69655806, + "learning_rate": 3.0927299467987e-07, + "loss": 0.71891081, + "num_input_tokens_seen": 296314790, + "step": 13738, + "time_per_iteration": 2.765735387802124 + }, + { + "auxiliary_loss_clip": 0.01132577, + "auxiliary_loss_mlp": 0.01103981, + "balance_loss_clip": 1.00175691, + "balance_loss_mlp": 1.00050771, + "epoch": 0.8260333684052307, + "flos": 38361645233280.0, + "grad_norm": 2.2887232247602753, + "language_loss": 0.62439966, + "learning_rate": 3.090649796213911e-07, + "loss": 0.64676529, + "num_input_tokens_seen": 296335355, + "step": 13739, + "time_per_iteration": 2.8503403663635254 + }, + { + "auxiliary_loss_clip": 0.01127276, + "auxiliary_loss_mlp": 0.01074809, + "balance_loss_clip": 1.00070751, + "balance_loss_mlp": 1.00004053, + "epoch": 0.8260934916578987, + "flos": 62185611882240.0, + "grad_norm": 0.837787098931456, + "language_loss": 0.59348589, + "learning_rate": 3.0885702868333853e-07, + "loss": 0.61550671, + "num_input_tokens_seen": 296399885, + "step": 13740, + "time_per_iteration": 3.405073404312134 + }, + { + "auxiliary_loss_clip": 0.01164468, + "auxiliary_loss_mlp": 0.01102578, + "balance_loss_clip": 1.00189817, + "balance_loss_mlp": 1.00053477, + "epoch": 0.8261536149105667, + "flos": 22565475786240.0, + "grad_norm": 2.2427039672873144, + "language_loss": 0.75491703, + "learning_rate": 3.086491418735959e-07, + "loss": 0.77758747, + "num_input_tokens_seen": 296417660, + "step": 13741, + "time_per_iteration": 4.413954734802246 + }, + { + "auxiliary_loss_clip": 0.01147702, + "auxiliary_loss_mlp": 0.01101322, + "balance_loss_clip": 1.00178254, + "balance_loss_mlp": 1.00051832, + "epoch": 0.8262137381632346, + "flos": 32525479342080.0, + "grad_norm": 1.8726988756969265, + "language_loss": 0.6255157, + "learning_rate": 3.0844131920004726e-07, + "loss": 0.64800596, + "num_input_tokens_seen": 296438255, + "step": 13742, + "time_per_iteration": 2.6907870769500732 + }, + { + "auxiliary_loss_clip": 0.01101916, + "auxiliary_loss_mlp": 0.01102708, + "balance_loss_clip": 1.0016402, + "balance_loss_mlp": 1.00047362, + "epoch": 0.8262738614159026, + "flos": 14136451666560.0, + "grad_norm": 3.489945861940146, + "language_loss": 0.66086936, + "learning_rate": 3.0823356067057327e-07, + "loss": 0.68291557, + "num_input_tokens_seen": 296454485, + "step": 13743, + "time_per_iteration": 3.1352126598358154 + }, + { + "auxiliary_loss_clip": 0.01134617, + "auxiliary_loss_mlp": 0.01102092, + "balance_loss_clip": 1.0019151, + "balance_loss_mlp": 1.00042987, + "epoch": 0.8263339846685706, + "flos": 19825347795840.0, + "grad_norm": 3.5420386071363064, + "language_loss": 0.66538775, + "learning_rate": 3.0802586629305283e-07, + "loss": 0.68775487, + "num_input_tokens_seen": 296473740, + "step": 13744, + "time_per_iteration": 2.8125665187835693 + }, + { + "auxiliary_loss_clip": 0.01115792, + "auxiliary_loss_mlp": 0.01101817, + "balance_loss_clip": 1.0017724, + "balance_loss_mlp": 1.00053644, + "epoch": 0.8263941079212386, + "flos": 22745962650240.0, + "grad_norm": 1.734632623959675, + "language_loss": 0.75341213, + "learning_rate": 3.078182360753612e-07, + "loss": 0.77558815, + "num_input_tokens_seen": 296493355, + "step": 13745, + "time_per_iteration": 2.724358081817627 + }, + { + "auxiliary_loss_clip": 0.01132464, + "auxiliary_loss_mlp": 0.00747334, + "balance_loss_clip": 1.00173378, + "balance_loss_mlp": 1.00046206, + "epoch": 0.8264542311739065, + "flos": 20120641505280.0, + "grad_norm": 1.8085591928651807, + "language_loss": 0.79136848, + "learning_rate": 3.076106700253709e-07, + "loss": 0.81016648, + "num_input_tokens_seen": 296510520, + "step": 13746, + "time_per_iteration": 2.671596050262451 + }, + { + "auxiliary_loss_clip": 0.01148031, + "auxiliary_loss_mlp": 0.01102979, + "balance_loss_clip": 1.00182521, + "balance_loss_mlp": 1.00055432, + "epoch": 0.8265143544265745, + "flos": 16837149502080.0, + "grad_norm": 2.12549515494086, + "language_loss": 0.68459976, + "learning_rate": 3.0740316815095415e-07, + "loss": 0.70710981, + "num_input_tokens_seen": 296528265, + "step": 13747, + "time_per_iteration": 2.5859909057617188 + }, + { + "auxiliary_loss_clip": 0.01149346, + "auxiliary_loss_mlp": 0.01101387, + "balance_loss_clip": 1.00181222, + "balance_loss_mlp": 1.00039268, + "epoch": 0.8265744776792424, + "flos": 22018592240640.0, + "grad_norm": 1.8223273469742587, + "language_loss": 0.75396371, + "learning_rate": 3.0719573045997835e-07, + "loss": 0.77647102, + "num_input_tokens_seen": 296547810, + "step": 13748, + "time_per_iteration": 2.657132387161255 + }, + { + "auxiliary_loss_clip": 0.01114166, + "auxiliary_loss_mlp": 0.01101033, + "balance_loss_clip": 1.00178194, + "balance_loss_mlp": 1.00061142, + "epoch": 0.8266346009319104, + "flos": 19244852098560.0, + "grad_norm": 2.8116041069502398, + "language_loss": 0.63642025, + "learning_rate": 3.069883569603102e-07, + "loss": 0.6585722, + "num_input_tokens_seen": 296565940, + "step": 13749, + "time_per_iteration": 2.6698238849639893 + }, + { + "auxiliary_loss_clip": 0.01132848, + "auxiliary_loss_mlp": 0.01101043, + "balance_loss_clip": 1.00175571, + "balance_loss_mlp": 1.00043058, + "epoch": 0.8266947241845783, + "flos": 24166768095360.0, + "grad_norm": 1.727468568285058, + "language_loss": 0.73473817, + "learning_rate": 3.067810476598132e-07, + "loss": 0.7570771, + "num_input_tokens_seen": 296585090, + "step": 13750, + "time_per_iteration": 2.7605175971984863 + }, + { + "auxiliary_loss_clip": 0.0114773, + "auxiliary_loss_mlp": 0.01101784, + "balance_loss_clip": 1.00191796, + "balance_loss_mlp": 1.00050414, + "epoch": 0.8267548474372464, + "flos": 21105814803840.0, + "grad_norm": 1.8945703874708217, + "language_loss": 0.6573956, + "learning_rate": 3.065738025663496e-07, + "loss": 0.67989075, + "num_input_tokens_seen": 296604950, + "step": 13751, + "time_per_iteration": 2.645735502243042 + }, + { + "auxiliary_loss_clip": 0.01132959, + "auxiliary_loss_mlp": 0.01101477, + "balance_loss_clip": 1.00175202, + "balance_loss_mlp": 1.0004828, + "epoch": 0.8268149706899143, + "flos": 39968288668800.0, + "grad_norm": 1.8378907954099475, + "language_loss": 0.60832256, + "learning_rate": 3.0636662168777607e-07, + "loss": 0.63066691, + "num_input_tokens_seen": 296627780, + "step": 13752, + "time_per_iteration": 3.069694757461548 + }, + { + "auxiliary_loss_clip": 0.01141424, + "auxiliary_loss_mlp": 0.01074752, + "balance_loss_clip": 1.00071251, + "balance_loss_mlp": 0.99998373, + "epoch": 0.8268750939425823, + "flos": 65782423244160.0, + "grad_norm": 0.7741968291468926, + "language_loss": 0.57561976, + "learning_rate": 3.0615950503194986e-07, + "loss": 0.59778154, + "num_input_tokens_seen": 296683850, + "step": 13753, + "time_per_iteration": 3.42014741897583 + }, + { + "auxiliary_loss_clip": 0.01095959, + "auxiliary_loss_mlp": 0.00745487, + "balance_loss_clip": 1.00076485, + "balance_loss_mlp": 1.00024319, + "epoch": 0.8269352171952503, + "flos": 52981455242880.0, + "grad_norm": 0.7026998074137921, + "language_loss": 0.54992414, + "learning_rate": 3.0595245260672563e-07, + "loss": 0.56833857, + "num_input_tokens_seen": 296741420, + "step": 13754, + "time_per_iteration": 4.767058849334717 + }, + { + "auxiliary_loss_clip": 0.01117919, + "auxiliary_loss_mlp": 0.01100422, + "balance_loss_clip": 1.00178313, + "balance_loss_mlp": 1.00052452, + "epoch": 0.8269953404479182, + "flos": 23076125487360.0, + "grad_norm": 1.9086141400693843, + "language_loss": 0.6915189, + "learning_rate": 3.0574546441995354e-07, + "loss": 0.71370232, + "num_input_tokens_seen": 296759620, + "step": 13755, + "time_per_iteration": 3.421433210372925 + }, + { + "auxiliary_loss_clip": 0.01099654, + "auxiliary_loss_mlp": 0.01100446, + "balance_loss_clip": 1.00179029, + "balance_loss_mlp": 1.00059581, + "epoch": 0.8270554637005862, + "flos": 14209996763520.0, + "grad_norm": 2.5325222804203538, + "language_loss": 0.69885367, + "learning_rate": 3.0553854047948324e-07, + "loss": 0.72085464, + "num_input_tokens_seen": 296777275, + "step": 13756, + "time_per_iteration": 4.244386434555054 + }, + { + "auxiliary_loss_clip": 0.01148063, + "auxiliary_loss_mlp": 0.01101367, + "balance_loss_clip": 1.00191855, + "balance_loss_mlp": 1.00065935, + "epoch": 0.8271155869532542, + "flos": 21762046327680.0, + "grad_norm": 1.7506360969971773, + "language_loss": 0.7259376, + "learning_rate": 3.053316807931623e-07, + "loss": 0.74843192, + "num_input_tokens_seen": 296796655, + "step": 13757, + "time_per_iteration": 4.89493989944458 + }, + { + "auxiliary_loss_clip": 0.01147964, + "auxiliary_loss_mlp": 0.01102495, + "balance_loss_clip": 1.00191665, + "balance_loss_mlp": 1.00045204, + "epoch": 0.8271757102059222, + "flos": 15120475729920.0, + "grad_norm": 2.591511316339512, + "language_loss": 0.69233066, + "learning_rate": 3.0512488536883283e-07, + "loss": 0.71483517, + "num_input_tokens_seen": 296813705, + "step": 13758, + "time_per_iteration": 3.1675214767456055 + }, + { + "auxiliary_loss_clip": 0.01132135, + "auxiliary_loss_mlp": 0.01100095, + "balance_loss_clip": 1.00172043, + "balance_loss_mlp": 1.00048423, + "epoch": 0.8272358334585901, + "flos": 24133730561280.0, + "grad_norm": 1.6035854375571235, + "language_loss": 0.69435751, + "learning_rate": 3.0491815421433775e-07, + "loss": 0.71667981, + "num_input_tokens_seen": 296833985, + "step": 13759, + "time_per_iteration": 2.99208927154541 + }, + { + "auxiliary_loss_clip": 0.01132352, + "auxiliary_loss_mlp": 0.01101472, + "balance_loss_clip": 1.00173533, + "balance_loss_mlp": 1.00047827, + "epoch": 0.8272959567112581, + "flos": 18990712396800.0, + "grad_norm": 1.8219516330060666, + "language_loss": 0.71146715, + "learning_rate": 3.047114873375161e-07, + "loss": 0.73380536, + "num_input_tokens_seen": 296850150, + "step": 13760, + "time_per_iteration": 3.260481595993042 + }, + { + "auxiliary_loss_clip": 0.01118211, + "auxiliary_loss_mlp": 0.01101001, + "balance_loss_clip": 1.00192034, + "balance_loss_mlp": 1.00048399, + "epoch": 0.827356079963926, + "flos": 20631614428800.0, + "grad_norm": 1.6965626486216674, + "language_loss": 0.77459729, + "learning_rate": 3.0450488474620505e-07, + "loss": 0.79678941, + "num_input_tokens_seen": 296869585, + "step": 13761, + "time_per_iteration": 2.802783727645874 + }, + { + "auxiliary_loss_clip": 0.0111525, + "auxiliary_loss_mlp": 0.01099848, + "balance_loss_clip": 1.00165546, + "balance_loss_mlp": 1.00052297, + "epoch": 0.827416203216594, + "flos": 22416625825920.0, + "grad_norm": 2.0972536012169023, + "language_loss": 0.69794524, + "learning_rate": 3.042983464482387e-07, + "loss": 0.72009623, + "num_input_tokens_seen": 296887710, + "step": 13762, + "time_per_iteration": 2.8747739791870117 + }, + { + "auxiliary_loss_clip": 0.01099828, + "auxiliary_loss_mlp": 0.01099983, + "balance_loss_clip": 1.00158954, + "balance_loss_mlp": 1.0004189, + "epoch": 0.827476326469262, + "flos": 19026192055680.0, + "grad_norm": 2.0269234725706013, + "language_loss": 0.69884008, + "learning_rate": 3.0409187245144853e-07, + "loss": 0.72083819, + "num_input_tokens_seen": 296906265, + "step": 13763, + "time_per_iteration": 2.918440818786621 + }, + { + "auxiliary_loss_clip": 0.01114179, + "auxiliary_loss_mlp": 0.01074763, + "balance_loss_clip": 1.00073719, + "balance_loss_mlp": 0.99999505, + "epoch": 0.82753644972193, + "flos": 68500575089280.0, + "grad_norm": 0.8440200328202846, + "language_loss": 0.65126503, + "learning_rate": 3.038854627636651e-07, + "loss": 0.67315447, + "num_input_tokens_seen": 296971290, + "step": 13764, + "time_per_iteration": 3.355816125869751 + }, + { + "auxiliary_loss_clip": 0.01147893, + "auxiliary_loss_mlp": 0.01101943, + "balance_loss_clip": 1.00191534, + "balance_loss_mlp": 1.00056756, + "epoch": 0.8275965729745979, + "flos": 18405404277120.0, + "grad_norm": 1.9401195787454404, + "language_loss": 0.77971423, + "learning_rate": 3.0367911739271423e-07, + "loss": 0.8022126, + "num_input_tokens_seen": 296989060, + "step": 13765, + "time_per_iteration": 2.616849184036255 + }, + { + "auxiliary_loss_clip": 0.01084961, + "auxiliary_loss_mlp": 0.01101877, + "balance_loss_clip": 1.00166655, + "balance_loss_mlp": 1.00035822, + "epoch": 0.8276566962272659, + "flos": 28512067063680.0, + "grad_norm": 1.5633233166443155, + "language_loss": 0.62545258, + "learning_rate": 3.034728363464214e-07, + "loss": 0.64732099, + "num_input_tokens_seen": 297011300, + "step": 13766, + "time_per_iteration": 2.8845696449279785 + }, + { + "auxiliary_loss_clip": 0.01116434, + "auxiliary_loss_mlp": 0.01100901, + "balance_loss_clip": 1.00170326, + "balance_loss_mlp": 1.00038338, + "epoch": 0.8277168194799339, + "flos": 20230240878720.0, + "grad_norm": 1.618936862938933, + "language_loss": 0.82597893, + "learning_rate": 3.03266619632609e-07, + "loss": 0.84815228, + "num_input_tokens_seen": 297030350, + "step": 13767, + "time_per_iteration": 2.842601776123047 + }, + { + "auxiliary_loss_clip": 0.01131514, + "auxiliary_loss_mlp": 0.01102418, + "balance_loss_clip": 1.00189018, + "balance_loss_mlp": 1.00066066, + "epoch": 0.8277769427326018, + "flos": 28476623318400.0, + "grad_norm": 2.6225362532525462, + "language_loss": 0.68958664, + "learning_rate": 3.030604672590964e-07, + "loss": 0.71192598, + "num_input_tokens_seen": 297049710, + "step": 13768, + "time_per_iteration": 2.8189523220062256 + }, + { + "auxiliary_loss_clip": 0.01087589, + "auxiliary_loss_mlp": 0.01100059, + "balance_loss_clip": 1.00162876, + "balance_loss_mlp": 1.00049555, + "epoch": 0.8278370659852698, + "flos": 27197628768000.0, + "grad_norm": 1.7860996243449667, + "language_loss": 0.74377668, + "learning_rate": 3.028543792337006e-07, + "loss": 0.76565313, + "num_input_tokens_seen": 297070510, + "step": 13769, + "time_per_iteration": 2.9610791206359863 + }, + { + "auxiliary_loss_clip": 0.01130836, + "auxiliary_loss_mlp": 0.01101335, + "balance_loss_clip": 1.00171137, + "balance_loss_mlp": 1.00043678, + "epoch": 0.8278971892379378, + "flos": 37816126404480.0, + "grad_norm": 1.8506601124583604, + "language_loss": 0.74456823, + "learning_rate": 3.0264835556423675e-07, + "loss": 0.76689005, + "num_input_tokens_seen": 297092585, + "step": 13770, + "time_per_iteration": 2.8947620391845703 + }, + { + "auxiliary_loss_clip": 0.01116271, + "auxiliary_loss_mlp": 0.01101413, + "balance_loss_clip": 1.00166965, + "balance_loss_mlp": 1.00051463, + "epoch": 0.8279573124906058, + "flos": 22560160573440.0, + "grad_norm": 1.7302109284671665, + "language_loss": 0.75688398, + "learning_rate": 3.0244239625851785e-07, + "loss": 0.77906078, + "num_input_tokens_seen": 297110055, + "step": 13771, + "time_per_iteration": 2.7341878414154053 + }, + { + "auxiliary_loss_clip": 0.0116428, + "auxiliary_loss_mlp": 0.01101047, + "balance_loss_clip": 1.00187099, + "balance_loss_mlp": 1.0006249, + "epoch": 0.8280174357432737, + "flos": 36064619418240.0, + "grad_norm": 1.568070919043963, + "language_loss": 0.72746074, + "learning_rate": 3.0223650132435284e-07, + "loss": 0.75011396, + "num_input_tokens_seen": 297132170, + "step": 13772, + "time_per_iteration": 2.6801364421844482 + }, + { + "auxiliary_loss_clip": 0.01131876, + "auxiliary_loss_mlp": 0.01100248, + "balance_loss_clip": 1.00174689, + "balance_loss_mlp": 1.00049376, + "epoch": 0.8280775589959417, + "flos": 22961067246720.0, + "grad_norm": 2.3449530194823573, + "language_loss": 0.74807578, + "learning_rate": 3.0203067076955035e-07, + "loss": 0.77039701, + "num_input_tokens_seen": 297149515, + "step": 13773, + "time_per_iteration": 2.6584620475769043 + }, + { + "auxiliary_loss_clip": 0.0109961, + "auxiliary_loss_mlp": 0.01101094, + "balance_loss_clip": 1.00170577, + "balance_loss_mlp": 1.0005765, + "epoch": 0.8281376822486096, + "flos": 26063282286720.0, + "grad_norm": 3.558898752346497, + "language_loss": 0.75891, + "learning_rate": 3.01824904601915e-07, + "loss": 0.78091705, + "num_input_tokens_seen": 297170320, + "step": 13774, + "time_per_iteration": 4.358517646789551 + }, + { + "auxiliary_loss_clip": 0.01114267, + "auxiliary_loss_mlp": 0.00747383, + "balance_loss_clip": 1.00175071, + "balance_loss_mlp": 1.00048518, + "epoch": 0.8281978055012776, + "flos": 20667776446080.0, + "grad_norm": 1.636808931213865, + "language_loss": 0.75561255, + "learning_rate": 3.01619202829249e-07, + "loss": 0.77422905, + "num_input_tokens_seen": 297189935, + "step": 13775, + "time_per_iteration": 2.7999730110168457 + }, + { + "auxiliary_loss_clip": 0.01164501, + "auxiliary_loss_mlp": 0.01102435, + "balance_loss_clip": 1.00188017, + "balance_loss_mlp": 1.00048733, + "epoch": 0.8282579287539455, + "flos": 29315281040640.0, + "grad_norm": 1.8561610297647966, + "language_loss": 0.73719597, + "learning_rate": 3.01413565459353e-07, + "loss": 0.75986528, + "num_input_tokens_seen": 297210885, + "step": 13776, + "time_per_iteration": 2.7150890827178955 + }, + { + "auxiliary_loss_clip": 0.01103263, + "auxiliary_loss_mlp": 0.01101353, + "balance_loss_clip": 1.00179684, + "balance_loss_mlp": 1.00045395, + "epoch": 0.8283180520066136, + "flos": 15706178899200.0, + "grad_norm": 2.3092050307247196, + "language_loss": 0.77618092, + "learning_rate": 3.0120799250002483e-07, + "loss": 0.79822707, + "num_input_tokens_seen": 297228500, + "step": 13777, + "time_per_iteration": 2.837049722671509 + }, + { + "auxiliary_loss_clip": 0.01147575, + "auxiliary_loss_mlp": 0.0110136, + "balance_loss_clip": 1.00189137, + "balance_loss_mlp": 1.00050938, + "epoch": 0.8283781752592815, + "flos": 24791470456320.0, + "grad_norm": 2.047718270806807, + "language_loss": 0.82804459, + "learning_rate": 3.010024839590604e-07, + "loss": 0.8505339, + "num_input_tokens_seen": 297249470, + "step": 13778, + "time_per_iteration": 4.134535789489746 + }, + { + "auxiliary_loss_clip": 0.01149571, + "auxiliary_loss_mlp": 0.01100342, + "balance_loss_clip": 1.00190187, + "balance_loss_mlp": 1.00039709, + "epoch": 0.8284382985119495, + "flos": 18982811404800.0, + "grad_norm": 1.9964149768084132, + "language_loss": 0.74081248, + "learning_rate": 3.0079703984425187e-07, + "loss": 0.76331162, + "num_input_tokens_seen": 297265970, + "step": 13779, + "time_per_iteration": 2.6313889026641846 + }, + { + "auxiliary_loss_clip": 0.01112953, + "auxiliary_loss_mlp": 0.01074739, + "balance_loss_clip": 1.00085521, + "balance_loss_mlp": 0.99997103, + "epoch": 0.8284984217646175, + "flos": 61034460814080.0, + "grad_norm": 0.7680041965787983, + "language_loss": 0.56728166, + "learning_rate": 3.0059166016338954e-07, + "loss": 0.58915859, + "num_input_tokens_seen": 297325525, + "step": 13780, + "time_per_iteration": 3.282137632369995 + }, + { + "auxiliary_loss_clip": 0.01116434, + "auxiliary_loss_mlp": 0.01101719, + "balance_loss_clip": 1.00174809, + "balance_loss_mlp": 1.00043869, + "epoch": 0.8285585450172854, + "flos": 19714635100800.0, + "grad_norm": 2.1997615830402997, + "language_loss": 0.80059409, + "learning_rate": 3.0038634492426205e-07, + "loss": 0.8227756, + "num_input_tokens_seen": 297345025, + "step": 13781, + "time_per_iteration": 2.8218629360198975 + }, + { + "auxiliary_loss_clip": 0.01101529, + "auxiliary_loss_mlp": 0.01102316, + "balance_loss_clip": 1.00172913, + "balance_loss_mlp": 1.00046325, + "epoch": 0.8286186682699535, + "flos": 21688896280320.0, + "grad_norm": 2.004739832721798, + "language_loss": 0.7553733, + "learning_rate": 3.001810941346543e-07, + "loss": 0.7774117, + "num_input_tokens_seen": 297363570, + "step": 13782, + "time_per_iteration": 2.7770044803619385 + }, + { + "auxiliary_loss_clip": 0.01147617, + "auxiliary_loss_mlp": 0.01101024, + "balance_loss_clip": 1.00166416, + "balance_loss_mlp": 1.00036359, + "epoch": 0.8286787915226214, + "flos": 25775566346880.0, + "grad_norm": 1.7328040192518304, + "language_loss": 0.76109719, + "learning_rate": 2.9997590780234983e-07, + "loss": 0.78358358, + "num_input_tokens_seen": 297385385, + "step": 13783, + "time_per_iteration": 2.7486324310302734 + }, + { + "auxiliary_loss_clip": 0.01164168, + "auxiliary_loss_mlp": 0.01101954, + "balance_loss_clip": 1.00179875, + "balance_loss_mlp": 1.00038826, + "epoch": 0.8287389147752894, + "flos": 21288348743040.0, + "grad_norm": 1.8245718045611659, + "language_loss": 0.73748082, + "learning_rate": 2.997707859351304e-07, + "loss": 0.76014209, + "num_input_tokens_seen": 297403950, + "step": 13784, + "time_per_iteration": 2.5660946369171143 + }, + { + "auxiliary_loss_clip": 0.0114769, + "auxiliary_loss_mlp": 0.01102065, + "balance_loss_clip": 1.00169814, + "balance_loss_mlp": 1.00059414, + "epoch": 0.8287990380279573, + "flos": 33544875323520.0, + "grad_norm": 1.5327918733284762, + "language_loss": 0.69795126, + "learning_rate": 2.99565728540772e-07, + "loss": 0.72044879, + "num_input_tokens_seen": 297424565, + "step": 13785, + "time_per_iteration": 2.7343528270721436 + }, + { + "auxiliary_loss_clip": 0.01130971, + "auxiliary_loss_mlp": 0.0110098, + "balance_loss_clip": 1.00174952, + "balance_loss_mlp": 1.00046301, + "epoch": 0.8288591612806253, + "flos": 22966346545920.0, + "grad_norm": 2.406217593634712, + "language_loss": 0.68622267, + "learning_rate": 2.993607356270516e-07, + "loss": 0.70854223, + "num_input_tokens_seen": 297445180, + "step": 13786, + "time_per_iteration": 2.6763100624084473 + }, + { + "auxiliary_loss_clip": 0.01098969, + "auxiliary_loss_mlp": 0.01101995, + "balance_loss_clip": 1.00164747, + "balance_loss_mlp": 1.00052381, + "epoch": 0.8289192845332932, + "flos": 18588979710720.0, + "grad_norm": 1.709574220571509, + "language_loss": 0.77382684, + "learning_rate": 2.991558072017426e-07, + "loss": 0.79583645, + "num_input_tokens_seen": 297463790, + "step": 13787, + "time_per_iteration": 2.753481388092041 + }, + { + "auxiliary_loss_clip": 0.01148096, + "auxiliary_loss_mlp": 0.01100861, + "balance_loss_clip": 1.0019455, + "balance_loss_mlp": 1.00053477, + "epoch": 0.8289794077859612, + "flos": 15450423085440.0, + "grad_norm": 2.0352506241306676, + "language_loss": 0.80240214, + "learning_rate": 2.989509432726163e-07, + "loss": 0.82489175, + "num_input_tokens_seen": 297480100, + "step": 13788, + "time_per_iteration": 2.619722366333008 + }, + { + "auxiliary_loss_clip": 0.01130842, + "auxiliary_loss_mlp": 0.01100795, + "balance_loss_clip": 1.00176072, + "balance_loss_mlp": 1.00056338, + "epoch": 0.8290395310386292, + "flos": 28877853214080.0, + "grad_norm": 1.6540505276129966, + "language_loss": 0.70971763, + "learning_rate": 2.9874614384744014e-07, + "loss": 0.73203397, + "num_input_tokens_seen": 297499890, + "step": 13789, + "time_per_iteration": 3.071178913116455 + }, + { + "auxiliary_loss_clip": 0.01118334, + "auxiliary_loss_mlp": 0.011018, + "balance_loss_clip": 1.00158656, + "balance_loss_mlp": 1.00051999, + "epoch": 0.8290996542912972, + "flos": 36576274700160.0, + "grad_norm": 2.000557134590706, + "language_loss": 0.68003023, + "learning_rate": 2.985414089339813e-07, + "loss": 0.70223153, + "num_input_tokens_seen": 297521440, + "step": 13790, + "time_per_iteration": 3.1247785091400146 + }, + { + "auxiliary_loss_clip": 0.01147783, + "auxiliary_loss_mlp": 0.0110107, + "balance_loss_clip": 1.00180423, + "balance_loss_mlp": 1.00040948, + "epoch": 0.8291597775439651, + "flos": 23623009032960.0, + "grad_norm": 1.6958745386126433, + "language_loss": 0.77643526, + "learning_rate": 2.9833673854000265e-07, + "loss": 0.79892385, + "num_input_tokens_seen": 297539920, + "step": 13791, + "time_per_iteration": 2.81238055229187 + }, + { + "auxiliary_loss_clip": 0.01132825, + "auxiliary_loss_mlp": 0.01101101, + "balance_loss_clip": 1.00198567, + "balance_loss_mlp": 1.00039256, + "epoch": 0.8292199007966331, + "flos": 21397481239680.0, + "grad_norm": 3.383744549779619, + "language_loss": 0.69873786, + "learning_rate": 2.981321326732651e-07, + "loss": 0.72107708, + "num_input_tokens_seen": 297560000, + "step": 13792, + "time_per_iteration": 4.592029333114624 + }, + { + "auxiliary_loss_clip": 0.01130905, + "auxiliary_loss_mlp": 0.01101763, + "balance_loss_clip": 1.00175881, + "balance_loss_mlp": 1.0004828, + "epoch": 0.829280024049301, + "flos": 28767607395840.0, + "grad_norm": 3.1785770765809875, + "language_loss": 0.65245247, + "learning_rate": 2.9792759134152736e-07, + "loss": 0.67477912, + "num_input_tokens_seen": 297579300, + "step": 13793, + "time_per_iteration": 2.768982410430908 + }, + { + "auxiliary_loss_clip": 0.01085417, + "auxiliary_loss_mlp": 0.01101344, + "balance_loss_clip": 1.00154746, + "balance_loss_mlp": 1.00044501, + "epoch": 0.829340147301969, + "flos": 19938071652480.0, + "grad_norm": 1.7658556774398169, + "language_loss": 0.66375494, + "learning_rate": 2.977231145525461e-07, + "loss": 0.68562251, + "num_input_tokens_seen": 297598095, + "step": 13794, + "time_per_iteration": 2.8374741077423096 + }, + { + "auxiliary_loss_clip": 0.011643, + "auxiliary_loss_mlp": 0.01101807, + "balance_loss_clip": 1.0017544, + "balance_loss_mlp": 1.00062239, + "epoch": 0.829400270554637, + "flos": 25228575060480.0, + "grad_norm": 1.9359954148318406, + "language_loss": 0.66355157, + "learning_rate": 2.975187023140757e-07, + "loss": 0.68621272, + "num_input_tokens_seen": 297615955, + "step": 13795, + "time_per_iteration": 4.2032105922698975 + }, + { + "auxiliary_loss_clip": 0.01052232, + "auxiliary_loss_mlp": 0.01100343, + "balance_loss_clip": 1.00147319, + "balance_loss_mlp": 1.00049293, + "epoch": 0.829460393807305, + "flos": 24463570176000.0, + "grad_norm": 1.7292799129682028, + "language_loss": 0.66215599, + "learning_rate": 2.973143546338661e-07, + "loss": 0.68368173, + "num_input_tokens_seen": 297636285, + "step": 13796, + "time_per_iteration": 3.0070230960845947 + }, + { + "auxiliary_loss_clip": 0.01103503, + "auxiliary_loss_mlp": 0.01100537, + "balance_loss_clip": 1.0017066, + "balance_loss_mlp": 1.00049639, + "epoch": 0.829520517059973, + "flos": 15122486891520.0, + "grad_norm": 4.498143677716011, + "language_loss": 0.71829522, + "learning_rate": 2.971100715196666e-07, + "loss": 0.74033564, + "num_input_tokens_seen": 297653315, + "step": 13797, + "time_per_iteration": 3.2743544578552246 + }, + { + "auxiliary_loss_clip": 0.01068092, + "auxiliary_loss_mlp": 0.01101307, + "balance_loss_clip": 1.00156379, + "balance_loss_mlp": 1.0005039, + "epoch": 0.8295806403126409, + "flos": 21579979265280.0, + "grad_norm": 9.133627760725657, + "language_loss": 0.71990341, + "learning_rate": 2.969058529792243e-07, + "loss": 0.74159741, + "num_input_tokens_seen": 297673480, + "step": 13798, + "time_per_iteration": 2.9788146018981934 + }, + { + "auxiliary_loss_clip": 0.01120048, + "auxiliary_loss_mlp": 0.01100526, + "balance_loss_clip": 1.00180316, + "balance_loss_mlp": 1.00048542, + "epoch": 0.8296407635653089, + "flos": 21726566668800.0, + "grad_norm": 5.29534109233004, + "language_loss": 0.7645359, + "learning_rate": 2.967016990202822e-07, + "loss": 0.78674167, + "num_input_tokens_seen": 297693250, + "step": 13799, + "time_per_iteration": 2.830364227294922 + }, + { + "auxiliary_loss_clip": 0.01164138, + "auxiliary_loss_mlp": 0.01101532, + "balance_loss_clip": 1.00185382, + "balance_loss_mlp": 1.00053787, + "epoch": 0.8297008868179768, + "flos": 11181147252480.0, + "grad_norm": 1.8085254809176603, + "language_loss": 0.6747579, + "learning_rate": 2.9649760965058245e-07, + "loss": 0.69741458, + "num_input_tokens_seen": 297710975, + "step": 13800, + "time_per_iteration": 2.9983394145965576 + }, + { + "auxiliary_loss_clip": 0.01101272, + "auxiliary_loss_mlp": 0.01102431, + "balance_loss_clip": 1.00166416, + "balance_loss_mlp": 1.0004828, + "epoch": 0.8297610100706448, + "flos": 20664041431680.0, + "grad_norm": 1.838941664164911, + "language_loss": 0.74393165, + "learning_rate": 2.9629358487786515e-07, + "loss": 0.76596868, + "num_input_tokens_seen": 297730860, + "step": 13801, + "time_per_iteration": 2.932480812072754 + }, + { + "auxiliary_loss_clip": 0.01101142, + "auxiliary_loss_mlp": 0.01100735, + "balance_loss_clip": 1.00169873, + "balance_loss_mlp": 1.00040829, + "epoch": 0.8298211333233128, + "flos": 20376325491840.0, + "grad_norm": 4.065248934917771, + "language_loss": 0.73579192, + "learning_rate": 2.9608962470986476e-07, + "loss": 0.75781071, + "num_input_tokens_seen": 297749765, + "step": 13802, + "time_per_iteration": 2.90720272064209 + }, + { + "auxiliary_loss_clip": 0.01134989, + "auxiliary_loss_mlp": 0.01101434, + "balance_loss_clip": 1.00179744, + "balance_loss_mlp": 1.00043988, + "epoch": 0.8298812565759808, + "flos": 21508696725120.0, + "grad_norm": 1.6121125356392372, + "language_loss": 0.74703133, + "learning_rate": 2.9588572915431644e-07, + "loss": 0.76939559, + "num_input_tokens_seen": 297770380, + "step": 13803, + "time_per_iteration": 3.953739643096924 + }, + { + "auxiliary_loss_clip": 0.01147741, + "auxiliary_loss_mlp": 0.01100516, + "balance_loss_clip": 1.00190282, + "balance_loss_mlp": 1.00052333, + "epoch": 0.8299413798286487, + "flos": 22818681734400.0, + "grad_norm": 1.7259094128843169, + "language_loss": 0.79208398, + "learning_rate": 2.9568189821895215e-07, + "loss": 0.81456649, + "num_input_tokens_seen": 297789440, + "step": 13804, + "time_per_iteration": 4.147565841674805 + }, + { + "auxiliary_loss_clip": 0.01164299, + "auxiliary_loss_mlp": 0.01100614, + "balance_loss_clip": 1.00191021, + "balance_loss_mlp": 1.00047874, + "epoch": 0.8300015030813167, + "flos": 29679199683840.0, + "grad_norm": 1.5642236451785236, + "language_loss": 0.73165977, + "learning_rate": 2.954781319115016e-07, + "loss": 0.75430888, + "num_input_tokens_seen": 297810425, + "step": 13805, + "time_per_iteration": 3.1621313095092773 + }, + { + "auxiliary_loss_clip": 0.01148033, + "auxiliary_loss_mlp": 0.00747322, + "balance_loss_clip": 1.0017904, + "balance_loss_mlp": 1.00043213, + "epoch": 0.8300616263339846, + "flos": 19719483436800.0, + "grad_norm": 1.9454252031186265, + "language_loss": 0.77502966, + "learning_rate": 2.952744302396906e-07, + "loss": 0.79398316, + "num_input_tokens_seen": 297827680, + "step": 13806, + "time_per_iteration": 3.3342013359069824 + }, + { + "auxiliary_loss_clip": 0.01147994, + "auxiliary_loss_mlp": 0.01101415, + "balance_loss_clip": 1.00186467, + "balance_loss_mlp": 1.00042057, + "epoch": 0.8301217495866526, + "flos": 19901945548800.0, + "grad_norm": 1.8317544530127916, + "language_loss": 0.63412333, + "learning_rate": 2.950707932112444e-07, + "loss": 0.6566174, + "num_input_tokens_seen": 297848005, + "step": 13807, + "time_per_iteration": 3.0549890995025635 + }, + { + "auxiliary_loss_clip": 0.01147796, + "auxiliary_loss_mlp": 0.01101843, + "balance_loss_clip": 1.00195599, + "balance_loss_mlp": 1.00041986, + "epoch": 0.8301818728393207, + "flos": 19715784336000.0, + "grad_norm": 2.219511993346683, + "language_loss": 0.73145688, + "learning_rate": 2.948672208338847e-07, + "loss": 0.75395328, + "num_input_tokens_seen": 297866730, + "step": 13808, + "time_per_iteration": 2.7823283672332764 + }, + { + "auxiliary_loss_clip": 0.01134348, + "auxiliary_loss_mlp": 0.01102475, + "balance_loss_clip": 1.00201488, + "balance_loss_mlp": 1.00071776, + "epoch": 0.8302419960919886, + "flos": 28293658416000.0, + "grad_norm": 1.675580806299648, + "language_loss": 0.66376781, + "learning_rate": 2.9466371311533046e-07, + "loss": 0.68613601, + "num_input_tokens_seen": 297886390, + "step": 13809, + "time_per_iteration": 3.2257964611053467 + }, + { + "auxiliary_loss_clip": 0.01164389, + "auxiliary_loss_mlp": 0.01101149, + "balance_loss_clip": 1.00189185, + "balance_loss_mlp": 1.00034595, + "epoch": 0.8303021193446566, + "flos": 18223444955520.0, + "grad_norm": 1.9648160189760346, + "language_loss": 0.74041295, + "learning_rate": 2.9446027006329896e-07, + "loss": 0.76306832, + "num_input_tokens_seen": 297905110, + "step": 13810, + "time_per_iteration": 2.7548437118530273 + }, + { + "auxiliary_loss_clip": 0.01115726, + "auxiliary_loss_mlp": 0.01100572, + "balance_loss_clip": 1.00168943, + "balance_loss_mlp": 1.0005796, + "epoch": 0.8303622425973245, + "flos": 23111425578240.0, + "grad_norm": 1.5119071531355996, + "language_loss": 0.80787843, + "learning_rate": 2.94256891685505e-07, + "loss": 0.83004141, + "num_input_tokens_seen": 297925460, + "step": 13811, + "time_per_iteration": 4.476428985595703 + }, + { + "auxiliary_loss_clip": 0.01118225, + "auxiliary_loss_mlp": 0.01101654, + "balance_loss_clip": 1.00191617, + "balance_loss_mlp": 1.0006125, + "epoch": 0.8304223658499925, + "flos": 19572860119680.0, + "grad_norm": 1.7828520192574828, + "language_loss": 0.73598766, + "learning_rate": 2.9405357798966156e-07, + "loss": 0.75818646, + "num_input_tokens_seen": 297941760, + "step": 13812, + "time_per_iteration": 2.7440080642700195 + }, + { + "auxiliary_loss_clip": 0.01130993, + "auxiliary_loss_mlp": 0.01100678, + "balance_loss_clip": 1.00175178, + "balance_loss_mlp": 1.00054181, + "epoch": 0.8304824891026604, + "flos": 24426115269120.0, + "grad_norm": 1.5645964909796535, + "language_loss": 0.78408277, + "learning_rate": 2.9385032898347664e-07, + "loss": 0.80639946, + "num_input_tokens_seen": 297959745, + "step": 13813, + "time_per_iteration": 2.703606128692627 + }, + { + "auxiliary_loss_clip": 0.01084183, + "auxiliary_loss_mlp": 0.00747527, + "balance_loss_clip": 1.00160754, + "balance_loss_mlp": 1.00059545, + "epoch": 0.8305426123553284, + "flos": 22381792611840.0, + "grad_norm": 1.8748943680583199, + "language_loss": 0.71088803, + "learning_rate": 2.93647144674658e-07, + "loss": 0.72920513, + "num_input_tokens_seen": 297977665, + "step": 13814, + "time_per_iteration": 2.846540689468384 + }, + { + "auxiliary_loss_clip": 0.01164617, + "auxiliary_loss_mlp": 0.01103181, + "balance_loss_clip": 1.00186563, + "balance_loss_mlp": 1.00056624, + "epoch": 0.8306027356079964, + "flos": 14903575453440.0, + "grad_norm": 2.274518707288352, + "language_loss": 0.68115717, + "learning_rate": 2.9344402507091116e-07, + "loss": 0.70383513, + "num_input_tokens_seen": 297993525, + "step": 13815, + "time_per_iteration": 2.6134798526763916 + }, + { + "auxiliary_loss_clip": 0.01147673, + "auxiliary_loss_mlp": 0.01101127, + "balance_loss_clip": 1.00193095, + "balance_loss_mlp": 1.00046694, + "epoch": 0.8306628588606644, + "flos": 19644573623040.0, + "grad_norm": 1.9329959186221952, + "language_loss": 0.7567358, + "learning_rate": 2.9324097017993745e-07, + "loss": 0.7792238, + "num_input_tokens_seen": 298012920, + "step": 13816, + "time_per_iteration": 4.193552494049072 + }, + { + "auxiliary_loss_clip": 0.01116529, + "auxiliary_loss_mlp": 0.01100967, + "balance_loss_clip": 1.00171757, + "balance_loss_mlp": 1.0005455, + "epoch": 0.8307229821133323, + "flos": 24389737770240.0, + "grad_norm": 1.7698010286724861, + "language_loss": 0.81402916, + "learning_rate": 2.930379800094371e-07, + "loss": 0.83620411, + "num_input_tokens_seen": 298033310, + "step": 13817, + "time_per_iteration": 2.76689076423645 + }, + { + "auxiliary_loss_clip": 0.01147717, + "auxiliary_loss_mlp": 0.01101479, + "balance_loss_clip": 1.00187421, + "balance_loss_mlp": 1.00048542, + "epoch": 0.8307831053660003, + "flos": 20996933702400.0, + "grad_norm": 3.4912237217637045, + "language_loss": 0.78192788, + "learning_rate": 2.9283505456710875e-07, + "loss": 0.80441988, + "num_input_tokens_seen": 298053530, + "step": 13818, + "time_per_iteration": 2.6504416465759277 + }, + { + "auxiliary_loss_clip": 0.01134264, + "auxiliary_loss_mlp": 0.01100763, + "balance_loss_clip": 1.00182712, + "balance_loss_mlp": 1.00043678, + "epoch": 0.8308432286186682, + "flos": 21397301671680.0, + "grad_norm": 1.7897308871355644, + "language_loss": 0.82197797, + "learning_rate": 2.926321938606453e-07, + "loss": 0.84432822, + "num_input_tokens_seen": 298069305, + "step": 13819, + "time_per_iteration": 2.7511394023895264 + }, + { + "auxiliary_loss_clip": 0.01141596, + "auxiliary_loss_mlp": 0.01074736, + "balance_loss_clip": 1.0007751, + "balance_loss_mlp": 0.99996769, + "epoch": 0.8309033518713362, + "flos": 62533656714240.0, + "grad_norm": 0.7585907055843765, + "language_loss": 0.56270337, + "learning_rate": 2.924293978977399e-07, + "loss": 0.58486664, + "num_input_tokens_seen": 298125830, + "step": 13820, + "time_per_iteration": 3.2057278156280518 + }, + { + "auxiliary_loss_clip": 0.01149663, + "auxiliary_loss_mlp": 0.01099805, + "balance_loss_clip": 1.00191951, + "balance_loss_mlp": 1.00038433, + "epoch": 0.8309634751240043, + "flos": 16979104051200.0, + "grad_norm": 2.1622846608995787, + "language_loss": 0.68400562, + "learning_rate": 2.922266666860831e-07, + "loss": 0.70650029, + "num_input_tokens_seen": 298142320, + "step": 13821, + "time_per_iteration": 2.5884745121002197 + }, + { + "auxiliary_loss_clip": 0.01071663, + "auxiliary_loss_mlp": 0.01101543, + "balance_loss_clip": 1.00158644, + "balance_loss_mlp": 1.00054932, + "epoch": 0.8310235983766722, + "flos": 22674464628480.0, + "grad_norm": 4.565972197835152, + "language_loss": 0.69312084, + "learning_rate": 2.920240002333625e-07, + "loss": 0.71485293, + "num_input_tokens_seen": 298161845, + "step": 13822, + "time_per_iteration": 2.9058499336242676 + }, + { + "auxiliary_loss_clip": 0.01100478, + "auxiliary_loss_mlp": 0.01100289, + "balance_loss_clip": 1.00183892, + "balance_loss_mlp": 1.00053489, + "epoch": 0.8310837216293402, + "flos": 30811463176320.0, + "grad_norm": 1.7278392097220043, + "language_loss": 0.62028033, + "learning_rate": 2.918213985472631e-07, + "loss": 0.64228797, + "num_input_tokens_seen": 298184165, + "step": 13823, + "time_per_iteration": 2.8578317165374756 + }, + { + "auxiliary_loss_clip": 0.01127227, + "auxiliary_loss_mlp": 0.01075064, + "balance_loss_clip": 1.00070119, + "balance_loss_mlp": 0.99991435, + "epoch": 0.8311438448820081, + "flos": 71276074997760.0, + "grad_norm": 0.865634469805333, + "language_loss": 0.61951327, + "learning_rate": 2.916188616354669e-07, + "loss": 0.64153612, + "num_input_tokens_seen": 298251720, + "step": 13824, + "time_per_iteration": 3.251464366912842 + }, + { + "auxiliary_loss_clip": 0.01164321, + "auxiliary_loss_mlp": 0.01101417, + "balance_loss_clip": 1.00184298, + "balance_loss_mlp": 1.00051785, + "epoch": 0.8312039681346761, + "flos": 20887082933760.0, + "grad_norm": 1.6141236866783486, + "language_loss": 0.74591744, + "learning_rate": 2.914163895056552e-07, + "loss": 0.76857483, + "num_input_tokens_seen": 298271910, + "step": 13825, + "time_per_iteration": 2.576969861984253 + }, + { + "auxiliary_loss_clip": 0.01099647, + "auxiliary_loss_mlp": 0.00747515, + "balance_loss_clip": 1.00168598, + "balance_loss_mlp": 1.0005579, + "epoch": 0.831264091387344, + "flos": 17017528625280.0, + "grad_norm": 2.305681301848705, + "language_loss": 0.8027575, + "learning_rate": 2.9121398216550486e-07, + "loss": 0.82122916, + "num_input_tokens_seen": 298288105, + "step": 13826, + "time_per_iteration": 2.848644971847534 + }, + { + "auxiliary_loss_clip": 0.01164363, + "auxiliary_loss_mlp": 0.01102182, + "balance_loss_clip": 1.00190127, + "balance_loss_mlp": 1.00052011, + "epoch": 0.831324214640012, + "flos": 24419578993920.0, + "grad_norm": 1.6834145900764286, + "language_loss": 0.67944056, + "learning_rate": 2.910116396226914e-07, + "loss": 0.702106, + "num_input_tokens_seen": 298307600, + "step": 13827, + "time_per_iteration": 2.6520566940307617 + }, + { + "auxiliary_loss_clip": 0.01149083, + "auxiliary_loss_mlp": 0.01101012, + "balance_loss_clip": 1.00182831, + "balance_loss_mlp": 1.00058985, + "epoch": 0.83138433789268, + "flos": 13545576938880.0, + "grad_norm": 2.132742711655169, + "language_loss": 0.74273604, + "learning_rate": 2.9080936188488834e-07, + "loss": 0.76523697, + "num_input_tokens_seen": 298323055, + "step": 13828, + "time_per_iteration": 2.6589152812957764 + }, + { + "auxiliary_loss_clip": 0.01119964, + "auxiliary_loss_mlp": 0.01101689, + "balance_loss_clip": 1.00169063, + "balance_loss_mlp": 1.00050402, + "epoch": 0.831444461145348, + "flos": 44492386561920.0, + "grad_norm": 1.674173524275904, + "language_loss": 0.67227137, + "learning_rate": 2.906071489597657e-07, + "loss": 0.69448793, + "num_input_tokens_seen": 298346950, + "step": 13829, + "time_per_iteration": 4.361804962158203 + }, + { + "auxiliary_loss_clip": 0.01131273, + "auxiliary_loss_mlp": 0.01102389, + "balance_loss_clip": 1.00169039, + "balance_loss_mlp": 1.00034547, + "epoch": 0.8315045843980159, + "flos": 22705024124160.0, + "grad_norm": 1.6201802316392258, + "language_loss": 0.82972193, + "learning_rate": 2.9040500085499054e-07, + "loss": 0.85205853, + "num_input_tokens_seen": 298366315, + "step": 13830, + "time_per_iteration": 2.6719729900360107 + }, + { + "auxiliary_loss_clip": 0.01147759, + "auxiliary_loss_mlp": 0.01101867, + "balance_loss_clip": 1.00183177, + "balance_loss_mlp": 1.00058639, + "epoch": 0.8315647076506839, + "flos": 16873491087360.0, + "grad_norm": 2.3887533501924327, + "language_loss": 0.74568617, + "learning_rate": 2.9020291757822925e-07, + "loss": 0.76818246, + "num_input_tokens_seen": 298385185, + "step": 13831, + "time_per_iteration": 2.571539878845215 + }, + { + "auxiliary_loss_clip": 0.01164312, + "auxiliary_loss_mlp": 0.01102271, + "balance_loss_clip": 1.00189281, + "balance_loss_mlp": 1.00060916, + "epoch": 0.8316248309033518, + "flos": 13808730954240.0, + "grad_norm": 1.735331117928805, + "language_loss": 0.71291965, + "learning_rate": 2.9000089913714523e-07, + "loss": 0.73558545, + "num_input_tokens_seen": 298402335, + "step": 13832, + "time_per_iteration": 4.002293586730957 + }, + { + "auxiliary_loss_clip": 0.01133328, + "auxiliary_loss_mlp": 0.01101817, + "balance_loss_clip": 1.00178015, + "balance_loss_mlp": 1.00039387, + "epoch": 0.8316849541560198, + "flos": 23512511819520.0, + "grad_norm": 1.9752399349654424, + "language_loss": 0.84718966, + "learning_rate": 2.897989455393979e-07, + "loss": 0.86954117, + "num_input_tokens_seen": 298423370, + "step": 13833, + "time_per_iteration": 2.675644874572754 + }, + { + "auxiliary_loss_clip": 0.0113231, + "auxiliary_loss_mlp": 0.01102012, + "balance_loss_clip": 1.00179505, + "balance_loss_mlp": 1.00054121, + "epoch": 0.8317450774086879, + "flos": 23771356202880.0, + "grad_norm": 1.7031430037350306, + "language_loss": 0.76316571, + "learning_rate": 2.8959705679264625e-07, + "loss": 0.78550887, + "num_input_tokens_seen": 298444835, + "step": 13834, + "time_per_iteration": 2.751408338546753 + }, + { + "auxiliary_loss_clip": 0.01164144, + "auxiliary_loss_mlp": 0.00747361, + "balance_loss_clip": 1.00183117, + "balance_loss_mlp": 1.0005486, + "epoch": 0.8318052006613558, + "flos": 16215535710720.0, + "grad_norm": 2.2011196788201643, + "language_loss": 0.79783678, + "learning_rate": 2.893952329045459e-07, + "loss": 0.81695175, + "num_input_tokens_seen": 298461845, + "step": 13835, + "time_per_iteration": 2.565093517303467 + }, + { + "auxiliary_loss_clip": 0.0114776, + "auxiliary_loss_mlp": 0.01102776, + "balance_loss_clip": 1.00186658, + "balance_loss_mlp": 1.00054264, + "epoch": 0.8318653239140238, + "flos": 19974556892160.0, + "grad_norm": 6.8508977374671005, + "language_loss": 0.80766881, + "learning_rate": 2.8919347388274905e-07, + "loss": 0.83017415, + "num_input_tokens_seen": 298479095, + "step": 13836, + "time_per_iteration": 2.6865835189819336 + }, + { + "auxiliary_loss_clip": 0.01131294, + "auxiliary_loss_mlp": 0.01099972, + "balance_loss_clip": 1.00175667, + "balance_loss_mlp": 1.00031328, + "epoch": 0.8319254471666917, + "flos": 17704714694400.0, + "grad_norm": 3.2410198095169225, + "language_loss": 0.77802753, + "learning_rate": 2.8899177973490727e-07, + "loss": 0.80034024, + "num_input_tokens_seen": 298494475, + "step": 13837, + "time_per_iteration": 3.3279125690460205 + }, + { + "auxiliary_loss_clip": 0.01164532, + "auxiliary_loss_mlp": 0.0110233, + "balance_loss_clip": 1.00186324, + "balance_loss_mlp": 1.00047767, + "epoch": 0.8319855704193597, + "flos": 19536554448000.0, + "grad_norm": 1.9216693688473616, + "language_loss": 0.83342516, + "learning_rate": 2.887901504686685e-07, + "loss": 0.85609376, + "num_input_tokens_seen": 298513185, + "step": 13838, + "time_per_iteration": 4.993895053863525 + }, + { + "auxiliary_loss_clip": 0.01134904, + "auxiliary_loss_mlp": 0.01101794, + "balance_loss_clip": 1.00201654, + "balance_loss_mlp": 1.00046635, + "epoch": 0.8320456936720276, + "flos": 21178067011200.0, + "grad_norm": 2.104546850789703, + "language_loss": 0.74341482, + "learning_rate": 2.885885860916795e-07, + "loss": 0.76578176, + "num_input_tokens_seen": 298531885, + "step": 13839, + "time_per_iteration": 4.500841379165649 + }, + { + "auxiliary_loss_clip": 0.0114968, + "auxiliary_loss_mlp": 0.01101751, + "balance_loss_clip": 1.00201678, + "balance_loss_mlp": 1.00047016, + "epoch": 0.8321058169246957, + "flos": 33250874503680.0, + "grad_norm": 2.586492440021968, + "language_loss": 0.67848676, + "learning_rate": 2.8838708661158253e-07, + "loss": 0.70100105, + "num_input_tokens_seen": 298554905, + "step": 13840, + "time_per_iteration": 2.904930830001831 + }, + { + "auxiliary_loss_clip": 0.01100489, + "auxiliary_loss_mlp": 0.01101554, + "balance_loss_clip": 1.00171089, + "balance_loss_mlp": 1.00046468, + "epoch": 0.8321659401773636, + "flos": 14208129256320.0, + "grad_norm": 1.8358104867554728, + "language_loss": 0.79407299, + "learning_rate": 2.8818565203601843e-07, + "loss": 0.81609344, + "num_input_tokens_seen": 298571185, + "step": 13841, + "time_per_iteration": 2.8869850635528564 + }, + { + "auxiliary_loss_clip": 0.01115716, + "auxiliary_loss_mlp": 0.01102126, + "balance_loss_clip": 1.00192583, + "balance_loss_mlp": 1.00046456, + "epoch": 0.8322260634300316, + "flos": 15158253859200.0, + "grad_norm": 2.1283479295014027, + "language_loss": 0.67904103, + "learning_rate": 2.879842823726262e-07, + "loss": 0.70121944, + "num_input_tokens_seen": 298588505, + "step": 13842, + "time_per_iteration": 2.9303109645843506 + }, + { + "auxiliary_loss_clip": 0.01133087, + "auxiliary_loss_mlp": 0.01101912, + "balance_loss_clip": 1.00186431, + "balance_loss_mlp": 1.00034547, + "epoch": 0.8322861866826995, + "flos": 25300827267840.0, + "grad_norm": 2.0101527678297617, + "language_loss": 0.72774124, + "learning_rate": 2.8778297762904124e-07, + "loss": 0.7500912, + "num_input_tokens_seen": 298609295, + "step": 13843, + "time_per_iteration": 2.979421377182007 + }, + { + "auxiliary_loss_clip": 0.01116703, + "auxiliary_loss_mlp": 0.01101004, + "balance_loss_clip": 1.00177073, + "balance_loss_mlp": 1.00043917, + "epoch": 0.8323463099353675, + "flos": 17019360218880.0, + "grad_norm": 1.7338248958432223, + "language_loss": 0.77689576, + "learning_rate": 2.875817378128975e-07, + "loss": 0.79907286, + "num_input_tokens_seen": 298625765, + "step": 13844, + "time_per_iteration": 3.238581895828247 + }, + { + "auxiliary_loss_clip": 0.01124487, + "auxiliary_loss_mlp": 0.01074746, + "balance_loss_clip": 1.00062943, + "balance_loss_mlp": 0.99997789, + "epoch": 0.8324064331880354, + "flos": 55607889709440.0, + "grad_norm": 0.7908882337627676, + "language_loss": 0.5528264, + "learning_rate": 2.8738056293182624e-07, + "loss": 0.57481873, + "num_input_tokens_seen": 298683005, + "step": 13845, + "time_per_iteration": 4.659953832626343 + }, + { + "auxiliary_loss_clip": 0.01149403, + "auxiliary_loss_mlp": 0.01102511, + "balance_loss_clip": 1.00200045, + "balance_loss_mlp": 1.00065827, + "epoch": 0.8324665564407034, + "flos": 26138623063680.0, + "grad_norm": 1.7666422578248318, + "language_loss": 0.7534821, + "learning_rate": 2.871794529934555e-07, + "loss": 0.77600121, + "num_input_tokens_seen": 298703060, + "step": 13846, + "time_per_iteration": 2.726022481918335 + }, + { + "auxiliary_loss_clip": 0.01101254, + "auxiliary_loss_mlp": 0.01101997, + "balance_loss_clip": 1.0016669, + "balance_loss_mlp": 1.00043035, + "epoch": 0.8325266796933715, + "flos": 22049187649920.0, + "grad_norm": 1.6666159540228722, + "language_loss": 0.78773636, + "learning_rate": 2.8697840800541115e-07, + "loss": 0.8097688, + "num_input_tokens_seen": 298721765, + "step": 13847, + "time_per_iteration": 2.8025946617126465 + }, + { + "auxiliary_loss_clip": 0.01084066, + "auxiliary_loss_mlp": 0.01100236, + "balance_loss_clip": 1.00147963, + "balance_loss_mlp": 1.00048149, + "epoch": 0.8325868029460394, + "flos": 22816634659200.0, + "grad_norm": 1.7284854617112169, + "language_loss": 0.74828279, + "learning_rate": 2.867774279753175e-07, + "loss": 0.77012587, + "num_input_tokens_seen": 298740825, + "step": 13848, + "time_per_iteration": 2.854043483734131 + }, + { + "auxiliary_loss_clip": 0.01148066, + "auxiliary_loss_mlp": 0.01100995, + "balance_loss_clip": 1.00185061, + "balance_loss_mlp": 1.00038218, + "epoch": 0.8326469261987074, + "flos": 14757454926720.0, + "grad_norm": 2.8797095733731, + "language_loss": 0.63406664, + "learning_rate": 2.8657651291079554e-07, + "loss": 0.6565572, + "num_input_tokens_seen": 298758515, + "step": 13849, + "time_per_iteration": 4.186585903167725 + }, + { + "auxiliary_loss_clip": 0.01134356, + "auxiliary_loss_mlp": 0.01102006, + "balance_loss_clip": 1.00172198, + "balance_loss_mlp": 1.00043964, + "epoch": 0.8327070494513753, + "flos": 22926126291840.0, + "grad_norm": 2.34734785831268, + "language_loss": 0.79853654, + "learning_rate": 2.863756628194638e-07, + "loss": 0.8209002, + "num_input_tokens_seen": 298776375, + "step": 13850, + "time_per_iteration": 4.851027250289917 + }, + { + "auxiliary_loss_clip": 0.01116241, + "auxiliary_loss_mlp": 0.01100495, + "balance_loss_clip": 1.00167847, + "balance_loss_mlp": 1.00050259, + "epoch": 0.8327671727040433, + "flos": 20665334321280.0, + "grad_norm": 1.8472014141921682, + "language_loss": 0.7848137, + "learning_rate": 2.8617487770893877e-07, + "loss": 0.80698109, + "num_input_tokens_seen": 298795135, + "step": 13851, + "time_per_iteration": 2.841682195663452 + }, + { + "auxiliary_loss_clip": 0.01143147, + "auxiliary_loss_mlp": 0.01075093, + "balance_loss_clip": 1.00080776, + "balance_loss_mlp": 0.99994391, + "epoch": 0.8328272959567112, + "flos": 56060760384000.0, + "grad_norm": 0.7790771391177826, + "language_loss": 0.55792499, + "learning_rate": 2.859741575868344e-07, + "loss": 0.58010739, + "num_input_tokens_seen": 298855475, + "step": 13852, + "time_per_iteration": 3.252931833267212 + }, + { + "auxiliary_loss_clip": 0.01149527, + "auxiliary_loss_mlp": 0.01101429, + "balance_loss_clip": 1.00182223, + "balance_loss_mlp": 1.00043535, + "epoch": 0.8328874192093793, + "flos": 32303084284800.0, + "grad_norm": 2.5527041718082457, + "language_loss": 0.67414427, + "learning_rate": 2.8577350246076125e-07, + "loss": 0.69665384, + "num_input_tokens_seen": 298875875, + "step": 13853, + "time_per_iteration": 2.8239500522613525 + }, + { + "auxiliary_loss_clip": 0.01133362, + "auxiliary_loss_mlp": 0.01100795, + "balance_loss_clip": 1.00187075, + "balance_loss_mlp": 1.0005641, + "epoch": 0.8329475424620472, + "flos": 23512691387520.0, + "grad_norm": 1.6219224022809973, + "language_loss": 0.78448844, + "learning_rate": 2.855729123383286e-07, + "loss": 0.80683005, + "num_input_tokens_seen": 298895950, + "step": 13854, + "time_per_iteration": 4.186717748641968 + }, + { + "auxiliary_loss_clip": 0.01158289, + "auxiliary_loss_mlp": 0.0107474, + "balance_loss_clip": 1.00076365, + "balance_loss_mlp": 0.99997157, + "epoch": 0.8330076657147152, + "flos": 67840680378240.0, + "grad_norm": 0.7716227593696627, + "language_loss": 0.58661842, + "learning_rate": 2.8537238722714295e-07, + "loss": 0.60894871, + "num_input_tokens_seen": 298955770, + "step": 13855, + "time_per_iteration": 4.651487112045288 + }, + { + "auxiliary_loss_clip": 0.01147849, + "auxiliary_loss_mlp": 0.01102415, + "balance_loss_clip": 1.00184155, + "balance_loss_mlp": 1.00046754, + "epoch": 0.8330677889673831, + "flos": 22892801448960.0, + "grad_norm": 3.155582039680547, + "language_loss": 0.71546942, + "learning_rate": 2.8517192713480853e-07, + "loss": 0.73797208, + "num_input_tokens_seen": 298976545, + "step": 13856, + "time_per_iteration": 2.7192087173461914 + }, + { + "auxiliary_loss_clip": 0.01149725, + "auxiliary_loss_mlp": 0.01101163, + "balance_loss_clip": 1.00188076, + "balance_loss_mlp": 1.00055051, + "epoch": 0.8331279122200511, + "flos": 27345042184320.0, + "grad_norm": 1.583099866813062, + "language_loss": 0.75498176, + "learning_rate": 2.8497153206892677e-07, + "loss": 0.77749062, + "num_input_tokens_seen": 298996750, + "step": 13857, + "time_per_iteration": 2.7537147998809814 + }, + { + "auxiliary_loss_clip": 0.01099843, + "auxiliary_loss_mlp": 0.01099679, + "balance_loss_clip": 1.0017426, + "balance_loss_mlp": 1.00040185, + "epoch": 0.833188035472719, + "flos": 19938179393280.0, + "grad_norm": 1.9620118903455561, + "language_loss": 0.73353755, + "learning_rate": 2.847712020370958e-07, + "loss": 0.7555328, + "num_input_tokens_seen": 299014895, + "step": 13858, + "time_per_iteration": 2.893073081970215 + }, + { + "auxiliary_loss_clip": 0.01164468, + "auxiliary_loss_mlp": 0.01102843, + "balance_loss_clip": 1.00188851, + "balance_loss_mlp": 1.00051367, + "epoch": 0.833248158725387, + "flos": 15232624968960.0, + "grad_norm": 1.7099541367079096, + "language_loss": 0.73175848, + "learning_rate": 2.8457093704691316e-07, + "loss": 0.75443161, + "num_input_tokens_seen": 299032855, + "step": 13859, + "time_per_iteration": 2.677267074584961 + }, + { + "auxiliary_loss_clip": 0.01147316, + "auxiliary_loss_mlp": 0.01099264, + "balance_loss_clip": 1.00182223, + "balance_loss_mlp": 1.00036824, + "epoch": 0.8333082819780551, + "flos": 24535535074560.0, + "grad_norm": 2.115003241047941, + "language_loss": 0.79366827, + "learning_rate": 2.8437073710597205e-07, + "loss": 0.81613404, + "num_input_tokens_seen": 299052055, + "step": 13860, + "time_per_iteration": 4.824054718017578 + }, + { + "auxiliary_loss_clip": 0.01065378, + "auxiliary_loss_mlp": 0.01100631, + "balance_loss_clip": 1.00133097, + "balance_loss_mlp": 1.00044799, + "epoch": 0.833368405230723, + "flos": 31467407391360.0, + "grad_norm": 2.4166159108465153, + "language_loss": 0.81978881, + "learning_rate": 2.841706022218644e-07, + "loss": 0.8414489, + "num_input_tokens_seen": 299075285, + "step": 13861, + "time_per_iteration": 2.9295589923858643 + }, + { + "auxiliary_loss_clip": 0.01164264, + "auxiliary_loss_mlp": 0.01101821, + "balance_loss_clip": 1.00187743, + "balance_loss_mlp": 1.00054049, + "epoch": 0.833428528483391, + "flos": 14902713527040.0, + "grad_norm": 1.8284604510526425, + "language_loss": 0.79211658, + "learning_rate": 2.839705324021806e-07, + "loss": 0.81477743, + "num_input_tokens_seen": 299092520, + "step": 13862, + "time_per_iteration": 2.7467262744903564 + }, + { + "auxiliary_loss_clip": 0.01149645, + "auxiliary_loss_mlp": 0.01101319, + "balance_loss_clip": 1.00177097, + "balance_loss_mlp": 1.00037265, + "epoch": 0.8334886517360589, + "flos": 22199833290240.0, + "grad_norm": 1.8793648261168394, + "language_loss": 0.74978936, + "learning_rate": 2.83770527654505e-07, + "loss": 0.77229893, + "num_input_tokens_seen": 299109450, + "step": 13863, + "time_per_iteration": 2.6464669704437256 + }, + { + "auxiliary_loss_clip": 0.01102183, + "auxiliary_loss_mlp": 0.00747399, + "balance_loss_clip": 1.00195813, + "balance_loss_mlp": 1.00062037, + "epoch": 0.8335487749887269, + "flos": 30372562892160.0, + "grad_norm": 2.9107554567694924, + "language_loss": 0.74874038, + "learning_rate": 2.835705879864232e-07, + "loss": 0.76723623, + "num_input_tokens_seen": 299129540, + "step": 13864, + "time_per_iteration": 2.82991099357605 + }, + { + "auxiliary_loss_clip": 0.01130267, + "auxiliary_loss_mlp": 0.01101762, + "balance_loss_clip": 1.00163627, + "balance_loss_mlp": 1.00048184, + "epoch": 0.8336088982413948, + "flos": 24681152810880.0, + "grad_norm": 2.2616594267261805, + "language_loss": 0.68967032, + "learning_rate": 2.833707134055168e-07, + "loss": 0.71199059, + "num_input_tokens_seen": 299148670, + "step": 13865, + "time_per_iteration": 2.733177661895752 + }, + { + "auxiliary_loss_clip": 0.01148045, + "auxiliary_loss_mlp": 0.0110179, + "balance_loss_clip": 1.00178957, + "balance_loss_mlp": 1.00060487, + "epoch": 0.8336690214940629, + "flos": 38177207873280.0, + "grad_norm": 1.5660592839769814, + "language_loss": 0.75031942, + "learning_rate": 2.831709039193653e-07, + "loss": 0.77281773, + "num_input_tokens_seen": 299169330, + "step": 13866, + "time_per_iteration": 4.729495048522949 + }, + { + "auxiliary_loss_clip": 0.01127231, + "auxiliary_loss_mlp": 0.01074866, + "balance_loss_clip": 1.00091314, + "balance_loss_mlp": 1.00009775, + "epoch": 0.8337291447467308, + "flos": 55565119589760.0, + "grad_norm": 0.8638036489436918, + "language_loss": 0.63045251, + "learning_rate": 2.8297115953554465e-07, + "loss": 0.65247357, + "num_input_tokens_seen": 299220980, + "step": 13867, + "time_per_iteration": 4.530324220657349 + }, + { + "auxiliary_loss_clip": 0.01130622, + "auxiliary_loss_mlp": 0.01101166, + "balance_loss_clip": 1.00172853, + "balance_loss_mlp": 1.00055313, + "epoch": 0.8337892679993988, + "flos": 24133550993280.0, + "grad_norm": 1.88158930002892, + "language_loss": 0.72243077, + "learning_rate": 2.827714802616301e-07, + "loss": 0.74474859, + "num_input_tokens_seen": 299240130, + "step": 13868, + "time_per_iteration": 2.7389938831329346 + }, + { + "auxiliary_loss_clip": 0.01131189, + "auxiliary_loss_mlp": 0.01101055, + "balance_loss_clip": 1.00186288, + "balance_loss_mlp": 1.00044286, + "epoch": 0.8338493912520667, + "flos": 28183915388160.0, + "grad_norm": 1.4366452508706424, + "language_loss": 0.80343592, + "learning_rate": 2.8257186610519325e-07, + "loss": 0.82575834, + "num_input_tokens_seen": 299260705, + "step": 13869, + "time_per_iteration": 4.34047532081604 + }, + { + "auxiliary_loss_clip": 0.01148719, + "auxiliary_loss_mlp": 0.01101026, + "balance_loss_clip": 1.00185609, + "balance_loss_mlp": 1.00050902, + "epoch": 0.8339095145047347, + "flos": 22158356060160.0, + "grad_norm": 1.558169770542068, + "language_loss": 0.82712382, + "learning_rate": 2.823723170738028e-07, + "loss": 0.8496213, + "num_input_tokens_seen": 299278925, + "step": 13870, + "time_per_iteration": 4.380739688873291 + }, + { + "auxiliary_loss_clip": 0.01132026, + "auxiliary_loss_mlp": 0.01101699, + "balance_loss_clip": 1.00180459, + "balance_loss_mlp": 1.00032341, + "epoch": 0.8339696377574026, + "flos": 17307112072320.0, + "grad_norm": 2.4995809341848374, + "language_loss": 0.70511973, + "learning_rate": 2.821728331750264e-07, + "loss": 0.72745693, + "num_input_tokens_seen": 299291580, + "step": 13871, + "time_per_iteration": 2.9056546688079834 + }, + { + "auxiliary_loss_clip": 0.01147548, + "auxiliary_loss_mlp": 0.01100359, + "balance_loss_clip": 1.00193334, + "balance_loss_mlp": 1.00050974, + "epoch": 0.8340297610100706, + "flos": 20668351063680.0, + "grad_norm": 2.139378456408253, + "language_loss": 0.68689227, + "learning_rate": 2.8197341441642853e-07, + "loss": 0.70937139, + "num_input_tokens_seen": 299310385, + "step": 13872, + "time_per_iteration": 2.7175610065460205 + }, + { + "auxiliary_loss_clip": 0.01130827, + "auxiliary_loss_mlp": 0.01101046, + "balance_loss_clip": 1.00169897, + "balance_loss_mlp": 1.0003376, + "epoch": 0.8340898842627387, + "flos": 20515442866560.0, + "grad_norm": 2.19957313431727, + "language_loss": 0.7350207, + "learning_rate": 2.817740608055712e-07, + "loss": 0.75733942, + "num_input_tokens_seen": 299327660, + "step": 13873, + "time_per_iteration": 2.7073147296905518 + }, + { + "auxiliary_loss_clip": 0.01130834, + "auxiliary_loss_mlp": 0.01103125, + "balance_loss_clip": 1.00177336, + "balance_loss_mlp": 1.0005095, + "epoch": 0.8341500075154066, + "flos": 21425850005760.0, + "grad_norm": 12.840646345091312, + "language_loss": 0.75404978, + "learning_rate": 2.81574772350013e-07, + "loss": 0.77638936, + "num_input_tokens_seen": 299343685, + "step": 13874, + "time_per_iteration": 2.692596197128296 + }, + { + "auxiliary_loss_clip": 0.01133054, + "auxiliary_loss_mlp": 0.01100949, + "balance_loss_clip": 1.00182772, + "balance_loss_mlp": 1.00033617, + "epoch": 0.8342101307680746, + "flos": 22090988102400.0, + "grad_norm": 2.4671518307029023, + "language_loss": 0.66440976, + "learning_rate": 2.813755490573118e-07, + "loss": 0.68674982, + "num_input_tokens_seen": 299363305, + "step": 13875, + "time_per_iteration": 2.7287089824676514 + }, + { + "auxiliary_loss_clip": 0.01103565, + "auxiliary_loss_mlp": 0.0110063, + "balance_loss_clip": 1.00175166, + "balance_loss_mlp": 1.000494, + "epoch": 0.8342702540207425, + "flos": 21871466133120.0, + "grad_norm": 1.678761727673428, + "language_loss": 0.79683375, + "learning_rate": 2.8117639093502243e-07, + "loss": 0.81887573, + "num_input_tokens_seen": 299382630, + "step": 13876, + "time_per_iteration": 5.066567897796631 + }, + { + "auxiliary_loss_clip": 0.01149327, + "auxiliary_loss_mlp": 0.011, + "balance_loss_clip": 1.00190043, + "balance_loss_mlp": 1.00048423, + "epoch": 0.8343303772734105, + "flos": 22528487756160.0, + "grad_norm": 2.181151183786371, + "language_loss": 0.8675797, + "learning_rate": 2.8097729799069615e-07, + "loss": 0.89007294, + "num_input_tokens_seen": 299402385, + "step": 13877, + "time_per_iteration": 2.878227472305298 + }, + { + "auxiliary_loss_clip": 0.01119402, + "auxiliary_loss_mlp": 0.0109981, + "balance_loss_clip": 1.0018785, + "balance_loss_mlp": 1.00053239, + "epoch": 0.8343905005260784, + "flos": 14939773384320.0, + "grad_norm": 2.152466901018827, + "language_loss": 0.69052017, + "learning_rate": 2.807782702318828e-07, + "loss": 0.71271235, + "num_input_tokens_seen": 299419820, + "step": 13878, + "time_per_iteration": 2.801504373550415 + }, + { + "auxiliary_loss_clip": 0.01132767, + "auxiliary_loss_mlp": 0.01101124, + "balance_loss_clip": 1.0017705, + "balance_loss_mlp": 1.0004164, + "epoch": 0.8344506237787465, + "flos": 15012456554880.0, + "grad_norm": 2.0748399509646465, + "language_loss": 0.79092985, + "learning_rate": 2.805793076661309e-07, + "loss": 0.81326878, + "num_input_tokens_seen": 299436265, + "step": 13879, + "time_per_iteration": 2.780137062072754 + }, + { + "auxiliary_loss_clip": 0.01098117, + "auxiliary_loss_mlp": 0.01100139, + "balance_loss_clip": 1.00156569, + "balance_loss_mlp": 1.00038481, + "epoch": 0.8345107470314144, + "flos": 17560389847680.0, + "grad_norm": 2.1534686911739542, + "language_loss": 0.83222979, + "learning_rate": 2.803804103009828e-07, + "loss": 0.85421234, + "num_input_tokens_seen": 299451660, + "step": 13880, + "time_per_iteration": 2.7913613319396973 + }, + { + "auxiliary_loss_clip": 0.01132626, + "auxiliary_loss_mlp": 0.01101846, + "balance_loss_clip": 1.00183535, + "balance_loss_mlp": 1.00037527, + "epoch": 0.8345708702840824, + "flos": 25187277398400.0, + "grad_norm": 1.5214721595702985, + "language_loss": 0.78238785, + "learning_rate": 2.80181578143982e-07, + "loss": 0.80473256, + "num_input_tokens_seen": 299472070, + "step": 13881, + "time_per_iteration": 2.754774570465088 + }, + { + "auxiliary_loss_clip": 0.01116337, + "auxiliary_loss_mlp": 0.01100045, + "balance_loss_clip": 1.00172234, + "balance_loss_mlp": 1.00048137, + "epoch": 0.8346309935367503, + "flos": 15083559527040.0, + "grad_norm": 2.1894522575342266, + "language_loss": 0.78486043, + "learning_rate": 2.7998281120266807e-07, + "loss": 0.80702424, + "num_input_tokens_seen": 299486725, + "step": 13882, + "time_per_iteration": 2.880962371826172 + }, + { + "auxiliary_loss_clip": 0.01118021, + "auxiliary_loss_mlp": 0.01101122, + "balance_loss_clip": 1.00177693, + "balance_loss_mlp": 1.00069976, + "epoch": 0.8346911167894183, + "flos": 22930615491840.0, + "grad_norm": 2.179839272755763, + "language_loss": 0.80106467, + "learning_rate": 2.79784109484579e-07, + "loss": 0.82325608, + "num_input_tokens_seen": 299505435, + "step": 13883, + "time_per_iteration": 2.7314453125 + }, + { + "auxiliary_loss_clip": 0.01147531, + "auxiliary_loss_mlp": 0.01101616, + "balance_loss_clip": 1.00175822, + "balance_loss_mlp": 1.00043142, + "epoch": 0.8347512400420862, + "flos": 20193037367040.0, + "grad_norm": 1.8450926591636108, + "language_loss": 0.7396394, + "learning_rate": 2.795854729972482e-07, + "loss": 0.76213086, + "num_input_tokens_seen": 299523555, + "step": 13884, + "time_per_iteration": 2.686567544937134 + }, + { + "auxiliary_loss_clip": 0.01131614, + "auxiliary_loss_mlp": 0.01103318, + "balance_loss_clip": 1.00180745, + "balance_loss_mlp": 1.00051153, + "epoch": 0.8348113632947542, + "flos": 25954832148480.0, + "grad_norm": 1.62049266946892, + "language_loss": 0.70684111, + "learning_rate": 2.7938690174820913e-07, + "loss": 0.72919035, + "num_input_tokens_seen": 299541660, + "step": 13885, + "time_per_iteration": 2.7520618438720703 + }, + { + "auxiliary_loss_clip": 0.01114673, + "auxiliary_loss_mlp": 0.01100944, + "balance_loss_clip": 1.00161314, + "balance_loss_mlp": 1.00052214, + "epoch": 0.8348714865474223, + "flos": 34204554552960.0, + "grad_norm": 2.3148154398525764, + "language_loss": 0.70043248, + "learning_rate": 2.791883957449912e-07, + "loss": 0.72258866, + "num_input_tokens_seen": 299562465, + "step": 13886, + "time_per_iteration": 2.8237133026123047 + }, + { + "auxiliary_loss_clip": 0.0111634, + "auxiliary_loss_mlp": 0.01100318, + "balance_loss_clip": 1.00166821, + "balance_loss_mlp": 1.00032556, + "epoch": 0.8349316098000902, + "flos": 24390132819840.0, + "grad_norm": 1.5486266586515487, + "language_loss": 0.79248703, + "learning_rate": 2.7898995499512134e-07, + "loss": 0.81465364, + "num_input_tokens_seen": 299582700, + "step": 13887, + "time_per_iteration": 4.4139769077301025 + }, + { + "auxiliary_loss_clip": 0.01131532, + "auxiliary_loss_mlp": 0.00747506, + "balance_loss_clip": 1.00181818, + "balance_loss_mlp": 1.00057769, + "epoch": 0.8349917330527582, + "flos": 23032744836480.0, + "grad_norm": 2.7851178084951966, + "language_loss": 0.64505035, + "learning_rate": 2.7879157950612467e-07, + "loss": 0.66384071, + "num_input_tokens_seen": 299600310, + "step": 13888, + "time_per_iteration": 2.6789889335632324 + }, + { + "auxiliary_loss_clip": 0.01131453, + "auxiliary_loss_mlp": 0.01101492, + "balance_loss_clip": 1.00179064, + "balance_loss_mlp": 1.00030708, + "epoch": 0.8350518563054261, + "flos": 13625873792640.0, + "grad_norm": 4.945857415822722, + "language_loss": 0.6638248, + "learning_rate": 2.785932692855244e-07, + "loss": 0.68615419, + "num_input_tokens_seen": 299617025, + "step": 13889, + "time_per_iteration": 2.6220109462738037 + }, + { + "auxiliary_loss_clip": 0.01149797, + "auxiliary_loss_mlp": 0.01100433, + "balance_loss_clip": 1.00192523, + "balance_loss_mlp": 1.00039291, + "epoch": 0.8351119795580941, + "flos": 21579799697280.0, + "grad_norm": 7.079182609939619, + "language_loss": 0.68443298, + "learning_rate": 2.783950243408399e-07, + "loss": 0.70693529, + "num_input_tokens_seen": 299633050, + "step": 13890, + "time_per_iteration": 2.6062161922454834 + }, + { + "auxiliary_loss_clip": 0.01131077, + "auxiliary_loss_mlp": 0.01103001, + "balance_loss_clip": 1.0018158, + "balance_loss_mlp": 1.00067151, + "epoch": 0.835172102810762, + "flos": 20038297576320.0, + "grad_norm": 2.758132392388368, + "language_loss": 0.5942021, + "learning_rate": 2.7819684467958817e-07, + "loss": 0.61654282, + "num_input_tokens_seen": 299646445, + "step": 13891, + "time_per_iteration": 4.1376447677612305 + }, + { + "auxiliary_loss_clip": 0.01147534, + "auxiliary_loss_mlp": 0.01101059, + "balance_loss_clip": 1.00183344, + "balance_loss_mlp": 1.00039887, + "epoch": 0.8352322260634301, + "flos": 25111577485440.0, + "grad_norm": 2.0633580678940326, + "language_loss": 0.71643811, + "learning_rate": 2.779987303092846e-07, + "loss": 0.73892403, + "num_input_tokens_seen": 299662665, + "step": 13892, + "time_per_iteration": 2.791799306869507 + }, + { + "auxiliary_loss_clip": 0.01164207, + "auxiliary_loss_mlp": 0.01100133, + "balance_loss_clip": 1.00189471, + "balance_loss_mlp": 1.00047445, + "epoch": 0.835292349316098, + "flos": 24863758577280.0, + "grad_norm": 1.7537947047177729, + "language_loss": 0.65962142, + "learning_rate": 2.7780068123744207e-07, + "loss": 0.6822648, + "num_input_tokens_seen": 299683585, + "step": 13893, + "time_per_iteration": 3.1435656547546387 + }, + { + "auxiliary_loss_clip": 0.01131547, + "auxiliary_loss_mlp": 0.01101127, + "balance_loss_clip": 1.00177431, + "balance_loss_mlp": 1.00041938, + "epoch": 0.835352472568766, + "flos": 19865568049920.0, + "grad_norm": 2.044590002606593, + "language_loss": 0.78011417, + "learning_rate": 2.7760269747156996e-07, + "loss": 0.80244088, + "num_input_tokens_seen": 299702680, + "step": 13894, + "time_per_iteration": 2.6646134853363037 + }, + { + "auxiliary_loss_clip": 0.01149736, + "auxiliary_loss_mlp": 0.01100137, + "balance_loss_clip": 1.00199902, + "balance_loss_mlp": 1.00052547, + "epoch": 0.8354125958214339, + "flos": 22054754257920.0, + "grad_norm": 1.6797207673331012, + "language_loss": 0.72733414, + "learning_rate": 2.7740477901917625e-07, + "loss": 0.74983287, + "num_input_tokens_seen": 299721050, + "step": 13895, + "time_per_iteration": 2.6259946823120117 + }, + { + "auxiliary_loss_clip": 0.01147632, + "auxiliary_loss_mlp": 0.01102391, + "balance_loss_clip": 1.0017283, + "balance_loss_mlp": 1.00053835, + "epoch": 0.8354727190741019, + "flos": 21397804462080.0, + "grad_norm": 1.9792428555877648, + "language_loss": 0.72060049, + "learning_rate": 2.772069258877667e-07, + "loss": 0.74310076, + "num_input_tokens_seen": 299738255, + "step": 13896, + "time_per_iteration": 2.6360833644866943 + }, + { + "auxiliary_loss_clip": 0.01149352, + "auxiliary_loss_mlp": 0.01100841, + "balance_loss_clip": 1.00180447, + "balance_loss_mlp": 1.00041914, + "epoch": 0.8355328423267698, + "flos": 50840997834240.0, + "grad_norm": 2.415444778177357, + "language_loss": 0.59046519, + "learning_rate": 2.770091380848423e-07, + "loss": 0.61296713, + "num_input_tokens_seen": 299761315, + "step": 13897, + "time_per_iteration": 2.973345994949341 + }, + { + "auxiliary_loss_clip": 0.01158151, + "auxiliary_loss_mlp": 0.00745419, + "balance_loss_clip": 1.00072408, + "balance_loss_mlp": 1.00032794, + "epoch": 0.8355929655794379, + "flos": 65551052764800.0, + "grad_norm": 0.6951166038642284, + "language_loss": 0.57646739, + "learning_rate": 2.7681141561790423e-07, + "loss": 0.59550297, + "num_input_tokens_seen": 299828735, + "step": 13898, + "time_per_iteration": 3.179316759109497 + }, + { + "auxiliary_loss_clip": 0.01148345, + "auxiliary_loss_mlp": 0.01101824, + "balance_loss_clip": 1.00192547, + "balance_loss_mlp": 1.0004487, + "epoch": 0.8356530888321058, + "flos": 19170516902400.0, + "grad_norm": 1.8379115468097917, + "language_loss": 0.80030364, + "learning_rate": 2.7661375849444967e-07, + "loss": 0.82280535, + "num_input_tokens_seen": 299848395, + "step": 13899, + "time_per_iteration": 2.639683961868286 + }, + { + "auxiliary_loss_clip": 0.0116442, + "auxiliary_loss_mlp": 0.01101299, + "balance_loss_clip": 1.00197268, + "balance_loss_mlp": 1.00049543, + "epoch": 0.8357132120847738, + "flos": 44126672238720.0, + "grad_norm": 1.6385277153669395, + "language_loss": 0.68769991, + "learning_rate": 2.764161667219749e-07, + "loss": 0.71035719, + "num_input_tokens_seen": 299871665, + "step": 13900, + "time_per_iteration": 2.7690517902374268 + }, + { + "auxiliary_loss_clip": 0.01132935, + "auxiliary_loss_mlp": 0.01100051, + "balance_loss_clip": 1.0018115, + "balance_loss_mlp": 1.0003922, + "epoch": 0.8357733353374418, + "flos": 24389701856640.0, + "grad_norm": 1.454760418055502, + "language_loss": 0.70792699, + "learning_rate": 2.762186403079716e-07, + "loss": 0.7302568, + "num_input_tokens_seen": 299891960, + "step": 13901, + "time_per_iteration": 2.7139275074005127 + }, + { + "auxiliary_loss_clip": 0.01101326, + "auxiliary_loss_mlp": 0.01102299, + "balance_loss_clip": 1.00157118, + "balance_loss_mlp": 1.00054169, + "epoch": 0.8358334585901097, + "flos": 20916313626240.0, + "grad_norm": 2.567179234973761, + "language_loss": 0.80108476, + "learning_rate": 2.7602117925992963e-07, + "loss": 0.82312101, + "num_input_tokens_seen": 299905070, + "step": 13902, + "time_per_iteration": 2.74157977104187 + }, + { + "auxiliary_loss_clip": 0.0114908, + "auxiliary_loss_mlp": 0.01100566, + "balance_loss_clip": 1.00181079, + "balance_loss_mlp": 1.00043035, + "epoch": 0.8358935818427777, + "flos": 19244169740160.0, + "grad_norm": 1.9201626346552723, + "language_loss": 0.62552351, + "learning_rate": 2.758237835853379e-07, + "loss": 0.64801997, + "num_input_tokens_seen": 299925130, + "step": 13903, + "time_per_iteration": 2.639324188232422 + }, + { + "auxiliary_loss_clip": 0.01134776, + "auxiliary_loss_mlp": 0.01102113, + "balance_loss_clip": 1.00189209, + "balance_loss_mlp": 1.00045168, + "epoch": 0.8359537050954456, + "flos": 24134053783680.0, + "grad_norm": 1.6464666104802186, + "language_loss": 0.74322456, + "learning_rate": 2.7562645329168054e-07, + "loss": 0.76559347, + "num_input_tokens_seen": 299943845, + "step": 13904, + "time_per_iteration": 2.703733444213867 + }, + { + "auxiliary_loss_clip": 0.01133242, + "auxiliary_loss_mlp": 0.01100448, + "balance_loss_clip": 1.00172031, + "balance_loss_mlp": 1.00050318, + "epoch": 0.8360138283481137, + "flos": 16180415187840.0, + "grad_norm": 2.0624288929062957, + "language_loss": 0.72893322, + "learning_rate": 2.7542918838644104e-07, + "loss": 0.75127017, + "num_input_tokens_seen": 299961620, + "step": 13905, + "time_per_iteration": 4.04280424118042 + }, + { + "auxiliary_loss_clip": 0.01149577, + "auxiliary_loss_mlp": 0.01100162, + "balance_loss_clip": 1.00194311, + "balance_loss_mlp": 1.00059867, + "epoch": 0.8360739516007816, + "flos": 22198899536640.0, + "grad_norm": 1.9329833454954777, + "language_loss": 0.66669565, + "learning_rate": 2.752319888771e-07, + "loss": 0.68919301, + "num_input_tokens_seen": 299982170, + "step": 13906, + "time_per_iteration": 4.0920655727386475 + }, + { + "auxiliary_loss_clip": 0.01147595, + "auxiliary_loss_mlp": 0.01101174, + "balance_loss_clip": 1.00178158, + "balance_loss_mlp": 1.00046611, + "epoch": 0.8361340748534496, + "flos": 20923137210240.0, + "grad_norm": 1.8901067665222293, + "language_loss": 0.74327254, + "learning_rate": 2.7503485477113475e-07, + "loss": 0.76576018, + "num_input_tokens_seen": 300001330, + "step": 13907, + "time_per_iteration": 2.6110363006591797 + }, + { + "auxiliary_loss_clip": 0.01113749, + "auxiliary_loss_mlp": 0.0110181, + "balance_loss_clip": 1.0016073, + "balance_loss_mlp": 1.00048256, + "epoch": 0.8361941981061175, + "flos": 26173599932160.0, + "grad_norm": 2.1329185987519117, + "language_loss": 0.75231504, + "learning_rate": 2.7483778607602005e-07, + "loss": 0.77447069, + "num_input_tokens_seen": 300020645, + "step": 13908, + "time_per_iteration": 2.719162702560425 + }, + { + "auxiliary_loss_clip": 0.01147072, + "auxiliary_loss_mlp": 0.01102375, + "balance_loss_clip": 1.00179315, + "balance_loss_mlp": 1.00052297, + "epoch": 0.8362543213587855, + "flos": 24419363512320.0, + "grad_norm": 2.1027905203682704, + "language_loss": 0.71420765, + "learning_rate": 2.7464078279922964e-07, + "loss": 0.73670214, + "num_input_tokens_seen": 300039945, + "step": 13909, + "time_per_iteration": 2.6248085498809814 + }, + { + "auxiliary_loss_clip": 0.01164424, + "auxiliary_loss_mlp": 0.00747489, + "balance_loss_clip": 1.00191903, + "balance_loss_mlp": 1.00055707, + "epoch": 0.8363144446114534, + "flos": 17202396948480.0, + "grad_norm": 2.0245184582187252, + "language_loss": 0.73325622, + "learning_rate": 2.744438449482338e-07, + "loss": 0.7523753, + "num_input_tokens_seen": 300058260, + "step": 13910, + "time_per_iteration": 2.5374598503112793 + }, + { + "auxiliary_loss_clip": 0.0114955, + "auxiliary_loss_mlp": 0.00747489, + "balance_loss_clip": 1.00191379, + "balance_loss_mlp": 1.00056648, + "epoch": 0.8363745678641215, + "flos": 19279398003840.0, + "grad_norm": 1.7359980228723217, + "language_loss": 0.73379993, + "learning_rate": 2.742469725305001e-07, + "loss": 0.7527703, + "num_input_tokens_seen": 300076720, + "step": 13911, + "time_per_iteration": 2.610497236251831 + }, + { + "auxiliary_loss_clip": 0.01132668, + "auxiliary_loss_mlp": 0.01101501, + "balance_loss_clip": 1.00191939, + "balance_loss_mlp": 1.00055504, + "epoch": 0.8364346911167894, + "flos": 11874869596800.0, + "grad_norm": 1.908560021291832, + "language_loss": 0.78590858, + "learning_rate": 2.740501655534946e-07, + "loss": 0.80825031, + "num_input_tokens_seen": 300092950, + "step": 13912, + "time_per_iteration": 2.589869737625122 + }, + { + "auxiliary_loss_clip": 0.01149658, + "auxiliary_loss_mlp": 0.01100868, + "balance_loss_clip": 1.00187397, + "balance_loss_mlp": 1.00054169, + "epoch": 0.8364948143694574, + "flos": 20225212974720.0, + "grad_norm": 2.134959078029412, + "language_loss": 0.78861147, + "learning_rate": 2.738534240246797e-07, + "loss": 0.8111167, + "num_input_tokens_seen": 300110950, + "step": 13913, + "time_per_iteration": 2.616947889328003 + }, + { + "auxiliary_loss_clip": 0.01147574, + "auxiliary_loss_mlp": 0.01101993, + "balance_loss_clip": 1.0017786, + "balance_loss_mlp": 1.00042725, + "epoch": 0.8365549376221254, + "flos": 21612909058560.0, + "grad_norm": 1.9168193012880774, + "language_loss": 0.73577315, + "learning_rate": 2.736567479515153e-07, + "loss": 0.75826883, + "num_input_tokens_seen": 300128705, + "step": 13914, + "time_per_iteration": 2.6157774925231934 + }, + { + "auxiliary_loss_clip": 0.01100232, + "auxiliary_loss_mlp": 0.0110038, + "balance_loss_clip": 1.00168395, + "balance_loss_mlp": 1.00048292, + "epoch": 0.8366150608747933, + "flos": 23294210912640.0, + "grad_norm": 1.6952920936933105, + "language_loss": 0.7131865, + "learning_rate": 2.7346013734146025e-07, + "loss": 0.73519266, + "num_input_tokens_seen": 300148635, + "step": 13915, + "time_per_iteration": 2.763882875442505 + }, + { + "auxiliary_loss_clip": 0.01116172, + "auxiliary_loss_mlp": 0.01101558, + "balance_loss_clip": 1.00163591, + "balance_loss_mlp": 1.00046802, + "epoch": 0.8366751841274613, + "flos": 15267673664640.0, + "grad_norm": 2.37987325806459, + "language_loss": 0.7249459, + "learning_rate": 2.7326359220197035e-07, + "loss": 0.74712324, + "num_input_tokens_seen": 300165490, + "step": 13916, + "time_per_iteration": 2.6642370223999023 + }, + { + "auxiliary_loss_clip": 0.01116202, + "auxiliary_loss_mlp": 0.00747504, + "balance_loss_clip": 1.0017941, + "balance_loss_mlp": 1.00056195, + "epoch": 0.8367353073801292, + "flos": 13224931205760.0, + "grad_norm": 2.2679993712504993, + "language_loss": 0.74828136, + "learning_rate": 2.7306711254049755e-07, + "loss": 0.76691842, + "num_input_tokens_seen": 300182130, + "step": 13917, + "time_per_iteration": 2.686222553253174 + }, + { + "auxiliary_loss_clip": 0.01164261, + "auxiliary_loss_mlp": 0.01100028, + "balance_loss_clip": 1.00203371, + "balance_loss_mlp": 1.00055933, + "epoch": 0.8367954306327973, + "flos": 24205084928640.0, + "grad_norm": 1.519605251773453, + "language_loss": 0.79061979, + "learning_rate": 2.728706983644933e-07, + "loss": 0.8132627, + "num_input_tokens_seen": 300203050, + "step": 13918, + "time_per_iteration": 2.736530303955078 + }, + { + "auxiliary_loss_clip": 0.01098133, + "auxiliary_loss_mlp": 0.01102088, + "balance_loss_clip": 1.00161755, + "balance_loss_mlp": 1.0005213, + "epoch": 0.8368555538854652, + "flos": 24534744975360.0, + "grad_norm": 1.5966208303773062, + "language_loss": 0.67841721, + "learning_rate": 2.7267434968140457e-07, + "loss": 0.70041943, + "num_input_tokens_seen": 300224380, + "step": 13919, + "time_per_iteration": 2.825328826904297 + }, + { + "auxiliary_loss_clip": 0.01149296, + "auxiliary_loss_mlp": 0.01100838, + "balance_loss_clip": 1.00184751, + "balance_loss_mlp": 1.00041592, + "epoch": 0.8369156771381332, + "flos": 20259363830400.0, + "grad_norm": 1.7509144918325645, + "language_loss": 0.73484629, + "learning_rate": 2.7247806649867835e-07, + "loss": 0.75734758, + "num_input_tokens_seen": 300242915, + "step": 13920, + "time_per_iteration": 2.6033458709716797 + }, + { + "auxiliary_loss_clip": 0.01134914, + "auxiliary_loss_mlp": 0.01100616, + "balance_loss_clip": 1.0018214, + "balance_loss_mlp": 1.00048041, + "epoch": 0.8369758003908011, + "flos": 21835555511040.0, + "grad_norm": 2.176202640408048, + "language_loss": 0.69108093, + "learning_rate": 2.722818488237566e-07, + "loss": 0.71343625, + "num_input_tokens_seen": 300261905, + "step": 13921, + "time_per_iteration": 2.671337604522705 + }, + { + "auxiliary_loss_clip": 0.01149392, + "auxiliary_loss_mlp": 0.01102363, + "balance_loss_clip": 1.00192797, + "balance_loss_mlp": 1.00051045, + "epoch": 0.8370359236434691, + "flos": 21719312121600.0, + "grad_norm": 1.8350001709497799, + "language_loss": 0.8512392, + "learning_rate": 2.720856966640801e-07, + "loss": 0.87375677, + "num_input_tokens_seen": 300281145, + "step": 13922, + "time_per_iteration": 2.7284414768218994 + }, + { + "auxiliary_loss_clip": 0.01114536, + "auxiliary_loss_mlp": 0.00747371, + "balance_loss_clip": 1.00172114, + "balance_loss_mlp": 1.00055122, + "epoch": 0.837096046896137, + "flos": 23148880485120.0, + "grad_norm": 1.668463122025337, + "language_loss": 0.71861649, + "learning_rate": 2.71889610027088e-07, + "loss": 0.73723561, + "num_input_tokens_seen": 300301610, + "step": 13923, + "time_per_iteration": 2.743121385574341 + }, + { + "auxiliary_loss_clip": 0.01132198, + "auxiliary_loss_mlp": 0.01100271, + "balance_loss_clip": 1.00178969, + "balance_loss_mlp": 1.00051689, + "epoch": 0.8371561701488051, + "flos": 24492872695680.0, + "grad_norm": 3.5877116284441226, + "language_loss": 0.76188767, + "learning_rate": 2.7169358892021433e-07, + "loss": 0.78421235, + "num_input_tokens_seen": 300319420, + "step": 13924, + "time_per_iteration": 4.335861921310425 + }, + { + "auxiliary_loss_clip": 0.011318, + "auxiliary_loss_mlp": 0.01099749, + "balance_loss_clip": 1.00176072, + "balance_loss_mlp": 1.00042331, + "epoch": 0.837216293401473, + "flos": 29206723161600.0, + "grad_norm": 1.4881571660667112, + "language_loss": 0.64475942, + "learning_rate": 2.7149763335089293e-07, + "loss": 0.66707492, + "num_input_tokens_seen": 300341325, + "step": 13925, + "time_per_iteration": 2.8057315349578857 + }, + { + "auxiliary_loss_clip": 0.0113234, + "auxiliary_loss_mlp": 0.01101359, + "balance_loss_clip": 1.00173306, + "balance_loss_mlp": 1.00046051, + "epoch": 0.837276416654141, + "flos": 25265275781760.0, + "grad_norm": 2.5012694918225913, + "language_loss": 0.74369663, + "learning_rate": 2.713017433265543e-07, + "loss": 0.76603365, + "num_input_tokens_seen": 300361620, + "step": 13926, + "time_per_iteration": 2.784679889678955 + }, + { + "auxiliary_loss_clip": 0.0114763, + "auxiliary_loss_mlp": 0.01101375, + "balance_loss_clip": 1.00191522, + "balance_loss_mlp": 1.00052357, + "epoch": 0.837336539906809, + "flos": 13882024656000.0, + "grad_norm": 1.6740532922052356, + "language_loss": 0.71276325, + "learning_rate": 2.711059188546274e-07, + "loss": 0.73525321, + "num_input_tokens_seen": 300378675, + "step": 13927, + "time_per_iteration": 2.5732240676879883 + }, + { + "auxiliary_loss_clip": 0.01126763, + "auxiliary_loss_mlp": 0.01075169, + "balance_loss_clip": 1.00094223, + "balance_loss_mlp": 1.00001991, + "epoch": 0.8373966631594769, + "flos": 68870599044480.0, + "grad_norm": 0.8697977361613052, + "language_loss": 0.588108, + "learning_rate": 2.7091015994253695e-07, + "loss": 0.61012727, + "num_input_tokens_seen": 300449740, + "step": 13928, + "time_per_iteration": 3.358644962310791 + }, + { + "auxiliary_loss_clip": 0.01116442, + "auxiliary_loss_mlp": 0.01101605, + "balance_loss_clip": 1.00195503, + "balance_loss_mlp": 1.00061095, + "epoch": 0.8374567864121449, + "flos": 20448972748800.0, + "grad_norm": 1.6176824891105808, + "language_loss": 0.70013356, + "learning_rate": 2.707144665977068e-07, + "loss": 0.722314, + "num_input_tokens_seen": 300470000, + "step": 13929, + "time_per_iteration": 4.098991632461548 + }, + { + "auxiliary_loss_clip": 0.01147679, + "auxiliary_loss_mlp": 0.01103074, + "balance_loss_clip": 1.00184274, + "balance_loss_mlp": 1.00036383, + "epoch": 0.8375169096648128, + "flos": 41904197101440.0, + "grad_norm": 1.584708605048535, + "language_loss": 0.66926348, + "learning_rate": 2.705188388275574e-07, + "loss": 0.69177103, + "num_input_tokens_seen": 300494975, + "step": 13930, + "time_per_iteration": 2.793750762939453 + }, + { + "auxiliary_loss_clip": 0.01098326, + "auxiliary_loss_mlp": 0.01101081, + "balance_loss_clip": 1.00173879, + "balance_loss_mlp": 1.00046825, + "epoch": 0.8375770329174809, + "flos": 20009354192640.0, + "grad_norm": 1.6643758261431227, + "language_loss": 0.71295309, + "learning_rate": 2.703232766395067e-07, + "loss": 0.7349472, + "num_input_tokens_seen": 300513175, + "step": 13931, + "time_per_iteration": 2.6970131397247314 + }, + { + "auxiliary_loss_clip": 0.01118258, + "auxiliary_loss_mlp": 0.01101121, + "balance_loss_clip": 1.00183606, + "balance_loss_mlp": 1.00055599, + "epoch": 0.8376371561701488, + "flos": 22783597125120.0, + "grad_norm": 1.5847498714651174, + "language_loss": 0.71705115, + "learning_rate": 2.701277800409705e-07, + "loss": 0.73924494, + "num_input_tokens_seen": 300533770, + "step": 13932, + "time_per_iteration": 2.7078514099121094 + }, + { + "auxiliary_loss_clip": 0.01067337, + "auxiliary_loss_mlp": 0.01100622, + "balance_loss_clip": 1.00150228, + "balance_loss_mlp": 1.00039077, + "epoch": 0.8376972794228168, + "flos": 23914459987200.0, + "grad_norm": 2.7465100794638593, + "language_loss": 0.67013896, + "learning_rate": 2.699323490393628e-07, + "loss": 0.69181848, + "num_input_tokens_seen": 300552995, + "step": 13933, + "time_per_iteration": 2.806277275085449 + }, + { + "auxiliary_loss_clip": 0.01132928, + "auxiliary_loss_mlp": 0.01100757, + "balance_loss_clip": 1.00189066, + "balance_loss_mlp": 1.00052619, + "epoch": 0.8377574026754847, + "flos": 13734718980480.0, + "grad_norm": 2.4664777694037188, + "language_loss": 0.76276171, + "learning_rate": 2.697369836420933e-07, + "loss": 0.78509855, + "num_input_tokens_seen": 300570275, + "step": 13934, + "time_per_iteration": 2.6193697452545166 + }, + { + "auxiliary_loss_clip": 0.01147643, + "auxiliary_loss_mlp": 0.01101476, + "balance_loss_clip": 1.00193942, + "balance_loss_mlp": 1.00057757, + "epoch": 0.8378175259281527, + "flos": 21651333632640.0, + "grad_norm": 2.2912916060544823, + "language_loss": 0.77601695, + "learning_rate": 2.6954168385657115e-07, + "loss": 0.79850811, + "num_input_tokens_seen": 300590875, + "step": 13935, + "time_per_iteration": 2.6715235710144043 + }, + { + "auxiliary_loss_clip": 0.01114513, + "auxiliary_loss_mlp": 0.01101762, + "balance_loss_clip": 1.00167942, + "balance_loss_mlp": 1.00052941, + "epoch": 0.8378776491808206, + "flos": 15448806973440.0, + "grad_norm": 2.9172071018293995, + "language_loss": 0.57067221, + "learning_rate": 2.6934644969020135e-07, + "loss": 0.59283495, + "num_input_tokens_seen": 300607490, + "step": 13936, + "time_per_iteration": 2.64528226852417 + }, + { + "auxiliary_loss_clip": 0.01149407, + "auxiliary_loss_mlp": 0.01101168, + "balance_loss_clip": 1.00181544, + "balance_loss_mlp": 1.00045955, + "epoch": 0.8379377724334887, + "flos": 14720395069440.0, + "grad_norm": 4.483748266121133, + "language_loss": 0.89448929, + "learning_rate": 2.691512811503882e-07, + "loss": 0.91699505, + "num_input_tokens_seen": 300623635, + "step": 13937, + "time_per_iteration": 2.630411148071289 + }, + { + "auxiliary_loss_clip": 0.01149079, + "auxiliary_loss_mlp": 0.01101626, + "balance_loss_clip": 1.00180221, + "balance_loss_mlp": 1.00044096, + "epoch": 0.8379978956861566, + "flos": 24535247765760.0, + "grad_norm": 1.777938576566966, + "language_loss": 0.81593859, + "learning_rate": 2.689561782445313e-07, + "loss": 0.8384456, + "num_input_tokens_seen": 300643835, + "step": 13938, + "time_per_iteration": 2.6146061420440674 + }, + { + "auxiliary_loss_clip": 0.01149189, + "auxiliary_loss_mlp": 0.01102279, + "balance_loss_clip": 1.00186515, + "balance_loss_mlp": 1.00052261, + "epoch": 0.8380580189388246, + "flos": 18952611045120.0, + "grad_norm": 1.8543478610957107, + "language_loss": 0.7051751, + "learning_rate": 2.6876114098002965e-07, + "loss": 0.7276898, + "num_input_tokens_seen": 300662500, + "step": 13939, + "time_per_iteration": 2.649301528930664 + }, + { + "auxiliary_loss_clip": 0.01115864, + "auxiliary_loss_mlp": 0.01101915, + "balance_loss_clip": 1.00174475, + "balance_loss_mlp": 1.00063539, + "epoch": 0.8381181421914926, + "flos": 26540283922560.0, + "grad_norm": 1.973940473741974, + "language_loss": 0.75709093, + "learning_rate": 2.6856616936428e-07, + "loss": 0.77926874, + "num_input_tokens_seen": 300681480, + "step": 13940, + "time_per_iteration": 2.699305534362793 + }, + { + "auxiliary_loss_clip": 0.01149726, + "auxiliary_loss_mlp": 0.01101598, + "balance_loss_clip": 1.00212836, + "balance_loss_mlp": 1.00050879, + "epoch": 0.8381782654441605, + "flos": 23291481479040.0, + "grad_norm": 1.857244412216227, + "language_loss": 0.76524413, + "learning_rate": 2.6837126340467374e-07, + "loss": 0.7877574, + "num_input_tokens_seen": 300699165, + "step": 13941, + "time_per_iteration": 2.6398813724517822 + }, + { + "auxiliary_loss_clip": 0.01099733, + "auxiliary_loss_mlp": 0.01102722, + "balance_loss_clip": 1.00159907, + "balance_loss_mlp": 1.0004878, + "epoch": 0.8382383886968285, + "flos": 26758800311040.0, + "grad_norm": 2.1727219447262445, + "language_loss": 0.73700827, + "learning_rate": 2.6817642310860276e-07, + "loss": 0.75903285, + "num_input_tokens_seen": 300714615, + "step": 13942, + "time_per_iteration": 4.180186033248901 + }, + { + "auxiliary_loss_clip": 0.0110174, + "auxiliary_loss_mlp": 0.0110247, + "balance_loss_clip": 1.0016911, + "balance_loss_mlp": 1.00052261, + "epoch": 0.8382985119494964, + "flos": 26104544035200.0, + "grad_norm": 1.556974629997717, + "language_loss": 0.79562509, + "learning_rate": 2.679816484834554e-07, + "loss": 0.81766713, + "num_input_tokens_seen": 300734860, + "step": 13943, + "time_per_iteration": 2.7554023265838623 + }, + { + "auxiliary_loss_clip": 0.01101135, + "auxiliary_loss_mlp": 0.01100905, + "balance_loss_clip": 1.00166428, + "balance_loss_mlp": 1.00043583, + "epoch": 0.8383586352021645, + "flos": 16435129507200.0, + "grad_norm": 2.1898901444065864, + "language_loss": 0.85212529, + "learning_rate": 2.6778693953661766e-07, + "loss": 0.87414569, + "num_input_tokens_seen": 300752735, + "step": 13944, + "time_per_iteration": 4.149780750274658 + }, + { + "auxiliary_loss_clip": 0.01142095, + "auxiliary_loss_mlp": 0.00745422, + "balance_loss_clip": 1.00076604, + "balance_loss_mlp": 1.00031972, + "epoch": 0.8384187584548324, + "flos": 64195532288640.0, + "grad_norm": 0.634850592988262, + "language_loss": 0.50288266, + "learning_rate": 2.6759229627547263e-07, + "loss": 0.52175784, + "num_input_tokens_seen": 300820760, + "step": 13945, + "time_per_iteration": 3.3132753372192383 + }, + { + "auxiliary_loss_clip": 0.01103338, + "auxiliary_loss_mlp": 0.01100479, + "balance_loss_clip": 1.00175583, + "balance_loss_mlp": 1.00048602, + "epoch": 0.8384788817075004, + "flos": 22382905933440.0, + "grad_norm": 1.5805221167072556, + "language_loss": 0.65120423, + "learning_rate": 2.673977187074017e-07, + "loss": 0.67324233, + "num_input_tokens_seen": 300840025, + "step": 13946, + "time_per_iteration": 2.7286057472229004 + }, + { + "auxiliary_loss_clip": 0.01099578, + "auxiliary_loss_mlp": 0.01101636, + "balance_loss_clip": 1.00158548, + "balance_loss_mlp": 1.00054669, + "epoch": 0.8385390049601683, + "flos": 29496845312640.0, + "grad_norm": 1.5524896879405266, + "language_loss": 0.67392933, + "learning_rate": 2.672032068397829e-07, + "loss": 0.69594145, + "num_input_tokens_seen": 300860380, + "step": 13947, + "time_per_iteration": 2.7959518432617188 + }, + { + "auxiliary_loss_clip": 0.01133092, + "auxiliary_loss_mlp": 0.01101769, + "balance_loss_clip": 1.00181234, + "balance_loss_mlp": 1.00039303, + "epoch": 0.8385991282128363, + "flos": 32707797799680.0, + "grad_norm": 1.6187852782035832, + "language_loss": 0.70043433, + "learning_rate": 2.6700876067999176e-07, + "loss": 0.72278285, + "num_input_tokens_seen": 300881895, + "step": 13948, + "time_per_iteration": 2.700084686279297 + }, + { + "auxiliary_loss_clip": 0.01130969, + "auxiliary_loss_mlp": 0.01099593, + "balance_loss_clip": 1.00171566, + "balance_loss_mlp": 1.00060141, + "epoch": 0.8386592514655042, + "flos": 25441022050560.0, + "grad_norm": 1.7462415961894873, + "language_loss": 0.85198653, + "learning_rate": 2.6681438023540194e-07, + "loss": 0.87429214, + "num_input_tokens_seen": 300901575, + "step": 13949, + "time_per_iteration": 2.733769655227661 + }, + { + "auxiliary_loss_clip": 0.01131158, + "auxiliary_loss_mlp": 0.0110053, + "balance_loss_clip": 1.0017885, + "balance_loss_mlp": 1.00048935, + "epoch": 0.8387193747181723, + "flos": 22015898720640.0, + "grad_norm": 1.849325200359107, + "language_loss": 0.7074464, + "learning_rate": 2.66620065513385e-07, + "loss": 0.72976327, + "num_input_tokens_seen": 300919735, + "step": 13950, + "time_per_iteration": 2.6330373287200928 + }, + { + "auxiliary_loss_clip": 0.01148427, + "auxiliary_loss_mlp": 0.01100753, + "balance_loss_clip": 1.00200415, + "balance_loss_mlp": 1.00052214, + "epoch": 0.8387794979708402, + "flos": 18150223080960.0, + "grad_norm": 1.738331418057398, + "language_loss": 0.64658445, + "learning_rate": 2.6642581652130913e-07, + "loss": 0.66907626, + "num_input_tokens_seen": 300939150, + "step": 13951, + "time_per_iteration": 2.647094964981079 + }, + { + "auxiliary_loss_clip": 0.01148112, + "auxiliary_loss_mlp": 0.01100736, + "balance_loss_clip": 1.00190544, + "balance_loss_mlp": 1.00040996, + "epoch": 0.8388396212235082, + "flos": 25411216740480.0, + "grad_norm": 1.706307981274135, + "language_loss": 0.70570433, + "learning_rate": 2.662316332665393e-07, + "loss": 0.72819281, + "num_input_tokens_seen": 300959730, + "step": 13952, + "time_per_iteration": 2.6092689037323 + }, + { + "auxiliary_loss_clip": 0.01147504, + "auxiliary_loss_mlp": 0.01101154, + "balance_loss_clip": 1.00179219, + "balance_loss_mlp": 1.00044572, + "epoch": 0.8388997444761762, + "flos": 22273055164800.0, + "grad_norm": 2.3122851715735284, + "language_loss": 0.72704244, + "learning_rate": 2.6603751575643987e-07, + "loss": 0.749529, + "num_input_tokens_seen": 300976120, + "step": 13953, + "time_per_iteration": 2.673593044281006 + }, + { + "auxiliary_loss_clip": 0.01085271, + "auxiliary_loss_mlp": 0.01101484, + "balance_loss_clip": 1.00177884, + "balance_loss_mlp": 1.00048947, + "epoch": 0.8389598677288441, + "flos": 19573219255680.0, + "grad_norm": 1.8186525232451365, + "language_loss": 0.67840171, + "learning_rate": 2.6584346399837176e-07, + "loss": 0.70026922, + "num_input_tokens_seen": 300995080, + "step": 13954, + "time_per_iteration": 2.724668264389038 + }, + { + "auxiliary_loss_clip": 0.01130442, + "auxiliary_loss_mlp": 0.0110077, + "balance_loss_clip": 1.00188375, + "balance_loss_mlp": 1.00044394, + "epoch": 0.8390199909815121, + "flos": 17384715406080.0, + "grad_norm": 1.7353725582164177, + "language_loss": 0.73277932, + "learning_rate": 2.656494779996932e-07, + "loss": 0.75509143, + "num_input_tokens_seen": 301012920, + "step": 13955, + "time_per_iteration": 2.664386749267578 + }, + { + "auxiliary_loss_clip": 0.01085158, + "auxiliary_loss_mlp": 0.01101765, + "balance_loss_clip": 1.00162625, + "balance_loss_mlp": 1.0004847, + "epoch": 0.83908011423418, + "flos": 24639639667200.0, + "grad_norm": 2.7857211417157495, + "language_loss": 0.66378069, + "learning_rate": 2.6545555776775995e-07, + "loss": 0.68564993, + "num_input_tokens_seen": 301028875, + "step": 13956, + "time_per_iteration": 2.8194806575775146 + }, + { + "auxiliary_loss_clip": 0.01147636, + "auxiliary_loss_mlp": 0.01101977, + "balance_loss_clip": 1.00182939, + "balance_loss_mlp": 1.00050604, + "epoch": 0.8391402374868481, + "flos": 24718356322560.0, + "grad_norm": 2.4730008100063468, + "language_loss": 0.7947005, + "learning_rate": 2.6526170330992667e-07, + "loss": 0.81719661, + "num_input_tokens_seen": 301050115, + "step": 13957, + "time_per_iteration": 2.807783603668213 + }, + { + "auxiliary_loss_clip": 0.01083171, + "auxiliary_loss_mlp": 0.01076076, + "balance_loss_clip": 1.00155449, + "balance_loss_mlp": 1.00016308, + "epoch": 0.839200360739516, + "flos": 56871695784960.0, + "grad_norm": 0.7513189298457973, + "language_loss": 0.53362614, + "learning_rate": 2.6506791463354283e-07, + "loss": 0.55521864, + "num_input_tokens_seen": 301114155, + "step": 13958, + "time_per_iteration": 3.513976812362671 + }, + { + "auxiliary_loss_clip": 0.01147481, + "auxiliary_loss_mlp": 0.01101062, + "balance_loss_clip": 1.00181174, + "balance_loss_mlp": 1.00044954, + "epoch": 0.839260483992184, + "flos": 18332792933760.0, + "grad_norm": 2.193586928575335, + "language_loss": 0.72978783, + "learning_rate": 2.648741917459574e-07, + "loss": 0.7522732, + "num_input_tokens_seen": 301133150, + "step": 13959, + "time_per_iteration": 3.203298568725586 + }, + { + "auxiliary_loss_clip": 0.01116221, + "auxiliary_loss_mlp": 0.01101635, + "balance_loss_clip": 1.00180852, + "balance_loss_mlp": 1.00035453, + "epoch": 0.8393206072448519, + "flos": 27087921653760.0, + "grad_norm": 1.6227387114727074, + "language_loss": 0.55152297, + "learning_rate": 2.646805346545169e-07, + "loss": 0.5737015, + "num_input_tokens_seen": 301153600, + "step": 13960, + "time_per_iteration": 2.847783327102661 + }, + { + "auxiliary_loss_clip": 0.01113058, + "auxiliary_loss_mlp": 0.01074898, + "balance_loss_clip": 1.00067663, + "balance_loss_mlp": 1.00013018, + "epoch": 0.8393807304975199, + "flos": 61521192057600.0, + "grad_norm": 0.7910100753285818, + "language_loss": 0.60768926, + "learning_rate": 2.6448694336656397e-07, + "loss": 0.62956882, + "num_input_tokens_seen": 301214335, + "step": 13961, + "time_per_iteration": 3.3539206981658936 + }, + { + "auxiliary_loss_clip": 0.0110165, + "auxiliary_loss_mlp": 0.0110085, + "balance_loss_clip": 1.00153959, + "balance_loss_mlp": 1.00042772, + "epoch": 0.8394408537501878, + "flos": 14894848448640.0, + "grad_norm": 2.268555586557364, + "language_loss": 0.68574172, + "learning_rate": 2.642934178894405e-07, + "loss": 0.70776671, + "num_input_tokens_seen": 301228960, + "step": 13962, + "time_per_iteration": 4.306610584259033 + }, + { + "auxiliary_loss_clip": 0.01116247, + "auxiliary_loss_mlp": 0.01101572, + "balance_loss_clip": 1.00169301, + "balance_loss_mlp": 1.00038695, + "epoch": 0.8395009770028559, + "flos": 17412186332160.0, + "grad_norm": 1.872481754329432, + "language_loss": 0.7325508, + "learning_rate": 2.640999582304841e-07, + "loss": 0.75472897, + "num_input_tokens_seen": 301245875, + "step": 13963, + "time_per_iteration": 2.726269006729126 + }, + { + "auxiliary_loss_clip": 0.01132867, + "auxiliary_loss_mlp": 0.01100794, + "balance_loss_clip": 1.00176239, + "balance_loss_mlp": 1.00051534, + "epoch": 0.8395611002555238, + "flos": 27924747782400.0, + "grad_norm": 1.631287013375027, + "language_loss": 0.76557952, + "learning_rate": 2.6390656439703173e-07, + "loss": 0.78791612, + "num_input_tokens_seen": 301265550, + "step": 13964, + "time_per_iteration": 2.7384018898010254 + }, + { + "auxiliary_loss_clip": 0.01130959, + "auxiliary_loss_mlp": 0.0110176, + "balance_loss_clip": 1.00169182, + "balance_loss_mlp": 1.00047934, + "epoch": 0.8396212235081918, + "flos": 11100922225920.0, + "grad_norm": 2.2446208702711994, + "language_loss": 0.78432941, + "learning_rate": 2.637132363964161e-07, + "loss": 0.80665654, + "num_input_tokens_seen": 301282035, + "step": 13965, + "time_per_iteration": 2.926790952682495 + }, + { + "auxiliary_loss_clip": 0.01147798, + "auxiliary_loss_mlp": 0.01100515, + "balance_loss_clip": 1.00175881, + "balance_loss_mlp": 1.00056982, + "epoch": 0.8396813467608598, + "flos": 35735641729920.0, + "grad_norm": 2.1053706567866173, + "language_loss": 0.65389824, + "learning_rate": 2.635199742359684e-07, + "loss": 0.67638135, + "num_input_tokens_seen": 301305210, + "step": 13966, + "time_per_iteration": 6.542032241821289 + }, + { + "auxiliary_loss_clip": 0.01131166, + "auxiliary_loss_mlp": 0.01100467, + "balance_loss_clip": 1.00174165, + "balance_loss_mlp": 1.00052214, + "epoch": 0.8397414700135277, + "flos": 26176724415360.0, + "grad_norm": 1.7936278438489779, + "language_loss": 0.7430734, + "learning_rate": 2.633267779230177e-07, + "loss": 0.76538974, + "num_input_tokens_seen": 301324885, + "step": 13967, + "time_per_iteration": 4.254976749420166 + }, + { + "auxiliary_loss_clip": 0.01130958, + "auxiliary_loss_mlp": 0.01100858, + "balance_loss_clip": 1.00169778, + "balance_loss_mlp": 1.00034094, + "epoch": 0.8398015932661957, + "flos": 18333116156160.0, + "grad_norm": 2.0216612177231736, + "language_loss": 0.82789218, + "learning_rate": 2.6313364746488974e-07, + "loss": 0.85021037, + "num_input_tokens_seen": 301343070, + "step": 13968, + "time_per_iteration": 3.9567859172821045 + }, + { + "auxiliary_loss_clip": 0.01131438, + "auxiliary_loss_mlp": 0.01101073, + "balance_loss_clip": 1.00187111, + "balance_loss_mlp": 1.00046015, + "epoch": 0.8398617165188637, + "flos": 17379507934080.0, + "grad_norm": 2.5801947663978124, + "language_loss": 0.77259934, + "learning_rate": 2.629405828689075e-07, + "loss": 0.7949245, + "num_input_tokens_seen": 301359280, + "step": 13969, + "time_per_iteration": 2.9047927856445312 + }, + { + "auxiliary_loss_clip": 0.01132947, + "auxiliary_loss_mlp": 0.01103285, + "balance_loss_clip": 1.00171661, + "balance_loss_mlp": 1.00038362, + "epoch": 0.8399218397715317, + "flos": 22929681738240.0, + "grad_norm": 2.258306368973128, + "language_loss": 0.77216482, + "learning_rate": 2.627475841423923e-07, + "loss": 0.79452711, + "num_input_tokens_seen": 301376465, + "step": 13970, + "time_per_iteration": 2.7761762142181396 + }, + { + "auxiliary_loss_clip": 0.01131128, + "auxiliary_loss_mlp": 0.01102272, + "balance_loss_clip": 1.00177491, + "balance_loss_mlp": 1.00051522, + "epoch": 0.8399819630241996, + "flos": 23149562843520.0, + "grad_norm": 1.9315596479820518, + "language_loss": 0.72113407, + "learning_rate": 2.625546512926633e-07, + "loss": 0.74346805, + "num_input_tokens_seen": 301396000, + "step": 13971, + "time_per_iteration": 2.7669143676757812 + }, + { + "auxiliary_loss_clip": 0.01132808, + "auxiliary_loss_mlp": 0.01101264, + "balance_loss_clip": 1.00175917, + "balance_loss_mlp": 1.00046039, + "epoch": 0.8400420862768676, + "flos": 16397423205120.0, + "grad_norm": 1.8772067183211196, + "language_loss": 0.7717514, + "learning_rate": 2.623617843270358e-07, + "loss": 0.79409212, + "num_input_tokens_seen": 301413160, + "step": 13972, + "time_per_iteration": 2.7105214595794678 + }, + { + "auxiliary_loss_clip": 0.01084152, + "auxiliary_loss_mlp": 0.01100306, + "balance_loss_clip": 1.00176382, + "balance_loss_mlp": 1.00050449, + "epoch": 0.8401022095295355, + "flos": 21287486816640.0, + "grad_norm": 1.395977206737369, + "language_loss": 0.68319196, + "learning_rate": 2.6216898325282333e-07, + "loss": 0.70503652, + "num_input_tokens_seen": 301433325, + "step": 13973, + "time_per_iteration": 2.915774345397949 + }, + { + "auxiliary_loss_clip": 0.01130711, + "auxiliary_loss_mlp": 0.01100945, + "balance_loss_clip": 1.001652, + "balance_loss_mlp": 1.00042725, + "epoch": 0.8401623327822035, + "flos": 17311313963520.0, + "grad_norm": 2.0038316116259463, + "language_loss": 0.78245991, + "learning_rate": 2.619762480773382e-07, + "loss": 0.80477655, + "num_input_tokens_seen": 301450265, + "step": 13974, + "time_per_iteration": 2.67917799949646 + }, + { + "auxiliary_loss_clip": 0.01147965, + "auxiliary_loss_mlp": 0.01101018, + "balance_loss_clip": 1.00176382, + "balance_loss_mlp": 1.00050092, + "epoch": 0.8402224560348714, + "flos": 22236677665920.0, + "grad_norm": 1.5444219900239888, + "language_loss": 0.72586232, + "learning_rate": 2.617835788078868e-07, + "loss": 0.74835211, + "num_input_tokens_seen": 301470760, + "step": 13975, + "time_per_iteration": 2.6731064319610596 + }, + { + "auxiliary_loss_clip": 0.01134281, + "auxiliary_loss_mlp": 0.01101273, + "balance_loss_clip": 1.00179839, + "balance_loss_mlp": 1.000422, + "epoch": 0.8402825792875395, + "flos": 20229953569920.0, + "grad_norm": 1.7082523320051224, + "language_loss": 0.72347152, + "learning_rate": 2.6159097545177645e-07, + "loss": 0.74582702, + "num_input_tokens_seen": 301489425, + "step": 13976, + "time_per_iteration": 2.826291561126709 + }, + { + "auxiliary_loss_clip": 0.01164312, + "auxiliary_loss_mlp": 0.00747401, + "balance_loss_clip": 1.00194168, + "balance_loss_mlp": 1.00058508, + "epoch": 0.8403427025402074, + "flos": 23289973107840.0, + "grad_norm": 1.7365056062888684, + "language_loss": 0.72120392, + "learning_rate": 2.61398438016311e-07, + "loss": 0.7403211, + "num_input_tokens_seen": 301508885, + "step": 13977, + "time_per_iteration": 2.7067532539367676 + }, + { + "auxiliary_loss_clip": 0.01147838, + "auxiliary_loss_mlp": 0.01100468, + "balance_loss_clip": 1.00179338, + "balance_loss_mlp": 1.00042796, + "epoch": 0.8404028257928754, + "flos": 32675586278400.0, + "grad_norm": 2.1758390509638064, + "language_loss": 0.68483448, + "learning_rate": 2.6120596650879043e-07, + "loss": 0.70731753, + "num_input_tokens_seen": 301533780, + "step": 13978, + "time_per_iteration": 2.7875113487243652 + }, + { + "auxiliary_loss_clip": 0.01118149, + "auxiliary_loss_mlp": 0.01099403, + "balance_loss_clip": 1.00176907, + "balance_loss_mlp": 1.00050652, + "epoch": 0.8404629490455434, + "flos": 16180522928640.0, + "grad_norm": 1.8687453663658784, + "language_loss": 0.78035939, + "learning_rate": 2.610135609365145e-07, + "loss": 0.80253488, + "num_input_tokens_seen": 301551775, + "step": 13979, + "time_per_iteration": 2.694533109664917 + }, + { + "auxiliary_loss_clip": 0.01147985, + "auxiliary_loss_mlp": 0.01101004, + "balance_loss_clip": 1.0018748, + "balance_loss_mlp": 1.00039101, + "epoch": 0.8405230722982113, + "flos": 15194451790080.0, + "grad_norm": 2.1307212352831324, + "language_loss": 0.78288341, + "learning_rate": 2.60821221306778e-07, + "loss": 0.80537331, + "num_input_tokens_seen": 301570495, + "step": 13980, + "time_per_iteration": 4.136633396148682 + }, + { + "auxiliary_loss_clip": 0.01116087, + "auxiliary_loss_mlp": 0.01101139, + "balance_loss_clip": 1.0017817, + "balance_loss_mlp": 1.00047827, + "epoch": 0.8405831955508793, + "flos": 27812418975360.0, + "grad_norm": 1.6851577629755574, + "language_loss": 0.86818355, + "learning_rate": 2.606289476268757e-07, + "loss": 0.89035583, + "num_input_tokens_seen": 301591705, + "step": 13981, + "time_per_iteration": 4.2086968421936035 + }, + { + "auxiliary_loss_clip": 0.01149481, + "auxiliary_loss_mlp": 0.01101426, + "balance_loss_clip": 1.00191069, + "balance_loss_mlp": 1.0005753, + "epoch": 0.8406433188035473, + "flos": 23769452782080.0, + "grad_norm": 1.9881926287002, + "language_loss": 0.68190515, + "learning_rate": 2.6043673990409745e-07, + "loss": 0.70441413, + "num_input_tokens_seen": 301611670, + "step": 13982, + "time_per_iteration": 2.6615774631500244 + }, + { + "auxiliary_loss_clip": 0.01101236, + "auxiliary_loss_mlp": 0.01101384, + "balance_loss_clip": 1.00185037, + "balance_loss_mlp": 1.00058031, + "epoch": 0.8407034420562153, + "flos": 29205681667200.0, + "grad_norm": 1.6591629075281888, + "language_loss": 0.68562663, + "learning_rate": 2.602445981457324e-07, + "loss": 0.70765281, + "num_input_tokens_seen": 301632540, + "step": 13983, + "time_per_iteration": 2.856217622756958 + }, + { + "auxiliary_loss_clip": 0.01117957, + "auxiliary_loss_mlp": 0.01101832, + "balance_loss_clip": 1.00169408, + "balance_loss_mlp": 1.00064659, + "epoch": 0.8407635653088832, + "flos": 26360084367360.0, + "grad_norm": 1.996142114732626, + "language_loss": 0.78970361, + "learning_rate": 2.6005252235906684e-07, + "loss": 0.81190157, + "num_input_tokens_seen": 301651480, + "step": 13984, + "time_per_iteration": 2.7450292110443115 + }, + { + "auxiliary_loss_clip": 0.01147527, + "auxiliary_loss_mlp": 0.01101044, + "balance_loss_clip": 1.00178957, + "balance_loss_mlp": 1.00043154, + "epoch": 0.8408236885615512, + "flos": 21468799693440.0, + "grad_norm": 3.702840772861175, + "language_loss": 0.60093963, + "learning_rate": 2.598605125513842e-07, + "loss": 0.62342536, + "num_input_tokens_seen": 301670010, + "step": 13985, + "time_per_iteration": 2.6585679054260254 + }, + { + "auxiliary_loss_clip": 0.0110303, + "auxiliary_loss_mlp": 0.01101396, + "balance_loss_clip": 1.00161576, + "balance_loss_mlp": 1.00049758, + "epoch": 0.8408838118142191, + "flos": 22963724853120.0, + "grad_norm": 2.826114596117275, + "language_loss": 0.81777775, + "learning_rate": 2.5966856872996467e-07, + "loss": 0.83982193, + "num_input_tokens_seen": 301689785, + "step": 13986, + "time_per_iteration": 2.737313747406006 + }, + { + "auxiliary_loss_clip": 0.01131157, + "auxiliary_loss_mlp": 0.00747371, + "balance_loss_clip": 1.00179005, + "balance_loss_mlp": 1.00043178, + "epoch": 0.8409439350668871, + "flos": 26800026145920.0, + "grad_norm": 1.7252092884271135, + "language_loss": 0.65887547, + "learning_rate": 2.5947669090208755e-07, + "loss": 0.67766076, + "num_input_tokens_seen": 301712225, + "step": 13987, + "time_per_iteration": 2.738807201385498 + }, + { + "auxiliary_loss_clip": 0.0116435, + "auxiliary_loss_mlp": 0.00747522, + "balance_loss_clip": 1.00190878, + "balance_loss_mlp": 1.00064468, + "epoch": 0.841004058319555, + "flos": 26578672583040.0, + "grad_norm": 2.3455709589149567, + "language_loss": 0.67583168, + "learning_rate": 2.5928487907502906e-07, + "loss": 0.69495046, + "num_input_tokens_seen": 301730955, + "step": 13988, + "time_per_iteration": 2.6528940200805664 + }, + { + "auxiliary_loss_clip": 0.0114812, + "auxiliary_loss_mlp": 0.01102303, + "balance_loss_clip": 1.00200665, + "balance_loss_mlp": 1.00045085, + "epoch": 0.8410641815722231, + "flos": 14501878680960.0, + "grad_norm": 2.2845840466920504, + "language_loss": 0.81187689, + "learning_rate": 2.590931332560622e-07, + "loss": 0.8343811, + "num_input_tokens_seen": 301746930, + "step": 13989, + "time_per_iteration": 2.6112122535705566 + }, + { + "auxiliary_loss_clip": 0.01149735, + "auxiliary_loss_mlp": 0.01101355, + "balance_loss_clip": 1.00187969, + "balance_loss_mlp": 1.00040841, + "epoch": 0.841124304824891, + "flos": 29166682475520.0, + "grad_norm": 1.8457479400846148, + "language_loss": 0.75488043, + "learning_rate": 2.5890145345245826e-07, + "loss": 0.77739131, + "num_input_tokens_seen": 301766945, + "step": 13990, + "time_per_iteration": 2.6776394844055176 + }, + { + "auxiliary_loss_clip": 0.01147808, + "auxiliary_loss_mlp": 0.01100492, + "balance_loss_clip": 1.00178862, + "balance_loss_mlp": 1.00054669, + "epoch": 0.841184428077559, + "flos": 22412028885120.0, + "grad_norm": 1.6087916316507156, + "language_loss": 0.80615526, + "learning_rate": 2.5870983967148597e-07, + "loss": 0.8286382, + "num_input_tokens_seen": 301785460, + "step": 13991, + "time_per_iteration": 2.702349901199341 + }, + { + "auxiliary_loss_clip": 0.01113849, + "auxiliary_loss_mlp": 0.01100305, + "balance_loss_clip": 1.00158191, + "balance_loss_mlp": 1.0004549, + "epoch": 0.841244551330227, + "flos": 22962791099520.0, + "grad_norm": 2.9391663350094457, + "language_loss": 0.70882124, + "learning_rate": 2.585182919204105e-07, + "loss": 0.73096275, + "num_input_tokens_seen": 301804180, + "step": 13992, + "time_per_iteration": 2.7019660472869873 + }, + { + "auxiliary_loss_clip": 0.01117926, + "auxiliary_loss_mlp": 0.01101046, + "balance_loss_clip": 1.00184679, + "balance_loss_mlp": 1.00038576, + "epoch": 0.8413046745828949, + "flos": 21032736583680.0, + "grad_norm": 1.6520381661785295, + "language_loss": 0.76636517, + "learning_rate": 2.583268102064959e-07, + "loss": 0.78855491, + "num_input_tokens_seen": 301823670, + "step": 13993, + "time_per_iteration": 2.7055835723876953 + }, + { + "auxiliary_loss_clip": 0.01149812, + "auxiliary_loss_mlp": 0.01102891, + "balance_loss_clip": 1.00187421, + "balance_loss_mlp": 1.0004667, + "epoch": 0.841364797835563, + "flos": 27052082858880.0, + "grad_norm": 2.22129373350975, + "language_loss": 0.74126697, + "learning_rate": 2.5813539453700393e-07, + "loss": 0.76379395, + "num_input_tokens_seen": 301845890, + "step": 13994, + "time_per_iteration": 2.6956698894500732 + }, + { + "auxiliary_loss_clip": 0.01149465, + "auxiliary_loss_mlp": 0.0110075, + "balance_loss_clip": 1.0019393, + "balance_loss_mlp": 1.00042295, + "epoch": 0.8414249210882309, + "flos": 17895688329600.0, + "grad_norm": 1.6832983007248232, + "language_loss": 0.59431148, + "learning_rate": 2.5794404491919163e-07, + "loss": 0.61681366, + "num_input_tokens_seen": 301863985, + "step": 13995, + "time_per_iteration": 2.582990884780884 + }, + { + "auxiliary_loss_clip": 0.01147, + "auxiliary_loss_mlp": 0.01100243, + "balance_loss_clip": 1.00178885, + "balance_loss_mlp": 1.00048852, + "epoch": 0.8414850443408989, + "flos": 25441201618560.0, + "grad_norm": 1.5800045442122201, + "language_loss": 0.71642941, + "learning_rate": 2.577527613603163e-07, + "loss": 0.73890185, + "num_input_tokens_seen": 301882765, + "step": 13996, + "time_per_iteration": 2.8188552856445312 + }, + { + "auxiliary_loss_clip": 0.01132829, + "auxiliary_loss_mlp": 0.01100193, + "balance_loss_clip": 1.00184798, + "balance_loss_mlp": 1.00039101, + "epoch": 0.8415451675935668, + "flos": 23220055284480.0, + "grad_norm": 2.363220837742483, + "language_loss": 0.64352059, + "learning_rate": 2.5756154386763017e-07, + "loss": 0.66585076, + "num_input_tokens_seen": 301902720, + "step": 13997, + "time_per_iteration": 2.769526958465576 + }, + { + "auxiliary_loss_clip": 0.01131852, + "auxiliary_loss_mlp": 0.011023, + "balance_loss_clip": 1.00175643, + "balance_loss_mlp": 1.00044715, + "epoch": 0.8416052908462348, + "flos": 18546496899840.0, + "grad_norm": 2.3751169247419432, + "language_loss": 0.82298446, + "learning_rate": 2.5737039244838565e-07, + "loss": 0.84532601, + "num_input_tokens_seen": 301921245, + "step": 13998, + "time_per_iteration": 2.685152292251587 + }, + { + "auxiliary_loss_clip": 0.01147531, + "auxiliary_loss_mlp": 0.00747493, + "balance_loss_clip": 1.00180793, + "balance_loss_mlp": 1.00050795, + "epoch": 0.8416654140989027, + "flos": 26105190480000.0, + "grad_norm": 1.8942721127355109, + "language_loss": 0.80424714, + "learning_rate": 2.5717930710982984e-07, + "loss": 0.82319736, + "num_input_tokens_seen": 301942320, + "step": 13999, + "time_per_iteration": 2.6835784912109375 + }, + { + "auxiliary_loss_clip": 0.01149845, + "auxiliary_loss_mlp": 0.01101455, + "balance_loss_clip": 1.00194192, + "balance_loss_mlp": 1.00046086, + "epoch": 0.8417255373515707, + "flos": 26433270328320.0, + "grad_norm": 1.9443136003262818, + "language_loss": 0.66784513, + "learning_rate": 2.569882878592096e-07, + "loss": 0.69035804, + "num_input_tokens_seen": 301963110, + "step": 14000, + "time_per_iteration": 4.199470281600952 + }, + { + "auxiliary_loss_clip": 0.01149249, + "auxiliary_loss_mlp": 0.01102535, + "balance_loss_clip": 1.00187552, + "balance_loss_mlp": 1.00058699, + "epoch": 0.8417856606042387, + "flos": 24717745791360.0, + "grad_norm": 1.4044694701243046, + "language_loss": 0.79414165, + "learning_rate": 2.5679733470376885e-07, + "loss": 0.81665945, + "num_input_tokens_seen": 301984915, + "step": 14001, + "time_per_iteration": 2.670106887817383 + }, + { + "auxiliary_loss_clip": 0.01083433, + "auxiliary_loss_mlp": 0.01099926, + "balance_loss_clip": 1.00165176, + "balance_loss_mlp": 1.000458, + "epoch": 0.8418457838569067, + "flos": 20850849089280.0, + "grad_norm": 1.7588172367961, + "language_loss": 0.78418589, + "learning_rate": 2.5660644765074703e-07, + "loss": 0.80601954, + "num_input_tokens_seen": 302004095, + "step": 14002, + "time_per_iteration": 2.8407232761383057 + }, + { + "auxiliary_loss_clip": 0.01097443, + "auxiliary_loss_mlp": 0.00747345, + "balance_loss_clip": 1.0015347, + "balance_loss_mlp": 1.0004406, + "epoch": 0.8419059071095746, + "flos": 28660629715200.0, + "grad_norm": 1.3758930155072675, + "language_loss": 0.78099775, + "learning_rate": 2.5641562670738334e-07, + "loss": 0.79944563, + "num_input_tokens_seen": 302027250, + "step": 14003, + "time_per_iteration": 2.8019936084747314 + }, + { + "auxiliary_loss_clip": 0.01131299, + "auxiliary_loss_mlp": 0.01101512, + "balance_loss_clip": 1.00184023, + "balance_loss_mlp": 1.00042224, + "epoch": 0.8419660303622426, + "flos": 21653596189440.0, + "grad_norm": 1.742399456062498, + "language_loss": 0.65630281, + "learning_rate": 2.5622487188091436e-07, + "loss": 0.67863089, + "num_input_tokens_seen": 302046950, + "step": 14004, + "time_per_iteration": 4.062957048416138 + }, + { + "auxiliary_loss_clip": 0.01149792, + "auxiliary_loss_mlp": 0.01102105, + "balance_loss_clip": 1.00196958, + "balance_loss_mlp": 1.00053835, + "epoch": 0.8420261536149106, + "flos": 25301114576640.0, + "grad_norm": 2.0257263568510995, + "language_loss": 0.75906146, + "learning_rate": 2.560341831785724e-07, + "loss": 0.78158039, + "num_input_tokens_seen": 302065470, + "step": 14005, + "time_per_iteration": 2.6914501190185547 + }, + { + "auxiliary_loss_clip": 0.01116363, + "auxiliary_loss_mlp": 0.00747385, + "balance_loss_clip": 1.00176644, + "balance_loss_mlp": 1.00049186, + "epoch": 0.8420862768675785, + "flos": 18763397176320.0, + "grad_norm": 1.921086106371412, + "language_loss": 0.77232885, + "learning_rate": 2.5584356060758906e-07, + "loss": 0.79096627, + "num_input_tokens_seen": 302083190, + "step": 14006, + "time_per_iteration": 2.7045977115631104 + }, + { + "auxiliary_loss_clip": 0.01147799, + "auxiliary_loss_mlp": 0.01101144, + "balance_loss_clip": 1.00176907, + "balance_loss_mlp": 1.00062656, + "epoch": 0.8421464001202466, + "flos": 18328052338560.0, + "grad_norm": 1.8698828254796156, + "language_loss": 0.77160877, + "learning_rate": 2.556530041751932e-07, + "loss": 0.7940982, + "num_input_tokens_seen": 302098820, + "step": 14007, + "time_per_iteration": 2.5756378173828125 + }, + { + "auxiliary_loss_clip": 0.01131926, + "auxiliary_loss_mlp": 0.01101331, + "balance_loss_clip": 1.00196159, + "balance_loss_mlp": 1.00043273, + "epoch": 0.8422065233729145, + "flos": 31537181560320.0, + "grad_norm": 1.8453337824184437, + "language_loss": 0.65644276, + "learning_rate": 2.554625138886102e-07, + "loss": 0.67877531, + "num_input_tokens_seen": 302117075, + "step": 14008, + "time_per_iteration": 2.679619073867798 + }, + { + "auxiliary_loss_clip": 0.01141797, + "auxiliary_loss_mlp": 0.01074851, + "balance_loss_clip": 1.00074589, + "balance_loss_mlp": 1.00008285, + "epoch": 0.8422666466255825, + "flos": 64298128510080.0, + "grad_norm": 0.7076330803498174, + "language_loss": 0.5692482, + "learning_rate": 2.552720897550631e-07, + "loss": 0.59141469, + "num_input_tokens_seen": 302179735, + "step": 14009, + "time_per_iteration": 3.241926670074463 + }, + { + "auxiliary_loss_clip": 0.01083412, + "auxiliary_loss_mlp": 0.01099959, + "balance_loss_clip": 1.00173557, + "balance_loss_mlp": 1.00053883, + "epoch": 0.8423267698782504, + "flos": 24316731377280.0, + "grad_norm": 1.5791059395985767, + "language_loss": 0.77723205, + "learning_rate": 2.5508173178177304e-07, + "loss": 0.79906577, + "num_input_tokens_seen": 302202055, + "step": 14010, + "time_per_iteration": 2.7926321029663086 + }, + { + "auxiliary_loss_clip": 0.01164215, + "auxiliary_loss_mlp": 0.01102774, + "balance_loss_clip": 1.00181246, + "balance_loss_mlp": 1.00054049, + "epoch": 0.8423868931309184, + "flos": 18296092212480.0, + "grad_norm": 1.5986114432994563, + "language_loss": 0.72585785, + "learning_rate": 2.548914399759592e-07, + "loss": 0.74852777, + "num_input_tokens_seen": 302221360, + "step": 14011, + "time_per_iteration": 2.6056673526763916 + }, + { + "auxiliary_loss_clip": 0.01147834, + "auxiliary_loss_mlp": 0.01101433, + "balance_loss_clip": 1.00181329, + "balance_loss_mlp": 1.00062931, + "epoch": 0.8424470163835863, + "flos": 23550218121600.0, + "grad_norm": 1.8300631878673053, + "language_loss": 0.84321868, + "learning_rate": 2.5470121434483636e-07, + "loss": 0.86571133, + "num_input_tokens_seen": 302240715, + "step": 14012, + "time_per_iteration": 2.663071393966675 + }, + { + "auxiliary_loss_clip": 0.01164005, + "auxiliary_loss_mlp": 0.0109994, + "balance_loss_clip": 1.00187182, + "balance_loss_mlp": 1.00051939, + "epoch": 0.8425071396362543, + "flos": 23769488695680.0, + "grad_norm": 2.315377572764299, + "language_loss": 0.68030316, + "learning_rate": 2.5451105489561884e-07, + "loss": 0.70294261, + "num_input_tokens_seen": 302260950, + "step": 14013, + "time_per_iteration": 2.6779720783233643 + }, + { + "auxiliary_loss_clip": 0.01164487, + "auxiliary_loss_mlp": 0.01101954, + "balance_loss_clip": 1.00195169, + "balance_loss_mlp": 1.0004828, + "epoch": 0.8425672628889223, + "flos": 16178906816640.0, + "grad_norm": 2.310555022332564, + "language_loss": 0.78844804, + "learning_rate": 2.5432096163551644e-07, + "loss": 0.81111246, + "num_input_tokens_seen": 302277500, + "step": 14014, + "time_per_iteration": 2.507563829421997 + }, + { + "auxiliary_loss_clip": 0.01117971, + "auxiliary_loss_mlp": 0.00747272, + "balance_loss_clip": 1.00164914, + "balance_loss_mlp": 1.00054336, + "epoch": 0.8426273861415903, + "flos": 23149131880320.0, + "grad_norm": 1.881517632979826, + "language_loss": 0.67413449, + "learning_rate": 2.5413093457173884e-07, + "loss": 0.69278693, + "num_input_tokens_seen": 302297930, + "step": 14015, + "time_per_iteration": 2.7490274906158447 + }, + { + "auxiliary_loss_clip": 0.01164372, + "auxiliary_loss_mlp": 0.0110115, + "balance_loss_clip": 1.0020144, + "balance_loss_mlp": 1.00034618, + "epoch": 0.8426875093942582, + "flos": 17457757712640.0, + "grad_norm": 2.2055830958494766, + "language_loss": 0.75728589, + "learning_rate": 2.5394097371149036e-07, + "loss": 0.77994108, + "num_input_tokens_seen": 302315735, + "step": 14016, + "time_per_iteration": 2.6674163341522217 + }, + { + "auxiliary_loss_clip": 0.01133034, + "auxiliary_loss_mlp": 0.01101149, + "balance_loss_clip": 1.00181937, + "balance_loss_mlp": 1.0005368, + "epoch": 0.8427476326469262, + "flos": 19640551299840.0, + "grad_norm": 1.8494629104845004, + "language_loss": 0.79040188, + "learning_rate": 2.5375107906197544e-07, + "loss": 0.81274372, + "num_input_tokens_seen": 302332790, + "step": 14017, + "time_per_iteration": 2.6752941608428955 + }, + { + "auxiliary_loss_clip": 0.01130918, + "auxiliary_loss_mlp": 0.01100314, + "balance_loss_clip": 1.00179172, + "balance_loss_mlp": 1.00051165, + "epoch": 0.8428077558995941, + "flos": 11941160146560.0, + "grad_norm": 3.184961609397497, + "language_loss": 0.62806374, + "learning_rate": 2.5356125063039525e-07, + "loss": 0.65037602, + "num_input_tokens_seen": 302346490, + "step": 14018, + "time_per_iteration": 5.277268171310425 + }, + { + "auxiliary_loss_clip": 0.01147649, + "auxiliary_loss_mlp": 0.01100546, + "balance_loss_clip": 1.00179279, + "balance_loss_mlp": 1.00050545, + "epoch": 0.8428678791522621, + "flos": 10451729767680.0, + "grad_norm": 1.845526037696844, + "language_loss": 0.7898705, + "learning_rate": 2.5337148842394687e-07, + "loss": 0.81235242, + "num_input_tokens_seen": 302363235, + "step": 14019, + "time_per_iteration": 4.037928104400635 + }, + { + "auxiliary_loss_clip": 0.01118185, + "auxiliary_loss_mlp": 0.01101382, + "balance_loss_clip": 1.00181222, + "balance_loss_mlp": 1.0004828, + "epoch": 0.8429280024049302, + "flos": 28767248259840.0, + "grad_norm": 1.7547901384251292, + "language_loss": 0.78263021, + "learning_rate": 2.531817924498265e-07, + "loss": 0.80482578, + "num_input_tokens_seen": 302383270, + "step": 14020, + "time_per_iteration": 2.774981737136841 + }, + { + "auxiliary_loss_clip": 0.01147816, + "auxiliary_loss_mlp": 0.0110128, + "balance_loss_clip": 1.00173318, + "balance_loss_mlp": 1.00047636, + "epoch": 0.8429881256575981, + "flos": 19537093152000.0, + "grad_norm": 1.6454768286065722, + "language_loss": 0.71608311, + "learning_rate": 2.5299216271522805e-07, + "loss": 0.73857415, + "num_input_tokens_seen": 302401355, + "step": 14021, + "time_per_iteration": 2.6875672340393066 + }, + { + "auxiliary_loss_clip": 0.01132164, + "auxiliary_loss_mlp": 0.01102451, + "balance_loss_clip": 1.00203705, + "balance_loss_mlp": 1.00059855, + "epoch": 0.8430482489102661, + "flos": 24790931752320.0, + "grad_norm": 4.018755521336973, + "language_loss": 0.69343227, + "learning_rate": 2.5280259922734125e-07, + "loss": 0.71577841, + "num_input_tokens_seen": 302419515, + "step": 14022, + "time_per_iteration": 2.739513397216797 + }, + { + "auxiliary_loss_clip": 0.01086713, + "auxiliary_loss_mlp": 0.01102254, + "balance_loss_clip": 1.00169122, + "balance_loss_mlp": 1.00049734, + "epoch": 0.843108372162934, + "flos": 21544248211200.0, + "grad_norm": 1.954113589777309, + "language_loss": 0.72364545, + "learning_rate": 2.526131019933553e-07, + "loss": 0.74553514, + "num_input_tokens_seen": 302438280, + "step": 14023, + "time_per_iteration": 2.8521978855133057 + }, + { + "auxiliary_loss_clip": 0.01147006, + "auxiliary_loss_mlp": 0.01101308, + "balance_loss_clip": 1.00188184, + "balance_loss_mlp": 1.00050473, + "epoch": 0.843168495415602, + "flos": 24608792862720.0, + "grad_norm": 1.5067181802333556, + "language_loss": 0.66629791, + "learning_rate": 2.524236710204559e-07, + "loss": 0.68878102, + "num_input_tokens_seen": 302460860, + "step": 14024, + "time_per_iteration": 2.6939971446990967 + }, + { + "auxiliary_loss_clip": 0.01147493, + "auxiliary_loss_mlp": 0.01100726, + "balance_loss_clip": 1.00180268, + "balance_loss_mlp": 1.00039935, + "epoch": 0.8432286186682699, + "flos": 15122738286720.0, + "grad_norm": 2.5504226218222277, + "language_loss": 0.80743241, + "learning_rate": 2.522343063158261e-07, + "loss": 0.82991469, + "num_input_tokens_seen": 302476980, + "step": 14025, + "time_per_iteration": 2.583313226699829 + }, + { + "auxiliary_loss_clip": 0.01147406, + "auxiliary_loss_mlp": 0.01100003, + "balance_loss_clip": 1.00181007, + "balance_loss_mlp": 1.00048697, + "epoch": 0.843288741920938, + "flos": 20301882554880.0, + "grad_norm": 1.458202614155336, + "language_loss": 0.77511615, + "learning_rate": 2.5204500788664606e-07, + "loss": 0.79759026, + "num_input_tokens_seen": 302496380, + "step": 14026, + "time_per_iteration": 2.643601894378662 + }, + { + "auxiliary_loss_clip": 0.01134969, + "auxiliary_loss_mlp": 0.01101036, + "balance_loss_clip": 1.00188088, + "balance_loss_mlp": 1.00070953, + "epoch": 0.8433488651736059, + "flos": 23332096782720.0, + "grad_norm": 1.4669916569658448, + "language_loss": 0.82883579, + "learning_rate": 2.518557757400945e-07, + "loss": 0.85119581, + "num_input_tokens_seen": 302516845, + "step": 14027, + "time_per_iteration": 2.6832275390625 + }, + { + "auxiliary_loss_clip": 0.01132635, + "auxiliary_loss_mlp": 0.011012, + "balance_loss_clip": 1.00169444, + "balance_loss_mlp": 1.00039697, + "epoch": 0.8434089884262739, + "flos": 39458105844480.0, + "grad_norm": 1.4302695653879682, + "language_loss": 0.56844991, + "learning_rate": 2.5166660988334754e-07, + "loss": 0.59078825, + "num_input_tokens_seen": 302538865, + "step": 14028, + "time_per_iteration": 2.8345582485198975 + }, + { + "auxiliary_loss_clip": 0.01132918, + "auxiliary_loss_mlp": 0.01101853, + "balance_loss_clip": 1.00198293, + "balance_loss_mlp": 1.00047708, + "epoch": 0.8434691116789418, + "flos": 23768842250880.0, + "grad_norm": 1.7625814989865498, + "language_loss": 0.63871539, + "learning_rate": 2.51477510323578e-07, + "loss": 0.66106308, + "num_input_tokens_seen": 302557970, + "step": 14029, + "time_per_iteration": 2.6883647441864014 + }, + { + "auxiliary_loss_clip": 0.01164215, + "auxiliary_loss_mlp": 0.01100489, + "balance_loss_clip": 1.00200105, + "balance_loss_mlp": 1.00054359, + "epoch": 0.8435292349316098, + "flos": 22671411972480.0, + "grad_norm": 1.5441796945856971, + "language_loss": 0.75103509, + "learning_rate": 2.51288477067956e-07, + "loss": 0.77368212, + "num_input_tokens_seen": 302578915, + "step": 14030, + "time_per_iteration": 2.6611549854278564 + }, + { + "auxiliary_loss_clip": 0.01132645, + "auxiliary_loss_mlp": 0.01100275, + "balance_loss_clip": 1.00174379, + "balance_loss_mlp": 1.00042546, + "epoch": 0.8435893581842777, + "flos": 18843622202880.0, + "grad_norm": 2.0794051791190227, + "language_loss": 0.83571124, + "learning_rate": 2.510995101236502e-07, + "loss": 0.85804045, + "num_input_tokens_seen": 302596300, + "step": 14031, + "time_per_iteration": 2.6457293033599854 + }, + { + "auxiliary_loss_clip": 0.01132228, + "auxiliary_loss_mlp": 0.01100625, + "balance_loss_clip": 1.00176072, + "balance_loss_mlp": 1.00044131, + "epoch": 0.8436494814369457, + "flos": 20704225772160.0, + "grad_norm": 1.9662945859194805, + "language_loss": 0.80113876, + "learning_rate": 2.509106094978266e-07, + "loss": 0.82346737, + "num_input_tokens_seen": 302614975, + "step": 14032, + "time_per_iteration": 2.676593542098999 + }, + { + "auxiliary_loss_clip": 0.01116225, + "auxiliary_loss_mlp": 0.01101087, + "balance_loss_clip": 1.00167561, + "balance_loss_mlp": 1.00037861, + "epoch": 0.8437096046896138, + "flos": 22674177319680.0, + "grad_norm": 1.5144344499761409, + "language_loss": 0.75562853, + "learning_rate": 2.507217751976478e-07, + "loss": 0.77780163, + "num_input_tokens_seen": 302636415, + "step": 14033, + "time_per_iteration": 2.769116163253784 + }, + { + "auxiliary_loss_clip": 0.01116035, + "auxiliary_loss_mlp": 0.01101204, + "balance_loss_clip": 1.0018146, + "balance_loss_mlp": 1.00059128, + "epoch": 0.8437697279422817, + "flos": 16180127879040.0, + "grad_norm": 1.936750223191056, + "language_loss": 0.83461177, + "learning_rate": 2.505330072302743e-07, + "loss": 0.85678411, + "num_input_tokens_seen": 302653605, + "step": 14034, + "time_per_iteration": 2.705110788345337 + }, + { + "auxiliary_loss_clip": 0.01118132, + "auxiliary_loss_mlp": 0.01101219, + "balance_loss_clip": 1.0017035, + "balance_loss_mlp": 1.0004158, + "epoch": 0.8438298511949497, + "flos": 28765847629440.0, + "grad_norm": 2.1349849583852274, + "language_loss": 0.78327847, + "learning_rate": 2.503443056028656e-07, + "loss": 0.80547202, + "num_input_tokens_seen": 302673965, + "step": 14035, + "time_per_iteration": 2.7669527530670166 + }, + { + "auxiliary_loss_clip": 0.01149487, + "auxiliary_loss_mlp": 0.0110158, + "balance_loss_clip": 1.00187731, + "balance_loss_mlp": 1.00049043, + "epoch": 0.8438899744476176, + "flos": 33724284779520.0, + "grad_norm": 2.5408335785133853, + "language_loss": 0.72132242, + "learning_rate": 2.501556703225751e-07, + "loss": 0.74383312, + "num_input_tokens_seen": 302695560, + "step": 14036, + "time_per_iteration": 2.7116951942443848 + }, + { + "auxiliary_loss_clip": 0.01164036, + "auxiliary_loss_mlp": 0.0109926, + "balance_loss_clip": 1.00182056, + "balance_loss_mlp": 1.00041151, + "epoch": 0.8439500977002856, + "flos": 25110787386240.0, + "grad_norm": 1.910051667988117, + "language_loss": 0.69637966, + "learning_rate": 2.49967101396557e-07, + "loss": 0.71901262, + "num_input_tokens_seen": 302713480, + "step": 14037, + "time_per_iteration": 4.203657150268555 + }, + { + "auxiliary_loss_clip": 0.011641, + "auxiliary_loss_mlp": 0.01100192, + "balance_loss_clip": 1.00184488, + "balance_loss_mlp": 1.00043797, + "epoch": 0.8440102209529535, + "flos": 32850362880000.0, + "grad_norm": 2.2147358056721225, + "language_loss": 0.68892121, + "learning_rate": 2.4977859883196227e-07, + "loss": 0.71156412, + "num_input_tokens_seen": 302736860, + "step": 14038, + "time_per_iteration": 2.7435555458068848 + }, + { + "auxiliary_loss_clip": 0.01102704, + "auxiliary_loss_mlp": 0.01101033, + "balance_loss_clip": 1.001876, + "balance_loss_mlp": 1.00051582, + "epoch": 0.8440703442056215, + "flos": 23730202195200.0, + "grad_norm": 1.7202241740497726, + "language_loss": 0.76443666, + "learning_rate": 2.49590162635938e-07, + "loss": 0.78647405, + "num_input_tokens_seen": 302757745, + "step": 14039, + "time_per_iteration": 2.784548282623291 + }, + { + "auxiliary_loss_clip": 0.01164507, + "auxiliary_loss_mlp": 0.01102144, + "balance_loss_clip": 1.00193965, + "balance_loss_mlp": 1.00048268, + "epoch": 0.8441304674582895, + "flos": 20193719725440.0, + "grad_norm": 2.1116276113986934, + "language_loss": 0.79358709, + "learning_rate": 2.4940179281563046e-07, + "loss": 0.81625354, + "num_input_tokens_seen": 302774885, + "step": 14040, + "time_per_iteration": 2.6135475635528564 + }, + { + "auxiliary_loss_clip": 0.01116651, + "auxiliary_loss_mlp": 0.01100828, + "balance_loss_clip": 1.00173283, + "balance_loss_mlp": 1.00040603, + "epoch": 0.8441905907109575, + "flos": 20219897761920.0, + "grad_norm": 2.3306855160890168, + "language_loss": 0.69573379, + "learning_rate": 2.492134893781821e-07, + "loss": 0.71790862, + "num_input_tokens_seen": 302791035, + "step": 14041, + "time_per_iteration": 2.6854803562164307 + }, + { + "auxiliary_loss_clip": 0.01133083, + "auxiliary_loss_mlp": 0.01101814, + "balance_loss_clip": 1.00179148, + "balance_loss_mlp": 1.00053406, + "epoch": 0.8442507139636254, + "flos": 13516453987200.0, + "grad_norm": 1.784621449201415, + "language_loss": 0.68983525, + "learning_rate": 2.490252523307341e-07, + "loss": 0.71218425, + "num_input_tokens_seen": 302808650, + "step": 14042, + "time_per_iteration": 4.186027526855469 + }, + { + "auxiliary_loss_clip": 0.01130655, + "auxiliary_loss_mlp": 0.01100865, + "balance_loss_clip": 1.00171852, + "balance_loss_mlp": 1.00053883, + "epoch": 0.8443108372162934, + "flos": 18220212731520.0, + "grad_norm": 1.9044890912383006, + "language_loss": 0.74769759, + "learning_rate": 2.4883708168042373e-07, + "loss": 0.7700128, + "num_input_tokens_seen": 302824605, + "step": 14043, + "time_per_iteration": 2.7024059295654297 + }, + { + "auxiliary_loss_clip": 0.0116425, + "auxiliary_loss_mlp": 0.00747444, + "balance_loss_clip": 1.00190067, + "balance_loss_mlp": 1.0005281, + "epoch": 0.8443709604689613, + "flos": 16105110324480.0, + "grad_norm": 2.0826684429894824, + "language_loss": 0.72564811, + "learning_rate": 2.486489774343865e-07, + "loss": 0.74476504, + "num_input_tokens_seen": 302840170, + "step": 14044, + "time_per_iteration": 2.550630807876587 + }, + { + "auxiliary_loss_clip": 0.01131887, + "auxiliary_loss_mlp": 0.01099884, + "balance_loss_clip": 1.00171626, + "balance_loss_mlp": 1.00041556, + "epoch": 0.8444310837216293, + "flos": 18512130562560.0, + "grad_norm": 1.6869637771385912, + "language_loss": 0.751441, + "learning_rate": 2.484609395997559e-07, + "loss": 0.77375865, + "num_input_tokens_seen": 302858320, + "step": 14045, + "time_per_iteration": 2.736109972000122 + }, + { + "auxiliary_loss_clip": 0.01134234, + "auxiliary_loss_mlp": 0.00747378, + "balance_loss_clip": 1.00172055, + "balance_loss_mlp": 1.0005312, + "epoch": 0.8444912069742974, + "flos": 14939845211520.0, + "grad_norm": 1.7192005598708295, + "language_loss": 0.78617936, + "learning_rate": 2.4827296818366216e-07, + "loss": 0.80499548, + "num_input_tokens_seen": 302875255, + "step": 14046, + "time_per_iteration": 2.654186248779297 + }, + { + "auxiliary_loss_clip": 0.01132771, + "auxiliary_loss_mlp": 0.01101146, + "balance_loss_clip": 1.00182855, + "balance_loss_mlp": 1.00043821, + "epoch": 0.8445513302269653, + "flos": 20120318282880.0, + "grad_norm": 1.8542352791517995, + "language_loss": 0.78066385, + "learning_rate": 2.4808506319323255e-07, + "loss": 0.80300295, + "num_input_tokens_seen": 302894690, + "step": 14047, + "time_per_iteration": 2.730473279953003 + }, + { + "auxiliary_loss_clip": 0.01133153, + "auxiliary_loss_mlp": 0.01100749, + "balance_loss_clip": 1.00184178, + "balance_loss_mlp": 1.0004704, + "epoch": 0.8446114534796333, + "flos": 31170928533120.0, + "grad_norm": 2.5896956803101143, + "language_loss": 0.72025335, + "learning_rate": 2.478972246355935e-07, + "loss": 0.74259233, + "num_input_tokens_seen": 302912405, + "step": 14048, + "time_per_iteration": 2.728104591369629 + }, + { + "auxiliary_loss_clip": 0.01050698, + "auxiliary_loss_mlp": 0.0110068, + "balance_loss_clip": 1.0014236, + "balance_loss_mlp": 1.00063968, + "epoch": 0.8446715767323012, + "flos": 23948323534080.0, + "grad_norm": 1.8323897908903073, + "language_loss": 0.73437309, + "learning_rate": 2.477094525178667e-07, + "loss": 0.75588685, + "num_input_tokens_seen": 302932525, + "step": 14049, + "time_per_iteration": 3.1499903202056885 + }, + { + "auxiliary_loss_clip": 0.01141819, + "auxiliary_loss_mlp": 0.00745382, + "balance_loss_clip": 1.00074244, + "balance_loss_mlp": 1.00014901, + "epoch": 0.8447316999849692, + "flos": 67984897484160.0, + "grad_norm": 0.799700658258744, + "language_loss": 0.60601836, + "learning_rate": 2.475217468471729e-07, + "loss": 0.62489039, + "num_input_tokens_seen": 302991285, + "step": 14050, + "time_per_iteration": 3.4564414024353027 + }, + { + "auxiliary_loss_clip": 0.01134561, + "auxiliary_loss_mlp": 0.00747418, + "balance_loss_clip": 1.00172794, + "balance_loss_mlp": 1.00053108, + "epoch": 0.8447918232376371, + "flos": 22418924296320.0, + "grad_norm": 2.237157858665365, + "language_loss": 0.72349966, + "learning_rate": 2.473341076306303e-07, + "loss": 0.74231941, + "num_input_tokens_seen": 303009515, + "step": 14051, + "time_per_iteration": 2.727835178375244 + }, + { + "auxiliary_loss_clip": 0.01147491, + "auxiliary_loss_mlp": 0.01100452, + "balance_loss_clip": 1.00191116, + "balance_loss_mlp": 1.00045896, + "epoch": 0.8448519464903052, + "flos": 23694147918720.0, + "grad_norm": 1.7809925157890591, + "language_loss": 0.74975586, + "learning_rate": 2.471465348753547e-07, + "loss": 0.77223533, + "num_input_tokens_seen": 303026905, + "step": 14052, + "time_per_iteration": 2.677288770675659 + }, + { + "auxiliary_loss_clip": 0.01133161, + "auxiliary_loss_mlp": 0.01099563, + "balance_loss_clip": 1.0018785, + "balance_loss_mlp": 1.00047612, + "epoch": 0.8449120697429731, + "flos": 13735904129280.0, + "grad_norm": 7.741382994755861, + "language_loss": 0.73492068, + "learning_rate": 2.469590285884575e-07, + "loss": 0.75724792, + "num_input_tokens_seen": 303045245, + "step": 14053, + "time_per_iteration": 2.709324836730957 + }, + { + "auxiliary_loss_clip": 0.0114962, + "auxiliary_loss_mlp": 0.01100606, + "balance_loss_clip": 1.00188923, + "balance_loss_mlp": 1.00042284, + "epoch": 0.8449721929956411, + "flos": 20886795624960.0, + "grad_norm": 2.029332982873086, + "language_loss": 0.7393167, + "learning_rate": 2.467715887770494e-07, + "loss": 0.76181895, + "num_input_tokens_seen": 303065205, + "step": 14054, + "time_per_iteration": 2.771850347518921 + }, + { + "auxiliary_loss_clip": 0.01149372, + "auxiliary_loss_mlp": 0.01101584, + "balance_loss_clip": 1.00182843, + "balance_loss_mlp": 1.00049472, + "epoch": 0.845032316248309, + "flos": 33216939129600.0, + "grad_norm": 1.694954097859403, + "language_loss": 0.78274643, + "learning_rate": 2.4658421544823895e-07, + "loss": 0.80525595, + "num_input_tokens_seen": 303088250, + "step": 14055, + "time_per_iteration": 2.7997052669525146 + }, + { + "auxiliary_loss_clip": 0.01147565, + "auxiliary_loss_mlp": 0.01100486, + "balance_loss_clip": 1.00182641, + "balance_loss_mlp": 1.00054073, + "epoch": 0.845092439500977, + "flos": 23585230903680.0, + "grad_norm": 1.7810366040113388, + "language_loss": 0.73053396, + "learning_rate": 2.463969086091302e-07, + "loss": 0.75301445, + "num_input_tokens_seen": 303109280, + "step": 14056, + "time_per_iteration": 4.287774324417114 + }, + { + "auxiliary_loss_clip": 0.01149845, + "auxiliary_loss_mlp": 0.01102156, + "balance_loss_clip": 1.00192285, + "balance_loss_mlp": 1.00058961, + "epoch": 0.8451525627536449, + "flos": 13333920048000.0, + "grad_norm": 2.3948065518249733, + "language_loss": 0.67886311, + "learning_rate": 2.4620966826682686e-07, + "loss": 0.70138311, + "num_input_tokens_seen": 303126075, + "step": 14057, + "time_per_iteration": 4.603338718414307 + }, + { + "auxiliary_loss_clip": 0.01101521, + "auxiliary_loss_mlp": 0.01101269, + "balance_loss_clip": 1.00172591, + "balance_loss_mlp": 1.00037074, + "epoch": 0.8452126860063129, + "flos": 27817985583360.0, + "grad_norm": 1.6044648860914872, + "language_loss": 0.77709997, + "learning_rate": 2.460224944284284e-07, + "loss": 0.79912788, + "num_input_tokens_seen": 303146920, + "step": 14058, + "time_per_iteration": 2.9066107273101807 + }, + { + "auxiliary_loss_clip": 0.0116441, + "auxiliary_loss_mlp": 0.01101354, + "balance_loss_clip": 1.00197124, + "balance_loss_mlp": 1.00055087, + "epoch": 0.845272809258981, + "flos": 27124694202240.0, + "grad_norm": 1.6384228317208038, + "language_loss": 0.69854867, + "learning_rate": 2.45835387101033e-07, + "loss": 0.72120637, + "num_input_tokens_seen": 303167885, + "step": 14059, + "time_per_iteration": 2.6340582370758057 + }, + { + "auxiliary_loss_clip": 0.01164494, + "auxiliary_loss_mlp": 0.01102694, + "balance_loss_clip": 1.00191009, + "balance_loss_mlp": 1.00055575, + "epoch": 0.8453329325116489, + "flos": 18332577452160.0, + "grad_norm": 2.091672716053271, + "language_loss": 0.57318974, + "learning_rate": 2.4564834629173516e-07, + "loss": 0.59586167, + "num_input_tokens_seen": 303185000, + "step": 14060, + "time_per_iteration": 2.655251979827881 + }, + { + "auxiliary_loss_clip": 0.01133116, + "auxiliary_loss_mlp": 0.01102917, + "balance_loss_clip": 1.00177884, + "balance_loss_mlp": 1.00068343, + "epoch": 0.8453930557643169, + "flos": 22675254727680.0, + "grad_norm": 7.91543438863581, + "language_loss": 0.75681174, + "learning_rate": 2.454613720076277e-07, + "loss": 0.77917206, + "num_input_tokens_seen": 303205210, + "step": 14061, + "time_per_iteration": 2.692922592163086 + }, + { + "auxiliary_loss_clip": 0.01133847, + "auxiliary_loss_mlp": 0.01102349, + "balance_loss_clip": 1.0019784, + "balance_loss_mlp": 1.00040126, + "epoch": 0.8454531790169848, + "flos": 22487261921280.0, + "grad_norm": 3.7942888471222336, + "language_loss": 0.71370018, + "learning_rate": 2.452744642558013e-07, + "loss": 0.73606211, + "num_input_tokens_seen": 303224655, + "step": 14062, + "time_per_iteration": 2.6724557876586914 + }, + { + "auxiliary_loss_clip": 0.01096343, + "auxiliary_loss_mlp": 0.01075636, + "balance_loss_clip": 1.00137711, + "balance_loss_mlp": 1.00010526, + "epoch": 0.8455133022696528, + "flos": 58277848481280.0, + "grad_norm": 0.6373214461215757, + "language_loss": 0.52638972, + "learning_rate": 2.450876230433432e-07, + "loss": 0.54810953, + "num_input_tokens_seen": 303289645, + "step": 14063, + "time_per_iteration": 3.3487331867218018 + }, + { + "auxiliary_loss_clip": 0.01116434, + "auxiliary_loss_mlp": 0.01099775, + "balance_loss_clip": 1.00184417, + "balance_loss_mlp": 1.00049734, + "epoch": 0.8455734255223207, + "flos": 21361283308800.0, + "grad_norm": 2.294046551380162, + "language_loss": 0.8237803, + "learning_rate": 2.449008483773378e-07, + "loss": 0.84594232, + "num_input_tokens_seen": 303308350, + "step": 14064, + "time_per_iteration": 2.6961216926574707 + }, + { + "auxiliary_loss_clip": 0.01147871, + "auxiliary_loss_mlp": 0.01101659, + "balance_loss_clip": 1.00198889, + "balance_loss_mlp": 1.00056982, + "epoch": 0.8456335487749888, + "flos": 20449260057600.0, + "grad_norm": 2.8370310208135936, + "language_loss": 0.72468668, + "learning_rate": 2.447141402648685e-07, + "loss": 0.74718201, + "num_input_tokens_seen": 303325230, + "step": 14065, + "time_per_iteration": 2.676748752593994 + }, + { + "auxiliary_loss_clip": 0.01116127, + "auxiliary_loss_mlp": 0.01099014, + "balance_loss_clip": 1.00175011, + "balance_loss_mlp": 1.00049901, + "epoch": 0.8456936720276567, + "flos": 28840901097600.0, + "grad_norm": 1.675773460814667, + "language_loss": 0.77520657, + "learning_rate": 2.445274987130146e-07, + "loss": 0.79735798, + "num_input_tokens_seen": 303345810, + "step": 14066, + "time_per_iteration": 2.834489345550537 + }, + { + "auxiliary_loss_clip": 0.01117275, + "auxiliary_loss_mlp": 0.01101851, + "balance_loss_clip": 1.00188875, + "balance_loss_mlp": 1.00047553, + "epoch": 0.8457537952803247, + "flos": 22672884430080.0, + "grad_norm": 1.6967156984657428, + "language_loss": 0.69977349, + "learning_rate": 2.4434092372885363e-07, + "loss": 0.72196472, + "num_input_tokens_seen": 303365140, + "step": 14067, + "time_per_iteration": 2.7649476528167725 + }, + { + "auxiliary_loss_clip": 0.01117159, + "auxiliary_loss_mlp": 0.01100316, + "balance_loss_clip": 1.00173938, + "balance_loss_mlp": 1.00051403, + "epoch": 0.8458139185329926, + "flos": 33802929607680.0, + "grad_norm": 1.7199336104592569, + "language_loss": 0.71048903, + "learning_rate": 2.4415441531946144e-07, + "loss": 0.73266387, + "num_input_tokens_seen": 303386150, + "step": 14068, + "time_per_iteration": 2.7990572452545166 + }, + { + "auxiliary_loss_clip": 0.01112116, + "auxiliary_loss_mlp": 0.01074886, + "balance_loss_clip": 1.00099778, + "balance_loss_mlp": 1.00011826, + "epoch": 0.8458740417856606, + "flos": 70295929603200.0, + "grad_norm": 0.6948135715365859, + "language_loss": 0.60480207, + "learning_rate": 2.4396797349190976e-07, + "loss": 0.62667209, + "num_input_tokens_seen": 303453770, + "step": 14069, + "time_per_iteration": 3.3698079586029053 + }, + { + "auxiliary_loss_clip": 0.01133171, + "auxiliary_loss_mlp": 0.01100733, + "balance_loss_clip": 1.00170851, + "balance_loss_mlp": 1.00050187, + "epoch": 0.8459341650383285, + "flos": 24170862245760.0, + "grad_norm": 1.559593908996745, + "language_loss": 0.74395978, + "learning_rate": 2.4378159825326804e-07, + "loss": 0.76629883, + "num_input_tokens_seen": 303474520, + "step": 14070, + "time_per_iteration": 2.6741106510162354 + }, + { + "auxiliary_loss_clip": 0.01099592, + "auxiliary_loss_mlp": 0.01100565, + "balance_loss_clip": 1.00164557, + "balance_loss_mlp": 1.00042892, + "epoch": 0.8459942882909965, + "flos": 38181158369280.0, + "grad_norm": 1.7012640705150237, + "language_loss": 0.67075604, + "learning_rate": 2.435952896106039e-07, + "loss": 0.69275761, + "num_input_tokens_seen": 303497345, + "step": 14071, + "time_per_iteration": 2.892444372177124 + }, + { + "auxiliary_loss_clip": 0.01141552, + "auxiliary_loss_mlp": 0.00745405, + "balance_loss_clip": 1.00072575, + "balance_loss_mlp": 1.00024116, + "epoch": 0.8460544115436646, + "flos": 64118252177280.0, + "grad_norm": 0.7345420831183459, + "language_loss": 0.61011648, + "learning_rate": 2.4340904757098313e-07, + "loss": 0.62898612, + "num_input_tokens_seen": 303554890, + "step": 14072, + "time_per_iteration": 3.0514564514160156 + }, + { + "auxiliary_loss_clip": 0.01114966, + "auxiliary_loss_mlp": 0.01101335, + "balance_loss_clip": 1.00166368, + "balance_loss_mlp": 1.00043654, + "epoch": 0.8461145347963325, + "flos": 24170826332160.0, + "grad_norm": 1.7016908054036444, + "language_loss": 0.72324312, + "learning_rate": 2.4322287214146664e-07, + "loss": 0.74540615, + "num_input_tokens_seen": 303574380, + "step": 14073, + "time_per_iteration": 2.7520880699157715 + }, + { + "auxiliary_loss_clip": 0.01130551, + "auxiliary_loss_mlp": 0.011026, + "balance_loss_clip": 1.00171363, + "balance_loss_mlp": 1.0004611, + "epoch": 0.8461746580490005, + "flos": 34893787697280.0, + "grad_norm": 2.260862935494901, + "language_loss": 0.78195751, + "learning_rate": 2.430367633291155e-07, + "loss": 0.80428898, + "num_input_tokens_seen": 303594910, + "step": 14074, + "time_per_iteration": 2.799987554550171 + }, + { + "auxiliary_loss_clip": 0.01147424, + "auxiliary_loss_mlp": 0.0110081, + "balance_loss_clip": 1.00183868, + "balance_loss_mlp": 1.00048304, + "epoch": 0.8462347813016684, + "flos": 25557014044800.0, + "grad_norm": 2.1746759800631814, + "language_loss": 0.75529397, + "learning_rate": 2.4285072114098583e-07, + "loss": 0.7777763, + "num_input_tokens_seen": 303613520, + "step": 14075, + "time_per_iteration": 4.2613584995269775 + }, + { + "auxiliary_loss_clip": 0.01130837, + "auxiliary_loss_mlp": 0.01100536, + "balance_loss_clip": 1.00179446, + "balance_loss_mlp": 1.00040042, + "epoch": 0.8462949045543364, + "flos": 21325336773120.0, + "grad_norm": 3.0281815982786515, + "language_loss": 0.73280025, + "learning_rate": 2.4266474558413355e-07, + "loss": 0.75511396, + "num_input_tokens_seen": 303631225, + "step": 14076, + "time_per_iteration": 2.7393901348114014 + }, + { + "auxiliary_loss_clip": 0.01133035, + "auxiliary_loss_mlp": 0.01102205, + "balance_loss_clip": 1.00180793, + "balance_loss_mlp": 1.00044763, + "epoch": 0.8463550278070043, + "flos": 22637440684800.0, + "grad_norm": 2.038718033190789, + "language_loss": 0.78076172, + "learning_rate": 2.4247883666560945e-07, + "loss": 0.80311406, + "num_input_tokens_seen": 303649175, + "step": 14077, + "time_per_iteration": 2.723360300064087 + }, + { + "auxiliary_loss_clip": 0.01116155, + "auxiliary_loss_mlp": 0.01102211, + "balance_loss_clip": 1.00175667, + "balance_loss_mlp": 1.00045371, + "epoch": 0.8464151510596724, + "flos": 13005588804480.0, + "grad_norm": 2.3396818179264867, + "language_loss": 0.75540012, + "learning_rate": 2.422929943924643e-07, + "loss": 0.77758384, + "num_input_tokens_seen": 303665915, + "step": 14078, + "time_per_iteration": 2.7463932037353516 + }, + { + "auxiliary_loss_clip": 0.01101129, + "auxiliary_loss_mlp": 0.0110099, + "balance_loss_clip": 1.00166059, + "balance_loss_mlp": 1.00037754, + "epoch": 0.8464752743123403, + "flos": 15704921923200.0, + "grad_norm": 2.164425691016578, + "language_loss": 0.85313416, + "learning_rate": 2.4210721877174565e-07, + "loss": 0.87515533, + "num_input_tokens_seen": 303679985, + "step": 14079, + "time_per_iteration": 4.1879026889801025 + }, + { + "auxiliary_loss_clip": 0.01117965, + "auxiliary_loss_mlp": 0.01102848, + "balance_loss_clip": 1.00176859, + "balance_loss_mlp": 1.0006144, + "epoch": 0.8465353975650083, + "flos": 21653955325440.0, + "grad_norm": 2.5207615440802207, + "language_loss": 0.59167045, + "learning_rate": 2.419215098104965e-07, + "loss": 0.61387861, + "num_input_tokens_seen": 303698470, + "step": 14080, + "time_per_iteration": 2.769622802734375 + }, + { + "auxiliary_loss_clip": 0.01131661, + "auxiliary_loss_mlp": 0.01102316, + "balance_loss_clip": 1.00181055, + "balance_loss_mlp": 1.00036836, + "epoch": 0.8465955208176762, + "flos": 18515650095360.0, + "grad_norm": 2.379600280587999, + "language_loss": 0.6590237, + "learning_rate": 2.4173586751576014e-07, + "loss": 0.68136346, + "num_input_tokens_seen": 303716415, + "step": 14081, + "time_per_iteration": 2.635307788848877 + }, + { + "auxiliary_loss_clip": 0.0114944, + "auxiliary_loss_mlp": 0.01101633, + "balance_loss_clip": 1.00187516, + "balance_loss_mlp": 1.00044823, + "epoch": 0.8466556440703442, + "flos": 24200559815040.0, + "grad_norm": 2.2001576066063935, + "language_loss": 0.73424631, + "learning_rate": 2.41550291894576e-07, + "loss": 0.75675702, + "num_input_tokens_seen": 303734490, + "step": 14082, + "time_per_iteration": 2.6659319400787354 + }, + { + "auxiliary_loss_clip": 0.01101383, + "auxiliary_loss_mlp": 0.01100892, + "balance_loss_clip": 1.00152159, + "balance_loss_mlp": 1.00046968, + "epoch": 0.8467157673230121, + "flos": 20375894528640.0, + "grad_norm": 1.889343145574525, + "language_loss": 0.76372588, + "learning_rate": 2.413647829539809e-07, + "loss": 0.7857486, + "num_input_tokens_seen": 303752310, + "step": 14083, + "time_per_iteration": 2.712954521179199 + }, + { + "auxiliary_loss_clip": 0.01100008, + "auxiliary_loss_mlp": 0.0110246, + "balance_loss_clip": 1.00171983, + "balance_loss_mlp": 1.00051236, + "epoch": 0.8467758905756801, + "flos": 28473642489600.0, + "grad_norm": 1.9240625723206666, + "language_loss": 0.65853381, + "learning_rate": 2.411793407010092e-07, + "loss": 0.68055844, + "num_input_tokens_seen": 303776065, + "step": 14084, + "time_per_iteration": 2.816143274307251 + }, + { + "auxiliary_loss_clip": 0.0109899, + "auxiliary_loss_mlp": 0.01100673, + "balance_loss_clip": 1.0016613, + "balance_loss_mlp": 1.00053728, + "epoch": 0.8468360138283482, + "flos": 11692551139200.0, + "grad_norm": 1.9768831570032293, + "language_loss": 0.69685352, + "learning_rate": 2.409939651426938e-07, + "loss": 0.71885014, + "num_input_tokens_seen": 303793500, + "step": 14085, + "time_per_iteration": 2.6855335235595703 + }, + { + "auxiliary_loss_clip": 0.01100224, + "auxiliary_loss_mlp": 0.01100581, + "balance_loss_clip": 1.0015924, + "balance_loss_mlp": 1.00034976, + "epoch": 0.8468961370810161, + "flos": 24607859109120.0, + "grad_norm": 1.5694043558464008, + "language_loss": 0.70958227, + "learning_rate": 2.408086562860634e-07, + "loss": 0.73159033, + "num_input_tokens_seen": 303814835, + "step": 14086, + "time_per_iteration": 2.8076815605163574 + }, + { + "auxiliary_loss_clip": 0.01149819, + "auxiliary_loss_mlp": 0.01100707, + "balance_loss_clip": 1.00192618, + "balance_loss_mlp": 1.00057173, + "epoch": 0.8469562603336841, + "flos": 19609812236160.0, + "grad_norm": 2.44259217989961, + "language_loss": 0.74578667, + "learning_rate": 2.4062341413814445e-07, + "loss": 0.76829195, + "num_input_tokens_seen": 303834505, + "step": 14087, + "time_per_iteration": 2.6586859226226807 + }, + { + "auxiliary_loss_clip": 0.01131498, + "auxiliary_loss_mlp": 0.0110108, + "balance_loss_clip": 1.00190997, + "balance_loss_mlp": 1.00046706, + "epoch": 0.847016383586352, + "flos": 22638949056000.0, + "grad_norm": 1.4916488402547659, + "language_loss": 0.7397784, + "learning_rate": 2.4043823870596227e-07, + "loss": 0.76210421, + "num_input_tokens_seen": 303855050, + "step": 14088, + "time_per_iteration": 2.683051347732544 + }, + { + "auxiliary_loss_clip": 0.01147484, + "auxiliary_loss_mlp": 0.01101517, + "balance_loss_clip": 1.00180447, + "balance_loss_mlp": 1.00052345, + "epoch": 0.84707650683902, + "flos": 20960161153920.0, + "grad_norm": 1.9617111919887498, + "language_loss": 0.71659255, + "learning_rate": 2.402531299965387e-07, + "loss": 0.73908257, + "num_input_tokens_seen": 303875635, + "step": 14089, + "time_per_iteration": 2.632840156555176 + }, + { + "auxiliary_loss_clip": 0.01164205, + "auxiliary_loss_mlp": 0.01101383, + "balance_loss_clip": 1.00199962, + "balance_loss_mlp": 1.00053191, + "epoch": 0.8471366300916879, + "flos": 24093007516800.0, + "grad_norm": 1.488388698932953, + "language_loss": 0.79333806, + "learning_rate": 2.400680880168928e-07, + "loss": 0.81599396, + "num_input_tokens_seen": 303896750, + "step": 14090, + "time_per_iteration": 2.64680552482605 + }, + { + "auxiliary_loss_clip": 0.01084971, + "auxiliary_loss_mlp": 0.01101856, + "balance_loss_clip": 1.00166571, + "balance_loss_mlp": 1.0005759, + "epoch": 0.847196753344356, + "flos": 18332900674560.0, + "grad_norm": 2.286011145098148, + "language_loss": 0.76881659, + "learning_rate": 2.3988311277404085e-07, + "loss": 0.79068482, + "num_input_tokens_seen": 303915435, + "step": 14091, + "time_per_iteration": 2.8017938137054443 + }, + { + "auxiliary_loss_clip": 0.01158193, + "auxiliary_loss_mlp": 0.01074781, + "balance_loss_clip": 1.00075197, + "balance_loss_mlp": 1.00001264, + "epoch": 0.8472568765970239, + "flos": 49567536956160.0, + "grad_norm": 0.8273399498356707, + "language_loss": 0.59400982, + "learning_rate": 2.396982042749982e-07, + "loss": 0.6163395, + "num_input_tokens_seen": 303977245, + "step": 14092, + "time_per_iteration": 3.2134063243865967 + }, + { + "auxiliary_loss_clip": 0.01132769, + "auxiliary_loss_mlp": 0.01101635, + "balance_loss_clip": 1.0017761, + "balance_loss_mlp": 1.00044966, + "epoch": 0.8473169998496919, + "flos": 19279074781440.0, + "grad_norm": 2.0663942767672054, + "language_loss": 0.7017321, + "learning_rate": 2.395133625267756e-07, + "loss": 0.72407615, + "num_input_tokens_seen": 303996055, + "step": 14093, + "time_per_iteration": 4.284192085266113 + }, + { + "auxiliary_loss_clip": 0.01164054, + "auxiliary_loss_mlp": 0.01099982, + "balance_loss_clip": 1.00184155, + "balance_loss_mlp": 1.00046575, + "epoch": 0.8473771231023598, + "flos": 17675555829120.0, + "grad_norm": 1.9212564077163188, + "language_loss": 0.83476669, + "learning_rate": 2.3932858753638263e-07, + "loss": 0.85740703, + "num_input_tokens_seen": 304012205, + "step": 14094, + "time_per_iteration": 2.588348627090454 + }, + { + "auxiliary_loss_clip": 0.01148616, + "auxiliary_loss_mlp": 0.01100262, + "balance_loss_clip": 1.00191832, + "balance_loss_mlp": 1.00050783, + "epoch": 0.8474372463550278, + "flos": 26359761144960.0, + "grad_norm": 1.7427123061191836, + "language_loss": 0.71444023, + "learning_rate": 2.3914387931082626e-07, + "loss": 0.73692894, + "num_input_tokens_seen": 304033475, + "step": 14095, + "time_per_iteration": 4.091628551483154 + }, + { + "auxiliary_loss_clip": 0.01147624, + "auxiliary_loss_mlp": 0.00747372, + "balance_loss_clip": 1.00181246, + "balance_loss_mlp": 1.00059628, + "epoch": 0.8474973696076957, + "flos": 23402050519680.0, + "grad_norm": 1.7736260675215534, + "language_loss": 0.80819106, + "learning_rate": 2.3895923785711105e-07, + "loss": 0.82714105, + "num_input_tokens_seen": 304051845, + "step": 14096, + "time_per_iteration": 2.6544435024261475 + }, + { + "auxiliary_loss_clip": 0.01147633, + "auxiliary_loss_mlp": 0.01101211, + "balance_loss_clip": 1.00179601, + "balance_loss_mlp": 1.00040817, + "epoch": 0.8475574928603637, + "flos": 25075666863360.0, + "grad_norm": 1.8969535490985188, + "language_loss": 0.77445471, + "learning_rate": 2.387746631822374e-07, + "loss": 0.79694313, + "num_input_tokens_seen": 304069965, + "step": 14097, + "time_per_iteration": 2.70768141746521 + }, + { + "auxiliary_loss_clip": 0.01117339, + "auxiliary_loss_mlp": 0.01100803, + "balance_loss_clip": 1.00175011, + "balance_loss_mlp": 1.00042903, + "epoch": 0.8476176161130318, + "flos": 19966691813760.0, + "grad_norm": 1.7235891112980828, + "language_loss": 0.80538946, + "learning_rate": 2.385901552932048e-07, + "loss": 0.82757092, + "num_input_tokens_seen": 304086805, + "step": 14098, + "time_per_iteration": 2.7167906761169434 + }, + { + "auxiliary_loss_clip": 0.01149861, + "auxiliary_loss_mlp": 0.00747446, + "balance_loss_clip": 1.00198412, + "balance_loss_mlp": 1.00057769, + "epoch": 0.8476777393656997, + "flos": 21285834791040.0, + "grad_norm": 2.2305004425040336, + "language_loss": 0.71535945, + "learning_rate": 2.3840571419701062e-07, + "loss": 0.73433256, + "num_input_tokens_seen": 304105865, + "step": 14099, + "time_per_iteration": 2.657973289489746 + }, + { + "auxiliary_loss_clip": 0.01149654, + "auxiliary_loss_mlp": 0.01101338, + "balance_loss_clip": 1.00186253, + "balance_loss_mlp": 1.00048661, + "epoch": 0.8477378626183677, + "flos": 29971476650880.0, + "grad_norm": 2.269806424353073, + "language_loss": 0.63792551, + "learning_rate": 2.3822133990064787e-07, + "loss": 0.66043544, + "num_input_tokens_seen": 304128300, + "step": 14100, + "time_per_iteration": 2.7137691974639893 + }, + { + "auxiliary_loss_clip": 0.01147609, + "auxiliary_loss_mlp": 0.01101896, + "balance_loss_clip": 1.00181222, + "balance_loss_mlp": 1.00042486, + "epoch": 0.8477979858710356, + "flos": 24237727413120.0, + "grad_norm": 2.372380622202238, + "language_loss": 0.73808038, + "learning_rate": 2.380370324111085e-07, + "loss": 0.76057541, + "num_input_tokens_seen": 304143695, + "step": 14101, + "time_per_iteration": 2.6554203033447266 + }, + { + "auxiliary_loss_clip": 0.01149147, + "auxiliary_loss_mlp": 0.01101294, + "balance_loss_clip": 1.00180387, + "balance_loss_mlp": 1.00044322, + "epoch": 0.8478581091237036, + "flos": 25593678852480.0, + "grad_norm": 1.5330958604708445, + "language_loss": 0.71244228, + "learning_rate": 2.3785279173538163e-07, + "loss": 0.73494661, + "num_input_tokens_seen": 304165800, + "step": 14102, + "time_per_iteration": 2.6753172874450684 + }, + { + "auxiliary_loss_clip": 0.01133315, + "auxiliary_loss_mlp": 0.01101759, + "balance_loss_clip": 1.00189865, + "balance_loss_mlp": 1.00038362, + "epoch": 0.8479182323763715, + "flos": 12057116227200.0, + "grad_norm": 1.9366782515075116, + "language_loss": 0.8181355, + "learning_rate": 2.3766861788045366e-07, + "loss": 0.84048617, + "num_input_tokens_seen": 304182910, + "step": 14103, + "time_per_iteration": 2.673264980316162 + }, + { + "auxiliary_loss_clip": 0.01164315, + "auxiliary_loss_mlp": 0.01101487, + "balance_loss_clip": 1.00204754, + "balance_loss_mlp": 1.00054049, + "epoch": 0.8479783556290396, + "flos": 21433391861760.0, + "grad_norm": 2.1909951716747513, + "language_loss": 0.78359282, + "learning_rate": 2.374845108533079e-07, + "loss": 0.80625081, + "num_input_tokens_seen": 304200175, + "step": 14104, + "time_per_iteration": 2.564610481262207 + }, + { + "auxiliary_loss_clip": 0.01147677, + "auxiliary_loss_mlp": 0.01102076, + "balance_loss_clip": 1.00188255, + "balance_loss_mlp": 1.0005095, + "epoch": 0.8480384788817075, + "flos": 19642634288640.0, + "grad_norm": 1.89288504235094, + "language_loss": 0.78869647, + "learning_rate": 2.3730047066092607e-07, + "loss": 0.811194, + "num_input_tokens_seen": 304217775, + "step": 14105, + "time_per_iteration": 2.6371068954467773 + }, + { + "auxiliary_loss_clip": 0.01133107, + "auxiliary_loss_mlp": 0.01102286, + "balance_loss_clip": 1.0018034, + "balance_loss_mlp": 1.00043333, + "epoch": 0.8480986021343755, + "flos": 22489201255680.0, + "grad_norm": 1.8190267688743942, + "language_loss": 0.50164384, + "learning_rate": 2.3711649731028749e-07, + "loss": 0.52399778, + "num_input_tokens_seen": 304235760, + "step": 14106, + "time_per_iteration": 2.6567609310150146 + }, + { + "auxiliary_loss_clip": 0.0111405, + "auxiliary_loss_mlp": 0.01101365, + "balance_loss_clip": 1.00161219, + "balance_loss_mlp": 1.00056183, + "epoch": 0.8481587253870434, + "flos": 22090557139200.0, + "grad_norm": 1.993218483971104, + "language_loss": 0.75622904, + "learning_rate": 2.3693259080836792e-07, + "loss": 0.7783832, + "num_input_tokens_seen": 304253985, + "step": 14107, + "time_per_iteration": 2.7186200618743896 + }, + { + "auxiliary_loss_clip": 0.01115744, + "auxiliary_loss_mlp": 0.01100273, + "balance_loss_clip": 1.00171089, + "balance_loss_mlp": 1.00037611, + "epoch": 0.8482188486397114, + "flos": 33582689366400.0, + "grad_norm": 1.5197246827243271, + "language_loss": 0.7326414, + "learning_rate": 2.3674875116214087e-07, + "loss": 0.75480157, + "num_input_tokens_seen": 304276785, + "step": 14108, + "time_per_iteration": 2.822084903717041 + }, + { + "auxiliary_loss_clip": 0.01164096, + "auxiliary_loss_mlp": 0.01100938, + "balance_loss_clip": 1.00186014, + "balance_loss_mlp": 1.00051641, + "epoch": 0.8482789718923793, + "flos": 20919402195840.0, + "grad_norm": 1.803953916436362, + "language_loss": 0.72390783, + "learning_rate": 2.3656497837857836e-07, + "loss": 0.74655813, + "num_input_tokens_seen": 304296310, + "step": 14109, + "time_per_iteration": 2.660423755645752 + }, + { + "auxiliary_loss_clip": 0.01083258, + "auxiliary_loss_mlp": 0.01101605, + "balance_loss_clip": 1.00181353, + "balance_loss_mlp": 1.00051594, + "epoch": 0.8483390951450474, + "flos": 12896204912640.0, + "grad_norm": 2.3729106295034246, + "language_loss": 0.73810816, + "learning_rate": 2.3638127246464811e-07, + "loss": 0.75995678, + "num_input_tokens_seen": 304311715, + "step": 14110, + "time_per_iteration": 2.735302686691284 + }, + { + "auxiliary_loss_clip": 0.01083516, + "auxiliary_loss_mlp": 0.01100413, + "balance_loss_clip": 1.00157237, + "balance_loss_mlp": 1.00051582, + "epoch": 0.8483992183977154, + "flos": 25081628520960.0, + "grad_norm": 1.6030191347757792, + "language_loss": 0.76426089, + "learning_rate": 2.3619763342731658e-07, + "loss": 0.78610027, + "num_input_tokens_seen": 304331910, + "step": 14111, + "time_per_iteration": 2.8191967010498047 + }, + { + "auxiliary_loss_clip": 0.0116418, + "auxiliary_loss_mlp": 0.01099616, + "balance_loss_clip": 1.00194263, + "balance_loss_mlp": 1.00052905, + "epoch": 0.8484593416503833, + "flos": 25557445008000.0, + "grad_norm": 1.5458706557913462, + "language_loss": 0.67297006, + "learning_rate": 2.3601406127354772e-07, + "loss": 0.69560796, + "num_input_tokens_seen": 304351405, + "step": 14112, + "time_per_iteration": 2.6777842044830322 + }, + { + "auxiliary_loss_clip": 0.01149505, + "auxiliary_loss_mlp": 0.01101496, + "balance_loss_clip": 1.00189281, + "balance_loss_mlp": 1.00050223, + "epoch": 0.8485194649030513, + "flos": 27198454780800.0, + "grad_norm": 1.4214331480953715, + "language_loss": 0.7355479, + "learning_rate": 2.3583055601030312e-07, + "loss": 0.75805795, + "num_input_tokens_seen": 304372935, + "step": 14113, + "time_per_iteration": 4.274285078048706 + }, + { + "auxiliary_loss_clip": 0.01114339, + "auxiliary_loss_mlp": 0.01100498, + "balance_loss_clip": 1.00180948, + "balance_loss_mlp": 1.00036263, + "epoch": 0.8485795881557192, + "flos": 24205910941440.0, + "grad_norm": 1.8531349342331274, + "language_loss": 0.66727674, + "learning_rate": 2.3564711764454003e-07, + "loss": 0.68942511, + "num_input_tokens_seen": 304393070, + "step": 14114, + "time_per_iteration": 2.784705877304077 + }, + { + "auxiliary_loss_clip": 0.01164396, + "auxiliary_loss_mlp": 0.01101134, + "balance_loss_clip": 1.00193048, + "balance_loss_mlp": 1.00056934, + "epoch": 0.8486397114083872, + "flos": 21141653598720.0, + "grad_norm": 2.013533844037306, + "language_loss": 0.78704792, + "learning_rate": 2.3546374618321495e-07, + "loss": 0.80970323, + "num_input_tokens_seen": 304411195, + "step": 14115, + "time_per_iteration": 2.6255035400390625 + }, + { + "auxiliary_loss_clip": 0.01164408, + "auxiliary_loss_mlp": 0.01101294, + "balance_loss_clip": 1.00192451, + "balance_loss_mlp": 1.00049043, + "epoch": 0.8486998346610551, + "flos": 19974772373760.0, + "grad_norm": 2.549118787621117, + "language_loss": 0.79321861, + "learning_rate": 2.3528044163328187e-07, + "loss": 0.81587565, + "num_input_tokens_seen": 304429425, + "step": 14116, + "time_per_iteration": 2.6046948432922363 + }, + { + "auxiliary_loss_clip": 0.01147771, + "auxiliary_loss_mlp": 0.01101505, + "balance_loss_clip": 1.0018785, + "balance_loss_mlp": 1.00051057, + "epoch": 0.8487599579137232, + "flos": 19792310261760.0, + "grad_norm": 1.9657833818477242, + "language_loss": 0.68489158, + "learning_rate": 2.3509720400169076e-07, + "loss": 0.70738435, + "num_input_tokens_seen": 304447460, + "step": 14117, + "time_per_iteration": 4.049837112426758 + }, + { + "auxiliary_loss_clip": 0.01134955, + "auxiliary_loss_mlp": 0.01101241, + "balance_loss_clip": 1.00180674, + "balance_loss_mlp": 1.00039017, + "epoch": 0.8488200811663911, + "flos": 26396030903040.0, + "grad_norm": 2.014804629550899, + "language_loss": 0.65492362, + "learning_rate": 2.3491403329539096e-07, + "loss": 0.67728555, + "num_input_tokens_seen": 304468230, + "step": 14118, + "time_per_iteration": 2.68485689163208 + }, + { + "auxiliary_loss_clip": 0.01116369, + "auxiliary_loss_mlp": 0.01100743, + "balance_loss_clip": 1.00171697, + "balance_loss_mlp": 1.00060701, + "epoch": 0.8488802044190591, + "flos": 16359285939840.0, + "grad_norm": 1.8293907880383917, + "language_loss": 0.72989571, + "learning_rate": 2.3473092952132757e-07, + "loss": 0.75206685, + "num_input_tokens_seen": 304484860, + "step": 14119, + "time_per_iteration": 2.793731212615967 + }, + { + "auxiliary_loss_clip": 0.01115948, + "auxiliary_loss_mlp": 0.01101699, + "balance_loss_clip": 1.00174379, + "balance_loss_mlp": 1.00041902, + "epoch": 0.848940327671727, + "flos": 19208869649280.0, + "grad_norm": 1.874079912408962, + "language_loss": 0.77800739, + "learning_rate": 2.345478926864446e-07, + "loss": 0.80018389, + "num_input_tokens_seen": 304503575, + "step": 14120, + "time_per_iteration": 2.708754062652588 + }, + { + "auxiliary_loss_clip": 0.01148134, + "auxiliary_loss_mlp": 0.01101852, + "balance_loss_clip": 1.00188923, + "balance_loss_mlp": 1.00047648, + "epoch": 0.849000450924395, + "flos": 21871178824320.0, + "grad_norm": 1.6386565998670695, + "language_loss": 0.7604574, + "learning_rate": 2.3436492279768227e-07, + "loss": 0.78295732, + "num_input_tokens_seen": 304525005, + "step": 14121, + "time_per_iteration": 2.63748836517334 + }, + { + "auxiliary_loss_clip": 0.01108676, + "auxiliary_loss_mlp": 0.0107458, + "balance_loss_clip": 1.00089574, + "balance_loss_mlp": 1.00019383, + "epoch": 0.8490605741770629, + "flos": 71166475624320.0, + "grad_norm": 0.805173022958172, + "language_loss": 0.60093886, + "learning_rate": 2.3418201986197883e-07, + "loss": 0.62277144, + "num_input_tokens_seen": 304585220, + "step": 14122, + "time_per_iteration": 3.278427839279175 + }, + { + "auxiliary_loss_clip": 0.01147678, + "auxiliary_loss_mlp": 0.01100699, + "balance_loss_clip": 1.0018934, + "balance_loss_mlp": 1.00056314, + "epoch": 0.849120697429731, + "flos": 24973357950720.0, + "grad_norm": 1.733705615661335, + "language_loss": 0.7997762, + "learning_rate": 2.3399918388627048e-07, + "loss": 0.8222599, + "num_input_tokens_seen": 304604665, + "step": 14123, + "time_per_iteration": 2.6420388221740723 + }, + { + "auxiliary_loss_clip": 0.01147317, + "auxiliary_loss_mlp": 0.01100164, + "balance_loss_clip": 1.00194359, + "balance_loss_mlp": 1.00055289, + "epoch": 0.8491808206823989, + "flos": 23032277959680.0, + "grad_norm": 2.3809794392973433, + "language_loss": 0.83279842, + "learning_rate": 2.3381641487749016e-07, + "loss": 0.85527325, + "num_input_tokens_seen": 304620600, + "step": 14124, + "time_per_iteration": 2.6776511669158936 + }, + { + "auxiliary_loss_clip": 0.0110008, + "auxiliary_loss_mlp": 0.01101243, + "balance_loss_clip": 1.00171185, + "balance_loss_mlp": 1.00043941, + "epoch": 0.8492409439350669, + "flos": 23878549365120.0, + "grad_norm": 2.011014439428748, + "language_loss": 0.71621418, + "learning_rate": 2.3363371284256805e-07, + "loss": 0.73822743, + "num_input_tokens_seen": 304639540, + "step": 14125, + "time_per_iteration": 2.7359063625335693 + }, + { + "auxiliary_loss_clip": 0.01164428, + "auxiliary_loss_mlp": 0.01102995, + "balance_loss_clip": 1.00185633, + "balance_loss_mlp": 1.0005703, + "epoch": 0.8493010671877349, + "flos": 22419893963520.0, + "grad_norm": 1.5744189630107457, + "language_loss": 0.73527372, + "learning_rate": 2.3345107778843288e-07, + "loss": 0.75794792, + "num_input_tokens_seen": 304660595, + "step": 14126, + "time_per_iteration": 2.6478137969970703 + }, + { + "auxiliary_loss_clip": 0.01101147, + "auxiliary_loss_mlp": 0.01100789, + "balance_loss_clip": 1.00165033, + "balance_loss_mlp": 1.00046241, + "epoch": 0.8493611904404028, + "flos": 17529435302400.0, + "grad_norm": 1.944658666764551, + "language_loss": 0.67530322, + "learning_rate": 2.3326850972200928e-07, + "loss": 0.69732261, + "num_input_tokens_seen": 304679580, + "step": 14127, + "time_per_iteration": 2.7221720218658447 + }, + { + "auxiliary_loss_clip": 0.01117861, + "auxiliary_loss_mlp": 0.00747578, + "balance_loss_clip": 1.00183797, + "balance_loss_mlp": 1.00063491, + "epoch": 0.8494213136930708, + "flos": 19462937523840.0, + "grad_norm": 2.616057577914846, + "language_loss": 0.69186163, + "learning_rate": 2.330860086502211e-07, + "loss": 0.71051604, + "num_input_tokens_seen": 304698385, + "step": 14128, + "time_per_iteration": 2.8545758724212646 + }, + { + "auxiliary_loss_clip": 0.01135062, + "auxiliary_loss_mlp": 0.01100814, + "balance_loss_clip": 1.00189984, + "balance_loss_mlp": 1.00058317, + "epoch": 0.8494814369457387, + "flos": 18770292587520.0, + "grad_norm": 2.444447962997155, + "language_loss": 0.77783102, + "learning_rate": 2.3290357457998855e-07, + "loss": 0.80018973, + "num_input_tokens_seen": 304715430, + "step": 14129, + "time_per_iteration": 2.7082784175872803 + }, + { + "auxiliary_loss_clip": 0.01083204, + "auxiliary_loss_mlp": 0.01101306, + "balance_loss_clip": 1.0016706, + "balance_loss_mlp": 1.00050247, + "epoch": 0.8495415601984068, + "flos": 23331486251520.0, + "grad_norm": 1.7782295507346768, + "language_loss": 0.68213892, + "learning_rate": 2.3272120751823031e-07, + "loss": 0.70398402, + "num_input_tokens_seen": 304734345, + "step": 14130, + "time_per_iteration": 2.875997304916382 + }, + { + "auxiliary_loss_clip": 0.01146948, + "auxiliary_loss_mlp": 0.0110096, + "balance_loss_clip": 1.00173259, + "balance_loss_mlp": 1.00044239, + "epoch": 0.8496016834510747, + "flos": 26612859352320.0, + "grad_norm": 4.201813362742759, + "language_loss": 0.71457255, + "learning_rate": 2.3253890747186e-07, + "loss": 0.73705161, + "num_input_tokens_seen": 304755030, + "step": 14131, + "time_per_iteration": 6.743203163146973 + }, + { + "auxiliary_loss_clip": 0.01115665, + "auxiliary_loss_mlp": 0.01100541, + "balance_loss_clip": 1.00153697, + "balance_loss_mlp": 1.00040472, + "epoch": 0.8496618067037427, + "flos": 25480380378240.0, + "grad_norm": 1.8806966094917021, + "language_loss": 0.68382853, + "learning_rate": 2.3235667444779162e-07, + "loss": 0.70599055, + "num_input_tokens_seen": 304774320, + "step": 14132, + "time_per_iteration": 2.713653087615967 + }, + { + "auxiliary_loss_clip": 0.01164203, + "auxiliary_loss_mlp": 0.01100765, + "balance_loss_clip": 1.00192881, + "balance_loss_mlp": 1.00048614, + "epoch": 0.8497219299564106, + "flos": 25374587846400.0, + "grad_norm": 1.7601958676758385, + "language_loss": 0.7004621, + "learning_rate": 2.3217450845293564e-07, + "loss": 0.72311181, + "num_input_tokens_seen": 304795355, + "step": 14133, + "time_per_iteration": 2.596151828765869 + }, + { + "auxiliary_loss_clip": 0.01098099, + "auxiliary_loss_mlp": 0.00745481, + "balance_loss_clip": 1.00090516, + "balance_loss_mlp": 1.00030065, + "epoch": 0.8497820532090786, + "flos": 67780279658880.0, + "grad_norm": 0.7277809941553959, + "language_loss": 0.57604909, + "learning_rate": 2.3199240949419918e-07, + "loss": 0.59448493, + "num_input_tokens_seen": 304863915, + "step": 14134, + "time_per_iteration": 3.4059371948242188 + }, + { + "auxiliary_loss_clip": 0.01116548, + "auxiliary_loss_mlp": 0.01101429, + "balance_loss_clip": 1.00173783, + "balance_loss_mlp": 1.00033963, + "epoch": 0.8498421764617465, + "flos": 23440546920960.0, + "grad_norm": 2.4620379714596945, + "language_loss": 0.78950149, + "learning_rate": 2.3181037757848787e-07, + "loss": 0.81168127, + "num_input_tokens_seen": 304881555, + "step": 14135, + "time_per_iteration": 2.8145105838775635 + }, + { + "auxiliary_loss_clip": 0.0114784, + "auxiliary_loss_mlp": 0.01102155, + "balance_loss_clip": 1.00164127, + "balance_loss_mlp": 1.00049353, + "epoch": 0.8499022997144146, + "flos": 17712615686400.0, + "grad_norm": 1.7697839902284933, + "language_loss": 0.63445228, + "learning_rate": 2.316284127127044e-07, + "loss": 0.6569522, + "num_input_tokens_seen": 304898760, + "step": 14136, + "time_per_iteration": 2.559666156768799 + }, + { + "auxiliary_loss_clip": 0.01148005, + "auxiliary_loss_mlp": 0.01102129, + "balance_loss_clip": 1.00188577, + "balance_loss_mlp": 1.00056267, + "epoch": 0.8499624229670825, + "flos": 18588512833920.0, + "grad_norm": 2.1668088805054526, + "language_loss": 0.84031868, + "learning_rate": 2.3144651490374835e-07, + "loss": 0.86281997, + "num_input_tokens_seen": 304915465, + "step": 14137, + "time_per_iteration": 2.621422529220581 + }, + { + "auxiliary_loss_clip": 0.01118071, + "auxiliary_loss_mlp": 0.01100176, + "balance_loss_clip": 1.00173461, + "balance_loss_mlp": 1.00051701, + "epoch": 0.8500225462197505, + "flos": 24345854328960.0, + "grad_norm": 2.340290298314954, + "language_loss": 0.79236603, + "learning_rate": 2.3126468415851773e-07, + "loss": 0.81454849, + "num_input_tokens_seen": 304933190, + "step": 14138, + "time_per_iteration": 2.74820876121521 + }, + { + "auxiliary_loss_clip": 0.01147561, + "auxiliary_loss_mlp": 0.0110132, + "balance_loss_clip": 1.00184226, + "balance_loss_mlp": 1.00061178, + "epoch": 0.8500826694724185, + "flos": 16545518979840.0, + "grad_norm": 1.8798895442073682, + "language_loss": 0.64470613, + "learning_rate": 2.310829204839073e-07, + "loss": 0.6671949, + "num_input_tokens_seen": 304951110, + "step": 14139, + "time_per_iteration": 2.643718719482422 + }, + { + "auxiliary_loss_clip": 0.01114704, + "auxiliary_loss_mlp": 0.01101522, + "balance_loss_clip": 1.00168717, + "balance_loss_mlp": 1.00038481, + "epoch": 0.8501427927250864, + "flos": 16289404030080.0, + "grad_norm": 2.480820080358728, + "language_loss": 0.70704126, + "learning_rate": 2.3090122388681043e-07, + "loss": 0.72920346, + "num_input_tokens_seen": 304969095, + "step": 14140, + "time_per_iteration": 2.6527273654937744 + }, + { + "auxiliary_loss_clip": 0.01117099, + "auxiliary_loss_mlp": 0.01102593, + "balance_loss_clip": 1.00183749, + "balance_loss_mlp": 1.00055015, + "epoch": 0.8502029159777544, + "flos": 26687912820480.0, + "grad_norm": 2.3701232042417844, + "language_loss": 0.63958621, + "learning_rate": 2.3071959437411648e-07, + "loss": 0.6617831, + "num_input_tokens_seen": 304989315, + "step": 14141, + "time_per_iteration": 2.7463347911834717 + }, + { + "auxiliary_loss_clip": 0.01116157, + "auxiliary_loss_mlp": 0.0110094, + "balance_loss_clip": 1.00179958, + "balance_loss_mlp": 1.00047028, + "epoch": 0.8502630392304223, + "flos": 35590778179200.0, + "grad_norm": 1.6622950960737533, + "language_loss": 0.70923978, + "learning_rate": 2.3053803195271214e-07, + "loss": 0.73141074, + "num_input_tokens_seen": 305011020, + "step": 14142, + "time_per_iteration": 2.8075525760650635 + }, + { + "auxiliary_loss_clip": 0.01101755, + "auxiliary_loss_mlp": 0.0110101, + "balance_loss_clip": 1.00160587, + "balance_loss_mlp": 1.00039732, + "epoch": 0.8503231624830904, + "flos": 21649466125440.0, + "grad_norm": 7.750164657056955, + "language_loss": 0.65333408, + "learning_rate": 2.3035653662948375e-07, + "loss": 0.67536175, + "num_input_tokens_seen": 305033550, + "step": 14143, + "time_per_iteration": 2.7738258838653564 + }, + { + "auxiliary_loss_clip": 0.01115441, + "auxiliary_loss_mlp": 0.0074734, + "balance_loss_clip": 1.00162756, + "balance_loss_mlp": 1.00040627, + "epoch": 0.8503832857357583, + "flos": 22417451838720.0, + "grad_norm": 1.8535207575039, + "language_loss": 0.68210596, + "learning_rate": 2.3017510841131216e-07, + "loss": 0.70073378, + "num_input_tokens_seen": 305052885, + "step": 14144, + "time_per_iteration": 2.714246988296509 + }, + { + "auxiliary_loss_clip": 0.01083545, + "auxiliary_loss_mlp": 0.01100104, + "balance_loss_clip": 1.00173318, + "balance_loss_mlp": 1.00054097, + "epoch": 0.8504434089884263, + "flos": 18697968552960.0, + "grad_norm": 2.0764966062715775, + "language_loss": 0.64927602, + "learning_rate": 2.299937473050777e-07, + "loss": 0.67111248, + "num_input_tokens_seen": 305071995, + "step": 14145, + "time_per_iteration": 2.705258846282959 + }, + { + "auxiliary_loss_clip": 0.01134584, + "auxiliary_loss_mlp": 0.01100219, + "balance_loss_clip": 1.00184226, + "balance_loss_mlp": 1.00046444, + "epoch": 0.8505035322410942, + "flos": 20007989475840.0, + "grad_norm": 1.710427270501196, + "language_loss": 0.85406649, + "learning_rate": 2.2981245331765842e-07, + "loss": 0.87641454, + "num_input_tokens_seen": 305090190, + "step": 14146, + "time_per_iteration": 2.6754045486450195 + }, + { + "auxiliary_loss_clip": 0.01164169, + "auxiliary_loss_mlp": 0.0110032, + "balance_loss_clip": 1.00180912, + "balance_loss_mlp": 1.00037491, + "epoch": 0.8505636554937622, + "flos": 20812173120000.0, + "grad_norm": 1.9672577330227188, + "language_loss": 0.83710968, + "learning_rate": 2.2963122645592814e-07, + "loss": 0.85975456, + "num_input_tokens_seen": 305109355, + "step": 14147, + "time_per_iteration": 2.5477278232574463 + }, + { + "auxiliary_loss_clip": 0.01149786, + "auxiliary_loss_mlp": 0.01101353, + "balance_loss_clip": 1.0019803, + "balance_loss_mlp": 1.00045419, + "epoch": 0.8506237787464301, + "flos": 14174445277440.0, + "grad_norm": 2.900392611526223, + "language_loss": 0.85149211, + "learning_rate": 2.2945006672675894e-07, + "loss": 0.87400347, + "num_input_tokens_seen": 305124165, + "step": 14148, + "time_per_iteration": 2.5476741790771484 + }, + { + "auxiliary_loss_clip": 0.01133011, + "auxiliary_loss_mlp": 0.0110127, + "balance_loss_clip": 1.0018754, + "balance_loss_mlp": 1.00041914, + "epoch": 0.8506839019990982, + "flos": 23258372117760.0, + "grad_norm": 2.2746114637102677, + "language_loss": 0.71506208, + "learning_rate": 2.292689741370204e-07, + "loss": 0.73740494, + "num_input_tokens_seen": 305143940, + "step": 14149, + "time_per_iteration": 2.625136137008667 + }, + { + "auxiliary_loss_clip": 0.01130343, + "auxiliary_loss_mlp": 0.01101371, + "balance_loss_clip": 1.00180173, + "balance_loss_mlp": 1.00047207, + "epoch": 0.8507440252517661, + "flos": 23659206963840.0, + "grad_norm": 1.745482892837506, + "language_loss": 0.76112682, + "learning_rate": 2.290879486935804e-07, + "loss": 0.78344393, + "num_input_tokens_seen": 305163505, + "step": 14150, + "time_per_iteration": 4.188263654708862 + }, + { + "auxiliary_loss_clip": 0.01118297, + "auxiliary_loss_mlp": 0.0110105, + "balance_loss_clip": 1.00180161, + "balance_loss_mlp": 1.00062799, + "epoch": 0.8508041485044341, + "flos": 18661339658880.0, + "grad_norm": 1.7443913849402546, + "language_loss": 0.72074419, + "learning_rate": 2.2890699040330231e-07, + "loss": 0.74293768, + "num_input_tokens_seen": 305182325, + "step": 14151, + "time_per_iteration": 2.6901142597198486 + }, + { + "auxiliary_loss_clip": 0.01079243, + "auxiliary_loss_mlp": 0.01074892, + "balance_loss_clip": 1.00072587, + "balance_loss_mlp": 1.00012445, + "epoch": 0.8508642717571021, + "flos": 52510918055040.0, + "grad_norm": 0.8861723175646622, + "language_loss": 0.59653425, + "learning_rate": 2.2872609927304909e-07, + "loss": 0.61807561, + "num_input_tokens_seen": 305230775, + "step": 14152, + "time_per_iteration": 3.1074793338775635 + }, + { + "auxiliary_loss_clip": 0.0112719, + "auxiliary_loss_mlp": 0.01074739, + "balance_loss_clip": 1.00081897, + "balance_loss_mlp": 0.99997073, + "epoch": 0.85092439500977, + "flos": 69297145050240.0, + "grad_norm": 0.7701042018256089, + "language_loss": 0.61201572, + "learning_rate": 2.285452753096797e-07, + "loss": 0.63403499, + "num_input_tokens_seen": 305296000, + "step": 14153, + "time_per_iteration": 3.441038131713867 + }, + { + "auxiliary_loss_clip": 0.01149012, + "auxiliary_loss_mlp": 0.01101111, + "balance_loss_clip": 1.00178647, + "balance_loss_mlp": 1.00045061, + "epoch": 0.850984518262438, + "flos": 24389737770240.0, + "grad_norm": 2.0416897726479633, + "language_loss": 0.80732083, + "learning_rate": 2.2836451852005067e-07, + "loss": 0.82982206, + "num_input_tokens_seen": 305314705, + "step": 14154, + "time_per_iteration": 2.619466781616211 + }, + { + "auxiliary_loss_clip": 0.0111447, + "auxiliary_loss_mlp": 0.01099642, + "balance_loss_clip": 1.00155079, + "balance_loss_mlp": 1.00041258, + "epoch": 0.851044641515106, + "flos": 23294821443840.0, + "grad_norm": 3.0180501040418357, + "language_loss": 0.7970227, + "learning_rate": 2.281838289110165e-07, + "loss": 0.81916386, + "num_input_tokens_seen": 305333870, + "step": 14155, + "time_per_iteration": 4.107757329940796 + }, + { + "auxiliary_loss_clip": 0.01114451, + "auxiliary_loss_mlp": 0.01101831, + "balance_loss_clip": 1.0015738, + "balance_loss_mlp": 1.00045538, + "epoch": 0.851104764767774, + "flos": 22050085489920.0, + "grad_norm": 2.1791726352335457, + "language_loss": 0.7032994, + "learning_rate": 2.2800320648942904e-07, + "loss": 0.7254622, + "num_input_tokens_seen": 305352780, + "step": 14156, + "time_per_iteration": 2.7122137546539307 + }, + { + "auxiliary_loss_clip": 0.01114264, + "auxiliary_loss_mlp": 0.01100011, + "balance_loss_clip": 1.0016712, + "balance_loss_mlp": 1.00044739, + "epoch": 0.8511648880204419, + "flos": 20704728562560.0, + "grad_norm": 1.6793648277785653, + "language_loss": 0.73524225, + "learning_rate": 2.278226512621386e-07, + "loss": 0.75738496, + "num_input_tokens_seen": 305371370, + "step": 14157, + "time_per_iteration": 2.7071444988250732 + }, + { + "auxiliary_loss_clip": 0.01067964, + "auxiliary_loss_mlp": 0.01099361, + "balance_loss_clip": 1.00164938, + "balance_loss_mlp": 1.00046539, + "epoch": 0.8512250112731099, + "flos": 24024669891840.0, + "grad_norm": 1.9345514925884444, + "language_loss": 0.79181647, + "learning_rate": 2.2764216323598995e-07, + "loss": 0.81348968, + "num_input_tokens_seen": 305387955, + "step": 14158, + "time_per_iteration": 2.91357684135437 + }, + { + "auxiliary_loss_clip": 0.01149326, + "auxiliary_loss_mlp": 0.0110088, + "balance_loss_clip": 1.00184727, + "balance_loss_mlp": 1.0004586, + "epoch": 0.8512851345257778, + "flos": 22015467757440.0, + "grad_norm": 1.9195755631008813, + "language_loss": 0.787274, + "learning_rate": 2.27461742417828e-07, + "loss": 0.80977613, + "num_input_tokens_seen": 305406285, + "step": 14159, + "time_per_iteration": 3.0247597694396973 + }, + { + "auxiliary_loss_clip": 0.01134156, + "auxiliary_loss_mlp": 0.01101323, + "balance_loss_clip": 1.00182939, + "balance_loss_mlp": 1.00061476, + "epoch": 0.8513452577784458, + "flos": 14830209924480.0, + "grad_norm": 1.7209388099283962, + "language_loss": 0.7122885, + "learning_rate": 2.2728138881449488e-07, + "loss": 0.73464328, + "num_input_tokens_seen": 305424500, + "step": 14160, + "time_per_iteration": 2.6377387046813965 + }, + { + "auxiliary_loss_clip": 0.01149393, + "auxiliary_loss_mlp": 0.01103163, + "balance_loss_clip": 1.00194252, + "balance_loss_mlp": 1.00045252, + "epoch": 0.8514053810311137, + "flos": 33035662166400.0, + "grad_norm": 2.1424634103834266, + "language_loss": 0.70375824, + "learning_rate": 2.2710110243282866e-07, + "loss": 0.72628379, + "num_input_tokens_seen": 305442990, + "step": 14161, + "time_per_iteration": 2.7811503410339355 + }, + { + "auxiliary_loss_clip": 0.01149463, + "auxiliary_loss_mlp": 0.01101634, + "balance_loss_clip": 1.00176573, + "balance_loss_mlp": 1.00035357, + "epoch": 0.8514655042837818, + "flos": 27564456412800.0, + "grad_norm": 2.0999277654857176, + "language_loss": 0.77734983, + "learning_rate": 2.2692088327966653e-07, + "loss": 0.79986072, + "num_input_tokens_seen": 305463065, + "step": 14162, + "time_per_iteration": 2.6560778617858887 + }, + { + "auxiliary_loss_clip": 0.01149643, + "auxiliary_loss_mlp": 0.01101121, + "balance_loss_clip": 1.00194991, + "balance_loss_mlp": 1.00050855, + "epoch": 0.8515256275364497, + "flos": 35556052705920.0, + "grad_norm": 2.6616721974416913, + "language_loss": 0.76637208, + "learning_rate": 2.2674073136184235e-07, + "loss": 0.78887975, + "num_input_tokens_seen": 305489070, + "step": 14163, + "time_per_iteration": 2.7577714920043945 + }, + { + "auxiliary_loss_clip": 0.0112645, + "auxiliary_loss_mlp": 0.01074447, + "balance_loss_clip": 1.00091004, + "balance_loss_mlp": 1.00005996, + "epoch": 0.8515857507891177, + "flos": 70207372621440.0, + "grad_norm": 0.6983709803444321, + "language_loss": 0.55038178, + "learning_rate": 2.2656064668618735e-07, + "loss": 0.57239074, + "num_input_tokens_seen": 305551490, + "step": 14164, + "time_per_iteration": 3.2641875743865967 + }, + { + "auxiliary_loss_clip": 0.01149023, + "auxiliary_loss_mlp": 0.01100686, + "balance_loss_clip": 1.0018518, + "balance_loss_mlp": 1.00050271, + "epoch": 0.8516458740417857, + "flos": 22675290641280.0, + "grad_norm": 1.9431990369611882, + "language_loss": 0.72388148, + "learning_rate": 2.2638062925953005e-07, + "loss": 0.7463786, + "num_input_tokens_seen": 305570535, + "step": 14165, + "time_per_iteration": 2.5972769260406494 + }, + { + "auxiliary_loss_clip": 0.01116482, + "auxiliary_loss_mlp": 0.01100056, + "balance_loss_clip": 1.00174534, + "balance_loss_mlp": 1.00054026, + "epoch": 0.8517059972944536, + "flos": 22747435107840.0, + "grad_norm": 2.333852063428745, + "language_loss": 0.67530537, + "learning_rate": 2.26200679088697e-07, + "loss": 0.69747066, + "num_input_tokens_seen": 305590800, + "step": 14166, + "time_per_iteration": 2.6664209365844727 + }, + { + "auxiliary_loss_clip": 0.01134451, + "auxiliary_loss_mlp": 0.01100708, + "balance_loss_clip": 1.00174975, + "balance_loss_mlp": 1.00052452, + "epoch": 0.8517661205471216, + "flos": 21689147675520.0, + "grad_norm": 2.4483333514864296, + "language_loss": 0.73852849, + "learning_rate": 2.260207961805125e-07, + "loss": 0.76088011, + "num_input_tokens_seen": 305609495, + "step": 14167, + "time_per_iteration": 2.65824294090271 + }, + { + "auxiliary_loss_clip": 0.01164262, + "auxiliary_loss_mlp": 0.01101343, + "balance_loss_clip": 1.00188792, + "balance_loss_mlp": 1.00044441, + "epoch": 0.8518262437997896, + "flos": 25374839241600.0, + "grad_norm": 1.9432666020884435, + "language_loss": 0.80515838, + "learning_rate": 2.258409805417969e-07, + "loss": 0.82781446, + "num_input_tokens_seen": 305629420, + "step": 14168, + "time_per_iteration": 5.126105785369873 + }, + { + "auxiliary_loss_clip": 0.01164209, + "auxiliary_loss_mlp": 0.01100234, + "balance_loss_clip": 1.00184429, + "balance_loss_mlp": 1.00043225, + "epoch": 0.8518863670524576, + "flos": 27235406897280.0, + "grad_norm": 1.8333490416642608, + "language_loss": 0.76161432, + "learning_rate": 2.2566123217936893e-07, + "loss": 0.78425872, + "num_input_tokens_seen": 305649835, + "step": 14169, + "time_per_iteration": 4.079454660415649 + }, + { + "auxiliary_loss_clip": 0.01164468, + "auxiliary_loss_mlp": 0.01101929, + "balance_loss_clip": 1.00196171, + "balance_loss_mlp": 1.00045776, + "epoch": 0.8519464903051255, + "flos": 20959514709120.0, + "grad_norm": 2.1792962565067953, + "language_loss": 0.63638365, + "learning_rate": 2.254815511000452e-07, + "loss": 0.6590476, + "num_input_tokens_seen": 305668840, + "step": 14170, + "time_per_iteration": 2.596888542175293 + }, + { + "auxiliary_loss_clip": 0.01148211, + "auxiliary_loss_mlp": 0.01100432, + "balance_loss_clip": 1.00188971, + "balance_loss_mlp": 1.00034344, + "epoch": 0.8520066135577935, + "flos": 18441745862400.0, + "grad_norm": 2.0659541366939083, + "language_loss": 0.86525893, + "learning_rate": 2.253019373106384e-07, + "loss": 0.88774538, + "num_input_tokens_seen": 305686955, + "step": 14171, + "time_per_iteration": 2.7539727687835693 + }, + { + "auxiliary_loss_clip": 0.01149843, + "auxiliary_loss_mlp": 0.01101605, + "balance_loss_clip": 1.00196779, + "balance_loss_mlp": 1.00042021, + "epoch": 0.8520667368104614, + "flos": 29130233149440.0, + "grad_norm": 1.7152483739299336, + "language_loss": 0.54350972, + "learning_rate": 2.2512239081796003e-07, + "loss": 0.56602418, + "num_input_tokens_seen": 305706290, + "step": 14172, + "time_per_iteration": 2.647831916809082 + }, + { + "auxiliary_loss_clip": 0.01132667, + "auxiliary_loss_mlp": 0.01099621, + "balance_loss_clip": 1.00179255, + "balance_loss_mlp": 1.00048649, + "epoch": 0.8521268600631294, + "flos": 16034366488320.0, + "grad_norm": 3.511614170529204, + "language_loss": 0.69825149, + "learning_rate": 2.2494291162881862e-07, + "loss": 0.72057438, + "num_input_tokens_seen": 305723835, + "step": 14173, + "time_per_iteration": 2.6954901218414307 + }, + { + "auxiliary_loss_clip": 0.01131071, + "auxiliary_loss_mlp": 0.0074741, + "balance_loss_clip": 1.00169718, + "balance_loss_mlp": 1.00055575, + "epoch": 0.8521869833157973, + "flos": 22454870832000.0, + "grad_norm": 4.537835316895053, + "language_loss": 0.77359092, + "learning_rate": 2.247634997500205e-07, + "loss": 0.79237568, + "num_input_tokens_seen": 305741655, + "step": 14174, + "time_per_iteration": 2.692460536956787 + }, + { + "auxiliary_loss_clip": 0.01118339, + "auxiliary_loss_mlp": 0.00747338, + "balance_loss_clip": 1.00189877, + "balance_loss_mlp": 1.00042534, + "epoch": 0.8522471065684654, + "flos": 24972029147520.0, + "grad_norm": 1.5591838197387438, + "language_loss": 0.81878793, + "learning_rate": 2.245841551883676e-07, + "loss": 0.83744466, + "num_input_tokens_seen": 305761890, + "step": 14175, + "time_per_iteration": 2.7518608570098877 + }, + { + "auxiliary_loss_clip": 0.01164325, + "auxiliary_loss_mlp": 0.01101251, + "balance_loss_clip": 1.00189173, + "balance_loss_mlp": 1.00039947, + "epoch": 0.8523072298211333, + "flos": 17710604524800.0, + "grad_norm": 2.201439111218446, + "language_loss": 0.65831417, + "learning_rate": 2.2440487795066153e-07, + "loss": 0.68096989, + "num_input_tokens_seen": 305779190, + "step": 14176, + "time_per_iteration": 2.56845760345459 + }, + { + "auxiliary_loss_clip": 0.01132871, + "auxiliary_loss_mlp": 0.0074728, + "balance_loss_clip": 1.00190067, + "balance_loss_mlp": 1.00054526, + "epoch": 0.8523673530738013, + "flos": 25446193608960.0, + "grad_norm": 1.8209735215562914, + "language_loss": 0.78396142, + "learning_rate": 2.2422566804370068e-07, + "loss": 0.80276293, + "num_input_tokens_seen": 305799870, + "step": 14177, + "time_per_iteration": 2.7981417179107666 + }, + { + "auxiliary_loss_clip": 0.01130601, + "auxiliary_loss_mlp": 0.01100915, + "balance_loss_clip": 1.00180507, + "balance_loss_mlp": 1.00039804, + "epoch": 0.8524274763264693, + "flos": 31429593348480.0, + "grad_norm": 1.6988289492183646, + "language_loss": 0.7316463, + "learning_rate": 2.2404652547428026e-07, + "loss": 0.75396144, + "num_input_tokens_seen": 305819695, + "step": 14178, + "time_per_iteration": 2.7480454444885254 + }, + { + "auxiliary_loss_clip": 0.01114607, + "auxiliary_loss_mlp": 0.01101994, + "balance_loss_clip": 1.00166667, + "balance_loss_mlp": 1.00061846, + "epoch": 0.8524875995791372, + "flos": 17712651600000.0, + "grad_norm": 1.6322404084787332, + "language_loss": 0.74688166, + "learning_rate": 2.238674502491935e-07, + "loss": 0.76904762, + "num_input_tokens_seen": 305837270, + "step": 14179, + "time_per_iteration": 2.6506378650665283 + }, + { + "auxiliary_loss_clip": 0.01164147, + "auxiliary_loss_mlp": 0.01100557, + "balance_loss_clip": 1.0019244, + "balance_loss_mlp": 1.00042129, + "epoch": 0.8525477228318052, + "flos": 21687316081920.0, + "grad_norm": 1.9040054401712119, + "language_loss": 0.81447822, + "learning_rate": 2.2368844237523165e-07, + "loss": 0.83712524, + "num_input_tokens_seen": 305855250, + "step": 14180, + "time_per_iteration": 2.594242572784424 + }, + { + "auxiliary_loss_clip": 0.01101875, + "auxiliary_loss_mlp": 0.01101262, + "balance_loss_clip": 1.00174713, + "balance_loss_mlp": 1.00055408, + "epoch": 0.8526078460844732, + "flos": 24827057856000.0, + "grad_norm": 2.2176278232873146, + "language_loss": 0.61708283, + "learning_rate": 2.235095018591815e-07, + "loss": 0.6391142, + "num_input_tokens_seen": 305875660, + "step": 14181, + "time_per_iteration": 2.7547128200531006 + }, + { + "auxiliary_loss_clip": 0.01164133, + "auxiliary_loss_mlp": 0.0109994, + "balance_loss_clip": 1.00186157, + "balance_loss_mlp": 1.00051975, + "epoch": 0.8526679693371412, + "flos": 13516418073600.0, + "grad_norm": 2.058770645817758, + "language_loss": 0.7215988, + "learning_rate": 2.2333062870782894e-07, + "loss": 0.74423957, + "num_input_tokens_seen": 305892415, + "step": 14182, + "time_per_iteration": 2.619474172592163 + }, + { + "auxiliary_loss_clip": 0.01099241, + "auxiliary_loss_mlp": 0.01101195, + "balance_loss_clip": 1.0015533, + "balance_loss_mlp": 1.00053477, + "epoch": 0.8527280925898091, + "flos": 23514092017920.0, + "grad_norm": 3.783105489012894, + "language_loss": 0.70632958, + "learning_rate": 2.2315182292795697e-07, + "loss": 0.72833395, + "num_input_tokens_seen": 305912665, + "step": 14183, + "time_per_iteration": 2.74340558052063 + }, + { + "auxiliary_loss_clip": 0.01132825, + "auxiliary_loss_mlp": 0.0110069, + "balance_loss_clip": 1.00194013, + "balance_loss_mlp": 1.0004586, + "epoch": 0.8527882158424771, + "flos": 20303031790080.0, + "grad_norm": 1.6848903511356539, + "language_loss": 0.72737622, + "learning_rate": 2.2297308452634644e-07, + "loss": 0.74971139, + "num_input_tokens_seen": 305931515, + "step": 14184, + "time_per_iteration": 2.7181873321533203 + }, + { + "auxiliary_loss_clip": 0.01164395, + "auxiliary_loss_mlp": 0.01100194, + "balance_loss_clip": 1.00205588, + "balance_loss_mlp": 1.00048733, + "epoch": 0.852848339095145, + "flos": 17202504689280.0, + "grad_norm": 4.676155505448226, + "language_loss": 0.7663635, + "learning_rate": 2.2279441350977457e-07, + "loss": 0.78900945, + "num_input_tokens_seen": 305949965, + "step": 14185, + "time_per_iteration": 2.5502171516418457 + }, + { + "auxiliary_loss_clip": 0.01118488, + "auxiliary_loss_mlp": 0.01101118, + "balance_loss_clip": 1.00177181, + "balance_loss_mlp": 1.00031471, + "epoch": 0.852908462347813, + "flos": 18368990864640.0, + "grad_norm": 1.8601266565009449, + "language_loss": 0.79907632, + "learning_rate": 2.2261580988501637e-07, + "loss": 0.82127237, + "num_input_tokens_seen": 305967820, + "step": 14186, + "time_per_iteration": 2.721316337585449 + }, + { + "auxiliary_loss_clip": 0.01130862, + "auxiliary_loss_mlp": 0.01101371, + "balance_loss_clip": 1.00173521, + "balance_loss_mlp": 1.00037646, + "epoch": 0.8529685856004809, + "flos": 18624890332800.0, + "grad_norm": 1.7751145594469289, + "language_loss": 0.62926024, + "learning_rate": 2.224372736588449e-07, + "loss": 0.6515826, + "num_input_tokens_seen": 305985505, + "step": 14187, + "time_per_iteration": 4.202656984329224 + }, + { + "auxiliary_loss_clip": 0.01100683, + "auxiliary_loss_mlp": 0.01102142, + "balance_loss_clip": 1.00175095, + "balance_loss_mlp": 1.00048029, + "epoch": 0.853028708853149, + "flos": 29607665748480.0, + "grad_norm": 1.4400129679481026, + "language_loss": 0.7665422, + "learning_rate": 2.2225880483803005e-07, + "loss": 0.78857046, + "num_input_tokens_seen": 306005220, + "step": 14188, + "time_per_iteration": 2.8001863956451416 + }, + { + "auxiliary_loss_clip": 0.01147711, + "auxiliary_loss_mlp": 0.01101293, + "balance_loss_clip": 1.00182796, + "balance_loss_mlp": 1.00039458, + "epoch": 0.8530888321058169, + "flos": 26353153042560.0, + "grad_norm": 1.4914123240158543, + "language_loss": 0.78372747, + "learning_rate": 2.2208040342933932e-07, + "loss": 0.80621749, + "num_input_tokens_seen": 306023785, + "step": 14189, + "time_per_iteration": 2.6867177486419678 + }, + { + "auxiliary_loss_clip": 0.01133387, + "auxiliary_loss_mlp": 0.01101287, + "balance_loss_clip": 1.00180149, + "balance_loss_mlp": 1.00038791, + "epoch": 0.8531489553584849, + "flos": 20521979141760.0, + "grad_norm": 6.760972832589224, + "language_loss": 0.79655528, + "learning_rate": 2.2190206943953793e-07, + "loss": 0.81890202, + "num_input_tokens_seen": 306041600, + "step": 14190, + "time_per_iteration": 2.63820743560791 + }, + { + "auxiliary_loss_clip": 0.01097689, + "auxiliary_loss_mlp": 0.01100874, + "balance_loss_clip": 1.00165546, + "balance_loss_mlp": 1.00045204, + "epoch": 0.8532090786111529, + "flos": 20704297599360.0, + "grad_norm": 2.1923282327172786, + "language_loss": 0.76033801, + "learning_rate": 2.2172380287538894e-07, + "loss": 0.78232366, + "num_input_tokens_seen": 306060345, + "step": 14191, + "time_per_iteration": 2.769373655319214 + }, + { + "auxiliary_loss_clip": 0.01147643, + "auxiliary_loss_mlp": 0.01101122, + "balance_loss_clip": 1.00185227, + "balance_loss_mlp": 1.00041378, + "epoch": 0.8532692018638208, + "flos": 19828903242240.0, + "grad_norm": 2.0929499976561994, + "language_loss": 0.69418997, + "learning_rate": 2.2154560374365073e-07, + "loss": 0.71667755, + "num_input_tokens_seen": 306078285, + "step": 14192, + "time_per_iteration": 2.646077871322632 + }, + { + "auxiliary_loss_clip": 0.01147865, + "auxiliary_loss_mlp": 0.0110308, + "balance_loss_clip": 1.00187778, + "balance_loss_mlp": 1.00055957, + "epoch": 0.8533293251164888, + "flos": 20996790048000.0, + "grad_norm": 2.686438954595209, + "language_loss": 0.63094604, + "learning_rate": 2.2136747205108164e-07, + "loss": 0.6534555, + "num_input_tokens_seen": 306093760, + "step": 14193, + "time_per_iteration": 4.123745918273926 + }, + { + "auxiliary_loss_clip": 0.01133236, + "auxiliary_loss_mlp": 0.01101209, + "balance_loss_clip": 1.00195551, + "balance_loss_mlp": 1.00050092, + "epoch": 0.8533894483691568, + "flos": 22419606654720.0, + "grad_norm": 1.7056558293475688, + "language_loss": 0.76447916, + "learning_rate": 2.211894078044365e-07, + "loss": 0.78682363, + "num_input_tokens_seen": 306112595, + "step": 14194, + "time_per_iteration": 2.723280429840088 + }, + { + "auxiliary_loss_clip": 0.01164334, + "auxiliary_loss_mlp": 0.01100981, + "balance_loss_clip": 1.00190449, + "balance_loss_mlp": 1.00046372, + "epoch": 0.8534495716218248, + "flos": 21616536332160.0, + "grad_norm": 1.728712143116125, + "language_loss": 0.69641566, + "learning_rate": 2.2101141101046705e-07, + "loss": 0.71906877, + "num_input_tokens_seen": 306131800, + "step": 14195, + "time_per_iteration": 2.5908029079437256 + }, + { + "auxiliary_loss_clip": 0.01131193, + "auxiliary_loss_mlp": 0.01100891, + "balance_loss_clip": 1.00183535, + "balance_loss_mlp": 1.0003742, + "epoch": 0.8535096948744927, + "flos": 22346277039360.0, + "grad_norm": 2.40412372587673, + "language_loss": 0.85748523, + "learning_rate": 2.2083348167592343e-07, + "loss": 0.8798061, + "num_input_tokens_seen": 306150590, + "step": 14196, + "time_per_iteration": 2.7842018604278564 + }, + { + "auxiliary_loss_clip": 0.01126576, + "auxiliary_loss_mlp": 0.01074281, + "balance_loss_clip": 1.00067508, + "balance_loss_mlp": 0.99989432, + "epoch": 0.8535698181271607, + "flos": 52762507891200.0, + "grad_norm": 0.7583162582066464, + "language_loss": 0.55055141, + "learning_rate": 2.2065561980755243e-07, + "loss": 0.57255995, + "num_input_tokens_seen": 306205850, + "step": 14197, + "time_per_iteration": 3.179896116256714 + }, + { + "auxiliary_loss_clip": 0.01114923, + "auxiliary_loss_mlp": 0.00747292, + "balance_loss_clip": 1.00167036, + "balance_loss_mlp": 1.00041211, + "epoch": 0.8536299413798286, + "flos": 19062892776960.0, + "grad_norm": 5.296156070838691, + "language_loss": 0.8137337, + "learning_rate": 2.2047782541209826e-07, + "loss": 0.83235586, + "num_input_tokens_seen": 306225220, + "step": 14198, + "time_per_iteration": 2.783256769180298 + }, + { + "auxiliary_loss_clip": 0.01164245, + "auxiliary_loss_mlp": 0.01100756, + "balance_loss_clip": 1.00191283, + "balance_loss_mlp": 1.00047684, + "epoch": 0.8536900646324966, + "flos": 49344743871360.0, + "grad_norm": 1.394566013014969, + "language_loss": 0.68331409, + "learning_rate": 2.203000984963035e-07, + "loss": 0.70596409, + "num_input_tokens_seen": 306249865, + "step": 14199, + "time_per_iteration": 2.8934836387634277 + }, + { + "auxiliary_loss_clip": 0.01115373, + "auxiliary_loss_mlp": 0.01099635, + "balance_loss_clip": 1.00159419, + "balance_loss_mlp": 1.00045323, + "epoch": 0.8537501878851645, + "flos": 21762333636480.0, + "grad_norm": 5.101121143970468, + "language_loss": 0.86367208, + "learning_rate": 2.201224390669072e-07, + "loss": 0.88582218, + "num_input_tokens_seen": 306270215, + "step": 14200, + "time_per_iteration": 2.6584689617156982 + }, + { + "auxiliary_loss_clip": 0.0111646, + "auxiliary_loss_mlp": 0.01100583, + "balance_loss_clip": 1.00162053, + "balance_loss_mlp": 1.00035191, + "epoch": 0.8538103111378326, + "flos": 22269176496000.0, + "grad_norm": 3.0148743154096564, + "language_loss": 0.78104448, + "learning_rate": 2.1994484713064666e-07, + "loss": 0.80321497, + "num_input_tokens_seen": 306288960, + "step": 14201, + "time_per_iteration": 2.6727750301361084 + }, + { + "auxiliary_loss_clip": 0.01130957, + "auxiliary_loss_mlp": 0.01100725, + "balance_loss_clip": 1.00174427, + "balance_loss_mlp": 1.0003984, + "epoch": 0.8538704343905005, + "flos": 20303929630080.0, + "grad_norm": 1.7120153628522905, + "language_loss": 0.69009858, + "learning_rate": 2.19767322694256e-07, + "loss": 0.7124154, + "num_input_tokens_seen": 306308735, + "step": 14202, + "time_per_iteration": 2.6441314220428467 + }, + { + "auxiliary_loss_clip": 0.01147478, + "auxiliary_loss_mlp": 0.01101249, + "balance_loss_clip": 1.00180221, + "balance_loss_mlp": 1.00058818, + "epoch": 0.8539305576431685, + "flos": 24755164784640.0, + "grad_norm": 1.668742492532875, + "language_loss": 0.80331743, + "learning_rate": 2.195898657644666e-07, + "loss": 0.82580471, + "num_input_tokens_seen": 306329015, + "step": 14203, + "time_per_iteration": 2.683751106262207 + }, + { + "auxiliary_loss_clip": 0.01134819, + "auxiliary_loss_mlp": 0.01101816, + "balance_loss_clip": 1.00189769, + "balance_loss_mlp": 1.00053537, + "epoch": 0.8539906808958365, + "flos": 26687625511680.0, + "grad_norm": 2.9642393233486675, + "language_loss": 0.66188103, + "learning_rate": 2.1941247634800808e-07, + "loss": 0.68424737, + "num_input_tokens_seen": 306349085, + "step": 14204, + "time_per_iteration": 2.7143986225128174 + }, + { + "auxiliary_loss_clip": 0.01164372, + "auxiliary_loss_mlp": 0.0110162, + "balance_loss_clip": 1.00188756, + "balance_loss_mlp": 1.00053072, + "epoch": 0.8540508041485044, + "flos": 13365521038080.0, + "grad_norm": 2.4128454996434545, + "language_loss": 0.60034549, + "learning_rate": 2.1923515445160667e-07, + "loss": 0.62300545, + "num_input_tokens_seen": 306365385, + "step": 14205, + "time_per_iteration": 2.5654425621032715 + }, + { + "auxiliary_loss_clip": 0.01130955, + "auxiliary_loss_mlp": 0.01101426, + "balance_loss_clip": 1.00181389, + "balance_loss_mlp": 1.00033689, + "epoch": 0.8541109274011724, + "flos": 32780876019840.0, + "grad_norm": 2.521051778195928, + "language_loss": 0.72291249, + "learning_rate": 2.1905790008198655e-07, + "loss": 0.74523628, + "num_input_tokens_seen": 306384585, + "step": 14206, + "time_per_iteration": 4.4907755851745605 + }, + { + "auxiliary_loss_clip": 0.01148078, + "auxiliary_loss_mlp": 0.0110025, + "balance_loss_clip": 1.00181389, + "balance_loss_mlp": 1.00040078, + "epoch": 0.8541710506538404, + "flos": 17639286071040.0, + "grad_norm": 3.6019768558897725, + "language_loss": 0.76763356, + "learning_rate": 2.1888071324586987e-07, + "loss": 0.79011691, + "num_input_tokens_seen": 306401565, + "step": 14207, + "time_per_iteration": 4.1087658405303955 + }, + { + "auxiliary_loss_clip": 0.01164329, + "auxiliary_loss_mlp": 0.01101206, + "balance_loss_clip": 1.00187016, + "balance_loss_mlp": 1.00049829, + "epoch": 0.8542311739065084, + "flos": 20263062931200.0, + "grad_norm": 1.8816803541221105, + "language_loss": 0.85383737, + "learning_rate": 2.1870359394997485e-07, + "loss": 0.87649274, + "num_input_tokens_seen": 306419995, + "step": 14208, + "time_per_iteration": 2.5713608264923096 + }, + { + "auxiliary_loss_clip": 0.0113409, + "auxiliary_loss_mlp": 0.01100935, + "balance_loss_clip": 1.00182855, + "balance_loss_mlp": 1.00041771, + "epoch": 0.8542912971591763, + "flos": 17785657992960.0, + "grad_norm": 1.4863795438137324, + "language_loss": 0.66075331, + "learning_rate": 2.1852654220101785e-07, + "loss": 0.68310356, + "num_input_tokens_seen": 306439240, + "step": 14209, + "time_per_iteration": 2.694309711456299 + }, + { + "auxiliary_loss_clip": 0.01101428, + "auxiliary_loss_mlp": 0.01100496, + "balance_loss_clip": 1.00170851, + "balance_loss_mlp": 1.00045562, + "epoch": 0.8543514204118443, + "flos": 26979507429120.0, + "grad_norm": 1.9605434366792094, + "language_loss": 0.7069447, + "learning_rate": 2.1834955800571287e-07, + "loss": 0.72896397, + "num_input_tokens_seen": 306458425, + "step": 14210, + "time_per_iteration": 2.817042827606201 + }, + { + "auxiliary_loss_clip": 0.01132994, + "auxiliary_loss_mlp": 0.01100869, + "balance_loss_clip": 1.00181842, + "balance_loss_mlp": 1.00044775, + "epoch": 0.8544115436645122, + "flos": 24024598064640.0, + "grad_norm": 1.5454321092101733, + "language_loss": 0.69910419, + "learning_rate": 2.1817264137077141e-07, + "loss": 0.72144282, + "num_input_tokens_seen": 306477210, + "step": 14211, + "time_per_iteration": 2.7421250343322754 + }, + { + "auxiliary_loss_clip": 0.01132926, + "auxiliary_loss_mlp": 0.01101534, + "balance_loss_clip": 1.0017848, + "balance_loss_mlp": 1.0004921, + "epoch": 0.8544716669171802, + "flos": 16617986668800.0, + "grad_norm": 2.182754617858342, + "language_loss": 0.8112874, + "learning_rate": 2.1799579230290166e-07, + "loss": 0.83363199, + "num_input_tokens_seen": 306495820, + "step": 14212, + "time_per_iteration": 2.633399724960327 + }, + { + "auxiliary_loss_clip": 0.01117857, + "auxiliary_loss_mlp": 0.01101894, + "balance_loss_clip": 1.00180793, + "balance_loss_mlp": 1.00042343, + "epoch": 0.8545317901698481, + "flos": 40005779489280.0, + "grad_norm": 1.9229787604218633, + "language_loss": 0.65942395, + "learning_rate": 2.178190108088105e-07, + "loss": 0.68162143, + "num_input_tokens_seen": 306516420, + "step": 14213, + "time_per_iteration": 2.990119218826294 + }, + { + "auxiliary_loss_clip": 0.01164174, + "auxiliary_loss_mlp": 0.01100307, + "balance_loss_clip": 1.00184107, + "balance_loss_mlp": 1.0003618, + "epoch": 0.8545919134225162, + "flos": 19902520166400.0, + "grad_norm": 1.8232787106546238, + "language_loss": 0.78110409, + "learning_rate": 2.1764229689520098e-07, + "loss": 0.80374891, + "num_input_tokens_seen": 306534785, + "step": 14214, + "time_per_iteration": 2.6487927436828613 + }, + { + "auxiliary_loss_clip": 0.0113232, + "auxiliary_loss_mlp": 0.01101724, + "balance_loss_clip": 1.00177693, + "balance_loss_mlp": 1.00030065, + "epoch": 0.8546520366751841, + "flos": 18952970181120.0, + "grad_norm": 3.745653390342807, + "language_loss": 0.66852093, + "learning_rate": 2.1746565056877397e-07, + "loss": 0.69086128, + "num_input_tokens_seen": 306552440, + "step": 14215, + "time_per_iteration": 2.7297065258026123 + }, + { + "auxiliary_loss_clip": 0.01164243, + "auxiliary_loss_mlp": 0.01100684, + "balance_loss_clip": 1.00188553, + "balance_loss_mlp": 1.000453, + "epoch": 0.8547121599278521, + "flos": 35621445415680.0, + "grad_norm": 1.9148135112530564, + "language_loss": 0.62922138, + "learning_rate": 2.172890718362279e-07, + "loss": 0.65187061, + "num_input_tokens_seen": 306573600, + "step": 14216, + "time_per_iteration": 2.839712619781494 + }, + { + "auxiliary_loss_clip": 0.0111621, + "auxiliary_loss_mlp": 0.01101347, + "balance_loss_clip": 1.00174952, + "balance_loss_mlp": 1.00044847, + "epoch": 0.8547722831805201, + "flos": 16910048154240.0, + "grad_norm": 1.656233147541407, + "language_loss": 0.65441525, + "learning_rate": 2.17112560704259e-07, + "loss": 0.67659092, + "num_input_tokens_seen": 306592840, + "step": 14217, + "time_per_iteration": 2.731351375579834 + }, + { + "auxiliary_loss_clip": 0.01147387, + "auxiliary_loss_mlp": 0.01101074, + "balance_loss_clip": 1.0018661, + "balance_loss_mlp": 1.00046146, + "epoch": 0.854832406433188, + "flos": 23002616304000.0, + "grad_norm": 1.7049346282123201, + "language_loss": 0.64919835, + "learning_rate": 2.1693611717956072e-07, + "loss": 0.67168295, + "num_input_tokens_seen": 306613210, + "step": 14218, + "time_per_iteration": 2.683227062225342 + }, + { + "auxiliary_loss_clip": 0.01149338, + "auxiliary_loss_mlp": 0.01101066, + "balance_loss_clip": 1.00177693, + "balance_loss_mlp": 1.00045371, + "epoch": 0.854892529685856, + "flos": 20412595249920.0, + "grad_norm": 1.78860778293923, + "language_loss": 0.70196217, + "learning_rate": 2.167597412688238e-07, + "loss": 0.7244662, + "num_input_tokens_seen": 306631620, + "step": 14219, + "time_per_iteration": 2.679299831390381 + }, + { + "auxiliary_loss_clip": 0.01132909, + "auxiliary_loss_mlp": 0.01102214, + "balance_loss_clip": 1.00172377, + "balance_loss_mlp": 1.00055254, + "epoch": 0.854952652938524, + "flos": 16398716094720.0, + "grad_norm": 2.2768550230135, + "language_loss": 0.66832113, + "learning_rate": 2.1658343297873549e-07, + "loss": 0.6906724, + "num_input_tokens_seen": 306646695, + "step": 14220, + "time_per_iteration": 2.6781716346740723 + }, + { + "auxiliary_loss_clip": 0.01164087, + "auxiliary_loss_mlp": 0.01100104, + "balance_loss_clip": 1.00190759, + "balance_loss_mlp": 1.00035012, + "epoch": 0.855012776191192, + "flos": 21178677542400.0, + "grad_norm": 2.6689300406673526, + "language_loss": 0.71484619, + "learning_rate": 2.164071923159827e-07, + "loss": 0.73748815, + "num_input_tokens_seen": 306665465, + "step": 14221, + "time_per_iteration": 2.5907232761383057 + }, + { + "auxiliary_loss_clip": 0.01116684, + "auxiliary_loss_mlp": 0.01101569, + "balance_loss_clip": 1.00174868, + "balance_loss_mlp": 1.00057459, + "epoch": 0.8550728994438599, + "flos": 26140993361280.0, + "grad_norm": 2.3411685726506453, + "language_loss": 0.60024577, + "learning_rate": 2.1623101928724763e-07, + "loss": 0.6224283, + "num_input_tokens_seen": 306685950, + "step": 14222, + "time_per_iteration": 2.751617670059204 + }, + { + "auxiliary_loss_clip": 0.01134419, + "auxiliary_loss_mlp": 0.01099976, + "balance_loss_clip": 1.00172973, + "balance_loss_mlp": 1.00050735, + "epoch": 0.8551330226965279, + "flos": 22786793435520.0, + "grad_norm": 1.7436771110821883, + "language_loss": 0.8420279, + "learning_rate": 2.1605491389921093e-07, + "loss": 0.8643719, + "num_input_tokens_seen": 306705740, + "step": 14223, + "time_per_iteration": 2.7203285694122314 + }, + { + "auxiliary_loss_clip": 0.01147679, + "auxiliary_loss_mlp": 0.01101111, + "balance_loss_clip": 1.00191236, + "balance_loss_mlp": 1.00054646, + "epoch": 0.8551931459491958, + "flos": 22419032037120.0, + "grad_norm": 1.5215189167094807, + "language_loss": 0.73880035, + "learning_rate": 2.158788761585515e-07, + "loss": 0.76128829, + "num_input_tokens_seen": 306725065, + "step": 14224, + "time_per_iteration": 2.7015395164489746 + }, + { + "auxiliary_loss_clip": 0.01132772, + "auxiliary_loss_mlp": 0.00747416, + "balance_loss_clip": 1.00186014, + "balance_loss_mlp": 1.00057948, + "epoch": 0.8552532692018638, + "flos": 19573183342080.0, + "grad_norm": 1.867201155597137, + "language_loss": 0.76002359, + "learning_rate": 2.1570290607194307e-07, + "loss": 0.77882546, + "num_input_tokens_seen": 306743630, + "step": 14225, + "time_per_iteration": 4.175179481506348 + }, + { + "auxiliary_loss_clip": 0.01085932, + "auxiliary_loss_mlp": 0.01099943, + "balance_loss_clip": 1.00161278, + "balance_loss_mlp": 1.00061727, + "epoch": 0.8553133924545318, + "flos": 26432767537920.0, + "grad_norm": 1.8534062602163563, + "language_loss": 0.77132356, + "learning_rate": 2.1552700364605925e-07, + "loss": 0.79318231, + "num_input_tokens_seen": 306763105, + "step": 14226, + "time_per_iteration": 2.8357744216918945 + }, + { + "auxiliary_loss_clip": 0.01164359, + "auxiliary_loss_mlp": 0.01102307, + "balance_loss_clip": 1.00184631, + "balance_loss_mlp": 1.00054955, + "epoch": 0.8553735157071998, + "flos": 16362446336640.0, + "grad_norm": 2.063264817878808, + "language_loss": 0.54975092, + "learning_rate": 2.153511688875702e-07, + "loss": 0.57241756, + "num_input_tokens_seen": 306779875, + "step": 14227, + "time_per_iteration": 2.666973829269409 + }, + { + "auxiliary_loss_clip": 0.01133325, + "auxiliary_loss_mlp": 0.00747381, + "balance_loss_clip": 1.00186348, + "balance_loss_mlp": 1.00043607, + "epoch": 0.8554336389598677, + "flos": 20887334328960.0, + "grad_norm": 3.4166936574552382, + "language_loss": 0.65639603, + "learning_rate": 2.151754018031442e-07, + "loss": 0.67520303, + "num_input_tokens_seen": 306800015, + "step": 14228, + "time_per_iteration": 2.681504487991333 + }, + { + "auxiliary_loss_clip": 0.01114732, + "auxiliary_loss_mlp": 0.01101371, + "balance_loss_clip": 1.00166094, + "balance_loss_mlp": 1.00047195, + "epoch": 0.8554937622125357, + "flos": 21284721469440.0, + "grad_norm": 2.268631162537537, + "language_loss": 0.74137533, + "learning_rate": 2.1499970239944542e-07, + "loss": 0.76353639, + "num_input_tokens_seen": 306814160, + "step": 14229, + "time_per_iteration": 2.739684581756592 + }, + { + "auxiliary_loss_clip": 0.01147438, + "auxiliary_loss_mlp": 0.01101284, + "balance_loss_clip": 1.0017817, + "balance_loss_mlp": 1.00038528, + "epoch": 0.8555538854652037, + "flos": 22413178120320.0, + "grad_norm": 2.693703927767595, + "language_loss": 0.72606206, + "learning_rate": 2.1482407068313724e-07, + "loss": 0.74854922, + "num_input_tokens_seen": 306833310, + "step": 14230, + "time_per_iteration": 2.6102964878082275 + }, + { + "auxiliary_loss_clip": 0.0114967, + "auxiliary_loss_mlp": 0.01101648, + "balance_loss_clip": 1.00191033, + "balance_loss_mlp": 1.00041556, + "epoch": 0.8556140087178716, + "flos": 20193719725440.0, + "grad_norm": 2.1606104226631, + "language_loss": 0.82483065, + "learning_rate": 2.1464850666087897e-07, + "loss": 0.8473438, + "num_input_tokens_seen": 306851345, + "step": 14231, + "time_per_iteration": 4.01720666885376 + }, + { + "auxiliary_loss_clip": 0.01149216, + "auxiliary_loss_mlp": 0.01101592, + "balance_loss_clip": 1.00194001, + "balance_loss_mlp": 1.00050235, + "epoch": 0.8556741319705397, + "flos": 22638123043200.0, + "grad_norm": 1.9234009753275818, + "language_loss": 0.68287826, + "learning_rate": 2.1447301033932796e-07, + "loss": 0.7053864, + "num_input_tokens_seen": 306871040, + "step": 14232, + "time_per_iteration": 2.6382362842559814 + }, + { + "auxiliary_loss_clip": 0.01130865, + "auxiliary_loss_mlp": 0.01101636, + "balance_loss_clip": 1.00182426, + "balance_loss_mlp": 1.00045097, + "epoch": 0.8557342552232076, + "flos": 23549320281600.0, + "grad_norm": 1.7333067691519277, + "language_loss": 0.67232955, + "learning_rate": 2.1429758172513955e-07, + "loss": 0.69465458, + "num_input_tokens_seen": 306891625, + "step": 14233, + "time_per_iteration": 2.6582541465759277 + }, + { + "auxiliary_loss_clip": 0.01147934, + "auxiliary_loss_mlp": 0.01100952, + "balance_loss_clip": 1.00182056, + "balance_loss_mlp": 1.00052965, + "epoch": 0.8557943784758756, + "flos": 19609884063360.0, + "grad_norm": 1.8089966473762775, + "language_loss": 0.77030432, + "learning_rate": 2.1412222082496556e-07, + "loss": 0.79279321, + "num_input_tokens_seen": 306910020, + "step": 14234, + "time_per_iteration": 2.660181999206543 + }, + { + "auxiliary_loss_clip": 0.01113271, + "auxiliary_loss_mlp": 0.010746, + "balance_loss_clip": 1.00082517, + "balance_loss_mlp": 1.0002135, + "epoch": 0.8558545017285435, + "flos": 70641891446400.0, + "grad_norm": 0.7527113913317633, + "language_loss": 0.57948297, + "learning_rate": 2.1394692764545684e-07, + "loss": 0.60136169, + "num_input_tokens_seen": 306969505, + "step": 14235, + "time_per_iteration": 3.229121208190918 + }, + { + "auxiliary_loss_clip": 0.01127261, + "auxiliary_loss_mlp": 0.01074853, + "balance_loss_clip": 1.00074697, + "balance_loss_mlp": 1.00008476, + "epoch": 0.8559146249812115, + "flos": 56649983086080.0, + "grad_norm": 0.781693995940045, + "language_loss": 0.56717181, + "learning_rate": 2.1377170219325858e-07, + "loss": 0.58919299, + "num_input_tokens_seen": 307027710, + "step": 14236, + "time_per_iteration": 3.1682488918304443 + }, + { + "auxiliary_loss_clip": 0.01130881, + "auxiliary_loss_mlp": 0.01100828, + "balance_loss_clip": 1.00178981, + "balance_loss_mlp": 1.00050116, + "epoch": 0.8559747482338794, + "flos": 22888240421760.0, + "grad_norm": 1.803175250907554, + "language_loss": 0.6995647, + "learning_rate": 2.1359654447501673e-07, + "loss": 0.72188181, + "num_input_tokens_seen": 307045515, + "step": 14237, + "time_per_iteration": 2.6664390563964844 + }, + { + "auxiliary_loss_clip": 0.01133101, + "auxiliary_loss_mlp": 0.01100058, + "balance_loss_clip": 1.00174093, + "balance_loss_mlp": 1.00039911, + "epoch": 0.8560348714865474, + "flos": 22601925112320.0, + "grad_norm": 2.1212673818574928, + "language_loss": 0.63541818, + "learning_rate": 2.1342145449737314e-07, + "loss": 0.65774977, + "num_input_tokens_seen": 307064470, + "step": 14238, + "time_per_iteration": 2.663886547088623 + }, + { + "auxiliary_loss_clip": 0.01164104, + "auxiliary_loss_mlp": 0.01099602, + "balance_loss_clip": 1.00184512, + "balance_loss_mlp": 1.00051522, + "epoch": 0.8560949947392154, + "flos": 17931455297280.0, + "grad_norm": 1.6028384797518815, + "language_loss": 0.69040525, + "learning_rate": 2.1324643226696648e-07, + "loss": 0.71304226, + "num_input_tokens_seen": 307083900, + "step": 14239, + "time_per_iteration": 2.610816240310669 + }, + { + "auxiliary_loss_clip": 0.01164333, + "auxiliary_loss_mlp": 0.01101404, + "balance_loss_clip": 1.00192285, + "balance_loss_mlp": 1.00040972, + "epoch": 0.8561551179918834, + "flos": 31026208636800.0, + "grad_norm": 2.1374957467718807, + "language_loss": 0.66533762, + "learning_rate": 2.1307147779043455e-07, + "loss": 0.68799496, + "num_input_tokens_seen": 307104590, + "step": 14240, + "time_per_iteration": 2.6948022842407227 + }, + { + "auxiliary_loss_clip": 0.0111676, + "auxiliary_loss_mlp": 0.01101242, + "balance_loss_clip": 1.00169742, + "balance_loss_mlp": 1.00039077, + "epoch": 0.8562152412445513, + "flos": 30665198995200.0, + "grad_norm": 1.8131453688432637, + "language_loss": 0.6263845, + "learning_rate": 2.1289659107441182e-07, + "loss": 0.64856458, + "num_input_tokens_seen": 307125580, + "step": 14241, + "time_per_iteration": 2.777848958969116 + }, + { + "auxiliary_loss_clip": 0.011645, + "auxiliary_loss_mlp": 0.01102054, + "balance_loss_clip": 1.00186443, + "balance_loss_mlp": 1.00058341, + "epoch": 0.8562753644972193, + "flos": 31576144838400.0, + "grad_norm": 2.013811203275034, + "language_loss": 0.74439716, + "learning_rate": 2.1272177212552855e-07, + "loss": 0.76706266, + "num_input_tokens_seen": 307147625, + "step": 14242, + "time_per_iteration": 2.6266696453094482 + }, + { + "auxiliary_loss_clip": 0.01069562, + "auxiliary_loss_mlp": 0.01102336, + "balance_loss_clip": 1.00145864, + "balance_loss_mlp": 1.00067401, + "epoch": 0.8563354877498872, + "flos": 26213640618240.0, + "grad_norm": 2.0966639290752793, + "language_loss": 0.76143646, + "learning_rate": 2.1254702095041498e-07, + "loss": 0.78315544, + "num_input_tokens_seen": 307164665, + "step": 14243, + "time_per_iteration": 2.8320486545562744 + }, + { + "auxiliary_loss_clip": 0.0113127, + "auxiliary_loss_mlp": 0.00747373, + "balance_loss_clip": 1.00178039, + "balance_loss_mlp": 1.00057518, + "epoch": 0.8563956110025552, + "flos": 24134341092480.0, + "grad_norm": 1.9372252512217647, + "language_loss": 0.68441236, + "learning_rate": 2.123723375556974e-07, + "loss": 0.70319879, + "num_input_tokens_seen": 307182530, + "step": 14244, + "time_per_iteration": 2.688493013381958 + }, + { + "auxiliary_loss_clip": 0.01143731, + "auxiliary_loss_mlp": 0.01074366, + "balance_loss_clip": 1.00071883, + "balance_loss_mlp": 0.9999792, + "epoch": 0.8564557342552233, + "flos": 56271986311680.0, + "grad_norm": 0.7522869039699511, + "language_loss": 0.58395624, + "learning_rate": 2.1219772194800046e-07, + "loss": 0.60613728, + "num_input_tokens_seen": 307241240, + "step": 14245, + "time_per_iteration": 6.177520513534546 + }, + { + "auxiliary_loss_clip": 0.01147592, + "auxiliary_loss_mlp": 0.01102753, + "balance_loss_clip": 1.00187051, + "balance_loss_mlp": 1.00042355, + "epoch": 0.8565158575078912, + "flos": 23440618748160.0, + "grad_norm": 1.9152662658569932, + "language_loss": 0.7741859, + "learning_rate": 2.1202317413394488e-07, + "loss": 0.79668939, + "num_input_tokens_seen": 307261485, + "step": 14246, + "time_per_iteration": 2.7698450088500977 + }, + { + "auxiliary_loss_clip": 0.01134815, + "auxiliary_loss_mlp": 0.01101288, + "balance_loss_clip": 1.00177348, + "balance_loss_mlp": 1.00038934, + "epoch": 0.8565759807605592, + "flos": 20375930442240.0, + "grad_norm": 1.8991500765631066, + "language_loss": 0.81295609, + "learning_rate": 2.1184869412014938e-07, + "loss": 0.83531713, + "num_input_tokens_seen": 307279160, + "step": 14247, + "time_per_iteration": 2.744677782058716 + }, + { + "auxiliary_loss_clip": 0.01131049, + "auxiliary_loss_mlp": 0.01100379, + "balance_loss_clip": 1.00180912, + "balance_loss_mlp": 1.00043356, + "epoch": 0.8566361040132271, + "flos": 18807101049600.0, + "grad_norm": 1.6740517320732509, + "language_loss": 0.77320921, + "learning_rate": 2.1167428191323112e-07, + "loss": 0.79552346, + "num_input_tokens_seen": 307297920, + "step": 14248, + "time_per_iteration": 2.6999263763427734 + }, + { + "auxiliary_loss_clip": 0.01101334, + "auxiliary_loss_mlp": 0.01101789, + "balance_loss_clip": 1.00174487, + "balance_loss_mlp": 1.00050855, + "epoch": 0.8566962272658951, + "flos": 24535355506560.0, + "grad_norm": 1.671189102536639, + "language_loss": 0.7796073, + "learning_rate": 2.1149993751980278e-07, + "loss": 0.80163848, + "num_input_tokens_seen": 307318320, + "step": 14249, + "time_per_iteration": 2.861467123031616 + }, + { + "auxiliary_loss_clip": 0.01132925, + "auxiliary_loss_mlp": 0.01100733, + "balance_loss_clip": 1.00181377, + "balance_loss_mlp": 1.00050211, + "epoch": 0.856756350518563, + "flos": 23178506227200.0, + "grad_norm": 1.8111610721307156, + "language_loss": 0.78261018, + "learning_rate": 2.1132566094647597e-07, + "loss": 0.80494678, + "num_input_tokens_seen": 307336720, + "step": 14250, + "time_per_iteration": 2.672020673751831 + }, + { + "auxiliary_loss_clip": 0.01132715, + "auxiliary_loss_mlp": 0.01100536, + "balance_loss_clip": 1.0018301, + "balance_loss_mlp": 1.00049591, + "epoch": 0.856816473771231, + "flos": 20808581760000.0, + "grad_norm": 2.0607428000580312, + "language_loss": 0.79541767, + "learning_rate": 2.1115145219985942e-07, + "loss": 0.81775016, + "num_input_tokens_seen": 307354120, + "step": 14251, + "time_per_iteration": 2.7423934936523438 + }, + { + "auxiliary_loss_clip": 0.01114225, + "auxiliary_loss_mlp": 0.01100079, + "balance_loss_clip": 1.00149703, + "balance_loss_mlp": 1.00051546, + "epoch": 0.856876597023899, + "flos": 20228157889920.0, + "grad_norm": 1.8764516090423111, + "language_loss": 0.61326206, + "learning_rate": 2.1097731128656005e-07, + "loss": 0.63540506, + "num_input_tokens_seen": 307373165, + "step": 14252, + "time_per_iteration": 2.7509634494781494 + }, + { + "auxiliary_loss_clip": 0.01132454, + "auxiliary_loss_mlp": 0.01101391, + "balance_loss_clip": 1.00181556, + "balance_loss_mlp": 1.00044465, + "epoch": 0.856936720276567, + "flos": 18296128126080.0, + "grad_norm": 1.7911688015468814, + "language_loss": 0.70096052, + "learning_rate": 2.1080323821317924e-07, + "loss": 0.72329891, + "num_input_tokens_seen": 307391000, + "step": 14253, + "time_per_iteration": 2.6586997509002686 + }, + { + "auxiliary_loss_clip": 0.01126406, + "auxiliary_loss_mlp": 0.01074741, + "balance_loss_clip": 1.00070548, + "balance_loss_mlp": 0.99997294, + "epoch": 0.8569968435292349, + "flos": 69878394933120.0, + "grad_norm": 0.7810962817056013, + "language_loss": 0.59163147, + "learning_rate": 2.1062923298631907e-07, + "loss": 0.61364299, + "num_input_tokens_seen": 307452865, + "step": 14254, + "time_per_iteration": 3.286200523376465 + }, + { + "auxiliary_loss_clip": 0.01130693, + "auxiliary_loss_mlp": 0.01100713, + "balance_loss_clip": 1.00173616, + "balance_loss_mlp": 1.000386, + "epoch": 0.8570569667819029, + "flos": 25848572739840.0, + "grad_norm": 1.8843517890493933, + "language_loss": 0.81083632, + "learning_rate": 2.1045529561257825e-07, + "loss": 0.83315045, + "num_input_tokens_seen": 307471940, + "step": 14255, + "time_per_iteration": 2.7367494106292725 + }, + { + "auxiliary_loss_clip": 0.01164052, + "auxiliary_loss_mlp": 0.01100147, + "balance_loss_clip": 1.00184464, + "balance_loss_mlp": 1.0004406, + "epoch": 0.8571170900345708, + "flos": 23257115141760.0, + "grad_norm": 2.407982116770104, + "language_loss": 0.67496365, + "learning_rate": 2.1028142609855126e-07, + "loss": 0.69760561, + "num_input_tokens_seen": 307488745, + "step": 14256, + "time_per_iteration": 2.6542880535125732 + }, + { + "auxiliary_loss_clip": 0.01147732, + "auxiliary_loss_mlp": 0.01101606, + "balance_loss_clip": 1.00183988, + "balance_loss_mlp": 1.00037384, + "epoch": 0.8571772132872388, + "flos": 18917670090240.0, + "grad_norm": 1.6296987193037065, + "language_loss": 0.70172048, + "learning_rate": 2.1010762445083218e-07, + "loss": 0.72421384, + "num_input_tokens_seen": 307506855, + "step": 14257, + "time_per_iteration": 2.6584582328796387 + }, + { + "auxiliary_loss_clip": 0.01117624, + "auxiliary_loss_mlp": 0.01100924, + "balance_loss_clip": 1.00172198, + "balance_loss_mlp": 1.00045407, + "epoch": 0.8572373365399069, + "flos": 33250120318080.0, + "grad_norm": 2.1062106547470143, + "language_loss": 0.76963329, + "learning_rate": 2.0993389067601197e-07, + "loss": 0.79181874, + "num_input_tokens_seen": 307526115, + "step": 14258, + "time_per_iteration": 2.784878730773926 + }, + { + "auxiliary_loss_clip": 0.01149093, + "auxiliary_loss_mlp": 0.00747244, + "balance_loss_clip": 1.00192428, + "balance_loss_mlp": 1.00063229, + "epoch": 0.8572974597925748, + "flos": 23327535755520.0, + "grad_norm": 2.7591907347296347, + "language_loss": 0.67833352, + "learning_rate": 2.0976022478067735e-07, + "loss": 0.69729692, + "num_input_tokens_seen": 307545230, + "step": 14259, + "time_per_iteration": 2.6825344562530518 + }, + { + "auxiliary_loss_clip": 0.01149512, + "auxiliary_loss_mlp": 0.01101003, + "balance_loss_clip": 1.00177372, + "balance_loss_mlp": 1.00048566, + "epoch": 0.8573575830452428, + "flos": 24535858296960.0, + "grad_norm": 1.8156550832538016, + "language_loss": 0.77255309, + "learning_rate": 2.0958662677141437e-07, + "loss": 0.79505825, + "num_input_tokens_seen": 307564900, + "step": 14260, + "time_per_iteration": 2.7085020542144775 + }, + { + "auxiliary_loss_clip": 0.01132288, + "auxiliary_loss_mlp": 0.01101504, + "balance_loss_clip": 1.00185978, + "balance_loss_mlp": 1.00041437, + "epoch": 0.8574177062979107, + "flos": 24165403378560.0, + "grad_norm": 1.8699505180576275, + "language_loss": 0.74182576, + "learning_rate": 2.09413096654806e-07, + "loss": 0.76416367, + "num_input_tokens_seen": 307583500, + "step": 14261, + "time_per_iteration": 2.676722288131714 + }, + { + "auxiliary_loss_clip": 0.01148365, + "auxiliary_loss_mlp": 0.01101043, + "balance_loss_clip": 1.00197196, + "balance_loss_mlp": 1.00052607, + "epoch": 0.8574778295505787, + "flos": 17930737025280.0, + "grad_norm": 2.2449927965050946, + "language_loss": 0.78751826, + "learning_rate": 2.0923963443743276e-07, + "loss": 0.8100124, + "num_input_tokens_seen": 307601430, + "step": 14262, + "time_per_iteration": 2.6909632682800293 + }, + { + "auxiliary_loss_clip": 0.01116045, + "auxiliary_loss_mlp": 0.01100465, + "balance_loss_clip": 1.00176466, + "balance_loss_mlp": 1.00051939, + "epoch": 0.8575379528032466, + "flos": 21580697537280.0, + "grad_norm": 1.7068177917997742, + "language_loss": 0.67959952, + "learning_rate": 2.0906624012587203e-07, + "loss": 0.70176464, + "num_input_tokens_seen": 307621495, + "step": 14263, + "time_per_iteration": 4.239221572875977 + }, + { + "auxiliary_loss_clip": 0.01114347, + "auxiliary_loss_mlp": 0.00747369, + "balance_loss_clip": 1.00170469, + "balance_loss_mlp": 1.00051308, + "epoch": 0.8575980760559146, + "flos": 21761579450880.0, + "grad_norm": 2.0245167052154587, + "language_loss": 0.79801464, + "learning_rate": 2.088929137266986e-07, + "loss": 0.81663179, + "num_input_tokens_seen": 307640840, + "step": 14264, + "time_per_iteration": 2.726778984069824 + }, + { + "auxiliary_loss_clip": 0.01118041, + "auxiliary_loss_mlp": 0.01101655, + "balance_loss_clip": 1.00173569, + "balance_loss_mlp": 1.00047052, + "epoch": 0.8576581993085826, + "flos": 34386442047360.0, + "grad_norm": 1.238885790143465, + "language_loss": 0.69726604, + "learning_rate": 2.0871965524648582e-07, + "loss": 0.71946299, + "num_input_tokens_seen": 307663820, + "step": 14265, + "time_per_iteration": 2.833475351333618 + }, + { + "auxiliary_loss_clip": 0.01164231, + "auxiliary_loss_mlp": 0.0109981, + "balance_loss_clip": 1.00197458, + "balance_loss_mlp": 1.00034142, + "epoch": 0.8577183225612506, + "flos": 23222497409280.0, + "grad_norm": 1.9172954175857166, + "language_loss": 0.66338474, + "learning_rate": 2.085464646918027e-07, + "loss": 0.68602514, + "num_input_tokens_seen": 307682385, + "step": 14266, + "time_per_iteration": 2.629899740219116 + }, + { + "auxiliary_loss_clip": 0.01132056, + "auxiliary_loss_mlp": 0.01100927, + "balance_loss_clip": 1.00180566, + "balance_loss_mlp": 1.00045705, + "epoch": 0.8577784458139185, + "flos": 28804164462720.0, + "grad_norm": 1.5683255016940596, + "language_loss": 0.75313312, + "learning_rate": 2.0837334206921731e-07, + "loss": 0.77546299, + "num_input_tokens_seen": 307704680, + "step": 14267, + "time_per_iteration": 2.8078036308288574 + }, + { + "auxiliary_loss_clip": 0.01147496, + "auxiliary_loss_mlp": 0.01100316, + "balance_loss_clip": 1.00184274, + "balance_loss_mlp": 1.00041914, + "epoch": 0.8578385690665865, + "flos": 19755573626880.0, + "grad_norm": 1.889524543364662, + "language_loss": 0.87955314, + "learning_rate": 2.082002873852946e-07, + "loss": 0.9020313, + "num_input_tokens_seen": 307723245, + "step": 14268, + "time_per_iteration": 2.7655394077301025 + }, + { + "auxiliary_loss_clip": 0.0114774, + "auxiliary_loss_mlp": 0.01102567, + "balance_loss_clip": 1.00190425, + "balance_loss_mlp": 1.00061941, + "epoch": 0.8578986923192544, + "flos": 20704082117760.0, + "grad_norm": 2.1270814964967006, + "language_loss": 0.72950435, + "learning_rate": 2.0802730064659667e-07, + "loss": 0.75200748, + "num_input_tokens_seen": 307742510, + "step": 14269, + "time_per_iteration": 4.006585121154785 + }, + { + "auxiliary_loss_clip": 0.0114974, + "auxiliary_loss_mlp": 0.01100696, + "balance_loss_clip": 1.00183499, + "balance_loss_mlp": 1.00046468, + "epoch": 0.8579588155719224, + "flos": 36101715189120.0, + "grad_norm": 2.3289941824805167, + "language_loss": 0.66129243, + "learning_rate": 2.0785438185968252e-07, + "loss": 0.68379676, + "num_input_tokens_seen": 307766030, + "step": 14270, + "time_per_iteration": 2.8122611045837402 + }, + { + "auxiliary_loss_clip": 0.01135013, + "auxiliary_loss_mlp": 0.01100395, + "balance_loss_clip": 1.00183845, + "balance_loss_mlp": 1.00044966, + "epoch": 0.8580189388245905, + "flos": 22853479034880.0, + "grad_norm": 1.5408259384661764, + "language_loss": 0.73643476, + "learning_rate": 2.0768153103110997e-07, + "loss": 0.75878882, + "num_input_tokens_seen": 307785800, + "step": 14271, + "time_per_iteration": 2.7701120376586914 + }, + { + "auxiliary_loss_clip": 0.01093715, + "auxiliary_loss_mlp": 0.00745315, + "balance_loss_clip": 1.00089872, + "balance_loss_mlp": 1.00008297, + "epoch": 0.8580790620772584, + "flos": 69642104290560.0, + "grad_norm": 0.794867957552163, + "language_loss": 0.59412527, + "learning_rate": 2.0750874816743358e-07, + "loss": 0.61251557, + "num_input_tokens_seen": 307850995, + "step": 14272, + "time_per_iteration": 3.401761770248413 + }, + { + "auxiliary_loss_clip": 0.01132875, + "auxiliary_loss_mlp": 0.01102527, + "balance_loss_clip": 1.00188327, + "balance_loss_mlp": 1.00048387, + "epoch": 0.8581391853299264, + "flos": 13334243270400.0, + "grad_norm": 1.747752154244232, + "language_loss": 0.75180578, + "learning_rate": 2.0733603327520499e-07, + "loss": 0.77415973, + "num_input_tokens_seen": 307868585, + "step": 14273, + "time_per_iteration": 2.6906449794769287 + }, + { + "auxiliary_loss_clip": 0.01147381, + "auxiliary_loss_mlp": 0.01100572, + "balance_loss_clip": 1.00174832, + "balance_loss_mlp": 1.00048375, + "epoch": 0.8581993085825943, + "flos": 19645651031040.0, + "grad_norm": 1.8794200287106828, + "language_loss": 0.82073504, + "learning_rate": 2.0716338636097385e-07, + "loss": 0.84321457, + "num_input_tokens_seen": 307886820, + "step": 14274, + "time_per_iteration": 2.615406036376953 + }, + { + "auxiliary_loss_clip": 0.01142059, + "auxiliary_loss_mlp": 0.01075226, + "balance_loss_clip": 1.00074553, + "balance_loss_mlp": 1.00007677, + "epoch": 0.8582594318352623, + "flos": 55825077294720.0, + "grad_norm": 0.8876617277042518, + "language_loss": 0.60834658, + "learning_rate": 2.0699080743128672e-07, + "loss": 0.63051951, + "num_input_tokens_seen": 307944020, + "step": 14275, + "time_per_iteration": 3.2388546466827393 + }, + { + "auxiliary_loss_clip": 0.01147879, + "auxiliary_loss_mlp": 0.01101785, + "balance_loss_clip": 1.00185561, + "balance_loss_mlp": 1.00036144, + "epoch": 0.8583195550879302, + "flos": 24279563779200.0, + "grad_norm": 3.1043033731444782, + "language_loss": 0.58804476, + "learning_rate": 2.0681829649268768e-07, + "loss": 0.61054134, + "num_input_tokens_seen": 307961055, + "step": 14276, + "time_per_iteration": 2.640780210494995 + }, + { + "auxiliary_loss_clip": 0.01130905, + "auxiliary_loss_mlp": 0.01101252, + "balance_loss_clip": 1.0017823, + "balance_loss_mlp": 1.00054407, + "epoch": 0.8583796783405983, + "flos": 13444129952640.0, + "grad_norm": 4.878967148840553, + "language_loss": 0.76384091, + "learning_rate": 2.0664585355171838e-07, + "loss": 0.78616256, + "num_input_tokens_seen": 307978690, + "step": 14277, + "time_per_iteration": 2.6482033729553223 + }, + { + "auxiliary_loss_clip": 0.01130926, + "auxiliary_loss_mlp": 0.0110172, + "balance_loss_clip": 1.00179982, + "balance_loss_mlp": 1.00044012, + "epoch": 0.8584398015932662, + "flos": 16180271533440.0, + "grad_norm": 1.9071661121576111, + "language_loss": 0.83822715, + "learning_rate": 2.0647347861491803e-07, + "loss": 0.86055362, + "num_input_tokens_seen": 307995870, + "step": 14278, + "time_per_iteration": 2.689058303833008 + }, + { + "auxiliary_loss_clip": 0.01131077, + "auxiliary_loss_mlp": 0.01103262, + "balance_loss_clip": 1.00182748, + "balance_loss_mlp": 1.00055122, + "epoch": 0.8584999248459342, + "flos": 17450431338240.0, + "grad_norm": 2.164716597074675, + "language_loss": 0.74673891, + "learning_rate": 2.0630117168882366e-07, + "loss": 0.76908231, + "num_input_tokens_seen": 308013645, + "step": 14279, + "time_per_iteration": 2.6800384521484375 + }, + { + "auxiliary_loss_clip": 0.01164224, + "auxiliary_loss_mlp": 0.01100601, + "balance_loss_clip": 1.00192738, + "balance_loss_mlp": 1.00046515, + "epoch": 0.8585600480986021, + "flos": 23441013797760.0, + "grad_norm": 3.4946151288665357, + "language_loss": 0.6615088, + "learning_rate": 2.0612893277996845e-07, + "loss": 0.68415707, + "num_input_tokens_seen": 308032490, + "step": 14280, + "time_per_iteration": 2.5815887451171875 + }, + { + "auxiliary_loss_clip": 0.01147475, + "auxiliary_loss_mlp": 0.01100171, + "balance_loss_clip": 1.00182009, + "balance_loss_mlp": 1.00046444, + "epoch": 0.8586201713512701, + "flos": 19937927998080.0, + "grad_norm": 1.8758454455356384, + "language_loss": 0.62529123, + "learning_rate": 2.0595676189488343e-07, + "loss": 0.64776766, + "num_input_tokens_seen": 308052110, + "step": 14281, + "time_per_iteration": 2.6406748294830322 + }, + { + "auxiliary_loss_clip": 0.01133042, + "auxiliary_loss_mlp": 0.00747436, + "balance_loss_clip": 1.00156569, + "balance_loss_mlp": 1.00051618, + "epoch": 0.858680294603938, + "flos": 15304769435520.0, + "grad_norm": 1.7023919381091062, + "language_loss": 0.73529398, + "learning_rate": 2.0578465904009845e-07, + "loss": 0.75409877, + "num_input_tokens_seen": 308070660, + "step": 14282, + "time_per_iteration": 5.159047603607178 + }, + { + "auxiliary_loss_clip": 0.01132818, + "auxiliary_loss_mlp": 0.01100352, + "balance_loss_clip": 1.0018158, + "balance_loss_mlp": 1.0004065, + "epoch": 0.858740417856606, + "flos": 22711237176960.0, + "grad_norm": 2.2833586874405127, + "language_loss": 0.75405568, + "learning_rate": 2.0561262422213832e-07, + "loss": 0.77638739, + "num_input_tokens_seen": 308089520, + "step": 14283, + "time_per_iteration": 4.155086040496826 + }, + { + "auxiliary_loss_clip": 0.01149428, + "auxiliary_loss_mlp": 0.01100823, + "balance_loss_clip": 1.00188386, + "balance_loss_mlp": 1.0003531, + "epoch": 0.8588005411092741, + "flos": 34054303962240.0, + "grad_norm": 1.957602120594982, + "language_loss": 0.60116065, + "learning_rate": 2.0544065744752736e-07, + "loss": 0.62366319, + "num_input_tokens_seen": 308111545, + "step": 14284, + "time_per_iteration": 2.7500343322753906 + }, + { + "auxiliary_loss_clip": 0.0113109, + "auxiliary_loss_mlp": 0.01099513, + "balance_loss_clip": 1.00170755, + "balance_loss_mlp": 1.00047386, + "epoch": 0.858860664361942, + "flos": 28913584268160.0, + "grad_norm": 2.214541970116008, + "language_loss": 0.75728452, + "learning_rate": 2.0526875872278749e-07, + "loss": 0.77959049, + "num_input_tokens_seen": 308129690, + "step": 14285, + "time_per_iteration": 2.7245607376098633 + }, + { + "auxiliary_loss_clip": 0.01147994, + "auxiliary_loss_mlp": 0.0110144, + "balance_loss_clip": 1.00191545, + "balance_loss_mlp": 1.00054121, + "epoch": 0.85892078761461, + "flos": 19792525743360.0, + "grad_norm": 2.0641997265916006, + "language_loss": 0.74317753, + "learning_rate": 2.0509692805443524e-07, + "loss": 0.76567185, + "num_input_tokens_seen": 308147410, + "step": 14286, + "time_per_iteration": 2.680588722229004 + }, + { + "auxiliary_loss_clip": 0.01124476, + "auxiliary_loss_mlp": 0.00745371, + "balance_loss_clip": 1.00081897, + "balance_loss_mlp": 1.00017214, + "epoch": 0.8589809108672779, + "flos": 67106630039040.0, + "grad_norm": 0.7750872708533401, + "language_loss": 0.49483627, + "learning_rate": 2.0492516544898718e-07, + "loss": 0.51353472, + "num_input_tokens_seen": 308204875, + "step": 14287, + "time_per_iteration": 3.1780102252960205 + }, + { + "auxiliary_loss_clip": 0.01147608, + "auxiliary_loss_mlp": 0.0110095, + "balance_loss_clip": 1.00188971, + "balance_loss_mlp": 1.00052798, + "epoch": 0.8590410341199459, + "flos": 29716259541120.0, + "grad_norm": 2.083687229270518, + "language_loss": 0.79118961, + "learning_rate": 2.0475347091295704e-07, + "loss": 0.81367517, + "num_input_tokens_seen": 308225690, + "step": 14288, + "time_per_iteration": 2.8050639629364014 + }, + { + "auxiliary_loss_clip": 0.01115374, + "auxiliary_loss_mlp": 0.0110186, + "balance_loss_clip": 1.00166821, + "balance_loss_mlp": 1.00057983, + "epoch": 0.8591011573726138, + "flos": 23987430466560.0, + "grad_norm": 1.945145865862827, + "language_loss": 0.81016862, + "learning_rate": 2.045818444528553e-07, + "loss": 0.83234096, + "num_input_tokens_seen": 308245255, + "step": 14289, + "time_per_iteration": 2.8304853439331055 + }, + { + "auxiliary_loss_clip": 0.01149635, + "auxiliary_loss_mlp": 0.01101923, + "balance_loss_clip": 1.00197029, + "balance_loss_mlp": 1.00054717, + "epoch": 0.8591612806252819, + "flos": 14428656806400.0, + "grad_norm": 1.6782037535350969, + "language_loss": 0.65372503, + "learning_rate": 2.0441028607518973e-07, + "loss": 0.67624056, + "num_input_tokens_seen": 308261755, + "step": 14290, + "time_per_iteration": 2.7627553939819336 + }, + { + "auxiliary_loss_clip": 0.01131127, + "auxiliary_loss_mlp": 0.0110229, + "balance_loss_clip": 1.00180316, + "balance_loss_mlp": 1.00043786, + "epoch": 0.8592214038779498, + "flos": 31577150419200.0, + "grad_norm": 2.30011724296339, + "language_loss": 0.55104387, + "learning_rate": 2.0423879578646642e-07, + "loss": 0.57337797, + "num_input_tokens_seen": 308285145, + "step": 14291, + "time_per_iteration": 2.787508964538574 + }, + { + "auxiliary_loss_clip": 0.01149139, + "auxiliary_loss_mlp": 0.01101008, + "balance_loss_clip": 1.00192356, + "balance_loss_mlp": 1.00039506, + "epoch": 0.8592815271306178, + "flos": 17457290835840.0, + "grad_norm": 1.9524785709099286, + "language_loss": 0.71519029, + "learning_rate": 2.0406737359318792e-07, + "loss": 0.73769176, + "num_input_tokens_seen": 308304130, + "step": 14292, + "time_per_iteration": 2.6703264713287354 + }, + { + "auxiliary_loss_clip": 0.01148682, + "auxiliary_loss_mlp": 0.01100142, + "balance_loss_clip": 1.0016948, + "balance_loss_mlp": 1.00038826, + "epoch": 0.8593416503832857, + "flos": 25411360394880.0, + "grad_norm": 2.47049432520847, + "language_loss": 0.71365857, + "learning_rate": 2.038960195018542e-07, + "loss": 0.73614687, + "num_input_tokens_seen": 308324670, + "step": 14293, + "time_per_iteration": 2.7541449069976807 + }, + { + "auxiliary_loss_clip": 0.0113106, + "auxiliary_loss_mlp": 0.01100727, + "balance_loss_clip": 1.00176334, + "balance_loss_mlp": 1.00049579, + "epoch": 0.8594017736359537, + "flos": 20996646393600.0, + "grad_norm": 1.8808110343508473, + "language_loss": 0.69124389, + "learning_rate": 2.0372473351896358e-07, + "loss": 0.71356177, + "num_input_tokens_seen": 308344215, + "step": 14294, + "time_per_iteration": 2.8241755962371826 + }, + { + "auxiliary_loss_clip": 0.01164144, + "auxiliary_loss_mlp": 0.01100018, + "balance_loss_clip": 1.00188565, + "balance_loss_mlp": 1.00050151, + "epoch": 0.8594618968886216, + "flos": 22091059929600.0, + "grad_norm": 2.379389949652444, + "language_loss": 0.77819562, + "learning_rate": 2.0355351565101087e-07, + "loss": 0.80083728, + "num_input_tokens_seen": 308360520, + "step": 14295, + "time_per_iteration": 2.7315115928649902 + }, + { + "auxiliary_loss_clip": 0.01133153, + "auxiliary_loss_mlp": 0.01101695, + "balance_loss_clip": 1.00181365, + "balance_loss_mlp": 1.00055742, + "epoch": 0.8595220201412896, + "flos": 11656245467520.0, + "grad_norm": 3.7709316749273296, + "language_loss": 0.69549537, + "learning_rate": 2.0338236590448975e-07, + "loss": 0.71784383, + "num_input_tokens_seen": 308376865, + "step": 14296, + "time_per_iteration": 2.7008297443389893 + }, + { + "auxiliary_loss_clip": 0.01133041, + "auxiliary_loss_mlp": 0.01101391, + "balance_loss_clip": 1.00181735, + "balance_loss_mlp": 1.00044441, + "epoch": 0.8595821433939577, + "flos": 25040366772480.0, + "grad_norm": 3.904629566579606, + "language_loss": 0.79293245, + "learning_rate": 2.0321128428588842e-07, + "loss": 0.8152768, + "num_input_tokens_seen": 308395870, + "step": 14297, + "time_per_iteration": 2.693545341491699 + }, + { + "auxiliary_loss_clip": 0.01149628, + "auxiliary_loss_mlp": 0.01100383, + "balance_loss_clip": 1.00188041, + "balance_loss_mlp": 1.00048542, + "epoch": 0.8596422666466256, + "flos": 28511528359680.0, + "grad_norm": 2.161416691642246, + "language_loss": 0.6808235, + "learning_rate": 2.030402708016954e-07, + "loss": 0.7033236, + "num_input_tokens_seen": 308417250, + "step": 14298, + "time_per_iteration": 2.670518398284912 + }, + { + "auxiliary_loss_clip": 0.01133106, + "auxiliary_loss_mlp": 0.01100264, + "balance_loss_clip": 1.00185478, + "balance_loss_mlp": 1.00060546, + "epoch": 0.8597023898992936, + "flos": 13589137157760.0, + "grad_norm": 2.075179403622136, + "language_loss": 0.6839788, + "learning_rate": 2.0286932545839576e-07, + "loss": 0.70631248, + "num_input_tokens_seen": 308434565, + "step": 14299, + "time_per_iteration": 2.7207577228546143 + }, + { + "auxiliary_loss_clip": 0.01114039, + "auxiliary_loss_mlp": 0.01101073, + "balance_loss_clip": 1.00163269, + "balance_loss_mlp": 1.00055552, + "epoch": 0.8597625131519615, + "flos": 32300821728000.0, + "grad_norm": 2.2234496215285913, + "language_loss": 0.71430206, + "learning_rate": 2.0269844826247096e-07, + "loss": 0.73645324, + "num_input_tokens_seen": 308450040, + "step": 14300, + "time_per_iteration": 2.8014771938323975 + }, + { + "auxiliary_loss_clip": 0.01134041, + "auxiliary_loss_mlp": 0.01101036, + "balance_loss_clip": 1.00166702, + "balance_loss_mlp": 1.00051832, + "epoch": 0.8598226364046295, + "flos": 28730367970560.0, + "grad_norm": 1.9969645375260339, + "language_loss": 0.68943304, + "learning_rate": 2.0252763922040116e-07, + "loss": 0.71178377, + "num_input_tokens_seen": 308470545, + "step": 14301, + "time_per_iteration": 4.255528450012207 + }, + { + "auxiliary_loss_clip": 0.01101143, + "auxiliary_loss_mlp": 0.01100854, + "balance_loss_clip": 1.00166774, + "balance_loss_mlp": 1.00052726, + "epoch": 0.8598827596572974, + "flos": 21871825269120.0, + "grad_norm": 2.088208850977343, + "language_loss": 0.74112403, + "learning_rate": 2.023568983386641e-07, + "loss": 0.76314402, + "num_input_tokens_seen": 308490020, + "step": 14302, + "time_per_iteration": 2.7969844341278076 + }, + { + "auxiliary_loss_clip": 0.01147383, + "auxiliary_loss_mlp": 0.01100195, + "balance_loss_clip": 1.00174475, + "balance_loss_mlp": 1.00048852, + "epoch": 0.8599428829099655, + "flos": 23767297966080.0, + "grad_norm": 1.7411660193477323, + "language_loss": 0.83881152, + "learning_rate": 2.02186225623733e-07, + "loss": 0.86128724, + "num_input_tokens_seen": 308509065, + "step": 14303, + "time_per_iteration": 2.6760613918304443 + }, + { + "auxiliary_loss_clip": 0.01149619, + "auxiliary_loss_mlp": 0.01101286, + "balance_loss_clip": 1.00175631, + "balance_loss_mlp": 1.00062585, + "epoch": 0.8600030061626334, + "flos": 16212770363520.0, + "grad_norm": 2.4048359227621905, + "language_loss": 0.77190739, + "learning_rate": 2.0201562108208025e-07, + "loss": 0.79441643, + "num_input_tokens_seen": 308524725, + "step": 14304, + "time_per_iteration": 2.6234264373779297 + }, + { + "auxiliary_loss_clip": 0.01164252, + "auxiliary_loss_mlp": 0.01102294, + "balance_loss_clip": 1.00182259, + "balance_loss_mlp": 1.00044203, + "epoch": 0.8600631294153014, + "flos": 15669370437120.0, + "grad_norm": 3.071567067160625, + "language_loss": 0.54129946, + "learning_rate": 2.0184508472017537e-07, + "loss": 0.56396496, + "num_input_tokens_seen": 308543525, + "step": 14305, + "time_per_iteration": 2.5878682136535645 + }, + { + "auxiliary_loss_clip": 0.01164246, + "auxiliary_loss_mlp": 0.01100563, + "balance_loss_clip": 1.00193679, + "balance_loss_mlp": 1.0003314, + "epoch": 0.8601232526679693, + "flos": 17493093717120.0, + "grad_norm": 3.57296842681124, + "language_loss": 0.83709908, + "learning_rate": 2.0167461654448558e-07, + "loss": 0.85974717, + "num_input_tokens_seen": 308557995, + "step": 14306, + "time_per_iteration": 4.039355516433716 + }, + { + "auxiliary_loss_clip": 0.01149432, + "auxiliary_loss_mlp": 0.0074735, + "balance_loss_clip": 1.0018065, + "balance_loss_mlp": 1.00044346, + "epoch": 0.8601833759206373, + "flos": 26985935963520.0, + "grad_norm": 1.4720848656463528, + "language_loss": 0.71600932, + "learning_rate": 2.01504216561474e-07, + "loss": 0.73497713, + "num_input_tokens_seen": 308582750, + "step": 14307, + "time_per_iteration": 2.8551011085510254 + }, + { + "auxiliary_loss_clip": 0.01149688, + "auxiliary_loss_mlp": 0.00747607, + "balance_loss_clip": 1.00182533, + "balance_loss_mlp": 1.00060225, + "epoch": 0.8602434991733052, + "flos": 25229760209280.0, + "grad_norm": 1.709198495912613, + "language_loss": 0.63796735, + "learning_rate": 2.0133388477760316e-07, + "loss": 0.65694034, + "num_input_tokens_seen": 308603770, + "step": 14308, + "time_per_iteration": 2.7194063663482666 + }, + { + "auxiliary_loss_clip": 0.01127022, + "auxiliary_loss_mlp": 0.0107439, + "balance_loss_clip": 1.00073719, + "balance_loss_mlp": 1.00000334, + "epoch": 0.8603036224259732, + "flos": 71015363107200.0, + "grad_norm": 0.62416681576948, + "language_loss": 0.48441768, + "learning_rate": 2.0116362119933172e-07, + "loss": 0.50643182, + "num_input_tokens_seen": 308667735, + "step": 14309, + "time_per_iteration": 3.3219358921051025 + }, + { + "auxiliary_loss_clip": 0.01082601, + "auxiliary_loss_mlp": 0.0110259, + "balance_loss_clip": 1.00158167, + "balance_loss_mlp": 1.0006423, + "epoch": 0.8603637456786413, + "flos": 20300625578880.0, + "grad_norm": 5.902792289891885, + "language_loss": 0.67460573, + "learning_rate": 2.0099342583311563e-07, + "loss": 0.69645762, + "num_input_tokens_seen": 308686300, + "step": 14310, + "time_per_iteration": 2.846252918243408 + }, + { + "auxiliary_loss_clip": 0.01087089, + "auxiliary_loss_mlp": 0.01100418, + "balance_loss_clip": 1.00175405, + "balance_loss_mlp": 1.00052094, + "epoch": 0.8604238689313092, + "flos": 21835842819840.0, + "grad_norm": 1.6418242767350997, + "language_loss": 0.78044093, + "learning_rate": 2.0082329868540905e-07, + "loss": 0.80231607, + "num_input_tokens_seen": 308705825, + "step": 14311, + "time_per_iteration": 2.840608835220337 + }, + { + "auxiliary_loss_clip": 0.01147518, + "auxiliary_loss_mlp": 0.01101014, + "balance_loss_clip": 1.00183916, + "balance_loss_mlp": 1.00049675, + "epoch": 0.8604839921839772, + "flos": 18004210295040.0, + "grad_norm": 1.952923898688807, + "language_loss": 0.71996081, + "learning_rate": 2.006532397626639e-07, + "loss": 0.74244618, + "num_input_tokens_seen": 308723340, + "step": 14312, + "time_per_iteration": 2.6952202320098877 + }, + { + "auxiliary_loss_clip": 0.01132881, + "auxiliary_loss_mlp": 0.01100896, + "balance_loss_clip": 1.00176001, + "balance_loss_mlp": 1.0005219, + "epoch": 0.8605441154366451, + "flos": 16252164604800.0, + "grad_norm": 1.8089234032018033, + "language_loss": 0.77825344, + "learning_rate": 2.0048324907132797e-07, + "loss": 0.80059117, + "num_input_tokens_seen": 308741280, + "step": 14313, + "time_per_iteration": 2.7368972301483154 + }, + { + "auxiliary_loss_clip": 0.01134774, + "auxiliary_loss_mlp": 0.01100896, + "balance_loss_clip": 1.00181556, + "balance_loss_mlp": 1.00047445, + "epoch": 0.8606042386893131, + "flos": 32267065921920.0, + "grad_norm": 1.7417729741433365, + "language_loss": 0.73226392, + "learning_rate": 2.003133266178474e-07, + "loss": 0.75462067, + "num_input_tokens_seen": 308762875, + "step": 14314, + "time_per_iteration": 2.782479763031006 + }, + { + "auxiliary_loss_clip": 0.01130749, + "auxiliary_loss_mlp": 0.0110077, + "balance_loss_clip": 1.001719, + "balance_loss_mlp": 1.00044334, + "epoch": 0.860664361941981, + "flos": 20229774001920.0, + "grad_norm": 1.8198492902565302, + "language_loss": 0.69275784, + "learning_rate": 2.001434724086657e-07, + "loss": 0.71507311, + "num_input_tokens_seen": 308780315, + "step": 14315, + "time_per_iteration": 2.6419129371643066 + }, + { + "auxiliary_loss_clip": 0.01147682, + "auxiliary_loss_mlp": 0.01100765, + "balance_loss_clip": 1.00193548, + "balance_loss_mlp": 1.00062919, + "epoch": 0.8607244851946491, + "flos": 25191622944000.0, + "grad_norm": 1.8603417518838903, + "language_loss": 0.72299707, + "learning_rate": 1.9997368645022418e-07, + "loss": 0.74548149, + "num_input_tokens_seen": 308799435, + "step": 14316, + "time_per_iteration": 2.6674957275390625 + }, + { + "auxiliary_loss_clip": 0.01131059, + "auxiliary_loss_mlp": 0.01101436, + "balance_loss_clip": 1.00187969, + "balance_loss_mlp": 1.00044155, + "epoch": 0.860784608447317, + "flos": 20482082110080.0, + "grad_norm": 1.806656291401864, + "language_loss": 0.82767868, + "learning_rate": 1.9980396874896056e-07, + "loss": 0.8500036, + "num_input_tokens_seen": 308817730, + "step": 14317, + "time_per_iteration": 2.649001121520996 + }, + { + "auxiliary_loss_clip": 0.01134279, + "auxiliary_loss_mlp": 0.01100847, + "balance_loss_clip": 1.00174272, + "balance_loss_mlp": 1.00052023, + "epoch": 0.860844731699985, + "flos": 50476037696640.0, + "grad_norm": 1.637379421434479, + "language_loss": 0.67110014, + "learning_rate": 1.996343193113108e-07, + "loss": 0.6934514, + "num_input_tokens_seen": 308841735, + "step": 14318, + "time_per_iteration": 2.9492878913879395 + }, + { + "auxiliary_loss_clip": 0.011475, + "auxiliary_loss_mlp": 0.01100449, + "balance_loss_clip": 1.00179625, + "balance_loss_mlp": 1.00040865, + "epoch": 0.8609048549526529, + "flos": 41172768455040.0, + "grad_norm": 1.5625340148352203, + "language_loss": 0.71203077, + "learning_rate": 1.9946473814370911e-07, + "loss": 0.7345103, + "num_input_tokens_seen": 308865050, + "step": 14319, + "time_per_iteration": 2.8676679134368896 + }, + { + "auxiliary_loss_clip": 0.01133225, + "auxiliary_loss_mlp": 0.00747429, + "balance_loss_clip": 1.00191617, + "balance_loss_mlp": 1.00052047, + "epoch": 0.8609649782053209, + "flos": 23951196622080.0, + "grad_norm": 1.6727629247844928, + "language_loss": 0.66997552, + "learning_rate": 1.992952252525839e-07, + "loss": 0.6887821, + "num_input_tokens_seen": 308885375, + "step": 14320, + "time_per_iteration": 4.441867351531982 + }, + { + "auxiliary_loss_clip": 0.01132936, + "auxiliary_loss_mlp": 0.01101224, + "balance_loss_clip": 1.00168371, + "balance_loss_mlp": 1.00046825, + "epoch": 0.8610251014579888, + "flos": 23112574813440.0, + "grad_norm": 2.2080957354797777, + "language_loss": 0.7964648, + "learning_rate": 1.9912578064436446e-07, + "loss": 0.81880641, + "num_input_tokens_seen": 308904700, + "step": 14321, + "time_per_iteration": 4.084712266921997 + }, + { + "auxiliary_loss_clip": 0.01149367, + "auxiliary_loss_mlp": 0.00747373, + "balance_loss_clip": 1.00182581, + "balance_loss_mlp": 1.00044179, + "epoch": 0.8610852247106568, + "flos": 19426811420160.0, + "grad_norm": 2.782026760576467, + "language_loss": 0.71380746, + "learning_rate": 1.9895640432547567e-07, + "loss": 0.73277491, + "num_input_tokens_seen": 308922985, + "step": 14322, + "time_per_iteration": 2.6451008319854736 + }, + { + "auxiliary_loss_clip": 0.01132588, + "auxiliary_loss_mlp": 0.01101892, + "balance_loss_clip": 1.00187063, + "balance_loss_mlp": 1.00051677, + "epoch": 0.8611453479633249, + "flos": 19312076401920.0, + "grad_norm": 3.0528532707567817, + "language_loss": 0.56494313, + "learning_rate": 1.9878709630234102e-07, + "loss": 0.58728796, + "num_input_tokens_seen": 308940765, + "step": 14323, + "time_per_iteration": 2.6614251136779785 + }, + { + "auxiliary_loss_clip": 0.01115683, + "auxiliary_loss_mlp": 0.01100454, + "balance_loss_clip": 1.00170028, + "balance_loss_mlp": 1.00046146, + "epoch": 0.8612054712159928, + "flos": 23253667436160.0, + "grad_norm": 1.874375871795816, + "language_loss": 0.75324756, + "learning_rate": 1.986178565813801e-07, + "loss": 0.77540886, + "num_input_tokens_seen": 308960110, + "step": 14324, + "time_per_iteration": 2.81536602973938 + }, + { + "auxiliary_loss_clip": 0.01101245, + "auxiliary_loss_mlp": 0.01100521, + "balance_loss_clip": 1.0016768, + "balance_loss_mlp": 1.00033784, + "epoch": 0.8612655944686608, + "flos": 16028440744320.0, + "grad_norm": 2.3118562988531903, + "language_loss": 0.66392612, + "learning_rate": 1.9844868516901036e-07, + "loss": 0.68594384, + "num_input_tokens_seen": 308976665, + "step": 14325, + "time_per_iteration": 2.7477574348449707 + }, + { + "auxiliary_loss_clip": 0.01147632, + "auxiliary_loss_mlp": 0.01101577, + "balance_loss_clip": 1.00178957, + "balance_loss_mlp": 1.00048709, + "epoch": 0.8613257177213287, + "flos": 22492720788480.0, + "grad_norm": 1.6996587083600683, + "language_loss": 0.645594, + "learning_rate": 1.982795820716472e-07, + "loss": 0.66808611, + "num_input_tokens_seen": 308997015, + "step": 14326, + "time_per_iteration": 2.7320802211761475 + }, + { + "auxiliary_loss_clip": 0.01134988, + "auxiliary_loss_mlp": 0.01101449, + "balance_loss_clip": 1.00194669, + "balance_loss_mlp": 1.00055051, + "epoch": 0.8613858409739967, + "flos": 17238056175360.0, + "grad_norm": 4.00530328947456, + "language_loss": 0.8404181, + "learning_rate": 1.9811054729570253e-07, + "loss": 0.86278242, + "num_input_tokens_seen": 309015250, + "step": 14327, + "time_per_iteration": 2.675626516342163 + }, + { + "auxiliary_loss_clip": 0.01149619, + "auxiliary_loss_mlp": 0.01100475, + "balance_loss_clip": 1.0018146, + "balance_loss_mlp": 1.00048184, + "epoch": 0.8614459642266646, + "flos": 22821123859200.0, + "grad_norm": 2.5835578995878623, + "language_loss": 0.75317729, + "learning_rate": 1.9794158084758661e-07, + "loss": 0.77567828, + "num_input_tokens_seen": 309034140, + "step": 14328, + "time_per_iteration": 2.6756093502044678 + }, + { + "auxiliary_loss_clip": 0.01149536, + "auxiliary_loss_mlp": 0.01100099, + "balance_loss_clip": 1.00185347, + "balance_loss_mlp": 1.00034475, + "epoch": 0.8615060874793327, + "flos": 26504301473280.0, + "grad_norm": 1.683820523397643, + "language_loss": 0.80209035, + "learning_rate": 1.9777268273370673e-07, + "loss": 0.82458675, + "num_input_tokens_seen": 309055075, + "step": 14329, + "time_per_iteration": 2.7575879096984863 + }, + { + "auxiliary_loss_clip": 0.01131225, + "auxiliary_loss_mlp": 0.01101059, + "balance_loss_clip": 1.00176811, + "balance_loss_mlp": 1.00054169, + "epoch": 0.8615662107320006, + "flos": 24061011477120.0, + "grad_norm": 2.168286309795194, + "language_loss": 0.77079129, + "learning_rate": 1.9760385296046757e-07, + "loss": 0.79311419, + "num_input_tokens_seen": 309074650, + "step": 14330, + "time_per_iteration": 2.7532944679260254 + }, + { + "auxiliary_loss_clip": 0.01147619, + "auxiliary_loss_mlp": 0.01100692, + "balance_loss_clip": 1.00176096, + "balance_loss_mlp": 1.00055599, + "epoch": 0.8616263339846686, + "flos": 24165044242560.0, + "grad_norm": 2.6199836692953324, + "language_loss": 0.65071964, + "learning_rate": 1.974350915342702e-07, + "loss": 0.67320281, + "num_input_tokens_seen": 309094385, + "step": 14331, + "time_per_iteration": 2.8376622200012207 + }, + { + "auxiliary_loss_clip": 0.01131238, + "auxiliary_loss_mlp": 0.01099727, + "balance_loss_clip": 1.00176644, + "balance_loss_mlp": 1.00049734, + "epoch": 0.8616864572373365, + "flos": 21724340025600.0, + "grad_norm": 1.6855152483076856, + "language_loss": 0.75812054, + "learning_rate": 1.9726639846151506e-07, + "loss": 0.7804302, + "num_input_tokens_seen": 309111815, + "step": 14332, + "time_per_iteration": 2.8034346103668213 + }, + { + "auxiliary_loss_clip": 0.01147817, + "auxiliary_loss_mlp": 0.01100941, + "balance_loss_clip": 1.00168121, + "balance_loss_mlp": 1.00047135, + "epoch": 0.8617465804900045, + "flos": 23766651521280.0, + "grad_norm": 2.129934970071532, + "language_loss": 0.67116082, + "learning_rate": 1.9709777374859904e-07, + "loss": 0.6936484, + "num_input_tokens_seen": 309131385, + "step": 14333, + "time_per_iteration": 2.7677857875823975 + }, + { + "auxiliary_loss_clip": 0.01133345, + "auxiliary_loss_mlp": 0.01103214, + "balance_loss_clip": 1.0017314, + "balance_loss_mlp": 1.00040793, + "epoch": 0.8618067037426724, + "flos": 37703941251840.0, + "grad_norm": 1.750766574623112, + "language_loss": 0.62042052, + "learning_rate": 1.969292174019157e-07, + "loss": 0.64278615, + "num_input_tokens_seen": 309155020, + "step": 14334, + "time_per_iteration": 2.8466389179229736 + }, + { + "auxiliary_loss_clip": 0.01119252, + "auxiliary_loss_mlp": 0.01103101, + "balance_loss_clip": 1.0018568, + "balance_loss_mlp": 1.00067663, + "epoch": 0.8618668269953405, + "flos": 21471026336640.0, + "grad_norm": 1.8857654159417845, + "language_loss": 0.69485968, + "learning_rate": 1.967607294278577e-07, + "loss": 0.71708322, + "num_input_tokens_seen": 309172865, + "step": 14335, + "time_per_iteration": 2.777503728866577 + }, + { + "auxiliary_loss_clip": 0.0114778, + "auxiliary_loss_mlp": 0.01101626, + "balance_loss_clip": 1.00200939, + "balance_loss_mlp": 1.00044084, + "epoch": 0.8619269502480085, + "flos": 22232691256320.0, + "grad_norm": 1.3864001760822897, + "language_loss": 0.82810187, + "learning_rate": 1.965923098328135e-07, + "loss": 0.85059595, + "num_input_tokens_seen": 309193575, + "step": 14336, + "time_per_iteration": 2.720153331756592 + }, + { + "auxiliary_loss_clip": 0.01164541, + "auxiliary_loss_mlp": 0.01101701, + "balance_loss_clip": 1.00194395, + "balance_loss_mlp": 1.00051618, + "epoch": 0.8619870735006764, + "flos": 22710626645760.0, + "grad_norm": 2.20886743637745, + "language_loss": 0.6750493, + "learning_rate": 1.9642395862316907e-07, + "loss": 0.69771177, + "num_input_tokens_seen": 309212680, + "step": 14337, + "time_per_iteration": 2.5992941856384277 + }, + { + "auxiliary_loss_clip": 0.01119971, + "auxiliary_loss_mlp": 0.01100884, + "balance_loss_clip": 1.00187278, + "balance_loss_mlp": 1.00060487, + "epoch": 0.8620471967533444, + "flos": 37520293991040.0, + "grad_norm": 2.5585291850894474, + "language_loss": 0.6734944, + "learning_rate": 1.962556758053089e-07, + "loss": 0.69570297, + "num_input_tokens_seen": 309234485, + "step": 14338, + "time_per_iteration": 2.8591666221618652 + }, + { + "auxiliary_loss_clip": 0.01130305, + "auxiliary_loss_mlp": 0.01100891, + "balance_loss_clip": 1.00176561, + "balance_loss_mlp": 1.00051641, + "epoch": 0.8621073200060123, + "flos": 19682459493120.0, + "grad_norm": 1.8943218362210008, + "language_loss": 0.61818683, + "learning_rate": 1.9608746138561448e-07, + "loss": 0.64049876, + "num_input_tokens_seen": 309253630, + "step": 14339, + "time_per_iteration": 4.292147874832153 + }, + { + "auxiliary_loss_clip": 0.01133, + "auxiliary_loss_mlp": 0.00747414, + "balance_loss_clip": 1.00178277, + "balance_loss_mlp": 1.00051343, + "epoch": 0.8621674432586803, + "flos": 14536855549440.0, + "grad_norm": 1.9037816166274606, + "language_loss": 0.62709773, + "learning_rate": 1.9591931537046458e-07, + "loss": 0.64590186, + "num_input_tokens_seen": 309270950, + "step": 14340, + "time_per_iteration": 2.6766905784606934 + }, + { + "auxiliary_loss_clip": 0.01101933, + "auxiliary_loss_mlp": 0.01099563, + "balance_loss_clip": 1.00174713, + "balance_loss_mlp": 1.00028539, + "epoch": 0.8622275665113482, + "flos": 20740100480640.0, + "grad_norm": 2.127631647892058, + "language_loss": 0.80262518, + "learning_rate": 1.9575123776623493e-07, + "loss": 0.82464015, + "num_input_tokens_seen": 309288780, + "step": 14341, + "time_per_iteration": 2.823155641555786 + }, + { + "auxiliary_loss_clip": 0.01147554, + "auxiliary_loss_mlp": 0.01099962, + "balance_loss_clip": 1.00166512, + "balance_loss_mlp": 1.00049388, + "epoch": 0.8622876897640163, + "flos": 24715914197760.0, + "grad_norm": 2.164914141991712, + "language_loss": 0.74785542, + "learning_rate": 1.9558322857929887e-07, + "loss": 0.77033055, + "num_input_tokens_seen": 309310875, + "step": 14342, + "time_per_iteration": 2.7928900718688965 + }, + { + "auxiliary_loss_clip": 0.01114973, + "auxiliary_loss_mlp": 0.01101958, + "balance_loss_clip": 1.00183022, + "balance_loss_mlp": 1.00048733, + "epoch": 0.8623478130166842, + "flos": 17457362663040.0, + "grad_norm": 1.7012831176228134, + "language_loss": 0.68914402, + "learning_rate": 1.95415287816028e-07, + "loss": 0.71131337, + "num_input_tokens_seen": 309329900, + "step": 14343, + "time_per_iteration": 2.7848799228668213 + }, + { + "auxiliary_loss_clip": 0.01147393, + "auxiliary_loss_mlp": 0.01101165, + "balance_loss_clip": 1.00179076, + "balance_loss_mlp": 1.0005523, + "epoch": 0.8624079362693522, + "flos": 18109176814080.0, + "grad_norm": 1.8506528318693254, + "language_loss": 0.68013942, + "learning_rate": 1.9524741548278967e-07, + "loss": 0.70262504, + "num_input_tokens_seen": 309347870, + "step": 14344, + "time_per_iteration": 4.13390851020813 + }, + { + "auxiliary_loss_clip": 0.01117224, + "auxiliary_loss_mlp": 0.01100813, + "balance_loss_clip": 1.00186348, + "balance_loss_mlp": 1.00058174, + "epoch": 0.8624680595220201, + "flos": 30666455971200.0, + "grad_norm": 1.427355529592472, + "language_loss": 0.81247705, + "learning_rate": 1.9507961158595054e-07, + "loss": 0.83465737, + "num_input_tokens_seen": 309371695, + "step": 14345, + "time_per_iteration": 2.8376660346984863 + }, + { + "auxiliary_loss_clip": 0.01148912, + "auxiliary_loss_mlp": 0.01101098, + "balance_loss_clip": 1.00196409, + "balance_loss_mlp": 1.00048566, + "epoch": 0.8625281827746881, + "flos": 37998588516480.0, + "grad_norm": 2.152097819078294, + "language_loss": 0.50323498, + "learning_rate": 1.9491187613187355e-07, + "loss": 0.52573508, + "num_input_tokens_seen": 309394645, + "step": 14346, + "time_per_iteration": 2.8101582527160645 + }, + { + "auxiliary_loss_clip": 0.01070987, + "auxiliary_loss_mlp": 0.01100025, + "balance_loss_clip": 1.00166607, + "balance_loss_mlp": 1.00046182, + "epoch": 0.862588306027356, + "flos": 26249730808320.0, + "grad_norm": 1.5986873287646963, + "language_loss": 0.75214159, + "learning_rate": 1.9474420912691913e-07, + "loss": 0.77385169, + "num_input_tokens_seen": 309413170, + "step": 14347, + "time_per_iteration": 2.8614718914031982 + }, + { + "auxiliary_loss_clip": 0.01133242, + "auxiliary_loss_mlp": 0.01102101, + "balance_loss_clip": 1.00194943, + "balance_loss_mlp": 1.00043964, + "epoch": 0.862648429280024, + "flos": 25878809013120.0, + "grad_norm": 2.1271031908340365, + "language_loss": 0.80439746, + "learning_rate": 1.945766105774449e-07, + "loss": 0.82675087, + "num_input_tokens_seen": 309431315, + "step": 14348, + "time_per_iteration": 2.819490432739258 + }, + { + "auxiliary_loss_clip": 0.01147256, + "auxiliary_loss_mlp": 0.01099978, + "balance_loss_clip": 1.00183606, + "balance_loss_mlp": 1.00031877, + "epoch": 0.862708552532692, + "flos": 37816413713280.0, + "grad_norm": 1.660110659903758, + "language_loss": 0.66098696, + "learning_rate": 1.9440908048980665e-07, + "loss": 0.68345928, + "num_input_tokens_seen": 309453020, + "step": 14349, + "time_per_iteration": 2.803715467453003 + }, + { + "auxiliary_loss_clip": 0.01149348, + "auxiliary_loss_mlp": 0.01100939, + "balance_loss_clip": 1.00188935, + "balance_loss_mlp": 1.00056458, + "epoch": 0.86276867578536, + "flos": 19091800247040.0, + "grad_norm": 2.333655740208431, + "language_loss": 0.70146865, + "learning_rate": 1.942416188703573e-07, + "loss": 0.72397155, + "num_input_tokens_seen": 309469780, + "step": 14350, + "time_per_iteration": 2.6749143600463867 + }, + { + "auxiliary_loss_clip": 0.01133098, + "auxiliary_loss_mlp": 0.01101477, + "balance_loss_clip": 1.00185335, + "balance_loss_mlp": 1.0005784, + "epoch": 0.862828799038028, + "flos": 22164281804160.0, + "grad_norm": 2.5719289191603387, + "language_loss": 0.76999307, + "learning_rate": 1.9407422572544618e-07, + "loss": 0.79233879, + "num_input_tokens_seen": 309489610, + "step": 14351, + "time_per_iteration": 2.767022132873535 + }, + { + "auxiliary_loss_clip": 0.01147508, + "auxiliary_loss_mlp": 0.01100282, + "balance_loss_clip": 1.00195956, + "balance_loss_mlp": 1.00043201, + "epoch": 0.8628889222906959, + "flos": 23145576433920.0, + "grad_norm": 1.9942669537185729, + "language_loss": 0.84454137, + "learning_rate": 1.9390690106142204e-07, + "loss": 0.8670193, + "num_input_tokens_seen": 309508295, + "step": 14352, + "time_per_iteration": 2.7855186462402344 + }, + { + "auxiliary_loss_clip": 0.01143334, + "auxiliary_loss_mlp": 0.01074362, + "balance_loss_clip": 1.00076795, + "balance_loss_mlp": 0.99997503, + "epoch": 0.8629490455433639, + "flos": 57817762151040.0, + "grad_norm": 0.7869022918729563, + "language_loss": 0.61941421, + "learning_rate": 1.9373964488462913e-07, + "loss": 0.64159119, + "num_input_tokens_seen": 309567960, + "step": 14353, + "time_per_iteration": 3.2288055419921875 + }, + { + "auxiliary_loss_clip": 0.01164151, + "auxiliary_loss_mlp": 0.01100167, + "balance_loss_clip": 1.00196803, + "balance_loss_mlp": 1.00046074, + "epoch": 0.8630091687960318, + "flos": 15919667383680.0, + "grad_norm": 1.944005191702991, + "language_loss": 0.81540751, + "learning_rate": 1.9357245720140948e-07, + "loss": 0.83805072, + "num_input_tokens_seen": 309586050, + "step": 14354, + "time_per_iteration": 2.560365915298462 + }, + { + "auxiliary_loss_clip": 0.01134437, + "auxiliary_loss_mlp": 0.0110057, + "balance_loss_clip": 1.0017029, + "balance_loss_mlp": 1.00043404, + "epoch": 0.8630692920486999, + "flos": 17961691570560.0, + "grad_norm": 2.090198013077844, + "language_loss": 0.85315043, + "learning_rate": 1.934053380181031e-07, + "loss": 0.87550056, + "num_input_tokens_seen": 309602910, + "step": 14355, + "time_per_iteration": 2.669215202331543 + }, + { + "auxiliary_loss_clip": 0.01116431, + "auxiliary_loss_mlp": 0.01100992, + "balance_loss_clip": 1.00180519, + "balance_loss_mlp": 1.00042725, + "epoch": 0.8631294153013678, + "flos": 22455158140800.0, + "grad_norm": 2.6573979570860997, + "language_loss": 0.58809167, + "learning_rate": 1.9323828734104763e-07, + "loss": 0.61026597, + "num_input_tokens_seen": 309621175, + "step": 14356, + "time_per_iteration": 2.72959566116333 + }, + { + "auxiliary_loss_clip": 0.01098834, + "auxiliary_loss_mlp": 0.01100978, + "balance_loss_clip": 1.00154257, + "balance_loss_mlp": 1.00050855, + "epoch": 0.8631895385540358, + "flos": 16837005847680.0, + "grad_norm": 1.716186145777171, + "language_loss": 0.77017653, + "learning_rate": 1.9307130517657756e-07, + "loss": 0.7921747, + "num_input_tokens_seen": 309639395, + "step": 14357, + "time_per_iteration": 2.746337890625 + }, + { + "auxiliary_loss_clip": 0.01147815, + "auxiliary_loss_mlp": 0.01102151, + "balance_loss_clip": 1.00198078, + "balance_loss_mlp": 1.00039399, + "epoch": 0.8632496618067037, + "flos": 18697214367360.0, + "grad_norm": 2.8580793404457796, + "language_loss": 0.77541006, + "learning_rate": 1.9290439153102468e-07, + "loss": 0.79790974, + "num_input_tokens_seen": 309657265, + "step": 14358, + "time_per_iteration": 4.466938734054565 + }, + { + "auxiliary_loss_clip": 0.01120065, + "auxiliary_loss_mlp": 0.01101599, + "balance_loss_clip": 1.00177824, + "balance_loss_mlp": 1.00041413, + "epoch": 0.8633097850593717, + "flos": 24279922915200.0, + "grad_norm": 1.6215837634108046, + "language_loss": 0.75286865, + "learning_rate": 1.9273754641071816e-07, + "loss": 0.77508533, + "num_input_tokens_seen": 309678610, + "step": 14359, + "time_per_iteration": 4.1117589473724365 + }, + { + "auxiliary_loss_clip": 0.01085757, + "auxiliary_loss_mlp": 0.01100695, + "balance_loss_clip": 1.00176692, + "balance_loss_mlp": 1.00041616, + "epoch": 0.8633699083120396, + "flos": 21178569801600.0, + "grad_norm": 1.8589407444541914, + "language_loss": 0.70463967, + "learning_rate": 1.9257076982198517e-07, + "loss": 0.72650415, + "num_input_tokens_seen": 309697710, + "step": 14360, + "time_per_iteration": 2.8082547187805176 + }, + { + "auxiliary_loss_clip": 0.011146, + "auxiliary_loss_mlp": 0.01102139, + "balance_loss_clip": 1.00178516, + "balance_loss_mlp": 1.00047684, + "epoch": 0.8634300315647077, + "flos": 19244888012160.0, + "grad_norm": 1.860733496688402, + "language_loss": 0.77065271, + "learning_rate": 1.9240406177114953e-07, + "loss": 0.7928201, + "num_input_tokens_seen": 309715985, + "step": 14361, + "time_per_iteration": 2.6950531005859375 + }, + { + "auxiliary_loss_clip": 0.01158055, + "auxiliary_loss_mlp": 0.01074407, + "balance_loss_clip": 1.00070214, + "balance_loss_mlp": 1.00002003, + "epoch": 0.8634901548173756, + "flos": 66195648282240.0, + "grad_norm": 0.9428385525743191, + "language_loss": 0.58835542, + "learning_rate": 1.922374222645329e-07, + "loss": 0.61068004, + "num_input_tokens_seen": 309779930, + "step": 14362, + "time_per_iteration": 3.1425583362579346 + }, + { + "auxiliary_loss_clip": 0.01066288, + "auxiliary_loss_mlp": 0.01102054, + "balance_loss_clip": 1.0014286, + "balance_loss_mlp": 1.00048804, + "epoch": 0.8635502780700436, + "flos": 24789531121920.0, + "grad_norm": 1.907183943666724, + "language_loss": 0.80829483, + "learning_rate": 1.9207085130845524e-07, + "loss": 0.82997823, + "num_input_tokens_seen": 309800580, + "step": 14363, + "time_per_iteration": 2.957080841064453 + }, + { + "auxiliary_loss_clip": 0.01132732, + "auxiliary_loss_mlp": 0.0110123, + "balance_loss_clip": 1.00170338, + "balance_loss_mlp": 1.00042629, + "epoch": 0.8636104013227116, + "flos": 25189970918400.0, + "grad_norm": 2.855644861563666, + "language_loss": 0.72699052, + "learning_rate": 1.9190434890923112e-07, + "loss": 0.74933016, + "num_input_tokens_seen": 309821725, + "step": 14364, + "time_per_iteration": 2.736436605453491 + }, + { + "auxiliary_loss_clip": 0.01134033, + "auxiliary_loss_mlp": 0.01101251, + "balance_loss_clip": 1.00172901, + "balance_loss_mlp": 1.00044739, + "epoch": 0.8636705245753795, + "flos": 23878441624320.0, + "grad_norm": 1.5957131491005576, + "language_loss": 0.7174871, + "learning_rate": 1.917379150731755e-07, + "loss": 0.73983991, + "num_input_tokens_seen": 309841565, + "step": 14365, + "time_per_iteration": 2.7398173809051514 + }, + { + "auxiliary_loss_clip": 0.01132577, + "auxiliary_loss_mlp": 0.01101944, + "balance_loss_clip": 1.00181246, + "balance_loss_mlp": 1.00056803, + "epoch": 0.8637306478280475, + "flos": 23110455911040.0, + "grad_norm": 1.9890248265666448, + "language_loss": 0.71399182, + "learning_rate": 1.915715498065993e-07, + "loss": 0.73633707, + "num_input_tokens_seen": 309858635, + "step": 14366, + "time_per_iteration": 2.7455015182495117 + }, + { + "auxiliary_loss_clip": 0.01132593, + "auxiliary_loss_mlp": 0.01100607, + "balance_loss_clip": 1.00180435, + "balance_loss_mlp": 1.00037575, + "epoch": 0.8637907710807154, + "flos": 21906802137600.0, + "grad_norm": 1.7807282424929105, + "language_loss": 0.82181793, + "learning_rate": 1.9140525311581146e-07, + "loss": 0.84414995, + "num_input_tokens_seen": 309877885, + "step": 14367, + "time_per_iteration": 2.736313581466675 + }, + { + "auxiliary_loss_clip": 0.01131155, + "auxiliary_loss_mlp": 0.01102009, + "balance_loss_clip": 1.00181055, + "balance_loss_mlp": 1.00044286, + "epoch": 0.8638508943333835, + "flos": 23580526222080.0, + "grad_norm": 2.055648579664049, + "language_loss": 0.61319822, + "learning_rate": 1.9123902500711743e-07, + "loss": 0.63552988, + "num_input_tokens_seen": 309893140, + "step": 14368, + "time_per_iteration": 2.7438440322875977 + }, + { + "auxiliary_loss_clip": 0.01147905, + "auxiliary_loss_mlp": 0.01100801, + "balance_loss_clip": 1.00196791, + "balance_loss_mlp": 1.00047421, + "epoch": 0.8639110175860514, + "flos": 25775853655680.0, + "grad_norm": 2.1547884009869134, + "language_loss": 0.76356137, + "learning_rate": 1.91072865486821e-07, + "loss": 0.78604841, + "num_input_tokens_seen": 309914175, + "step": 14369, + "time_per_iteration": 2.7404325008392334 + }, + { + "auxiliary_loss_clip": 0.0113333, + "auxiliary_loss_mlp": 0.01101935, + "balance_loss_clip": 1.00170803, + "balance_loss_mlp": 1.0004642, + "epoch": 0.8639711408387194, + "flos": 23369443948800.0, + "grad_norm": 2.330624340969294, + "language_loss": 0.64379984, + "learning_rate": 1.9090677456122294e-07, + "loss": 0.66615254, + "num_input_tokens_seen": 309932395, + "step": 14370, + "time_per_iteration": 2.7784104347229004 + }, + { + "auxiliary_loss_clip": 0.01086744, + "auxiliary_loss_mlp": 0.01100004, + "balance_loss_clip": 1.00179625, + "balance_loss_mlp": 1.00048816, + "epoch": 0.8640312640913873, + "flos": 22127221946880.0, + "grad_norm": 1.6297145196383627, + "language_loss": 0.66315627, + "learning_rate": 1.907407522366209e-07, + "loss": 0.68502378, + "num_input_tokens_seen": 309951720, + "step": 14371, + "time_per_iteration": 2.8062455654144287 + }, + { + "auxiliary_loss_clip": 0.01126618, + "auxiliary_loss_mlp": 0.0107438, + "balance_loss_clip": 1.00085139, + "balance_loss_mlp": 0.99999315, + "epoch": 0.8640913873440553, + "flos": 57571735944960.0, + "grad_norm": 0.8630342532237751, + "language_loss": 0.56948, + "learning_rate": 1.905747985193107e-07, + "loss": 0.59148997, + "num_input_tokens_seen": 310006120, + "step": 14372, + "time_per_iteration": 3.1647229194641113 + }, + { + "auxiliary_loss_clip": 0.01164214, + "auxiliary_loss_mlp": 0.01100629, + "balance_loss_clip": 1.00198078, + "balance_loss_mlp": 1.00058889, + "epoch": 0.8641515105967232, + "flos": 23987430466560.0, + "grad_norm": 1.9904788609500432, + "language_loss": 0.79196, + "learning_rate": 1.9040891341558597e-07, + "loss": 0.81460845, + "num_input_tokens_seen": 310026740, + "step": 14373, + "time_per_iteration": 2.6640472412109375 + }, + { + "auxiliary_loss_clip": 0.01164268, + "auxiliary_loss_mlp": 0.01100762, + "balance_loss_clip": 1.00194287, + "balance_loss_mlp": 1.00043511, + "epoch": 0.8642116338493913, + "flos": 19062749122560.0, + "grad_norm": 2.2825422179725976, + "language_loss": 0.6367076, + "learning_rate": 1.9024309693173656e-07, + "loss": 0.65935791, + "num_input_tokens_seen": 310044135, + "step": 14374, + "time_per_iteration": 2.6407713890075684 + }, + { + "auxiliary_loss_clip": 0.01133292, + "auxiliary_loss_mlp": 0.01100029, + "balance_loss_clip": 1.00188804, + "balance_loss_mlp": 1.00060809, + "epoch": 0.8642717571020592, + "flos": 18254148105600.0, + "grad_norm": 2.237389530169865, + "language_loss": 0.77257228, + "learning_rate": 1.9007734907404993e-07, + "loss": 0.79490548, + "num_input_tokens_seen": 310061560, + "step": 14375, + "time_per_iteration": 2.690467357635498 + }, + { + "auxiliary_loss_clip": 0.01099111, + "auxiliary_loss_mlp": 0.00747224, + "balance_loss_clip": 1.00163102, + "balance_loss_mlp": 1.00034785, + "epoch": 0.8643318803547272, + "flos": 57663270777600.0, + "grad_norm": 2.3734180107583387, + "language_loss": 0.60613137, + "learning_rate": 1.899116698488117e-07, + "loss": 0.62459475, + "num_input_tokens_seen": 310087310, + "step": 14376, + "time_per_iteration": 3.118739366531372 + }, + { + "auxiliary_loss_clip": 0.01117232, + "auxiliary_loss_mlp": 0.01100905, + "balance_loss_clip": 1.00184464, + "balance_loss_mlp": 1.00053132, + "epoch": 0.8643920036073952, + "flos": 19609524927360.0, + "grad_norm": 2.4116874456796094, + "language_loss": 0.66440064, + "learning_rate": 1.8974605926230457e-07, + "loss": 0.68658197, + "num_input_tokens_seen": 310106260, + "step": 14377, + "time_per_iteration": 4.231890678405762 + }, + { + "auxiliary_loss_clip": 0.01132811, + "auxiliary_loss_mlp": 0.01101556, + "balance_loss_clip": 1.00181079, + "balance_loss_mlp": 1.00065768, + "epoch": 0.8644521268600631, + "flos": 20850346298880.0, + "grad_norm": 1.5895551281303768, + "language_loss": 0.70575106, + "learning_rate": 1.8958051732080804e-07, + "loss": 0.72809476, + "num_input_tokens_seen": 310125305, + "step": 14378, + "time_per_iteration": 2.6525518894195557 + }, + { + "auxiliary_loss_clip": 0.01141398, + "auxiliary_loss_mlp": 0.01074313, + "balance_loss_clip": 1.00067639, + "balance_loss_mlp": 0.99992603, + "epoch": 0.8645122501127311, + "flos": 66719550101760.0, + "grad_norm": 0.8032302891958494, + "language_loss": 0.60306525, + "learning_rate": 1.894150440305995e-07, + "loss": 0.62522238, + "num_input_tokens_seen": 310189270, + "step": 14379, + "time_per_iteration": 3.325514793395996 + }, + { + "auxiliary_loss_clip": 0.01133646, + "auxiliary_loss_mlp": 0.01100997, + "balance_loss_clip": 1.00196755, + "balance_loss_mlp": 1.00057483, + "epoch": 0.864572373365399, + "flos": 21690009601920.0, + "grad_norm": 1.626855851553116, + "language_loss": 0.7448796, + "learning_rate": 1.8924963939795478e-07, + "loss": 0.76722604, + "num_input_tokens_seen": 310208395, + "step": 14380, + "time_per_iteration": 2.7196407318115234 + }, + { + "auxiliary_loss_clip": 0.01135327, + "auxiliary_loss_mlp": 0.01101894, + "balance_loss_clip": 1.0019716, + "balance_loss_mlp": 1.00051856, + "epoch": 0.8646324966180671, + "flos": 20266402896000.0, + "grad_norm": 4.4365561135500435, + "language_loss": 0.75269032, + "learning_rate": 1.8908430342914473e-07, + "loss": 0.7750625, + "num_input_tokens_seen": 310227415, + "step": 14381, + "time_per_iteration": 4.051188230514526 + }, + { + "auxiliary_loss_clip": 0.01131023, + "auxiliary_loss_mlp": 0.0109973, + "balance_loss_clip": 1.00175893, + "balance_loss_mlp": 1.00045216, + "epoch": 0.864692619870735, + "flos": 11946188050560.0, + "grad_norm": 2.297909936122943, + "language_loss": 0.84210181, + "learning_rate": 1.8891903613043892e-07, + "loss": 0.86440933, + "num_input_tokens_seen": 310242625, + "step": 14382, + "time_per_iteration": 2.6717123985290527 + }, + { + "auxiliary_loss_clip": 0.01148851, + "auxiliary_loss_mlp": 0.0110073, + "balance_loss_clip": 1.00193691, + "balance_loss_mlp": 1.00045133, + "epoch": 0.864752743123403, + "flos": 21470703114240.0, + "grad_norm": 1.9507938583209345, + "language_loss": 0.75707054, + "learning_rate": 1.8875383750810504e-07, + "loss": 0.77956629, + "num_input_tokens_seen": 310260585, + "step": 14383, + "time_per_iteration": 2.744027853012085 + }, + { + "auxiliary_loss_clip": 0.01132712, + "auxiliary_loss_mlp": 0.01100293, + "balance_loss_clip": 1.00184143, + "balance_loss_mlp": 1.00053906, + "epoch": 0.8648128663760709, + "flos": 19530018172800.0, + "grad_norm": 1.7583807183913818, + "language_loss": 0.85110891, + "learning_rate": 1.8858870756840738e-07, + "loss": 0.87343895, + "num_input_tokens_seen": 310277210, + "step": 14384, + "time_per_iteration": 2.730332612991333 + }, + { + "auxiliary_loss_clip": 0.01146839, + "auxiliary_loss_mlp": 0.01099923, + "balance_loss_clip": 1.00177884, + "balance_loss_mlp": 1.00050294, + "epoch": 0.8648729896287389, + "flos": 21287953693440.0, + "grad_norm": 1.871933868025928, + "language_loss": 0.80622542, + "learning_rate": 1.884236463176072e-07, + "loss": 0.82869303, + "num_input_tokens_seen": 310296610, + "step": 14385, + "time_per_iteration": 2.7280492782592773 + }, + { + "auxiliary_loss_clip": 0.01133007, + "auxiliary_loss_mlp": 0.01102673, + "balance_loss_clip": 1.00194633, + "balance_loss_mlp": 1.0005343, + "epoch": 0.8649331128814068, + "flos": 24604483230720.0, + "grad_norm": 1.9802950815013118, + "language_loss": 0.73048317, + "learning_rate": 1.8825865376196437e-07, + "loss": 0.75283998, + "num_input_tokens_seen": 310316830, + "step": 14386, + "time_per_iteration": 2.7554614543914795 + }, + { + "auxiliary_loss_clip": 0.0114938, + "auxiliary_loss_mlp": 0.01102015, + "balance_loss_clip": 1.00190187, + "balance_loss_mlp": 1.00044858, + "epoch": 0.8649932361340749, + "flos": 15377811742080.0, + "grad_norm": 2.4934785061701086, + "language_loss": 0.82126558, + "learning_rate": 1.8809372990773476e-07, + "loss": 0.84377956, + "num_input_tokens_seen": 310334355, + "step": 14387, + "time_per_iteration": 2.6750080585479736 + }, + { + "auxiliary_loss_clip": 0.01164093, + "auxiliary_loss_mlp": 0.01100324, + "balance_loss_clip": 1.00185323, + "balance_loss_mlp": 1.00042665, + "epoch": 0.8650533593867428, + "flos": 19901227276800.0, + "grad_norm": 1.9511003763576922, + "language_loss": 0.68602103, + "learning_rate": 1.8792887476117224e-07, + "loss": 0.70866513, + "num_input_tokens_seen": 310352900, + "step": 14388, + "time_per_iteration": 2.645735025405884 + }, + { + "auxiliary_loss_clip": 0.01114538, + "auxiliary_loss_mlp": 0.01099814, + "balance_loss_clip": 1.0017494, + "balance_loss_mlp": 1.0005368, + "epoch": 0.8651134826394108, + "flos": 25626931868160.0, + "grad_norm": 1.5393648057422356, + "language_loss": 0.9035182, + "learning_rate": 1.877640883285283e-07, + "loss": 0.92566168, + "num_input_tokens_seen": 310372855, + "step": 14389, + "time_per_iteration": 2.81832218170166 + }, + { + "auxiliary_loss_clip": 0.01098387, + "auxiliary_loss_mlp": 0.00747168, + "balance_loss_clip": 1.00166166, + "balance_loss_mlp": 1.00048304, + "epoch": 0.8651736058920788, + "flos": 18734525619840.0, + "grad_norm": 1.5633592330925408, + "language_loss": 0.70880198, + "learning_rate": 1.8759937061605212e-07, + "loss": 0.72725749, + "num_input_tokens_seen": 310391595, + "step": 14390, + "time_per_iteration": 2.749300241470337 + }, + { + "auxiliary_loss_clip": 0.01164275, + "auxiliary_loss_mlp": 0.01101237, + "balance_loss_clip": 1.00192261, + "balance_loss_mlp": 1.00057673, + "epoch": 0.8652337291447467, + "flos": 20776765288320.0, + "grad_norm": 1.7089689402959816, + "language_loss": 0.82387936, + "learning_rate": 1.8743472162998941e-07, + "loss": 0.84653449, + "num_input_tokens_seen": 310410090, + "step": 14391, + "time_per_iteration": 2.6982030868530273 + }, + { + "auxiliary_loss_clip": 0.01111911, + "auxiliary_loss_mlp": 0.01074832, + "balance_loss_clip": 1.00100088, + "balance_loss_mlp": 1.00006425, + "epoch": 0.8652938523974147, + "flos": 64227887464320.0, + "grad_norm": 0.7994064554131598, + "language_loss": 0.67999822, + "learning_rate": 1.8727014137658337e-07, + "loss": 0.70186567, + "num_input_tokens_seen": 310470055, + "step": 14392, + "time_per_iteration": 3.1809113025665283 + }, + { + "auxiliary_loss_clip": 0.01149776, + "auxiliary_loss_mlp": 0.01102498, + "balance_loss_clip": 1.00184727, + "balance_loss_mlp": 1.00045443, + "epoch": 0.8653539756500827, + "flos": 18040587793920.0, + "grad_norm": 1.8722678682564338, + "language_loss": 0.75681823, + "learning_rate": 1.8710562986207523e-07, + "loss": 0.77934098, + "num_input_tokens_seen": 310487665, + "step": 14393, + "time_per_iteration": 2.6482534408569336 + }, + { + "auxiliary_loss_clip": 0.01133001, + "auxiliary_loss_mlp": 0.01101187, + "balance_loss_clip": 1.00173569, + "balance_loss_mlp": 1.00047922, + "epoch": 0.8654140989027507, + "flos": 17382416935680.0, + "grad_norm": 2.602923205628475, + "language_loss": 0.73722482, + "learning_rate": 1.8694118709270357e-07, + "loss": 0.75956666, + "num_input_tokens_seen": 310506130, + "step": 14394, + "time_per_iteration": 2.6987950801849365 + }, + { + "auxiliary_loss_clip": 0.01149637, + "auxiliary_loss_mlp": 0.01101342, + "balance_loss_clip": 1.00189447, + "balance_loss_mlp": 1.00044322, + "epoch": 0.8654742221554186, + "flos": 53284862448000.0, + "grad_norm": 2.1679411105007116, + "language_loss": 0.65373683, + "learning_rate": 1.867768130747036e-07, + "loss": 0.67624664, + "num_input_tokens_seen": 310532445, + "step": 14395, + "time_per_iteration": 4.505916357040405 + }, + { + "auxiliary_loss_clip": 0.0114978, + "auxiliary_loss_mlp": 0.01102338, + "balance_loss_clip": 1.00194192, + "balance_loss_mlp": 1.00048554, + "epoch": 0.8655343454080866, + "flos": 23914711382400.0, + "grad_norm": 4.272224287341412, + "language_loss": 0.6751554, + "learning_rate": 1.8661250781430838e-07, + "loss": 0.69767654, + "num_input_tokens_seen": 310552300, + "step": 14396, + "time_per_iteration": 4.0118727684021 + }, + { + "auxiliary_loss_clip": 0.01147614, + "auxiliary_loss_mlp": 0.01100982, + "balance_loss_clip": 1.00185227, + "balance_loss_mlp": 1.00055969, + "epoch": 0.8655944686607545, + "flos": 24097209408000.0, + "grad_norm": 2.1036774888928798, + "language_loss": 0.69475555, + "learning_rate": 1.8644827131774954e-07, + "loss": 0.71724153, + "num_input_tokens_seen": 310572710, + "step": 14397, + "time_per_iteration": 2.6548733711242676 + }, + { + "auxiliary_loss_clip": 0.01134407, + "auxiliary_loss_mlp": 0.01100556, + "balance_loss_clip": 1.0017314, + "balance_loss_mlp": 1.00042033, + "epoch": 0.8656545919134225, + "flos": 23112718467840.0, + "grad_norm": 1.6180392525696192, + "language_loss": 0.63711715, + "learning_rate": 1.86284103591253e-07, + "loss": 0.65946668, + "num_input_tokens_seen": 310592460, + "step": 14398, + "time_per_iteration": 2.6838858127593994 + }, + { + "auxiliary_loss_clip": 0.0111787, + "auxiliary_loss_mlp": 0.01100495, + "balance_loss_clip": 1.00181663, + "balance_loss_mlp": 1.00040722, + "epoch": 0.8657147151660904, + "flos": 21141761339520.0, + "grad_norm": 3.2590045396041383, + "language_loss": 0.76042265, + "learning_rate": 1.8612000464104517e-07, + "loss": 0.7826063, + "num_input_tokens_seen": 310609375, + "step": 14399, + "time_per_iteration": 2.705044984817505 + }, + { + "auxiliary_loss_clip": 0.01147801, + "auxiliary_loss_mlp": 0.0110031, + "balance_loss_clip": 1.00185299, + "balance_loss_mlp": 1.00036478, + "epoch": 0.8657748384187585, + "flos": 16289439943680.0, + "grad_norm": 2.5147729937943435, + "language_loss": 0.93263555, + "learning_rate": 1.8595597447334855e-07, + "loss": 0.95511663, + "num_input_tokens_seen": 310627405, + "step": 14400, + "time_per_iteration": 2.5732269287109375 + }, + { + "auxiliary_loss_clip": 0.01085998, + "auxiliary_loss_mlp": 0.01100785, + "balance_loss_clip": 1.00188529, + "balance_loss_mlp": 1.00064898, + "epoch": 0.8658349616714264, + "flos": 30843890179200.0, + "grad_norm": 2.0076641039948426, + "language_loss": 0.67754614, + "learning_rate": 1.8579201309438353e-07, + "loss": 0.69941396, + "num_input_tokens_seen": 310649945, + "step": 14401, + "time_per_iteration": 2.865093469619751 + }, + { + "auxiliary_loss_clip": 0.01147502, + "auxiliary_loss_mlp": 0.01101626, + "balance_loss_clip": 1.00179803, + "balance_loss_mlp": 1.00044131, + "epoch": 0.8658950849240944, + "flos": 18952862440320.0, + "grad_norm": 8.380983966297649, + "language_loss": 0.73597801, + "learning_rate": 1.8562812051036714e-07, + "loss": 0.75846934, + "num_input_tokens_seen": 310668285, + "step": 14402, + "time_per_iteration": 2.6769661903381348 + }, + { + "auxiliary_loss_clip": 0.0107016, + "auxiliary_loss_mlp": 0.01100696, + "balance_loss_clip": 1.00179446, + "balance_loss_mlp": 1.00046492, + "epoch": 0.8659552081767624, + "flos": 23364344217600.0, + "grad_norm": 1.7566772944406468, + "language_loss": 0.74967617, + "learning_rate": 1.8546429672751397e-07, + "loss": 0.77138478, + "num_input_tokens_seen": 310687015, + "step": 14403, + "time_per_iteration": 2.8778107166290283 + }, + { + "auxiliary_loss_clip": 0.01133164, + "auxiliary_loss_mlp": 0.01102697, + "balance_loss_clip": 1.00188088, + "balance_loss_mlp": 1.00055897, + "epoch": 0.8660153314294303, + "flos": 23841992298240.0, + "grad_norm": 1.8206608896783616, + "language_loss": 0.73029339, + "learning_rate": 1.853005417520368e-07, + "loss": 0.75265199, + "num_input_tokens_seen": 310707580, + "step": 14404, + "time_per_iteration": 2.8177385330200195 + }, + { + "auxiliary_loss_clip": 0.01116034, + "auxiliary_loss_mlp": 0.01100831, + "balance_loss_clip": 1.00178921, + "balance_loss_mlp": 1.00050414, + "epoch": 0.8660754546820983, + "flos": 23112467072640.0, + "grad_norm": 2.5530678309533505, + "language_loss": 0.70209795, + "learning_rate": 1.851368555901447e-07, + "loss": 0.72426659, + "num_input_tokens_seen": 310727300, + "step": 14405, + "time_per_iteration": 2.7894389629364014 + }, + { + "auxiliary_loss_clip": 0.01147827, + "auxiliary_loss_mlp": 0.00747418, + "balance_loss_clip": 1.00184536, + "balance_loss_mlp": 1.00050974, + "epoch": 0.8661355779347663, + "flos": 14391991998720.0, + "grad_norm": 1.8350954075022694, + "language_loss": 0.66188025, + "learning_rate": 1.8497323824804467e-07, + "loss": 0.68083274, + "num_input_tokens_seen": 310744935, + "step": 14406, + "time_per_iteration": 2.71124267578125 + }, + { + "auxiliary_loss_clip": 0.01132785, + "auxiliary_loss_mlp": 0.01100813, + "balance_loss_clip": 1.00174475, + "balance_loss_mlp": 1.00048637, + "epoch": 0.8661957011874343, + "flos": 21870137329920.0, + "grad_norm": 1.499999020173225, + "language_loss": 0.83385742, + "learning_rate": 1.8480968973194177e-07, + "loss": 0.85619342, + "num_input_tokens_seen": 310765085, + "step": 14407, + "time_per_iteration": 2.7140727043151855 + }, + { + "auxiliary_loss_clip": 0.01147455, + "auxiliary_loss_mlp": 0.01100953, + "balance_loss_clip": 1.00188756, + "balance_loss_mlp": 1.00062633, + "epoch": 0.8662558244401022, + "flos": 21835160461440.0, + "grad_norm": 1.9572782674941014, + "language_loss": 0.69998926, + "learning_rate": 1.8464621004803748e-07, + "loss": 0.72247332, + "num_input_tokens_seen": 310783260, + "step": 14408, + "time_per_iteration": 2.655318021774292 + }, + { + "auxiliary_loss_clip": 0.01147726, + "auxiliary_loss_mlp": 0.01100834, + "balance_loss_clip": 1.00185001, + "balance_loss_mlp": 1.00055528, + "epoch": 0.8663159476927702, + "flos": 17384104874880.0, + "grad_norm": 2.070936320424662, + "language_loss": 0.77058041, + "learning_rate": 1.844827992025304e-07, + "loss": 0.79306602, + "num_input_tokens_seen": 310801970, + "step": 14409, + "time_per_iteration": 2.64310359954834 + }, + { + "auxiliary_loss_clip": 0.01147631, + "auxiliary_loss_mlp": 0.01101965, + "balance_loss_clip": 1.00183606, + "balance_loss_mlp": 1.00049424, + "epoch": 0.8663760709454381, + "flos": 22747722416640.0, + "grad_norm": 1.7787514613363444, + "language_loss": 0.76778758, + "learning_rate": 1.8431945720161757e-07, + "loss": 0.79028356, + "num_input_tokens_seen": 310822070, + "step": 14410, + "time_per_iteration": 2.791431427001953 + }, + { + "auxiliary_loss_clip": 0.01116628, + "auxiliary_loss_mlp": 0.01101296, + "balance_loss_clip": 1.00169921, + "balance_loss_mlp": 1.00058806, + "epoch": 0.8664361941981061, + "flos": 17376850327680.0, + "grad_norm": 2.6516177401533985, + "language_loss": 0.77667129, + "learning_rate": 1.8415618405149315e-07, + "loss": 0.79885054, + "num_input_tokens_seen": 310838355, + "step": 14411, + "time_per_iteration": 2.653979778289795 + }, + { + "auxiliary_loss_clip": 0.01130483, + "auxiliary_loss_mlp": 0.01100318, + "balance_loss_clip": 1.00156939, + "balance_loss_mlp": 1.00056338, + "epoch": 0.866496317450774, + "flos": 16034438315520.0, + "grad_norm": 1.7264851193612958, + "language_loss": 0.73725367, + "learning_rate": 1.8399297975834794e-07, + "loss": 0.75956166, + "num_input_tokens_seen": 310856055, + "step": 14412, + "time_per_iteration": 2.61801815032959 + }, + { + "auxiliary_loss_clip": 0.01149124, + "auxiliary_loss_mlp": 0.00747167, + "balance_loss_clip": 1.00186622, + "balance_loss_mlp": 1.00046492, + "epoch": 0.8665564407034421, + "flos": 20814830726400.0, + "grad_norm": 1.6645872206723735, + "language_loss": 0.69800979, + "learning_rate": 1.83829844328371e-07, + "loss": 0.71697271, + "num_input_tokens_seen": 310876695, + "step": 14413, + "time_per_iteration": 2.6309478282928467 + }, + { + "auxiliary_loss_clip": 0.01147381, + "auxiliary_loss_mlp": 0.0110046, + "balance_loss_clip": 1.00180793, + "balance_loss_mlp": 1.00051522, + "epoch": 0.86661656395611, + "flos": 15815167741440.0, + "grad_norm": 2.517285067547055, + "language_loss": 0.62579012, + "learning_rate": 1.8366677776774874e-07, + "loss": 0.64826852, + "num_input_tokens_seen": 310893880, + "step": 14414, + "time_per_iteration": 2.6114158630371094 + }, + { + "auxiliary_loss_clip": 0.01115781, + "auxiliary_loss_mlp": 0.00747258, + "balance_loss_clip": 1.00173557, + "balance_loss_mlp": 1.0004077, + "epoch": 0.866676687208778, + "flos": 23036910814080.0, + "grad_norm": 1.662354608804366, + "language_loss": 0.63661182, + "learning_rate": 1.8350378008266377e-07, + "loss": 0.6552422, + "num_input_tokens_seen": 310914145, + "step": 14415, + "time_per_iteration": 4.43119478225708 + }, + { + "auxiliary_loss_clip": 0.01109382, + "auxiliary_loss_mlp": 0.01074465, + "balance_loss_clip": 1.00081956, + "balance_loss_mlp": 1.0000782, + "epoch": 0.866736810461446, + "flos": 63802275212160.0, + "grad_norm": 0.7990394627740824, + "language_loss": 0.60415262, + "learning_rate": 1.8334085127929754e-07, + "loss": 0.62599105, + "num_input_tokens_seen": 310972825, + "step": 14416, + "time_per_iteration": 3.346714496612549 + }, + { + "auxiliary_loss_clip": 0.01148048, + "auxiliary_loss_mlp": 0.00747441, + "balance_loss_clip": 1.00175881, + "balance_loss_mlp": 1.00051236, + "epoch": 0.8667969337141139, + "flos": 20449367798400.0, + "grad_norm": 1.6324650480192875, + "language_loss": 0.74337316, + "learning_rate": 1.831779913638285e-07, + "loss": 0.76232803, + "num_input_tokens_seen": 310992050, + "step": 14417, + "time_per_iteration": 2.665555715560913 + }, + { + "auxiliary_loss_clip": 0.01130728, + "auxiliary_loss_mlp": 0.01100446, + "balance_loss_clip": 1.00173855, + "balance_loss_mlp": 1.00054836, + "epoch": 0.866857056966782, + "flos": 21653703930240.0, + "grad_norm": 1.5594976252469315, + "language_loss": 0.75267118, + "learning_rate": 1.830152003424319e-07, + "loss": 0.77498293, + "num_input_tokens_seen": 311011105, + "step": 14418, + "time_per_iteration": 2.710482597351074 + }, + { + "auxiliary_loss_clip": 0.0114757, + "auxiliary_loss_mlp": 0.01099668, + "balance_loss_clip": 1.00185823, + "balance_loss_mlp": 1.00053334, + "epoch": 0.8669171802194499, + "flos": 22852832590080.0, + "grad_norm": 1.613770014113135, + "language_loss": 0.68281484, + "learning_rate": 1.8285247822128126e-07, + "loss": 0.70528722, + "num_input_tokens_seen": 311032080, + "step": 14419, + "time_per_iteration": 4.036527872085571 + }, + { + "auxiliary_loss_clip": 0.01149539, + "auxiliary_loss_mlp": 0.01100404, + "balance_loss_clip": 1.00173557, + "balance_loss_mlp": 1.00050688, + "epoch": 0.8669773034721179, + "flos": 18734166483840.0, + "grad_norm": 1.6420543363371753, + "language_loss": 0.78830242, + "learning_rate": 1.826898250065465e-07, + "loss": 0.8108018, + "num_input_tokens_seen": 311049735, + "step": 14420, + "time_per_iteration": 2.6278598308563232 + }, + { + "auxiliary_loss_clip": 0.01149637, + "auxiliary_loss_mlp": 0.01100621, + "balance_loss_clip": 1.00200558, + "balance_loss_mlp": 1.00048494, + "epoch": 0.8670374267247858, + "flos": 18916018064640.0, + "grad_norm": 1.620970895012667, + "language_loss": 0.83855754, + "learning_rate": 1.8252724070439586e-07, + "loss": 0.86106014, + "num_input_tokens_seen": 311067675, + "step": 14421, + "time_per_iteration": 2.7086851596832275 + }, + { + "auxiliary_loss_clip": 0.01126011, + "auxiliary_loss_mlp": 0.01074585, + "balance_loss_clip": 1.00096631, + "balance_loss_mlp": 1.00019801, + "epoch": 0.8670975499774538, + "flos": 48814527214080.0, + "grad_norm": 0.7001560974330155, + "language_loss": 0.49125424, + "learning_rate": 1.823647253209941e-07, + "loss": 0.51326019, + "num_input_tokens_seen": 311126605, + "step": 14422, + "time_per_iteration": 3.2191848754882812 + }, + { + "auxiliary_loss_clip": 0.01131277, + "auxiliary_loss_mlp": 0.00747222, + "balance_loss_clip": 1.001827, + "balance_loss_mlp": 1.00044203, + "epoch": 0.8671576732301217, + "flos": 26136145025280.0, + "grad_norm": 2.0223055500807448, + "language_loss": 0.73483753, + "learning_rate": 1.8220227886250417e-07, + "loss": 0.75362253, + "num_input_tokens_seen": 311147325, + "step": 14423, + "time_per_iteration": 2.7862954139709473 + }, + { + "auxiliary_loss_clip": 0.01117756, + "auxiliary_loss_mlp": 0.01098719, + "balance_loss_clip": 1.00179338, + "balance_loss_mlp": 1.00034761, + "epoch": 0.8672177964827897, + "flos": 18367446579840.0, + "grad_norm": 1.63480898281589, + "language_loss": 0.7705307, + "learning_rate": 1.8203990133508684e-07, + "loss": 0.79269546, + "num_input_tokens_seen": 311165385, + "step": 14424, + "time_per_iteration": 2.7055203914642334 + }, + { + "auxiliary_loss_clip": 0.01117693, + "auxiliary_loss_mlp": 0.01100032, + "balance_loss_clip": 1.0017333, + "balance_loss_mlp": 1.00046897, + "epoch": 0.8672779197354576, + "flos": 28545355992960.0, + "grad_norm": 1.9341744342915723, + "language_loss": 0.71635562, + "learning_rate": 1.8187759274489767e-07, + "loss": 0.73853284, + "num_input_tokens_seen": 311185860, + "step": 14425, + "time_per_iteration": 2.8464252948760986 + }, + { + "auxiliary_loss_clip": 0.01147662, + "auxiliary_loss_mlp": 0.01102165, + "balance_loss_clip": 1.00183582, + "balance_loss_mlp": 1.00050306, + "epoch": 0.8673380429881257, + "flos": 22382474970240.0, + "grad_norm": 1.722109890554731, + "language_loss": 0.68215132, + "learning_rate": 1.817153530980926e-07, + "loss": 0.70464951, + "num_input_tokens_seen": 311205810, + "step": 14426, + "time_per_iteration": 2.7120532989501953 + }, + { + "auxiliary_loss_clip": 0.01115697, + "auxiliary_loss_mlp": 0.01101475, + "balance_loss_clip": 1.00192702, + "balance_loss_mlp": 1.00038564, + "epoch": 0.8673981662407936, + "flos": 20996430912000.0, + "grad_norm": 2.2263476588923496, + "language_loss": 0.70893335, + "learning_rate": 1.815531824008234e-07, + "loss": 0.73110509, + "num_input_tokens_seen": 311226080, + "step": 14427, + "time_per_iteration": 2.794664144515991 + }, + { + "auxiliary_loss_clip": 0.0111893, + "auxiliary_loss_mlp": 0.01100305, + "balance_loss_clip": 1.00173044, + "balance_loss_mlp": 1.00040746, + "epoch": 0.8674582894934616, + "flos": 24426797627520.0, + "grad_norm": 11.518120065691946, + "language_loss": 0.6804136, + "learning_rate": 1.8139108065924004e-07, + "loss": 0.70260596, + "num_input_tokens_seen": 311246380, + "step": 14428, + "time_per_iteration": 2.7804651260375977 + }, + { + "auxiliary_loss_clip": 0.01131051, + "auxiliary_loss_mlp": 0.01100631, + "balance_loss_clip": 1.0016706, + "balance_loss_mlp": 1.00049543, + "epoch": 0.8675184127461296, + "flos": 20737514701440.0, + "grad_norm": 2.8172295110953485, + "language_loss": 0.69983673, + "learning_rate": 1.812290478794889e-07, + "loss": 0.72215354, + "num_input_tokens_seen": 311266465, + "step": 14429, + "time_per_iteration": 2.727628469467163 + }, + { + "auxiliary_loss_clip": 0.01131943, + "auxiliary_loss_mlp": 0.01100936, + "balance_loss_clip": 1.00182295, + "balance_loss_mlp": 1.00037146, + "epoch": 0.8675785359987975, + "flos": 19135647774720.0, + "grad_norm": 1.9138344620421053, + "language_loss": 0.67084026, + "learning_rate": 1.810670840677151e-07, + "loss": 0.693169, + "num_input_tokens_seen": 311285075, + "step": 14430, + "time_per_iteration": 2.6695687770843506 + }, + { + "auxiliary_loss_clip": 0.01099619, + "auxiliary_loss_mlp": 0.01101188, + "balance_loss_clip": 1.00165892, + "balance_loss_mlp": 1.00067091, + "epoch": 0.8676386592514655, + "flos": 22710662559360.0, + "grad_norm": 1.9684672684003848, + "language_loss": 0.69332498, + "learning_rate": 1.8090518923005948e-07, + "loss": 0.71533298, + "num_input_tokens_seen": 311303230, + "step": 14431, + "time_per_iteration": 2.7333121299743652 + }, + { + "auxiliary_loss_clip": 0.01149589, + "auxiliary_loss_mlp": 0.0110099, + "balance_loss_clip": 1.00189066, + "balance_loss_mlp": 1.00066376, + "epoch": 0.8676987825041335, + "flos": 14209853109120.0, + "grad_norm": 3.5950668389939375, + "language_loss": 0.63386595, + "learning_rate": 1.8074336337266116e-07, + "loss": 0.65637177, + "num_input_tokens_seen": 311318070, + "step": 14432, + "time_per_iteration": 2.668212413787842 + }, + { + "auxiliary_loss_clip": 0.01148669, + "auxiliary_loss_mlp": 0.01100633, + "balance_loss_clip": 1.00185561, + "balance_loss_mlp": 1.00064015, + "epoch": 0.8677589057568015, + "flos": 13589927256960.0, + "grad_norm": 41.98832646909237, + "language_loss": 0.78200769, + "learning_rate": 1.8058160650165656e-07, + "loss": 0.80450076, + "num_input_tokens_seen": 311334885, + "step": 14433, + "time_per_iteration": 4.168629884719849 + }, + { + "auxiliary_loss_clip": 0.01126745, + "auxiliary_loss_mlp": 0.01074324, + "balance_loss_clip": 1.00082636, + "balance_loss_mlp": 0.99993753, + "epoch": 0.8678190290094694, + "flos": 68933657370240.0, + "grad_norm": 0.7044965974562125, + "language_loss": 0.5847367, + "learning_rate": 1.804199186231805e-07, + "loss": 0.60674739, + "num_input_tokens_seen": 311399780, + "step": 14434, + "time_per_iteration": 4.6681084632873535 + }, + { + "auxiliary_loss_clip": 0.01132891, + "auxiliary_loss_mlp": 0.01100035, + "balance_loss_clip": 1.0017066, + "balance_loss_mlp": 1.00047171, + "epoch": 0.8678791522621374, + "flos": 32557726776960.0, + "grad_norm": 1.6544275865616724, + "language_loss": 0.79751509, + "learning_rate": 1.802582997433628e-07, + "loss": 0.81984437, + "num_input_tokens_seen": 311419610, + "step": 14435, + "time_per_iteration": 2.8037753105163574 + }, + { + "auxiliary_loss_clip": 0.01132257, + "auxiliary_loss_mlp": 0.00747532, + "balance_loss_clip": 1.00166273, + "balance_loss_mlp": 1.00049961, + "epoch": 0.8679392755148053, + "flos": 35042637657600.0, + "grad_norm": 1.8882360548548855, + "language_loss": 0.61915565, + "learning_rate": 1.8009674986833322e-07, + "loss": 0.63795358, + "num_input_tokens_seen": 311440045, + "step": 14436, + "time_per_iteration": 2.883542060852051 + }, + { + "auxiliary_loss_clip": 0.01130882, + "auxiliary_loss_mlp": 0.01101394, + "balance_loss_clip": 1.00179827, + "balance_loss_mlp": 1.00054336, + "epoch": 0.8679993987674733, + "flos": 18552494471040.0, + "grad_norm": 2.7354866879358615, + "language_loss": 0.70685995, + "learning_rate": 1.7993526900421706e-07, + "loss": 0.72918272, + "num_input_tokens_seen": 311456660, + "step": 14437, + "time_per_iteration": 2.759948492050171 + }, + { + "auxiliary_loss_clip": 0.01115546, + "auxiliary_loss_mlp": 0.01100864, + "balance_loss_clip": 1.00176013, + "balance_loss_mlp": 1.00044239, + "epoch": 0.8680595220201412, + "flos": 27454390162560.0, + "grad_norm": 2.4465474519322887, + "language_loss": 0.80240953, + "learning_rate": 1.797738571571381e-07, + "loss": 0.82457358, + "num_input_tokens_seen": 311475460, + "step": 14438, + "time_per_iteration": 2.9837491512298584 + }, + { + "auxiliary_loss_clip": 0.01147685, + "auxiliary_loss_mlp": 0.01099742, + "balance_loss_clip": 1.00169289, + "balance_loss_mlp": 1.00046468, + "epoch": 0.8681196452728093, + "flos": 19208797822080.0, + "grad_norm": 1.925766567703513, + "language_loss": 0.67627394, + "learning_rate": 1.7961251433321656e-07, + "loss": 0.69874823, + "num_input_tokens_seen": 311494575, + "step": 14439, + "time_per_iteration": 2.697039842605591 + }, + { + "auxiliary_loss_clip": 0.01147416, + "auxiliary_loss_mlp": 0.0110011, + "balance_loss_clip": 1.00178516, + "balance_loss_mlp": 1.00049865, + "epoch": 0.8681797685254772, + "flos": 37560442417920.0, + "grad_norm": 1.5096588330594507, + "language_loss": 0.63714904, + "learning_rate": 1.7945124053857085e-07, + "loss": 0.65962428, + "num_input_tokens_seen": 311515805, + "step": 14440, + "time_per_iteration": 2.9279956817626953 + }, + { + "auxiliary_loss_clip": 0.01149406, + "auxiliary_loss_mlp": 0.01100445, + "balance_loss_clip": 1.00188828, + "balance_loss_mlp": 1.00054801, + "epoch": 0.8682398917781452, + "flos": 23289937194240.0, + "grad_norm": 1.4616283121919031, + "language_loss": 0.65664613, + "learning_rate": 1.7929003577931722e-07, + "loss": 0.67914462, + "num_input_tokens_seen": 311536000, + "step": 14441, + "time_per_iteration": 2.684077024459839 + }, + { + "auxiliary_loss_clip": 0.01147872, + "auxiliary_loss_mlp": 0.01100452, + "balance_loss_clip": 1.00192881, + "balance_loss_mlp": 1.00041187, + "epoch": 0.8683000150308132, + "flos": 21872794936320.0, + "grad_norm": 3.4343744821725966, + "language_loss": 0.66489011, + "learning_rate": 1.7912890006156722e-07, + "loss": 0.6873734, + "num_input_tokens_seen": 311556220, + "step": 14442, + "time_per_iteration": 2.675879955291748 + }, + { + "auxiliary_loss_clip": 0.01133006, + "auxiliary_loss_mlp": 0.01102211, + "balance_loss_clip": 1.00172126, + "balance_loss_mlp": 1.00054967, + "epoch": 0.8683601382834811, + "flos": 14647209108480.0, + "grad_norm": 1.7866806708369685, + "language_loss": 0.72741127, + "learning_rate": 1.7896783339143195e-07, + "loss": 0.74976343, + "num_input_tokens_seen": 311572530, + "step": 14443, + "time_per_iteration": 2.6612515449523926 + }, + { + "auxiliary_loss_clip": 0.01164104, + "auxiliary_loss_mlp": 0.01101188, + "balance_loss_clip": 1.0018394, + "balance_loss_mlp": 1.00047982, + "epoch": 0.8684202615361492, + "flos": 26359904799360.0, + "grad_norm": 2.1938669968471523, + "language_loss": 0.83219659, + "learning_rate": 1.7880683577501877e-07, + "loss": 0.85484952, + "num_input_tokens_seen": 311591105, + "step": 14444, + "time_per_iteration": 2.6862571239471436 + }, + { + "auxiliary_loss_clip": 0.01114169, + "auxiliary_loss_mlp": 0.01100747, + "balance_loss_clip": 1.00163424, + "balance_loss_mlp": 1.00051618, + "epoch": 0.8684803847888171, + "flos": 20704010290560.0, + "grad_norm": 2.147706466038225, + "language_loss": 0.7704711, + "learning_rate": 1.7864590721843342e-07, + "loss": 0.79262024, + "num_input_tokens_seen": 311608350, + "step": 14445, + "time_per_iteration": 2.737161636352539 + }, + { + "auxiliary_loss_clip": 0.01147086, + "auxiliary_loss_mlp": 0.01101496, + "balance_loss_clip": 1.00191331, + "balance_loss_mlp": 1.00050187, + "epoch": 0.8685405080414851, + "flos": 22638123043200.0, + "grad_norm": 1.9481623550368745, + "language_loss": 0.68077099, + "learning_rate": 1.7848504772777728e-07, + "loss": 0.70325685, + "num_input_tokens_seen": 311626380, + "step": 14446, + "time_per_iteration": 2.685026168823242 + }, + { + "auxiliary_loss_clip": 0.01149569, + "auxiliary_loss_mlp": 0.01101006, + "balance_loss_clip": 1.00199533, + "balance_loss_mlp": 1.00039375, + "epoch": 0.868600631294153, + "flos": 24822065865600.0, + "grad_norm": 1.7319375938374182, + "language_loss": 0.82683134, + "learning_rate": 1.7832425730915102e-07, + "loss": 0.8493371, + "num_input_tokens_seen": 311644345, + "step": 14447, + "time_per_iteration": 2.7974231243133545 + }, + { + "auxiliary_loss_clip": 0.01068274, + "auxiliary_loss_mlp": 0.01100077, + "balance_loss_clip": 1.0014776, + "balance_loss_mlp": 1.00037074, + "epoch": 0.868660754546821, + "flos": 25113983696640.0, + "grad_norm": 1.6364835541470233, + "language_loss": 0.74291754, + "learning_rate": 1.781635359686515e-07, + "loss": 0.76460105, + "num_input_tokens_seen": 311663340, + "step": 14448, + "time_per_iteration": 3.0677969455718994 + }, + { + "auxiliary_loss_clip": 0.0113278, + "auxiliary_loss_mlp": 0.01100624, + "balance_loss_clip": 1.00186253, + "balance_loss_mlp": 1.00039279, + "epoch": 0.8687208777994889, + "flos": 12677832178560.0, + "grad_norm": 2.11712588491874, + "language_loss": 0.81027806, + "learning_rate": 1.7800288371237303e-07, + "loss": 0.83261204, + "num_input_tokens_seen": 311679860, + "step": 14449, + "time_per_iteration": 2.9582326412200928 + }, + { + "auxiliary_loss_clip": 0.01110417, + "auxiliary_loss_mlp": 0.01074437, + "balance_loss_clip": 1.00085902, + "balance_loss_mlp": 1.00005007, + "epoch": 0.8687810010521569, + "flos": 65617235573760.0, + "grad_norm": 0.8023434118000015, + "language_loss": 0.60561609, + "learning_rate": 1.7784230054640758e-07, + "loss": 0.62746465, + "num_input_tokens_seen": 311738135, + "step": 14450, + "time_per_iteration": 3.218696355819702 + }, + { + "auxiliary_loss_clip": 0.0111964, + "auxiliary_loss_mlp": 0.01102125, + "balance_loss_clip": 1.00174785, + "balance_loss_mlp": 1.00036848, + "epoch": 0.8688411243048249, + "flos": 24244012293120.0, + "grad_norm": 1.6210129343948207, + "language_loss": 0.75929892, + "learning_rate": 1.7768178647684517e-07, + "loss": 0.78151655, + "num_input_tokens_seen": 311756975, + "step": 14451, + "time_per_iteration": 2.7844293117523193 + }, + { + "auxiliary_loss_clip": 0.01147398, + "auxiliary_loss_mlp": 0.01101038, + "balance_loss_clip": 1.00178218, + "balance_loss_mlp": 1.0004251, + "epoch": 0.8689012475574929, + "flos": 18221828843520.0, + "grad_norm": 3.0693070412341976, + "language_loss": 0.72614479, + "learning_rate": 1.7752134150977205e-07, + "loss": 0.74862909, + "num_input_tokens_seen": 311771830, + "step": 14452, + "time_per_iteration": 2.695652484893799 + }, + { + "auxiliary_loss_clip": 0.01132708, + "auxiliary_loss_mlp": 0.00747323, + "balance_loss_clip": 1.00179589, + "balance_loss_mlp": 1.00040984, + "epoch": 0.8689613708101608, + "flos": 19646728439040.0, + "grad_norm": 1.9642206221213847, + "language_loss": 0.72342777, + "learning_rate": 1.7736096565127201e-07, + "loss": 0.74222809, + "num_input_tokens_seen": 311790130, + "step": 14453, + "time_per_iteration": 4.198333024978638 + }, + { + "auxiliary_loss_clip": 0.01149569, + "auxiliary_loss_mlp": 0.01100498, + "balance_loss_clip": 1.00193298, + "balance_loss_mlp": 1.0006007, + "epoch": 0.8690214940628288, + "flos": 11728749070080.0, + "grad_norm": 1.9080268665974127, + "language_loss": 0.73539162, + "learning_rate": 1.7720065890742664e-07, + "loss": 0.75789231, + "num_input_tokens_seen": 311808360, + "step": 14454, + "time_per_iteration": 2.691220998764038 + }, + { + "auxiliary_loss_clip": 0.01164298, + "auxiliary_loss_mlp": 0.01101193, + "balance_loss_clip": 1.00204325, + "balance_loss_mlp": 1.0004853, + "epoch": 0.8690816173154968, + "flos": 34936450076160.0, + "grad_norm": 1.745925492708746, + "language_loss": 0.59658307, + "learning_rate": 1.7704042128431552e-07, + "loss": 0.61923802, + "num_input_tokens_seen": 311831325, + "step": 14455, + "time_per_iteration": 2.7374966144561768 + }, + { + "auxiliary_loss_clip": 0.01132766, + "auxiliary_loss_mlp": 0.01100668, + "balance_loss_clip": 1.00178826, + "balance_loss_mlp": 1.0004847, + "epoch": 0.8691417405681647, + "flos": 11614804151040.0, + "grad_norm": 2.46143946119353, + "language_loss": 0.80088598, + "learning_rate": 1.7688025278801378e-07, + "loss": 0.82322031, + "num_input_tokens_seen": 311848090, + "step": 14456, + "time_per_iteration": 4.060114145278931 + }, + { + "auxiliary_loss_clip": 0.01085212, + "auxiliary_loss_mlp": 0.01102286, + "balance_loss_clip": 1.00165677, + "balance_loss_mlp": 1.00043368, + "epoch": 0.8692018638208328, + "flos": 24608038677120.0, + "grad_norm": 6.413113439870717, + "language_loss": 0.74608469, + "learning_rate": 1.7672015342459568e-07, + "loss": 0.76795965, + "num_input_tokens_seen": 311867855, + "step": 14457, + "time_per_iteration": 2.821775436401367 + }, + { + "auxiliary_loss_clip": 0.01099853, + "auxiliary_loss_mlp": 0.01100464, + "balance_loss_clip": 1.00172234, + "balance_loss_mlp": 1.00042307, + "epoch": 0.8692619870735007, + "flos": 25995124229760.0, + "grad_norm": 1.6466141460969592, + "language_loss": 0.78602225, + "learning_rate": 1.765601232001328e-07, + "loss": 0.80802542, + "num_input_tokens_seen": 311888675, + "step": 14458, + "time_per_iteration": 2.8416693210601807 + }, + { + "auxiliary_loss_clip": 0.01149104, + "auxiliary_loss_mlp": 0.011012, + "balance_loss_clip": 1.00194335, + "balance_loss_mlp": 1.00058699, + "epoch": 0.8693221103261687, + "flos": 18041808856320.0, + "grad_norm": 1.8089345105438748, + "language_loss": 0.71150416, + "learning_rate": 1.7640016212069187e-07, + "loss": 0.73400724, + "num_input_tokens_seen": 311907310, + "step": 14459, + "time_per_iteration": 2.6670541763305664 + }, + { + "auxiliary_loss_clip": 0.01132835, + "auxiliary_loss_mlp": 0.01100012, + "balance_loss_clip": 1.00183511, + "balance_loss_mlp": 1.00059128, + "epoch": 0.8693822335788366, + "flos": 27492347859840.0, + "grad_norm": 1.52026084443218, + "language_loss": 0.73868477, + "learning_rate": 1.762402701923398e-07, + "loss": 0.76101327, + "num_input_tokens_seen": 311929635, + "step": 14460, + "time_per_iteration": 2.7871711254119873 + }, + { + "auxiliary_loss_clip": 0.01132443, + "auxiliary_loss_mlp": 0.01100981, + "balance_loss_clip": 1.00175476, + "balance_loss_mlp": 1.00055873, + "epoch": 0.8694423568315046, + "flos": 24097712198400.0, + "grad_norm": 1.9010494850507569, + "language_loss": 0.64977753, + "learning_rate": 1.7608044742113947e-07, + "loss": 0.67211175, + "num_input_tokens_seen": 311948800, + "step": 14461, + "time_per_iteration": 2.726379632949829 + }, + { + "auxiliary_loss_clip": 0.01149531, + "auxiliary_loss_mlp": 0.01100949, + "balance_loss_clip": 1.0019033, + "balance_loss_mlp": 1.00071776, + "epoch": 0.8695024800841725, + "flos": 18362131367040.0, + "grad_norm": 2.02282386043982, + "language_loss": 0.82439387, + "learning_rate": 1.7592069381315123e-07, + "loss": 0.84689862, + "num_input_tokens_seen": 311964090, + "step": 14462, + "time_per_iteration": 2.7040510177612305 + }, + { + "auxiliary_loss_clip": 0.01149631, + "auxiliary_loss_mlp": 0.01102053, + "balance_loss_clip": 1.00193608, + "balance_loss_mlp": 1.00048709, + "epoch": 0.8695626033368405, + "flos": 14027750133120.0, + "grad_norm": 3.2981777808707196, + "language_loss": 0.65487611, + "learning_rate": 1.757610093744335e-07, + "loss": 0.67739302, + "num_input_tokens_seen": 311981460, + "step": 14463, + "time_per_iteration": 2.7012457847595215 + }, + { + "auxiliary_loss_clip": 0.01132077, + "auxiliary_loss_mlp": 0.01101535, + "balance_loss_clip": 1.00183272, + "balance_loss_mlp": 1.0004456, + "epoch": 0.8696227265895085, + "flos": 16836862193280.0, + "grad_norm": 1.8625034265047855, + "language_loss": 0.66482717, + "learning_rate": 1.7560139411104058e-07, + "loss": 0.68716323, + "num_input_tokens_seen": 312000115, + "step": 14464, + "time_per_iteration": 2.821582555770874 + }, + { + "auxiliary_loss_clip": 0.01130939, + "auxiliary_loss_mlp": 0.01101387, + "balance_loss_clip": 1.00169253, + "balance_loss_mlp": 1.00048852, + "epoch": 0.8696828498421765, + "flos": 21799070271360.0, + "grad_norm": 2.4065594032442963, + "language_loss": 0.63166857, + "learning_rate": 1.7544184802902607e-07, + "loss": 0.65399182, + "num_input_tokens_seen": 312020770, + "step": 14465, + "time_per_iteration": 2.706519603729248 + }, + { + "auxiliary_loss_clip": 0.01147267, + "auxiliary_loss_mlp": 0.01099485, + "balance_loss_clip": 1.00178003, + "balance_loss_mlp": 1.00058901, + "epoch": 0.8697429730948444, + "flos": 22894812610560.0, + "grad_norm": 8.692754956931163, + "language_loss": 0.84705186, + "learning_rate": 1.7528237113443934e-07, + "loss": 0.86951935, + "num_input_tokens_seen": 312041870, + "step": 14466, + "time_per_iteration": 2.693875312805176 + }, + { + "auxiliary_loss_clip": 0.01133546, + "auxiliary_loss_mlp": 0.01102811, + "balance_loss_clip": 1.00188398, + "balance_loss_mlp": 1.00067234, + "epoch": 0.8698030963475124, + "flos": 24717458482560.0, + "grad_norm": 2.265152495874987, + "language_loss": 0.62014651, + "learning_rate": 1.7512296343332779e-07, + "loss": 0.64251006, + "num_input_tokens_seen": 312058210, + "step": 14467, + "time_per_iteration": 2.765887498855591 + }, + { + "auxiliary_loss_clip": 0.01163993, + "auxiliary_loss_mlp": 0.01099712, + "balance_loss_clip": 1.00181079, + "balance_loss_mlp": 1.00048232, + "epoch": 0.8698632196001803, + "flos": 28442221067520.0, + "grad_norm": 1.507572682800507, + "language_loss": 0.6891377, + "learning_rate": 1.7496362493173655e-07, + "loss": 0.71177471, + "num_input_tokens_seen": 312082665, + "step": 14468, + "time_per_iteration": 2.7501697540283203 + }, + { + "auxiliary_loss_clip": 0.01131227, + "auxiliary_loss_mlp": 0.01100731, + "balance_loss_clip": 1.00172615, + "balance_loss_mlp": 1.00059533, + "epoch": 0.8699233428528483, + "flos": 27636457224960.0, + "grad_norm": 1.6343975953988759, + "language_loss": 0.71055543, + "learning_rate": 1.7480435563570773e-07, + "loss": 0.73287499, + "num_input_tokens_seen": 312101960, + "step": 14469, + "time_per_iteration": 2.757150173187256 + }, + { + "auxiliary_loss_clip": 0.01146834, + "auxiliary_loss_mlp": 0.01099671, + "balance_loss_clip": 1.00191045, + "balance_loss_mlp": 1.00048923, + "epoch": 0.8699834661055164, + "flos": 20045659864320.0, + "grad_norm": 8.6728284248767, + "language_loss": 0.84169793, + "learning_rate": 1.7464515555128024e-07, + "loss": 0.86416298, + "num_input_tokens_seen": 312117125, + "step": 14470, + "time_per_iteration": 2.6382253170013428 + }, + { + "auxiliary_loss_clip": 0.0113198, + "auxiliary_loss_mlp": 0.01100646, + "balance_loss_clip": 1.00177884, + "balance_loss_mlp": 1.00050986, + "epoch": 0.8700435893581843, + "flos": 23732787974400.0, + "grad_norm": 1.714127591188968, + "language_loss": 0.73042727, + "learning_rate": 1.7448602468449148e-07, + "loss": 0.7527535, + "num_input_tokens_seen": 312135775, + "step": 14471, + "time_per_iteration": 4.198477029800415 + }, + { + "auxiliary_loss_clip": 0.01164128, + "auxiliary_loss_mlp": 0.01100474, + "balance_loss_clip": 1.00182867, + "balance_loss_mlp": 1.00048113, + "epoch": 0.8701037126108523, + "flos": 23548422441600.0, + "grad_norm": 2.400824538449274, + "language_loss": 0.78938961, + "learning_rate": 1.7432696304137573e-07, + "loss": 0.81203568, + "num_input_tokens_seen": 312156070, + "step": 14472, + "time_per_iteration": 4.019620895385742 + }, + { + "auxiliary_loss_clip": 0.01147979, + "auxiliary_loss_mlp": 0.00747389, + "balance_loss_clip": 1.0018487, + "balance_loss_mlp": 1.00045645, + "epoch": 0.8701638358635202, + "flos": 18843442634880.0, + "grad_norm": 5.195243899915829, + "language_loss": 0.7264055, + "learning_rate": 1.741679706279644e-07, + "loss": 0.74535918, + "num_input_tokens_seen": 312174380, + "step": 14473, + "time_per_iteration": 2.6440517902374268 + }, + { + "auxiliary_loss_clip": 0.01164358, + "auxiliary_loss_mlp": 0.01100916, + "balance_loss_clip": 1.00187969, + "balance_loss_mlp": 1.0004946, + "epoch": 0.8702239591161882, + "flos": 27928339142400.0, + "grad_norm": 1.4227999604258381, + "language_loss": 0.7220878, + "learning_rate": 1.7400904745028644e-07, + "loss": 0.74474055, + "num_input_tokens_seen": 312195130, + "step": 14474, + "time_per_iteration": 2.7191855907440186 + }, + { + "auxiliary_loss_clip": 0.01130834, + "auxiliary_loss_mlp": 0.01101338, + "balance_loss_clip": 1.00166404, + "balance_loss_mlp": 1.0005343, + "epoch": 0.8702840823688561, + "flos": 17233997938560.0, + "grad_norm": 2.196302510996823, + "language_loss": 0.67268401, + "learning_rate": 1.7385019351436925e-07, + "loss": 0.69500571, + "num_input_tokens_seen": 312212300, + "step": 14475, + "time_per_iteration": 2.686983585357666 + }, + { + "auxiliary_loss_clip": 0.0116417, + "auxiliary_loss_mlp": 0.01100708, + "balance_loss_clip": 1.00186682, + "balance_loss_mlp": 1.00038195, + "epoch": 0.8703442056215241, + "flos": 19427565605760.0, + "grad_norm": 1.8996155574122506, + "language_loss": 0.77891779, + "learning_rate": 1.736914088262349e-07, + "loss": 0.8015666, + "num_input_tokens_seen": 312231735, + "step": 14476, + "time_per_iteration": 2.614194393157959 + }, + { + "auxiliary_loss_clip": 0.0114923, + "auxiliary_loss_mlp": 0.0110082, + "balance_loss_clip": 1.00192702, + "balance_loss_mlp": 1.00035048, + "epoch": 0.8704043288741921, + "flos": 22273845264000.0, + "grad_norm": 1.8302170260342612, + "language_loss": 0.72077358, + "learning_rate": 1.7353269339190525e-07, + "loss": 0.74327409, + "num_input_tokens_seen": 312253060, + "step": 14477, + "time_per_iteration": 2.6471662521362305 + }, + { + "auxiliary_loss_clip": 0.01148597, + "auxiliary_loss_mlp": 0.01100673, + "balance_loss_clip": 1.00182188, + "balance_loss_mlp": 1.00039434, + "epoch": 0.8704644521268601, + "flos": 16648725732480.0, + "grad_norm": 2.1284443650437406, + "language_loss": 0.59744799, + "learning_rate": 1.7337404721739946e-07, + "loss": 0.61994064, + "num_input_tokens_seen": 312269460, + "step": 14478, + "time_per_iteration": 2.6925032138824463 + }, + { + "auxiliary_loss_clip": 0.01149711, + "auxiliary_loss_mlp": 0.01100209, + "balance_loss_clip": 1.00205576, + "balance_loss_mlp": 1.00050187, + "epoch": 0.870524575379528, + "flos": 24280210224000.0, + "grad_norm": 3.0905557368893906, + "language_loss": 0.71361482, + "learning_rate": 1.732154703087323e-07, + "loss": 0.73611403, + "num_input_tokens_seen": 312289830, + "step": 14479, + "time_per_iteration": 2.7623531818389893 + }, + { + "auxiliary_loss_clip": 0.01130821, + "auxiliary_loss_mlp": 0.01100815, + "balance_loss_clip": 1.00172949, + "balance_loss_mlp": 1.00048888, + "epoch": 0.870584698632196, + "flos": 28768684803840.0, + "grad_norm": 1.5145013453578708, + "language_loss": 0.70874977, + "learning_rate": 1.7305696267191805e-07, + "loss": 0.73106617, + "num_input_tokens_seen": 312311320, + "step": 14480, + "time_per_iteration": 2.812368869781494 + }, + { + "auxiliary_loss_clip": 0.01101197, + "auxiliary_loss_mlp": 0.01101033, + "balance_loss_clip": 1.00161505, + "balance_loss_mlp": 1.00042045, + "epoch": 0.8706448218848639, + "flos": 32449635774720.0, + "grad_norm": 1.6684826118169398, + "language_loss": 0.70092082, + "learning_rate": 1.728985243129666e-07, + "loss": 0.72294319, + "num_input_tokens_seen": 312332095, + "step": 14481, + "time_per_iteration": 2.9079763889312744 + }, + { + "auxiliary_loss_clip": 0.01147311, + "auxiliary_loss_mlp": 0.01100342, + "balance_loss_clip": 1.00172162, + "balance_loss_mlp": 1.0004921, + "epoch": 0.8707049451375319, + "flos": 22748009725440.0, + "grad_norm": 2.1569870760415464, + "language_loss": 0.76832634, + "learning_rate": 1.7274015523788643e-07, + "loss": 0.79080284, + "num_input_tokens_seen": 312351225, + "step": 14482, + "time_per_iteration": 2.6836330890655518 + }, + { + "auxiliary_loss_clip": 0.01131788, + "auxiliary_loss_mlp": 0.01100715, + "balance_loss_clip": 1.00175428, + "balance_loss_mlp": 1.00053191, + "epoch": 0.8707650683902, + "flos": 15851976203520.0, + "grad_norm": 1.6999127117056514, + "language_loss": 0.76503825, + "learning_rate": 1.7258185545268234e-07, + "loss": 0.78736329, + "num_input_tokens_seen": 312369730, + "step": 14483, + "time_per_iteration": 2.756638765335083 + }, + { + "auxiliary_loss_clip": 0.01149723, + "auxiliary_loss_mlp": 0.01101994, + "balance_loss_clip": 1.00191545, + "balance_loss_mlp": 1.00047517, + "epoch": 0.8708251916428679, + "flos": 16468131127680.0, + "grad_norm": 2.284763795389073, + "language_loss": 0.61989361, + "learning_rate": 1.7242362496335749e-07, + "loss": 0.64241076, + "num_input_tokens_seen": 312386780, + "step": 14484, + "time_per_iteration": 2.6298446655273438 + }, + { + "auxiliary_loss_clip": 0.01164327, + "auxiliary_loss_mlp": 0.01101419, + "balance_loss_clip": 1.00204957, + "balance_loss_mlp": 1.00052047, + "epoch": 0.8708853148955359, + "flos": 15377847655680.0, + "grad_norm": 1.7860031701450856, + "language_loss": 0.68071759, + "learning_rate": 1.7226546377591222e-07, + "loss": 0.70337504, + "num_input_tokens_seen": 312404875, + "step": 14485, + "time_per_iteration": 2.747013568878174 + }, + { + "auxiliary_loss_clip": 0.01084642, + "auxiliary_loss_mlp": 0.00747436, + "balance_loss_clip": 1.00159359, + "balance_loss_mlp": 1.00052285, + "epoch": 0.8709454381482038, + "flos": 30551325903360.0, + "grad_norm": 1.9285320647917537, + "language_loss": 0.63324022, + "learning_rate": 1.7210737189634373e-07, + "loss": 0.65156102, + "num_input_tokens_seen": 312425280, + "step": 14486, + "time_per_iteration": 2.9278883934020996 + }, + { + "auxiliary_loss_clip": 0.01164343, + "auxiliary_loss_mlp": 0.01102675, + "balance_loss_clip": 1.0018661, + "balance_loss_mlp": 1.0005362, + "epoch": 0.8710055614008718, + "flos": 22601422321920.0, + "grad_norm": 2.3909594397308562, + "language_loss": 0.61858445, + "learning_rate": 1.7194934933064653e-07, + "loss": 0.6412546, + "num_input_tokens_seen": 312443835, + "step": 14487, + "time_per_iteration": 2.7140703201293945 + }, + { + "auxiliary_loss_clip": 0.01132958, + "auxiliary_loss_mlp": 0.00747158, + "balance_loss_clip": 1.0016464, + "balance_loss_mlp": 1.00046468, + "epoch": 0.8710656846535397, + "flos": 18443146492800.0, + "grad_norm": 2.2969774814861332, + "language_loss": 0.67784238, + "learning_rate": 1.7179139608481318e-07, + "loss": 0.69664353, + "num_input_tokens_seen": 312460830, + "step": 14488, + "time_per_iteration": 2.7032909393310547 + }, + { + "auxiliary_loss_clip": 0.01132816, + "auxiliary_loss_mlp": 0.0074711, + "balance_loss_clip": 1.00182748, + "balance_loss_mlp": 1.00040185, + "epoch": 0.8711258079062077, + "flos": 16503862181760.0, + "grad_norm": 1.80551491567702, + "language_loss": 0.85706258, + "learning_rate": 1.716335121648338e-07, + "loss": 0.87586188, + "num_input_tokens_seen": 312477575, + "step": 14489, + "time_per_iteration": 2.6929233074188232 + }, + { + "auxiliary_loss_clip": 0.01147618, + "auxiliary_loss_mlp": 0.01101479, + "balance_loss_clip": 1.00182962, + "balance_loss_mlp": 1.00048447, + "epoch": 0.8711859311588757, + "flos": 15663336952320.0, + "grad_norm": 2.028860502038943, + "language_loss": 0.75878382, + "learning_rate": 1.7147569757669445e-07, + "loss": 0.78127486, + "num_input_tokens_seen": 312492140, + "step": 14490, + "time_per_iteration": 4.2215917110443115 + }, + { + "auxiliary_loss_clip": 0.01149115, + "auxiliary_loss_mlp": 0.0110166, + "balance_loss_clip": 1.00178337, + "balance_loss_mlp": 1.0003798, + "epoch": 0.8712460544115437, + "flos": 15557544420480.0, + "grad_norm": 2.0639254063816477, + "language_loss": 0.76294434, + "learning_rate": 1.7131795232638012e-07, + "loss": 0.78545213, + "num_input_tokens_seen": 312508400, + "step": 14491, + "time_per_iteration": 2.583411931991577 + }, + { + "auxiliary_loss_clip": 0.01115243, + "auxiliary_loss_mlp": 0.01100217, + "balance_loss_clip": 1.00175166, + "balance_loss_mlp": 1.00036776, + "epoch": 0.8713061776642116, + "flos": 16763568491520.0, + "grad_norm": 1.6550223099058499, + "language_loss": 0.67205793, + "learning_rate": 1.711602764198723e-07, + "loss": 0.69421256, + "num_input_tokens_seen": 312525915, + "step": 14492, + "time_per_iteration": 2.8221521377563477 + }, + { + "auxiliary_loss_clip": 0.01147242, + "auxiliary_loss_mlp": 0.01099659, + "balance_loss_clip": 1.00175595, + "balance_loss_mlp": 1.00038171, + "epoch": 0.8713663009168796, + "flos": 24279887001600.0, + "grad_norm": 1.6664732209237296, + "language_loss": 0.69444734, + "learning_rate": 1.7100266986314992e-07, + "loss": 0.71691632, + "num_input_tokens_seen": 312544735, + "step": 14493, + "time_per_iteration": 4.129340171813965 + }, + { + "auxiliary_loss_clip": 0.0116427, + "auxiliary_loss_mlp": 0.01102377, + "balance_loss_clip": 1.00194144, + "balance_loss_mlp": 1.00052416, + "epoch": 0.8714264241695475, + "flos": 23795594904960.0, + "grad_norm": 2.373537747763438, + "language_loss": 0.89319885, + "learning_rate": 1.7084513266218936e-07, + "loss": 0.9158653, + "num_input_tokens_seen": 312557910, + "step": 14494, + "time_per_iteration": 2.5386171340942383 + }, + { + "auxiliary_loss_clip": 0.01114837, + "auxiliary_loss_mlp": 0.01099664, + "balance_loss_clip": 1.00183523, + "balance_loss_mlp": 1.00048172, + "epoch": 0.8714865474222155, + "flos": 37997942071680.0, + "grad_norm": 1.7419358120482042, + "language_loss": 0.59185368, + "learning_rate": 1.7068766482296514e-07, + "loss": 0.61399871, + "num_input_tokens_seen": 312580360, + "step": 14495, + "time_per_iteration": 2.8787190914154053 + }, + { + "auxiliary_loss_clip": 0.0111591, + "auxiliary_loss_mlp": 0.0110064, + "balance_loss_clip": 1.0016458, + "balance_loss_mlp": 1.00050402, + "epoch": 0.8715466706748836, + "flos": 22455696844800.0, + "grad_norm": 1.9678125527429675, + "language_loss": 0.80598247, + "learning_rate": 1.7053026635144762e-07, + "loss": 0.82814801, + "num_input_tokens_seen": 312597550, + "step": 14496, + "time_per_iteration": 2.6831114292144775 + }, + { + "auxiliary_loss_clip": 0.01132161, + "auxiliary_loss_mlp": 0.01101403, + "balance_loss_clip": 1.00197482, + "balance_loss_mlp": 1.00055182, + "epoch": 0.8716067939275515, + "flos": 21215126868480.0, + "grad_norm": 2.0315540632810314, + "language_loss": 0.78594375, + "learning_rate": 1.7037293725360624e-07, + "loss": 0.8082794, + "num_input_tokens_seen": 312616435, + "step": 14497, + "time_per_iteration": 2.8025104999542236 + }, + { + "auxiliary_loss_clip": 0.01164326, + "auxiliary_loss_mlp": 0.01101926, + "balance_loss_clip": 1.00186408, + "balance_loss_mlp": 1.0003593, + "epoch": 0.8716669171802195, + "flos": 22997732054400.0, + "grad_norm": 1.9519664668014927, + "language_loss": 0.66643769, + "learning_rate": 1.70215677535406e-07, + "loss": 0.68910027, + "num_input_tokens_seen": 312632770, + "step": 14498, + "time_per_iteration": 2.595189332962036 + }, + { + "auxiliary_loss_clip": 0.01113914, + "auxiliary_loss_mlp": 0.01100255, + "balance_loss_clip": 1.00172567, + "balance_loss_mlp": 1.00040579, + "epoch": 0.8717270404328874, + "flos": 29784058462080.0, + "grad_norm": 1.9001188273891771, + "language_loss": 0.57344079, + "learning_rate": 1.700584872028108e-07, + "loss": 0.59558249, + "num_input_tokens_seen": 312651900, + "step": 14499, + "time_per_iteration": 2.8158907890319824 + }, + { + "auxiliary_loss_clip": 0.01116279, + "auxiliary_loss_mlp": 0.01101812, + "balance_loss_clip": 1.00177205, + "balance_loss_mlp": 1.00053132, + "epoch": 0.8717871636855554, + "flos": 22018125363840.0, + "grad_norm": 2.171590007232131, + "language_loss": 0.79737657, + "learning_rate": 1.6990136626178097e-07, + "loss": 0.81955743, + "num_input_tokens_seen": 312671380, + "step": 14500, + "time_per_iteration": 2.9247589111328125 + }, + { + "auxiliary_loss_clip": 0.01147889, + "auxiliary_loss_mlp": 0.01101834, + "balance_loss_clip": 1.00192237, + "balance_loss_mlp": 1.00055385, + "epoch": 0.8718472869382233, + "flos": 16654256426880.0, + "grad_norm": 2.8906971061350917, + "language_loss": 0.73245841, + "learning_rate": 1.6974431471827466e-07, + "loss": 0.75495565, + "num_input_tokens_seen": 312689215, + "step": 14501, + "time_per_iteration": 2.6546924114227295 + }, + { + "auxiliary_loss_clip": 0.01118488, + "auxiliary_loss_mlp": 0.01101892, + "balance_loss_clip": 1.00179791, + "balance_loss_mlp": 1.00051594, + "epoch": 0.8719074101908914, + "flos": 19495328613120.0, + "grad_norm": 1.5849058434836831, + "language_loss": 0.64311361, + "learning_rate": 1.695873325782482e-07, + "loss": 0.66531742, + "num_input_tokens_seen": 312706400, + "step": 14502, + "time_per_iteration": 2.708902597427368 + }, + { + "auxiliary_loss_clip": 0.01132486, + "auxiliary_loss_mlp": 0.01101489, + "balance_loss_clip": 1.00175047, + "balance_loss_mlp": 1.00049472, + "epoch": 0.8719675334435593, + "flos": 33070890430080.0, + "grad_norm": 2.6252206350211136, + "language_loss": 0.68784499, + "learning_rate": 1.6943041984765262e-07, + "loss": 0.71018475, + "num_input_tokens_seen": 312727985, + "step": 14503, + "time_per_iteration": 2.8318026065826416 + }, + { + "auxiliary_loss_clip": 0.01130994, + "auxiliary_loss_mlp": 0.01100337, + "balance_loss_clip": 1.00176668, + "balance_loss_mlp": 1.00058246, + "epoch": 0.8720276566962273, + "flos": 13626268842240.0, + "grad_norm": 2.288573393107711, + "language_loss": 0.69668663, + "learning_rate": 1.6927357653243912e-07, + "loss": 0.71899992, + "num_input_tokens_seen": 312745025, + "step": 14504, + "time_per_iteration": 2.6909656524658203 + }, + { + "auxiliary_loss_clip": 0.01147555, + "auxiliary_loss_mlp": 0.00747474, + "balance_loss_clip": 1.00183761, + "balance_loss_mlp": 1.00050366, + "epoch": 0.8720877799488952, + "flos": 23514163845120.0, + "grad_norm": 4.3157477936716955, + "language_loss": 0.6998207, + "learning_rate": 1.691168026385552e-07, + "loss": 0.71877098, + "num_input_tokens_seen": 312764170, + "step": 14505, + "time_per_iteration": 2.672487497329712 + }, + { + "auxiliary_loss_clip": 0.01132047, + "auxiliary_loss_mlp": 0.01099801, + "balance_loss_clip": 1.00179803, + "balance_loss_mlp": 1.00033283, + "epoch": 0.8721479032015632, + "flos": 20814148368000.0, + "grad_norm": 1.5569177395952418, + "language_loss": 0.78216326, + "learning_rate": 1.6896009817194545e-07, + "loss": 0.80448174, + "num_input_tokens_seen": 312783830, + "step": 14506, + "time_per_iteration": 2.697200298309326 + }, + { + "auxiliary_loss_clip": 0.01133033, + "auxiliary_loss_mlp": 0.01101744, + "balance_loss_clip": 1.0017184, + "balance_loss_mlp": 1.0004636, + "epoch": 0.8722080264542311, + "flos": 19463655795840.0, + "grad_norm": 2.2671072560482073, + "language_loss": 0.74126422, + "learning_rate": 1.6880346313855221e-07, + "loss": 0.76361191, + "num_input_tokens_seen": 312802015, + "step": 14507, + "time_per_iteration": 2.7219016551971436 + }, + { + "auxiliary_loss_clip": 0.01084646, + "auxiliary_loss_mlp": 0.01101995, + "balance_loss_clip": 1.00162148, + "balance_loss_mlp": 1.00052357, + "epoch": 0.8722681497068991, + "flos": 21761866759680.0, + "grad_norm": 3.0995604492094193, + "language_loss": 0.72118497, + "learning_rate": 1.686468975443156e-07, + "loss": 0.74305141, + "num_input_tokens_seen": 312820650, + "step": 14508, + "time_per_iteration": 2.8074729442596436 + }, + { + "auxiliary_loss_clip": 0.0113097, + "auxiliary_loss_mlp": 0.011023, + "balance_loss_clip": 1.00180924, + "balance_loss_mlp": 1.00054276, + "epoch": 0.8723282729595672, + "flos": 28877134942080.0, + "grad_norm": 1.7535470689193715, + "language_loss": 0.68327647, + "learning_rate": 1.6849040139517202e-07, + "loss": 0.7056092, + "num_input_tokens_seen": 312841310, + "step": 14509, + "time_per_iteration": 4.165515661239624 + }, + { + "auxiliary_loss_clip": 0.01130859, + "auxiliary_loss_mlp": 0.01101331, + "balance_loss_clip": 1.00183225, + "balance_loss_mlp": 1.00047994, + "epoch": 0.8723883962122351, + "flos": 26469145036800.0, + "grad_norm": 2.259992113425763, + "language_loss": 0.58226043, + "learning_rate": 1.683339746970558e-07, + "loss": 0.60458231, + "num_input_tokens_seen": 312862100, + "step": 14510, + "time_per_iteration": 4.147928476333618 + }, + { + "auxiliary_loss_clip": 0.01164459, + "auxiliary_loss_mlp": 0.01103042, + "balance_loss_clip": 1.00188422, + "balance_loss_mlp": 1.00052178, + "epoch": 0.8724485194649031, + "flos": 20521476351360.0, + "grad_norm": 2.702119002139927, + "language_loss": 0.67465413, + "learning_rate": 1.6817761745589865e-07, + "loss": 0.69732916, + "num_input_tokens_seen": 312880220, + "step": 14511, + "time_per_iteration": 2.6823971271514893 + }, + { + "auxiliary_loss_clip": 0.01087765, + "auxiliary_loss_mlp": 0.01100649, + "balance_loss_clip": 1.00160551, + "balance_loss_mlp": 1.00046563, + "epoch": 0.872508642717571, + "flos": 24353360271360.0, + "grad_norm": 1.74670351795663, + "language_loss": 0.81979197, + "learning_rate": 1.6802132967763027e-07, + "loss": 0.84167612, + "num_input_tokens_seen": 312900765, + "step": 14512, + "time_per_iteration": 3.0490410327911377 + }, + { + "auxiliary_loss_clip": 0.01126883, + "auxiliary_loss_mlp": 0.01074474, + "balance_loss_clip": 1.00071788, + "balance_loss_mlp": 1.00008762, + "epoch": 0.872568765970239, + "flos": 61410012485760.0, + "grad_norm": 0.7835826826723145, + "language_loss": 0.58635449, + "learning_rate": 1.6786511136817617e-07, + "loss": 0.60836804, + "num_input_tokens_seen": 312955840, + "step": 14513, + "time_per_iteration": 3.435011148452759 + }, + { + "auxiliary_loss_clip": 0.01147586, + "auxiliary_loss_mlp": 0.01101012, + "balance_loss_clip": 1.00185049, + "balance_loss_mlp": 1.00035167, + "epoch": 0.8726288892229069, + "flos": 22598046443520.0, + "grad_norm": 1.6812439054457784, + "language_loss": 0.76906824, + "learning_rate": 1.6770896253346112e-07, + "loss": 0.79155421, + "num_input_tokens_seen": 312973565, + "step": 14514, + "time_per_iteration": 2.66110897064209 + }, + { + "auxiliary_loss_clip": 0.01149724, + "auxiliary_loss_mlp": 0.01101953, + "balance_loss_clip": 1.00195467, + "balance_loss_mlp": 1.00048184, + "epoch": 0.872689012475575, + "flos": 25885201633920.0, + "grad_norm": 4.052613535240791, + "language_loss": 0.65095836, + "learning_rate": 1.675528831794055e-07, + "loss": 0.67347515, + "num_input_tokens_seen": 312994660, + "step": 14515, + "time_per_iteration": 2.6737582683563232 + }, + { + "auxiliary_loss_clip": 0.01149629, + "auxiliary_loss_mlp": 0.01101418, + "balance_loss_clip": 1.00185943, + "balance_loss_mlp": 1.00080585, + "epoch": 0.8727491357282429, + "flos": 21506721477120.0, + "grad_norm": 2.543721824673727, + "language_loss": 0.78959459, + "learning_rate": 1.6739687331192842e-07, + "loss": 0.81210506, + "num_input_tokens_seen": 313009860, + "step": 14516, + "time_per_iteration": 2.6480515003204346 + }, + { + "auxiliary_loss_clip": 0.01164362, + "auxiliary_loss_mlp": 0.01102087, + "balance_loss_clip": 1.00188351, + "balance_loss_mlp": 1.00061584, + "epoch": 0.8728092589809109, + "flos": 19207504932480.0, + "grad_norm": 2.158465883607917, + "language_loss": 0.71920639, + "learning_rate": 1.672409329369453e-07, + "loss": 0.74187088, + "num_input_tokens_seen": 313027025, + "step": 14517, + "time_per_iteration": 2.558349370956421 + }, + { + "auxiliary_loss_clip": 0.01113859, + "auxiliary_loss_mlp": 0.010997, + "balance_loss_clip": 1.0017035, + "balance_loss_mlp": 1.00037444, + "epoch": 0.8728693822335788, + "flos": 20595308757120.0, + "grad_norm": 2.107632220393425, + "language_loss": 0.72520232, + "learning_rate": 1.6708506206036966e-07, + "loss": 0.74733794, + "num_input_tokens_seen": 313046830, + "step": 14518, + "time_per_iteration": 2.7211551666259766 + }, + { + "auxiliary_loss_clip": 0.01134111, + "auxiliary_loss_mlp": 0.01099852, + "balance_loss_clip": 1.00182211, + "balance_loss_mlp": 1.00052738, + "epoch": 0.8729295054862468, + "flos": 21728613744000.0, + "grad_norm": 1.4235722383606928, + "language_loss": 0.74266595, + "learning_rate": 1.6692926068811275e-07, + "loss": 0.76500559, + "num_input_tokens_seen": 313067715, + "step": 14519, + "time_per_iteration": 2.736679792404175 + }, + { + "auxiliary_loss_clip": 0.01147482, + "auxiliary_loss_mlp": 0.01101646, + "balance_loss_clip": 1.00180912, + "balance_loss_mlp": 1.00050855, + "epoch": 0.8729896287389147, + "flos": 17673436926720.0, + "grad_norm": 2.702305348806926, + "language_loss": 0.76198441, + "learning_rate": 1.6677352882608142e-07, + "loss": 0.78447568, + "num_input_tokens_seen": 313082305, + "step": 14520, + "time_per_iteration": 2.6046886444091797 + }, + { + "auxiliary_loss_clip": 0.01130307, + "auxiliary_loss_mlp": 0.01101287, + "balance_loss_clip": 1.00173163, + "balance_loss_mlp": 1.00053132, + "epoch": 0.8730497519915827, + "flos": 24571804832640.0, + "grad_norm": 1.8804433406138954, + "language_loss": 0.82179379, + "learning_rate": 1.666178664801816e-07, + "loss": 0.84410971, + "num_input_tokens_seen": 313101190, + "step": 14521, + "time_per_iteration": 2.7085418701171875 + }, + { + "auxiliary_loss_clip": 0.01147765, + "auxiliary_loss_mlp": 0.01101535, + "balance_loss_clip": 1.00177169, + "balance_loss_mlp": 1.00054133, + "epoch": 0.8731098752442508, + "flos": 13443734903040.0, + "grad_norm": 2.2263232557775234, + "language_loss": 0.7590304, + "learning_rate": 1.6646227365631616e-07, + "loss": 0.78152335, + "num_input_tokens_seen": 313118965, + "step": 14522, + "time_per_iteration": 2.6177175045013428 + }, + { + "auxiliary_loss_clip": 0.01149346, + "auxiliary_loss_mlp": 0.00747165, + "balance_loss_clip": 1.00188231, + "balance_loss_mlp": 1.00035524, + "epoch": 0.8731699984969187, + "flos": 23474446381440.0, + "grad_norm": 1.740998938829455, + "language_loss": 0.75495899, + "learning_rate": 1.66306750360385e-07, + "loss": 0.77392405, + "num_input_tokens_seen": 313139280, + "step": 14523, + "time_per_iteration": 2.7982242107391357 + }, + { + "auxiliary_loss_clip": 0.01149486, + "auxiliary_loss_mlp": 0.01100433, + "balance_loss_clip": 1.00184083, + "balance_loss_mlp": 1.00058353, + "epoch": 0.8732301217495867, + "flos": 17712651600000.0, + "grad_norm": 2.7400820982371803, + "language_loss": 0.78769934, + "learning_rate": 1.6615129659828542e-07, + "loss": 0.81019855, + "num_input_tokens_seen": 313156655, + "step": 14524, + "time_per_iteration": 2.6381545066833496 + }, + { + "auxiliary_loss_clip": 0.01134804, + "auxiliary_loss_mlp": 0.01099935, + "balance_loss_clip": 1.00180554, + "balance_loss_mlp": 1.00056195, + "epoch": 0.8732902450022546, + "flos": 22054359208320.0, + "grad_norm": 6.513545735115648, + "language_loss": 0.77632332, + "learning_rate": 1.6599591237591272e-07, + "loss": 0.79867065, + "num_input_tokens_seen": 313174050, + "step": 14525, + "time_per_iteration": 2.7195043563842773 + }, + { + "auxiliary_loss_clip": 0.01031843, + "auxiliary_loss_mlp": 0.01101599, + "balance_loss_clip": 1.0013423, + "balance_loss_mlp": 1.0007, + "epoch": 0.8733503682549226, + "flos": 22272983337600.0, + "grad_norm": 1.8241366682151454, + "language_loss": 0.69264662, + "learning_rate": 1.6584059769915902e-07, + "loss": 0.71398103, + "num_input_tokens_seen": 313192765, + "step": 14526, + "time_per_iteration": 3.1233863830566406 + }, + { + "auxiliary_loss_clip": 0.01097926, + "auxiliary_loss_mlp": 0.01103025, + "balance_loss_clip": 1.00157619, + "balance_loss_mlp": 1.00060034, + "epoch": 0.8734104915075905, + "flos": 23364344217600.0, + "grad_norm": 1.9539093023140253, + "language_loss": 0.60870415, + "learning_rate": 1.6568535257391326e-07, + "loss": 0.6307137, + "num_input_tokens_seen": 313210925, + "step": 14527, + "time_per_iteration": 3.101749897003174 + }, + { + "auxiliary_loss_clip": 0.01148063, + "auxiliary_loss_mlp": 0.0110274, + "balance_loss_clip": 1.00186419, + "balance_loss_mlp": 1.00069714, + "epoch": 0.8734706147602586, + "flos": 17712292464000.0, + "grad_norm": 1.8627840806216196, + "language_loss": 0.65775824, + "learning_rate": 1.6553017700606265e-07, + "loss": 0.68026626, + "num_input_tokens_seen": 313228250, + "step": 14528, + "time_per_iteration": 4.160964488983154 + }, + { + "auxiliary_loss_clip": 0.01118209, + "auxiliary_loss_mlp": 0.01100469, + "balance_loss_clip": 1.00184608, + "balance_loss_mlp": 1.00042844, + "epoch": 0.8735307380129265, + "flos": 22049367217920.0, + "grad_norm": 2.169166269349145, + "language_loss": 0.89662272, + "learning_rate": 1.6537507100149205e-07, + "loss": 0.91880953, + "num_input_tokens_seen": 313247880, + "step": 14529, + "time_per_iteration": 2.7352428436279297 + }, + { + "auxiliary_loss_clip": 0.01132973, + "auxiliary_loss_mlp": 0.01099777, + "balance_loss_clip": 1.00193381, + "balance_loss_mlp": 1.00049937, + "epoch": 0.8735908612655945, + "flos": 25338425829120.0, + "grad_norm": 1.7576808000015516, + "language_loss": 0.84892422, + "learning_rate": 1.6522003456608258e-07, + "loss": 0.8712517, + "num_input_tokens_seen": 313266790, + "step": 14530, + "time_per_iteration": 2.7742199897766113 + }, + { + "auxiliary_loss_clip": 0.01133384, + "auxiliary_loss_mlp": 0.01100396, + "balance_loss_clip": 1.0017693, + "balance_loss_mlp": 1.00059438, + "epoch": 0.8736509845182624, + "flos": 21540908246400.0, + "grad_norm": 1.6623847111875367, + "language_loss": 0.74519616, + "learning_rate": 1.650650677057128e-07, + "loss": 0.76753402, + "num_input_tokens_seen": 313286805, + "step": 14531, + "time_per_iteration": 4.712419271469116 + }, + { + "auxiliary_loss_clip": 0.01148961, + "auxiliary_loss_mlp": 0.0110001, + "balance_loss_clip": 1.00182271, + "balance_loss_mlp": 1.00068521, + "epoch": 0.8737111077709304, + "flos": 22017227523840.0, + "grad_norm": 2.5796430967956705, + "language_loss": 0.61473364, + "learning_rate": 1.6491017042625966e-07, + "loss": 0.6372233, + "num_input_tokens_seen": 313305415, + "step": 14532, + "time_per_iteration": 2.7000272274017334 + }, + { + "auxiliary_loss_clip": 0.01141427, + "auxiliary_loss_mlp": 0.01074699, + "balance_loss_clip": 1.00070965, + "balance_loss_mlp": 1.00031221, + "epoch": 0.8737712310235983, + "flos": 70066315912320.0, + "grad_norm": 0.8190271939256296, + "language_loss": 0.5874083, + "learning_rate": 1.6475534273359704e-07, + "loss": 0.60956961, + "num_input_tokens_seen": 313369940, + "step": 14533, + "time_per_iteration": 3.3186984062194824 + }, + { + "auxiliary_loss_clip": 0.01133061, + "auxiliary_loss_mlp": 0.0110009, + "balance_loss_clip": 1.00184679, + "balance_loss_mlp": 1.00066972, + "epoch": 0.8738313542762663, + "flos": 28658331244800.0, + "grad_norm": 1.378787130916364, + "language_loss": 0.76499033, + "learning_rate": 1.646005846335954e-07, + "loss": 0.78732187, + "num_input_tokens_seen": 313390965, + "step": 14534, + "time_per_iteration": 2.745065689086914 + }, + { + "auxiliary_loss_clip": 0.0113325, + "auxiliary_loss_mlp": 0.01101521, + "balance_loss_clip": 1.00191855, + "balance_loss_mlp": 1.00076532, + "epoch": 0.8738914775289344, + "flos": 22346384780160.0, + "grad_norm": 2.0055183439557283, + "language_loss": 0.74920094, + "learning_rate": 1.6444589613212357e-07, + "loss": 0.77154869, + "num_input_tokens_seen": 313409680, + "step": 14535, + "time_per_iteration": 2.7042489051818848 + }, + { + "auxiliary_loss_clip": 0.01164195, + "auxiliary_loss_mlp": 0.01100677, + "balance_loss_clip": 1.00184917, + "balance_loss_mlp": 1.00054109, + "epoch": 0.8739516007816023, + "flos": 31759648444800.0, + "grad_norm": 1.9372937869276268, + "language_loss": 0.74530911, + "learning_rate": 1.64291277235048e-07, + "loss": 0.76795781, + "num_input_tokens_seen": 313431335, + "step": 14536, + "time_per_iteration": 2.7136762142181396 + }, + { + "auxiliary_loss_clip": 0.01133015, + "auxiliary_loss_mlp": 0.01101004, + "balance_loss_clip": 1.0017066, + "balance_loss_mlp": 1.00067782, + "epoch": 0.8740117240342703, + "flos": 21211715076480.0, + "grad_norm": 1.7268026144250173, + "language_loss": 0.63793659, + "learning_rate": 1.641367279482304e-07, + "loss": 0.66027671, + "num_input_tokens_seen": 313449225, + "step": 14537, + "time_per_iteration": 2.667005777359009 + }, + { + "auxiliary_loss_clip": 0.01149546, + "auxiliary_loss_mlp": 0.01100642, + "balance_loss_clip": 1.00181699, + "balance_loss_mlp": 1.0004108, + "epoch": 0.8740718472869382, + "flos": 25186666867200.0, + "grad_norm": 1.8406183653767534, + "language_loss": 0.58652592, + "learning_rate": 1.6398224827753216e-07, + "loss": 0.6090278, + "num_input_tokens_seen": 313467715, + "step": 14538, + "time_per_iteration": 2.6826961040496826 + }, + { + "auxiliary_loss_clip": 0.01149166, + "auxiliary_loss_mlp": 0.01100468, + "balance_loss_clip": 1.00205266, + "balance_loss_mlp": 1.0005703, + "epoch": 0.8741319705396062, + "flos": 19500931134720.0, + "grad_norm": 1.9899531913513588, + "language_loss": 0.68416941, + "learning_rate": 1.6382783822881142e-07, + "loss": 0.70666575, + "num_input_tokens_seen": 313486805, + "step": 14539, + "time_per_iteration": 2.6807985305786133 + }, + { + "auxiliary_loss_clip": 0.01149763, + "auxiliary_loss_mlp": 0.01101612, + "balance_loss_clip": 1.00180066, + "balance_loss_mlp": 1.0006175, + "epoch": 0.8741920937922741, + "flos": 14100900180480.0, + "grad_norm": 2.639858715235154, + "language_loss": 0.74160743, + "learning_rate": 1.6367349780792262e-07, + "loss": 0.76412112, + "num_input_tokens_seen": 313504880, + "step": 14540, + "time_per_iteration": 2.6780965328216553 + }, + { + "auxiliary_loss_clip": 0.01132477, + "auxiliary_loss_mlp": 0.0110072, + "balance_loss_clip": 1.00177431, + "balance_loss_mlp": 1.0005362, + "epoch": 0.8742522170449422, + "flos": 27709858667520.0, + "grad_norm": 1.8118790555213544, + "language_loss": 0.79212272, + "learning_rate": 1.635192270207193e-07, + "loss": 0.81445467, + "num_input_tokens_seen": 313524995, + "step": 14541, + "time_per_iteration": 2.7422237396240234 + }, + { + "auxiliary_loss_clip": 0.0110161, + "auxiliary_loss_mlp": 0.01102265, + "balance_loss_clip": 1.00175118, + "balance_loss_mlp": 1.00069857, + "epoch": 0.8743123402976101, + "flos": 21142587352320.0, + "grad_norm": 3.4786001284778303, + "language_loss": 0.66397136, + "learning_rate": 1.6336502587305035e-07, + "loss": 0.68601012, + "num_input_tokens_seen": 313541740, + "step": 14542, + "time_per_iteration": 2.7605865001678467 + }, + { + "auxiliary_loss_clip": 0.01158041, + "auxiliary_loss_mlp": 0.01074098, + "balance_loss_clip": 1.00071645, + "balance_loss_mlp": 1.00009274, + "epoch": 0.8743724635502781, + "flos": 60870024351360.0, + "grad_norm": 0.7924384049332875, + "language_loss": 0.54466093, + "learning_rate": 1.632108943707642e-07, + "loss": 0.56698227, + "num_input_tokens_seen": 313593445, + "step": 14543, + "time_per_iteration": 3.0014781951904297 + }, + { + "auxiliary_loss_clip": 0.0113325, + "auxiliary_loss_mlp": 0.01101447, + "balance_loss_clip": 1.00194955, + "balance_loss_mlp": 1.00054836, + "epoch": 0.874432586802946, + "flos": 28109292883200.0, + "grad_norm": 2.1764839239283527, + "language_loss": 0.69634855, + "learning_rate": 1.6305683251970458e-07, + "loss": 0.71869552, + "num_input_tokens_seen": 313615640, + "step": 14544, + "time_per_iteration": 2.7027273178100586 + }, + { + "auxiliary_loss_clip": 0.01099682, + "auxiliary_loss_mlp": 0.01099621, + "balance_loss_clip": 1.00169766, + "balance_loss_mlp": 1.0005821, + "epoch": 0.874492710055614, + "flos": 23550289948800.0, + "grad_norm": 1.63570542830142, + "language_loss": 0.75745463, + "learning_rate": 1.62902840325714e-07, + "loss": 0.77944767, + "num_input_tokens_seen": 313635550, + "step": 14545, + "time_per_iteration": 2.8467252254486084 + }, + { + "auxiliary_loss_clip": 0.01149606, + "auxiliary_loss_mlp": 0.00747476, + "balance_loss_clip": 1.00189447, + "balance_loss_mlp": 1.00044966, + "epoch": 0.8745528333082819, + "flos": 40915647924480.0, + "grad_norm": 1.7483365420913268, + "language_loss": 0.6621874, + "learning_rate": 1.6274891779463217e-07, + "loss": 0.68115824, + "num_input_tokens_seen": 313659275, + "step": 14546, + "time_per_iteration": 2.931858539581299 + }, + { + "auxiliary_loss_clip": 0.01164145, + "auxiliary_loss_mlp": 0.01101063, + "balance_loss_clip": 1.00187147, + "balance_loss_mlp": 1.00049782, + "epoch": 0.87461295656095, + "flos": 23622901292160.0, + "grad_norm": 1.5750140071605838, + "language_loss": 0.73277605, + "learning_rate": 1.6259506493229536e-07, + "loss": 0.75542814, + "num_input_tokens_seen": 313680595, + "step": 14547, + "time_per_iteration": 4.129262208938599 + }, + { + "auxiliary_loss_clip": 0.01164487, + "auxiliary_loss_mlp": 0.01103145, + "balance_loss_clip": 1.0019443, + "balance_loss_mlp": 1.00081611, + "epoch": 0.874673079813618, + "flos": 38794116983040.0, + "grad_norm": 2.5391591744547477, + "language_loss": 0.69879627, + "learning_rate": 1.6244128174453752e-07, + "loss": 0.72147262, + "num_input_tokens_seen": 313699730, + "step": 14548, + "time_per_iteration": 4.130057334899902 + }, + { + "auxiliary_loss_clip": 0.01131045, + "auxiliary_loss_mlp": 0.01101562, + "balance_loss_clip": 1.00169587, + "balance_loss_mlp": 1.00066328, + "epoch": 0.8747332030662859, + "flos": 23696159080320.0, + "grad_norm": 1.927455577395446, + "language_loss": 0.70402277, + "learning_rate": 1.6228756823719093e-07, + "loss": 0.72634888, + "num_input_tokens_seen": 313720090, + "step": 14549, + "time_per_iteration": 2.6920766830444336 + }, + { + "auxiliary_loss_clip": 0.01149789, + "auxiliary_loss_mlp": 0.00747481, + "balance_loss_clip": 1.00184298, + "balance_loss_mlp": 1.00050616, + "epoch": 0.8747933263189539, + "flos": 24462456854400.0, + "grad_norm": 2.462237864601654, + "language_loss": 0.84107786, + "learning_rate": 1.6213392441608352e-07, + "loss": 0.86005056, + "num_input_tokens_seen": 313736795, + "step": 14550, + "time_per_iteration": 2.6772241592407227 + }, + { + "auxiliary_loss_clip": 0.01147638, + "auxiliary_loss_mlp": 0.01101429, + "balance_loss_clip": 1.0018239, + "balance_loss_mlp": 1.00072074, + "epoch": 0.8748534495716218, + "flos": 13809161917440.0, + "grad_norm": 1.7051132420425785, + "language_loss": 0.71621478, + "learning_rate": 1.6198035028704183e-07, + "loss": 0.73870552, + "num_input_tokens_seen": 313754820, + "step": 14551, + "time_per_iteration": 2.6569771766662598 + }, + { + "auxiliary_loss_clip": 0.01149423, + "auxiliary_loss_mlp": 0.00747301, + "balance_loss_clip": 1.00185704, + "balance_loss_mlp": 1.0004549, + "epoch": 0.8749135728242898, + "flos": 29862092759040.0, + "grad_norm": 2.0565746975803574, + "language_loss": 0.64508569, + "learning_rate": 1.6182684585588934e-07, + "loss": 0.66405296, + "num_input_tokens_seen": 313775830, + "step": 14552, + "time_per_iteration": 2.668079137802124 + }, + { + "auxiliary_loss_clip": 0.0111628, + "auxiliary_loss_mlp": 0.01101611, + "balance_loss_clip": 1.00172126, + "balance_loss_mlp": 1.00042593, + "epoch": 0.8749736960769577, + "flos": 24133479166080.0, + "grad_norm": 1.8147147855129688, + "language_loss": 0.79721445, + "learning_rate": 1.616734111284479e-07, + "loss": 0.8193934, + "num_input_tokens_seen": 313795745, + "step": 14553, + "time_per_iteration": 2.823148250579834 + }, + { + "auxiliary_loss_clip": 0.01149943, + "auxiliary_loss_mlp": 0.0110113, + "balance_loss_clip": 1.00189328, + "balance_loss_mlp": 1.00061321, + "epoch": 0.8750338193296258, + "flos": 17202540602880.0, + "grad_norm": 2.1090100663984344, + "language_loss": 0.69521159, + "learning_rate": 1.6152004611053416e-07, + "loss": 0.7177223, + "num_input_tokens_seen": 313813895, + "step": 14554, + "time_per_iteration": 2.5735044479370117 + }, + { + "auxiliary_loss_clip": 0.01131377, + "auxiliary_loss_mlp": 0.00747257, + "balance_loss_clip": 1.00182784, + "balance_loss_mlp": 1.00038648, + "epoch": 0.8750939425822937, + "flos": 23733218937600.0, + "grad_norm": 1.7416516657119294, + "language_loss": 0.84003365, + "learning_rate": 1.6136675080796457e-07, + "loss": 0.85881996, + "num_input_tokens_seen": 313834225, + "step": 14555, + "time_per_iteration": 2.7378482818603516 + }, + { + "auxiliary_loss_clip": 0.01147521, + "auxiliary_loss_mlp": 0.01100952, + "balance_loss_clip": 1.00182128, + "balance_loss_mlp": 1.00062537, + "epoch": 0.8751540658349617, + "flos": 26541684552960.0, + "grad_norm": 1.8843847580383426, + "language_loss": 0.71018529, + "learning_rate": 1.6121352522655252e-07, + "loss": 0.73267007, + "num_input_tokens_seen": 313854430, + "step": 14556, + "time_per_iteration": 2.653383493423462 + }, + { + "auxiliary_loss_clip": 0.01134034, + "auxiliary_loss_mlp": 0.01102259, + "balance_loss_clip": 1.00176108, + "balance_loss_mlp": 1.000597, + "epoch": 0.8752141890876296, + "flos": 19386806647680.0, + "grad_norm": 2.1573992422119685, + "language_loss": 0.76847011, + "learning_rate": 1.6106036937210732e-07, + "loss": 0.790833, + "num_input_tokens_seen": 313871600, + "step": 14557, + "time_per_iteration": 2.751188278198242 + }, + { + "auxiliary_loss_clip": 0.01116388, + "auxiliary_loss_mlp": 0.01101387, + "balance_loss_clip": 1.00183272, + "balance_loss_mlp": 1.00087023, + "epoch": 0.8752743123402976, + "flos": 25374408278400.0, + "grad_norm": 2.06768619392753, + "language_loss": 0.82892966, + "learning_rate": 1.6090728325043767e-07, + "loss": 0.85110742, + "num_input_tokens_seen": 313891570, + "step": 14558, + "time_per_iteration": 2.746976375579834 + }, + { + "auxiliary_loss_clip": 0.01158049, + "auxiliary_loss_mlp": 0.01074787, + "balance_loss_clip": 1.00068188, + "balance_loss_mlp": 1.00040078, + "epoch": 0.8753344355929655, + "flos": 59952398578560.0, + "grad_norm": 0.8062342939639073, + "language_loss": 0.56104195, + "learning_rate": 1.6075426686734784e-07, + "loss": 0.58337027, + "num_input_tokens_seen": 313951290, + "step": 14559, + "time_per_iteration": 3.1594302654266357 + }, + { + "auxiliary_loss_clip": 0.01148034, + "auxiliary_loss_mlp": 0.01099801, + "balance_loss_clip": 1.00183201, + "balance_loss_mlp": 1.00061905, + "epoch": 0.8753945588456336, + "flos": 17894646835200.0, + "grad_norm": 1.6337305478193034, + "language_loss": 0.65829968, + "learning_rate": 1.606013202286407e-07, + "loss": 0.68077803, + "num_input_tokens_seen": 313968645, + "step": 14560, + "time_per_iteration": 2.692058801651001 + }, + { + "auxiliary_loss_clip": 0.01164121, + "auxiliary_loss_mlp": 0.01100866, + "balance_loss_clip": 1.00184417, + "balance_loss_mlp": 1.00063515, + "epoch": 0.8754546820983016, + "flos": 30914885410560.0, + "grad_norm": 1.8415560832594637, + "language_loss": 0.78988898, + "learning_rate": 1.6044844334011541e-07, + "loss": 0.81253886, + "num_input_tokens_seen": 313987580, + "step": 14561, + "time_per_iteration": 2.658606767654419 + }, + { + "auxiliary_loss_clip": 0.01164336, + "auxiliary_loss_mlp": 0.01101377, + "balance_loss_clip": 1.00183618, + "balance_loss_mlp": 1.00076461, + "epoch": 0.8755148053509695, + "flos": 20631075724800.0, + "grad_norm": 3.3496736715777384, + "language_loss": 0.77590585, + "learning_rate": 1.6029563620756982e-07, + "loss": 0.798563, + "num_input_tokens_seen": 314004460, + "step": 14562, + "time_per_iteration": 2.635871171951294 + }, + { + "auxiliary_loss_clip": 0.01163926, + "auxiliary_loss_mlp": 0.01098672, + "balance_loss_clip": 1.00185776, + "balance_loss_mlp": 1.00063455, + "epoch": 0.8755749286036375, + "flos": 34969739005440.0, + "grad_norm": 1.5397365143486113, + "language_loss": 0.71749127, + "learning_rate": 1.601428988367981e-07, + "loss": 0.74011731, + "num_input_tokens_seen": 314026855, + "step": 14563, + "time_per_iteration": 2.6905243396759033 + }, + { + "auxiliary_loss_clip": 0.01164451, + "auxiliary_loss_mlp": 0.01101629, + "balance_loss_clip": 1.00197673, + "balance_loss_mlp": 1.00073016, + "epoch": 0.8756350518563054, + "flos": 18186456925440.0, + "grad_norm": 2.955583421912965, + "language_loss": 0.65344268, + "learning_rate": 1.5999023123359235e-07, + "loss": 0.67610347, + "num_input_tokens_seen": 314042830, + "step": 14564, + "time_per_iteration": 2.6796162128448486 + }, + { + "auxiliary_loss_clip": 0.01147326, + "auxiliary_loss_mlp": 0.01101071, + "balance_loss_clip": 1.00169826, + "balance_loss_mlp": 1.00074482, + "epoch": 0.8756951751089734, + "flos": 20084012611200.0, + "grad_norm": 1.8668252924386795, + "language_loss": 0.70790046, + "learning_rate": 1.598376334037408e-07, + "loss": 0.73038441, + "num_input_tokens_seen": 314062225, + "step": 14565, + "time_per_iteration": 2.6237149238586426 + }, + { + "auxiliary_loss_clip": 0.01131415, + "auxiliary_loss_mlp": 0.0110276, + "balance_loss_clip": 1.00178266, + "balance_loss_mlp": 1.00071704, + "epoch": 0.8757552983616413, + "flos": 27525241739520.0, + "grad_norm": 2.1426685303078723, + "language_loss": 0.77676213, + "learning_rate": 1.5968510535303102e-07, + "loss": 0.79910386, + "num_input_tokens_seen": 314082325, + "step": 14566, + "time_per_iteration": 4.316816568374634 + }, + { + "auxiliary_loss_clip": 0.01130778, + "auxiliary_loss_mlp": 0.01101311, + "balance_loss_clip": 1.00183344, + "balance_loss_mlp": 1.00069833, + "epoch": 0.8758154216143094, + "flos": 18073014796800.0, + "grad_norm": 1.8489375788666327, + "language_loss": 0.71044111, + "learning_rate": 1.5953264708724624e-07, + "loss": 0.73276198, + "num_input_tokens_seen": 314100310, + "step": 14567, + "time_per_iteration": 2.722048044204712 + }, + { + "auxiliary_loss_clip": 0.01133458, + "auxiliary_loss_mlp": 0.00747449, + "balance_loss_clip": 1.00173068, + "balance_loss_mlp": 1.00050664, + "epoch": 0.8758755448669773, + "flos": 25045681985280.0, + "grad_norm": 1.8581430581799065, + "language_loss": 0.74187648, + "learning_rate": 1.5938025861216776e-07, + "loss": 0.76068556, + "num_input_tokens_seen": 314121330, + "step": 14568, + "time_per_iteration": 2.740325927734375 + }, + { + "auxiliary_loss_clip": 0.01116136, + "auxiliary_loss_mlp": 0.0110133, + "balance_loss_clip": 1.00182831, + "balance_loss_mlp": 1.00071704, + "epoch": 0.8759356681196453, + "flos": 22856818999680.0, + "grad_norm": 1.8101018914067963, + "language_loss": 0.87113309, + "learning_rate": 1.5922793993357475e-07, + "loss": 0.89330775, + "num_input_tokens_seen": 314139875, + "step": 14569, + "time_per_iteration": 4.092484951019287 + }, + { + "auxiliary_loss_clip": 0.01113724, + "auxiliary_loss_mlp": 0.01100265, + "balance_loss_clip": 1.00156236, + "balance_loss_mlp": 1.00051081, + "epoch": 0.8759957913723132, + "flos": 21032521102080.0, + "grad_norm": 1.6235268421539866, + "language_loss": 0.74027789, + "learning_rate": 1.5907569105724284e-07, + "loss": 0.76241779, + "num_input_tokens_seen": 314157850, + "step": 14570, + "time_per_iteration": 2.737130880355835 + }, + { + "auxiliary_loss_clip": 0.01147828, + "auxiliary_loss_mlp": 0.00747554, + "balance_loss_clip": 1.00195873, + "balance_loss_mlp": 1.00058568, + "epoch": 0.8760559146249812, + "flos": 20010467514240.0, + "grad_norm": 1.5608292370112797, + "language_loss": 0.67828625, + "learning_rate": 1.5892351198894472e-07, + "loss": 0.69724011, + "num_input_tokens_seen": 314176720, + "step": 14571, + "time_per_iteration": 2.66215443611145 + }, + { + "auxiliary_loss_clip": 0.01116558, + "auxiliary_loss_mlp": 0.01100408, + "balance_loss_clip": 1.00169563, + "balance_loss_mlp": 1.00074923, + "epoch": 0.8761160378776491, + "flos": 19974161842560.0, + "grad_norm": 2.097963392808928, + "language_loss": 0.62465453, + "learning_rate": 1.5877140273445156e-07, + "loss": 0.64682424, + "num_input_tokens_seen": 314196645, + "step": 14572, + "time_per_iteration": 2.810175895690918 + }, + { + "auxiliary_loss_clip": 0.01147413, + "auxiliary_loss_mlp": 0.01100327, + "balance_loss_clip": 1.00179803, + "balance_loss_mlp": 1.000525, + "epoch": 0.8761761611303172, + "flos": 28804415857920.0, + "grad_norm": 2.8063677050632063, + "language_loss": 0.7359305, + "learning_rate": 1.5861936329953162e-07, + "loss": 0.75840789, + "num_input_tokens_seen": 314217430, + "step": 14573, + "time_per_iteration": 2.7080533504486084 + }, + { + "auxiliary_loss_clip": 0.01098872, + "auxiliary_loss_mlp": 0.00747192, + "balance_loss_clip": 1.0016253, + "balance_loss_mlp": 1.00046599, + "epoch": 0.8762362843829851, + "flos": 18332505624960.0, + "grad_norm": 2.594745151340622, + "language_loss": 0.73087895, + "learning_rate": 1.5846739368994966e-07, + "loss": 0.74933958, + "num_input_tokens_seen": 314235310, + "step": 14574, + "time_per_iteration": 2.841885566711426 + }, + { + "auxiliary_loss_clip": 0.01147353, + "auxiliary_loss_mlp": 0.01101227, + "balance_loss_clip": 1.00175345, + "balance_loss_mlp": 1.00080562, + "epoch": 0.8762964076356531, + "flos": 15779149378560.0, + "grad_norm": 2.0697669806668264, + "language_loss": 0.75590819, + "learning_rate": 1.5831549391146903e-07, + "loss": 0.77839398, + "num_input_tokens_seen": 314252355, + "step": 14575, + "time_per_iteration": 2.658353567123413 + }, + { + "auxiliary_loss_clip": 0.01132937, + "auxiliary_loss_mlp": 0.01101016, + "balance_loss_clip": 1.00185549, + "balance_loss_mlp": 1.00068927, + "epoch": 0.8763565308883211, + "flos": 33176754789120.0, + "grad_norm": 3.508456808555032, + "language_loss": 0.66636461, + "learning_rate": 1.5816366396984916e-07, + "loss": 0.68870413, + "num_input_tokens_seen": 314272755, + "step": 14576, + "time_per_iteration": 2.75575852394104 + }, + { + "auxiliary_loss_clip": 0.01133132, + "auxiliary_loss_mlp": 0.01100125, + "balance_loss_clip": 1.00179791, + "balance_loss_mlp": 1.00075257, + "epoch": 0.876416654140989, + "flos": 15888102307200.0, + "grad_norm": 2.3730777214211103, + "language_loss": 0.66609079, + "learning_rate": 1.5801190387084806e-07, + "loss": 0.68842334, + "num_input_tokens_seen": 314291365, + "step": 14577, + "time_per_iteration": 2.63291597366333 + }, + { + "auxiliary_loss_clip": 0.01147519, + "auxiliary_loss_mlp": 0.01101592, + "balance_loss_clip": 1.00190794, + "balance_loss_mlp": 1.00069368, + "epoch": 0.876476777393657, + "flos": 25885237547520.0, + "grad_norm": 4.1718010648460035, + "language_loss": 0.71232677, + "learning_rate": 1.5786021362021962e-07, + "loss": 0.73481792, + "num_input_tokens_seen": 314310075, + "step": 14578, + "time_per_iteration": 2.676976203918457 + }, + { + "auxiliary_loss_clip": 0.01164408, + "auxiliary_loss_mlp": 0.01102004, + "balance_loss_clip": 1.00194275, + "balance_loss_mlp": 1.00058126, + "epoch": 0.876536900646325, + "flos": 13589675861760.0, + "grad_norm": 1.9685593015552003, + "language_loss": 0.71082938, + "learning_rate": 1.5770859322371676e-07, + "loss": 0.73349345, + "num_input_tokens_seen": 314325695, + "step": 14579, + "time_per_iteration": 2.606787919998169 + }, + { + "auxiliary_loss_clip": 0.01134556, + "auxiliary_loss_mlp": 0.01100307, + "balance_loss_clip": 1.00180984, + "balance_loss_mlp": 1.0006007, + "epoch": 0.876597023898993, + "flos": 12203344494720.0, + "grad_norm": 3.4538876918924557, + "language_loss": 0.70506513, + "learning_rate": 1.5755704268708912e-07, + "loss": 0.72741377, + "num_input_tokens_seen": 314343605, + "step": 14580, + "time_per_iteration": 2.653841018676758 + }, + { + "auxiliary_loss_clip": 0.01164243, + "auxiliary_loss_mlp": 0.00747321, + "balance_loss_clip": 1.00196433, + "balance_loss_mlp": 1.00047743, + "epoch": 0.8766571471516609, + "flos": 25336773803520.0, + "grad_norm": 1.784915142301017, + "language_loss": 0.65418041, + "learning_rate": 1.5740556201608256e-07, + "loss": 0.67329603, + "num_input_tokens_seen": 314364275, + "step": 14581, + "time_per_iteration": 2.609851598739624 + }, + { + "auxiliary_loss_clip": 0.01130625, + "auxiliary_loss_mlp": 0.01099892, + "balance_loss_clip": 1.00167549, + "balance_loss_mlp": 1.0006144, + "epoch": 0.8767172704043289, + "flos": 30113287545600.0, + "grad_norm": 1.6917404668836273, + "language_loss": 0.73858482, + "learning_rate": 1.572541512164416e-07, + "loss": 0.76088989, + "num_input_tokens_seen": 314385140, + "step": 14582, + "time_per_iteration": 2.727104663848877 + }, + { + "auxiliary_loss_clip": 0.0116424, + "auxiliary_loss_mlp": 0.00747408, + "balance_loss_clip": 1.00191569, + "balance_loss_mlp": 1.00049186, + "epoch": 0.8767773936569968, + "flos": 19281157770240.0, + "grad_norm": 2.412736389564518, + "language_loss": 0.66629475, + "learning_rate": 1.5710281029390826e-07, + "loss": 0.68541121, + "num_input_tokens_seen": 314403715, + "step": 14583, + "time_per_iteration": 2.5972180366516113 + }, + { + "auxiliary_loss_clip": 0.01148842, + "auxiliary_loss_mlp": 0.00747307, + "balance_loss_clip": 1.00188053, + "balance_loss_mlp": 1.0004015, + "epoch": 0.8768375169096648, + "flos": 21247230648960.0, + "grad_norm": 1.6018474348538827, + "language_loss": 0.79083145, + "learning_rate": 1.5695153925422067e-07, + "loss": 0.809793, + "num_input_tokens_seen": 314421880, + "step": 14584, + "time_per_iteration": 2.6579928398132324 + }, + { + "auxiliary_loss_clip": 0.01114579, + "auxiliary_loss_mlp": 0.01100904, + "balance_loss_clip": 1.00166595, + "balance_loss_mlp": 1.00038719, + "epoch": 0.8768976401623327, + "flos": 23295539715840.0, + "grad_norm": 1.5515446493234055, + "language_loss": 0.72068888, + "learning_rate": 1.5680033810311555e-07, + "loss": 0.74284363, + "num_input_tokens_seen": 314441585, + "step": 14585, + "time_per_iteration": 4.215253114700317 + }, + { + "auxiliary_loss_clip": 0.01134872, + "auxiliary_loss_mlp": 0.01100028, + "balance_loss_clip": 1.00183415, + "balance_loss_mlp": 1.00051236, + "epoch": 0.8769577634150008, + "flos": 21361247395200.0, + "grad_norm": 1.9945172117871948, + "language_loss": 0.74340069, + "learning_rate": 1.5664920684632654e-07, + "loss": 0.76574969, + "num_input_tokens_seen": 314459020, + "step": 14586, + "time_per_iteration": 4.0523152351379395 + }, + { + "auxiliary_loss_clip": 0.01164153, + "auxiliary_loss_mlp": 0.0110065, + "balance_loss_clip": 1.00180292, + "balance_loss_mlp": 1.00051391, + "epoch": 0.8770178866676687, + "flos": 23514056104320.0, + "grad_norm": 1.6634103698116576, + "language_loss": 0.78728473, + "learning_rate": 1.564981454895844e-07, + "loss": 0.80993277, + "num_input_tokens_seen": 314478935, + "step": 14587, + "time_per_iteration": 2.541614294052124 + }, + { + "auxiliary_loss_clip": 0.01147024, + "auxiliary_loss_mlp": 0.01101447, + "balance_loss_clip": 1.00185978, + "balance_loss_mlp": 1.00069118, + "epoch": 0.8770780099203367, + "flos": 19719052473600.0, + "grad_norm": 1.6057218972370746, + "language_loss": 0.7385428, + "learning_rate": 1.5634715403861697e-07, + "loss": 0.76102746, + "num_input_tokens_seen": 314497635, + "step": 14588, + "time_per_iteration": 2.608828544616699 + }, + { + "auxiliary_loss_clip": 0.01082531, + "auxiliary_loss_mlp": 0.00747365, + "balance_loss_clip": 1.00162768, + "balance_loss_mlp": 1.0004766, + "epoch": 0.8771381331730047, + "flos": 21395901041280.0, + "grad_norm": 1.9303942366718378, + "language_loss": 0.66547173, + "learning_rate": 1.5619623249915016e-07, + "loss": 0.68377072, + "num_input_tokens_seen": 314515445, + "step": 14589, + "time_per_iteration": 2.7450287342071533 + }, + { + "auxiliary_loss_clip": 0.011474, + "auxiliary_loss_mlp": 0.01101126, + "balance_loss_clip": 1.00182867, + "balance_loss_mlp": 1.0006088, + "epoch": 0.8771982564256726, + "flos": 20261770041600.0, + "grad_norm": 2.469948302802971, + "language_loss": 0.70284653, + "learning_rate": 1.5604538087690732e-07, + "loss": 0.72533184, + "num_input_tokens_seen": 314533040, + "step": 14590, + "time_per_iteration": 2.5804920196533203 + }, + { + "auxiliary_loss_clip": 0.01133192, + "auxiliary_loss_mlp": 0.01102176, + "balance_loss_clip": 1.00189888, + "balance_loss_mlp": 1.0006094, + "epoch": 0.8772583796783406, + "flos": 12489372495360.0, + "grad_norm": 2.0022772192996388, + "language_loss": 0.74511707, + "learning_rate": 1.558945991776086e-07, + "loss": 0.76747072, + "num_input_tokens_seen": 314548280, + "step": 14591, + "time_per_iteration": 2.5667829513549805 + }, + { + "auxiliary_loss_clip": 0.01163981, + "auxiliary_loss_mlp": 0.01100483, + "balance_loss_clip": 1.00191712, + "balance_loss_mlp": 1.00044215, + "epoch": 0.8773185029310085, + "flos": 15921103927680.0, + "grad_norm": 1.7058596566264597, + "language_loss": 0.79990184, + "learning_rate": 1.5574388740697096e-07, + "loss": 0.82254642, + "num_input_tokens_seen": 314565345, + "step": 14592, + "time_per_iteration": 2.617532730102539 + }, + { + "auxiliary_loss_clip": 0.0116409, + "auxiliary_loss_mlp": 0.01099982, + "balance_loss_clip": 1.00192201, + "balance_loss_mlp": 1.00056148, + "epoch": 0.8773786261836766, + "flos": 21504530747520.0, + "grad_norm": 2.2501180820666944, + "language_loss": 0.82613993, + "learning_rate": 1.5559324557071052e-07, + "loss": 0.84878063, + "num_input_tokens_seen": 314584190, + "step": 14593, + "time_per_iteration": 2.547175407409668 + }, + { + "auxiliary_loss_clip": 0.01149671, + "auxiliary_loss_mlp": 0.01099513, + "balance_loss_clip": 1.00194836, + "balance_loss_mlp": 1.00066519, + "epoch": 0.8774387494363445, + "flos": 26761493831040.0, + "grad_norm": 2.2950400387722754, + "language_loss": 0.76122028, + "learning_rate": 1.5544267367453845e-07, + "loss": 0.78371215, + "num_input_tokens_seen": 314605625, + "step": 14594, + "time_per_iteration": 2.668238401412964 + }, + { + "auxiliary_loss_clip": 0.01103201, + "auxiliary_loss_mlp": 0.011013, + "balance_loss_clip": 1.00171518, + "balance_loss_mlp": 1.00059247, + "epoch": 0.8774988726890125, + "flos": 18478841633280.0, + "grad_norm": 2.0924164752004537, + "language_loss": 0.77956712, + "learning_rate": 1.552921717241651e-07, + "loss": 0.80161214, + "num_input_tokens_seen": 314622630, + "step": 14595, + "time_per_iteration": 2.6803135871887207 + }, + { + "auxiliary_loss_clip": 0.01115707, + "auxiliary_loss_mlp": 0.01100888, + "balance_loss_clip": 1.00178313, + "balance_loss_mlp": 1.00060904, + "epoch": 0.8775589959416804, + "flos": 24426366664320.0, + "grad_norm": 1.7764754888337653, + "language_loss": 0.70753467, + "learning_rate": 1.5514173972529743e-07, + "loss": 0.72970057, + "num_input_tokens_seen": 314642460, + "step": 14596, + "time_per_iteration": 2.741753578186035 + }, + { + "auxiliary_loss_clip": 0.01113911, + "auxiliary_loss_mlp": 0.01100443, + "balance_loss_clip": 1.00172448, + "balance_loss_mlp": 1.0005455, + "epoch": 0.8776191191943484, + "flos": 23440151871360.0, + "grad_norm": 2.2520192971353694, + "language_loss": 0.85800064, + "learning_rate": 1.5499137768364067e-07, + "loss": 0.88014424, + "num_input_tokens_seen": 314659875, + "step": 14597, + "time_per_iteration": 2.7795042991638184 + }, + { + "auxiliary_loss_clip": 0.01147495, + "auxiliary_loss_mlp": 0.01101356, + "balance_loss_clip": 1.00188744, + "balance_loss_mlp": 1.00074303, + "epoch": 0.8776792424470163, + "flos": 26830872950400.0, + "grad_norm": 1.5762064719761035, + "language_loss": 0.72622454, + "learning_rate": 1.5484108560489494e-07, + "loss": 0.74871308, + "num_input_tokens_seen": 314680260, + "step": 14598, + "time_per_iteration": 2.649062395095825 + }, + { + "auxiliary_loss_clip": 0.01149741, + "auxiliary_loss_mlp": 0.0074728, + "balance_loss_clip": 1.00197721, + "balance_loss_mlp": 1.0004127, + "epoch": 0.8777393656996844, + "flos": 15626169354240.0, + "grad_norm": 2.222297043841131, + "language_loss": 0.77342653, + "learning_rate": 1.5469086349476036e-07, + "loss": 0.79239666, + "num_input_tokens_seen": 314696260, + "step": 14599, + "time_per_iteration": 2.597799062728882 + }, + { + "auxiliary_loss_clip": 0.01117985, + "auxiliary_loss_mlp": 0.01100848, + "balance_loss_clip": 1.00177824, + "balance_loss_mlp": 1.00061679, + "epoch": 0.8777994889523523, + "flos": 18879999701760.0, + "grad_norm": 2.534828079057186, + "language_loss": 0.67953414, + "learning_rate": 1.545407113589332e-07, + "loss": 0.7017225, + "num_input_tokens_seen": 314714215, + "step": 14600, + "time_per_iteration": 2.6741280555725098 + }, + { + "auxiliary_loss_clip": 0.01147623, + "auxiliary_loss_mlp": 0.01100628, + "balance_loss_clip": 1.00187135, + "balance_loss_mlp": 1.00053954, + "epoch": 0.8778596122050203, + "flos": 48826516400640.0, + "grad_norm": 1.9148303566892977, + "language_loss": 0.6938597, + "learning_rate": 1.543906292031072e-07, + "loss": 0.71634221, + "num_input_tokens_seen": 314735700, + "step": 14601, + "time_per_iteration": 2.8914220333099365 + }, + { + "auxiliary_loss_clip": 0.0114898, + "auxiliary_loss_mlp": 0.01102557, + "balance_loss_clip": 1.00191295, + "balance_loss_mlp": 1.00060964, + "epoch": 0.8779197354576883, + "flos": 25660184883840.0, + "grad_norm": 2.4711921769840806, + "language_loss": 0.73175353, + "learning_rate": 1.542406170329733e-07, + "loss": 0.75426888, + "num_input_tokens_seen": 314753335, + "step": 14602, + "time_per_iteration": 2.621511459350586 + }, + { + "auxiliary_loss_clip": 0.011642, + "auxiliary_loss_mlp": 0.01100893, + "balance_loss_clip": 1.00191903, + "balance_loss_mlp": 1.00056672, + "epoch": 0.8779798587103562, + "flos": 18843227153280.0, + "grad_norm": 2.5283481321683285, + "language_loss": 0.70928979, + "learning_rate": 1.5409067485422056e-07, + "loss": 0.73194075, + "num_input_tokens_seen": 314770800, + "step": 14603, + "time_per_iteration": 2.6374552249908447 + }, + { + "auxiliary_loss_clip": 0.01126277, + "auxiliary_loss_mlp": 0.01074466, + "balance_loss_clip": 1.0007813, + "balance_loss_mlp": 1.00007904, + "epoch": 0.8780399819630242, + "flos": 68613119377920.0, + "grad_norm": 0.7312073197033424, + "language_loss": 0.54075831, + "learning_rate": 1.539408026725344e-07, + "loss": 0.56276572, + "num_input_tokens_seen": 314837275, + "step": 14604, + "time_per_iteration": 4.749364376068115 + }, + { + "auxiliary_loss_clip": 0.01112545, + "auxiliary_loss_mlp": 0.01074872, + "balance_loss_clip": 1.0007267, + "balance_loss_mlp": 1.00048542, + "epoch": 0.8781001052156922, + "flos": 65734807766400.0, + "grad_norm": 0.6969149546789924, + "language_loss": 0.59452057, + "learning_rate": 1.537910004935976e-07, + "loss": 0.61639476, + "num_input_tokens_seen": 314902220, + "step": 14605, + "time_per_iteration": 3.2319018840789795 + }, + { + "auxiliary_loss_clip": 0.01099999, + "auxiliary_loss_mlp": 0.0110116, + "balance_loss_clip": 1.00168264, + "balance_loss_mlp": 1.00064278, + "epoch": 0.8781602284683602, + "flos": 22049654526720.0, + "grad_norm": 1.6224133747274552, + "language_loss": 0.85066032, + "learning_rate": 1.536412683230912e-07, + "loss": 0.87267196, + "num_input_tokens_seen": 314921645, + "step": 14606, + "time_per_iteration": 4.155766487121582 + }, + { + "auxiliary_loss_clip": 0.0116439, + "auxiliary_loss_mlp": 0.01102139, + "balance_loss_clip": 1.001966, + "balance_loss_mlp": 1.00057292, + "epoch": 0.8782203517210281, + "flos": 17562939713280.0, + "grad_norm": 2.0393738112443063, + "language_loss": 0.70631689, + "learning_rate": 1.534916061666931e-07, + "loss": 0.72898221, + "num_input_tokens_seen": 314939390, + "step": 14607, + "time_per_iteration": 2.6326634883880615 + }, + { + "auxiliary_loss_clip": 0.01133068, + "auxiliary_loss_mlp": 0.01100092, + "balance_loss_clip": 1.00179958, + "balance_loss_mlp": 1.00067163, + "epoch": 0.8782804749736961, + "flos": 25520421064320.0, + "grad_norm": 1.8484457063959097, + "language_loss": 0.72286069, + "learning_rate": 1.533420140300785e-07, + "loss": 0.74519229, + "num_input_tokens_seen": 314959205, + "step": 14608, + "time_per_iteration": 2.7213997840881348 + }, + { + "auxiliary_loss_clip": 0.01149733, + "auxiliary_loss_mlp": 0.01102597, + "balance_loss_clip": 1.00183094, + "balance_loss_mlp": 1.00064874, + "epoch": 0.878340598226364, + "flos": 21798747048960.0, + "grad_norm": 2.6027283346197088, + "language_loss": 0.87240493, + "learning_rate": 1.5319249191891936e-07, + "loss": 0.89492822, + "num_input_tokens_seen": 314977485, + "step": 14609, + "time_per_iteration": 2.7706382274627686 + }, + { + "auxiliary_loss_clip": 0.01102328, + "auxiliary_loss_mlp": 0.01101964, + "balance_loss_clip": 1.00185823, + "balance_loss_mlp": 1.00063658, + "epoch": 0.878400721479032, + "flos": 21102403011840.0, + "grad_norm": 1.576280492721176, + "language_loss": 0.70345551, + "learning_rate": 1.5304303983888643e-07, + "loss": 0.72549838, + "num_input_tokens_seen": 314997830, + "step": 14610, + "time_per_iteration": 2.7121431827545166 + }, + { + "auxiliary_loss_clip": 0.01147756, + "auxiliary_loss_mlp": 0.00747333, + "balance_loss_clip": 1.00182176, + "balance_loss_mlp": 1.00043988, + "epoch": 0.8784608447316999, + "flos": 20923532259840.0, + "grad_norm": 2.295163593417699, + "language_loss": 0.80358791, + "learning_rate": 1.5289365779564612e-07, + "loss": 0.82253885, + "num_input_tokens_seen": 315016480, + "step": 14611, + "time_per_iteration": 2.623383045196533 + }, + { + "auxiliary_loss_clip": 0.01164338, + "auxiliary_loss_mlp": 0.01101316, + "balance_loss_clip": 1.00193036, + "balance_loss_mlp": 1.00060761, + "epoch": 0.878520967984368, + "flos": 23330660238720.0, + "grad_norm": 1.731554127106493, + "language_loss": 0.76531708, + "learning_rate": 1.5274434579486338e-07, + "loss": 0.78797364, + "num_input_tokens_seen": 315036135, + "step": 14612, + "time_per_iteration": 2.550603151321411 + }, + { + "auxiliary_loss_clip": 0.01098186, + "auxiliary_loss_mlp": 0.01100341, + "balance_loss_clip": 1.0017941, + "balance_loss_mlp": 1.00063396, + "epoch": 0.8785810912370359, + "flos": 25518984520320.0, + "grad_norm": 1.8707756555920083, + "language_loss": 0.72220194, + "learning_rate": 1.525951038422002e-07, + "loss": 0.74418724, + "num_input_tokens_seen": 315057995, + "step": 14613, + "time_per_iteration": 2.7867586612701416 + }, + { + "auxiliary_loss_clip": 0.01112695, + "auxiliary_loss_mlp": 0.01074126, + "balance_loss_clip": 1.00064325, + "balance_loss_mlp": 1.000121, + "epoch": 0.8786412144897039, + "flos": 61841047691520.0, + "grad_norm": 1.0293873880662407, + "language_loss": 0.64569354, + "learning_rate": 1.5244593194331667e-07, + "loss": 0.66756171, + "num_input_tokens_seen": 315104010, + "step": 14614, + "time_per_iteration": 2.984041690826416 + }, + { + "auxiliary_loss_clip": 0.01158052, + "auxiliary_loss_mlp": 0.01074444, + "balance_loss_clip": 1.00068545, + "balance_loss_mlp": 1.00043857, + "epoch": 0.8787013377423719, + "flos": 70989364638720.0, + "grad_norm": 0.6561155331330698, + "language_loss": 0.58594286, + "learning_rate": 1.5229683010386762e-07, + "loss": 0.6082679, + "num_input_tokens_seen": 315174550, + "step": 14615, + "time_per_iteration": 3.1547861099243164 + }, + { + "auxiliary_loss_clip": 0.01099038, + "auxiliary_loss_mlp": 0.01100419, + "balance_loss_clip": 1.00147986, + "balance_loss_mlp": 1.00061703, + "epoch": 0.8787614609950398, + "flos": 17347404153600.0, + "grad_norm": 2.6978923553979786, + "language_loss": 0.72785068, + "learning_rate": 1.5214779832950807e-07, + "loss": 0.74984521, + "num_input_tokens_seen": 315191825, + "step": 14616, + "time_per_iteration": 2.785322427749634 + }, + { + "auxiliary_loss_clip": 0.01158072, + "auxiliary_loss_mlp": 0.01074585, + "balance_loss_clip": 1.00069153, + "balance_loss_mlp": 1.0001986, + "epoch": 0.8788215842477078, + "flos": 72511401588480.0, + "grad_norm": 0.8020438602913111, + "language_loss": 0.57978755, + "learning_rate": 1.5199883662588953e-07, + "loss": 0.60211408, + "num_input_tokens_seen": 315255075, + "step": 14617, + "time_per_iteration": 3.215768814086914 + }, + { + "auxiliary_loss_clip": 0.01133595, + "auxiliary_loss_mlp": 0.01100063, + "balance_loss_clip": 1.00178123, + "balance_loss_mlp": 1.00040376, + "epoch": 0.8788817075003758, + "flos": 24827452905600.0, + "grad_norm": 2.214701075932672, + "language_loss": 0.83568573, + "learning_rate": 1.5184994499865987e-07, + "loss": 0.85802233, + "num_input_tokens_seen": 315273995, + "step": 14618, + "time_per_iteration": 2.7455344200134277 + }, + { + "auxiliary_loss_clip": 0.01130707, + "auxiliary_loss_mlp": 0.01099657, + "balance_loss_clip": 1.00181639, + "balance_loss_mlp": 1.00047481, + "epoch": 0.8789418307530438, + "flos": 22638769488000.0, + "grad_norm": 2.164272150970163, + "language_loss": 0.69098699, + "learning_rate": 1.5170112345346598e-07, + "loss": 0.71329063, + "num_input_tokens_seen": 315294485, + "step": 14619, + "time_per_iteration": 2.717282772064209 + }, + { + "auxiliary_loss_clip": 0.01099857, + "auxiliary_loss_mlp": 0.01101875, + "balance_loss_clip": 1.00163186, + "balance_loss_mlp": 1.00059438, + "epoch": 0.8790019540057117, + "flos": 19785738072960.0, + "grad_norm": 1.9005701630489007, + "language_loss": 0.77719581, + "learning_rate": 1.5155237199595016e-07, + "loss": 0.79921317, + "num_input_tokens_seen": 315310420, + "step": 14620, + "time_per_iteration": 2.7492876052856445 + }, + { + "auxiliary_loss_clip": 0.011332, + "auxiliary_loss_mlp": 0.0110178, + "balance_loss_clip": 1.00185657, + "balance_loss_mlp": 1.00059533, + "epoch": 0.8790620772583797, + "flos": 20229774001920.0, + "grad_norm": 1.8725191285976348, + "language_loss": 0.79695046, + "learning_rate": 1.514036906317542e-07, + "loss": 0.81930029, + "num_input_tokens_seen": 315330110, + "step": 14621, + "time_per_iteration": 2.8019351959228516 + }, + { + "auxiliary_loss_clip": 0.01131121, + "auxiliary_loss_mlp": 0.01101502, + "balance_loss_clip": 1.00174701, + "balance_loss_mlp": 1.00060344, + "epoch": 0.8791222005110476, + "flos": 24130785646080.0, + "grad_norm": 3.9313646865022243, + "language_loss": 0.66883039, + "learning_rate": 1.5125507936651506e-07, + "loss": 0.69115669, + "num_input_tokens_seen": 315350080, + "step": 14622, + "time_per_iteration": 4.273889064788818 + }, + { + "auxiliary_loss_clip": 0.01130873, + "auxiliary_loss_mlp": 0.01100734, + "balance_loss_clip": 1.00176072, + "balance_loss_mlp": 1.000741, + "epoch": 0.8791823237637156, + "flos": 21614201948160.0, + "grad_norm": 1.9964871640398347, + "language_loss": 0.72805393, + "learning_rate": 1.511065382058687e-07, + "loss": 0.75037003, + "num_input_tokens_seen": 315366360, + "step": 14623, + "time_per_iteration": 2.6361560821533203 + }, + { + "auxiliary_loss_clip": 0.01100387, + "auxiliary_loss_mlp": 0.0110079, + "balance_loss_clip": 1.00162649, + "balance_loss_mlp": 1.00065446, + "epoch": 0.8792424470163835, + "flos": 24243401761920.0, + "grad_norm": 2.2510484523304375, + "language_loss": 0.78581607, + "learning_rate": 1.5095806715544801e-07, + "loss": 0.80782783, + "num_input_tokens_seen": 315385890, + "step": 14624, + "time_per_iteration": 4.229667901992798 + }, + { + "auxiliary_loss_clip": 0.01149414, + "auxiliary_loss_mlp": 0.0110144, + "balance_loss_clip": 1.00180912, + "balance_loss_mlp": 1.00073242, + "epoch": 0.8793025702690516, + "flos": 24893204751360.0, + "grad_norm": 1.7585111716104287, + "language_loss": 0.80195439, + "learning_rate": 1.5080966622088265e-07, + "loss": 0.82446301, + "num_input_tokens_seen": 315403400, + "step": 14625, + "time_per_iteration": 2.6760432720184326 + }, + { + "auxiliary_loss_clip": 0.01130145, + "auxiliary_loss_mlp": 0.01099627, + "balance_loss_clip": 1.00174367, + "balance_loss_mlp": 1.00063562, + "epoch": 0.8793626935217195, + "flos": 25373115388800.0, + "grad_norm": 1.5054572483211088, + "language_loss": 0.74199045, + "learning_rate": 1.5066133540779967e-07, + "loss": 0.76428819, + "num_input_tokens_seen": 315423670, + "step": 14626, + "time_per_iteration": 2.6922545433044434 + }, + { + "auxiliary_loss_clip": 0.01147453, + "auxiliary_loss_mlp": 0.01101565, + "balance_loss_clip": 1.00171697, + "balance_loss_mlp": 1.0006187, + "epoch": 0.8794228167743875, + "flos": 34678000742400.0, + "grad_norm": 2.231562810527911, + "language_loss": 0.70928556, + "learning_rate": 1.505130747218246e-07, + "loss": 0.73177576, + "num_input_tokens_seen": 315446265, + "step": 14627, + "time_per_iteration": 2.746150016784668 + }, + { + "auxiliary_loss_clip": 0.01115743, + "auxiliary_loss_mlp": 0.0110015, + "balance_loss_clip": 1.00167465, + "balance_loss_mlp": 1.00049055, + "epoch": 0.8794829400270555, + "flos": 19464014931840.0, + "grad_norm": 1.728601065374275, + "language_loss": 0.72488201, + "learning_rate": 1.5036488416857873e-07, + "loss": 0.74704087, + "num_input_tokens_seen": 315464655, + "step": 14628, + "time_per_iteration": 2.717437744140625 + }, + { + "auxiliary_loss_clip": 0.01119016, + "auxiliary_loss_mlp": 0.011016, + "balance_loss_clip": 1.00179839, + "balance_loss_mlp": 1.00060582, + "epoch": 0.8795430632797234, + "flos": 15231403906560.0, + "grad_norm": 4.272861605744887, + "language_loss": 0.68926489, + "learning_rate": 1.5021676375368175e-07, + "loss": 0.71147108, + "num_input_tokens_seen": 315481090, + "step": 14629, + "time_per_iteration": 2.6802139282226562 + }, + { + "auxiliary_loss_clip": 0.01130775, + "auxiliary_loss_mlp": 0.01099831, + "balance_loss_clip": 1.00157666, + "balance_loss_mlp": 1.00064862, + "epoch": 0.8796031865323914, + "flos": 27744727795200.0, + "grad_norm": 1.610414637463683, + "language_loss": 0.68580574, + "learning_rate": 1.5006871348275053e-07, + "loss": 0.70811182, + "num_input_tokens_seen": 315502010, + "step": 14630, + "time_per_iteration": 2.710348606109619 + }, + { + "auxiliary_loss_clip": 0.01130741, + "auxiliary_loss_mlp": 0.01100375, + "balance_loss_clip": 1.00170672, + "balance_loss_mlp": 1.00081098, + "epoch": 0.8796633097850594, + "flos": 31285412156160.0, + "grad_norm": 1.6915727843475912, + "language_loss": 0.74239969, + "learning_rate": 1.499207333613999e-07, + "loss": 0.76471084, + "num_input_tokens_seen": 315523040, + "step": 14631, + "time_per_iteration": 2.7114222049713135 + }, + { + "auxiliary_loss_clip": 0.01132903, + "auxiliary_loss_mlp": 0.00747196, + "balance_loss_clip": 1.00182891, + "balance_loss_mlp": 1.00038624, + "epoch": 0.8797234330377274, + "flos": 24243150366720.0, + "grad_norm": 2.126942683785575, + "language_loss": 0.6972515, + "learning_rate": 1.4977282339523954e-07, + "loss": 0.71605253, + "num_input_tokens_seen": 315541865, + "step": 14632, + "time_per_iteration": 2.7068588733673096 + }, + { + "auxiliary_loss_clip": 0.01130754, + "auxiliary_loss_mlp": 0.0110158, + "balance_loss_clip": 1.00179386, + "balance_loss_mlp": 1.00068116, + "epoch": 0.8797835562903953, + "flos": 24167414540160.0, + "grad_norm": 1.8094295749127343, + "language_loss": 0.65214998, + "learning_rate": 1.4962498358987929e-07, + "loss": 0.67447329, + "num_input_tokens_seen": 315561470, + "step": 14633, + "time_per_iteration": 2.7179172039031982 + }, + { + "auxiliary_loss_clip": 0.01119232, + "auxiliary_loss_mlp": 0.01100174, + "balance_loss_clip": 1.00172961, + "balance_loss_mlp": 1.00056303, + "epoch": 0.8798436795430633, + "flos": 19284677303040.0, + "grad_norm": 1.4078656631455537, + "language_loss": 0.84040529, + "learning_rate": 1.4947721395092528e-07, + "loss": 0.86259937, + "num_input_tokens_seen": 315583140, + "step": 14634, + "time_per_iteration": 2.7633166313171387 + }, + { + "auxiliary_loss_clip": 0.01130624, + "auxiliary_loss_mlp": 0.00747374, + "balance_loss_clip": 1.00171292, + "balance_loss_mlp": 1.00044656, + "epoch": 0.8799038027957312, + "flos": 28179390274560.0, + "grad_norm": 1.8031791699872897, + "language_loss": 0.80363286, + "learning_rate": 1.4932951448398056e-07, + "loss": 0.82241285, + "num_input_tokens_seen": 315601935, + "step": 14635, + "time_per_iteration": 2.744757652282715 + }, + { + "auxiliary_loss_clip": 0.01131451, + "auxiliary_loss_mlp": 0.01100718, + "balance_loss_clip": 1.00193393, + "balance_loss_mlp": 1.0005827, + "epoch": 0.8799639260483992, + "flos": 24644703484800.0, + "grad_norm": 1.8532002634133555, + "language_loss": 0.6555661, + "learning_rate": 1.4918188519464648e-07, + "loss": 0.6778878, + "num_input_tokens_seen": 315619995, + "step": 14636, + "time_per_iteration": 2.707118272781372 + }, + { + "auxiliary_loss_clip": 0.01134151, + "auxiliary_loss_mlp": 0.01100685, + "balance_loss_clip": 1.00178325, + "balance_loss_mlp": 1.00073981, + "epoch": 0.8800240493010671, + "flos": 22200479735040.0, + "grad_norm": 1.762317808029814, + "language_loss": 0.70531869, + "learning_rate": 1.4903432608852074e-07, + "loss": 0.72766703, + "num_input_tokens_seen": 315637895, + "step": 14637, + "time_per_iteration": 2.693593740463257 + }, + { + "auxiliary_loss_clip": 0.01132557, + "auxiliary_loss_mlp": 0.01100914, + "balance_loss_clip": 1.00192761, + "balance_loss_mlp": 1.00054002, + "epoch": 0.8800841725537352, + "flos": 14246086953600.0, + "grad_norm": 1.7785246758757716, + "language_loss": 0.6619463, + "learning_rate": 1.4888683717119843e-07, + "loss": 0.68428111, + "num_input_tokens_seen": 315655520, + "step": 14638, + "time_per_iteration": 2.695916175842285 + }, + { + "auxiliary_loss_clip": 0.01147531, + "auxiliary_loss_mlp": 0.01100988, + "balance_loss_clip": 1.001899, + "balance_loss_mlp": 1.0005182, + "epoch": 0.8801442958064031, + "flos": 37415794348800.0, + "grad_norm": 1.7031845525173466, + "language_loss": 0.58214611, + "learning_rate": 1.4873941844827286e-07, + "loss": 0.6046313, + "num_input_tokens_seen": 315678955, + "step": 14639, + "time_per_iteration": 2.8895514011383057 + }, + { + "auxiliary_loss_clip": 0.01133131, + "auxiliary_loss_mlp": 0.01101079, + "balance_loss_clip": 1.00189579, + "balance_loss_mlp": 1.00075221, + "epoch": 0.8802044190590711, + "flos": 25047334010880.0, + "grad_norm": 1.7093587614871772, + "language_loss": 0.7467643, + "learning_rate": 1.4859206992533402e-07, + "loss": 0.76910645, + "num_input_tokens_seen": 315700360, + "step": 14640, + "time_per_iteration": 2.7389297485351562 + }, + { + "auxiliary_loss_clip": 0.01134889, + "auxiliary_loss_mlp": 0.01101181, + "balance_loss_clip": 1.00187278, + "balance_loss_mlp": 1.00075889, + "epoch": 0.8802645423117391, + "flos": 24133874215680.0, + "grad_norm": 1.7797058623165014, + "language_loss": 0.69759631, + "learning_rate": 1.4844479160796985e-07, + "loss": 0.71995699, + "num_input_tokens_seen": 315719270, + "step": 14641, + "time_per_iteration": 2.7214839458465576 + }, + { + "auxiliary_loss_clip": 0.01147954, + "auxiliary_loss_mlp": 0.0110167, + "balance_loss_clip": 1.001827, + "balance_loss_mlp": 1.00058007, + "epoch": 0.880324665564407, + "flos": 17931203902080.0, + "grad_norm": 1.8625410143269379, + "language_loss": 0.85067773, + "learning_rate": 1.4829758350176457e-07, + "loss": 0.87317395, + "num_input_tokens_seen": 315737425, + "step": 14642, + "time_per_iteration": 4.148470163345337 + }, + { + "auxiliary_loss_clip": 0.01100198, + "auxiliary_loss_mlp": 0.01100468, + "balance_loss_clip": 1.00179422, + "balance_loss_mlp": 1.0006181, + "epoch": 0.880384788817075, + "flos": 21287630471040.0, + "grad_norm": 12.639192857733788, + "language_loss": 0.7879076, + "learning_rate": 1.4815044561230038e-07, + "loss": 0.80991429, + "num_input_tokens_seen": 315755725, + "step": 14643, + "time_per_iteration": 2.831045389175415 + }, + { + "auxiliary_loss_clip": 0.01147522, + "auxiliary_loss_mlp": 0.01099606, + "balance_loss_clip": 1.00179791, + "balance_loss_mlp": 1.00075734, + "epoch": 0.880444912069743, + "flos": 12458489777280.0, + "grad_norm": 1.641754821474323, + "language_loss": 0.73179913, + "learning_rate": 1.4800337794515705e-07, + "loss": 0.75427037, + "num_input_tokens_seen": 315773835, + "step": 14644, + "time_per_iteration": 4.023958921432495 + }, + { + "auxiliary_loss_clip": 0.01164273, + "auxiliary_loss_mlp": 0.00747445, + "balance_loss_clip": 1.00190234, + "balance_loss_mlp": 1.0005151, + "epoch": 0.880505035322411, + "flos": 13625945619840.0, + "grad_norm": 2.190897592056034, + "language_loss": 0.79338527, + "learning_rate": 1.47856380505911e-07, + "loss": 0.81250238, + "num_input_tokens_seen": 315790615, + "step": 14645, + "time_per_iteration": 2.630343198776245 + }, + { + "auxiliary_loss_clip": 0.01149359, + "auxiliary_loss_mlp": 0.01101263, + "balance_loss_clip": 1.0018816, + "balance_loss_mlp": 1.00065053, + "epoch": 0.8805651585750789, + "flos": 23183067254400.0, + "grad_norm": 1.612567583004525, + "language_loss": 0.64342248, + "learning_rate": 1.477094533001364e-07, + "loss": 0.66592872, + "num_input_tokens_seen": 315811010, + "step": 14646, + "time_per_iteration": 2.66003155708313 + }, + { + "auxiliary_loss_clip": 0.01132074, + "auxiliary_loss_mlp": 0.01101661, + "balance_loss_clip": 1.0019809, + "balance_loss_mlp": 1.0007627, + "epoch": 0.8806252818277469, + "flos": 14903000835840.0, + "grad_norm": 2.0807457318877733, + "language_loss": 0.7755968, + "learning_rate": 1.475625963334055e-07, + "loss": 0.79793417, + "num_input_tokens_seen": 315828130, + "step": 14647, + "time_per_iteration": 2.636406898498535 + }, + { + "auxiliary_loss_clip": 0.0116424, + "auxiliary_loss_mlp": 0.01100479, + "balance_loss_clip": 1.00199199, + "balance_loss_mlp": 1.00062919, + "epoch": 0.8806854050804148, + "flos": 17639178330240.0, + "grad_norm": 1.9492776804081957, + "language_loss": 0.74919653, + "learning_rate": 1.4741580961128652e-07, + "loss": 0.77184367, + "num_input_tokens_seen": 315844900, + "step": 14648, + "time_per_iteration": 2.55208158493042 + }, + { + "auxiliary_loss_clip": 0.0113312, + "auxiliary_loss_mlp": 0.01100578, + "balance_loss_clip": 1.00168669, + "balance_loss_mlp": 1.00053763, + "epoch": 0.8807455283330828, + "flos": 25332392344320.0, + "grad_norm": 1.970628154723201, + "language_loss": 0.65579283, + "learning_rate": 1.4726909313934522e-07, + "loss": 0.67812979, + "num_input_tokens_seen": 315863745, + "step": 14649, + "time_per_iteration": 2.694514751434326 + }, + { + "auxiliary_loss_clip": 0.01115385, + "auxiliary_loss_mlp": 0.01101331, + "balance_loss_clip": 1.00168753, + "balance_loss_mlp": 1.00052786, + "epoch": 0.8808056515857507, + "flos": 25265168040960.0, + "grad_norm": 1.495356339296147, + "language_loss": 0.62278062, + "learning_rate": 1.4712244692314578e-07, + "loss": 0.64494777, + "num_input_tokens_seen": 315885765, + "step": 14650, + "time_per_iteration": 2.758025646209717 + }, + { + "auxiliary_loss_clip": 0.01132742, + "auxiliary_loss_mlp": 0.01100973, + "balance_loss_clip": 1.00187337, + "balance_loss_mlp": 1.00074232, + "epoch": 0.8808657748384188, + "flos": 26578852151040.0, + "grad_norm": 1.5782003104282514, + "language_loss": 0.72697401, + "learning_rate": 1.4697587096824914e-07, + "loss": 0.74931121, + "num_input_tokens_seen": 315907340, + "step": 14651, + "time_per_iteration": 2.6881473064422607 + }, + { + "auxiliary_loss_clip": 0.01149582, + "auxiliary_loss_mlp": 0.0110128, + "balance_loss_clip": 1.00187755, + "balance_loss_mlp": 1.00057244, + "epoch": 0.8809258980910867, + "flos": 18661231918080.0, + "grad_norm": 1.8389165508317507, + "language_loss": 0.72039747, + "learning_rate": 1.4682936528021284e-07, + "loss": 0.74290615, + "num_input_tokens_seen": 315924935, + "step": 14652, + "time_per_iteration": 2.627488374710083 + }, + { + "auxiliary_loss_clip": 0.01148236, + "auxiliary_loss_mlp": 0.01100457, + "balance_loss_clip": 1.00187123, + "balance_loss_mlp": 1.00065529, + "epoch": 0.8809860213437547, + "flos": 19792274348160.0, + "grad_norm": 2.0335116254574843, + "language_loss": 0.74981928, + "learning_rate": 1.4668292986459286e-07, + "loss": 0.7723062, + "num_input_tokens_seen": 315943165, + "step": 14653, + "time_per_iteration": 2.668370485305786 + }, + { + "auxiliary_loss_clip": 0.0116441, + "auxiliary_loss_mlp": 0.0110167, + "balance_loss_clip": 1.00185335, + "balance_loss_mlp": 1.00058079, + "epoch": 0.8810461445964227, + "flos": 17894467267200.0, + "grad_norm": 2.118749294168235, + "language_loss": 0.71589619, + "learning_rate": 1.465365647269421e-07, + "loss": 0.73855698, + "num_input_tokens_seen": 315961340, + "step": 14654, + "time_per_iteration": 2.651165008544922 + }, + { + "auxiliary_loss_clip": 0.01099228, + "auxiliary_loss_mlp": 0.01100428, + "balance_loss_clip": 1.00152588, + "balance_loss_mlp": 1.00067377, + "epoch": 0.8811062678490906, + "flos": 29163917128320.0, + "grad_norm": 1.564412030217513, + "language_loss": 0.71274853, + "learning_rate": 1.4639026987281012e-07, + "loss": 0.73474514, + "num_input_tokens_seen": 315981335, + "step": 14655, + "time_per_iteration": 2.809041976928711 + }, + { + "auxiliary_loss_clip": 0.01099123, + "auxiliary_loss_mlp": 0.01100598, + "balance_loss_clip": 1.0016582, + "balance_loss_mlp": 1.00055718, + "epoch": 0.8811663911017587, + "flos": 20338834671360.0, + "grad_norm": 1.702528756023979, + "language_loss": 0.81457448, + "learning_rate": 1.462440453077449e-07, + "loss": 0.83657175, + "num_input_tokens_seen": 316001325, + "step": 14656, + "time_per_iteration": 2.7899560928344727 + }, + { + "auxiliary_loss_clip": 0.01132382, + "auxiliary_loss_mlp": 0.01100594, + "balance_loss_clip": 1.00186741, + "balance_loss_mlp": 1.00050604, + "epoch": 0.8812265143544266, + "flos": 25885704424320.0, + "grad_norm": 1.716055763033671, + "language_loss": 0.68417299, + "learning_rate": 1.460978910372914e-07, + "loss": 0.7065028, + "num_input_tokens_seen": 316022540, + "step": 14657, + "time_per_iteration": 2.7218875885009766 + }, + { + "auxiliary_loss_clip": 0.01133246, + "auxiliary_loss_mlp": 0.01102149, + "balance_loss_clip": 1.00182009, + "balance_loss_mlp": 1.0006783, + "epoch": 0.8812866376070946, + "flos": 27195509865600.0, + "grad_norm": 2.222787823496057, + "language_loss": 0.83997309, + "learning_rate": 1.4595180706699207e-07, + "loss": 0.8623271, + "num_input_tokens_seen": 316037735, + "step": 14658, + "time_per_iteration": 2.797956705093384 + }, + { + "auxiliary_loss_clip": 0.01132495, + "auxiliary_loss_mlp": 0.01101762, + "balance_loss_clip": 1.00180936, + "balance_loss_mlp": 1.00057673, + "epoch": 0.8813467608597625, + "flos": 23807194997760.0, + "grad_norm": 1.7003982462911966, + "language_loss": 0.77391094, + "learning_rate": 1.4580579340238554e-07, + "loss": 0.7962535, + "num_input_tokens_seen": 316058105, + "step": 14659, + "time_per_iteration": 2.721189498901367 + }, + { + "auxiliary_loss_clip": 0.01131171, + "auxiliary_loss_mlp": 0.01100445, + "balance_loss_clip": 1.00175226, + "balance_loss_mlp": 1.00054717, + "epoch": 0.8814068841124305, + "flos": 21105455667840.0, + "grad_norm": 2.089112553303575, + "language_loss": 0.604684, + "learning_rate": 1.4565985004900894e-07, + "loss": 0.62700015, + "num_input_tokens_seen": 316074415, + "step": 14660, + "time_per_iteration": 4.159224271774292 + }, + { + "auxiliary_loss_clip": 0.0113404, + "auxiliary_loss_mlp": 0.01100025, + "balance_loss_clip": 1.00179505, + "balance_loss_mlp": 1.00046158, + "epoch": 0.8814670073650984, + "flos": 24716991605760.0, + "grad_norm": 2.251136738322102, + "language_loss": 0.77778286, + "learning_rate": 1.455139770123972e-07, + "loss": 0.80012345, + "num_input_tokens_seen": 316094405, + "step": 14661, + "time_per_iteration": 4.159190893173218 + }, + { + "auxiliary_loss_clip": 0.01115494, + "auxiliary_loss_mlp": 0.01101257, + "balance_loss_clip": 1.00188208, + "balance_loss_mlp": 1.0008347, + "epoch": 0.8815271306177664, + "flos": 22966274718720.0, + "grad_norm": 1.6591396189117986, + "language_loss": 0.76971012, + "learning_rate": 1.45368174298081e-07, + "loss": 0.79187763, + "num_input_tokens_seen": 316113390, + "step": 14662, + "time_per_iteration": 2.7614588737487793 + }, + { + "auxiliary_loss_clip": 0.01103502, + "auxiliary_loss_mlp": 0.01099349, + "balance_loss_clip": 1.00160313, + "balance_loss_mlp": 1.00050068, + "epoch": 0.8815872538704344, + "flos": 19460064435840.0, + "grad_norm": 1.8243414870383114, + "language_loss": 0.73824441, + "learning_rate": 1.4522244191158929e-07, + "loss": 0.76027286, + "num_input_tokens_seen": 316131085, + "step": 14663, + "time_per_iteration": 2.7542576789855957 + }, + { + "auxiliary_loss_clip": 0.01148635, + "auxiliary_loss_mlp": 0.00747257, + "balance_loss_clip": 1.00188446, + "balance_loss_mlp": 1.00051022, + "epoch": 0.8816473771231024, + "flos": 32156604622080.0, + "grad_norm": 1.4897012699313905, + "language_loss": 0.69897145, + "learning_rate": 1.450767798584489e-07, + "loss": 0.71793032, + "num_input_tokens_seen": 316151440, + "step": 14664, + "time_per_iteration": 2.7292065620422363 + }, + { + "auxiliary_loss_clip": 0.01067166, + "auxiliary_loss_mlp": 0.01099292, + "balance_loss_clip": 1.0013864, + "balance_loss_mlp": 1.00053906, + "epoch": 0.8817075003757703, + "flos": 19682279925120.0, + "grad_norm": 3.315377104032399, + "language_loss": 0.8114357, + "learning_rate": 1.449311881441828e-07, + "loss": 0.8331002, + "num_input_tokens_seen": 316170750, + "step": 14665, + "time_per_iteration": 2.785005807876587 + }, + { + "auxiliary_loss_clip": 0.01132978, + "auxiliary_loss_mlp": 0.01101198, + "balance_loss_clip": 1.0018611, + "balance_loss_mlp": 1.00058579, + "epoch": 0.8817676236284383, + "flos": 15668616251520.0, + "grad_norm": 2.4982030411301337, + "language_loss": 0.58150458, + "learning_rate": 1.447856667743117e-07, + "loss": 0.60384631, + "num_input_tokens_seen": 316187265, + "step": 14666, + "time_per_iteration": 2.761650323867798 + }, + { + "auxiliary_loss_clip": 0.01147707, + "auxiliary_loss_mlp": 0.01101666, + "balance_loss_clip": 1.00190139, + "balance_loss_mlp": 1.00057638, + "epoch": 0.8818277468811063, + "flos": 17895185539200.0, + "grad_norm": 1.689297937203337, + "language_loss": 0.83270001, + "learning_rate": 1.4464021575435403e-07, + "loss": 0.85519373, + "num_input_tokens_seen": 316206555, + "step": 14667, + "time_per_iteration": 2.585158348083496 + }, + { + "auxiliary_loss_clip": 0.01164255, + "auxiliary_loss_mlp": 0.01100704, + "balance_loss_clip": 1.00190234, + "balance_loss_mlp": 1.000664, + "epoch": 0.8818878701337742, + "flos": 18770508069120.0, + "grad_norm": 1.7120619647563178, + "language_loss": 0.62210596, + "learning_rate": 1.4449483508982563e-07, + "loss": 0.6447556, + "num_input_tokens_seen": 316225210, + "step": 14668, + "time_per_iteration": 2.565277576446533 + }, + { + "auxiliary_loss_clip": 0.01147729, + "auxiliary_loss_mlp": 0.01099809, + "balance_loss_clip": 1.00192475, + "balance_loss_mlp": 1.0005312, + "epoch": 0.8819479933864423, + "flos": 17712292464000.0, + "grad_norm": 2.122084609802418, + "language_loss": 0.57038188, + "learning_rate": 1.4434952478623918e-07, + "loss": 0.5928573, + "num_input_tokens_seen": 316242685, + "step": 14669, + "time_per_iteration": 2.597585916519165 + }, + { + "auxiliary_loss_clip": 0.01164271, + "auxiliary_loss_mlp": 0.01100799, + "balance_loss_clip": 1.00191724, + "balance_loss_mlp": 1.00056744, + "epoch": 0.8820081166391102, + "flos": 11728749070080.0, + "grad_norm": 1.824495642411466, + "language_loss": 0.71433604, + "learning_rate": 1.442042848491043e-07, + "loss": 0.73698676, + "num_input_tokens_seen": 316260935, + "step": 14670, + "time_per_iteration": 2.5732908248901367 + }, + { + "auxiliary_loss_clip": 0.01149665, + "auxiliary_loss_mlp": 0.0109985, + "balance_loss_clip": 1.00188351, + "balance_loss_mlp": 1.00066793, + "epoch": 0.8820682398917782, + "flos": 27490372611840.0, + "grad_norm": 2.8833097663213945, + "language_loss": 0.73840714, + "learning_rate": 1.44059115283929e-07, + "loss": 0.76090229, + "num_input_tokens_seen": 316281190, + "step": 14671, + "time_per_iteration": 2.73909068107605 + }, + { + "auxiliary_loss_clip": 0.01131121, + "auxiliary_loss_mlp": 0.01101361, + "balance_loss_clip": 1.00166988, + "balance_loss_mlp": 1.00046229, + "epoch": 0.8821283631444461, + "flos": 16873850223360.0, + "grad_norm": 1.924248307835654, + "language_loss": 0.84698558, + "learning_rate": 1.43914016096218e-07, + "loss": 0.86931038, + "num_input_tokens_seen": 316297115, + "step": 14672, + "time_per_iteration": 2.675081729888916 + }, + { + "auxiliary_loss_clip": 0.01116453, + "auxiliary_loss_mlp": 0.01100607, + "balance_loss_clip": 1.00173604, + "balance_loss_mlp": 1.00056648, + "epoch": 0.8821884863971141, + "flos": 24280964409600.0, + "grad_norm": 1.5851970125386359, + "language_loss": 0.72475874, + "learning_rate": 1.4376898729147336e-07, + "loss": 0.74692941, + "num_input_tokens_seen": 316318235, + "step": 14673, + "time_per_iteration": 2.698969602584839 + }, + { + "auxiliary_loss_clip": 0.01126903, + "auxiliary_loss_mlp": 0.01074726, + "balance_loss_clip": 1.00066757, + "balance_loss_mlp": 1.00072122, + "epoch": 0.882248609649782, + "flos": 59432342492160.0, + "grad_norm": 0.8010572292336042, + "language_loss": 0.49366501, + "learning_rate": 1.4362402887519487e-07, + "loss": 0.51568127, + "num_input_tokens_seen": 316384705, + "step": 14674, + "time_per_iteration": 3.271509885787964 + }, + { + "auxiliary_loss_clip": 0.01132791, + "auxiliary_loss_mlp": 0.00747375, + "balance_loss_clip": 1.0018177, + "balance_loss_mlp": 1.00042343, + "epoch": 0.88230873290245, + "flos": 19937784343680.0, + "grad_norm": 2.1038428133062705, + "language_loss": 0.76222563, + "learning_rate": 1.4347914085287971e-07, + "loss": 0.7810272, + "num_input_tokens_seen": 316401165, + "step": 14675, + "time_per_iteration": 2.6813242435455322 + }, + { + "auxiliary_loss_clip": 0.01132662, + "auxiliary_loss_mlp": 0.01100006, + "balance_loss_clip": 1.00171304, + "balance_loss_mlp": 1.00044203, + "epoch": 0.882368856155118, + "flos": 16362769559040.0, + "grad_norm": 2.174172607752435, + "language_loss": 0.79640955, + "learning_rate": 1.4333432323002105e-07, + "loss": 0.8187362, + "num_input_tokens_seen": 316418780, + "step": 14676, + "time_per_iteration": 2.635446071624756 + }, + { + "auxiliary_loss_clip": 0.01111588, + "auxiliary_loss_mlp": 0.01076176, + "balance_loss_clip": 1.00153673, + "balance_loss_mlp": 1.00064492, + "epoch": 0.882428979407786, + "flos": 70594563277440.0, + "grad_norm": 0.6982365765243858, + "language_loss": 0.54750168, + "learning_rate": 1.431895760121109e-07, + "loss": 0.56937927, + "num_input_tokens_seen": 316482030, + "step": 14677, + "time_per_iteration": 3.3248848915100098 + }, + { + "auxiliary_loss_clip": 0.01164141, + "auxiliary_loss_mlp": 0.01100751, + "balance_loss_clip": 1.00181854, + "balance_loss_mlp": 1.00042415, + "epoch": 0.8824891026604539, + "flos": 18150294908160.0, + "grad_norm": 2.080389379702, + "language_loss": 0.65301669, + "learning_rate": 1.4304489920463847e-07, + "loss": 0.67566562, + "num_input_tokens_seen": 316499175, + "step": 14678, + "time_per_iteration": 2.593048572540283 + }, + { + "auxiliary_loss_clip": 0.01132923, + "auxiliary_loss_mlp": 0.01101127, + "balance_loss_clip": 1.00183487, + "balance_loss_mlp": 1.00051439, + "epoch": 0.8825492259131219, + "flos": 27232713377280.0, + "grad_norm": 1.9876542019799446, + "language_loss": 0.71105689, + "learning_rate": 1.4290029281308936e-07, + "loss": 0.73339736, + "num_input_tokens_seen": 316519495, + "step": 14679, + "time_per_iteration": 4.194069862365723 + }, + { + "auxiliary_loss_clip": 0.01130632, + "auxiliary_loss_mlp": 0.01099632, + "balance_loss_clip": 1.00169158, + "balance_loss_mlp": 1.0004499, + "epoch": 0.8826093491657898, + "flos": 22274419881600.0, + "grad_norm": 1.7314827899731975, + "language_loss": 0.63773727, + "learning_rate": 1.4275575684294694e-07, + "loss": 0.66003996, + "num_input_tokens_seen": 316538180, + "step": 14680, + "time_per_iteration": 2.702991247177124 + }, + { + "auxiliary_loss_clip": 0.01164179, + "auxiliary_loss_mlp": 0.01100226, + "balance_loss_clip": 1.00195622, + "balance_loss_mlp": 1.00042367, + "epoch": 0.8826694724184578, + "flos": 14204753377920.0, + "grad_norm": 3.1869311992809473, + "language_loss": 0.77091622, + "learning_rate": 1.4261129129969328e-07, + "loss": 0.79356027, + "num_input_tokens_seen": 316551750, + "step": 14681, + "time_per_iteration": 2.5419058799743652 + }, + { + "auxiliary_loss_clip": 0.01132934, + "auxiliary_loss_mlp": 0.0110134, + "balance_loss_clip": 1.00179684, + "balance_loss_mlp": 1.0005362, + "epoch": 0.8827295956711259, + "flos": 20631686256000.0, + "grad_norm": 1.7519278997683367, + "language_loss": 0.73156559, + "learning_rate": 1.424668961888047e-07, + "loss": 0.75390828, + "num_input_tokens_seen": 316570680, + "step": 14682, + "time_per_iteration": 4.103490114212036 + }, + { + "auxiliary_loss_clip": 0.0110035, + "auxiliary_loss_mlp": 0.0110098, + "balance_loss_clip": 1.00165308, + "balance_loss_mlp": 1.00046277, + "epoch": 0.8827897189237938, + "flos": 18513064316160.0, + "grad_norm": 4.991475981158399, + "language_loss": 0.74466532, + "learning_rate": 1.4232257151575765e-07, + "loss": 0.76667857, + "num_input_tokens_seen": 316588635, + "step": 14683, + "time_per_iteration": 2.7925963401794434 + }, + { + "auxiliary_loss_clip": 0.0111606, + "auxiliary_loss_mlp": 0.01101123, + "balance_loss_clip": 1.00175655, + "balance_loss_mlp": 1.00041533, + "epoch": 0.8828498421764618, + "flos": 22747399194240.0, + "grad_norm": 1.8593288302305035, + "language_loss": 0.65746629, + "learning_rate": 1.4217831728602492e-07, + "loss": 0.67963815, + "num_input_tokens_seen": 316607550, + "step": 14684, + "time_per_iteration": 2.7891149520874023 + }, + { + "auxiliary_loss_clip": 0.01147424, + "auxiliary_loss_mlp": 0.01100108, + "balance_loss_clip": 1.00178385, + "balance_loss_mlp": 1.00054455, + "epoch": 0.8829099654291297, + "flos": 15012384727680.0, + "grad_norm": 1.75564343075082, + "language_loss": 0.69678563, + "learning_rate": 1.4203413350507677e-07, + "loss": 0.71926093, + "num_input_tokens_seen": 316624460, + "step": 14685, + "time_per_iteration": 2.588245153427124 + }, + { + "auxiliary_loss_clip": 0.01086707, + "auxiliary_loss_mlp": 0.01101181, + "balance_loss_clip": 1.00160265, + "balance_loss_mlp": 1.00047255, + "epoch": 0.8829700886817977, + "flos": 16720546976640.0, + "grad_norm": 1.9300632041036796, + "language_loss": 0.74345779, + "learning_rate": 1.418900201783806e-07, + "loss": 0.76533663, + "num_input_tokens_seen": 316640765, + "step": 14686, + "time_per_iteration": 2.752380132675171 + }, + { + "auxiliary_loss_clip": 0.01099234, + "auxiliary_loss_mlp": 0.01100138, + "balance_loss_clip": 1.00174081, + "balance_loss_mlp": 1.0004791, + "epoch": 0.8830302119344656, + "flos": 15263256291840.0, + "grad_norm": 1.7870590895542497, + "language_loss": 0.62966645, + "learning_rate": 1.417459773114007e-07, + "loss": 0.6516602, + "num_input_tokens_seen": 316656120, + "step": 14687, + "time_per_iteration": 2.6915621757507324 + }, + { + "auxiliary_loss_clip": 0.01147656, + "auxiliary_loss_mlp": 0.01101087, + "balance_loss_clip": 1.00178826, + "balance_loss_mlp": 1.0006175, + "epoch": 0.8830903351871336, + "flos": 28617751854720.0, + "grad_norm": 1.9036204207834497, + "language_loss": 0.68908203, + "learning_rate": 1.4160200490959984e-07, + "loss": 0.71156949, + "num_input_tokens_seen": 316676095, + "step": 14688, + "time_per_iteration": 2.6394193172454834 + }, + { + "auxiliary_loss_clip": 0.01146935, + "auxiliary_loss_mlp": 0.01100347, + "balance_loss_clip": 1.00193048, + "balance_loss_mlp": 1.00040209, + "epoch": 0.8831504584398016, + "flos": 28001632844160.0, + "grad_norm": 1.6198742755922002, + "language_loss": 0.67142773, + "learning_rate": 1.4145810297843697e-07, + "loss": 0.69390047, + "num_input_tokens_seen": 316696235, + "step": 14689, + "time_per_iteration": 2.6649789810180664 + }, + { + "auxiliary_loss_clip": 0.01132297, + "auxiliary_loss_mlp": 0.0110087, + "balance_loss_clip": 1.00208569, + "balance_loss_mlp": 1.00054312, + "epoch": 0.8832105816924696, + "flos": 26579642250240.0, + "grad_norm": 1.3917677862529638, + "language_loss": 0.74588072, + "learning_rate": 1.4131427152336905e-07, + "loss": 0.76821244, + "num_input_tokens_seen": 316719680, + "step": 14690, + "time_per_iteration": 2.718057155609131 + }, + { + "auxiliary_loss_clip": 0.01134553, + "auxiliary_loss_mlp": 0.01101937, + "balance_loss_clip": 1.00203407, + "balance_loss_mlp": 1.00065696, + "epoch": 0.8832707049451375, + "flos": 24898771359360.0, + "grad_norm": 1.4298578020633248, + "language_loss": 0.72936243, + "learning_rate": 1.4117051054985018e-07, + "loss": 0.75172734, + "num_input_tokens_seen": 316739830, + "step": 14691, + "time_per_iteration": 2.6620142459869385 + }, + { + "auxiliary_loss_clip": 0.01114892, + "auxiliary_loss_mlp": 0.01102478, + "balance_loss_clip": 1.00166464, + "balance_loss_mlp": 1.00048232, + "epoch": 0.8833308281978055, + "flos": 15451141357440.0, + "grad_norm": 2.002138321291736, + "language_loss": 0.52039886, + "learning_rate": 1.4102682006333243e-07, + "loss": 0.54257256, + "num_input_tokens_seen": 316758105, + "step": 14692, + "time_per_iteration": 2.6439578533172607 + }, + { + "auxiliary_loss_clip": 0.01114278, + "auxiliary_loss_mlp": 0.01101052, + "balance_loss_clip": 1.00171685, + "balance_loss_mlp": 1.00053453, + "epoch": 0.8833909514504734, + "flos": 20301523418880.0, + "grad_norm": 2.4845329084503684, + "language_loss": 0.60586572, + "learning_rate": 1.4088320006926346e-07, + "loss": 0.62801898, + "num_input_tokens_seen": 316777455, + "step": 14693, + "time_per_iteration": 2.6750497817993164 + }, + { + "auxiliary_loss_clip": 0.01164065, + "auxiliary_loss_mlp": 0.01099476, + "balance_loss_clip": 1.00189352, + "balance_loss_mlp": 1.00048459, + "epoch": 0.8834510747031414, + "flos": 20374027021440.0, + "grad_norm": 1.7234248973523818, + "language_loss": 0.75146687, + "learning_rate": 1.407396505730898e-07, + "loss": 0.77410233, + "num_input_tokens_seen": 316796300, + "step": 14694, + "time_per_iteration": 2.5664210319519043 + }, + { + "auxiliary_loss_clip": 0.0114845, + "auxiliary_loss_mlp": 0.01100431, + "balance_loss_clip": 1.00192189, + "balance_loss_mlp": 1.00048614, + "epoch": 0.8835111979558095, + "flos": 29752026508800.0, + "grad_norm": 1.683948818292505, + "language_loss": 0.72408736, + "learning_rate": 1.4059617158025527e-07, + "loss": 0.74657613, + "num_input_tokens_seen": 316819090, + "step": 14695, + "time_per_iteration": 2.670098066329956 + }, + { + "auxiliary_loss_clip": 0.01147607, + "auxiliary_loss_mlp": 0.01100564, + "balance_loss_clip": 1.00192726, + "balance_loss_mlp": 1.00052357, + "epoch": 0.8835713212084774, + "flos": 24134556574080.0, + "grad_norm": 1.6878778469709885, + "language_loss": 0.80096948, + "learning_rate": 1.404527630961998e-07, + "loss": 0.82345122, + "num_input_tokens_seen": 316839250, + "step": 14696, + "time_per_iteration": 2.7206735610961914 + }, + { + "auxiliary_loss_clip": 0.01100255, + "auxiliary_loss_mlp": 0.01100509, + "balance_loss_clip": 1.00167155, + "balance_loss_mlp": 1.00056386, + "epoch": 0.8836314444611454, + "flos": 27672331933440.0, + "grad_norm": 1.5062266001749827, + "language_loss": 0.74848282, + "learning_rate": 1.4030942512636236e-07, + "loss": 0.77049041, + "num_input_tokens_seen": 316861315, + "step": 14697, + "time_per_iteration": 2.7818827629089355 + }, + { + "auxiliary_loss_clip": 0.01132664, + "auxiliary_loss_mlp": 0.01100385, + "balance_loss_clip": 1.00176167, + "balance_loss_mlp": 1.00063097, + "epoch": 0.8836915677138133, + "flos": 16836969934080.0, + "grad_norm": 2.1854592775338255, + "language_loss": 0.72258776, + "learning_rate": 1.401661576761779e-07, + "loss": 0.74491823, + "num_input_tokens_seen": 316879325, + "step": 14698, + "time_per_iteration": 5.521987676620483 + }, + { + "auxiliary_loss_clip": 0.01141275, + "auxiliary_loss_mlp": 0.0107419, + "balance_loss_clip": 1.00066447, + "balance_loss_mlp": 1.00018489, + "epoch": 0.8837516909664813, + "flos": 69310540823040.0, + "grad_norm": 0.8013518351086784, + "language_loss": 0.53775585, + "learning_rate": 1.4002296075107856e-07, + "loss": 0.55991042, + "num_input_tokens_seen": 316936425, + "step": 14699, + "time_per_iteration": 3.1804311275482178 + }, + { + "auxiliary_loss_clip": 0.01132924, + "auxiliary_loss_mlp": 0.01100524, + "balance_loss_clip": 1.0017786, + "balance_loss_mlp": 1.00048327, + "epoch": 0.8838118142191492, + "flos": 21324726241920.0, + "grad_norm": 1.8043828897714274, + "language_loss": 0.76444393, + "learning_rate": 1.3987983435649508e-07, + "loss": 0.78677839, + "num_input_tokens_seen": 316956360, + "step": 14700, + "time_per_iteration": 2.6876254081726074 + }, + { + "auxiliary_loss_clip": 0.01116041, + "auxiliary_loss_mlp": 0.01100185, + "balance_loss_clip": 1.00175107, + "balance_loss_mlp": 1.0003829, + "epoch": 0.8838719374718172, + "flos": 21470559459840.0, + "grad_norm": 1.7044606008331507, + "language_loss": 0.73693472, + "learning_rate": 1.3973677849785494e-07, + "loss": 0.75909698, + "num_input_tokens_seen": 316975295, + "step": 14701, + "time_per_iteration": 2.7059693336486816 + }, + { + "auxiliary_loss_clip": 0.0113416, + "auxiliary_loss_mlp": 0.01101842, + "balance_loss_clip": 1.00174141, + "balance_loss_mlp": 1.00046587, + "epoch": 0.8839320607244852, + "flos": 26468929555200.0, + "grad_norm": 2.487305731885502, + "language_loss": 0.71082175, + "learning_rate": 1.3959379318058262e-07, + "loss": 0.73318172, + "num_input_tokens_seen": 316994520, + "step": 14702, + "time_per_iteration": 2.757329225540161 + }, + { + "auxiliary_loss_clip": 0.01117959, + "auxiliary_loss_mlp": 0.01101704, + "balance_loss_clip": 1.00176275, + "balance_loss_mlp": 1.00051904, + "epoch": 0.8839921839771532, + "flos": 45222270923520.0, + "grad_norm": 1.5969454921773063, + "language_loss": 0.71839988, + "learning_rate": 1.3945087841010006e-07, + "loss": 0.74059653, + "num_input_tokens_seen": 317018095, + "step": 14703, + "time_per_iteration": 2.956477165222168 + }, + { + "auxiliary_loss_clip": 0.01098452, + "auxiliary_loss_mlp": 0.01098838, + "balance_loss_clip": 1.00156331, + "balance_loss_mlp": 1.00051379, + "epoch": 0.8840523072298211, + "flos": 20006876154240.0, + "grad_norm": 1.7240462508995273, + "language_loss": 0.66600132, + "learning_rate": 1.3930803419182645e-07, + "loss": 0.68797427, + "num_input_tokens_seen": 317035755, + "step": 14704, + "time_per_iteration": 2.7851386070251465 + }, + { + "auxiliary_loss_clip": 0.01148349, + "auxiliary_loss_mlp": 0.01099943, + "balance_loss_clip": 1.00197935, + "balance_loss_mlp": 1.00047469, + "epoch": 0.8841124304824891, + "flos": 24426007528320.0, + "grad_norm": 1.6636527027506784, + "language_loss": 0.70232135, + "learning_rate": 1.3916526053117905e-07, + "loss": 0.72480428, + "num_input_tokens_seen": 317055765, + "step": 14705, + "time_per_iteration": 2.6091320514678955 + }, + { + "auxiliary_loss_clip": 0.0113275, + "auxiliary_loss_mlp": 0.01099221, + "balance_loss_clip": 1.00171256, + "balance_loss_mlp": 1.00051594, + "epoch": 0.884172553735157, + "flos": 31284622056960.0, + "grad_norm": 1.6723727746777766, + "language_loss": 0.71133661, + "learning_rate": 1.3902255743357104e-07, + "loss": 0.73365629, + "num_input_tokens_seen": 317077955, + "step": 14706, + "time_per_iteration": 2.734779119491577 + }, + { + "auxiliary_loss_clip": 0.01148599, + "auxiliary_loss_mlp": 0.01100208, + "balance_loss_clip": 1.00178552, + "balance_loss_mlp": 1.00045395, + "epoch": 0.884232676987825, + "flos": 21391160446080.0, + "grad_norm": 1.6131844499699721, + "language_loss": 0.74480414, + "learning_rate": 1.3887992490441413e-07, + "loss": 0.7672922, + "num_input_tokens_seen": 317095825, + "step": 14707, + "time_per_iteration": 2.6180789470672607 + }, + { + "auxiliary_loss_clip": 0.01111763, + "auxiliary_loss_mlp": 0.01074901, + "balance_loss_clip": 1.000633, + "balance_loss_mlp": 1.00089574, + "epoch": 0.8842928002404931, + "flos": 57911451799680.0, + "grad_norm": 0.7932965803505853, + "language_loss": 0.60421288, + "learning_rate": 1.387373629491173e-07, + "loss": 0.62607944, + "num_input_tokens_seen": 317152875, + "step": 14708, + "time_per_iteration": 3.121610641479492 + }, + { + "auxiliary_loss_clip": 0.01132589, + "auxiliary_loss_mlp": 0.0109851, + "balance_loss_clip": 1.00186205, + "balance_loss_mlp": 1.00052035, + "epoch": 0.884352923493161, + "flos": 41463896186880.0, + "grad_norm": 1.6976555862991096, + "language_loss": 0.6695931, + "learning_rate": 1.3859487157308625e-07, + "loss": 0.69190407, + "num_input_tokens_seen": 317176725, + "step": 14709, + "time_per_iteration": 2.811765432357788 + }, + { + "auxiliary_loss_clip": 0.01131399, + "auxiliary_loss_mlp": 0.01102097, + "balance_loss_clip": 1.00174785, + "balance_loss_mlp": 1.00062573, + "epoch": 0.884413046745829, + "flos": 46541234332800.0, + "grad_norm": 1.4828106912452261, + "language_loss": 0.62488508, + "learning_rate": 1.3845245078172373e-07, + "loss": 0.64722008, + "num_input_tokens_seen": 317206880, + "step": 14710, + "time_per_iteration": 2.886122941970825 + }, + { + "auxiliary_loss_clip": 0.01115661, + "auxiliary_loss_mlp": 0.01099692, + "balance_loss_clip": 1.00175047, + "balance_loss_mlp": 1.00041437, + "epoch": 0.8844731699984969, + "flos": 19135324552320.0, + "grad_norm": 2.3081421817334173, + "language_loss": 0.64056313, + "learning_rate": 1.38310100580431e-07, + "loss": 0.66271663, + "num_input_tokens_seen": 317224135, + "step": 14711, + "time_per_iteration": 2.744633674621582 + }, + { + "auxiliary_loss_clip": 0.01116144, + "auxiliary_loss_mlp": 0.01101901, + "balance_loss_clip": 1.00171971, + "balance_loss_mlp": 1.00047779, + "epoch": 0.8845332932511649, + "flos": 23260634674560.0, + "grad_norm": 2.0879905627629185, + "language_loss": 0.76062316, + "learning_rate": 1.38167820974606e-07, + "loss": 0.78280354, + "num_input_tokens_seen": 317244505, + "step": 14712, + "time_per_iteration": 2.751312017440796 + }, + { + "auxiliary_loss_clip": 0.01085238, + "auxiliary_loss_mlp": 0.01100131, + "balance_loss_clip": 1.00173068, + "balance_loss_mlp": 1.00066328, + "epoch": 0.8845934165038328, + "flos": 17564591738880.0, + "grad_norm": 2.2344300404319846, + "language_loss": 0.81051254, + "learning_rate": 1.3802561196964368e-07, + "loss": 0.83236623, + "num_input_tokens_seen": 317257830, + "step": 14713, + "time_per_iteration": 2.777540683746338 + }, + { + "auxiliary_loss_clip": 0.01130669, + "auxiliary_loss_mlp": 0.01100822, + "balance_loss_clip": 1.00173163, + "balance_loss_mlp": 1.00049567, + "epoch": 0.8846535397565009, + "flos": 27485739757440.0, + "grad_norm": 1.5763817214777862, + "language_loss": 0.55579662, + "learning_rate": 1.3788347357093688e-07, + "loss": 0.57811153, + "num_input_tokens_seen": 317278430, + "step": 14714, + "time_per_iteration": 2.727285146713257 + }, + { + "auxiliary_loss_clip": 0.01084078, + "auxiliary_loss_mlp": 0.01100196, + "balance_loss_clip": 1.00152326, + "balance_loss_mlp": 1.0005374, + "epoch": 0.8847136630091688, + "flos": 28761430256640.0, + "grad_norm": 1.7974760546957356, + "language_loss": 0.73679549, + "learning_rate": 1.377414057838755e-07, + "loss": 0.75863826, + "num_input_tokens_seen": 317295970, + "step": 14715, + "time_per_iteration": 2.8606879711151123 + }, + { + "auxiliary_loss_clip": 0.0114918, + "auxiliary_loss_mlp": 0.01100812, + "balance_loss_clip": 1.00190985, + "balance_loss_mlp": 1.00043797, + "epoch": 0.8847737862618368, + "flos": 23476924419840.0, + "grad_norm": 1.4783811075670104, + "language_loss": 0.75180334, + "learning_rate": 1.375994086138461e-07, + "loss": 0.77430326, + "num_input_tokens_seen": 317316185, + "step": 14716, + "time_per_iteration": 2.6543030738830566 + }, + { + "auxiliary_loss_clip": 0.01118013, + "auxiliary_loss_mlp": 0.01100413, + "balance_loss_clip": 1.00177109, + "balance_loss_mlp": 1.00065827, + "epoch": 0.8848339095145047, + "flos": 18660872782080.0, + "grad_norm": 1.9635873097764092, + "language_loss": 0.70825195, + "learning_rate": 1.3745748206623397e-07, + "loss": 0.73043615, + "num_input_tokens_seen": 317333275, + "step": 14717, + "time_per_iteration": 4.228890419006348 + }, + { + "auxiliary_loss_clip": 0.01149024, + "auxiliary_loss_mlp": 0.01100025, + "balance_loss_clip": 1.00196171, + "balance_loss_mlp": 1.00050902, + "epoch": 0.8848940327671727, + "flos": 32270298145920.0, + "grad_norm": 2.147218405225342, + "language_loss": 0.73889798, + "learning_rate": 1.373156261464208e-07, + "loss": 0.76138842, + "num_input_tokens_seen": 317351245, + "step": 14718, + "time_per_iteration": 2.7289113998413086 + }, + { + "auxiliary_loss_clip": 0.01083563, + "auxiliary_loss_mlp": 0.0109996, + "balance_loss_clip": 1.00155354, + "balance_loss_mlp": 1.00053954, + "epoch": 0.8849541560198406, + "flos": 24021832717440.0, + "grad_norm": 1.4576244685048596, + "language_loss": 0.78482288, + "learning_rate": 1.3717384085978602e-07, + "loss": 0.80665815, + "num_input_tokens_seen": 317370740, + "step": 14719, + "time_per_iteration": 2.8249590396881104 + }, + { + "auxiliary_loss_clip": 0.01164216, + "auxiliary_loss_mlp": 0.0110039, + "balance_loss_clip": 1.00191212, + "balance_loss_mlp": 1.00049305, + "epoch": 0.8850142792725086, + "flos": 16873060124160.0, + "grad_norm": 2.2057200648106123, + "language_loss": 0.72092557, + "learning_rate": 1.3703212621170579e-07, + "loss": 0.74357164, + "num_input_tokens_seen": 317388370, + "step": 14720, + "time_per_iteration": 4.113725423812866 + }, + { + "auxiliary_loss_clip": 0.01130551, + "auxiliary_loss_mlp": 0.01101547, + "balance_loss_clip": 1.00172281, + "balance_loss_mlp": 1.00055301, + "epoch": 0.8850744025251767, + "flos": 24024059360640.0, + "grad_norm": 3.1133217280916154, + "language_loss": 0.82734799, + "learning_rate": 1.3689048220755383e-07, + "loss": 0.84966898, + "num_input_tokens_seen": 317407390, + "step": 14721, + "time_per_iteration": 2.7269444465637207 + }, + { + "auxiliary_loss_clip": 0.01132273, + "auxiliary_loss_mlp": 0.01100558, + "balance_loss_clip": 1.00173402, + "balance_loss_mlp": 1.00051796, + "epoch": 0.8851345257778446, + "flos": 47955575329920.0, + "grad_norm": 1.8157288796372395, + "language_loss": 0.62206668, + "learning_rate": 1.3674890885270186e-07, + "loss": 0.64439499, + "num_input_tokens_seen": 317430825, + "step": 14722, + "time_per_iteration": 2.8710806369781494 + }, + { + "auxiliary_loss_clip": 0.01147455, + "auxiliary_loss_mlp": 0.011009, + "balance_loss_clip": 1.00174785, + "balance_loss_mlp": 1.00047791, + "epoch": 0.8851946490305126, + "flos": 36611000173440.0, + "grad_norm": 2.2331132492912014, + "language_loss": 0.68860841, + "learning_rate": 1.3660740615251754e-07, + "loss": 0.71109188, + "num_input_tokens_seen": 317451905, + "step": 14723, + "time_per_iteration": 2.763270139694214 + }, + { + "auxiliary_loss_clip": 0.01118222, + "auxiliary_loss_mlp": 0.01100181, + "balance_loss_clip": 1.00172412, + "balance_loss_mlp": 1.00052249, + "epoch": 0.8852547722831805, + "flos": 21544248211200.0, + "grad_norm": 1.6890486146203953, + "language_loss": 0.77907097, + "learning_rate": 1.3646597411236703e-07, + "loss": 0.80125511, + "num_input_tokens_seen": 317470030, + "step": 14724, + "time_per_iteration": 2.705808401107788 + }, + { + "auxiliary_loss_clip": 0.01141387, + "auxiliary_loss_mlp": 0.01074424, + "balance_loss_clip": 1.00081563, + "balance_loss_mlp": 1.0004189, + "epoch": 0.8853148955358485, + "flos": 63059246472960.0, + "grad_norm": 0.7950957992959009, + "language_loss": 0.58881146, + "learning_rate": 1.363246127376143e-07, + "loss": 0.61096954, + "num_input_tokens_seen": 317527460, + "step": 14725, + "time_per_iteration": 3.0606164932250977 + }, + { + "auxiliary_loss_clip": 0.01132965, + "auxiliary_loss_mlp": 0.00747528, + "balance_loss_clip": 1.00177884, + "balance_loss_mlp": 1.00051999, + "epoch": 0.8853750187885164, + "flos": 18149828031360.0, + "grad_norm": 2.9258480580046387, + "language_loss": 0.68741828, + "learning_rate": 1.3618332203361837e-07, + "loss": 0.70622325, + "num_input_tokens_seen": 317544070, + "step": 14726, + "time_per_iteration": 2.626784563064575 + }, + { + "auxiliary_loss_clip": 0.01149521, + "auxiliary_loss_mlp": 0.00747296, + "balance_loss_clip": 1.00188208, + "balance_loss_mlp": 1.00046062, + "epoch": 0.8854351420411845, + "flos": 39570542392320.0, + "grad_norm": 1.3798548086730118, + "language_loss": 0.69736594, + "learning_rate": 1.3604210200573785e-07, + "loss": 0.7163341, + "num_input_tokens_seen": 317570275, + "step": 14727, + "time_per_iteration": 2.7830018997192383 + }, + { + "auxiliary_loss_clip": 0.01130433, + "auxiliary_loss_mlp": 0.01101082, + "balance_loss_clip": 1.00196433, + "balance_loss_mlp": 1.00046945, + "epoch": 0.8854952652938524, + "flos": 23769309127680.0, + "grad_norm": 1.8747441263877052, + "language_loss": 0.70349789, + "learning_rate": 1.3590095265932733e-07, + "loss": 0.72581303, + "num_input_tokens_seen": 317590160, + "step": 14728, + "time_per_iteration": 2.6619417667388916 + }, + { + "auxiliary_loss_clip": 0.01116094, + "auxiliary_loss_mlp": 0.01100035, + "balance_loss_clip": 1.00168943, + "balance_loss_mlp": 1.00047135, + "epoch": 0.8855553885465204, + "flos": 18290310122880.0, + "grad_norm": 2.264611364338224, + "language_loss": 0.66178024, + "learning_rate": 1.3575987399973987e-07, + "loss": 0.68394148, + "num_input_tokens_seen": 317608340, + "step": 14729, + "time_per_iteration": 2.622267007827759 + }, + { + "auxiliary_loss_clip": 0.01131164, + "auxiliary_loss_mlp": 0.01100076, + "balance_loss_clip": 1.00203681, + "balance_loss_mlp": 1.00060821, + "epoch": 0.8856155117991883, + "flos": 36867402432000.0, + "grad_norm": 1.5322675443274505, + "language_loss": 0.62977535, + "learning_rate": 1.3561886603232453e-07, + "loss": 0.65208769, + "num_input_tokens_seen": 317629910, + "step": 14730, + "time_per_iteration": 2.801715135574341 + }, + { + "auxiliary_loss_clip": 0.01115497, + "auxiliary_loss_mlp": 0.01100243, + "balance_loss_clip": 1.00166392, + "balance_loss_mlp": 1.0005362, + "epoch": 0.8856756350518563, + "flos": 22163886754560.0, + "grad_norm": 2.2626250390474425, + "language_loss": 0.79381108, + "learning_rate": 1.3547792876242904e-07, + "loss": 0.81596851, + "num_input_tokens_seen": 317650265, + "step": 14731, + "time_per_iteration": 2.7009737491607666 + }, + { + "auxiliary_loss_clip": 0.01116283, + "auxiliary_loss_mlp": 0.01100533, + "balance_loss_clip": 1.0017252, + "balance_loss_mlp": 1.0006361, + "epoch": 0.8857357583045242, + "flos": 20740962407040.0, + "grad_norm": 1.8813939929667267, + "language_loss": 0.83096081, + "learning_rate": 1.3533706219539708e-07, + "loss": 0.85312897, + "num_input_tokens_seen": 317669045, + "step": 14732, + "time_per_iteration": 2.8089964389801025 + }, + { + "auxiliary_loss_clip": 0.01125324, + "auxiliary_loss_mlp": 0.01074033, + "balance_loss_clip": 1.00088978, + "balance_loss_mlp": 1.00040948, + "epoch": 0.8857958815571922, + "flos": 69892329409920.0, + "grad_norm": 0.8885258087220224, + "language_loss": 0.59875989, + "learning_rate": 1.3519626633657045e-07, + "loss": 0.62075353, + "num_input_tokens_seen": 317728065, + "step": 14733, + "time_per_iteration": 3.172584056854248 + }, + { + "auxiliary_loss_clip": 0.01164367, + "auxiliary_loss_mlp": 0.0074732, + "balance_loss_clip": 1.00201702, + "balance_loss_mlp": 1.00047565, + "epoch": 0.8858560048098603, + "flos": 15121948187520.0, + "grad_norm": 1.9234235963036705, + "language_loss": 0.66924667, + "learning_rate": 1.3505554119128838e-07, + "loss": 0.68836355, + "num_input_tokens_seen": 317746120, + "step": 14734, + "time_per_iteration": 2.548607587814331 + }, + { + "auxiliary_loss_clip": 0.01132875, + "auxiliary_loss_mlp": 0.01100029, + "balance_loss_clip": 1.00204039, + "balance_loss_mlp": 1.00046539, + "epoch": 0.8859161280625282, + "flos": 16611019430400.0, + "grad_norm": 2.6343371909087256, + "language_loss": 0.75506878, + "learning_rate": 1.3491488676488682e-07, + "loss": 0.77739787, + "num_input_tokens_seen": 317762280, + "step": 14735, + "time_per_iteration": 5.505553960800171 + }, + { + "auxiliary_loss_clip": 0.01118265, + "auxiliary_loss_mlp": 0.01099987, + "balance_loss_clip": 1.00171041, + "balance_loss_mlp": 1.0005666, + "epoch": 0.8859762513151962, + "flos": 18694484933760.0, + "grad_norm": 2.34317397260083, + "language_loss": 0.70774835, + "learning_rate": 1.3477430306270066e-07, + "loss": 0.72993088, + "num_input_tokens_seen": 317780615, + "step": 14736, + "time_per_iteration": 2.677304267883301 + }, + { + "auxiliary_loss_clip": 0.01132169, + "auxiliary_loss_mlp": 0.01101504, + "balance_loss_clip": 1.0019784, + "balance_loss_mlp": 1.00046206, + "epoch": 0.8860363745678641, + "flos": 19536877670400.0, + "grad_norm": 1.7835178010208697, + "language_loss": 0.8414824, + "learning_rate": 1.3463379009005892e-07, + "loss": 0.86381912, + "num_input_tokens_seen": 317798830, + "step": 14737, + "time_per_iteration": 2.6286940574645996 + }, + { + "auxiliary_loss_clip": 0.01118424, + "auxiliary_loss_mlp": 0.01101899, + "balance_loss_clip": 1.00178111, + "balance_loss_mlp": 1.00052381, + "epoch": 0.8860964978205321, + "flos": 35954912304000.0, + "grad_norm": 2.8597203660312616, + "language_loss": 0.68052357, + "learning_rate": 1.3449334785229093e-07, + "loss": 0.70272684, + "num_input_tokens_seen": 317819235, + "step": 14738, + "time_per_iteration": 2.8412580490112305 + }, + { + "auxiliary_loss_clip": 0.01147595, + "auxiliary_loss_mlp": 0.01102177, + "balance_loss_clip": 1.00176692, + "balance_loss_mlp": 1.00051546, + "epoch": 0.8861566210732, + "flos": 21212577002880.0, + "grad_norm": 2.2631641732240833, + "language_loss": 0.7500174, + "learning_rate": 1.343529763547222e-07, + "loss": 0.77251512, + "num_input_tokens_seen": 317836785, + "step": 14739, + "time_per_iteration": 2.5828545093536377 + }, + { + "auxiliary_loss_clip": 0.01147491, + "auxiliary_loss_mlp": 0.0110039, + "balance_loss_clip": 1.00188029, + "balance_loss_mlp": 1.00049257, + "epoch": 0.886216744325868, + "flos": 14609071843200.0, + "grad_norm": 1.8666841995843901, + "language_loss": 0.87106335, + "learning_rate": 1.3421267560267559e-07, + "loss": 0.89354217, + "num_input_tokens_seen": 317854225, + "step": 14740, + "time_per_iteration": 2.641416311264038 + }, + { + "auxiliary_loss_clip": 0.01098911, + "auxiliary_loss_mlp": 0.01100091, + "balance_loss_clip": 1.00182176, + "balance_loss_mlp": 1.00052702, + "epoch": 0.886276867578536, + "flos": 26651643062400.0, + "grad_norm": 1.8836763779559356, + "language_loss": 0.6340518, + "learning_rate": 1.34072445601471e-07, + "loss": 0.6560418, + "num_input_tokens_seen": 317874865, + "step": 14741, + "time_per_iteration": 2.83156156539917 + }, + { + "auxiliary_loss_clip": 0.01164125, + "auxiliary_loss_mlp": 0.01101056, + "balance_loss_clip": 1.00184369, + "balance_loss_mlp": 1.0004437, + "epoch": 0.886336990831204, + "flos": 16764071281920.0, + "grad_norm": 1.8886855648879206, + "language_loss": 0.72641367, + "learning_rate": 1.3393228635642717e-07, + "loss": 0.74906552, + "num_input_tokens_seen": 317892830, + "step": 14742, + "time_per_iteration": 2.6785497665405273 + }, + { + "auxiliary_loss_clip": 0.01149476, + "auxiliary_loss_mlp": 0.00747288, + "balance_loss_clip": 1.00181389, + "balance_loss_mlp": 1.00042152, + "epoch": 0.8863971140838719, + "flos": 25265275781760.0, + "grad_norm": 2.0526508652244395, + "language_loss": 0.59633696, + "learning_rate": 1.3379219787285733e-07, + "loss": 0.61530465, + "num_input_tokens_seen": 317911780, + "step": 14743, + "time_per_iteration": 2.7049713134765625 + }, + { + "auxiliary_loss_clip": 0.0111619, + "auxiliary_loss_mlp": 0.01101914, + "balance_loss_clip": 1.00175452, + "balance_loss_mlp": 1.00053847, + "epoch": 0.8864572373365399, + "flos": 23404313076480.0, + "grad_norm": 1.6558003706149442, + "language_loss": 0.60113287, + "learning_rate": 1.3365218015607437e-07, + "loss": 0.6233139, + "num_input_tokens_seen": 317932855, + "step": 14744, + "time_per_iteration": 2.7134435176849365 + }, + { + "auxiliary_loss_clip": 0.01148026, + "auxiliary_loss_mlp": 0.00747427, + "balance_loss_clip": 1.00204301, + "balance_loss_mlp": 1.00047231, + "epoch": 0.8865173605892078, + "flos": 18548759456640.0, + "grad_norm": 1.6472270648631575, + "language_loss": 0.76856041, + "learning_rate": 1.3351223321138762e-07, + "loss": 0.78751493, + "num_input_tokens_seen": 317952090, + "step": 14745, + "time_per_iteration": 2.633195400238037 + }, + { + "auxiliary_loss_clip": 0.01164307, + "auxiliary_loss_mlp": 0.0074733, + "balance_loss_clip": 1.00193584, + "balance_loss_mlp": 1.00049591, + "epoch": 0.8865774838418758, + "flos": 19025868833280.0, + "grad_norm": 1.7913289236879784, + "language_loss": 0.77446866, + "learning_rate": 1.3337235704410454e-07, + "loss": 0.79358506, + "num_input_tokens_seen": 317970370, + "step": 14746, + "time_per_iteration": 2.6254591941833496 + }, + { + "auxiliary_loss_clip": 0.01131166, + "auxiliary_loss_mlp": 0.01101139, + "balance_loss_clip": 1.00192261, + "balance_loss_mlp": 1.00047898, + "epoch": 0.8866376070945439, + "flos": 22163168482560.0, + "grad_norm": 2.2220865798510667, + "language_loss": 0.76809877, + "learning_rate": 1.3323255165952873e-07, + "loss": 0.79042184, + "num_input_tokens_seen": 317989125, + "step": 14747, + "time_per_iteration": 2.6939709186553955 + }, + { + "auxiliary_loss_clip": 0.01130931, + "auxiliary_loss_mlp": 0.00747266, + "balance_loss_clip": 1.00172019, + "balance_loss_mlp": 1.00042176, + "epoch": 0.8866977303472118, + "flos": 20704261685760.0, + "grad_norm": 1.9455274497660933, + "language_loss": 0.82696104, + "learning_rate": 1.3309281706296127e-07, + "loss": 0.845743, + "num_input_tokens_seen": 318007820, + "step": 14748, + "time_per_iteration": 2.6757123470306396 + }, + { + "auxiliary_loss_clip": 0.01147324, + "auxiliary_loss_mlp": 0.01100648, + "balance_loss_clip": 1.00175118, + "balance_loss_mlp": 1.00051188, + "epoch": 0.8867578535998798, + "flos": 48794448533760.0, + "grad_norm": 1.8591193983142997, + "language_loss": 0.77339685, + "learning_rate": 1.3295315325970148e-07, + "loss": 0.79587656, + "num_input_tokens_seen": 318030435, + "step": 14749, + "time_per_iteration": 2.861327648162842 + }, + { + "auxiliary_loss_clip": 0.01081998, + "auxiliary_loss_mlp": 0.00747391, + "balance_loss_clip": 1.00152504, + "balance_loss_mlp": 1.00048256, + "epoch": 0.8868179768525477, + "flos": 21105312013440.0, + "grad_norm": 2.1896796601239843, + "language_loss": 0.69199395, + "learning_rate": 1.328135602550451e-07, + "loss": 0.71028781, + "num_input_tokens_seen": 318049465, + "step": 14750, + "time_per_iteration": 2.7637722492218018 + }, + { + "auxiliary_loss_clip": 0.01147562, + "auxiliary_loss_mlp": 0.01100184, + "balance_loss_clip": 1.00176358, + "balance_loss_mlp": 1.000525, + "epoch": 0.8868781001052157, + "flos": 21830922656640.0, + "grad_norm": 2.1019661662260316, + "language_loss": 0.58849996, + "learning_rate": 1.3267403805428546e-07, + "loss": 0.61097747, + "num_input_tokens_seen": 318067760, + "step": 14751, + "time_per_iteration": 2.7128870487213135 + }, + { + "auxiliary_loss_clip": 0.01164194, + "auxiliary_loss_mlp": 0.01101272, + "balance_loss_clip": 1.00192702, + "balance_loss_mlp": 1.00037336, + "epoch": 0.8869382233578836, + "flos": 13516418073600.0, + "grad_norm": 2.0532350941025377, + "language_loss": 0.81647044, + "learning_rate": 1.3253458666271344e-07, + "loss": 0.8391251, + "num_input_tokens_seen": 318082785, + "step": 14752, + "time_per_iteration": 2.5092873573303223 + }, + { + "auxiliary_loss_clip": 0.01130907, + "auxiliary_loss_mlp": 0.01101285, + "balance_loss_clip": 1.00181937, + "balance_loss_mlp": 1.0006243, + "epoch": 0.8869983466105517, + "flos": 22704988210560.0, + "grad_norm": 1.9514495889907122, + "language_loss": 0.79918253, + "learning_rate": 1.3239520608561793e-07, + "loss": 0.82150447, + "num_input_tokens_seen": 318101925, + "step": 14753, + "time_per_iteration": 2.6939120292663574 + }, + { + "auxiliary_loss_clip": 0.01164067, + "auxiliary_loss_mlp": 0.01100617, + "balance_loss_clip": 1.00188529, + "balance_loss_mlp": 1.00052929, + "epoch": 0.8870584698632196, + "flos": 15340751884800.0, + "grad_norm": 3.1479914130794064, + "language_loss": 0.65292317, + "learning_rate": 1.3225589632828248e-07, + "loss": 0.67556995, + "num_input_tokens_seen": 318119945, + "step": 14754, + "time_per_iteration": 4.102837562561035 + }, + { + "auxiliary_loss_clip": 0.01164208, + "auxiliary_loss_mlp": 0.01100974, + "balance_loss_clip": 1.00192487, + "balance_loss_mlp": 1.0005517, + "epoch": 0.8871185931158876, + "flos": 26615624699520.0, + "grad_norm": 2.105672992436401, + "language_loss": 0.74313092, + "learning_rate": 1.3211665739599065e-07, + "loss": 0.76578271, + "num_input_tokens_seen": 318139685, + "step": 14755, + "time_per_iteration": 2.6619176864624023 + }, + { + "auxiliary_loss_clip": 0.01134869, + "auxiliary_loss_mlp": 0.0110116, + "balance_loss_clip": 1.00177836, + "balance_loss_mlp": 1.00059474, + "epoch": 0.8871787163685555, + "flos": 21799034357760.0, + "grad_norm": 1.4594830768279896, + "language_loss": 0.78025031, + "learning_rate": 1.3197748929402262e-07, + "loss": 0.80261058, + "num_input_tokens_seen": 318160375, + "step": 14756, + "time_per_iteration": 2.7634146213531494 + }, + { + "auxiliary_loss_clip": 0.0113193, + "auxiliary_loss_mlp": 0.01100277, + "balance_loss_clip": 1.00172186, + "balance_loss_mlp": 1.00042701, + "epoch": 0.8872388396212235, + "flos": 14902964922240.0, + "grad_norm": 1.8009160391493593, + "language_loss": 0.76195383, + "learning_rate": 1.3183839202765535e-07, + "loss": 0.78427589, + "num_input_tokens_seen": 318177995, + "step": 14757, + "time_per_iteration": 2.666369676589966 + }, + { + "auxiliary_loss_clip": 0.01088136, + "auxiliary_loss_mlp": 0.0110004, + "balance_loss_clip": 1.00175238, + "balance_loss_mlp": 1.00066686, + "epoch": 0.8872989628738914, + "flos": 26432157006720.0, + "grad_norm": 1.847893241610183, + "language_loss": 0.68255115, + "learning_rate": 1.316993656021632e-07, + "loss": 0.70443285, + "num_input_tokens_seen": 318197030, + "step": 14758, + "time_per_iteration": 4.171246767044067 + }, + { + "auxiliary_loss_clip": 0.01164399, + "auxiliary_loss_mlp": 0.01101747, + "balance_loss_clip": 1.00200987, + "balance_loss_mlp": 1.00046659, + "epoch": 0.8873590861265594, + "flos": 48142562555520.0, + "grad_norm": 1.6515612528101578, + "language_loss": 0.68959391, + "learning_rate": 1.3156041002281915e-07, + "loss": 0.71225536, + "num_input_tokens_seen": 318221780, + "step": 14759, + "time_per_iteration": 2.8309803009033203 + }, + { + "auxiliary_loss_clip": 0.0116414, + "auxiliary_loss_mlp": 0.01100088, + "balance_loss_clip": 1.00182867, + "balance_loss_mlp": 1.00047719, + "epoch": 0.8874192093792275, + "flos": 18332972501760.0, + "grad_norm": 1.9847884301539624, + "language_loss": 0.74429786, + "learning_rate": 1.3142152529489092e-07, + "loss": 0.76694018, + "num_input_tokens_seen": 318239710, + "step": 14760, + "time_per_iteration": 2.5484426021575928 + }, + { + "auxiliary_loss_clip": 0.01131425, + "auxiliary_loss_mlp": 0.01101555, + "balance_loss_clip": 1.00176084, + "balance_loss_mlp": 1.00056136, + "epoch": 0.8874793326318954, + "flos": 17894215872000.0, + "grad_norm": 3.0863765899496762, + "language_loss": 0.76527667, + "learning_rate": 1.3128271142364565e-07, + "loss": 0.78760648, + "num_input_tokens_seen": 318257425, + "step": 14761, + "time_per_iteration": 2.6750447750091553 + }, + { + "auxiliary_loss_clip": 0.01164222, + "auxiliary_loss_mlp": 0.01100268, + "balance_loss_clip": 1.00190234, + "balance_loss_mlp": 1.00060916, + "epoch": 0.8875394558845634, + "flos": 31102231772160.0, + "grad_norm": 1.7973997651623699, + "language_loss": 0.61720788, + "learning_rate": 1.3114396841434717e-07, + "loss": 0.63985276, + "num_input_tokens_seen": 318278485, + "step": 14762, + "time_per_iteration": 2.6241419315338135 + }, + { + "auxiliary_loss_clip": 0.01149516, + "auxiliary_loss_mlp": 0.01101124, + "balance_loss_clip": 1.00197542, + "balance_loss_mlp": 1.00051093, + "epoch": 0.8875995791372313, + "flos": 21142048648320.0, + "grad_norm": 2.044674069442678, + "language_loss": 0.64320201, + "learning_rate": 1.3100529627225697e-07, + "loss": 0.66570842, + "num_input_tokens_seen": 318297560, + "step": 14763, + "time_per_iteration": 2.6272518634796143 + }, + { + "auxiliary_loss_clip": 0.01131129, + "auxiliary_loss_mlp": 0.00747296, + "balance_loss_clip": 1.00186682, + "balance_loss_mlp": 1.00048268, + "epoch": 0.8876597023898993, + "flos": 17455136019840.0, + "grad_norm": 2.7869887508795017, + "language_loss": 0.70746469, + "learning_rate": 1.3086669500263335e-07, + "loss": 0.72624898, + "num_input_tokens_seen": 318313060, + "step": 14764, + "time_per_iteration": 2.6295578479766846 + }, + { + "auxiliary_loss_clip": 0.01164382, + "auxiliary_loss_mlp": 0.01101062, + "balance_loss_clip": 1.00186634, + "balance_loss_mlp": 1.0004499, + "epoch": 0.8877198256425672, + "flos": 22707933125760.0, + "grad_norm": 2.1170298761694117, + "language_loss": 0.65859783, + "learning_rate": 1.3072816461073166e-07, + "loss": 0.68125236, + "num_input_tokens_seen": 318332030, + "step": 14765, + "time_per_iteration": 2.6229960918426514 + }, + { + "auxiliary_loss_clip": 0.01115326, + "auxiliary_loss_mlp": 0.01099192, + "balance_loss_clip": 1.00172079, + "balance_loss_mlp": 1.00063038, + "epoch": 0.8877799488952353, + "flos": 24535104111360.0, + "grad_norm": 1.7912572683249253, + "language_loss": 0.76770449, + "learning_rate": 1.3058970510180568e-07, + "loss": 0.78984964, + "num_input_tokens_seen": 318351090, + "step": 14766, + "time_per_iteration": 2.7385261058807373 + }, + { + "auxiliary_loss_clip": 0.01132801, + "auxiliary_loss_mlp": 0.01100092, + "balance_loss_clip": 1.00189626, + "balance_loss_mlp": 1.00043297, + "epoch": 0.8878400721479032, + "flos": 20959191486720.0, + "grad_norm": 1.8815328145387715, + "language_loss": 0.73023826, + "learning_rate": 1.3045131648110496e-07, + "loss": 0.75256717, + "num_input_tokens_seen": 318372000, + "step": 14767, + "time_per_iteration": 2.6915788650512695 + }, + { + "auxiliary_loss_clip": 0.01164123, + "auxiliary_loss_mlp": 0.01099986, + "balance_loss_clip": 1.00194764, + "balance_loss_mlp": 1.00051749, + "epoch": 0.8879001954005712, + "flos": 25295260659840.0, + "grad_norm": 1.6928778661427226, + "language_loss": 0.70938128, + "learning_rate": 1.303129987538778e-07, + "loss": 0.7320224, + "num_input_tokens_seen": 318391530, + "step": 14768, + "time_per_iteration": 2.6856400966644287 + }, + { + "auxiliary_loss_clip": 0.01149399, + "auxiliary_loss_mlp": 0.01100209, + "balance_loss_clip": 1.00191963, + "balance_loss_mlp": 1.00054979, + "epoch": 0.8879603186532391, + "flos": 23185329811200.0, + "grad_norm": 1.6122528054535452, + "language_loss": 0.69935942, + "learning_rate": 1.3017475192536932e-07, + "loss": 0.72185552, + "num_input_tokens_seen": 318410690, + "step": 14769, + "time_per_iteration": 2.649728536605835 + }, + { + "auxiliary_loss_clip": 0.01132256, + "auxiliary_loss_mlp": 0.01099673, + "balance_loss_clip": 1.00192308, + "balance_loss_mlp": 1.00039542, + "epoch": 0.8880204419059071, + "flos": 13655427707520.0, + "grad_norm": 1.8906088552175724, + "language_loss": 0.6718297, + "learning_rate": 1.3003657600082174e-07, + "loss": 0.69414896, + "num_input_tokens_seen": 318427380, + "step": 14770, + "time_per_iteration": 2.651371479034424 + }, + { + "auxiliary_loss_clip": 0.01147093, + "auxiliary_loss_mlp": 0.01099503, + "balance_loss_clip": 1.00180268, + "balance_loss_mlp": 1.00055921, + "epoch": 0.888080565158575, + "flos": 20631865824000.0, + "grad_norm": 1.6886155981394928, + "language_loss": 0.65267754, + "learning_rate": 1.2989847098547424e-07, + "loss": 0.67514348, + "num_input_tokens_seen": 318448530, + "step": 14771, + "time_per_iteration": 2.6306331157684326 + }, + { + "auxiliary_loss_clip": 0.01133032, + "auxiliary_loss_mlp": 0.01100335, + "balance_loss_clip": 1.00186956, + "balance_loss_mlp": 1.00048578, + "epoch": 0.888140688411243, + "flos": 28620014411520.0, + "grad_norm": 3.718937995794565, + "language_loss": 0.82187164, + "learning_rate": 1.2976043688456396e-07, + "loss": 0.84420526, + "num_input_tokens_seen": 318468655, + "step": 14772, + "time_per_iteration": 2.721316337585449 + }, + { + "auxiliary_loss_clip": 0.01134589, + "auxiliary_loss_mlp": 0.01098359, + "balance_loss_clip": 1.00170946, + "balance_loss_mlp": 1.00036871, + "epoch": 0.8882008116639111, + "flos": 25520241496320.0, + "grad_norm": 1.4865183239437716, + "language_loss": 0.76294947, + "learning_rate": 1.296224737033258e-07, + "loss": 0.78527892, + "num_input_tokens_seen": 318488740, + "step": 14773, + "time_per_iteration": 4.102840900421143 + }, + { + "auxiliary_loss_clip": 0.01132304, + "auxiliary_loss_mlp": 0.01100069, + "balance_loss_clip": 1.00191426, + "balance_loss_mlp": 1.00050569, + "epoch": 0.888260934916579, + "flos": 27673696650240.0, + "grad_norm": 1.7966962920882166, + "language_loss": 0.74852955, + "learning_rate": 1.294845814469907e-07, + "loss": 0.77085328, + "num_input_tokens_seen": 318508810, + "step": 14774, + "time_per_iteration": 2.7200264930725098 + }, + { + "auxiliary_loss_clip": 0.01100046, + "auxiliary_loss_mlp": 0.00747212, + "balance_loss_clip": 1.00167942, + "balance_loss_mlp": 1.00037456, + "epoch": 0.888321058169247, + "flos": 21611077464960.0, + "grad_norm": 2.347857571053607, + "language_loss": 0.71589565, + "learning_rate": 1.2934676012078783e-07, + "loss": 0.73436826, + "num_input_tokens_seen": 318526860, + "step": 14775, + "time_per_iteration": 2.811986207962036 + }, + { + "auxiliary_loss_clip": 0.01164193, + "auxiliary_loss_mlp": 0.01100122, + "balance_loss_clip": 1.00195968, + "balance_loss_mlp": 1.00046325, + "epoch": 0.8883811814219149, + "flos": 18149109759360.0, + "grad_norm": 2.752190588709122, + "language_loss": 0.80220127, + "learning_rate": 1.292090097299432e-07, + "loss": 0.82484442, + "num_input_tokens_seen": 318545180, + "step": 14776, + "time_per_iteration": 2.6271743774414062 + }, + { + "auxiliary_loss_clip": 0.01149579, + "auxiliary_loss_mlp": 0.01101189, + "balance_loss_clip": 1.00171375, + "balance_loss_mlp": 1.00038576, + "epoch": 0.8884413046745829, + "flos": 28324648874880.0, + "grad_norm": 3.2182646097777536, + "language_loss": 0.69511914, + "learning_rate": 1.290713302796802e-07, + "loss": 0.71762693, + "num_input_tokens_seen": 318564350, + "step": 14777, + "time_per_iteration": 2.6711859703063965 + }, + { + "auxiliary_loss_clip": 0.01149461, + "auxiliary_loss_mlp": 0.0110055, + "balance_loss_clip": 1.00179362, + "balance_loss_mlp": 1.00060558, + "epoch": 0.8885014279272508, + "flos": 15158756649600.0, + "grad_norm": 2.9893059881571316, + "language_loss": 0.70737219, + "learning_rate": 1.2893372177522e-07, + "loss": 0.72987235, + "num_input_tokens_seen": 318582275, + "step": 14778, + "time_per_iteration": 2.5935721397399902 + }, + { + "auxiliary_loss_clip": 0.01164204, + "auxiliary_loss_mlp": 0.01100117, + "balance_loss_clip": 1.00191641, + "balance_loss_mlp": 1.00036299, + "epoch": 0.8885615511799189, + "flos": 19099593498240.0, + "grad_norm": 1.5999981625224404, + "language_loss": 0.77487683, + "learning_rate": 1.287961842217804e-07, + "loss": 0.79752004, + "num_input_tokens_seen": 318601230, + "step": 14779, + "time_per_iteration": 2.6080074310302734 + }, + { + "auxiliary_loss_clip": 0.01125414, + "auxiliary_loss_mlp": 0.01074511, + "balance_loss_clip": 1.00092554, + "balance_loss_mlp": 1.00050569, + "epoch": 0.8886216744325868, + "flos": 51186567605760.0, + "grad_norm": 0.8891313484897634, + "language_loss": 0.56742489, + "learning_rate": 1.2865871762457747e-07, + "loss": 0.58942413, + "num_input_tokens_seen": 318645595, + "step": 14780, + "time_per_iteration": 2.997673988342285 + }, + { + "auxiliary_loss_clip": 0.01158036, + "auxiliary_loss_mlp": 0.01073888, + "balance_loss_clip": 1.00072598, + "balance_loss_mlp": 1.00026441, + "epoch": 0.8886817976852548, + "flos": 61612981263360.0, + "grad_norm": 0.7911651122171905, + "language_loss": 0.62440574, + "learning_rate": 1.2852132198882326e-07, + "loss": 0.64672506, + "num_input_tokens_seen": 318707850, + "step": 14781, + "time_per_iteration": 3.240304946899414 + }, + { + "auxiliary_loss_clip": 0.01043213, + "auxiliary_loss_mlp": 0.01075059, + "balance_loss_clip": 1.00075829, + "balance_loss_mlp": 1.00067282, + "epoch": 0.8887419209379227, + "flos": 60646946935680.0, + "grad_norm": 0.9088528011222168, + "language_loss": 0.58158213, + "learning_rate": 1.2838399731972805e-07, + "loss": 0.60276484, + "num_input_tokens_seen": 318764915, + "step": 14782, + "time_per_iteration": 3.4041588306427 + }, + { + "auxiliary_loss_clip": 0.01164091, + "auxiliary_loss_mlp": 0.01099681, + "balance_loss_clip": 1.00188422, + "balance_loss_mlp": 1.00064158, + "epoch": 0.8888020441905907, + "flos": 29205861235200.0, + "grad_norm": 1.7733755366282193, + "language_loss": 0.66261697, + "learning_rate": 1.2824674362249922e-07, + "loss": 0.68525469, + "num_input_tokens_seen": 318785660, + "step": 14783, + "time_per_iteration": 2.798867702484131 + }, + { + "auxiliary_loss_clip": 0.01164311, + "auxiliary_loss_mlp": 0.01101317, + "balance_loss_clip": 1.00182676, + "balance_loss_mlp": 1.00051343, + "epoch": 0.8888621674432586, + "flos": 22162701605760.0, + "grad_norm": 1.5368713014957929, + "language_loss": 0.77748859, + "learning_rate": 1.281095609023415e-07, + "loss": 0.80014491, + "num_input_tokens_seen": 318806080, + "step": 14784, + "time_per_iteration": 2.567453384399414 + }, + { + "auxiliary_loss_clip": 0.01133102, + "auxiliary_loss_mlp": 0.01101663, + "balance_loss_clip": 1.00194943, + "balance_loss_mlp": 1.00062156, + "epoch": 0.8889222906959267, + "flos": 27672834723840.0, + "grad_norm": 2.3579638945561863, + "language_loss": 0.6109153, + "learning_rate": 1.279724491644565e-07, + "loss": 0.63326299, + "num_input_tokens_seen": 318826445, + "step": 14785, + "time_per_iteration": 2.6596109867095947 + }, + { + "auxiliary_loss_clip": 0.01116084, + "auxiliary_loss_mlp": 0.01100909, + "balance_loss_clip": 1.00177884, + "balance_loss_mlp": 1.00048673, + "epoch": 0.8889824139485947, + "flos": 14168627274240.0, + "grad_norm": 2.1072523113737094, + "language_loss": 0.64921963, + "learning_rate": 1.278354084140445e-07, + "loss": 0.67138958, + "num_input_tokens_seen": 318843915, + "step": 14786, + "time_per_iteration": 2.754155397415161 + }, + { + "auxiliary_loss_clip": 0.01115516, + "auxiliary_loss_mlp": 0.007475, + "balance_loss_clip": 1.00161672, + "balance_loss_mlp": 1.00051117, + "epoch": 0.8890425372012626, + "flos": 12853003829760.0, + "grad_norm": 2.4969184144526033, + "language_loss": 0.85393769, + "learning_rate": 1.276984386563009e-07, + "loss": 0.87256783, + "num_input_tokens_seen": 318859670, + "step": 14787, + "time_per_iteration": 2.631091356277466 + }, + { + "auxiliary_loss_clip": 0.01130622, + "auxiliary_loss_mlp": 0.01100076, + "balance_loss_clip": 1.00171947, + "balance_loss_mlp": 1.0005126, + "epoch": 0.8891026604539306, + "flos": 21689291329920.0, + "grad_norm": 2.2077400469900286, + "language_loss": 0.70516109, + "learning_rate": 1.2756153989642027e-07, + "loss": 0.72746801, + "num_input_tokens_seen": 318877855, + "step": 14788, + "time_per_iteration": 2.651480197906494 + }, + { + "auxiliary_loss_clip": 0.01163947, + "auxiliary_loss_mlp": 0.01099056, + "balance_loss_clip": 1.00189006, + "balance_loss_mlp": 1.00044608, + "epoch": 0.8891627837065985, + "flos": 21871430219520.0, + "grad_norm": 1.7444221434099736, + "language_loss": 0.70136547, + "learning_rate": 1.274247121395935e-07, + "loss": 0.72399551, + "num_input_tokens_seen": 318896045, + "step": 14789, + "time_per_iteration": 2.583056926727295 + }, + { + "auxiliary_loss_clip": 0.01149398, + "auxiliary_loss_mlp": 0.01100206, + "balance_loss_clip": 1.00193906, + "balance_loss_mlp": 1.00035584, + "epoch": 0.8892229069592665, + "flos": 21580230660480.0, + "grad_norm": 1.5402270855340676, + "language_loss": 0.70316112, + "learning_rate": 1.2728795539100956e-07, + "loss": 0.72565717, + "num_input_tokens_seen": 318915515, + "step": 14790, + "time_per_iteration": 2.724348783493042 + }, + { + "auxiliary_loss_clip": 0.01132048, + "auxiliary_loss_mlp": 0.0110053, + "balance_loss_clip": 1.00183392, + "balance_loss_mlp": 1.00058484, + "epoch": 0.8892830302119344, + "flos": 23075981832960.0, + "grad_norm": 1.7065389323174933, + "language_loss": 0.72935206, + "learning_rate": 1.2715126965585387e-07, + "loss": 0.75167787, + "num_input_tokens_seen": 318934305, + "step": 14791, + "time_per_iteration": 2.6348228454589844 + }, + { + "auxiliary_loss_clip": 0.01119134, + "auxiliary_loss_mlp": 0.01100576, + "balance_loss_clip": 1.001737, + "balance_loss_mlp": 1.00067878, + "epoch": 0.8893431534646025, + "flos": 23072139077760.0, + "grad_norm": 1.7913016173893548, + "language_loss": 0.73868716, + "learning_rate": 1.2701465493931008e-07, + "loss": 0.76088428, + "num_input_tokens_seen": 318953880, + "step": 14792, + "time_per_iteration": 4.471564769744873 + }, + { + "auxiliary_loss_clip": 0.01069709, + "auxiliary_loss_mlp": 0.01102253, + "balance_loss_clip": 1.00160289, + "balance_loss_mlp": 1.00049579, + "epoch": 0.8894032767172704, + "flos": 22454978572800.0, + "grad_norm": 2.042205264992978, + "language_loss": 0.66367602, + "learning_rate": 1.2687811124655801e-07, + "loss": 0.68539566, + "num_input_tokens_seen": 318971395, + "step": 14793, + "time_per_iteration": 2.9390828609466553 + }, + { + "auxiliary_loss_clip": 0.01116307, + "auxiliary_loss_mlp": 0.01100283, + "balance_loss_clip": 1.00160646, + "balance_loss_mlp": 1.00062418, + "epoch": 0.8894633999699384, + "flos": 25338246261120.0, + "grad_norm": 1.5826814973040162, + "language_loss": 0.71753228, + "learning_rate": 1.2674163858277552e-07, + "loss": 0.73969823, + "num_input_tokens_seen": 318990580, + "step": 14794, + "time_per_iteration": 2.717416763305664 + }, + { + "auxiliary_loss_clip": 0.01148012, + "auxiliary_loss_mlp": 0.0110167, + "balance_loss_clip": 1.00183725, + "balance_loss_mlp": 1.00058055, + "epoch": 0.8895235232226063, + "flos": 20994096528000.0, + "grad_norm": 1.5229655412157332, + "language_loss": 0.75513244, + "learning_rate": 1.2660523695313785e-07, + "loss": 0.77762926, + "num_input_tokens_seen": 319010040, + "step": 14795, + "time_per_iteration": 2.653623580932617 + }, + { + "auxiliary_loss_clip": 0.01111828, + "auxiliary_loss_mlp": 0.01074382, + "balance_loss_clip": 1.00059688, + "balance_loss_mlp": 1.00037718, + "epoch": 0.8895836464752743, + "flos": 69732956764800.0, + "grad_norm": 0.7650261907516692, + "language_loss": 0.56037933, + "learning_rate": 1.2646890636281727e-07, + "loss": 0.58224142, + "num_input_tokens_seen": 319063860, + "step": 14796, + "time_per_iteration": 4.453073501586914 + }, + { + "auxiliary_loss_clip": 0.0116433, + "auxiliary_loss_mlp": 0.01101348, + "balance_loss_clip": 1.00194597, + "balance_loss_mlp": 1.00054502, + "epoch": 0.8896437697279422, + "flos": 23221815050880.0, + "grad_norm": 2.756267462344018, + "language_loss": 0.70005345, + "learning_rate": 1.263326468169843e-07, + "loss": 0.72271031, + "num_input_tokens_seen": 319082335, + "step": 14797, + "time_per_iteration": 2.598618984222412 + }, + { + "auxiliary_loss_clip": 0.01129336, + "auxiliary_loss_mlp": 0.01074239, + "balance_loss_clip": 1.00070727, + "balance_loss_mlp": 1.00023425, + "epoch": 0.8897038929806103, + "flos": 70752711882240.0, + "grad_norm": 0.747200104549899, + "language_loss": 0.58068955, + "learning_rate": 1.2619645832080417e-07, + "loss": 0.60272527, + "num_input_tokens_seen": 319147075, + "step": 14798, + "time_per_iteration": 3.2461795806884766 + }, + { + "auxiliary_loss_clip": 0.01149514, + "auxiliary_loss_mlp": 0.01100681, + "balance_loss_clip": 1.00191665, + "balance_loss_mlp": 1.0004971, + "epoch": 0.8897640162332782, + "flos": 19245103493760.0, + "grad_norm": 1.9417940144660544, + "language_loss": 0.79015619, + "learning_rate": 1.2606034087944251e-07, + "loss": 0.81265813, + "num_input_tokens_seen": 319166630, + "step": 14799, + "time_per_iteration": 2.6339468955993652 + }, + { + "auxiliary_loss_clip": 0.01141939, + "auxiliary_loss_mlp": 0.01074442, + "balance_loss_clip": 1.00075221, + "balance_loss_mlp": 1.00043702, + "epoch": 0.8898241394859462, + "flos": 41356275039360.0, + "grad_norm": 0.8845418147995288, + "language_loss": 0.58135009, + "learning_rate": 1.2592429449806053e-07, + "loss": 0.60351384, + "num_input_tokens_seen": 319221865, + "step": 14800, + "time_per_iteration": 3.117027759552002 + }, + { + "auxiliary_loss_clip": 0.01148022, + "auxiliary_loss_mlp": 0.01100462, + "balance_loss_clip": 1.00182343, + "balance_loss_mlp": 1.00056481, + "epoch": 0.8898842627386142, + "flos": 18986295024000.0, + "grad_norm": 1.9960768640523765, + "language_loss": 0.66292083, + "learning_rate": 1.2578831918181698e-07, + "loss": 0.68540567, + "num_input_tokens_seen": 319240710, + "step": 14801, + "time_per_iteration": 2.639976978302002 + }, + { + "auxiliary_loss_clip": 0.01119235, + "auxiliary_loss_mlp": 0.0110246, + "balance_loss_clip": 1.00201869, + "balance_loss_mlp": 1.00051188, + "epoch": 0.8899443859912821, + "flos": 13217173868160.0, + "grad_norm": 2.746639441324306, + "language_loss": 0.75520957, + "learning_rate": 1.256524149358682e-07, + "loss": 0.77742648, + "num_input_tokens_seen": 319256495, + "step": 14802, + "time_per_iteration": 2.6464478969573975 + }, + { + "auxiliary_loss_clip": 0.01147721, + "auxiliary_loss_mlp": 0.01100211, + "balance_loss_clip": 1.00184631, + "balance_loss_mlp": 1.00050449, + "epoch": 0.8900045092439501, + "flos": 22674680110080.0, + "grad_norm": 1.7971690836819, + "language_loss": 0.73281473, + "learning_rate": 1.2551658176536805e-07, + "loss": 0.75529408, + "num_input_tokens_seen": 319273620, + "step": 14803, + "time_per_iteration": 2.6564888954162598 + }, + { + "auxiliary_loss_clip": 0.01133767, + "auxiliary_loss_mlp": 0.01100034, + "balance_loss_clip": 1.00198722, + "balance_loss_mlp": 1.00037479, + "epoch": 0.890064632496618, + "flos": 21141617685120.0, + "grad_norm": 2.858044266132251, + "language_loss": 0.71728885, + "learning_rate": 1.2538081967546664e-07, + "loss": 0.73962682, + "num_input_tokens_seen": 319291720, + "step": 14804, + "time_per_iteration": 2.633713722229004 + }, + { + "auxiliary_loss_clip": 0.01147813, + "auxiliary_loss_mlp": 0.01100488, + "balance_loss_clip": 1.00176907, + "balance_loss_mlp": 1.00049484, + "epoch": 0.8901247557492861, + "flos": 23397058529280.0, + "grad_norm": 1.6667329222555334, + "language_loss": 0.81289947, + "learning_rate": 1.252451286713123e-07, + "loss": 0.83538252, + "num_input_tokens_seen": 319310380, + "step": 14805, + "time_per_iteration": 2.646925210952759 + }, + { + "auxiliary_loss_clip": 0.01147513, + "auxiliary_loss_mlp": 0.01100749, + "balance_loss_clip": 1.00188398, + "balance_loss_mlp": 1.0005182, + "epoch": 0.890184879001954, + "flos": 29169591477120.0, + "grad_norm": 1.9261348942090686, + "language_loss": 0.67049819, + "learning_rate": 1.251095087580505e-07, + "loss": 0.69298089, + "num_input_tokens_seen": 319331765, + "step": 14806, + "time_per_iteration": 2.6517961025238037 + }, + { + "auxiliary_loss_clip": 0.0113228, + "auxiliary_loss_mlp": 0.01099957, + "balance_loss_clip": 1.00173354, + "balance_loss_mlp": 1.00053644, + "epoch": 0.890245002254622, + "flos": 14427830793600.0, + "grad_norm": 1.9024854427807638, + "language_loss": 0.67538142, + "learning_rate": 1.2497395994082438e-07, + "loss": 0.69770384, + "num_input_tokens_seen": 319349135, + "step": 14807, + "time_per_iteration": 2.6827893257141113 + }, + { + "auxiliary_loss_clip": 0.01130544, + "auxiliary_loss_mlp": 0.01099841, + "balance_loss_clip": 1.00167584, + "balance_loss_mlp": 1.00046861, + "epoch": 0.8903051255072899, + "flos": 22382187661440.0, + "grad_norm": 1.9623603272588466, + "language_loss": 0.75443345, + "learning_rate": 1.248384822247732e-07, + "loss": 0.77673727, + "num_input_tokens_seen": 319368410, + "step": 14808, + "time_per_iteration": 2.614426851272583 + }, + { + "auxiliary_loss_clip": 0.0111414, + "auxiliary_loss_mlp": 0.01100225, + "balance_loss_clip": 1.0016048, + "balance_loss_mlp": 1.00056601, + "epoch": 0.8903652487599579, + "flos": 20777375819520.0, + "grad_norm": 3.4699767993479598, + "language_loss": 0.81379306, + "learning_rate": 1.2470307561503513e-07, + "loss": 0.83593667, + "num_input_tokens_seen": 319387535, + "step": 14809, + "time_per_iteration": 2.6880931854248047 + }, + { + "auxiliary_loss_clip": 0.01149383, + "auxiliary_loss_mlp": 0.01100132, + "balance_loss_clip": 1.00181508, + "balance_loss_mlp": 1.00033021, + "epoch": 0.8904253720126258, + "flos": 24424499157120.0, + "grad_norm": 1.7615485106765765, + "language_loss": 0.68810201, + "learning_rate": 1.2456774011674442e-07, + "loss": 0.71059716, + "num_input_tokens_seen": 319407210, + "step": 14810, + "time_per_iteration": 4.018455266952515 + }, + { + "auxiliary_loss_clip": 0.01116633, + "auxiliary_loss_mlp": 0.01100454, + "balance_loss_clip": 1.00162935, + "balance_loss_mlp": 1.00041366, + "epoch": 0.8904854952652939, + "flos": 19463871277440.0, + "grad_norm": 1.9721048925778482, + "language_loss": 0.70149714, + "learning_rate": 1.2443247573503257e-07, + "loss": 0.72366804, + "num_input_tokens_seen": 319425340, + "step": 14811, + "time_per_iteration": 4.1328511238098145 + }, + { + "auxiliary_loss_clip": 0.0111813, + "auxiliary_loss_mlp": 0.00747267, + "balance_loss_clip": 1.00182438, + "balance_loss_mlp": 1.00037825, + "epoch": 0.8905456185179618, + "flos": 50800741666560.0, + "grad_norm": 2.4868045102492, + "language_loss": 0.65597594, + "learning_rate": 1.2429728247502924e-07, + "loss": 0.67462993, + "num_input_tokens_seen": 319448150, + "step": 14812, + "time_per_iteration": 2.949986219406128 + }, + { + "auxiliary_loss_clip": 0.01098872, + "auxiliary_loss_mlp": 0.01099897, + "balance_loss_clip": 1.00160599, + "balance_loss_mlp": 1.00052452, + "epoch": 0.8906057417706298, + "flos": 17784867893760.0, + "grad_norm": 1.9344036767485058, + "language_loss": 0.68321049, + "learning_rate": 1.24162160341861e-07, + "loss": 0.70519823, + "num_input_tokens_seen": 319466115, + "step": 14813, + "time_per_iteration": 2.7290091514587402 + }, + { + "auxiliary_loss_clip": 0.01134799, + "auxiliary_loss_mlp": 0.01103235, + "balance_loss_clip": 1.0018661, + "balance_loss_mlp": 1.00042868, + "epoch": 0.8906658650232978, + "flos": 21944867575680.0, + "grad_norm": 2.1578376152480954, + "language_loss": 0.75713235, + "learning_rate": 1.2402710934065198e-07, + "loss": 0.7795127, + "num_input_tokens_seen": 319485255, + "step": 14814, + "time_per_iteration": 2.654407501220703 + }, + { + "auxiliary_loss_clip": 0.01147543, + "auxiliary_loss_mlp": 0.01100808, + "balance_loss_clip": 1.0017221, + "balance_loss_mlp": 1.00043344, + "epoch": 0.8907259882759657, + "flos": 21287810039040.0, + "grad_norm": 2.4624077373891176, + "language_loss": 0.74122858, + "learning_rate": 1.2389212947652229e-07, + "loss": 0.76371211, + "num_input_tokens_seen": 319501800, + "step": 14815, + "time_per_iteration": 2.624171257019043 + }, + { + "auxiliary_loss_clip": 0.01118104, + "auxiliary_loss_mlp": 0.01100207, + "balance_loss_clip": 1.00190389, + "balance_loss_mlp": 1.00050056, + "epoch": 0.8907861115286337, + "flos": 20120426023680.0, + "grad_norm": 2.0312855304109547, + "language_loss": 0.75290596, + "learning_rate": 1.237572207545914e-07, + "loss": 0.77508914, + "num_input_tokens_seen": 319520415, + "step": 14816, + "time_per_iteration": 2.6817941665649414 + }, + { + "auxiliary_loss_clip": 0.01132913, + "auxiliary_loss_mlp": 0.0110067, + "balance_loss_clip": 1.00182891, + "balance_loss_mlp": 1.00043941, + "epoch": 0.8908462347813016, + "flos": 20084156265600.0, + "grad_norm": 1.80262972449387, + "language_loss": 0.77940857, + "learning_rate": 1.2362238317997476e-07, + "loss": 0.80174434, + "num_input_tokens_seen": 319538410, + "step": 14817, + "time_per_iteration": 2.689663887023926 + }, + { + "auxiliary_loss_clip": 0.01112651, + "auxiliary_loss_mlp": 0.01074641, + "balance_loss_clip": 1.00069547, + "balance_loss_mlp": 1.00025463, + "epoch": 0.8909063580339697, + "flos": 65503649790720.0, + "grad_norm": 0.7620709584662111, + "language_loss": 0.56486183, + "learning_rate": 1.2348761675778517e-07, + "loss": 0.58673471, + "num_input_tokens_seen": 319602565, + "step": 14818, + "time_per_iteration": 3.2851035594940186 + }, + { + "auxiliary_loss_clip": 0.01099603, + "auxiliary_loss_mlp": 0.01100851, + "balance_loss_clip": 1.00156116, + "balance_loss_mlp": 1.00052476, + "epoch": 0.8909664812866376, + "flos": 29863062426240.0, + "grad_norm": 1.670162249902658, + "language_loss": 0.64343625, + "learning_rate": 1.2335292149313325e-07, + "loss": 0.6654408, + "num_input_tokens_seen": 319624645, + "step": 14819, + "time_per_iteration": 2.793036699295044 + }, + { + "auxiliary_loss_clip": 0.01147784, + "auxiliary_loss_mlp": 0.01100473, + "balance_loss_clip": 1.00189543, + "balance_loss_mlp": 1.00043213, + "epoch": 0.8910266045393056, + "flos": 25447127362560.0, + "grad_norm": 2.501177618259093, + "language_loss": 0.78345567, + "learning_rate": 1.2321829739112731e-07, + "loss": 0.80593824, + "num_input_tokens_seen": 319644040, + "step": 14820, + "time_per_iteration": 2.698051691055298 + }, + { + "auxiliary_loss_clip": 0.01114877, + "auxiliary_loss_mlp": 0.00747268, + "balance_loss_clip": 1.0018326, + "balance_loss_mlp": 1.00047565, + "epoch": 0.8910867277919735, + "flos": 24499121662080.0, + "grad_norm": 1.7030755665557875, + "language_loss": 0.76485956, + "learning_rate": 1.2308374445687087e-07, + "loss": 0.783481, + "num_input_tokens_seen": 319663930, + "step": 14821, + "time_per_iteration": 2.744260787963867 + }, + { + "auxiliary_loss_clip": 0.01142006, + "auxiliary_loss_mlp": 0.00745422, + "balance_loss_clip": 1.00087547, + "balance_loss_mlp": 1.00023448, + "epoch": 0.8911468510446415, + "flos": 60688136856960.0, + "grad_norm": 0.8003357772264657, + "language_loss": 0.59342444, + "learning_rate": 1.2294926269546712e-07, + "loss": 0.61229873, + "num_input_tokens_seen": 319721245, + "step": 14822, + "time_per_iteration": 3.078625440597534 + }, + { + "auxiliary_loss_clip": 0.01147937, + "auxiliary_loss_mlp": 0.0110037, + "balance_loss_clip": 1.00187182, + "balance_loss_mlp": 1.00056827, + "epoch": 0.8912069742973094, + "flos": 25337492075520.0, + "grad_norm": 1.8165293732432652, + "language_loss": 0.68998206, + "learning_rate": 1.2281485211201515e-07, + "loss": 0.71246511, + "num_input_tokens_seen": 319741200, + "step": 14823, + "time_per_iteration": 2.6305997371673584 + }, + { + "auxiliary_loss_clip": 0.01149507, + "auxiliary_loss_mlp": 0.01099962, + "balance_loss_clip": 1.00177479, + "balance_loss_mlp": 1.00049424, + "epoch": 0.8912670975499775, + "flos": 18223516782720.0, + "grad_norm": 2.715379349710834, + "language_loss": 0.69057679, + "learning_rate": 1.2268051271161262e-07, + "loss": 0.71307147, + "num_input_tokens_seen": 319759265, + "step": 14824, + "time_per_iteration": 2.7645926475524902 + }, + { + "auxiliary_loss_clip": 0.01101531, + "auxiliary_loss_mlp": 0.01100419, + "balance_loss_clip": 1.00161409, + "balance_loss_mlp": 1.00047374, + "epoch": 0.8913272208026454, + "flos": 26504481041280.0, + "grad_norm": 2.885149830133668, + "language_loss": 0.70500481, + "learning_rate": 1.2254624449935303e-07, + "loss": 0.72702432, + "num_input_tokens_seen": 319777560, + "step": 14825, + "time_per_iteration": 2.7862889766693115 + }, + { + "auxiliary_loss_clip": 0.01132729, + "auxiliary_loss_mlp": 0.01100999, + "balance_loss_clip": 1.0017972, + "balance_loss_mlp": 1.00048172, + "epoch": 0.8913873440553134, + "flos": 18802324540800.0, + "grad_norm": 2.6628653584793764, + "language_loss": 0.71188986, + "learning_rate": 1.2241204748032786e-07, + "loss": 0.73422706, + "num_input_tokens_seen": 319794125, + "step": 14826, + "time_per_iteration": 2.66287899017334 + }, + { + "auxiliary_loss_clip": 0.01147841, + "auxiliary_loss_mlp": 0.01101021, + "balance_loss_clip": 1.00209117, + "balance_loss_mlp": 1.00045621, + "epoch": 0.8914474673079814, + "flos": 20884892204160.0, + "grad_norm": 2.1761131300899135, + "language_loss": 0.74995005, + "learning_rate": 1.2227792165962615e-07, + "loss": 0.77243876, + "num_input_tokens_seen": 319810310, + "step": 14827, + "time_per_iteration": 2.593381404876709 + }, + { + "auxiliary_loss_clip": 0.0114862, + "auxiliary_loss_mlp": 0.01099756, + "balance_loss_clip": 1.00172424, + "balance_loss_mlp": 1.00052583, + "epoch": 0.8915075905606493, + "flos": 20952439729920.0, + "grad_norm": 1.7349607600090535, + "language_loss": 0.77781379, + "learning_rate": 1.221438670423336e-07, + "loss": 0.80029762, + "num_input_tokens_seen": 319828505, + "step": 14828, + "time_per_iteration": 2.628357172012329 + }, + { + "auxiliary_loss_clip": 0.01114928, + "auxiliary_loss_mlp": 0.01100333, + "balance_loss_clip": 1.00185299, + "balance_loss_mlp": 1.00048304, + "epoch": 0.8915677138133173, + "flos": 23076305055360.0, + "grad_norm": 1.6513627077659132, + "language_loss": 0.75347888, + "learning_rate": 1.2200988363353392e-07, + "loss": 0.77563143, + "num_input_tokens_seen": 319848680, + "step": 14829, + "time_per_iteration": 2.708158254623413 + }, + { + "auxiliary_loss_clip": 0.01164301, + "auxiliary_loss_mlp": 0.01100557, + "balance_loss_clip": 1.00193048, + "balance_loss_mlp": 1.00051618, + "epoch": 0.8916278370659853, + "flos": 23440259612160.0, + "grad_norm": 1.753227118243575, + "language_loss": 0.84282851, + "learning_rate": 1.2187597143830773e-07, + "loss": 0.86547709, + "num_input_tokens_seen": 319868835, + "step": 14830, + "time_per_iteration": 4.017737150192261 + }, + { + "auxiliary_loss_clip": 0.01147432, + "auxiliary_loss_mlp": 0.01099638, + "balance_loss_clip": 1.00176144, + "balance_loss_mlp": 1.0005033, + "epoch": 0.8916879603186533, + "flos": 25160488830720.0, + "grad_norm": 1.3850121075933879, + "language_loss": 0.74843168, + "learning_rate": 1.2174213046173299e-07, + "loss": 0.7709024, + "num_input_tokens_seen": 319891585, + "step": 14831, + "time_per_iteration": 2.705615997314453 + }, + { + "auxiliary_loss_clip": 0.01149645, + "auxiliary_loss_mlp": 0.01100582, + "balance_loss_clip": 1.00177479, + "balance_loss_mlp": 1.00044644, + "epoch": 0.8917480835713212, + "flos": 20229845829120.0, + "grad_norm": 5.867011722234556, + "language_loss": 0.73134887, + "learning_rate": 1.216083607088847e-07, + "loss": 0.75385118, + "num_input_tokens_seen": 319910315, + "step": 14832, + "time_per_iteration": 2.597015142440796 + }, + { + "auxiliary_loss_clip": 0.01084807, + "auxiliary_loss_mlp": 0.0074746, + "balance_loss_clip": 1.00177264, + "balance_loss_mlp": 1.00045705, + "epoch": 0.8918082068239892, + "flos": 26101922342400.0, + "grad_norm": 1.9723689322531994, + "language_loss": 0.66767329, + "learning_rate": 1.214746621848355e-07, + "loss": 0.68599594, + "num_input_tokens_seen": 319932275, + "step": 14833, + "time_per_iteration": 4.191941738128662 + }, + { + "auxiliary_loss_clip": 0.01149212, + "auxiliary_loss_mlp": 0.01101932, + "balance_loss_clip": 1.00192893, + "balance_loss_mlp": 1.00055659, + "epoch": 0.8918683300766571, + "flos": 24831439315200.0, + "grad_norm": 2.0987181478362364, + "language_loss": 0.73828697, + "learning_rate": 1.2134103489465575e-07, + "loss": 0.76079834, + "num_input_tokens_seen": 319955335, + "step": 14834, + "time_per_iteration": 2.744539737701416 + }, + { + "auxiliary_loss_clip": 0.01117874, + "auxiliary_loss_mlp": 0.01100138, + "balance_loss_clip": 1.0019021, + "balance_loss_mlp": 1.00057483, + "epoch": 0.8919284533293251, + "flos": 22305158945280.0, + "grad_norm": 1.8661936005569981, + "language_loss": 0.78820992, + "learning_rate": 1.2120747884341188e-07, + "loss": 0.81039, + "num_input_tokens_seen": 319973990, + "step": 14835, + "time_per_iteration": 2.7000412940979004 + }, + { + "auxiliary_loss_clip": 0.01163974, + "auxiliary_loss_mlp": 0.01099918, + "balance_loss_clip": 1.00180268, + "balance_loss_mlp": 1.00044954, + "epoch": 0.891988576581993, + "flos": 30373532559360.0, + "grad_norm": 2.7994402582777695, + "language_loss": 0.74204862, + "learning_rate": 1.210739940361689e-07, + "loss": 0.76468754, + "num_input_tokens_seen": 319995555, + "step": 14836, + "time_per_iteration": 2.6681175231933594 + }, + { + "auxiliary_loss_clip": 0.0113277, + "auxiliary_loss_mlp": 0.0109993, + "balance_loss_clip": 1.00172031, + "balance_loss_mlp": 1.00055718, + "epoch": 0.8920486998346611, + "flos": 15552947479680.0, + "grad_norm": 2.422839978708976, + "language_loss": 0.68563455, + "learning_rate": 1.2094058047798838e-07, + "loss": 0.70796156, + "num_input_tokens_seen": 320012385, + "step": 14837, + "time_per_iteration": 2.5980100631713867 + }, + { + "auxiliary_loss_clip": 0.01085674, + "auxiliary_loss_mlp": 0.01101082, + "balance_loss_clip": 1.00167608, + "balance_loss_mlp": 1.00046968, + "epoch": 0.892108823087329, + "flos": 21214983214080.0, + "grad_norm": 1.7009907665276074, + "language_loss": 0.67716992, + "learning_rate": 1.2080723817392913e-07, + "loss": 0.69903749, + "num_input_tokens_seen": 320032390, + "step": 14838, + "time_per_iteration": 2.7867369651794434 + }, + { + "auxiliary_loss_clip": 0.01149615, + "auxiliary_loss_mlp": 0.01100798, + "balance_loss_clip": 1.00186706, + "balance_loss_mlp": 1.00047112, + "epoch": 0.892168946339997, + "flos": 21978982517760.0, + "grad_norm": 1.917779075592067, + "language_loss": 0.76218486, + "learning_rate": 1.2067396712904777e-07, + "loss": 0.78468895, + "num_input_tokens_seen": 320052885, + "step": 14839, + "time_per_iteration": 2.6478705406188965 + }, + { + "auxiliary_loss_clip": 0.01112265, + "auxiliary_loss_mlp": 0.00745445, + "balance_loss_clip": 1.0006038, + "balance_loss_mlp": 1.00012994, + "epoch": 0.892229069592665, + "flos": 67475289277440.0, + "grad_norm": 0.6784993745032089, + "language_loss": 0.49399614, + "learning_rate": 1.205407673483978e-07, + "loss": 0.51257324, + "num_input_tokens_seen": 320113685, + "step": 14840, + "time_per_iteration": 3.280104637145996 + }, + { + "auxiliary_loss_clip": 0.01164462, + "auxiliary_loss_mlp": 0.01102541, + "balance_loss_clip": 1.00197268, + "balance_loss_mlp": 1.00059342, + "epoch": 0.8922891928453329, + "flos": 19459561645440.0, + "grad_norm": 2.0206093615592455, + "language_loss": 0.64041388, + "learning_rate": 1.2040763883703074e-07, + "loss": 0.66308391, + "num_input_tokens_seen": 320130810, + "step": 14841, + "time_per_iteration": 2.600644826889038 + }, + { + "auxiliary_loss_clip": 0.01115513, + "auxiliary_loss_mlp": 0.00747303, + "balance_loss_clip": 1.00176883, + "balance_loss_mlp": 1.0004859, + "epoch": 0.8923493160980009, + "flos": 23367396873600.0, + "grad_norm": 1.9475014029527251, + "language_loss": 0.68372494, + "learning_rate": 1.2027458159999438e-07, + "loss": 0.70235312, + "num_input_tokens_seen": 320152170, + "step": 14842, + "time_per_iteration": 2.7396445274353027 + }, + { + "auxiliary_loss_clip": 0.01164001, + "auxiliary_loss_mlp": 0.0109967, + "balance_loss_clip": 1.00187683, + "balance_loss_mlp": 1.00053596, + "epoch": 0.8924094393506689, + "flos": 26177047637760.0, + "grad_norm": 2.0096353022970024, + "language_loss": 0.79876369, + "learning_rate": 1.2014159564233373e-07, + "loss": 0.8214004, + "num_input_tokens_seen": 320172360, + "step": 14843, + "time_per_iteration": 2.664153575897217 + }, + { + "auxiliary_loss_clip": 0.01132956, + "auxiliary_loss_mlp": 0.01101915, + "balance_loss_clip": 1.00191295, + "balance_loss_mlp": 1.00053966, + "epoch": 0.8924695626033369, + "flos": 22018520413440.0, + "grad_norm": 2.00795902570134, + "language_loss": 0.68233776, + "learning_rate": 1.2000868096909257e-07, + "loss": 0.70468652, + "num_input_tokens_seen": 320192130, + "step": 14844, + "time_per_iteration": 2.627340316772461 + }, + { + "auxiliary_loss_clip": 0.01103036, + "auxiliary_loss_mlp": 0.01101521, + "balance_loss_clip": 1.00180793, + "balance_loss_mlp": 1.00052691, + "epoch": 0.8925296858560048, + "flos": 14793940166400.0, + "grad_norm": 1.9135229322713017, + "language_loss": 0.90827429, + "learning_rate": 1.1987583758531038e-07, + "loss": 0.93031985, + "num_input_tokens_seen": 320207760, + "step": 14845, + "time_per_iteration": 2.7049031257629395 + }, + { + "auxiliary_loss_clip": 0.01149624, + "auxiliary_loss_mlp": 0.01099508, + "balance_loss_clip": 1.0019232, + "balance_loss_mlp": 1.00056386, + "epoch": 0.8925898091086728, + "flos": 22346636175360.0, + "grad_norm": 3.651530057978736, + "language_loss": 0.72560328, + "learning_rate": 1.1974306549602476e-07, + "loss": 0.74809456, + "num_input_tokens_seen": 320225325, + "step": 14846, + "time_per_iteration": 2.5821309089660645 + }, + { + "auxiliary_loss_clip": 0.01118187, + "auxiliary_loss_mlp": 0.01101284, + "balance_loss_clip": 1.00171959, + "balance_loss_mlp": 1.00048089, + "epoch": 0.8926499323613407, + "flos": 45806322067200.0, + "grad_norm": 2.0051470228522654, + "language_loss": 0.57033408, + "learning_rate": 1.1961036470627094e-07, + "loss": 0.59252882, + "num_input_tokens_seen": 320247645, + "step": 14847, + "time_per_iteration": 2.939363479614258 + }, + { + "auxiliary_loss_clip": 0.01116126, + "auxiliary_loss_mlp": 0.01100023, + "balance_loss_clip": 1.00188696, + "balance_loss_mlp": 1.00050712, + "epoch": 0.8927100556140087, + "flos": 22127042378880.0, + "grad_norm": 1.8607862936220865, + "language_loss": 0.76463574, + "learning_rate": 1.1947773522108052e-07, + "loss": 0.78679723, + "num_input_tokens_seen": 320266005, + "step": 14848, + "time_per_iteration": 4.0708324909210205 + }, + { + "auxiliary_loss_clip": 0.01086817, + "auxiliary_loss_mlp": 0.01099638, + "balance_loss_clip": 1.00175452, + "balance_loss_mlp": 1.00069439, + "epoch": 0.8927701788666766, + "flos": 28330143655680.0, + "grad_norm": 1.8676441477924302, + "language_loss": 0.6972959, + "learning_rate": 1.1934517704548251e-07, + "loss": 0.71916044, + "num_input_tokens_seen": 320285555, + "step": 14849, + "time_per_iteration": 4.205574989318848 + }, + { + "auxiliary_loss_clip": 0.01147599, + "auxiliary_loss_mlp": 0.01101367, + "balance_loss_clip": 1.00195038, + "balance_loss_mlp": 1.00056338, + "epoch": 0.8928303021193447, + "flos": 25294973351040.0, + "grad_norm": 1.8505050656985151, + "language_loss": 0.81064564, + "learning_rate": 1.1921269018450364e-07, + "loss": 0.83313531, + "num_input_tokens_seen": 320305395, + "step": 14850, + "time_per_iteration": 2.682281255722046 + }, + { + "auxiliary_loss_clip": 0.01134683, + "auxiliary_loss_mlp": 0.01100463, + "balance_loss_clip": 1.00186563, + "balance_loss_mlp": 1.0006609, + "epoch": 0.8928904253720126, + "flos": 22236713579520.0, + "grad_norm": 1.6810786947311336, + "language_loss": 0.74770409, + "learning_rate": 1.1908027464316872e-07, + "loss": 0.77005559, + "num_input_tokens_seen": 320324220, + "step": 14851, + "time_per_iteration": 2.649296760559082 + }, + { + "auxiliary_loss_clip": 0.01132163, + "auxiliary_loss_mlp": 0.01100254, + "balance_loss_clip": 1.00180948, + "balance_loss_mlp": 1.00050008, + "epoch": 0.8929505486246806, + "flos": 27092374940160.0, + "grad_norm": 1.9523330713856217, + "language_loss": 0.78512275, + "learning_rate": 1.1894793042649775e-07, + "loss": 0.80744696, + "num_input_tokens_seen": 320347195, + "step": 14852, + "time_per_iteration": 2.721426486968994 + }, + { + "auxiliary_loss_clip": 0.01147827, + "auxiliary_loss_mlp": 0.01100131, + "balance_loss_clip": 1.00194287, + "balance_loss_mlp": 1.00047243, + "epoch": 0.8930106718773486, + "flos": 23039352938880.0, + "grad_norm": 1.4408735092190408, + "language_loss": 0.69152379, + "learning_rate": 1.1881565753951006e-07, + "loss": 0.71400338, + "num_input_tokens_seen": 320366850, + "step": 14853, + "time_per_iteration": 2.6725504398345947 + }, + { + "auxiliary_loss_clip": 0.01099079, + "auxiliary_loss_mlp": 0.01101366, + "balance_loss_clip": 1.00156891, + "balance_loss_mlp": 1.00051486, + "epoch": 0.8930707951300165, + "flos": 35626652887680.0, + "grad_norm": 1.7369381355506377, + "language_loss": 0.67043382, + "learning_rate": 1.1868345598722118e-07, + "loss": 0.69243824, + "num_input_tokens_seen": 320388895, + "step": 14854, + "time_per_iteration": 2.8685266971588135 + }, + { + "auxiliary_loss_clip": 0.01134384, + "auxiliary_loss_mlp": 0.0110059, + "balance_loss_clip": 1.00176835, + "balance_loss_mlp": 1.00050163, + "epoch": 0.8931309183826845, + "flos": 23039891642880.0, + "grad_norm": 1.8258981511962324, + "language_loss": 0.7483511, + "learning_rate": 1.1855132577464399e-07, + "loss": 0.77070081, + "num_input_tokens_seen": 320408520, + "step": 14855, + "time_per_iteration": 2.7112982273101807 + }, + { + "auxiliary_loss_clip": 0.01130729, + "auxiliary_loss_mlp": 0.01099919, + "balance_loss_clip": 1.001647, + "balance_loss_mlp": 1.00054646, + "epoch": 0.8931910416353525, + "flos": 26504624695680.0, + "grad_norm": 1.8110277049315082, + "language_loss": 0.64103842, + "learning_rate": 1.1841926690678893e-07, + "loss": 0.66334486, + "num_input_tokens_seen": 320427400, + "step": 14856, + "time_per_iteration": 2.681589126586914 + }, + { + "auxiliary_loss_clip": 0.01164083, + "auxiliary_loss_mlp": 0.01099355, + "balance_loss_clip": 1.00185585, + "balance_loss_mlp": 1.00045907, + "epoch": 0.8932511648880205, + "flos": 24973609345920.0, + "grad_norm": 1.7023346696335224, + "language_loss": 0.66220653, + "learning_rate": 1.1828727938866378e-07, + "loss": 0.68484092, + "num_input_tokens_seen": 320447570, + "step": 14857, + "time_per_iteration": 2.6374945640563965 + }, + { + "auxiliary_loss_clip": 0.01102801, + "auxiliary_loss_mlp": 0.01100866, + "balance_loss_clip": 1.00177705, + "balance_loss_mlp": 1.00063479, + "epoch": 0.8933112881406884, + "flos": 24460733001600.0, + "grad_norm": 2.6211709142057176, + "language_loss": 0.75354391, + "learning_rate": 1.1815536322527408e-07, + "loss": 0.77558059, + "num_input_tokens_seen": 320464405, + "step": 14858, + "time_per_iteration": 2.713721513748169 + }, + { + "auxiliary_loss_clip": 0.01147476, + "auxiliary_loss_mlp": 0.01101114, + "balance_loss_clip": 1.00184441, + "balance_loss_mlp": 1.00040591, + "epoch": 0.8933714113933564, + "flos": 28293083798400.0, + "grad_norm": 1.6841899728845982, + "language_loss": 0.69719785, + "learning_rate": 1.1802351842162139e-07, + "loss": 0.71968377, + "num_input_tokens_seen": 320485525, + "step": 14859, + "time_per_iteration": 2.681873321533203 + }, + { + "auxiliary_loss_clip": 0.01099712, + "auxiliary_loss_mlp": 0.01099379, + "balance_loss_clip": 1.0016315, + "balance_loss_mlp": 1.00067353, + "epoch": 0.8934315346460243, + "flos": 21434864319360.0, + "grad_norm": 3.5017804035897115, + "language_loss": 0.75705373, + "learning_rate": 1.1789174498270526e-07, + "loss": 0.77904463, + "num_input_tokens_seen": 320506725, + "step": 14860, + "time_per_iteration": 2.7715303897857666 + }, + { + "auxiliary_loss_clip": 0.01135017, + "auxiliary_loss_mlp": 0.01101613, + "balance_loss_clip": 1.00192928, + "balance_loss_mlp": 1.00052321, + "epoch": 0.8934916578986923, + "flos": 23769596436480.0, + "grad_norm": 1.7719637222221853, + "language_loss": 0.57817626, + "learning_rate": 1.1776004291352303e-07, + "loss": 0.60054255, + "num_input_tokens_seen": 320525425, + "step": 14861, + "time_per_iteration": 2.663759469985962 + }, + { + "auxiliary_loss_clip": 0.01132993, + "auxiliary_loss_mlp": 0.01100596, + "balance_loss_clip": 1.0016309, + "balance_loss_mlp": 1.00046062, + "epoch": 0.8935517811513602, + "flos": 18916161719040.0, + "grad_norm": 1.8265644440691604, + "language_loss": 0.63416523, + "learning_rate": 1.176284122190685e-07, + "loss": 0.65650111, + "num_input_tokens_seen": 320543010, + "step": 14862, + "time_per_iteration": 2.662430763244629 + }, + { + "auxiliary_loss_clip": 0.01147278, + "auxiliary_loss_mlp": 0.01100123, + "balance_loss_clip": 1.00180173, + "balance_loss_mlp": 1.00046444, + "epoch": 0.8936119044040283, + "flos": 24061370613120.0, + "grad_norm": 1.6818403875888048, + "language_loss": 0.77422178, + "learning_rate": 1.1749685290433298e-07, + "loss": 0.79669577, + "num_input_tokens_seen": 320562180, + "step": 14863, + "time_per_iteration": 2.6254494190216064 + }, + { + "auxiliary_loss_clip": 0.01133606, + "auxiliary_loss_mlp": 0.01099236, + "balance_loss_clip": 1.00171995, + "balance_loss_mlp": 1.00043523, + "epoch": 0.8936720276566962, + "flos": 21324079797120.0, + "grad_norm": 3.833565417595555, + "language_loss": 0.70929635, + "learning_rate": 1.1736536497430627e-07, + "loss": 0.73162472, + "num_input_tokens_seen": 320580395, + "step": 14864, + "time_per_iteration": 2.6607820987701416 + }, + { + "auxiliary_loss_clip": 0.01149969, + "auxiliary_loss_mlp": 0.01102265, + "balance_loss_clip": 1.00203419, + "balance_loss_mlp": 1.0006032, + "epoch": 0.8937321509093642, + "flos": 18406122549120.0, + "grad_norm": 1.8557109802961087, + "language_loss": 0.75150418, + "learning_rate": 1.1723394843397283e-07, + "loss": 0.77402651, + "num_input_tokens_seen": 320599505, + "step": 14865, + "time_per_iteration": 2.5929458141326904 + }, + { + "auxiliary_loss_clip": 0.01116076, + "auxiliary_loss_mlp": 0.01099881, + "balance_loss_clip": 1.00159991, + "balance_loss_mlp": 1.00046015, + "epoch": 0.8937922741620322, + "flos": 22054754257920.0, + "grad_norm": 2.048760643806439, + "language_loss": 0.72020954, + "learning_rate": 1.1710260328831668e-07, + "loss": 0.74236912, + "num_input_tokens_seen": 320619825, + "step": 14866, + "time_per_iteration": 2.719896078109741 + }, + { + "auxiliary_loss_clip": 0.01147729, + "auxiliary_loss_mlp": 0.01101481, + "balance_loss_clip": 1.00182199, + "balance_loss_mlp": 1.00048649, + "epoch": 0.8938523974147001, + "flos": 25664386775040.0, + "grad_norm": 1.8305258673898812, + "language_loss": 0.84047043, + "learning_rate": 1.1697132954231869e-07, + "loss": 0.86296248, + "num_input_tokens_seen": 320638515, + "step": 14867, + "time_per_iteration": 2.661379098892212 + }, + { + "auxiliary_loss_clip": 0.01148017, + "auxiliary_loss_mlp": 0.01099756, + "balance_loss_clip": 1.00184274, + "balance_loss_mlp": 1.00066912, + "epoch": 0.8939125206673681, + "flos": 25742852035200.0, + "grad_norm": 2.888138487739847, + "language_loss": 0.80916661, + "learning_rate": 1.168401272009567e-07, + "loss": 0.8316443, + "num_input_tokens_seen": 320659430, + "step": 14868, + "time_per_iteration": 4.081812143325806 + }, + { + "auxiliary_loss_clip": 0.01133549, + "auxiliary_loss_mlp": 0.01100266, + "balance_loss_clip": 1.00198889, + "balance_loss_mlp": 1.00055933, + "epoch": 0.8939726439200361, + "flos": 27344503480320.0, + "grad_norm": 1.6955839139721742, + "language_loss": 0.77222407, + "learning_rate": 1.167089962692056e-07, + "loss": 0.79456222, + "num_input_tokens_seen": 320679295, + "step": 14869, + "time_per_iteration": 2.665137767791748 + }, + { + "auxiliary_loss_clip": 0.01147513, + "auxiliary_loss_mlp": 0.00747303, + "balance_loss_clip": 1.00189114, + "balance_loss_mlp": 1.0004679, + "epoch": 0.8940327671727041, + "flos": 20338834671360.0, + "grad_norm": 2.1002818054080707, + "language_loss": 0.65330982, + "learning_rate": 1.1657793675203853e-07, + "loss": 0.67225802, + "num_input_tokens_seen": 320697535, + "step": 14870, + "time_per_iteration": 2.6564364433288574 + }, + { + "auxiliary_loss_clip": 0.0109828, + "auxiliary_loss_mlp": 0.01074329, + "balance_loss_clip": 1.00092411, + "balance_loss_mlp": 1.0007056, + "epoch": 0.894092890425372, + "flos": 58410573235200.0, + "grad_norm": 0.7958614121160366, + "language_loss": 0.55971605, + "learning_rate": 1.1644694865442461e-07, + "loss": 0.58144224, + "num_input_tokens_seen": 320758635, + "step": 14871, + "time_per_iteration": 4.780661106109619 + }, + { + "auxiliary_loss_clip": 0.01147685, + "auxiliary_loss_mlp": 0.011001, + "balance_loss_clip": 1.00181162, + "balance_loss_mlp": 1.00077438, + "epoch": 0.89415301367804, + "flos": 19829657427840.0, + "grad_norm": 2.0649801955572373, + "language_loss": 0.76546502, + "learning_rate": 1.16316031981331e-07, + "loss": 0.78794277, + "num_input_tokens_seen": 320777175, + "step": 14872, + "time_per_iteration": 2.645200490951538 + }, + { + "auxiliary_loss_clip": 0.01147198, + "auxiliary_loss_mlp": 0.01099637, + "balance_loss_clip": 1.00184774, + "balance_loss_mlp": 1.00050282, + "epoch": 0.8942131369307079, + "flos": 25775781828480.0, + "grad_norm": 1.5924636668992638, + "language_loss": 0.66871309, + "learning_rate": 1.1618518673772215e-07, + "loss": 0.69118142, + "num_input_tokens_seen": 320797670, + "step": 14873, + "time_per_iteration": 2.6518516540527344 + }, + { + "auxiliary_loss_clip": 0.01164166, + "auxiliary_loss_mlp": 0.01100583, + "balance_loss_clip": 1.00190783, + "balance_loss_mlp": 1.00054264, + "epoch": 0.8942732601833759, + "flos": 23149024139520.0, + "grad_norm": 1.5382478215955022, + "language_loss": 0.59426033, + "learning_rate": 1.1605441292856033e-07, + "loss": 0.61690784, + "num_input_tokens_seen": 320817410, + "step": 14874, + "time_per_iteration": 2.6506175994873047 + }, + { + "auxiliary_loss_clip": 0.01114214, + "auxiliary_loss_mlp": 0.01101172, + "balance_loss_clip": 1.00173342, + "balance_loss_mlp": 1.0004642, + "epoch": 0.8943333834360438, + "flos": 27855548231040.0, + "grad_norm": 15.206253020697037, + "language_loss": 0.75490987, + "learning_rate": 1.1592371055880356e-07, + "loss": 0.77706373, + "num_input_tokens_seen": 320836745, + "step": 14875, + "time_per_iteration": 2.7229082584381104 + }, + { + "auxiliary_loss_clip": 0.01116498, + "auxiliary_loss_mlp": 0.01102222, + "balance_loss_clip": 1.00182736, + "balance_loss_mlp": 1.0004648, + "epoch": 0.8943935066887119, + "flos": 22163958581760.0, + "grad_norm": 1.7197380872145893, + "language_loss": 0.77417368, + "learning_rate": 1.1579307963340857e-07, + "loss": 0.79636085, + "num_input_tokens_seen": 320853305, + "step": 14876, + "time_per_iteration": 2.727571964263916 + }, + { + "auxiliary_loss_clip": 0.01147546, + "auxiliary_loss_mlp": 0.01100233, + "balance_loss_clip": 1.00184548, + "balance_loss_mlp": 1.00043058, + "epoch": 0.8944536299413798, + "flos": 21470056669440.0, + "grad_norm": 1.7049059449242774, + "language_loss": 0.78861201, + "learning_rate": 1.156625201573287e-07, + "loss": 0.81108975, + "num_input_tokens_seen": 320872885, + "step": 14877, + "time_per_iteration": 2.5883595943450928 + }, + { + "auxiliary_loss_clip": 0.01102715, + "auxiliary_loss_mlp": 0.01100472, + "balance_loss_clip": 1.00164115, + "balance_loss_mlp": 1.00047934, + "epoch": 0.8945137531940478, + "flos": 17748777703680.0, + "grad_norm": 2.107535908678487, + "language_loss": 0.75086153, + "learning_rate": 1.155320321355151e-07, + "loss": 0.77289343, + "num_input_tokens_seen": 320889755, + "step": 14878, + "time_per_iteration": 2.7265803813934326 + }, + { + "auxiliary_loss_clip": 0.01149292, + "auxiliary_loss_mlp": 0.01101096, + "balance_loss_clip": 1.00180268, + "balance_loss_mlp": 1.00038791, + "epoch": 0.8945738764467158, + "flos": 21142264129920.0, + "grad_norm": 2.2550385493351532, + "language_loss": 0.76026034, + "learning_rate": 1.1540161557291539e-07, + "loss": 0.7827642, + "num_input_tokens_seen": 320907860, + "step": 14879, + "time_per_iteration": 2.60671067237854 + }, + { + "auxiliary_loss_clip": 0.01116152, + "auxiliary_loss_mlp": 0.01101064, + "balance_loss_clip": 1.00179076, + "balance_loss_mlp": 1.00054693, + "epoch": 0.8946339996993837, + "flos": 14903000835840.0, + "grad_norm": 2.3756286689990316, + "language_loss": 0.74562919, + "learning_rate": 1.1527127047447538e-07, + "loss": 0.76780128, + "num_input_tokens_seen": 320925825, + "step": 14880, + "time_per_iteration": 2.7660791873931885 + }, + { + "auxiliary_loss_clip": 0.0114968, + "auxiliary_loss_mlp": 0.01100735, + "balance_loss_clip": 1.00186062, + "balance_loss_mlp": 1.00050378, + "epoch": 0.8946941229520518, + "flos": 27382173868800.0, + "grad_norm": 1.8684593282689708, + "language_loss": 0.82919955, + "learning_rate": 1.1514099684513822e-07, + "loss": 0.8517037, + "num_input_tokens_seen": 320946165, + "step": 14881, + "time_per_iteration": 2.6350319385528564 + }, + { + "auxiliary_loss_clip": 0.01117382, + "auxiliary_loss_mlp": 0.00747351, + "balance_loss_clip": 1.00173211, + "balance_loss_mlp": 1.00051129, + "epoch": 0.8947542462047197, + "flos": 31796277338880.0, + "grad_norm": 2.5267956961203017, + "language_loss": 0.67761874, + "learning_rate": 1.1501079468984287e-07, + "loss": 0.69626606, + "num_input_tokens_seen": 320969330, + "step": 14882, + "time_per_iteration": 2.7884509563446045 + }, + { + "auxiliary_loss_clip": 0.01134902, + "auxiliary_loss_mlp": 0.01101791, + "balance_loss_clip": 1.00194848, + "balance_loss_mlp": 1.00060606, + "epoch": 0.8948143694573877, + "flos": 20883599314560.0, + "grad_norm": 2.342917491656565, + "language_loss": 0.75300193, + "learning_rate": 1.1488066401352691e-07, + "loss": 0.77536893, + "num_input_tokens_seen": 320985055, + "step": 14883, + "time_per_iteration": 2.624295234680176 + }, + { + "auxiliary_loss_clip": 0.01130199, + "auxiliary_loss_mlp": 0.01098816, + "balance_loss_clip": 1.00174141, + "balance_loss_mlp": 1.00049233, + "epoch": 0.8948744927100556, + "flos": 28215552291840.0, + "grad_norm": 1.8296181161762455, + "language_loss": 0.72586578, + "learning_rate": 1.147506048211253e-07, + "loss": 0.74815595, + "num_input_tokens_seen": 321004720, + "step": 14884, + "time_per_iteration": 2.7210724353790283 + }, + { + "auxiliary_loss_clip": 0.01132874, + "auxiliary_loss_mlp": 0.01099079, + "balance_loss_clip": 1.00170016, + "balance_loss_mlp": 1.00051701, + "epoch": 0.8949346159627236, + "flos": 21902672073600.0, + "grad_norm": 2.2872163751533874, + "language_loss": 0.75955224, + "learning_rate": 1.1462061711756987e-07, + "loss": 0.7818718, + "num_input_tokens_seen": 321022350, + "step": 14885, + "time_per_iteration": 4.0633158683776855 + }, + { + "auxiliary_loss_clip": 0.01131123, + "auxiliary_loss_mlp": 0.01102112, + "balance_loss_clip": 1.00168097, + "balance_loss_mlp": 1.00045037, + "epoch": 0.8949947392153915, + "flos": 21359128492800.0, + "grad_norm": 1.7639582471438164, + "language_loss": 0.81737924, + "learning_rate": 1.1449070090778911e-07, + "loss": 0.83971155, + "num_input_tokens_seen": 321040450, + "step": 14886, + "time_per_iteration": 2.6470155715942383 + }, + { + "auxiliary_loss_clip": 0.0108506, + "auxiliary_loss_mlp": 0.01100161, + "balance_loss_clip": 1.00157928, + "balance_loss_mlp": 1.0004065, + "epoch": 0.8950548624680595, + "flos": 52445342799360.0, + "grad_norm": 1.8548054222537076, + "language_loss": 0.63759643, + "learning_rate": 1.1436085619671043e-07, + "loss": 0.65944862, + "num_input_tokens_seen": 321063970, + "step": 14887, + "time_per_iteration": 4.296842575073242 + }, + { + "auxiliary_loss_clip": 0.01134559, + "auxiliary_loss_mlp": 0.01101226, + "balance_loss_clip": 1.00188184, + "balance_loss_mlp": 1.00056612, + "epoch": 0.8951149857207275, + "flos": 20121323863680.0, + "grad_norm": 3.8419857805765325, + "language_loss": 0.60474885, + "learning_rate": 1.1423108298925698e-07, + "loss": 0.62710667, + "num_input_tokens_seen": 321083840, + "step": 14888, + "time_per_iteration": 2.6620047092437744 + }, + { + "auxiliary_loss_clip": 0.01164247, + "auxiliary_loss_mlp": 0.01101614, + "balance_loss_clip": 1.00185823, + "balance_loss_mlp": 1.00052428, + "epoch": 0.8951751089733955, + "flos": 29862631463040.0, + "grad_norm": 2.8385398953420933, + "language_loss": 0.70140165, + "learning_rate": 1.1410138129034952e-07, + "loss": 0.72406018, + "num_input_tokens_seen": 321104165, + "step": 14889, + "time_per_iteration": 2.6109399795532227 + }, + { + "auxiliary_loss_clip": 0.01147673, + "auxiliary_loss_mlp": 0.00747408, + "balance_loss_clip": 1.00173271, + "balance_loss_mlp": 1.00047112, + "epoch": 0.8952352322260634, + "flos": 15262789415040.0, + "grad_norm": 2.366442642327065, + "language_loss": 0.71415311, + "learning_rate": 1.1397175110490676e-07, + "loss": 0.73310387, + "num_input_tokens_seen": 321117290, + "step": 14890, + "time_per_iteration": 2.61704421043396 + }, + { + "auxiliary_loss_clip": 0.01055084, + "auxiliary_loss_mlp": 0.00747262, + "balance_loss_clip": 1.00137925, + "balance_loss_mlp": 1.00040269, + "epoch": 0.8952953554787314, + "flos": 26798338206720.0, + "grad_norm": 1.7167373709397582, + "language_loss": 0.75681365, + "learning_rate": 1.1384219243784454e-07, + "loss": 0.77483714, + "num_input_tokens_seen": 321137115, + "step": 14891, + "time_per_iteration": 3.111274242401123 + }, + { + "auxiliary_loss_clip": 0.01082922, + "auxiliary_loss_mlp": 0.01101239, + "balance_loss_clip": 1.00149441, + "balance_loss_mlp": 1.00053155, + "epoch": 0.8953554787313994, + "flos": 14137205852160.0, + "grad_norm": 1.7109503562980206, + "language_loss": 0.76732576, + "learning_rate": 1.1371270529407517e-07, + "loss": 0.7891674, + "num_input_tokens_seen": 321154490, + "step": 14892, + "time_per_iteration": 3.416425943374634 + }, + { + "auxiliary_loss_clip": 0.01147856, + "auxiliary_loss_mlp": 0.01100761, + "balance_loss_clip": 1.00180268, + "balance_loss_mlp": 1.00052929, + "epoch": 0.8954156019840673, + "flos": 25703314139520.0, + "grad_norm": 3.146108622702354, + "language_loss": 0.81547391, + "learning_rate": 1.1358328967850895e-07, + "loss": 0.83796006, + "num_input_tokens_seen": 321175625, + "step": 14893, + "time_per_iteration": 2.6895992755889893 + }, + { + "auxiliary_loss_clip": 0.01116239, + "auxiliary_loss_mlp": 0.01099584, + "balance_loss_clip": 1.00186753, + "balance_loss_mlp": 1.00054514, + "epoch": 0.8954757252367354, + "flos": 21907987286400.0, + "grad_norm": 1.9339424449449056, + "language_loss": 0.74699759, + "learning_rate": 1.1345394559605348e-07, + "loss": 0.76915586, + "num_input_tokens_seen": 321193895, + "step": 14894, + "time_per_iteration": 2.677917242050171 + }, + { + "auxiliary_loss_clip": 0.01147841, + "auxiliary_loss_mlp": 0.01102046, + "balance_loss_clip": 1.00176322, + "balance_loss_mlp": 1.00048006, + "epoch": 0.8955358484894033, + "flos": 12970396454400.0, + "grad_norm": 1.9448473549136307, + "language_loss": 0.66604859, + "learning_rate": 1.1332467305161352e-07, + "loss": 0.68854737, + "num_input_tokens_seen": 321211610, + "step": 14895, + "time_per_iteration": 2.6229052543640137 + }, + { + "auxiliary_loss_clip": 0.01147508, + "auxiliary_loss_mlp": 0.01101823, + "balance_loss_clip": 1.00188613, + "balance_loss_mlp": 1.00035214, + "epoch": 0.8955959717420713, + "flos": 17273966797440.0, + "grad_norm": 1.7425578755955764, + "language_loss": 0.66995394, + "learning_rate": 1.1319547205009094e-07, + "loss": 0.6924473, + "num_input_tokens_seen": 321229805, + "step": 14896, + "time_per_iteration": 2.5724594593048096 + }, + { + "auxiliary_loss_clip": 0.0114792, + "auxiliary_loss_mlp": 0.01099969, + "balance_loss_clip": 1.00183475, + "balance_loss_mlp": 1.00045323, + "epoch": 0.8956560949947392, + "flos": 14793868339200.0, + "grad_norm": 2.861811065793624, + "language_loss": 0.75813878, + "learning_rate": 1.1306634259638492e-07, + "loss": 0.78061759, + "num_input_tokens_seen": 321247165, + "step": 14897, + "time_per_iteration": 2.659169912338257 + }, + { + "auxiliary_loss_clip": 0.01098199, + "auxiliary_loss_mlp": 0.00745336, + "balance_loss_clip": 1.00064111, + "balance_loss_mlp": 1.00001705, + "epoch": 0.8957162182474072, + "flos": 63607817957760.0, + "grad_norm": 0.7521136365373244, + "language_loss": 0.5534997, + "learning_rate": 1.129372846953931e-07, + "loss": 0.57193506, + "num_input_tokens_seen": 321308425, + "step": 14898, + "time_per_iteration": 3.3233211040496826 + }, + { + "auxiliary_loss_clip": 0.01164189, + "auxiliary_loss_mlp": 0.00747415, + "balance_loss_clip": 1.00190103, + "balance_loss_mlp": 1.00046492, + "epoch": 0.8957763415000751, + "flos": 25009843190400.0, + "grad_norm": 2.0161597999925145, + "language_loss": 0.70356411, + "learning_rate": 1.12808298352008e-07, + "loss": 0.72268015, + "num_input_tokens_seen": 321329295, + "step": 14899, + "time_per_iteration": 2.6672463417053223 + }, + { + "auxiliary_loss_clip": 0.01083306, + "auxiliary_loss_mlp": 0.01101201, + "balance_loss_clip": 1.00160384, + "balance_loss_mlp": 1.00054026, + "epoch": 0.8958364647527431, + "flos": 19828615933440.0, + "grad_norm": 1.7264363273566905, + "language_loss": 0.74079084, + "learning_rate": 1.1267938357112106e-07, + "loss": 0.76263595, + "num_input_tokens_seen": 321347580, + "step": 14900, + "time_per_iteration": 2.744093179702759 + }, + { + "auxiliary_loss_clip": 0.01097721, + "auxiliary_loss_mlp": 0.01074625, + "balance_loss_clip": 1.00065756, + "balance_loss_mlp": 1.00023806, + "epoch": 0.895896588005411, + "flos": 65537190115200.0, + "grad_norm": 0.7836326305136483, + "language_loss": 0.61817372, + "learning_rate": 1.1255054035762124e-07, + "loss": 0.63989711, + "num_input_tokens_seen": 321407820, + "step": 14901, + "time_per_iteration": 3.27081561088562 + }, + { + "auxiliary_loss_clip": 0.01147403, + "auxiliary_loss_mlp": 0.01100528, + "balance_loss_clip": 1.00180984, + "balance_loss_mlp": 1.0003922, + "epoch": 0.8959567112580791, + "flos": 25591021246080.0, + "grad_norm": 2.002282438475764, + "language_loss": 0.70576012, + "learning_rate": 1.1242176871639441e-07, + "loss": 0.72823942, + "num_input_tokens_seen": 321426745, + "step": 14902, + "time_per_iteration": 2.6624906063079834 + }, + { + "auxiliary_loss_clip": 0.0113296, + "auxiliary_loss_mlp": 0.01100891, + "balance_loss_clip": 1.00171351, + "balance_loss_mlp": 1.00056481, + "epoch": 0.896016834510747, + "flos": 24201780877440.0, + "grad_norm": 1.6771924550784345, + "language_loss": 0.78255248, + "learning_rate": 1.1229306865232313e-07, + "loss": 0.80489099, + "num_input_tokens_seen": 321446165, + "step": 14903, + "time_per_iteration": 2.6880829334259033 + }, + { + "auxiliary_loss_clip": 0.01133595, + "auxiliary_loss_mlp": 0.01100546, + "balance_loss_clip": 1.00187683, + "balance_loss_mlp": 1.00045836, + "epoch": 0.896076957763415, + "flos": 23075945919360.0, + "grad_norm": 2.139602766212296, + "language_loss": 0.73049384, + "learning_rate": 1.121644401702877e-07, + "loss": 0.75283527, + "num_input_tokens_seen": 321465285, + "step": 14904, + "time_per_iteration": 2.6920523643493652 + }, + { + "auxiliary_loss_clip": 0.01147511, + "auxiliary_loss_mlp": 0.0110099, + "balance_loss_clip": 1.00181198, + "balance_loss_mlp": 1.0004251, + "epoch": 0.8961370810160829, + "flos": 22236605838720.0, + "grad_norm": 2.256949944458602, + "language_loss": 0.7469089, + "learning_rate": 1.12035883275166e-07, + "loss": 0.76939392, + "num_input_tokens_seen": 321483670, + "step": 14905, + "time_per_iteration": 4.343943357467651 + }, + { + "auxiliary_loss_clip": 0.01149425, + "auxiliary_loss_mlp": 0.01098933, + "balance_loss_clip": 1.00175321, + "balance_loss_mlp": 1.00041795, + "epoch": 0.8961972042687509, + "flos": 23072318645760.0, + "grad_norm": 2.985923607055751, + "language_loss": 0.76051009, + "learning_rate": 1.1190739797183279e-07, + "loss": 0.78299367, + "num_input_tokens_seen": 321501190, + "step": 14906, + "time_per_iteration": 2.6504616737365723 + }, + { + "auxiliary_loss_clip": 0.01147437, + "auxiliary_loss_mlp": 0.01100042, + "balance_loss_clip": 1.00177598, + "balance_loss_mlp": 1.00052559, + "epoch": 0.896257327521419, + "flos": 18185882307840.0, + "grad_norm": 1.7058099146902428, + "language_loss": 0.74184871, + "learning_rate": 1.1177898426515996e-07, + "loss": 0.76432347, + "num_input_tokens_seen": 321518540, + "step": 14907, + "time_per_iteration": 2.5848915576934814 + }, + { + "auxiliary_loss_clip": 0.01149477, + "auxiliary_loss_mlp": 0.01099982, + "balance_loss_clip": 1.00186968, + "balance_loss_mlp": 1.00051379, + "epoch": 0.8963174507740869, + "flos": 17895472848000.0, + "grad_norm": 1.8223473181483094, + "language_loss": 0.8316263, + "learning_rate": 1.1165064216001785e-07, + "loss": 0.85412097, + "num_input_tokens_seen": 321536555, + "step": 14908, + "time_per_iteration": 4.298490524291992 + }, + { + "auxiliary_loss_clip": 0.01130884, + "auxiliary_loss_mlp": 0.01102008, + "balance_loss_clip": 1.00172317, + "balance_loss_mlp": 1.00053668, + "epoch": 0.8963775740267549, + "flos": 21032269706880.0, + "grad_norm": 1.7213909880896887, + "language_loss": 0.70163292, + "learning_rate": 1.1152237166127232e-07, + "loss": 0.72396183, + "num_input_tokens_seen": 321557655, + "step": 14909, + "time_per_iteration": 2.725215435028076 + }, + { + "auxiliary_loss_clip": 0.01115131, + "auxiliary_loss_mlp": 0.01100918, + "balance_loss_clip": 1.00173211, + "balance_loss_mlp": 1.00059199, + "epoch": 0.8964376972794228, + "flos": 23179619548800.0, + "grad_norm": 4.296764726909653, + "language_loss": 0.72070056, + "learning_rate": 1.113941727737877e-07, + "loss": 0.74286103, + "num_input_tokens_seen": 321576160, + "step": 14910, + "time_per_iteration": 2.7046420574188232 + }, + { + "auxiliary_loss_clip": 0.01147438, + "auxiliary_loss_mlp": 0.01101149, + "balance_loss_clip": 1.00180864, + "balance_loss_mlp": 1.00048876, + "epoch": 0.8964978205320908, + "flos": 24972998814720.0, + "grad_norm": 1.8456365940630501, + "language_loss": 0.63742381, + "learning_rate": 1.1126604550242502e-07, + "loss": 0.65990967, + "num_input_tokens_seen": 321596205, + "step": 14911, + "time_per_iteration": 2.6461801528930664 + }, + { + "auxiliary_loss_clip": 0.01133464, + "auxiliary_loss_mlp": 0.00747531, + "balance_loss_clip": 1.00204861, + "balance_loss_mlp": 1.00059962, + "epoch": 0.8965579437847587, + "flos": 19172025273600.0, + "grad_norm": 2.499769045997651, + "language_loss": 0.75036013, + "learning_rate": 1.111379898520437e-07, + "loss": 0.76917005, + "num_input_tokens_seen": 321614800, + "step": 14912, + "time_per_iteration": 2.7333545684814453 + }, + { + "auxiliary_loss_clip": 0.01134002, + "auxiliary_loss_mlp": 0.011005, + "balance_loss_clip": 1.00171041, + "balance_loss_mlp": 1.00050712, + "epoch": 0.8966180670374267, + "flos": 24276690691200.0, + "grad_norm": 2.1928626440098102, + "language_loss": 0.81864023, + "learning_rate": 1.1101000582749876e-07, + "loss": 0.84098524, + "num_input_tokens_seen": 321633445, + "step": 14913, + "time_per_iteration": 2.647826910018921 + }, + { + "auxiliary_loss_clip": 0.01147679, + "auxiliary_loss_mlp": 0.01101419, + "balance_loss_clip": 1.00189972, + "balance_loss_mlp": 1.00061584, + "epoch": 0.8966781902900947, + "flos": 13553190622080.0, + "grad_norm": 2.312469580503481, + "language_loss": 0.60962832, + "learning_rate": 1.1088209343364407e-07, + "loss": 0.6321193, + "num_input_tokens_seen": 321650890, + "step": 14914, + "time_per_iteration": 2.6162617206573486 + }, + { + "auxiliary_loss_clip": 0.0112764, + "auxiliary_loss_mlp": 0.01074373, + "balance_loss_clip": 1.00089335, + "balance_loss_mlp": 1.0003674, + "epoch": 0.8967383135427627, + "flos": 65066114223360.0, + "grad_norm": 0.7136345742669341, + "language_loss": 0.55048466, + "learning_rate": 1.1075425267532956e-07, + "loss": 0.57250476, + "num_input_tokens_seen": 321710960, + "step": 14915, + "time_per_iteration": 3.297675848007202 + }, + { + "auxiliary_loss_clip": 0.01115732, + "auxiliary_loss_mlp": 0.01100025, + "balance_loss_clip": 1.00163162, + "balance_loss_mlp": 1.00046182, + "epoch": 0.8967984367954306, + "flos": 29713027317120.0, + "grad_norm": 1.7139827795713638, + "language_loss": 0.71683908, + "learning_rate": 1.1062648355740289e-07, + "loss": 0.73899662, + "num_input_tokens_seen": 321733290, + "step": 14916, + "time_per_iteration": 2.8311917781829834 + }, + { + "auxiliary_loss_clip": 0.01130936, + "auxiliary_loss_mlp": 0.01100538, + "balance_loss_clip": 1.00175524, + "balance_loss_mlp": 1.00068796, + "epoch": 0.8968585600480986, + "flos": 25702488126720.0, + "grad_norm": 2.0703351552749982, + "language_loss": 0.78032809, + "learning_rate": 1.1049878608470931e-07, + "loss": 0.80264288, + "num_input_tokens_seen": 321753120, + "step": 14917, + "time_per_iteration": 2.6794111728668213 + }, + { + "auxiliary_loss_clip": 0.01147805, + "auxiliary_loss_mlp": 0.01101961, + "balance_loss_clip": 1.00194955, + "balance_loss_mlp": 1.00068069, + "epoch": 0.8969186833007665, + "flos": 30044698525440.0, + "grad_norm": 2.096549751914277, + "language_loss": 0.68426561, + "learning_rate": 1.1037116026209137e-07, + "loss": 0.70676327, + "num_input_tokens_seen": 321772840, + "step": 14918, + "time_per_iteration": 2.774005889892578 + }, + { + "auxiliary_loss_clip": 0.01097697, + "auxiliary_loss_mlp": 0.01100389, + "balance_loss_clip": 1.0015161, + "balance_loss_mlp": 1.00058651, + "epoch": 0.8969788065534345, + "flos": 22818143030400.0, + "grad_norm": 1.8843595661346144, + "language_loss": 0.83540845, + "learning_rate": 1.102436060943881e-07, + "loss": 0.85738921, + "num_input_tokens_seen": 321791020, + "step": 14919, + "time_per_iteration": 2.72000789642334 + }, + { + "auxiliary_loss_clip": 0.01164222, + "auxiliary_loss_mlp": 0.00747517, + "balance_loss_clip": 1.00183654, + "balance_loss_mlp": 1.00047696, + "epoch": 0.8970389298061026, + "flos": 13261488272640.0, + "grad_norm": 3.0296493250021537, + "language_loss": 0.72212052, + "learning_rate": 1.1011612358643696e-07, + "loss": 0.74123794, + "num_input_tokens_seen": 321810075, + "step": 14920, + "time_per_iteration": 2.519760847091675 + }, + { + "auxiliary_loss_clip": 0.01149567, + "auxiliary_loss_mlp": 0.01101565, + "balance_loss_clip": 1.00198078, + "balance_loss_mlp": 1.00057137, + "epoch": 0.8970990530587705, + "flos": 10266071345280.0, + "grad_norm": 2.435671181867371, + "language_loss": 0.91261899, + "learning_rate": 1.0998871274307164e-07, + "loss": 0.93513036, + "num_input_tokens_seen": 321822635, + "step": 14921, + "time_per_iteration": 2.567286252975464 + }, + { + "auxiliary_loss_clip": 0.0108573, + "auxiliary_loss_mlp": 0.01100443, + "balance_loss_clip": 1.00158775, + "balance_loss_mlp": 1.00045002, + "epoch": 0.8971591763114385, + "flos": 20302708567680.0, + "grad_norm": 1.671726102995391, + "language_loss": 0.73991656, + "learning_rate": 1.0986137356912384e-07, + "loss": 0.76177835, + "num_input_tokens_seen": 321841130, + "step": 14922, + "time_per_iteration": 2.7559890747070312 + }, + { + "auxiliary_loss_clip": 0.01100389, + "auxiliary_loss_mlp": 0.01100025, + "balance_loss_clip": 1.00151825, + "balance_loss_mlp": 1.00050914, + "epoch": 0.8972192995641064, + "flos": 23257043314560.0, + "grad_norm": 1.681001644717551, + "language_loss": 0.70159668, + "learning_rate": 1.097341060694219e-07, + "loss": 0.7236008, + "num_input_tokens_seen": 321859855, + "step": 14923, + "time_per_iteration": 4.132267951965332 + }, + { + "auxiliary_loss_clip": 0.01130537, + "auxiliary_loss_mlp": 0.01100698, + "balance_loss_clip": 1.00166607, + "balance_loss_mlp": 1.00046706, + "epoch": 0.8972794228167744, + "flos": 18369601395840.0, + "grad_norm": 2.595729733489317, + "language_loss": 0.70528936, + "learning_rate": 1.0960691024879221e-07, + "loss": 0.72760171, + "num_input_tokens_seen": 321877990, + "step": 14924, + "time_per_iteration": 2.6122326850891113 + }, + { + "auxiliary_loss_clip": 0.01149507, + "auxiliary_loss_mlp": 0.01100212, + "balance_loss_clip": 1.0018307, + "balance_loss_mlp": 1.00055289, + "epoch": 0.8973395460694423, + "flos": 23952058548480.0, + "grad_norm": 1.6137040814870571, + "language_loss": 0.7180112, + "learning_rate": 1.0947978611205844e-07, + "loss": 0.74050832, + "num_input_tokens_seen": 321898120, + "step": 14925, + "time_per_iteration": 3.95405912399292 + }, + { + "auxiliary_loss_clip": 0.01134768, + "auxiliary_loss_mlp": 0.00747432, + "balance_loss_clip": 1.00185847, + "balance_loss_mlp": 1.00054526, + "epoch": 0.8973996693221103, + "flos": 24970843998720.0, + "grad_norm": 3.925777298087029, + "language_loss": 0.82552373, + "learning_rate": 1.0935273366404008e-07, + "loss": 0.84434575, + "num_input_tokens_seen": 321918140, + "step": 14926, + "time_per_iteration": 2.7200498580932617 + }, + { + "auxiliary_loss_clip": 0.01100745, + "auxiliary_loss_mlp": 0.01100282, + "balance_loss_clip": 1.00177562, + "balance_loss_mlp": 1.00047958, + "epoch": 0.8974597925747783, + "flos": 25738937452800.0, + "grad_norm": 1.4144482377361371, + "language_loss": 0.79165459, + "learning_rate": 1.092257529095555e-07, + "loss": 0.81366491, + "num_input_tokens_seen": 321938580, + "step": 14927, + "time_per_iteration": 2.7617223262786865 + }, + { + "auxiliary_loss_clip": 0.01132729, + "auxiliary_loss_mlp": 0.01100152, + "balance_loss_clip": 1.00194693, + "balance_loss_mlp": 1.00049305, + "epoch": 0.8975199158274463, + "flos": 38071918131840.0, + "grad_norm": 1.5230949945392007, + "language_loss": 0.66609645, + "learning_rate": 1.0909884385341994e-07, + "loss": 0.68842518, + "num_input_tokens_seen": 321961135, + "step": 14928, + "time_per_iteration": 2.813159465789795 + }, + { + "auxiliary_loss_clip": 0.01130431, + "auxiliary_loss_mlp": 0.01100541, + "balance_loss_clip": 1.00183785, + "balance_loss_mlp": 1.0005486, + "epoch": 0.8975800390801142, + "flos": 25411683617280.0, + "grad_norm": 1.909778249601659, + "language_loss": 0.70758426, + "learning_rate": 1.0897200650044602e-07, + "loss": 0.72989392, + "num_input_tokens_seen": 321980945, + "step": 14929, + "time_per_iteration": 2.753265857696533 + }, + { + "auxiliary_loss_clip": 0.0113196, + "auxiliary_loss_mlp": 0.01100463, + "balance_loss_clip": 1.00182438, + "balance_loss_mlp": 1.0004704, + "epoch": 0.8976401623327822, + "flos": 21759604202880.0, + "grad_norm": 1.7492879103823047, + "language_loss": 0.67690521, + "learning_rate": 1.0884524085544256e-07, + "loss": 0.69922942, + "num_input_tokens_seen": 322000350, + "step": 14930, + "time_per_iteration": 2.6993839740753174 + }, + { + "auxiliary_loss_clip": 0.01132979, + "auxiliary_loss_mlp": 0.01100662, + "balance_loss_clip": 1.00174499, + "balance_loss_mlp": 1.00057352, + "epoch": 0.8977002855854501, + "flos": 13845323934720.0, + "grad_norm": 1.8892896175192486, + "language_loss": 0.75073421, + "learning_rate": 1.0871854692321769e-07, + "loss": 0.77307063, + "num_input_tokens_seen": 322018980, + "step": 14931, + "time_per_iteration": 2.6977627277374268 + }, + { + "auxiliary_loss_clip": 0.01148722, + "auxiliary_loss_mlp": 0.010997, + "balance_loss_clip": 1.00193024, + "balance_loss_mlp": 1.00056541, + "epoch": 0.8977604088381181, + "flos": 19427529692160.0, + "grad_norm": 1.653333774547444, + "language_loss": 0.633973, + "learning_rate": 1.0859192470857492e-07, + "loss": 0.65645719, + "num_input_tokens_seen": 322037675, + "step": 14932, + "time_per_iteration": 2.6080172061920166 + }, + { + "auxiliary_loss_clip": 0.0114736, + "auxiliary_loss_mlp": 0.01100116, + "balance_loss_clip": 1.00176382, + "balance_loss_mlp": 1.00045681, + "epoch": 0.8978205320907862, + "flos": 22742083981440.0, + "grad_norm": 1.6978487849918282, + "language_loss": 0.71688437, + "learning_rate": 1.0846537421631552e-07, + "loss": 0.73935914, + "num_input_tokens_seen": 322055130, + "step": 14933, + "time_per_iteration": 2.6183533668518066 + }, + { + "auxiliary_loss_clip": 0.01100976, + "auxiliary_loss_mlp": 0.01101312, + "balance_loss_clip": 1.00172496, + "balance_loss_mlp": 1.00065136, + "epoch": 0.8978806553434541, + "flos": 21360529123200.0, + "grad_norm": 1.5189642689004117, + "language_loss": 0.74744427, + "learning_rate": 1.0833889545123898e-07, + "loss": 0.76946718, + "num_input_tokens_seen": 322074850, + "step": 14934, + "time_per_iteration": 2.723325252532959 + }, + { + "auxiliary_loss_clip": 0.01118213, + "auxiliary_loss_mlp": 0.01099909, + "balance_loss_clip": 1.00199366, + "balance_loss_mlp": 1.00053561, + "epoch": 0.8979407785961221, + "flos": 20924178704640.0, + "grad_norm": 1.8650056081438349, + "language_loss": 0.60693944, + "learning_rate": 1.0821248841814123e-07, + "loss": 0.62912065, + "num_input_tokens_seen": 322093315, + "step": 14935, + "time_per_iteration": 2.7097039222717285 + }, + { + "auxiliary_loss_clip": 0.01113733, + "auxiliary_loss_mlp": 0.01099835, + "balance_loss_clip": 1.00159502, + "balance_loss_mlp": 1.00041461, + "epoch": 0.89800090184879, + "flos": 25228934196480.0, + "grad_norm": 4.135587322951995, + "language_loss": 0.76629025, + "learning_rate": 1.0808615312181512e-07, + "loss": 0.78842592, + "num_input_tokens_seen": 322112555, + "step": 14936, + "time_per_iteration": 2.6867384910583496 + }, + { + "auxiliary_loss_clip": 0.01133369, + "auxiliary_loss_mlp": 0.01100141, + "balance_loss_clip": 1.00181091, + "balance_loss_mlp": 1.00048184, + "epoch": 0.898061025101458, + "flos": 22562674525440.0, + "grad_norm": 1.6703935667765704, + "language_loss": 0.73694289, + "learning_rate": 1.0795988956705193e-07, + "loss": 0.75927794, + "num_input_tokens_seen": 322130440, + "step": 14937, + "time_per_iteration": 2.7254958152770996 + }, + { + "auxiliary_loss_clip": 0.01112503, + "auxiliary_loss_mlp": 0.01074539, + "balance_loss_clip": 1.00080085, + "balance_loss_mlp": 1.00053394, + "epoch": 0.8981211483541259, + "flos": 56192551384320.0, + "grad_norm": 0.838094366059958, + "language_loss": 0.63499868, + "learning_rate": 1.0783369775863915e-07, + "loss": 0.65686905, + "num_input_tokens_seen": 322187295, + "step": 14938, + "time_per_iteration": 3.1515164375305176 + }, + { + "auxiliary_loss_clip": 0.01132716, + "auxiliary_loss_mlp": 0.01099742, + "balance_loss_clip": 1.00180674, + "balance_loss_mlp": 1.00051188, + "epoch": 0.898181271606794, + "flos": 16392718523520.0, + "grad_norm": 2.296208926265925, + "language_loss": 0.80492759, + "learning_rate": 1.0770757770136251e-07, + "loss": 0.82725215, + "num_input_tokens_seen": 322202965, + "step": 14939, + "time_per_iteration": 2.6877129077911377 + }, + { + "auxiliary_loss_clip": 0.01108256, + "auxiliary_loss_mlp": 0.01073986, + "balance_loss_clip": 1.00072479, + "balance_loss_mlp": 1.00036192, + "epoch": 0.8982413948594619, + "flos": 63440259989760.0, + "grad_norm": 0.7292548982847438, + "language_loss": 0.52845049, + "learning_rate": 1.0758152940000375e-07, + "loss": 0.55027288, + "num_input_tokens_seen": 322269490, + "step": 14940, + "time_per_iteration": 3.468411684036255 + }, + { + "auxiliary_loss_clip": 0.01164174, + "auxiliary_loss_mlp": 0.01100991, + "balance_loss_clip": 1.0018599, + "balance_loss_mlp": 1.0004735, + "epoch": 0.8983015181121299, + "flos": 21835340029440.0, + "grad_norm": 2.353094059127908, + "language_loss": 0.778844, + "learning_rate": 1.0745555285934327e-07, + "loss": 0.80149567, + "num_input_tokens_seen": 322288060, + "step": 14941, + "time_per_iteration": 3.2230119705200195 + }, + { + "auxiliary_loss_clip": 0.01149122, + "auxiliary_loss_mlp": 0.0110142, + "balance_loss_clip": 1.00185299, + "balance_loss_mlp": 1.00061655, + "epoch": 0.8983616413647978, + "flos": 28949961767040.0, + "grad_norm": 3.9852074164678193, + "language_loss": 0.73178375, + "learning_rate": 1.0732964808415834e-07, + "loss": 0.75428915, + "num_input_tokens_seen": 322307930, + "step": 14942, + "time_per_iteration": 2.676854133605957 + }, + { + "auxiliary_loss_clip": 0.01134437, + "auxiliary_loss_mlp": 0.01101359, + "balance_loss_clip": 1.00173926, + "balance_loss_mlp": 1.00055599, + "epoch": 0.8984217646174658, + "flos": 17785083375360.0, + "grad_norm": 2.560340010852149, + "language_loss": 0.79656816, + "learning_rate": 1.0720381507922205e-07, + "loss": 0.8189261, + "num_input_tokens_seen": 322326155, + "step": 14943, + "time_per_iteration": 4.22105073928833 + }, + { + "auxiliary_loss_clip": 0.01133245, + "auxiliary_loss_mlp": 0.01101101, + "balance_loss_clip": 1.00178933, + "balance_loss_mlp": 1.00039291, + "epoch": 0.8984818878701337, + "flos": 23404528558080.0, + "grad_norm": 1.466648433770476, + "language_loss": 0.71198374, + "learning_rate": 1.0707805384930701e-07, + "loss": 0.7343272, + "num_input_tokens_seen": 322345850, + "step": 14944, + "time_per_iteration": 2.666649580001831 + }, + { + "auxiliary_loss_clip": 0.01120485, + "auxiliary_loss_mlp": 0.01101694, + "balance_loss_clip": 1.00198746, + "balance_loss_mlp": 1.00050914, + "epoch": 0.8985420111228017, + "flos": 22346061557760.0, + "grad_norm": 2.4561696546764944, + "language_loss": 0.75681847, + "learning_rate": 1.0695236439918187e-07, + "loss": 0.77904022, + "num_input_tokens_seen": 322364715, + "step": 14945, + "time_per_iteration": 2.6872105598449707 + }, + { + "auxiliary_loss_clip": 0.01164395, + "auxiliary_loss_mlp": 0.01102084, + "balance_loss_clip": 1.00187254, + "balance_loss_mlp": 1.00046968, + "epoch": 0.8986021343754698, + "flos": 21392776558080.0, + "grad_norm": 2.426882547373968, + "language_loss": 0.73487175, + "learning_rate": 1.0682674673361302e-07, + "loss": 0.75753659, + "num_input_tokens_seen": 322383570, + "step": 14946, + "time_per_iteration": 4.516413688659668 + }, + { + "auxiliary_loss_clip": 0.01101704, + "auxiliary_loss_mlp": 0.01101198, + "balance_loss_clip": 1.00165057, + "balance_loss_mlp": 1.00053763, + "epoch": 0.8986622576281377, + "flos": 21325372686720.0, + "grad_norm": 1.9625905531918992, + "language_loss": 0.64567304, + "learning_rate": 1.0670120085736334e-07, + "loss": 0.66770208, + "num_input_tokens_seen": 322401375, + "step": 14947, + "time_per_iteration": 2.706932544708252 + }, + { + "auxiliary_loss_clip": 0.01130651, + "auxiliary_loss_mlp": 0.01100114, + "balance_loss_clip": 1.00179541, + "balance_loss_mlp": 1.00055003, + "epoch": 0.8987223808808057, + "flos": 23988292392960.0, + "grad_norm": 1.800387174510975, + "language_loss": 0.69554162, + "learning_rate": 1.0657572677519411e-07, + "loss": 0.71784931, + "num_input_tokens_seen": 322421890, + "step": 14948, + "time_per_iteration": 2.683281660079956 + }, + { + "auxiliary_loss_clip": 0.01115947, + "auxiliary_loss_mlp": 0.01099392, + "balance_loss_clip": 1.00169635, + "balance_loss_mlp": 1.00044858, + "epoch": 0.8987825041334736, + "flos": 41500956044160.0, + "grad_norm": 1.7380176930547282, + "language_loss": 0.7462157, + "learning_rate": 1.0645032449186309e-07, + "loss": 0.76836908, + "num_input_tokens_seen": 322445730, + "step": 14949, + "time_per_iteration": 2.8697335720062256 + }, + { + "auxiliary_loss_clip": 0.0109948, + "auxiliary_loss_mlp": 0.01101817, + "balance_loss_clip": 1.00163639, + "balance_loss_mlp": 1.00053644, + "epoch": 0.8988426273861416, + "flos": 27564276844800.0, + "grad_norm": 1.7224176003280336, + "language_loss": 0.7569921, + "learning_rate": 1.0632499401212513e-07, + "loss": 0.77900505, + "num_input_tokens_seen": 322464595, + "step": 14950, + "time_per_iteration": 2.8533310890197754 + }, + { + "auxiliary_loss_clip": 0.01134596, + "auxiliary_loss_mlp": 0.01100446, + "balance_loss_clip": 1.00198674, + "balance_loss_mlp": 1.00054812, + "epoch": 0.8989027506388095, + "flos": 17092653920640.0, + "grad_norm": 2.878325314957327, + "language_loss": 0.66639191, + "learning_rate": 1.0619973534073334e-07, + "loss": 0.68874228, + "num_input_tokens_seen": 322483305, + "step": 14951, + "time_per_iteration": 2.651538372039795 + }, + { + "auxiliary_loss_clip": 0.01164349, + "auxiliary_loss_mlp": 0.01101662, + "balance_loss_clip": 1.00182331, + "balance_loss_mlp": 1.00047708, + "epoch": 0.8989628738914776, + "flos": 20555124416640.0, + "grad_norm": 1.8867180036851883, + "language_loss": 0.74213552, + "learning_rate": 1.0607454848243769e-07, + "loss": 0.76479566, + "num_input_tokens_seen": 322501905, + "step": 14952, + "time_per_iteration": 2.5977883338928223 + }, + { + "auxiliary_loss_clip": 0.01164173, + "auxiliary_loss_mlp": 0.01100667, + "balance_loss_clip": 1.00189912, + "balance_loss_mlp": 1.0005312, + "epoch": 0.8990229971441455, + "flos": 16251087196800.0, + "grad_norm": 2.2204247594214284, + "language_loss": 0.56600565, + "learning_rate": 1.0594943344198481e-07, + "loss": 0.58865404, + "num_input_tokens_seen": 322518135, + "step": 14953, + "time_per_iteration": 2.5304901599884033 + }, + { + "auxiliary_loss_clip": 0.01132374, + "auxiliary_loss_mlp": 0.0110104, + "balance_loss_clip": 1.00184941, + "balance_loss_mlp": 1.00042748, + "epoch": 0.8990831203968135, + "flos": 21981316901760.0, + "grad_norm": 2.2758725111258857, + "language_loss": 0.81840909, + "learning_rate": 1.0582439022411915e-07, + "loss": 0.84074318, + "num_input_tokens_seen": 322537905, + "step": 14954, + "time_per_iteration": 2.664727210998535 + }, + { + "auxiliary_loss_clip": 0.01164035, + "auxiliary_loss_mlp": 0.01099863, + "balance_loss_clip": 1.00195181, + "balance_loss_mlp": 1.00049019, + "epoch": 0.8991432436494814, + "flos": 27447171528960.0, + "grad_norm": 3.06661098179819, + "language_loss": 0.60231435, + "learning_rate": 1.0569941883358224e-07, + "loss": 0.62495327, + "num_input_tokens_seen": 322557945, + "step": 14955, + "time_per_iteration": 2.657243251800537 + }, + { + "auxiliary_loss_clip": 0.01148549, + "auxiliary_loss_mlp": 0.01100201, + "balance_loss_clip": 1.00179625, + "balance_loss_mlp": 1.00044703, + "epoch": 0.8992033669021494, + "flos": 21579835610880.0, + "grad_norm": 2.0993106827173085, + "language_loss": 0.54975772, + "learning_rate": 1.0557451927511341e-07, + "loss": 0.57224518, + "num_input_tokens_seen": 322575765, + "step": 14956, + "time_per_iteration": 2.609194040298462 + }, + { + "auxiliary_loss_clip": 0.01100967, + "auxiliary_loss_mlp": 0.01100854, + "balance_loss_clip": 1.0016638, + "balance_loss_mlp": 1.00043201, + "epoch": 0.8992634901548173, + "flos": 28584211530240.0, + "grad_norm": 1.7320409087250446, + "language_loss": 0.79959583, + "learning_rate": 1.0544969155344863e-07, + "loss": 0.82161403, + "num_input_tokens_seen": 322595665, + "step": 14957, + "time_per_iteration": 2.8206560611724854 + }, + { + "auxiliary_loss_clip": 0.0116436, + "auxiliary_loss_mlp": 0.01101023, + "balance_loss_clip": 1.00192297, + "balance_loss_mlp": 1.00050616, + "epoch": 0.8993236134074853, + "flos": 19867435557120.0, + "grad_norm": 1.6383065170796405, + "language_loss": 0.78521752, + "learning_rate": 1.0532493567332123e-07, + "loss": 0.80787134, + "num_input_tokens_seen": 322614755, + "step": 14958, + "time_per_iteration": 2.5920395851135254 + }, + { + "auxiliary_loss_clip": 0.01070401, + "auxiliary_loss_mlp": 0.01100118, + "balance_loss_clip": 1.00173068, + "balance_loss_mlp": 1.00050724, + "epoch": 0.8993837366601534, + "flos": 19390649402880.0, + "grad_norm": 1.4142763807241314, + "language_loss": 0.74844432, + "learning_rate": 1.0520025163946277e-07, + "loss": 0.77014947, + "num_input_tokens_seen": 322633425, + "step": 14959, + "time_per_iteration": 3.0205423831939697 + }, + { + "auxiliary_loss_clip": 0.01164046, + "auxiliary_loss_mlp": 0.01099703, + "balance_loss_clip": 1.00178754, + "balance_loss_mlp": 1.00042546, + "epoch": 0.8994438599128213, + "flos": 18551740285440.0, + "grad_norm": 1.9805479207484977, + "language_loss": 0.68427259, + "learning_rate": 1.0507563945660015e-07, + "loss": 0.70691007, + "num_input_tokens_seen": 322652065, + "step": 14960, + "time_per_iteration": 3.2407515048980713 + }, + { + "auxiliary_loss_clip": 0.01130136, + "auxiliary_loss_mlp": 0.01099803, + "balance_loss_clip": 1.00187409, + "balance_loss_mlp": 1.00047815, + "epoch": 0.8995039831654893, + "flos": 24427587726720.0, + "grad_norm": 1.6380667932136646, + "language_loss": 0.65950137, + "learning_rate": 1.049510991294591e-07, + "loss": 0.68180072, + "num_input_tokens_seen": 322673275, + "step": 14961, + "time_per_iteration": 4.104921817779541 + }, + { + "auxiliary_loss_clip": 0.01132898, + "auxiliary_loss_mlp": 0.01098931, + "balance_loss_clip": 1.00172281, + "balance_loss_mlp": 1.00046372, + "epoch": 0.8995641064181572, + "flos": 21251324799360.0, + "grad_norm": 1.4892994533092978, + "language_loss": 0.83088529, + "learning_rate": 1.0482663066276254e-07, + "loss": 0.85320354, + "num_input_tokens_seen": 322693375, + "step": 14962, + "time_per_iteration": 4.3085408210754395 + }, + { + "auxiliary_loss_clip": 0.01130888, + "auxiliary_loss_mlp": 0.01101464, + "balance_loss_clip": 1.00186551, + "balance_loss_mlp": 1.00042224, + "epoch": 0.8996242296708252, + "flos": 23513661054720.0, + "grad_norm": 1.9061233907796649, + "language_loss": 0.76409161, + "learning_rate": 1.047022340612298e-07, + "loss": 0.78641516, + "num_input_tokens_seen": 322712615, + "step": 14963, + "time_per_iteration": 2.746948480606079 + }, + { + "auxiliary_loss_clip": 0.01081479, + "auxiliary_loss_mlp": 0.01074328, + "balance_loss_clip": 1.00085938, + "balance_loss_mlp": 1.00032258, + "epoch": 0.8996843529234931, + "flos": 62403230430720.0, + "grad_norm": 0.7717746738387086, + "language_loss": 0.57490456, + "learning_rate": 1.0457790932957867e-07, + "loss": 0.59646261, + "num_input_tokens_seen": 322766855, + "step": 14964, + "time_per_iteration": 3.2766215801239014 + }, + { + "auxiliary_loss_clip": 0.01147733, + "auxiliary_loss_mlp": 0.01102745, + "balance_loss_clip": 1.00194156, + "balance_loss_mlp": 1.00051117, + "epoch": 0.8997444761761612, + "flos": 24236829573120.0, + "grad_norm": 3.477486158313771, + "language_loss": 0.67602909, + "learning_rate": 1.0445365647252269e-07, + "loss": 0.69853389, + "num_input_tokens_seen": 322781130, + "step": 14965, + "time_per_iteration": 2.6179683208465576 + }, + { + "auxiliary_loss_clip": 0.01164134, + "auxiliary_loss_mlp": 0.01100761, + "balance_loss_clip": 1.00185561, + "balance_loss_mlp": 1.00043476, + "epoch": 0.8998045994288291, + "flos": 21361103740800.0, + "grad_norm": 2.435221176859381, + "language_loss": 0.71952909, + "learning_rate": 1.0432947549477433e-07, + "loss": 0.74217808, + "num_input_tokens_seen": 322800310, + "step": 14966, + "time_per_iteration": 2.5335097312927246 + }, + { + "auxiliary_loss_clip": 0.0111774, + "auxiliary_loss_mlp": 0.01101422, + "balance_loss_clip": 1.00180364, + "balance_loss_mlp": 1.00052297, + "epoch": 0.8998647226814971, + "flos": 28986159697920.0, + "grad_norm": 2.6343093519381338, + "language_loss": 0.7345677, + "learning_rate": 1.0420536640104205e-07, + "loss": 0.75675929, + "num_input_tokens_seen": 322820955, + "step": 14967, + "time_per_iteration": 2.7655253410339355 + }, + { + "auxiliary_loss_clip": 0.01101771, + "auxiliary_loss_mlp": 0.00747333, + "balance_loss_clip": 1.00154507, + "balance_loss_mlp": 1.00044811, + "epoch": 0.899924845934165, + "flos": 13625909706240.0, + "grad_norm": 2.8691947967916813, + "language_loss": 0.72241271, + "learning_rate": 1.040813291960323e-07, + "loss": 0.74090374, + "num_input_tokens_seen": 322838780, + "step": 14968, + "time_per_iteration": 2.700266122817993 + }, + { + "auxiliary_loss_clip": 0.01147829, + "auxiliary_loss_mlp": 0.01101269, + "balance_loss_clip": 1.00179708, + "balance_loss_mlp": 1.000561, + "epoch": 0.899984969186833, + "flos": 20882629647360.0, + "grad_norm": 1.9631280301511693, + "language_loss": 0.71362072, + "learning_rate": 1.0395736388444864e-07, + "loss": 0.7361117, + "num_input_tokens_seen": 322856710, + "step": 14969, + "time_per_iteration": 2.6345314979553223 + }, + { + "auxiliary_loss_clip": 0.01164263, + "auxiliary_loss_mlp": 0.01100654, + "balance_loss_clip": 1.00194287, + "balance_loss_mlp": 1.00042272, + "epoch": 0.9000450924395009, + "flos": 20921808407040.0, + "grad_norm": 2.0751442192763294, + "language_loss": 0.75846612, + "learning_rate": 1.0383347047099201e-07, + "loss": 0.78111529, + "num_input_tokens_seen": 322876070, + "step": 14970, + "time_per_iteration": 2.6059625148773193 + }, + { + "auxiliary_loss_clip": 0.01149689, + "auxiliary_loss_mlp": 0.01100572, + "balance_loss_clip": 1.00191426, + "balance_loss_mlp": 1.00048375, + "epoch": 0.900105215692169, + "flos": 17165049782400.0, + "grad_norm": 1.7282662289690385, + "language_loss": 0.73515451, + "learning_rate": 1.0370964896035972e-07, + "loss": 0.75765717, + "num_input_tokens_seen": 322895095, + "step": 14971, + "time_per_iteration": 2.639848470687866 + }, + { + "auxiliary_loss_clip": 0.01114444, + "auxiliary_loss_mlp": 0.01099694, + "balance_loss_clip": 1.00168967, + "balance_loss_mlp": 1.00036848, + "epoch": 0.900165338944837, + "flos": 19931930426880.0, + "grad_norm": 2.0834632137652958, + "language_loss": 0.81768197, + "learning_rate": 1.035858993572476e-07, + "loss": 0.83982337, + "num_input_tokens_seen": 322911845, + "step": 14972, + "time_per_iteration": 2.7406272888183594 + }, + { + "auxiliary_loss_clip": 0.01133799, + "auxiliary_loss_mlp": 0.01101596, + "balance_loss_clip": 1.00196218, + "balance_loss_mlp": 1.00055373, + "epoch": 0.9002254621975049, + "flos": 16107085572480.0, + "grad_norm": 2.587120706004952, + "language_loss": 0.8176142, + "learning_rate": 1.0346222166634855e-07, + "loss": 0.83996809, + "num_input_tokens_seen": 322928170, + "step": 14973, + "time_per_iteration": 2.7175261974334717 + }, + { + "auxiliary_loss_clip": 0.01164153, + "auxiliary_loss_mlp": 0.01100465, + "balance_loss_clip": 1.00188339, + "balance_loss_mlp": 1.00066316, + "epoch": 0.9002855854501729, + "flos": 28476120528000.0, + "grad_norm": 1.7816875661043365, + "language_loss": 0.58416218, + "learning_rate": 1.0333861589235193e-07, + "loss": 0.60680842, + "num_input_tokens_seen": 322948165, + "step": 14974, + "time_per_iteration": 2.6696250438690186 + }, + { + "auxiliary_loss_clip": 0.01164326, + "auxiliary_loss_mlp": 0.01101383, + "balance_loss_clip": 1.00203609, + "balance_loss_mlp": 1.00057995, + "epoch": 0.9003457087028408, + "flos": 25630307746560.0, + "grad_norm": 1.7871928609861767, + "language_loss": 0.634781, + "learning_rate": 1.0321508203994489e-07, + "loss": 0.6574381, + "num_input_tokens_seen": 322968880, + "step": 14975, + "time_per_iteration": 2.5963587760925293 + }, + { + "auxiliary_loss_clip": 0.01147684, + "auxiliary_loss_mlp": 0.01101346, + "balance_loss_clip": 1.00187445, + "balance_loss_mlp": 1.00054264, + "epoch": 0.9004058319555088, + "flos": 24389414547840.0, + "grad_norm": 2.459200489633587, + "language_loss": 0.73393083, + "learning_rate": 1.0309162011381257e-07, + "loss": 0.75642115, + "num_input_tokens_seen": 322989395, + "step": 14976, + "time_per_iteration": 2.6554927825927734 + }, + { + "auxiliary_loss_clip": 0.01148225, + "auxiliary_loss_mlp": 0.01101095, + "balance_loss_clip": 1.00203753, + "balance_loss_mlp": 1.00057769, + "epoch": 0.9004659552081767, + "flos": 29059345658880.0, + "grad_norm": 1.834945072828489, + "language_loss": 0.6975702, + "learning_rate": 1.0296823011863565e-07, + "loss": 0.72006339, + "num_input_tokens_seen": 323009060, + "step": 14977, + "time_per_iteration": 2.668431520462036 + }, + { + "auxiliary_loss_clip": 0.01133167, + "auxiliary_loss_mlp": 0.00747429, + "balance_loss_clip": 1.00177336, + "balance_loss_mlp": 1.0004344, + "epoch": 0.9005260784608448, + "flos": 16763855800320.0, + "grad_norm": 6.067149332231083, + "language_loss": 0.65088552, + "learning_rate": 1.0284491205909351e-07, + "loss": 0.66969144, + "num_input_tokens_seen": 323027530, + "step": 14978, + "time_per_iteration": 2.704169273376465 + }, + { + "auxiliary_loss_clip": 0.01118424, + "auxiliary_loss_mlp": 0.01101726, + "balance_loss_clip": 1.00184739, + "balance_loss_mlp": 1.00054073, + "epoch": 0.9005862017135127, + "flos": 20376002269440.0, + "grad_norm": 1.845027110373284, + "language_loss": 0.7873193, + "learning_rate": 1.0272166593986286e-07, + "loss": 0.80952084, + "num_input_tokens_seen": 323045370, + "step": 14979, + "time_per_iteration": 2.66243839263916 + }, + { + "auxiliary_loss_clip": 0.01126796, + "auxiliary_loss_mlp": 0.01074193, + "balance_loss_clip": 1.0007019, + "balance_loss_mlp": 1.00018787, + "epoch": 0.9006463249661807, + "flos": 67580255796480.0, + "grad_norm": 0.7210523962400248, + "language_loss": 0.53569782, + "learning_rate": 1.0259849176561642e-07, + "loss": 0.55770779, + "num_input_tokens_seen": 323105660, + "step": 14980, + "time_per_iteration": 4.588686466217041 + }, + { + "auxiliary_loss_clip": 0.01147636, + "auxiliary_loss_mlp": 0.01101468, + "balance_loss_clip": 1.00185275, + "balance_loss_mlp": 1.0005213, + "epoch": 0.9007064482188486, + "flos": 28293335193600.0, + "grad_norm": 1.8118001934707706, + "language_loss": 0.822694, + "learning_rate": 1.0247538954102553e-07, + "loss": 0.84518504, + "num_input_tokens_seen": 323126365, + "step": 14981, + "time_per_iteration": 2.7946417331695557 + }, + { + "auxiliary_loss_clip": 0.01099915, + "auxiliary_loss_mlp": 0.01099692, + "balance_loss_clip": 1.00167716, + "balance_loss_mlp": 1.00055766, + "epoch": 0.9007665714715166, + "flos": 21616320850560.0, + "grad_norm": 1.4544018552478033, + "language_loss": 0.81479001, + "learning_rate": 1.0235235927075758e-07, + "loss": 0.83678603, + "num_input_tokens_seen": 323145655, + "step": 14982, + "time_per_iteration": 2.76224684715271 + }, + { + "auxiliary_loss_clip": 0.01134494, + "auxiliary_loss_mlp": 0.01101047, + "balance_loss_clip": 1.00192237, + "balance_loss_mlp": 1.00067306, + "epoch": 0.9008266947241845, + "flos": 26541864120960.0, + "grad_norm": 1.707085846092877, + "language_loss": 0.71293974, + "learning_rate": 1.0222940095947885e-07, + "loss": 0.73529512, + "num_input_tokens_seen": 323164540, + "step": 14983, + "time_per_iteration": 4.109454154968262 + }, + { + "auxiliary_loss_clip": 0.01147547, + "auxiliary_loss_mlp": 0.01099736, + "balance_loss_clip": 1.00193441, + "balance_loss_mlp": 1.00045824, + "epoch": 0.9008868179768525, + "flos": 23110527738240.0, + "grad_norm": 1.691438063921413, + "language_loss": 0.74909669, + "learning_rate": 1.0210651461185115e-07, + "loss": 0.77156955, + "num_input_tokens_seen": 323186960, + "step": 14984, + "time_per_iteration": 2.6535189151763916 + }, + { + "auxiliary_loss_clip": 0.01164027, + "auxiliary_loss_mlp": 0.01100111, + "balance_loss_clip": 1.0018146, + "balance_loss_mlp": 1.00059509, + "epoch": 0.9009469412295206, + "flos": 19060809788160.0, + "grad_norm": 1.9151977832780187, + "language_loss": 0.70462084, + "learning_rate": 1.0198370023253456e-07, + "loss": 0.72726226, + "num_input_tokens_seen": 323206135, + "step": 14985, + "time_per_iteration": 2.547639846801758 + }, + { + "auxiliary_loss_clip": 0.01130727, + "auxiliary_loss_mlp": 0.01100852, + "balance_loss_clip": 1.00168967, + "balance_loss_mlp": 1.00047779, + "epoch": 0.9010070644821885, + "flos": 23222281927680.0, + "grad_norm": 2.2121804906392453, + "language_loss": 0.70376158, + "learning_rate": 1.0186095782618643e-07, + "loss": 0.72607738, + "num_input_tokens_seen": 323225980, + "step": 14986, + "time_per_iteration": 2.6491341590881348 + }, + { + "auxiliary_loss_clip": 0.011496, + "auxiliary_loss_mlp": 0.01100123, + "balance_loss_clip": 1.00175953, + "balance_loss_mlp": 1.00055981, + "epoch": 0.9010671877348565, + "flos": 17384823146880.0, + "grad_norm": 2.2472036865438696, + "language_loss": 0.76683354, + "learning_rate": 1.0173828739746104e-07, + "loss": 0.78933078, + "num_input_tokens_seen": 323243700, + "step": 14987, + "time_per_iteration": 2.5624547004699707 + }, + { + "auxiliary_loss_clip": 0.01147754, + "auxiliary_loss_mlp": 0.01099663, + "balance_loss_clip": 1.00180209, + "balance_loss_mlp": 1.0005759, + "epoch": 0.9011273109875244, + "flos": 21908166854400.0, + "grad_norm": 2.2001611295293135, + "language_loss": 0.73734051, + "learning_rate": 1.0161568895100981e-07, + "loss": 0.75981474, + "num_input_tokens_seen": 323261535, + "step": 14988, + "time_per_iteration": 2.6515650749206543 + }, + { + "auxiliary_loss_clip": 0.01131423, + "auxiliary_loss_mlp": 0.01101275, + "balance_loss_clip": 1.00185513, + "balance_loss_mlp": 1.00042391, + "epoch": 0.9011874342401924, + "flos": 24060831909120.0, + "grad_norm": 6.383149489817297, + "language_loss": 0.6910677, + "learning_rate": 1.0149316249148188e-07, + "loss": 0.71339464, + "num_input_tokens_seen": 323281855, + "step": 14989, + "time_per_iteration": 2.6817893981933594 + }, + { + "auxiliary_loss_clip": 0.01164252, + "auxiliary_loss_mlp": 0.01100588, + "balance_loss_clip": 1.0019232, + "balance_loss_mlp": 1.00054789, + "epoch": 0.9012475574928603, + "flos": 16758791982720.0, + "grad_norm": 1.7743118471512558, + "language_loss": 0.79716653, + "learning_rate": 1.0137070802352376e-07, + "loss": 0.81981492, + "num_input_tokens_seen": 323299505, + "step": 14990, + "time_per_iteration": 2.5902578830718994 + }, + { + "auxiliary_loss_clip": 0.0110244, + "auxiliary_loss_mlp": 0.01101313, + "balance_loss_clip": 1.00165176, + "balance_loss_mlp": 1.00050938, + "epoch": 0.9013076807455284, + "flos": 19971109186560.0, + "grad_norm": 1.843661751418772, + "language_loss": 0.78025985, + "learning_rate": 1.0124832555177842e-07, + "loss": 0.80229735, + "num_input_tokens_seen": 323318365, + "step": 14991, + "time_per_iteration": 2.7298130989074707 + }, + { + "auxiliary_loss_clip": 0.01110023, + "auxiliary_loss_mlp": 0.00745599, + "balance_loss_clip": 1.00059414, + "balance_loss_mlp": 1.00039947, + "epoch": 0.9013678039981963, + "flos": 65180274624000.0, + "grad_norm": 0.7774069490783703, + "language_loss": 0.60201919, + "learning_rate": 1.0112601508088726e-07, + "loss": 0.62057543, + "num_input_tokens_seen": 323371835, + "step": 14992, + "time_per_iteration": 3.1490488052368164 + }, + { + "auxiliary_loss_clip": 0.01147396, + "auxiliary_loss_mlp": 0.01099946, + "balance_loss_clip": 1.00180209, + "balance_loss_mlp": 1.00057304, + "epoch": 0.9014279272508643, + "flos": 20521224956160.0, + "grad_norm": 2.0978581217424215, + "language_loss": 0.8286339, + "learning_rate": 1.0100377661548764e-07, + "loss": 0.85110736, + "num_input_tokens_seen": 323388495, + "step": 14993, + "time_per_iteration": 2.6721913814544678 + }, + { + "auxiliary_loss_clip": 0.01164286, + "auxiliary_loss_mlp": 0.01100835, + "balance_loss_clip": 1.00196624, + "balance_loss_mlp": 1.00055647, + "epoch": 0.9014880505035322, + "flos": 17309051406720.0, + "grad_norm": 2.1843443847455437, + "language_loss": 0.73347247, + "learning_rate": 1.0088161016021502e-07, + "loss": 0.75612372, + "num_input_tokens_seen": 323405280, + "step": 14994, + "time_per_iteration": 2.5659966468811035 + }, + { + "auxiliary_loss_clip": 0.01146837, + "auxiliary_loss_mlp": 0.01099805, + "balance_loss_clip": 1.00180554, + "balance_loss_mlp": 1.00043249, + "epoch": 0.9015481737562002, + "flos": 28402862739840.0, + "grad_norm": 2.2622813589594677, + "language_loss": 0.64527476, + "learning_rate": 1.0075951571970187e-07, + "loss": 0.66774118, + "num_input_tokens_seen": 323425310, + "step": 14995, + "time_per_iteration": 2.701042890548706 + }, + { + "auxiliary_loss_clip": 0.01120017, + "auxiliary_loss_mlp": 0.0110029, + "balance_loss_clip": 1.00175667, + "balance_loss_mlp": 1.00063133, + "epoch": 0.9016082970088681, + "flos": 29752672953600.0, + "grad_norm": 2.2096332060599058, + "language_loss": 0.66702318, + "learning_rate": 1.0063749329857873e-07, + "loss": 0.68922621, + "num_input_tokens_seen": 323447805, + "step": 14996, + "time_per_iteration": 2.7581210136413574 + }, + { + "auxiliary_loss_clip": 0.01149468, + "auxiliary_loss_mlp": 0.01099461, + "balance_loss_clip": 1.00176311, + "balance_loss_mlp": 1.00037432, + "epoch": 0.9016684202615362, + "flos": 23513230091520.0, + "grad_norm": 2.2646610949647905, + "language_loss": 0.65975833, + "learning_rate": 1.0051554290147168e-07, + "loss": 0.68224764, + "num_input_tokens_seen": 323467150, + "step": 14997, + "time_per_iteration": 2.6698696613311768 + }, + { + "auxiliary_loss_clip": 0.01134576, + "auxiliary_loss_mlp": 0.01100302, + "balance_loss_clip": 1.00186872, + "balance_loss_mlp": 1.00059557, + "epoch": 0.9017285435142042, + "flos": 16979247705600.0, + "grad_norm": 1.9609370690152876, + "language_loss": 0.77239966, + "learning_rate": 1.0039366453300613e-07, + "loss": 0.79474843, + "num_input_tokens_seen": 323484250, + "step": 14998, + "time_per_iteration": 4.096213102340698 + }, + { + "auxiliary_loss_clip": 0.01164189, + "auxiliary_loss_mlp": 0.01099885, + "balance_loss_clip": 1.00181556, + "balance_loss_mlp": 1.00046468, + "epoch": 0.9017886667668721, + "flos": 21393351175680.0, + "grad_norm": 1.7504394506119458, + "language_loss": 0.75465763, + "learning_rate": 1.0027185819780281e-07, + "loss": 0.77729845, + "num_input_tokens_seen": 323502910, + "step": 14999, + "time_per_iteration": 2.573694944381714 + }, + { + "auxiliary_loss_clip": 0.01085054, + "auxiliary_loss_mlp": 0.0110012, + "balance_loss_clip": 1.00175488, + "balance_loss_mlp": 1.00041306, + "epoch": 0.9018487900195401, + "flos": 20996574566400.0, + "grad_norm": 2.666892677339746, + "language_loss": 0.75757587, + "learning_rate": 1.0015012390048117e-07, + "loss": 0.77942765, + "num_input_tokens_seen": 323521820, + "step": 15000, + "time_per_iteration": 4.032242059707642 + }, + { + "auxiliary_loss_clip": 0.01147718, + "auxiliary_loss_mlp": 0.0109999, + "balance_loss_clip": 1.00180471, + "balance_loss_mlp": 1.00037897, + "epoch": 0.901908913272208, + "flos": 53358443458560.0, + "grad_norm": 2.4049252595174115, + "language_loss": 0.80945152, + "learning_rate": 1.0002846164565704e-07, + "loss": 0.83192861, + "num_input_tokens_seen": 323543200, + "step": 15001, + "time_per_iteration": 2.8821098804473877 + }, + { + "auxiliary_loss_clip": 0.01115379, + "auxiliary_loss_mlp": 0.01099444, + "balance_loss_clip": 1.00176692, + "balance_loss_mlp": 1.00050032, + "epoch": 0.901969036524876, + "flos": 22089838867200.0, + "grad_norm": 1.4855956880860317, + "language_loss": 0.7844156, + "learning_rate": 9.990687143794407e-08, + "loss": 0.80656385, + "num_input_tokens_seen": 323563075, + "step": 15002, + "time_per_iteration": 2.8292596340179443 + }, + { + "auxiliary_loss_clip": 0.01132598, + "auxiliary_loss_mlp": 0.01101225, + "balance_loss_clip": 1.00186217, + "balance_loss_mlp": 1.00061202, + "epoch": 0.9020291597775439, + "flos": 23835025059840.0, + "grad_norm": 2.766684207856287, + "language_loss": 0.6782499, + "learning_rate": 9.978535328195347e-08, + "loss": 0.70058817, + "num_input_tokens_seen": 323579065, + "step": 15003, + "time_per_iteration": 2.6964492797851562 + }, + { + "auxiliary_loss_clip": 0.01132854, + "auxiliary_loss_mlp": 0.01101255, + "balance_loss_clip": 1.00188518, + "balance_loss_mlp": 1.00059485, + "epoch": 0.902089283030212, + "flos": 18326005263360.0, + "grad_norm": 1.6570307668617323, + "language_loss": 0.85837376, + "learning_rate": 9.9663907182292e-08, + "loss": 0.88071483, + "num_input_tokens_seen": 323594835, + "step": 15004, + "time_per_iteration": 2.614434003829956 + }, + { + "auxiliary_loss_clip": 0.0111799, + "auxiliary_loss_mlp": 0.01100906, + "balance_loss_clip": 1.00183511, + "balance_loss_mlp": 1.00048375, + "epoch": 0.9021494062828799, + "flos": 24170359455360.0, + "grad_norm": 2.1766561551752255, + "language_loss": 0.7271198, + "learning_rate": 9.954253314356575e-08, + "loss": 0.74930871, + "num_input_tokens_seen": 323611475, + "step": 15005, + "time_per_iteration": 2.7968811988830566 + }, + { + "auxiliary_loss_clip": 0.01149094, + "auxiliary_loss_mlp": 0.01101401, + "balance_loss_clip": 1.00175977, + "balance_loss_mlp": 1.00059724, + "epoch": 0.9022095295355479, + "flos": 21616859554560.0, + "grad_norm": 1.9660736163572323, + "language_loss": 0.70902401, + "learning_rate": 9.942123117037748e-08, + "loss": 0.731529, + "num_input_tokens_seen": 323629730, + "step": 15006, + "time_per_iteration": 2.6189959049224854 + }, + { + "auxiliary_loss_clip": 0.01132893, + "auxiliary_loss_mlp": 0.01100913, + "balance_loss_clip": 1.00184822, + "balance_loss_mlp": 1.00039542, + "epoch": 0.9022696527882158, + "flos": 18726229578240.0, + "grad_norm": 34.482285798086586, + "language_loss": 0.84635425, + "learning_rate": 9.930000126732618e-08, + "loss": 0.86869228, + "num_input_tokens_seen": 323646000, + "step": 15007, + "time_per_iteration": 2.61738920211792 + }, + { + "auxiliary_loss_clip": 0.01134484, + "auxiliary_loss_mlp": 0.01100263, + "balance_loss_clip": 1.00185776, + "balance_loss_mlp": 1.00046086, + "epoch": 0.9023297760408838, + "flos": 26761206522240.0, + "grad_norm": 1.6596050828301114, + "language_loss": 0.78392851, + "learning_rate": 9.917884343900928e-08, + "loss": 0.80627596, + "num_input_tokens_seen": 323667250, + "step": 15008, + "time_per_iteration": 2.676471471786499 + }, + { + "auxiliary_loss_clip": 0.01116172, + "auxiliary_loss_mlp": 0.01099546, + "balance_loss_clip": 1.0018661, + "balance_loss_mlp": 1.00055444, + "epoch": 0.9023898992935517, + "flos": 20522553759360.0, + "grad_norm": 1.6082122997120338, + "language_loss": 0.73558009, + "learning_rate": 9.905775769002156e-08, + "loss": 0.75773734, + "num_input_tokens_seen": 323687150, + "step": 15009, + "time_per_iteration": 2.700418472290039 + }, + { + "auxiliary_loss_clip": 0.01164095, + "auxiliary_loss_mlp": 0.01100799, + "balance_loss_clip": 1.00187778, + "balance_loss_mlp": 1.00047255, + "epoch": 0.9024500225462198, + "flos": 17456644391040.0, + "grad_norm": 1.743550538774897, + "language_loss": 0.73367631, + "learning_rate": 9.893674402495399e-08, + "loss": 0.75632524, + "num_input_tokens_seen": 323703660, + "step": 15010, + "time_per_iteration": 2.5429956912994385 + }, + { + "auxiliary_loss_clip": 0.01131088, + "auxiliary_loss_mlp": 0.01101203, + "balance_loss_clip": 1.00179029, + "balance_loss_mlp": 1.00049496, + "epoch": 0.9025101457988878, + "flos": 20813609664000.0, + "grad_norm": 1.828783795604297, + "language_loss": 0.74148327, + "learning_rate": 9.881580244839538e-08, + "loss": 0.76380622, + "num_input_tokens_seen": 323722060, + "step": 15011, + "time_per_iteration": 2.6591968536376953 + }, + { + "auxiliary_loss_clip": 0.01149637, + "auxiliary_loss_mlp": 0.01101715, + "balance_loss_clip": 1.0017817, + "balance_loss_mlp": 1.00043488, + "epoch": 0.9025702690515557, + "flos": 19026371623680.0, + "grad_norm": 1.9852497523187762, + "language_loss": 0.73449612, + "learning_rate": 9.869493296493204e-08, + "loss": 0.75700963, + "num_input_tokens_seen": 323740645, + "step": 15012, + "time_per_iteration": 2.6010732650756836 + }, + { + "auxiliary_loss_clip": 0.01115526, + "auxiliary_loss_mlp": 0.01099754, + "balance_loss_clip": 1.00173497, + "balance_loss_mlp": 1.0007149, + "epoch": 0.9026303923042237, + "flos": 19682818629120.0, + "grad_norm": 1.6901203610031739, + "language_loss": 0.69124305, + "learning_rate": 9.857413557914763e-08, + "loss": 0.71339589, + "num_input_tokens_seen": 323758905, + "step": 15013, + "time_per_iteration": 2.7620508670806885 + }, + { + "auxiliary_loss_clip": 0.01147288, + "auxiliary_loss_mlp": 0.01099463, + "balance_loss_clip": 1.00179148, + "balance_loss_mlp": 1.00056672, + "epoch": 0.9026905155568916, + "flos": 24608110504320.0, + "grad_norm": 1.4175325476226746, + "language_loss": 0.7315954, + "learning_rate": 9.845341029562249e-08, + "loss": 0.75406289, + "num_input_tokens_seen": 323780595, + "step": 15014, + "time_per_iteration": 2.6916022300720215 + }, + { + "auxiliary_loss_clip": 0.01163994, + "auxiliary_loss_mlp": 0.01100689, + "balance_loss_clip": 1.00175762, + "balance_loss_mlp": 1.00050604, + "epoch": 0.9027506388095596, + "flos": 20521799573760.0, + "grad_norm": 2.178964760807997, + "language_loss": 0.72172618, + "learning_rate": 9.833275711893474e-08, + "loss": 0.74437302, + "num_input_tokens_seen": 323798160, + "step": 15015, + "time_per_iteration": 2.662001371383667 + }, + { + "auxiliary_loss_clip": 0.0113291, + "auxiliary_loss_mlp": 0.01100433, + "balance_loss_clip": 1.00178766, + "balance_loss_mlp": 1.00039268, + "epoch": 0.9028107620622275, + "flos": 22784494965120.0, + "grad_norm": 3.1453304667447055, + "language_loss": 0.68796724, + "learning_rate": 9.821217605365895e-08, + "loss": 0.71030068, + "num_input_tokens_seen": 323816810, + "step": 15016, + "time_per_iteration": 2.6587841510772705 + }, + { + "auxiliary_loss_clip": 0.01164208, + "auxiliary_loss_mlp": 0.01098904, + "balance_loss_clip": 1.00197804, + "balance_loss_mlp": 1.00058007, + "epoch": 0.9028708853148956, + "flos": 25410534382080.0, + "grad_norm": 1.805347600356898, + "language_loss": 0.70492494, + "learning_rate": 9.809166710436855e-08, + "loss": 0.72755611, + "num_input_tokens_seen": 323836900, + "step": 15017, + "time_per_iteration": 2.6381781101226807 + }, + { + "auxiliary_loss_clip": 0.01130531, + "auxiliary_loss_mlp": 0.01100842, + "balance_loss_clip": 1.00202751, + "balance_loss_mlp": 1.00065827, + "epoch": 0.9029310085675635, + "flos": 21871322478720.0, + "grad_norm": 2.0131627129220795, + "language_loss": 0.69582891, + "learning_rate": 9.797123027563237e-08, + "loss": 0.71814263, + "num_input_tokens_seen": 323855325, + "step": 15018, + "time_per_iteration": 4.049628019332886 + }, + { + "auxiliary_loss_clip": 0.01146929, + "auxiliary_loss_mlp": 0.01100382, + "balance_loss_clip": 1.00182867, + "balance_loss_mlp": 1.00053239, + "epoch": 0.9029911318202315, + "flos": 26214394803840.0, + "grad_norm": 1.8856660453644516, + "language_loss": 0.69378525, + "learning_rate": 9.785086557201782e-08, + "loss": 0.71625835, + "num_input_tokens_seen": 323875650, + "step": 15019, + "time_per_iteration": 2.7031636238098145 + }, + { + "auxiliary_loss_clip": 0.01164034, + "auxiliary_loss_mlp": 0.01099012, + "balance_loss_clip": 1.00181067, + "balance_loss_mlp": 1.0005933, + "epoch": 0.9030512550728994, + "flos": 15961360095360.0, + "grad_norm": 1.9192059057007673, + "language_loss": 0.72020447, + "learning_rate": 9.773057299808951e-08, + "loss": 0.74283493, + "num_input_tokens_seen": 323892920, + "step": 15020, + "time_per_iteration": 4.075527906417847 + }, + { + "auxiliary_loss_clip": 0.01149506, + "auxiliary_loss_mlp": 0.01100835, + "balance_loss_clip": 1.00191522, + "balance_loss_mlp": 1.00055599, + "epoch": 0.9031113783255674, + "flos": 23987610034560.0, + "grad_norm": 1.5214038939705234, + "language_loss": 0.74169075, + "learning_rate": 9.7610352558408e-08, + "loss": 0.76419413, + "num_input_tokens_seen": 323913835, + "step": 15021, + "time_per_iteration": 2.670092821121216 + }, + { + "auxiliary_loss_clip": 0.01164229, + "auxiliary_loss_mlp": 0.01101309, + "balance_loss_clip": 1.00188744, + "balance_loss_mlp": 1.00040984, + "epoch": 0.9031715015782353, + "flos": 22237216369920.0, + "grad_norm": 2.1574163842096907, + "language_loss": 0.7258935, + "learning_rate": 9.749020425753251e-08, + "loss": 0.74854887, + "num_input_tokens_seen": 323933440, + "step": 15022, + "time_per_iteration": 2.5773916244506836 + }, + { + "auxiliary_loss_clip": 0.01117103, + "auxiliary_loss_mlp": 0.01098377, + "balance_loss_clip": 1.00175357, + "balance_loss_mlp": 1.00043416, + "epoch": 0.9032316248309034, + "flos": 26323168164480.0, + "grad_norm": 1.9160859616016375, + "language_loss": 0.73033088, + "learning_rate": 9.737012810001943e-08, + "loss": 0.75248569, + "num_input_tokens_seen": 323954090, + "step": 15023, + "time_per_iteration": 2.739647150039673 + }, + { + "auxiliary_loss_clip": 0.0114755, + "auxiliary_loss_mlp": 0.0110052, + "balance_loss_clip": 1.00184226, + "balance_loss_mlp": 1.00047994, + "epoch": 0.9032917480835713, + "flos": 22636686499200.0, + "grad_norm": 1.845514469691361, + "language_loss": 0.82367808, + "learning_rate": 9.725012409042155e-08, + "loss": 0.84615874, + "num_input_tokens_seen": 323974040, + "step": 15024, + "time_per_iteration": 2.6314010620117188 + }, + { + "auxiliary_loss_clip": 0.01147548, + "auxiliary_loss_mlp": 0.01100569, + "balance_loss_clip": 1.00184321, + "balance_loss_mlp": 1.00038552, + "epoch": 0.9033518713362393, + "flos": 23878764846720.0, + "grad_norm": 1.5302519940320256, + "language_loss": 0.69497633, + "learning_rate": 9.713019223328966e-08, + "loss": 0.71745753, + "num_input_tokens_seen": 323996125, + "step": 15025, + "time_per_iteration": 2.685870409011841 + }, + { + "auxiliary_loss_clip": 0.01114213, + "auxiliary_loss_mlp": 0.01099965, + "balance_loss_clip": 1.00160742, + "balance_loss_mlp": 1.0006398, + "epoch": 0.9034119945889073, + "flos": 26905279973760.0, + "grad_norm": 2.094662745118937, + "language_loss": 0.76564217, + "learning_rate": 9.70103325331717e-08, + "loss": 0.78778398, + "num_input_tokens_seen": 324017645, + "step": 15026, + "time_per_iteration": 2.7605535984039307 + }, + { + "auxiliary_loss_clip": 0.01147513, + "auxiliary_loss_mlp": 0.01100178, + "balance_loss_clip": 1.00195861, + "balance_loss_mlp": 1.00051928, + "epoch": 0.9034721178415752, + "flos": 20850166730880.0, + "grad_norm": 1.8894272497148699, + "language_loss": 0.68511891, + "learning_rate": 9.68905449946129e-08, + "loss": 0.70759583, + "num_input_tokens_seen": 324036875, + "step": 15027, + "time_per_iteration": 2.6253654956817627 + }, + { + "auxiliary_loss_clip": 0.01102455, + "auxiliary_loss_mlp": 0.01099, + "balance_loss_clip": 1.00176024, + "balance_loss_mlp": 1.00058079, + "epoch": 0.9035322410942432, + "flos": 22234307368320.0, + "grad_norm": 1.6049677088949246, + "language_loss": 0.75552595, + "learning_rate": 9.677082962215477e-08, + "loss": 0.77754045, + "num_input_tokens_seen": 324057045, + "step": 15028, + "time_per_iteration": 2.721283197402954 + }, + { + "auxiliary_loss_clip": 0.01100398, + "auxiliary_loss_mlp": 0.01100206, + "balance_loss_clip": 1.00180912, + "balance_loss_mlp": 1.0005945, + "epoch": 0.9035923643469111, + "flos": 25923410726400.0, + "grad_norm": 2.034125759106996, + "language_loss": 0.69343942, + "learning_rate": 9.665118642033765e-08, + "loss": 0.71544552, + "num_input_tokens_seen": 324079735, + "step": 15029, + "time_per_iteration": 2.811875104904175 + }, + { + "auxiliary_loss_clip": 0.0114783, + "auxiliary_loss_mlp": 0.01101485, + "balance_loss_clip": 1.00179744, + "balance_loss_mlp": 1.00049055, + "epoch": 0.9036524875995792, + "flos": 20339804338560.0, + "grad_norm": 2.3007471053878636, + "language_loss": 0.73637915, + "learning_rate": 9.653161539369858e-08, + "loss": 0.75887227, + "num_input_tokens_seen": 324097785, + "step": 15030, + "time_per_iteration": 2.631527900695801 + }, + { + "auxiliary_loss_clip": 0.011488, + "auxiliary_loss_mlp": 0.01099835, + "balance_loss_clip": 1.00180984, + "balance_loss_mlp": 1.0005095, + "epoch": 0.9037126108522471, + "flos": 40114624677120.0, + "grad_norm": 1.904546359406543, + "language_loss": 0.6823222, + "learning_rate": 9.641211654677151e-08, + "loss": 0.70480853, + "num_input_tokens_seen": 324121625, + "step": 15031, + "time_per_iteration": 2.8457372188568115 + }, + { + "auxiliary_loss_clip": 0.01131028, + "auxiliary_loss_mlp": 0.01099265, + "balance_loss_clip": 1.00168753, + "balance_loss_mlp": 1.00036871, + "epoch": 0.9037727341049151, + "flos": 23332024955520.0, + "grad_norm": 1.48262796006458, + "language_loss": 0.76601911, + "learning_rate": 9.629268988408723e-08, + "loss": 0.78832203, + "num_input_tokens_seen": 324142535, + "step": 15032, + "time_per_iteration": 2.7459471225738525 + }, + { + "auxiliary_loss_clip": 0.011643, + "auxiliary_loss_mlp": 0.01100895, + "balance_loss_clip": 1.00190699, + "balance_loss_mlp": 1.00052035, + "epoch": 0.903832857357583, + "flos": 12822659815680.0, + "grad_norm": 2.8372133982005527, + "language_loss": 0.75399363, + "learning_rate": 9.617333541017502e-08, + "loss": 0.77664554, + "num_input_tokens_seen": 324159610, + "step": 15033, + "time_per_iteration": 2.6012449264526367 + }, + { + "auxiliary_loss_clip": 0.01119658, + "auxiliary_loss_mlp": 0.01099989, + "balance_loss_clip": 1.00187564, + "balance_loss_mlp": 1.00066376, + "epoch": 0.903892980610251, + "flos": 25703026830720.0, + "grad_norm": 1.6169612328859115, + "language_loss": 0.73799419, + "learning_rate": 9.605405312956105e-08, + "loss": 0.76019067, + "num_input_tokens_seen": 324182510, + "step": 15034, + "time_per_iteration": 2.7962796688079834 + }, + { + "auxiliary_loss_clip": 0.01118024, + "auxiliary_loss_mlp": 0.01100476, + "balance_loss_clip": 1.00183964, + "balance_loss_mlp": 1.00048304, + "epoch": 0.9039531038629189, + "flos": 14684089397760.0, + "grad_norm": 1.5834818680701146, + "language_loss": 0.63500434, + "learning_rate": 9.593484304676791e-08, + "loss": 0.65718937, + "num_input_tokens_seen": 324200555, + "step": 15035, + "time_per_iteration": 4.134243011474609 + }, + { + "auxiliary_loss_clip": 0.01164264, + "auxiliary_loss_mlp": 0.01101028, + "balance_loss_clip": 1.00199461, + "balance_loss_mlp": 1.00060594, + "epoch": 0.904013227115587, + "flos": 24024921287040.0, + "grad_norm": 2.3027590951615884, + "language_loss": 0.62413341, + "learning_rate": 9.581570516631643e-08, + "loss": 0.64678633, + "num_input_tokens_seen": 324220255, + "step": 15036, + "time_per_iteration": 2.637266159057617 + }, + { + "auxiliary_loss_clip": 0.01098443, + "auxiliary_loss_mlp": 0.01098846, + "balance_loss_clip": 1.00182033, + "balance_loss_mlp": 1.00042677, + "epoch": 0.9040733503682549, + "flos": 22856459863680.0, + "grad_norm": 2.0278960156204056, + "language_loss": 0.82119113, + "learning_rate": 9.569663949272455e-08, + "loss": 0.84316403, + "num_input_tokens_seen": 324237855, + "step": 15037, + "time_per_iteration": 2.828427791595459 + }, + { + "auxiliary_loss_clip": 0.01164204, + "auxiliary_loss_mlp": 0.01101212, + "balance_loss_clip": 1.00184071, + "balance_loss_mlp": 1.00050402, + "epoch": 0.9041334736209229, + "flos": 19974951941760.0, + "grad_norm": 2.195291534039806, + "language_loss": 0.67455351, + "learning_rate": 9.557764603050667e-08, + "loss": 0.69720769, + "num_input_tokens_seen": 324257050, + "step": 15038, + "time_per_iteration": 3.9914681911468506 + }, + { + "auxiliary_loss_clip": 0.01132064, + "auxiliary_loss_mlp": 0.01099965, + "balance_loss_clip": 1.00175762, + "balance_loss_mlp": 1.00054479, + "epoch": 0.9041935968735909, + "flos": 17530548624000.0, + "grad_norm": 2.5258766006492444, + "language_loss": 0.74989212, + "learning_rate": 9.545872478417494e-08, + "loss": 0.77221245, + "num_input_tokens_seen": 324275510, + "step": 15039, + "time_per_iteration": 2.6527395248413086 + }, + { + "auxiliary_loss_clip": 0.01130939, + "auxiliary_loss_mlp": 0.01100762, + "balance_loss_clip": 1.00181603, + "balance_loss_mlp": 1.0005312, + "epoch": 0.9042537201262588, + "flos": 22780149419520.0, + "grad_norm": 1.5685931177957557, + "language_loss": 0.7027787, + "learning_rate": 9.533987575823977e-08, + "loss": 0.72509569, + "num_input_tokens_seen": 324295150, + "step": 15040, + "time_per_iteration": 2.655698537826538 + }, + { + "auxiliary_loss_clip": 0.01117305, + "auxiliary_loss_mlp": 0.01100407, + "balance_loss_clip": 1.00177121, + "balance_loss_mlp": 1.00036669, + "epoch": 0.9043138433789268, + "flos": 20595416497920.0, + "grad_norm": 1.7433640160767983, + "language_loss": 0.67605209, + "learning_rate": 9.522109895720709e-08, + "loss": 0.69822919, + "num_input_tokens_seen": 324313855, + "step": 15041, + "time_per_iteration": 2.689405679702759 + }, + { + "auxiliary_loss_clip": 0.01147402, + "auxiliary_loss_mlp": 0.01100096, + "balance_loss_clip": 1.00171816, + "balance_loss_mlp": 1.00043726, + "epoch": 0.9043739666315948, + "flos": 32962978995840.0, + "grad_norm": 2.390289829500412, + "language_loss": 0.57322502, + "learning_rate": 9.510239438558155e-08, + "loss": 0.59569997, + "num_input_tokens_seen": 324338465, + "step": 15042, + "time_per_iteration": 2.709688663482666 + }, + { + "auxiliary_loss_clip": 0.01126607, + "auxiliary_loss_mlp": 0.00745366, + "balance_loss_clip": 1.00070095, + "balance_loss_mlp": 1.00008631, + "epoch": 0.9044340898842628, + "flos": 67296418525440.0, + "grad_norm": 0.7771240111687109, + "language_loss": 0.56949973, + "learning_rate": 9.498376204786351e-08, + "loss": 0.58821946, + "num_input_tokens_seen": 324398740, + "step": 15043, + "time_per_iteration": 3.147469997406006 + }, + { + "auxiliary_loss_clip": 0.01130882, + "auxiliary_loss_mlp": 0.01100865, + "balance_loss_clip": 1.00170898, + "balance_loss_mlp": 1.00044346, + "epoch": 0.9044942131369307, + "flos": 17713154390400.0, + "grad_norm": 2.1357465430593754, + "language_loss": 0.70010197, + "learning_rate": 9.486520194855274e-08, + "loss": 0.72241944, + "num_input_tokens_seen": 324417335, + "step": 15044, + "time_per_iteration": 2.625084161758423 + }, + { + "auxiliary_loss_clip": 0.01132025, + "auxiliary_loss_mlp": 0.01100796, + "balance_loss_clip": 1.00175071, + "balance_loss_mlp": 1.00061262, + "epoch": 0.9045543363895987, + "flos": 17820563034240.0, + "grad_norm": 2.513690174243579, + "language_loss": 0.69487822, + "learning_rate": 9.474671409214407e-08, + "loss": 0.71720642, + "num_input_tokens_seen": 324433240, + "step": 15045, + "time_per_iteration": 2.5889084339141846 + }, + { + "auxiliary_loss_clip": 0.01115986, + "auxiliary_loss_mlp": 0.01101447, + "balance_loss_clip": 1.00180435, + "balance_loss_mlp": 1.0005964, + "epoch": 0.9046144596422666, + "flos": 21872723109120.0, + "grad_norm": 2.2134068011606245, + "language_loss": 0.66090953, + "learning_rate": 9.462829848313081e-08, + "loss": 0.68308383, + "num_input_tokens_seen": 324452675, + "step": 15046, + "time_per_iteration": 2.706071138381958 + }, + { + "auxiliary_loss_clip": 0.0111421, + "auxiliary_loss_mlp": 0.01101109, + "balance_loss_clip": 1.00165057, + "balance_loss_mlp": 1.00078285, + "epoch": 0.9046745828949346, + "flos": 17672646827520.0, + "grad_norm": 2.1828883895998104, + "language_loss": 0.62001556, + "learning_rate": 9.450995512600379e-08, + "loss": 0.64216876, + "num_input_tokens_seen": 324467865, + "step": 15047, + "time_per_iteration": 2.677293539047241 + }, + { + "auxiliary_loss_clip": 0.01164243, + "auxiliary_loss_mlp": 0.00747374, + "balance_loss_clip": 1.0019989, + "balance_loss_mlp": 1.0005101, + "epoch": 0.9047347061476025, + "flos": 25702559953920.0, + "grad_norm": 1.768193232154135, + "language_loss": 0.71307254, + "learning_rate": 9.439168402525032e-08, + "loss": 0.7321887, + "num_input_tokens_seen": 324490430, + "step": 15048, + "time_per_iteration": 2.6993632316589355 + }, + { + "auxiliary_loss_clip": 0.01147634, + "auxiliary_loss_mlp": 0.01100068, + "balance_loss_clip": 1.00175261, + "balance_loss_mlp": 1.00055242, + "epoch": 0.9047948294002706, + "flos": 15158146118400.0, + "grad_norm": 2.0711489328602988, + "language_loss": 0.75197923, + "learning_rate": 9.427348518535483e-08, + "loss": 0.7744562, + "num_input_tokens_seen": 324506620, + "step": 15049, + "time_per_iteration": 2.666297674179077 + }, + { + "auxiliary_loss_clip": 0.01147413, + "auxiliary_loss_mlp": 0.01099207, + "balance_loss_clip": 1.00191772, + "balance_loss_mlp": 1.00054932, + "epoch": 0.9048549526529385, + "flos": 21872292145920.0, + "grad_norm": 1.8768883448131428, + "language_loss": 0.7563864, + "learning_rate": 9.415535861079993e-08, + "loss": 0.77885258, + "num_input_tokens_seen": 324525505, + "step": 15050, + "time_per_iteration": 2.6637628078460693 + }, + { + "auxiliary_loss_clip": 0.011643, + "auxiliary_loss_mlp": 0.00747295, + "balance_loss_clip": 1.00188339, + "balance_loss_mlp": 1.00042784, + "epoch": 0.9049150759056065, + "flos": 23546626761600.0, + "grad_norm": 1.9195321810293944, + "language_loss": 0.81533241, + "learning_rate": 9.403730430606472e-08, + "loss": 0.83444834, + "num_input_tokens_seen": 324544415, + "step": 15051, + "time_per_iteration": 2.5882585048675537 + }, + { + "auxiliary_loss_clip": 0.01147337, + "auxiliary_loss_mlp": 0.01100347, + "balance_loss_clip": 1.0017966, + "balance_loss_mlp": 1.00049758, + "epoch": 0.9049751991582745, + "flos": 19645902426240.0, + "grad_norm": 2.7750370567253837, + "language_loss": 0.89167273, + "learning_rate": 9.391932227562582e-08, + "loss": 0.91414952, + "num_input_tokens_seen": 324562555, + "step": 15052, + "time_per_iteration": 2.7252233028411865 + }, + { + "auxiliary_loss_clip": 0.0114968, + "auxiliary_loss_mlp": 0.01100957, + "balance_loss_clip": 1.00191498, + "balance_loss_mlp": 1.00043941, + "epoch": 0.9050353224109424, + "flos": 15596220389760.0, + "grad_norm": 2.3191927835663595, + "language_loss": 0.77369219, + "learning_rate": 9.380141252395724e-08, + "loss": 0.79619861, + "num_input_tokens_seen": 324580865, + "step": 15053, + "time_per_iteration": 2.5979161262512207 + }, + { + "auxiliary_loss_clip": 0.01147325, + "auxiliary_loss_mlp": 0.0110056, + "balance_loss_clip": 1.00181055, + "balance_loss_mlp": 1.00051963, + "epoch": 0.9050954456636104, + "flos": 28183592165760.0, + "grad_norm": 1.6273423195761414, + "language_loss": 0.73111314, + "learning_rate": 9.368357505553049e-08, + "loss": 0.75359201, + "num_input_tokens_seen": 324600665, + "step": 15054, + "time_per_iteration": 2.668889045715332 + }, + { + "auxiliary_loss_clip": 0.01101389, + "auxiliary_loss_mlp": 0.01100324, + "balance_loss_clip": 1.00170684, + "balance_loss_mlp": 1.00047481, + "epoch": 0.9051555689162784, + "flos": 25731611078400.0, + "grad_norm": 1.8853405954209095, + "language_loss": 0.83287084, + "learning_rate": 9.356580987481333e-08, + "loss": 0.85488796, + "num_input_tokens_seen": 324618145, + "step": 15055, + "time_per_iteration": 2.782491683959961 + }, + { + "auxiliary_loss_clip": 0.01146812, + "auxiliary_loss_mlp": 0.01100082, + "balance_loss_clip": 1.00182176, + "balance_loss_mlp": 1.00056648, + "epoch": 0.9052156921689464, + "flos": 23257258796160.0, + "grad_norm": 1.6261158076187339, + "language_loss": 0.85200214, + "learning_rate": 9.344811698627176e-08, + "loss": 0.87447119, + "num_input_tokens_seen": 324638165, + "step": 15056, + "time_per_iteration": 4.107299327850342 + }, + { + "auxiliary_loss_clip": 0.01133134, + "auxiliary_loss_mlp": 0.01100776, + "balance_loss_clip": 1.00192654, + "balance_loss_mlp": 1.00040209, + "epoch": 0.9052758154216143, + "flos": 29564285097600.0, + "grad_norm": 1.8857111188599487, + "language_loss": 0.71823704, + "learning_rate": 9.333049639436863e-08, + "loss": 0.74057615, + "num_input_tokens_seen": 324658560, + "step": 15057, + "time_per_iteration": 2.721376895904541 + }, + { + "auxiliary_loss_clip": 0.0114954, + "auxiliary_loss_mlp": 0.01100206, + "balance_loss_clip": 1.00187504, + "balance_loss_mlp": 1.0005945, + "epoch": 0.9053359386742823, + "flos": 22127688823680.0, + "grad_norm": 1.6078777731336131, + "language_loss": 0.81058598, + "learning_rate": 9.321294810356418e-08, + "loss": 0.83308339, + "num_input_tokens_seen": 324679185, + "step": 15058, + "time_per_iteration": 4.05943489074707 + }, + { + "auxiliary_loss_clip": 0.01140958, + "auxiliary_loss_mlp": 0.0107386, + "balance_loss_clip": 1.0006268, + "balance_loss_mlp": 1.00023675, + "epoch": 0.9053960619269502, + "flos": 67090112760960.0, + "grad_norm": 0.6717697879903601, + "language_loss": 0.51409078, + "learning_rate": 9.309547211831592e-08, + "loss": 0.53623891, + "num_input_tokens_seen": 324744830, + "step": 15059, + "time_per_iteration": 3.2501070499420166 + }, + { + "auxiliary_loss_clip": 0.01100378, + "auxiliary_loss_mlp": 0.01100451, + "balance_loss_clip": 1.00161457, + "balance_loss_mlp": 1.00041032, + "epoch": 0.9054561851796182, + "flos": 15815419136640.0, + "grad_norm": 2.69052622217043, + "language_loss": 0.67269158, + "learning_rate": 9.297806844307831e-08, + "loss": 0.69469988, + "num_input_tokens_seen": 324762905, + "step": 15060, + "time_per_iteration": 2.7400999069213867 + }, + { + "auxiliary_loss_clip": 0.01131088, + "auxiliary_loss_mlp": 0.01101226, + "balance_loss_clip": 1.00173354, + "balance_loss_mlp": 1.00061297, + "epoch": 0.9055163084322861, + "flos": 17566997950080.0, + "grad_norm": 1.8991795915369911, + "language_loss": 0.64524376, + "learning_rate": 9.286073708230357e-08, + "loss": 0.6675669, + "num_input_tokens_seen": 324781905, + "step": 15061, + "time_per_iteration": 2.6451210975646973 + }, + { + "auxiliary_loss_clip": 0.01132676, + "auxiliary_loss_mlp": 0.01101098, + "balance_loss_clip": 1.00195742, + "balance_loss_mlp": 1.00058079, + "epoch": 0.9055764316849542, + "flos": 17639573379840.0, + "grad_norm": 2.3618422344315917, + "language_loss": 0.7181685, + "learning_rate": 9.274347804044058e-08, + "loss": 0.74050617, + "num_input_tokens_seen": 324799260, + "step": 15062, + "time_per_iteration": 2.639514446258545 + }, + { + "auxiliary_loss_clip": 0.01163992, + "auxiliary_loss_mlp": 0.01098772, + "balance_loss_clip": 1.00179887, + "balance_loss_mlp": 1.00040054, + "epoch": 0.9056365549376221, + "flos": 20120856986880.0, + "grad_norm": 1.7061299486797554, + "language_loss": 0.70855153, + "learning_rate": 9.2626291321936e-08, + "loss": 0.73117912, + "num_input_tokens_seen": 324817800, + "step": 15063, + "time_per_iteration": 2.5683095455169678 + }, + { + "auxiliary_loss_clip": 0.01116798, + "auxiliary_loss_mlp": 0.01099244, + "balance_loss_clip": 1.00185966, + "balance_loss_mlp": 1.00049067, + "epoch": 0.9056966781902901, + "flos": 27598786836480.0, + "grad_norm": 1.7370069524590084, + "language_loss": 0.72263122, + "learning_rate": 9.250917693123406e-08, + "loss": 0.74479169, + "num_input_tokens_seen": 324838445, + "step": 15064, + "time_per_iteration": 2.7218854427337646 + }, + { + "auxiliary_loss_clip": 0.01148634, + "auxiliary_loss_mlp": 0.01100579, + "balance_loss_clip": 1.00169849, + "balance_loss_mlp": 1.00063372, + "epoch": 0.9057568014429581, + "flos": 25920106675200.0, + "grad_norm": 2.288555212104709, + "language_loss": 0.69488621, + "learning_rate": 9.23921348727752e-08, + "loss": 0.71737838, + "num_input_tokens_seen": 324859895, + "step": 15065, + "time_per_iteration": 2.659574508666992 + }, + { + "auxiliary_loss_clip": 0.011338, + "auxiliary_loss_mlp": 0.01100445, + "balance_loss_clip": 1.00213027, + "balance_loss_mlp": 1.00064301, + "epoch": 0.905816924695626, + "flos": 22930364096640.0, + "grad_norm": 1.481298453389111, + "language_loss": 0.63022971, + "learning_rate": 9.227516515099743e-08, + "loss": 0.65257215, + "num_input_tokens_seen": 324879580, + "step": 15066, + "time_per_iteration": 2.651632785797119 + }, + { + "auxiliary_loss_clip": 0.01068537, + "auxiliary_loss_mlp": 0.01101649, + "balance_loss_clip": 1.0016048, + "balance_loss_mlp": 1.00055981, + "epoch": 0.905877047948294, + "flos": 22157422306560.0, + "grad_norm": 1.8883630123331243, + "language_loss": 0.79846108, + "learning_rate": 9.215826777033675e-08, + "loss": 0.82016289, + "num_input_tokens_seen": 324898950, + "step": 15067, + "time_per_iteration": 2.8085427284240723 + }, + { + "auxiliary_loss_clip": 0.0113077, + "auxiliary_loss_mlp": 0.01101476, + "balance_loss_clip": 1.00177872, + "balance_loss_mlp": 1.00048196, + "epoch": 0.905937171200962, + "flos": 15304805349120.0, + "grad_norm": 1.5799642012171973, + "language_loss": 0.69936723, + "learning_rate": 9.204144273522563e-08, + "loss": 0.7216897, + "num_input_tokens_seen": 324917455, + "step": 15068, + "time_per_iteration": 2.6327807903289795 + }, + { + "auxiliary_loss_clip": 0.0116405, + "auxiliary_loss_mlp": 0.01099513, + "balance_loss_clip": 1.00186801, + "balance_loss_mlp": 1.0004735, + "epoch": 0.90599729445363, + "flos": 19462973437440.0, + "grad_norm": 2.818223885331837, + "language_loss": 0.85416138, + "learning_rate": 9.19246900500943e-08, + "loss": 0.87679696, + "num_input_tokens_seen": 324934495, + "step": 15069, + "time_per_iteration": 2.6811885833740234 + }, + { + "auxiliary_loss_clip": 0.01149522, + "auxiliary_loss_mlp": 0.01101207, + "balance_loss_clip": 1.00173521, + "balance_loss_mlp": 1.00049925, + "epoch": 0.9060574177062979, + "flos": 23732967542400.0, + "grad_norm": 3.2669979634744815, + "language_loss": 0.59380078, + "learning_rate": 9.180800971936987e-08, + "loss": 0.61630809, + "num_input_tokens_seen": 324953230, + "step": 15070, + "time_per_iteration": 2.636265516281128 + }, + { + "auxiliary_loss_clip": 0.01114468, + "auxiliary_loss_mlp": 0.01101264, + "balance_loss_clip": 1.00154305, + "balance_loss_mlp": 1.00036502, + "epoch": 0.9061175409589659, + "flos": 17311134395520.0, + "grad_norm": 2.1612633614933903, + "language_loss": 0.81990159, + "learning_rate": 9.169140174747724e-08, + "loss": 0.8420589, + "num_input_tokens_seen": 324969880, + "step": 15071, + "time_per_iteration": 2.710977792739868 + }, + { + "auxiliary_loss_clip": 0.01164408, + "auxiliary_loss_mlp": 0.01101524, + "balance_loss_clip": 1.00193596, + "balance_loss_mlp": 1.00052953, + "epoch": 0.9061776642116338, + "flos": 17778439359360.0, + "grad_norm": 1.8402698347583888, + "language_loss": 0.61783803, + "learning_rate": 9.157486613883758e-08, + "loss": 0.64049733, + "num_input_tokens_seen": 324987005, + "step": 15072, + "time_per_iteration": 2.533935785293579 + }, + { + "auxiliary_loss_clip": 0.01132936, + "auxiliary_loss_mlp": 0.01099671, + "balance_loss_clip": 1.00178313, + "balance_loss_mlp": 1.00058436, + "epoch": 0.9062377874643018, + "flos": 42777688037760.0, + "grad_norm": 1.911824064004169, + "language_loss": 0.73278546, + "learning_rate": 9.145840289787021e-08, + "loss": 0.75511158, + "num_input_tokens_seen": 325010700, + "step": 15073, + "time_per_iteration": 4.338594913482666 + }, + { + "auxiliary_loss_clip": 0.01146908, + "auxiliary_loss_mlp": 0.01099846, + "balance_loss_clip": 1.00185549, + "balance_loss_mlp": 1.00047326, + "epoch": 0.9062979107169697, + "flos": 16361620323840.0, + "grad_norm": 2.372967422228927, + "language_loss": 0.80933446, + "learning_rate": 9.134201202899161e-08, + "loss": 0.83180201, + "num_input_tokens_seen": 325028760, + "step": 15074, + "time_per_iteration": 2.6672725677490234 + }, + { + "auxiliary_loss_clip": 0.01080018, + "auxiliary_loss_mlp": 0.00745241, + "balance_loss_clip": 1.0008018, + "balance_loss_mlp": 1.00014782, + "epoch": 0.9063580339696378, + "flos": 69313988528640.0, + "grad_norm": 0.7441465851763982, + "language_loss": 0.52356303, + "learning_rate": 9.122569353661513e-08, + "loss": 0.54181564, + "num_input_tokens_seen": 325093545, + "step": 15075, + "time_per_iteration": 3.4098992347717285 + }, + { + "auxiliary_loss_clip": 0.0111054, + "auxiliary_loss_mlp": 0.01073892, + "balance_loss_clip": 1.00087655, + "balance_loss_mlp": 1.00026858, + "epoch": 0.9064181572223057, + "flos": 58794747148800.0, + "grad_norm": 0.7288832040321965, + "language_loss": 0.62074435, + "learning_rate": 9.11094474251517e-08, + "loss": 0.64258868, + "num_input_tokens_seen": 325152295, + "step": 15076, + "time_per_iteration": 3.1812660694122314 + }, + { + "auxiliary_loss_clip": 0.01147277, + "auxiliary_loss_mlp": 0.0110075, + "balance_loss_clip": 1.00174654, + "balance_loss_mlp": 1.00066233, + "epoch": 0.9064782804749737, + "flos": 21762692772480.0, + "grad_norm": 2.8620051836204015, + "language_loss": 0.82142031, + "learning_rate": 9.09932736990091e-08, + "loss": 0.84390056, + "num_input_tokens_seen": 325169705, + "step": 15077, + "time_per_iteration": 3.91741943359375 + }, + { + "auxiliary_loss_clip": 0.01133952, + "auxiliary_loss_mlp": 0.00747145, + "balance_loss_clip": 1.00174582, + "balance_loss_mlp": 1.00042939, + "epoch": 0.9065384037276417, + "flos": 21397373498880.0, + "grad_norm": 2.1024822491152633, + "language_loss": 0.84438705, + "learning_rate": 9.08771723625934e-08, + "loss": 0.86319804, + "num_input_tokens_seen": 325189175, + "step": 15078, + "time_per_iteration": 2.642477035522461 + }, + { + "auxiliary_loss_clip": 0.01147204, + "auxiliary_loss_mlp": 0.00747263, + "balance_loss_clip": 1.00179291, + "balance_loss_mlp": 1.00039577, + "epoch": 0.9065985269803096, + "flos": 38283646849920.0, + "grad_norm": 1.701640234065193, + "language_loss": 0.654194, + "learning_rate": 9.076114342030617e-08, + "loss": 0.67313862, + "num_input_tokens_seen": 325211020, + "step": 15079, + "time_per_iteration": 2.808000326156616 + }, + { + "auxiliary_loss_clip": 0.01066007, + "auxiliary_loss_mlp": 0.01100082, + "balance_loss_clip": 1.00157249, + "balance_loss_mlp": 1.00037503, + "epoch": 0.9066586502329776, + "flos": 44818562989440.0, + "grad_norm": 1.664751200279501, + "language_loss": 0.70792186, + "learning_rate": 9.064518687654765e-08, + "loss": 0.72958279, + "num_input_tokens_seen": 325236970, + "step": 15080, + "time_per_iteration": 3.0011441707611084 + }, + { + "auxiliary_loss_clip": 0.01148009, + "auxiliary_loss_mlp": 0.01102511, + "balance_loss_clip": 1.00183702, + "balance_loss_mlp": 1.00046778, + "epoch": 0.9067187734856456, + "flos": 18623992492800.0, + "grad_norm": 5.71624534002194, + "language_loss": 0.70781469, + "learning_rate": 9.052930273571547e-08, + "loss": 0.73031992, + "num_input_tokens_seen": 325252670, + "step": 15081, + "time_per_iteration": 2.598052501678467 + }, + { + "auxiliary_loss_clip": 0.01133013, + "auxiliary_loss_mlp": 0.01099504, + "balance_loss_clip": 1.00181675, + "balance_loss_mlp": 1.00051212, + "epoch": 0.9067788967383136, + "flos": 22747578762240.0, + "grad_norm": 1.8710108958362222, + "language_loss": 0.74482787, + "learning_rate": 9.04134910022032e-08, + "loss": 0.76715302, + "num_input_tokens_seen": 325273860, + "step": 15082, + "time_per_iteration": 2.6750948429107666 + }, + { + "auxiliary_loss_clip": 0.01113505, + "auxiliary_loss_mlp": 0.0109969, + "balance_loss_clip": 1.00163424, + "balance_loss_mlp": 1.00055552, + "epoch": 0.9068390199909815, + "flos": 27670787648640.0, + "grad_norm": 1.744143814950938, + "language_loss": 0.78149974, + "learning_rate": 9.029775168040266e-08, + "loss": 0.80363166, + "num_input_tokens_seen": 325294140, + "step": 15083, + "time_per_iteration": 2.705157518386841 + }, + { + "auxiliary_loss_clip": 0.01130447, + "auxiliary_loss_mlp": 0.00747164, + "balance_loss_clip": 1.00171709, + "balance_loss_mlp": 1.00043368, + "epoch": 0.9068991432436495, + "flos": 24244012293120.0, + "grad_norm": 1.5338530512157564, + "language_loss": 0.69352114, + "learning_rate": 9.01820847747028e-08, + "loss": 0.71229732, + "num_input_tokens_seen": 325313130, + "step": 15084, + "time_per_iteration": 2.6734557151794434 + }, + { + "auxiliary_loss_clip": 0.01164133, + "auxiliary_loss_mlp": 0.01100298, + "balance_loss_clip": 1.00188935, + "balance_loss_mlp": 1.00044823, + "epoch": 0.9069592664963174, + "flos": 28033305661440.0, + "grad_norm": 1.8109709147532491, + "language_loss": 0.66670346, + "learning_rate": 9.006649028948965e-08, + "loss": 0.6893478, + "num_input_tokens_seen": 325334880, + "step": 15085, + "time_per_iteration": 2.6047146320343018 + }, + { + "auxiliary_loss_clip": 0.01114038, + "auxiliary_loss_mlp": 0.01075125, + "balance_loss_clip": 1.00155091, + "balance_loss_mlp": 1.00035655, + "epoch": 0.9070193897489854, + "flos": 68778414789120.0, + "grad_norm": 0.7905751745282082, + "language_loss": 0.61322397, + "learning_rate": 8.995096822914638e-08, + "loss": 0.63511562, + "num_input_tokens_seen": 325394175, + "step": 15086, + "time_per_iteration": 3.269747018814087 + }, + { + "auxiliary_loss_clip": 0.01149434, + "auxiliary_loss_mlp": 0.0110055, + "balance_loss_clip": 1.00193524, + "balance_loss_mlp": 1.00055778, + "epoch": 0.9070795130016533, + "flos": 23441624328960.0, + "grad_norm": 1.6266889170125598, + "language_loss": 0.72242928, + "learning_rate": 8.983551859805416e-08, + "loss": 0.74492908, + "num_input_tokens_seen": 325415020, + "step": 15087, + "time_per_iteration": 2.6530964374542236 + }, + { + "auxiliary_loss_clip": 0.01132269, + "auxiliary_loss_mlp": 0.01100292, + "balance_loss_clip": 1.00174475, + "balance_loss_mlp": 1.0004425, + "epoch": 0.9071396362543214, + "flos": 18916413114240.0, + "grad_norm": 2.013676719212669, + "language_loss": 0.76715231, + "learning_rate": 8.972014140059058e-08, + "loss": 0.78947794, + "num_input_tokens_seen": 325433595, + "step": 15088, + "time_per_iteration": 2.6620535850524902 + }, + { + "auxiliary_loss_clip": 0.01133041, + "auxiliary_loss_mlp": 0.0110002, + "balance_loss_clip": 1.00179124, + "balance_loss_mlp": 1.00050378, + "epoch": 0.9071997595069893, + "flos": 25228646887680.0, + "grad_norm": 1.952980801187182, + "language_loss": 0.73588061, + "learning_rate": 8.960483664113038e-08, + "loss": 0.7582112, + "num_input_tokens_seen": 325451605, + "step": 15089, + "time_per_iteration": 2.6697399616241455 + }, + { + "auxiliary_loss_clip": 0.01164032, + "auxiliary_loss_mlp": 0.01099822, + "balance_loss_clip": 1.00188637, + "balance_loss_mlp": 1.00054455, + "epoch": 0.9072598827596573, + "flos": 24346608514560.0, + "grad_norm": 2.001138498997971, + "language_loss": 0.75679672, + "learning_rate": 8.948960432404628e-08, + "loss": 0.77943528, + "num_input_tokens_seen": 325470645, + "step": 15090, + "time_per_iteration": 2.615230083465576 + }, + { + "auxiliary_loss_clip": 0.01131671, + "auxiliary_loss_mlp": 0.01101279, + "balance_loss_clip": 1.00195813, + "balance_loss_mlp": 1.00057149, + "epoch": 0.9073200060123253, + "flos": 22674967418880.0, + "grad_norm": 2.8876106196901907, + "language_loss": 0.77933967, + "learning_rate": 8.93744444537079e-08, + "loss": 0.80166912, + "num_input_tokens_seen": 325488070, + "step": 15091, + "time_per_iteration": 2.70930814743042 + }, + { + "auxiliary_loss_clip": 0.0113261, + "auxiliary_loss_mlp": 0.01098664, + "balance_loss_clip": 1.00170445, + "balance_loss_mlp": 1.00048339, + "epoch": 0.9073801292649932, + "flos": 23695476721920.0, + "grad_norm": 1.7430212983649314, + "language_loss": 0.86115015, + "learning_rate": 8.925935703448217e-08, + "loss": 0.88346291, + "num_input_tokens_seen": 325509285, + "step": 15092, + "time_per_iteration": 2.6908762454986572 + }, + { + "auxiliary_loss_clip": 0.01130156, + "auxiliary_loss_mlp": 0.01100841, + "balance_loss_clip": 1.00183237, + "balance_loss_mlp": 1.00051498, + "epoch": 0.9074402525176612, + "flos": 25375413859200.0, + "grad_norm": 1.7387240824601342, + "language_loss": 0.78536189, + "learning_rate": 8.914434207073296e-08, + "loss": 0.80767184, + "num_input_tokens_seen": 325529360, + "step": 15093, + "time_per_iteration": 2.7074499130249023 + }, + { + "auxiliary_loss_clip": 0.01143661, + "auxiliary_loss_mlp": 0.01073795, + "balance_loss_clip": 1.0007093, + "balance_loss_mlp": 1.00017107, + "epoch": 0.9075003757703292, + "flos": 67649024384640.0, + "grad_norm": 0.7419594024535373, + "language_loss": 0.57012236, + "learning_rate": 8.902939956682188e-08, + "loss": 0.59229684, + "num_input_tokens_seen": 325583565, + "step": 15094, + "time_per_iteration": 4.594263553619385 + }, + { + "auxiliary_loss_clip": 0.01149595, + "auxiliary_loss_mlp": 0.01101591, + "balance_loss_clip": 1.00193477, + "balance_loss_mlp": 1.00059748, + "epoch": 0.9075604990229972, + "flos": 22453649769600.0, + "grad_norm": 1.8989974582846207, + "language_loss": 0.7112624, + "learning_rate": 8.891452952710742e-08, + "loss": 0.7337743, + "num_input_tokens_seen": 325603690, + "step": 15095, + "time_per_iteration": 4.088422060012817 + }, + { + "auxiliary_loss_clip": 0.01118219, + "auxiliary_loss_mlp": 0.01100171, + "balance_loss_clip": 1.00179625, + "balance_loss_mlp": 1.00051165, + "epoch": 0.9076206222756651, + "flos": 19536662188800.0, + "grad_norm": 4.472192961450114, + "language_loss": 0.7440663, + "learning_rate": 8.879973195594526e-08, + "loss": 0.76625019, + "num_input_tokens_seen": 325622255, + "step": 15096, + "time_per_iteration": 2.778987169265747 + }, + { + "auxiliary_loss_clip": 0.01164315, + "auxiliary_loss_mlp": 0.01100936, + "balance_loss_clip": 1.00195885, + "balance_loss_mlp": 1.00051427, + "epoch": 0.9076807455283331, + "flos": 30116914819200.0, + "grad_norm": 3.7441638747468486, + "language_loss": 0.57069081, + "learning_rate": 8.868500685768898e-08, + "loss": 0.59334332, + "num_input_tokens_seen": 325640165, + "step": 15097, + "time_per_iteration": 2.643092155456543 + }, + { + "auxiliary_loss_clip": 0.01147862, + "auxiliary_loss_mlp": 0.01099985, + "balance_loss_clip": 1.00187635, + "balance_loss_mlp": 1.00032568, + "epoch": 0.907740868781001, + "flos": 18697537589760.0, + "grad_norm": 1.6004900936781872, + "language_loss": 0.79705757, + "learning_rate": 8.857035423668935e-08, + "loss": 0.81953603, + "num_input_tokens_seen": 325659455, + "step": 15098, + "time_per_iteration": 2.6220312118530273 + }, + { + "auxiliary_loss_clip": 0.01099559, + "auxiliary_loss_mlp": 0.00747379, + "balance_loss_clip": 1.00157714, + "balance_loss_mlp": 1.00046086, + "epoch": 0.907800992033669, + "flos": 22638805401600.0, + "grad_norm": 1.7941619495857888, + "language_loss": 0.66259217, + "learning_rate": 8.845577409729266e-08, + "loss": 0.68106163, + "num_input_tokens_seen": 325678095, + "step": 15099, + "time_per_iteration": 2.779869318008423 + }, + { + "auxiliary_loss_clip": 0.01132869, + "auxiliary_loss_mlp": 0.01100826, + "balance_loss_clip": 1.00186682, + "balance_loss_mlp": 1.00064206, + "epoch": 0.907861115286337, + "flos": 21287666384640.0, + "grad_norm": 1.8499038583839045, + "language_loss": 0.70301044, + "learning_rate": 8.834126644384477e-08, + "loss": 0.7253474, + "num_input_tokens_seen": 325695825, + "step": 15100, + "time_per_iteration": 2.65470814704895 + }, + { + "auxiliary_loss_clip": 0.01141977, + "auxiliary_loss_mlp": 0.01074349, + "balance_loss_clip": 1.00079095, + "balance_loss_mlp": 1.00034392, + "epoch": 0.907921238539005, + "flos": 69739493040000.0, + "grad_norm": 0.6271043822184271, + "language_loss": 0.53408837, + "learning_rate": 8.822683128068775e-08, + "loss": 0.55625165, + "num_input_tokens_seen": 325764515, + "step": 15101, + "time_per_iteration": 3.2269444465637207 + }, + { + "auxiliary_loss_clip": 0.01117976, + "auxiliary_loss_mlp": 0.01100785, + "balance_loss_clip": 1.00179052, + "balance_loss_mlp": 1.00036287, + "epoch": 0.9079813617916729, + "flos": 23477391296640.0, + "grad_norm": 1.8313472538439288, + "language_loss": 0.68488151, + "learning_rate": 8.811246861216081e-08, + "loss": 0.70706904, + "num_input_tokens_seen": 325783235, + "step": 15102, + "time_per_iteration": 2.6644177436828613 + }, + { + "auxiliary_loss_clip": 0.01149561, + "auxiliary_loss_mlp": 0.01099537, + "balance_loss_clip": 1.00202417, + "balance_loss_mlp": 1.00049782, + "epoch": 0.9080414850443409, + "flos": 22929933133440.0, + "grad_norm": 2.77119129964752, + "language_loss": 0.7891897, + "learning_rate": 8.799817844260049e-08, + "loss": 0.81168073, + "num_input_tokens_seen": 325800195, + "step": 15103, + "time_per_iteration": 2.6244471073150635 + }, + { + "auxiliary_loss_clip": 0.01132974, + "auxiliary_loss_mlp": 0.01099486, + "balance_loss_clip": 1.00181639, + "balance_loss_mlp": 1.00044692, + "epoch": 0.9081016082970089, + "flos": 26177083551360.0, + "grad_norm": 1.7462413655232556, + "language_loss": 0.71481097, + "learning_rate": 8.78839607763413e-08, + "loss": 0.73713559, + "num_input_tokens_seen": 325820215, + "step": 15104, + "time_per_iteration": 2.6654763221740723 + }, + { + "auxiliary_loss_clip": 0.01132523, + "auxiliary_loss_mlp": 0.01099297, + "balance_loss_clip": 1.00171638, + "balance_loss_mlp": 1.00035298, + "epoch": 0.9081617315496768, + "flos": 24462169545600.0, + "grad_norm": 1.8943576847531371, + "language_loss": 0.77422023, + "learning_rate": 8.77698156177138e-08, + "loss": 0.79653847, + "num_input_tokens_seen": 325838415, + "step": 15105, + "time_per_iteration": 2.6995773315429688 + }, + { + "auxiliary_loss_clip": 0.0116404, + "auxiliary_loss_mlp": 0.00747366, + "balance_loss_clip": 1.00179029, + "balance_loss_mlp": 1.00051486, + "epoch": 0.9082218548023449, + "flos": 24746868743040.0, + "grad_norm": 1.8749279330735806, + "language_loss": 0.74348652, + "learning_rate": 8.765574297104628e-08, + "loss": 0.76260054, + "num_input_tokens_seen": 325855580, + "step": 15106, + "time_per_iteration": 2.5720086097717285 + }, + { + "auxiliary_loss_clip": 0.0110357, + "auxiliary_loss_mlp": 0.01101687, + "balance_loss_clip": 1.00179935, + "balance_loss_mlp": 1.00059712, + "epoch": 0.9082819780550128, + "flos": 24421302846720.0, + "grad_norm": 1.633516675478831, + "language_loss": 0.803891, + "learning_rate": 8.754174284066462e-08, + "loss": 0.82594359, + "num_input_tokens_seen": 325874890, + "step": 15107, + "time_per_iteration": 2.7490408420562744 + }, + { + "auxiliary_loss_clip": 0.01125049, + "auxiliary_loss_mlp": 0.01074253, + "balance_loss_clip": 1.00099707, + "balance_loss_mlp": 1.00062931, + "epoch": 0.9083421013076808, + "flos": 59609704872960.0, + "grad_norm": 0.8145673561914464, + "language_loss": 0.59755164, + "learning_rate": 8.742781523089205e-08, + "loss": 0.61954463, + "num_input_tokens_seen": 325935835, + "step": 15108, + "time_per_iteration": 3.17885684967041 + }, + { + "auxiliary_loss_clip": 0.0113222, + "auxiliary_loss_mlp": 0.01100162, + "balance_loss_clip": 1.00168085, + "balance_loss_mlp": 1.00031233, + "epoch": 0.9084022245603487, + "flos": 33620216100480.0, + "grad_norm": 1.64809175933327, + "language_loss": 0.73406416, + "learning_rate": 8.73139601460482e-08, + "loss": 0.75638795, + "num_input_tokens_seen": 325958035, + "step": 15109, + "time_per_iteration": 2.7801034450531006 + }, + { + "auxiliary_loss_clip": 0.01116008, + "auxiliary_loss_mlp": 0.01099478, + "balance_loss_clip": 1.001652, + "balance_loss_mlp": 1.00043905, + "epoch": 0.9084623478130167, + "flos": 24971705925120.0, + "grad_norm": 2.7277835405530992, + "language_loss": 0.71432221, + "learning_rate": 8.720017759045073e-08, + "loss": 0.73647702, + "num_input_tokens_seen": 325979870, + "step": 15110, + "time_per_iteration": 2.732858180999756 + }, + { + "auxiliary_loss_clip": 0.01134605, + "auxiliary_loss_mlp": 0.01100577, + "balance_loss_clip": 1.00178337, + "balance_loss_mlp": 1.0005368, + "epoch": 0.9085224710656846, + "flos": 31461804869760.0, + "grad_norm": 2.231001364353922, + "language_loss": 0.68925118, + "learning_rate": 8.708646756841421e-08, + "loss": 0.71160299, + "num_input_tokens_seen": 325998245, + "step": 15111, + "time_per_iteration": 4.135199069976807 + }, + { + "auxiliary_loss_clip": 0.01124496, + "auxiliary_loss_mlp": 0.01073838, + "balance_loss_clip": 1.00080919, + "balance_loss_mlp": 1.00021446, + "epoch": 0.9085825943183526, + "flos": 64917012867840.0, + "grad_norm": 0.7031967544732499, + "language_loss": 0.51715767, + "learning_rate": 8.697283008425026e-08, + "loss": 0.53914106, + "num_input_tokens_seen": 326061770, + "step": 15112, + "time_per_iteration": 3.3167693614959717 + }, + { + "auxiliary_loss_clip": 0.01148995, + "auxiliary_loss_mlp": 0.01100332, + "balance_loss_clip": 1.00171816, + "balance_loss_mlp": 1.00052953, + "epoch": 0.9086427175710206, + "flos": 18953221576320.0, + "grad_norm": 1.8923718336508022, + "language_loss": 0.70245349, + "learning_rate": 8.685926514226837e-08, + "loss": 0.72494668, + "num_input_tokens_seen": 326080945, + "step": 15113, + "time_per_iteration": 2.601152181625366 + }, + { + "auxiliary_loss_clip": 0.01147438, + "auxiliary_loss_mlp": 0.01099756, + "balance_loss_clip": 1.00187671, + "balance_loss_mlp": 1.00047874, + "epoch": 0.9087028408236886, + "flos": 34014873807360.0, + "grad_norm": 2.3409666903732487, + "language_loss": 0.7913872, + "learning_rate": 8.674577274677508e-08, + "loss": 0.81385911, + "num_input_tokens_seen": 326100630, + "step": 15114, + "time_per_iteration": 2.6906800270080566 + }, + { + "auxiliary_loss_clip": 0.01099276, + "auxiliary_loss_mlp": 0.01100775, + "balance_loss_clip": 1.00166833, + "balance_loss_mlp": 1.0004487, + "epoch": 0.9087629640763565, + "flos": 21944580266880.0, + "grad_norm": 1.838999238293871, + "language_loss": 0.70169306, + "learning_rate": 8.663235290207405e-08, + "loss": 0.72369355, + "num_input_tokens_seen": 326120145, + "step": 15115, + "time_per_iteration": 4.3132688999176025 + }, + { + "auxiliary_loss_clip": 0.01120623, + "auxiliary_loss_mlp": 0.01101495, + "balance_loss_clip": 1.00199223, + "balance_loss_mlp": 1.00040555, + "epoch": 0.9088230873290245, + "flos": 21762908254080.0, + "grad_norm": 1.5700140301499381, + "language_loss": 0.65852398, + "learning_rate": 8.651900561246561e-08, + "loss": 0.68074512, + "num_input_tokens_seen": 326140715, + "step": 15116, + "time_per_iteration": 2.69865083694458 + }, + { + "auxiliary_loss_clip": 0.01164006, + "auxiliary_loss_mlp": 0.0110027, + "balance_loss_clip": 1.00200605, + "balance_loss_mlp": 1.00046849, + "epoch": 0.9088832105816925, + "flos": 21541267382400.0, + "grad_norm": 1.6787801379939267, + "language_loss": 0.69468701, + "learning_rate": 8.640573088224812e-08, + "loss": 0.71732986, + "num_input_tokens_seen": 326159130, + "step": 15117, + "time_per_iteration": 2.5460317134857178 + }, + { + "auxiliary_loss_clip": 0.01113701, + "auxiliary_loss_mlp": 0.010999, + "balance_loss_clip": 1.00152564, + "balance_loss_mlp": 1.00038362, + "epoch": 0.9089433338343604, + "flos": 25996704428160.0, + "grad_norm": 1.58928437163059, + "language_loss": 0.74348825, + "learning_rate": 8.629252871571745e-08, + "loss": 0.76562428, + "num_input_tokens_seen": 326181375, + "step": 15118, + "time_per_iteration": 2.739987850189209 + }, + { + "auxiliary_loss_clip": 0.01133304, + "auxiliary_loss_mlp": 0.01101965, + "balance_loss_clip": 1.00174093, + "balance_loss_mlp": 1.000494, + "epoch": 0.9090034570870285, + "flos": 21178426147200.0, + "grad_norm": 5.267130952684086, + "language_loss": 0.73233831, + "learning_rate": 8.617939911716554e-08, + "loss": 0.754691, + "num_input_tokens_seen": 326199740, + "step": 15119, + "time_per_iteration": 2.6116840839385986 + }, + { + "auxiliary_loss_clip": 0.01116343, + "auxiliary_loss_mlp": 0.01102734, + "balance_loss_clip": 1.00183344, + "balance_loss_mlp": 1.00059521, + "epoch": 0.9090635803396964, + "flos": 16141811045760.0, + "grad_norm": 2.635923943310363, + "language_loss": 0.70969802, + "learning_rate": 8.60663420908827e-08, + "loss": 0.73188877, + "num_input_tokens_seen": 326214350, + "step": 15120, + "time_per_iteration": 2.7170488834381104 + }, + { + "auxiliary_loss_clip": 0.01164225, + "auxiliary_loss_mlp": 0.00747335, + "balance_loss_clip": 1.00187981, + "balance_loss_mlp": 1.0005089, + "epoch": 0.9091237035923644, + "flos": 20591537829120.0, + "grad_norm": 3.3467544999180263, + "language_loss": 0.66116613, + "learning_rate": 8.595335764115596e-08, + "loss": 0.68028176, + "num_input_tokens_seen": 326234580, + "step": 15121, + "time_per_iteration": 2.575441360473633 + }, + { + "auxiliary_loss_clip": 0.01148925, + "auxiliary_loss_mlp": 0.01100955, + "balance_loss_clip": 1.00176907, + "balance_loss_mlp": 1.00062799, + "epoch": 0.9091838268450323, + "flos": 52227760164480.0, + "grad_norm": 1.9938882608644217, + "language_loss": 0.7042346, + "learning_rate": 8.58404457722699e-08, + "loss": 0.72673339, + "num_input_tokens_seen": 326259080, + "step": 15122, + "time_per_iteration": 2.942782402038574 + }, + { + "auxiliary_loss_clip": 0.01101704, + "auxiliary_loss_mlp": 0.0109925, + "balance_loss_clip": 1.00154448, + "balance_loss_mlp": 1.00044894, + "epoch": 0.9092439500977003, + "flos": 20559613616640.0, + "grad_norm": 1.2697308883687732, + "language_loss": 0.74620706, + "learning_rate": 8.572760648850575e-08, + "loss": 0.76821661, + "num_input_tokens_seen": 326280175, + "step": 15123, + "time_per_iteration": 2.7343027591705322 + }, + { + "auxiliary_loss_clip": 0.01147331, + "auxiliary_loss_mlp": 0.01099493, + "balance_loss_clip": 1.00177431, + "balance_loss_mlp": 1.00031054, + "epoch": 0.9093040733503682, + "flos": 28617859595520.0, + "grad_norm": 2.3914684032839504, + "language_loss": 0.75824183, + "learning_rate": 8.561483979414253e-08, + "loss": 0.7807101, + "num_input_tokens_seen": 326297990, + "step": 15124, + "time_per_iteration": 2.689058780670166 + }, + { + "auxiliary_loss_clip": 0.0114972, + "auxiliary_loss_mlp": 0.01100543, + "balance_loss_clip": 1.00195503, + "balance_loss_mlp": 1.00050211, + "epoch": 0.9093641966030362, + "flos": 23440187784960.0, + "grad_norm": 2.9306219999860983, + "language_loss": 0.72190481, + "learning_rate": 8.55021456934566e-08, + "loss": 0.74440747, + "num_input_tokens_seen": 326316735, + "step": 15125, + "time_per_iteration": 2.5872890949249268 + }, + { + "auxiliary_loss_clip": 0.01116541, + "auxiliary_loss_mlp": 0.01100761, + "balance_loss_clip": 1.00206208, + "balance_loss_mlp": 1.00062561, + "epoch": 0.9094243198557042, + "flos": 16800197385600.0, + "grad_norm": 1.6262857476098156, + "language_loss": 0.78769875, + "learning_rate": 8.538952419072143e-08, + "loss": 0.80987179, + "num_input_tokens_seen": 326334370, + "step": 15126, + "time_per_iteration": 2.659088134765625 + }, + { + "auxiliary_loss_clip": 0.01116839, + "auxiliary_loss_mlp": 0.01100261, + "balance_loss_clip": 1.00194037, + "balance_loss_mlp": 1.00045931, + "epoch": 0.9094844431083722, + "flos": 24273278899200.0, + "grad_norm": 2.0317728122675303, + "language_loss": 0.75370157, + "learning_rate": 8.527697529020694e-08, + "loss": 0.77587253, + "num_input_tokens_seen": 326353435, + "step": 15127, + "time_per_iteration": 2.692781448364258 + }, + { + "auxiliary_loss_clip": 0.01066496, + "auxiliary_loss_mlp": 0.01100914, + "balance_loss_clip": 1.00171423, + "balance_loss_mlp": 1.00063539, + "epoch": 0.9095445663610401, + "flos": 21944652094080.0, + "grad_norm": 1.828038913827986, + "language_loss": 0.62297505, + "learning_rate": 8.516449899618173e-08, + "loss": 0.64464915, + "num_input_tokens_seen": 326371810, + "step": 15128, + "time_per_iteration": 2.8198423385620117 + }, + { + "auxiliary_loss_clip": 0.01116187, + "auxiliary_loss_mlp": 0.0109906, + "balance_loss_clip": 1.00164807, + "balance_loss_mlp": 1.00040221, + "epoch": 0.9096046896137081, + "flos": 19792848965760.0, + "grad_norm": 2.850955569033422, + "language_loss": 0.77003884, + "learning_rate": 8.505209531291013e-08, + "loss": 0.79219127, + "num_input_tokens_seen": 326391380, + "step": 15129, + "time_per_iteration": 2.710508108139038 + }, + { + "auxiliary_loss_clip": 0.01149521, + "auxiliary_loss_mlp": 0.01100073, + "balance_loss_clip": 1.00187945, + "balance_loss_mlp": 1.00046206, + "epoch": 0.909664812866376, + "flos": 22638087129600.0, + "grad_norm": 2.018901215022985, + "language_loss": 0.83092457, + "learning_rate": 8.49397642446552e-08, + "loss": 0.8534205, + "num_input_tokens_seen": 326408800, + "step": 15130, + "time_per_iteration": 2.613417625427246 + }, + { + "auxiliary_loss_clip": 0.01132244, + "auxiliary_loss_mlp": 0.01100978, + "balance_loss_clip": 1.0018357, + "balance_loss_mlp": 1.00050819, + "epoch": 0.909724936119044, + "flos": 39852153020160.0, + "grad_norm": 1.9945601175766174, + "language_loss": 0.75022399, + "learning_rate": 8.482750579567644e-08, + "loss": 0.77255613, + "num_input_tokens_seen": 326431565, + "step": 15131, + "time_per_iteration": 4.276426792144775 + }, + { + "auxiliary_loss_clip": 0.01133258, + "auxiliary_loss_mlp": 0.01100105, + "balance_loss_clip": 1.00199342, + "balance_loss_mlp": 1.00058889, + "epoch": 0.9097850593717121, + "flos": 35071616954880.0, + "grad_norm": 2.257385812589421, + "language_loss": 0.59482622, + "learning_rate": 8.471531997023085e-08, + "loss": 0.61715984, + "num_input_tokens_seen": 326451715, + "step": 15132, + "time_per_iteration": 2.762415885925293 + }, + { + "auxiliary_loss_clip": 0.01114978, + "auxiliary_loss_mlp": 0.01100799, + "balance_loss_clip": 1.00190425, + "balance_loss_mlp": 1.00052023, + "epoch": 0.90984518262438, + "flos": 23367468700800.0, + "grad_norm": 1.4244743438006329, + "language_loss": 0.82358503, + "learning_rate": 8.460320677257193e-08, + "loss": 0.84574288, + "num_input_tokens_seen": 326470855, + "step": 15133, + "time_per_iteration": 4.129878759384155 + }, + { + "auxiliary_loss_clip": 0.01133062, + "auxiliary_loss_mlp": 0.01100589, + "balance_loss_clip": 1.00182736, + "balance_loss_mlp": 1.000453, + "epoch": 0.909905305877048, + "flos": 27523302405120.0, + "grad_norm": 1.779049049557158, + "language_loss": 0.74147356, + "learning_rate": 8.449116620695118e-08, + "loss": 0.76381004, + "num_input_tokens_seen": 326490480, + "step": 15134, + "time_per_iteration": 2.7079975605010986 + }, + { + "auxiliary_loss_clip": 0.01118666, + "auxiliary_loss_mlp": 0.01101568, + "balance_loss_clip": 1.00170803, + "balance_loss_mlp": 1.00047827, + "epoch": 0.9099654291297159, + "flos": 24347865490560.0, + "grad_norm": 2.754378324336457, + "language_loss": 0.72767591, + "learning_rate": 8.437919827761786e-08, + "loss": 0.74987829, + "num_input_tokens_seen": 326509445, + "step": 15135, + "time_per_iteration": 2.733671188354492 + }, + { + "auxiliary_loss_clip": 0.011474, + "auxiliary_loss_mlp": 0.01099911, + "balance_loss_clip": 1.00187492, + "balance_loss_mlp": 1.00053763, + "epoch": 0.9100255523823839, + "flos": 21215234609280.0, + "grad_norm": 1.577148609736766, + "language_loss": 0.69926316, + "learning_rate": 8.426730298881702e-08, + "loss": 0.72173631, + "num_input_tokens_seen": 326528380, + "step": 15136, + "time_per_iteration": 2.6099236011505127 + }, + { + "auxiliary_loss_clip": 0.01110001, + "auxiliary_loss_mlp": 0.01074276, + "balance_loss_clip": 1.00076318, + "balance_loss_mlp": 1.00027084, + "epoch": 0.9100856756350518, + "flos": 46052276446080.0, + "grad_norm": 0.8148845355096858, + "language_loss": 0.59278721, + "learning_rate": 8.415548034479214e-08, + "loss": 0.61462998, + "num_input_tokens_seen": 326576940, + "step": 15137, + "time_per_iteration": 2.9796669483184814 + }, + { + "auxiliary_loss_clip": 0.01149518, + "auxiliary_loss_mlp": 0.01100043, + "balance_loss_clip": 1.00178146, + "balance_loss_mlp": 1.00062227, + "epoch": 0.9101457988877198, + "flos": 20229917656320.0, + "grad_norm": 1.5654732494025096, + "language_loss": 0.82210183, + "learning_rate": 8.40437303497834e-08, + "loss": 0.84459746, + "num_input_tokens_seen": 326596100, + "step": 15138, + "time_per_iteration": 2.595069646835327 + }, + { + "auxiliary_loss_clip": 0.01147132, + "auxiliary_loss_mlp": 0.01099718, + "balance_loss_clip": 1.00183392, + "balance_loss_mlp": 1.00039291, + "epoch": 0.9102059221403878, + "flos": 26615157822720.0, + "grad_norm": 1.450876611544506, + "language_loss": 0.81324792, + "learning_rate": 8.39320530080283e-08, + "loss": 0.83571649, + "num_input_tokens_seen": 326615700, + "step": 15139, + "time_per_iteration": 2.7030129432678223 + }, + { + "auxiliary_loss_clip": 0.01113831, + "auxiliary_loss_mlp": 0.01099689, + "balance_loss_clip": 1.00159502, + "balance_loss_mlp": 1.00055444, + "epoch": 0.9102660453930558, + "flos": 21908561904000.0, + "grad_norm": 1.6064707792832738, + "language_loss": 0.77676743, + "learning_rate": 8.382044832376167e-08, + "loss": 0.79890263, + "num_input_tokens_seen": 326635905, + "step": 15140, + "time_per_iteration": 2.674119710922241 + }, + { + "auxiliary_loss_clip": 0.01164082, + "auxiliary_loss_mlp": 0.01099795, + "balance_loss_clip": 1.00184071, + "balance_loss_mlp": 1.00042188, + "epoch": 0.9103261686457237, + "flos": 36176660916480.0, + "grad_norm": 1.767580433692072, + "language_loss": 0.66468674, + "learning_rate": 8.370891630121569e-08, + "loss": 0.68732548, + "num_input_tokens_seen": 326661855, + "step": 15141, + "time_per_iteration": 2.710675001144409 + }, + { + "auxiliary_loss_clip": 0.01147037, + "auxiliary_loss_mlp": 0.01100334, + "balance_loss_clip": 1.00183392, + "balance_loss_mlp": 1.00057912, + "epoch": 0.9103862918983917, + "flos": 23878549365120.0, + "grad_norm": 1.7686993422063806, + "language_loss": 0.74927264, + "learning_rate": 8.359745694462005e-08, + "loss": 0.77174634, + "num_input_tokens_seen": 326679320, + "step": 15142, + "time_per_iteration": 2.658931255340576 + }, + { + "auxiliary_loss_clip": 0.01117347, + "auxiliary_loss_mlp": 0.01099722, + "balance_loss_clip": 1.00169897, + "balance_loss_mlp": 1.00054014, + "epoch": 0.9104464151510596, + "flos": 14939521989120.0, + "grad_norm": 1.788221454022793, + "language_loss": 0.64170337, + "learning_rate": 8.348607025820076e-08, + "loss": 0.66387409, + "num_input_tokens_seen": 326698110, + "step": 15143, + "time_per_iteration": 2.680745840072632 + }, + { + "auxiliary_loss_clip": 0.01164063, + "auxiliary_loss_mlp": 0.01100428, + "balance_loss_clip": 1.00178432, + "balance_loss_mlp": 1.00057805, + "epoch": 0.9105065384037276, + "flos": 33655803500160.0, + "grad_norm": 1.87230957356832, + "language_loss": 0.61586583, + "learning_rate": 8.337475624618152e-08, + "loss": 0.63851076, + "num_input_tokens_seen": 326718370, + "step": 15144, + "time_per_iteration": 2.6319477558135986 + }, + { + "auxiliary_loss_clip": 0.01117985, + "auxiliary_loss_mlp": 0.01098898, + "balance_loss_clip": 1.00180984, + "balance_loss_mlp": 1.00038302, + "epoch": 0.9105666616563957, + "flos": 24316695463680.0, + "grad_norm": 1.8958661185405765, + "language_loss": 0.71032584, + "learning_rate": 8.326351491278382e-08, + "loss": 0.73249471, + "num_input_tokens_seen": 326738445, + "step": 15145, + "time_per_iteration": 2.736504077911377 + }, + { + "auxiliary_loss_clip": 0.01100311, + "auxiliary_loss_mlp": 0.01099004, + "balance_loss_clip": 1.00181687, + "balance_loss_mlp": 1.00044167, + "epoch": 0.9106267849090636, + "flos": 29971692132480.0, + "grad_norm": 1.4842984729695288, + "language_loss": 0.70577002, + "learning_rate": 8.315234626222545e-08, + "loss": 0.72776318, + "num_input_tokens_seen": 326758855, + "step": 15146, + "time_per_iteration": 2.784665107727051 + }, + { + "auxiliary_loss_clip": 0.01132878, + "auxiliary_loss_mlp": 0.0110001, + "balance_loss_clip": 1.00167513, + "balance_loss_mlp": 1.00054145, + "epoch": 0.9106869081617316, + "flos": 25337743470720.0, + "grad_norm": 2.043521615581892, + "language_loss": 0.72869968, + "learning_rate": 8.304125029872233e-08, + "loss": 0.75102854, + "num_input_tokens_seen": 326777140, + "step": 15147, + "time_per_iteration": 2.7064056396484375 + }, + { + "auxiliary_loss_clip": 0.01115782, + "auxiliary_loss_mlp": 0.01099834, + "balance_loss_clip": 1.00157356, + "balance_loss_mlp": 1.00046098, + "epoch": 0.9107470314143995, + "flos": 18187031543040.0, + "grad_norm": 1.943092031841686, + "language_loss": 0.80078608, + "learning_rate": 8.293022702648711e-08, + "loss": 0.82294226, + "num_input_tokens_seen": 326794070, + "step": 15148, + "time_per_iteration": 2.6805853843688965 + }, + { + "auxiliary_loss_clip": 0.01113985, + "auxiliary_loss_mlp": 0.01100383, + "balance_loss_clip": 1.00155056, + "balance_loss_mlp": 1.00053382, + "epoch": 0.9108071546670675, + "flos": 23550828652800.0, + "grad_norm": 1.8206085940799017, + "language_loss": 0.67944461, + "learning_rate": 8.281927644972996e-08, + "loss": 0.70158827, + "num_input_tokens_seen": 326814695, + "step": 15149, + "time_per_iteration": 4.110770225524902 + }, + { + "auxiliary_loss_clip": 0.01164163, + "auxiliary_loss_mlp": 0.01099282, + "balance_loss_clip": 1.00195217, + "balance_loss_mlp": 1.00048089, + "epoch": 0.9108672779197354, + "flos": 25630307746560.0, + "grad_norm": 1.7825355012947803, + "language_loss": 0.6300379, + "learning_rate": 8.270839857265776e-08, + "loss": 0.65267235, + "num_input_tokens_seen": 326835295, + "step": 15150, + "time_per_iteration": 2.6289310455322266 + }, + { + "auxiliary_loss_clip": 0.01114682, + "auxiliary_loss_mlp": 0.01100711, + "balance_loss_clip": 1.00182259, + "balance_loss_mlp": 1.00057507, + "epoch": 0.9109274011724035, + "flos": 22339094319360.0, + "grad_norm": 2.3373096028044316, + "language_loss": 0.72524554, + "learning_rate": 8.259759339947514e-08, + "loss": 0.74739945, + "num_input_tokens_seen": 326853350, + "step": 15151, + "time_per_iteration": 2.650217056274414 + }, + { + "auxiliary_loss_clip": 0.01147368, + "auxiliary_loss_mlp": 0.01099596, + "balance_loss_clip": 1.00182009, + "balance_loss_mlp": 1.00036597, + "epoch": 0.9109875244250714, + "flos": 26688200129280.0, + "grad_norm": 1.6381299036012806, + "language_loss": 0.64155924, + "learning_rate": 8.248686093438429e-08, + "loss": 0.66402882, + "num_input_tokens_seen": 326873425, + "step": 15152, + "time_per_iteration": 4.150392770767212 + }, + { + "auxiliary_loss_clip": 0.01132846, + "auxiliary_loss_mlp": 0.00747307, + "balance_loss_clip": 1.00175512, + "balance_loss_mlp": 1.00046051, + "epoch": 0.9110476476777394, + "flos": 22930112701440.0, + "grad_norm": 2.1515056241266017, + "language_loss": 0.73399973, + "learning_rate": 8.23762011815834e-08, + "loss": 0.7528013, + "num_input_tokens_seen": 326893455, + "step": 15153, + "time_per_iteration": 2.6714935302734375 + }, + { + "auxiliary_loss_clip": 0.01120149, + "auxiliary_loss_mlp": 0.01101432, + "balance_loss_clip": 1.00189233, + "balance_loss_mlp": 1.00062895, + "epoch": 0.9111077709304073, + "flos": 13472857854720.0, + "grad_norm": 3.1021277480558354, + "language_loss": 0.72205317, + "learning_rate": 8.226561414526956e-08, + "loss": 0.74426901, + "num_input_tokens_seen": 326910210, + "step": 15154, + "time_per_iteration": 2.6980111598968506 + }, + { + "auxiliary_loss_clip": 0.01132221, + "auxiliary_loss_mlp": 0.01100331, + "balance_loss_clip": 1.00184071, + "balance_loss_mlp": 1.00052941, + "epoch": 0.9111678941830753, + "flos": 20850561780480.0, + "grad_norm": 1.676817696509125, + "language_loss": 0.81962597, + "learning_rate": 8.215509982963564e-08, + "loss": 0.84195149, + "num_input_tokens_seen": 326929350, + "step": 15155, + "time_per_iteration": 2.7188527584075928 + }, + { + "auxiliary_loss_clip": 0.01147393, + "auxiliary_loss_mlp": 0.01100006, + "balance_loss_clip": 1.0018518, + "balance_loss_mlp": 1.00049043, + "epoch": 0.9112280174357432, + "flos": 19682244011520.0, + "grad_norm": 1.5047525283279166, + "language_loss": 0.59636366, + "learning_rate": 8.204465823887252e-08, + "loss": 0.61883765, + "num_input_tokens_seen": 326949060, + "step": 15156, + "time_per_iteration": 2.5980491638183594 + }, + { + "auxiliary_loss_clip": 0.01147629, + "auxiliary_loss_mlp": 0.01100621, + "balance_loss_clip": 1.00176406, + "balance_loss_mlp": 1.00038934, + "epoch": 0.9112881406884112, + "flos": 25447163276160.0, + "grad_norm": 2.2524397437228574, + "language_loss": 0.7449221, + "learning_rate": 8.193428937716796e-08, + "loss": 0.76740462, + "num_input_tokens_seen": 326968950, + "step": 15157, + "time_per_iteration": 2.6912999153137207 + }, + { + "auxiliary_loss_clip": 0.01103343, + "auxiliary_loss_mlp": 0.01099968, + "balance_loss_clip": 1.00162268, + "balance_loss_mlp": 1.00054789, + "epoch": 0.9113482639410793, + "flos": 33066975847680.0, + "grad_norm": 2.3014215739138644, + "language_loss": 0.59271413, + "learning_rate": 8.182399324870747e-08, + "loss": 0.61474729, + "num_input_tokens_seen": 326989455, + "step": 15158, + "time_per_iteration": 2.801177978515625 + }, + { + "auxiliary_loss_clip": 0.01081229, + "auxiliary_loss_mlp": 0.01099774, + "balance_loss_clip": 1.00139523, + "balance_loss_mlp": 1.00054431, + "epoch": 0.9114083871937472, + "flos": 21835591424640.0, + "grad_norm": 1.6167469577606668, + "language_loss": 0.67590201, + "learning_rate": 8.171376985767375e-08, + "loss": 0.69771206, + "num_input_tokens_seen": 327009640, + "step": 15159, + "time_per_iteration": 2.767425298690796 + }, + { + "auxiliary_loss_clip": 0.01132664, + "auxiliary_loss_mlp": 0.01099691, + "balance_loss_clip": 1.00171089, + "balance_loss_mlp": 1.00036597, + "epoch": 0.9114685104464152, + "flos": 27088999061760.0, + "grad_norm": 3.1730211017495336, + "language_loss": 0.7883389, + "learning_rate": 8.160361920824588e-08, + "loss": 0.81066245, + "num_input_tokens_seen": 327027690, + "step": 15160, + "time_per_iteration": 2.700883150100708 + }, + { + "auxiliary_loss_clip": 0.01164187, + "auxiliary_loss_mlp": 0.01101077, + "balance_loss_clip": 1.00193238, + "balance_loss_mlp": 1.00036907, + "epoch": 0.9115286336990831, + "flos": 17967042696960.0, + "grad_norm": 1.8939038993044168, + "language_loss": 0.68872225, + "learning_rate": 8.149354130460073e-08, + "loss": 0.71137488, + "num_input_tokens_seen": 327045915, + "step": 15161, + "time_per_iteration": 2.506826162338257 + }, + { + "auxiliary_loss_clip": 0.01100733, + "auxiliary_loss_mlp": 0.01100453, + "balance_loss_clip": 1.00166225, + "balance_loss_mlp": 1.00045991, + "epoch": 0.9115887569517511, + "flos": 22929861306240.0, + "grad_norm": 1.7514933566309567, + "language_loss": 0.76636112, + "learning_rate": 8.138353615091321e-08, + "loss": 0.78837293, + "num_input_tokens_seen": 327066355, + "step": 15162, + "time_per_iteration": 2.7179813385009766 + }, + { + "auxiliary_loss_clip": 0.01132884, + "auxiliary_loss_mlp": 0.01101484, + "balance_loss_clip": 1.00200081, + "balance_loss_mlp": 1.00048995, + "epoch": 0.911648880204419, + "flos": 23988436047360.0, + "grad_norm": 1.9713737986975681, + "language_loss": 0.66738904, + "learning_rate": 8.127360375135395e-08, + "loss": 0.68973279, + "num_input_tokens_seen": 327086735, + "step": 15163, + "time_per_iteration": 2.6220896244049072 + }, + { + "auxiliary_loss_clip": 0.01099492, + "auxiliary_loss_mlp": 0.01100456, + "balance_loss_clip": 1.00165665, + "balance_loss_mlp": 1.00046277, + "epoch": 0.911709003457087, + "flos": 17055306754560.0, + "grad_norm": 2.303978745680473, + "language_loss": 0.70664263, + "learning_rate": 8.116374411009186e-08, + "loss": 0.72864211, + "num_input_tokens_seen": 327104035, + "step": 15164, + "time_per_iteration": 2.7063817977905273 + }, + { + "auxiliary_loss_clip": 0.01164008, + "auxiliary_loss_mlp": 0.01099226, + "balance_loss_clip": 1.00192451, + "balance_loss_mlp": 1.00047326, + "epoch": 0.911769126709755, + "flos": 21653344794240.0, + "grad_norm": 2.1224505044262654, + "language_loss": 0.75877643, + "learning_rate": 8.105395723129315e-08, + "loss": 0.78140879, + "num_input_tokens_seen": 327124370, + "step": 15165, + "time_per_iteration": 2.5551795959472656 + }, + { + "auxiliary_loss_clip": 0.01149759, + "auxiliary_loss_mlp": 0.01101265, + "balance_loss_clip": 1.00194192, + "balance_loss_mlp": 1.00074792, + "epoch": 0.911829249962423, + "flos": 24790321221120.0, + "grad_norm": 2.0974533361553784, + "language_loss": 0.72051132, + "learning_rate": 8.094424311912074e-08, + "loss": 0.74302155, + "num_input_tokens_seen": 327140915, + "step": 15166, + "time_per_iteration": 2.7069857120513916 + }, + { + "auxiliary_loss_clip": 0.01099411, + "auxiliary_loss_mlp": 0.0110133, + "balance_loss_clip": 1.00171375, + "balance_loss_mlp": 1.00052691, + "epoch": 0.9118893732150909, + "flos": 20959406968320.0, + "grad_norm": 2.458311212784387, + "language_loss": 0.73091054, + "learning_rate": 8.083460177773482e-08, + "loss": 0.75291795, + "num_input_tokens_seen": 327158940, + "step": 15167, + "time_per_iteration": 2.6849472522735596 + }, + { + "auxiliary_loss_clip": 0.0112881, + "auxiliary_loss_mlp": 0.01074092, + "balance_loss_clip": 1.00071144, + "balance_loss_mlp": 1.00008714, + "epoch": 0.9119494964677589, + "flos": 67917385872000.0, + "grad_norm": 0.771575537863966, + "language_loss": 0.65581894, + "learning_rate": 8.072503321129298e-08, + "loss": 0.67784798, + "num_input_tokens_seen": 327217450, + "step": 15168, + "time_per_iteration": 3.184241771697998 + }, + { + "auxiliary_loss_clip": 0.01130985, + "auxiliary_loss_mlp": 0.0110013, + "balance_loss_clip": 1.00163388, + "balance_loss_mlp": 1.0005188, + "epoch": 0.9120096197204268, + "flos": 18551524803840.0, + "grad_norm": 2.2986994824255165, + "language_loss": 0.7834425, + "learning_rate": 8.061553742395033e-08, + "loss": 0.80575359, + "num_input_tokens_seen": 327233905, + "step": 15169, + "time_per_iteration": 4.159093618392944 + }, + { + "auxiliary_loss_clip": 0.01147394, + "auxiliary_loss_mlp": 0.01100166, + "balance_loss_clip": 1.0018369, + "balance_loss_mlp": 1.00045979, + "epoch": 0.9120697429730948, + "flos": 19025725178880.0, + "grad_norm": 1.728762860833611, + "language_loss": 0.82070047, + "learning_rate": 8.05061144198591e-08, + "loss": 0.84317601, + "num_input_tokens_seen": 327252430, + "step": 15170, + "time_per_iteration": 2.5852057933807373 + }, + { + "auxiliary_loss_clip": 0.01148663, + "auxiliary_loss_mlp": 0.01100945, + "balance_loss_clip": 1.00186718, + "balance_loss_mlp": 1.00047576, + "epoch": 0.9121298662257629, + "flos": 17163685065600.0, + "grad_norm": 2.128394480905411, + "language_loss": 0.77420056, + "learning_rate": 8.039676420316799e-08, + "loss": 0.7966966, + "num_input_tokens_seen": 327269215, + "step": 15171, + "time_per_iteration": 4.526535987854004 + }, + { + "auxiliary_loss_clip": 0.01066896, + "auxiliary_loss_mlp": 0.01099898, + "balance_loss_clip": 1.00154614, + "balance_loss_mlp": 1.00052476, + "epoch": 0.9121899894784308, + "flos": 19682710888320.0, + "grad_norm": 2.0997816827330866, + "language_loss": 0.6686188, + "learning_rate": 8.02874867780241e-08, + "loss": 0.69028676, + "num_input_tokens_seen": 327290320, + "step": 15172, + "time_per_iteration": 2.8576624393463135 + }, + { + "auxiliary_loss_clip": 0.01133248, + "auxiliary_loss_mlp": 0.01100969, + "balance_loss_clip": 1.00184536, + "balance_loss_mlp": 1.00054681, + "epoch": 0.9122501127310988, + "flos": 22235743912320.0, + "grad_norm": 2.1788943493447976, + "language_loss": 0.75114352, + "learning_rate": 8.017828214857103e-08, + "loss": 0.77348566, + "num_input_tokens_seen": 327310150, + "step": 15173, + "time_per_iteration": 2.6340994834899902 + }, + { + "auxiliary_loss_clip": 0.01132014, + "auxiliary_loss_mlp": 0.01101548, + "balance_loss_clip": 1.001773, + "balance_loss_mlp": 1.00055385, + "epoch": 0.9123102359837667, + "flos": 15957122290560.0, + "grad_norm": 2.366335384131686, + "language_loss": 0.66036141, + "learning_rate": 8.00691503189499e-08, + "loss": 0.68269706, + "num_input_tokens_seen": 327326660, + "step": 15174, + "time_per_iteration": 2.701732873916626 + }, + { + "auxiliary_loss_clip": 0.01149516, + "auxiliary_loss_mlp": 0.01100487, + "balance_loss_clip": 1.00191772, + "balance_loss_mlp": 1.00058937, + "epoch": 0.9123703592364347, + "flos": 25155784149120.0, + "grad_norm": 1.6451355892237967, + "language_loss": 0.74526846, + "learning_rate": 7.996009129329894e-08, + "loss": 0.76776844, + "num_input_tokens_seen": 327346700, + "step": 15175, + "time_per_iteration": 2.602189302444458 + }, + { + "auxiliary_loss_clip": 0.01141304, + "auxiliary_loss_mlp": 0.01073754, + "balance_loss_clip": 1.00067139, + "balance_loss_mlp": 1.00012994, + "epoch": 0.9124304824891026, + "flos": 60801650812800.0, + "grad_norm": 0.9614125257916172, + "language_loss": 0.583794, + "learning_rate": 7.985110507575421e-08, + "loss": 0.60594463, + "num_input_tokens_seen": 327403050, + "step": 15176, + "time_per_iteration": 3.2297818660736084 + }, + { + "auxiliary_loss_clip": 0.01134996, + "auxiliary_loss_mlp": 0.0110085, + "balance_loss_clip": 1.00194192, + "balance_loss_mlp": 1.00061893, + "epoch": 0.9124906057417707, + "flos": 18150941352960.0, + "grad_norm": 2.3247926882052448, + "language_loss": 0.65533864, + "learning_rate": 7.97421916704475e-08, + "loss": 0.67769718, + "num_input_tokens_seen": 327422225, + "step": 15177, + "time_per_iteration": 2.627582550048828 + }, + { + "auxiliary_loss_clip": 0.01133058, + "auxiliary_loss_mlp": 0.01099958, + "balance_loss_clip": 1.0018847, + "balance_loss_mlp": 1.00048935, + "epoch": 0.9125507289944386, + "flos": 11686769049600.0, + "grad_norm": 2.3673141049618116, + "language_loss": 0.81361485, + "learning_rate": 7.963335108150926e-08, + "loss": 0.83594495, + "num_input_tokens_seen": 327437025, + "step": 15178, + "time_per_iteration": 2.608154296875 + }, + { + "auxiliary_loss_clip": 0.01086088, + "auxiliary_loss_mlp": 0.01100411, + "balance_loss_clip": 1.00158548, + "balance_loss_mlp": 1.00046635, + "epoch": 0.9126108522471066, + "flos": 17748813617280.0, + "grad_norm": 1.9960690812716215, + "language_loss": 0.78962159, + "learning_rate": 7.952458331306711e-08, + "loss": 0.8114866, + "num_input_tokens_seen": 327453915, + "step": 15179, + "time_per_iteration": 2.7089881896972656 + }, + { + "auxiliary_loss_clip": 0.0113069, + "auxiliary_loss_mlp": 0.0110007, + "balance_loss_clip": 1.00177598, + "balance_loss_mlp": 1.00060129, + "epoch": 0.9126709754997745, + "flos": 27635738952960.0, + "grad_norm": 1.6776145036715917, + "language_loss": 0.67884541, + "learning_rate": 7.941588836924507e-08, + "loss": 0.70115292, + "num_input_tokens_seen": 327474415, + "step": 15180, + "time_per_iteration": 2.6994199752807617 + }, + { + "auxiliary_loss_clip": 0.01149479, + "auxiliary_loss_mlp": 0.01099445, + "balance_loss_clip": 1.00184846, + "balance_loss_mlp": 1.00040603, + "epoch": 0.9127310987524425, + "flos": 15924982596480.0, + "grad_norm": 2.493487941466375, + "language_loss": 0.74825096, + "learning_rate": 7.930726625416495e-08, + "loss": 0.77074027, + "num_input_tokens_seen": 327492750, + "step": 15181, + "time_per_iteration": 2.597475528717041 + }, + { + "auxiliary_loss_clip": 0.01164435, + "auxiliary_loss_mlp": 0.01100303, + "balance_loss_clip": 1.0020082, + "balance_loss_mlp": 1.0005492, + "epoch": 0.9127912220051104, + "flos": 21536885923200.0, + "grad_norm": 2.146990927435366, + "language_loss": 0.74679768, + "learning_rate": 7.919871697194614e-08, + "loss": 0.76944506, + "num_input_tokens_seen": 327509470, + "step": 15182, + "time_per_iteration": 2.571093797683716 + }, + { + "auxiliary_loss_clip": 0.01164233, + "auxiliary_loss_mlp": 0.01100551, + "balance_loss_clip": 1.00187051, + "balance_loss_mlp": 1.00051045, + "epoch": 0.9128513452577784, + "flos": 24063561342720.0, + "grad_norm": 1.46267939757444, + "language_loss": 0.76690567, + "learning_rate": 7.909024052670421e-08, + "loss": 0.78955346, + "num_input_tokens_seen": 327530520, + "step": 15183, + "time_per_iteration": 2.5974719524383545 + }, + { + "auxiliary_loss_clip": 0.01147872, + "auxiliary_loss_mlp": 0.01101439, + "balance_loss_clip": 1.00185013, + "balance_loss_mlp": 1.00044453, + "epoch": 0.9129114685104465, + "flos": 16216469464320.0, + "grad_norm": 1.9884341153592207, + "language_loss": 0.76386803, + "learning_rate": 7.898183692255256e-08, + "loss": 0.7863611, + "num_input_tokens_seen": 327546960, + "step": 15184, + "time_per_iteration": 2.552499294281006 + }, + { + "auxiliary_loss_clip": 0.01148128, + "auxiliary_loss_mlp": 0.01100308, + "balance_loss_clip": 1.00200367, + "balance_loss_mlp": 1.0004108, + "epoch": 0.9129715917631144, + "flos": 19384364522880.0, + "grad_norm": 1.689221029402925, + "language_loss": 0.74431241, + "learning_rate": 7.887350616360233e-08, + "loss": 0.76679683, + "num_input_tokens_seen": 327564830, + "step": 15185, + "time_per_iteration": 2.6345021724700928 + }, + { + "auxiliary_loss_clip": 0.0113074, + "auxiliary_loss_mlp": 0.0109986, + "balance_loss_clip": 1.00170946, + "balance_loss_mlp": 1.00039172, + "epoch": 0.9130317150157824, + "flos": 20590460421120.0, + "grad_norm": 2.1313808112691204, + "language_loss": 0.68339741, + "learning_rate": 7.876524825396158e-08, + "loss": 0.70570338, + "num_input_tokens_seen": 327583675, + "step": 15186, + "time_per_iteration": 2.6352529525756836 + }, + { + "auxiliary_loss_clip": 0.01133039, + "auxiliary_loss_mlp": 0.01101795, + "balance_loss_clip": 1.00181866, + "balance_loss_mlp": 1.00051463, + "epoch": 0.9130918382684503, + "flos": 20189230525440.0, + "grad_norm": 2.0862364880196975, + "language_loss": 0.77528816, + "learning_rate": 7.865706319773502e-08, + "loss": 0.79763651, + "num_input_tokens_seen": 327602280, + "step": 15187, + "time_per_iteration": 2.6204612255096436 + }, + { + "auxiliary_loss_clip": 0.01164154, + "auxiliary_loss_mlp": 0.00747436, + "balance_loss_clip": 1.00184298, + "balance_loss_mlp": 1.00052977, + "epoch": 0.9131519615211183, + "flos": 25556870390400.0, + "grad_norm": 2.296733241399604, + "language_loss": 0.65834051, + "learning_rate": 7.854895099902515e-08, + "loss": 0.67745644, + "num_input_tokens_seen": 327623515, + "step": 15188, + "time_per_iteration": 4.10697078704834 + }, + { + "auxiliary_loss_clip": 0.01067557, + "auxiliary_loss_mlp": 0.01099906, + "balance_loss_clip": 1.00149107, + "balance_loss_mlp": 1.00043774, + "epoch": 0.9132120847737862, + "flos": 17931563038080.0, + "grad_norm": 1.9823428571388182, + "language_loss": 0.76093912, + "learning_rate": 7.844091166193157e-08, + "loss": 0.78261375, + "num_input_tokens_seen": 327642875, + "step": 15189, + "time_per_iteration": 2.8000268936157227 + }, + { + "auxiliary_loss_clip": 0.0114946, + "auxiliary_loss_mlp": 0.01098422, + "balance_loss_clip": 1.00185943, + "balance_loss_mlp": 1.00047958, + "epoch": 0.9132722080264543, + "flos": 20047635112320.0, + "grad_norm": 2.068833307081228, + "language_loss": 0.75695097, + "learning_rate": 7.8332945190551e-08, + "loss": 0.77942979, + "num_input_tokens_seen": 327662450, + "step": 15190, + "time_per_iteration": 4.136358261108398 + }, + { + "auxiliary_loss_clip": 0.01142279, + "auxiliary_loss_mlp": 0.01073835, + "balance_loss_clip": 1.00084877, + "balance_loss_mlp": 1.00021172, + "epoch": 0.9133323312791222, + "flos": 70439967141120.0, + "grad_norm": 0.6957908953215572, + "language_loss": 0.57349539, + "learning_rate": 7.822505158897797e-08, + "loss": 0.59565651, + "num_input_tokens_seen": 327723845, + "step": 15191, + "time_per_iteration": 3.170745372772217 + }, + { + "auxiliary_loss_clip": 0.01164282, + "auxiliary_loss_mlp": 0.01100346, + "balance_loss_clip": 1.00186896, + "balance_loss_mlp": 1.00059152, + "epoch": 0.9133924545317902, + "flos": 25483792170240.0, + "grad_norm": 1.7187826659526635, + "language_loss": 0.74206817, + "learning_rate": 7.81172308613034e-08, + "loss": 0.76471442, + "num_input_tokens_seen": 327742590, + "step": 15192, + "time_per_iteration": 2.62412166595459 + }, + { + "auxiliary_loss_clip": 0.01147406, + "auxiliary_loss_mlp": 0.01099567, + "balance_loss_clip": 1.00184393, + "balance_loss_mlp": 1.00043297, + "epoch": 0.9134525777844581, + "flos": 39930690107520.0, + "grad_norm": 1.611339976369723, + "language_loss": 0.69240826, + "learning_rate": 7.800948301161647e-08, + "loss": 0.71487796, + "num_input_tokens_seen": 327764350, + "step": 15193, + "time_per_iteration": 2.7687230110168457 + }, + { + "auxiliary_loss_clip": 0.0114736, + "auxiliary_loss_mlp": 0.01099841, + "balance_loss_clip": 1.00191605, + "balance_loss_mlp": 1.00070703, + "epoch": 0.9135127010371261, + "flos": 20886723797760.0, + "grad_norm": 1.5067354631212146, + "language_loss": 0.73496163, + "learning_rate": 7.790180804400215e-08, + "loss": 0.75743365, + "num_input_tokens_seen": 327783120, + "step": 15194, + "time_per_iteration": 2.6454718112945557 + }, + { + "auxiliary_loss_clip": 0.01100386, + "auxiliary_loss_mlp": 0.01101421, + "balance_loss_clip": 1.0016005, + "balance_loss_mlp": 1.00042737, + "epoch": 0.913572824289794, + "flos": 20813250528000.0, + "grad_norm": 2.8682966327224757, + "language_loss": 0.62295699, + "learning_rate": 7.779420596254383e-08, + "loss": 0.64497507, + "num_input_tokens_seen": 327801960, + "step": 15195, + "time_per_iteration": 2.7187047004699707 + }, + { + "auxiliary_loss_clip": 0.01149544, + "auxiliary_loss_mlp": 0.01100904, + "balance_loss_clip": 1.00183249, + "balance_loss_mlp": 1.0004822, + "epoch": 0.913632947542462, + "flos": 25703278225920.0, + "grad_norm": 1.8126357183119362, + "language_loss": 0.71131408, + "learning_rate": 7.768667677132201e-08, + "loss": 0.73381853, + "num_input_tokens_seen": 327823795, + "step": 15196, + "time_per_iteration": 2.6550116539001465 + }, + { + "auxiliary_loss_clip": 0.01132656, + "auxiliary_loss_mlp": 0.01099903, + "balance_loss_clip": 1.00187171, + "balance_loss_mlp": 1.0004822, + "epoch": 0.9136930707951301, + "flos": 26286216048000.0, + "grad_norm": 1.5287639252360699, + "language_loss": 0.71406353, + "learning_rate": 7.757922047441411e-08, + "loss": 0.73638904, + "num_input_tokens_seen": 327845175, + "step": 15197, + "time_per_iteration": 2.704352617263794 + }, + { + "auxiliary_loss_clip": 0.01132559, + "auxiliary_loss_mlp": 0.01100277, + "balance_loss_clip": 1.00166905, + "balance_loss_mlp": 1.00042689, + "epoch": 0.913753194047798, + "flos": 22091885942400.0, + "grad_norm": 1.919386002401134, + "language_loss": 0.77983773, + "learning_rate": 7.747183707589489e-08, + "loss": 0.80216604, + "num_input_tokens_seen": 327863150, + "step": 15198, + "time_per_iteration": 2.6656923294067383 + }, + { + "auxiliary_loss_clip": 0.01147628, + "auxiliary_loss_mlp": 0.01099829, + "balance_loss_clip": 1.0018512, + "balance_loss_mlp": 1.00045586, + "epoch": 0.913813317300466, + "flos": 23587206151680.0, + "grad_norm": 1.362664164542311, + "language_loss": 0.68125874, + "learning_rate": 7.736452657983616e-08, + "loss": 0.70373327, + "num_input_tokens_seen": 327883445, + "step": 15199, + "time_per_iteration": 2.678330659866333 + }, + { + "auxiliary_loss_clip": 0.01149067, + "auxiliary_loss_mlp": 0.00747255, + "balance_loss_clip": 1.00183988, + "balance_loss_mlp": 1.00045741, + "epoch": 0.9138734405531339, + "flos": 28876452583680.0, + "grad_norm": 1.53682316214492, + "language_loss": 0.67640913, + "learning_rate": 7.725728899030714e-08, + "loss": 0.69537234, + "num_input_tokens_seen": 327905745, + "step": 15200, + "time_per_iteration": 2.6469969749450684 + }, + { + "auxiliary_loss_clip": 0.01146849, + "auxiliary_loss_mlp": 0.01099442, + "balance_loss_clip": 1.00192833, + "balance_loss_mlp": 1.00054634, + "epoch": 0.9139335638058019, + "flos": 22821087945600.0, + "grad_norm": 1.7861089758235593, + "language_loss": 0.71155632, + "learning_rate": 7.715012431137435e-08, + "loss": 0.73401928, + "num_input_tokens_seen": 327925435, + "step": 15201, + "time_per_iteration": 2.602421760559082 + }, + { + "auxiliary_loss_clip": 0.01147589, + "auxiliary_loss_mlp": 0.01099041, + "balance_loss_clip": 1.0017513, + "balance_loss_mlp": 1.00038362, + "epoch": 0.9139936870584698, + "flos": 18004174381440.0, + "grad_norm": 1.7973750623405649, + "language_loss": 0.70409364, + "learning_rate": 7.704303254710165e-08, + "loss": 0.72656, + "num_input_tokens_seen": 327944145, + "step": 15202, + "time_per_iteration": 2.5375382900238037 + }, + { + "auxiliary_loss_clip": 0.01164067, + "auxiliary_loss_mlp": 0.01100351, + "balance_loss_clip": 1.00178409, + "balance_loss_mlp": 1.0005486, + "epoch": 0.9140538103111379, + "flos": 15813767111040.0, + "grad_norm": 2.5427923620278143, + "language_loss": 0.66758049, + "learning_rate": 7.693601370155001e-08, + "loss": 0.69022465, + "num_input_tokens_seen": 327960565, + "step": 15203, + "time_per_iteration": 2.52264142036438 + }, + { + "auxiliary_loss_clip": 0.01149082, + "auxiliary_loss_mlp": 0.0110067, + "balance_loss_clip": 1.00189757, + "balance_loss_mlp": 1.00058234, + "epoch": 0.9141139335638058, + "flos": 23987035416960.0, + "grad_norm": 1.7067230511780023, + "language_loss": 0.68831611, + "learning_rate": 7.682906777877751e-08, + "loss": 0.71081364, + "num_input_tokens_seen": 327981180, + "step": 15204, + "time_per_iteration": 2.6004321575164795 + }, + { + "auxiliary_loss_clip": 0.01148857, + "auxiliary_loss_mlp": 0.01100626, + "balance_loss_clip": 1.00168848, + "balance_loss_mlp": 1.0003953, + "epoch": 0.9141740568164738, + "flos": 24024418496640.0, + "grad_norm": 2.4399727536681293, + "language_loss": 0.59662092, + "learning_rate": 7.672219478283915e-08, + "loss": 0.61911571, + "num_input_tokens_seen": 328001500, + "step": 15205, + "time_per_iteration": 2.594407320022583 + }, + { + "auxiliary_loss_clip": 0.01116338, + "auxiliary_loss_mlp": 0.01100639, + "balance_loss_clip": 1.00191474, + "balance_loss_mlp": 1.00055087, + "epoch": 0.9142341800691417, + "flos": 27018291139200.0, + "grad_norm": 1.8028545854369953, + "language_loss": 0.8128472, + "learning_rate": 7.661539471778811e-08, + "loss": 0.83501697, + "num_input_tokens_seen": 328023025, + "step": 15206, + "time_per_iteration": 2.718400478363037 + }, + { + "auxiliary_loss_clip": 0.01099919, + "auxiliary_loss_mlp": 0.0110033, + "balance_loss_clip": 1.0015974, + "balance_loss_mlp": 1.00038481, + "epoch": 0.9142943033218097, + "flos": 20412487509120.0, + "grad_norm": 3.003553907997814, + "language_loss": 0.7368477, + "learning_rate": 7.650866758767382e-08, + "loss": 0.75885022, + "num_input_tokens_seen": 328041410, + "step": 15207, + "time_per_iteration": 4.2444748878479 + }, + { + "auxiliary_loss_clip": 0.01098784, + "auxiliary_loss_mlp": 0.01100015, + "balance_loss_clip": 1.00150681, + "balance_loss_mlp": 1.00049853, + "epoch": 0.9143544265744776, + "flos": 19755322231680.0, + "grad_norm": 1.8642380037641941, + "language_loss": 0.73158139, + "learning_rate": 7.640201339654373e-08, + "loss": 0.75356942, + "num_input_tokens_seen": 328060495, + "step": 15208, + "time_per_iteration": 4.226043701171875 + }, + { + "auxiliary_loss_clip": 0.01148165, + "auxiliary_loss_mlp": 0.0110039, + "balance_loss_clip": 1.00195289, + "balance_loss_mlp": 1.0003494, + "epoch": 0.9144145498271457, + "flos": 17165444832000.0, + "grad_norm": 2.2023866148789923, + "language_loss": 0.86467063, + "learning_rate": 7.629543214844237e-08, + "loss": 0.88715613, + "num_input_tokens_seen": 328076905, + "step": 15209, + "time_per_iteration": 2.6182000637054443 + }, + { + "auxiliary_loss_clip": 0.01132318, + "auxiliary_loss_mlp": 0.01099483, + "balance_loss_clip": 1.00180149, + "balance_loss_mlp": 1.00068283, + "epoch": 0.9144746730798137, + "flos": 23726072131200.0, + "grad_norm": 2.2367100186376945, + "language_loss": 0.75377536, + "learning_rate": 7.618892384741093e-08, + "loss": 0.77609336, + "num_input_tokens_seen": 328096960, + "step": 15210, + "time_per_iteration": 2.6704928874969482 + }, + { + "auxiliary_loss_clip": 0.0113281, + "auxiliary_loss_mlp": 0.01100439, + "balance_loss_clip": 1.0016464, + "balance_loss_mlp": 1.00049388, + "epoch": 0.9145347963324816, + "flos": 25847854467840.0, + "grad_norm": 9.099718359014858, + "language_loss": 0.78354293, + "learning_rate": 7.6082488497488e-08, + "loss": 0.80587542, + "num_input_tokens_seen": 328115445, + "step": 15211, + "time_per_iteration": 2.66304874420166 + }, + { + "auxiliary_loss_clip": 0.01149013, + "auxiliary_loss_mlp": 0.01100048, + "balance_loss_clip": 1.00182772, + "balance_loss_mlp": 1.00048411, + "epoch": 0.9145949195851496, + "flos": 19242769109760.0, + "grad_norm": 3.1519038698067487, + "language_loss": 0.82849908, + "learning_rate": 7.597612610270986e-08, + "loss": 0.8509897, + "num_input_tokens_seen": 328133965, + "step": 15212, + "time_per_iteration": 2.5609569549560547 + }, + { + "auxiliary_loss_clip": 0.01147767, + "auxiliary_loss_mlp": 0.01098964, + "balance_loss_clip": 1.00173759, + "balance_loss_mlp": 1.00040162, + "epoch": 0.9146550428378175, + "flos": 18296379521280.0, + "grad_norm": 1.7766264918630073, + "language_loss": 0.83979344, + "learning_rate": 7.586983666711022e-08, + "loss": 0.86226076, + "num_input_tokens_seen": 328151520, + "step": 15213, + "time_per_iteration": 2.5953876972198486 + }, + { + "auxiliary_loss_clip": 0.01148003, + "auxiliary_loss_mlp": 0.01099938, + "balance_loss_clip": 1.00197411, + "balance_loss_mlp": 1.0004698, + "epoch": 0.9147151660904855, + "flos": 20084264006400.0, + "grad_norm": 1.989649182228344, + "language_loss": 0.70941144, + "learning_rate": 7.576362019471894e-08, + "loss": 0.73189086, + "num_input_tokens_seen": 328171275, + "step": 15214, + "time_per_iteration": 2.5757641792297363 + }, + { + "auxiliary_loss_clip": 0.01147576, + "auxiliary_loss_mlp": 0.0110207, + "balance_loss_clip": 1.00190997, + "balance_loss_mlp": 1.00059915, + "epoch": 0.9147752893431534, + "flos": 24389127239040.0, + "grad_norm": 1.595908347756576, + "language_loss": 0.62928963, + "learning_rate": 7.565747668956413e-08, + "loss": 0.65178609, + "num_input_tokens_seen": 328192115, + "step": 15215, + "time_per_iteration": 2.634644031524658 + }, + { + "auxiliary_loss_clip": 0.01117697, + "auxiliary_loss_mlp": 0.01100914, + "balance_loss_clip": 1.00171328, + "balance_loss_mlp": 1.00049186, + "epoch": 0.9148354125958215, + "flos": 18150402648960.0, + "grad_norm": 3.7867636461116523, + "language_loss": 0.76560819, + "learning_rate": 7.555140615567058e-08, + "loss": 0.78779423, + "num_input_tokens_seen": 328208990, + "step": 15216, + "time_per_iteration": 2.632251024246216 + }, + { + "auxiliary_loss_clip": 0.01132667, + "auxiliary_loss_mlp": 0.01100093, + "balance_loss_clip": 1.00183201, + "balance_loss_mlp": 1.00057685, + "epoch": 0.9148955358484894, + "flos": 23367540528000.0, + "grad_norm": 4.553411258880733, + "language_loss": 0.68410075, + "learning_rate": 7.544540859706062e-08, + "loss": 0.70642835, + "num_input_tokens_seen": 328227840, + "step": 15217, + "time_per_iteration": 2.655941963195801 + }, + { + "auxiliary_loss_clip": 0.01147288, + "auxiliary_loss_mlp": 0.01099926, + "balance_loss_clip": 1.00184786, + "balance_loss_mlp": 1.00041056, + "epoch": 0.9149556591011574, + "flos": 18076498416000.0, + "grad_norm": 2.0033282264606784, + "language_loss": 0.80022806, + "learning_rate": 7.533948401775347e-08, + "loss": 0.82270026, + "num_input_tokens_seen": 328246250, + "step": 15218, + "time_per_iteration": 2.5424680709838867 + }, + { + "auxiliary_loss_clip": 0.01108933, + "auxiliary_loss_mlp": 0.01075066, + "balance_loss_clip": 1.00082946, + "balance_loss_mlp": 1.00067902, + "epoch": 0.9150157823538253, + "flos": 54586374825600.0, + "grad_norm": 0.8517812158893182, + "language_loss": 0.59337968, + "learning_rate": 7.523363242176595e-08, + "loss": 0.61521965, + "num_input_tokens_seen": 328303625, + "step": 15219, + "time_per_iteration": 3.201301336288452 + }, + { + "auxiliary_loss_clip": 0.01149392, + "auxiliary_loss_mlp": 0.01100034, + "balance_loss_clip": 1.00187576, + "balance_loss_mlp": 1.00061321, + "epoch": 0.9150759056064933, + "flos": 17893102550400.0, + "grad_norm": 1.6788190779766747, + "language_loss": 0.78390801, + "learning_rate": 7.512785381311216e-08, + "loss": 0.80640221, + "num_input_tokens_seen": 328322135, + "step": 15220, + "time_per_iteration": 2.6501388549804688 + }, + { + "auxiliary_loss_clip": 0.01099609, + "auxiliary_loss_mlp": 0.01101067, + "balance_loss_clip": 1.00163877, + "balance_loss_mlp": 1.00055003, + "epoch": 0.9151360288591612, + "flos": 18073517587200.0, + "grad_norm": 1.991066849191207, + "language_loss": 0.65820962, + "learning_rate": 7.50221481958031e-08, + "loss": 0.68021643, + "num_input_tokens_seen": 328340750, + "step": 15221, + "time_per_iteration": 2.6822474002838135 + }, + { + "auxiliary_loss_clip": 0.01131001, + "auxiliary_loss_mlp": 0.01099886, + "balance_loss_clip": 1.00161409, + "balance_loss_mlp": 1.00056052, + "epoch": 0.9151961521118293, + "flos": 19354523299200.0, + "grad_norm": 4.597994289073204, + "language_loss": 0.84453058, + "learning_rate": 7.491651557384692e-08, + "loss": 0.86683953, + "num_input_tokens_seen": 328359995, + "step": 15222, + "time_per_iteration": 2.65090274810791 + }, + { + "auxiliary_loss_clip": 0.01127254, + "auxiliary_loss_mlp": 0.0107423, + "balance_loss_clip": 1.00090241, + "balance_loss_mlp": 1.00022447, + "epoch": 0.9152562753644973, + "flos": 72146621018880.0, + "grad_norm": 0.7189673170631141, + "language_loss": 0.496443, + "learning_rate": 7.481095595124953e-08, + "loss": 0.51845789, + "num_input_tokens_seen": 328426865, + "step": 15223, + "time_per_iteration": 3.231567859649658 + }, + { + "auxiliary_loss_clip": 0.0111466, + "auxiliary_loss_mlp": 0.01100749, + "balance_loss_clip": 1.00174773, + "balance_loss_mlp": 1.00070846, + "epoch": 0.9153163986171652, + "flos": 20777016683520.0, + "grad_norm": 2.2205967306669514, + "language_loss": 0.72105497, + "learning_rate": 7.470546933201349e-08, + "loss": 0.743209, + "num_input_tokens_seen": 328445970, + "step": 15224, + "time_per_iteration": 2.7399380207061768 + }, + { + "auxiliary_loss_clip": 0.01147364, + "auxiliary_loss_mlp": 0.01100085, + "balance_loss_clip": 1.00175405, + "balance_loss_mlp": 1.00033069, + "epoch": 0.9153765218698332, + "flos": 23040107124480.0, + "grad_norm": 3.0150372503289056, + "language_loss": 0.81255674, + "learning_rate": 7.460005572013895e-08, + "loss": 0.83503127, + "num_input_tokens_seen": 328464585, + "step": 15225, + "time_per_iteration": 4.034222841262817 + }, + { + "auxiliary_loss_clip": 0.01164278, + "auxiliary_loss_mlp": 0.0110068, + "balance_loss_clip": 1.00193882, + "balance_loss_mlp": 1.00035334, + "epoch": 0.9154366451225011, + "flos": 28990900293120.0, + "grad_norm": 1.6695698169758526, + "language_loss": 0.71205348, + "learning_rate": 7.44947151196238e-08, + "loss": 0.73470306, + "num_input_tokens_seen": 328490155, + "step": 15226, + "time_per_iteration": 2.6631345748901367 + }, + { + "auxiliary_loss_clip": 0.01069551, + "auxiliary_loss_mlp": 0.01100329, + "balance_loss_clip": 1.00163722, + "balance_loss_mlp": 1.00043201, + "epoch": 0.9154967683751691, + "flos": 22309504490880.0, + "grad_norm": 1.8023151374681745, + "language_loss": 0.74591553, + "learning_rate": 7.43894475344613e-08, + "loss": 0.76761436, + "num_input_tokens_seen": 328508275, + "step": 15227, + "time_per_iteration": 2.8363685607910156 + }, + { + "auxiliary_loss_clip": 0.01132653, + "auxiliary_loss_mlp": 0.01099625, + "balance_loss_clip": 1.00184107, + "balance_loss_mlp": 1.00053787, + "epoch": 0.915556891627837, + "flos": 24571481610240.0, + "grad_norm": 1.4717319256290438, + "language_loss": 0.74175495, + "learning_rate": 7.428425296864404e-08, + "loss": 0.76407766, + "num_input_tokens_seen": 328529425, + "step": 15228, + "time_per_iteration": 4.238850831985474 + }, + { + "auxiliary_loss_clip": 0.01116207, + "auxiliary_loss_mlp": 0.01098988, + "balance_loss_clip": 1.00154686, + "balance_loss_mlp": 1.00047374, + "epoch": 0.9156170148805051, + "flos": 22164676853760.0, + "grad_norm": 1.625854964422259, + "language_loss": 0.71816343, + "learning_rate": 7.417913142616106e-08, + "loss": 0.74031538, + "num_input_tokens_seen": 328550200, + "step": 15229, + "time_per_iteration": 2.695035934448242 + }, + { + "auxiliary_loss_clip": 0.01164219, + "auxiliary_loss_mlp": 0.01100468, + "balance_loss_clip": 1.00191236, + "balance_loss_mlp": 1.00052333, + "epoch": 0.915677138133173, + "flos": 20920659171840.0, + "grad_norm": 3.3303424429399215, + "language_loss": 0.83066994, + "learning_rate": 7.407408291099848e-08, + "loss": 0.85331678, + "num_input_tokens_seen": 328568540, + "step": 15230, + "time_per_iteration": 2.5979299545288086 + }, + { + "auxiliary_loss_clip": 0.01102915, + "auxiliary_loss_mlp": 0.01099334, + "balance_loss_clip": 1.00164688, + "balance_loss_mlp": 1.00053382, + "epoch": 0.915737261385841, + "flos": 24345136056960.0, + "grad_norm": 1.7901278851860485, + "language_loss": 0.83584177, + "learning_rate": 7.396910742713957e-08, + "loss": 0.85786426, + "num_input_tokens_seen": 328587300, + "step": 15231, + "time_per_iteration": 2.7785909175872803 + }, + { + "auxiliary_loss_clip": 0.01149279, + "auxiliary_loss_mlp": 0.01099819, + "balance_loss_clip": 1.0016439, + "balance_loss_mlp": 1.00035107, + "epoch": 0.9157973846385089, + "flos": 26761386090240.0, + "grad_norm": 1.5228340894916714, + "language_loss": 0.72143346, + "learning_rate": 7.386420497856516e-08, + "loss": 0.74392444, + "num_input_tokens_seen": 328610055, + "step": 15232, + "time_per_iteration": 2.655343770980835 + }, + { + "auxiliary_loss_clip": 0.01164114, + "auxiliary_loss_mlp": 0.01100096, + "balance_loss_clip": 1.00182605, + "balance_loss_mlp": 1.00043726, + "epoch": 0.9158575078911769, + "flos": 18478733892480.0, + "grad_norm": 2.4444132701958075, + "language_loss": 0.67593443, + "learning_rate": 7.375937556925338e-08, + "loss": 0.69857657, + "num_input_tokens_seen": 328626815, + "step": 15233, + "time_per_iteration": 2.60453462600708 + }, + { + "auxiliary_loss_clip": 0.01131037, + "auxiliary_loss_mlp": 0.01100912, + "balance_loss_clip": 1.00173998, + "balance_loss_mlp": 1.00048995, + "epoch": 0.9159176311438448, + "flos": 21798926616960.0, + "grad_norm": 3.121050347319929, + "language_loss": 0.70292521, + "learning_rate": 7.365461920317861e-08, + "loss": 0.72524476, + "num_input_tokens_seen": 328643995, + "step": 15234, + "time_per_iteration": 2.7475054264068604 + }, + { + "auxiliary_loss_clip": 0.01130691, + "auxiliary_loss_mlp": 0.01101715, + "balance_loss_clip": 1.00177336, + "balance_loss_mlp": 1.00053036, + "epoch": 0.9159777543965129, + "flos": 24783749032320.0, + "grad_norm": 1.6979826023101965, + "language_loss": 0.88244641, + "learning_rate": 7.354993588431391e-08, + "loss": 0.90477043, + "num_input_tokens_seen": 328659565, + "step": 15235, + "time_per_iteration": 2.6100962162017822 + }, + { + "auxiliary_loss_clip": 0.01087144, + "auxiliary_loss_mlp": 0.01101419, + "balance_loss_clip": 1.00176847, + "balance_loss_mlp": 1.00052011, + "epoch": 0.9160378776491809, + "flos": 26868758820480.0, + "grad_norm": 2.572962280964595, + "language_loss": 0.77010906, + "learning_rate": 7.344532561662853e-08, + "loss": 0.79199463, + "num_input_tokens_seen": 328679045, + "step": 15236, + "time_per_iteration": 2.8051419258117676 + }, + { + "auxiliary_loss_clip": 0.0108106, + "auxiliary_loss_mlp": 0.01074569, + "balance_loss_clip": 1.00069547, + "balance_loss_mlp": 1.00018203, + "epoch": 0.9160980009018488, + "flos": 70578222589440.0, + "grad_norm": 0.6753977488830641, + "language_loss": 0.62213039, + "learning_rate": 7.334078840409019e-08, + "loss": 0.64368665, + "num_input_tokens_seen": 328744565, + "step": 15237, + "time_per_iteration": 3.246345281600952 + }, + { + "auxiliary_loss_clip": 0.01164189, + "auxiliary_loss_mlp": 0.00747191, + "balance_loss_clip": 1.00197411, + "balance_loss_mlp": 1.00048041, + "epoch": 0.9161581241545168, + "flos": 16289332202880.0, + "grad_norm": 1.938986566916266, + "language_loss": 0.74819845, + "learning_rate": 7.323632425066151e-08, + "loss": 0.76731229, + "num_input_tokens_seen": 328762455, + "step": 15238, + "time_per_iteration": 2.5166866779327393 + }, + { + "auxiliary_loss_clip": 0.01164225, + "auxiliary_loss_mlp": 0.01100251, + "balance_loss_clip": 1.0018816, + "balance_loss_mlp": 1.00054455, + "epoch": 0.9162182474071847, + "flos": 18438154502400.0, + "grad_norm": 1.8732834490762706, + "language_loss": 0.74949324, + "learning_rate": 7.313193316030464e-08, + "loss": 0.772138, + "num_input_tokens_seen": 328780320, + "step": 15239, + "time_per_iteration": 2.56083345413208 + }, + { + "auxiliary_loss_clip": 0.01117357, + "auxiliary_loss_mlp": 0.01100673, + "balance_loss_clip": 1.00165653, + "balance_loss_mlp": 1.00058508, + "epoch": 0.9162783706598527, + "flos": 19167248764800.0, + "grad_norm": 2.1752785933329792, + "language_loss": 0.63631356, + "learning_rate": 7.302761513697819e-08, + "loss": 0.65849376, + "num_input_tokens_seen": 328797570, + "step": 15240, + "time_per_iteration": 2.645418405532837 + }, + { + "auxiliary_loss_clip": 0.01132637, + "auxiliary_loss_mlp": 0.00747211, + "balance_loss_clip": 1.0017662, + "balance_loss_mlp": 1.00042963, + "epoch": 0.9163384939125206, + "flos": 20412990299520.0, + "grad_norm": 2.0029072422559655, + "language_loss": 0.76617235, + "learning_rate": 7.292337018463746e-08, + "loss": 0.78497088, + "num_input_tokens_seen": 328814075, + "step": 15241, + "time_per_iteration": 2.6400370597839355 + }, + { + "auxiliary_loss_clip": 0.01147279, + "auxiliary_loss_mlp": 0.01102709, + "balance_loss_clip": 1.00181973, + "balance_loss_mlp": 1.00042725, + "epoch": 0.9163986171651887, + "flos": 19645902426240.0, + "grad_norm": 2.2399007842391785, + "language_loss": 0.67705667, + "learning_rate": 7.281919830723549e-08, + "loss": 0.69955659, + "num_input_tokens_seen": 328831990, + "step": 15242, + "time_per_iteration": 2.5869739055633545 + }, + { + "auxiliary_loss_clip": 0.01149524, + "auxiliary_loss_mlp": 0.01100608, + "balance_loss_clip": 1.00180244, + "balance_loss_mlp": 1.00056744, + "epoch": 0.9164587404178566, + "flos": 12823054865280.0, + "grad_norm": 1.7823942702093947, + "language_loss": 0.80629206, + "learning_rate": 7.271509950872334e-08, + "loss": 0.82879341, + "num_input_tokens_seen": 328849105, + "step": 15243, + "time_per_iteration": 2.564164876937866 + }, + { + "auxiliary_loss_clip": 0.01134402, + "auxiliary_loss_mlp": 0.01101309, + "balance_loss_clip": 1.00182378, + "balance_loss_mlp": 1.00055361, + "epoch": 0.9165188636705246, + "flos": 22309396750080.0, + "grad_norm": 1.9270222054190222, + "language_loss": 0.8184101, + "learning_rate": 7.261107379304721e-08, + "loss": 0.84076715, + "num_input_tokens_seen": 328866810, + "step": 15244, + "time_per_iteration": 2.612842321395874 + }, + { + "auxiliary_loss_clip": 0.01164202, + "auxiliary_loss_mlp": 0.01101245, + "balance_loss_clip": 1.00182688, + "balance_loss_mlp": 1.00063252, + "epoch": 0.9165789869231925, + "flos": 18223337214720.0, + "grad_norm": 3.9175142386336503, + "language_loss": 0.71952289, + "learning_rate": 7.250712116415214e-08, + "loss": 0.74217737, + "num_input_tokens_seen": 328885325, + "step": 15245, + "time_per_iteration": 4.145089864730835 + }, + { + "auxiliary_loss_clip": 0.01130239, + "auxiliary_loss_mlp": 0.01099734, + "balance_loss_clip": 1.00172126, + "balance_loss_mlp": 1.00055194, + "epoch": 0.9166391101758605, + "flos": 13691553811200.0, + "grad_norm": 1.6356419944244893, + "language_loss": 0.75012189, + "learning_rate": 7.240324162598033e-08, + "loss": 0.7724216, + "num_input_tokens_seen": 328902655, + "step": 15246, + "time_per_iteration": 2.6321473121643066 + }, + { + "auxiliary_loss_clip": 0.0113304, + "auxiliary_loss_mlp": 0.01100637, + "balance_loss_clip": 1.00185072, + "balance_loss_mlp": 1.00050175, + "epoch": 0.9166992334285284, + "flos": 17346793622400.0, + "grad_norm": 3.7008072360718454, + "language_loss": 0.75576651, + "learning_rate": 7.229943518247106e-08, + "loss": 0.77810323, + "num_input_tokens_seen": 328918440, + "step": 15247, + "time_per_iteration": 3.965376377105713 + }, + { + "auxiliary_loss_clip": 0.01147618, + "auxiliary_loss_mlp": 0.01100519, + "balance_loss_clip": 1.00194514, + "balance_loss_mlp": 1.00047815, + "epoch": 0.9167593566811965, + "flos": 23731135948800.0, + "grad_norm": 1.7178712203753304, + "language_loss": 0.75858074, + "learning_rate": 7.219570183756052e-08, + "loss": 0.78106213, + "num_input_tokens_seen": 328938055, + "step": 15248, + "time_per_iteration": 2.67238712310791 + }, + { + "auxiliary_loss_clip": 0.01149582, + "auxiliary_loss_mlp": 0.01101966, + "balance_loss_clip": 1.00187302, + "balance_loss_mlp": 1.00059092, + "epoch": 0.9168194799338644, + "flos": 27818201064960.0, + "grad_norm": 2.4375298832420884, + "language_loss": 0.72756827, + "learning_rate": 7.209204159518178e-08, + "loss": 0.75008374, + "num_input_tokens_seen": 328957895, + "step": 15249, + "time_per_iteration": 2.6263058185577393 + }, + { + "auxiliary_loss_clip": 0.01101473, + "auxiliary_loss_mlp": 0.0110029, + "balance_loss_clip": 1.00189996, + "balance_loss_mlp": 1.00044024, + "epoch": 0.9168796031865324, + "flos": 21717552355200.0, + "grad_norm": 1.9334976859190518, + "language_loss": 0.76093745, + "learning_rate": 7.198845445926616e-08, + "loss": 0.78295505, + "num_input_tokens_seen": 328971365, + "step": 15250, + "time_per_iteration": 2.71234393119812 + }, + { + "auxiliary_loss_clip": 0.01100926, + "auxiliary_loss_mlp": 0.01100835, + "balance_loss_clip": 1.0016768, + "balance_loss_mlp": 1.00050867, + "epoch": 0.9169397264392004, + "flos": 23404420817280.0, + "grad_norm": 1.883911741042712, + "language_loss": 0.76000106, + "learning_rate": 7.188494043374138e-08, + "loss": 0.78201866, + "num_input_tokens_seen": 328990830, + "step": 15251, + "time_per_iteration": 2.7207159996032715 + }, + { + "auxiliary_loss_clip": 0.01133038, + "auxiliary_loss_mlp": 0.01100822, + "balance_loss_clip": 1.00200152, + "balance_loss_mlp": 1.00044799, + "epoch": 0.9169998496918683, + "flos": 23950981140480.0, + "grad_norm": 2.495190419037789, + "language_loss": 0.79896879, + "learning_rate": 7.178149952253298e-08, + "loss": 0.82130742, + "num_input_tokens_seen": 329008345, + "step": 15252, + "time_per_iteration": 2.719905376434326 + }, + { + "auxiliary_loss_clip": 0.0116416, + "auxiliary_loss_mlp": 0.01100441, + "balance_loss_clip": 1.00196171, + "balance_loss_mlp": 1.00049555, + "epoch": 0.9170599729445363, + "flos": 18332469711360.0, + "grad_norm": 1.7216493428323736, + "language_loss": 0.77269876, + "learning_rate": 7.167813172956316e-08, + "loss": 0.79534471, + "num_input_tokens_seen": 329027440, + "step": 15253, + "time_per_iteration": 2.613678455352783 + }, + { + "auxiliary_loss_clip": 0.01147684, + "auxiliary_loss_mlp": 0.01100813, + "balance_loss_clip": 1.00193501, + "balance_loss_mlp": 1.00048614, + "epoch": 0.9171200961972042, + "flos": 22674859678080.0, + "grad_norm": 2.5255173355230744, + "language_loss": 0.73337585, + "learning_rate": 7.157483705875256e-08, + "loss": 0.75586081, + "num_input_tokens_seen": 329046445, + "step": 15254, + "time_per_iteration": 2.5824763774871826 + }, + { + "auxiliary_loss_clip": 0.0111414, + "auxiliary_loss_mlp": 0.01100089, + "balance_loss_clip": 1.00161743, + "balance_loss_mlp": 1.00047803, + "epoch": 0.9171802194498723, + "flos": 26719298328960.0, + "grad_norm": 2.304957131705181, + "language_loss": 0.79456854, + "learning_rate": 7.14716155140167e-08, + "loss": 0.81671077, + "num_input_tokens_seen": 329065555, + "step": 15255, + "time_per_iteration": 2.691692352294922 + }, + { + "auxiliary_loss_clip": 0.0114749, + "auxiliary_loss_mlp": 0.01100744, + "balance_loss_clip": 1.00175381, + "balance_loss_mlp": 1.00056016, + "epoch": 0.9172403427025402, + "flos": 37889240538240.0, + "grad_norm": 1.9939469784551371, + "language_loss": 0.68474096, + "learning_rate": 7.136846709927047e-08, + "loss": 0.7072233, + "num_input_tokens_seen": 329087515, + "step": 15256, + "time_per_iteration": 2.7169597148895264 + }, + { + "auxiliary_loss_clip": 0.01149164, + "auxiliary_loss_mlp": 0.01099765, + "balance_loss_clip": 1.00188971, + "balance_loss_mlp": 1.00053573, + "epoch": 0.9173004659552082, + "flos": 17055163100160.0, + "grad_norm": 1.7452417759747103, + "language_loss": 0.83990467, + "learning_rate": 7.126539181842561e-08, + "loss": 0.86239392, + "num_input_tokens_seen": 329106820, + "step": 15257, + "time_per_iteration": 2.6044857501983643 + }, + { + "auxiliary_loss_clip": 0.01134691, + "auxiliary_loss_mlp": 0.01100326, + "balance_loss_clip": 1.00179851, + "balance_loss_mlp": 1.00057197, + "epoch": 0.9173605892078761, + "flos": 22201593056640.0, + "grad_norm": 1.739849811160038, + "language_loss": 0.77495575, + "learning_rate": 7.116238967539012e-08, + "loss": 0.79730588, + "num_input_tokens_seen": 329126515, + "step": 15258, + "time_per_iteration": 2.6725285053253174 + }, + { + "auxiliary_loss_clip": 0.01148034, + "auxiliary_loss_mlp": 0.01100368, + "balance_loss_clip": 1.00205362, + "balance_loss_mlp": 1.00051832, + "epoch": 0.9174207124605441, + "flos": 16507776764160.0, + "grad_norm": 2.101338717803871, + "language_loss": 0.78598541, + "learning_rate": 7.105946067406999e-08, + "loss": 0.80846947, + "num_input_tokens_seen": 329142660, + "step": 15259, + "time_per_iteration": 2.593365430831909 + }, + { + "auxiliary_loss_clip": 0.01099971, + "auxiliary_loss_mlp": 0.01100125, + "balance_loss_clip": 1.00170696, + "balance_loss_mlp": 1.00051391, + "epoch": 0.917480835713212, + "flos": 24535606901760.0, + "grad_norm": 1.542402467947649, + "language_loss": 0.76358604, + "learning_rate": 7.095660481836895e-08, + "loss": 0.78558695, + "num_input_tokens_seen": 329162575, + "step": 15260, + "time_per_iteration": 2.725210666656494 + }, + { + "auxiliary_loss_clip": 0.01102666, + "auxiliary_loss_mlp": 0.01099506, + "balance_loss_clip": 1.00161505, + "balance_loss_mlp": 1.00041914, + "epoch": 0.9175409589658801, + "flos": 20880726226560.0, + "grad_norm": 1.6760910116198657, + "language_loss": 0.61303163, + "learning_rate": 7.085382211218637e-08, + "loss": 0.6350534, + "num_input_tokens_seen": 329182090, + "step": 15261, + "time_per_iteration": 2.740475654602051 + }, + { + "auxiliary_loss_clip": 0.01132716, + "auxiliary_loss_mlp": 0.01100677, + "balance_loss_clip": 1.00174499, + "balance_loss_mlp": 1.00058913, + "epoch": 0.917601082218548, + "flos": 14276035918080.0, + "grad_norm": 1.8530306118123234, + "language_loss": 0.73616403, + "learning_rate": 7.075111255942002e-08, + "loss": 0.75849795, + "num_input_tokens_seen": 329196535, + "step": 15262, + "time_per_iteration": 2.5911800861358643 + }, + { + "auxiliary_loss_clip": 0.01164338, + "auxiliary_loss_mlp": 0.01100486, + "balance_loss_clip": 1.00177383, + "balance_loss_mlp": 1.00054061, + "epoch": 0.917661205471216, + "flos": 19099234362240.0, + "grad_norm": 1.9542328032976348, + "language_loss": 0.77618903, + "learning_rate": 7.064847616396496e-08, + "loss": 0.79883724, + "num_input_tokens_seen": 329215135, + "step": 15263, + "time_per_iteration": 3.989061117172241 + }, + { + "auxiliary_loss_clip": 0.01164396, + "auxiliary_loss_mlp": 0.01101189, + "balance_loss_clip": 1.00197434, + "balance_loss_mlp": 1.00052929, + "epoch": 0.917721328723884, + "flos": 21106568989440.0, + "grad_norm": 1.6567008483532548, + "language_loss": 0.7575897, + "learning_rate": 7.054591292971324e-08, + "loss": 0.78024554, + "num_input_tokens_seen": 329235150, + "step": 15264, + "time_per_iteration": 2.5939102172851562 + }, + { + "auxiliary_loss_clip": 0.01130843, + "auxiliary_loss_mlp": 0.01100532, + "balance_loss_clip": 1.00187683, + "balance_loss_mlp": 1.00058722, + "epoch": 0.9177814519765519, + "flos": 21943215550080.0, + "grad_norm": 1.7301482581532335, + "language_loss": 0.83171344, + "learning_rate": 7.044342286055394e-08, + "loss": 0.85402715, + "num_input_tokens_seen": 329254365, + "step": 15265, + "time_per_iteration": 2.6350772380828857 + }, + { + "auxiliary_loss_clip": 0.01164451, + "auxiliary_loss_mlp": 0.01101633, + "balance_loss_clip": 1.00198269, + "balance_loss_mlp": 1.00063872, + "epoch": 0.9178415752292199, + "flos": 24205982768640.0, + "grad_norm": 1.6366878175059436, + "language_loss": 0.73179454, + "learning_rate": 7.034100596037306e-08, + "loss": 0.75445539, + "num_input_tokens_seen": 329274385, + "step": 15266, + "time_per_iteration": 4.0504302978515625 + }, + { + "auxiliary_loss_clip": 0.01164113, + "auxiliary_loss_mlp": 0.01100557, + "balance_loss_clip": 1.00180817, + "balance_loss_mlp": 1.0003736, + "epoch": 0.9179016984818879, + "flos": 20042068504320.0, + "grad_norm": 1.55614275836273, + "language_loss": 0.77862853, + "learning_rate": 7.023866223305486e-08, + "loss": 0.80127525, + "num_input_tokens_seen": 329292160, + "step": 15267, + "time_per_iteration": 2.547750234603882 + }, + { + "auxiliary_loss_clip": 0.01141584, + "auxiliary_loss_mlp": 0.0074538, + "balance_loss_clip": 1.00088882, + "balance_loss_mlp": 1.00016022, + "epoch": 0.9179618217345559, + "flos": 65555901100800.0, + "grad_norm": 0.7349920679480912, + "language_loss": 0.56260401, + "learning_rate": 7.013639168247975e-08, + "loss": 0.58147365, + "num_input_tokens_seen": 329351870, + "step": 15268, + "time_per_iteration": 3.224452495574951 + }, + { + "auxiliary_loss_clip": 0.01164152, + "auxiliary_loss_mlp": 0.00747224, + "balance_loss_clip": 1.00190377, + "balance_loss_mlp": 1.00048923, + "epoch": 0.9180219449872238, + "flos": 21324618501120.0, + "grad_norm": 1.6931169860857447, + "language_loss": 0.7648201, + "learning_rate": 7.0034194312526e-08, + "loss": 0.78393382, + "num_input_tokens_seen": 329370930, + "step": 15269, + "time_per_iteration": 2.572293281555176 + }, + { + "auxiliary_loss_clip": 0.01115675, + "auxiliary_loss_mlp": 0.01100019, + "balance_loss_clip": 1.00167143, + "balance_loss_mlp": 1.00045562, + "epoch": 0.9180820682398918, + "flos": 41060008684800.0, + "grad_norm": 1.661057208876028, + "language_loss": 0.72365844, + "learning_rate": 6.993207012706936e-08, + "loss": 0.7458154, + "num_input_tokens_seen": 329391275, + "step": 15270, + "time_per_iteration": 2.865459680557251 + }, + { + "auxiliary_loss_clip": 0.01163955, + "auxiliary_loss_mlp": 0.01099825, + "balance_loss_clip": 1.00183153, + "balance_loss_mlp": 1.00059497, + "epoch": 0.9181421914925597, + "flos": 28072915384320.0, + "grad_norm": 1.5364883963265359, + "language_loss": 0.80025768, + "learning_rate": 6.98300191299821e-08, + "loss": 0.82289541, + "num_input_tokens_seen": 329412775, + "step": 15271, + "time_per_iteration": 2.6396257877349854 + }, + { + "auxiliary_loss_clip": 0.01118721, + "auxiliary_loss_mlp": 0.01100528, + "balance_loss_clip": 1.00175059, + "balance_loss_mlp": 1.00058317, + "epoch": 0.9182023147452277, + "flos": 29169411909120.0, + "grad_norm": 1.9331757909738658, + "language_loss": 0.72768837, + "learning_rate": 6.972804132513355e-08, + "loss": 0.74988079, + "num_input_tokens_seen": 329432440, + "step": 15272, + "time_per_iteration": 2.730604887008667 + }, + { + "auxiliary_loss_clip": 0.01131555, + "auxiliary_loss_mlp": 0.01099922, + "balance_loss_clip": 1.0018661, + "balance_loss_mlp": 1.00073946, + "epoch": 0.9182624379978956, + "flos": 24060831909120.0, + "grad_norm": 2.217108535736839, + "language_loss": 0.72761303, + "learning_rate": 6.962613671639105e-08, + "loss": 0.74992776, + "num_input_tokens_seen": 329450605, + "step": 15273, + "time_per_iteration": 2.7302405834198 + }, + { + "auxiliary_loss_clip": 0.01113505, + "auxiliary_loss_mlp": 0.01098685, + "balance_loss_clip": 1.00162852, + "balance_loss_mlp": 1.00045693, + "epoch": 0.9183225612505637, + "flos": 23293528554240.0, + "grad_norm": 1.8497450478203727, + "language_loss": 0.74212712, + "learning_rate": 6.952430530761933e-08, + "loss": 0.76424903, + "num_input_tokens_seen": 329470550, + "step": 15274, + "time_per_iteration": 2.75150728225708 + }, + { + "auxiliary_loss_clip": 0.01149636, + "auxiliary_loss_mlp": 0.01100229, + "balance_loss_clip": 1.00190496, + "balance_loss_mlp": 1.00066531, + "epoch": 0.9183826845032316, + "flos": 19609237618560.0, + "grad_norm": 1.9030644625698236, + "language_loss": 0.68773127, + "learning_rate": 6.942254710267902e-08, + "loss": 0.71022993, + "num_input_tokens_seen": 329489765, + "step": 15275, + "time_per_iteration": 2.6149284839630127 + }, + { + "auxiliary_loss_clip": 0.01147251, + "auxiliary_loss_mlp": 0.01100379, + "balance_loss_clip": 1.00178623, + "balance_loss_mlp": 1.00048172, + "epoch": 0.9184428077558996, + "flos": 18479057114880.0, + "grad_norm": 4.661948725915319, + "language_loss": 0.72183561, + "learning_rate": 6.932086210542953e-08, + "loss": 0.74431187, + "num_input_tokens_seen": 329507040, + "step": 15276, + "time_per_iteration": 2.5504090785980225 + }, + { + "auxiliary_loss_clip": 0.01130936, + "auxiliary_loss_mlp": 0.01100266, + "balance_loss_clip": 1.00186372, + "balance_loss_mlp": 1.00055909, + "epoch": 0.9185029310085676, + "flos": 20741034234240.0, + "grad_norm": 1.6734434509705287, + "language_loss": 0.7337507, + "learning_rate": 6.921925031972642e-08, + "loss": 0.75606275, + "num_input_tokens_seen": 329525540, + "step": 15277, + "time_per_iteration": 2.653193473815918 + }, + { + "auxiliary_loss_clip": 0.01111806, + "auxiliary_loss_mlp": 0.01074371, + "balance_loss_clip": 1.00062537, + "balance_loss_mlp": 1.00036585, + "epoch": 0.9185630542612355, + "flos": 68209231875840.0, + "grad_norm": 0.7205381566761506, + "language_loss": 0.59228653, + "learning_rate": 6.91177117494226e-08, + "loss": 0.61414826, + "num_input_tokens_seen": 329592905, + "step": 15278, + "time_per_iteration": 3.3405582904815674 + }, + { + "auxiliary_loss_clip": 0.01113146, + "auxiliary_loss_mlp": 0.01099378, + "balance_loss_clip": 1.00150335, + "balance_loss_mlp": 1.00043488, + "epoch": 0.9186231775139035, + "flos": 12239470598400.0, + "grad_norm": 2.05938439513367, + "language_loss": 0.645051, + "learning_rate": 6.901624639836879e-08, + "loss": 0.66717625, + "num_input_tokens_seen": 329610150, + "step": 15279, + "time_per_iteration": 2.6181516647338867 + }, + { + "auxiliary_loss_clip": 0.01158043, + "auxiliary_loss_mlp": 0.00745276, + "balance_loss_clip": 1.00071836, + "balance_loss_mlp": 1.0001955, + "epoch": 0.9186833007665715, + "flos": 63939237770880.0, + "grad_norm": 0.8552894792085584, + "language_loss": 0.60200548, + "learning_rate": 6.891485427041211e-08, + "loss": 0.62103868, + "num_input_tokens_seen": 329673650, + "step": 15280, + "time_per_iteration": 3.1541171073913574 + }, + { + "auxiliary_loss_clip": 0.01132736, + "auxiliary_loss_mlp": 0.0110032, + "balance_loss_clip": 1.00168407, + "balance_loss_mlp": 1.00037503, + "epoch": 0.9187434240192395, + "flos": 19974700546560.0, + "grad_norm": 2.2265367643966316, + "language_loss": 0.69663596, + "learning_rate": 6.881353536939815e-08, + "loss": 0.71896648, + "num_input_tokens_seen": 329692520, + "step": 15281, + "time_per_iteration": 2.613748073577881 + }, + { + "auxiliary_loss_clip": 0.01130177, + "auxiliary_loss_mlp": 0.01100047, + "balance_loss_clip": 1.00170469, + "balance_loss_mlp": 1.00034022, + "epoch": 0.9188035472719074, + "flos": 25227820874880.0, + "grad_norm": 5.019657810314166, + "language_loss": 0.84459817, + "learning_rate": 6.871228969916831e-08, + "loss": 0.86690044, + "num_input_tokens_seen": 329713750, + "step": 15282, + "time_per_iteration": 2.67166805267334 + }, + { + "auxiliary_loss_clip": 0.01132698, + "auxiliary_loss_mlp": 0.01101038, + "balance_loss_clip": 1.00184238, + "balance_loss_mlp": 1.00061631, + "epoch": 0.9188636705245754, + "flos": 18405547931520.0, + "grad_norm": 1.8093935041924665, + "language_loss": 0.60250688, + "learning_rate": 6.861111726356194e-08, + "loss": 0.62484419, + "num_input_tokens_seen": 329730960, + "step": 15283, + "time_per_iteration": 4.691791772842407 + }, + { + "auxiliary_loss_clip": 0.01147593, + "auxiliary_loss_mlp": 0.00747351, + "balance_loss_clip": 1.00183702, + "balance_loss_mlp": 1.00043464, + "epoch": 0.9189237937772433, + "flos": 23769129559680.0, + "grad_norm": 1.6523247867130186, + "language_loss": 0.65709054, + "learning_rate": 6.851001806641554e-08, + "loss": 0.67603993, + "num_input_tokens_seen": 329750975, + "step": 15284, + "time_per_iteration": 2.6507368087768555 + }, + { + "auxiliary_loss_clip": 0.01164018, + "auxiliary_loss_mlp": 0.01100603, + "balance_loss_clip": 1.00182652, + "balance_loss_mlp": 1.00051463, + "epoch": 0.9189839170299113, + "flos": 21214624078080.0, + "grad_norm": 1.8940011062223625, + "language_loss": 0.73975611, + "learning_rate": 6.840899211156292e-08, + "loss": 0.7624023, + "num_input_tokens_seen": 329769645, + "step": 15285, + "time_per_iteration": 4.035360097885132 + }, + { + "auxiliary_loss_clip": 0.01163899, + "auxiliary_loss_mlp": 0.01099686, + "balance_loss_clip": 1.00178039, + "balance_loss_mlp": 1.00040829, + "epoch": 0.9190440402825792, + "flos": 16727370560640.0, + "grad_norm": 2.076933386525186, + "language_loss": 0.71628666, + "learning_rate": 6.830803940283458e-08, + "loss": 0.73892248, + "num_input_tokens_seen": 329788185, + "step": 15286, + "time_per_iteration": 2.5953776836395264 + }, + { + "auxiliary_loss_clip": 0.01164202, + "auxiliary_loss_mlp": 0.01100589, + "balance_loss_clip": 1.00191307, + "balance_loss_mlp": 1.00059617, + "epoch": 0.9191041635352473, + "flos": 23441193365760.0, + "grad_norm": 2.0184846847248537, + "language_loss": 0.73691344, + "learning_rate": 6.820715994405945e-08, + "loss": 0.7595613, + "num_input_tokens_seen": 329806780, + "step": 15287, + "time_per_iteration": 2.5932929515838623 + }, + { + "auxiliary_loss_clip": 0.01164373, + "auxiliary_loss_mlp": 0.01100738, + "balance_loss_clip": 1.00202262, + "balance_loss_mlp": 1.00050712, + "epoch": 0.9191642867879152, + "flos": 18807532012800.0, + "grad_norm": 1.7894291835811686, + "language_loss": 0.65193439, + "learning_rate": 6.810635373906226e-08, + "loss": 0.67458552, + "num_input_tokens_seen": 329826350, + "step": 15288, + "time_per_iteration": 2.6038575172424316 + }, + { + "auxiliary_loss_clip": 0.01164305, + "auxiliary_loss_mlp": 0.01100062, + "balance_loss_clip": 1.00206923, + "balance_loss_mlp": 1.0005939, + "epoch": 0.9192244100405832, + "flos": 32160950167680.0, + "grad_norm": 2.020876935464647, + "language_loss": 0.71506411, + "learning_rate": 6.800562079166549e-08, + "loss": 0.73770779, + "num_input_tokens_seen": 329846160, + "step": 15289, + "time_per_iteration": 2.6441800594329834 + }, + { + "auxiliary_loss_clip": 0.01118529, + "auxiliary_loss_mlp": 0.01100736, + "balance_loss_clip": 1.00193906, + "balance_loss_mlp": 1.00059986, + "epoch": 0.9192845332932512, + "flos": 16357669827840.0, + "grad_norm": 2.569440136456813, + "language_loss": 0.74461269, + "learning_rate": 6.790496110568921e-08, + "loss": 0.76680535, + "num_input_tokens_seen": 329862020, + "step": 15290, + "time_per_iteration": 2.753042697906494 + }, + { + "auxiliary_loss_clip": 0.01102093, + "auxiliary_loss_mlp": 0.01099415, + "balance_loss_clip": 1.00168836, + "balance_loss_mlp": 1.00042331, + "epoch": 0.9193446565459191, + "flos": 26614475464320.0, + "grad_norm": 2.387607223814381, + "language_loss": 0.72118855, + "learning_rate": 6.78043746849506e-08, + "loss": 0.74320364, + "num_input_tokens_seen": 329880185, + "step": 15291, + "time_per_iteration": 2.7933380603790283 + }, + { + "auxiliary_loss_clip": 0.01132497, + "auxiliary_loss_mlp": 0.0109942, + "balance_loss_clip": 1.0017736, + "balance_loss_mlp": 1.0004282, + "epoch": 0.9194047797985871, + "flos": 22492182084480.0, + "grad_norm": 1.6915763777090114, + "language_loss": 0.70841753, + "learning_rate": 6.770386153326346e-08, + "loss": 0.73073667, + "num_input_tokens_seen": 329900255, + "step": 15292, + "time_per_iteration": 2.7151100635528564 + }, + { + "auxiliary_loss_clip": 0.01132839, + "auxiliary_loss_mlp": 0.01100483, + "balance_loss_clip": 1.00181305, + "balance_loss_mlp": 1.00053763, + "epoch": 0.9194649030512551, + "flos": 25078791346560.0, + "grad_norm": 1.9705860430073565, + "language_loss": 0.72809893, + "learning_rate": 6.760342165443988e-08, + "loss": 0.75043213, + "num_input_tokens_seen": 329919095, + "step": 15293, + "time_per_iteration": 2.690246343612671 + }, + { + "auxiliary_loss_clip": 0.01164227, + "auxiliary_loss_mlp": 0.01100516, + "balance_loss_clip": 1.00199747, + "balance_loss_mlp": 1.00047588, + "epoch": 0.9195250263039231, + "flos": 11911139354880.0, + "grad_norm": 1.9738268546055637, + "language_loss": 0.78217655, + "learning_rate": 6.750305505228837e-08, + "loss": 0.80482399, + "num_input_tokens_seen": 329936505, + "step": 15294, + "time_per_iteration": 2.565666675567627 + }, + { + "auxiliary_loss_clip": 0.01133911, + "auxiliary_loss_mlp": 0.01100747, + "balance_loss_clip": 1.00180054, + "balance_loss_mlp": 1.00051641, + "epoch": 0.919585149556591, + "flos": 21834154880640.0, + "grad_norm": 1.6311370882844547, + "language_loss": 0.77332258, + "learning_rate": 6.74027617306141e-08, + "loss": 0.79566914, + "num_input_tokens_seen": 329956795, + "step": 15295, + "time_per_iteration": 2.6883723735809326 + }, + { + "auxiliary_loss_clip": 0.01164092, + "auxiliary_loss_mlp": 0.01099544, + "balance_loss_clip": 1.002002, + "balance_loss_mlp": 1.00050473, + "epoch": 0.919645272809259, + "flos": 28184059042560.0, + "grad_norm": 4.145402548808905, + "language_loss": 0.71224666, + "learning_rate": 6.730254169322114e-08, + "loss": 0.73488295, + "num_input_tokens_seen": 329977195, + "step": 15296, + "time_per_iteration": 2.700514316558838 + }, + { + "auxiliary_loss_clip": 0.01164024, + "auxiliary_loss_mlp": 0.01100114, + "balance_loss_clip": 1.00187802, + "balance_loss_mlp": 1.00064552, + "epoch": 0.9197053960619269, + "flos": 18332828847360.0, + "grad_norm": 2.1989012876687, + "language_loss": 0.75349605, + "learning_rate": 6.720239494390912e-08, + "loss": 0.77613747, + "num_input_tokens_seen": 329992095, + "step": 15297, + "time_per_iteration": 2.5265493392944336 + }, + { + "auxiliary_loss_clip": 0.01147775, + "auxiliary_loss_mlp": 0.00747347, + "balance_loss_clip": 1.00183523, + "balance_loss_mlp": 1.00037694, + "epoch": 0.9197655193145949, + "flos": 28183448511360.0, + "grad_norm": 1.6839817737969112, + "language_loss": 0.73860359, + "learning_rate": 6.710232148647676e-08, + "loss": 0.75755477, + "num_input_tokens_seen": 330011490, + "step": 15298, + "time_per_iteration": 2.687899351119995 + }, + { + "auxiliary_loss_clip": 0.01131062, + "auxiliary_loss_mlp": 0.01100907, + "balance_loss_clip": 1.001755, + "balance_loss_mlp": 1.00058031, + "epoch": 0.9198256425672628, + "flos": 17306321973120.0, + "grad_norm": 2.0686190112137113, + "language_loss": 0.79255348, + "learning_rate": 6.70023213247175e-08, + "loss": 0.81487322, + "num_input_tokens_seen": 330027885, + "step": 15299, + "time_per_iteration": 2.6330132484436035 + }, + { + "auxiliary_loss_clip": 0.01117694, + "auxiliary_loss_mlp": 0.01100103, + "balance_loss_clip": 1.00182199, + "balance_loss_mlp": 1.00044405, + "epoch": 0.9198857658199309, + "flos": 17858520731520.0, + "grad_norm": 2.262962894674223, + "language_loss": 0.64113927, + "learning_rate": 6.690239446242385e-08, + "loss": 0.6633172, + "num_input_tokens_seen": 330046230, + "step": 15300, + "time_per_iteration": 2.6552727222442627 + }, + { + "auxiliary_loss_clip": 0.01130374, + "auxiliary_loss_mlp": 0.00747106, + "balance_loss_clip": 1.00170135, + "balance_loss_mlp": 1.00038278, + "epoch": 0.9199458890725988, + "flos": 22127545169280.0, + "grad_norm": 1.8261764701454901, + "language_loss": 0.69655347, + "learning_rate": 6.680254090338545e-08, + "loss": 0.71532834, + "num_input_tokens_seen": 330065535, + "step": 15301, + "time_per_iteration": 4.192782878875732 + }, + { + "auxiliary_loss_clip": 0.01149551, + "auxiliary_loss_mlp": 0.01101518, + "balance_loss_clip": 1.0020268, + "balance_loss_mlp": 1.00052416, + "epoch": 0.9200060123252668, + "flos": 16034043265920.0, + "grad_norm": 2.150550049453793, + "language_loss": 0.71072167, + "learning_rate": 6.670276065138814e-08, + "loss": 0.73323238, + "num_input_tokens_seen": 330082920, + "step": 15302, + "time_per_iteration": 2.5779099464416504 + }, + { + "auxiliary_loss_clip": 0.01164262, + "auxiliary_loss_mlp": 0.01101437, + "balance_loss_clip": 1.00188565, + "balance_loss_mlp": 1.00053811, + "epoch": 0.9200661355779348, + "flos": 26864521015680.0, + "grad_norm": 1.694021213343897, + "language_loss": 0.7674129, + "learning_rate": 6.660305371021579e-08, + "loss": 0.79006982, + "num_input_tokens_seen": 330101165, + "step": 15303, + "time_per_iteration": 4.079408884048462 + }, + { + "auxiliary_loss_clip": 0.01132524, + "auxiliary_loss_mlp": 0.01100922, + "balance_loss_clip": 1.00197506, + "balance_loss_mlp": 1.00059521, + "epoch": 0.9201262588306027, + "flos": 12786749193600.0, + "grad_norm": 2.298507495915942, + "language_loss": 0.88118958, + "learning_rate": 6.650342008365006e-08, + "loss": 0.90352404, + "num_input_tokens_seen": 330118775, + "step": 15304, + "time_per_iteration": 2.7391529083251953 + }, + { + "auxiliary_loss_clip": 0.01085481, + "auxiliary_loss_mlp": 0.01101464, + "balance_loss_clip": 1.0017004, + "balance_loss_mlp": 1.00051761, + "epoch": 0.9201863820832707, + "flos": 20631614428800.0, + "grad_norm": 1.9777060645944409, + "language_loss": 0.77292001, + "learning_rate": 6.64038597754677e-08, + "loss": 0.79478949, + "num_input_tokens_seen": 330135570, + "step": 15305, + "time_per_iteration": 2.8366572856903076 + }, + { + "auxiliary_loss_clip": 0.01148038, + "auxiliary_loss_mlp": 0.01100269, + "balance_loss_clip": 1.00191534, + "balance_loss_mlp": 1.00061035, + "epoch": 0.9202465053359387, + "flos": 26395815421440.0, + "grad_norm": 6.102374254576663, + "language_loss": 0.81308717, + "learning_rate": 6.630437278944501e-08, + "loss": 0.83557022, + "num_input_tokens_seen": 330152840, + "step": 15306, + "time_per_iteration": 3.076124668121338 + }, + { + "auxiliary_loss_clip": 0.01116012, + "auxiliary_loss_mlp": 0.01099988, + "balance_loss_clip": 1.00155985, + "balance_loss_mlp": 1.00051939, + "epoch": 0.9203066285886067, + "flos": 10488179093760.0, + "grad_norm": 2.156531280241005, + "language_loss": 0.72240353, + "learning_rate": 6.62049591293541e-08, + "loss": 0.74456352, + "num_input_tokens_seen": 330168605, + "step": 15307, + "time_per_iteration": 2.7241017818450928 + }, + { + "auxiliary_loss_clip": 0.01147627, + "auxiliary_loss_mlp": 0.01101334, + "balance_loss_clip": 1.00179601, + "balance_loss_mlp": 1.00043571, + "epoch": 0.9203667518412746, + "flos": 19390721230080.0, + "grad_norm": 1.8863893835390964, + "language_loss": 0.7863211, + "learning_rate": 6.610561879896526e-08, + "loss": 0.80881071, + "num_input_tokens_seen": 330186160, + "step": 15308, + "time_per_iteration": 2.6512088775634766 + }, + { + "auxiliary_loss_clip": 0.01132763, + "auxiliary_loss_mlp": 0.01100588, + "balance_loss_clip": 1.00173831, + "balance_loss_mlp": 1.00045204, + "epoch": 0.9204268750939426, + "flos": 15924982596480.0, + "grad_norm": 2.150757880370249, + "language_loss": 0.78023005, + "learning_rate": 6.600635180204484e-08, + "loss": 0.80256349, + "num_input_tokens_seen": 330201780, + "step": 15309, + "time_per_iteration": 2.6587462425231934 + }, + { + "auxiliary_loss_clip": 0.01101807, + "auxiliary_loss_mlp": 0.01100317, + "balance_loss_clip": 1.00185394, + "balance_loss_mlp": 1.00051475, + "epoch": 0.9204869983466105, + "flos": 16471758401280.0, + "grad_norm": 1.8753722486048792, + "language_loss": 0.66523015, + "learning_rate": 6.590715814235781e-08, + "loss": 0.68725133, + "num_input_tokens_seen": 330219165, + "step": 15310, + "time_per_iteration": 2.6863884925842285 + }, + { + "auxiliary_loss_clip": 0.01084622, + "auxiliary_loss_mlp": 0.01100489, + "balance_loss_clip": 1.00164771, + "balance_loss_mlp": 1.00044847, + "epoch": 0.9205471215992785, + "flos": 21539220307200.0, + "grad_norm": 1.6620747741364463, + "language_loss": 0.66205859, + "learning_rate": 6.580803782366495e-08, + "loss": 0.68390971, + "num_input_tokens_seen": 330238975, + "step": 15311, + "time_per_iteration": 2.8245294094085693 + }, + { + "auxiliary_loss_clip": 0.0114789, + "auxiliary_loss_mlp": 0.01099998, + "balance_loss_clip": 1.00175858, + "balance_loss_mlp": 1.00052965, + "epoch": 0.9206072448519464, + "flos": 25005892694400.0, + "grad_norm": 1.7311166380326872, + "language_loss": 0.76160169, + "learning_rate": 6.570899084972503e-08, + "loss": 0.78408051, + "num_input_tokens_seen": 330259755, + "step": 15312, + "time_per_iteration": 2.685072422027588 + }, + { + "auxiliary_loss_clip": 0.01147233, + "auxiliary_loss_mlp": 0.01099762, + "balance_loss_clip": 1.00177443, + "balance_loss_mlp": 1.00058019, + "epoch": 0.9206673681046145, + "flos": 20522661500160.0, + "grad_norm": 1.742560923041089, + "language_loss": 0.79437286, + "learning_rate": 6.561001722429394e-08, + "loss": 0.81684279, + "num_input_tokens_seen": 330277660, + "step": 15313, + "time_per_iteration": 2.6071293354034424 + }, + { + "auxiliary_loss_clip": 0.01149078, + "auxiliary_loss_mlp": 0.011, + "balance_loss_clip": 1.00187445, + "balance_loss_mlp": 1.00048447, + "epoch": 0.9207274913572824, + "flos": 20883455660160.0, + "grad_norm": 1.9556092271608603, + "language_loss": 0.77940732, + "learning_rate": 6.55111169511251e-08, + "loss": 0.80189812, + "num_input_tokens_seen": 330295455, + "step": 15314, + "time_per_iteration": 2.598747730255127 + }, + { + "auxiliary_loss_clip": 0.01132281, + "auxiliary_loss_mlp": 0.01100677, + "balance_loss_clip": 1.00171471, + "balance_loss_mlp": 1.00044632, + "epoch": 0.9207876146099504, + "flos": 22708256348160.0, + "grad_norm": 1.941081841462759, + "language_loss": 0.79589176, + "learning_rate": 6.541229003396864e-08, + "loss": 0.81822127, + "num_input_tokens_seen": 330315310, + "step": 15315, + "time_per_iteration": 2.685168981552124 + }, + { + "auxiliary_loss_clip": 0.01130515, + "auxiliary_loss_mlp": 0.01101379, + "balance_loss_clip": 1.00183058, + "balance_loss_mlp": 1.00048077, + "epoch": 0.9208477378626184, + "flos": 18507354053760.0, + "grad_norm": 1.9259866428703247, + "language_loss": 0.76353097, + "learning_rate": 6.531353647657156e-08, + "loss": 0.78584993, + "num_input_tokens_seen": 330333260, + "step": 15316, + "time_per_iteration": 2.6144773960113525 + }, + { + "auxiliary_loss_clip": 0.01164241, + "auxiliary_loss_mlp": 0.01100193, + "balance_loss_clip": 1.00180244, + "balance_loss_mlp": 1.00058198, + "epoch": 0.9209078611152863, + "flos": 22999635475200.0, + "grad_norm": 1.724278804874941, + "language_loss": 0.69658726, + "learning_rate": 6.521485628267931e-08, + "loss": 0.71923161, + "num_input_tokens_seen": 330352465, + "step": 15317, + "time_per_iteration": 2.549936532974243 + }, + { + "auxiliary_loss_clip": 0.01147539, + "auxiliary_loss_mlp": 0.01101866, + "balance_loss_clip": 1.00186694, + "balance_loss_mlp": 1.00058579, + "epoch": 0.9209679843679544, + "flos": 24061514267520.0, + "grad_norm": 2.0359451115234353, + "language_loss": 0.83710086, + "learning_rate": 6.511624945603378e-08, + "loss": 0.85959488, + "num_input_tokens_seen": 330372685, + "step": 15318, + "time_per_iteration": 2.6711716651916504 + }, + { + "auxiliary_loss_clip": 0.01130549, + "auxiliary_loss_mlp": 0.01099576, + "balance_loss_clip": 1.00179088, + "balance_loss_mlp": 1.00048947, + "epoch": 0.9210281076206223, + "flos": 13553370190080.0, + "grad_norm": 1.9458682557990508, + "language_loss": 0.85741723, + "learning_rate": 6.501771600037354e-08, + "loss": 0.87971848, + "num_input_tokens_seen": 330388860, + "step": 15319, + "time_per_iteration": 2.637880802154541 + }, + { + "auxiliary_loss_clip": 0.01158004, + "auxiliary_loss_mlp": 0.01073884, + "balance_loss_clip": 1.00075364, + "balance_loss_mlp": 1.00026059, + "epoch": 0.9210882308732903, + "flos": 71426289674880.0, + "grad_norm": 0.7786420218348146, + "language_loss": 0.56221557, + "learning_rate": 6.491925591943559e-08, + "loss": 0.58453441, + "num_input_tokens_seen": 330448735, + "step": 15320, + "time_per_iteration": 4.870944261550903 + }, + { + "auxiliary_loss_clip": 0.01099628, + "auxiliary_loss_mlp": 0.01102422, + "balance_loss_clip": 1.00164592, + "balance_loss_mlp": 1.00066447, + "epoch": 0.9211483541259582, + "flos": 18509113820160.0, + "grad_norm": 2.1089067662681718, + "language_loss": 0.64265949, + "learning_rate": 6.482086921695384e-08, + "loss": 0.66468, + "num_input_tokens_seen": 330465600, + "step": 15321, + "time_per_iteration": 2.695256233215332 + }, + { + "auxiliary_loss_clip": 0.01117801, + "auxiliary_loss_mlp": 0.01099925, + "balance_loss_clip": 1.00181699, + "balance_loss_mlp": 1.00055265, + "epoch": 0.9212084773786262, + "flos": 23258228463360.0, + "grad_norm": 1.3958228524136325, + "language_loss": 0.71452874, + "learning_rate": 6.47225558966582e-08, + "loss": 0.73670602, + "num_input_tokens_seen": 330485770, + "step": 15322, + "time_per_iteration": 2.727567672729492 + }, + { + "auxiliary_loss_clip": 0.01097545, + "auxiliary_loss_mlp": 0.01100626, + "balance_loss_clip": 1.00155747, + "balance_loss_mlp": 1.00058603, + "epoch": 0.9212686006312941, + "flos": 16289511770880.0, + "grad_norm": 1.790710941138034, + "language_loss": 0.69540638, + "learning_rate": 6.462431596227725e-08, + "loss": 0.71738815, + "num_input_tokens_seen": 330504255, + "step": 15323, + "time_per_iteration": 4.125779390335083 + }, + { + "auxiliary_loss_clip": 0.01132931, + "auxiliary_loss_mlp": 0.01101595, + "balance_loss_clip": 1.00170875, + "balance_loss_mlp": 1.00060058, + "epoch": 0.9213287238839621, + "flos": 19785773986560.0, + "grad_norm": 1.7376893116849021, + "language_loss": 0.74820203, + "learning_rate": 6.452614941753597e-08, + "loss": 0.77054727, + "num_input_tokens_seen": 330520705, + "step": 15324, + "time_per_iteration": 2.627760887145996 + }, + { + "auxiliary_loss_clip": 0.01147519, + "auxiliary_loss_mlp": 0.0110093, + "balance_loss_clip": 1.00184345, + "balance_loss_mlp": 1.00069892, + "epoch": 0.92138884713663, + "flos": 21030402199680.0, + "grad_norm": 4.034825703452493, + "language_loss": 0.71066272, + "learning_rate": 6.442805626615744e-08, + "loss": 0.73314714, + "num_input_tokens_seen": 330539245, + "step": 15325, + "time_per_iteration": 2.6287031173706055 + }, + { + "auxiliary_loss_clip": 0.01133027, + "auxiliary_loss_mlp": 0.0109938, + "balance_loss_clip": 1.00169277, + "balance_loss_mlp": 1.00057971, + "epoch": 0.9214489703892981, + "flos": 28587264186240.0, + "grad_norm": 1.776734481033565, + "language_loss": 0.78389281, + "learning_rate": 6.433003651186109e-08, + "loss": 0.80621684, + "num_input_tokens_seen": 330561815, + "step": 15326, + "time_per_iteration": 2.7548844814300537 + }, + { + "auxiliary_loss_clip": 0.01147579, + "auxiliary_loss_mlp": 0.01100832, + "balance_loss_clip": 1.00192022, + "balance_loss_mlp": 1.00050545, + "epoch": 0.921509093641966, + "flos": 16361476669440.0, + "grad_norm": 2.524898704152744, + "language_loss": 0.71915996, + "learning_rate": 6.42320901583635e-08, + "loss": 0.74164408, + "num_input_tokens_seen": 330579760, + "step": 15327, + "time_per_iteration": 2.582414388656616 + }, + { + "auxiliary_loss_clip": 0.01147166, + "auxiliary_loss_mlp": 0.01101549, + "balance_loss_clip": 1.00193036, + "balance_loss_mlp": 1.00065041, + "epoch": 0.921569216894634, + "flos": 26830837036800.0, + "grad_norm": 2.1011527707290494, + "language_loss": 0.77664751, + "learning_rate": 6.413421720937906e-08, + "loss": 0.79913467, + "num_input_tokens_seen": 330598545, + "step": 15328, + "time_per_iteration": 2.7658486366271973 + }, + { + "auxiliary_loss_clip": 0.01131976, + "auxiliary_loss_mlp": 0.01099253, + "balance_loss_clip": 1.00168514, + "balance_loss_mlp": 1.00050044, + "epoch": 0.921629340147302, + "flos": 24645134448000.0, + "grad_norm": 3.7825913382563714, + "language_loss": 0.71560454, + "learning_rate": 6.4036417668619e-08, + "loss": 0.73791683, + "num_input_tokens_seen": 330616700, + "step": 15329, + "time_per_iteration": 2.709892749786377 + }, + { + "auxiliary_loss_clip": 0.01147379, + "auxiliary_loss_mlp": 0.01099111, + "balance_loss_clip": 1.00177741, + "balance_loss_mlp": 1.00040603, + "epoch": 0.9216894633999699, + "flos": 15086504442240.0, + "grad_norm": 4.492507572059546, + "language_loss": 0.8662225, + "learning_rate": 6.393869153979192e-08, + "loss": 0.88868737, + "num_input_tokens_seen": 330633355, + "step": 15330, + "time_per_iteration": 2.593428373336792 + }, + { + "auxiliary_loss_clip": 0.01119741, + "auxiliary_loss_mlp": 0.01100693, + "balance_loss_clip": 1.00189531, + "balance_loss_mlp": 1.00055718, + "epoch": 0.921749586652638, + "flos": 19204524103680.0, + "grad_norm": 2.1070305392893025, + "language_loss": 0.75759375, + "learning_rate": 6.384103882660397e-08, + "loss": 0.77979803, + "num_input_tokens_seen": 330651470, + "step": 15331, + "time_per_iteration": 2.718735456466675 + }, + { + "auxiliary_loss_clip": 0.01147407, + "auxiliary_loss_mlp": 0.01099945, + "balance_loss_clip": 1.00170207, + "balance_loss_mlp": 1.00047648, + "epoch": 0.9218097099053059, + "flos": 20522446018560.0, + "grad_norm": 1.6484798390873647, + "language_loss": 0.75325608, + "learning_rate": 6.374345953275794e-08, + "loss": 0.77572966, + "num_input_tokens_seen": 330669170, + "step": 15332, + "time_per_iteration": 2.646876573562622 + }, + { + "auxiliary_loss_clip": 0.01099901, + "auxiliary_loss_mlp": 0.01099553, + "balance_loss_clip": 1.00174665, + "balance_loss_mlp": 1.00046611, + "epoch": 0.9218698331579739, + "flos": 17348625216000.0, + "grad_norm": 2.1226688081183167, + "language_loss": 0.74825561, + "learning_rate": 6.364595366195358e-08, + "loss": 0.7702502, + "num_input_tokens_seen": 330686635, + "step": 15333, + "time_per_iteration": 2.709735870361328 + }, + { + "auxiliary_loss_clip": 0.01141603, + "auxiliary_loss_mlp": 0.01074237, + "balance_loss_clip": 1.00090933, + "balance_loss_mlp": 1.00023162, + "epoch": 0.9219299564106418, + "flos": 61958332575360.0, + "grad_norm": 0.7992874456110122, + "language_loss": 0.52902228, + "learning_rate": 6.354852121788879e-08, + "loss": 0.55118072, + "num_input_tokens_seen": 330749160, + "step": 15334, + "time_per_iteration": 3.170600175857544 + }, + { + "auxiliary_loss_clip": 0.01132852, + "auxiliary_loss_mlp": 0.01099813, + "balance_loss_clip": 1.00158238, + "balance_loss_mlp": 1.00039268, + "epoch": 0.9219900796633098, + "flos": 15701761526400.0, + "grad_norm": 2.164085112572345, + "language_loss": 0.62406218, + "learning_rate": 6.345116220425839e-08, + "loss": 0.64638877, + "num_input_tokens_seen": 330766840, + "step": 15335, + "time_per_iteration": 2.592886209487915 + }, + { + "auxiliary_loss_clip": 0.01103942, + "auxiliary_loss_mlp": 0.01100546, + "balance_loss_clip": 1.00187707, + "balance_loss_mlp": 1.00041056, + "epoch": 0.9220502029159777, + "flos": 24932670819840.0, + "grad_norm": 1.8913112791317972, + "language_loss": 0.71699047, + "learning_rate": 6.335387662475366e-08, + "loss": 0.73903537, + "num_input_tokens_seen": 330785585, + "step": 15336, + "time_per_iteration": 2.7939369678497314 + }, + { + "auxiliary_loss_clip": 0.01131096, + "auxiliary_loss_mlp": 0.01099559, + "balance_loss_clip": 1.00175822, + "balance_loss_mlp": 1.00056767, + "epoch": 0.9221103261686457, + "flos": 15667215621120.0, + "grad_norm": 1.9609750248436102, + "language_loss": 0.7163353, + "learning_rate": 6.325666448306433e-08, + "loss": 0.73864186, + "num_input_tokens_seen": 330800750, + "step": 15337, + "time_per_iteration": 2.678684949874878 + }, + { + "auxiliary_loss_clip": 0.01143584, + "auxiliary_loss_mlp": 0.01074046, + "balance_loss_clip": 1.0007385, + "balance_loss_mlp": 1.00004089, + "epoch": 0.9221704494213137, + "flos": 67516299630720.0, + "grad_norm": 0.8840961693582906, + "language_loss": 0.65311402, + "learning_rate": 6.31595257828763e-08, + "loss": 0.67529035, + "num_input_tokens_seen": 330863640, + "step": 15338, + "time_per_iteration": 3.138404130935669 + }, + { + "auxiliary_loss_clip": 0.01148671, + "auxiliary_loss_mlp": 0.01099959, + "balance_loss_clip": 1.00175941, + "balance_loss_mlp": 1.0004909, + "epoch": 0.9222305726739817, + "flos": 30226945155840.0, + "grad_norm": 2.746793496840049, + "language_loss": 0.67278993, + "learning_rate": 6.306246052787289e-08, + "loss": 0.69527626, + "num_input_tokens_seen": 330884675, + "step": 15339, + "time_per_iteration": 4.16744327545166 + }, + { + "auxiliary_loss_clip": 0.0116429, + "auxiliary_loss_mlp": 0.01100413, + "balance_loss_clip": 1.00193882, + "balance_loss_mlp": 1.00046766, + "epoch": 0.9222906959266496, + "flos": 25337204766720.0, + "grad_norm": 1.6896243158129907, + "language_loss": 0.72110951, + "learning_rate": 6.296546872173513e-08, + "loss": 0.74375653, + "num_input_tokens_seen": 330904125, + "step": 15340, + "time_per_iteration": 2.645350217819214 + }, + { + "auxiliary_loss_clip": 0.01116065, + "auxiliary_loss_mlp": 0.01101177, + "balance_loss_clip": 1.00185657, + "balance_loss_mlp": 1.00066006, + "epoch": 0.9223508191793176, + "flos": 27599864244480.0, + "grad_norm": 1.6109696534843478, + "language_loss": 0.70256984, + "learning_rate": 6.286855036814098e-08, + "loss": 0.72474229, + "num_input_tokens_seen": 330925140, + "step": 15341, + "time_per_iteration": 4.338291883468628 + }, + { + "auxiliary_loss_clip": 0.01098085, + "auxiliary_loss_mlp": 0.01099483, + "balance_loss_clip": 1.00161767, + "balance_loss_mlp": 1.00053895, + "epoch": 0.9224109424319856, + "flos": 27307587277440.0, + "grad_norm": 1.612498242593529, + "language_loss": 0.66924083, + "learning_rate": 6.277170547076571e-08, + "loss": 0.69121659, + "num_input_tokens_seen": 330946625, + "step": 15342, + "time_per_iteration": 2.8558053970336914 + }, + { + "auxiliary_loss_clip": 0.01098828, + "auxiliary_loss_mlp": 0.01100407, + "balance_loss_clip": 1.00157332, + "balance_loss_mlp": 1.00050998, + "epoch": 0.9224710656846535, + "flos": 48208314401280.0, + "grad_norm": 2.0546303108823616, + "language_loss": 0.69440311, + "learning_rate": 6.26749340332815e-08, + "loss": 0.7163955, + "num_input_tokens_seen": 330967795, + "step": 15343, + "time_per_iteration": 2.9722554683685303 + }, + { + "auxiliary_loss_clip": 0.01127596, + "auxiliary_loss_mlp": 0.01074324, + "balance_loss_clip": 1.00075829, + "balance_loss_mlp": 1.00031912, + "epoch": 0.9225311889373216, + "flos": 66722171794560.0, + "grad_norm": 0.7234110907493125, + "language_loss": 0.52005094, + "learning_rate": 6.257823605935786e-08, + "loss": 0.54207009, + "num_input_tokens_seen": 331040850, + "step": 15344, + "time_per_iteration": 3.435955047607422 + }, + { + "auxiliary_loss_clip": 0.01164099, + "auxiliary_loss_mlp": 0.01099436, + "balance_loss_clip": 1.00200534, + "balance_loss_mlp": 1.00044477, + "epoch": 0.9225913121899895, + "flos": 22271295398400.0, + "grad_norm": 1.5272876127268837, + "language_loss": 0.70191526, + "learning_rate": 6.248161155266162e-08, + "loss": 0.7245506, + "num_input_tokens_seen": 331060595, + "step": 15345, + "time_per_iteration": 2.611659049987793 + }, + { + "auxiliary_loss_clip": 0.01131418, + "auxiliary_loss_mlp": 0.01100141, + "balance_loss_clip": 1.00180674, + "balance_loss_mlp": 1.00072026, + "epoch": 0.9226514354426575, + "flos": 20082719721600.0, + "grad_norm": 3.058554486128395, + "language_loss": 0.77476239, + "learning_rate": 6.238506051685677e-08, + "loss": 0.79707795, + "num_input_tokens_seen": 331080195, + "step": 15346, + "time_per_iteration": 2.6820521354675293 + }, + { + "auxiliary_loss_clip": 0.01131049, + "auxiliary_loss_mlp": 0.01101691, + "balance_loss_clip": 1.00183356, + "balance_loss_mlp": 1.00055432, + "epoch": 0.9227115586953254, + "flos": 16070851728000.0, + "grad_norm": 2.189254112037407, + "language_loss": 0.76461637, + "learning_rate": 6.228858295560457e-08, + "loss": 0.78694373, + "num_input_tokens_seen": 331097645, + "step": 15347, + "time_per_iteration": 2.720144748687744 + }, + { + "auxiliary_loss_clip": 0.01147242, + "auxiliary_loss_mlp": 0.01099078, + "balance_loss_clip": 1.00179589, + "balance_loss_mlp": 1.00046837, + "epoch": 0.9227716819479934, + "flos": 20446027833600.0, + "grad_norm": 1.7692789782915097, + "language_loss": 0.76741123, + "learning_rate": 6.219217887256367e-08, + "loss": 0.78987443, + "num_input_tokens_seen": 331116830, + "step": 15348, + "time_per_iteration": 2.702667713165283 + }, + { + "auxiliary_loss_clip": 0.01132337, + "auxiliary_loss_mlp": 0.01100767, + "balance_loss_clip": 1.00178027, + "balance_loss_mlp": 1.00053573, + "epoch": 0.9228318052006613, + "flos": 25007401065600.0, + "grad_norm": 2.305686947551207, + "language_loss": 0.67655075, + "learning_rate": 6.209584827138959e-08, + "loss": 0.6988818, + "num_input_tokens_seen": 331137235, + "step": 15349, + "time_per_iteration": 2.9021189212799072 + }, + { + "auxiliary_loss_clip": 0.01117868, + "auxiliary_loss_mlp": 0.01101123, + "balance_loss_clip": 1.00190759, + "balance_loss_mlp": 1.00041485, + "epoch": 0.9228919284533293, + "flos": 12677257560960.0, + "grad_norm": 2.355672599356939, + "language_loss": 0.8691982, + "learning_rate": 6.199959115573495e-08, + "loss": 0.89138812, + "num_input_tokens_seen": 331153155, + "step": 15350, + "time_per_iteration": 2.7780187129974365 + }, + { + "auxiliary_loss_clip": 0.01126601, + "auxiliary_loss_mlp": 0.01074204, + "balance_loss_clip": 1.00075316, + "balance_loss_mlp": 1.0001992, + "epoch": 0.9229520517059973, + "flos": 69986162712960.0, + "grad_norm": 0.8790693488917205, + "language_loss": 0.60390329, + "learning_rate": 6.190340752924994e-08, + "loss": 0.62591136, + "num_input_tokens_seen": 331214895, + "step": 15351, + "time_per_iteration": 3.23994779586792 + }, + { + "auxiliary_loss_clip": 0.01130972, + "auxiliary_loss_mlp": 0.01100956, + "balance_loss_clip": 1.00172281, + "balance_loss_mlp": 1.00043869, + "epoch": 0.9230121749586653, + "flos": 14793832425600.0, + "grad_norm": 2.1422781658771926, + "language_loss": 0.77512556, + "learning_rate": 6.180729739558233e-08, + "loss": 0.79744482, + "num_input_tokens_seen": 331232185, + "step": 15352, + "time_per_iteration": 2.781108856201172 + }, + { + "auxiliary_loss_clip": 0.01117371, + "auxiliary_loss_mlp": 0.01101642, + "balance_loss_clip": 1.00172448, + "balance_loss_mlp": 1.00045681, + "epoch": 0.9230722982113332, + "flos": 22967208472320.0, + "grad_norm": 22.183034606688043, + "language_loss": 0.59454429, + "learning_rate": 6.171126075837585e-08, + "loss": 0.61673439, + "num_input_tokens_seen": 331251065, + "step": 15353, + "time_per_iteration": 2.735684871673584 + }, + { + "auxiliary_loss_clip": 0.01134245, + "auxiliary_loss_mlp": 0.01100298, + "balance_loss_clip": 1.00197268, + "balance_loss_mlp": 1.00049639, + "epoch": 0.9231324214640012, + "flos": 18551452976640.0, + "grad_norm": 1.720747979672479, + "language_loss": 0.74833214, + "learning_rate": 6.161529762127293e-08, + "loss": 0.77067757, + "num_input_tokens_seen": 331269110, + "step": 15354, + "time_per_iteration": 2.7246479988098145 + }, + { + "auxiliary_loss_clip": 0.01164277, + "auxiliary_loss_mlp": 0.01100224, + "balance_loss_clip": 1.00188136, + "balance_loss_mlp": 1.00051761, + "epoch": 0.9231925447166691, + "flos": 22082727974400.0, + "grad_norm": 2.152265329291056, + "language_loss": 0.65043616, + "learning_rate": 6.1519407987912e-08, + "loss": 0.67308122, + "num_input_tokens_seen": 331286555, + "step": 15355, + "time_per_iteration": 2.6515190601348877 + }, + { + "auxiliary_loss_clip": 0.01133681, + "auxiliary_loss_mlp": 0.01100316, + "balance_loss_clip": 1.00180686, + "balance_loss_mlp": 1.00056195, + "epoch": 0.9232526679693371, + "flos": 26541145848960.0, + "grad_norm": 1.5429646746845278, + "language_loss": 0.74258745, + "learning_rate": 6.142359186192947e-08, + "loss": 0.76492745, + "num_input_tokens_seen": 331307660, + "step": 15356, + "time_per_iteration": 2.803402900695801 + }, + { + "auxiliary_loss_clip": 0.01132952, + "auxiliary_loss_mlp": 0.01100757, + "balance_loss_clip": 1.00189269, + "balance_loss_mlp": 1.00057375, + "epoch": 0.9233127912220052, + "flos": 14756664827520.0, + "grad_norm": 3.0084907998343806, + "language_loss": 0.60740823, + "learning_rate": 6.132784924695844e-08, + "loss": 0.62974536, + "num_input_tokens_seen": 331324885, + "step": 15357, + "time_per_iteration": 2.726698398590088 + }, + { + "auxiliary_loss_clip": 0.0111679, + "auxiliary_loss_mlp": 0.01100877, + "balance_loss_clip": 1.00170064, + "balance_loss_mlp": 1.00055075, + "epoch": 0.9233729144746731, + "flos": 25261792162560.0, + "grad_norm": 1.4438530631473296, + "language_loss": 0.70035684, + "learning_rate": 6.123218014662956e-08, + "loss": 0.72253352, + "num_input_tokens_seen": 331345885, + "step": 15358, + "time_per_iteration": 4.327995777130127 + }, + { + "auxiliary_loss_clip": 0.01164117, + "auxiliary_loss_mlp": 0.0110056, + "balance_loss_clip": 1.0017972, + "balance_loss_mlp": 1.00042462, + "epoch": 0.9234330377273411, + "flos": 27849837968640.0, + "grad_norm": 1.9208291955070707, + "language_loss": 0.72943497, + "learning_rate": 6.113658456457104e-08, + "loss": 0.75208175, + "num_input_tokens_seen": 331364320, + "step": 15359, + "time_per_iteration": 2.7318103313446045 + }, + { + "auxiliary_loss_clip": 0.01068657, + "auxiliary_loss_mlp": 0.01100583, + "balance_loss_clip": 1.00152659, + "balance_loss_mlp": 1.00054312, + "epoch": 0.923493160980009, + "flos": 24608361899520.0, + "grad_norm": 1.9230309171229951, + "language_loss": 0.64497197, + "learning_rate": 6.104106250440732e-08, + "loss": 0.66666436, + "num_input_tokens_seen": 331384135, + "step": 15360, + "time_per_iteration": 2.922506809234619 + }, + { + "auxiliary_loss_clip": 0.01141946, + "auxiliary_loss_mlp": 0.00745325, + "balance_loss_clip": 1.00078869, + "balance_loss_mlp": 1.00019252, + "epoch": 0.923553284232677, + "flos": 67700916558720.0, + "grad_norm": 0.7549551735425692, + "language_loss": 0.55051202, + "learning_rate": 6.094561396976083e-08, + "loss": 0.56938475, + "num_input_tokens_seen": 331440645, + "step": 15361, + "time_per_iteration": 4.638435363769531 + }, + { + "auxiliary_loss_clip": 0.01118047, + "auxiliary_loss_mlp": 0.0110084, + "balance_loss_clip": 1.00166976, + "balance_loss_mlp": 1.00041842, + "epoch": 0.9236134074853449, + "flos": 18807244704000.0, + "grad_norm": 1.9594667008182953, + "language_loss": 0.70044148, + "learning_rate": 6.085023896425112e-08, + "loss": 0.72263038, + "num_input_tokens_seen": 331459580, + "step": 15362, + "time_per_iteration": 2.8437399864196777 + }, + { + "auxiliary_loss_clip": 0.01149757, + "auxiliary_loss_mlp": 0.01101629, + "balance_loss_clip": 1.00183439, + "balance_loss_mlp": 1.00044394, + "epoch": 0.923673530738013, + "flos": 27782362270080.0, + "grad_norm": 1.9744236357368203, + "language_loss": 0.75578082, + "learning_rate": 6.075493749149463e-08, + "loss": 0.77829468, + "num_input_tokens_seen": 331481560, + "step": 15363, + "time_per_iteration": 2.7179412841796875 + }, + { + "auxiliary_loss_clip": 0.0116408, + "auxiliary_loss_mlp": 0.01100862, + "balance_loss_clip": 1.00179029, + "balance_loss_mlp": 1.00053596, + "epoch": 0.9237336539906809, + "flos": 26797117144320.0, + "grad_norm": 1.938540307356653, + "language_loss": 0.83506989, + "learning_rate": 6.065970955510514e-08, + "loss": 0.8577193, + "num_input_tokens_seen": 331499090, + "step": 15364, + "time_per_iteration": 2.6883809566497803 + }, + { + "auxiliary_loss_clip": 0.01115585, + "auxiliary_loss_mlp": 0.01100492, + "balance_loss_clip": 1.00171089, + "balance_loss_mlp": 1.00040364, + "epoch": 0.9237937772433489, + "flos": 23587708942080.0, + "grad_norm": 1.5877399418481541, + "language_loss": 0.68120933, + "learning_rate": 6.056455515869419e-08, + "loss": 0.70337009, + "num_input_tokens_seen": 331519420, + "step": 15365, + "time_per_iteration": 2.796811580657959 + }, + { + "auxiliary_loss_clip": 0.01164109, + "auxiliary_loss_mlp": 0.01100954, + "balance_loss_clip": 1.00193238, + "balance_loss_mlp": 1.00053239, + "epoch": 0.9238539004960168, + "flos": 26140562398080.0, + "grad_norm": 2.121803170012301, + "language_loss": 0.62992358, + "learning_rate": 6.046947430586913e-08, + "loss": 0.65257418, + "num_input_tokens_seen": 331538720, + "step": 15366, + "time_per_iteration": 2.7017087936401367 + }, + { + "auxiliary_loss_clip": 0.01115968, + "auxiliary_loss_mlp": 0.01099506, + "balance_loss_clip": 1.00173426, + "balance_loss_mlp": 1.00051427, + "epoch": 0.9239140237486848, + "flos": 21068000760960.0, + "grad_norm": 2.037667468643855, + "language_loss": 0.74284554, + "learning_rate": 6.037446700023619e-08, + "loss": 0.76500022, + "num_input_tokens_seen": 331558505, + "step": 15367, + "time_per_iteration": 2.9406917095184326 + }, + { + "auxiliary_loss_clip": 0.01132412, + "auxiliary_loss_mlp": 0.00747179, + "balance_loss_clip": 1.00174093, + "balance_loss_mlp": 1.00040674, + "epoch": 0.9239741470013527, + "flos": 24607930936320.0, + "grad_norm": 1.9749771024864462, + "language_loss": 0.64828098, + "learning_rate": 6.027953324539759e-08, + "loss": 0.66707695, + "num_input_tokens_seen": 331578440, + "step": 15368, + "time_per_iteration": 2.7594423294067383 + }, + { + "auxiliary_loss_clip": 0.01149203, + "auxiliary_loss_mlp": 0.01101026, + "balance_loss_clip": 1.00183678, + "balance_loss_mlp": 1.00046098, + "epoch": 0.9240342702540207, + "flos": 24718248581760.0, + "grad_norm": 2.0401370607331963, + "language_loss": 0.74375218, + "learning_rate": 6.018467304495401e-08, + "loss": 0.76625448, + "num_input_tokens_seen": 331598945, + "step": 15369, + "time_per_iteration": 2.7266838550567627 + }, + { + "auxiliary_loss_clip": 0.01147702, + "auxiliary_loss_mlp": 0.01101916, + "balance_loss_clip": 1.00188613, + "balance_loss_mlp": 1.00049305, + "epoch": 0.9240943935066888, + "flos": 20849987162880.0, + "grad_norm": 1.8681853170810336, + "language_loss": 0.76655489, + "learning_rate": 6.008988640250145e-08, + "loss": 0.78905106, + "num_input_tokens_seen": 331616700, + "step": 15370, + "time_per_iteration": 2.860823392868042 + }, + { + "auxiliary_loss_clip": 0.01164235, + "auxiliary_loss_mlp": 0.01100335, + "balance_loss_clip": 1.00187886, + "balance_loss_mlp": 1.00048518, + "epoch": 0.9241545167593567, + "flos": 24462313200000.0, + "grad_norm": 3.045088015380143, + "language_loss": 0.66931021, + "learning_rate": 5.999517332163528e-08, + "loss": 0.69195586, + "num_input_tokens_seen": 331635625, + "step": 15371, + "time_per_iteration": 2.7212650775909424 + }, + { + "auxiliary_loss_clip": 0.01128681, + "auxiliary_loss_mlp": 0.01074189, + "balance_loss_clip": 1.00082242, + "balance_loss_mlp": 1.00018406, + "epoch": 0.9242146400120247, + "flos": 61827259847040.0, + "grad_norm": 0.7282242522219148, + "language_loss": 0.57688433, + "learning_rate": 5.99005338059464e-08, + "loss": 0.59891301, + "num_input_tokens_seen": 331698595, + "step": 15372, + "time_per_iteration": 3.227999448776245 + }, + { + "auxiliary_loss_clip": 0.01163965, + "auxiliary_loss_mlp": 0.01099639, + "balance_loss_clip": 1.001863, + "balance_loss_mlp": 1.00055242, + "epoch": 0.9242747632646926, + "flos": 22048397550720.0, + "grad_norm": 1.9904207720740674, + "language_loss": 0.69830072, + "learning_rate": 5.98059678590237e-08, + "loss": 0.72093678, + "num_input_tokens_seen": 331717975, + "step": 15373, + "time_per_iteration": 2.692002534866333 + }, + { + "auxiliary_loss_clip": 0.01149501, + "auxiliary_loss_mlp": 0.01100826, + "balance_loss_clip": 1.00199056, + "balance_loss_mlp": 1.0006901, + "epoch": 0.9243348865173606, + "flos": 18478338842880.0, + "grad_norm": 3.03717519338528, + "language_loss": 0.74803197, + "learning_rate": 5.971147548445299e-08, + "loss": 0.77053523, + "num_input_tokens_seen": 331737220, + "step": 15374, + "time_per_iteration": 2.7414989471435547 + }, + { + "auxiliary_loss_clip": 0.01117744, + "auxiliary_loss_mlp": 0.0110023, + "balance_loss_clip": 1.00169814, + "balance_loss_mlp": 1.00061917, + "epoch": 0.9243950097700285, + "flos": 23258767167360.0, + "grad_norm": 1.788728849893748, + "language_loss": 0.64823979, + "learning_rate": 5.961705668581784e-08, + "loss": 0.67041957, + "num_input_tokens_seen": 331757300, + "step": 15375, + "time_per_iteration": 2.7238471508026123 + }, + { + "auxiliary_loss_clip": 0.01131015, + "auxiliary_loss_mlp": 0.01099833, + "balance_loss_clip": 1.00183392, + "balance_loss_mlp": 1.00050807, + "epoch": 0.9244551330226966, + "flos": 29749081593600.0, + "grad_norm": 1.945374091135115, + "language_loss": 0.66510797, + "learning_rate": 5.952271146669829e-08, + "loss": 0.68741643, + "num_input_tokens_seen": 331776995, + "step": 15376, + "time_per_iteration": 4.252937316894531 + }, + { + "auxiliary_loss_clip": 0.01157996, + "auxiliary_loss_mlp": 0.0107379, + "balance_loss_clip": 1.00071418, + "balance_loss_mlp": 1.00016618, + "epoch": 0.9245152562753645, + "flos": 68864960609280.0, + "grad_norm": 0.648070636730256, + "language_loss": 0.61101818, + "learning_rate": 5.94284398306717e-08, + "loss": 0.63333607, + "num_input_tokens_seen": 331845015, + "step": 15377, + "time_per_iteration": 3.175117015838623 + }, + { + "auxiliary_loss_clip": 0.01116187, + "auxiliary_loss_mlp": 0.01101112, + "balance_loss_clip": 1.00171888, + "balance_loss_mlp": 1.00059497, + "epoch": 0.9245753795280325, + "flos": 21579260993280.0, + "grad_norm": 3.823718120441811, + "language_loss": 0.73976421, + "learning_rate": 5.933424178131341e-08, + "loss": 0.7619372, + "num_input_tokens_seen": 331862795, + "step": 15378, + "time_per_iteration": 2.7055811882019043 + }, + { + "auxiliary_loss_clip": 0.01164322, + "auxiliary_loss_mlp": 0.0110096, + "balance_loss_clip": 1.00199318, + "balance_loss_mlp": 1.00044274, + "epoch": 0.9246355027807004, + "flos": 34496077334400.0, + "grad_norm": 1.8574273319643475, + "language_loss": 0.62264776, + "learning_rate": 5.924011732219503e-08, + "loss": 0.64530057, + "num_input_tokens_seen": 331882535, + "step": 15379, + "time_per_iteration": 4.065632104873657 + }, + { + "auxiliary_loss_clip": 0.01067635, + "auxiliary_loss_mlp": 0.01099372, + "balance_loss_clip": 1.00177002, + "balance_loss_mlp": 1.0004282, + "epoch": 0.9246956260333684, + "flos": 15953854152960.0, + "grad_norm": 2.3886965107398264, + "language_loss": 0.83700395, + "learning_rate": 5.914606645688591e-08, + "loss": 0.85867405, + "num_input_tokens_seen": 331899335, + "step": 15380, + "time_per_iteration": 2.875328302383423 + }, + { + "auxiliary_loss_clip": 0.01164204, + "auxiliary_loss_mlp": 0.01101473, + "balance_loss_clip": 1.00187993, + "balance_loss_mlp": 1.00047874, + "epoch": 0.9247557492860363, + "flos": 23368366540800.0, + "grad_norm": 1.6435079063432447, + "language_loss": 0.73215878, + "learning_rate": 5.905208918895233e-08, + "loss": 0.75481558, + "num_input_tokens_seen": 331919030, + "step": 15381, + "time_per_iteration": 2.614574432373047 + }, + { + "auxiliary_loss_clip": 0.01147748, + "auxiliary_loss_mlp": 0.01100964, + "balance_loss_clip": 1.00183845, + "balance_loss_mlp": 1.00044703, + "epoch": 0.9248158725387043, + "flos": 23039855729280.0, + "grad_norm": 1.7998201115719863, + "language_loss": 0.78466111, + "learning_rate": 5.8958185521958524e-08, + "loss": 0.80714822, + "num_input_tokens_seen": 331936465, + "step": 15382, + "time_per_iteration": 2.663151979446411 + }, + { + "auxiliary_loss_clip": 0.0113047, + "auxiliary_loss_mlp": 0.01100508, + "balance_loss_clip": 1.00163245, + "balance_loss_mlp": 1.00056338, + "epoch": 0.9248759957913724, + "flos": 22522418357760.0, + "grad_norm": 1.7813756746387257, + "language_loss": 0.75178289, + "learning_rate": 5.886435545946455e-08, + "loss": 0.77409267, + "num_input_tokens_seen": 331954625, + "step": 15383, + "time_per_iteration": 2.6961681842803955 + }, + { + "auxiliary_loss_clip": 0.01132656, + "auxiliary_loss_mlp": 0.01100187, + "balance_loss_clip": 1.00164509, + "balance_loss_mlp": 1.00043225, + "epoch": 0.9249361190440403, + "flos": 25447271016960.0, + "grad_norm": 1.9811369471968177, + "language_loss": 0.7549572, + "learning_rate": 5.8770599005028456e-08, + "loss": 0.77728558, + "num_input_tokens_seen": 331975865, + "step": 15384, + "time_per_iteration": 2.7355093955993652 + }, + { + "auxiliary_loss_clip": 0.01131261, + "auxiliary_loss_mlp": 0.01099773, + "balance_loss_clip": 1.00176358, + "balance_loss_mlp": 1.00054288, + "epoch": 0.9249962422967083, + "flos": 12378623886720.0, + "grad_norm": 2.1805439768034893, + "language_loss": 0.66406024, + "learning_rate": 5.8676916162206045e-08, + "loss": 0.68637049, + "num_input_tokens_seen": 331992760, + "step": 15385, + "time_per_iteration": 2.682511568069458 + }, + { + "auxiliary_loss_clip": 0.01163993, + "auxiliary_loss_mlp": 0.01100547, + "balance_loss_clip": 1.00184119, + "balance_loss_mlp": 1.00045848, + "epoch": 0.9250563655493762, + "flos": 22929430343040.0, + "grad_norm": 1.7739301437963049, + "language_loss": 0.80385512, + "learning_rate": 5.85833069345496e-08, + "loss": 0.82650054, + "num_input_tokens_seen": 332011890, + "step": 15386, + "time_per_iteration": 2.6330041885375977 + }, + { + "auxiliary_loss_clip": 0.01149284, + "auxiliary_loss_mlp": 0.01100371, + "balance_loss_clip": 1.00193429, + "balance_loss_mlp": 1.00052154, + "epoch": 0.9251164888020442, + "flos": 18478662065280.0, + "grad_norm": 1.816241451882787, + "language_loss": 0.75580448, + "learning_rate": 5.8489771325608504e-08, + "loss": 0.778301, + "num_input_tokens_seen": 332029485, + "step": 15387, + "time_per_iteration": 2.580410957336426 + }, + { + "auxiliary_loss_clip": 0.01147315, + "auxiliary_loss_mlp": 0.0110007, + "balance_loss_clip": 1.00178587, + "balance_loss_mlp": 1.00069714, + "epoch": 0.9251766120547121, + "flos": 33037062796800.0, + "grad_norm": 4.681277138955595, + "language_loss": 0.70113325, + "learning_rate": 5.839630933893014e-08, + "loss": 0.72360706, + "num_input_tokens_seen": 332052970, + "step": 15388, + "time_per_iteration": 2.7195019721984863 + }, + { + "auxiliary_loss_clip": 0.01147029, + "auxiliary_loss_mlp": 0.01100368, + "balance_loss_clip": 1.00184298, + "balance_loss_mlp": 1.00047064, + "epoch": 0.9252367353073802, + "flos": 24387906176640.0, + "grad_norm": 1.9557328307295705, + "language_loss": 0.82019389, + "learning_rate": 5.8302920978058115e-08, + "loss": 0.84266782, + "num_input_tokens_seen": 332070395, + "step": 15389, + "time_per_iteration": 2.598674774169922 + }, + { + "auxiliary_loss_clip": 0.01149882, + "auxiliary_loss_mlp": 0.01101904, + "balance_loss_clip": 1.0019002, + "balance_loss_mlp": 1.00048065, + "epoch": 0.9252968585600481, + "flos": 18916844077440.0, + "grad_norm": 1.9104512598717371, + "language_loss": 0.78944755, + "learning_rate": 5.820960624653381e-08, + "loss": 0.81196547, + "num_input_tokens_seen": 332090185, + "step": 15390, + "time_per_iteration": 2.649834156036377 + }, + { + "auxiliary_loss_clip": 0.01118146, + "auxiliary_loss_mlp": 0.01101157, + "balance_loss_clip": 1.00183523, + "balance_loss_mlp": 1.00054455, + "epoch": 0.9253569818127161, + "flos": 21725345606400.0, + "grad_norm": 1.6729193277544507, + "language_loss": 0.75564218, + "learning_rate": 5.811636514789597e-08, + "loss": 0.77783519, + "num_input_tokens_seen": 332109050, + "step": 15391, + "time_per_iteration": 2.6893651485443115 + }, + { + "auxiliary_loss_clip": 0.01134027, + "auxiliary_loss_mlp": 0.01100914, + "balance_loss_clip": 1.00178874, + "balance_loss_mlp": 1.00044441, + "epoch": 0.925417105065384, + "flos": 34240357434240.0, + "grad_norm": 2.3572076332602148, + "language_loss": 0.52366811, + "learning_rate": 5.80231976856802e-08, + "loss": 0.54601753, + "num_input_tokens_seen": 332131180, + "step": 15392, + "time_per_iteration": 2.7999024391174316 + }, + { + "auxiliary_loss_clip": 0.01164168, + "auxiliary_loss_mlp": 0.01100223, + "balance_loss_clip": 1.00185871, + "balance_loss_mlp": 1.00046873, + "epoch": 0.925477228318052, + "flos": 25959536830080.0, + "grad_norm": 1.9336477260188778, + "language_loss": 0.76853281, + "learning_rate": 5.7930103863419454e-08, + "loss": 0.79117674, + "num_input_tokens_seen": 332149555, + "step": 15393, + "time_per_iteration": 2.6003053188323975 + }, + { + "auxiliary_loss_clip": 0.01133055, + "auxiliary_loss_mlp": 0.01100949, + "balance_loss_clip": 1.00181055, + "balance_loss_mlp": 1.00052714, + "epoch": 0.9255373515707199, + "flos": 11838240702720.0, + "grad_norm": 2.4728555544387736, + "language_loss": 0.6919632, + "learning_rate": 5.783708368464357e-08, + "loss": 0.7143032, + "num_input_tokens_seen": 332165830, + "step": 15394, + "time_per_iteration": 2.6936709880828857 + }, + { + "auxiliary_loss_clip": 0.01164242, + "auxiliary_loss_mlp": 0.01099999, + "balance_loss_clip": 1.00192249, + "balance_loss_mlp": 1.00038791, + "epoch": 0.925597474823388, + "flos": 21434325615360.0, + "grad_norm": 1.6252164558053885, + "language_loss": 0.72909319, + "learning_rate": 5.7744137152879956e-08, + "loss": 0.75173557, + "num_input_tokens_seen": 332185130, + "step": 15395, + "time_per_iteration": 4.141781806945801 + }, + { + "auxiliary_loss_clip": 0.01100047, + "auxiliary_loss_mlp": 0.01099132, + "balance_loss_clip": 1.00160837, + "balance_loss_mlp": 1.00037885, + "epoch": 0.925657598076056, + "flos": 22857573185280.0, + "grad_norm": 2.304613484241198, + "language_loss": 0.7156322, + "learning_rate": 5.7651264271653785e-08, + "loss": 0.73762399, + "num_input_tokens_seen": 332203695, + "step": 15396, + "time_per_iteration": 2.6810004711151123 + }, + { + "auxiliary_loss_clip": 0.01164126, + "auxiliary_loss_mlp": 0.01100481, + "balance_loss_clip": 1.0018487, + "balance_loss_mlp": 1.00048852, + "epoch": 0.9257177213287239, + "flos": 25704032411520.0, + "grad_norm": 1.6113805823315939, + "language_loss": 0.87270063, + "learning_rate": 5.755846504448603e-08, + "loss": 0.89534676, + "num_input_tokens_seen": 332224850, + "step": 15397, + "time_per_iteration": 2.573955774307251 + }, + { + "auxiliary_loss_clip": 0.0115797, + "auxiliary_loss_mlp": 0.01073783, + "balance_loss_clip": 1.00070179, + "balance_loss_mlp": 1.00015974, + "epoch": 0.9257778445813919, + "flos": 59592933221760.0, + "grad_norm": 0.8187425684412719, + "language_loss": 0.55172533, + "learning_rate": 5.746573947489586e-08, + "loss": 0.57404286, + "num_input_tokens_seen": 332278085, + "step": 15398, + "time_per_iteration": 2.9856653213500977 + }, + { + "auxiliary_loss_clip": 0.01132924, + "auxiliary_loss_mlp": 0.01102573, + "balance_loss_clip": 1.00177312, + "balance_loss_mlp": 1.00053024, + "epoch": 0.9258379678340598, + "flos": 27709427704320.0, + "grad_norm": 3.2383169240866883, + "language_loss": 0.76478481, + "learning_rate": 5.7373087566400025e-08, + "loss": 0.78713977, + "num_input_tokens_seen": 332297875, + "step": 15399, + "time_per_iteration": 4.149944305419922 + }, + { + "auxiliary_loss_clip": 0.01132632, + "auxiliary_loss_mlp": 0.01098828, + "balance_loss_clip": 1.00177455, + "balance_loss_mlp": 1.00045657, + "epoch": 0.9258980910867278, + "flos": 24863543095680.0, + "grad_norm": 1.6921128881444376, + "language_loss": 0.78211749, + "learning_rate": 5.7280509322510826e-08, + "loss": 0.80443215, + "num_input_tokens_seen": 332318500, + "step": 15400, + "time_per_iteration": 2.6703951358795166 + }, + { + "auxiliary_loss_clip": 0.01143819, + "auxiliary_loss_mlp": 0.01073855, + "balance_loss_clip": 1.00079513, + "balance_loss_mlp": 1.00023174, + "epoch": 0.9259582143393957, + "flos": 63134587249920.0, + "grad_norm": 0.7198772395337127, + "language_loss": 0.51348186, + "learning_rate": 5.718800474673946e-08, + "loss": 0.5356586, + "num_input_tokens_seen": 332381980, + "step": 15401, + "time_per_iteration": 3.075155735015869 + }, + { + "auxiliary_loss_clip": 0.01147442, + "auxiliary_loss_mlp": 0.01099019, + "balance_loss_clip": 1.0018307, + "balance_loss_mlp": 1.00059986, + "epoch": 0.9260183375920638, + "flos": 24127122458880.0, + "grad_norm": 2.7415399143978187, + "language_loss": 0.82663655, + "learning_rate": 5.709557384259378e-08, + "loss": 0.84910113, + "num_input_tokens_seen": 332399510, + "step": 15402, + "time_per_iteration": 2.6167449951171875 + }, + { + "auxiliary_loss_clip": 0.01157997, + "auxiliary_loss_mlp": 0.01073706, + "balance_loss_clip": 1.00073051, + "balance_loss_mlp": 1.00008249, + "epoch": 0.9260784608447317, + "flos": 63042872849280.0, + "grad_norm": 0.7369533770626354, + "language_loss": 0.51147044, + "learning_rate": 5.700321661357876e-08, + "loss": 0.53378749, + "num_input_tokens_seen": 332459130, + "step": 15403, + "time_per_iteration": 3.1530463695526123 + }, + { + "auxiliary_loss_clip": 0.01124675, + "auxiliary_loss_mlp": 0.01074253, + "balance_loss_clip": 1.00080049, + "balance_loss_mlp": 1.00024748, + "epoch": 0.9261385840973997, + "flos": 70585979927040.0, + "grad_norm": 0.6797683110874359, + "language_loss": 0.58704591, + "learning_rate": 5.69109330631965e-08, + "loss": 0.60903519, + "num_input_tokens_seen": 332526555, + "step": 15404, + "time_per_iteration": 3.1850037574768066 + }, + { + "auxiliary_loss_clip": 0.01131396, + "auxiliary_loss_mlp": 0.01100993, + "balance_loss_clip": 1.0017488, + "balance_loss_mlp": 1.00052297, + "epoch": 0.9261987073500676, + "flos": 20229917656320.0, + "grad_norm": 1.9314510173580095, + "language_loss": 0.71679735, + "learning_rate": 5.681872319494596e-08, + "loss": 0.7391212, + "num_input_tokens_seen": 332544005, + "step": 15405, + "time_per_iteration": 2.6905357837677 + }, + { + "auxiliary_loss_clip": 0.01099517, + "auxiliary_loss_mlp": 0.01101191, + "balance_loss_clip": 1.00167704, + "balance_loss_mlp": 1.0005784, + "epoch": 0.9262588306027356, + "flos": 20954163582720.0, + "grad_norm": 1.69549713145056, + "language_loss": 0.68731976, + "learning_rate": 5.672658701232458e-08, + "loss": 0.70932686, + "num_input_tokens_seen": 332563070, + "step": 15406, + "time_per_iteration": 2.7204530239105225 + }, + { + "auxiliary_loss_clip": 0.01101255, + "auxiliary_loss_mlp": 0.01100972, + "balance_loss_clip": 1.00171947, + "balance_loss_mlp": 1.00050211, + "epoch": 0.9263189538554035, + "flos": 22158679282560.0, + "grad_norm": 2.548681118981609, + "language_loss": 0.76192671, + "learning_rate": 5.663452451882555e-08, + "loss": 0.78394896, + "num_input_tokens_seen": 332579620, + "step": 15407, + "time_per_iteration": 2.727079153060913 + }, + { + "auxiliary_loss_clip": 0.0111625, + "auxiliary_loss_mlp": 0.01102129, + "balance_loss_clip": 1.00168133, + "balance_loss_mlp": 1.00065827, + "epoch": 0.9263790771080715, + "flos": 18187211111040.0, + "grad_norm": 1.7545815293664808, + "language_loss": 0.72571981, + "learning_rate": 5.6542535717940096e-08, + "loss": 0.74790353, + "num_input_tokens_seen": 332597795, + "step": 15408, + "time_per_iteration": 2.6583640575408936 + }, + { + "auxiliary_loss_clip": 0.01133515, + "auxiliary_loss_mlp": 0.01099124, + "balance_loss_clip": 1.0019387, + "balance_loss_mlp": 1.00056219, + "epoch": 0.9264392003607396, + "flos": 48178545004800.0, + "grad_norm": 2.3102290633422937, + "language_loss": 0.68516791, + "learning_rate": 5.645062061315675e-08, + "loss": 0.70749432, + "num_input_tokens_seen": 332620375, + "step": 15409, + "time_per_iteration": 2.847698450088501 + }, + { + "auxiliary_loss_clip": 0.01113272, + "auxiliary_loss_mlp": 0.01100811, + "balance_loss_clip": 1.00164175, + "balance_loss_mlp": 1.00048423, + "epoch": 0.9264993236134075, + "flos": 26389458714240.0, + "grad_norm": 1.7326096899787733, + "language_loss": 0.75657248, + "learning_rate": 5.6358779207960506e-08, + "loss": 0.77871329, + "num_input_tokens_seen": 332639510, + "step": 15410, + "time_per_iteration": 2.7208969593048096 + }, + { + "auxiliary_loss_clip": 0.01098429, + "auxiliary_loss_mlp": 0.01100629, + "balance_loss_clip": 1.00157213, + "balance_loss_mlp": 1.00039744, + "epoch": 0.9265594468660755, + "flos": 20920084554240.0, + "grad_norm": 1.6298352895967436, + "language_loss": 0.81827176, + "learning_rate": 5.6267011505833905e-08, + "loss": 0.84026229, + "num_input_tokens_seen": 332658350, + "step": 15411, + "time_per_iteration": 2.7012970447540283 + }, + { + "auxiliary_loss_clip": 0.01133109, + "auxiliary_loss_mlp": 0.0110081, + "balance_loss_clip": 1.00199449, + "balance_loss_mlp": 1.00048351, + "epoch": 0.9266195701187434, + "flos": 17525017929600.0, + "grad_norm": 1.785068896143608, + "language_loss": 0.75377542, + "learning_rate": 5.617531751025728e-08, + "loss": 0.77611464, + "num_input_tokens_seen": 332676715, + "step": 15412, + "time_per_iteration": 2.645434856414795 + }, + { + "auxiliary_loss_clip": 0.01164106, + "auxiliary_loss_mlp": 0.01100218, + "balance_loss_clip": 1.00180531, + "balance_loss_mlp": 1.00051165, + "epoch": 0.9266796933714114, + "flos": 33688733293440.0, + "grad_norm": 1.6993724877196406, + "language_loss": 0.66947371, + "learning_rate": 5.6083697224707406e-08, + "loss": 0.69211692, + "num_input_tokens_seen": 332701470, + "step": 15413, + "time_per_iteration": 2.709641218185425 + }, + { + "auxiliary_loss_clip": 0.01084064, + "auxiliary_loss_mlp": 0.01100389, + "balance_loss_clip": 1.00158668, + "balance_loss_mlp": 1.00049186, + "epoch": 0.9267398166240793, + "flos": 18916520855040.0, + "grad_norm": 1.8821631454047218, + "language_loss": 0.75764441, + "learning_rate": 5.5992150652658167e-08, + "loss": 0.77948892, + "num_input_tokens_seen": 332719060, + "step": 15414, + "time_per_iteration": 4.239774465560913 + }, + { + "auxiliary_loss_clip": 0.01147688, + "auxiliary_loss_mlp": 0.01099601, + "balance_loss_clip": 1.00178242, + "balance_loss_mlp": 1.00046635, + "epoch": 0.9267999398767474, + "flos": 20478957626880.0, + "grad_norm": 2.4710938649512673, + "language_loss": 0.8146053, + "learning_rate": 5.59006777975819e-08, + "loss": 0.83707821, + "num_input_tokens_seen": 332736345, + "step": 15415, + "time_per_iteration": 2.660696029663086 + }, + { + "auxiliary_loss_clip": 0.01135054, + "auxiliary_loss_mlp": 0.01100536, + "balance_loss_clip": 1.00171614, + "balance_loss_mlp": 1.00049531, + "epoch": 0.9268600631294153, + "flos": 24789351553920.0, + "grad_norm": 1.398262553527893, + "language_loss": 0.54251999, + "learning_rate": 5.580927866294671e-08, + "loss": 0.5648759, + "num_input_tokens_seen": 332756270, + "step": 15416, + "time_per_iteration": 2.720607280731201 + }, + { + "auxiliary_loss_clip": 0.0111612, + "auxiliary_loss_mlp": 0.01100373, + "balance_loss_clip": 1.00177717, + "balance_loss_mlp": 1.00057089, + "epoch": 0.9269201863820833, + "flos": 18697178453760.0, + "grad_norm": 1.9743671521677149, + "language_loss": 0.71877629, + "learning_rate": 5.571795325221807e-08, + "loss": 0.74094117, + "num_input_tokens_seen": 332775185, + "step": 15417, + "time_per_iteration": 4.006597280502319 + }, + { + "auxiliary_loss_clip": 0.01147609, + "auxiliary_loss_mlp": 0.01100465, + "balance_loss_clip": 1.00174057, + "balance_loss_mlp": 1.00042462, + "epoch": 0.9269803096347512, + "flos": 20923999136640.0, + "grad_norm": 1.954457027355145, + "language_loss": 0.75837094, + "learning_rate": 5.5626701568859624e-08, + "loss": 0.78085172, + "num_input_tokens_seen": 332794320, + "step": 15418, + "time_per_iteration": 2.5523812770843506 + }, + { + "auxiliary_loss_clip": 0.01147416, + "auxiliary_loss_mlp": 0.01099761, + "balance_loss_clip": 1.00169647, + "balance_loss_mlp": 1.00038838, + "epoch": 0.9270404328874192, + "flos": 28002710252160.0, + "grad_norm": 1.6012839094449942, + "language_loss": 0.76207817, + "learning_rate": 5.553552361633174e-08, + "loss": 0.78454995, + "num_input_tokens_seen": 332818095, + "step": 15419, + "time_per_iteration": 2.6340315341949463 + }, + { + "auxiliary_loss_clip": 0.01163916, + "auxiliary_loss_mlp": 0.01098687, + "balance_loss_clip": 1.00178611, + "balance_loss_mlp": 1.00055349, + "epoch": 0.9271005561400871, + "flos": 25889870401920.0, + "grad_norm": 1.7738833877563827, + "language_loss": 0.76233965, + "learning_rate": 5.5444419398091636e-08, + "loss": 0.78496569, + "num_input_tokens_seen": 332839860, + "step": 15420, + "time_per_iteration": 2.5730783939361572 + }, + { + "auxiliary_loss_clip": 0.01147699, + "auxiliary_loss_mlp": 0.0110073, + "balance_loss_clip": 1.0017457, + "balance_loss_mlp": 1.00045121, + "epoch": 0.9271606793927551, + "flos": 27053914452480.0, + "grad_norm": 1.5510962560688395, + "language_loss": 0.7675404, + "learning_rate": 5.535338891759389e-08, + "loss": 0.79002464, + "num_input_tokens_seen": 332861155, + "step": 15421, + "time_per_iteration": 2.6131110191345215 + }, + { + "auxiliary_loss_clip": 0.01133248, + "auxiliary_loss_mlp": 0.01100766, + "balance_loss_clip": 1.00194073, + "balance_loss_mlp": 1.00058246, + "epoch": 0.9272208026454232, + "flos": 26209869690240.0, + "grad_norm": 1.9037445404278523, + "language_loss": 0.72984952, + "learning_rate": 5.526243217829041e-08, + "loss": 0.75218964, + "num_input_tokens_seen": 332881110, + "step": 15422, + "time_per_iteration": 2.6642489433288574 + }, + { + "auxiliary_loss_clip": 0.01149489, + "auxiliary_loss_mlp": 0.01101011, + "balance_loss_clip": 1.00188553, + "balance_loss_mlp": 1.00058949, + "epoch": 0.9272809258980911, + "flos": 12458453863680.0, + "grad_norm": 1.901025400540999, + "language_loss": 0.76915181, + "learning_rate": 5.517154918363065e-08, + "loss": 0.79165679, + "num_input_tokens_seen": 332899350, + "step": 15423, + "time_per_iteration": 2.641666889190674 + }, + { + "auxiliary_loss_clip": 0.01149606, + "auxiliary_loss_mlp": 0.0110092, + "balance_loss_clip": 1.00185823, + "balance_loss_mlp": 1.00040221, + "epoch": 0.9273410491507591, + "flos": 22856890826880.0, + "grad_norm": 2.327401215251892, + "language_loss": 0.75437343, + "learning_rate": 5.508073993706053e-08, + "loss": 0.77687877, + "num_input_tokens_seen": 332918105, + "step": 15424, + "time_per_iteration": 2.606471061706543 + }, + { + "auxiliary_loss_clip": 0.01143595, + "auxiliary_loss_mlp": 0.01073722, + "balance_loss_clip": 1.00070131, + "balance_loss_mlp": 1.00009882, + "epoch": 0.927401172403427, + "flos": 47665384329600.0, + "grad_norm": 0.7828154599661696, + "language_loss": 0.60689133, + "learning_rate": 5.499000444202351e-08, + "loss": 0.6290645, + "num_input_tokens_seen": 332969490, + "step": 15425, + "time_per_iteration": 2.9552385807037354 + }, + { + "auxiliary_loss_clip": 0.01130318, + "auxiliary_loss_mlp": 0.00747284, + "balance_loss_clip": 1.00181842, + "balance_loss_mlp": 1.00046229, + "epoch": 0.927461295656095, + "flos": 29972374490880.0, + "grad_norm": 1.361832492762918, + "language_loss": 0.70735121, + "learning_rate": 5.489934270196106e-08, + "loss": 0.72612727, + "num_input_tokens_seen": 332988805, + "step": 15426, + "time_per_iteration": 2.740251302719116 + }, + { + "auxiliary_loss_clip": 0.01130793, + "auxiliary_loss_mlp": 0.01101218, + "balance_loss_clip": 1.00189352, + "balance_loss_mlp": 1.00046194, + "epoch": 0.9275214189087629, + "flos": 20375427651840.0, + "grad_norm": 2.686795180338646, + "language_loss": 0.83065528, + "learning_rate": 5.480875472030977e-08, + "loss": 0.85297537, + "num_input_tokens_seen": 333007960, + "step": 15427, + "time_per_iteration": 2.6485202312469482 + }, + { + "auxiliary_loss_clip": 0.01117534, + "auxiliary_loss_mlp": 0.0110038, + "balance_loss_clip": 1.00182784, + "balance_loss_mlp": 1.00057769, + "epoch": 0.927581542161431, + "flos": 22383193242240.0, + "grad_norm": 1.5823178754692748, + "language_loss": 0.77022326, + "learning_rate": 5.471824050050555e-08, + "loss": 0.79240245, + "num_input_tokens_seen": 333026035, + "step": 15428, + "time_per_iteration": 2.7464840412139893 + }, + { + "auxiliary_loss_clip": 0.01116132, + "auxiliary_loss_mlp": 0.0110004, + "balance_loss_clip": 1.00167811, + "balance_loss_mlp": 1.00052404, + "epoch": 0.9276416654140989, + "flos": 23952453598080.0, + "grad_norm": 4.852816043672225, + "language_loss": 0.74311769, + "learning_rate": 5.4627800045980555e-08, + "loss": 0.76527935, + "num_input_tokens_seen": 333045590, + "step": 15429, + "time_per_iteration": 2.7240242958068848 + }, + { + "auxiliary_loss_clip": 0.0111786, + "auxiliary_loss_mlp": 0.01099292, + "balance_loss_clip": 1.0016396, + "balance_loss_mlp": 1.00053895, + "epoch": 0.9277017886667669, + "flos": 13917719796480.0, + "grad_norm": 2.500621536105439, + "language_loss": 0.75253606, + "learning_rate": 5.45374333601647e-08, + "loss": 0.77470756, + "num_input_tokens_seen": 333063355, + "step": 15430, + "time_per_iteration": 2.6947779655456543 + }, + { + "auxiliary_loss_clip": 0.01149471, + "auxiliary_loss_mlp": 0.0110177, + "balance_loss_clip": 1.00184894, + "balance_loss_mlp": 1.00053704, + "epoch": 0.9277619119194348, + "flos": 35666478092160.0, + "grad_norm": 1.396497404319563, + "language_loss": 0.76329803, + "learning_rate": 5.444714044648391e-08, + "loss": 0.78581047, + "num_input_tokens_seen": 333088045, + "step": 15431, + "time_per_iteration": 2.7571909427642822 + }, + { + "auxiliary_loss_clip": 0.01147476, + "auxiliary_loss_mlp": 0.01100003, + "balance_loss_clip": 1.00196958, + "balance_loss_mlp": 1.00058246, + "epoch": 0.9278220351721028, + "flos": 23841238112640.0, + "grad_norm": 2.0547711794560812, + "language_loss": 0.70624793, + "learning_rate": 5.4356921308363e-08, + "loss": 0.72872275, + "num_input_tokens_seen": 333108005, + "step": 15432, + "time_per_iteration": 2.6644458770751953 + }, + { + "auxiliary_loss_clip": 0.01097844, + "auxiliary_loss_mlp": 0.01100812, + "balance_loss_clip": 1.00160265, + "balance_loss_mlp": 1.00048506, + "epoch": 0.9278821584247707, + "flos": 15228135768960.0, + "grad_norm": 2.3131430022623487, + "language_loss": 0.81777465, + "learning_rate": 5.4266775949222354e-08, + "loss": 0.8397612, + "num_input_tokens_seen": 333124335, + "step": 15433, + "time_per_iteration": 4.307461500167847 + }, + { + "auxiliary_loss_clip": 0.01163926, + "auxiliary_loss_mlp": 0.01098591, + "balance_loss_clip": 1.00194716, + "balance_loss_mlp": 1.00045836, + "epoch": 0.9279422816774388, + "flos": 24681404206080.0, + "grad_norm": 2.0925972846864638, + "language_loss": 0.66167247, + "learning_rate": 5.417670437248056e-08, + "loss": 0.68429768, + "num_input_tokens_seen": 333143995, + "step": 15434, + "time_per_iteration": 2.5897278785705566 + }, + { + "auxiliary_loss_clip": 0.01132449, + "auxiliary_loss_mlp": 0.01099239, + "balance_loss_clip": 1.00189579, + "balance_loss_mlp": 1.00043797, + "epoch": 0.9280024049301068, + "flos": 19169188099200.0, + "grad_norm": 1.8197425260404403, + "language_loss": 0.68597406, + "learning_rate": 5.40867065815529e-08, + "loss": 0.70829093, + "num_input_tokens_seen": 333162805, + "step": 15435, + "time_per_iteration": 2.613234758377075 + }, + { + "auxiliary_loss_clip": 0.01164208, + "auxiliary_loss_mlp": 0.0110054, + "balance_loss_clip": 1.00192142, + "balance_loss_mlp": 1.00073755, + "epoch": 0.9280625281827747, + "flos": 11393701983360.0, + "grad_norm": 2.053777361879021, + "language_loss": 0.7208885, + "learning_rate": 5.399678257985263e-08, + "loss": 0.743536, + "num_input_tokens_seen": 333175770, + "step": 15436, + "time_per_iteration": 3.908618688583374 + }, + { + "auxiliary_loss_clip": 0.01132409, + "auxiliary_loss_mlp": 0.01099391, + "balance_loss_clip": 1.00174403, + "balance_loss_mlp": 1.00049472, + "epoch": 0.9281226514354427, + "flos": 24785616539520.0, + "grad_norm": 2.198482543060022, + "language_loss": 0.67009181, + "learning_rate": 5.390693237078925e-08, + "loss": 0.69240981, + "num_input_tokens_seen": 333194775, + "step": 15437, + "time_per_iteration": 2.6329822540283203 + }, + { + "auxiliary_loss_clip": 0.01147541, + "auxiliary_loss_mlp": 0.01101244, + "balance_loss_clip": 1.00182867, + "balance_loss_mlp": 1.00044084, + "epoch": 0.9281827746881106, + "flos": 15083128563840.0, + "grad_norm": 2.087463184277998, + "language_loss": 0.71201074, + "learning_rate": 5.3817155957770254e-08, + "loss": 0.73449862, + "num_input_tokens_seen": 333208920, + "step": 15438, + "time_per_iteration": 2.5672054290771484 + }, + { + "auxiliary_loss_clip": 0.01164198, + "auxiliary_loss_mlp": 0.0110087, + "balance_loss_clip": 1.00186729, + "balance_loss_mlp": 1.00044847, + "epoch": 0.9282428979407786, + "flos": 24135059364480.0, + "grad_norm": 1.7786452881199846, + "language_loss": 0.64630771, + "learning_rate": 5.3727453344199366e-08, + "loss": 0.66895831, + "num_input_tokens_seen": 333229350, + "step": 15439, + "time_per_iteration": 2.5365750789642334 + }, + { + "auxiliary_loss_clip": 0.01132706, + "auxiliary_loss_mlp": 0.01101114, + "balance_loss_clip": 1.0018208, + "balance_loss_mlp": 1.00059724, + "epoch": 0.9283030211934465, + "flos": 24823215100800.0, + "grad_norm": 1.8161816929373855, + "language_loss": 0.70385188, + "learning_rate": 5.363782453347876e-08, + "loss": 0.72619009, + "num_input_tokens_seen": 333246125, + "step": 15440, + "time_per_iteration": 2.6700432300567627 + }, + { + "auxiliary_loss_clip": 0.01118445, + "auxiliary_loss_mlp": 0.00747372, + "balance_loss_clip": 1.00170135, + "balance_loss_mlp": 1.00043392, + "epoch": 0.9283631444461146, + "flos": 23981037845760.0, + "grad_norm": 1.7781865693159078, + "language_loss": 0.76642287, + "learning_rate": 5.354826952900682e-08, + "loss": 0.78508103, + "num_input_tokens_seen": 333263685, + "step": 15441, + "time_per_iteration": 2.6749067306518555 + }, + { + "auxiliary_loss_clip": 0.01149291, + "auxiliary_loss_mlp": 0.0109906, + "balance_loss_clip": 1.00190234, + "balance_loss_mlp": 1.00049746, + "epoch": 0.9284232676987825, + "flos": 22784530878720.0, + "grad_norm": 1.7778803153896112, + "language_loss": 0.64123678, + "learning_rate": 5.345878833417949e-08, + "loss": 0.66372025, + "num_input_tokens_seen": 333282435, + "step": 15442, + "time_per_iteration": 2.6245615482330322 + }, + { + "auxiliary_loss_clip": 0.01116981, + "auxiliary_loss_mlp": 0.0110118, + "balance_loss_clip": 1.00197208, + "balance_loss_mlp": 1.00047243, + "epoch": 0.9284833909514505, + "flos": 19500500171520.0, + "grad_norm": 1.878820352092604, + "language_loss": 0.80928034, + "learning_rate": 5.3369380952390295e-08, + "loss": 0.83146191, + "num_input_tokens_seen": 333300400, + "step": 15443, + "time_per_iteration": 2.6850545406341553 + }, + { + "auxiliary_loss_clip": 0.01149171, + "auxiliary_loss_mlp": 0.00747368, + "balance_loss_clip": 1.00191379, + "balance_loss_mlp": 1.00055814, + "epoch": 0.9285435142041184, + "flos": 23185976256000.0, + "grad_norm": 1.9793081341397762, + "language_loss": 0.65302253, + "learning_rate": 5.328004738702896e-08, + "loss": 0.67198789, + "num_input_tokens_seen": 333318980, + "step": 15444, + "time_per_iteration": 2.623915910720825 + }, + { + "auxiliary_loss_clip": 0.01116515, + "auxiliary_loss_mlp": 0.01100496, + "balance_loss_clip": 1.00174046, + "balance_loss_mlp": 1.00055063, + "epoch": 0.9286036374567864, + "flos": 17675519915520.0, + "grad_norm": 2.2933997850697363, + "language_loss": 0.73685116, + "learning_rate": 5.3190787641483215e-08, + "loss": 0.75902128, + "num_input_tokens_seen": 333334135, + "step": 15445, + "time_per_iteration": 2.6240012645721436 + }, + { + "auxiliary_loss_clip": 0.01149851, + "auxiliary_loss_mlp": 0.01100139, + "balance_loss_clip": 1.00206232, + "balance_loss_mlp": 1.00057554, + "epoch": 0.9286637607094543, + "flos": 20886687884160.0, + "grad_norm": 1.5967937716348972, + "language_loss": 0.71244395, + "learning_rate": 5.3101601719138135e-08, + "loss": 0.73494387, + "num_input_tokens_seen": 333353325, + "step": 15446, + "time_per_iteration": 2.6406941413879395 + }, + { + "auxiliary_loss_clip": 0.01085856, + "auxiliary_loss_mlp": 0.01101427, + "balance_loss_clip": 1.00157595, + "balance_loss_mlp": 1.00043249, + "epoch": 0.9287238839621224, + "flos": 19026012487680.0, + "grad_norm": 2.205003460403391, + "language_loss": 0.69637752, + "learning_rate": 5.301248962337523e-08, + "loss": 0.71825039, + "num_input_tokens_seen": 333371110, + "step": 15447, + "time_per_iteration": 2.730698823928833 + }, + { + "auxiliary_loss_clip": 0.01163837, + "auxiliary_loss_mlp": 0.010987, + "balance_loss_clip": 1.00181735, + "balance_loss_mlp": 1.00042403, + "epoch": 0.9287840072147904, + "flos": 20557027837440.0, + "grad_norm": 1.7177751458369306, + "language_loss": 0.72230053, + "learning_rate": 5.292345135757403e-08, + "loss": 0.74492586, + "num_input_tokens_seen": 333391420, + "step": 15448, + "time_per_iteration": 2.6343674659729004 + }, + { + "auxiliary_loss_clip": 0.01164076, + "auxiliary_loss_mlp": 0.01100788, + "balance_loss_clip": 1.00187492, + "balance_loss_mlp": 1.00036645, + "epoch": 0.9288441304674583, + "flos": 21250822008960.0, + "grad_norm": 1.7163687456479422, + "language_loss": 0.74206388, + "learning_rate": 5.283448692511072e-08, + "loss": 0.76471251, + "num_input_tokens_seen": 333410365, + "step": 15449, + "time_per_iteration": 2.5612101554870605 + }, + { + "auxiliary_loss_clip": 0.01164036, + "auxiliary_loss_mlp": 0.00747338, + "balance_loss_clip": 1.00189662, + "balance_loss_mlp": 1.00048018, + "epoch": 0.9289042537201263, + "flos": 27669853895040.0, + "grad_norm": 1.8724327731203216, + "language_loss": 0.67619026, + "learning_rate": 5.27455963293586e-08, + "loss": 0.69530404, + "num_input_tokens_seen": 333430000, + "step": 15450, + "time_per_iteration": 2.658989667892456 + }, + { + "auxiliary_loss_clip": 0.01115746, + "auxiliary_loss_mlp": 0.01100729, + "balance_loss_clip": 1.0017519, + "balance_loss_mlp": 1.00040221, + "epoch": 0.9289643769727942, + "flos": 19317750750720.0, + "grad_norm": 2.0041196456998738, + "language_loss": 0.71811271, + "learning_rate": 5.265677957368875e-08, + "loss": 0.74027747, + "num_input_tokens_seen": 333445800, + "step": 15451, + "time_per_iteration": 2.6698338985443115 + }, + { + "auxiliary_loss_clip": 0.01134471, + "auxiliary_loss_mlp": 0.01101468, + "balance_loss_clip": 1.00183439, + "balance_loss_mlp": 1.00056946, + "epoch": 0.9290245002254622, + "flos": 14058058233600.0, + "grad_norm": 2.7136541736564626, + "language_loss": 0.73332739, + "learning_rate": 5.25680366614687e-08, + "loss": 0.75568676, + "num_input_tokens_seen": 333461550, + "step": 15452, + "time_per_iteration": 3.999066114425659 + }, + { + "auxiliary_loss_clip": 0.01132772, + "auxiliary_loss_mlp": 0.01099907, + "balance_loss_clip": 1.00182056, + "balance_loss_mlp": 1.00039101, + "epoch": 0.9290846234781301, + "flos": 20047132321920.0, + "grad_norm": 1.8910261941393898, + "language_loss": 0.74248457, + "learning_rate": 5.2479367596064196e-08, + "loss": 0.76481134, + "num_input_tokens_seen": 333478835, + "step": 15453, + "time_per_iteration": 2.668038845062256 + }, + { + "auxiliary_loss_clip": 0.01109649, + "auxiliary_loss_mlp": 0.01074295, + "balance_loss_clip": 1.00072455, + "balance_loss_mlp": 1.00029016, + "epoch": 0.9291447467307982, + "flos": 61227514460160.0, + "grad_norm": 0.839079786259633, + "language_loss": 0.60769844, + "learning_rate": 5.2390772380837226e-08, + "loss": 0.62953782, + "num_input_tokens_seen": 333535250, + "step": 15454, + "time_per_iteration": 3.126265525817871 + }, + { + "auxiliary_loss_clip": 0.01134657, + "auxiliary_loss_mlp": 0.01100734, + "balance_loss_clip": 1.00184369, + "balance_loss_mlp": 1.00050318, + "epoch": 0.9292048699834661, + "flos": 20553328736640.0, + "grad_norm": 2.423221106474269, + "language_loss": 0.69191313, + "learning_rate": 5.230225101914709e-08, + "loss": 0.71426702, + "num_input_tokens_seen": 333553805, + "step": 15455, + "time_per_iteration": 3.9857168197631836 + }, + { + "auxiliary_loss_clip": 0.0111692, + "auxiliary_loss_mlp": 0.01100684, + "balance_loss_clip": 1.00203109, + "balance_loss_mlp": 1.0005002, + "epoch": 0.9292649932361341, + "flos": 23623655477760.0, + "grad_norm": 3.87683667592101, + "language_loss": 0.64015806, + "learning_rate": 5.22138035143509e-08, + "loss": 0.66233414, + "num_input_tokens_seen": 333572800, + "step": 15456, + "time_per_iteration": 2.709650754928589 + }, + { + "auxiliary_loss_clip": 0.01101015, + "auxiliary_loss_mlp": 0.01101592, + "balance_loss_clip": 1.00174081, + "balance_loss_mlp": 1.00064564, + "epoch": 0.929325116488802, + "flos": 15009942602880.0, + "grad_norm": 2.2287862095614113, + "language_loss": 0.68271875, + "learning_rate": 5.2125429869802615e-08, + "loss": 0.70474482, + "num_input_tokens_seen": 333588520, + "step": 15457, + "time_per_iteration": 2.628641128540039 + }, + { + "auxiliary_loss_clip": 0.01130786, + "auxiliary_loss_mlp": 0.01100768, + "balance_loss_clip": 1.00173497, + "balance_loss_mlp": 1.0005368, + "epoch": 0.92938523974147, + "flos": 17967365919360.0, + "grad_norm": 2.820328347806983, + "language_loss": 0.81028199, + "learning_rate": 5.203713008885291e-08, + "loss": 0.83259749, + "num_input_tokens_seen": 333603435, + "step": 15458, + "time_per_iteration": 2.664733648300171 + }, + { + "auxiliary_loss_clip": 0.01148645, + "auxiliary_loss_mlp": 0.01100528, + "balance_loss_clip": 1.00183928, + "balance_loss_mlp": 1.00058246, + "epoch": 0.9294453629941379, + "flos": 23003047267200.0, + "grad_norm": 1.6243184524145056, + "language_loss": 0.72128427, + "learning_rate": 5.194890417485065e-08, + "loss": 0.74377596, + "num_input_tokens_seen": 333623305, + "step": 15459, + "time_per_iteration": 2.5777080059051514 + }, + { + "auxiliary_loss_clip": 0.01117719, + "auxiliary_loss_mlp": 0.01100517, + "balance_loss_clip": 1.00189102, + "balance_loss_mlp": 1.00057244, + "epoch": 0.929505486246806, + "flos": 17055234927360.0, + "grad_norm": 3.3084712769092284, + "language_loss": 0.58873546, + "learning_rate": 5.1860752131141384e-08, + "loss": 0.61091781, + "num_input_tokens_seen": 333641205, + "step": 15460, + "time_per_iteration": 2.670926809310913 + }, + { + "auxiliary_loss_clip": 0.01101195, + "auxiliary_loss_mlp": 0.0110075, + "balance_loss_clip": 1.00170195, + "balance_loss_mlp": 1.00051928, + "epoch": 0.9295656094994739, + "flos": 27340409329920.0, + "grad_norm": 1.7993896217336336, + "language_loss": 0.80459672, + "learning_rate": 5.177267396106733e-08, + "loss": 0.82661617, + "num_input_tokens_seen": 333659615, + "step": 15461, + "time_per_iteration": 2.714463710784912 + }, + { + "auxiliary_loss_clip": 0.01131755, + "auxiliary_loss_mlp": 0.01099645, + "balance_loss_clip": 1.0016551, + "balance_loss_mlp": 1.00041509, + "epoch": 0.9296257327521419, + "flos": 21470954509440.0, + "grad_norm": 11.03582764747619, + "language_loss": 0.7818507, + "learning_rate": 5.168466966796869e-08, + "loss": 0.80416477, + "num_input_tokens_seen": 333678985, + "step": 15462, + "time_per_iteration": 2.7935171127319336 + }, + { + "auxiliary_loss_clip": 0.01119765, + "auxiliary_loss_mlp": 0.01100757, + "balance_loss_clip": 1.00183105, + "balance_loss_mlp": 1.00043035, + "epoch": 0.9296858560048099, + "flos": 16362661818240.0, + "grad_norm": 2.1386150304341176, + "language_loss": 0.6259588, + "learning_rate": 5.159673925518282e-08, + "loss": 0.64816397, + "num_input_tokens_seen": 333696410, + "step": 15463, + "time_per_iteration": 2.63108491897583 + }, + { + "auxiliary_loss_clip": 0.01134414, + "auxiliary_loss_mlp": 0.01100132, + "balance_loss_clip": 1.00178921, + "balance_loss_mlp": 1.00047338, + "epoch": 0.9297459792574778, + "flos": 29858609139840.0, + "grad_norm": 1.4238347166125978, + "language_loss": 0.71027493, + "learning_rate": 5.15088827260437e-08, + "loss": 0.73262048, + "num_input_tokens_seen": 333716615, + "step": 15464, + "time_per_iteration": 2.7021138668060303 + }, + { + "auxiliary_loss_clip": 0.0113512, + "auxiliary_loss_mlp": 0.01100835, + "balance_loss_clip": 1.00177503, + "balance_loss_mlp": 1.00041294, + "epoch": 0.9298061025101458, + "flos": 15924838942080.0, + "grad_norm": 2.4903378964627674, + "language_loss": 0.77376986, + "learning_rate": 5.1421100083883115e-08, + "loss": 0.79612947, + "num_input_tokens_seen": 333732800, + "step": 15465, + "time_per_iteration": 2.6093218326568604 + }, + { + "auxiliary_loss_clip": 0.01076869, + "auxiliary_loss_mlp": 0.01075239, + "balance_loss_clip": 1.00055075, + "balance_loss_mlp": 1.00047064, + "epoch": 0.9298662257628137, + "flos": 64096994304000.0, + "grad_norm": 0.6993564059119713, + "language_loss": 0.56482995, + "learning_rate": 5.133339133202952e-08, + "loss": 0.58635104, + "num_input_tokens_seen": 333799300, + "step": 15466, + "time_per_iteration": 3.571704864501953 + }, + { + "auxiliary_loss_clip": 0.01132753, + "auxiliary_loss_mlp": 0.01101695, + "balance_loss_clip": 1.00170636, + "balance_loss_mlp": 1.00060558, + "epoch": 0.9299263490154818, + "flos": 24280210224000.0, + "grad_norm": 1.3978939314584922, + "language_loss": 0.7284053, + "learning_rate": 5.1245756473809355e-08, + "loss": 0.75074983, + "num_input_tokens_seen": 333820360, + "step": 15467, + "time_per_iteration": 2.977787733078003 + }, + { + "auxiliary_loss_clip": 0.01133503, + "auxiliary_loss_mlp": 0.01101147, + "balance_loss_clip": 1.00195253, + "balance_loss_mlp": 1.00053406, + "epoch": 0.9299864722681497, + "flos": 23294354567040.0, + "grad_norm": 2.0587391355547067, + "language_loss": 0.71763217, + "learning_rate": 5.1158195512545076e-08, + "loss": 0.73997861, + "num_input_tokens_seen": 333840415, + "step": 15468, + "time_per_iteration": 2.629366397857666 + }, + { + "auxiliary_loss_clip": 0.01149266, + "auxiliary_loss_mlp": 0.01101128, + "balance_loss_clip": 1.00179696, + "balance_loss_mlp": 1.00042057, + "epoch": 0.9300465955208177, + "flos": 21395972868480.0, + "grad_norm": 1.8022394891963942, + "language_loss": 0.75400746, + "learning_rate": 5.107070845155737e-08, + "loss": 0.77651143, + "num_input_tokens_seen": 333859910, + "step": 15469, + "time_per_iteration": 2.5869367122650146 + }, + { + "auxiliary_loss_clip": 0.01130996, + "auxiliary_loss_mlp": 0.01100332, + "balance_loss_clip": 1.00169992, + "balance_loss_mlp": 1.00048208, + "epoch": 0.9301067187734856, + "flos": 24571445696640.0, + "grad_norm": 1.817297904377605, + "language_loss": 0.75659227, + "learning_rate": 5.098329529416379e-08, + "loss": 0.77890551, + "num_input_tokens_seen": 333880495, + "step": 15470, + "time_per_iteration": 4.472949981689453 + }, + { + "auxiliary_loss_clip": 0.01114456, + "auxiliary_loss_mlp": 0.01100002, + "balance_loss_clip": 1.00184011, + "balance_loss_mlp": 1.00048649, + "epoch": 0.9301668420261536, + "flos": 22196960202240.0, + "grad_norm": 1.7689672509763499, + "language_loss": 0.74890971, + "learning_rate": 5.089595604367902e-08, + "loss": 0.77105427, + "num_input_tokens_seen": 333897640, + "step": 15471, + "time_per_iteration": 2.664486885070801 + }, + { + "auxiliary_loss_clip": 0.01147539, + "auxiliary_loss_mlp": 0.01099974, + "balance_loss_clip": 1.00170624, + "balance_loss_mlp": 1.00045776, + "epoch": 0.9302269652788215, + "flos": 17747628468480.0, + "grad_norm": 4.522853235413726, + "language_loss": 0.69590104, + "learning_rate": 5.080869070341487e-08, + "loss": 0.71837616, + "num_input_tokens_seen": 333913670, + "step": 15472, + "time_per_iteration": 2.578519821166992 + }, + { + "auxiliary_loss_clip": 0.01132582, + "auxiliary_loss_mlp": 0.01098927, + "balance_loss_clip": 1.00187755, + "balance_loss_mlp": 1.00050759, + "epoch": 0.9302870885314896, + "flos": 19390793057280.0, + "grad_norm": 1.7219218192600816, + "language_loss": 0.88672984, + "learning_rate": 5.0721499276680233e-08, + "loss": 0.90904498, + "num_input_tokens_seen": 333934105, + "step": 15473, + "time_per_iteration": 2.6474902629852295 + }, + { + "auxiliary_loss_clip": 0.01131217, + "auxiliary_loss_mlp": 0.01101252, + "balance_loss_clip": 1.00171018, + "balance_loss_mlp": 1.00054443, + "epoch": 0.9303472117841575, + "flos": 21760286561280.0, + "grad_norm": 1.8124153475841862, + "language_loss": 0.64548612, + "learning_rate": 5.063438176678203e-08, + "loss": 0.6678108, + "num_input_tokens_seen": 333953635, + "step": 15474, + "time_per_iteration": 4.326924800872803 + }, + { + "auxiliary_loss_clip": 0.01164234, + "auxiliary_loss_mlp": 0.0109977, + "balance_loss_clip": 1.00192606, + "balance_loss_mlp": 1.00063586, + "epoch": 0.9304073350368255, + "flos": 19609740408960.0, + "grad_norm": 1.95480200643611, + "language_loss": 0.74403089, + "learning_rate": 5.054733817702339e-08, + "loss": 0.76667094, + "num_input_tokens_seen": 333971825, + "step": 15475, + "time_per_iteration": 2.575427293777466 + }, + { + "auxiliary_loss_clip": 0.01149565, + "auxiliary_loss_mlp": 0.01099839, + "balance_loss_clip": 1.00195742, + "balance_loss_mlp": 1.00041819, + "epoch": 0.9304674582894935, + "flos": 30441582875520.0, + "grad_norm": 1.9941420132538366, + "language_loss": 0.66253656, + "learning_rate": 5.0460368510704786e-08, + "loss": 0.68503058, + "num_input_tokens_seen": 333990120, + "step": 15476, + "time_per_iteration": 2.645970582962036 + }, + { + "auxiliary_loss_clip": 0.01099142, + "auxiliary_loss_mlp": 0.01100627, + "balance_loss_clip": 1.0016979, + "balance_loss_mlp": 1.00063419, + "epoch": 0.9305275815421614, + "flos": 17785693906560.0, + "grad_norm": 2.181170651428714, + "language_loss": 0.69149578, + "learning_rate": 5.0373472771124914e-08, + "loss": 0.71349347, + "num_input_tokens_seen": 334007970, + "step": 15477, + "time_per_iteration": 2.7150983810424805 + }, + { + "auxiliary_loss_clip": 0.01132083, + "auxiliary_loss_mlp": 0.01099626, + "balance_loss_clip": 1.00185037, + "balance_loss_mlp": 1.00049114, + "epoch": 0.9305877047948294, + "flos": 25298456970240.0, + "grad_norm": 2.3102521347491987, + "language_loss": 0.58464754, + "learning_rate": 5.0286650961578027e-08, + "loss": 0.60696465, + "num_input_tokens_seen": 334027120, + "step": 15478, + "time_per_iteration": 2.677152395248413 + }, + { + "auxiliary_loss_clip": 0.01131289, + "auxiliary_loss_mlp": 0.01101741, + "balance_loss_clip": 1.00172389, + "balance_loss_mlp": 1.00041366, + "epoch": 0.9306478280474973, + "flos": 16977236544000.0, + "grad_norm": 7.024886710340833, + "language_loss": 0.78746438, + "learning_rate": 5.01999030853566e-08, + "loss": 0.80979466, + "num_input_tokens_seen": 334042785, + "step": 15479, + "time_per_iteration": 2.584496021270752 + }, + { + "auxiliary_loss_clip": 0.01164182, + "auxiliary_loss_mlp": 0.01099788, + "balance_loss_clip": 1.00186908, + "balance_loss_mlp": 1.00051045, + "epoch": 0.9307079513001654, + "flos": 35663353608960.0, + "grad_norm": 1.6317764145779268, + "language_loss": 0.68646163, + "learning_rate": 5.0113229145750445e-08, + "loss": 0.70910132, + "num_input_tokens_seen": 334063480, + "step": 15480, + "time_per_iteration": 2.6205031871795654 + }, + { + "auxiliary_loss_clip": 0.01164203, + "auxiliary_loss_mlp": 0.01100278, + "balance_loss_clip": 1.00190949, + "balance_loss_mlp": 1.00057113, + "epoch": 0.9307680745528333, + "flos": 19208151377280.0, + "grad_norm": 1.9329086863949023, + "language_loss": 0.67772359, + "learning_rate": 5.002662914604583e-08, + "loss": 0.7003684, + "num_input_tokens_seen": 334082005, + "step": 15481, + "time_per_iteration": 2.5268640518188477 + }, + { + "auxiliary_loss_clip": 0.01133942, + "auxiliary_loss_mlp": 0.01099816, + "balance_loss_clip": 1.00176597, + "balance_loss_mlp": 1.00030017, + "epoch": 0.9308281978055013, + "flos": 19062641381760.0, + "grad_norm": 1.7617579413666724, + "language_loss": 0.74610943, + "learning_rate": 4.994010308952701e-08, + "loss": 0.76844704, + "num_input_tokens_seen": 334101375, + "step": 15482, + "time_per_iteration": 2.652259588241577 + }, + { + "auxiliary_loss_clip": 0.0114881, + "auxiliary_loss_mlp": 0.01099257, + "balance_loss_clip": 1.00181341, + "balance_loss_mlp": 1.00040913, + "epoch": 0.9308883210581692, + "flos": 20521548178560.0, + "grad_norm": 3.5663263358296646, + "language_loss": 0.80129105, + "learning_rate": 4.985365097947469e-08, + "loss": 0.82377172, + "num_input_tokens_seen": 334119460, + "step": 15483, + "time_per_iteration": 2.576711654663086 + }, + { + "auxiliary_loss_clip": 0.01130979, + "auxiliary_loss_mlp": 0.01100639, + "balance_loss_clip": 1.00171661, + "balance_loss_mlp": 1.00050306, + "epoch": 0.9309484443108372, + "flos": 13001422826880.0, + "grad_norm": 1.9933474695788607, + "language_loss": 0.74366486, + "learning_rate": 4.976727281916782e-08, + "loss": 0.76598108, + "num_input_tokens_seen": 334136065, + "step": 15484, + "time_per_iteration": 2.576235771179199 + }, + { + "auxiliary_loss_clip": 0.01130295, + "auxiliary_loss_mlp": 0.01100818, + "balance_loss_clip": 1.00183499, + "balance_loss_mlp": 1.0004437, + "epoch": 0.9310085675635051, + "flos": 12567765928320.0, + "grad_norm": 2.459382106007302, + "language_loss": 0.76401138, + "learning_rate": 4.968096861188087e-08, + "loss": 0.78632253, + "num_input_tokens_seen": 334153690, + "step": 15485, + "time_per_iteration": 2.5935323238372803 + }, + { + "auxiliary_loss_clip": 0.01102675, + "auxiliary_loss_mlp": 0.01100321, + "balance_loss_clip": 1.00178456, + "balance_loss_mlp": 1.00051904, + "epoch": 0.9310686908161732, + "flos": 23477570864640.0, + "grad_norm": 1.8545634326739733, + "language_loss": 0.78325403, + "learning_rate": 4.959473836088723e-08, + "loss": 0.80528402, + "num_input_tokens_seen": 334171880, + "step": 15486, + "time_per_iteration": 2.721346855163574 + }, + { + "auxiliary_loss_clip": 0.0111414, + "auxiliary_loss_mlp": 0.01101743, + "balance_loss_clip": 1.00180054, + "balance_loss_mlp": 1.0005579, + "epoch": 0.9311288140688411, + "flos": 24170287628160.0, + "grad_norm": 1.8451355746032057, + "language_loss": 0.7671479, + "learning_rate": 4.950858206945674e-08, + "loss": 0.7893067, + "num_input_tokens_seen": 334190005, + "step": 15487, + "time_per_iteration": 2.669814109802246 + }, + { + "auxiliary_loss_clip": 0.01117694, + "auxiliary_loss_mlp": 0.01100233, + "balance_loss_clip": 1.00167847, + "balance_loss_mlp": 1.00043094, + "epoch": 0.9311889373215091, + "flos": 35590203561600.0, + "grad_norm": 1.9911906018599188, + "language_loss": 0.66991031, + "learning_rate": 4.942249974085633e-08, + "loss": 0.69208962, + "num_input_tokens_seen": 334209545, + "step": 15488, + "time_per_iteration": 2.7648770809173584 + }, + { + "auxiliary_loss_clip": 0.01130428, + "auxiliary_loss_mlp": 0.0109976, + "balance_loss_clip": 1.00161636, + "balance_loss_mlp": 1.00038755, + "epoch": 0.9312490605741771, + "flos": 20230528187520.0, + "grad_norm": 2.15538905490536, + "language_loss": 0.74842095, + "learning_rate": 4.933649137834983e-08, + "loss": 0.77072281, + "num_input_tokens_seen": 334228900, + "step": 15489, + "time_per_iteration": 2.6347272396087646 + }, + { + "auxiliary_loss_clip": 0.01164213, + "auxiliary_loss_mlp": 0.0110196, + "balance_loss_clip": 1.00185382, + "balance_loss_mlp": 1.00068021, + "epoch": 0.931309183826845, + "flos": 13950577762560.0, + "grad_norm": 2.2631256078494144, + "language_loss": 0.80969155, + "learning_rate": 4.925055698519931e-08, + "loss": 0.83235323, + "num_input_tokens_seen": 334245500, + "step": 15490, + "time_per_iteration": 3.898074150085449 + }, + { + "auxiliary_loss_clip": 0.01097561, + "auxiliary_loss_mlp": 0.01101395, + "balance_loss_clip": 1.00163531, + "balance_loss_mlp": 1.00059175, + "epoch": 0.931369307079513, + "flos": 20156731695360.0, + "grad_norm": 2.078531139886863, + "language_loss": 0.71832085, + "learning_rate": 4.9164696564663264e-08, + "loss": 0.74031037, + "num_input_tokens_seen": 334264370, + "step": 15491, + "time_per_iteration": 2.7026987075805664 + }, + { + "auxiliary_loss_clip": 0.01131613, + "auxiliary_loss_mlp": 0.00747145, + "balance_loss_clip": 1.00170708, + "balance_loss_mlp": 1.0004257, + "epoch": 0.931429430332181, + "flos": 25338569483520.0, + "grad_norm": 1.8539281115394532, + "language_loss": 0.74483007, + "learning_rate": 4.9078910119997096e-08, + "loss": 0.76361763, + "num_input_tokens_seen": 334283905, + "step": 15492, + "time_per_iteration": 2.632370710372925 + }, + { + "auxiliary_loss_clip": 0.01143693, + "auxiliary_loss_mlp": 0.01073767, + "balance_loss_clip": 1.00073338, + "balance_loss_mlp": 1.00014353, + "epoch": 0.931489553584849, + "flos": 71226193985280.0, + "grad_norm": 0.710204518107757, + "language_loss": 0.53442073, + "learning_rate": 4.899319765445442e-08, + "loss": 0.55659533, + "num_input_tokens_seen": 334339925, + "step": 15493, + "time_per_iteration": 4.375655651092529 + }, + { + "auxiliary_loss_clip": 0.01149128, + "auxiliary_loss_mlp": 0.01100502, + "balance_loss_clip": 1.00192511, + "balance_loss_mlp": 1.00055742, + "epoch": 0.9315496768375169, + "flos": 14643653662080.0, + "grad_norm": 1.6575924271368305, + "language_loss": 0.70602649, + "learning_rate": 4.890755917128531e-08, + "loss": 0.72852278, + "num_input_tokens_seen": 334357225, + "step": 15494, + "time_per_iteration": 2.5641283988952637 + }, + { + "auxiliary_loss_clip": 0.01148118, + "auxiliary_loss_mlp": 0.01101513, + "balance_loss_clip": 1.00199401, + "balance_loss_mlp": 1.00042379, + "epoch": 0.9316098000901849, + "flos": 28329928174080.0, + "grad_norm": 1.655883652305739, + "language_loss": 0.68416202, + "learning_rate": 4.882199467373671e-08, + "loss": 0.70665824, + "num_input_tokens_seen": 334375945, + "step": 15495, + "time_per_iteration": 2.61942195892334 + }, + { + "auxiliary_loss_clip": 0.01163988, + "auxiliary_loss_mlp": 0.01099813, + "balance_loss_clip": 1.00178909, + "balance_loss_mlp": 1.00053596, + "epoch": 0.9316699233428528, + "flos": 28512677594880.0, + "grad_norm": 1.9506574407370025, + "language_loss": 0.61565161, + "learning_rate": 4.8736504165053815e-08, + "loss": 0.63828969, + "num_input_tokens_seen": 334395310, + "step": 15496, + "time_per_iteration": 2.579294204711914 + }, + { + "auxiliary_loss_clip": 0.01147949, + "auxiliary_loss_mlp": 0.01100221, + "balance_loss_clip": 1.00182509, + "balance_loss_mlp": 1.00051379, + "epoch": 0.9317300465955208, + "flos": 33693402061440.0, + "grad_norm": 2.006131462000222, + "language_loss": 0.77085829, + "learning_rate": 4.865108764847825e-08, + "loss": 0.79333997, + "num_input_tokens_seen": 334416965, + "step": 15497, + "time_per_iteration": 2.654512643814087 + }, + { + "auxiliary_loss_clip": 0.01148081, + "auxiliary_loss_mlp": 0.00747229, + "balance_loss_clip": 1.00186157, + "balance_loss_mlp": 1.00041091, + "epoch": 0.9317901698481887, + "flos": 23658237296640.0, + "grad_norm": 1.6896306836750514, + "language_loss": 0.66615009, + "learning_rate": 4.856574512724898e-08, + "loss": 0.68510318, + "num_input_tokens_seen": 334435620, + "step": 15498, + "time_per_iteration": 2.5815813541412354 + }, + { + "auxiliary_loss_clip": 0.01131183, + "auxiliary_loss_mlp": 0.01100112, + "balance_loss_clip": 1.00175333, + "balance_loss_mlp": 1.00054884, + "epoch": 0.9318502931008568, + "flos": 20960017499520.0, + "grad_norm": 2.6994158097127015, + "language_loss": 0.80029774, + "learning_rate": 4.8480476604602305e-08, + "loss": 0.82261074, + "num_input_tokens_seen": 334456210, + "step": 15499, + "time_per_iteration": 2.634695053100586 + }, + { + "auxiliary_loss_clip": 0.01100682, + "auxiliary_loss_mlp": 0.01099954, + "balance_loss_clip": 1.00162053, + "balance_loss_mlp": 1.00062919, + "epoch": 0.9319104163535247, + "flos": 23441049711360.0, + "grad_norm": 1.531024370397886, + "language_loss": 0.76678586, + "learning_rate": 4.8395282083771196e-08, + "loss": 0.78879225, + "num_input_tokens_seen": 334475485, + "step": 15500, + "time_per_iteration": 2.706462860107422 + }, + { + "auxiliary_loss_clip": 0.01115321, + "auxiliary_loss_mlp": 0.01099127, + "balance_loss_clip": 1.00158894, + "balance_loss_mlp": 1.00046945, + "epoch": 0.9319705396061927, + "flos": 22347426274560.0, + "grad_norm": 1.9144240519737385, + "language_loss": 0.7269116, + "learning_rate": 4.8310161567987064e-08, + "loss": 0.7490561, + "num_input_tokens_seen": 334494740, + "step": 15501, + "time_per_iteration": 2.651268243789673 + }, + { + "auxiliary_loss_clip": 0.01164307, + "auxiliary_loss_mlp": 0.01101371, + "balance_loss_clip": 1.00187778, + "balance_loss_mlp": 1.00047207, + "epoch": 0.9320306628588607, + "flos": 20993557824000.0, + "grad_norm": 1.9207912126900482, + "language_loss": 0.66349578, + "learning_rate": 4.822511506047666e-08, + "loss": 0.68615258, + "num_input_tokens_seen": 334511910, + "step": 15502, + "time_per_iteration": 2.5321056842803955 + }, + { + "auxiliary_loss_clip": 0.0114765, + "auxiliary_loss_mlp": 0.00747519, + "balance_loss_clip": 1.00181222, + "balance_loss_mlp": 1.00060081, + "epoch": 0.9320907861115286, + "flos": 24538300421760.0, + "grad_norm": 1.5949874136504276, + "language_loss": 0.65751618, + "learning_rate": 4.814014256446586e-08, + "loss": 0.6764679, + "num_input_tokens_seen": 334533150, + "step": 15503, + "time_per_iteration": 2.6238691806793213 + }, + { + "auxiliary_loss_clip": 0.01118219, + "auxiliary_loss_mlp": 0.01101555, + "balance_loss_clip": 1.00185001, + "balance_loss_mlp": 1.00056088, + "epoch": 0.9321509093641966, + "flos": 19785414850560.0, + "grad_norm": 1.4907205448381988, + "language_loss": 0.75489718, + "learning_rate": 4.805524408317652e-08, + "loss": 0.7770949, + "num_input_tokens_seen": 334550940, + "step": 15504, + "time_per_iteration": 2.6355292797088623 + }, + { + "auxiliary_loss_clip": 0.01147437, + "auxiliary_loss_mlp": 0.00747381, + "balance_loss_clip": 1.00180221, + "balance_loss_mlp": 1.00052559, + "epoch": 0.9322110326168646, + "flos": 24972675592320.0, + "grad_norm": 2.0207895381817687, + "language_loss": 0.71131337, + "learning_rate": 4.797041961982762e-08, + "loss": 0.73026156, + "num_input_tokens_seen": 334570935, + "step": 15505, + "time_per_iteration": 2.590327262878418 + }, + { + "auxiliary_loss_clip": 0.01132578, + "auxiliary_loss_mlp": 0.01100814, + "balance_loss_clip": 1.00178039, + "balance_loss_mlp": 1.00048721, + "epoch": 0.9322711558695326, + "flos": 16143642639360.0, + "grad_norm": 2.044462729895973, + "language_loss": 0.75338948, + "learning_rate": 4.788566917763614e-08, + "loss": 0.77572334, + "num_input_tokens_seen": 334589315, + "step": 15506, + "time_per_iteration": 2.5968241691589355 + }, + { + "auxiliary_loss_clip": 0.01117333, + "auxiliary_loss_mlp": 0.01099383, + "balance_loss_clip": 1.00183654, + "balance_loss_mlp": 1.00058222, + "epoch": 0.9323312791222005, + "flos": 23732428838400.0, + "grad_norm": 2.1322396737161196, + "language_loss": 0.83137655, + "learning_rate": 4.780099275981597e-08, + "loss": 0.8535437, + "num_input_tokens_seen": 334608990, + "step": 15507, + "time_per_iteration": 2.659053087234497 + }, + { + "auxiliary_loss_clip": 0.01164151, + "auxiliary_loss_mlp": 0.0109969, + "balance_loss_clip": 1.00185931, + "balance_loss_mlp": 1.00041199, + "epoch": 0.9323914023748685, + "flos": 20777914523520.0, + "grad_norm": 1.724109655097626, + "language_loss": 0.67898643, + "learning_rate": 4.771639036957742e-08, + "loss": 0.70162481, + "num_input_tokens_seen": 334628655, + "step": 15508, + "time_per_iteration": 4.085558176040649 + }, + { + "auxiliary_loss_clip": 0.01118093, + "auxiliary_loss_mlp": 0.01099406, + "balance_loss_clip": 1.00174868, + "balance_loss_mlp": 1.00055766, + "epoch": 0.9324515256275364, + "flos": 23915178259200.0, + "grad_norm": 1.5933011286973473, + "language_loss": 0.72430664, + "learning_rate": 4.7631862010129033e-08, + "loss": 0.7464816, + "num_input_tokens_seen": 334648295, + "step": 15509, + "time_per_iteration": 2.714195489883423 + }, + { + "auxiliary_loss_clip": 0.01147647, + "auxiliary_loss_mlp": 0.01100392, + "balance_loss_clip": 1.00180769, + "balance_loss_mlp": 1.00039899, + "epoch": 0.9325116488802044, + "flos": 18005215875840.0, + "grad_norm": 2.2351322353923306, + "language_loss": 0.74496055, + "learning_rate": 4.754740768467624e-08, + "loss": 0.76744097, + "num_input_tokens_seen": 334666280, + "step": 15510, + "time_per_iteration": 2.550171375274658 + }, + { + "auxiliary_loss_clip": 0.01149532, + "auxiliary_loss_mlp": 0.01100547, + "balance_loss_clip": 1.00179982, + "balance_loss_mlp": 1.00050664, + "epoch": 0.9325717721328723, + "flos": 29021603443200.0, + "grad_norm": 1.649254142591758, + "language_loss": 0.70215976, + "learning_rate": 4.746302739642161e-08, + "loss": 0.72466052, + "num_input_tokens_seen": 334688830, + "step": 15511, + "time_per_iteration": 2.7840230464935303 + }, + { + "auxiliary_loss_clip": 0.01132813, + "auxiliary_loss_mlp": 0.01100291, + "balance_loss_clip": 1.0018084, + "balance_loss_mlp": 1.00063157, + "epoch": 0.9326318953855404, + "flos": 21646341642240.0, + "grad_norm": 1.9562377814873297, + "language_loss": 0.78140163, + "learning_rate": 4.737872114856412e-08, + "loss": 0.80373269, + "num_input_tokens_seen": 334705205, + "step": 15512, + "time_per_iteration": 4.047589063644409 + }, + { + "auxiliary_loss_clip": 0.011641, + "auxiliary_loss_mlp": 0.01100688, + "balance_loss_clip": 1.00188494, + "balance_loss_mlp": 1.00045681, + "epoch": 0.9326920186382083, + "flos": 26065724411520.0, + "grad_norm": 1.4586527059727643, + "language_loss": 0.80749208, + "learning_rate": 4.7294488944301436e-08, + "loss": 0.83013999, + "num_input_tokens_seen": 334723830, + "step": 15513, + "time_per_iteration": 2.5497469902038574 + }, + { + "auxiliary_loss_clip": 0.01131507, + "auxiliary_loss_mlp": 0.01101736, + "balance_loss_clip": 1.00191045, + "balance_loss_mlp": 1.00055099, + "epoch": 0.9327521418908763, + "flos": 12057116227200.0, + "grad_norm": 2.377365054182132, + "language_loss": 0.79857832, + "learning_rate": 4.721033078682768e-08, + "loss": 0.82091069, + "num_input_tokens_seen": 334740825, + "step": 15514, + "time_per_iteration": 2.564053773880005 + }, + { + "auxiliary_loss_clip": 0.01131424, + "auxiliary_loss_mlp": 0.01099955, + "balance_loss_clip": 1.00192165, + "balance_loss_mlp": 1.0005821, + "epoch": 0.9328122651435443, + "flos": 43834395271680.0, + "grad_norm": 1.8872729957102934, + "language_loss": 0.71975857, + "learning_rate": 4.7126246679333626e-08, + "loss": 0.74207234, + "num_input_tokens_seen": 334765825, + "step": 15515, + "time_per_iteration": 2.7828667163848877 + }, + { + "auxiliary_loss_clip": 0.01132999, + "auxiliary_loss_mlp": 0.01101242, + "balance_loss_clip": 1.00173569, + "balance_loss_mlp": 1.00053358, + "epoch": 0.9328723883962122, + "flos": 15194954580480.0, + "grad_norm": 2.368831632203886, + "language_loss": 0.80902815, + "learning_rate": 4.704223662500806e-08, + "loss": 0.83137047, + "num_input_tokens_seen": 334782680, + "step": 15516, + "time_per_iteration": 2.6523642539978027 + }, + { + "auxiliary_loss_clip": 0.01119879, + "auxiliary_loss_mlp": 0.01100681, + "balance_loss_clip": 1.00182176, + "balance_loss_mlp": 1.00049734, + "epoch": 0.9329325116488802, + "flos": 20261770041600.0, + "grad_norm": 3.049783419306664, + "language_loss": 0.80448824, + "learning_rate": 4.695830062703643e-08, + "loss": 0.82669383, + "num_input_tokens_seen": 334800160, + "step": 15517, + "time_per_iteration": 2.669857978820801 + }, + { + "auxiliary_loss_clip": 0.0113083, + "auxiliary_loss_mlp": 0.0110036, + "balance_loss_clip": 1.00168645, + "balance_loss_mlp": 1.0005579, + "epoch": 0.9329926349015482, + "flos": 13115008609920.0, + "grad_norm": 2.100712519368552, + "language_loss": 0.74861872, + "learning_rate": 4.687443868860219e-08, + "loss": 0.77093059, + "num_input_tokens_seen": 334815840, + "step": 15518, + "time_per_iteration": 2.575455665588379 + }, + { + "auxiliary_loss_clip": 0.01132712, + "auxiliary_loss_mlp": 0.01100212, + "balance_loss_clip": 1.00172067, + "balance_loss_mlp": 1.00064814, + "epoch": 0.9330527581542162, + "flos": 23040250778880.0, + "grad_norm": 2.1174742992369513, + "language_loss": 0.75700289, + "learning_rate": 4.679065081288458e-08, + "loss": 0.77933216, + "num_input_tokens_seen": 334834735, + "step": 15519, + "time_per_iteration": 2.605239152908325 + }, + { + "auxiliary_loss_clip": 0.01085102, + "auxiliary_loss_mlp": 0.01099926, + "balance_loss_clip": 1.00162554, + "balance_loss_mlp": 1.00045824, + "epoch": 0.9331128814068841, + "flos": 15559627409280.0, + "grad_norm": 2.741058872410936, + "language_loss": 0.82869112, + "learning_rate": 4.6706937003061275e-08, + "loss": 0.85054135, + "num_input_tokens_seen": 334853490, + "step": 15520, + "time_per_iteration": 2.715740203857422 + }, + { + "auxiliary_loss_clip": 0.01149311, + "auxiliary_loss_mlp": 0.01099912, + "balance_loss_clip": 1.00181675, + "balance_loss_mlp": 1.00049186, + "epoch": 0.9331730046595521, + "flos": 22271762275200.0, + "grad_norm": 1.7684728808784198, + "language_loss": 0.76108599, + "learning_rate": 4.6623297262306846e-08, + "loss": 0.78357816, + "num_input_tokens_seen": 334873675, + "step": 15521, + "time_per_iteration": 2.5623714923858643 + }, + { + "auxiliary_loss_clip": 0.01149674, + "auxiliary_loss_mlp": 0.01099966, + "balance_loss_clip": 1.00204444, + "balance_loss_mlp": 1.00040293, + "epoch": 0.93323312791222, + "flos": 15777641007360.0, + "grad_norm": 2.042459897102972, + "language_loss": 0.77769035, + "learning_rate": 4.6539731593792545e-08, + "loss": 0.80018675, + "num_input_tokens_seen": 334890970, + "step": 15522, + "time_per_iteration": 2.5758886337280273 + }, + { + "auxiliary_loss_clip": 0.01116544, + "auxiliary_loss_mlp": 0.00747299, + "balance_loss_clip": 1.00173354, + "balance_loss_mlp": 1.00042462, + "epoch": 0.933293251164888, + "flos": 22010978557440.0, + "grad_norm": 2.0556512779184306, + "language_loss": 0.62874669, + "learning_rate": 4.6456240000687373e-08, + "loss": 0.64738512, + "num_input_tokens_seen": 334906635, + "step": 15523, + "time_per_iteration": 2.628460645675659 + }, + { + "auxiliary_loss_clip": 0.01132657, + "auxiliary_loss_mlp": 0.01100054, + "balance_loss_clip": 1.00178909, + "balance_loss_mlp": 1.00049078, + "epoch": 0.933353374417556, + "flos": 26031358074240.0, + "grad_norm": 3.176321949877849, + "language_loss": 0.68353122, + "learning_rate": 4.63728224861577e-08, + "loss": 0.70585835, + "num_input_tokens_seen": 334926230, + "step": 15524, + "time_per_iteration": 2.658297300338745 + }, + { + "auxiliary_loss_clip": 0.01101511, + "auxiliary_loss_mlp": 0.01101302, + "balance_loss_clip": 1.00174928, + "balance_loss_mlp": 1.00064206, + "epoch": 0.933413497670224, + "flos": 24900100162560.0, + "grad_norm": 1.7490450382103313, + "language_loss": 0.73795176, + "learning_rate": 4.628947905336589e-08, + "loss": 0.75997984, + "num_input_tokens_seen": 334946680, + "step": 15525, + "time_per_iteration": 2.7160398960113525 + }, + { + "auxiliary_loss_clip": 0.01101082, + "auxiliary_loss_mlp": 0.0109976, + "balance_loss_clip": 1.00160384, + "balance_loss_mlp": 1.00067282, + "epoch": 0.9334736209228919, + "flos": 23688689051520.0, + "grad_norm": 1.7028179912903891, + "language_loss": 0.83672678, + "learning_rate": 4.6206209705473175e-08, + "loss": 0.8587352, + "num_input_tokens_seen": 334964785, + "step": 15526, + "time_per_iteration": 2.7129666805267334 + }, + { + "auxiliary_loss_clip": 0.010989, + "auxiliary_loss_mlp": 0.0110019, + "balance_loss_clip": 1.00178742, + "balance_loss_mlp": 1.0005306, + "epoch": 0.9335337441755599, + "flos": 15377344865280.0, + "grad_norm": 2.048462909485568, + "language_loss": 0.68507147, + "learning_rate": 4.61230144456366e-08, + "loss": 0.70706236, + "num_input_tokens_seen": 334982400, + "step": 15527, + "time_per_iteration": 2.6570231914520264 + }, + { + "auxiliary_loss_clip": 0.0116443, + "auxiliary_loss_mlp": 0.01101114, + "balance_loss_clip": 1.00198698, + "balance_loss_mlp": 1.00050163, + "epoch": 0.9335938674282279, + "flos": 16106726436480.0, + "grad_norm": 1.830253438088087, + "language_loss": 0.65418601, + "learning_rate": 4.603989327701141e-08, + "loss": 0.67684144, + "num_input_tokens_seen": 334999685, + "step": 15528, + "time_per_iteration": 3.964162588119507 + }, + { + "auxiliary_loss_clip": 0.01164205, + "auxiliary_loss_mlp": 0.01100804, + "balance_loss_clip": 1.00186384, + "balance_loss_mlp": 1.00047743, + "epoch": 0.9336539906808958, + "flos": 18952898353920.0, + "grad_norm": 2.149362701996889, + "language_loss": 0.74735034, + "learning_rate": 4.5956846202748867e-08, + "loss": 0.77000046, + "num_input_tokens_seen": 335019160, + "step": 15529, + "time_per_iteration": 2.521540880203247 + }, + { + "auxiliary_loss_clip": 0.01101492, + "auxiliary_loss_mlp": 0.01100008, + "balance_loss_clip": 1.00156164, + "balance_loss_mlp": 1.00058794, + "epoch": 0.9337141139335638, + "flos": 18109104986880.0, + "grad_norm": 1.7474899242088666, + "language_loss": 0.63164169, + "learning_rate": 4.5873873225998674e-08, + "loss": 0.65365672, + "num_input_tokens_seen": 335037350, + "step": 15530, + "time_per_iteration": 4.067895174026489 + }, + { + "auxiliary_loss_clip": 0.01131223, + "auxiliary_loss_mlp": 0.01098588, + "balance_loss_clip": 1.00173998, + "balance_loss_mlp": 1.00050247, + "epoch": 0.9337742371862318, + "flos": 17345716214400.0, + "grad_norm": 1.7333388791626396, + "language_loss": 0.72472334, + "learning_rate": 4.5790974349907194e-08, + "loss": 0.74702144, + "num_input_tokens_seen": 335056060, + "step": 15531, + "time_per_iteration": 2.606334686279297 + }, + { + "auxiliary_loss_clip": 0.01134313, + "auxiliary_loss_mlp": 0.01101156, + "balance_loss_clip": 1.00175834, + "balance_loss_mlp": 1.00044787, + "epoch": 0.9338343604388998, + "flos": 29058986522880.0, + "grad_norm": 3.1961638813582147, + "language_loss": 0.71127343, + "learning_rate": 4.5708149577617925e-08, + "loss": 0.73362803, + "num_input_tokens_seen": 335075410, + "step": 15532, + "time_per_iteration": 2.668630361557007 + }, + { + "auxiliary_loss_clip": 0.01164273, + "auxiliary_loss_mlp": 0.00747304, + "balance_loss_clip": 1.00185657, + "balance_loss_mlp": 1.00042629, + "epoch": 0.9338944836915677, + "flos": 18660908695680.0, + "grad_norm": 2.144733733220671, + "language_loss": 0.73221725, + "learning_rate": 4.5625398912271016e-08, + "loss": 0.75133306, + "num_input_tokens_seen": 335095190, + "step": 15533, + "time_per_iteration": 2.570568561553955 + }, + { + "auxiliary_loss_clip": 0.0111527, + "auxiliary_loss_mlp": 0.01099276, + "balance_loss_clip": 1.00170922, + "balance_loss_mlp": 1.00042725, + "epoch": 0.9339546069442357, + "flos": 16617735273600.0, + "grad_norm": 1.88300952343447, + "language_loss": 0.79766697, + "learning_rate": 4.554272235700507e-08, + "loss": 0.81981242, + "num_input_tokens_seen": 335113825, + "step": 15534, + "time_per_iteration": 2.6290571689605713 + }, + { + "auxiliary_loss_clip": 0.01163848, + "auxiliary_loss_mlp": 0.01099524, + "balance_loss_clip": 1.00186718, + "balance_loss_mlp": 1.00058007, + "epoch": 0.9340147301969036, + "flos": 23693106424320.0, + "grad_norm": 1.7068458436784244, + "language_loss": 0.7455461, + "learning_rate": 4.546011991495513e-08, + "loss": 0.76817989, + "num_input_tokens_seen": 335136425, + "step": 15535, + "time_per_iteration": 2.5682971477508545 + }, + { + "auxiliary_loss_clip": 0.01148014, + "auxiliary_loss_mlp": 0.01101219, + "balance_loss_clip": 1.00189376, + "balance_loss_mlp": 1.00051105, + "epoch": 0.9340748534495716, + "flos": 28654452576000.0, + "grad_norm": 2.1380452494838367, + "language_loss": 0.77451932, + "learning_rate": 4.537759158925292e-08, + "loss": 0.79701167, + "num_input_tokens_seen": 335157925, + "step": 15536, + "time_per_iteration": 2.6253445148468018 + }, + { + "auxiliary_loss_clip": 0.01117913, + "auxiliary_loss_mlp": 0.01099786, + "balance_loss_clip": 1.00171709, + "balance_loss_mlp": 1.00046134, + "epoch": 0.9341349767022396, + "flos": 24899633285760.0, + "grad_norm": 1.4261511825096271, + "language_loss": 0.80840749, + "learning_rate": 4.5295137383028593e-08, + "loss": 0.83058453, + "num_input_tokens_seen": 335177840, + "step": 15537, + "time_per_iteration": 2.663867950439453 + }, + { + "auxiliary_loss_clip": 0.01131985, + "auxiliary_loss_mlp": 0.0110128, + "balance_loss_clip": 1.00178182, + "balance_loss_mlp": 1.0005722, + "epoch": 0.9341950999549076, + "flos": 29059525226880.0, + "grad_norm": 1.7276677444735469, + "language_loss": 0.77696615, + "learning_rate": 4.5212757299408764e-08, + "loss": 0.79929882, + "num_input_tokens_seen": 335199470, + "step": 15538, + "time_per_iteration": 2.651580572128296 + }, + { + "auxiliary_loss_clip": 0.01130829, + "auxiliary_loss_mlp": 0.01099198, + "balance_loss_clip": 1.0016706, + "balance_loss_mlp": 1.00044513, + "epoch": 0.9342552232075755, + "flos": 23587062497280.0, + "grad_norm": 1.5316519921747471, + "language_loss": 0.73429722, + "learning_rate": 4.513045134151672e-08, + "loss": 0.75659752, + "num_input_tokens_seen": 335218885, + "step": 15539, + "time_per_iteration": 2.613542079925537 + }, + { + "auxiliary_loss_clip": 0.01101099, + "auxiliary_loss_mlp": 0.0109958, + "balance_loss_clip": 1.00179601, + "balance_loss_mlp": 1.00044537, + "epoch": 0.9343153464602435, + "flos": 36721389646080.0, + "grad_norm": 1.7592294733118445, + "language_loss": 0.65241957, + "learning_rate": 4.504821951247373e-08, + "loss": 0.67442644, + "num_input_tokens_seen": 335239485, + "step": 15540, + "time_per_iteration": 2.8026175498962402 + }, + { + "auxiliary_loss_clip": 0.01147863, + "auxiliary_loss_mlp": 0.01100501, + "balance_loss_clip": 1.00179434, + "balance_loss_mlp": 1.00036502, + "epoch": 0.9343754697129115, + "flos": 22236498097920.0, + "grad_norm": 1.7143427461481366, + "language_loss": 0.76713061, + "learning_rate": 4.496606181539864e-08, + "loss": 0.78961426, + "num_input_tokens_seen": 335258355, + "step": 15541, + "time_per_iteration": 2.577439785003662 + }, + { + "auxiliary_loss_clip": 0.01147861, + "auxiliary_loss_mlp": 0.01100919, + "balance_loss_clip": 1.0019294, + "balance_loss_mlp": 1.00049686, + "epoch": 0.9344355929655794, + "flos": 29710333797120.0, + "grad_norm": 1.942900292181489, + "language_loss": 0.66933429, + "learning_rate": 4.4883978253406066e-08, + "loss": 0.69182205, + "num_input_tokens_seen": 335276835, + "step": 15542, + "time_per_iteration": 2.6116881370544434 + }, + { + "auxiliary_loss_clip": 0.0111659, + "auxiliary_loss_mlp": 0.01100665, + "balance_loss_clip": 1.0017457, + "balance_loss_mlp": 1.00052929, + "epoch": 0.9344957162182475, + "flos": 18880394751360.0, + "grad_norm": 1.8781445159858312, + "language_loss": 0.69529778, + "learning_rate": 4.480196882960907e-08, + "loss": 0.71747029, + "num_input_tokens_seen": 335296220, + "step": 15543, + "time_per_iteration": 2.612994432449341 + }, + { + "auxiliary_loss_clip": 0.0114942, + "auxiliary_loss_mlp": 0.01101288, + "balance_loss_clip": 1.00180602, + "balance_loss_mlp": 1.00043678, + "epoch": 0.9345558394709154, + "flos": 27417761268480.0, + "grad_norm": 1.9707607835706078, + "language_loss": 0.69435143, + "learning_rate": 4.4720033547117394e-08, + "loss": 0.71685863, + "num_input_tokens_seen": 335316335, + "step": 15544, + "time_per_iteration": 2.624305009841919 + }, + { + "auxiliary_loss_clip": 0.01149313, + "auxiliary_loss_mlp": 0.01101528, + "balance_loss_clip": 1.00188398, + "balance_loss_mlp": 1.00043821, + "epoch": 0.9346159627235834, + "flos": 20741285629440.0, + "grad_norm": 1.5634364143090256, + "language_loss": 0.77369535, + "learning_rate": 4.463817240903789e-08, + "loss": 0.79620373, + "num_input_tokens_seen": 335335545, + "step": 15545, + "time_per_iteration": 2.596982717514038 + }, + { + "auxiliary_loss_clip": 0.01147639, + "auxiliary_loss_mlp": 0.01101226, + "balance_loss_clip": 1.0018189, + "balance_loss_mlp": 1.00042307, + "epoch": 0.9346760859762513, + "flos": 21069221823360.0, + "grad_norm": 1.7248042375461161, + "language_loss": 0.69369018, + "learning_rate": 4.455638541847495e-08, + "loss": 0.71617889, + "num_input_tokens_seen": 335355350, + "step": 15546, + "time_per_iteration": 4.20422887802124 + }, + { + "auxiliary_loss_clip": 0.01118118, + "auxiliary_loss_mlp": 0.0109913, + "balance_loss_clip": 1.00194168, + "balance_loss_mlp": 1.00047231, + "epoch": 0.9347362092289193, + "flos": 29204927481600.0, + "grad_norm": 1.7569369357909541, + "language_loss": 0.82414067, + "learning_rate": 4.447467257852966e-08, + "loss": 0.84631312, + "num_input_tokens_seen": 335375160, + "step": 15547, + "time_per_iteration": 2.698359966278076 + }, + { + "auxiliary_loss_clip": 0.01149135, + "auxiliary_loss_mlp": 0.01099855, + "balance_loss_clip": 1.00181186, + "balance_loss_mlp": 1.00048184, + "epoch": 0.9347963324815872, + "flos": 19427350124160.0, + "grad_norm": 1.833161036129955, + "language_loss": 0.83559483, + "learning_rate": 4.439303389230087e-08, + "loss": 0.85808474, + "num_input_tokens_seen": 335394080, + "step": 15548, + "time_per_iteration": 2.530879497528076 + }, + { + "auxiliary_loss_clip": 0.01147605, + "auxiliary_loss_mlp": 0.01101978, + "balance_loss_clip": 1.00182891, + "balance_loss_mlp": 1.00050676, + "epoch": 0.9348564557342552, + "flos": 36901840596480.0, + "grad_norm": 1.7491348152914787, + "language_loss": 0.65617925, + "learning_rate": 4.4311469362884326e-08, + "loss": 0.67867506, + "num_input_tokens_seen": 335414230, + "step": 15549, + "time_per_iteration": 2.701633930206299 + }, + { + "auxiliary_loss_clip": 0.01147476, + "auxiliary_loss_mlp": 0.01101969, + "balance_loss_clip": 1.00192368, + "balance_loss_mlp": 1.00059354, + "epoch": 0.9349165789869232, + "flos": 21690117342720.0, + "grad_norm": 1.6590412715177374, + "language_loss": 0.79933792, + "learning_rate": 4.4229978993372665e-08, + "loss": 0.82183236, + "num_input_tokens_seen": 335432890, + "step": 15550, + "time_per_iteration": 3.969313859939575 + }, + { + "auxiliary_loss_clip": 0.01147545, + "auxiliary_loss_mlp": 0.0110002, + "balance_loss_clip": 1.00193453, + "balance_loss_mlp": 1.00055182, + "epoch": 0.9349767022395912, + "flos": 18844053166080.0, + "grad_norm": 1.7077803025224343, + "language_loss": 0.7543155, + "learning_rate": 4.4148562786856524e-08, + "loss": 0.7767911, + "num_input_tokens_seen": 335452085, + "step": 15551, + "time_per_iteration": 2.5688796043395996 + }, + { + "auxiliary_loss_clip": 0.01085682, + "auxiliary_loss_mlp": 0.01099186, + "balance_loss_clip": 1.00159824, + "balance_loss_mlp": 1.00057578, + "epoch": 0.9350368254922591, + "flos": 24973429777920.0, + "grad_norm": 1.5474107362776623, + "language_loss": 0.73216808, + "learning_rate": 4.406722074642255e-08, + "loss": 0.75401676, + "num_input_tokens_seen": 335472130, + "step": 15552, + "time_per_iteration": 2.8391470909118652 + }, + { + "auxiliary_loss_clip": 0.01101127, + "auxiliary_loss_mlp": 0.01100986, + "balance_loss_clip": 1.00166106, + "balance_loss_mlp": 1.00070739, + "epoch": 0.9350969487449271, + "flos": 23070594792960.0, + "grad_norm": 1.6857787551348644, + "language_loss": 0.77207053, + "learning_rate": 4.3985952875155386e-08, + "loss": 0.79409164, + "num_input_tokens_seen": 335489970, + "step": 15553, + "time_per_iteration": 2.681013345718384 + }, + { + "auxiliary_loss_clip": 0.01118469, + "auxiliary_loss_mlp": 0.0110108, + "balance_loss_clip": 1.00164115, + "balance_loss_mlp": 1.00065792, + "epoch": 0.9351570719975951, + "flos": 18625177641600.0, + "grad_norm": 2.8162047591343997, + "language_loss": 0.78432959, + "learning_rate": 4.390475917613723e-08, + "loss": 0.80652505, + "num_input_tokens_seen": 335509125, + "step": 15554, + "time_per_iteration": 2.637005090713501 + }, + { + "auxiliary_loss_clip": 0.0113252, + "auxiliary_loss_mlp": 0.01099011, + "balance_loss_clip": 1.0017463, + "balance_loss_mlp": 1.00049591, + "epoch": 0.935217195250263, + "flos": 15888353702400.0, + "grad_norm": 1.7364290506031457, + "language_loss": 0.69174999, + "learning_rate": 4.382363965244695e-08, + "loss": 0.71406525, + "num_input_tokens_seen": 335525620, + "step": 15555, + "time_per_iteration": 2.5698368549346924 + }, + { + "auxiliary_loss_clip": 0.01052195, + "auxiliary_loss_mlp": 0.01101097, + "balance_loss_clip": 1.00169384, + "balance_loss_mlp": 1.00067496, + "epoch": 0.935277318502931, + "flos": 24390312387840.0, + "grad_norm": 1.5908207436707915, + "language_loss": 0.75562811, + "learning_rate": 4.374259430715965e-08, + "loss": 0.777161, + "num_input_tokens_seen": 335547565, + "step": 15556, + "time_per_iteration": 2.8643643856048584 + }, + { + "auxiliary_loss_clip": 0.01132152, + "auxiliary_loss_mlp": 0.01100336, + "balance_loss_clip": 1.00170064, + "balance_loss_mlp": 1.00053418, + "epoch": 0.935337441755599, + "flos": 27600259294080.0, + "grad_norm": 1.5705687398576322, + "language_loss": 0.7226755, + "learning_rate": 4.366162314334953e-08, + "loss": 0.74500042, + "num_input_tokens_seen": 335570285, + "step": 15557, + "time_per_iteration": 2.645829439163208 + }, + { + "auxiliary_loss_clip": 0.01164113, + "auxiliary_loss_mlp": 0.01100741, + "balance_loss_clip": 1.00188541, + "balance_loss_mlp": 1.00046206, + "epoch": 0.935397565008267, + "flos": 20482872209280.0, + "grad_norm": 1.5392165900982984, + "language_loss": 0.63099474, + "learning_rate": 4.358072616408681e-08, + "loss": 0.65364325, + "num_input_tokens_seen": 335588600, + "step": 15558, + "time_per_iteration": 2.5293636322021484 + }, + { + "auxiliary_loss_clip": 0.01131049, + "auxiliary_loss_mlp": 0.01100363, + "balance_loss_clip": 1.00173807, + "balance_loss_mlp": 1.00046563, + "epoch": 0.9354576882609349, + "flos": 23654394541440.0, + "grad_norm": 2.2012459478987147, + "language_loss": 0.73314756, + "learning_rate": 4.34999033724388e-08, + "loss": 0.75546169, + "num_input_tokens_seen": 335606235, + "step": 15559, + "time_per_iteration": 2.5991032123565674 + }, + { + "auxiliary_loss_clip": 0.01101523, + "auxiliary_loss_mlp": 0.00747328, + "balance_loss_clip": 1.00183296, + "balance_loss_mlp": 1.00046515, + "epoch": 0.9355178115136029, + "flos": 36684904406400.0, + "grad_norm": 1.6308498610146487, + "language_loss": 0.63688719, + "learning_rate": 4.341915477147062e-08, + "loss": 0.65537572, + "num_input_tokens_seen": 335628240, + "step": 15560, + "time_per_iteration": 2.798513174057007 + }, + { + "auxiliary_loss_clip": 0.01070818, + "auxiliary_loss_mlp": 0.01102066, + "balance_loss_clip": 1.0016439, + "balance_loss_mlp": 1.00040448, + "epoch": 0.9355779347662708, + "flos": 14460401450880.0, + "grad_norm": 2.0986453666709592, + "language_loss": 0.64400017, + "learning_rate": 4.3338480364244034e-08, + "loss": 0.66572905, + "num_input_tokens_seen": 335643755, + "step": 15561, + "time_per_iteration": 2.7180674076080322 + }, + { + "auxiliary_loss_clip": 0.01164114, + "auxiliary_loss_mlp": 0.01100528, + "balance_loss_clip": 1.00189853, + "balance_loss_mlp": 1.00063086, + "epoch": 0.9356380580189388, + "flos": 23185976256000.0, + "grad_norm": 2.1950212253482344, + "language_loss": 0.75627255, + "learning_rate": 4.325788015381859e-08, + "loss": 0.77891898, + "num_input_tokens_seen": 335665160, + "step": 15562, + "time_per_iteration": 2.547917127609253 + }, + { + "auxiliary_loss_clip": 0.01140966, + "auxiliary_loss_mlp": 0.01073847, + "balance_loss_clip": 1.0006659, + "balance_loss_mlp": 1.00022316, + "epoch": 0.9356981812716068, + "flos": 67471626090240.0, + "grad_norm": 0.9482228767695263, + "language_loss": 0.6226778, + "learning_rate": 4.31773541432503e-08, + "loss": 0.64482594, + "num_input_tokens_seen": 335715240, + "step": 15563, + "time_per_iteration": 2.9701013565063477 + }, + { + "auxiliary_loss_clip": 0.01100137, + "auxiliary_loss_mlp": 0.01099694, + "balance_loss_clip": 1.00178218, + "balance_loss_mlp": 1.00055945, + "epoch": 0.9357583045242748, + "flos": 24681619687680.0, + "grad_norm": 1.5502717359412983, + "language_loss": 0.78355014, + "learning_rate": 4.3096902335592714e-08, + "loss": 0.80554849, + "num_input_tokens_seen": 335734970, + "step": 15564, + "time_per_iteration": 2.7158868312835693 + }, + { + "auxiliary_loss_clip": 0.01164285, + "auxiliary_loss_mlp": 0.01101143, + "balance_loss_clip": 1.00184298, + "balance_loss_mlp": 1.00052989, + "epoch": 0.9358184277769427, + "flos": 19463727623040.0, + "grad_norm": 1.8288667654348025, + "language_loss": 0.78100568, + "learning_rate": 4.301652473389694e-08, + "loss": 0.80365992, + "num_input_tokens_seen": 335753435, + "step": 15565, + "time_per_iteration": 4.038430690765381 + }, + { + "auxiliary_loss_clip": 0.01147459, + "auxiliary_loss_mlp": 0.01099669, + "balance_loss_clip": 1.00182462, + "balance_loss_mlp": 1.00039172, + "epoch": 0.9358785510296107, + "flos": 18916987731840.0, + "grad_norm": 2.418370885274374, + "language_loss": 0.72389466, + "learning_rate": 4.2936221341210774e-08, + "loss": 0.7463659, + "num_input_tokens_seen": 335772105, + "step": 15566, + "time_per_iteration": 2.618347406387329 + }, + { + "auxiliary_loss_clip": 0.01116193, + "auxiliary_loss_mlp": 0.00747315, + "balance_loss_clip": 1.00165343, + "balance_loss_mlp": 1.00048268, + "epoch": 0.9359386742822787, + "flos": 23441265192960.0, + "grad_norm": 3.7062332369324573, + "language_loss": 0.68270767, + "learning_rate": 4.285599216057889e-08, + "loss": 0.7013427, + "num_input_tokens_seen": 335789125, + "step": 15567, + "time_per_iteration": 2.670470714569092 + }, + { + "auxiliary_loss_clip": 0.01133327, + "auxiliary_loss_mlp": 0.01100927, + "balance_loss_clip": 1.00181663, + "balance_loss_mlp": 1.00050485, + "epoch": 0.9359987975349466, + "flos": 32744067557760.0, + "grad_norm": 2.0771409724678382, + "language_loss": 0.62475365, + "learning_rate": 4.277583719504418e-08, + "loss": 0.64709616, + "num_input_tokens_seen": 335810995, + "step": 15568, + "time_per_iteration": 4.036384582519531 + }, + { + "auxiliary_loss_clip": 0.0113457, + "auxiliary_loss_mlp": 0.01100722, + "balance_loss_clip": 1.00168967, + "balance_loss_mlp": 1.00068164, + "epoch": 0.9360589207876147, + "flos": 22819651401600.0, + "grad_norm": 1.547062085861296, + "language_loss": 0.78744853, + "learning_rate": 4.269575644764556e-08, + "loss": 0.8098014, + "num_input_tokens_seen": 335830580, + "step": 15569, + "time_per_iteration": 2.593956708908081 + }, + { + "auxiliary_loss_clip": 0.01132279, + "auxiliary_loss_mlp": 0.01100917, + "balance_loss_clip": 1.00185442, + "balance_loss_mlp": 1.00059009, + "epoch": 0.9361190440402826, + "flos": 20885251340160.0, + "grad_norm": 2.5979444483093976, + "language_loss": 0.69246447, + "learning_rate": 4.261574992142014e-08, + "loss": 0.71479642, + "num_input_tokens_seen": 335846515, + "step": 15570, + "time_per_iteration": 2.616316556930542 + }, + { + "auxiliary_loss_clip": 0.01147874, + "auxiliary_loss_mlp": 0.0110027, + "balance_loss_clip": 1.00185657, + "balance_loss_mlp": 1.00046754, + "epoch": 0.9361791672929506, + "flos": 19317822577920.0, + "grad_norm": 1.9907437962942793, + "language_loss": 0.78987205, + "learning_rate": 4.2535817619401726e-08, + "loss": 0.81235349, + "num_input_tokens_seen": 335863350, + "step": 15571, + "time_per_iteration": 2.543750286102295 + }, + { + "auxiliary_loss_clip": 0.01118055, + "auxiliary_loss_mlp": 0.01100337, + "balance_loss_clip": 1.00174451, + "balance_loss_mlp": 1.00048733, + "epoch": 0.9362392905456185, + "flos": 15158182032000.0, + "grad_norm": 2.0871475532709183, + "language_loss": 0.77640188, + "learning_rate": 4.2455959544621224e-08, + "loss": 0.79858577, + "num_input_tokens_seen": 335880510, + "step": 15572, + "time_per_iteration": 2.6305038928985596 + }, + { + "auxiliary_loss_clip": 0.01132121, + "auxiliary_loss_mlp": 0.01099147, + "balance_loss_clip": 1.00167346, + "balance_loss_mlp": 1.00053728, + "epoch": 0.9362994137982865, + "flos": 22085888371200.0, + "grad_norm": 2.483308739035863, + "language_loss": 0.77888763, + "learning_rate": 4.237617570010688e-08, + "loss": 0.80120027, + "num_input_tokens_seen": 335899440, + "step": 15573, + "time_per_iteration": 2.694209337234497 + }, + { + "auxiliary_loss_clip": 0.01120152, + "auxiliary_loss_mlp": 0.01100315, + "balance_loss_clip": 1.00180578, + "balance_loss_mlp": 1.0003221, + "epoch": 0.9363595370509544, + "flos": 23512260424320.0, + "grad_norm": 1.8995237639748173, + "language_loss": 0.74673367, + "learning_rate": 4.2296466088884044e-08, + "loss": 0.76893836, + "num_input_tokens_seen": 335919540, + "step": 15574, + "time_per_iteration": 2.689305305480957 + }, + { + "auxiliary_loss_clip": 0.0109993, + "auxiliary_loss_mlp": 0.01100305, + "balance_loss_clip": 1.00174952, + "balance_loss_mlp": 1.00069404, + "epoch": 0.9364196603036224, + "flos": 27123473139840.0, + "grad_norm": 2.6963088357153273, + "language_loss": 0.67972344, + "learning_rate": 4.221683071397564e-08, + "loss": 0.70172584, + "num_input_tokens_seen": 335939665, + "step": 15575, + "time_per_iteration": 2.7186014652252197 + }, + { + "auxiliary_loss_clip": 0.01132694, + "auxiliary_loss_mlp": 0.01099651, + "balance_loss_clip": 1.00201201, + "balance_loss_mlp": 1.00056434, + "epoch": 0.9364797835562904, + "flos": 18479057114880.0, + "grad_norm": 1.5113240805702044, + "language_loss": 0.65136051, + "learning_rate": 4.2137269578401026e-08, + "loss": 0.67368388, + "num_input_tokens_seen": 335958580, + "step": 15576, + "time_per_iteration": 2.587334156036377 + }, + { + "auxiliary_loss_clip": 0.01149421, + "auxiliary_loss_mlp": 0.01100835, + "balance_loss_clip": 1.001827, + "balance_loss_mlp": 1.0004611, + "epoch": 0.9365399068089584, + "flos": 13005552890880.0, + "grad_norm": 3.1912898208371416, + "language_loss": 0.75898933, + "learning_rate": 4.2057782685177566e-08, + "loss": 0.78149188, + "num_input_tokens_seen": 335974965, + "step": 15577, + "time_per_iteration": 2.6077706813812256 + }, + { + "auxiliary_loss_clip": 0.0110111, + "auxiliary_loss_mlp": 0.01100368, + "balance_loss_clip": 1.0016489, + "balance_loss_mlp": 1.00042343, + "epoch": 0.9366000300616263, + "flos": 25666433850240.0, + "grad_norm": 1.9514163960906263, + "language_loss": 0.52405226, + "learning_rate": 4.1978370037318855e-08, + "loss": 0.54606706, + "num_input_tokens_seen": 335996575, + "step": 15578, + "time_per_iteration": 2.7566919326782227 + }, + { + "auxiliary_loss_clip": 0.01086466, + "auxiliary_loss_mlp": 0.01099424, + "balance_loss_clip": 1.00165594, + "balance_loss_mlp": 1.00081444, + "epoch": 0.9366601533142943, + "flos": 21433355948160.0, + "grad_norm": 2.143947319768241, + "language_loss": 0.70827752, + "learning_rate": 4.189903163783692e-08, + "loss": 0.73013639, + "num_input_tokens_seen": 336017265, + "step": 15579, + "time_per_iteration": 2.748558282852173 + }, + { + "auxiliary_loss_clip": 0.01132791, + "auxiliary_loss_mlp": 0.01100332, + "balance_loss_clip": 1.00179791, + "balance_loss_mlp": 1.00043416, + "epoch": 0.9367202765669622, + "flos": 24093222998400.0, + "grad_norm": 2.320690142514656, + "language_loss": 0.76527399, + "learning_rate": 4.181976748973959e-08, + "loss": 0.78760517, + "num_input_tokens_seen": 336035905, + "step": 15580, + "time_per_iteration": 2.6401162147521973 + }, + { + "auxiliary_loss_clip": 0.01149553, + "auxiliary_loss_mlp": 0.01101409, + "balance_loss_clip": 1.00192595, + "balance_loss_mlp": 1.00046301, + "epoch": 0.9367803998196302, + "flos": 20888842700160.0, + "grad_norm": 1.9170383695627564, + "language_loss": 0.66409445, + "learning_rate": 4.1740577596033114e-08, + "loss": 0.68660408, + "num_input_tokens_seen": 336055585, + "step": 15581, + "time_per_iteration": 2.5898921489715576 + }, + { + "auxiliary_loss_clip": 0.01147911, + "auxiliary_loss_mlp": 0.01099915, + "balance_loss_clip": 1.0018605, + "balance_loss_mlp": 1.00039887, + "epoch": 0.9368405230722983, + "flos": 22564362464640.0, + "grad_norm": 1.819895212608772, + "language_loss": 0.76788616, + "learning_rate": 4.166146195972042e-08, + "loss": 0.79036438, + "num_input_tokens_seen": 336076695, + "step": 15582, + "time_per_iteration": 2.6106362342834473 + }, + { + "auxiliary_loss_clip": 0.0107036, + "auxiliary_loss_mlp": 0.0110052, + "balance_loss_clip": 1.00164461, + "balance_loss_mlp": 1.00057459, + "epoch": 0.9369006463249662, + "flos": 18880215183360.0, + "grad_norm": 1.6397438581047774, + "language_loss": 0.74019563, + "learning_rate": 4.1582420583800905e-08, + "loss": 0.76190442, + "num_input_tokens_seen": 336094740, + "step": 15583, + "time_per_iteration": 4.378225803375244 + }, + { + "auxiliary_loss_clip": 0.01164413, + "auxiliary_loss_mlp": 0.01101027, + "balance_loss_clip": 1.00200176, + "balance_loss_mlp": 1.00060487, + "epoch": 0.9369607695776342, + "flos": 26432516142720.0, + "grad_norm": 2.06365441835767, + "language_loss": 0.84221196, + "learning_rate": 4.1503453471272376e-08, + "loss": 0.86486638, + "num_input_tokens_seen": 336113985, + "step": 15584, + "time_per_iteration": 2.5996456146240234 + }, + { + "auxiliary_loss_clip": 0.01147985, + "auxiliary_loss_mlp": 0.0074741, + "balance_loss_clip": 1.00183427, + "balance_loss_mlp": 1.00048983, + "epoch": 0.9370208928303021, + "flos": 39567346081920.0, + "grad_norm": 1.4697309610804838, + "language_loss": 0.72354341, + "learning_rate": 4.1424560625129334e-08, + "loss": 0.74249732, + "num_input_tokens_seen": 336136395, + "step": 15585, + "time_per_iteration": 2.732487916946411 + }, + { + "auxiliary_loss_clip": 0.01115824, + "auxiliary_loss_mlp": 0.01098856, + "balance_loss_clip": 1.00164497, + "balance_loss_mlp": 1.0003891, + "epoch": 0.9370810160829701, + "flos": 22963114321920.0, + "grad_norm": 2.1909901167135075, + "language_loss": 0.80416274, + "learning_rate": 4.134574204836316e-08, + "loss": 0.8263095, + "num_input_tokens_seen": 336156345, + "step": 15586, + "time_per_iteration": 2.6963586807250977 + }, + { + "auxiliary_loss_clip": 0.01116118, + "auxiliary_loss_mlp": 0.01100803, + "balance_loss_clip": 1.00168562, + "balance_loss_mlp": 1.0005722, + "epoch": 0.937141139335638, + "flos": 23075048079360.0, + "grad_norm": 1.7514161268425186, + "language_loss": 0.76673496, + "learning_rate": 4.126699774396258e-08, + "loss": 0.78890419, + "num_input_tokens_seen": 336176760, + "step": 15587, + "time_per_iteration": 4.0452187061309814 + }, + { + "auxiliary_loss_clip": 0.01132342, + "auxiliary_loss_mlp": 0.01101386, + "balance_loss_clip": 1.00171363, + "balance_loss_mlp": 1.00063014, + "epoch": 0.937201262588306, + "flos": 16356664247040.0, + "grad_norm": 1.7576783636007545, + "language_loss": 0.87461042, + "learning_rate": 4.118832771491387e-08, + "loss": 0.89694774, + "num_input_tokens_seen": 336193285, + "step": 15588, + "time_per_iteration": 2.6256659030914307 + }, + { + "auxiliary_loss_clip": 0.01164016, + "auxiliary_loss_mlp": 0.00747317, + "balance_loss_clip": 1.00191998, + "balance_loss_mlp": 1.0004921, + "epoch": 0.937261385840974, + "flos": 20194078861440.0, + "grad_norm": 1.618240322974355, + "language_loss": 0.78664893, + "learning_rate": 4.11097319642002e-08, + "loss": 0.80576229, + "num_input_tokens_seen": 336211425, + "step": 15589, + "time_per_iteration": 2.5883593559265137 + }, + { + "auxiliary_loss_clip": 0.01163955, + "auxiliary_loss_mlp": 0.01099073, + "balance_loss_clip": 1.0018208, + "balance_loss_mlp": 1.00055885, + "epoch": 0.937321509093642, + "flos": 18295948558080.0, + "grad_norm": 2.103535242923024, + "language_loss": 0.77962947, + "learning_rate": 4.103121049480163e-08, + "loss": 0.8022598, + "num_input_tokens_seen": 336230205, + "step": 15590, + "time_per_iteration": 2.5493195056915283 + }, + { + "auxiliary_loss_clip": 0.01133622, + "auxiliary_loss_mlp": 0.01101786, + "balance_loss_clip": 1.00206113, + "balance_loss_mlp": 1.0006969, + "epoch": 0.9373816323463099, + "flos": 25884662929920.0, + "grad_norm": 2.3502605638781544, + "language_loss": 0.71192276, + "learning_rate": 4.095276330969577e-08, + "loss": 0.73427683, + "num_input_tokens_seen": 336252440, + "step": 15591, + "time_per_iteration": 2.667628288269043 + }, + { + "auxiliary_loss_clip": 0.01147878, + "auxiliary_loss_mlp": 0.00747382, + "balance_loss_clip": 1.00177705, + "balance_loss_mlp": 1.00044405, + "epoch": 0.9374417555989779, + "flos": 27198849830400.0, + "grad_norm": 1.9901823221664634, + "language_loss": 0.53867429, + "learning_rate": 4.0874390411857804e-08, + "loss": 0.55762684, + "num_input_tokens_seen": 336273845, + "step": 15592, + "time_per_iteration": 2.656867027282715 + }, + { + "auxiliary_loss_clip": 0.011479, + "auxiliary_loss_mlp": 0.01099561, + "balance_loss_clip": 1.00190914, + "balance_loss_mlp": 1.00047421, + "epoch": 0.9375018788516458, + "flos": 23621249266560.0, + "grad_norm": 1.4850208777100724, + "language_loss": 0.67323279, + "learning_rate": 4.0796091804259136e-08, + "loss": 0.69570738, + "num_input_tokens_seen": 336292790, + "step": 15593, + "time_per_iteration": 2.6244213581085205 + }, + { + "auxiliary_loss_clip": 0.01132373, + "auxiliary_loss_mlp": 0.01100588, + "balance_loss_clip": 1.00178242, + "balance_loss_mlp": 1.00045204, + "epoch": 0.9375620021043138, + "flos": 22678774260480.0, + "grad_norm": 1.904055570814138, + "language_loss": 0.74215126, + "learning_rate": 4.0717867489868715e-08, + "loss": 0.76448089, + "num_input_tokens_seen": 336312600, + "step": 15594, + "time_per_iteration": 2.6508514881134033 + }, + { + "auxiliary_loss_clip": 0.01149285, + "auxiliary_loss_mlp": 0.01100287, + "balance_loss_clip": 1.00179458, + "balance_loss_mlp": 1.00043714, + "epoch": 0.9376221253569819, + "flos": 27560254521600.0, + "grad_norm": 1.6815460067287982, + "language_loss": 0.73315859, + "learning_rate": 4.063971747165351e-08, + "loss": 0.75565428, + "num_input_tokens_seen": 336332770, + "step": 15595, + "time_per_iteration": 2.613807201385498 + }, + { + "auxiliary_loss_clip": 0.0113146, + "auxiliary_loss_mlp": 0.01100339, + "balance_loss_clip": 1.00194764, + "balance_loss_mlp": 1.00048947, + "epoch": 0.9376822486096498, + "flos": 24129887806080.0, + "grad_norm": 1.9773070634651908, + "language_loss": 0.76331663, + "learning_rate": 4.056164175257626e-08, + "loss": 0.78563464, + "num_input_tokens_seen": 336351445, + "step": 15596, + "time_per_iteration": 2.619668483734131 + }, + { + "auxiliary_loss_clip": 0.01131477, + "auxiliary_loss_mlp": 0.01100239, + "balance_loss_clip": 1.00192225, + "balance_loss_mlp": 1.00048435, + "epoch": 0.9377423718623178, + "flos": 22784028088320.0, + "grad_norm": 2.162350521065008, + "language_loss": 0.78546149, + "learning_rate": 4.0483640335597926e-08, + "loss": 0.80777866, + "num_input_tokens_seen": 336368690, + "step": 15597, + "time_per_iteration": 2.5941665172576904 + }, + { + "auxiliary_loss_clip": 0.01164298, + "auxiliary_loss_mlp": 0.0110084, + "balance_loss_clip": 1.00193167, + "balance_loss_mlp": 1.0005132, + "epoch": 0.9378024951149857, + "flos": 19168900790400.0, + "grad_norm": 1.4265601101958343, + "language_loss": 0.8113308, + "learning_rate": 4.0405713223676363e-08, + "loss": 0.83398211, + "num_input_tokens_seen": 336388165, + "step": 15598, + "time_per_iteration": 2.519552707672119 + }, + { + "auxiliary_loss_clip": 0.01114587, + "auxiliary_loss_mlp": 0.01102441, + "balance_loss_clip": 1.00162566, + "balance_loss_mlp": 1.00063682, + "epoch": 0.9378626183676537, + "flos": 23505508667520.0, + "grad_norm": 2.041316289851166, + "language_loss": 0.62797481, + "learning_rate": 4.0327860419766994e-08, + "loss": 0.65014517, + "num_input_tokens_seen": 336406475, + "step": 15599, + "time_per_iteration": 2.7149078845977783 + }, + { + "auxiliary_loss_clip": 0.01116391, + "auxiliary_loss_mlp": 0.01100524, + "balance_loss_clip": 1.00181961, + "balance_loss_mlp": 1.00048375, + "epoch": 0.9379227416203216, + "flos": 18405655672320.0, + "grad_norm": 1.8363450861206434, + "language_loss": 0.73239636, + "learning_rate": 4.0250081926821e-08, + "loss": 0.75456554, + "num_input_tokens_seen": 336424690, + "step": 15600, + "time_per_iteration": 2.644378185272217 + }, + { + "auxiliary_loss_clip": 0.01131168, + "auxiliary_loss_mlp": 0.01100384, + "balance_loss_clip": 1.00178266, + "balance_loss_mlp": 1.00043857, + "epoch": 0.9379828648729897, + "flos": 17821855923840.0, + "grad_norm": 3.2739383501798205, + "language_loss": 0.69000447, + "learning_rate": 4.0172377747788474e-08, + "loss": 0.71232003, + "num_input_tokens_seen": 336443055, + "step": 15601, + "time_per_iteration": 2.583082914352417 + }, + { + "auxiliary_loss_clip": 0.01143594, + "auxiliary_loss_mlp": 0.01073767, + "balance_loss_clip": 1.00071621, + "balance_loss_mlp": 1.00014329, + "epoch": 0.9380429881256576, + "flos": 68024399466240.0, + "grad_norm": 0.7548555116290172, + "language_loss": 0.5811795, + "learning_rate": 4.009474788561573e-08, + "loss": 0.60335308, + "num_input_tokens_seen": 336510190, + "step": 15602, + "time_per_iteration": 3.3176567554473877 + }, + { + "auxiliary_loss_clip": 0.01070228, + "auxiliary_loss_mlp": 0.01101382, + "balance_loss_clip": 1.00177372, + "balance_loss_mlp": 1.00048351, + "epoch": 0.9381031113783256, + "flos": 20776980769920.0, + "grad_norm": 2.337648580429108, + "language_loss": 0.72027957, + "learning_rate": 4.001719234324663e-08, + "loss": 0.74199569, + "num_input_tokens_seen": 336529250, + "step": 15603, + "time_per_iteration": 4.167659282684326 + }, + { + "auxiliary_loss_clip": 0.01163826, + "auxiliary_loss_mlp": 0.01098752, + "balance_loss_clip": 1.0018115, + "balance_loss_mlp": 1.00047541, + "epoch": 0.9381632346309935, + "flos": 19025078734080.0, + "grad_norm": 1.7343826968228386, + "language_loss": 0.76273525, + "learning_rate": 3.993971112362171e-08, + "loss": 0.78536099, + "num_input_tokens_seen": 336548530, + "step": 15604, + "time_per_iteration": 2.5824151039123535 + }, + { + "auxiliary_loss_clip": 0.0113449, + "auxiliary_loss_mlp": 0.01101128, + "balance_loss_clip": 1.00187624, + "balance_loss_mlp": 1.00046754, + "epoch": 0.9382233578836615, + "flos": 23513840622720.0, + "grad_norm": 2.361116457654365, + "language_loss": 0.65547299, + "learning_rate": 3.9862304229679734e-08, + "loss": 0.67782921, + "num_input_tokens_seen": 336568510, + "step": 15605, + "time_per_iteration": 2.629955768585205 + }, + { + "auxiliary_loss_clip": 0.01115454, + "auxiliary_loss_mlp": 0.00747499, + "balance_loss_clip": 1.00160551, + "balance_loss_mlp": 1.00048053, + "epoch": 0.9382834811363294, + "flos": 43067882016000.0, + "grad_norm": 1.7572297465733249, + "language_loss": 0.67378819, + "learning_rate": 3.9784971664355683e-08, + "loss": 0.69241774, + "num_input_tokens_seen": 336592020, + "step": 15606, + "time_per_iteration": 4.30299973487854 + }, + { + "auxiliary_loss_clip": 0.01147249, + "auxiliary_loss_mlp": 0.01099043, + "balance_loss_clip": 1.00183535, + "balance_loss_mlp": 1.00043321, + "epoch": 0.9383436043889974, + "flos": 16436242828800.0, + "grad_norm": 1.8044836891533196, + "language_loss": 0.7741338, + "learning_rate": 3.970771343058166e-08, + "loss": 0.79659671, + "num_input_tokens_seen": 336610010, + "step": 15607, + "time_per_iteration": 2.573972702026367 + }, + { + "auxiliary_loss_clip": 0.01149579, + "auxiliary_loss_mlp": 0.01100625, + "balance_loss_clip": 1.00186849, + "balance_loss_mlp": 1.00039399, + "epoch": 0.9384037276416655, + "flos": 20740603271040.0, + "grad_norm": 2.3294377768291037, + "language_loss": 0.8261925, + "learning_rate": 3.963052953128776e-08, + "loss": 0.84869456, + "num_input_tokens_seen": 336628520, + "step": 15608, + "time_per_iteration": 2.6115732192993164 + }, + { + "auxiliary_loss_clip": 0.01147745, + "auxiliary_loss_mlp": 0.0110159, + "balance_loss_clip": 1.00202703, + "balance_loss_mlp": 1.0005486, + "epoch": 0.9384638508943334, + "flos": 19062677295360.0, + "grad_norm": 1.7811755823524482, + "language_loss": 0.68774402, + "learning_rate": 3.9553419969400536e-08, + "loss": 0.71023738, + "num_input_tokens_seen": 336647365, + "step": 15609, + "time_per_iteration": 2.5799591541290283 + }, + { + "auxiliary_loss_clip": 0.01115422, + "auxiliary_loss_mlp": 0.01100435, + "balance_loss_clip": 1.00165009, + "balance_loss_mlp": 1.00039434, + "epoch": 0.9385239741470014, + "flos": 23404887694080.0, + "grad_norm": 3.4544307551849633, + "language_loss": 0.75159866, + "learning_rate": 3.9476384747844316e-08, + "loss": 0.77375722, + "num_input_tokens_seen": 336667165, + "step": 15610, + "time_per_iteration": 2.6936566829681396 + }, + { + "auxiliary_loss_clip": 0.01083116, + "auxiliary_loss_mlp": 0.01099784, + "balance_loss_clip": 1.00163281, + "balance_loss_mlp": 1.00045931, + "epoch": 0.9385840973996693, + "flos": 12824742804480.0, + "grad_norm": 1.8967413845546242, + "language_loss": 0.7524038, + "learning_rate": 3.939942386953987e-08, + "loss": 0.77423275, + "num_input_tokens_seen": 336684130, + "step": 15611, + "time_per_iteration": 2.720888614654541 + }, + { + "auxiliary_loss_clip": 0.01113721, + "auxiliary_loss_mlp": 0.01099354, + "balance_loss_clip": 1.00171614, + "balance_loss_mlp": 1.00041068, + "epoch": 0.9386442206523373, + "flos": 15486980152320.0, + "grad_norm": 1.74246709168696, + "language_loss": 0.66023993, + "learning_rate": 3.9322537337405756e-08, + "loss": 0.68237066, + "num_input_tokens_seen": 336701520, + "step": 15612, + "time_per_iteration": 2.6655328273773193 + }, + { + "auxiliary_loss_clip": 0.01147446, + "auxiliary_loss_mlp": 0.01099157, + "balance_loss_clip": 1.00185156, + "balance_loss_mlp": 1.00049949, + "epoch": 0.9387043439050052, + "flos": 21178821196800.0, + "grad_norm": 2.0174901494543893, + "language_loss": 0.56954175, + "learning_rate": 3.924572515435742e-08, + "loss": 0.59200776, + "num_input_tokens_seen": 336720675, + "step": 15613, + "time_per_iteration": 2.584812879562378 + }, + { + "auxiliary_loss_clip": 0.0113082, + "auxiliary_loss_mlp": 0.0110075, + "balance_loss_clip": 1.00165415, + "balance_loss_mlp": 1.00071001, + "epoch": 0.9387644671576733, + "flos": 27668273696640.0, + "grad_norm": 1.9521257099499774, + "language_loss": 0.70767891, + "learning_rate": 3.916898732330764e-08, + "loss": 0.72999465, + "num_input_tokens_seen": 336741005, + "step": 15614, + "time_per_iteration": 2.687610149383545 + }, + { + "auxiliary_loss_clip": 0.01147544, + "auxiliary_loss_mlp": 0.0110105, + "balance_loss_clip": 1.00182271, + "balance_loss_mlp": 1.00043714, + "epoch": 0.9388245904103412, + "flos": 18836331742080.0, + "grad_norm": 3.194845222182951, + "language_loss": 0.81037211, + "learning_rate": 3.9092323847166544e-08, + "loss": 0.83285803, + "num_input_tokens_seen": 336757990, + "step": 15615, + "time_per_iteration": 2.5623562335968018 + }, + { + "auxiliary_loss_clip": 0.01132069, + "auxiliary_loss_mlp": 0.01100409, + "balance_loss_clip": 1.00181484, + "balance_loss_mlp": 1.00046349, + "epoch": 0.9388847136630092, + "flos": 25483828083840.0, + "grad_norm": 17.5163698432888, + "language_loss": 0.7185052, + "learning_rate": 3.901573472884134e-08, + "loss": 0.74082994, + "num_input_tokens_seen": 336777705, + "step": 15616, + "time_per_iteration": 2.666062355041504 + }, + { + "auxiliary_loss_clip": 0.01164125, + "auxiliary_loss_mlp": 0.01100174, + "balance_loss_clip": 1.00193977, + "balance_loss_mlp": 1.0004673, + "epoch": 0.9389448369156771, + "flos": 18734992496640.0, + "grad_norm": 2.7043708460177216, + "language_loss": 0.66107893, + "learning_rate": 3.89392199712355e-08, + "loss": 0.6837219, + "num_input_tokens_seen": 336798275, + "step": 15617, + "time_per_iteration": 2.5352816581726074 + }, + { + "auxiliary_loss_clip": 0.01147536, + "auxiliary_loss_mlp": 0.01100897, + "balance_loss_clip": 1.00177932, + "balance_loss_mlp": 1.00047469, + "epoch": 0.9390049601683451, + "flos": 21717839664000.0, + "grad_norm": 2.2739239982200043, + "language_loss": 0.73335934, + "learning_rate": 3.886277957725092e-08, + "loss": 0.7558437, + "num_input_tokens_seen": 336813835, + "step": 15618, + "time_per_iteration": 2.580402135848999 + }, + { + "auxiliary_loss_clip": 0.01164507, + "auxiliary_loss_mlp": 0.01101193, + "balance_loss_clip": 1.00194478, + "balance_loss_mlp": 1.0004853, + "epoch": 0.939065083421013, + "flos": 19391224020480.0, + "grad_norm": 1.9062854114109902, + "language_loss": 0.70116538, + "learning_rate": 3.878641354978662e-08, + "loss": 0.72382236, + "num_input_tokens_seen": 336832210, + "step": 15619, + "time_per_iteration": 2.513235092163086 + }, + { + "auxiliary_loss_clip": 0.01134832, + "auxiliary_loss_mlp": 0.01101145, + "balance_loss_clip": 1.00194216, + "balance_loss_mlp": 1.00053239, + "epoch": 0.939125206673681, + "flos": 24681511946880.0, + "grad_norm": 1.7349042206670167, + "language_loss": 0.7780934, + "learning_rate": 3.8710121891737834e-08, + "loss": 0.80045319, + "num_input_tokens_seen": 336851380, + "step": 15620, + "time_per_iteration": 2.6543402671813965 + }, + { + "auxiliary_loss_clip": 0.01147281, + "auxiliary_loss_mlp": 0.01099346, + "balance_loss_clip": 1.00186336, + "balance_loss_mlp": 1.00049806, + "epoch": 0.9391853299263491, + "flos": 16325961096960.0, + "grad_norm": 2.1538611130973737, + "language_loss": 0.73718762, + "learning_rate": 3.8633904605998025e-08, + "loss": 0.75965393, + "num_input_tokens_seen": 336868525, + "step": 15621, + "time_per_iteration": 4.078829765319824 + }, + { + "auxiliary_loss_clip": 0.01118213, + "auxiliary_loss_mlp": 0.0110152, + "balance_loss_clip": 1.00181043, + "balance_loss_mlp": 1.00057316, + "epoch": 0.939245453179017, + "flos": 11655778590720.0, + "grad_norm": 3.175531689681213, + "language_loss": 0.65909231, + "learning_rate": 3.855776169545688e-08, + "loss": 0.68128961, + "num_input_tokens_seen": 336886200, + "step": 15622, + "time_per_iteration": 2.6154274940490723 + }, + { + "auxiliary_loss_clip": 0.01134391, + "auxiliary_loss_mlp": 0.01100065, + "balance_loss_clip": 1.0018208, + "balance_loss_mlp": 1.00054932, + "epoch": 0.939305576431685, + "flos": 23148700917120.0, + "grad_norm": 2.59775784729195, + "language_loss": 0.71560907, + "learning_rate": 3.848169316300209e-08, + "loss": 0.73795366, + "num_input_tokens_seen": 336905815, + "step": 15623, + "time_per_iteration": 2.620295286178589 + }, + { + "auxiliary_loss_clip": 0.01147523, + "auxiliary_loss_mlp": 0.01101495, + "balance_loss_clip": 1.00194073, + "balance_loss_mlp": 1.00054872, + "epoch": 0.9393656996843529, + "flos": 33287790706560.0, + "grad_norm": 1.8999625769926194, + "language_loss": 0.72493196, + "learning_rate": 3.84056990115178e-08, + "loss": 0.74742216, + "num_input_tokens_seen": 336928460, + "step": 15624, + "time_per_iteration": 2.719472885131836 + }, + { + "auxiliary_loss_clip": 0.01116054, + "auxiliary_loss_mlp": 0.01100062, + "balance_loss_clip": 1.00175929, + "balance_loss_mlp": 1.00054562, + "epoch": 0.9394258229370209, + "flos": 21689434984320.0, + "grad_norm": 2.039512254884142, + "language_loss": 0.89263982, + "learning_rate": 3.832977924388614e-08, + "loss": 0.914801, + "num_input_tokens_seen": 336948320, + "step": 15625, + "time_per_iteration": 4.0873801708221436 + }, + { + "auxiliary_loss_clip": 0.01147353, + "auxiliary_loss_mlp": 0.01099922, + "balance_loss_clip": 1.00178123, + "balance_loss_mlp": 1.00054884, + "epoch": 0.9394859461896888, + "flos": 23874203819520.0, + "grad_norm": 1.8690096071746056, + "language_loss": 0.83975279, + "learning_rate": 3.825393386298592e-08, + "loss": 0.86222553, + "num_input_tokens_seen": 336967670, + "step": 15626, + "time_per_iteration": 2.593708038330078 + }, + { + "auxiliary_loss_clip": 0.01129315, + "auxiliary_loss_mlp": 0.01074089, + "balance_loss_clip": 1.00072885, + "balance_loss_mlp": 1.00008345, + "epoch": 0.9395460694423569, + "flos": 61566116993280.0, + "grad_norm": 0.7751353216591167, + "language_loss": 0.56127548, + "learning_rate": 3.8178162871693284e-08, + "loss": 0.58330953, + "num_input_tokens_seen": 337028395, + "step": 15627, + "time_per_iteration": 3.1906235218048096 + }, + { + "auxiliary_loss_clip": 0.01101154, + "auxiliary_loss_mlp": 0.01099337, + "balance_loss_clip": 1.00182486, + "balance_loss_mlp": 1.00048852, + "epoch": 0.9396061926950248, + "flos": 20995712640000.0, + "grad_norm": 1.4794251317135614, + "language_loss": 0.70363104, + "learning_rate": 3.810246627288105e-08, + "loss": 0.72563595, + "num_input_tokens_seen": 337048150, + "step": 15628, + "time_per_iteration": 2.71075439453125 + }, + { + "auxiliary_loss_clip": 0.01147541, + "auxiliary_loss_mlp": 0.01100235, + "balance_loss_clip": 1.00191951, + "balance_loss_mlp": 1.00052881, + "epoch": 0.9396663159476928, + "flos": 27487786832640.0, + "grad_norm": 2.066085901223624, + "language_loss": 0.7545135, + "learning_rate": 3.8026844069420025e-08, + "loss": 0.77699125, + "num_input_tokens_seen": 337069315, + "step": 15629, + "time_per_iteration": 2.622243881225586 + }, + { + "auxiliary_loss_clip": 0.01085921, + "auxiliary_loss_mlp": 0.01099431, + "balance_loss_clip": 1.00168812, + "balance_loss_mlp": 1.00039172, + "epoch": 0.9397264392003607, + "flos": 19427457864960.0, + "grad_norm": 1.766031062251393, + "language_loss": 0.74496144, + "learning_rate": 3.795129626417748e-08, + "loss": 0.76681495, + "num_input_tokens_seen": 337087765, + "step": 15630, + "time_per_iteration": 2.7170605659484863 + }, + { + "auxiliary_loss_clip": 0.01133281, + "auxiliary_loss_mlp": 0.01098909, + "balance_loss_clip": 1.00190508, + "balance_loss_mlp": 1.00063276, + "epoch": 0.9397865624530287, + "flos": 18004820826240.0, + "grad_norm": 1.8712357524758558, + "language_loss": 0.69342148, + "learning_rate": 3.787582286001845e-08, + "loss": 0.71574336, + "num_input_tokens_seen": 337106265, + "step": 15631, + "time_per_iteration": 2.6009042263031006 + }, + { + "auxiliary_loss_clip": 0.01099439, + "auxiliary_loss_mlp": 0.01100386, + "balance_loss_clip": 1.00172603, + "balance_loss_mlp": 1.00063145, + "epoch": 0.9398466857056966, + "flos": 22564613859840.0, + "grad_norm": 2.3803505662963538, + "language_loss": 0.75127017, + "learning_rate": 3.7800423859805086e-08, + "loss": 0.77326846, + "num_input_tokens_seen": 337126090, + "step": 15632, + "time_per_iteration": 2.757833242416382 + }, + { + "auxiliary_loss_clip": 0.01149834, + "auxiliary_loss_mlp": 0.01102008, + "balance_loss_clip": 1.00211525, + "balance_loss_mlp": 1.00044203, + "epoch": 0.9399068089583646, + "flos": 24535678728960.0, + "grad_norm": 5.7461549859571805, + "language_loss": 0.74293065, + "learning_rate": 3.772509926639622e-08, + "loss": 0.76544911, + "num_input_tokens_seen": 337145655, + "step": 15633, + "time_per_iteration": 2.5928895473480225 + }, + { + "auxiliary_loss_clip": 0.01164239, + "auxiliary_loss_mlp": 0.01101303, + "balance_loss_clip": 1.00192535, + "balance_loss_mlp": 1.00059485, + "epoch": 0.9399669322110327, + "flos": 25630343660160.0, + "grad_norm": 1.774099885237169, + "language_loss": 0.72236162, + "learning_rate": 3.764984908264823e-08, + "loss": 0.74501705, + "num_input_tokens_seen": 337164805, + "step": 15634, + "time_per_iteration": 2.56622052192688 + }, + { + "auxiliary_loss_clip": 0.01149446, + "auxiliary_loss_mlp": 0.0110096, + "balance_loss_clip": 1.00177956, + "balance_loss_mlp": 1.00044262, + "epoch": 0.9400270554637006, + "flos": 17089385783040.0, + "grad_norm": 5.044974195948061, + "language_loss": 0.68872499, + "learning_rate": 3.75746733114144e-08, + "loss": 0.71122909, + "num_input_tokens_seen": 337182280, + "step": 15635, + "time_per_iteration": 2.5441346168518066 + }, + { + "auxiliary_loss_clip": 0.01096899, + "auxiliary_loss_mlp": 0.01098899, + "balance_loss_clip": 1.00165355, + "balance_loss_mlp": 1.00057471, + "epoch": 0.9400871787163686, + "flos": 22055113393920.0, + "grad_norm": 1.6099977584036085, + "language_loss": 0.74249923, + "learning_rate": 3.7499571955545985e-08, + "loss": 0.76445723, + "num_input_tokens_seen": 337203495, + "step": 15636, + "time_per_iteration": 2.691020965576172 + }, + { + "auxiliary_loss_clip": 0.01147467, + "auxiliary_loss_mlp": 0.01100497, + "balance_loss_clip": 1.00185847, + "balance_loss_mlp": 1.00045609, + "epoch": 0.9401473019690365, + "flos": 16982767238400.0, + "grad_norm": 4.177169473514206, + "language_loss": 0.83166134, + "learning_rate": 3.7424545017890054e-08, + "loss": 0.854141, + "num_input_tokens_seen": 337220435, + "step": 15637, + "time_per_iteration": 2.532410144805908 + }, + { + "auxiliary_loss_clip": 0.01100701, + "auxiliary_loss_mlp": 0.01100146, + "balance_loss_clip": 1.00169599, + "balance_loss_mlp": 1.00048661, + "epoch": 0.9402074252217045, + "flos": 19681956702720.0, + "grad_norm": 4.9735571795512294, + "language_loss": 0.68828082, + "learning_rate": 3.7349592501292325e-08, + "loss": 0.71028924, + "num_input_tokens_seen": 337238095, + "step": 15638, + "time_per_iteration": 2.6618471145629883 + }, + { + "auxiliary_loss_clip": 0.01147474, + "auxiliary_loss_mlp": 0.01099445, + "balance_loss_clip": 1.00182509, + "balance_loss_mlp": 1.00064421, + "epoch": 0.9402675484743724, + "flos": 24754302858240.0, + "grad_norm": 1.6756499225955919, + "language_loss": 0.84888536, + "learning_rate": 3.727471440859498e-08, + "loss": 0.87135452, + "num_input_tokens_seen": 337256645, + "step": 15639, + "time_per_iteration": 2.57387113571167 + }, + { + "auxiliary_loss_clip": 0.01132805, + "auxiliary_loss_mlp": 0.00747341, + "balance_loss_clip": 1.00165868, + "balance_loss_mlp": 1.00046539, + "epoch": 0.9403276717270405, + "flos": 25558630156800.0, + "grad_norm": 2.719427908706747, + "language_loss": 0.78300965, + "learning_rate": 3.719991074263662e-08, + "loss": 0.8018111, + "num_input_tokens_seen": 337278360, + "step": 15640, + "time_per_iteration": 2.6794073581695557 + }, + { + "auxiliary_loss_clip": 0.01147582, + "auxiliary_loss_mlp": 0.01100906, + "balance_loss_clip": 1.00176477, + "balance_loss_mlp": 1.00043654, + "epoch": 0.9403877949797084, + "flos": 26689852154880.0, + "grad_norm": 1.9549913584453213, + "language_loss": 0.74341547, + "learning_rate": 3.7125181506254544e-08, + "loss": 0.76590037, + "num_input_tokens_seen": 337302480, + "step": 15641, + "time_per_iteration": 4.055827856063843 + }, + { + "auxiliary_loss_clip": 0.01147493, + "auxiliary_loss_mlp": 0.01102041, + "balance_loss_clip": 1.00174737, + "balance_loss_mlp": 1.00047433, + "epoch": 0.9404479182323764, + "flos": 15011666455680.0, + "grad_norm": 2.5878690134906535, + "language_loss": 0.82107902, + "learning_rate": 3.7050526702282256e-08, + "loss": 0.84357435, + "num_input_tokens_seen": 337316600, + "step": 15642, + "time_per_iteration": 2.5125279426574707 + }, + { + "auxiliary_loss_clip": 0.0114762, + "auxiliary_loss_mlp": 0.01100285, + "balance_loss_clip": 1.00182879, + "balance_loss_mlp": 1.00043488, + "epoch": 0.9405080414850443, + "flos": 24973573432320.0, + "grad_norm": 1.9196071056160917, + "language_loss": 0.67872834, + "learning_rate": 3.697594633355084e-08, + "loss": 0.70120734, + "num_input_tokens_seen": 337336895, + "step": 15643, + "time_per_iteration": 4.030256032943726 + }, + { + "auxiliary_loss_clip": 0.01149629, + "auxiliary_loss_mlp": 0.01101896, + "balance_loss_clip": 1.00196195, + "balance_loss_mlp": 1.00061607, + "epoch": 0.9405681647377123, + "flos": 20844743777280.0, + "grad_norm": 8.668783406450773, + "language_loss": 0.76856029, + "learning_rate": 3.6901440402888226e-08, + "loss": 0.79107553, + "num_input_tokens_seen": 337355105, + "step": 15644, + "time_per_iteration": 2.5654711723327637 + }, + { + "auxiliary_loss_clip": 0.01149718, + "auxiliary_loss_mlp": 0.01099999, + "balance_loss_clip": 1.00198901, + "balance_loss_mlp": 1.00057864, + "epoch": 0.9406282879903802, + "flos": 23805578885760.0, + "grad_norm": 1.4731142395507988, + "language_loss": 0.67427742, + "learning_rate": 3.682700891311974e-08, + "loss": 0.6967746, + "num_input_tokens_seen": 337374905, + "step": 15645, + "time_per_iteration": 2.613234519958496 + }, + { + "auxiliary_loss_clip": 0.01133123, + "auxiliary_loss_mlp": 0.00747229, + "balance_loss_clip": 1.00187492, + "balance_loss_mlp": 1.00035942, + "epoch": 0.9406884112430483, + "flos": 27674953626240.0, + "grad_norm": 1.5536483127842529, + "language_loss": 0.70247477, + "learning_rate": 3.6752651867067774e-08, + "loss": 0.72127831, + "num_input_tokens_seen": 337397130, + "step": 15646, + "time_per_iteration": 2.662822723388672 + }, + { + "auxiliary_loss_clip": 0.01149476, + "auxiliary_loss_mlp": 0.01100462, + "balance_loss_clip": 1.0019424, + "balance_loss_mlp": 1.00042176, + "epoch": 0.9407485344957163, + "flos": 23075048079360.0, + "grad_norm": 2.575931516096493, + "language_loss": 0.74417162, + "learning_rate": 3.667836926755208e-08, + "loss": 0.766671, + "num_input_tokens_seen": 337418660, + "step": 15647, + "time_per_iteration": 2.583296775817871 + }, + { + "auxiliary_loss_clip": 0.01125268, + "auxiliary_loss_mlp": 0.01073864, + "balance_loss_clip": 1.00074863, + "balance_loss_mlp": 1.00024021, + "epoch": 0.9408086577483842, + "flos": 71014034304000.0, + "grad_norm": 0.883137753650416, + "language_loss": 0.63537425, + "learning_rate": 3.660416111738907e-08, + "loss": 0.65736556, + "num_input_tokens_seen": 337478055, + "step": 15648, + "time_per_iteration": 3.2777156829833984 + }, + { + "auxiliary_loss_clip": 0.01164004, + "auxiliary_loss_mlp": 0.01099801, + "balance_loss_clip": 1.00183582, + "balance_loss_mlp": 1.00057149, + "epoch": 0.9408687810010522, + "flos": 23730956380800.0, + "grad_norm": 1.3194809266415555, + "language_loss": 0.66139394, + "learning_rate": 3.653002741939337e-08, + "loss": 0.68403196, + "num_input_tokens_seen": 337499405, + "step": 15649, + "time_per_iteration": 2.5497257709503174 + }, + { + "auxiliary_loss_clip": 0.0111762, + "auxiliary_loss_mlp": 0.01100152, + "balance_loss_clip": 1.00167179, + "balance_loss_mlp": 1.00049281, + "epoch": 0.9409289042537201, + "flos": 18369314087040.0, + "grad_norm": 1.9168911455143967, + "language_loss": 0.77648842, + "learning_rate": 3.645596817637586e-08, + "loss": 0.79866612, + "num_input_tokens_seen": 337517195, + "step": 15650, + "time_per_iteration": 2.6385257244110107 + }, + { + "auxiliary_loss_clip": 0.01100834, + "auxiliary_loss_mlp": 0.01099934, + "balance_loss_clip": 1.00177741, + "balance_loss_mlp": 1.00051343, + "epoch": 0.9409890275063881, + "flos": 23878333883520.0, + "grad_norm": 2.458161172915852, + "language_loss": 0.74334896, + "learning_rate": 3.638198339114451e-08, + "loss": 0.76535666, + "num_input_tokens_seen": 337535245, + "step": 15651, + "time_per_iteration": 2.747086763381958 + }, + { + "auxiliary_loss_clip": 0.01164005, + "auxiliary_loss_mlp": 0.01100443, + "balance_loss_clip": 1.0018518, + "balance_loss_mlp": 1.00054562, + "epoch": 0.941049150759056, + "flos": 16545088016640.0, + "grad_norm": 2.280601054634471, + "language_loss": 0.72319269, + "learning_rate": 3.630807306650507e-08, + "loss": 0.74583715, + "num_input_tokens_seen": 337553040, + "step": 15652, + "time_per_iteration": 2.5380711555480957 + }, + { + "auxiliary_loss_clip": 0.01116603, + "auxiliary_loss_mlp": 0.01102723, + "balance_loss_clip": 1.00165224, + "balance_loss_mlp": 1.00058413, + "epoch": 0.9411092740117241, + "flos": 25118401069440.0, + "grad_norm": 2.28861080422329, + "language_loss": 0.66302812, + "learning_rate": 3.6234237205260645e-08, + "loss": 0.68522137, + "num_input_tokens_seen": 337574580, + "step": 15653, + "time_per_iteration": 2.6977853775024414 + }, + { + "auxiliary_loss_clip": 0.01164276, + "auxiliary_loss_mlp": 0.01101435, + "balance_loss_clip": 1.00195515, + "balance_loss_mlp": 1.00044048, + "epoch": 0.941169397264392, + "flos": 21142264129920.0, + "grad_norm": 2.9710153040221536, + "language_loss": 0.77792937, + "learning_rate": 3.6160475810210536e-08, + "loss": 0.80058646, + "num_input_tokens_seen": 337593010, + "step": 15654, + "time_per_iteration": 2.5689470767974854 + }, + { + "auxiliary_loss_clip": 0.01149617, + "auxiliary_loss_mlp": 0.01101559, + "balance_loss_clip": 1.00170636, + "balance_loss_mlp": 1.00046945, + "epoch": 0.94122952051706, + "flos": 38508914995200.0, + "grad_norm": 1.56517036008431, + "language_loss": 0.70134193, + "learning_rate": 3.6086788884152065e-08, + "loss": 0.72385371, + "num_input_tokens_seen": 337616170, + "step": 15655, + "time_per_iteration": 2.711256742477417 + }, + { + "auxiliary_loss_clip": 0.01164184, + "auxiliary_loss_mlp": 0.01100606, + "balance_loss_clip": 1.00192213, + "balance_loss_mlp": 1.00056541, + "epoch": 0.9412896437697279, + "flos": 18369206346240.0, + "grad_norm": 1.8528867566280351, + "language_loss": 0.72064877, + "learning_rate": 3.601317642987944e-08, + "loss": 0.74329668, + "num_input_tokens_seen": 337635215, + "step": 15656, + "time_per_iteration": 2.512552261352539 + }, + { + "auxiliary_loss_clip": 0.01133151, + "auxiliary_loss_mlp": 0.01100042, + "balance_loss_clip": 1.00193381, + "balance_loss_mlp": 1.00047803, + "epoch": 0.9413497670223959, + "flos": 25884950238720.0, + "grad_norm": 2.0381214783888804, + "language_loss": 0.78047824, + "learning_rate": 3.593963845018377e-08, + "loss": 0.80281019, + "num_input_tokens_seen": 337654195, + "step": 15657, + "time_per_iteration": 2.616814613342285 + }, + { + "auxiliary_loss_clip": 0.01117381, + "auxiliary_loss_mlp": 0.01100183, + "balance_loss_clip": 1.00160325, + "balance_loss_mlp": 1.00042844, + "epoch": 0.9414098902750638, + "flos": 16618309891200.0, + "grad_norm": 1.9426280474561846, + "language_loss": 0.84337187, + "learning_rate": 3.586617494785371e-08, + "loss": 0.86554754, + "num_input_tokens_seen": 337671810, + "step": 15658, + "time_per_iteration": 2.610790729522705 + }, + { + "auxiliary_loss_clip": 0.01164352, + "auxiliary_loss_mlp": 0.01101914, + "balance_loss_clip": 1.00193906, + "balance_loss_mlp": 1.00053871, + "epoch": 0.9414700135277319, + "flos": 18625033987200.0, + "grad_norm": 1.8051823932635582, + "language_loss": 0.71010923, + "learning_rate": 3.5792785925675254e-08, + "loss": 0.73277193, + "num_input_tokens_seen": 337689410, + "step": 15659, + "time_per_iteration": 4.030303955078125 + }, + { + "auxiliary_loss_clip": 0.01131063, + "auxiliary_loss_mlp": 0.01100474, + "balance_loss_clip": 1.00173259, + "balance_loss_mlp": 1.00062454, + "epoch": 0.9415301367803999, + "flos": 26280146649600.0, + "grad_norm": 1.7756818432070731, + "language_loss": 0.79450631, + "learning_rate": 3.571947138643172e-08, + "loss": 0.81682163, + "num_input_tokens_seen": 337709950, + "step": 15660, + "time_per_iteration": 2.6425952911376953 + }, + { + "auxiliary_loss_clip": 0.01115665, + "auxiliary_loss_mlp": 0.01098837, + "balance_loss_clip": 1.00166059, + "balance_loss_mlp": 1.0005132, + "epoch": 0.9415902600330678, + "flos": 23261388860160.0, + "grad_norm": 1.490492751522667, + "language_loss": 0.6782198, + "learning_rate": 3.564623133290201e-08, + "loss": 0.70036471, + "num_input_tokens_seen": 337731320, + "step": 15661, + "time_per_iteration": 2.7088522911071777 + }, + { + "auxiliary_loss_clip": 0.01149337, + "auxiliary_loss_mlp": 0.01099219, + "balance_loss_clip": 1.00180495, + "balance_loss_mlp": 1.00046611, + "epoch": 0.9416503832857358, + "flos": 14719138093440.0, + "grad_norm": 2.0532774378488154, + "language_loss": 0.6676237, + "learning_rate": 3.557306576786434e-08, + "loss": 0.69010925, + "num_input_tokens_seen": 337747720, + "step": 15662, + "time_per_iteration": 3.9340403079986572 + }, + { + "auxiliary_loss_clip": 0.01125405, + "auxiliary_loss_mlp": 0.01074289, + "balance_loss_clip": 1.00095451, + "balance_loss_mlp": 1.00028348, + "epoch": 0.9417105065384037, + "flos": 70312698276480.0, + "grad_norm": 0.7633332702537639, + "language_loss": 0.59260321, + "learning_rate": 3.5499974694092935e-08, + "loss": 0.61460012, + "num_input_tokens_seen": 337806930, + "step": 15663, + "time_per_iteration": 3.289705753326416 + }, + { + "auxiliary_loss_clip": 0.01149642, + "auxiliary_loss_mlp": 0.01102129, + "balance_loss_clip": 1.00184, + "balance_loss_mlp": 1.00046766, + "epoch": 0.9417706297910717, + "flos": 34057895322240.0, + "grad_norm": 1.9552050553833138, + "language_loss": 0.66706783, + "learning_rate": 3.542695811435914e-08, + "loss": 0.68958557, + "num_input_tokens_seen": 337828100, + "step": 15664, + "time_per_iteration": 2.7697336673736572 + }, + { + "auxiliary_loss_clip": 0.01130673, + "auxiliary_loss_mlp": 0.01100999, + "balance_loss_clip": 1.00176549, + "balance_loss_mlp": 1.00043416, + "epoch": 0.9418307530437396, + "flos": 16471614746880.0, + "grad_norm": 2.2207136102960368, + "language_loss": 0.73406589, + "learning_rate": 3.535401603143207e-08, + "loss": 0.75638258, + "num_input_tokens_seen": 337844805, + "step": 15665, + "time_per_iteration": 2.6269607543945312 + }, + { + "auxiliary_loss_clip": 0.01164167, + "auxiliary_loss_mlp": 0.01099632, + "balance_loss_clip": 1.00196934, + "balance_loss_mlp": 1.00045013, + "epoch": 0.9418908762964077, + "flos": 11253543114240.0, + "grad_norm": 2.3801749671006625, + "language_loss": 0.63880444, + "learning_rate": 3.528114844807773e-08, + "loss": 0.6614424, + "num_input_tokens_seen": 337860490, + "step": 15666, + "time_per_iteration": 2.5388622283935547 + }, + { + "auxiliary_loss_clip": 0.01116332, + "auxiliary_loss_mlp": 0.01100321, + "balance_loss_clip": 1.00164151, + "balance_loss_mlp": 1.00056636, + "epoch": 0.9419509995490756, + "flos": 18438836860800.0, + "grad_norm": 2.622006127930454, + "language_loss": 0.78798854, + "learning_rate": 3.520835536705902e-08, + "loss": 0.81015509, + "num_input_tokens_seen": 337878360, + "step": 15667, + "time_per_iteration": 2.6543068885803223 + }, + { + "auxiliary_loss_clip": 0.0116399, + "auxiliary_loss_mlp": 0.01099334, + "balance_loss_clip": 1.00185382, + "balance_loss_mlp": 1.00043821, + "epoch": 0.9420111228017436, + "flos": 20737945664640.0, + "grad_norm": 1.679264136055703, + "language_loss": 0.75032133, + "learning_rate": 3.5135636791136404e-08, + "loss": 0.77295458, + "num_input_tokens_seen": 337895635, + "step": 15668, + "time_per_iteration": 2.568972110748291 + }, + { + "auxiliary_loss_clip": 0.01098156, + "auxiliary_loss_mlp": 0.01100475, + "balance_loss_clip": 1.00176418, + "balance_loss_mlp": 1.00053, + "epoch": 0.9420712460544115, + "flos": 21141940907520.0, + "grad_norm": 3.155006027656575, + "language_loss": 0.58711696, + "learning_rate": 3.506299272306723e-08, + "loss": 0.6091032, + "num_input_tokens_seen": 337913940, + "step": 15669, + "time_per_iteration": 2.6865234375 + }, + { + "auxiliary_loss_clip": 0.0111751, + "auxiliary_loss_mlp": 0.01099056, + "balance_loss_clip": 1.00180387, + "balance_loss_mlp": 1.0004456, + "epoch": 0.9421313693070795, + "flos": 15851760721920.0, + "grad_norm": 1.5876664943435668, + "language_loss": 0.7651366, + "learning_rate": 3.4990423165606406e-08, + "loss": 0.78730214, + "num_input_tokens_seen": 337932015, + "step": 15670, + "time_per_iteration": 2.7032806873321533 + }, + { + "auxiliary_loss_clip": 0.01164121, + "auxiliary_loss_mlp": 0.01100483, + "balance_loss_clip": 1.00190794, + "balance_loss_mlp": 1.00053751, + "epoch": 0.9421914925597474, + "flos": 32415915882240.0, + "grad_norm": 3.381038539708903, + "language_loss": 0.64856517, + "learning_rate": 3.491792812150574e-08, + "loss": 0.67121124, + "num_input_tokens_seen": 337953345, + "step": 15671, + "time_per_iteration": 2.656277894973755 + }, + { + "auxiliary_loss_clip": 0.01132786, + "auxiliary_loss_mlp": 0.0110055, + "balance_loss_clip": 1.0018785, + "balance_loss_mlp": 1.00050926, + "epoch": 0.9422516158124155, + "flos": 19718513769600.0, + "grad_norm": 1.6914453466943917, + "language_loss": 0.79477257, + "learning_rate": 3.48455075935139e-08, + "loss": 0.81710589, + "num_input_tokens_seen": 337973685, + "step": 15672, + "time_per_iteration": 2.5928001403808594 + }, + { + "auxiliary_loss_clip": 0.01117559, + "auxiliary_loss_mlp": 0.01101768, + "balance_loss_clip": 1.00166273, + "balance_loss_mlp": 1.00058317, + "epoch": 0.9423117390650835, + "flos": 16253277926400.0, + "grad_norm": 2.2155793146307605, + "language_loss": 0.7383725, + "learning_rate": 3.47731615843776e-08, + "loss": 0.76056576, + "num_input_tokens_seen": 337989175, + "step": 15673, + "time_per_iteration": 2.6258151531219482 + }, + { + "auxiliary_loss_clip": 0.01149624, + "auxiliary_loss_mlp": 0.0109968, + "balance_loss_clip": 1.00189602, + "balance_loss_mlp": 1.00045037, + "epoch": 0.9423718623177514, + "flos": 31796564647680.0, + "grad_norm": 1.4892782418189785, + "language_loss": 0.70311451, + "learning_rate": 3.470089009683974e-08, + "loss": 0.72560751, + "num_input_tokens_seen": 338011800, + "step": 15674, + "time_per_iteration": 2.655318260192871 + }, + { + "auxiliary_loss_clip": 0.01164107, + "auxiliary_loss_mlp": 0.01100408, + "balance_loss_clip": 1.00186801, + "balance_loss_mlp": 1.00046313, + "epoch": 0.9424319855704194, + "flos": 23331809473920.0, + "grad_norm": 6.2292014183918285, + "language_loss": 0.8151179, + "learning_rate": 3.462869313364125e-08, + "loss": 0.83776307, + "num_input_tokens_seen": 338032120, + "step": 15675, + "time_per_iteration": 2.543853521347046 + }, + { + "auxiliary_loss_clip": 0.01132902, + "auxiliary_loss_mlp": 0.0110019, + "balance_loss_clip": 1.00192237, + "balance_loss_mlp": 1.00043571, + "epoch": 0.9424921088230873, + "flos": 20777627214720.0, + "grad_norm": 1.7894884403279692, + "language_loss": 0.63098359, + "learning_rate": 3.4556570697519494e-08, + "loss": 0.65331447, + "num_input_tokens_seen": 338051880, + "step": 15676, + "time_per_iteration": 2.6144819259643555 + }, + { + "auxiliary_loss_clip": 0.01133987, + "auxiliary_loss_mlp": 0.01100356, + "balance_loss_clip": 1.00176287, + "balance_loss_mlp": 1.00055385, + "epoch": 0.9425522320757553, + "flos": 19026658932480.0, + "grad_norm": 1.7874151273247663, + "language_loss": 0.66887558, + "learning_rate": 3.448452279120984e-08, + "loss": 0.69121897, + "num_input_tokens_seen": 338069665, + "step": 15677, + "time_per_iteration": 2.6192049980163574 + }, + { + "auxiliary_loss_clip": 0.01116156, + "auxiliary_loss_mlp": 0.01101348, + "balance_loss_clip": 1.00176835, + "balance_loss_mlp": 1.00059187, + "epoch": 0.9426123553284232, + "flos": 25155353185920.0, + "grad_norm": 1.7997753214560657, + "language_loss": 0.64177883, + "learning_rate": 3.441254941744387e-08, + "loss": 0.6639539, + "num_input_tokens_seen": 338090490, + "step": 15678, + "time_per_iteration": 2.7520546913146973 + }, + { + "auxiliary_loss_clip": 0.01117439, + "auxiliary_loss_mlp": 0.01100313, + "balance_loss_clip": 1.00168872, + "balance_loss_mlp": 1.00046349, + "epoch": 0.9426724785810913, + "flos": 21179359900800.0, + "grad_norm": 1.8377433832601138, + "language_loss": 0.74134588, + "learning_rate": 3.434065057895097e-08, + "loss": 0.76352334, + "num_input_tokens_seen": 338109825, + "step": 15679, + "time_per_iteration": 4.081061601638794 + }, + { + "auxiliary_loss_clip": 0.01132328, + "auxiliary_loss_mlp": 0.01100811, + "balance_loss_clip": 1.00183511, + "balance_loss_mlp": 1.00067556, + "epoch": 0.9427326018337592, + "flos": 14756916222720.0, + "grad_norm": 2.5340394339753303, + "language_loss": 0.77369589, + "learning_rate": 3.426882627845762e-08, + "loss": 0.7960273, + "num_input_tokens_seen": 338125790, + "step": 15680, + "time_per_iteration": 2.6312429904937744 + }, + { + "auxiliary_loss_clip": 0.01147888, + "auxiliary_loss_mlp": 0.01099555, + "balance_loss_clip": 1.00190496, + "balance_loss_mlp": 1.00056362, + "epoch": 0.9427927250864272, + "flos": 20923640000640.0, + "grad_norm": 1.8512042676562233, + "language_loss": 0.75199819, + "learning_rate": 3.419707651868742e-08, + "loss": 0.77447259, + "num_input_tokens_seen": 338145610, + "step": 15681, + "time_per_iteration": 4.009734392166138 + }, + { + "auxiliary_loss_clip": 0.01131655, + "auxiliary_loss_mlp": 0.01100621, + "balance_loss_clip": 1.0019716, + "balance_loss_mlp": 1.00067544, + "epoch": 0.9428528483390951, + "flos": 19752520970880.0, + "grad_norm": 1.7841783799416278, + "language_loss": 0.65565228, + "learning_rate": 3.412540130236086e-08, + "loss": 0.67797506, + "num_input_tokens_seen": 338165960, + "step": 15682, + "time_per_iteration": 2.6640408039093018 + }, + { + "auxiliary_loss_clip": 0.01114819, + "auxiliary_loss_mlp": 0.01099494, + "balance_loss_clip": 1.00159526, + "balance_loss_mlp": 1.00050259, + "epoch": 0.9429129715917631, + "flos": 24534996370560.0, + "grad_norm": 5.682822367064033, + "language_loss": 0.76423287, + "learning_rate": 3.405380063219665e-08, + "loss": 0.78637594, + "num_input_tokens_seen": 338187215, + "step": 15683, + "time_per_iteration": 2.6817128658294678 + }, + { + "auxiliary_loss_clip": 0.01149626, + "auxiliary_loss_mlp": 0.01101318, + "balance_loss_clip": 1.00191259, + "balance_loss_mlp": 1.00065815, + "epoch": 0.942973094844431, + "flos": 17959824063360.0, + "grad_norm": 5.043554637868485, + "language_loss": 0.75248659, + "learning_rate": 3.398227451090885e-08, + "loss": 0.77499598, + "num_input_tokens_seen": 338201825, + "step": 15684, + "time_per_iteration": 2.5879127979278564 + }, + { + "auxiliary_loss_clip": 0.01164165, + "auxiliary_loss_mlp": 0.01099553, + "balance_loss_clip": 1.00191629, + "balance_loss_mlp": 1.00032353, + "epoch": 0.9430332180970991, + "flos": 26137689310080.0, + "grad_norm": 1.6889675736022811, + "language_loss": 0.76670289, + "learning_rate": 3.391082294121017e-08, + "loss": 0.78934008, + "num_input_tokens_seen": 338220865, + "step": 15685, + "time_per_iteration": 2.611473798751831 + }, + { + "auxiliary_loss_clip": 0.0114684, + "auxiliary_loss_mlp": 0.01099464, + "balance_loss_clip": 1.00183952, + "balance_loss_mlp": 1.00047243, + "epoch": 0.943093341349767, + "flos": 23951376190080.0, + "grad_norm": 1.817788040249593, + "language_loss": 0.75619495, + "learning_rate": 3.383944592581023e-08, + "loss": 0.77865797, + "num_input_tokens_seen": 338240160, + "step": 15686, + "time_per_iteration": 2.6412386894226074 + }, + { + "auxiliary_loss_clip": 0.01147543, + "auxiliary_loss_mlp": 0.01101529, + "balance_loss_clip": 1.00183058, + "balance_loss_mlp": 1.00053537, + "epoch": 0.943153464602435, + "flos": 17968407413760.0, + "grad_norm": 3.05945444290017, + "language_loss": 0.80663717, + "learning_rate": 3.376814346741575e-08, + "loss": 0.82912791, + "num_input_tokens_seen": 338259305, + "step": 15687, + "time_per_iteration": 2.6381735801696777 + }, + { + "auxiliary_loss_clip": 0.01130904, + "auxiliary_loss_mlp": 0.01101139, + "balance_loss_clip": 1.0017134, + "balance_loss_mlp": 1.0006218, + "epoch": 0.943213587855103, + "flos": 14501519544960.0, + "grad_norm": 2.1395755671746572, + "language_loss": 0.76053107, + "learning_rate": 3.369691556873011e-08, + "loss": 0.78285146, + "num_input_tokens_seen": 338274950, + "step": 15688, + "time_per_iteration": 2.60905122756958 + }, + { + "auxiliary_loss_clip": 0.01132424, + "auxiliary_loss_mlp": 0.01100059, + "balance_loss_clip": 1.00180054, + "balance_loss_mlp": 1.00044799, + "epoch": 0.9432737111077709, + "flos": 28986411093120.0, + "grad_norm": 1.9378069261996396, + "language_loss": 0.68286681, + "learning_rate": 3.3625762232454504e-08, + "loss": 0.70519167, + "num_input_tokens_seen": 338295585, + "step": 15689, + "time_per_iteration": 2.752929210662842 + }, + { + "auxiliary_loss_clip": 0.01149417, + "auxiliary_loss_mlp": 0.01099395, + "balance_loss_clip": 1.00180113, + "balance_loss_mlp": 1.00059438, + "epoch": 0.9433338343604389, + "flos": 21609066303360.0, + "grad_norm": 1.8505752290236228, + "language_loss": 0.80604231, + "learning_rate": 3.35546834612872e-08, + "loss": 0.82853043, + "num_input_tokens_seen": 338314555, + "step": 15690, + "time_per_iteration": 2.6957449913024902 + }, + { + "auxiliary_loss_clip": 0.01147879, + "auxiliary_loss_mlp": 0.0110048, + "balance_loss_clip": 1.0018481, + "balance_loss_mlp": 1.00048697, + "epoch": 0.9433939576131068, + "flos": 33182285483520.0, + "grad_norm": 1.8974195461872567, + "language_loss": 0.60178232, + "learning_rate": 3.348367925792317e-08, + "loss": 0.62426591, + "num_input_tokens_seen": 338336260, + "step": 15691, + "time_per_iteration": 2.7042481899261475 + }, + { + "auxiliary_loss_clip": 0.01119573, + "auxiliary_loss_mlp": 0.01100158, + "balance_loss_clip": 1.00197482, + "balance_loss_mlp": 1.00045085, + "epoch": 0.9434540808657749, + "flos": 20486391742080.0, + "grad_norm": 1.4942107264608717, + "language_loss": 0.66423631, + "learning_rate": 3.341274962505514e-08, + "loss": 0.68643355, + "num_input_tokens_seen": 338354680, + "step": 15692, + "time_per_iteration": 2.6735620498657227 + }, + { + "auxiliary_loss_clip": 0.01147676, + "auxiliary_loss_mlp": 0.01100297, + "balance_loss_clip": 1.00172687, + "balance_loss_mlp": 1.00054216, + "epoch": 0.9435142041184428, + "flos": 21542955321600.0, + "grad_norm": 2.766978612885444, + "language_loss": 0.75014317, + "learning_rate": 3.334189456537251e-08, + "loss": 0.77262288, + "num_input_tokens_seen": 338372490, + "step": 15693, + "time_per_iteration": 2.6088004112243652 + }, + { + "auxiliary_loss_clip": 0.01119191, + "auxiliary_loss_mlp": 0.01100371, + "balance_loss_clip": 1.00178146, + "balance_loss_mlp": 1.00047362, + "epoch": 0.9435743273711108, + "flos": 25009089004800.0, + "grad_norm": 1.861890671380048, + "language_loss": 0.73137736, + "learning_rate": 3.327111408156291e-08, + "loss": 0.75357294, + "num_input_tokens_seen": 338390870, + "step": 15694, + "time_per_iteration": 2.693040132522583 + }, + { + "auxiliary_loss_clip": 0.01111877, + "auxiliary_loss_mlp": 0.01073941, + "balance_loss_clip": 1.00084054, + "balance_loss_mlp": 1.00031745, + "epoch": 0.9436344506237787, + "flos": 60158707320960.0, + "grad_norm": 0.6894515721503303, + "language_loss": 0.50579572, + "learning_rate": 3.3200408176309316e-08, + "loss": 0.52765381, + "num_input_tokens_seen": 338453075, + "step": 15695, + "time_per_iteration": 3.2570815086364746 + }, + { + "auxiliary_loss_clip": 0.01134788, + "auxiliary_loss_mlp": 0.01099767, + "balance_loss_clip": 1.00179136, + "balance_loss_mlp": 1.00063288, + "epoch": 0.9436945738764467, + "flos": 22237252283520.0, + "grad_norm": 2.049342768182152, + "language_loss": 0.64935672, + "learning_rate": 3.312977685229335e-08, + "loss": 0.67170227, + "num_input_tokens_seen": 338471770, + "step": 15696, + "time_per_iteration": 4.290968656539917 + }, + { + "auxiliary_loss_clip": 0.01147533, + "auxiliary_loss_mlp": 0.0110079, + "balance_loss_clip": 1.00183642, + "balance_loss_mlp": 1.00041556, + "epoch": 0.9437546971291146, + "flos": 25045179194880.0, + "grad_norm": 1.8632656514901467, + "language_loss": 0.66664326, + "learning_rate": 3.305922011219353e-08, + "loss": 0.68912649, + "num_input_tokens_seen": 338492190, + "step": 15697, + "time_per_iteration": 2.611658811569214 + }, + { + "auxiliary_loss_clip": 0.01112289, + "auxiliary_loss_mlp": 0.01074335, + "balance_loss_clip": 1.00084114, + "balance_loss_mlp": 1.00032949, + "epoch": 0.9438148203817827, + "flos": 56790788400000.0, + "grad_norm": 0.8452603878676969, + "language_loss": 0.63277352, + "learning_rate": 3.298873795868506e-08, + "loss": 0.65463972, + "num_input_tokens_seen": 338552560, + "step": 15698, + "time_per_iteration": 3.181978702545166 + }, + { + "auxiliary_loss_clip": 0.01130826, + "auxiliary_loss_mlp": 0.01101164, + "balance_loss_clip": 1.00174737, + "balance_loss_mlp": 1.00064659, + "epoch": 0.9438749436344506, + "flos": 22346384780160.0, + "grad_norm": 4.869783028812043, + "language_loss": 0.69788629, + "learning_rate": 3.291833039444092e-08, + "loss": 0.72020614, + "num_input_tokens_seen": 338571770, + "step": 15699, + "time_per_iteration": 2.63582706451416 + }, + { + "auxiliary_loss_clip": 0.01117988, + "auxiliary_loss_mlp": 0.01100396, + "balance_loss_clip": 1.00183582, + "balance_loss_mlp": 1.00045121, + "epoch": 0.9439350668871186, + "flos": 13370800337280.0, + "grad_norm": 1.953732974022227, + "language_loss": 0.74550068, + "learning_rate": 3.2847997422130734e-08, + "loss": 0.76768458, + "num_input_tokens_seen": 338587310, + "step": 15700, + "time_per_iteration": 4.042866230010986 + }, + { + "auxiliary_loss_clip": 0.01070094, + "auxiliary_loss_mlp": 0.01100373, + "balance_loss_clip": 1.0017556, + "balance_loss_mlp": 1.00057089, + "epoch": 0.9439951901397866, + "flos": 17785334770560.0, + "grad_norm": 2.5493122818781693, + "language_loss": 0.70569617, + "learning_rate": 3.2777739044421495e-08, + "loss": 0.72740078, + "num_input_tokens_seen": 338606235, + "step": 15701, + "time_per_iteration": 2.869523286819458 + }, + { + "auxiliary_loss_clip": 0.01100273, + "auxiliary_loss_mlp": 0.01101414, + "balance_loss_clip": 1.00158453, + "balance_loss_mlp": 1.00051486, + "epoch": 0.9440553133924545, + "flos": 18879568738560.0, + "grad_norm": 3.0968330871513166, + "language_loss": 0.78179723, + "learning_rate": 3.2707555263977505e-08, + "loss": 0.80381405, + "num_input_tokens_seen": 338624090, + "step": 15702, + "time_per_iteration": 2.719942808151245 + }, + { + "auxiliary_loss_clip": 0.01149386, + "auxiliary_loss_mlp": 0.0110139, + "balance_loss_clip": 1.00196123, + "balance_loss_mlp": 1.00049126, + "epoch": 0.9441154366451225, + "flos": 19572967860480.0, + "grad_norm": 1.7222150217591043, + "language_loss": 0.6656872, + "learning_rate": 3.2637446083460194e-08, + "loss": 0.68819499, + "num_input_tokens_seen": 338643695, + "step": 15703, + "time_per_iteration": 2.6000187397003174 + }, + { + "auxiliary_loss_clip": 0.01147647, + "auxiliary_loss_mlp": 0.01101325, + "balance_loss_clip": 1.0018959, + "balance_loss_mlp": 1.00042653, + "epoch": 0.9441755598977905, + "flos": 30294995472000.0, + "grad_norm": 1.688001592492585, + "language_loss": 0.73344743, + "learning_rate": 3.256741150552833e-08, + "loss": 0.75593716, + "num_input_tokens_seen": 338664725, + "step": 15704, + "time_per_iteration": 2.6933610439300537 + }, + { + "auxiliary_loss_clip": 0.01147285, + "auxiliary_loss_mlp": 0.01100169, + "balance_loss_clip": 1.00188088, + "balance_loss_mlp": 1.00051022, + "epoch": 0.9442356831504585, + "flos": 20667884186880.0, + "grad_norm": 1.7478581380368534, + "language_loss": 0.74515855, + "learning_rate": 3.2497451532837336e-08, + "loss": 0.76763314, + "num_input_tokens_seen": 338683990, + "step": 15705, + "time_per_iteration": 2.604736804962158 + }, + { + "auxiliary_loss_clip": 0.01132304, + "auxiliary_loss_mlp": 0.01099875, + "balance_loss_clip": 1.00181735, + "balance_loss_mlp": 1.00054944, + "epoch": 0.9442958064031264, + "flos": 16107265140480.0, + "grad_norm": 1.9601923974644084, + "language_loss": 0.77104115, + "learning_rate": 3.2427566168039986e-08, + "loss": 0.79336292, + "num_input_tokens_seen": 338702025, + "step": 15706, + "time_per_iteration": 2.644775390625 + }, + { + "auxiliary_loss_clip": 0.01147264, + "auxiliary_loss_mlp": 0.01098914, + "balance_loss_clip": 1.001755, + "balance_loss_mlp": 1.00044739, + "epoch": 0.9443559296557944, + "flos": 20447392550400.0, + "grad_norm": 1.501713848157348, + "language_loss": 0.69283795, + "learning_rate": 3.23577554137866e-08, + "loss": 0.71529973, + "num_input_tokens_seen": 338720920, + "step": 15707, + "time_per_iteration": 2.599404811859131 + }, + { + "auxiliary_loss_clip": 0.01164027, + "auxiliary_loss_mlp": 0.01099948, + "balance_loss_clip": 1.00185859, + "balance_loss_mlp": 1.00038469, + "epoch": 0.9444160529084623, + "flos": 21610897896960.0, + "grad_norm": 1.9830940230115799, + "language_loss": 0.69210076, + "learning_rate": 3.22880192727244e-08, + "loss": 0.71474051, + "num_input_tokens_seen": 338739590, + "step": 15708, + "time_per_iteration": 2.568695545196533 + }, + { + "auxiliary_loss_clip": 0.0114958, + "auxiliary_loss_mlp": 0.01100046, + "balance_loss_clip": 1.00196862, + "balance_loss_mlp": 1.0004344, + "epoch": 0.9444761761611303, + "flos": 18441781776000.0, + "grad_norm": 3.7524744578322005, + "language_loss": 0.70755541, + "learning_rate": 3.221835774749748e-08, + "loss": 0.73005164, + "num_input_tokens_seen": 338757240, + "step": 15709, + "time_per_iteration": 2.611560583114624 + }, + { + "auxiliary_loss_clip": 0.01101329, + "auxiliary_loss_mlp": 0.01099697, + "balance_loss_clip": 1.0016849, + "balance_loss_mlp": 1.00061035, + "epoch": 0.9445362994137982, + "flos": 20957144411520.0, + "grad_norm": 2.0286038743995793, + "language_loss": 0.84724188, + "learning_rate": 3.214877084074774e-08, + "loss": 0.86925215, + "num_input_tokens_seen": 338773750, + "step": 15710, + "time_per_iteration": 2.768414258956909 + }, + { + "auxiliary_loss_clip": 0.0111849, + "auxiliary_loss_mlp": 0.01100822, + "balance_loss_clip": 1.00166941, + "balance_loss_mlp": 1.00059116, + "epoch": 0.9445964226664663, + "flos": 20303283185280.0, + "grad_norm": 1.6024023666051923, + "language_loss": 0.71439385, + "learning_rate": 3.2079258555113956e-08, + "loss": 0.73658699, + "num_input_tokens_seen": 338792115, + "step": 15711, + "time_per_iteration": 2.7447779178619385 + }, + { + "auxiliary_loss_clip": 0.01149701, + "auxiliary_loss_mlp": 0.01101303, + "balance_loss_clip": 1.00201249, + "balance_loss_mlp": 1.00054765, + "epoch": 0.9446565459191342, + "flos": 26396030903040.0, + "grad_norm": 2.4507079492044266, + "language_loss": 0.69347107, + "learning_rate": 3.200982089323179e-08, + "loss": 0.71598113, + "num_input_tokens_seen": 338812480, + "step": 15712, + "time_per_iteration": 2.678593873977661 + }, + { + "auxiliary_loss_clip": 0.0114776, + "auxiliary_loss_mlp": 0.01101427, + "balance_loss_clip": 1.00186634, + "balance_loss_mlp": 1.00062394, + "epoch": 0.9447166691718022, + "flos": 16544764794240.0, + "grad_norm": 2.3406214759242423, + "language_loss": 0.70775735, + "learning_rate": 3.1940457857734246e-08, + "loss": 0.73024929, + "num_input_tokens_seen": 338829105, + "step": 15713, + "time_per_iteration": 2.5491795539855957 + }, + { + "auxiliary_loss_clip": 0.01132886, + "auxiliary_loss_mlp": 0.01100027, + "balance_loss_clip": 1.00179768, + "balance_loss_mlp": 1.00051141, + "epoch": 0.9447767924244702, + "flos": 29164635400320.0, + "grad_norm": 1.559300376378392, + "language_loss": 0.76950002, + "learning_rate": 3.187116945125212e-08, + "loss": 0.79182911, + "num_input_tokens_seen": 338850670, + "step": 15714, + "time_per_iteration": 2.76041841506958 + }, + { + "auxiliary_loss_clip": 0.0111737, + "auxiliary_loss_mlp": 0.01100207, + "balance_loss_clip": 1.00162888, + "balance_loss_mlp": 1.0004046, + "epoch": 0.9448369156771381, + "flos": 19274908803840.0, + "grad_norm": 19.87352869948353, + "language_loss": 0.67623746, + "learning_rate": 3.1801955676412194e-08, + "loss": 0.69841325, + "num_input_tokens_seen": 338867795, + "step": 15715, + "time_per_iteration": 2.6511988639831543 + }, + { + "auxiliary_loss_clip": 0.01113821, + "auxiliary_loss_mlp": 0.0110085, + "balance_loss_clip": 1.00171876, + "balance_loss_mlp": 1.00052321, + "epoch": 0.9448970389298061, + "flos": 23841166285440.0, + "grad_norm": 2.077258408414731, + "language_loss": 0.7499249, + "learning_rate": 3.173281653583948e-08, + "loss": 0.7720716, + "num_input_tokens_seen": 338887205, + "step": 15716, + "time_per_iteration": 2.72941255569458 + }, + { + "auxiliary_loss_clip": 0.01132115, + "auxiliary_loss_mlp": 0.0110008, + "balance_loss_clip": 1.00181937, + "balance_loss_mlp": 1.00037384, + "epoch": 0.944957162182474, + "flos": 22382259488640.0, + "grad_norm": 2.5543226354035435, + "language_loss": 0.62448561, + "learning_rate": 3.166375203215565e-08, + "loss": 0.64680755, + "num_input_tokens_seen": 338906130, + "step": 15717, + "time_per_iteration": 4.120644569396973 + }, + { + "auxiliary_loss_clip": 0.01147808, + "auxiliary_loss_mlp": 0.01100687, + "balance_loss_clip": 1.00184703, + "balance_loss_mlp": 1.00055158, + "epoch": 0.9450172854351421, + "flos": 17383889393280.0, + "grad_norm": 2.199204533177549, + "language_loss": 0.7906543, + "learning_rate": 3.1594762167979514e-08, + "loss": 0.81313926, + "num_input_tokens_seen": 338923045, + "step": 15718, + "time_per_iteration": 2.570465087890625 + }, + { + "auxiliary_loss_clip": 0.01141654, + "auxiliary_loss_mlp": 0.01073938, + "balance_loss_clip": 1.00077891, + "balance_loss_mlp": 1.00031412, + "epoch": 0.94507740868781, + "flos": 68466352406400.0, + "grad_norm": 0.7070578198770785, + "language_loss": 0.57780135, + "learning_rate": 3.152584694592719e-08, + "loss": 0.59995735, + "num_input_tokens_seen": 338987545, + "step": 15719, + "time_per_iteration": 4.581980228424072 + }, + { + "auxiliary_loss_clip": 0.0110347, + "auxiliary_loss_mlp": 0.00747379, + "balance_loss_clip": 1.00177312, + "balance_loss_mlp": 1.00045383, + "epoch": 0.945137531940478, + "flos": 21142479611520.0, + "grad_norm": 1.8341011410916366, + "language_loss": 0.75990355, + "learning_rate": 3.145700636861193e-08, + "loss": 0.77841204, + "num_input_tokens_seen": 339007830, + "step": 15720, + "time_per_iteration": 2.762805700302124 + }, + { + "auxiliary_loss_clip": 0.01147245, + "auxiliary_loss_mlp": 0.01099386, + "balance_loss_clip": 1.00164986, + "balance_loss_mlp": 1.00048995, + "epoch": 0.9451976551931459, + "flos": 24533918962560.0, + "grad_norm": 1.8375356509571295, + "language_loss": 0.72798133, + "learning_rate": 3.138824043864452e-08, + "loss": 0.75044763, + "num_input_tokens_seen": 339028980, + "step": 15721, + "time_per_iteration": 2.63525390625 + }, + { + "auxiliary_loss_clip": 0.01102901, + "auxiliary_loss_mlp": 0.01100823, + "balance_loss_clip": 1.00167942, + "balance_loss_mlp": 1.0004487, + "epoch": 0.9452577784458139, + "flos": 23440582834560.0, + "grad_norm": 1.8866574825100813, + "language_loss": 0.84931004, + "learning_rate": 3.131954915863244e-08, + "loss": 0.87134731, + "num_input_tokens_seen": 339047950, + "step": 15722, + "time_per_iteration": 2.8250725269317627 + }, + { + "auxiliary_loss_clip": 0.01124359, + "auxiliary_loss_mlp": 0.01073739, + "balance_loss_clip": 1.00082636, + "balance_loss_mlp": 1.00011563, + "epoch": 0.9453179016984818, + "flos": 52017686449920.0, + "grad_norm": 0.9807216952743992, + "language_loss": 0.64469975, + "learning_rate": 3.125093253118005e-08, + "loss": 0.66668075, + "num_input_tokens_seen": 339104535, + "step": 15723, + "time_per_iteration": 3.127686023712158 + }, + { + "auxiliary_loss_clip": 0.01116594, + "auxiliary_loss_mlp": 0.0110168, + "balance_loss_clip": 1.00170231, + "balance_loss_mlp": 1.00049543, + "epoch": 0.9453780249511499, + "flos": 13473001509120.0, + "grad_norm": 2.011439359356957, + "language_loss": 0.73046577, + "learning_rate": 3.1182390558889715e-08, + "loss": 0.75264847, + "num_input_tokens_seen": 339122050, + "step": 15724, + "time_per_iteration": 2.68882417678833 + }, + { + "auxiliary_loss_clip": 0.01115724, + "auxiliary_loss_mlp": 0.01100148, + "balance_loss_clip": 1.00162959, + "balance_loss_mlp": 1.00058413, + "epoch": 0.9454381482038178, + "flos": 23258515772160.0, + "grad_norm": 4.596789207204549, + "language_loss": 0.84348798, + "learning_rate": 3.111392324436024e-08, + "loss": 0.86564666, + "num_input_tokens_seen": 339138940, + "step": 15725, + "time_per_iteration": 2.6673343181610107 + }, + { + "auxiliary_loss_clip": 0.01132399, + "auxiliary_loss_mlp": 0.01100116, + "balance_loss_clip": 1.00173664, + "balance_loss_mlp": 1.00055242, + "epoch": 0.9454982714564858, + "flos": 19496621502720.0, + "grad_norm": 2.327618240270894, + "language_loss": 0.7125659, + "learning_rate": 3.104553059018822e-08, + "loss": 0.73489106, + "num_input_tokens_seen": 339158245, + "step": 15726, + "time_per_iteration": 2.7042696475982666 + }, + { + "auxiliary_loss_clip": 0.01132735, + "auxiliary_loss_mlp": 0.01100357, + "balance_loss_clip": 1.00187969, + "balance_loss_mlp": 1.00050771, + "epoch": 0.9455583947091538, + "flos": 23258120722560.0, + "grad_norm": 5.815447956929331, + "language_loss": 0.61045063, + "learning_rate": 3.097721259896735e-08, + "loss": 0.63278157, + "num_input_tokens_seen": 339178200, + "step": 15727, + "time_per_iteration": 2.6525468826293945 + }, + { + "auxiliary_loss_clip": 0.01149385, + "auxiliary_loss_mlp": 0.0109927, + "balance_loss_clip": 1.00185609, + "balance_loss_mlp": 1.00042176, + "epoch": 0.9456185179618217, + "flos": 17673041877120.0, + "grad_norm": 2.2160422917820504, + "language_loss": 0.81843823, + "learning_rate": 3.0908969273287566e-08, + "loss": 0.8409248, + "num_input_tokens_seen": 339193950, + "step": 15728, + "time_per_iteration": 2.580629348754883 + }, + { + "auxiliary_loss_clip": 0.0108133, + "auxiliary_loss_mlp": 0.01074247, + "balance_loss_clip": 1.00076067, + "balance_loss_mlp": 1.00024152, + "epoch": 0.9456786412144897, + "flos": 61415040389760.0, + "grad_norm": 0.7549840334698876, + "language_loss": 0.59089136, + "learning_rate": 3.08408006157368e-08, + "loss": 0.61244714, + "num_input_tokens_seen": 339252330, + "step": 15729, + "time_per_iteration": 3.349630832672119 + }, + { + "auxiliary_loss_clip": 0.01164043, + "auxiliary_loss_mlp": 0.01099153, + "balance_loss_clip": 1.00179994, + "balance_loss_mlp": 1.00049579, + "epoch": 0.9457387644671577, + "flos": 18588369179520.0, + "grad_norm": 2.9229696631468953, + "language_loss": 0.76389658, + "learning_rate": 3.077270662890052e-08, + "loss": 0.78652853, + "num_input_tokens_seen": 339270325, + "step": 15730, + "time_per_iteration": 2.5383739471435547 + }, + { + "auxiliary_loss_clip": 0.01119772, + "auxiliary_loss_mlp": 0.0110111, + "balance_loss_clip": 1.00176811, + "balance_loss_mlp": 1.00059223, + "epoch": 0.9457988877198257, + "flos": 21108544237440.0, + "grad_norm": 1.4688856176518732, + "language_loss": 0.62390727, + "learning_rate": 3.070468731536047e-08, + "loss": 0.64611608, + "num_input_tokens_seen": 339291980, + "step": 15731, + "time_per_iteration": 2.74957537651062 + }, + { + "auxiliary_loss_clip": 0.01148989, + "auxiliary_loss_mlp": 0.01100752, + "balance_loss_clip": 1.00181735, + "balance_loss_mlp": 1.0004257, + "epoch": 0.9458590109724936, + "flos": 26688379697280.0, + "grad_norm": 1.8898666480602553, + "language_loss": 0.64095235, + "learning_rate": 3.063674267769589e-08, + "loss": 0.66344976, + "num_input_tokens_seen": 339311795, + "step": 15732, + "time_per_iteration": 2.627124309539795 + }, + { + "auxiliary_loss_clip": 0.01147998, + "auxiliary_loss_mlp": 0.01101817, + "balance_loss_clip": 1.00187111, + "balance_loss_mlp": 1.00034571, + "epoch": 0.9459191342251616, + "flos": 18661591054080.0, + "grad_norm": 2.3754975154516993, + "language_loss": 0.84099412, + "learning_rate": 3.056887271848363e-08, + "loss": 0.86349225, + "num_input_tokens_seen": 339327745, + "step": 15733, + "time_per_iteration": 2.593832492828369 + }, + { + "auxiliary_loss_clip": 0.01149379, + "auxiliary_loss_mlp": 0.01099545, + "balance_loss_clip": 1.00189972, + "balance_loss_mlp": 1.00055385, + "epoch": 0.9459792574778295, + "flos": 23398459159680.0, + "grad_norm": 1.6247516781923808, + "language_loss": 0.7223497, + "learning_rate": 3.0501077440297173e-08, + "loss": 0.74483895, + "num_input_tokens_seen": 339346445, + "step": 15734, + "time_per_iteration": 4.141058683395386 + }, + { + "auxiliary_loss_clip": 0.01147143, + "auxiliary_loss_mlp": 0.01099111, + "balance_loss_clip": 1.00166273, + "balance_loss_mlp": 1.00054932, + "epoch": 0.9460393807304975, + "flos": 24392969994240.0, + "grad_norm": 1.4869774296885794, + "language_loss": 0.87088388, + "learning_rate": 3.043335684570692e-08, + "loss": 0.89334631, + "num_input_tokens_seen": 339367945, + "step": 15735, + "time_per_iteration": 2.6149778366088867 + }, + { + "auxiliary_loss_clip": 0.01133572, + "auxiliary_loss_mlp": 0.0110057, + "balance_loss_clip": 1.00179911, + "balance_loss_mlp": 1.00043392, + "epoch": 0.9460995039831654, + "flos": 21939408708480.0, + "grad_norm": 2.007727846383168, + "language_loss": 0.67617857, + "learning_rate": 3.036571093728102e-08, + "loss": 0.69851995, + "num_input_tokens_seen": 339386060, + "step": 15736, + "time_per_iteration": 2.663623809814453 + }, + { + "auxiliary_loss_clip": 0.0109548, + "auxiliary_loss_mlp": 0.01074172, + "balance_loss_clip": 1.00097919, + "balance_loss_mlp": 1.00054836, + "epoch": 0.9461596272358335, + "flos": 70322466775680.0, + "grad_norm": 0.8632370815369155, + "language_loss": 0.65306413, + "learning_rate": 3.029813971758499e-08, + "loss": 0.67476058, + "num_input_tokens_seen": 339446695, + "step": 15737, + "time_per_iteration": 4.6724443435668945 + }, + { + "auxiliary_loss_clip": 0.01141563, + "auxiliary_loss_mlp": 0.01074085, + "balance_loss_clip": 1.00087059, + "balance_loss_mlp": 1.00007975, + "epoch": 0.9462197504885014, + "flos": 58591242645120.0, + "grad_norm": 0.8148386386141533, + "language_loss": 0.58851445, + "learning_rate": 3.0230643189181225e-08, + "loss": 0.61067098, + "num_input_tokens_seen": 339510080, + "step": 15738, + "time_per_iteration": 3.1604061126708984 + }, + { + "auxiliary_loss_clip": 0.01149323, + "auxiliary_loss_mlp": 0.01099765, + "balance_loss_clip": 1.00187588, + "balance_loss_mlp": 1.00058341, + "epoch": 0.9462798737411694, + "flos": 23433759250560.0, + "grad_norm": 1.5910809735365776, + "language_loss": 0.71624953, + "learning_rate": 3.016322135462834e-08, + "loss": 0.73874044, + "num_input_tokens_seen": 339529335, + "step": 15739, + "time_per_iteration": 2.630171298980713 + }, + { + "auxiliary_loss_clip": 0.01149547, + "auxiliary_loss_mlp": 0.01100606, + "balance_loss_clip": 1.0018276, + "balance_loss_mlp": 1.00056589, + "epoch": 0.9463399969938374, + "flos": 25046077034880.0, + "grad_norm": 2.330343394614088, + "language_loss": 0.64245403, + "learning_rate": 3.009587421648363e-08, + "loss": 0.66495562, + "num_input_tokens_seen": 339548820, + "step": 15740, + "time_per_iteration": 2.6599020957946777 + }, + { + "auxiliary_loss_clip": 0.01130414, + "auxiliary_loss_mlp": 0.0109967, + "balance_loss_clip": 1.00175881, + "balance_loss_mlp": 1.00048757, + "epoch": 0.9464001202465053, + "flos": 24352606085760.0, + "grad_norm": 1.9003197207854152, + "language_loss": 0.66481942, + "learning_rate": 3.0028601777301045e-08, + "loss": 0.6871202, + "num_input_tokens_seen": 339566775, + "step": 15741, + "time_per_iteration": 2.675945997238159 + }, + { + "auxiliary_loss_clip": 0.01147483, + "auxiliary_loss_mlp": 0.01100333, + "balance_loss_clip": 1.00185728, + "balance_loss_mlp": 1.00043583, + "epoch": 0.9464602434991733, + "flos": 17165444832000.0, + "grad_norm": 2.291210182989587, + "language_loss": 0.7546103, + "learning_rate": 2.9961404039630987e-08, + "loss": 0.77708852, + "num_input_tokens_seen": 339581905, + "step": 15742, + "time_per_iteration": 2.6046760082244873 + }, + { + "auxiliary_loss_clip": 0.01149407, + "auxiliary_loss_mlp": 0.01099508, + "balance_loss_clip": 1.00179815, + "balance_loss_mlp": 1.00056458, + "epoch": 0.9465203667518413, + "flos": 19938107566080.0, + "grad_norm": 2.186532918109784, + "language_loss": 0.7248137, + "learning_rate": 2.989428100602187e-08, + "loss": 0.74730289, + "num_input_tokens_seen": 339599870, + "step": 15743, + "time_per_iteration": 2.6251397132873535 + }, + { + "auxiliary_loss_clip": 0.01116406, + "auxiliary_loss_mlp": 0.01101042, + "balance_loss_clip": 1.00177717, + "balance_loss_mlp": 1.00052524, + "epoch": 0.9465804900045093, + "flos": 20120318282880.0, + "grad_norm": 2.1583731148891085, + "language_loss": 0.79923606, + "learning_rate": 2.982723267901943e-08, + "loss": 0.82141054, + "num_input_tokens_seen": 339620250, + "step": 15744, + "time_per_iteration": 2.6936774253845215 + }, + { + "auxiliary_loss_clip": 0.01130908, + "auxiliary_loss_mlp": 0.01100484, + "balance_loss_clip": 1.00172091, + "balance_loss_mlp": 1.00063431, + "epoch": 0.9466406132571772, + "flos": 23911622812800.0, + "grad_norm": 1.7288450732266667, + "language_loss": 0.78008807, + "learning_rate": 2.9760259061165417e-08, + "loss": 0.80240202, + "num_input_tokens_seen": 339639900, + "step": 15745, + "time_per_iteration": 2.68381667137146 + }, + { + "auxiliary_loss_clip": 0.01132617, + "auxiliary_loss_mlp": 0.01100737, + "balance_loss_clip": 1.00173676, + "balance_loss_mlp": 1.00050592, + "epoch": 0.9467007365098452, + "flos": 19933223316480.0, + "grad_norm": 1.8712993235593933, + "language_loss": 0.70251226, + "learning_rate": 2.9693360155000014e-08, + "loss": 0.72484589, + "num_input_tokens_seen": 339658970, + "step": 15746, + "time_per_iteration": 2.652252197265625 + }, + { + "auxiliary_loss_clip": 0.01131448, + "auxiliary_loss_mlp": 0.01099911, + "balance_loss_clip": 1.00177109, + "balance_loss_mlp": 1.00034773, + "epoch": 0.9467608597625131, + "flos": 19310496203520.0, + "grad_norm": 2.1376201269589075, + "language_loss": 0.56146276, + "learning_rate": 2.962653596305964e-08, + "loss": 0.5837763, + "num_input_tokens_seen": 339675600, + "step": 15747, + "time_per_iteration": 2.6368930339813232 + }, + { + "auxiliary_loss_clip": 0.01066752, + "auxiliary_loss_mlp": 0.0107485, + "balance_loss_clip": 1.00073814, + "balance_loss_mlp": 1.00084519, + "epoch": 0.9468209830151811, + "flos": 69630252802560.0, + "grad_norm": 0.6907694116763987, + "language_loss": 0.53249633, + "learning_rate": 2.955978648787871e-08, + "loss": 0.5539124, + "num_input_tokens_seen": 339744505, + "step": 15748, + "time_per_iteration": 3.9724254608154297 + }, + { + "auxiliary_loss_clip": 0.01132971, + "auxiliary_loss_mlp": 0.01100388, + "balance_loss_clip": 1.00184023, + "balance_loss_mlp": 1.00058627, + "epoch": 0.946881106267849, + "flos": 27016639113600.0, + "grad_norm": 1.6701575561679447, + "language_loss": 0.66623902, + "learning_rate": 2.9493111731988096e-08, + "loss": 0.68857259, + "num_input_tokens_seen": 339765810, + "step": 15749, + "time_per_iteration": 2.9103012084960938 + }, + { + "auxiliary_loss_clip": 0.01117491, + "auxiliary_loss_mlp": 0.0110082, + "balance_loss_clip": 1.00168324, + "balance_loss_mlp": 1.00054109, + "epoch": 0.9469412295205171, + "flos": 20190092451840.0, + "grad_norm": 1.968365713565623, + "language_loss": 0.76048738, + "learning_rate": 2.942651169791621e-08, + "loss": 0.7826705, + "num_input_tokens_seen": 339784125, + "step": 15750, + "time_per_iteration": 2.7300474643707275 + }, + { + "auxiliary_loss_clip": 0.01147457, + "auxiliary_loss_mlp": 0.01100165, + "balance_loss_clip": 1.00194693, + "balance_loss_mlp": 1.00041056, + "epoch": 0.947001352773185, + "flos": 21324905809920.0, + "grad_norm": 1.803392299977215, + "language_loss": 0.68083888, + "learning_rate": 2.9359986388188372e-08, + "loss": 0.70331514, + "num_input_tokens_seen": 339803450, + "step": 15751, + "time_per_iteration": 2.720348596572876 + }, + { + "auxiliary_loss_clip": 0.01117865, + "auxiliary_loss_mlp": 0.01100797, + "balance_loss_clip": 1.00182712, + "balance_loss_mlp": 1.00047016, + "epoch": 0.947061476025853, + "flos": 21944041562880.0, + "grad_norm": 1.686109050742337, + "language_loss": 0.65119225, + "learning_rate": 2.929353580532723e-08, + "loss": 0.67337888, + "num_input_tokens_seen": 339823215, + "step": 15752, + "time_per_iteration": 3.0583035945892334 + }, + { + "auxiliary_loss_clip": 0.01149095, + "auxiliary_loss_mlp": 0.01100399, + "balance_loss_clip": 1.00186646, + "balance_loss_mlp": 1.00035822, + "epoch": 0.947121599278521, + "flos": 21394715892480.0, + "grad_norm": 1.814473702706933, + "language_loss": 0.71659732, + "learning_rate": 2.9227159951852764e-08, + "loss": 0.73909223, + "num_input_tokens_seen": 339842230, + "step": 15753, + "time_per_iteration": 2.604236364364624 + }, + { + "auxiliary_loss_clip": 0.01164212, + "auxiliary_loss_mlp": 0.01100702, + "balance_loss_clip": 1.00185633, + "balance_loss_mlp": 1.000471, + "epoch": 0.9471817225311889, + "flos": 23075730437760.0, + "grad_norm": 2.099145331444552, + "language_loss": 0.70134693, + "learning_rate": 2.9160858830281855e-08, + "loss": 0.72399604, + "num_input_tokens_seen": 339861640, + "step": 15754, + "time_per_iteration": 4.244297504425049 + }, + { + "auxiliary_loss_clip": 0.01164261, + "auxiliary_loss_mlp": 0.01101278, + "balance_loss_clip": 1.00186753, + "balance_loss_mlp": 1.00047445, + "epoch": 0.947241845783857, + "flos": 11910744305280.0, + "grad_norm": 2.1713215265833763, + "language_loss": 0.78763211, + "learning_rate": 2.9094632443129153e-08, + "loss": 0.81028748, + "num_input_tokens_seen": 339878210, + "step": 15755, + "time_per_iteration": 2.55557918548584 + }, + { + "auxiliary_loss_clip": 0.01116717, + "auxiliary_loss_mlp": 0.01102479, + "balance_loss_clip": 1.00178957, + "balance_loss_mlp": 1.00072145, + "epoch": 0.9473019690365249, + "flos": 20740675098240.0, + "grad_norm": 2.466851512476368, + "language_loss": 0.75572419, + "learning_rate": 2.9028480792904876e-08, + "loss": 0.77791619, + "num_input_tokens_seen": 339894255, + "step": 15756, + "time_per_iteration": 2.7001683712005615 + }, + { + "auxiliary_loss_clip": 0.01134901, + "auxiliary_loss_mlp": 0.0110002, + "balance_loss_clip": 1.00182009, + "balance_loss_mlp": 1.00055206, + "epoch": 0.9473620922891929, + "flos": 17639896602240.0, + "grad_norm": 3.286615189809328, + "language_loss": 0.74961072, + "learning_rate": 2.8962403882118347e-08, + "loss": 0.7719599, + "num_input_tokens_seen": 339912425, + "step": 15757, + "time_per_iteration": 4.123141288757324 + }, + { + "auxiliary_loss_clip": 0.01132361, + "auxiliary_loss_mlp": 0.01100772, + "balance_loss_clip": 1.00176203, + "balance_loss_mlp": 1.00058818, + "epoch": 0.9474222155418608, + "flos": 23550002640000.0, + "grad_norm": 2.4104243396908873, + "language_loss": 0.79216826, + "learning_rate": 2.889640171327512e-08, + "loss": 0.81449962, + "num_input_tokens_seen": 339929635, + "step": 15758, + "time_per_iteration": 2.697232484817505 + }, + { + "auxiliary_loss_clip": 0.01133386, + "auxiliary_loss_mlp": 0.0074733, + "balance_loss_clip": 1.00197053, + "balance_loss_mlp": 1.00043011, + "epoch": 0.9474823387945288, + "flos": 27089753247360.0, + "grad_norm": 1.948597582286106, + "language_loss": 0.7175231, + "learning_rate": 2.8830474288877638e-08, + "loss": 0.73633027, + "num_input_tokens_seen": 339951200, + "step": 15759, + "time_per_iteration": 2.695984363555908 + }, + { + "auxiliary_loss_clip": 0.01147423, + "auxiliary_loss_mlp": 0.01099958, + "balance_loss_clip": 1.00198793, + "balance_loss_mlp": 1.00058556, + "epoch": 0.9475424620471967, + "flos": 22966526113920.0, + "grad_norm": 2.7361397769068563, + "language_loss": 0.75562298, + "learning_rate": 2.8764621611426344e-08, + "loss": 0.7780968, + "num_input_tokens_seen": 339971820, + "step": 15760, + "time_per_iteration": 2.665757417678833 + }, + { + "auxiliary_loss_clip": 0.01164213, + "auxiliary_loss_mlp": 0.00747334, + "balance_loss_clip": 1.00192475, + "balance_loss_mlp": 1.00043678, + "epoch": 0.9476025852998647, + "flos": 20047671025920.0, + "grad_norm": 2.1096438902780714, + "language_loss": 0.72602153, + "learning_rate": 2.8698843683418128e-08, + "loss": 0.74513704, + "num_input_tokens_seen": 339989420, + "step": 15761, + "time_per_iteration": 2.6290106773376465 + }, + { + "auxiliary_loss_clip": 0.01130763, + "auxiliary_loss_mlp": 0.011, + "balance_loss_clip": 1.00180602, + "balance_loss_mlp": 1.00062716, + "epoch": 0.9476627085525327, + "flos": 14975468524800.0, + "grad_norm": 2.0461650114476893, + "language_loss": 0.71675628, + "learning_rate": 2.863314050734722e-08, + "loss": 0.73906392, + "num_input_tokens_seen": 340006690, + "step": 15762, + "time_per_iteration": 2.6786205768585205 + }, + { + "auxiliary_loss_clip": 0.011643, + "auxiliary_loss_mlp": 0.01102075, + "balance_loss_clip": 1.001845, + "balance_loss_mlp": 1.00050855, + "epoch": 0.9477228318052007, + "flos": 18697788984960.0, + "grad_norm": 2.0878792785579208, + "language_loss": 0.66703099, + "learning_rate": 2.856751208570518e-08, + "loss": 0.6896947, + "num_input_tokens_seen": 340025480, + "step": 15763, + "time_per_iteration": 2.5478994846343994 + }, + { + "auxiliary_loss_clip": 0.01164111, + "auxiliary_loss_mlp": 0.01100275, + "balance_loss_clip": 1.00175643, + "balance_loss_mlp": 1.00056839, + "epoch": 0.9477829550578686, + "flos": 23875065745920.0, + "grad_norm": 1.9935864228014901, + "language_loss": 0.70021355, + "learning_rate": 2.8501958420980466e-08, + "loss": 0.72285742, + "num_input_tokens_seen": 340043785, + "step": 15764, + "time_per_iteration": 2.574390411376953 + }, + { + "auxiliary_loss_clip": 0.01148919, + "auxiliary_loss_mlp": 0.00747164, + "balance_loss_clip": 1.00197172, + "balance_loss_mlp": 1.00039244, + "epoch": 0.9478430783105366, + "flos": 22562890007040.0, + "grad_norm": 1.6864988507128291, + "language_loss": 0.71200514, + "learning_rate": 2.8436479515659306e-08, + "loss": 0.73096597, + "num_input_tokens_seen": 340064360, + "step": 15765, + "time_per_iteration": 2.647203207015991 + }, + { + "auxiliary_loss_clip": 0.01140856, + "auxiliary_loss_mlp": 0.01073778, + "balance_loss_clip": 1.00075555, + "balance_loss_mlp": 1.00015473, + "epoch": 0.9479032015632046, + "flos": 60857885554560.0, + "grad_norm": 0.8066299119723739, + "language_loss": 0.59037572, + "learning_rate": 2.8371075372224384e-08, + "loss": 0.61252207, + "num_input_tokens_seen": 340114425, + "step": 15766, + "time_per_iteration": 2.936152696609497 + }, + { + "auxiliary_loss_clip": 0.01102317, + "auxiliary_loss_mlp": 0.01099833, + "balance_loss_clip": 1.00182569, + "balance_loss_mlp": 1.0006032, + "epoch": 0.9479633248158725, + "flos": 14683873916160.0, + "grad_norm": 2.883264433986926, + "language_loss": 0.74254113, + "learning_rate": 2.8305745993155938e-08, + "loss": 0.76456261, + "num_input_tokens_seen": 340132200, + "step": 15767, + "time_per_iteration": 2.7997114658355713 + }, + { + "auxiliary_loss_clip": 0.01117296, + "auxiliary_loss_mlp": 0.01101254, + "balance_loss_clip": 1.0018115, + "balance_loss_mlp": 1.00049829, + "epoch": 0.9480234480685406, + "flos": 20333878594560.0, + "grad_norm": 3.237983108899804, + "language_loss": 0.73158354, + "learning_rate": 2.8240491380931096e-08, + "loss": 0.75376904, + "num_input_tokens_seen": 340149175, + "step": 15768, + "time_per_iteration": 2.6755166053771973 + }, + { + "auxiliary_loss_clip": 0.01094311, + "auxiliary_loss_mlp": 0.01074478, + "balance_loss_clip": 1.00074553, + "balance_loss_mlp": 1.00047255, + "epoch": 0.9480835713212085, + "flos": 70293092428800.0, + "grad_norm": 0.7309142273489003, + "language_loss": 0.55255657, + "learning_rate": 2.8175311538024326e-08, + "loss": 0.57424444, + "num_input_tokens_seen": 340208155, + "step": 15769, + "time_per_iteration": 3.3985915184020996 + }, + { + "auxiliary_loss_clip": 0.0110232, + "auxiliary_loss_mlp": 0.0109968, + "balance_loss_clip": 1.0017755, + "balance_loss_mlp": 1.00035465, + "epoch": 0.9481436945738765, + "flos": 25449749055360.0, + "grad_norm": 1.3413121888169441, + "language_loss": 0.77385879, + "learning_rate": 2.8110206466907428e-08, + "loss": 0.79587877, + "num_input_tokens_seen": 340229275, + "step": 15770, + "time_per_iteration": 3.1309876441955566 + }, + { + "auxiliary_loss_clip": 0.01131394, + "auxiliary_loss_mlp": 0.01100166, + "balance_loss_clip": 1.00193024, + "balance_loss_mlp": 1.00045967, + "epoch": 0.9482038178265444, + "flos": 26979902478720.0, + "grad_norm": 1.9780770289627363, + "language_loss": 0.80033612, + "learning_rate": 2.8045176170049313e-08, + "loss": 0.82265174, + "num_input_tokens_seen": 340248920, + "step": 15771, + "time_per_iteration": 2.68697452545166 + }, + { + "auxiliary_loss_clip": 0.01117515, + "auxiliary_loss_mlp": 0.01099894, + "balance_loss_clip": 1.00179017, + "balance_loss_mlp": 1.00042546, + "epoch": 0.9482639410792124, + "flos": 17785442511360.0, + "grad_norm": 1.8287526776672935, + "language_loss": 0.69656169, + "learning_rate": 2.7980220649915566e-08, + "loss": 0.71873575, + "num_input_tokens_seen": 340266775, + "step": 15772, + "time_per_iteration": 4.317840337753296 + }, + { + "auxiliary_loss_clip": 0.01149742, + "auxiliary_loss_mlp": 0.01099975, + "balance_loss_clip": 1.00193906, + "balance_loss_mlp": 1.00045943, + "epoch": 0.9483240643318803, + "flos": 20996682307200.0, + "grad_norm": 1.502306592608672, + "language_loss": 0.73700047, + "learning_rate": 2.7915339908969327e-08, + "loss": 0.75949764, + "num_input_tokens_seen": 340285295, + "step": 15773, + "time_per_iteration": 2.599500894546509 + }, + { + "auxiliary_loss_clip": 0.0111645, + "auxiliary_loss_mlp": 0.01100614, + "balance_loss_clip": 1.0016644, + "balance_loss_mlp": 1.00066864, + "epoch": 0.9483841875845483, + "flos": 20083294339200.0, + "grad_norm": 2.364685558656804, + "language_loss": 0.62476569, + "learning_rate": 2.7850533949671072e-08, + "loss": 0.6469363, + "num_input_tokens_seen": 340304265, + "step": 15774, + "time_per_iteration": 4.0866265296936035 + }, + { + "auxiliary_loss_clip": 0.01164098, + "auxiliary_loss_mlp": 0.01100635, + "balance_loss_clip": 1.00185227, + "balance_loss_mlp": 1.00059414, + "epoch": 0.9484443108372163, + "flos": 20813645577600.0, + "grad_norm": 2.592244126562742, + "language_loss": 0.59417093, + "learning_rate": 2.7785802774478396e-08, + "loss": 0.61681825, + "num_input_tokens_seen": 340323690, + "step": 15775, + "time_per_iteration": 2.556427001953125 + }, + { + "auxiliary_loss_clip": 0.01130796, + "auxiliary_loss_mlp": 0.01100312, + "balance_loss_clip": 1.00181651, + "balance_loss_mlp": 1.00046182, + "epoch": 0.9485044340898843, + "flos": 36429184506240.0, + "grad_norm": 2.835186701789329, + "language_loss": 0.61706758, + "learning_rate": 2.772114638584555e-08, + "loss": 0.63937867, + "num_input_tokens_seen": 340345830, + "step": 15776, + "time_per_iteration": 2.7427430152893066 + }, + { + "auxiliary_loss_clip": 0.01133076, + "auxiliary_loss_mlp": 0.01100545, + "balance_loss_clip": 1.0017122, + "balance_loss_mlp": 1.00050485, + "epoch": 0.9485645573425522, + "flos": 22602535643520.0, + "grad_norm": 1.914875495032976, + "language_loss": 0.73298383, + "learning_rate": 2.765656478622458e-08, + "loss": 0.75532001, + "num_input_tokens_seen": 340365910, + "step": 15777, + "time_per_iteration": 2.6742794513702393 + }, + { + "auxiliary_loss_clip": 0.01149533, + "auxiliary_loss_mlp": 0.01102749, + "balance_loss_clip": 1.0019716, + "balance_loss_mlp": 1.00051475, + "epoch": 0.9486246805952202, + "flos": 22017766227840.0, + "grad_norm": 2.2244747933176727, + "language_loss": 0.71704018, + "learning_rate": 2.759205797806441e-08, + "loss": 0.73956293, + "num_input_tokens_seen": 340383935, + "step": 15778, + "time_per_iteration": 2.606722354888916 + }, + { + "auxiliary_loss_clip": 0.01148566, + "auxiliary_loss_mlp": 0.00747206, + "balance_loss_clip": 1.00194693, + "balance_loss_mlp": 1.0004487, + "epoch": 0.9486848038478882, + "flos": 16508674604160.0, + "grad_norm": 1.8347991581820657, + "language_loss": 0.70051724, + "learning_rate": 2.7527625963810865e-08, + "loss": 0.71947491, + "num_input_tokens_seen": 340402760, + "step": 15779, + "time_per_iteration": 2.5763099193573 + }, + { + "auxiliary_loss_clip": 0.0116426, + "auxiliary_loss_mlp": 0.01100337, + "balance_loss_clip": 1.00193954, + "balance_loss_mlp": 1.00043964, + "epoch": 0.9487449271005561, + "flos": 19244385221760.0, + "grad_norm": 2.1247425608090507, + "language_loss": 0.78954697, + "learning_rate": 2.7463268745907542e-08, + "loss": 0.81219292, + "num_input_tokens_seen": 340422105, + "step": 15780, + "time_per_iteration": 2.5854921340942383 + }, + { + "auxiliary_loss_clip": 0.01131213, + "auxiliary_loss_mlp": 0.00747327, + "balance_loss_clip": 1.00188267, + "balance_loss_mlp": 1.00043821, + "epoch": 0.9488050503532242, + "flos": 21762692772480.0, + "grad_norm": 2.132461835792277, + "language_loss": 0.66141737, + "learning_rate": 2.7398986326794494e-08, + "loss": 0.68020278, + "num_input_tokens_seen": 340441160, + "step": 15781, + "time_per_iteration": 2.664517402648926 + }, + { + "auxiliary_loss_clip": 0.01163927, + "auxiliary_loss_mlp": 0.01099798, + "balance_loss_clip": 1.0018872, + "balance_loss_mlp": 1.00042546, + "epoch": 0.9488651736058921, + "flos": 18368919037440.0, + "grad_norm": 1.9739545625787818, + "language_loss": 0.79816902, + "learning_rate": 2.733477870890999e-08, + "loss": 0.82080626, + "num_input_tokens_seen": 340458200, + "step": 15782, + "time_per_iteration": 2.647264003753662 + }, + { + "auxiliary_loss_clip": 0.01142264, + "auxiliary_loss_mlp": 0.01073799, + "balance_loss_clip": 1.00087452, + "balance_loss_mlp": 1.00017512, + "epoch": 0.9489252968585601, + "flos": 70084057230720.0, + "grad_norm": 0.7098775061933786, + "language_loss": 0.59804356, + "learning_rate": 2.7270645894688082e-08, + "loss": 0.62020421, + "num_input_tokens_seen": 340526420, + "step": 15783, + "time_per_iteration": 3.386564254760742 + }, + { + "auxiliary_loss_clip": 0.0114753, + "auxiliary_loss_mlp": 0.01101018, + "balance_loss_clip": 1.00173104, + "balance_loss_mlp": 1.00054848, + "epoch": 0.948985420111228, + "flos": 27855440490240.0, + "grad_norm": 1.5772771962813796, + "language_loss": 0.74028397, + "learning_rate": 2.720658788656105e-08, + "loss": 0.76276946, + "num_input_tokens_seen": 340546325, + "step": 15784, + "time_per_iteration": 2.723858118057251 + }, + { + "auxiliary_loss_clip": 0.01103486, + "auxiliary_loss_mlp": 0.01100939, + "balance_loss_clip": 1.00170517, + "balance_loss_mlp": 1.0004698, + "epoch": 0.949045543363896, + "flos": 24316049018880.0, + "grad_norm": 1.7696812189775168, + "language_loss": 0.69655693, + "learning_rate": 2.714260468695806e-08, + "loss": 0.71860111, + "num_input_tokens_seen": 340565145, + "step": 15785, + "time_per_iteration": 2.761660099029541 + }, + { + "auxiliary_loss_clip": 0.01164279, + "auxiliary_loss_mlp": 0.01100935, + "balance_loss_clip": 1.00189614, + "balance_loss_mlp": 1.00065613, + "epoch": 0.9491056666165639, + "flos": 24241677909120.0, + "grad_norm": 1.668935329851534, + "language_loss": 0.75977433, + "learning_rate": 2.707869629830495e-08, + "loss": 0.78242642, + "num_input_tokens_seen": 340585465, + "step": 15786, + "time_per_iteration": 2.628411054611206 + }, + { + "auxiliary_loss_clip": 0.01114075, + "auxiliary_loss_mlp": 0.01099973, + "balance_loss_clip": 1.00165522, + "balance_loss_mlp": 1.00055218, + "epoch": 0.949165789869232, + "flos": 24531261356160.0, + "grad_norm": 6.324777393309422, + "language_loss": 0.79186654, + "learning_rate": 2.7014862723025335e-08, + "loss": 0.81400698, + "num_input_tokens_seen": 340606010, + "step": 15787, + "time_per_iteration": 2.670654773712158 + }, + { + "auxiliary_loss_clip": 0.011487, + "auxiliary_loss_mlp": 0.01100252, + "balance_loss_clip": 1.00198507, + "balance_loss_mlp": 1.00045002, + "epoch": 0.9492259131218999, + "flos": 22235348862720.0, + "grad_norm": 1.6567818953999158, + "language_loss": 0.76320601, + "learning_rate": 2.6951103963540388e-08, + "loss": 0.78569555, + "num_input_tokens_seen": 340626135, + "step": 15788, + "time_per_iteration": 2.6736555099487305 + }, + { + "auxiliary_loss_clip": 0.011476, + "auxiliary_loss_mlp": 0.01101638, + "balance_loss_clip": 1.00187993, + "balance_loss_mlp": 1.00054836, + "epoch": 0.9492860363745679, + "flos": 22966310632320.0, + "grad_norm": 1.7963731160624294, + "language_loss": 0.71844411, + "learning_rate": 2.6887420022266848e-08, + "loss": 0.74093652, + "num_input_tokens_seen": 340644870, + "step": 15789, + "time_per_iteration": 2.6279044151306152 + }, + { + "auxiliary_loss_clip": 0.01117107, + "auxiliary_loss_mlp": 0.01100515, + "balance_loss_clip": 1.00203371, + "balance_loss_mlp": 1.00052238, + "epoch": 0.9493461596272358, + "flos": 18370283754240.0, + "grad_norm": 1.8264057779917655, + "language_loss": 0.73448622, + "learning_rate": 2.682381090161989e-08, + "loss": 0.75666249, + "num_input_tokens_seen": 340663695, + "step": 15790, + "time_per_iteration": 2.6713459491729736 + }, + { + "auxiliary_loss_clip": 0.01116024, + "auxiliary_loss_mlp": 0.01101208, + "balance_loss_clip": 1.00178957, + "balance_loss_mlp": 1.00040436, + "epoch": 0.9494062828799038, + "flos": 20011724490240.0, + "grad_norm": 1.978016416988687, + "language_loss": 0.77703238, + "learning_rate": 2.6760276604012033e-08, + "loss": 0.79920465, + "num_input_tokens_seen": 340682970, + "step": 15791, + "time_per_iteration": 4.046483993530273 + }, + { + "auxiliary_loss_clip": 0.01147781, + "auxiliary_loss_mlp": 0.01101354, + "balance_loss_clip": 1.00179815, + "balance_loss_mlp": 1.00055039, + "epoch": 0.9494664061325718, + "flos": 27228583313280.0, + "grad_norm": 2.1092920086494735, + "language_loss": 0.73349607, + "learning_rate": 2.6696817131852234e-08, + "loss": 0.75598741, + "num_input_tokens_seen": 340702275, + "step": 15792, + "time_per_iteration": 2.629098892211914 + }, + { + "auxiliary_loss_clip": 0.01149354, + "auxiliary_loss_mlp": 0.0110004, + "balance_loss_clip": 1.00181448, + "balance_loss_mlp": 1.00052452, + "epoch": 0.9495265293852397, + "flos": 18369816877440.0, + "grad_norm": 2.0196521106911045, + "language_loss": 0.78204107, + "learning_rate": 2.663343248754679e-08, + "loss": 0.80453503, + "num_input_tokens_seen": 340719060, + "step": 15793, + "time_per_iteration": 2.569448947906494 + }, + { + "auxiliary_loss_clip": 0.01132785, + "auxiliary_loss_mlp": 0.01099896, + "balance_loss_clip": 1.00186932, + "balance_loss_mlp": 1.00052273, + "epoch": 0.9495866526379078, + "flos": 23075766351360.0, + "grad_norm": 1.9233792068908928, + "language_loss": 0.77587426, + "learning_rate": 2.6570122673499562e-08, + "loss": 0.79820108, + "num_input_tokens_seen": 340737815, + "step": 15794, + "time_per_iteration": 2.6709225177764893 + }, + { + "auxiliary_loss_clip": 0.0111566, + "auxiliary_loss_mlp": 0.00747424, + "balance_loss_clip": 1.0017091, + "balance_loss_mlp": 1.00053895, + "epoch": 0.9496467758905757, + "flos": 17529902179200.0, + "grad_norm": 1.9494552644667913, + "language_loss": 0.61463141, + "learning_rate": 2.650688769211107e-08, + "loss": 0.63326222, + "num_input_tokens_seen": 340756035, + "step": 15795, + "time_per_iteration": 4.171872138977051 + }, + { + "auxiliary_loss_clip": 0.01147295, + "auxiliary_loss_mlp": 0.01100205, + "balance_loss_clip": 1.001858, + "balance_loss_mlp": 1.00054646, + "epoch": 0.9497068991432437, + "flos": 24133910129280.0, + "grad_norm": 1.6673518438559334, + "language_loss": 0.78756893, + "learning_rate": 2.644372754577895e-08, + "loss": 0.81004393, + "num_input_tokens_seen": 340775620, + "step": 15796, + "time_per_iteration": 2.645369291305542 + }, + { + "auxiliary_loss_clip": 0.01147475, + "auxiliary_loss_mlp": 0.01101652, + "balance_loss_clip": 1.00183702, + "balance_loss_mlp": 1.00046718, + "epoch": 0.9497670223959116, + "flos": 20303319098880.0, + "grad_norm": 1.8898328711226888, + "language_loss": 0.75624144, + "learning_rate": 2.6380642236898398e-08, + "loss": 0.77873266, + "num_input_tokens_seen": 340794510, + "step": 15797, + "time_per_iteration": 2.5946714878082275 + }, + { + "auxiliary_loss_clip": 0.01118109, + "auxiliary_loss_mlp": 0.00747406, + "balance_loss_clip": 1.00188422, + "balance_loss_mlp": 1.00049639, + "epoch": 0.9498271456485796, + "flos": 13698916099200.0, + "grad_norm": 2.3727900618942876, + "language_loss": 0.65781844, + "learning_rate": 2.6317631767861727e-08, + "loss": 0.67647362, + "num_input_tokens_seen": 340812955, + "step": 15798, + "time_per_iteration": 2.7183473110198975 + }, + { + "auxiliary_loss_clip": 0.01147574, + "auxiliary_loss_mlp": 0.01100524, + "balance_loss_clip": 1.00185192, + "balance_loss_mlp": 1.00053096, + "epoch": 0.9498872689012475, + "flos": 20814004713600.0, + "grad_norm": 2.0299413335087872, + "language_loss": 0.77376115, + "learning_rate": 2.6254696141058575e-08, + "loss": 0.79624212, + "num_input_tokens_seen": 340829200, + "step": 15799, + "time_per_iteration": 2.5782883167266846 + }, + { + "auxiliary_loss_clip": 0.01149472, + "auxiliary_loss_mlp": 0.01100264, + "balance_loss_clip": 1.00205696, + "balance_loss_mlp": 1.0005101, + "epoch": 0.9499473921539155, + "flos": 21032700670080.0, + "grad_norm": 2.2893924908823506, + "language_loss": 0.70762831, + "learning_rate": 2.6191835358874814e-08, + "loss": 0.73012567, + "num_input_tokens_seen": 340848035, + "step": 15800, + "time_per_iteration": 2.6143112182617188 + }, + { + "auxiliary_loss_clip": 0.0113277, + "auxiliary_loss_mlp": 0.01099802, + "balance_loss_clip": 1.00176382, + "balance_loss_mlp": 1.00038123, + "epoch": 0.9500075154065835, + "flos": 20998693468800.0, + "grad_norm": 1.6822769590854112, + "language_loss": 0.71755803, + "learning_rate": 2.6129049423694315e-08, + "loss": 0.73988378, + "num_input_tokens_seen": 340870025, + "step": 15801, + "time_per_iteration": 2.6632895469665527 + }, + { + "auxiliary_loss_clip": 0.01149372, + "auxiliary_loss_mlp": 0.01099745, + "balance_loss_clip": 1.00186646, + "balance_loss_mlp": 1.00056267, + "epoch": 0.9500676386592515, + "flos": 25121956515840.0, + "grad_norm": 1.4694218416919391, + "language_loss": 0.81121916, + "learning_rate": 2.6066338337898508e-08, + "loss": 0.83371031, + "num_input_tokens_seen": 340892290, + "step": 15802, + "time_per_iteration": 2.6760170459747314 + }, + { + "auxiliary_loss_clip": 0.0116434, + "auxiliary_loss_mlp": 0.01101243, + "balance_loss_clip": 1.00195539, + "balance_loss_mlp": 1.00043988, + "epoch": 0.9501277619119194, + "flos": 27523625627520.0, + "grad_norm": 1.6167549628201114, + "language_loss": 0.67497379, + "learning_rate": 2.60037021038646e-08, + "loss": 0.69762963, + "num_input_tokens_seen": 340912260, + "step": 15803, + "time_per_iteration": 2.6029980182647705 + }, + { + "auxiliary_loss_clip": 0.01132639, + "auxiliary_loss_mlp": 0.01100725, + "balance_loss_clip": 1.00185478, + "balance_loss_mlp": 1.00044632, + "epoch": 0.9501878851645874, + "flos": 20813968800000.0, + "grad_norm": 1.5866589180616555, + "language_loss": 0.76157093, + "learning_rate": 2.5941140723968247e-08, + "loss": 0.78390455, + "num_input_tokens_seen": 340928930, + "step": 15804, + "time_per_iteration": 2.6726114749908447 + }, + { + "auxiliary_loss_clip": 0.0114996, + "auxiliary_loss_mlp": 0.01101621, + "balance_loss_clip": 1.00208545, + "balance_loss_mlp": 1.00053132, + "epoch": 0.9502480084172553, + "flos": 18369385914240.0, + "grad_norm": 2.5640635844916395, + "language_loss": 0.73269439, + "learning_rate": 2.5878654200581775e-08, + "loss": 0.75521022, + "num_input_tokens_seen": 340946615, + "step": 15805, + "time_per_iteration": 2.6295690536499023 + }, + { + "auxiliary_loss_clip": 0.01132876, + "auxiliary_loss_mlp": 0.0110094, + "balance_loss_clip": 1.00197983, + "balance_loss_mlp": 1.00051785, + "epoch": 0.9503081316699233, + "flos": 23549607590400.0, + "grad_norm": 2.060247480137863, + "language_loss": 0.8014757, + "learning_rate": 2.5816242536074618e-08, + "loss": 0.8238138, + "num_input_tokens_seen": 340967545, + "step": 15806, + "time_per_iteration": 2.7359883785247803 + }, + { + "auxiliary_loss_clip": 0.01116267, + "auxiliary_loss_mlp": 0.01099963, + "balance_loss_clip": 1.00183868, + "balance_loss_mlp": 1.00049436, + "epoch": 0.9503682549225914, + "flos": 18040444139520.0, + "grad_norm": 2.13293851832418, + "language_loss": 0.82189834, + "learning_rate": 2.5753905732813108e-08, + "loss": 0.84406066, + "num_input_tokens_seen": 340984955, + "step": 15807, + "time_per_iteration": 2.639137029647827 + }, + { + "auxiliary_loss_clip": 0.0114976, + "auxiliary_loss_mlp": 0.01099936, + "balance_loss_clip": 1.00193894, + "balance_loss_mlp": 1.0004679, + "epoch": 0.9504283781752593, + "flos": 25886135387520.0, + "grad_norm": 2.031260939410171, + "language_loss": 0.71718085, + "learning_rate": 2.5691643793161355e-08, + "loss": 0.73967785, + "num_input_tokens_seen": 341007300, + "step": 15808, + "time_per_iteration": 2.699612855911255 + }, + { + "auxiliary_loss_clip": 0.01148655, + "auxiliary_loss_mlp": 0.01100434, + "balance_loss_clip": 1.00182366, + "balance_loss_mlp": 1.00044107, + "epoch": 0.9504885014279273, + "flos": 22124025636480.0, + "grad_norm": 1.618076790433492, + "language_loss": 0.6979413, + "learning_rate": 2.562945671948058e-08, + "loss": 0.72043216, + "num_input_tokens_seen": 341026695, + "step": 15809, + "time_per_iteration": 4.241078615188599 + }, + { + "auxiliary_loss_clip": 0.01134629, + "auxiliary_loss_mlp": 0.0110002, + "balance_loss_clip": 1.00179935, + "balance_loss_mlp": 1.0003612, + "epoch": 0.9505486246805952, + "flos": 21615961714560.0, + "grad_norm": 1.4874899548151574, + "language_loss": 0.7544592, + "learning_rate": 2.5567344514128452e-08, + "loss": 0.7768057, + "num_input_tokens_seen": 341047080, + "step": 15810, + "time_per_iteration": 2.721817970275879 + }, + { + "auxiliary_loss_clip": 0.01114101, + "auxiliary_loss_mlp": 0.01100788, + "balance_loss_clip": 1.00169945, + "balance_loss_mlp": 1.0007, + "epoch": 0.9506087479332632, + "flos": 22528236360960.0, + "grad_norm": 1.5339384491746137, + "language_loss": 0.8009268, + "learning_rate": 2.5505307179460643e-08, + "loss": 0.82307565, + "num_input_tokens_seen": 341067310, + "step": 15811, + "time_per_iteration": 2.7053420543670654 + }, + { + "auxiliary_loss_clip": 0.01135092, + "auxiliary_loss_mlp": 0.01100589, + "balance_loss_clip": 1.00179315, + "balance_loss_mlp": 1.00045276, + "epoch": 0.9506688711859311, + "flos": 27527360641920.0, + "grad_norm": 2.007186936720197, + "language_loss": 0.6998806, + "learning_rate": 2.5443344717829495e-08, + "loss": 0.72223747, + "num_input_tokens_seen": 341085110, + "step": 15812, + "time_per_iteration": 4.0528564453125 + }, + { + "auxiliary_loss_clip": 0.01115795, + "auxiliary_loss_mlp": 0.01100563, + "balance_loss_clip": 1.00179493, + "balance_loss_mlp": 1.00047445, + "epoch": 0.9507289944385992, + "flos": 19865783531520.0, + "grad_norm": 1.6017725684898967, + "language_loss": 0.65758598, + "learning_rate": 2.538145713158446e-08, + "loss": 0.67974961, + "num_input_tokens_seen": 341103190, + "step": 15813, + "time_per_iteration": 2.699261426925659 + }, + { + "auxiliary_loss_clip": 0.01147576, + "auxiliary_loss_mlp": 0.01100715, + "balance_loss_clip": 1.00183773, + "balance_loss_mlp": 1.00057888, + "epoch": 0.9507891176912671, + "flos": 25193274969600.0, + "grad_norm": 1.4844773778261677, + "language_loss": 0.70373911, + "learning_rate": 2.5319644423072327e-08, + "loss": 0.72622204, + "num_input_tokens_seen": 341125695, + "step": 15814, + "time_per_iteration": 2.782093048095703 + }, + { + "auxiliary_loss_clip": 0.01147486, + "auxiliary_loss_mlp": 0.01099939, + "balance_loss_clip": 1.00182176, + "balance_loss_mlp": 1.00047112, + "epoch": 0.9508492409439351, + "flos": 24899561458560.0, + "grad_norm": 1.9412643227759767, + "language_loss": 0.63215083, + "learning_rate": 2.5257906594637445e-08, + "loss": 0.65462506, + "num_input_tokens_seen": 341143930, + "step": 15815, + "time_per_iteration": 2.632833480834961 + }, + { + "auxiliary_loss_clip": 0.01130697, + "auxiliary_loss_mlp": 0.01101034, + "balance_loss_clip": 1.00175738, + "balance_loss_mlp": 1.00051701, + "epoch": 0.950909364196603, + "flos": 29784094375680.0, + "grad_norm": 1.9978878113915373, + "language_loss": 0.5877049, + "learning_rate": 2.519624364862061e-08, + "loss": 0.61002219, + "num_input_tokens_seen": 341164280, + "step": 15816, + "time_per_iteration": 2.728090524673462 + }, + { + "auxiliary_loss_clip": 0.01164107, + "auxiliary_loss_mlp": 0.01100845, + "balance_loss_clip": 1.00188148, + "balance_loss_mlp": 1.00061393, + "epoch": 0.950969487449271, + "flos": 24717781704960.0, + "grad_norm": 1.3681810673291452, + "language_loss": 0.73558605, + "learning_rate": 2.513465558735994e-08, + "loss": 0.75823557, + "num_input_tokens_seen": 341183670, + "step": 15817, + "time_per_iteration": 2.6024038791656494 + }, + { + "auxiliary_loss_clip": 0.01130686, + "auxiliary_loss_mlp": 0.01100709, + "balance_loss_clip": 1.00175714, + "balance_loss_mlp": 1.00057268, + "epoch": 0.9510296107019389, + "flos": 13699167494400.0, + "grad_norm": 1.5746006716169698, + "language_loss": 0.59800732, + "learning_rate": 2.5073142413190918e-08, + "loss": 0.62032127, + "num_input_tokens_seen": 341201900, + "step": 15818, + "time_per_iteration": 2.650026321411133 + }, + { + "auxiliary_loss_clip": 0.01164173, + "auxiliary_loss_mlp": 0.01101199, + "balance_loss_clip": 1.00193751, + "balance_loss_mlp": 1.00058615, + "epoch": 0.9510897339546069, + "flos": 17311852667520.0, + "grad_norm": 1.990170040701488, + "language_loss": 0.69313216, + "learning_rate": 2.5011704128446552e-08, + "loss": 0.71578586, + "num_input_tokens_seen": 341218340, + "step": 15819, + "time_per_iteration": 2.6133687496185303 + }, + { + "auxiliary_loss_clip": 0.01101363, + "auxiliary_loss_mlp": 0.01100536, + "balance_loss_clip": 1.00172591, + "balance_loss_mlp": 1.00044799, + "epoch": 0.951149857207275, + "flos": 14793940166400.0, + "grad_norm": 1.6552842505795395, + "language_loss": 0.74205226, + "learning_rate": 2.49503407354561e-08, + "loss": 0.76407123, + "num_input_tokens_seen": 341235885, + "step": 15820, + "time_per_iteration": 2.7974612712860107 + }, + { + "auxiliary_loss_clip": 0.01132441, + "auxiliary_loss_mlp": 0.01101272, + "balance_loss_clip": 1.00187397, + "balance_loss_mlp": 1.00065923, + "epoch": 0.9512099804599429, + "flos": 19391152193280.0, + "grad_norm": 1.8326633799640255, + "language_loss": 0.7865178, + "learning_rate": 2.4889052236546804e-08, + "loss": 0.80885494, + "num_input_tokens_seen": 341255280, + "step": 15821, + "time_per_iteration": 2.8192198276519775 + }, + { + "auxiliary_loss_clip": 0.01116274, + "auxiliary_loss_mlp": 0.01100245, + "balance_loss_clip": 1.00166249, + "balance_loss_mlp": 1.00053859, + "epoch": 0.9512701037126109, + "flos": 36757874885760.0, + "grad_norm": 1.4768759330099588, + "language_loss": 0.71008176, + "learning_rate": 2.4827838634042586e-08, + "loss": 0.73224694, + "num_input_tokens_seen": 341279055, + "step": 15822, + "time_per_iteration": 2.870763063430786 + }, + { + "auxiliary_loss_clip": 0.01147566, + "auxiliary_loss_mlp": 0.01099848, + "balance_loss_clip": 1.00192881, + "balance_loss_mlp": 1.00057006, + "epoch": 0.9513302269652788, + "flos": 22638266697600.0, + "grad_norm": 1.6612200633047591, + "language_loss": 0.65901816, + "learning_rate": 2.47666999302647e-08, + "loss": 0.68149227, + "num_input_tokens_seen": 341298560, + "step": 15823, + "time_per_iteration": 2.681389331817627 + }, + { + "auxiliary_loss_clip": 0.01149764, + "auxiliary_loss_mlp": 0.01099181, + "balance_loss_clip": 1.00198948, + "balance_loss_mlp": 1.00057077, + "epoch": 0.9513903502179468, + "flos": 22893232412160.0, + "grad_norm": 2.072747898132082, + "language_loss": 0.77523768, + "learning_rate": 2.4705636127531292e-08, + "loss": 0.79772717, + "num_input_tokens_seen": 341316650, + "step": 15824, + "time_per_iteration": 2.6523818969726562 + }, + { + "auxiliary_loss_clip": 0.01164362, + "auxiliary_loss_mlp": 0.01101706, + "balance_loss_clip": 1.00190187, + "balance_loss_mlp": 1.00052118, + "epoch": 0.9514504734706147, + "flos": 27928626451200.0, + "grad_norm": 2.1998328990315223, + "language_loss": 0.73494446, + "learning_rate": 2.4644647228158065e-08, + "loss": 0.7576052, + "num_input_tokens_seen": 341336185, + "step": 15825, + "time_per_iteration": 2.610367774963379 + }, + { + "auxiliary_loss_clip": 0.01141994, + "auxiliary_loss_mlp": 0.01073181, + "balance_loss_clip": 1.00092041, + "balance_loss_mlp": 1.00032067, + "epoch": 0.9515105967232828, + "flos": 67366767312000.0, + "grad_norm": 0.8346505926805542, + "language_loss": 0.53479528, + "learning_rate": 2.458373323445806e-08, + "loss": 0.55694705, + "num_input_tokens_seen": 341395795, + "step": 15826, + "time_per_iteration": 3.091512441635132 + }, + { + "auxiliary_loss_clip": 0.01130883, + "auxiliary_loss_mlp": 0.01100784, + "balance_loss_clip": 1.00182271, + "balance_loss_mlp": 1.00074387, + "epoch": 0.9515707199759507, + "flos": 25846525664640.0, + "grad_norm": 2.1322201715906757, + "language_loss": 0.7274009, + "learning_rate": 2.452289414874076e-08, + "loss": 0.74971759, + "num_input_tokens_seen": 341415675, + "step": 15827, + "time_per_iteration": 2.6902236938476562 + }, + { + "auxiliary_loss_clip": 0.01130718, + "auxiliary_loss_mlp": 0.01100316, + "balance_loss_clip": 1.00176728, + "balance_loss_mlp": 1.00056171, + "epoch": 0.9516308432286187, + "flos": 21828983322240.0, + "grad_norm": 2.1278700826384624, + "language_loss": 0.74380594, + "learning_rate": 2.4462129973313207e-08, + "loss": 0.76611626, + "num_input_tokens_seen": 341432990, + "step": 15828, + "time_per_iteration": 2.706437110900879 + }, + { + "auxiliary_loss_clip": 0.01115034, + "auxiliary_loss_mlp": 0.01099031, + "balance_loss_clip": 1.00189781, + "balance_loss_mlp": 1.00070739, + "epoch": 0.9516909664812866, + "flos": 27269593666560.0, + "grad_norm": 1.8019732878280756, + "language_loss": 0.72886872, + "learning_rate": 2.440144071047978e-08, + "loss": 0.75100935, + "num_input_tokens_seen": 341454100, + "step": 15829, + "time_per_iteration": 4.116559743881226 + }, + { + "auxiliary_loss_clip": 0.01149341, + "auxiliary_loss_mlp": 0.01100366, + "balance_loss_clip": 1.00190163, + "balance_loss_mlp": 1.00042093, + "epoch": 0.9517510897339546, + "flos": 21215342350080.0, + "grad_norm": 1.5960244489778976, + "language_loss": 0.61416245, + "learning_rate": 2.4340826362541533e-08, + "loss": 0.63665956, + "num_input_tokens_seen": 341472955, + "step": 15830, + "time_per_iteration": 2.6657216548919678 + }, + { + "auxiliary_loss_clip": 0.01147842, + "auxiliary_loss_mlp": 0.01100048, + "balance_loss_clip": 1.00189233, + "balance_loss_mlp": 1.00043643, + "epoch": 0.9518112129866225, + "flos": 18733986915840.0, + "grad_norm": 1.9968212765004678, + "language_loss": 0.73112118, + "learning_rate": 2.428028693179729e-08, + "loss": 0.7536, + "num_input_tokens_seen": 341490165, + "step": 15831, + "time_per_iteration": 2.589792013168335 + }, + { + "auxiliary_loss_clip": 0.01102147, + "auxiliary_loss_mlp": 0.0109868, + "balance_loss_clip": 1.00181198, + "balance_loss_mlp": 1.00040352, + "epoch": 0.9518713362392905, + "flos": 16763676232320.0, + "grad_norm": 2.959631512144514, + "language_loss": 0.65328085, + "learning_rate": 2.4219822420542545e-08, + "loss": 0.67528915, + "num_input_tokens_seen": 341508055, + "step": 15832, + "time_per_iteration": 2.7224061489105225 + }, + { + "auxiliary_loss_clip": 0.01149162, + "auxiliary_loss_mlp": 0.01100067, + "balance_loss_clip": 1.00193739, + "balance_loss_mlp": 1.00059843, + "epoch": 0.9519314594919586, + "flos": 15230649720960.0, + "grad_norm": 1.766873119351286, + "language_loss": 0.77683884, + "learning_rate": 2.4159432831070135e-08, + "loss": 0.79933119, + "num_input_tokens_seen": 341526155, + "step": 15833, + "time_per_iteration": 4.006248235702515 + }, + { + "auxiliary_loss_clip": 0.01115844, + "auxiliary_loss_mlp": 0.0109944, + "balance_loss_clip": 1.00185919, + "balance_loss_mlp": 1.00049615, + "epoch": 0.9519915827446265, + "flos": 19352943100800.0, + "grad_norm": 2.133361150995914, + "language_loss": 0.7438848, + "learning_rate": 2.4099118165670007e-08, + "loss": 0.76603758, + "num_input_tokens_seen": 341540450, + "step": 15834, + "time_per_iteration": 2.664440870285034 + }, + { + "auxiliary_loss_clip": 0.01147972, + "auxiliary_loss_mlp": 0.01101848, + "balance_loss_clip": 1.00181949, + "balance_loss_mlp": 1.00066292, + "epoch": 0.9520517059972945, + "flos": 22266303408000.0, + "grad_norm": 1.986111349769443, + "language_loss": 0.76344121, + "learning_rate": 2.4038878426629216e-08, + "loss": 0.78593946, + "num_input_tokens_seen": 341557865, + "step": 15835, + "time_per_iteration": 2.655973196029663 + }, + { + "auxiliary_loss_clip": 0.01132884, + "auxiliary_loss_mlp": 0.01100428, + "balance_loss_clip": 1.00181389, + "balance_loss_mlp": 1.00048327, + "epoch": 0.9521118292499624, + "flos": 14862313704960.0, + "grad_norm": 2.0943922238638217, + "language_loss": 0.66455519, + "learning_rate": 2.397871361623238e-08, + "loss": 0.68688834, + "num_input_tokens_seen": 341573890, + "step": 15836, + "time_per_iteration": 2.6425044536590576 + }, + { + "auxiliary_loss_clip": 0.01115872, + "auxiliary_loss_mlp": 0.01099881, + "balance_loss_clip": 1.00178206, + "balance_loss_mlp": 1.00046074, + "epoch": 0.9521719525026304, + "flos": 23508812718720.0, + "grad_norm": 1.7157335858589229, + "language_loss": 0.70314252, + "learning_rate": 2.391862373676057e-08, + "loss": 0.72530001, + "num_input_tokens_seen": 341593770, + "step": 15837, + "time_per_iteration": 2.7747952938079834 + }, + { + "auxiliary_loss_clip": 0.01163969, + "auxiliary_loss_mlp": 0.01100494, + "balance_loss_clip": 1.00175118, + "balance_loss_mlp": 1.00054908, + "epoch": 0.9522320757552983, + "flos": 19714922409600.0, + "grad_norm": 1.9233208734805667, + "language_loss": 0.73346925, + "learning_rate": 2.3858608790492617e-08, + "loss": 0.75611389, + "num_input_tokens_seen": 341612065, + "step": 15838, + "time_per_iteration": 2.6044654846191406 + }, + { + "auxiliary_loss_clip": 0.01120174, + "auxiliary_loss_mlp": 0.01101019, + "balance_loss_clip": 1.0018146, + "balance_loss_mlp": 1.00050211, + "epoch": 0.9522921990079664, + "flos": 25921291824000.0, + "grad_norm": 1.8743351337916476, + "language_loss": 0.78421056, + "learning_rate": 2.379866877970449e-08, + "loss": 0.80642247, + "num_input_tokens_seen": 341631365, + "step": 15839, + "time_per_iteration": 2.722365617752075 + }, + { + "auxiliary_loss_clip": 0.01118267, + "auxiliary_loss_mlp": 0.01099713, + "balance_loss_clip": 1.00180936, + "balance_loss_mlp": 1.00053096, + "epoch": 0.9523523222606343, + "flos": 19208115463680.0, + "grad_norm": 1.509742500193579, + "language_loss": 0.80500126, + "learning_rate": 2.3738803706668585e-08, + "loss": 0.82718098, + "num_input_tokens_seen": 341650300, + "step": 15840, + "time_per_iteration": 2.7070467472076416 + }, + { + "auxiliary_loss_clip": 0.01130479, + "auxiliary_loss_mlp": 0.01099236, + "balance_loss_clip": 1.00168991, + "balance_loss_mlp": 1.00053096, + "epoch": 0.9524124455133023, + "flos": 20921269703040.0, + "grad_norm": 6.36984131821283, + "language_loss": 0.73109239, + "learning_rate": 2.3679013573655314e-08, + "loss": 0.75338954, + "num_input_tokens_seen": 341667680, + "step": 15841, + "time_per_iteration": 2.619687795639038 + }, + { + "auxiliary_loss_clip": 0.01130388, + "auxiliary_loss_mlp": 0.01099427, + "balance_loss_clip": 1.00177336, + "balance_loss_mlp": 1.00038791, + "epoch": 0.9524725687659702, + "flos": 18843550375680.0, + "grad_norm": 2.2654814289376732, + "language_loss": 0.79033613, + "learning_rate": 2.3619298382931972e-08, + "loss": 0.81263429, + "num_input_tokens_seen": 341685760, + "step": 15842, + "time_per_iteration": 2.6504898071289062 + }, + { + "auxiliary_loss_clip": 0.01130845, + "auxiliary_loss_mlp": 0.01100548, + "balance_loss_clip": 1.00190699, + "balance_loss_mlp": 1.0006032, + "epoch": 0.9525326920186382, + "flos": 22674680110080.0, + "grad_norm": 2.6940510538177826, + "language_loss": 0.72368407, + "learning_rate": 2.3559658136762973e-08, + "loss": 0.74599802, + "num_input_tokens_seen": 341705300, + "step": 15843, + "time_per_iteration": 2.6427927017211914 + }, + { + "auxiliary_loss_clip": 0.01131632, + "auxiliary_loss_mlp": 0.00747296, + "balance_loss_clip": 1.00193644, + "balance_loss_mlp": 1.00047255, + "epoch": 0.9525928152713061, + "flos": 22086642556800.0, + "grad_norm": 1.705651285224808, + "language_loss": 0.78316212, + "learning_rate": 2.3500092837409612e-08, + "loss": 0.80195141, + "num_input_tokens_seen": 341724565, + "step": 15844, + "time_per_iteration": 2.6992909908294678 + }, + { + "auxiliary_loss_clip": 0.01116502, + "auxiliary_loss_mlp": 0.01101212, + "balance_loss_clip": 1.00169897, + "balance_loss_mlp": 1.00064671, + "epoch": 0.9526529385239741, + "flos": 20704728562560.0, + "grad_norm": 3.6575139123192995, + "language_loss": 0.70027399, + "learning_rate": 2.3440602487130977e-08, + "loss": 0.72245115, + "num_input_tokens_seen": 341743605, + "step": 15845, + "time_per_iteration": 2.671376943588257 + }, + { + "auxiliary_loss_clip": 0.01116303, + "auxiliary_loss_mlp": 0.0110054, + "balance_loss_clip": 1.00169897, + "balance_loss_mlp": 1.00049996, + "epoch": 0.9527130617766422, + "flos": 23368043318400.0, + "grad_norm": 1.964939760321021, + "language_loss": 0.75633669, + "learning_rate": 2.338118708818282e-08, + "loss": 0.77850515, + "num_input_tokens_seen": 341763475, + "step": 15846, + "time_per_iteration": 2.7461307048797607 + }, + { + "auxiliary_loss_clip": 0.01116552, + "auxiliary_loss_mlp": 0.01099207, + "balance_loss_clip": 1.00164843, + "balance_loss_mlp": 1.00035918, + "epoch": 0.9527731850293101, + "flos": 18985935888000.0, + "grad_norm": 2.949301929908595, + "language_loss": 0.78065872, + "learning_rate": 2.3321846642817998e-08, + "loss": 0.80281633, + "num_input_tokens_seen": 341781265, + "step": 15847, + "time_per_iteration": 4.258587598800659 + }, + { + "auxiliary_loss_clip": 0.01116159, + "auxiliary_loss_mlp": 0.01100073, + "balance_loss_clip": 1.00166488, + "balance_loss_mlp": 1.00065231, + "epoch": 0.9528333082819781, + "flos": 19318038059520.0, + "grad_norm": 4.1212184469658775, + "language_loss": 0.780038, + "learning_rate": 2.326258115328672e-08, + "loss": 0.80220032, + "num_input_tokens_seen": 341798825, + "step": 15848, + "time_per_iteration": 2.736264944076538 + }, + { + "auxiliary_loss_clip": 0.01133131, + "auxiliary_loss_mlp": 0.01102117, + "balance_loss_clip": 1.00183225, + "balance_loss_mlp": 1.00055027, + "epoch": 0.952893431534646, + "flos": 23951340276480.0, + "grad_norm": 1.9180749989775387, + "language_loss": 0.72203809, + "learning_rate": 2.320339062183674e-08, + "loss": 0.74439049, + "num_input_tokens_seen": 341819480, + "step": 15849, + "time_per_iteration": 2.699509859085083 + }, + { + "auxiliary_loss_clip": 0.01147497, + "auxiliary_loss_mlp": 0.01101227, + "balance_loss_clip": 1.00184965, + "balance_loss_mlp": 1.0006144, + "epoch": 0.952953554787314, + "flos": 21030545854080.0, + "grad_norm": 2.1800000844358176, + "language_loss": 0.75233436, + "learning_rate": 2.314427505071226e-08, + "loss": 0.77482164, + "num_input_tokens_seen": 341838035, + "step": 15850, + "time_per_iteration": 3.910048723220825 + }, + { + "auxiliary_loss_clip": 0.01133292, + "auxiliary_loss_mlp": 0.01100714, + "balance_loss_clip": 1.00187135, + "balance_loss_mlp": 1.00057793, + "epoch": 0.9530136780399819, + "flos": 22382870019840.0, + "grad_norm": 2.188794638904534, + "language_loss": 0.72565103, + "learning_rate": 2.308523444215482e-08, + "loss": 0.74799109, + "num_input_tokens_seen": 341855895, + "step": 15851, + "time_per_iteration": 2.634105920791626 + }, + { + "auxiliary_loss_clip": 0.01133031, + "auxiliary_loss_mlp": 0.01100302, + "balance_loss_clip": 1.00175977, + "balance_loss_mlp": 1.0004046, + "epoch": 0.95307380129265, + "flos": 22159613036160.0, + "grad_norm": 4.735720615922304, + "language_loss": 0.79547435, + "learning_rate": 2.3026268798403525e-08, + "loss": 0.81780767, + "num_input_tokens_seen": 341875240, + "step": 15852, + "time_per_iteration": 2.6548402309417725 + }, + { + "auxiliary_loss_clip": 0.01149453, + "auxiliary_loss_mlp": 0.0110089, + "balance_loss_clip": 1.00187731, + "balance_loss_mlp": 1.00046813, + "epoch": 0.9531339245453179, + "flos": 44022747214080.0, + "grad_norm": 1.5135799301247495, + "language_loss": 0.59586632, + "learning_rate": 2.2967378121694138e-08, + "loss": 0.61836976, + "num_input_tokens_seen": 341901020, + "step": 15853, + "time_per_iteration": 2.8652098178863525 + }, + { + "auxiliary_loss_clip": 0.01133608, + "auxiliary_loss_mlp": 0.0109825, + "balance_loss_clip": 1.0017215, + "balance_loss_mlp": 1.00045133, + "epoch": 0.9531940477979859, + "flos": 20266690204800.0, + "grad_norm": 1.7286141097372723, + "language_loss": 0.72497565, + "learning_rate": 2.290856241425998e-08, + "loss": 0.74729419, + "num_input_tokens_seen": 341919365, + "step": 15854, + "time_per_iteration": 2.672226667404175 + }, + { + "auxiliary_loss_clip": 0.01131015, + "auxiliary_loss_mlp": 0.01099827, + "balance_loss_clip": 1.00172877, + "balance_loss_mlp": 1.00045443, + "epoch": 0.9532541710506538, + "flos": 25335732309120.0, + "grad_norm": 3.437461529333553, + "language_loss": 0.6760655, + "learning_rate": 2.284982167833127e-08, + "loss": 0.69837391, + "num_input_tokens_seen": 341939985, + "step": 15855, + "time_per_iteration": 2.6815567016601562 + }, + { + "auxiliary_loss_clip": 0.01164253, + "auxiliary_loss_mlp": 0.01099678, + "balance_loss_clip": 1.00197268, + "balance_loss_mlp": 1.00059128, + "epoch": 0.9533142943033218, + "flos": 26469288691200.0, + "grad_norm": 1.614532119272822, + "language_loss": 0.76235187, + "learning_rate": 2.279115591613556e-08, + "loss": 0.78499115, + "num_input_tokens_seen": 341959255, + "step": 15856, + "time_per_iteration": 2.681065797805786 + }, + { + "auxiliary_loss_clip": 0.01134644, + "auxiliary_loss_mlp": 0.01100298, + "balance_loss_clip": 1.00188088, + "balance_loss_mlp": 1.00054383, + "epoch": 0.9533744175559897, + "flos": 23656944407040.0, + "grad_norm": 1.8471037760205224, + "language_loss": 0.78107893, + "learning_rate": 2.2732565129897075e-08, + "loss": 0.80342841, + "num_input_tokens_seen": 341977205, + "step": 15857, + "time_per_iteration": 2.6855828762054443 + }, + { + "auxiliary_loss_clip": 0.0114288, + "auxiliary_loss_mlp": 0.01073659, + "balance_loss_clip": 1.00078869, + "balance_loss_mlp": 1.00003529, + "epoch": 0.9534345408086577, + "flos": 61052055500160.0, + "grad_norm": 0.7265072561636502, + "language_loss": 0.62575543, + "learning_rate": 2.267404932183803e-08, + "loss": 0.64792079, + "num_input_tokens_seen": 342038545, + "step": 15858, + "time_per_iteration": 3.191358804702759 + }, + { + "auxiliary_loss_clip": 0.01099103, + "auxiliary_loss_mlp": 0.01099642, + "balance_loss_clip": 1.0017066, + "balance_loss_mlp": 1.00055575, + "epoch": 0.9534946640613258, + "flos": 18951677291520.0, + "grad_norm": 1.4859996997568468, + "language_loss": 0.56952339, + "learning_rate": 2.2615608494177097e-08, + "loss": 0.59151077, + "num_input_tokens_seen": 342058195, + "step": 15859, + "time_per_iteration": 2.800861358642578 + }, + { + "auxiliary_loss_clip": 0.01163979, + "auxiliary_loss_mlp": 0.01099325, + "balance_loss_clip": 1.00193286, + "balance_loss_mlp": 1.00042892, + "epoch": 0.9535547873139937, + "flos": 16654292340480.0, + "grad_norm": 2.583179406766835, + "language_loss": 0.8198036, + "learning_rate": 2.2557242649130504e-08, + "loss": 0.84243667, + "num_input_tokens_seen": 342075025, + "step": 15860, + "time_per_iteration": 2.621511220932007 + }, + { + "auxiliary_loss_clip": 0.01097724, + "auxiliary_loss_mlp": 0.00747368, + "balance_loss_clip": 1.00157499, + "balance_loss_mlp": 1.00050354, + "epoch": 0.9536149105666617, + "flos": 20667776446080.0, + "grad_norm": 1.9260164856170008, + "language_loss": 0.66674364, + "learning_rate": 2.249895178891159e-08, + "loss": 0.68519449, + "num_input_tokens_seen": 342094595, + "step": 15861, + "time_per_iteration": 2.760758638381958 + }, + { + "auxiliary_loss_clip": 0.01149089, + "auxiliary_loss_mlp": 0.01101612, + "balance_loss_clip": 1.0018357, + "balance_loss_mlp": 1.00061846, + "epoch": 0.9536750338193296, + "flos": 30700499086080.0, + "grad_norm": 1.933070762314824, + "language_loss": 0.65969992, + "learning_rate": 2.244073591573037e-08, + "loss": 0.68220693, + "num_input_tokens_seen": 342115970, + "step": 15862, + "time_per_iteration": 2.6992385387420654 + }, + { + "auxiliary_loss_clip": 0.01117779, + "auxiliary_loss_mlp": 0.01099588, + "balance_loss_clip": 1.00191832, + "balance_loss_mlp": 1.00054908, + "epoch": 0.9537351570719976, + "flos": 20405484357120.0, + "grad_norm": 1.5262361119374053, + "language_loss": 0.67567819, + "learning_rate": 2.238259503179485e-08, + "loss": 0.6978519, + "num_input_tokens_seen": 342134080, + "step": 15863, + "time_per_iteration": 2.7058098316192627 + }, + { + "auxiliary_loss_clip": 0.01134261, + "auxiliary_loss_mlp": 0.01099705, + "balance_loss_clip": 1.0017786, + "balance_loss_mlp": 1.00047505, + "epoch": 0.9537952803246655, + "flos": 29929245235200.0, + "grad_norm": 1.6561424288508766, + "language_loss": 0.78694898, + "learning_rate": 2.2324529139309267e-08, + "loss": 0.80928862, + "num_input_tokens_seen": 342154725, + "step": 15864, + "time_per_iteration": 2.7284462451934814 + }, + { + "auxiliary_loss_clip": 0.01115868, + "auxiliary_loss_mlp": 0.01099863, + "balance_loss_clip": 1.00174189, + "balance_loss_mlp": 1.00063312, + "epoch": 0.9538554035773336, + "flos": 20521404524160.0, + "grad_norm": 1.876755898243604, + "language_loss": 0.59704965, + "learning_rate": 2.226653824047586e-08, + "loss": 0.61920696, + "num_input_tokens_seen": 342172275, + "step": 15865, + "time_per_iteration": 2.7571628093719482 + }, + { + "auxiliary_loss_clip": 0.0111617, + "auxiliary_loss_mlp": 0.01100212, + "balance_loss_clip": 1.00178051, + "balance_loss_mlp": 1.00041032, + "epoch": 0.9539155268300015, + "flos": 18406517598720.0, + "grad_norm": 2.114052903886223, + "language_loss": 0.69883335, + "learning_rate": 2.2208622337493765e-08, + "loss": 0.72099721, + "num_input_tokens_seen": 342190880, + "step": 15866, + "time_per_iteration": 4.167298793792725 + }, + { + "auxiliary_loss_clip": 0.01134058, + "auxiliary_loss_mlp": 0.01100418, + "balance_loss_clip": 1.00176346, + "balance_loss_mlp": 1.00047255, + "epoch": 0.9539756500826695, + "flos": 26213281482240.0, + "grad_norm": 2.193805879831527, + "language_loss": 0.84969014, + "learning_rate": 2.215078143255855e-08, + "loss": 0.87203491, + "num_input_tokens_seen": 342208165, + "step": 15867, + "time_per_iteration": 2.6379761695861816 + }, + { + "auxiliary_loss_clip": 0.01141573, + "auxiliary_loss_mlp": 0.01074339, + "balance_loss_clip": 1.00085139, + "balance_loss_mlp": 1.00033438, + "epoch": 0.9540357733353374, + "flos": 68289097766400.0, + "grad_norm": 0.7466484482407643, + "language_loss": 0.61808407, + "learning_rate": 2.2093015527864024e-08, + "loss": 0.64024317, + "num_input_tokens_seen": 342277110, + "step": 15868, + "time_per_iteration": 3.213395357131958 + }, + { + "auxiliary_loss_clip": 0.01114181, + "auxiliary_loss_mlp": 0.01099773, + "balance_loss_clip": 1.00162578, + "balance_loss_mlp": 1.0003525, + "epoch": 0.9540958965880054, + "flos": 21288276915840.0, + "grad_norm": 2.8634044784179875, + "language_loss": 0.59974378, + "learning_rate": 2.2035324625600425e-08, + "loss": 0.62188333, + "num_input_tokens_seen": 342294695, + "step": 15869, + "time_per_iteration": 2.662166118621826 + }, + { + "auxiliary_loss_clip": 0.01120063, + "auxiliary_loss_mlp": 0.0074743, + "balance_loss_clip": 1.00201905, + "balance_loss_mlp": 1.00056076, + "epoch": 0.9541560198406733, + "flos": 19751407649280.0, + "grad_norm": 3.2763839371026693, + "language_loss": 0.71139991, + "learning_rate": 2.197770872795579e-08, + "loss": 0.73007482, + "num_input_tokens_seen": 342314970, + "step": 15870, + "time_per_iteration": 2.7369282245635986 + }, + { + "auxiliary_loss_clip": 0.0111443, + "auxiliary_loss_mlp": 0.01099393, + "balance_loss_clip": 1.00158298, + "balance_loss_mlp": 1.00054455, + "epoch": 0.9542161430933414, + "flos": 24715626888960.0, + "grad_norm": 3.302729734467264, + "language_loss": 0.76933342, + "learning_rate": 2.1920167837114368e-08, + "loss": 0.79147166, + "num_input_tokens_seen": 342334255, + "step": 15871, + "time_per_iteration": 4.1983301639556885 + }, + { + "auxiliary_loss_clip": 0.01149466, + "auxiliary_loss_mlp": 0.01101478, + "balance_loss_clip": 1.001863, + "balance_loss_mlp": 1.00048399, + "epoch": 0.9542762663460094, + "flos": 31065818359680.0, + "grad_norm": 4.064231925148351, + "language_loss": 0.58492422, + "learning_rate": 2.1862701955258634e-08, + "loss": 0.60743362, + "num_input_tokens_seen": 342354730, + "step": 15872, + "time_per_iteration": 2.6782515048980713 + }, + { + "auxiliary_loss_clip": 0.01132884, + "auxiliary_loss_mlp": 0.01101698, + "balance_loss_clip": 1.00179529, + "balance_loss_mlp": 1.00060821, + "epoch": 0.9543363895986773, + "flos": 20776729374720.0, + "grad_norm": 1.8007453995073894, + "language_loss": 0.74853742, + "learning_rate": 2.1805311084567514e-08, + "loss": 0.7708832, + "num_input_tokens_seen": 342374565, + "step": 15873, + "time_per_iteration": 2.6521716117858887 + }, + { + "auxiliary_loss_clip": 0.01164268, + "auxiliary_loss_mlp": 0.01100908, + "balance_loss_clip": 1.00201237, + "balance_loss_mlp": 1.00048661, + "epoch": 0.9543965128513453, + "flos": 24462744163200.0, + "grad_norm": 1.769343969258489, + "language_loss": 0.62168694, + "learning_rate": 2.1747995227217265e-08, + "loss": 0.64433873, + "num_input_tokens_seen": 342394590, + "step": 15874, + "time_per_iteration": 2.5801033973693848 + }, + { + "auxiliary_loss_clip": 0.01132135, + "auxiliary_loss_mlp": 0.01099877, + "balance_loss_clip": 1.00169635, + "balance_loss_mlp": 1.0005995, + "epoch": 0.9544566361040132, + "flos": 15261532439040.0, + "grad_norm": 2.380191025418974, + "language_loss": 0.89501488, + "learning_rate": 2.169075438538104e-08, + "loss": 0.91733503, + "num_input_tokens_seen": 342410445, + "step": 15875, + "time_per_iteration": 2.6759564876556396 + }, + { + "auxiliary_loss_clip": 0.01164307, + "auxiliary_loss_mlp": 0.01101177, + "balance_loss_clip": 1.00190735, + "balance_loss_mlp": 1.00061166, + "epoch": 0.9545167593566812, + "flos": 25918777872000.0, + "grad_norm": 2.7064994805890468, + "language_loss": 0.67970669, + "learning_rate": 2.1633588561229765e-08, + "loss": 0.70236158, + "num_input_tokens_seen": 342430970, + "step": 15876, + "time_per_iteration": 2.6217916011810303 + }, + { + "auxiliary_loss_clip": 0.01147353, + "auxiliary_loss_mlp": 0.01101798, + "balance_loss_clip": 1.0019033, + "balance_loss_mlp": 1.0005182, + "epoch": 0.9545768826093491, + "flos": 25628188844160.0, + "grad_norm": 2.3462518510052877, + "language_loss": 0.6959132, + "learning_rate": 2.1576497756931267e-08, + "loss": 0.71840477, + "num_input_tokens_seen": 342449505, + "step": 15877, + "time_per_iteration": 2.7191617488861084 + }, + { + "auxiliary_loss_clip": 0.01116841, + "auxiliary_loss_mlp": 0.01100957, + "balance_loss_clip": 1.00172925, + "balance_loss_mlp": 1.00063026, + "epoch": 0.9546370058620172, + "flos": 22491499726080.0, + "grad_norm": 1.6485290205798857, + "language_loss": 0.71140075, + "learning_rate": 2.1519481974650035e-08, + "loss": 0.7335788, + "num_input_tokens_seen": 342470390, + "step": 15878, + "time_per_iteration": 2.6806530952453613 + }, + { + "auxiliary_loss_clip": 0.01164105, + "auxiliary_loss_mlp": 0.01100037, + "balance_loss_clip": 1.00187254, + "balance_loss_mlp": 1.00037801, + "epoch": 0.9546971291146851, + "flos": 24609582961920.0, + "grad_norm": 2.3260883179854743, + "language_loss": 0.68404973, + "learning_rate": 2.1462541216548335e-08, + "loss": 0.70669115, + "num_input_tokens_seen": 342492560, + "step": 15879, + "time_per_iteration": 2.6178414821624756 + }, + { + "auxiliary_loss_clip": 0.01116433, + "auxiliary_loss_mlp": 0.0074721, + "balance_loss_clip": 1.00167632, + "balance_loss_mlp": 1.00036502, + "epoch": 0.9547572523673531, + "flos": 28657756627200.0, + "grad_norm": 2.2312145982284064, + "language_loss": 0.85103059, + "learning_rate": 2.1405675484785334e-08, + "loss": 0.86966699, + "num_input_tokens_seen": 342512315, + "step": 15880, + "time_per_iteration": 2.827806234359741 + }, + { + "auxiliary_loss_clip": 0.01086988, + "auxiliary_loss_mlp": 0.01100454, + "balance_loss_clip": 1.00155759, + "balance_loss_mlp": 1.0004611, + "epoch": 0.954817375620021, + "flos": 33802606385280.0, + "grad_norm": 2.0351424269327523, + "language_loss": 0.72214115, + "learning_rate": 2.134888478151753e-08, + "loss": 0.74401557, + "num_input_tokens_seen": 342533060, + "step": 15881, + "time_per_iteration": 2.935530424118042 + }, + { + "auxiliary_loss_clip": 0.01147898, + "auxiliary_loss_mlp": 0.01100037, + "balance_loss_clip": 1.00194001, + "balance_loss_mlp": 1.00061595, + "epoch": 0.954877498872689, + "flos": 14428225843200.0, + "grad_norm": 2.3914819142409267, + "language_loss": 0.71528333, + "learning_rate": 2.1292169108898083e-08, + "loss": 0.73776269, + "num_input_tokens_seen": 342550830, + "step": 15882, + "time_per_iteration": 2.6055448055267334 + }, + { + "auxiliary_loss_clip": 0.01132297, + "auxiliary_loss_mlp": 0.01099968, + "balance_loss_clip": 1.001773, + "balance_loss_mlp": 1.00054789, + "epoch": 0.9549376221253569, + "flos": 59269447336320.0, + "grad_norm": 1.8884937047470163, + "language_loss": 0.65463954, + "learning_rate": 2.1235528469078168e-08, + "loss": 0.6769622, + "num_input_tokens_seen": 342575070, + "step": 15883, + "time_per_iteration": 2.989276885986328 + }, + { + "auxiliary_loss_clip": 0.01147488, + "auxiliary_loss_mlp": 0.01101144, + "balance_loss_clip": 1.00193191, + "balance_loss_mlp": 1.00057912, + "epoch": 0.954997745378025, + "flos": 17274397760640.0, + "grad_norm": 3.673808679975113, + "language_loss": 0.78013903, + "learning_rate": 2.1178962864205175e-08, + "loss": 0.80262536, + "num_input_tokens_seen": 342592215, + "step": 15884, + "time_per_iteration": 2.6680474281311035 + }, + { + "auxiliary_loss_clip": 0.01164199, + "auxiliary_loss_mlp": 0.01101122, + "balance_loss_clip": 1.00182319, + "balance_loss_mlp": 1.00041437, + "epoch": 0.955057868630693, + "flos": 13006378903680.0, + "grad_norm": 1.8053419127569905, + "language_loss": 0.77886248, + "learning_rate": 2.1122472296424054e-08, + "loss": 0.8015157, + "num_input_tokens_seen": 342610030, + "step": 15885, + "time_per_iteration": 4.231053113937378 + }, + { + "auxiliary_loss_clip": 0.01164236, + "auxiliary_loss_mlp": 0.0110045, + "balance_loss_clip": 1.00193322, + "balance_loss_mlp": 1.0006485, + "epoch": 0.9551179918833609, + "flos": 22637692080000.0, + "grad_norm": 4.440053379350839, + "language_loss": 0.69759774, + "learning_rate": 2.1066056767877317e-08, + "loss": 0.72024465, + "num_input_tokens_seen": 342626475, + "step": 15886, + "time_per_iteration": 2.6223840713500977 + }, + { + "auxiliary_loss_clip": 0.01115888, + "auxiliary_loss_mlp": 0.01101977, + "balance_loss_clip": 1.00161505, + "balance_loss_mlp": 1.00050616, + "epoch": 0.9551781151360289, + "flos": 21542811667200.0, + "grad_norm": 1.6206516862107356, + "language_loss": 0.72703952, + "learning_rate": 2.1009716280703916e-08, + "loss": 0.74921817, + "num_input_tokens_seen": 342646645, + "step": 15887, + "time_per_iteration": 2.6798505783081055 + }, + { + "auxiliary_loss_clip": 0.01134666, + "auxiliary_loss_mlp": 0.01099423, + "balance_loss_clip": 1.00180912, + "balance_loss_mlp": 1.00033593, + "epoch": 0.9552382383886968, + "flos": 20702250524160.0, + "grad_norm": 3.6075954023844665, + "language_loss": 0.56786495, + "learning_rate": 2.0953450837040364e-08, + "loss": 0.59020585, + "num_input_tokens_seen": 342663615, + "step": 15888, + "time_per_iteration": 4.033256769180298 + }, + { + "auxiliary_loss_clip": 0.01141402, + "auxiliary_loss_mlp": 0.01074264, + "balance_loss_clip": 1.00065899, + "balance_loss_mlp": 1.00025892, + "epoch": 0.9552983616413648, + "flos": 67769792887680.0, + "grad_norm": 0.7070680890032633, + "language_loss": 0.5791176, + "learning_rate": 2.0897260439020514e-08, + "loss": 0.60127425, + "num_input_tokens_seen": 342728275, + "step": 15889, + "time_per_iteration": 3.189896821975708 + }, + { + "auxiliary_loss_clip": 0.01164241, + "auxiliary_loss_mlp": 0.01100906, + "balance_loss_clip": 1.00180984, + "balance_loss_mlp": 1.00048411, + "epoch": 0.9553584848940327, + "flos": 21579979265280.0, + "grad_norm": 1.2929503755778555, + "language_loss": 0.66946876, + "learning_rate": 2.084114508877466e-08, + "loss": 0.69212025, + "num_input_tokens_seen": 342748860, + "step": 15890, + "time_per_iteration": 2.595480442047119 + }, + { + "auxiliary_loss_clip": 0.01164281, + "auxiliary_loss_mlp": 0.01100042, + "balance_loss_clip": 1.0020237, + "balance_loss_mlp": 1.00066924, + "epoch": 0.9554186081467008, + "flos": 24208173498240.0, + "grad_norm": 1.5943831314073804, + "language_loss": 0.74055898, + "learning_rate": 2.0785104788430874e-08, + "loss": 0.76320219, + "num_input_tokens_seen": 342769705, + "step": 15891, + "time_per_iteration": 2.5903139114379883 + }, + { + "auxiliary_loss_clip": 0.01115603, + "auxiliary_loss_mlp": 0.01099261, + "balance_loss_clip": 1.00173914, + "balance_loss_mlp": 1.00060296, + "epoch": 0.9554787313993687, + "flos": 16251554073600.0, + "grad_norm": 1.9566728730799776, + "language_loss": 0.77914202, + "learning_rate": 2.072913954011435e-08, + "loss": 0.80129075, + "num_input_tokens_seen": 342787000, + "step": 15892, + "time_per_iteration": 2.6438968181610107 + }, + { + "auxiliary_loss_clip": 0.01164002, + "auxiliary_loss_mlp": 0.01099966, + "balance_loss_clip": 1.00187838, + "balance_loss_mlp": 1.00040197, + "epoch": 0.9555388546520367, + "flos": 23404133508480.0, + "grad_norm": 1.4373514601642352, + "language_loss": 0.69914699, + "learning_rate": 2.0673249345947386e-08, + "loss": 0.72178668, + "num_input_tokens_seen": 342807795, + "step": 15893, + "time_per_iteration": 2.5601556301116943 + }, + { + "auxiliary_loss_clip": 0.01132021, + "auxiliary_loss_mlp": 0.00747236, + "balance_loss_clip": 1.00182819, + "balance_loss_mlp": 1.00047588, + "epoch": 0.9555989779047046, + "flos": 14794047907200.0, + "grad_norm": 2.294472091512707, + "language_loss": 0.65838659, + "learning_rate": 2.0617434208048955e-08, + "loss": 0.6771791, + "num_input_tokens_seen": 342825490, + "step": 15894, + "time_per_iteration": 2.62149977684021 + }, + { + "auxiliary_loss_clip": 0.01147548, + "auxiliary_loss_mlp": 0.01100897, + "balance_loss_clip": 1.00186205, + "balance_loss_mlp": 1.00057089, + "epoch": 0.9556591011573726, + "flos": 22236749493120.0, + "grad_norm": 2.0351906542777383, + "language_loss": 0.81711823, + "learning_rate": 2.056169412853581e-08, + "loss": 0.83960271, + "num_input_tokens_seen": 342844965, + "step": 15895, + "time_per_iteration": 2.564440965652466 + }, + { + "auxiliary_loss_clip": 0.0113102, + "auxiliary_loss_mlp": 0.01101248, + "balance_loss_clip": 1.00166893, + "balance_loss_mlp": 1.00044501, + "epoch": 0.9557192244100405, + "flos": 27855296835840.0, + "grad_norm": 2.0167588153506, + "language_loss": 0.72580898, + "learning_rate": 2.0506029109521593e-08, + "loss": 0.74813163, + "num_input_tokens_seen": 342865915, + "step": 15896, + "time_per_iteration": 2.6894443035125732 + }, + { + "auxiliary_loss_clip": 0.01164078, + "auxiliary_loss_mlp": 0.01100186, + "balance_loss_clip": 1.00187099, + "balance_loss_mlp": 1.0004797, + "epoch": 0.9557793476627086, + "flos": 17602800831360.0, + "grad_norm": 1.9894358524296867, + "language_loss": 0.79696345, + "learning_rate": 2.045043915311706e-08, + "loss": 0.81960607, + "num_input_tokens_seen": 342884000, + "step": 15897, + "time_per_iteration": 2.5201189517974854 + }, + { + "auxiliary_loss_clip": 0.01132819, + "auxiliary_loss_mlp": 0.01100972, + "balance_loss_clip": 1.00174689, + "balance_loss_mlp": 1.00040722, + "epoch": 0.9558394709153766, + "flos": 23875496709120.0, + "grad_norm": 2.521438560628629, + "language_loss": 0.72671551, + "learning_rate": 2.03949242614303e-08, + "loss": 0.74905348, + "num_input_tokens_seen": 342903095, + "step": 15898, + "time_per_iteration": 2.7160725593566895 + }, + { + "auxiliary_loss_clip": 0.01110363, + "auxiliary_loss_mlp": 0.01074624, + "balance_loss_clip": 1.00066161, + "balance_loss_mlp": 1.0002377, + "epoch": 0.9558995941680445, + "flos": 53682001171200.0, + "grad_norm": 0.8333114388902534, + "language_loss": 0.52317679, + "learning_rate": 2.033948443656652e-08, + "loss": 0.54502666, + "num_input_tokens_seen": 342958155, + "step": 15899, + "time_per_iteration": 3.199962615966797 + }, + { + "auxiliary_loss_clip": 0.01147676, + "auxiliary_loss_mlp": 0.01102194, + "balance_loss_clip": 1.00196815, + "balance_loss_mlp": 1.00053275, + "epoch": 0.9559597174207125, + "flos": 13764488376960.0, + "grad_norm": 2.3442030137338277, + "language_loss": 0.68894243, + "learning_rate": 2.028411968062782e-08, + "loss": 0.71144104, + "num_input_tokens_seen": 342972500, + "step": 15900, + "time_per_iteration": 2.615978956222534 + }, + { + "auxiliary_loss_clip": 0.01148998, + "auxiliary_loss_mlp": 0.00747361, + "balance_loss_clip": 1.001809, + "balance_loss_mlp": 1.00049019, + "epoch": 0.9560198406733804, + "flos": 19936347799680.0, + "grad_norm": 1.9832172044449918, + "language_loss": 0.82794416, + "learning_rate": 2.0228829995713627e-08, + "loss": 0.84690773, + "num_input_tokens_seen": 342989035, + "step": 15901, + "time_per_iteration": 2.6160058975219727 + }, + { + "auxiliary_loss_clip": 0.01113442, + "auxiliary_loss_mlp": 0.01074108, + "balance_loss_clip": 1.00097573, + "balance_loss_mlp": 1.00048447, + "epoch": 0.9560799639260484, + "flos": 57289550699520.0, + "grad_norm": 0.7174130012930581, + "language_loss": 0.54378355, + "learning_rate": 2.0173615383920485e-08, + "loss": 0.56565905, + "num_input_tokens_seen": 343051675, + "step": 15902, + "time_per_iteration": 3.2840142250061035 + }, + { + "auxiliary_loss_clip": 0.0113374, + "auxiliary_loss_mlp": 0.01098484, + "balance_loss_clip": 1.00175345, + "balance_loss_mlp": 1.000494, + "epoch": 0.9561400871787163, + "flos": 18917167299840.0, + "grad_norm": 2.0409477077275917, + "language_loss": 0.85245252, + "learning_rate": 2.01184758473425e-08, + "loss": 0.87477469, + "num_input_tokens_seen": 343068895, + "step": 15903, + "time_per_iteration": 2.6138248443603516 + }, + { + "auxiliary_loss_clip": 0.01132379, + "auxiliary_loss_mlp": 0.00747278, + "balance_loss_clip": 1.00172591, + "balance_loss_mlp": 1.0004797, + "epoch": 0.9562002104313844, + "flos": 18038576632320.0, + "grad_norm": 2.243001685180964, + "language_loss": 0.80721778, + "learning_rate": 2.0063411388070217e-08, + "loss": 0.82601434, + "num_input_tokens_seen": 343087115, + "step": 15904, + "time_per_iteration": 4.070939540863037 + }, + { + "auxiliary_loss_clip": 0.01149707, + "auxiliary_loss_mlp": 0.01101198, + "balance_loss_clip": 1.00190258, + "balance_loss_mlp": 1.00053763, + "epoch": 0.9562603336840523, + "flos": 24717673964160.0, + "grad_norm": 2.4118310852952884, + "language_loss": 0.59794843, + "learning_rate": 2.0008422008191972e-08, + "loss": 0.62045747, + "num_input_tokens_seen": 343105575, + "step": 15905, + "time_per_iteration": 2.6347696781158447 + }, + { + "auxiliary_loss_clip": 0.01147429, + "auxiliary_loss_mlp": 0.0110051, + "balance_loss_clip": 1.00179791, + "balance_loss_mlp": 1.00056481, + "epoch": 0.9563204569367203, + "flos": 21177205084800.0, + "grad_norm": 1.9626896989028915, + "language_loss": 0.70377898, + "learning_rate": 1.995350770979254e-08, + "loss": 0.72625828, + "num_input_tokens_seen": 343123025, + "step": 15906, + "time_per_iteration": 2.655125141143799 + }, + { + "auxiliary_loss_clip": 0.01102317, + "auxiliary_loss_mlp": 0.01100968, + "balance_loss_clip": 1.00194979, + "balance_loss_mlp": 1.00045085, + "epoch": 0.9563805801893882, + "flos": 20229738088320.0, + "grad_norm": 1.8319587574689318, + "language_loss": 0.71219879, + "learning_rate": 1.9898668494954473e-08, + "loss": 0.73423171, + "num_input_tokens_seen": 343141625, + "step": 15907, + "time_per_iteration": 2.671668767929077 + }, + { + "auxiliary_loss_clip": 0.01116583, + "auxiliary_loss_mlp": 0.01099527, + "balance_loss_clip": 1.00183177, + "balance_loss_mlp": 1.00058365, + "epoch": 0.9564407034420562, + "flos": 25411001258880.0, + "grad_norm": 2.2453848725893737, + "language_loss": 0.699911, + "learning_rate": 1.9843904365757447e-08, + "loss": 0.72207212, + "num_input_tokens_seen": 343161300, + "step": 15908, + "time_per_iteration": 4.170385122299194 + }, + { + "auxiliary_loss_clip": 0.01130282, + "auxiliary_loss_mlp": 0.00747311, + "balance_loss_clip": 1.00179863, + "balance_loss_mlp": 1.00045848, + "epoch": 0.9565008266947241, + "flos": 18623884752000.0, + "grad_norm": 1.844300004406228, + "language_loss": 0.82953995, + "learning_rate": 1.978921532427802e-08, + "loss": 0.84831589, + "num_input_tokens_seen": 343177815, + "step": 15909, + "time_per_iteration": 2.6087749004364014 + }, + { + "auxiliary_loss_clip": 0.01147407, + "auxiliary_loss_mlp": 0.01100658, + "balance_loss_clip": 1.00176358, + "balance_loss_mlp": 1.0005219, + "epoch": 0.9565609499473922, + "flos": 24862142465280.0, + "grad_norm": 2.644032205442303, + "language_loss": 0.67620617, + "learning_rate": 1.9734601372590086e-08, + "loss": 0.69868678, + "num_input_tokens_seen": 343198140, + "step": 15910, + "time_per_iteration": 2.684805154800415 + }, + { + "auxiliary_loss_clip": 0.01147703, + "auxiliary_loss_mlp": 0.01101266, + "balance_loss_clip": 1.00191545, + "balance_loss_mlp": 1.00060582, + "epoch": 0.9566210732000601, + "flos": 21798459740160.0, + "grad_norm": 1.9635052095014696, + "language_loss": 0.74290138, + "learning_rate": 1.968006251276444e-08, + "loss": 0.76539105, + "num_input_tokens_seen": 343218280, + "step": 15911, + "time_per_iteration": 2.634580135345459 + }, + { + "auxiliary_loss_clip": 0.01147806, + "auxiliary_loss_mlp": 0.01100324, + "balance_loss_clip": 1.00174773, + "balance_loss_mlp": 1.00037861, + "epoch": 0.9566811964527281, + "flos": 18697609416960.0, + "grad_norm": 2.05993674543817, + "language_loss": 0.69146824, + "learning_rate": 1.9625598746869198e-08, + "loss": 0.71394956, + "num_input_tokens_seen": 343236850, + "step": 15912, + "time_per_iteration": 2.616290807723999 + }, + { + "auxiliary_loss_clip": 0.01134899, + "auxiliary_loss_mlp": 0.01101584, + "balance_loss_clip": 1.00194836, + "balance_loss_mlp": 1.00063741, + "epoch": 0.9567413197053961, + "flos": 13000632727680.0, + "grad_norm": 3.141293382269817, + "language_loss": 0.72238064, + "learning_rate": 1.95712100769696e-08, + "loss": 0.74474543, + "num_input_tokens_seen": 343253065, + "step": 15913, + "time_per_iteration": 2.6479856967926025 + }, + { + "auxiliary_loss_clip": 0.01066313, + "auxiliary_loss_mlp": 0.01100614, + "balance_loss_clip": 1.00149727, + "balance_loss_mlp": 1.0005734, + "epoch": 0.956801442958064, + "flos": 19719267955200.0, + "grad_norm": 2.041811469594779, + "language_loss": 0.73257726, + "learning_rate": 1.9516896505128444e-08, + "loss": 0.75424647, + "num_input_tokens_seen": 343270330, + "step": 15914, + "time_per_iteration": 2.7763500213623047 + }, + { + "auxiliary_loss_clip": 0.01164109, + "auxiliary_loss_mlp": 0.01100153, + "balance_loss_clip": 1.00188112, + "balance_loss_mlp": 1.00049376, + "epoch": 0.956861566210732, + "flos": 18222834424320.0, + "grad_norm": 1.5767495933612277, + "language_loss": 0.67154849, + "learning_rate": 1.9462658033404965e-08, + "loss": 0.6941911, + "num_input_tokens_seen": 343289625, + "step": 15915, + "time_per_iteration": 2.5473828315734863 + }, + { + "auxiliary_loss_clip": 0.01147259, + "auxiliary_loss_mlp": 0.01100548, + "balance_loss_clip": 1.00184691, + "balance_loss_mlp": 1.00050783, + "epoch": 0.9569216894634, + "flos": 22196960202240.0, + "grad_norm": 2.253016041064518, + "language_loss": 0.64396143, + "learning_rate": 1.9408494663855967e-08, + "loss": 0.66643947, + "num_input_tokens_seen": 343309200, + "step": 15916, + "time_per_iteration": 2.6008126735687256 + }, + { + "auxiliary_loss_clip": 0.01163937, + "auxiliary_loss_mlp": 0.0109912, + "balance_loss_clip": 1.00191796, + "balance_loss_mlp": 1.00046229, + "epoch": 0.956981812716068, + "flos": 21689291329920.0, + "grad_norm": 2.15432407947112, + "language_loss": 0.80775821, + "learning_rate": 1.935440639853536e-08, + "loss": 0.83038878, + "num_input_tokens_seen": 343326270, + "step": 15917, + "time_per_iteration": 2.585848808288574 + }, + { + "auxiliary_loss_clip": 0.01132934, + "auxiliary_loss_mlp": 0.01099344, + "balance_loss_clip": 1.00185299, + "balance_loss_mlp": 1.00059128, + "epoch": 0.9570419359687359, + "flos": 13990905757440.0, + "grad_norm": 1.8515357972818294, + "language_loss": 0.73069417, + "learning_rate": 1.9300393239494172e-08, + "loss": 0.75301695, + "num_input_tokens_seen": 343344430, + "step": 15918, + "time_per_iteration": 2.6388022899627686 + }, + { + "auxiliary_loss_clip": 0.01112702, + "auxiliary_loss_mlp": 0.01074258, + "balance_loss_clip": 1.00066924, + "balance_loss_mlp": 1.00025296, + "epoch": 0.9571020592214039, + "flos": 65196938534400.0, + "grad_norm": 0.6330154997768374, + "language_loss": 0.53064674, + "learning_rate": 1.924645518878032e-08, + "loss": 0.55251634, + "num_input_tokens_seen": 343416155, + "step": 15919, + "time_per_iteration": 3.391375780105591 + }, + { + "auxiliary_loss_clip": 0.01149722, + "auxiliary_loss_mlp": 0.01101094, + "balance_loss_clip": 1.00191343, + "balance_loss_mlp": 1.00048184, + "epoch": 0.9571621824740718, + "flos": 17384068961280.0, + "grad_norm": 4.355874260487737, + "language_loss": 0.75188315, + "learning_rate": 1.919259224843972e-08, + "loss": 0.77439129, + "num_input_tokens_seen": 343431715, + "step": 15920, + "time_per_iteration": 2.5997631549835205 + }, + { + "auxiliary_loss_clip": 0.01114575, + "auxiliary_loss_mlp": 0.01101035, + "balance_loss_clip": 1.0017134, + "balance_loss_mlp": 1.00056505, + "epoch": 0.9572223057267398, + "flos": 14538184352640.0, + "grad_norm": 1.8938698799458815, + "language_loss": 0.79229546, + "learning_rate": 1.9138804420514298e-08, + "loss": 0.81445158, + "num_input_tokens_seen": 343450425, + "step": 15921, + "time_per_iteration": 2.682831048965454 + }, + { + "auxiliary_loss_clip": 0.01149113, + "auxiliary_loss_mlp": 0.01101198, + "balance_loss_clip": 1.00176334, + "balance_loss_mlp": 1.00044227, + "epoch": 0.9572824289794077, + "flos": 33947793158400.0, + "grad_norm": 2.283610513034022, + "language_loss": 0.51157612, + "learning_rate": 1.9085091707044197e-08, + "loss": 0.53407919, + "num_input_tokens_seen": 343470445, + "step": 15922, + "time_per_iteration": 2.740435838699341 + }, + { + "auxiliary_loss_clip": 0.01101763, + "auxiliary_loss_mlp": 0.01100759, + "balance_loss_clip": 1.00180101, + "balance_loss_mlp": 1.00052798, + "epoch": 0.9573425522320758, + "flos": 18694915896960.0, + "grad_norm": 1.9417648193524006, + "language_loss": 0.83895618, + "learning_rate": 1.903145411006557e-08, + "loss": 0.86098135, + "num_input_tokens_seen": 343485200, + "step": 15923, + "time_per_iteration": 4.305248498916626 + }, + { + "auxiliary_loss_clip": 0.01132591, + "auxiliary_loss_mlp": 0.0110005, + "balance_loss_clip": 1.00170064, + "balance_loss_mlp": 1.00058174, + "epoch": 0.9574026754847437, + "flos": 28510307297280.0, + "grad_norm": 1.7907876954923034, + "language_loss": 0.75002617, + "learning_rate": 1.8977891631613008e-08, + "loss": 0.77235258, + "num_input_tokens_seen": 343505080, + "step": 15924, + "time_per_iteration": 2.718503475189209 + }, + { + "auxiliary_loss_clip": 0.01131033, + "auxiliary_loss_mlp": 0.01100365, + "balance_loss_clip": 1.0016855, + "balance_loss_mlp": 1.00056267, + "epoch": 0.9574627987374117, + "flos": 24352390604160.0, + "grad_norm": 2.5289819962687208, + "language_loss": 0.85899818, + "learning_rate": 1.892440427371711e-08, + "loss": 0.88131213, + "num_input_tokens_seen": 343523995, + "step": 15925, + "time_per_iteration": 2.7268428802490234 + }, + { + "auxiliary_loss_clip": 0.01117416, + "auxiliary_loss_mlp": 0.01101057, + "balance_loss_clip": 1.00179362, + "balance_loss_mlp": 1.00053942, + "epoch": 0.9575229219900797, + "flos": 23510680225920.0, + "grad_norm": 1.8174469005491107, + "language_loss": 0.75690138, + "learning_rate": 1.8870992038406474e-08, + "loss": 0.77908611, + "num_input_tokens_seen": 343542015, + "step": 15926, + "time_per_iteration": 4.053712844848633 + }, + { + "auxiliary_loss_clip": 0.01131012, + "auxiliary_loss_mlp": 0.01099636, + "balance_loss_clip": 1.00169945, + "balance_loss_mlp": 1.00045395, + "epoch": 0.9575830452427476, + "flos": 22674823764480.0, + "grad_norm": 1.9394398545763878, + "language_loss": 0.77504092, + "learning_rate": 1.8817654927706373e-08, + "loss": 0.79734743, + "num_input_tokens_seen": 343561680, + "step": 15927, + "time_per_iteration": 2.689008951187134 + }, + { + "auxiliary_loss_clip": 0.01102363, + "auxiliary_loss_mlp": 0.0110132, + "balance_loss_clip": 1.00167322, + "balance_loss_mlp": 1.00046885, + "epoch": 0.9576431684954156, + "flos": 30485250835200.0, + "grad_norm": 2.0405174618973128, + "language_loss": 0.68718982, + "learning_rate": 1.8764392943639183e-08, + "loss": 0.70922661, + "num_input_tokens_seen": 343585290, + "step": 15928, + "time_per_iteration": 2.7967896461486816 + }, + { + "auxiliary_loss_clip": 0.01132104, + "auxiliary_loss_mlp": 0.01100727, + "balance_loss_clip": 1.00188637, + "balance_loss_mlp": 1.00049603, + "epoch": 0.9577032917480836, + "flos": 21687387909120.0, + "grad_norm": 1.6431707759356229, + "language_loss": 0.81854486, + "learning_rate": 1.871120608822485e-08, + "loss": 0.84087324, + "num_input_tokens_seen": 343604045, + "step": 15929, + "time_per_iteration": 2.6557514667510986 + }, + { + "auxiliary_loss_clip": 0.01117951, + "auxiliary_loss_mlp": 0.01101161, + "balance_loss_clip": 1.00168443, + "balance_loss_mlp": 1.00054848, + "epoch": 0.9577634150007516, + "flos": 29023147728000.0, + "grad_norm": 1.667552262199743, + "language_loss": 0.72358179, + "learning_rate": 1.8658094363480202e-08, + "loss": 0.74577284, + "num_input_tokens_seen": 343626595, + "step": 15930, + "time_per_iteration": 2.721769094467163 + }, + { + "auxiliary_loss_clip": 0.01085596, + "auxiliary_loss_mlp": 0.01100024, + "balance_loss_clip": 1.00171566, + "balance_loss_mlp": 1.00041318, + "epoch": 0.9578235382534195, + "flos": 19282235178240.0, + "grad_norm": 2.808792756402841, + "language_loss": 0.62262309, + "learning_rate": 1.8605057771419185e-08, + "loss": 0.64447927, + "num_input_tokens_seen": 343646195, + "step": 15931, + "time_per_iteration": 2.7685482501983643 + }, + { + "auxiliary_loss_clip": 0.01164072, + "auxiliary_loss_mlp": 0.01099559, + "balance_loss_clip": 1.00196934, + "balance_loss_mlp": 1.00047195, + "epoch": 0.9578836615060875, + "flos": 13699275235200.0, + "grad_norm": 3.161067024277854, + "language_loss": 0.69298267, + "learning_rate": 1.8552096314052633e-08, + "loss": 0.71561897, + "num_input_tokens_seen": 343663665, + "step": 15932, + "time_per_iteration": 2.557281970977783 + }, + { + "auxiliary_loss_clip": 0.01115772, + "auxiliary_loss_mlp": 0.01101277, + "balance_loss_clip": 1.00165749, + "balance_loss_mlp": 1.00075984, + "epoch": 0.9579437847587554, + "flos": 17054516655360.0, + "grad_norm": 1.9698817449290098, + "language_loss": 0.75529706, + "learning_rate": 1.849920999338961e-08, + "loss": 0.77746755, + "num_input_tokens_seen": 343682145, + "step": 15933, + "time_per_iteration": 2.7092466354370117 + }, + { + "auxiliary_loss_clip": 0.01097369, + "auxiliary_loss_mlp": 0.01074134, + "balance_loss_clip": 1.0007534, + "balance_loss_mlp": 1.00012875, + "epoch": 0.9580039080114234, + "flos": 60570887886720.0, + "grad_norm": 0.6961858430973866, + "language_loss": 0.57298559, + "learning_rate": 1.8446398811434948e-08, + "loss": 0.59470063, + "num_input_tokens_seen": 343744685, + "step": 15934, + "time_per_iteration": 3.426133394241333 + }, + { + "auxiliary_loss_clip": 0.01157958, + "auxiliary_loss_mlp": 0.00745323, + "balance_loss_clip": 1.00074828, + "balance_loss_mlp": 1.00013673, + "epoch": 0.9580640312640913, + "flos": 66235365745920.0, + "grad_norm": 0.9024765536756051, + "language_loss": 0.65902489, + "learning_rate": 1.8393662770191277e-08, + "loss": 0.67805767, + "num_input_tokens_seen": 343801835, + "step": 15935, + "time_per_iteration": 3.072995185852051 + }, + { + "auxiliary_loss_clip": 0.01127744, + "auxiliary_loss_mlp": 0.01074705, + "balance_loss_clip": 1.00156379, + "balance_loss_mlp": 1.00031877, + "epoch": 0.9581241545167594, + "flos": 62218002971520.0, + "grad_norm": 0.7888458888615235, + "language_loss": 0.57018596, + "learning_rate": 1.8341001871658546e-08, + "loss": 0.59221047, + "num_input_tokens_seen": 343861515, + "step": 15936, + "time_per_iteration": 3.1880385875701904 + }, + { + "auxiliary_loss_clip": 0.01083039, + "auxiliary_loss_mlp": 0.01100401, + "balance_loss_clip": 1.00151944, + "balance_loss_mlp": 1.0005517, + "epoch": 0.9581842777694273, + "flos": 23768088065280.0, + "grad_norm": 2.3548947929908466, + "language_loss": 0.78458118, + "learning_rate": 1.8288416117833825e-08, + "loss": 0.80641556, + "num_input_tokens_seen": 343881240, + "step": 15937, + "time_per_iteration": 2.8231353759765625 + }, + { + "auxiliary_loss_clip": 0.01149482, + "auxiliary_loss_mlp": 0.01100639, + "balance_loss_clip": 1.00196052, + "balance_loss_mlp": 1.00040746, + "epoch": 0.9582444010220953, + "flos": 21213079793280.0, + "grad_norm": 1.6974560914434387, + "language_loss": 0.68443781, + "learning_rate": 1.8235905510710636e-08, + "loss": 0.70693898, + "num_input_tokens_seen": 343900885, + "step": 15938, + "time_per_iteration": 2.6168713569641113 + }, + { + "auxiliary_loss_clip": 0.0111537, + "auxiliary_loss_mlp": 0.01100234, + "balance_loss_clip": 1.00171113, + "balance_loss_mlp": 1.0005753, + "epoch": 0.9583045242747633, + "flos": 23805147922560.0, + "grad_norm": 3.2345582858619157, + "language_loss": 0.66488731, + "learning_rate": 1.8183470052280712e-08, + "loss": 0.68704331, + "num_input_tokens_seen": 343918460, + "step": 15939, + "time_per_iteration": 2.7155144214630127 + }, + { + "auxiliary_loss_clip": 0.01132967, + "auxiliary_loss_mlp": 0.01100138, + "balance_loss_clip": 1.00186563, + "balance_loss_mlp": 1.00052667, + "epoch": 0.9583646475274312, + "flos": 24131468004480.0, + "grad_norm": 1.4584506735377505, + "language_loss": 0.73934925, + "learning_rate": 1.8131109744532025e-08, + "loss": 0.76168025, + "num_input_tokens_seen": 343938030, + "step": 15940, + "time_per_iteration": 2.693377733230591 + }, + { + "auxiliary_loss_clip": 0.01164311, + "auxiliary_loss_mlp": 0.01101064, + "balance_loss_clip": 1.00197101, + "balance_loss_mlp": 1.00045133, + "epoch": 0.9584247707800992, + "flos": 20886651970560.0, + "grad_norm": 1.8459445899181224, + "language_loss": 0.72978544, + "learning_rate": 1.8078824589450535e-08, + "loss": 0.7524392, + "num_input_tokens_seen": 343956635, + "step": 15941, + "time_per_iteration": 2.5746285915374756 + }, + { + "auxiliary_loss_clip": 0.01131104, + "auxiliary_loss_mlp": 0.0109942, + "balance_loss_clip": 1.00175595, + "balance_loss_mlp": 1.00061953, + "epoch": 0.9584848940327672, + "flos": 26067591918720.0, + "grad_norm": 3.34310153602415, + "language_loss": 0.70896411, + "learning_rate": 1.8026614589018442e-08, + "loss": 0.73126942, + "num_input_tokens_seen": 343976625, + "step": 15942, + "time_per_iteration": 4.055248498916626 + }, + { + "auxiliary_loss_clip": 0.0116429, + "auxiliary_loss_mlp": 0.01101828, + "balance_loss_clip": 1.00192404, + "balance_loss_mlp": 1.00054753, + "epoch": 0.9585450172854352, + "flos": 34492988764800.0, + "grad_norm": 1.5280332190117287, + "language_loss": 0.72203434, + "learning_rate": 1.797447974521571e-08, + "loss": 0.74469554, + "num_input_tokens_seen": 343997790, + "step": 15943, + "time_per_iteration": 2.6817877292633057 + }, + { + "auxiliary_loss_clip": 0.01147587, + "auxiliary_loss_mlp": 0.01101645, + "balance_loss_clip": 1.0018214, + "balance_loss_mlp": 1.000651, + "epoch": 0.9586051405381031, + "flos": 23110743219840.0, + "grad_norm": 1.7701391333409204, + "language_loss": 0.68205523, + "learning_rate": 1.792242006001965e-08, + "loss": 0.70454752, + "num_input_tokens_seen": 344016935, + "step": 15944, + "time_per_iteration": 2.714043140411377 + }, + { + "auxiliary_loss_clip": 0.0116414, + "auxiliary_loss_mlp": 0.01100629, + "balance_loss_clip": 1.00174713, + "balance_loss_mlp": 1.00068426, + "epoch": 0.9586652637907711, + "flos": 19603994232960.0, + "grad_norm": 1.6487741209980802, + "language_loss": 0.66053599, + "learning_rate": 1.7870435535403795e-08, + "loss": 0.68318367, + "num_input_tokens_seen": 344035590, + "step": 15945, + "time_per_iteration": 2.588946580886841 + }, + { + "auxiliary_loss_clip": 0.01062181, + "auxiliary_loss_mlp": 0.01073764, + "balance_loss_clip": 1.00073409, + "balance_loss_mlp": 1.00014055, + "epoch": 0.958725387043439, + "flos": 72073327317120.0, + "grad_norm": 0.7358289403319337, + "language_loss": 0.61804217, + "learning_rate": 1.7818526173339678e-08, + "loss": 0.63940161, + "num_input_tokens_seen": 344100845, + "step": 15946, + "time_per_iteration": 4.925495147705078 + }, + { + "auxiliary_loss_clip": 0.01163955, + "auxiliary_loss_mlp": 0.01099884, + "balance_loss_clip": 1.0018574, + "balance_loss_mlp": 1.00055909, + "epoch": 0.958785510296107, + "flos": 28911932242560.0, + "grad_norm": 1.8005259497795163, + "language_loss": 0.75274765, + "learning_rate": 1.7766691975795723e-08, + "loss": 0.7753861, + "num_input_tokens_seen": 344121780, + "step": 15947, + "time_per_iteration": 2.622252941131592 + }, + { + "auxiliary_loss_clip": 0.01132813, + "auxiliary_loss_mlp": 0.01100352, + "balance_loss_clip": 1.00173616, + "balance_loss_mlp": 1.00050223, + "epoch": 0.958845633548775, + "flos": 18477189607680.0, + "grad_norm": 2.331349381229642, + "language_loss": 0.7062366, + "learning_rate": 1.771493294473747e-08, + "loss": 0.72856826, + "num_input_tokens_seen": 344140150, + "step": 15948, + "time_per_iteration": 2.6326465606689453 + }, + { + "auxiliary_loss_clip": 0.01098306, + "auxiliary_loss_mlp": 0.01100587, + "balance_loss_clip": 1.00176322, + "balance_loss_mlp": 1.00049853, + "epoch": 0.958905756801443, + "flos": 24206916522240.0, + "grad_norm": 3.2595737407159664, + "language_loss": 0.78514934, + "learning_rate": 1.7663249082127574e-08, + "loss": 0.80713826, + "num_input_tokens_seen": 344158200, + "step": 15949, + "time_per_iteration": 2.735887050628662 + }, + { + "auxiliary_loss_clip": 0.01164281, + "auxiliary_loss_mlp": 0.01101255, + "balance_loss_clip": 1.00191879, + "balance_loss_mlp": 1.00054729, + "epoch": 0.9589658800541109, + "flos": 25007939769600.0, + "grad_norm": 1.8785073257863123, + "language_loss": 0.68706304, + "learning_rate": 1.761164038992602e-08, + "loss": 0.70971847, + "num_input_tokens_seen": 344174720, + "step": 15950, + "time_per_iteration": 2.5468406677246094 + }, + { + "auxiliary_loss_clip": 0.01132989, + "auxiliary_loss_mlp": 0.01099896, + "balance_loss_clip": 1.00184202, + "balance_loss_mlp": 1.00047588, + "epoch": 0.9590260033067789, + "flos": 23514558894720.0, + "grad_norm": 3.411163485354017, + "language_loss": 0.86109734, + "learning_rate": 1.7560106870089687e-08, + "loss": 0.88342619, + "num_input_tokens_seen": 344192580, + "step": 15951, + "time_per_iteration": 2.6625635623931885 + }, + { + "auxiliary_loss_clip": 0.01133549, + "auxiliary_loss_mlp": 0.01101388, + "balance_loss_clip": 1.00184429, + "balance_loss_mlp": 1.00058484, + "epoch": 0.9590861265594469, + "flos": 25520349237120.0, + "grad_norm": 2.6252411105620985, + "language_loss": 0.80230761, + "learning_rate": 1.7508648524572568e-08, + "loss": 0.82465702, + "num_input_tokens_seen": 344210345, + "step": 15952, + "time_per_iteration": 2.637014865875244 + }, + { + "auxiliary_loss_clip": 0.01147378, + "auxiliary_loss_mlp": 0.01100936, + "balance_loss_clip": 1.00189018, + "balance_loss_mlp": 1.00041842, + "epoch": 0.9591462498121148, + "flos": 21179323987200.0, + "grad_norm": 1.6752935938423517, + "language_loss": 0.6973328, + "learning_rate": 1.7457265355326434e-08, + "loss": 0.71981597, + "num_input_tokens_seen": 344229540, + "step": 15953, + "time_per_iteration": 2.6026241779327393 + }, + { + "auxiliary_loss_clip": 0.01082272, + "auxiliary_loss_mlp": 0.01099832, + "balance_loss_clip": 1.00160587, + "balance_loss_mlp": 1.00050735, + "epoch": 0.9592063730647828, + "flos": 21723047136000.0, + "grad_norm": 3.2152597635719697, + "language_loss": 0.58080101, + "learning_rate": 1.7405957364299285e-08, + "loss": 0.60262209, + "num_input_tokens_seen": 344247830, + "step": 15954, + "time_per_iteration": 2.7259604930877686 + }, + { + "auxiliary_loss_clip": 0.01147607, + "auxiliary_loss_mlp": 0.01101248, + "balance_loss_clip": 1.00184822, + "balance_loss_mlp": 1.00054026, + "epoch": 0.9592664963174508, + "flos": 29891395278720.0, + "grad_norm": 2.1251217087674066, + "language_loss": 0.73829997, + "learning_rate": 1.7354724553437117e-08, + "loss": 0.7607885, + "num_input_tokens_seen": 344267760, + "step": 15955, + "time_per_iteration": 2.6524691581726074 + }, + { + "auxiliary_loss_clip": 0.01130729, + "auxiliary_loss_mlp": 0.01100489, + "balance_loss_clip": 1.00176215, + "balance_loss_mlp": 1.00044847, + "epoch": 0.9593266195701188, + "flos": 17999613354240.0, + "grad_norm": 1.806083852339068, + "language_loss": 0.62619317, + "learning_rate": 1.7303566924682378e-08, + "loss": 0.64850527, + "num_input_tokens_seen": 344284905, + "step": 15956, + "time_per_iteration": 2.6398773193359375 + }, + { + "auxiliary_loss_clip": 0.01114468, + "auxiliary_loss_mlp": 0.01100952, + "balance_loss_clip": 1.00176716, + "balance_loss_mlp": 1.00053024, + "epoch": 0.9593867428227867, + "flos": 18838271076480.0, + "grad_norm": 1.9031201901711057, + "language_loss": 0.60138595, + "learning_rate": 1.725248447997507e-08, + "loss": 0.6235401, + "num_input_tokens_seen": 344302025, + "step": 15957, + "time_per_iteration": 2.6891982555389404 + }, + { + "auxiliary_loss_clip": 0.0111811, + "auxiliary_loss_mlp": 0.01101142, + "balance_loss_clip": 1.00190639, + "balance_loss_mlp": 1.00043368, + "epoch": 0.9594468660754547, + "flos": 29567050444800.0, + "grad_norm": 2.395483558922783, + "language_loss": 0.74322891, + "learning_rate": 1.7201477221252314e-08, + "loss": 0.76542145, + "num_input_tokens_seen": 344321935, + "step": 15958, + "time_per_iteration": 2.7877585887908936 + }, + { + "auxiliary_loss_clip": 0.01132483, + "auxiliary_loss_mlp": 0.00747273, + "balance_loss_clip": 1.00173819, + "balance_loss_mlp": 1.00045574, + "epoch": 0.9595069893281226, + "flos": 20703256104960.0, + "grad_norm": 1.648624839095887, + "language_loss": 0.74500144, + "learning_rate": 1.7150545150448116e-08, + "loss": 0.76379901, + "num_input_tokens_seen": 344340405, + "step": 15959, + "time_per_iteration": 2.6807336807250977 + }, + { + "auxiliary_loss_clip": 0.01149857, + "auxiliary_loss_mlp": 0.01100461, + "balance_loss_clip": 1.00197387, + "balance_loss_mlp": 1.0005157, + "epoch": 0.9595671125807906, + "flos": 22453613856000.0, + "grad_norm": 2.235568237959887, + "language_loss": 0.65335155, + "learning_rate": 1.7099688269493816e-08, + "loss": 0.6758548, + "num_input_tokens_seen": 344359925, + "step": 15960, + "time_per_iteration": 2.6011228561401367 + }, + { + "auxiliary_loss_clip": 0.01164079, + "auxiliary_loss_mlp": 0.0109954, + "balance_loss_clip": 1.00187135, + "balance_loss_mlp": 1.00050151, + "epoch": 0.9596272358334585, + "flos": 23915214172800.0, + "grad_norm": 2.0766577329097706, + "language_loss": 0.78076088, + "learning_rate": 1.7048906580318544e-08, + "loss": 0.80339706, + "num_input_tokens_seen": 344379100, + "step": 15961, + "time_per_iteration": 4.168184995651245 + }, + { + "auxiliary_loss_clip": 0.01101703, + "auxiliary_loss_mlp": 0.01099924, + "balance_loss_clip": 1.00185108, + "balance_loss_mlp": 1.00036025, + "epoch": 0.9596873590861266, + "flos": 17672539086720.0, + "grad_norm": 2.1955981186088427, + "language_loss": 0.75857502, + "learning_rate": 1.699820008484698e-08, + "loss": 0.78059125, + "num_input_tokens_seen": 344396895, + "step": 15962, + "time_per_iteration": 2.7090160846710205 + }, + { + "auxiliary_loss_clip": 0.01132003, + "auxiliary_loss_mlp": 0.01100889, + "balance_loss_clip": 1.00181794, + "balance_loss_mlp": 1.00051439, + "epoch": 0.9597474823387945, + "flos": 25808532053760.0, + "grad_norm": 1.9065089191834281, + "language_loss": 0.7142204, + "learning_rate": 1.6947568785002698e-08, + "loss": 0.73654932, + "num_input_tokens_seen": 344415115, + "step": 15963, + "time_per_iteration": 4.049335718154907 + }, + { + "auxiliary_loss_clip": 0.01131032, + "auxiliary_loss_mlp": 0.0109913, + "balance_loss_clip": 1.00178194, + "balance_loss_mlp": 1.00042462, + "epoch": 0.9598076055914625, + "flos": 23768519028480.0, + "grad_norm": 1.4783709180767557, + "language_loss": 0.74054909, + "learning_rate": 1.689701268270527e-08, + "loss": 0.7628507, + "num_input_tokens_seen": 344435185, + "step": 15964, + "time_per_iteration": 2.674964189529419 + }, + { + "auxiliary_loss_clip": 0.01110027, + "auxiliary_loss_mlp": 0.01073679, + "balance_loss_clip": 1.00070894, + "balance_loss_mlp": 1.00005519, + "epoch": 0.9598677288441305, + "flos": 56515962464640.0, + "grad_norm": 0.9001240785752144, + "language_loss": 0.57633895, + "learning_rate": 1.684653177987161e-08, + "loss": 0.598176, + "num_input_tokens_seen": 344488950, + "step": 15965, + "time_per_iteration": 3.1938977241516113 + }, + { + "auxiliary_loss_clip": 0.01164031, + "auxiliary_loss_mlp": 0.01100169, + "balance_loss_clip": 1.00179505, + "balance_loss_mlp": 1.0005573, + "epoch": 0.9599278520967984, + "flos": 22997480659200.0, + "grad_norm": 1.6708054383352606, + "language_loss": 0.78765172, + "learning_rate": 1.6796126078416627e-08, + "loss": 0.81029367, + "num_input_tokens_seen": 344506740, + "step": 15966, + "time_per_iteration": 2.5948801040649414 + }, + { + "auxiliary_loss_clip": 0.01132756, + "auxiliary_loss_mlp": 0.01100304, + "balance_loss_clip": 1.00169253, + "balance_loss_mlp": 1.00045466, + "epoch": 0.9599879753494664, + "flos": 23039676161280.0, + "grad_norm": 1.8744348227343859, + "language_loss": 0.79504061, + "learning_rate": 1.674579558025102e-08, + "loss": 0.81737125, + "num_input_tokens_seen": 344526670, + "step": 15967, + "time_per_iteration": 2.6390764713287354 + }, + { + "auxiliary_loss_clip": 0.01103919, + "auxiliary_loss_mlp": 0.01101158, + "balance_loss_clip": 1.00188804, + "balance_loss_mlp": 1.00054502, + "epoch": 0.9600480986021344, + "flos": 16392287560320.0, + "grad_norm": 1.8683357003087808, + "language_loss": 0.80517399, + "learning_rate": 1.669554028728348e-08, + "loss": 0.82722479, + "num_input_tokens_seen": 344541995, + "step": 15968, + "time_per_iteration": 2.7068657875061035 + }, + { + "auxiliary_loss_clip": 0.01099567, + "auxiliary_loss_mlp": 0.01101542, + "balance_loss_clip": 1.00167084, + "balance_loss_mlp": 1.0006907, + "epoch": 0.9601082218548024, + "flos": 24276439296000.0, + "grad_norm": 2.72094171175612, + "language_loss": 0.67323589, + "learning_rate": 1.6645360201420044e-08, + "loss": 0.69524705, + "num_input_tokens_seen": 344559980, + "step": 15969, + "time_per_iteration": 2.7395849227905273 + }, + { + "auxiliary_loss_clip": 0.01149596, + "auxiliary_loss_mlp": 0.01100192, + "balance_loss_clip": 1.00195098, + "balance_loss_mlp": 1.00067592, + "epoch": 0.9601683451074703, + "flos": 19609991804160.0, + "grad_norm": 3.0269382488175096, + "language_loss": 0.79436398, + "learning_rate": 1.6595255324563186e-08, + "loss": 0.81686181, + "num_input_tokens_seen": 344577765, + "step": 15970, + "time_per_iteration": 2.6181693077087402 + }, + { + "auxiliary_loss_clip": 0.01146805, + "auxiliary_loss_mlp": 0.01100025, + "balance_loss_clip": 1.00187838, + "balance_loss_mlp": 1.0005095, + "epoch": 0.9602284683601383, + "flos": 26651104358400.0, + "grad_norm": 1.5395807865270434, + "language_loss": 0.77237391, + "learning_rate": 1.654522565861316e-08, + "loss": 0.79484218, + "num_input_tokens_seen": 344597650, + "step": 15971, + "time_per_iteration": 2.61895751953125 + }, + { + "auxiliary_loss_clip": 0.01132888, + "auxiliary_loss_mlp": 0.01100838, + "balance_loss_clip": 1.00174034, + "balance_loss_mlp": 1.00041604, + "epoch": 0.9602885916128062, + "flos": 15554096714880.0, + "grad_norm": 2.417365775458235, + "language_loss": 0.67385626, + "learning_rate": 1.64952712054669e-08, + "loss": 0.69619358, + "num_input_tokens_seen": 344613580, + "step": 15972, + "time_per_iteration": 2.606677293777466 + }, + { + "auxiliary_loss_clip": 0.01146909, + "auxiliary_loss_mlp": 0.00747381, + "balance_loss_clip": 1.00182819, + "balance_loss_mlp": 1.0005002, + "epoch": 0.9603487148654742, + "flos": 16502353810560.0, + "grad_norm": 2.2229903122687467, + "language_loss": 0.76286387, + "learning_rate": 1.644539196701844e-08, + "loss": 0.78180677, + "num_input_tokens_seen": 344626910, + "step": 15973, + "time_per_iteration": 2.581333875656128 + }, + { + "auxiliary_loss_clip": 0.0110297, + "auxiliary_loss_mlp": 0.01100505, + "balance_loss_clip": 1.00177181, + "balance_loss_mlp": 1.00051224, + "epoch": 0.9604088381181421, + "flos": 20845354308480.0, + "grad_norm": 13.95969672866399, + "language_loss": 0.69172251, + "learning_rate": 1.639558794515983e-08, + "loss": 0.71375728, + "num_input_tokens_seen": 344644330, + "step": 15974, + "time_per_iteration": 2.7059342861175537 + }, + { + "auxiliary_loss_clip": 0.01147951, + "auxiliary_loss_mlp": 0.0109986, + "balance_loss_clip": 1.00177073, + "balance_loss_mlp": 1.00048721, + "epoch": 0.9604689613708102, + "flos": 19683105937920.0, + "grad_norm": 1.7109553255653835, + "language_loss": 0.68017781, + "learning_rate": 1.6345859141779105e-08, + "loss": 0.70265597, + "num_input_tokens_seen": 344663910, + "step": 15975, + "time_per_iteration": 2.5995984077453613 + }, + { + "auxiliary_loss_clip": 0.01164016, + "auxiliary_loss_mlp": 0.01099746, + "balance_loss_clip": 1.00191879, + "balance_loss_mlp": 1.00037336, + "epoch": 0.9605290846234781, + "flos": 24097568544000.0, + "grad_norm": 2.203165563029249, + "language_loss": 0.55519271, + "learning_rate": 1.6296205558762322e-08, + "loss": 0.57783031, + "num_input_tokens_seen": 344682320, + "step": 15976, + "time_per_iteration": 2.5812387466430664 + }, + { + "auxiliary_loss_clip": 0.01132806, + "auxiliary_loss_mlp": 0.01099274, + "balance_loss_clip": 1.00184727, + "balance_loss_mlp": 1.00052071, + "epoch": 0.9605892078761461, + "flos": 27122575299840.0, + "grad_norm": 14.5750848882493, + "language_loss": 0.6846832, + "learning_rate": 1.624662719799219e-08, + "loss": 0.70700407, + "num_input_tokens_seen": 344701355, + "step": 15977, + "time_per_iteration": 2.6895415782928467 + }, + { + "auxiliary_loss_clip": 0.01149018, + "auxiliary_loss_mlp": 0.01099966, + "balance_loss_clip": 1.00179029, + "balance_loss_mlp": 1.00064087, + "epoch": 0.9606493311288141, + "flos": 14136918543360.0, + "grad_norm": 1.7110847016497581, + "language_loss": 0.81836498, + "learning_rate": 1.6197124061348766e-08, + "loss": 0.84085476, + "num_input_tokens_seen": 344717980, + "step": 15978, + "time_per_iteration": 2.6286144256591797 + }, + { + "auxiliary_loss_clip": 0.01149114, + "auxiliary_loss_mlp": 0.01101615, + "balance_loss_clip": 1.00183523, + "balance_loss_mlp": 1.00052524, + "epoch": 0.960709454381482, + "flos": 15813336147840.0, + "grad_norm": 2.6429507552578446, + "language_loss": 0.83013117, + "learning_rate": 1.614769615070921e-08, + "loss": 0.85263854, + "num_input_tokens_seen": 344733480, + "step": 15979, + "time_per_iteration": 3.923318386077881 + }, + { + "auxiliary_loss_clip": 0.01164351, + "auxiliary_loss_mlp": 0.01101271, + "balance_loss_clip": 1.00192547, + "balance_loss_mlp": 1.00056314, + "epoch": 0.96076957763415, + "flos": 22565403959040.0, + "grad_norm": 1.4827619000907764, + "language_loss": 0.79926002, + "learning_rate": 1.6098343467947805e-08, + "loss": 0.82191634, + "num_input_tokens_seen": 344752130, + "step": 15980, + "time_per_iteration": 2.581691265106201 + }, + { + "auxiliary_loss_clip": 0.01149518, + "auxiliary_loss_mlp": 0.01100814, + "balance_loss_clip": 1.00177813, + "balance_loss_mlp": 1.00048745, + "epoch": 0.960829700886818, + "flos": 24681260551680.0, + "grad_norm": 1.9465855151994227, + "language_loss": 0.6841954, + "learning_rate": 1.6049066014935942e-08, + "loss": 0.70669866, + "num_input_tokens_seen": 344771195, + "step": 15981, + "time_per_iteration": 2.659240961074829 + }, + { + "auxiliary_loss_clip": 0.01147494, + "auxiliary_loss_mlp": 0.00747357, + "balance_loss_clip": 1.00174332, + "balance_loss_mlp": 1.00046301, + "epoch": 0.960889824139486, + "flos": 26542223256960.0, + "grad_norm": 1.620854345669848, + "language_loss": 0.69300634, + "learning_rate": 1.5999863793542344e-08, + "loss": 0.71195483, + "num_input_tokens_seen": 344793150, + "step": 15982, + "time_per_iteration": 2.6446897983551025 + }, + { + "auxiliary_loss_clip": 0.01108623, + "auxiliary_loss_mlp": 0.01074914, + "balance_loss_clip": 1.00076067, + "balance_loss_mlp": 1.00052726, + "epoch": 0.9609499473921539, + "flos": 71114942586240.0, + "grad_norm": 0.6704653199030124, + "language_loss": 0.53276098, + "learning_rate": 1.595073680563286e-08, + "loss": 0.5545963, + "num_input_tokens_seen": 344852855, + "step": 15983, + "time_per_iteration": 4.790234565734863 + }, + { + "auxiliary_loss_clip": 0.01164109, + "auxiliary_loss_mlp": 0.0110072, + "balance_loss_clip": 1.00183988, + "balance_loss_mlp": 1.0005362, + "epoch": 0.9610100706448219, + "flos": 20552466810240.0, + "grad_norm": 2.248846576816389, + "language_loss": 0.67834306, + "learning_rate": 1.5901685053070212e-08, + "loss": 0.70099139, + "num_input_tokens_seen": 344869830, + "step": 15984, + "time_per_iteration": 2.569547414779663 + }, + { + "auxiliary_loss_clip": 0.01115189, + "auxiliary_loss_mlp": 0.01098976, + "balance_loss_clip": 1.00166297, + "balance_loss_mlp": 1.00060475, + "epoch": 0.9610701938974898, + "flos": 14064199459200.0, + "grad_norm": 1.5441627077724518, + "language_loss": 0.67267334, + "learning_rate": 1.5852708537714477e-08, + "loss": 0.69481504, + "num_input_tokens_seen": 344888905, + "step": 15985, + "time_per_iteration": 2.6849777698516846 + }, + { + "auxiliary_loss_clip": 0.01164168, + "auxiliary_loss_mlp": 0.01100719, + "balance_loss_clip": 1.00196743, + "balance_loss_mlp": 1.00048828, + "epoch": 0.9611303171501578, + "flos": 20229989483520.0, + "grad_norm": 1.8163417770753514, + "language_loss": 0.78741539, + "learning_rate": 1.580380726142283e-08, + "loss": 0.81006432, + "num_input_tokens_seen": 344907160, + "step": 15986, + "time_per_iteration": 2.5304172039031982 + }, + { + "auxiliary_loss_clip": 0.01098652, + "auxiliary_loss_mlp": 0.01100394, + "balance_loss_clip": 1.00188124, + "balance_loss_mlp": 1.00044918, + "epoch": 0.9611904404028258, + "flos": 20951075013120.0, + "grad_norm": 1.962164999502799, + "language_loss": 0.64126968, + "learning_rate": 1.5754981226049792e-08, + "loss": 0.66326022, + "num_input_tokens_seen": 344922400, + "step": 15987, + "time_per_iteration": 2.6829030513763428 + }, + { + "auxiliary_loss_clip": 0.01163904, + "auxiliary_loss_mlp": 0.01099741, + "balance_loss_clip": 1.00187302, + "balance_loss_mlp": 1.00055873, + "epoch": 0.9612505636554938, + "flos": 24827740214400.0, + "grad_norm": 1.7729343990761872, + "language_loss": 0.6695863, + "learning_rate": 1.5706230433446544e-08, + "loss": 0.69222271, + "num_input_tokens_seen": 344941910, + "step": 15988, + "time_per_iteration": 2.6256566047668457 + }, + { + "auxiliary_loss_clip": 0.01147508, + "auxiliary_loss_mlp": 0.01099886, + "balance_loss_clip": 1.00182664, + "balance_loss_mlp": 1.00060833, + "epoch": 0.9613106869081617, + "flos": 17164977955200.0, + "grad_norm": 1.8668732318752244, + "language_loss": 0.75117218, + "learning_rate": 1.5657554885462055e-08, + "loss": 0.77364612, + "num_input_tokens_seen": 344960020, + "step": 15989, + "time_per_iteration": 2.6719539165496826 + }, + { + "auxiliary_loss_clip": 0.01127738, + "auxiliary_loss_mlp": 0.01073928, + "balance_loss_clip": 1.00086188, + "balance_loss_mlp": 1.0003047, + "epoch": 0.9613708101608297, + "flos": 61563818522880.0, + "grad_norm": 2.035684806694649, + "language_loss": 0.63095212, + "learning_rate": 1.5608954583941737e-08, + "loss": 0.65296882, + "num_input_tokens_seen": 345018290, + "step": 15990, + "time_per_iteration": 3.1337978839874268 + }, + { + "auxiliary_loss_clip": 0.01147427, + "auxiliary_loss_mlp": 0.0110064, + "balance_loss_clip": 1.00177526, + "balance_loss_mlp": 1.00050473, + "epoch": 0.9614309334134977, + "flos": 27417904922880.0, + "grad_norm": 2.3137554775327183, + "language_loss": 0.77720225, + "learning_rate": 1.5560429530729003e-08, + "loss": 0.79968297, + "num_input_tokens_seen": 345040235, + "step": 15991, + "time_per_iteration": 2.6082077026367188 + }, + { + "auxiliary_loss_clip": 0.01164247, + "auxiliary_loss_mlp": 0.01101977, + "balance_loss_clip": 1.0018537, + "balance_loss_mlp": 1.00060117, + "epoch": 0.9614910566661656, + "flos": 22819148611200.0, + "grad_norm": 2.966224935783896, + "language_loss": 0.84704846, + "learning_rate": 1.5511979727663493e-08, + "loss": 0.86971068, + "num_input_tokens_seen": 345054540, + "step": 15992, + "time_per_iteration": 2.5635986328125 + }, + { + "auxiliary_loss_clip": 0.0113199, + "auxiliary_loss_mlp": 0.01100576, + "balance_loss_clip": 1.00158668, + "balance_loss_mlp": 1.00053608, + "epoch": 0.9615511799188337, + "flos": 20667812359680.0, + "grad_norm": 1.8565778928131655, + "language_loss": 0.72345805, + "learning_rate": 1.5463605176582406e-08, + "loss": 0.74578369, + "num_input_tokens_seen": 345074035, + "step": 15993, + "time_per_iteration": 2.6465559005737305 + }, + { + "auxiliary_loss_clip": 0.0111571, + "auxiliary_loss_mlp": 0.01101157, + "balance_loss_clip": 1.00181937, + "balance_loss_mlp": 1.00040126, + "epoch": 0.9616113031715016, + "flos": 33149212035840.0, + "grad_norm": 1.6299592106124763, + "language_loss": 0.68380475, + "learning_rate": 1.5415305879320716e-08, + "loss": 0.70597339, + "num_input_tokens_seen": 345099270, + "step": 15994, + "time_per_iteration": 2.7725107669830322 + }, + { + "auxiliary_loss_clip": 0.01117623, + "auxiliary_loss_mlp": 0.01100083, + "balance_loss_clip": 1.00192976, + "balance_loss_mlp": 1.00051928, + "epoch": 0.9616714264241696, + "flos": 25009807276800.0, + "grad_norm": 5.5690841642285065, + "language_loss": 0.84506381, + "learning_rate": 1.5367081837709183e-08, + "loss": 0.86724091, + "num_input_tokens_seen": 345116975, + "step": 15995, + "time_per_iteration": 2.7141520977020264 + }, + { + "auxiliary_loss_clip": 0.01148775, + "auxiliary_loss_mlp": 0.01101025, + "balance_loss_clip": 1.00179529, + "balance_loss_mlp": 1.00050759, + "epoch": 0.9617315496768375, + "flos": 13547480359680.0, + "grad_norm": 1.9257670987282984, + "language_loss": 0.75906086, + "learning_rate": 1.5318933053576788e-08, + "loss": 0.78155887, + "num_input_tokens_seen": 345133645, + "step": 15996, + "time_per_iteration": 2.612973928451538 + }, + { + "auxiliary_loss_clip": 0.01130716, + "auxiliary_loss_mlp": 0.01101014, + "balance_loss_clip": 1.00167429, + "balance_loss_mlp": 1.00040185, + "epoch": 0.9617916729295055, + "flos": 11254512781440.0, + "grad_norm": 3.637492737326111, + "language_loss": 0.76628661, + "learning_rate": 1.52708595287494e-08, + "loss": 0.7886039, + "num_input_tokens_seen": 345150740, + "step": 15997, + "time_per_iteration": 2.6002731323242188 + }, + { + "auxiliary_loss_clip": 0.01164074, + "auxiliary_loss_mlp": 0.00747187, + "balance_loss_clip": 1.00189352, + "balance_loss_mlp": 1.00046885, + "epoch": 0.9618517961821734, + "flos": 22819723228800.0, + "grad_norm": 1.526463644866256, + "language_loss": 0.67197895, + "learning_rate": 1.522286126505001e-08, + "loss": 0.6910916, + "num_input_tokens_seen": 345170365, + "step": 15998, + "time_per_iteration": 2.6045641899108887 + }, + { + "auxiliary_loss_clip": 0.01134655, + "auxiliary_loss_mlp": 0.01100073, + "balance_loss_clip": 1.00177658, + "balance_loss_mlp": 1.00041437, + "epoch": 0.9619119194348414, + "flos": 16617340224000.0, + "grad_norm": 1.5987679363581457, + "language_loss": 0.72701252, + "learning_rate": 1.5174938264298498e-08, + "loss": 0.74935973, + "num_input_tokens_seen": 345188930, + "step": 15999, + "time_per_iteration": 4.1860692501068115 + }, + { + "auxiliary_loss_clip": 0.01132149, + "auxiliary_loss_mlp": 0.01098946, + "balance_loss_clip": 1.00180864, + "balance_loss_mlp": 1.00043106, + "epoch": 0.9619720426875094, + "flos": 24535140024960.0, + "grad_norm": 2.0017240559389275, + "language_loss": 0.65419698, + "learning_rate": 1.5127090528312514e-08, + "loss": 0.67650789, + "num_input_tokens_seen": 345209615, + "step": 16000, + "time_per_iteration": 4.188874244689941 + }, + { + "auxiliary_loss_clip": 0.0111729, + "auxiliary_loss_mlp": 0.01099825, + "balance_loss_clip": 1.00167704, + "balance_loss_mlp": 1.00040448, + "epoch": 0.9620321659401774, + "flos": 20632224960000.0, + "grad_norm": 1.9357534445926896, + "language_loss": 0.75519025, + "learning_rate": 1.5079318058905723e-08, + "loss": 0.77736145, + "num_input_tokens_seen": 345229175, + "step": 16001, + "time_per_iteration": 2.644284725189209 + }, + { + "auxiliary_loss_clip": 0.01147276, + "auxiliary_loss_mlp": 0.0109974, + "balance_loss_clip": 1.00174761, + "balance_loss_mlp": 1.00050986, + "epoch": 0.9620922891928453, + "flos": 18515290959360.0, + "grad_norm": 2.1021215022489903, + "language_loss": 0.68244839, + "learning_rate": 1.5031620857890447e-08, + "loss": 0.70491862, + "num_input_tokens_seen": 345247815, + "step": 16002, + "time_per_iteration": 2.5628182888031006 + }, + { + "auxiliary_loss_clip": 0.01147444, + "auxiliary_loss_mlp": 0.01100262, + "balance_loss_clip": 1.00188684, + "balance_loss_mlp": 1.00041223, + "epoch": 0.9621524124455133, + "flos": 28767391914240.0, + "grad_norm": 1.2400207899134137, + "language_loss": 0.64794672, + "learning_rate": 1.4983998927074804e-08, + "loss": 0.67042375, + "num_input_tokens_seen": 345269935, + "step": 16003, + "time_per_iteration": 2.6830341815948486 + }, + { + "auxiliary_loss_clip": 0.01097452, + "auxiliary_loss_mlp": 0.01100471, + "balance_loss_clip": 1.00181353, + "balance_loss_mlp": 1.00057316, + "epoch": 0.9622125356981813, + "flos": 19098875226240.0, + "grad_norm": 1.7114236503338502, + "language_loss": 0.7546137, + "learning_rate": 1.493645226826512e-08, + "loss": 0.77659297, + "num_input_tokens_seen": 345288310, + "step": 16004, + "time_per_iteration": 2.67328143119812 + }, + { + "auxiliary_loss_clip": 0.01147559, + "auxiliary_loss_mlp": 0.01100349, + "balance_loss_clip": 1.00185108, + "balance_loss_mlp": 1.00054729, + "epoch": 0.9622726589508492, + "flos": 20302816308480.0, + "grad_norm": 2.1453950773196238, + "language_loss": 0.79338777, + "learning_rate": 1.4888980883263958e-08, + "loss": 0.81586683, + "num_input_tokens_seen": 345306615, + "step": 16005, + "time_per_iteration": 2.5419232845306396 + }, + { + "auxiliary_loss_clip": 0.01147456, + "auxiliary_loss_mlp": 0.01100138, + "balance_loss_clip": 1.00186634, + "balance_loss_mlp": 1.00052679, + "epoch": 0.9623327822035173, + "flos": 54929750889600.0, + "grad_norm": 2.178978239566372, + "language_loss": 0.6788103, + "learning_rate": 1.4841584773871652e-08, + "loss": 0.7012862, + "num_input_tokens_seen": 345331935, + "step": 16006, + "time_per_iteration": 2.9202489852905273 + }, + { + "auxiliary_loss_clip": 0.01133015, + "auxiliary_loss_mlp": 0.0109867, + "balance_loss_clip": 1.00186789, + "balance_loss_mlp": 1.00063264, + "epoch": 0.9623929054561852, + "flos": 21759029585280.0, + "grad_norm": 1.928402522548769, + "language_loss": 0.78356671, + "learning_rate": 1.479426394188521e-08, + "loss": 0.80588353, + "num_input_tokens_seen": 345351510, + "step": 16007, + "time_per_iteration": 2.636484384536743 + }, + { + "auxiliary_loss_clip": 0.01164307, + "auxiliary_loss_mlp": 0.01101718, + "balance_loss_clip": 1.00196981, + "balance_loss_mlp": 1.00062871, + "epoch": 0.9624530287088532, + "flos": 17931563038080.0, + "grad_norm": 1.985113554129135, + "language_loss": 0.677544, + "learning_rate": 1.4747018389099198e-08, + "loss": 0.70020425, + "num_input_tokens_seen": 345367750, + "step": 16008, + "time_per_iteration": 2.549365997314453 + }, + { + "auxiliary_loss_clip": 0.01132278, + "auxiliary_loss_mlp": 0.01100883, + "balance_loss_clip": 1.00175846, + "balance_loss_mlp": 1.00050902, + "epoch": 0.9625131519615211, + "flos": 23253739263360.0, + "grad_norm": 2.3035513699783703, + "language_loss": 0.73224866, + "learning_rate": 1.469984811730529e-08, + "loss": 0.75458026, + "num_input_tokens_seen": 345384790, + "step": 16009, + "time_per_iteration": 2.6232564449310303 + }, + { + "auxiliary_loss_clip": 0.01147279, + "auxiliary_loss_mlp": 0.01099639, + "balance_loss_clip": 1.00179911, + "balance_loss_mlp": 1.00040901, + "epoch": 0.9625732752141891, + "flos": 18916628595840.0, + "grad_norm": 3.3585737165704663, + "language_loss": 0.75800151, + "learning_rate": 1.4652753128292061e-08, + "loss": 0.78047073, + "num_input_tokens_seen": 345403390, + "step": 16010, + "time_per_iteration": 2.5634403228759766 + }, + { + "auxiliary_loss_clip": 0.01147752, + "auxiliary_loss_mlp": 0.01101923, + "balance_loss_clip": 1.00196457, + "balance_loss_mlp": 1.00054729, + "epoch": 0.962633398466857, + "flos": 16252918790400.0, + "grad_norm": 1.938528718937776, + "language_loss": 0.69697702, + "learning_rate": 1.4605733423845635e-08, + "loss": 0.71947384, + "num_input_tokens_seen": 345418685, + "step": 16011, + "time_per_iteration": 2.6858255863189697 + }, + { + "auxiliary_loss_clip": 0.01147383, + "auxiliary_loss_mlp": 0.01100413, + "balance_loss_clip": 1.00183368, + "balance_loss_mlp": 1.00056303, + "epoch": 0.962693521719525, + "flos": 54197424403200.0, + "grad_norm": 1.759995028491408, + "language_loss": 0.68679082, + "learning_rate": 1.4558789005748585e-08, + "loss": 0.70926881, + "num_input_tokens_seen": 345442380, + "step": 16012, + "time_per_iteration": 2.8418710231781006 + }, + { + "auxiliary_loss_clip": 0.01135036, + "auxiliary_loss_mlp": 0.0110217, + "balance_loss_clip": 1.0018816, + "balance_loss_mlp": 1.00060344, + "epoch": 0.962753644972193, + "flos": 33105795471360.0, + "grad_norm": 2.7260642010888865, + "language_loss": 0.72055036, + "learning_rate": 1.4511919875781264e-08, + "loss": 0.74292248, + "num_input_tokens_seen": 345463815, + "step": 16013, + "time_per_iteration": 2.705723524093628 + }, + { + "auxiliary_loss_clip": 0.01131916, + "auxiliary_loss_mlp": 0.01100783, + "balance_loss_clip": 1.00189924, + "balance_loss_mlp": 1.00045633, + "epoch": 0.962813768224861, + "flos": 42230660837760.0, + "grad_norm": 2.8832578462380662, + "language_loss": 0.63366199, + "learning_rate": 1.4465126035720698e-08, + "loss": 0.65598899, + "num_input_tokens_seen": 345484525, + "step": 16014, + "time_per_iteration": 2.7709782123565674 + }, + { + "auxiliary_loss_clip": 0.01130677, + "auxiliary_loss_mlp": 0.01098864, + "balance_loss_clip": 1.00178301, + "balance_loss_mlp": 1.00049257, + "epoch": 0.9628738914775289, + "flos": 43944677003520.0, + "grad_norm": 2.0923445838345973, + "language_loss": 0.71788567, + "learning_rate": 1.4418407487341688e-08, + "loss": 0.74018109, + "num_input_tokens_seen": 345508295, + "step": 16015, + "time_per_iteration": 2.8337290287017822 + }, + { + "auxiliary_loss_clip": 0.01118223, + "auxiliary_loss_mlp": 0.01100032, + "balance_loss_clip": 1.00185478, + "balance_loss_mlp": 1.00056398, + "epoch": 0.9629340147301969, + "flos": 15596184476160.0, + "grad_norm": 1.7472389844216656, + "language_loss": 0.76998681, + "learning_rate": 1.4371764232415707e-08, + "loss": 0.79216933, + "num_input_tokens_seen": 345525155, + "step": 16016, + "time_per_iteration": 2.633805274963379 + }, + { + "auxiliary_loss_clip": 0.01157945, + "auxiliary_loss_mlp": 0.0107367, + "balance_loss_clip": 1.00073671, + "balance_loss_mlp": 1.00004625, + "epoch": 0.9629941379828649, + "flos": 62951011816320.0, + "grad_norm": 0.8041308630375387, + "language_loss": 0.6308372, + "learning_rate": 1.4325196272711337e-08, + "loss": 0.6531533, + "num_input_tokens_seen": 345578905, + "step": 16017, + "time_per_iteration": 4.424330949783325 + }, + { + "auxiliary_loss_clip": 0.01134335, + "auxiliary_loss_mlp": 0.01100639, + "balance_loss_clip": 1.00179565, + "balance_loss_mlp": 1.00050354, + "epoch": 0.9630542612355328, + "flos": 29899116702720.0, + "grad_norm": 2.1596646441665786, + "language_loss": 0.664244, + "learning_rate": 1.4278703609994502e-08, + "loss": 0.68659377, + "num_input_tokens_seen": 345598965, + "step": 16018, + "time_per_iteration": 2.7271249294281006 + }, + { + "auxiliary_loss_clip": 0.01098738, + "auxiliary_loss_mlp": 0.01100147, + "balance_loss_clip": 1.00178385, + "balance_loss_mlp": 1.00063062, + "epoch": 0.9631143844882009, + "flos": 17894575008000.0, + "grad_norm": 2.169638875965086, + "language_loss": 0.79704845, + "learning_rate": 1.4232286246028457e-08, + "loss": 0.81903732, + "num_input_tokens_seen": 345617945, + "step": 16019, + "time_per_iteration": 2.668349504470825 + }, + { + "auxiliary_loss_clip": 0.01117726, + "auxiliary_loss_mlp": 0.01099563, + "balance_loss_clip": 1.00167727, + "balance_loss_mlp": 1.0004288, + "epoch": 0.9631745077408688, + "flos": 26139161767680.0, + "grad_norm": 1.4973420661725645, + "language_loss": 0.71606594, + "learning_rate": 1.4185944182572907e-08, + "loss": 0.73823887, + "num_input_tokens_seen": 345637920, + "step": 16020, + "time_per_iteration": 2.7436153888702393 + }, + { + "auxiliary_loss_clip": 0.01130662, + "auxiliary_loss_mlp": 0.01100692, + "balance_loss_clip": 1.0017432, + "balance_loss_mlp": 1.0004611, + "epoch": 0.9632346309935368, + "flos": 24973645259520.0, + "grad_norm": 2.358034600308401, + "language_loss": 0.77046627, + "learning_rate": 1.4139677421385331e-08, + "loss": 0.7927798, + "num_input_tokens_seen": 345656195, + "step": 16021, + "time_per_iteration": 4.0738372802734375 + }, + { + "auxiliary_loss_clip": 0.01116502, + "auxiliary_loss_mlp": 0.01102361, + "balance_loss_clip": 1.0016551, + "balance_loss_mlp": 1.00050902, + "epoch": 0.9632947542462047, + "flos": 23617226943360.0, + "grad_norm": 2.2323038601052465, + "language_loss": 0.64751238, + "learning_rate": 1.4093485964220331e-08, + "loss": 0.66970104, + "num_input_tokens_seen": 345676700, + "step": 16022, + "time_per_iteration": 2.673870325088501 + }, + { + "auxiliary_loss_clip": 0.01134808, + "auxiliary_loss_mlp": 0.01099585, + "balance_loss_clip": 1.00189042, + "balance_loss_mlp": 1.00059354, + "epoch": 0.9633548774988727, + "flos": 26395599939840.0, + "grad_norm": 1.9707345375603458, + "language_loss": 0.73030961, + "learning_rate": 1.4047369812829168e-08, + "loss": 0.75265354, + "num_input_tokens_seen": 345696725, + "step": 16023, + "time_per_iteration": 2.707953453063965 + }, + { + "auxiliary_loss_clip": 0.01149006, + "auxiliary_loss_mlp": 0.01099749, + "balance_loss_clip": 1.00180769, + "balance_loss_mlp": 1.00037622, + "epoch": 0.9634150007515406, + "flos": 23767728929280.0, + "grad_norm": 1.6930950550893715, + "language_loss": 0.81496233, + "learning_rate": 1.4001328968960891e-08, + "loss": 0.83744985, + "num_input_tokens_seen": 345716245, + "step": 16024, + "time_per_iteration": 2.626633405685425 + }, + { + "auxiliary_loss_clip": 0.01147815, + "auxiliary_loss_mlp": 0.01100598, + "balance_loss_clip": 1.00180173, + "balance_loss_mlp": 1.00060523, + "epoch": 0.9634751240042086, + "flos": 24135346673280.0, + "grad_norm": 2.645352817518345, + "language_loss": 0.81414425, + "learning_rate": 1.3955363434361212e-08, + "loss": 0.83662838, + "num_input_tokens_seen": 345739060, + "step": 16025, + "time_per_iteration": 2.6469759941101074 + }, + { + "auxiliary_loss_clip": 0.01149655, + "auxiliary_loss_mlp": 0.01100888, + "balance_loss_clip": 1.00183725, + "balance_loss_mlp": 1.00046575, + "epoch": 0.9635352472568766, + "flos": 24349086552960.0, + "grad_norm": 1.8202996418722877, + "language_loss": 0.76410997, + "learning_rate": 1.3909473210773181e-08, + "loss": 0.78661537, + "num_input_tokens_seen": 345758325, + "step": 16026, + "time_per_iteration": 2.592216730117798 + }, + { + "auxiliary_loss_clip": 0.01117921, + "auxiliary_loss_mlp": 0.00747384, + "balance_loss_clip": 1.00172901, + "balance_loss_mlp": 1.00046611, + "epoch": 0.9635953705095446, + "flos": 23984772860160.0, + "grad_norm": 1.8104456211274749, + "language_loss": 0.63343185, + "learning_rate": 1.3863658299936965e-08, + "loss": 0.65208495, + "num_input_tokens_seen": 345778530, + "step": 16027, + "time_per_iteration": 2.729369640350342 + }, + { + "auxiliary_loss_clip": 0.01149645, + "auxiliary_loss_mlp": 0.01101317, + "balance_loss_clip": 1.00197768, + "balance_loss_mlp": 1.00056183, + "epoch": 0.9636554937622125, + "flos": 19828436365440.0, + "grad_norm": 1.8459581750716865, + "language_loss": 0.87267137, + "learning_rate": 1.3817918703589837e-08, + "loss": 0.89518106, + "num_input_tokens_seen": 345796535, + "step": 16028, + "time_per_iteration": 2.5424253940582275 + }, + { + "auxiliary_loss_clip": 0.01095045, + "auxiliary_loss_mlp": 0.01075746, + "balance_loss_clip": 1.0009743, + "balance_loss_mlp": 1.000597, + "epoch": 0.9637156170148805, + "flos": 67435499986560.0, + "grad_norm": 0.6813757613532474, + "language_loss": 0.53186631, + "learning_rate": 1.3772254423466412e-08, + "loss": 0.5535742, + "num_input_tokens_seen": 345859700, + "step": 16029, + "time_per_iteration": 3.247281789779663 + }, + { + "auxiliary_loss_clip": 0.0116416, + "auxiliary_loss_mlp": 0.01100965, + "balance_loss_clip": 1.00183046, + "balance_loss_mlp": 1.00035238, + "epoch": 0.9637757402675484, + "flos": 20300912887680.0, + "grad_norm": 1.9480387705408184, + "language_loss": 0.73684311, + "learning_rate": 1.372666546129797e-08, + "loss": 0.75949436, + "num_input_tokens_seen": 345878760, + "step": 16030, + "time_per_iteration": 2.5421082973480225 + }, + { + "auxiliary_loss_clip": 0.01133962, + "auxiliary_loss_mlp": 0.01099908, + "balance_loss_clip": 1.00198865, + "balance_loss_mlp": 1.00053465, + "epoch": 0.9638358635202164, + "flos": 27234544970880.0, + "grad_norm": 1.8947781503872847, + "language_loss": 0.66378999, + "learning_rate": 1.3681151818813575e-08, + "loss": 0.68612862, + "num_input_tokens_seen": 345900445, + "step": 16031, + "time_per_iteration": 2.672046422958374 + }, + { + "auxiliary_loss_clip": 0.01141314, + "auxiliary_loss_mlp": 0.00745406, + "balance_loss_clip": 1.00069249, + "balance_loss_mlp": 1.00023425, + "epoch": 0.9638959867728845, + "flos": 70288998278400.0, + "grad_norm": 0.8379404833117403, + "language_loss": 0.6071474, + "learning_rate": 1.3635713497738955e-08, + "loss": 0.62601459, + "num_input_tokens_seen": 345961020, + "step": 16032, + "time_per_iteration": 3.178758382797241 + }, + { + "auxiliary_loss_clip": 0.01147211, + "auxiliary_loss_mlp": 0.01099463, + "balance_loss_clip": 1.0018146, + "balance_loss_mlp": 1.00056732, + "epoch": 0.9639561100255524, + "flos": 25407517639680.0, + "grad_norm": 1.7878988536963303, + "language_loss": 0.6633215, + "learning_rate": 1.3590350499796954e-08, + "loss": 0.68578821, + "num_input_tokens_seen": 345980210, + "step": 16033, + "time_per_iteration": 2.605977773666382 + }, + { + "auxiliary_loss_clip": 0.01084443, + "auxiliary_loss_mlp": 0.01100001, + "balance_loss_clip": 1.00176787, + "balance_loss_mlp": 1.00062847, + "epoch": 0.9640162332782204, + "flos": 18113881495680.0, + "grad_norm": 6.194199314810491, + "language_loss": 0.6572367, + "learning_rate": 1.3545062826707976e-08, + "loss": 0.6790812, + "num_input_tokens_seen": 345998280, + "step": 16034, + "time_per_iteration": 2.775313377380371 + }, + { + "auxiliary_loss_clip": 0.01118651, + "auxiliary_loss_mlp": 0.01100475, + "balance_loss_clip": 1.00183749, + "balance_loss_mlp": 1.00048208, + "epoch": 0.9640763565308883, + "flos": 23440295525760.0, + "grad_norm": 3.4713657386929317, + "language_loss": 0.7451973, + "learning_rate": 1.3499850480189313e-08, + "loss": 0.76738846, + "num_input_tokens_seen": 346015545, + "step": 16035, + "time_per_iteration": 2.664883613586426 + }, + { + "auxiliary_loss_clip": 0.01164443, + "auxiliary_loss_mlp": 0.01100342, + "balance_loss_clip": 1.00218797, + "balance_loss_mlp": 1.00049257, + "epoch": 0.9641364797835563, + "flos": 22419355259520.0, + "grad_norm": 2.311997501694368, + "language_loss": 0.82269645, + "learning_rate": 1.3454713461955591e-08, + "loss": 0.84534431, + "num_input_tokens_seen": 346034055, + "step": 16036, + "time_per_iteration": 2.5853211879730225 + }, + { + "auxiliary_loss_clip": 0.01131563, + "auxiliary_loss_mlp": 0.01100664, + "balance_loss_clip": 1.00182796, + "balance_loss_mlp": 1.00052774, + "epoch": 0.9641966030362242, + "flos": 30622357048320.0, + "grad_norm": 1.9509735864936482, + "language_loss": 0.69860041, + "learning_rate": 1.340965177371789e-08, + "loss": 0.72092271, + "num_input_tokens_seen": 346054130, + "step": 16037, + "time_per_iteration": 4.173454523086548 + }, + { + "auxiliary_loss_clip": 0.0116411, + "auxiliary_loss_mlp": 0.0110045, + "balance_loss_clip": 1.0018307, + "balance_loss_mlp": 1.00045705, + "epoch": 0.9642567262888923, + "flos": 20953122088320.0, + "grad_norm": 1.8009106711897052, + "language_loss": 0.6276167, + "learning_rate": 1.3364665417185506e-08, + "loss": 0.65026224, + "num_input_tokens_seen": 346072990, + "step": 16038, + "time_per_iteration": 4.068692684173584 + }, + { + "auxiliary_loss_clip": 0.01118177, + "auxiliary_loss_mlp": 0.00747453, + "balance_loss_clip": 1.00175655, + "balance_loss_mlp": 1.00061822, + "epoch": 0.9643168495415602, + "flos": 22639415932800.0, + "grad_norm": 1.7887452843976426, + "language_loss": 0.70721722, + "learning_rate": 1.3319754394064187e-08, + "loss": 0.72587359, + "num_input_tokens_seen": 346093745, + "step": 16039, + "time_per_iteration": 2.7070906162261963 + }, + { + "auxiliary_loss_clip": 0.01099818, + "auxiliary_loss_mlp": 0.01100134, + "balance_loss_clip": 1.00170922, + "balance_loss_mlp": 1.00042748, + "epoch": 0.9643769727942282, + "flos": 20266259241600.0, + "grad_norm": 2.1792712193195927, + "language_loss": 0.73038244, + "learning_rate": 1.327491870605657e-08, + "loss": 0.75238192, + "num_input_tokens_seen": 346110115, + "step": 16040, + "time_per_iteration": 2.740125894546509 + }, + { + "auxiliary_loss_clip": 0.01149187, + "auxiliary_loss_mlp": 0.0110127, + "balance_loss_clip": 1.00186968, + "balance_loss_mlp": 1.00051463, + "epoch": 0.9644370960468961, + "flos": 13881845088000.0, + "grad_norm": 2.3847089096667484, + "language_loss": 0.73218304, + "learning_rate": 1.3230158354863296e-08, + "loss": 0.75468755, + "num_input_tokens_seen": 346127165, + "step": 16041, + "time_per_iteration": 2.5833215713500977 + }, + { + "auxiliary_loss_clip": 0.0113285, + "auxiliary_loss_mlp": 0.01099519, + "balance_loss_clip": 1.00188291, + "balance_loss_mlp": 1.0004797, + "epoch": 0.9644972192995641, + "flos": 17238199829760.0, + "grad_norm": 2.443420846248901, + "language_loss": 0.71938884, + "learning_rate": 1.3185473342181674e-08, + "loss": 0.74171257, + "num_input_tokens_seen": 346145950, + "step": 16042, + "time_per_iteration": 2.7033233642578125 + }, + { + "auxiliary_loss_clip": 0.01118396, + "auxiliary_loss_mlp": 0.01100712, + "balance_loss_clip": 1.00174248, + "balance_loss_mlp": 1.00052822, + "epoch": 0.964557342552232, + "flos": 23840340272640.0, + "grad_norm": 1.8594175573868468, + "language_loss": 0.81467998, + "learning_rate": 1.3140863669705683e-08, + "loss": 0.83687109, + "num_input_tokens_seen": 346165005, + "step": 16043, + "time_per_iteration": 2.7478997707366943 + }, + { + "auxiliary_loss_clip": 0.01133085, + "auxiliary_loss_mlp": 0.01100205, + "balance_loss_clip": 1.00175476, + "balance_loss_mlp": 1.00059414, + "epoch": 0.9646174658049, + "flos": 21653129312640.0, + "grad_norm": 1.6156056087338917, + "language_loss": 0.71775603, + "learning_rate": 1.3096329339127522e-08, + "loss": 0.74008894, + "num_input_tokens_seen": 346185095, + "step": 16044, + "time_per_iteration": 2.667957067489624 + }, + { + "auxiliary_loss_clip": 0.01132978, + "auxiliary_loss_mlp": 0.01099183, + "balance_loss_clip": 1.00177431, + "balance_loss_mlp": 1.00062037, + "epoch": 0.9646775890575681, + "flos": 17129570123520.0, + "grad_norm": 2.0217649657518875, + "language_loss": 0.70294774, + "learning_rate": 1.3051870352135397e-08, + "loss": 0.72526938, + "num_input_tokens_seen": 346202580, + "step": 16045, + "time_per_iteration": 2.6063053607940674 + }, + { + "auxiliary_loss_clip": 0.0108564, + "auxiliary_loss_mlp": 0.01100597, + "balance_loss_clip": 1.00167775, + "balance_loss_mlp": 1.00046086, + "epoch": 0.964737712310236, + "flos": 13005732458880.0, + "grad_norm": 2.3862904495464123, + "language_loss": 0.74961489, + "learning_rate": 1.3007486710415737e-08, + "loss": 0.77147722, + "num_input_tokens_seen": 346219395, + "step": 16046, + "time_per_iteration": 2.7283310890197754 + }, + { + "auxiliary_loss_clip": 0.01148704, + "auxiliary_loss_mlp": 0.01100602, + "balance_loss_clip": 1.00183821, + "balance_loss_mlp": 1.00065708, + "epoch": 0.964797835562904, + "flos": 24279240556800.0, + "grad_norm": 1.5023559343238135, + "language_loss": 0.62545526, + "learning_rate": 1.2963178415651199e-08, + "loss": 0.64794832, + "num_input_tokens_seen": 346239715, + "step": 16047, + "time_per_iteration": 2.5926315784454346 + }, + { + "auxiliary_loss_clip": 0.01131141, + "auxiliary_loss_mlp": 0.01099972, + "balance_loss_clip": 1.00181115, + "balance_loss_mlp": 1.00050354, + "epoch": 0.9648579588155719, + "flos": 20522697413760.0, + "grad_norm": 1.845985619865476, + "language_loss": 0.69136691, + "learning_rate": 1.2918945469521992e-08, + "loss": 0.713678, + "num_input_tokens_seen": 346258500, + "step": 16048, + "time_per_iteration": 2.6553397178649902 + }, + { + "auxiliary_loss_clip": 0.01147461, + "auxiliary_loss_mlp": 0.01100721, + "balance_loss_clip": 1.00179839, + "balance_loss_mlp": 1.00049043, + "epoch": 0.9649180820682399, + "flos": 32154844855680.0, + "grad_norm": 2.1674154631669276, + "language_loss": 0.63930655, + "learning_rate": 1.2874787873705662e-08, + "loss": 0.66178834, + "num_input_tokens_seen": 346279110, + "step": 16049, + "time_per_iteration": 2.6588387489318848 + }, + { + "auxiliary_loss_clip": 0.01147846, + "auxiliary_loss_mlp": 0.01100183, + "balance_loss_clip": 1.00197482, + "balance_loss_mlp": 1.00057149, + "epoch": 0.9649782053209078, + "flos": 20522589672960.0, + "grad_norm": 1.8075583476084278, + "language_loss": 0.7114234, + "learning_rate": 1.2830705629876427e-08, + "loss": 0.73390365, + "num_input_tokens_seen": 346297860, + "step": 16050, + "time_per_iteration": 2.628551959991455 + }, + { + "auxiliary_loss_clip": 0.01149729, + "auxiliary_loss_mlp": 0.01101849, + "balance_loss_clip": 1.00188017, + "balance_loss_mlp": 1.00061607, + "epoch": 0.9650383285735759, + "flos": 43067953843200.0, + "grad_norm": 1.9532692705312495, + "language_loss": 0.69856977, + "learning_rate": 1.278669873970606e-08, + "loss": 0.72108555, + "num_input_tokens_seen": 346319860, + "step": 16051, + "time_per_iteration": 2.811598062515259 + }, + { + "auxiliary_loss_clip": 0.01141296, + "auxiliary_loss_mlp": 0.01073786, + "balance_loss_clip": 1.00075114, + "balance_loss_mlp": 1.00016236, + "epoch": 0.9650984518262438, + "flos": 61748255882880.0, + "grad_norm": 0.9075895705381344, + "language_loss": 0.59148657, + "learning_rate": 1.2742767204863004e-08, + "loss": 0.61363739, + "num_input_tokens_seen": 346379025, + "step": 16052, + "time_per_iteration": 3.228546142578125 + }, + { + "auxiliary_loss_clip": 0.01164047, + "auxiliary_loss_mlp": 0.0110056, + "balance_loss_clip": 1.00184059, + "balance_loss_mlp": 1.00028157, + "epoch": 0.9651585750789118, + "flos": 29789337761280.0, + "grad_norm": 2.3977140150814655, + "language_loss": 0.74404204, + "learning_rate": 1.2698911027013482e-08, + "loss": 0.76668811, + "num_input_tokens_seen": 346402250, + "step": 16053, + "time_per_iteration": 2.645766496658325 + }, + { + "auxiliary_loss_clip": 0.01134326, + "auxiliary_loss_mlp": 0.0110058, + "balance_loss_clip": 1.00188279, + "balance_loss_mlp": 1.00058675, + "epoch": 0.9652186983315797, + "flos": 16873060124160.0, + "grad_norm": 2.243491723758107, + "language_loss": 0.68671596, + "learning_rate": 1.2655130207820386e-08, + "loss": 0.70906496, + "num_input_tokens_seen": 346419555, + "step": 16054, + "time_per_iteration": 3.9623618125915527 + }, + { + "auxiliary_loss_clip": 0.01147882, + "auxiliary_loss_mlp": 0.0074734, + "balance_loss_clip": 1.00198436, + "balance_loss_mlp": 1.00048423, + "epoch": 0.9652788215842477, + "flos": 31649761762560.0, + "grad_norm": 1.374719706212404, + "language_loss": 0.61938536, + "learning_rate": 1.2611424748943944e-08, + "loss": 0.63833761, + "num_input_tokens_seen": 346441245, + "step": 16055, + "time_per_iteration": 2.6996026039123535 + }, + { + "auxiliary_loss_clip": 0.01115965, + "auxiliary_loss_mlp": 0.01099962, + "balance_loss_clip": 1.00182223, + "balance_loss_mlp": 1.00058877, + "epoch": 0.9653389448369156, + "flos": 24754266944640.0, + "grad_norm": 1.8731941658402056, + "language_loss": 0.77142298, + "learning_rate": 1.2567794652041719e-08, + "loss": 0.79358232, + "num_input_tokens_seen": 346460065, + "step": 16056, + "time_per_iteration": 2.721982479095459 + }, + { + "auxiliary_loss_clip": 0.01132688, + "auxiliary_loss_mlp": 0.01100341, + "balance_loss_clip": 1.00180745, + "balance_loss_mlp": 1.00049174, + "epoch": 0.9653990680895836, + "flos": 20297249700480.0, + "grad_norm": 1.5523970610251656, + "language_loss": 0.7142908, + "learning_rate": 1.2524239918767498e-08, + "loss": 0.73662114, + "num_input_tokens_seen": 346478005, + "step": 16057, + "time_per_iteration": 2.735933303833008 + }, + { + "auxiliary_loss_clip": 0.01164187, + "auxiliary_loss_mlp": 0.01100544, + "balance_loss_clip": 1.00185728, + "balance_loss_mlp": 1.00050354, + "epoch": 0.9654591913422517, + "flos": 22528775064960.0, + "grad_norm": 1.9119437532591916, + "language_loss": 0.72009027, + "learning_rate": 1.2480760550773295e-08, + "loss": 0.74273759, + "num_input_tokens_seen": 346497575, + "step": 16058, + "time_per_iteration": 2.5642125606536865 + }, + { + "auxiliary_loss_clip": 0.011474, + "auxiliary_loss_mlp": 0.01100541, + "balance_loss_clip": 1.00176573, + "balance_loss_mlp": 1.00064361, + "epoch": 0.9655193145949196, + "flos": 26763002202240.0, + "grad_norm": 1.5300910491262205, + "language_loss": 0.74142677, + "learning_rate": 1.2437356549708011e-08, + "loss": 0.76390612, + "num_input_tokens_seen": 346520000, + "step": 16059, + "time_per_iteration": 4.106688022613525 + }, + { + "auxiliary_loss_clip": 0.01132563, + "auxiliary_loss_mlp": 0.01100778, + "balance_loss_clip": 1.00179005, + "balance_loss_mlp": 1.00054729, + "epoch": 0.9655794378475876, + "flos": 41970703132800.0, + "grad_norm": 2.0305858216201442, + "language_loss": 0.73934162, + "learning_rate": 1.239402791721722e-08, + "loss": 0.76167506, + "num_input_tokens_seen": 346541605, + "step": 16060, + "time_per_iteration": 2.809063673019409 + }, + { + "auxiliary_loss_clip": 0.0113164, + "auxiliary_loss_mlp": 0.01099289, + "balance_loss_clip": 1.00177419, + "balance_loss_mlp": 1.00039268, + "epoch": 0.9656395611002555, + "flos": 27709427704320.0, + "grad_norm": 1.8350513661309096, + "language_loss": 0.76499391, + "learning_rate": 1.2350774654944273e-08, + "loss": 0.78730315, + "num_input_tokens_seen": 346560955, + "step": 16061, + "time_per_iteration": 2.686046838760376 + }, + { + "auxiliary_loss_clip": 0.01126506, + "auxiliary_loss_mlp": 0.01074053, + "balance_loss_clip": 1.0008235, + "balance_loss_mlp": 1.00004816, + "epoch": 0.9656996843529235, + "flos": 68968562411520.0, + "grad_norm": 0.726006268206466, + "language_loss": 0.6418227, + "learning_rate": 1.2307596764528749e-08, + "loss": 0.66382825, + "num_input_tokens_seen": 346621615, + "step": 16062, + "time_per_iteration": 3.21179461479187 + }, + { + "auxiliary_loss_clip": 0.01101056, + "auxiliary_loss_mlp": 0.01099486, + "balance_loss_clip": 1.00177884, + "balance_loss_mlp": 1.00049484, + "epoch": 0.9657598076055914, + "flos": 20631327120000.0, + "grad_norm": 2.4998606400346772, + "language_loss": 0.93073869, + "learning_rate": 1.226449424760867e-08, + "loss": 0.95274413, + "num_input_tokens_seen": 346637460, + "step": 16063, + "time_per_iteration": 2.7408785820007324 + }, + { + "auxiliary_loss_clip": 0.0114773, + "auxiliary_loss_mlp": 0.01100398, + "balance_loss_clip": 1.00190616, + "balance_loss_mlp": 1.00064373, + "epoch": 0.9658199308582595, + "flos": 20448577699200.0, + "grad_norm": 2.0959924684737405, + "language_loss": 0.82111597, + "learning_rate": 1.2221467105818062e-08, + "loss": 0.84359723, + "num_input_tokens_seen": 346655625, + "step": 16064, + "time_per_iteration": 2.561217784881592 + }, + { + "auxiliary_loss_clip": 0.01149506, + "auxiliary_loss_mlp": 0.00747195, + "balance_loss_clip": 1.00213766, + "balance_loss_mlp": 1.00049281, + "epoch": 0.9658800541109274, + "flos": 24718033100160.0, + "grad_norm": 2.1635141473191637, + "language_loss": 0.8422029, + "learning_rate": 1.2178515340788731e-08, + "loss": 0.86116982, + "num_input_tokens_seen": 346675220, + "step": 16065, + "time_per_iteration": 2.637383222579956 + }, + { + "auxiliary_loss_clip": 0.01130677, + "auxiliary_loss_mlp": 0.01100245, + "balance_loss_clip": 1.00172913, + "balance_loss_mlp": 1.00053871, + "epoch": 0.9659401773635954, + "flos": 21610035970560.0, + "grad_norm": 1.9968044870202009, + "language_loss": 0.67451715, + "learning_rate": 1.2135638954149151e-08, + "loss": 0.6968264, + "num_input_tokens_seen": 346694710, + "step": 16066, + "time_per_iteration": 2.622664213180542 + }, + { + "auxiliary_loss_clip": 0.01164076, + "auxiliary_loss_mlp": 0.01099921, + "balance_loss_clip": 1.00189328, + "balance_loss_mlp": 1.0005486, + "epoch": 0.9660003006162633, + "flos": 20301200196480.0, + "grad_norm": 3.0735615429152934, + "language_loss": 0.82348967, + "learning_rate": 1.209283794752558e-08, + "loss": 0.8461296, + "num_input_tokens_seen": 346712645, + "step": 16067, + "time_per_iteration": 2.6114351749420166 + }, + { + "auxiliary_loss_clip": 0.0113208, + "auxiliary_loss_mlp": 0.0109974, + "balance_loss_clip": 1.00171256, + "balance_loss_mlp": 1.00055778, + "epoch": 0.9660604238689313, + "flos": 24461954064000.0, + "grad_norm": 2.4850109658296224, + "language_loss": 0.68902171, + "learning_rate": 1.2050112322540496e-08, + "loss": 0.71133995, + "num_input_tokens_seen": 346732375, + "step": 16068, + "time_per_iteration": 2.6407270431518555 + }, + { + "auxiliary_loss_clip": 0.01149353, + "auxiliary_loss_mlp": 0.01098012, + "balance_loss_clip": 1.00187159, + "balance_loss_mlp": 1.00059414, + "epoch": 0.9661205471215992, + "flos": 19864023765120.0, + "grad_norm": 1.769404098774762, + "language_loss": 0.67830658, + "learning_rate": 1.20074620808146e-08, + "loss": 0.70078021, + "num_input_tokens_seen": 346750430, + "step": 16069, + "time_per_iteration": 2.615119457244873 + }, + { + "auxiliary_loss_clip": 0.01131396, + "auxiliary_loss_mlp": 0.01100004, + "balance_loss_clip": 1.00188899, + "balance_loss_mlp": 1.00048852, + "epoch": 0.9661806703742672, + "flos": 20557889763840.0, + "grad_norm": 2.292259852925097, + "language_loss": 0.89105487, + "learning_rate": 1.1964887223964826e-08, + "loss": 0.91336882, + "num_input_tokens_seen": 346768455, + "step": 16070, + "time_per_iteration": 2.588043451309204 + }, + { + "auxiliary_loss_clip": 0.01164322, + "auxiliary_loss_mlp": 0.01100507, + "balance_loss_clip": 1.00200403, + "balance_loss_mlp": 1.00056195, + "epoch": 0.9662407936269353, + "flos": 21430949736960.0, + "grad_norm": 1.820112542303442, + "language_loss": 0.77344263, + "learning_rate": 1.1922387753605878e-08, + "loss": 0.79609096, + "num_input_tokens_seen": 346786530, + "step": 16071, + "time_per_iteration": 2.578125476837158 + }, + { + "auxiliary_loss_clip": 0.01134109, + "auxiliary_loss_mlp": 0.01100058, + "balance_loss_clip": 1.00183296, + "balance_loss_mlp": 1.00054216, + "epoch": 0.9663009168796032, + "flos": 14902893095040.0, + "grad_norm": 1.8989749359949002, + "language_loss": 0.65969384, + "learning_rate": 1.1879963671349137e-08, + "loss": 0.68203545, + "num_input_tokens_seen": 346804635, + "step": 16072, + "time_per_iteration": 2.5737645626068115 + }, + { + "auxiliary_loss_clip": 0.01149662, + "auxiliary_loss_mlp": 0.01100022, + "balance_loss_clip": 1.00192642, + "balance_loss_mlp": 1.00045788, + "epoch": 0.9663610401322712, + "flos": 24310877460480.0, + "grad_norm": 1.6551276638187133, + "language_loss": 0.77353895, + "learning_rate": 1.1837614978803534e-08, + "loss": 0.79603577, + "num_input_tokens_seen": 346823070, + "step": 16073, + "time_per_iteration": 2.5958645343780518 + }, + { + "auxiliary_loss_clip": 0.01164269, + "auxiliary_loss_mlp": 0.01101119, + "balance_loss_clip": 1.0018785, + "balance_loss_mlp": 1.00050688, + "epoch": 0.9664211633849391, + "flos": 17637849527040.0, + "grad_norm": 2.143626510096389, + "language_loss": 0.7612527, + "learning_rate": 1.1795341677574677e-08, + "loss": 0.78390658, + "num_input_tokens_seen": 346841180, + "step": 16074, + "time_per_iteration": 2.493107557296753 + }, + { + "auxiliary_loss_clip": 0.011332, + "auxiliary_loss_mlp": 0.01100636, + "balance_loss_clip": 1.00181782, + "balance_loss_mlp": 1.00049996, + "epoch": 0.9664812866376071, + "flos": 29789409588480.0, + "grad_norm": 1.530678373989922, + "language_loss": 0.75857377, + "learning_rate": 1.1753143769265728e-08, + "loss": 0.78091216, + "num_input_tokens_seen": 346864250, + "step": 16075, + "time_per_iteration": 4.13752555847168 + }, + { + "auxiliary_loss_clip": 0.01116022, + "auxiliary_loss_mlp": 0.01100278, + "balance_loss_clip": 1.00186276, + "balance_loss_mlp": 1.00052381, + "epoch": 0.966541409890275, + "flos": 14282320798080.0, + "grad_norm": 2.06368179078072, + "language_loss": 0.79410994, + "learning_rate": 1.171102125547696e-08, + "loss": 0.81627297, + "num_input_tokens_seen": 346881955, + "step": 16076, + "time_per_iteration": 4.001877784729004 + }, + { + "auxiliary_loss_clip": 0.01131964, + "auxiliary_loss_mlp": 0.01100783, + "balance_loss_clip": 1.00184369, + "balance_loss_mlp": 1.00069499, + "epoch": 0.9666015331429431, + "flos": 19860432405120.0, + "grad_norm": 1.8299466847602914, + "language_loss": 0.71910596, + "learning_rate": 1.166897413780532e-08, + "loss": 0.74143338, + "num_input_tokens_seen": 346900445, + "step": 16077, + "time_per_iteration": 2.6484720706939697 + }, + { + "auxiliary_loss_clip": 0.01149457, + "auxiliary_loss_mlp": 0.01100854, + "balance_loss_clip": 1.00196338, + "balance_loss_mlp": 1.00052738, + "epoch": 0.966661656395611, + "flos": 27125951178240.0, + "grad_norm": 2.891648799267145, + "language_loss": 0.59693444, + "learning_rate": 1.1627002417845533e-08, + "loss": 0.61943752, + "num_input_tokens_seen": 346920135, + "step": 16078, + "time_per_iteration": 2.6275949478149414 + }, + { + "auxiliary_loss_clip": 0.01147917, + "auxiliary_loss_mlp": 0.01101085, + "balance_loss_clip": 1.00200129, + "balance_loss_mlp": 1.00056767, + "epoch": 0.966721779648279, + "flos": 21508229848320.0, + "grad_norm": 1.938332073726055, + "language_loss": 0.71743655, + "learning_rate": 1.158510609718899e-08, + "loss": 0.73992658, + "num_input_tokens_seen": 346940450, + "step": 16079, + "time_per_iteration": 2.684337615966797 + }, + { + "auxiliary_loss_clip": 0.01147184, + "auxiliary_loss_mlp": 0.01099505, + "balance_loss_clip": 1.00174642, + "balance_loss_mlp": 1.00041866, + "epoch": 0.9667819029009469, + "flos": 23878118401920.0, + "grad_norm": 2.0145025845130555, + "language_loss": 0.72261751, + "learning_rate": 1.1543285177424644e-08, + "loss": 0.7450844, + "num_input_tokens_seen": 346960935, + "step": 16080, + "time_per_iteration": 2.614199161529541 + }, + { + "auxiliary_loss_clip": 0.0113458, + "auxiliary_loss_mlp": 0.01100292, + "balance_loss_clip": 1.0018543, + "balance_loss_mlp": 1.0004425, + "epoch": 0.9668420261536149, + "flos": 21507224267520.0, + "grad_norm": 2.048760739760021, + "language_loss": 0.74168622, + "learning_rate": 1.1501539660138115e-08, + "loss": 0.76403499, + "num_input_tokens_seen": 346980100, + "step": 16081, + "time_per_iteration": 2.6429696083068848 + }, + { + "auxiliary_loss_clip": 0.01133074, + "auxiliary_loss_mlp": 0.01100346, + "balance_loss_clip": 1.00179064, + "balance_loss_mlp": 1.00040042, + "epoch": 0.9669021494062828, + "flos": 26687266375680.0, + "grad_norm": 1.7711757263081447, + "language_loss": 0.67447567, + "learning_rate": 1.145986954691236e-08, + "loss": 0.69680983, + "num_input_tokens_seen": 347001250, + "step": 16082, + "time_per_iteration": 2.7200286388397217 + }, + { + "auxiliary_loss_clip": 0.01119196, + "auxiliary_loss_mlp": 0.01099656, + "balance_loss_clip": 1.00174999, + "balance_loss_mlp": 1.0005213, + "epoch": 0.9669622726589508, + "flos": 29825032901760.0, + "grad_norm": 1.6211707722561506, + "language_loss": 0.76584327, + "learning_rate": 1.141827483932789e-08, + "loss": 0.78803182, + "num_input_tokens_seen": 347022975, + "step": 16083, + "time_per_iteration": 2.7492611408233643 + }, + { + "auxiliary_loss_clip": 0.01099797, + "auxiliary_loss_mlp": 0.0109996, + "balance_loss_clip": 1.00165796, + "balance_loss_mlp": 1.00058746, + "epoch": 0.9670223959116189, + "flos": 22922499018240.0, + "grad_norm": 2.1502668353666583, + "language_loss": 0.79635853, + "learning_rate": 1.1376755538961669e-08, + "loss": 0.81835616, + "num_input_tokens_seen": 347038780, + "step": 16084, + "time_per_iteration": 2.707855224609375 + }, + { + "auxiliary_loss_clip": 0.01149076, + "auxiliary_loss_mlp": 0.01101161, + "balance_loss_clip": 1.00184155, + "balance_loss_mlp": 1.00035775, + "epoch": 0.9670825191642868, + "flos": 18624495283200.0, + "grad_norm": 2.3227547448177734, + "language_loss": 0.67739487, + "learning_rate": 1.1335311647387991e-08, + "loss": 0.69989729, + "num_input_tokens_seen": 347056705, + "step": 16085, + "time_per_iteration": 2.5920205116271973 + }, + { + "auxiliary_loss_clip": 0.01130919, + "auxiliary_loss_mlp": 0.01101988, + "balance_loss_clip": 1.00182104, + "balance_loss_mlp": 1.00046968, + "epoch": 0.9671426424169548, + "flos": 24497936513280.0, + "grad_norm": 1.8759969042466207, + "language_loss": 0.68856418, + "learning_rate": 1.1293943166178709e-08, + "loss": 0.71089315, + "num_input_tokens_seen": 347075710, + "step": 16086, + "time_per_iteration": 2.6113498210906982 + }, + { + "auxiliary_loss_clip": 0.01149616, + "auxiliary_loss_mlp": 0.01100302, + "balance_loss_clip": 1.00194395, + "balance_loss_mlp": 1.00040436, + "epoch": 0.9672027656696227, + "flos": 20371189847040.0, + "grad_norm": 1.63179114579605, + "language_loss": 0.78168947, + "learning_rate": 1.125265009690235e-08, + "loss": 0.80418861, + "num_input_tokens_seen": 347092325, + "step": 16087, + "time_per_iteration": 2.553180694580078 + }, + { + "auxiliary_loss_clip": 0.01132331, + "auxiliary_loss_mlp": 0.01099787, + "balance_loss_clip": 1.00176299, + "balance_loss_mlp": 1.00041449, + "epoch": 0.9672628889222907, + "flos": 18880179269760.0, + "grad_norm": 1.9681462628303097, + "language_loss": 0.71269035, + "learning_rate": 1.1211432441124769e-08, + "loss": 0.73501158, + "num_input_tokens_seen": 347110595, + "step": 16088, + "time_per_iteration": 2.5904815196990967 + }, + { + "auxiliary_loss_clip": 0.01164182, + "auxiliary_loss_mlp": 0.00747353, + "balance_loss_clip": 1.00203419, + "balance_loss_mlp": 1.00048304, + "epoch": 0.9673230121749586, + "flos": 28695247447680.0, + "grad_norm": 1.4765557807620808, + "language_loss": 0.70424551, + "learning_rate": 1.117029020040916e-08, + "loss": 0.7233609, + "num_input_tokens_seen": 347131625, + "step": 16089, + "time_per_iteration": 2.6143391132354736 + }, + { + "auxiliary_loss_clip": 0.01164292, + "auxiliary_loss_mlp": 0.01102068, + "balance_loss_clip": 1.00194323, + "balance_loss_mlp": 1.00050199, + "epoch": 0.9673831354276267, + "flos": 20484452407680.0, + "grad_norm": 2.708661889961833, + "language_loss": 0.75317931, + "learning_rate": 1.1129223376315167e-08, + "loss": 0.77584291, + "num_input_tokens_seen": 347147910, + "step": 16090, + "time_per_iteration": 2.5190443992614746 + }, + { + "auxiliary_loss_clip": 0.01130758, + "auxiliary_loss_mlp": 0.011016, + "balance_loss_clip": 1.00170565, + "balance_loss_mlp": 1.00046301, + "epoch": 0.9674432586802946, + "flos": 26797548107520.0, + "grad_norm": 1.6246875778784515, + "language_loss": 0.69095308, + "learning_rate": 1.1088231970400653e-08, + "loss": 0.71327668, + "num_input_tokens_seen": 347168805, + "step": 16091, + "time_per_iteration": 2.700491428375244 + }, + { + "auxiliary_loss_clip": 0.01163929, + "auxiliary_loss_mlp": 0.01101263, + "balance_loss_clip": 1.00178611, + "balance_loss_mlp": 1.00055528, + "epoch": 0.9675033819329626, + "flos": 22310941034880.0, + "grad_norm": 1.6798956943402863, + "language_loss": 0.77124846, + "learning_rate": 1.1047315984219484e-08, + "loss": 0.79390037, + "num_input_tokens_seen": 347189455, + "step": 16092, + "time_per_iteration": 3.965083122253418 + }, + { + "auxiliary_loss_clip": 0.01164165, + "auxiliary_loss_mlp": 0.01099807, + "balance_loss_clip": 1.00206041, + "balance_loss_mlp": 1.00048208, + "epoch": 0.9675635051856305, + "flos": 12675713276160.0, + "grad_norm": 1.974693623477595, + "language_loss": 0.76614511, + "learning_rate": 1.1006475419323313e-08, + "loss": 0.7887848, + "num_input_tokens_seen": 347206030, + "step": 16093, + "time_per_iteration": 2.5045368671417236 + }, + { + "auxiliary_loss_clip": 0.01130185, + "auxiliary_loss_mlp": 0.01100123, + "balance_loss_clip": 1.00173664, + "balance_loss_mlp": 1.00032127, + "epoch": 0.9676236284382985, + "flos": 24608469640320.0, + "grad_norm": 1.3941834962899278, + "language_loss": 0.68973541, + "learning_rate": 1.096571027726112e-08, + "loss": 0.71203852, + "num_input_tokens_seen": 347226250, + "step": 16094, + "time_per_iteration": 2.660555362701416 + }, + { + "auxiliary_loss_clip": 0.01147716, + "auxiliary_loss_mlp": 0.01101159, + "balance_loss_clip": 1.00182688, + "balance_loss_mlp": 1.00045049, + "epoch": 0.9676837516909664, + "flos": 23367145478400.0, + "grad_norm": 1.6018314101018352, + "language_loss": 0.75670409, + "learning_rate": 1.0925020559578557e-08, + "loss": 0.77919286, + "num_input_tokens_seen": 347247350, + "step": 16095, + "time_per_iteration": 2.586013078689575 + }, + { + "auxiliary_loss_clip": 0.01164428, + "auxiliary_loss_mlp": 0.01101274, + "balance_loss_clip": 1.00190985, + "balance_loss_mlp": 1.00066113, + "epoch": 0.9677438749436345, + "flos": 20486894532480.0, + "grad_norm": 2.243772746373192, + "language_loss": 0.70213467, + "learning_rate": 1.0884406267818392e-08, + "loss": 0.72479171, + "num_input_tokens_seen": 347266870, + "step": 16096, + "time_per_iteration": 2.6394569873809814 + }, + { + "auxiliary_loss_clip": 0.01131949, + "auxiliary_loss_mlp": 0.01101936, + "balance_loss_clip": 1.00182438, + "balance_loss_mlp": 1.00056052, + "epoch": 0.9678039981963025, + "flos": 47555889719040.0, + "grad_norm": 1.713428429872366, + "language_loss": 0.71716666, + "learning_rate": 1.0843867403520946e-08, + "loss": 0.73950553, + "num_input_tokens_seen": 347290120, + "step": 16097, + "time_per_iteration": 4.183794260025024 + }, + { + "auxiliary_loss_clip": 0.01164174, + "auxiliary_loss_mlp": 0.01099909, + "balance_loss_clip": 1.00198746, + "balance_loss_mlp": 1.00048888, + "epoch": 0.9678641214489704, + "flos": 25040474513280.0, + "grad_norm": 1.7723119315770952, + "language_loss": 0.77999282, + "learning_rate": 1.0803403968223434e-08, + "loss": 0.80263364, + "num_input_tokens_seen": 347308785, + "step": 16098, + "time_per_iteration": 2.553703546524048 + }, + { + "auxiliary_loss_clip": 0.01131701, + "auxiliary_loss_mlp": 0.01100216, + "balance_loss_clip": 1.00188541, + "balance_loss_mlp": 1.00046182, + "epoch": 0.9679242447016384, + "flos": 19240937516160.0, + "grad_norm": 1.8294423828216824, + "language_loss": 0.90775865, + "learning_rate": 1.0763015963459965e-08, + "loss": 0.93007791, + "num_input_tokens_seen": 347326375, + "step": 16099, + "time_per_iteration": 2.6283180713653564 + }, + { + "auxiliary_loss_clip": 0.01147538, + "auxiliary_loss_mlp": 0.01101469, + "balance_loss_clip": 1.00177443, + "balance_loss_mlp": 1.00037909, + "epoch": 0.9679843679543063, + "flos": 33254681345280.0, + "grad_norm": 1.6966559233785716, + "language_loss": 0.6644578, + "learning_rate": 1.0722703390762643e-08, + "loss": 0.68694782, + "num_input_tokens_seen": 347348250, + "step": 16100, + "time_per_iteration": 2.699432373046875 + }, + { + "auxiliary_loss_clip": 0.01114637, + "auxiliary_loss_mlp": 0.01100562, + "balance_loss_clip": 1.00167918, + "balance_loss_mlp": 1.00056887, + "epoch": 0.9680444912069743, + "flos": 22783633038720.0, + "grad_norm": 1.7084657279142972, + "language_loss": 0.73482752, + "learning_rate": 1.0682466251659584e-08, + "loss": 0.75697947, + "num_input_tokens_seen": 347367400, + "step": 16101, + "time_per_iteration": 2.693147659301758 + }, + { + "auxiliary_loss_clip": 0.01130631, + "auxiliary_loss_mlp": 0.01100489, + "balance_loss_clip": 1.00174451, + "balance_loss_mlp": 1.00044882, + "epoch": 0.9681046144596422, + "flos": 24024095274240.0, + "grad_norm": 1.6238830850607369, + "language_loss": 0.73235607, + "learning_rate": 1.0642304547676672e-08, + "loss": 0.75466728, + "num_input_tokens_seen": 347387600, + "step": 16102, + "time_per_iteration": 2.6295461654663086 + }, + { + "auxiliary_loss_clip": 0.01117245, + "auxiliary_loss_mlp": 0.01100676, + "balance_loss_clip": 1.00198472, + "balance_loss_mlp": 1.00044489, + "epoch": 0.9681647377123103, + "flos": 23441013797760.0, + "grad_norm": 1.6782189241868406, + "language_loss": 0.7720865, + "learning_rate": 1.0602218280337139e-08, + "loss": 0.79426575, + "num_input_tokens_seen": 347406915, + "step": 16103, + "time_per_iteration": 2.7079710960388184 + }, + { + "auxiliary_loss_clip": 0.0113448, + "auxiliary_loss_mlp": 0.01100334, + "balance_loss_clip": 1.00190997, + "balance_loss_mlp": 1.00048399, + "epoch": 0.9682248609649782, + "flos": 22675075159680.0, + "grad_norm": 2.0089232624020177, + "language_loss": 0.80165422, + "learning_rate": 1.0562207451160655e-08, + "loss": 0.82400227, + "num_input_tokens_seen": 347425140, + "step": 16104, + "time_per_iteration": 2.630782127380371 + }, + { + "auxiliary_loss_clip": 0.01149545, + "auxiliary_loss_mlp": 0.01099261, + "balance_loss_clip": 1.00180697, + "balance_loss_mlp": 1.00065064, + "epoch": 0.9682849842176462, + "flos": 24428413739520.0, + "grad_norm": 1.449399596023816, + "language_loss": 0.77564859, + "learning_rate": 1.0522272061664672e-08, + "loss": 0.79813671, + "num_input_tokens_seen": 347446350, + "step": 16105, + "time_per_iteration": 2.6376523971557617 + }, + { + "auxiliary_loss_clip": 0.01126855, + "auxiliary_loss_mlp": 0.0107375, + "balance_loss_clip": 1.00083244, + "balance_loss_mlp": 1.00012672, + "epoch": 0.9683451074703141, + "flos": 59995132784640.0, + "grad_norm": 0.8214390380471128, + "language_loss": 0.56705743, + "learning_rate": 1.0482412113363536e-08, + "loss": 0.58906347, + "num_input_tokens_seen": 347510135, + "step": 16106, + "time_per_iteration": 3.2388856410980225 + }, + { + "auxiliary_loss_clip": 0.01114199, + "auxiliary_loss_mlp": 0.01074059, + "balance_loss_clip": 1.00073695, + "balance_loss_mlp": 1.00005364, + "epoch": 0.9684052307229821, + "flos": 52696145514240.0, + "grad_norm": 0.8835653002309333, + "language_loss": 0.61592662, + "learning_rate": 1.0442627607768707e-08, + "loss": 0.63780916, + "num_input_tokens_seen": 347562505, + "step": 16107, + "time_per_iteration": 3.0681445598602295 + }, + { + "auxiliary_loss_clip": 0.01149112, + "auxiliary_loss_mlp": 0.01100733, + "balance_loss_clip": 1.00188875, + "balance_loss_mlp": 1.00059712, + "epoch": 0.96846535397565, + "flos": 22783848520320.0, + "grad_norm": 2.2699469297963115, + "language_loss": 0.73789632, + "learning_rate": 1.040291854638875e-08, + "loss": 0.76039481, + "num_input_tokens_seen": 347579150, + "step": 16108, + "time_per_iteration": 2.6071958541870117 + }, + { + "auxiliary_loss_clip": 0.01149747, + "auxiliary_loss_mlp": 0.01100873, + "balance_loss_clip": 1.00192237, + "balance_loss_mlp": 1.00045121, + "epoch": 0.968525477228318, + "flos": 23323980309120.0, + "grad_norm": 2.4997401431873616, + "language_loss": 0.57324934, + "learning_rate": 1.0363284930729576e-08, + "loss": 0.59575558, + "num_input_tokens_seen": 347596705, + "step": 16109, + "time_per_iteration": 2.571484327316284 + }, + { + "auxiliary_loss_clip": 0.0114363, + "auxiliary_loss_mlp": 0.01073824, + "balance_loss_clip": 1.00072694, + "balance_loss_mlp": 1.00020051, + "epoch": 0.9685856004809861, + "flos": 67882947707520.0, + "grad_norm": 0.6689371010575788, + "language_loss": 0.54207712, + "learning_rate": 1.0323726762294205e-08, + "loss": 0.56425166, + "num_input_tokens_seen": 347661870, + "step": 16110, + "time_per_iteration": 3.0957653522491455 + }, + { + "auxiliary_loss_clip": 0.01085178, + "auxiliary_loss_mlp": 0.01101677, + "balance_loss_clip": 1.00174856, + "balance_loss_mlp": 1.00058794, + "epoch": 0.968645723733654, + "flos": 33947900899200.0, + "grad_norm": 1.516386714754006, + "language_loss": 0.6257081, + "learning_rate": 1.0284244042582325e-08, + "loss": 0.64757663, + "num_input_tokens_seen": 347684295, + "step": 16111, + "time_per_iteration": 2.8309314250946045 + }, + { + "auxiliary_loss_clip": 0.01130842, + "auxiliary_loss_mlp": 0.01099325, + "balance_loss_clip": 1.00171936, + "balance_loss_mlp": 1.00052452, + "epoch": 0.968705846986322, + "flos": 18551488890240.0, + "grad_norm": 1.9643622094310074, + "language_loss": 0.74850631, + "learning_rate": 1.024483677309118e-08, + "loss": 0.77080792, + "num_input_tokens_seen": 347702585, + "step": 16112, + "time_per_iteration": 2.6019227504730225 + }, + { + "auxiliary_loss_clip": 0.0114845, + "auxiliary_loss_mlp": 0.01099172, + "balance_loss_clip": 1.00179672, + "balance_loss_mlp": 1.00041926, + "epoch": 0.9687659702389899, + "flos": 17420913336960.0, + "grad_norm": 2.3609899529958476, + "language_loss": 0.66368806, + "learning_rate": 1.020550495531558e-08, + "loss": 0.68616432, + "num_input_tokens_seen": 347721810, + "step": 16113, + "time_per_iteration": 5.566357612609863 + }, + { + "auxiliary_loss_clip": 0.01140937, + "auxiliary_loss_mlp": 0.01073837, + "balance_loss_clip": 1.0006485, + "balance_loss_mlp": 1.00021386, + "epoch": 0.9688260934916579, + "flos": 62047176865920.0, + "grad_norm": 0.753280460785875, + "language_loss": 0.56513625, + "learning_rate": 1.0166248590746329e-08, + "loss": 0.58728403, + "num_input_tokens_seen": 347782330, + "step": 16114, + "time_per_iteration": 3.135098934173584 + }, + { + "auxiliary_loss_clip": 0.01132573, + "auxiliary_loss_mlp": 0.01100908, + "balance_loss_clip": 1.00190771, + "balance_loss_mlp": 1.00062919, + "epoch": 0.9688862167443258, + "flos": 15076520461440.0, + "grad_norm": 1.862279402251131, + "language_loss": 0.82624555, + "learning_rate": 1.0127067680872458e-08, + "loss": 0.84858036, + "num_input_tokens_seen": 347794835, + "step": 16115, + "time_per_iteration": 2.544856548309326 + }, + { + "auxiliary_loss_clip": 0.01147299, + "auxiliary_loss_mlp": 0.01099185, + "balance_loss_clip": 1.00178456, + "balance_loss_mlp": 1.00062287, + "epoch": 0.9689463399969939, + "flos": 19938215306880.0, + "grad_norm": 1.5659972220054201, + "language_loss": 0.71885169, + "learning_rate": 1.0087962227179448e-08, + "loss": 0.7413165, + "num_input_tokens_seen": 347814320, + "step": 16116, + "time_per_iteration": 2.6524319648742676 + }, + { + "auxiliary_loss_clip": 0.011166, + "auxiliary_loss_mlp": 0.01099908, + "balance_loss_clip": 1.00175643, + "balance_loss_mlp": 1.00053537, + "epoch": 0.9690064632496618, + "flos": 19573039687680.0, + "grad_norm": 2.13432072011619, + "language_loss": 0.75844777, + "learning_rate": 1.0048932231150553e-08, + "loss": 0.78061277, + "num_input_tokens_seen": 347832125, + "step": 16117, + "time_per_iteration": 2.643346071243286 + }, + { + "auxiliary_loss_clip": 0.01164178, + "auxiliary_loss_mlp": 0.01100428, + "balance_loss_clip": 1.00183892, + "balance_loss_mlp": 1.0004828, + "epoch": 0.9690665865023298, + "flos": 21872292145920.0, + "grad_norm": 1.9849364033708135, + "language_loss": 0.77332842, + "learning_rate": 1.000997769426548e-08, + "loss": 0.79597449, + "num_input_tokens_seen": 347850765, + "step": 16118, + "time_per_iteration": 2.6119091510772705 + }, + { + "auxiliary_loss_clip": 0.01135037, + "auxiliary_loss_mlp": 0.00747289, + "balance_loss_clip": 1.0020684, + "balance_loss_mlp": 1.00052488, + "epoch": 0.9691267097549977, + "flos": 20994491577600.0, + "grad_norm": 1.8531015422080443, + "language_loss": 0.78265232, + "learning_rate": 9.971098618001272e-09, + "loss": 0.80147564, + "num_input_tokens_seen": 347870125, + "step": 16119, + "time_per_iteration": 2.6274819374084473 + }, + { + "auxiliary_loss_clip": 0.01100501, + "auxiliary_loss_mlp": 0.01099493, + "balance_loss_clip": 1.00168073, + "balance_loss_mlp": 1.00040674, + "epoch": 0.9691868330076657, + "flos": 24279132816000.0, + "grad_norm": 1.631701301268099, + "language_loss": 0.75730705, + "learning_rate": 9.932295003832747e-09, + "loss": 0.77930701, + "num_input_tokens_seen": 347890615, + "step": 16120, + "time_per_iteration": 2.7372524738311768 + }, + { + "auxiliary_loss_clip": 0.0114958, + "auxiliary_loss_mlp": 0.01099724, + "balance_loss_clip": 1.00190747, + "balance_loss_mlp": 1.00044632, + "epoch": 0.9692469562603336, + "flos": 17675699483520.0, + "grad_norm": 1.9439976552922074, + "language_loss": 0.69803333, + "learning_rate": 9.89356685323095e-09, + "loss": 0.7205264, + "num_input_tokens_seen": 347908685, + "step": 16121, + "time_per_iteration": 2.5466115474700928 + }, + { + "auxiliary_loss_clip": 0.01146945, + "auxiliary_loss_mlp": 0.01100036, + "balance_loss_clip": 1.00183868, + "balance_loss_mlp": 1.00051999, + "epoch": 0.9693070795130017, + "flos": 26834392483200.0, + "grad_norm": 1.7472074863374052, + "language_loss": 0.69043589, + "learning_rate": 9.854914167664486e-09, + "loss": 0.7129057, + "num_input_tokens_seen": 347926385, + "step": 16122, + "time_per_iteration": 2.6116950511932373 + }, + { + "auxiliary_loss_clip": 0.0111779, + "auxiliary_loss_mlp": 0.01100332, + "balance_loss_clip": 1.00181508, + "balance_loss_mlp": 1.00048208, + "epoch": 0.9693672027656697, + "flos": 18077288515200.0, + "grad_norm": 1.7736118917107009, + "language_loss": 0.75463229, + "learning_rate": 9.81633694859907e-09, + "loss": 0.77681351, + "num_input_tokens_seen": 347945290, + "step": 16123, + "time_per_iteration": 2.6520583629608154 + }, + { + "auxiliary_loss_clip": 0.01115601, + "auxiliary_loss_mlp": 0.01100256, + "balance_loss_clip": 1.00159335, + "balance_loss_mlp": 1.0004065, + "epoch": 0.9694273260183376, + "flos": 21763015994880.0, + "grad_norm": 1.8378096012749316, + "language_loss": 0.74387872, + "learning_rate": 9.777835197497753e-09, + "loss": 0.76603734, + "num_input_tokens_seen": 347966330, + "step": 16124, + "time_per_iteration": 2.68916916847229 + }, + { + "auxiliary_loss_clip": 0.01147704, + "auxiliary_loss_mlp": 0.01101609, + "balance_loss_clip": 1.00183678, + "balance_loss_mlp": 1.0006144, + "epoch": 0.9694874492710056, + "flos": 24426115269120.0, + "grad_norm": 1.9106938860402012, + "language_loss": 0.74598789, + "learning_rate": 9.739408915820258e-09, + "loss": 0.76848102, + "num_input_tokens_seen": 347982590, + "step": 16125, + "time_per_iteration": 2.625340223312378 + }, + { + "auxiliary_loss_clip": 0.01141385, + "auxiliary_loss_mlp": 0.01074241, + "balance_loss_clip": 1.00065398, + "balance_loss_mlp": 1.0002358, + "epoch": 0.9695475725236735, + "flos": 67650748237440.0, + "grad_norm": 0.8589638458959772, + "language_loss": 0.61480236, + "learning_rate": 9.70105810502364e-09, + "loss": 0.63695872, + "num_input_tokens_seen": 348043310, + "step": 16126, + "time_per_iteration": 3.088778495788574 + }, + { + "auxiliary_loss_clip": 0.01147149, + "auxiliary_loss_mlp": 0.0109949, + "balance_loss_clip": 1.00180519, + "balance_loss_mlp": 1.00073731, + "epoch": 0.9696076957763415, + "flos": 19129326981120.0, + "grad_norm": 1.9816679607431544, + "language_loss": 0.74687755, + "learning_rate": 9.662782766562738e-09, + "loss": 0.76934397, + "num_input_tokens_seen": 348062200, + "step": 16127, + "time_per_iteration": 2.580695390701294 + }, + { + "auxiliary_loss_clip": 0.0109835, + "auxiliary_loss_mlp": 0.01100705, + "balance_loss_clip": 1.00157261, + "balance_loss_mlp": 1.00056946, + "epoch": 0.9696678190290094, + "flos": 15486836497920.0, + "grad_norm": 1.7072726514261913, + "language_loss": 0.69240844, + "learning_rate": 9.62458290188839e-09, + "loss": 0.71439892, + "num_input_tokens_seen": 348080685, + "step": 16128, + "time_per_iteration": 2.683849334716797 + }, + { + "auxiliary_loss_clip": 0.01114023, + "auxiliary_loss_mlp": 0.01100256, + "balance_loss_clip": 1.00177252, + "balance_loss_mlp": 1.00059664, + "epoch": 0.9697279422816775, + "flos": 36208692869760.0, + "grad_norm": 1.5985016407528945, + "language_loss": 0.65122902, + "learning_rate": 9.586458512449213e-09, + "loss": 0.67337191, + "num_input_tokens_seen": 348102500, + "step": 16129, + "time_per_iteration": 2.8161072731018066 + }, + { + "auxiliary_loss_clip": 0.0111534, + "auxiliary_loss_mlp": 0.0110186, + "balance_loss_clip": 1.00172555, + "balance_loss_mlp": 1.00048423, + "epoch": 0.9697880655343454, + "flos": 25484007651840.0, + "grad_norm": 2.054990139612822, + "language_loss": 0.63177508, + "learning_rate": 9.548409599691166e-09, + "loss": 0.65394711, + "num_input_tokens_seen": 348122515, + "step": 16130, + "time_per_iteration": 2.6826634407043457 + }, + { + "auxiliary_loss_clip": 0.01149808, + "auxiliary_loss_mlp": 0.01101705, + "balance_loss_clip": 1.00185227, + "balance_loss_mlp": 1.00052023, + "epoch": 0.9698481887870134, + "flos": 15333533251200.0, + "grad_norm": 2.2742061383422674, + "language_loss": 0.69674248, + "learning_rate": 9.510436165056867e-09, + "loss": 0.71925759, + "num_input_tokens_seen": 348138775, + "step": 16131, + "time_per_iteration": 3.9292545318603516 + }, + { + "auxiliary_loss_clip": 0.01164203, + "auxiliary_loss_mlp": 0.00747335, + "balance_loss_clip": 1.00186896, + "balance_loss_mlp": 1.00045788, + "epoch": 0.9699083120396813, + "flos": 21982250655360.0, + "grad_norm": 2.0389090141953776, + "language_loss": 0.76644862, + "learning_rate": 9.472538209986058e-09, + "loss": 0.78556395, + "num_input_tokens_seen": 348157115, + "step": 16132, + "time_per_iteration": 2.5613505840301514 + }, + { + "auxiliary_loss_clip": 0.01118482, + "auxiliary_loss_mlp": 0.01101407, + "balance_loss_clip": 1.00188982, + "balance_loss_mlp": 1.00069904, + "epoch": 0.9699684352923493, + "flos": 15664055224320.0, + "grad_norm": 2.673604065819607, + "language_loss": 0.78602159, + "learning_rate": 9.434715735916477e-09, + "loss": 0.80822039, + "num_input_tokens_seen": 348173035, + "step": 16133, + "time_per_iteration": 2.624149799346924 + }, + { + "auxiliary_loss_clip": 0.01131141, + "auxiliary_loss_mlp": 0.01099658, + "balance_loss_clip": 1.00177801, + "balance_loss_mlp": 1.00061917, + "epoch": 0.9700285585450172, + "flos": 21908382336000.0, + "grad_norm": 1.9092916918529077, + "language_loss": 0.64308339, + "learning_rate": 9.396968744281863e-09, + "loss": 0.66539139, + "num_input_tokens_seen": 348192960, + "step": 16134, + "time_per_iteration": 2.6135551929473877 + }, + { + "auxiliary_loss_clip": 0.01132796, + "auxiliary_loss_mlp": 0.01100583, + "balance_loss_clip": 1.00170267, + "balance_loss_mlp": 1.00054312, + "epoch": 0.9700886817976853, + "flos": 23914890950400.0, + "grad_norm": 2.214852054834249, + "language_loss": 0.80738515, + "learning_rate": 9.359297236513519e-09, + "loss": 0.82971895, + "num_input_tokens_seen": 348212805, + "step": 16135, + "time_per_iteration": 4.001470327377319 + }, + { + "auxiliary_loss_clip": 0.01147589, + "auxiliary_loss_mlp": 0.01101625, + "balance_loss_clip": 1.00181019, + "balance_loss_mlp": 1.00053525, + "epoch": 0.9701488050503532, + "flos": 25447845634560.0, + "grad_norm": 2.4383908221592727, + "language_loss": 0.73182404, + "learning_rate": 9.321701214040079e-09, + "loss": 0.75431621, + "num_input_tokens_seen": 348232900, + "step": 16136, + "time_per_iteration": 2.618140935897827 + }, + { + "auxiliary_loss_clip": 0.01164066, + "auxiliary_loss_mlp": 0.01099677, + "balance_loss_clip": 1.0018729, + "balance_loss_mlp": 1.00049496, + "epoch": 0.9702089283030212, + "flos": 20590855470720.0, + "grad_norm": 1.596910017319191, + "language_loss": 0.76200962, + "learning_rate": 9.28418067828729e-09, + "loss": 0.78464705, + "num_input_tokens_seen": 348253065, + "step": 16137, + "time_per_iteration": 2.567229747772217 + }, + { + "auxiliary_loss_clip": 0.01096967, + "auxiliary_loss_mlp": 0.01075302, + "balance_loss_clip": 1.00152957, + "balance_loss_mlp": 1.00053358, + "epoch": 0.9702690515556892, + "flos": 70651516291200.0, + "grad_norm": 0.7734839263969465, + "language_loss": 0.54950947, + "learning_rate": 9.246735630678015e-09, + "loss": 0.5712322, + "num_input_tokens_seen": 348316075, + "step": 16138, + "time_per_iteration": 3.416945219039917 + }, + { + "auxiliary_loss_clip": 0.01132778, + "auxiliary_loss_mlp": 0.01100493, + "balance_loss_clip": 1.00173557, + "balance_loss_mlp": 1.0005002, + "epoch": 0.9703291748083571, + "flos": 35881439034240.0, + "grad_norm": 2.064642084650857, + "language_loss": 0.70954871, + "learning_rate": 9.209366072632007e-09, + "loss": 0.73188138, + "num_input_tokens_seen": 348337605, + "step": 16139, + "time_per_iteration": 2.736419677734375 + }, + { + "auxiliary_loss_clip": 0.01147584, + "auxiliary_loss_mlp": 0.01101093, + "balance_loss_clip": 1.00183392, + "balance_loss_mlp": 1.00048018, + "epoch": 0.9703892980610251, + "flos": 24316479982080.0, + "grad_norm": 1.528370674089819, + "language_loss": 0.72350776, + "learning_rate": 9.172072005566134e-09, + "loss": 0.74599451, + "num_input_tokens_seen": 348359430, + "step": 16140, + "time_per_iteration": 2.620117425918579 + }, + { + "auxiliary_loss_clip": 0.0114761, + "auxiliary_loss_mlp": 0.00747343, + "balance_loss_clip": 1.0018481, + "balance_loss_mlp": 1.00051832, + "epoch": 0.970449421313693, + "flos": 18003743418240.0, + "grad_norm": 2.2853250875460143, + "language_loss": 0.6792804, + "learning_rate": 9.13485343089504e-09, + "loss": 0.69822991, + "num_input_tokens_seen": 348377890, + "step": 16141, + "time_per_iteration": 2.5904855728149414 + }, + { + "auxiliary_loss_clip": 0.01147125, + "auxiliary_loss_mlp": 0.01099809, + "balance_loss_clip": 1.00174212, + "balance_loss_mlp": 1.00053167, + "epoch": 0.9705095445663611, + "flos": 25337994865920.0, + "grad_norm": 2.6894950624683434, + "language_loss": 0.6839816, + "learning_rate": 9.097710350029597e-09, + "loss": 0.70645094, + "num_input_tokens_seen": 348396550, + "step": 16142, + "time_per_iteration": 2.6147823333740234 + }, + { + "auxiliary_loss_clip": 0.0110181, + "auxiliary_loss_mlp": 0.01100476, + "balance_loss_clip": 1.00169182, + "balance_loss_mlp": 1.00062668, + "epoch": 0.970569667819029, + "flos": 26833602384000.0, + "grad_norm": 1.8878277273830177, + "language_loss": 0.55701536, + "learning_rate": 9.060642764378457e-09, + "loss": 0.5790382, + "num_input_tokens_seen": 348417120, + "step": 16143, + "time_per_iteration": 2.7585325241088867 + }, + { + "auxiliary_loss_clip": 0.01147106, + "auxiliary_loss_mlp": 0.01100371, + "balance_loss_clip": 1.0018363, + "balance_loss_mlp": 1.0004741, + "epoch": 0.970629791071697, + "flos": 25848644567040.0, + "grad_norm": 2.218608506678491, + "language_loss": 0.67809427, + "learning_rate": 9.023650675347382e-09, + "loss": 0.70056903, + "num_input_tokens_seen": 348437750, + "step": 16144, + "time_per_iteration": 2.6243436336517334 + }, + { + "auxiliary_loss_clip": 0.01147337, + "auxiliary_loss_mlp": 0.01099991, + "balance_loss_clip": 1.00183105, + "balance_loss_mlp": 1.00066578, + "epoch": 0.9706899143243649, + "flos": 36540184510080.0, + "grad_norm": 1.9246414482517942, + "language_loss": 0.7202825, + "learning_rate": 8.986734084339253e-09, + "loss": 0.74275577, + "num_input_tokens_seen": 348460935, + "step": 16145, + "time_per_iteration": 2.705242156982422 + }, + { + "auxiliary_loss_clip": 0.01130581, + "auxiliary_loss_mlp": 0.01100598, + "balance_loss_clip": 1.00165415, + "balance_loss_mlp": 1.00046277, + "epoch": 0.9707500375770329, + "flos": 12268234414080.0, + "grad_norm": 7.577606009900411, + "language_loss": 0.79780245, + "learning_rate": 8.949892992753395e-09, + "loss": 0.82011425, + "num_input_tokens_seen": 348474480, + "step": 16146, + "time_per_iteration": 2.740368604660034 + }, + { + "auxiliary_loss_clip": 0.01108258, + "auxiliary_loss_mlp": 0.0107443, + "balance_loss_clip": 1.00094724, + "balance_loss_mlp": 1.00042486, + "epoch": 0.9708101608297008, + "flos": 60853040196480.0, + "grad_norm": 0.758455492858206, + "language_loss": 0.54546928, + "learning_rate": 8.91312740198713e-09, + "loss": 0.56729627, + "num_input_tokens_seen": 348541220, + "step": 16147, + "time_per_iteration": 3.2975101470947266 + }, + { + "auxiliary_loss_clip": 0.01120503, + "auxiliary_loss_mlp": 0.00747489, + "balance_loss_clip": 1.00184357, + "balance_loss_mlp": 1.00051761, + "epoch": 0.9708702840823689, + "flos": 27124766029440.0, + "grad_norm": 3.372010344779089, + "language_loss": 0.6099633, + "learning_rate": 8.876437313434682e-09, + "loss": 0.62864316, + "num_input_tokens_seen": 348559230, + "step": 16148, + "time_per_iteration": 2.684494733810425 + }, + { + "auxiliary_loss_clip": 0.01115459, + "auxiliary_loss_mlp": 0.01099756, + "balance_loss_clip": 1.00162005, + "balance_loss_mlp": 1.00052667, + "epoch": 0.9709304073350368, + "flos": 20777699041920.0, + "grad_norm": 1.9630829868841384, + "language_loss": 0.73581964, + "learning_rate": 8.839822728487155e-09, + "loss": 0.7579717, + "num_input_tokens_seen": 348577850, + "step": 16149, + "time_per_iteration": 2.6582226753234863 + }, + { + "auxiliary_loss_clip": 0.01147122, + "auxiliary_loss_mlp": 0.01100006, + "balance_loss_clip": 1.00161886, + "balance_loss_mlp": 1.00053763, + "epoch": 0.9709905305877048, + "flos": 41934541115520.0, + "grad_norm": 2.5035166590463946, + "language_loss": 0.74906671, + "learning_rate": 8.803283648533222e-09, + "loss": 0.77153796, + "num_input_tokens_seen": 348598345, + "step": 16150, + "time_per_iteration": 2.721264123916626 + }, + { + "auxiliary_loss_clip": 0.01130303, + "auxiliary_loss_mlp": 0.0110194, + "balance_loss_clip": 1.00171351, + "balance_loss_mlp": 1.00046921, + "epoch": 0.9710506538403728, + "flos": 17165588486400.0, + "grad_norm": 1.9286186351736887, + "language_loss": 0.73752862, + "learning_rate": 8.766820074958214e-09, + "loss": 0.7598511, + "num_input_tokens_seen": 348616300, + "step": 16151, + "time_per_iteration": 5.4492881298065186 + }, + { + "auxiliary_loss_clip": 0.01147443, + "auxiliary_loss_mlp": 0.01099942, + "balance_loss_clip": 1.00187945, + "balance_loss_mlp": 1.0004741, + "epoch": 0.9711107770930407, + "flos": 21173470070400.0, + "grad_norm": 1.847701235614736, + "language_loss": 0.74682915, + "learning_rate": 8.730432009145027e-09, + "loss": 0.76930302, + "num_input_tokens_seen": 348633845, + "step": 16152, + "time_per_iteration": 2.5526418685913086 + }, + { + "auxiliary_loss_clip": 0.01115396, + "auxiliary_loss_mlp": 0.01100761, + "balance_loss_clip": 1.00164533, + "balance_loss_mlp": 1.00043488, + "epoch": 0.9711709003457087, + "flos": 22237072715520.0, + "grad_norm": 1.9299123504653044, + "language_loss": 0.67291653, + "learning_rate": 8.694119452473448e-09, + "loss": 0.69507813, + "num_input_tokens_seen": 348653070, + "step": 16153, + "time_per_iteration": 2.671628713607788 + }, + { + "auxiliary_loss_clip": 0.01084947, + "auxiliary_loss_mlp": 0.01100327, + "balance_loss_clip": 1.00169063, + "balance_loss_mlp": 1.00047755, + "epoch": 0.9712310235983767, + "flos": 26213856099840.0, + "grad_norm": 1.9903506359847878, + "language_loss": 0.71014237, + "learning_rate": 8.65788240632037e-09, + "loss": 0.73199511, + "num_input_tokens_seen": 348672145, + "step": 16154, + "time_per_iteration": 2.7717130184173584 + }, + { + "auxiliary_loss_clip": 0.01085126, + "auxiliary_loss_mlp": 0.01100921, + "balance_loss_clip": 1.00165939, + "balance_loss_mlp": 1.00054634, + "epoch": 0.9712911468510447, + "flos": 20668171495680.0, + "grad_norm": 1.8227331944714302, + "language_loss": 0.80702662, + "learning_rate": 8.621720872059812e-09, + "loss": 0.8288871, + "num_input_tokens_seen": 348690615, + "step": 16155, + "time_per_iteration": 2.738847017288208 + }, + { + "auxiliary_loss_clip": 0.01147941, + "auxiliary_loss_mlp": 0.00747483, + "balance_loss_clip": 1.00181806, + "balance_loss_mlp": 1.00050235, + "epoch": 0.9713512701037126, + "flos": 13552903313280.0, + "grad_norm": 2.2157568707655417, + "language_loss": 0.67476857, + "learning_rate": 8.58563485106334e-09, + "loss": 0.69372284, + "num_input_tokens_seen": 348708665, + "step": 16156, + "time_per_iteration": 2.5591094493865967 + }, + { + "auxiliary_loss_clip": 0.01147447, + "auxiliary_loss_mlp": 0.01101113, + "balance_loss_clip": 1.00174201, + "balance_loss_mlp": 1.00064349, + "epoch": 0.9714113933563806, + "flos": 25848752307840.0, + "grad_norm": 3.801464762930424, + "language_loss": 0.9063912, + "learning_rate": 8.54962434469919e-09, + "loss": 0.92887682, + "num_input_tokens_seen": 348726105, + "step": 16157, + "time_per_iteration": 2.6195363998413086 + }, + { + "auxiliary_loss_clip": 0.01115112, + "auxiliary_loss_mlp": 0.00747339, + "balance_loss_clip": 1.001858, + "balance_loss_mlp": 1.00046158, + "epoch": 0.9714715166090485, + "flos": 12743081233920.0, + "grad_norm": 3.472792257275511, + "language_loss": 0.72581232, + "learning_rate": 8.513689354332721e-09, + "loss": 0.74443674, + "num_input_tokens_seen": 348743360, + "step": 16158, + "time_per_iteration": 2.653120517730713 + }, + { + "auxiliary_loss_clip": 0.01100769, + "auxiliary_loss_mlp": 0.01101054, + "balance_loss_clip": 1.00180686, + "balance_loss_mlp": 1.00067925, + "epoch": 0.9715316398617165, + "flos": 18405547931520.0, + "grad_norm": 1.9698894209969597, + "language_loss": 0.596434, + "learning_rate": 8.477829881326836e-09, + "loss": 0.61845219, + "num_input_tokens_seen": 348759045, + "step": 16159, + "time_per_iteration": 2.725045680999756 + }, + { + "auxiliary_loss_clip": 0.01164001, + "auxiliary_loss_mlp": 0.01099515, + "balance_loss_clip": 1.00188863, + "balance_loss_mlp": 1.00052404, + "epoch": 0.9715917631143844, + "flos": 28913799749760.0, + "grad_norm": 1.779491232182633, + "language_loss": 0.79305243, + "learning_rate": 8.44204592704112e-09, + "loss": 0.81568754, + "num_input_tokens_seen": 348779910, + "step": 16160, + "time_per_iteration": 2.601407289505005 + }, + { + "auxiliary_loss_clip": 0.01157955, + "auxiliary_loss_mlp": 0.01073629, + "balance_loss_clip": 1.00071716, + "balance_loss_mlp": 1.00000525, + "epoch": 0.9716518863670525, + "flos": 65939712900480.0, + "grad_norm": 0.7768446205492536, + "language_loss": 0.54268646, + "learning_rate": 8.406337492832704e-09, + "loss": 0.56500232, + "num_input_tokens_seen": 348838995, + "step": 16161, + "time_per_iteration": 3.1187524795532227 + }, + { + "auxiliary_loss_clip": 0.0114726, + "auxiliary_loss_mlp": 0.00747135, + "balance_loss_clip": 1.00179338, + "balance_loss_mlp": 1.00048852, + "epoch": 0.9717120096197204, + "flos": 17712759340800.0, + "grad_norm": 2.043705957286968, + "language_loss": 0.71737432, + "learning_rate": 8.3707045800554e-09, + "loss": 0.73631829, + "num_input_tokens_seen": 348858090, + "step": 16162, + "time_per_iteration": 2.601719856262207 + }, + { + "auxiliary_loss_clip": 0.01115989, + "auxiliary_loss_mlp": 0.01100726, + "balance_loss_clip": 1.00160122, + "balance_loss_mlp": 1.00059032, + "epoch": 0.9717721328723884, + "flos": 24463426521600.0, + "grad_norm": 1.6128960905339995, + "language_loss": 0.78868663, + "learning_rate": 8.335147190060787e-09, + "loss": 0.81085384, + "num_input_tokens_seen": 348877885, + "step": 16163, + "time_per_iteration": 2.679837465286255 + }, + { + "auxiliary_loss_clip": 0.01132997, + "auxiliary_loss_mlp": 0.01100054, + "balance_loss_clip": 1.0016861, + "balance_loss_mlp": 1.00049019, + "epoch": 0.9718322561250564, + "flos": 20776477979520.0, + "grad_norm": 1.7172542526371273, + "language_loss": 0.72823089, + "learning_rate": 8.299665324196903e-09, + "loss": 0.75056136, + "num_input_tokens_seen": 348897720, + "step": 16164, + "time_per_iteration": 2.6237220764160156 + }, + { + "auxiliary_loss_clip": 0.01084484, + "auxiliary_loss_mlp": 0.01102141, + "balance_loss_clip": 1.00182402, + "balance_loss_mlp": 1.00067055, + "epoch": 0.9718923793777243, + "flos": 19025904746880.0, + "grad_norm": 2.041212204015347, + "language_loss": 0.84112394, + "learning_rate": 8.264258983809114e-09, + "loss": 0.8629902, + "num_input_tokens_seen": 348915410, + "step": 16165, + "time_per_iteration": 2.697843551635742 + }, + { + "auxiliary_loss_clip": 0.01117194, + "auxiliary_loss_mlp": 0.0109937, + "balance_loss_clip": 1.00183177, + "balance_loss_mlp": 1.00042641, + "epoch": 0.9719525026303923, + "flos": 21871717528320.0, + "grad_norm": 1.6480792083014484, + "language_loss": 0.79095721, + "learning_rate": 8.228928170240345e-09, + "loss": 0.81312287, + "num_input_tokens_seen": 348934335, + "step": 16166, + "time_per_iteration": 2.6761467456817627 + }, + { + "auxiliary_loss_clip": 0.01132302, + "auxiliary_loss_mlp": 0.01100108, + "balance_loss_clip": 1.00184846, + "balance_loss_mlp": 1.00040138, + "epoch": 0.9720126258830603, + "flos": 14429303251200.0, + "grad_norm": 2.0883783131441054, + "language_loss": 0.70719844, + "learning_rate": 8.193672884830195e-09, + "loss": 0.72952253, + "num_input_tokens_seen": 348952405, + "step": 16167, + "time_per_iteration": 2.602169990539551 + }, + { + "auxiliary_loss_clip": 0.01130939, + "auxiliary_loss_mlp": 0.01100807, + "balance_loss_clip": 1.00180221, + "balance_loss_mlp": 1.00062394, + "epoch": 0.9720727491357283, + "flos": 26251167352320.0, + "grad_norm": 1.4580342403653324, + "language_loss": 0.75336444, + "learning_rate": 8.158493128915812e-09, + "loss": 0.77568197, + "num_input_tokens_seen": 348973580, + "step": 16168, + "time_per_iteration": 4.067339897155762 + }, + { + "auxiliary_loss_clip": 0.01101106, + "auxiliary_loss_mlp": 0.01100811, + "balance_loss_clip": 1.00167298, + "balance_loss_mlp": 1.00053251, + "epoch": 0.9721328723883962, + "flos": 22674105492480.0, + "grad_norm": 2.3588429716839534, + "language_loss": 0.7336657, + "learning_rate": 8.123388903830797e-09, + "loss": 0.75568485, + "num_input_tokens_seen": 348992035, + "step": 16169, + "time_per_iteration": 2.6902828216552734 + }, + { + "auxiliary_loss_clip": 0.01117968, + "auxiliary_loss_mlp": 0.01101278, + "balance_loss_clip": 1.0017122, + "balance_loss_mlp": 1.00037897, + "epoch": 0.9721929956410642, + "flos": 28074172360320.0, + "grad_norm": 2.219876528714776, + "language_loss": 0.57496333, + "learning_rate": 8.088360210906309e-09, + "loss": 0.59715575, + "num_input_tokens_seen": 349013160, + "step": 16170, + "time_per_iteration": 2.68147349357605 + }, + { + "auxiliary_loss_clip": 0.01115113, + "auxiliary_loss_mlp": 0.01100709, + "balance_loss_clip": 1.00170422, + "balance_loss_mlp": 1.0003829, + "epoch": 0.9722531188937321, + "flos": 20996251344000.0, + "grad_norm": 1.9577311536468103, + "language_loss": 0.71780837, + "learning_rate": 8.053407051471062e-09, + "loss": 0.73996663, + "num_input_tokens_seen": 349033485, + "step": 16171, + "time_per_iteration": 2.6621811389923096 + }, + { + "auxiliary_loss_clip": 0.01118292, + "auxiliary_loss_mlp": 0.01100486, + "balance_loss_clip": 1.00176883, + "balance_loss_mlp": 1.0006361, + "epoch": 0.9723132421464001, + "flos": 16070600332800.0, + "grad_norm": 1.6419887830909263, + "language_loss": 0.68365055, + "learning_rate": 8.018529426850218e-09, + "loss": 0.70583832, + "num_input_tokens_seen": 349051705, + "step": 16172, + "time_per_iteration": 3.9663848876953125 + }, + { + "auxiliary_loss_clip": 0.01149473, + "auxiliary_loss_mlp": 0.0109979, + "balance_loss_clip": 1.00197124, + "balance_loss_mlp": 1.00046468, + "epoch": 0.972373365399068, + "flos": 27745769289600.0, + "grad_norm": 1.8950126402234977, + "language_loss": 0.86073184, + "learning_rate": 7.983727338366274e-09, + "loss": 0.88322437, + "num_input_tokens_seen": 349070825, + "step": 16173, + "time_per_iteration": 2.630281925201416 + }, + { + "auxiliary_loss_clip": 0.01101754, + "auxiliary_loss_mlp": 0.01101845, + "balance_loss_clip": 1.0018177, + "balance_loss_mlp": 1.00051665, + "epoch": 0.9724334886517361, + "flos": 23002939526400.0, + "grad_norm": 1.9284392804585397, + "language_loss": 0.64343286, + "learning_rate": 7.949000787339289e-09, + "loss": 0.66546881, + "num_input_tokens_seen": 349089730, + "step": 16174, + "time_per_iteration": 2.757122755050659 + }, + { + "auxiliary_loss_clip": 0.01149419, + "auxiliary_loss_mlp": 0.0110002, + "balance_loss_clip": 1.00194681, + "balance_loss_mlp": 1.00045669, + "epoch": 0.972493611904404, + "flos": 25447055535360.0, + "grad_norm": 1.515536667923071, + "language_loss": 0.78170371, + "learning_rate": 7.914349775085538e-09, + "loss": 0.80419809, + "num_input_tokens_seen": 349111315, + "step": 16175, + "time_per_iteration": 2.61133074760437 + }, + { + "auxiliary_loss_clip": 0.01147913, + "auxiliary_loss_mlp": 0.01100199, + "balance_loss_clip": 1.00194669, + "balance_loss_mlp": 1.00044477, + "epoch": 0.972553735157072, + "flos": 16983054547200.0, + "grad_norm": 3.318663251113089, + "language_loss": 0.56790066, + "learning_rate": 7.879774302919307e-09, + "loss": 0.59038174, + "num_input_tokens_seen": 349129495, + "step": 16176, + "time_per_iteration": 2.580136299133301 + }, + { + "auxiliary_loss_clip": 0.01131213, + "auxiliary_loss_mlp": 0.01100819, + "balance_loss_clip": 1.00195718, + "balance_loss_mlp": 1.00049233, + "epoch": 0.97261385840974, + "flos": 26104651776000.0, + "grad_norm": 3.165079346764369, + "language_loss": 0.72726053, + "learning_rate": 7.845274372151545e-09, + "loss": 0.74958086, + "num_input_tokens_seen": 349148850, + "step": 16177, + "time_per_iteration": 2.6282389163970947 + }, + { + "auxiliary_loss_clip": 0.01132004, + "auxiliary_loss_mlp": 0.01100414, + "balance_loss_clip": 1.00164771, + "balance_loss_mlp": 1.0005641, + "epoch": 0.9726739816624079, + "flos": 25447881548160.0, + "grad_norm": 1.9963024363359911, + "language_loss": 0.68406928, + "learning_rate": 7.810849984090984e-09, + "loss": 0.70639348, + "num_input_tokens_seen": 349167620, + "step": 16178, + "time_per_iteration": 2.652395009994507 + }, + { + "auxiliary_loss_clip": 0.01086871, + "auxiliary_loss_mlp": 0.01101251, + "balance_loss_clip": 1.00159335, + "balance_loss_mlp": 1.00054252, + "epoch": 0.972734104915076, + "flos": 29014923513600.0, + "grad_norm": 1.9994768261705036, + "language_loss": 0.67725247, + "learning_rate": 7.776501140042358e-09, + "loss": 0.69913369, + "num_input_tokens_seen": 349185845, + "step": 16179, + "time_per_iteration": 2.797780990600586 + }, + { + "auxiliary_loss_clip": 0.01130116, + "auxiliary_loss_mlp": 0.00747311, + "balance_loss_clip": 1.00174558, + "balance_loss_mlp": 1.00046158, + "epoch": 0.9727942281677439, + "flos": 23437637919360.0, + "grad_norm": 3.8101264962518715, + "language_loss": 0.77026486, + "learning_rate": 7.742227841308624e-09, + "loss": 0.78903908, + "num_input_tokens_seen": 349204525, + "step": 16180, + "time_per_iteration": 2.6413137912750244 + }, + { + "auxiliary_loss_clip": 0.01147586, + "auxiliary_loss_mlp": 0.01101198, + "balance_loss_clip": 1.00179863, + "balance_loss_mlp": 1.00048971, + "epoch": 0.9728543514204119, + "flos": 31724599749120.0, + "grad_norm": 2.0048529008386167, + "language_loss": 0.76677036, + "learning_rate": 7.708030089189188e-09, + "loss": 0.78925818, + "num_input_tokens_seen": 349228075, + "step": 16181, + "time_per_iteration": 2.6775968074798584 + }, + { + "auxiliary_loss_clip": 0.01164123, + "auxiliary_loss_mlp": 0.01099798, + "balance_loss_clip": 1.00184178, + "balance_loss_mlp": 1.00047278, + "epoch": 0.9729144746730798, + "flos": 16289368116480.0, + "grad_norm": 1.4041268263418776, + "language_loss": 0.63412362, + "learning_rate": 7.67390788498079e-09, + "loss": 0.65676284, + "num_input_tokens_seen": 349246990, + "step": 16182, + "time_per_iteration": 2.5317330360412598 + }, + { + "auxiliary_loss_clip": 0.01054494, + "auxiliary_loss_mlp": 0.01100145, + "balance_loss_clip": 1.00145507, + "balance_loss_mlp": 1.00043821, + "epoch": 0.9729745979257478, + "flos": 25041408266880.0, + "grad_norm": 2.5258303025652418, + "language_loss": 0.62235045, + "learning_rate": 7.639861229977507e-09, + "loss": 0.64389682, + "num_input_tokens_seen": 349265890, + "step": 16183, + "time_per_iteration": 2.8294475078582764 + }, + { + "auxiliary_loss_clip": 0.01133856, + "auxiliary_loss_mlp": 0.01100286, + "balance_loss_clip": 1.00178885, + "balance_loss_mlp": 1.00048423, + "epoch": 0.9730347211784157, + "flos": 22638733574400.0, + "grad_norm": 2.3031780618953612, + "language_loss": 0.78053939, + "learning_rate": 7.605890125470527e-09, + "loss": 0.80288082, + "num_input_tokens_seen": 349285275, + "step": 16184, + "time_per_iteration": 2.644660234451294 + }, + { + "auxiliary_loss_clip": 0.01118363, + "auxiliary_loss_mlp": 0.01100198, + "balance_loss_clip": 1.00182295, + "balance_loss_mlp": 1.00053918, + "epoch": 0.9730948444310837, + "flos": 10998613313280.0, + "grad_norm": 2.336417048798215, + "language_loss": 0.79670811, + "learning_rate": 7.571994572747709e-09, + "loss": 0.81889373, + "num_input_tokens_seen": 349301515, + "step": 16185, + "time_per_iteration": 2.623311758041382 + }, + { + "auxiliary_loss_clip": 0.01118302, + "auxiliary_loss_mlp": 0.0109998, + "balance_loss_clip": 1.0018394, + "balance_loss_mlp": 1.000512, + "epoch": 0.9731549676837516, + "flos": 16799479113600.0, + "grad_norm": 6.3791989461384855, + "language_loss": 0.77846152, + "learning_rate": 7.538174573094469e-09, + "loss": 0.8006444, + "num_input_tokens_seen": 349319590, + "step": 16186, + "time_per_iteration": 2.6264455318450928 + }, + { + "auxiliary_loss_clip": 0.0113182, + "auxiliary_loss_mlp": 0.0109986, + "balance_loss_clip": 1.00173402, + "balance_loss_mlp": 1.0005827, + "epoch": 0.9732150909364197, + "flos": 21141761339520.0, + "grad_norm": 1.6482342387427718, + "language_loss": 0.65550011, + "learning_rate": 7.504430127793337e-09, + "loss": 0.67781687, + "num_input_tokens_seen": 349339230, + "step": 16187, + "time_per_iteration": 2.6183724403381348 + }, + { + "auxiliary_loss_clip": 0.01134084, + "auxiliary_loss_mlp": 0.0110053, + "balance_loss_clip": 1.00181055, + "balance_loss_mlp": 1.00044191, + "epoch": 0.9732752141890876, + "flos": 33727337435520.0, + "grad_norm": 1.701528765393223, + "language_loss": 0.80338788, + "learning_rate": 7.47076123812418e-09, + "loss": 0.82573402, + "num_input_tokens_seen": 349361155, + "step": 16188, + "time_per_iteration": 4.221889019012451 + }, + { + "auxiliary_loss_clip": 0.01116645, + "auxiliary_loss_mlp": 0.01099474, + "balance_loss_clip": 1.00163281, + "balance_loss_mlp": 1.00043488, + "epoch": 0.9733353374417556, + "flos": 23404384903680.0, + "grad_norm": 2.640944520336965, + "language_loss": 0.78522795, + "learning_rate": 7.437167905363084e-09, + "loss": 0.80738914, + "num_input_tokens_seen": 349379335, + "step": 16189, + "time_per_iteration": 3.9013595581054688 + }, + { + "auxiliary_loss_clip": 0.0114927, + "auxiliary_loss_mlp": 0.01100082, + "balance_loss_clip": 1.00180221, + "balance_loss_mlp": 1.00047052, + "epoch": 0.9733954606944236, + "flos": 39165792963840.0, + "grad_norm": 2.2460704068984847, + "language_loss": 0.51025081, + "learning_rate": 7.403650130784367e-09, + "loss": 0.53274429, + "num_input_tokens_seen": 349401575, + "step": 16190, + "time_per_iteration": 2.716400623321533 + }, + { + "auxiliary_loss_clip": 0.01147836, + "auxiliary_loss_mlp": 0.01100562, + "balance_loss_clip": 1.0017736, + "balance_loss_mlp": 1.00042605, + "epoch": 0.9734555839470915, + "flos": 21981819692160.0, + "grad_norm": 3.6084831273555062, + "language_loss": 0.8063733, + "learning_rate": 7.3702079156590105e-09, + "loss": 0.8288573, + "num_input_tokens_seen": 349420650, + "step": 16191, + "time_per_iteration": 2.561093807220459 + }, + { + "auxiliary_loss_clip": 0.01149424, + "auxiliary_loss_mlp": 0.01100599, + "balance_loss_clip": 1.00190246, + "balance_loss_mlp": 1.00041604, + "epoch": 0.9735157071997596, + "flos": 16575539771520.0, + "grad_norm": 2.3462032886311834, + "language_loss": 0.82909209, + "learning_rate": 7.336841261255111e-09, + "loss": 0.8515923, + "num_input_tokens_seen": 349436830, + "step": 16192, + "time_per_iteration": 2.5836915969848633 + }, + { + "auxiliary_loss_clip": 0.0108401, + "auxiliary_loss_mlp": 0.01101085, + "balance_loss_clip": 1.00171733, + "balance_loss_mlp": 1.00047255, + "epoch": 0.9735758304524275, + "flos": 20223237726720.0, + "grad_norm": 2.0544804596228885, + "language_loss": 0.74971569, + "learning_rate": 7.303550168837658e-09, + "loss": 0.77156663, + "num_input_tokens_seen": 349454325, + "step": 16193, + "time_per_iteration": 2.71624493598938 + }, + { + "auxiliary_loss_clip": 0.0113096, + "auxiliary_loss_mlp": 0.01099266, + "balance_loss_clip": 1.00170207, + "balance_loss_mlp": 1.00056076, + "epoch": 0.9736359537050955, + "flos": 23653353047040.0, + "grad_norm": 2.184553041867347, + "language_loss": 0.85275698, + "learning_rate": 7.270334639669417e-09, + "loss": 0.87505925, + "num_input_tokens_seen": 349470230, + "step": 16194, + "time_per_iteration": 2.6564972400665283 + }, + { + "auxiliary_loss_clip": 0.01116109, + "auxiliary_loss_mlp": 0.01099867, + "balance_loss_clip": 1.00184011, + "balance_loss_mlp": 1.00054216, + "epoch": 0.9736960769577634, + "flos": 15560202026880.0, + "grad_norm": 3.3144929303643407, + "language_loss": 0.75839949, + "learning_rate": 7.237194675009828e-09, + "loss": 0.78055918, + "num_input_tokens_seen": 349486250, + "step": 16195, + "time_per_iteration": 2.6946957111358643 + }, + { + "auxiliary_loss_clip": 0.01113705, + "auxiliary_loss_mlp": 0.01074007, + "balance_loss_clip": 1.00075269, + "balance_loss_mlp": 1.00000191, + "epoch": 0.9737562002104314, + "flos": 65351783088000.0, + "grad_norm": 0.7057218883918621, + "language_loss": 0.52396733, + "learning_rate": 7.204130276115439e-09, + "loss": 0.54584444, + "num_input_tokens_seen": 349545865, + "step": 16196, + "time_per_iteration": 3.193864107131958 + }, + { + "auxiliary_loss_clip": 0.01131195, + "auxiliary_loss_mlp": 0.01099778, + "balance_loss_clip": 1.00177455, + "balance_loss_mlp": 1.00045323, + "epoch": 0.9738163234630993, + "flos": 27196730928000.0, + "grad_norm": 2.3662598212409467, + "language_loss": 0.76351011, + "learning_rate": 7.171141444240136e-09, + "loss": 0.78581989, + "num_input_tokens_seen": 349566080, + "step": 16197, + "time_per_iteration": 2.712587833404541 + }, + { + "auxiliary_loss_clip": 0.01164261, + "auxiliary_loss_mlp": 0.01101814, + "balance_loss_clip": 1.00187707, + "balance_loss_mlp": 1.0005337, + "epoch": 0.9738764467157673, + "flos": 21069365477760.0, + "grad_norm": 2.6173211721729293, + "language_loss": 0.67872697, + "learning_rate": 7.13822818063492e-09, + "loss": 0.70138776, + "num_input_tokens_seen": 349585665, + "step": 16198, + "time_per_iteration": 2.5906691551208496 + }, + { + "auxiliary_loss_clip": 0.01164102, + "auxiliary_loss_mlp": 0.01101036, + "balance_loss_clip": 1.00183821, + "balance_loss_mlp": 1.00047135, + "epoch": 0.9739365699684353, + "flos": 21361211481600.0, + "grad_norm": 2.123105587744479, + "language_loss": 0.77721369, + "learning_rate": 7.10539048654768e-09, + "loss": 0.79986513, + "num_input_tokens_seen": 349605125, + "step": 16199, + "time_per_iteration": 2.58766770362854 + }, + { + "auxiliary_loss_clip": 0.01132256, + "auxiliary_loss_mlp": 0.01100679, + "balance_loss_clip": 1.00171566, + "balance_loss_mlp": 1.00054312, + "epoch": 0.9739966932211033, + "flos": 21902061542400.0, + "grad_norm": 1.7768657565176549, + "language_loss": 0.79622698, + "learning_rate": 7.072628363223865e-09, + "loss": 0.81855631, + "num_input_tokens_seen": 349623360, + "step": 16200, + "time_per_iteration": 2.676567316055298 + }, + { + "auxiliary_loss_clip": 0.01083443, + "auxiliary_loss_mlp": 0.01101733, + "balance_loss_clip": 1.00161123, + "balance_loss_mlp": 1.00064349, + "epoch": 0.9740568164737712, + "flos": 24827345164800.0, + "grad_norm": 2.3312311570887796, + "language_loss": 0.68756127, + "learning_rate": 7.039941811905592e-09, + "loss": 0.70941299, + "num_input_tokens_seen": 349644390, + "step": 16201, + "time_per_iteration": 2.82383131980896 + }, + { + "auxiliary_loss_clip": 0.0111615, + "auxiliary_loss_mlp": 0.01100211, + "balance_loss_clip": 1.0016675, + "balance_loss_mlp": 1.00050473, + "epoch": 0.9741169397264392, + "flos": 23623583650560.0, + "grad_norm": 1.4915094989916908, + "language_loss": 0.72648907, + "learning_rate": 7.0073308338325364e-09, + "loss": 0.74865264, + "num_input_tokens_seen": 349663200, + "step": 16202, + "time_per_iteration": 2.6975350379943848 + }, + { + "auxiliary_loss_clip": 0.01132561, + "auxiliary_loss_mlp": 0.0110039, + "balance_loss_clip": 1.00194073, + "balance_loss_mlp": 1.00054026, + "epoch": 0.9741770629791072, + "flos": 18841144164480.0, + "grad_norm": 1.8016259659150835, + "language_loss": 0.73317575, + "learning_rate": 6.974795430241265e-09, + "loss": 0.7555052, + "num_input_tokens_seen": 349681975, + "step": 16203, + "time_per_iteration": 2.7069787979125977 + }, + { + "auxiliary_loss_clip": 0.0116422, + "auxiliary_loss_mlp": 0.01100164, + "balance_loss_clip": 1.00187588, + "balance_loss_mlp": 1.00050545, + "epoch": 0.9742371862317751, + "flos": 22346241125760.0, + "grad_norm": 1.8570382847830713, + "language_loss": 0.77401674, + "learning_rate": 6.942335602365235e-09, + "loss": 0.79666054, + "num_input_tokens_seen": 349701185, + "step": 16204, + "time_per_iteration": 2.5754289627075195 + }, + { + "auxiliary_loss_clip": 0.01132694, + "auxiliary_loss_mlp": 0.01101244, + "balance_loss_clip": 1.00175333, + "balance_loss_mlp": 1.0005362, + "epoch": 0.9742973094844432, + "flos": 21762764599680.0, + "grad_norm": 2.062362764630806, + "language_loss": 0.79672396, + "learning_rate": 6.909951351435905e-09, + "loss": 0.81906337, + "num_input_tokens_seen": 349720360, + "step": 16205, + "time_per_iteration": 2.7009382247924805 + }, + { + "auxiliary_loss_clip": 0.01164103, + "auxiliary_loss_mlp": 0.01099373, + "balance_loss_clip": 1.00188804, + "balance_loss_mlp": 1.00047696, + "epoch": 0.9743574327371111, + "flos": 26248725227520.0, + "grad_norm": 1.6660401885031684, + "language_loss": 0.74678677, + "learning_rate": 6.87764267868074e-09, + "loss": 0.76942152, + "num_input_tokens_seen": 349741040, + "step": 16206, + "time_per_iteration": 4.065107345581055 + }, + { + "auxiliary_loss_clip": 0.0108605, + "auxiliary_loss_mlp": 0.01100441, + "balance_loss_clip": 1.00154829, + "balance_loss_mlp": 1.00049591, + "epoch": 0.9744175559897791, + "flos": 12349321367040.0, + "grad_norm": 2.0097672871843963, + "language_loss": 0.83974987, + "learning_rate": 6.8454095853252015e-09, + "loss": 0.86161482, + "num_input_tokens_seen": 349758895, + "step": 16207, + "time_per_iteration": 2.746635675430298 + }, + { + "auxiliary_loss_clip": 0.01147696, + "auxiliary_loss_mlp": 0.01099684, + "balance_loss_clip": 1.00171649, + "balance_loss_mlp": 1.00054979, + "epoch": 0.974477679242447, + "flos": 28397834835840.0, + "grad_norm": 1.7517468323269099, + "language_loss": 0.70563483, + "learning_rate": 6.813252072591425e-09, + "loss": 0.72810858, + "num_input_tokens_seen": 349779740, + "step": 16208, + "time_per_iteration": 2.6397299766540527 + }, + { + "auxiliary_loss_clip": 0.01115845, + "auxiliary_loss_mlp": 0.01099372, + "balance_loss_clip": 1.00179315, + "balance_loss_mlp": 1.00047565, + "epoch": 0.974537802495115, + "flos": 17785370684160.0, + "grad_norm": 1.8447204054692286, + "language_loss": 0.77562177, + "learning_rate": 6.781170141698878e-09, + "loss": 0.7977739, + "num_input_tokens_seen": 349796820, + "step": 16209, + "time_per_iteration": 2.632235050201416 + }, + { + "auxiliary_loss_clip": 0.01115427, + "auxiliary_loss_mlp": 0.00747351, + "balance_loss_clip": 1.00164866, + "balance_loss_mlp": 1.00050664, + "epoch": 0.9745979257477829, + "flos": 23842315520640.0, + "grad_norm": 1.7781552035222838, + "language_loss": 0.78802598, + "learning_rate": 6.749163793864144e-09, + "loss": 0.8066538, + "num_input_tokens_seen": 349816550, + "step": 16210, + "time_per_iteration": 4.0350682735443115 + }, + { + "auxiliary_loss_clip": 0.01134489, + "auxiliary_loss_mlp": 0.01100212, + "balance_loss_clip": 1.00183201, + "balance_loss_mlp": 1.00055277, + "epoch": 0.9746580490004509, + "flos": 27016172236800.0, + "grad_norm": 2.2600012959352367, + "language_loss": 0.78007758, + "learning_rate": 6.7172330303009176e-09, + "loss": 0.80242461, + "num_input_tokens_seen": 349834350, + "step": 16211, + "time_per_iteration": 2.676203966140747 + }, + { + "auxiliary_loss_clip": 0.01116691, + "auxiliary_loss_mlp": 0.01101225, + "balance_loss_clip": 1.00176072, + "balance_loss_mlp": 1.00051713, + "epoch": 0.9747181722531189, + "flos": 19792022952960.0, + "grad_norm": 2.1679184825605167, + "language_loss": 0.78034782, + "learning_rate": 6.685377852219787e-09, + "loss": 0.80252707, + "num_input_tokens_seen": 349853460, + "step": 16212, + "time_per_iteration": 2.676887273788452 + }, + { + "auxiliary_loss_clip": 0.01133224, + "auxiliary_loss_mlp": 0.01099404, + "balance_loss_clip": 1.00187755, + "balance_loss_mlp": 1.0005554, + "epoch": 0.9747782955057869, + "flos": 31430598929280.0, + "grad_norm": 1.5164659738090964, + "language_loss": 0.79940766, + "learning_rate": 6.653598260829118e-09, + "loss": 0.82173395, + "num_input_tokens_seen": 349874830, + "step": 16213, + "time_per_iteration": 2.714968681335449 + }, + { + "auxiliary_loss_clip": 0.01098829, + "auxiliary_loss_mlp": 0.01100223, + "balance_loss_clip": 1.00157762, + "balance_loss_mlp": 1.00042093, + "epoch": 0.9748384187584548, + "flos": 15961288268160.0, + "grad_norm": 8.3586461299198, + "language_loss": 0.66204214, + "learning_rate": 6.6218942573335044e-09, + "loss": 0.68403268, + "num_input_tokens_seen": 349893690, + "step": 16214, + "time_per_iteration": 2.7224717140197754 + }, + { + "auxiliary_loss_clip": 0.01132441, + "auxiliary_loss_mlp": 0.01099262, + "balance_loss_clip": 1.00179386, + "balance_loss_mlp": 1.00050914, + "epoch": 0.9748985420111228, + "flos": 20558715776640.0, + "grad_norm": 1.6460778501422653, + "language_loss": 0.74167967, + "learning_rate": 6.5902658429355386e-09, + "loss": 0.76399678, + "num_input_tokens_seen": 349912480, + "step": 16215, + "time_per_iteration": 2.6518118381500244 + }, + { + "auxiliary_loss_clip": 0.01099486, + "auxiliary_loss_mlp": 0.01100086, + "balance_loss_clip": 1.00155902, + "balance_loss_mlp": 1.0005703, + "epoch": 0.9749586652637908, + "flos": 36721605127680.0, + "grad_norm": 1.7171872727370576, + "language_loss": 0.67137104, + "learning_rate": 6.558713018834483e-09, + "loss": 0.69336677, + "num_input_tokens_seen": 349932470, + "step": 16216, + "time_per_iteration": 2.8698816299438477 + }, + { + "auxiliary_loss_clip": 0.0108849, + "auxiliary_loss_mlp": 0.01101581, + "balance_loss_clip": 1.0016464, + "balance_loss_mlp": 1.00049114, + "epoch": 0.9750187885164587, + "flos": 10999223844480.0, + "grad_norm": 1.9618538559199652, + "language_loss": 0.71777093, + "learning_rate": 6.527235786226937e-09, + "loss": 0.73967171, + "num_input_tokens_seen": 349949060, + "step": 16217, + "time_per_iteration": 2.7828638553619385 + }, + { + "auxiliary_loss_clip": 0.01115099, + "auxiliary_loss_mlp": 0.01099937, + "balance_loss_clip": 1.00169563, + "balance_loss_mlp": 1.00046861, + "epoch": 0.9750789117691268, + "flos": 25739512070400.0, + "grad_norm": 1.6574644249499855, + "language_loss": 0.78543496, + "learning_rate": 6.495834146306167e-09, + "loss": 0.8075853, + "num_input_tokens_seen": 349968010, + "step": 16218, + "time_per_iteration": 2.711554765701294 + }, + { + "auxiliary_loss_clip": 0.01130594, + "auxiliary_loss_mlp": 0.01099989, + "balance_loss_clip": 1.00174534, + "balance_loss_mlp": 1.00042534, + "epoch": 0.9751390350217947, + "flos": 13333955961600.0, + "grad_norm": 6.516155695915701, + "language_loss": 0.77589834, + "learning_rate": 6.464508100263222e-09, + "loss": 0.79820418, + "num_input_tokens_seen": 349985270, + "step": 16219, + "time_per_iteration": 2.588658094406128 + }, + { + "auxiliary_loss_clip": 0.01132168, + "auxiliary_loss_mlp": 0.01100397, + "balance_loss_clip": 1.00183451, + "balance_loss_mlp": 1.00054777, + "epoch": 0.9751991582744627, + "flos": 22820621068800.0, + "grad_norm": 1.8786168820799858, + "language_loss": 0.81395966, + "learning_rate": 6.433257649285817e-09, + "loss": 0.83628529, + "num_input_tokens_seen": 350003935, + "step": 16220, + "time_per_iteration": 2.654545307159424 + }, + { + "auxiliary_loss_clip": 0.01163999, + "auxiliary_loss_mlp": 0.01099831, + "balance_loss_clip": 1.00179577, + "balance_loss_mlp": 1.00050592, + "epoch": 0.9752592815271306, + "flos": 19646189735040.0, + "grad_norm": 1.824639736458807, + "language_loss": 0.75493836, + "learning_rate": 6.402082794559227e-09, + "loss": 0.77757663, + "num_input_tokens_seen": 350023595, + "step": 16221, + "time_per_iteration": 2.647686004638672 + }, + { + "auxiliary_loss_clip": 0.01116166, + "auxiliary_loss_mlp": 0.01099186, + "balance_loss_clip": 1.00164843, + "balance_loss_mlp": 1.00048089, + "epoch": 0.9753194047797986, + "flos": 26690462686080.0, + "grad_norm": 1.5464625881615064, + "language_loss": 0.66502678, + "learning_rate": 6.370983537265395e-09, + "loss": 0.68718028, + "num_input_tokens_seen": 350045920, + "step": 16222, + "time_per_iteration": 2.7121787071228027 + }, + { + "auxiliary_loss_clip": 0.01147295, + "auxiliary_loss_mlp": 0.01099997, + "balance_loss_clip": 1.00177848, + "balance_loss_mlp": 1.00052917, + "epoch": 0.9753795280324665, + "flos": 23221779137280.0, + "grad_norm": 1.921938455548984, + "language_loss": 0.88322711, + "learning_rate": 6.3399598785836004e-09, + "loss": 0.90570009, + "num_input_tokens_seen": 350063925, + "step": 16223, + "time_per_iteration": 2.5812416076660156 + }, + { + "auxiliary_loss_clip": 0.01082423, + "auxiliary_loss_mlp": 0.01100417, + "balance_loss_clip": 1.00152659, + "balance_loss_mlp": 1.00056767, + "epoch": 0.9754396512851345, + "flos": 19463835363840.0, + "grad_norm": 1.8772234853106082, + "language_loss": 0.74962938, + "learning_rate": 6.309011819690457e-09, + "loss": 0.77145779, + "num_input_tokens_seen": 350080900, + "step": 16224, + "time_per_iteration": 2.685194253921509 + }, + { + "auxiliary_loss_clip": 0.01126915, + "auxiliary_loss_mlp": 0.01074185, + "balance_loss_clip": 1.00074625, + "balance_loss_mlp": 1.00017953, + "epoch": 0.9754997745378025, + "flos": 68459313340800.0, + "grad_norm": 0.8069807247534049, + "language_loss": 0.59110516, + "learning_rate": 6.278139361759249e-09, + "loss": 0.61311615, + "num_input_tokens_seen": 350144550, + "step": 16225, + "time_per_iteration": 3.126721143722534 + }, + { + "auxiliary_loss_clip": 0.0111396, + "auxiliary_loss_mlp": 0.0074723, + "balance_loss_clip": 1.00171733, + "balance_loss_mlp": 1.00057387, + "epoch": 0.9755598977904705, + "flos": 26395168976640.0, + "grad_norm": 2.2942155878581696, + "language_loss": 0.68975431, + "learning_rate": 6.247342505960818e-09, + "loss": 0.70836622, + "num_input_tokens_seen": 350164050, + "step": 16226, + "time_per_iteration": 5.713395118713379 + }, + { + "auxiliary_loss_clip": 0.0114946, + "auxiliary_loss_mlp": 0.01101122, + "balance_loss_clip": 1.0019151, + "balance_loss_mlp": 1.00079572, + "epoch": 0.9756200210431384, + "flos": 16617663446400.0, + "grad_norm": 1.7303971431442333, + "language_loss": 0.82584608, + "learning_rate": 6.216621253462894e-09, + "loss": 0.84835196, + "num_input_tokens_seen": 350181350, + "step": 16227, + "time_per_iteration": 2.6430304050445557 + }, + { + "auxiliary_loss_clip": 0.01164069, + "auxiliary_loss_mlp": 0.01098923, + "balance_loss_clip": 1.00186145, + "balance_loss_mlp": 1.00050342, + "epoch": 0.9756801442958064, + "flos": 23623044946560.0, + "grad_norm": 1.7173190300500087, + "language_loss": 0.77444971, + "learning_rate": 6.185975605430549e-09, + "loss": 0.79707956, + "num_input_tokens_seen": 350199765, + "step": 16228, + "time_per_iteration": 2.5873043537139893 + }, + { + "auxiliary_loss_clip": 0.01143206, + "auxiliary_loss_mlp": 0.0107379, + "balance_loss_clip": 1.00076914, + "balance_loss_mlp": 1.00016677, + "epoch": 0.9757402675484744, + "flos": 61625799440640.0, + "grad_norm": 0.8485494795471282, + "language_loss": 0.55794907, + "learning_rate": 6.155405563025962e-09, + "loss": 0.58011901, + "num_input_tokens_seen": 350256420, + "step": 16229, + "time_per_iteration": 3.0487234592437744 + }, + { + "auxiliary_loss_clip": 0.01149444, + "auxiliary_loss_mlp": 0.0110009, + "balance_loss_clip": 1.00192988, + "balance_loss_mlp": 1.0004313, + "epoch": 0.9758003908011423, + "flos": 24058964401920.0, + "grad_norm": 1.6716817907791983, + "language_loss": 0.75104886, + "learning_rate": 6.124911127407984e-09, + "loss": 0.77354419, + "num_input_tokens_seen": 350276270, + "step": 16230, + "time_per_iteration": 2.5845561027526855 + }, + { + "auxiliary_loss_clip": 0.01130543, + "auxiliary_loss_mlp": 0.01099368, + "balance_loss_clip": 1.00175142, + "balance_loss_mlp": 1.00032854, + "epoch": 0.9758605140538104, + "flos": 17493093717120.0, + "grad_norm": 1.7669398997243693, + "language_loss": 0.71826863, + "learning_rate": 6.094492299733245e-09, + "loss": 0.74056768, + "num_input_tokens_seen": 350295000, + "step": 16231, + "time_per_iteration": 2.571197271347046 + }, + { + "auxiliary_loss_clip": 0.01130232, + "auxiliary_loss_mlp": 0.01101201, + "balance_loss_clip": 1.00176251, + "balance_loss_mlp": 1.00049329, + "epoch": 0.9759206373064783, + "flos": 24826950115200.0, + "grad_norm": 2.265000593784899, + "language_loss": 0.76617789, + "learning_rate": 6.064149081155267e-09, + "loss": 0.7884922, + "num_input_tokens_seen": 350314980, + "step": 16232, + "time_per_iteration": 2.618514060974121 + }, + { + "auxiliary_loss_clip": 0.01127639, + "auxiliary_loss_mlp": 0.01073966, + "balance_loss_clip": 1.00091088, + "balance_loss_mlp": 1.00034237, + "epoch": 0.9759807605591463, + "flos": 68161182456960.0, + "grad_norm": 0.747242149626519, + "language_loss": 0.53862095, + "learning_rate": 6.033881472824465e-09, + "loss": 0.560637, + "num_input_tokens_seen": 350371985, + "step": 16233, + "time_per_iteration": 3.005497932434082 + }, + { + "auxiliary_loss_clip": 0.01164022, + "auxiliary_loss_mlp": 0.01099865, + "balance_loss_clip": 1.00186419, + "balance_loss_mlp": 1.00049174, + "epoch": 0.9760408838118142, + "flos": 18989239939200.0, + "grad_norm": 1.8525329901511338, + "language_loss": 0.71487904, + "learning_rate": 6.003689475888807e-09, + "loss": 0.73751795, + "num_input_tokens_seen": 350390590, + "step": 16234, + "time_per_iteration": 2.6068243980407715 + }, + { + "auxiliary_loss_clip": 0.01147115, + "auxiliary_loss_mlp": 0.01101463, + "balance_loss_clip": 1.00175929, + "balance_loss_mlp": 1.00056481, + "epoch": 0.9761010070644822, + "flos": 17125978763520.0, + "grad_norm": 2.8203630523209524, + "language_loss": 0.79287463, + "learning_rate": 5.973573091493156e-09, + "loss": 0.81536043, + "num_input_tokens_seen": 350403770, + "step": 16235, + "time_per_iteration": 2.5219528675079346 + }, + { + "auxiliary_loss_clip": 0.01148011, + "auxiliary_loss_mlp": 0.01101087, + "balance_loss_clip": 1.00195682, + "balance_loss_mlp": 1.00047493, + "epoch": 0.9761611303171501, + "flos": 22052599441920.0, + "grad_norm": 1.9609340980912386, + "language_loss": 0.77143216, + "learning_rate": 5.943532320779265e-09, + "loss": 0.79392326, + "num_input_tokens_seen": 350421870, + "step": 16236, + "time_per_iteration": 2.567002296447754 + }, + { + "auxiliary_loss_clip": 0.01149082, + "auxiliary_loss_mlp": 0.01100248, + "balance_loss_clip": 1.00183773, + "balance_loss_mlp": 1.00039876, + "epoch": 0.9762212535698181, + "flos": 21757521214080.0, + "grad_norm": 2.3094857956357413, + "language_loss": 0.75877357, + "learning_rate": 5.913567164886446e-09, + "loss": 0.78126687, + "num_input_tokens_seen": 350440025, + "step": 16237, + "time_per_iteration": 2.55124568939209 + }, + { + "auxiliary_loss_clip": 0.01101445, + "auxiliary_loss_mlp": 0.01100569, + "balance_loss_clip": 1.00167358, + "balance_loss_mlp": 1.00052822, + "epoch": 0.9762813768224861, + "flos": 25921615046400.0, + "grad_norm": 1.8665799657683069, + "language_loss": 0.7272532, + "learning_rate": 5.8836776249509e-09, + "loss": 0.7492733, + "num_input_tokens_seen": 350459435, + "step": 16238, + "time_per_iteration": 2.722271680831909 + }, + { + "auxiliary_loss_clip": 0.01132156, + "auxiliary_loss_mlp": 0.00747391, + "balance_loss_clip": 1.00170875, + "balance_loss_mlp": 1.00045061, + "epoch": 0.9763415000751541, + "flos": 24051853509120.0, + "grad_norm": 3.007926786588183, + "language_loss": 0.83838677, + "learning_rate": 5.8538637021063875e-09, + "loss": 0.85718226, + "num_input_tokens_seen": 350472655, + "step": 16239, + "time_per_iteration": 2.602323532104492 + }, + { + "auxiliary_loss_clip": 0.01120153, + "auxiliary_loss_mlp": 0.0110081, + "balance_loss_clip": 1.00188589, + "balance_loss_mlp": 1.00057864, + "epoch": 0.976401623327822, + "flos": 17018677860480.0, + "grad_norm": 2.698343119078909, + "language_loss": 0.60939914, + "learning_rate": 5.824125397483115e-09, + "loss": 0.63160878, + "num_input_tokens_seen": 350488160, + "step": 16240, + "time_per_iteration": 2.622448444366455 + }, + { + "auxiliary_loss_clip": 0.01116689, + "auxiliary_loss_mlp": 0.01100107, + "balance_loss_clip": 1.00172639, + "balance_loss_mlp": 1.00054383, + "epoch": 0.97646174658049, + "flos": 16106941918080.0, + "grad_norm": 2.8073367106855542, + "language_loss": 0.8222031, + "learning_rate": 5.7944627122088474e-09, + "loss": 0.84437102, + "num_input_tokens_seen": 350506065, + "step": 16241, + "time_per_iteration": 2.627772569656372 + }, + { + "auxiliary_loss_clip": 0.0111621, + "auxiliary_loss_mlp": 0.01099618, + "balance_loss_clip": 1.00174749, + "balance_loss_mlp": 1.00072205, + "epoch": 0.9765218698331579, + "flos": 21252725429760.0, + "grad_norm": 3.3087800250102752, + "language_loss": 0.83239377, + "learning_rate": 5.764875647408463e-09, + "loss": 0.85455203, + "num_input_tokens_seen": 350524495, + "step": 16242, + "time_per_iteration": 2.747892379760742 + }, + { + "auxiliary_loss_clip": 0.01147418, + "auxiliary_loss_mlp": 0.01100454, + "balance_loss_clip": 1.00186431, + "balance_loss_mlp": 1.00046134, + "epoch": 0.9765819930858259, + "flos": 18588045957120.0, + "grad_norm": 1.5666258719265516, + "language_loss": 0.75483495, + "learning_rate": 5.7353642042037294e-09, + "loss": 0.77731371, + "num_input_tokens_seen": 350544185, + "step": 16243, + "time_per_iteration": 3.9536397457122803 + }, + { + "auxiliary_loss_clip": 0.01149471, + "auxiliary_loss_mlp": 0.01101306, + "balance_loss_clip": 1.00184822, + "balance_loss_mlp": 1.00059843, + "epoch": 0.976642116338494, + "flos": 20266833859200.0, + "grad_norm": 1.5951857365400484, + "language_loss": 0.70000124, + "learning_rate": 5.705928383713754e-09, + "loss": 0.72250903, + "num_input_tokens_seen": 350562675, + "step": 16244, + "time_per_iteration": 2.55263352394104 + }, + { + "auxiliary_loss_clip": 0.0113238, + "auxiliary_loss_mlp": 0.01100961, + "balance_loss_clip": 1.00186431, + "balance_loss_mlp": 1.00049186, + "epoch": 0.9767022395911619, + "flos": 25550477769600.0, + "grad_norm": 1.9552149971676525, + "language_loss": 0.83766735, + "learning_rate": 5.676568187055197e-09, + "loss": 0.86000073, + "num_input_tokens_seen": 350581535, + "step": 16245, + "time_per_iteration": 2.694383144378662 + }, + { + "auxiliary_loss_clip": 0.01099733, + "auxiliary_loss_mlp": 0.01098987, + "balance_loss_clip": 1.00167978, + "balance_loss_mlp": 1.00052047, + "epoch": 0.9767623628438299, + "flos": 21762656858880.0, + "grad_norm": 1.378482971618184, + "language_loss": 0.78714573, + "learning_rate": 5.647283615340726e-09, + "loss": 0.80913287, + "num_input_tokens_seen": 350601615, + "step": 16246, + "time_per_iteration": 2.6673219203948975 + }, + { + "auxiliary_loss_clip": 0.0116379, + "auxiliary_loss_mlp": 0.01098493, + "balance_loss_clip": 1.00184798, + "balance_loss_mlp": 1.0005033, + "epoch": 0.9768224860964978, + "flos": 15851114277120.0, + "grad_norm": 1.4836476777518905, + "language_loss": 0.73880082, + "learning_rate": 5.6180746696812275e-09, + "loss": 0.76142365, + "num_input_tokens_seen": 350619580, + "step": 16247, + "time_per_iteration": 4.000448942184448 + }, + { + "auxiliary_loss_clip": 0.01088253, + "auxiliary_loss_mlp": 0.01100579, + "balance_loss_clip": 1.00186849, + "balance_loss_mlp": 1.0004437, + "epoch": 0.9768826093491658, + "flos": 25151151294720.0, + "grad_norm": 1.9955446165161648, + "language_loss": 0.79947543, + "learning_rate": 5.58894135118404e-09, + "loss": 0.82136369, + "num_input_tokens_seen": 350640015, + "step": 16248, + "time_per_iteration": 2.7719664573669434 + }, + { + "auxiliary_loss_clip": 0.01082998, + "auxiliary_loss_mlp": 0.01101601, + "balance_loss_clip": 1.00173831, + "balance_loss_mlp": 1.00070262, + "epoch": 0.9769427326018337, + "flos": 22967028904320.0, + "grad_norm": 2.5696705802529243, + "language_loss": 0.78828096, + "learning_rate": 5.559883660954278e-09, + "loss": 0.81012702, + "num_input_tokens_seen": 350659155, + "step": 16249, + "time_per_iteration": 2.7617249488830566 + }, + { + "auxiliary_loss_clip": 0.01146779, + "auxiliary_loss_mlp": 0.01099597, + "balance_loss_clip": 1.00182974, + "balance_loss_mlp": 1.00050998, + "epoch": 0.9770028558545018, + "flos": 15264297786240.0, + "grad_norm": 2.74418791182791, + "language_loss": 0.6687367, + "learning_rate": 5.530901600093507e-09, + "loss": 0.69120044, + "num_input_tokens_seen": 350676615, + "step": 16250, + "time_per_iteration": 2.5638482570648193 + }, + { + "auxiliary_loss_clip": 0.01157976, + "auxiliary_loss_mlp": 0.01073979, + "balance_loss_clip": 1.00073934, + "balance_loss_mlp": 0.99997371, + "epoch": 0.9770629791071697, + "flos": 71450348808960.0, + "grad_norm": 0.7734056982636225, + "language_loss": 0.59877586, + "learning_rate": 5.501995169700846e-09, + "loss": 0.62109542, + "num_input_tokens_seen": 350736805, + "step": 16251, + "time_per_iteration": 3.12978196144104 + }, + { + "auxiliary_loss_clip": 0.01149423, + "auxiliary_loss_mlp": 0.01100211, + "balance_loss_clip": 1.00186455, + "balance_loss_mlp": 1.00045633, + "epoch": 0.9771231023598377, + "flos": 22412854897920.0, + "grad_norm": 1.934029845016877, + "language_loss": 0.78726804, + "learning_rate": 5.473164370872307e-09, + "loss": 0.80976439, + "num_input_tokens_seen": 350753600, + "step": 16252, + "time_per_iteration": 2.5891973972320557 + }, + { + "auxiliary_loss_clip": 0.01149674, + "auxiliary_loss_mlp": 0.01100842, + "balance_loss_clip": 1.00190425, + "balance_loss_mlp": 1.0004679, + "epoch": 0.9771832256125056, + "flos": 19025940660480.0, + "grad_norm": 2.336535584617772, + "language_loss": 0.64323103, + "learning_rate": 5.444409204701461e-09, + "loss": 0.6657362, + "num_input_tokens_seen": 350771225, + "step": 16253, + "time_per_iteration": 2.5659217834472656 + }, + { + "auxiliary_loss_clip": 0.01149572, + "auxiliary_loss_mlp": 0.01101634, + "balance_loss_clip": 1.00194192, + "balance_loss_mlp": 1.00054502, + "epoch": 0.9772433488651736, + "flos": 17822143232640.0, + "grad_norm": 2.044231137541712, + "language_loss": 0.76920462, + "learning_rate": 5.415729672278324e-09, + "loss": 0.79171669, + "num_input_tokens_seen": 350789100, + "step": 16254, + "time_per_iteration": 2.590902328491211 + }, + { + "auxiliary_loss_clip": 0.0114901, + "auxiliary_loss_mlp": 0.01101435, + "balance_loss_clip": 1.0018605, + "balance_loss_mlp": 1.00063181, + "epoch": 0.9773034721178415, + "flos": 37629785623680.0, + "grad_norm": 1.6382244968126218, + "language_loss": 0.63739693, + "learning_rate": 5.387125774690471e-09, + "loss": 0.65990138, + "num_input_tokens_seen": 350811085, + "step": 16255, + "time_per_iteration": 2.712100028991699 + }, + { + "auxiliary_loss_clip": 0.01115773, + "auxiliary_loss_mlp": 0.00747444, + "balance_loss_clip": 1.00166059, + "balance_loss_mlp": 1.00048375, + "epoch": 0.9773635953705095, + "flos": 20302457172480.0, + "grad_norm": 3.679095215416321, + "language_loss": 0.75763762, + "learning_rate": 5.358597513023033e-09, + "loss": 0.77626979, + "num_input_tokens_seen": 350831065, + "step": 16256, + "time_per_iteration": 2.669086456298828 + }, + { + "auxiliary_loss_clip": 0.01164012, + "auxiliary_loss_mlp": 0.01100333, + "balance_loss_clip": 1.00195932, + "balance_loss_mlp": 1.00053084, + "epoch": 0.9774237186231776, + "flos": 22309253095680.0, + "grad_norm": 2.04115074058276, + "language_loss": 0.7815963, + "learning_rate": 5.330144888357369e-09, + "loss": 0.80423975, + "num_input_tokens_seen": 350849675, + "step": 16257, + "time_per_iteration": 2.548110008239746 + }, + { + "auxiliary_loss_clip": 0.01147762, + "auxiliary_loss_mlp": 0.0110064, + "balance_loss_clip": 1.00185704, + "balance_loss_mlp": 1.00059974, + "epoch": 0.9774838418758455, + "flos": 24204905360640.0, + "grad_norm": 1.6244055787387341, + "language_loss": 0.75271803, + "learning_rate": 5.301767901772391e-09, + "loss": 0.77520204, + "num_input_tokens_seen": 350868955, + "step": 16258, + "time_per_iteration": 2.62389874458313 + }, + { + "auxiliary_loss_clip": 0.01142945, + "auxiliary_loss_mlp": 0.01074109, + "balance_loss_clip": 1.00080705, + "balance_loss_mlp": 1.00010419, + "epoch": 0.9775439651285135, + "flos": 66357139829760.0, + "grad_norm": 0.7233182511745416, + "language_loss": 0.59806299, + "learning_rate": 5.273466554344353e-09, + "loss": 0.62023354, + "num_input_tokens_seen": 350935110, + "step": 16259, + "time_per_iteration": 3.2519690990448 + }, + { + "auxiliary_loss_clip": 0.01134231, + "auxiliary_loss_mlp": 0.01101478, + "balance_loss_clip": 1.00191021, + "balance_loss_mlp": 1.00048351, + "epoch": 0.9776040883811814, + "flos": 22601565976320.0, + "grad_norm": 1.9310070147782232, + "language_loss": 0.73733598, + "learning_rate": 5.2452408471461705e-09, + "loss": 0.75969303, + "num_input_tokens_seen": 350953220, + "step": 16260, + "time_per_iteration": 2.633749485015869 + }, + { + "auxiliary_loss_clip": 0.01147886, + "auxiliary_loss_mlp": 0.01100262, + "balance_loss_clip": 1.00185132, + "balance_loss_mlp": 1.00050783, + "epoch": 0.9776642116338494, + "flos": 18442176825600.0, + "grad_norm": 1.8578335791846758, + "language_loss": 0.79831314, + "learning_rate": 5.2170907812485456e-09, + "loss": 0.82079458, + "num_input_tokens_seen": 350971915, + "step": 16261, + "time_per_iteration": 2.560088634490967 + }, + { + "auxiliary_loss_clip": 0.01149739, + "auxiliary_loss_mlp": 0.01100444, + "balance_loss_clip": 1.001899, + "balance_loss_mlp": 1.00040317, + "epoch": 0.9777243348865173, + "flos": 22638446265600.0, + "grad_norm": 2.1451775195446414, + "language_loss": 0.74340975, + "learning_rate": 5.189016357718845e-09, + "loss": 0.76591158, + "num_input_tokens_seen": 350990470, + "step": 16262, + "time_per_iteration": 2.580226182937622 + }, + { + "auxiliary_loss_clip": 0.0114754, + "auxiliary_loss_mlp": 0.01100802, + "balance_loss_clip": 1.00183201, + "balance_loss_mlp": 1.00057054, + "epoch": 0.9777844581391854, + "flos": 31321394605440.0, + "grad_norm": 2.3095831322383757, + "language_loss": 0.69962823, + "learning_rate": 5.16101757762133e-09, + "loss": 0.72211158, + "num_input_tokens_seen": 351010755, + "step": 16263, + "time_per_iteration": 3.994826555252075 + }, + { + "auxiliary_loss_clip": 0.01147493, + "auxiliary_loss_mlp": 0.01101271, + "balance_loss_clip": 1.00192237, + "balance_loss_mlp": 1.00046778, + "epoch": 0.9778445813918533, + "flos": 23039101543680.0, + "grad_norm": 1.8981906520320777, + "language_loss": 0.66777068, + "learning_rate": 5.133094442018038e-09, + "loss": 0.69025832, + "num_input_tokens_seen": 351029965, + "step": 16264, + "time_per_iteration": 3.9942221641540527 + }, + { + "auxiliary_loss_clip": 0.01097819, + "auxiliary_loss_mlp": 0.01101293, + "balance_loss_clip": 1.00163698, + "balance_loss_mlp": 1.00048995, + "epoch": 0.9779047046445213, + "flos": 17566351505280.0, + "grad_norm": 1.9819826734421322, + "language_loss": 0.72698522, + "learning_rate": 5.105246951967679e-09, + "loss": 0.74897635, + "num_input_tokens_seen": 351046205, + "step": 16265, + "time_per_iteration": 2.6736721992492676 + }, + { + "auxiliary_loss_clip": 0.01149328, + "auxiliary_loss_mlp": 0.01100319, + "balance_loss_clip": 1.00183976, + "balance_loss_mlp": 1.00046909, + "epoch": 0.9779648278971892, + "flos": 20741141975040.0, + "grad_norm": 2.0194955650369435, + "language_loss": 0.68995327, + "learning_rate": 5.077475108526297e-09, + "loss": 0.71244973, + "num_input_tokens_seen": 351065390, + "step": 16266, + "time_per_iteration": 2.634054183959961 + }, + { + "auxiliary_loss_clip": 0.01100253, + "auxiliary_loss_mlp": 0.01099263, + "balance_loss_clip": 1.00162268, + "balance_loss_mlp": 1.00060534, + "epoch": 0.9780249511498572, + "flos": 21026954494080.0, + "grad_norm": 2.083897790657292, + "language_loss": 0.86551219, + "learning_rate": 5.049778912747049e-09, + "loss": 0.88750738, + "num_input_tokens_seen": 351084355, + "step": 16267, + "time_per_iteration": 2.713047742843628 + }, + { + "auxiliary_loss_clip": 0.01067589, + "auxiliary_loss_mlp": 0.01101466, + "balance_loss_clip": 1.00152731, + "balance_loss_mlp": 1.00047219, + "epoch": 0.9780850744025251, + "flos": 30774223751040.0, + "grad_norm": 1.7996930930348198, + "language_loss": 0.69796002, + "learning_rate": 5.022158365679985e-09, + "loss": 0.71965063, + "num_input_tokens_seen": 351105870, + "step": 16268, + "time_per_iteration": 2.8658664226531982 + }, + { + "auxiliary_loss_clip": 0.01134352, + "auxiliary_loss_mlp": 0.01100596, + "balance_loss_clip": 1.00176644, + "balance_loss_mlp": 1.00041246, + "epoch": 0.9781451976551931, + "flos": 20302995876480.0, + "grad_norm": 1.5411617796807622, + "language_loss": 0.73957765, + "learning_rate": 4.994613468372711e-09, + "loss": 0.76192713, + "num_input_tokens_seen": 351124760, + "step": 16269, + "time_per_iteration": 2.6490094661712646 + }, + { + "auxiliary_loss_clip": 0.01134191, + "auxiliary_loss_mlp": 0.01100303, + "balance_loss_clip": 1.00184989, + "balance_loss_mlp": 1.00040603, + "epoch": 0.9782053209078612, + "flos": 24316479982080.0, + "grad_norm": 1.636441457678307, + "language_loss": 0.70529515, + "learning_rate": 4.967144221869501e-09, + "loss": 0.72764003, + "num_input_tokens_seen": 351142820, + "step": 16270, + "time_per_iteration": 2.6307106018066406 + }, + { + "auxiliary_loss_clip": 0.01164269, + "auxiliary_loss_mlp": 0.01100927, + "balance_loss_clip": 1.00206542, + "balance_loss_mlp": 1.00050509, + "epoch": 0.9782654441605291, + "flos": 32489425065600.0, + "grad_norm": 1.6282415658687075, + "language_loss": 0.63903749, + "learning_rate": 4.939750627212191e-09, + "loss": 0.66168946, + "num_input_tokens_seen": 351164805, + "step": 16271, + "time_per_iteration": 2.6521265506744385 + }, + { + "auxiliary_loss_clip": 0.01130588, + "auxiliary_loss_mlp": 0.01100329, + "balance_loss_clip": 1.00192094, + "balance_loss_mlp": 1.00052714, + "epoch": 0.9783255674131971, + "flos": 26979076465920.0, + "grad_norm": 1.5371166877634543, + "language_loss": 0.70746249, + "learning_rate": 4.912432685439505e-09, + "loss": 0.72977167, + "num_input_tokens_seen": 351187005, + "step": 16272, + "time_per_iteration": 2.6377153396606445 + }, + { + "auxiliary_loss_clip": 0.01083449, + "auxiliary_loss_mlp": 0.01100358, + "balance_loss_clip": 1.00153267, + "balance_loss_mlp": 1.00055552, + "epoch": 0.978385690665865, + "flos": 23112251591040.0, + "grad_norm": 2.223305246510746, + "language_loss": 0.66663861, + "learning_rate": 4.88519039758728e-09, + "loss": 0.68847662, + "num_input_tokens_seen": 351208450, + "step": 16273, + "time_per_iteration": 2.7606041431427 + }, + { + "auxiliary_loss_clip": 0.01116165, + "auxiliary_loss_mlp": 0.01100437, + "balance_loss_clip": 1.00172353, + "balance_loss_mlp": 1.00039649, + "epoch": 0.978445813918533, + "flos": 25409672455680.0, + "grad_norm": 1.529830393153967, + "language_loss": 0.73905122, + "learning_rate": 4.85802376468869e-09, + "loss": 0.76121724, + "num_input_tokens_seen": 351229585, + "step": 16274, + "time_per_iteration": 2.6770706176757812 + }, + { + "auxiliary_loss_clip": 0.01132897, + "auxiliary_loss_mlp": 0.01099346, + "balance_loss_clip": 1.00196087, + "balance_loss_mlp": 1.00054538, + "epoch": 0.9785059371712009, + "flos": 23550218121600.0, + "grad_norm": 1.5423279558976772, + "language_loss": 0.77740812, + "learning_rate": 4.830932787773579e-09, + "loss": 0.79973066, + "num_input_tokens_seen": 351249525, + "step": 16275, + "time_per_iteration": 2.650838851928711 + }, + { + "auxiliary_loss_clip": 0.01070418, + "auxiliary_loss_mlp": 0.01100729, + "balance_loss_clip": 1.00174081, + "balance_loss_mlp": 1.00049782, + "epoch": 0.978566060423869, + "flos": 34351177870080.0, + "grad_norm": 1.6736415324078007, + "language_loss": 0.70789963, + "learning_rate": 4.803917467869567e-09, + "loss": 0.7296111, + "num_input_tokens_seen": 351272530, + "step": 16276, + "time_per_iteration": 2.8623266220092773 + }, + { + "auxiliary_loss_clip": 0.01132672, + "auxiliary_loss_mlp": 0.01098317, + "balance_loss_clip": 1.00171661, + "balance_loss_mlp": 1.00056529, + "epoch": 0.9786261836765369, + "flos": 11618862387840.0, + "grad_norm": 1.8194210361790193, + "language_loss": 0.86017621, + "learning_rate": 4.776977806000726e-09, + "loss": 0.8824861, + "num_input_tokens_seen": 351288530, + "step": 16277, + "time_per_iteration": 2.7207205295562744 + }, + { + "auxiliary_loss_clip": 0.01149539, + "auxiliary_loss_mlp": 0.01100003, + "balance_loss_clip": 1.0018692, + "balance_loss_mlp": 1.00043976, + "epoch": 0.9786863069292049, + "flos": 17420949250560.0, + "grad_norm": 1.8556370922083112, + "language_loss": 0.71112454, + "learning_rate": 4.7501138031891264e-09, + "loss": 0.73361993, + "num_input_tokens_seen": 351305890, + "step": 16278, + "time_per_iteration": 2.5473146438598633 + }, + { + "auxiliary_loss_clip": 0.01148049, + "auxiliary_loss_mlp": 0.0109992, + "balance_loss_clip": 1.00200737, + "balance_loss_mlp": 1.00040424, + "epoch": 0.9787464301818728, + "flos": 20844923345280.0, + "grad_norm": 1.8481375870647325, + "language_loss": 0.84187376, + "learning_rate": 4.723325460453065e-09, + "loss": 0.86435342, + "num_input_tokens_seen": 351325010, + "step": 16279, + "time_per_iteration": 2.5979835987091064 + }, + { + "auxiliary_loss_clip": 0.01149409, + "auxiliary_loss_mlp": 0.01100622, + "balance_loss_clip": 1.00184667, + "balance_loss_mlp": 1.00048614, + "epoch": 0.9788065534345408, + "flos": 18222942165120.0, + "grad_norm": 1.9960807551660296, + "language_loss": 0.79006231, + "learning_rate": 4.696612778808395e-09, + "loss": 0.81256264, + "num_input_tokens_seen": 351343060, + "step": 16280, + "time_per_iteration": 2.5594520568847656 + }, + { + "auxiliary_loss_clip": 0.01114783, + "auxiliary_loss_mlp": 0.01100016, + "balance_loss_clip": 1.00180483, + "balance_loss_mlp": 1.00064349, + "epoch": 0.9788666766872087, + "flos": 21578219498880.0, + "grad_norm": 1.9123099831996626, + "language_loss": 0.79272079, + "learning_rate": 4.669975759268085e-09, + "loss": 0.81486881, + "num_input_tokens_seen": 351363260, + "step": 16281, + "time_per_iteration": 4.348801374435425 + }, + { + "auxiliary_loss_clip": 0.01147491, + "auxiliary_loss_mlp": 0.01100944, + "balance_loss_clip": 1.00176597, + "balance_loss_mlp": 1.00061703, + "epoch": 0.9789267999398767, + "flos": 24900495212160.0, + "grad_norm": 1.7231533269752521, + "language_loss": 0.80234623, + "learning_rate": 4.643414402842216e-09, + "loss": 0.82483065, + "num_input_tokens_seen": 351382610, + "step": 16282, + "time_per_iteration": 2.650912284851074 + }, + { + "auxiliary_loss_clip": 0.01130832, + "auxiliary_loss_mlp": 0.01099859, + "balance_loss_clip": 1.00168538, + "balance_loss_mlp": 1.0005815, + "epoch": 0.9789869231925448, + "flos": 19573111514880.0, + "grad_norm": 2.145885951540198, + "language_loss": 0.83345008, + "learning_rate": 4.616928710538204e-09, + "loss": 0.855757, + "num_input_tokens_seen": 351401075, + "step": 16283, + "time_per_iteration": 2.598062515258789 + }, + { + "auxiliary_loss_clip": 0.01147777, + "auxiliary_loss_mlp": 0.01100324, + "balance_loss_clip": 1.00177383, + "balance_loss_mlp": 1.00047433, + "epoch": 0.9790470464452127, + "flos": 16796641939200.0, + "grad_norm": 1.8828172312370781, + "language_loss": 0.71645224, + "learning_rate": 4.590518683360134e-09, + "loss": 0.73893327, + "num_input_tokens_seen": 351419275, + "step": 16284, + "time_per_iteration": 2.5488734245300293 + }, + { + "auxiliary_loss_clip": 0.01147473, + "auxiliary_loss_mlp": 0.01099219, + "balance_loss_clip": 1.00191534, + "balance_loss_mlp": 1.00051355, + "epoch": 0.9791071696978807, + "flos": 18369350000640.0, + "grad_norm": 2.8709853475259863, + "language_loss": 0.6468209, + "learning_rate": 4.56418432230965e-09, + "loss": 0.6692878, + "num_input_tokens_seen": 351437375, + "step": 16285, + "time_per_iteration": 4.011488676071167 + }, + { + "auxiliary_loss_clip": 0.01130722, + "auxiliary_loss_mlp": 0.01099775, + "balance_loss_clip": 1.00178814, + "balance_loss_mlp": 1.00040245, + "epoch": 0.9791672929505486, + "flos": 24170323541760.0, + "grad_norm": 1.6793856481232847, + "language_loss": 0.70840275, + "learning_rate": 4.537925628385286e-09, + "loss": 0.73070776, + "num_input_tokens_seen": 351457810, + "step": 16286, + "time_per_iteration": 2.636897563934326 + }, + { + "auxiliary_loss_clip": 0.01147566, + "auxiliary_loss_mlp": 0.01098871, + "balance_loss_clip": 1.00171781, + "balance_loss_mlp": 1.00049949, + "epoch": 0.9792274162032166, + "flos": 24354114456960.0, + "grad_norm": 1.5958634603094812, + "language_loss": 0.5838992, + "learning_rate": 4.511742602582691e-09, + "loss": 0.60636359, + "num_input_tokens_seen": 351478825, + "step": 16287, + "time_per_iteration": 2.6296675205230713 + }, + { + "auxiliary_loss_clip": 0.01147307, + "auxiliary_loss_mlp": 0.01100106, + "balance_loss_clip": 1.00180793, + "balance_loss_mlp": 1.0005424, + "epoch": 0.9792875394558845, + "flos": 26395779507840.0, + "grad_norm": 1.7375918091757996, + "language_loss": 0.81831676, + "learning_rate": 4.485635245894626e-09, + "loss": 0.84079087, + "num_input_tokens_seen": 351498785, + "step": 16288, + "time_per_iteration": 2.6238343715667725 + }, + { + "auxiliary_loss_clip": 0.01132745, + "auxiliary_loss_mlp": 0.00747419, + "balance_loss_clip": 1.00197256, + "balance_loss_mlp": 1.00048852, + "epoch": 0.9793476627085526, + "flos": 28148004766080.0, + "grad_norm": 1.4638298329389323, + "language_loss": 0.71700799, + "learning_rate": 4.459603559311631e-09, + "loss": 0.73580962, + "num_input_tokens_seen": 351520235, + "step": 16289, + "time_per_iteration": 2.689415454864502 + }, + { + "auxiliary_loss_clip": 0.01115315, + "auxiliary_loss_mlp": 0.01100334, + "balance_loss_clip": 1.00162828, + "balance_loss_mlp": 1.00048399, + "epoch": 0.9794077859612205, + "flos": 16763927627520.0, + "grad_norm": 2.3229042313059316, + "language_loss": 0.75381291, + "learning_rate": 4.43364754382003e-09, + "loss": 0.77596939, + "num_input_tokens_seen": 351538900, + "step": 16290, + "time_per_iteration": 2.6349105834960938 + }, + { + "auxiliary_loss_clip": 0.0114761, + "auxiliary_loss_mlp": 0.0110055, + "balance_loss_clip": 1.00189281, + "balance_loss_mlp": 1.00046158, + "epoch": 0.9794679092138885, + "flos": 19280834547840.0, + "grad_norm": 1.640967236252186, + "language_loss": 0.67236376, + "learning_rate": 4.4077672004048105e-09, + "loss": 0.69484532, + "num_input_tokens_seen": 351558715, + "step": 16291, + "time_per_iteration": 2.5918326377868652 + }, + { + "auxiliary_loss_clip": 0.01164236, + "auxiliary_loss_mlp": 0.00747388, + "balance_loss_clip": 1.00192547, + "balance_loss_mlp": 1.00049376, + "epoch": 0.9795280324665564, + "flos": 32156640535680.0, + "grad_norm": 2.056782395278363, + "language_loss": 0.62839603, + "learning_rate": 4.3819625300467456e-09, + "loss": 0.64751232, + "num_input_tokens_seen": 351578450, + "step": 16292, + "time_per_iteration": 2.61405611038208 + }, + { + "auxiliary_loss_clip": 0.01113215, + "auxiliary_loss_mlp": 0.01100534, + "balance_loss_clip": 1.00168371, + "balance_loss_mlp": 1.000494, + "epoch": 0.9795881557192244, + "flos": 19060953442560.0, + "grad_norm": 2.2282950484935116, + "language_loss": 0.73318422, + "learning_rate": 4.356233533724829e-09, + "loss": 0.75532174, + "num_input_tokens_seen": 351597195, + "step": 16293, + "time_per_iteration": 2.6907403469085693 + }, + { + "auxiliary_loss_clip": 0.01149094, + "auxiliary_loss_mlp": 0.01101305, + "balance_loss_clip": 1.00184739, + "balance_loss_mlp": 1.00045395, + "epoch": 0.9796482789718923, + "flos": 28329928174080.0, + "grad_norm": 1.8581200453365287, + "language_loss": 0.83921719, + "learning_rate": 4.330580212414503e-09, + "loss": 0.86172116, + "num_input_tokens_seen": 351617460, + "step": 16294, + "time_per_iteration": 2.6196811199188232 + }, + { + "auxiliary_loss_clip": 0.01116115, + "auxiliary_loss_mlp": 0.01099402, + "balance_loss_clip": 1.00176263, + "balance_loss_mlp": 1.00055385, + "epoch": 0.9797084022245603, + "flos": 17967976450560.0, + "grad_norm": 2.0462708998628116, + "language_loss": 0.72103, + "learning_rate": 4.305002567088767e-09, + "loss": 0.74318516, + "num_input_tokens_seen": 351635900, + "step": 16295, + "time_per_iteration": 2.66909122467041 + }, + { + "auxiliary_loss_clip": 0.01149729, + "auxiliary_loss_mlp": 0.01100668, + "balance_loss_clip": 1.00190854, + "balance_loss_mlp": 1.00058031, + "epoch": 0.9797685254772284, + "flos": 20266726118400.0, + "grad_norm": 2.2213562167266785, + "language_loss": 0.81106353, + "learning_rate": 4.2795005987170674e-09, + "loss": 0.83356744, + "num_input_tokens_seen": 351655400, + "step": 16296, + "time_per_iteration": 2.5641064643859863 + }, + { + "auxiliary_loss_clip": 0.01132636, + "auxiliary_loss_mlp": 0.01100436, + "balance_loss_clip": 1.00189376, + "balance_loss_mlp": 1.00053859, + "epoch": 0.9798286487298963, + "flos": 26907147480960.0, + "grad_norm": 1.8853774791610645, + "language_loss": 0.75553304, + "learning_rate": 4.254074308266853e-09, + "loss": 0.77786374, + "num_input_tokens_seen": 351675505, + "step": 16297, + "time_per_iteration": 2.693117141723633 + }, + { + "auxiliary_loss_clip": 0.01149592, + "auxiliary_loss_mlp": 0.01100619, + "balance_loss_clip": 1.00181365, + "balance_loss_mlp": 1.00057888, + "epoch": 0.9798887719825643, + "flos": 27161071701120.0, + "grad_norm": 1.608114533570805, + "language_loss": 0.7843768, + "learning_rate": 4.228723696702019e-09, + "loss": 0.80687892, + "num_input_tokens_seen": 351697920, + "step": 16298, + "time_per_iteration": 2.6190502643585205 + }, + { + "auxiliary_loss_clip": 0.01147533, + "auxiliary_loss_mlp": 0.01099824, + "balance_loss_clip": 1.00182915, + "balance_loss_mlp": 1.00049877, + "epoch": 0.9799488952352322, + "flos": 20668422890880.0, + "grad_norm": 1.464072160140761, + "language_loss": 0.72955978, + "learning_rate": 4.203448764984019e-09, + "loss": 0.75203335, + "num_input_tokens_seen": 351717615, + "step": 16299, + "time_per_iteration": 2.57474946975708 + }, + { + "auxiliary_loss_clip": 0.01132905, + "auxiliary_loss_mlp": 0.01100335, + "balance_loss_clip": 1.00188708, + "balance_loss_mlp": 1.00053334, + "epoch": 0.9800090184879002, + "flos": 21981209160960.0, + "grad_norm": 2.0855262796632217, + "language_loss": 0.89298344, + "learning_rate": 4.178249514071419e-09, + "loss": 0.91531587, + "num_input_tokens_seen": 351735260, + "step": 16300, + "time_per_iteration": 4.098207235336304 + }, + { + "auxiliary_loss_clip": 0.01147517, + "auxiliary_loss_mlp": 0.01100874, + "balance_loss_clip": 1.00179029, + "balance_loss_mlp": 1.00045168, + "epoch": 0.9800691417405681, + "flos": 21288420570240.0, + "grad_norm": 2.3221466965913096, + "language_loss": 0.78011191, + "learning_rate": 4.1531259449194555e-09, + "loss": 0.80259579, + "num_input_tokens_seen": 351755800, + "step": 16301, + "time_per_iteration": 2.606881618499756 + }, + { + "auxiliary_loss_clip": 0.01130888, + "auxiliary_loss_mlp": 0.01100593, + "balance_loss_clip": 1.00180817, + "balance_loss_mlp": 1.00055265, + "epoch": 0.9801292649932362, + "flos": 18439878355200.0, + "grad_norm": 4.083842433323422, + "language_loss": 0.75149012, + "learning_rate": 4.128078058480921e-09, + "loss": 0.7738049, + "num_input_tokens_seen": 351774790, + "step": 16302, + "time_per_iteration": 4.007430791854858 + }, + { + "auxiliary_loss_clip": 0.01131352, + "auxiliary_loss_mlp": 0.01100657, + "balance_loss_clip": 1.00186539, + "balance_loss_mlp": 1.00042582, + "epoch": 0.9801893882459041, + "flos": 25046364343680.0, + "grad_norm": 3.1175513782356394, + "language_loss": 0.79581702, + "learning_rate": 4.103105855705724e-09, + "loss": 0.81813705, + "num_input_tokens_seen": 351792855, + "step": 16303, + "time_per_iteration": 2.6758272647857666 + }, + { + "auxiliary_loss_clip": 0.01118386, + "auxiliary_loss_mlp": 0.01100847, + "balance_loss_clip": 1.00184894, + "balance_loss_mlp": 1.000664, + "epoch": 0.9802495114985721, + "flos": 18511484117760.0, + "grad_norm": 2.675896859790697, + "language_loss": 0.83493054, + "learning_rate": 4.078209337540883e-09, + "loss": 0.8571229, + "num_input_tokens_seen": 351811450, + "step": 16304, + "time_per_iteration": 2.6431798934936523 + }, + { + "auxiliary_loss_clip": 0.01115824, + "auxiliary_loss_mlp": 0.01099174, + "balance_loss_clip": 1.0017271, + "balance_loss_mlp": 1.00046873, + "epoch": 0.98030963475124, + "flos": 21469841187840.0, + "grad_norm": 1.8742060640583516, + "language_loss": 0.7009021, + "learning_rate": 4.053388504930089e-09, + "loss": 0.72305208, + "num_input_tokens_seen": 351831960, + "step": 16305, + "time_per_iteration": 2.671325206756592 + }, + { + "auxiliary_loss_clip": 0.01132968, + "auxiliary_loss_mlp": 0.01101384, + "balance_loss_clip": 1.00178754, + "balance_loss_mlp": 1.00048506, + "epoch": 0.980369758003908, + "flos": 20412272027520.0, + "grad_norm": 1.837820935098727, + "language_loss": 0.71998489, + "learning_rate": 4.028643358815032e-09, + "loss": 0.74232841, + "num_input_tokens_seen": 351851585, + "step": 16306, + "time_per_iteration": 2.5950024127960205 + }, + { + "auxiliary_loss_clip": 0.01134628, + "auxiliary_loss_mlp": 0.01098948, + "balance_loss_clip": 1.00186813, + "balance_loss_mlp": 1.00043309, + "epoch": 0.9804298812565759, + "flos": 23399177431680.0, + "grad_norm": 1.5555695796687634, + "language_loss": 0.7364465, + "learning_rate": 4.00397390013385e-09, + "loss": 0.75878227, + "num_input_tokens_seen": 351871085, + "step": 16307, + "time_per_iteration": 2.6321377754211426 + }, + { + "auxiliary_loss_clip": 0.01116514, + "auxiliary_loss_mlp": 0.01098159, + "balance_loss_clip": 1.00173664, + "balance_loss_mlp": 1.00045466, + "epoch": 0.980490004509244, + "flos": 23292666627840.0, + "grad_norm": 1.490097300049517, + "language_loss": 0.74885297, + "learning_rate": 3.979380129822018e-09, + "loss": 0.77099979, + "num_input_tokens_seen": 351891775, + "step": 16308, + "time_per_iteration": 2.659475564956665 + }, + { + "auxiliary_loss_clip": 0.01126602, + "auxiliary_loss_mlp": 0.01074062, + "balance_loss_clip": 1.00084293, + "balance_loss_mlp": 1.00005651, + "epoch": 0.980550127761912, + "flos": 56051027798400.0, + "grad_norm": 0.755854529896517, + "language_loss": 0.5779916, + "learning_rate": 3.954862048811902e-09, + "loss": 0.59999824, + "num_input_tokens_seen": 351946770, + "step": 16309, + "time_per_iteration": 3.0591111183166504 + }, + { + "auxiliary_loss_clip": 0.0109942, + "auxiliary_loss_mlp": 0.01100489, + "balance_loss_clip": 1.00161898, + "balance_loss_mlp": 1.00049663, + "epoch": 0.9806102510145799, + "flos": 25333290184320.0, + "grad_norm": 1.825570450987201, + "language_loss": 0.66140497, + "learning_rate": 3.930419658033646e-09, + "loss": 0.68340403, + "num_input_tokens_seen": 351966155, + "step": 16310, + "time_per_iteration": 2.7430732250213623 + }, + { + "auxiliary_loss_clip": 0.01125203, + "auxiliary_loss_mlp": 0.01074987, + "balance_loss_clip": 1.00083923, + "balance_loss_mlp": 1.00060034, + "epoch": 0.9806703742672479, + "flos": 67274837429760.0, + "grad_norm": 0.821089350448553, + "language_loss": 0.54493845, + "learning_rate": 3.906052958413841e-09, + "loss": 0.56694031, + "num_input_tokens_seen": 352031655, + "step": 16311, + "time_per_iteration": 3.244206666946411 + }, + { + "auxiliary_loss_clip": 0.01149609, + "auxiliary_loss_mlp": 0.01100463, + "balance_loss_clip": 1.00189757, + "balance_loss_mlp": 1.00042295, + "epoch": 0.9807304975199158, + "flos": 25228970110080.0, + "grad_norm": 1.5153207293300637, + "language_loss": 0.79705834, + "learning_rate": 3.881761950876638e-09, + "loss": 0.8195591, + "num_input_tokens_seen": 352051920, + "step": 16312, + "time_per_iteration": 2.637035846710205 + }, + { + "auxiliary_loss_clip": 0.01132642, + "auxiliary_loss_mlp": 0.0109962, + "balance_loss_clip": 1.00185823, + "balance_loss_mlp": 1.00053334, + "epoch": 0.9807906207725838, + "flos": 17456392995840.0, + "grad_norm": 1.948940142803489, + "language_loss": 0.63387465, + "learning_rate": 3.8575466363430785e-09, + "loss": 0.65619725, + "num_input_tokens_seen": 352069315, + "step": 16313, + "time_per_iteration": 2.6564013957977295 + }, + { + "auxiliary_loss_clip": 0.01147519, + "auxiliary_loss_mlp": 0.01100542, + "balance_loss_clip": 1.00185657, + "balance_loss_mlp": 1.00045383, + "epoch": 0.9808507440252517, + "flos": 21032413361280.0, + "grad_norm": 2.0409302838494776, + "language_loss": 0.72788012, + "learning_rate": 3.833407015731316e-09, + "loss": 0.75036073, + "num_input_tokens_seen": 352089480, + "step": 16314, + "time_per_iteration": 2.6101322174072266 + }, + { + "auxiliary_loss_clip": 0.01109949, + "auxiliary_loss_mlp": 0.01073761, + "balance_loss_clip": 1.00084496, + "balance_loss_mlp": 1.00013757, + "epoch": 0.9809108672779198, + "flos": 64044491598720.0, + "grad_norm": 0.6913619137725875, + "language_loss": 0.51687551, + "learning_rate": 3.80934308995684e-09, + "loss": 0.53871262, + "num_input_tokens_seen": 352150000, + "step": 16315, + "time_per_iteration": 3.215620279312134 + }, + { + "auxiliary_loss_clip": 0.01147617, + "auxiliary_loss_mlp": 0.01100188, + "balance_loss_clip": 1.00179052, + "balance_loss_mlp": 1.0007199, + "epoch": 0.9809709905305877, + "flos": 22780616296320.0, + "grad_norm": 1.3562876483922552, + "language_loss": 0.69809377, + "learning_rate": 3.785354859932033e-09, + "loss": 0.72057182, + "num_input_tokens_seen": 352170990, + "step": 16316, + "time_per_iteration": 2.6248950958251953 + }, + { + "auxiliary_loss_clip": 0.01164157, + "auxiliary_loss_mlp": 0.01099747, + "balance_loss_clip": 1.00185633, + "balance_loss_mlp": 1.00037432, + "epoch": 0.9810311137832557, + "flos": 37013415217920.0, + "grad_norm": 1.8481553411166278, + "language_loss": 0.55383897, + "learning_rate": 3.76144232656661e-09, + "loss": 0.576478, + "num_input_tokens_seen": 352195335, + "step": 16317, + "time_per_iteration": 2.6652472019195557 + }, + { + "auxiliary_loss_clip": 0.01105703, + "auxiliary_loss_mlp": 0.0109992, + "balance_loss_clip": 1.00191069, + "balance_loss_mlp": 1.00059533, + "epoch": 0.9810912370359236, + "flos": 18916305373440.0, + "grad_norm": 1.708477370771327, + "language_loss": 0.72967929, + "learning_rate": 3.737605490767404e-09, + "loss": 0.75173557, + "num_input_tokens_seen": 352214170, + "step": 16318, + "time_per_iteration": 2.7069091796875 + }, + { + "auxiliary_loss_clip": 0.0113078, + "auxiliary_loss_mlp": 0.01099782, + "balance_loss_clip": 1.0017283, + "balance_loss_mlp": 1.00050473, + "epoch": 0.9811513602885916, + "flos": 18441602208000.0, + "grad_norm": 2.22910441163369, + "language_loss": 0.81977028, + "learning_rate": 3.7138443534383555e-09, + "loss": 0.84207588, + "num_input_tokens_seen": 352231470, + "step": 16319, + "time_per_iteration": 4.004100799560547 + }, + { + "auxiliary_loss_clip": 0.01143454, + "auxiliary_loss_mlp": 0.01074143, + "balance_loss_clip": 1.00072527, + "balance_loss_mlp": 1.00013769, + "epoch": 0.9812114835412595, + "flos": 68058945371520.0, + "grad_norm": 0.7114831747559729, + "language_loss": 0.53542626, + "learning_rate": 3.6901589154803014e-09, + "loss": 0.55760229, + "num_input_tokens_seen": 352291770, + "step": 16320, + "time_per_iteration": 3.0088117122650146 + }, + { + "auxiliary_loss_clip": 0.01101845, + "auxiliary_loss_mlp": 0.01101357, + "balance_loss_clip": 1.00176644, + "balance_loss_mlp": 1.00064886, + "epoch": 0.9812716067939276, + "flos": 25373007648000.0, + "grad_norm": 2.298513714498822, + "language_loss": 0.73395157, + "learning_rate": 3.6665491777914116e-09, + "loss": 0.75598359, + "num_input_tokens_seen": 352310735, + "step": 16321, + "time_per_iteration": 2.762345314025879 + }, + { + "auxiliary_loss_clip": 0.01134777, + "auxiliary_loss_mlp": 0.01099988, + "balance_loss_clip": 1.00196695, + "balance_loss_mlp": 1.00047243, + "epoch": 0.9813317300465956, + "flos": 22856818999680.0, + "grad_norm": 1.692503329876759, + "language_loss": 0.78710282, + "learning_rate": 3.6430151412669698e-09, + "loss": 0.80945051, + "num_input_tokens_seen": 352329545, + "step": 16322, + "time_per_iteration": 2.6367881298065186 + }, + { + "auxiliary_loss_clip": 0.0114738, + "auxiliary_loss_mlp": 0.010996, + "balance_loss_clip": 1.00179923, + "balance_loss_mlp": 1.00051332, + "epoch": 0.9813918532992635, + "flos": 23586954756480.0, + "grad_norm": 1.780769791659461, + "language_loss": 0.8085385, + "learning_rate": 3.619556806799595e-09, + "loss": 0.83100832, + "num_input_tokens_seen": 352352080, + "step": 16323, + "time_per_iteration": 4.033668041229248 + }, + { + "auxiliary_loss_clip": 0.01164265, + "auxiliary_loss_mlp": 0.01101391, + "balance_loss_clip": 1.00190282, + "balance_loss_mlp": 1.00049233, + "epoch": 0.9814519765519315, + "flos": 19606328616960.0, + "grad_norm": 2.1042269398494757, + "language_loss": 0.84706587, + "learning_rate": 3.596174175278799e-09, + "loss": 0.86972249, + "num_input_tokens_seen": 352366455, + "step": 16324, + "time_per_iteration": 2.5224430561065674 + }, + { + "auxiliary_loss_clip": 0.01130977, + "auxiliary_loss_mlp": 0.0110067, + "balance_loss_clip": 1.00170934, + "balance_loss_mlp": 1.00043845, + "epoch": 0.9815120998045994, + "flos": 33946284787200.0, + "grad_norm": 1.4219982587206568, + "language_loss": 0.7428332, + "learning_rate": 3.5728672475909827e-09, + "loss": 0.76514965, + "num_input_tokens_seen": 352386090, + "step": 16325, + "time_per_iteration": 2.778785467147827 + }, + { + "auxiliary_loss_clip": 0.01098684, + "auxiliary_loss_mlp": 0.01098958, + "balance_loss_clip": 1.00170898, + "balance_loss_mlp": 1.00049138, + "epoch": 0.9815722230572674, + "flos": 20850023076480.0, + "grad_norm": 1.8166261381265345, + "language_loss": 0.76503694, + "learning_rate": 3.5496360246201063e-09, + "loss": 0.78701335, + "num_input_tokens_seen": 352404000, + "step": 16326, + "time_per_iteration": 2.705606460571289 + }, + { + "auxiliary_loss_clip": 0.01118102, + "auxiliary_loss_mlp": 0.01101173, + "balance_loss_clip": 1.00190365, + "balance_loss_mlp": 1.00041711, + "epoch": 0.9816323463099353, + "flos": 22894525301760.0, + "grad_norm": 1.856690985396567, + "language_loss": 0.67494184, + "learning_rate": 3.5264805072470205e-09, + "loss": 0.69713461, + "num_input_tokens_seen": 352423540, + "step": 16327, + "time_per_iteration": 2.6595864295959473 + }, + { + "auxiliary_loss_clip": 0.01149524, + "auxiliary_loss_mlp": 0.0110165, + "balance_loss_clip": 1.00177562, + "balance_loss_mlp": 1.00065625, + "epoch": 0.9816924695626034, + "flos": 31539444117120.0, + "grad_norm": 1.5789361414436491, + "language_loss": 0.73992515, + "learning_rate": 3.5034006963501337e-09, + "loss": 0.76243687, + "num_input_tokens_seen": 352445530, + "step": 16328, + "time_per_iteration": 2.6787004470825195 + }, + { + "auxiliary_loss_clip": 0.01135124, + "auxiliary_loss_mlp": 0.01102027, + "balance_loss_clip": 1.0017823, + "balance_loss_mlp": 1.00055611, + "epoch": 0.9817525928152713, + "flos": 21506901045120.0, + "grad_norm": 1.890763062544537, + "language_loss": 0.8113358, + "learning_rate": 3.4803965928040802e-09, + "loss": 0.83370727, + "num_input_tokens_seen": 352466325, + "step": 16329, + "time_per_iteration": 2.6882166862487793 + }, + { + "auxiliary_loss_clip": 0.01164318, + "auxiliary_loss_mlp": 0.01100716, + "balance_loss_clip": 1.00185549, + "balance_loss_mlp": 1.00062799, + "epoch": 0.9818127160679393, + "flos": 25550513683200.0, + "grad_norm": 5.981668241893896, + "language_loss": 0.76295203, + "learning_rate": 3.4574681974817168e-09, + "loss": 0.78560233, + "num_input_tokens_seen": 352485505, + "step": 16330, + "time_per_iteration": 2.6714773178100586 + }, + { + "auxiliary_loss_clip": 0.01164454, + "auxiliary_loss_mlp": 0.01102582, + "balance_loss_clip": 1.00189543, + "balance_loss_mlp": 1.0004909, + "epoch": 0.9818728393206072, + "flos": 28803661672320.0, + "grad_norm": 2.5899592694332143, + "language_loss": 0.66780758, + "learning_rate": 3.434615511252126e-09, + "loss": 0.69047791, + "num_input_tokens_seen": 352505360, + "step": 16331, + "time_per_iteration": 2.6156816482543945 + }, + { + "auxiliary_loss_clip": 0.01147345, + "auxiliary_loss_mlp": 0.01099999, + "balance_loss_clip": 1.00178993, + "balance_loss_mlp": 1.00048268, + "epoch": 0.9819329625732752, + "flos": 23222246014080.0, + "grad_norm": 1.7307756644513301, + "language_loss": 0.73149675, + "learning_rate": 3.411838534981948e-09, + "loss": 0.75397015, + "num_input_tokens_seen": 352524035, + "step": 16332, + "time_per_iteration": 2.585479497909546 + }, + { + "auxiliary_loss_clip": 0.01147455, + "auxiliary_loss_mlp": 0.01099852, + "balance_loss_clip": 1.00187111, + "balance_loss_mlp": 1.00038338, + "epoch": 0.9819930858259431, + "flos": 17530440883200.0, + "grad_norm": 1.9204956969426847, + "language_loss": 0.76891857, + "learning_rate": 3.389137269534936e-09, + "loss": 0.79139161, + "num_input_tokens_seen": 352543210, + "step": 16333, + "time_per_iteration": 2.5512335300445557 + }, + { + "auxiliary_loss_clip": 0.01147699, + "auxiliary_loss_mlp": 0.00747248, + "balance_loss_clip": 1.00182915, + "balance_loss_mlp": 1.0003823, + "epoch": 0.9820532090786112, + "flos": 12529915971840.0, + "grad_norm": 2.1757632187560634, + "language_loss": 0.73140526, + "learning_rate": 3.366511715771958e-09, + "loss": 0.75035477, + "num_input_tokens_seen": 352559770, + "step": 16334, + "time_per_iteration": 2.599149227142334 + }, + { + "auxiliary_loss_clip": 0.01099813, + "auxiliary_loss_mlp": 0.01100734, + "balance_loss_clip": 1.00170434, + "balance_loss_mlp": 1.00050282, + "epoch": 0.9821133323312792, + "flos": 18840174497280.0, + "grad_norm": 3.04862364323562, + "language_loss": 0.78412759, + "learning_rate": 3.3439618745509934e-09, + "loss": 0.80613303, + "num_input_tokens_seen": 352577690, + "step": 16335, + "time_per_iteration": 2.681042432785034 + }, + { + "auxiliary_loss_clip": 0.01132485, + "auxiliary_loss_mlp": 0.01101836, + "balance_loss_clip": 1.00181985, + "balance_loss_mlp": 1.00065148, + "epoch": 0.9821734555839471, + "flos": 34824013528320.0, + "grad_norm": 2.0900362652983975, + "language_loss": 0.63906789, + "learning_rate": 3.3214877467271362e-09, + "loss": 0.66141111, + "num_input_tokens_seen": 352598850, + "step": 16336, + "time_per_iteration": 2.7284576892852783 + }, + { + "auxiliary_loss_clip": 0.01115695, + "auxiliary_loss_mlp": 0.01101086, + "balance_loss_clip": 1.00157237, + "balance_loss_mlp": 1.00066411, + "epoch": 0.9822335788366151, + "flos": 17128169493120.0, + "grad_norm": 2.272585388547222, + "language_loss": 0.73412424, + "learning_rate": 3.299089333152372e-09, + "loss": 0.7562921, + "num_input_tokens_seen": 352616130, + "step": 16337, + "time_per_iteration": 2.699502944946289 + }, + { + "auxiliary_loss_clip": 0.01147747, + "auxiliary_loss_mlp": 0.01099899, + "balance_loss_clip": 1.00178969, + "balance_loss_mlp": 1.00043106, + "epoch": 0.982293702089283, + "flos": 20813250528000.0, + "grad_norm": 1.6447308265320135, + "language_loss": 0.72968316, + "learning_rate": 3.2767666346764645e-09, + "loss": 0.75215966, + "num_input_tokens_seen": 352636885, + "step": 16338, + "time_per_iteration": 4.049657583236694 + }, + { + "auxiliary_loss_clip": 0.01085158, + "auxiliary_loss_mlp": 0.01099885, + "balance_loss_clip": 1.00173068, + "balance_loss_mlp": 1.00051236, + "epoch": 0.982353825341951, + "flos": 24680829588480.0, + "grad_norm": 2.1829146321257857, + "language_loss": 0.81643182, + "learning_rate": 3.2545196521454045e-09, + "loss": 0.83828223, + "num_input_tokens_seen": 352657905, + "step": 16339, + "time_per_iteration": 2.8835391998291016 + }, + { + "auxiliary_loss_clip": 0.01103155, + "auxiliary_loss_mlp": 0.01099815, + "balance_loss_clip": 1.00166917, + "balance_loss_mlp": 1.00053799, + "epoch": 0.982413948594619, + "flos": 20850489953280.0, + "grad_norm": 2.105835911033589, + "language_loss": 0.62609357, + "learning_rate": 3.232348386403405e-09, + "loss": 0.64812326, + "num_input_tokens_seen": 352676320, + "step": 16340, + "time_per_iteration": 4.4439473152160645 + }, + { + "auxiliary_loss_clip": 0.0116434, + "auxiliary_loss_mlp": 0.01101248, + "balance_loss_clip": 1.00196624, + "balance_loss_mlp": 1.00058782, + "epoch": 0.982474071847287, + "flos": 15377380778880.0, + "grad_norm": 2.2008133801997274, + "language_loss": 0.86091208, + "learning_rate": 3.2102528382904613e-09, + "loss": 0.88356793, + "num_input_tokens_seen": 352692665, + "step": 16341, + "time_per_iteration": 2.510239839553833 + }, + { + "auxiliary_loss_clip": 0.0113418, + "auxiliary_loss_mlp": 0.0109879, + "balance_loss_clip": 1.00195885, + "balance_loss_mlp": 1.00046623, + "epoch": 0.9825341950999549, + "flos": 23774732081280.0, + "grad_norm": 1.4492979050856862, + "language_loss": 0.67023844, + "learning_rate": 3.188233008645014e-09, + "loss": 0.69256812, + "num_input_tokens_seen": 352716130, + "step": 16342, + "time_per_iteration": 2.7075424194335938 + }, + { + "auxiliary_loss_clip": 0.01164187, + "auxiliary_loss_mlp": 0.01100057, + "balance_loss_clip": 1.00188398, + "balance_loss_mlp": 1.00039768, + "epoch": 0.9825943183526229, + "flos": 22746285872640.0, + "grad_norm": 1.846577179882591, + "language_loss": 0.77334738, + "learning_rate": 3.16628889830195e-09, + "loss": 0.79598975, + "num_input_tokens_seen": 352734705, + "step": 16343, + "time_per_iteration": 2.5725088119506836 + }, + { + "auxiliary_loss_clip": 0.01113938, + "auxiliary_loss_mlp": 0.01100192, + "balance_loss_clip": 1.00159788, + "balance_loss_mlp": 1.0004375, + "epoch": 0.9826544416052908, + "flos": 27709966408320.0, + "grad_norm": 1.6132159234405707, + "language_loss": 0.75335968, + "learning_rate": 3.1444205080932707e-09, + "loss": 0.77550101, + "num_input_tokens_seen": 352756225, + "step": 16344, + "time_per_iteration": 2.7208659648895264 + }, + { + "auxiliary_loss_clip": 0.01134479, + "auxiliary_loss_mlp": 0.01101245, + "balance_loss_clip": 1.00202119, + "balance_loss_mlp": 1.00053716, + "epoch": 0.9827145648579588, + "flos": 26941657472640.0, + "grad_norm": 2.031531784970984, + "language_loss": 0.66364896, + "learning_rate": 3.122627838848313e-09, + "loss": 0.68600619, + "num_input_tokens_seen": 352776210, + "step": 16345, + "time_per_iteration": 2.6639404296875 + }, + { + "auxiliary_loss_clip": 0.01147283, + "auxiliary_loss_mlp": 0.01098439, + "balance_loss_clip": 1.00178623, + "balance_loss_mlp": 1.00040126, + "epoch": 0.9827746881106267, + "flos": 21866545969920.0, + "grad_norm": 1.4560994319577842, + "language_loss": 0.79557955, + "learning_rate": 3.1009108913933045e-09, + "loss": 0.81803679, + "num_input_tokens_seen": 352795455, + "step": 16346, + "time_per_iteration": 2.5917627811431885 + }, + { + "auxiliary_loss_clip": 0.01147875, + "auxiliary_loss_mlp": 0.01101831, + "balance_loss_clip": 1.00172114, + "balance_loss_mlp": 1.00059843, + "epoch": 0.9828348113632948, + "flos": 20850777262080.0, + "grad_norm": 2.040984346184649, + "language_loss": 0.74704063, + "learning_rate": 3.079269666552031e-09, + "loss": 0.76953763, + "num_input_tokens_seen": 352812895, + "step": 16347, + "time_per_iteration": 2.5925655364990234 + }, + { + "auxiliary_loss_clip": 0.01070807, + "auxiliary_loss_mlp": 0.01098808, + "balance_loss_clip": 1.00150585, + "balance_loss_mlp": 1.00062764, + "epoch": 0.9828949346159628, + "flos": 34569227381760.0, + "grad_norm": 1.8621891835304294, + "language_loss": 0.66794926, + "learning_rate": 3.0577041651449474e-09, + "loss": 0.68964541, + "num_input_tokens_seen": 352835470, + "step": 16348, + "time_per_iteration": 2.939849376678467 + }, + { + "auxiliary_loss_clip": 0.01132218, + "auxiliary_loss_mlp": 0.01100351, + "balance_loss_clip": 1.00179958, + "balance_loss_mlp": 1.00050139, + "epoch": 0.9829550578686307, + "flos": 24457464864000.0, + "grad_norm": 1.7985801478885373, + "language_loss": 0.68936753, + "learning_rate": 3.0362143879898437e-09, + "loss": 0.71169317, + "num_input_tokens_seen": 352854295, + "step": 16349, + "time_per_iteration": 2.644315481185913 + }, + { + "auxiliary_loss_clip": 0.01131667, + "auxiliary_loss_mlp": 0.01097973, + "balance_loss_clip": 1.00187802, + "balance_loss_mlp": 1.00045967, + "epoch": 0.9830151811212987, + "flos": 16910084067840.0, + "grad_norm": 2.2784241320617333, + "language_loss": 0.76215219, + "learning_rate": 3.0148003359014018e-09, + "loss": 0.78444862, + "num_input_tokens_seen": 352869695, + "step": 16350, + "time_per_iteration": 2.5736398696899414 + }, + { + "auxiliary_loss_clip": 0.01118828, + "auxiliary_loss_mlp": 0.01100423, + "balance_loss_clip": 1.00162864, + "balance_loss_mlp": 1.00043023, + "epoch": 0.9830753043739666, + "flos": 21288312829440.0, + "grad_norm": 2.2043609125541925, + "language_loss": 0.84492159, + "learning_rate": 2.9934620096920826e-09, + "loss": 0.86711407, + "num_input_tokens_seen": 352887430, + "step": 16351, + "time_per_iteration": 2.7076590061187744 + }, + { + "auxiliary_loss_clip": 0.01116672, + "auxiliary_loss_mlp": 0.01099601, + "balance_loss_clip": 1.00164056, + "balance_loss_mlp": 1.00037169, + "epoch": 0.9831354276266346, + "flos": 31723522341120.0, + "grad_norm": 1.6461734156421162, + "language_loss": 0.68878829, + "learning_rate": 2.972199410170795e-09, + "loss": 0.71095103, + "num_input_tokens_seen": 352907555, + "step": 16352, + "time_per_iteration": 2.7884857654571533 + }, + { + "auxiliary_loss_clip": 0.01148269, + "auxiliary_loss_mlp": 0.00747296, + "balance_loss_clip": 1.00204158, + "balance_loss_mlp": 1.00045717, + "epoch": 0.9831955508793025, + "flos": 21619050284160.0, + "grad_norm": 1.474391870349644, + "language_loss": 0.66285568, + "learning_rate": 2.951012538143782e-09, + "loss": 0.68181133, + "num_input_tokens_seen": 352928670, + "step": 16353, + "time_per_iteration": 2.6820709705352783 + }, + { + "auxiliary_loss_clip": 0.01130498, + "auxiliary_loss_mlp": 0.01098734, + "balance_loss_clip": 1.00177538, + "balance_loss_mlp": 1.00045824, + "epoch": 0.9832556741319706, + "flos": 22968214053120.0, + "grad_norm": 1.629129370532964, + "language_loss": 0.74430668, + "learning_rate": 2.9299013944144025e-09, + "loss": 0.766599, + "num_input_tokens_seen": 352948345, + "step": 16354, + "time_per_iteration": 2.681570529937744 + }, + { + "auxiliary_loss_clip": 0.01148686, + "auxiliary_loss_mlp": 0.01099785, + "balance_loss_clip": 1.00181675, + "balance_loss_mlp": 1.00050735, + "epoch": 0.9833157973846385, + "flos": 21323900229120.0, + "grad_norm": 2.3355957722134617, + "language_loss": 0.77842462, + "learning_rate": 2.9088659797835702e-09, + "loss": 0.80090934, + "num_input_tokens_seen": 352967250, + "step": 16355, + "time_per_iteration": 2.6958322525024414 + }, + { + "auxiliary_loss_clip": 0.01147291, + "auxiliary_loss_mlp": 0.01100481, + "balance_loss_clip": 1.00180459, + "balance_loss_mlp": 1.00048792, + "epoch": 0.9833759206373065, + "flos": 21068719032960.0, + "grad_norm": 1.792647147178772, + "language_loss": 0.73360896, + "learning_rate": 2.8879062950484256e-09, + "loss": 0.75608671, + "num_input_tokens_seen": 352984725, + "step": 16356, + "time_per_iteration": 4.037716388702393 + }, + { + "auxiliary_loss_clip": 0.01132726, + "auxiliary_loss_mlp": 0.01100367, + "balance_loss_clip": 1.00176549, + "balance_loss_mlp": 1.000422, + "epoch": 0.9834360438899744, + "flos": 18697322108160.0, + "grad_norm": 1.7434917207964071, + "language_loss": 0.761567, + "learning_rate": 2.8670223410041104e-09, + "loss": 0.783898, + "num_input_tokens_seen": 353003480, + "step": 16357, + "time_per_iteration": 2.6310057640075684 + }, + { + "auxiliary_loss_clip": 0.01132869, + "auxiliary_loss_mlp": 0.01100386, + "balance_loss_clip": 1.00177932, + "balance_loss_mlp": 1.00048828, + "epoch": 0.9834961671426424, + "flos": 21105240186240.0, + "grad_norm": 2.215308576482318, + "language_loss": 0.80083871, + "learning_rate": 2.846214118442436e-09, + "loss": 0.82317126, + "num_input_tokens_seen": 353021425, + "step": 16358, + "time_per_iteration": 2.697150230407715 + }, + { + "auxiliary_loss_clip": 0.01149425, + "auxiliary_loss_mlp": 0.01100832, + "balance_loss_clip": 1.00182247, + "balance_loss_mlp": 1.00041056, + "epoch": 0.9835562903953103, + "flos": 26687625511680.0, + "grad_norm": 2.0799957308315022, + "language_loss": 0.67803603, + "learning_rate": 2.8254816281523263e-09, + "loss": 0.70053864, + "num_input_tokens_seen": 353039870, + "step": 16359, + "time_per_iteration": 2.763641119003296 + }, + { + "auxiliary_loss_clip": 0.01164067, + "auxiliary_loss_mlp": 0.01099759, + "balance_loss_clip": 1.00186753, + "balance_loss_mlp": 1.00048113, + "epoch": 0.9836164136479784, + "flos": 22090162089600.0, + "grad_norm": 1.6723125897752427, + "language_loss": 0.69409132, + "learning_rate": 2.804824870920264e-09, + "loss": 0.71672952, + "num_input_tokens_seen": 353059750, + "step": 16360, + "time_per_iteration": 4.835330009460449 + }, + { + "auxiliary_loss_clip": 0.01147015, + "auxiliary_loss_mlp": 0.01100887, + "balance_loss_clip": 1.00187302, + "balance_loss_mlp": 1.00051284, + "epoch": 0.9836765369006463, + "flos": 23878405710720.0, + "grad_norm": 1.9109745771112636, + "language_loss": 0.84196579, + "learning_rate": 2.7842438475293996e-09, + "loss": 0.86444479, + "num_input_tokens_seen": 353079940, + "step": 16361, + "time_per_iteration": 2.63199782371521 + }, + { + "auxiliary_loss_clip": 0.01164141, + "auxiliary_loss_mlp": 0.01100012, + "balance_loss_clip": 1.0018785, + "balance_loss_mlp": 1.00044847, + "epoch": 0.9837366601533143, + "flos": 25845017293440.0, + "grad_norm": 2.10807157561145, + "language_loss": 0.7601248, + "learning_rate": 2.76373855876022e-09, + "loss": 0.78276634, + "num_input_tokens_seen": 353099990, + "step": 16362, + "time_per_iteration": 2.605076789855957 + }, + { + "auxiliary_loss_clip": 0.01164306, + "auxiliary_loss_mlp": 0.01101224, + "balance_loss_clip": 1.00190401, + "balance_loss_mlp": 1.00051606, + "epoch": 0.9837967834059823, + "flos": 21358015171200.0, + "grad_norm": 1.792184954608897, + "language_loss": 0.71254086, + "learning_rate": 2.7433090053901043e-09, + "loss": 0.73519617, + "num_input_tokens_seen": 353118710, + "step": 16363, + "time_per_iteration": 2.5699923038482666 + }, + { + "auxiliary_loss_clip": 0.01132602, + "auxiliary_loss_mlp": 0.01098492, + "balance_loss_clip": 1.00173879, + "balance_loss_mlp": 1.00050211, + "epoch": 0.9838569066586502, + "flos": 18515793749760.0, + "grad_norm": 1.752318972025302, + "language_loss": 0.63008356, + "learning_rate": 2.7229551881937653e-09, + "loss": 0.65239453, + "num_input_tokens_seen": 353136415, + "step": 16364, + "time_per_iteration": 2.6358203887939453 + }, + { + "auxiliary_loss_clip": 0.01099116, + "auxiliary_loss_mlp": 0.01099502, + "balance_loss_clip": 1.00162697, + "balance_loss_mlp": 1.00055861, + "epoch": 0.9839170299113182, + "flos": 22452392793600.0, + "grad_norm": 1.571777759057652, + "language_loss": 0.75350308, + "learning_rate": 2.702677107943252e-09, + "loss": 0.77548933, + "num_input_tokens_seen": 353154650, + "step": 16365, + "time_per_iteration": 2.785494804382324 + }, + { + "auxiliary_loss_clip": 0.01114738, + "auxiliary_loss_mlp": 0.01100051, + "balance_loss_clip": 1.0017004, + "balance_loss_mlp": 1.00039196, + "epoch": 0.9839771531639862, + "flos": 27892320779520.0, + "grad_norm": 1.77090295789593, + "language_loss": 0.75991315, + "learning_rate": 2.6824747654072832e-09, + "loss": 0.78206098, + "num_input_tokens_seen": 353174065, + "step": 16366, + "time_per_iteration": 2.7338638305664062 + }, + { + "auxiliary_loss_clip": 0.01164067, + "auxiliary_loss_mlp": 0.0109997, + "balance_loss_clip": 1.00192046, + "balance_loss_mlp": 1.00045443, + "epoch": 0.9840372764166542, + "flos": 28214510797440.0, + "grad_norm": 1.6234650013529346, + "language_loss": 0.77180296, + "learning_rate": 2.662348161352357e-09, + "loss": 0.79444331, + "num_input_tokens_seen": 353193560, + "step": 16367, + "time_per_iteration": 2.6107420921325684 + }, + { + "auxiliary_loss_clip": 0.01132907, + "auxiliary_loss_mlp": 0.01099936, + "balance_loss_clip": 1.00188184, + "balance_loss_mlp": 1.00061131, + "epoch": 0.9840973996693221, + "flos": 23403989854080.0, + "grad_norm": 1.4954475311218296, + "language_loss": 0.61561817, + "learning_rate": 2.642297296540974e-09, + "loss": 0.63794661, + "num_input_tokens_seen": 353213525, + "step": 16368, + "time_per_iteration": 2.676393747329712 + }, + { + "auxiliary_loss_clip": 0.01148866, + "auxiliary_loss_mlp": 0.01099631, + "balance_loss_clip": 1.00181794, + "balance_loss_mlp": 1.00063944, + "epoch": 0.9841575229219901, + "flos": 21395865127680.0, + "grad_norm": 1.5113533624090059, + "language_loss": 0.65326869, + "learning_rate": 2.6223221717340816e-09, + "loss": 0.67575371, + "num_input_tokens_seen": 353234000, + "step": 16369, + "time_per_iteration": 2.6405234336853027 + }, + { + "auxiliary_loss_clip": 0.0114761, + "auxiliary_loss_mlp": 0.00747313, + "balance_loss_clip": 1.00182915, + "balance_loss_mlp": 1.0004828, + "epoch": 0.984217646174658, + "flos": 24464072966400.0, + "grad_norm": 1.7622115145434019, + "language_loss": 0.68787825, + "learning_rate": 2.6024227876886295e-09, + "loss": 0.70682746, + "num_input_tokens_seen": 353254940, + "step": 16370, + "time_per_iteration": 2.6447346210479736 + }, + { + "auxiliary_loss_clip": 0.01164163, + "auxiliary_loss_mlp": 0.01100393, + "balance_loss_clip": 1.00178373, + "balance_loss_mlp": 1.00049615, + "epoch": 0.984277769427326, + "flos": 16435057680000.0, + "grad_norm": 1.8050577384509483, + "language_loss": 0.74155331, + "learning_rate": 2.582599145159792e-09, + "loss": 0.76419884, + "num_input_tokens_seen": 353272590, + "step": 16371, + "time_per_iteration": 2.5206048488616943 + }, + { + "auxiliary_loss_clip": 0.01143636, + "auxiliary_loss_mlp": 0.01073724, + "balance_loss_clip": 1.00071263, + "balance_loss_mlp": 1.00010037, + "epoch": 0.9843378926799939, + "flos": 64530615288960.0, + "grad_norm": 0.7892469493887506, + "language_loss": 0.65138602, + "learning_rate": 2.562851244898745e-09, + "loss": 0.67355967, + "num_input_tokens_seen": 353334380, + "step": 16372, + "time_per_iteration": 3.161843776702881 + }, + { + "auxiliary_loss_clip": 0.01147208, + "auxiliary_loss_mlp": 0.01099746, + "balance_loss_clip": 1.00176382, + "balance_loss_mlp": 1.00046861, + "epoch": 0.984398015932662, + "flos": 17382811985280.0, + "grad_norm": 3.004332644736912, + "language_loss": 0.70460969, + "learning_rate": 2.5431790876544456e-09, + "loss": 0.72707921, + "num_input_tokens_seen": 353351640, + "step": 16373, + "time_per_iteration": 2.5706679821014404 + }, + { + "auxiliary_loss_clip": 0.01164046, + "auxiliary_loss_mlp": 0.01099649, + "balance_loss_clip": 1.00192809, + "balance_loss_mlp": 1.00051439, + "epoch": 0.9844581391853299, + "flos": 23879088069120.0, + "grad_norm": 3.1702261293888183, + "language_loss": 0.81501281, + "learning_rate": 2.523582674173186e-09, + "loss": 0.8376497, + "num_input_tokens_seen": 353372555, + "step": 16374, + "time_per_iteration": 2.5829882621765137 + }, + { + "auxiliary_loss_clip": 0.01098154, + "auxiliary_loss_mlp": 0.01099647, + "balance_loss_clip": 1.00162053, + "balance_loss_mlp": 1.00065541, + "epoch": 0.9845182624379979, + "flos": 19865352568320.0, + "grad_norm": 3.2697559193816415, + "language_loss": 0.69146645, + "learning_rate": 2.504062005197927e-09, + "loss": 0.71344447, + "num_input_tokens_seen": 353391385, + "step": 16375, + "time_per_iteration": 2.67714524269104 + }, + { + "auxiliary_loss_clip": 0.01134545, + "auxiliary_loss_mlp": 0.01101077, + "balance_loss_clip": 1.00176811, + "balance_loss_mlp": 1.0004642, + "epoch": 0.9845783856906659, + "flos": 28254659224320.0, + "grad_norm": 1.9283152622084185, + "language_loss": 0.80875206, + "learning_rate": 2.484617081468521e-09, + "loss": 0.83110833, + "num_input_tokens_seen": 353411630, + "step": 16376, + "time_per_iteration": 4.1581034660339355 + }, + { + "auxiliary_loss_clip": 0.0116409, + "auxiliary_loss_mlp": 0.01099894, + "balance_loss_clip": 1.00187981, + "balance_loss_mlp": 1.00042546, + "epoch": 0.9846385089433338, + "flos": 28328383889280.0, + "grad_norm": 1.5430044257677216, + "language_loss": 0.62204885, + "learning_rate": 2.4652479037228224e-09, + "loss": 0.64468867, + "num_input_tokens_seen": 353432895, + "step": 16377, + "time_per_iteration": 4.00175404548645 + }, + { + "auxiliary_loss_clip": 0.01117325, + "auxiliary_loss_mlp": 0.01100023, + "balance_loss_clip": 1.0017488, + "balance_loss_mlp": 1.000507, + "epoch": 0.9846986321960018, + "flos": 24316767290880.0, + "grad_norm": 2.4437161213726126, + "language_loss": 0.72752535, + "learning_rate": 2.445954472695133e-09, + "loss": 0.74969882, + "num_input_tokens_seen": 353454195, + "step": 16378, + "time_per_iteration": 2.7778096199035645 + }, + { + "auxiliary_loss_clip": 0.01164226, + "auxiliary_loss_mlp": 0.01100796, + "balance_loss_clip": 1.00188708, + "balance_loss_mlp": 1.00066066, + "epoch": 0.9847587554486698, + "flos": 27271999877760.0, + "grad_norm": 2.050872090227754, + "language_loss": 0.71100593, + "learning_rate": 2.426736789116868e-09, + "loss": 0.73365617, + "num_input_tokens_seen": 353475125, + "step": 16379, + "time_per_iteration": 2.6449263095855713 + }, + { + "auxiliary_loss_clip": 0.0111381, + "auxiliary_loss_mlp": 0.01100792, + "balance_loss_clip": 1.00179172, + "balance_loss_mlp": 1.00041795, + "epoch": 0.9848188787013378, + "flos": 16542717719040.0, + "grad_norm": 3.5117748637849404, + "language_loss": 0.687536, + "learning_rate": 2.407594853716999e-09, + "loss": 0.70968199, + "num_input_tokens_seen": 353493265, + "step": 16380, + "time_per_iteration": 2.695269823074341 + }, + { + "auxiliary_loss_clip": 0.01133738, + "auxiliary_loss_mlp": 0.0110079, + "balance_loss_clip": 1.0019803, + "balance_loss_mlp": 1.00065434, + "epoch": 0.9848790019540057, + "flos": 20193647898240.0, + "grad_norm": 3.537875381887493, + "language_loss": 0.78812194, + "learning_rate": 2.38852866722139e-09, + "loss": 0.81046724, + "num_input_tokens_seen": 353511650, + "step": 16381, + "time_per_iteration": 2.6226189136505127 + }, + { + "auxiliary_loss_clip": 0.01147458, + "auxiliary_loss_mlp": 0.01100647, + "balance_loss_clip": 1.00185871, + "balance_loss_mlp": 1.00041533, + "epoch": 0.9849391252066737, + "flos": 28259723041920.0, + "grad_norm": 1.5342021957508816, + "language_loss": 0.82463682, + "learning_rate": 2.3695382303527965e-09, + "loss": 0.84711784, + "num_input_tokens_seen": 353534035, + "step": 16382, + "time_per_iteration": 2.702564001083374 + }, + { + "auxiliary_loss_clip": 0.01135217, + "auxiliary_loss_mlp": 0.01101435, + "balance_loss_clip": 1.00189924, + "balance_loss_mlp": 1.00053596, + "epoch": 0.9849992484593416, + "flos": 22454942659200.0, + "grad_norm": 1.883167494565927, + "language_loss": 0.74341774, + "learning_rate": 2.3506235438315316e-09, + "loss": 0.76578426, + "num_input_tokens_seen": 353549950, + "step": 16383, + "time_per_iteration": 2.6577179431915283 + }, + { + "auxiliary_loss_clip": 0.01102916, + "auxiliary_loss_mlp": 0.01100604, + "balance_loss_clip": 1.00169468, + "balance_loss_mlp": 1.00042021, + "epoch": 0.9850593717120096, + "flos": 34497190656000.0, + "grad_norm": 2.0592017097437227, + "language_loss": 0.66148806, + "learning_rate": 2.3317846083750203e-09, + "loss": 0.6835233, + "num_input_tokens_seen": 353573745, + "step": 16384, + "time_per_iteration": 2.814114809036255 + }, + { + "auxiliary_loss_clip": 0.0113094, + "auxiliary_loss_mlp": 0.01101538, + "balance_loss_clip": 1.00179195, + "balance_loss_mlp": 1.00059104, + "epoch": 0.9851194949646775, + "flos": 38837282152320.0, + "grad_norm": 1.7663487955500692, + "language_loss": 0.70467061, + "learning_rate": 2.313021424697359e-09, + "loss": 0.72699535, + "num_input_tokens_seen": 353595335, + "step": 16385, + "time_per_iteration": 2.7754180431365967 + }, + { + "auxiliary_loss_clip": 0.01132928, + "auxiliary_loss_mlp": 0.01100381, + "balance_loss_clip": 1.00182211, + "balance_loss_mlp": 1.0005796, + "epoch": 0.9851796182173456, + "flos": 17712436118400.0, + "grad_norm": 2.084577858594333, + "language_loss": 0.81075263, + "learning_rate": 2.294333993509978e-09, + "loss": 0.83308572, + "num_input_tokens_seen": 353614270, + "step": 16386, + "time_per_iteration": 2.6570425033569336 + }, + { + "auxiliary_loss_clip": 0.01119745, + "auxiliary_loss_mlp": 0.0110078, + "balance_loss_clip": 1.00172222, + "balance_loss_mlp": 1.00059628, + "epoch": 0.9852397414700135, + "flos": 27454318335360.0, + "grad_norm": 1.903134791975563, + "language_loss": 0.67697519, + "learning_rate": 2.2757223155216442e-09, + "loss": 0.69918042, + "num_input_tokens_seen": 353634900, + "step": 16387, + "time_per_iteration": 2.772719144821167 + }, + { + "auxiliary_loss_clip": 0.01147127, + "auxiliary_loss_mlp": 0.0074714, + "balance_loss_clip": 1.00178623, + "balance_loss_mlp": 1.00040615, + "epoch": 0.9852998647226815, + "flos": 18296702743680.0, + "grad_norm": 1.6708985166594394, + "language_loss": 0.73991388, + "learning_rate": 2.257186391438237e-09, + "loss": 0.75885653, + "num_input_tokens_seen": 353652890, + "step": 16388, + "time_per_iteration": 2.5969619750976562 + }, + { + "auxiliary_loss_clip": 0.01147561, + "auxiliary_loss_mlp": 0.01099244, + "balance_loss_clip": 1.00171947, + "balance_loss_mlp": 1.00044358, + "epoch": 0.9853599879753495, + "flos": 19642562461440.0, + "grad_norm": 1.8963581483678942, + "language_loss": 0.82333708, + "learning_rate": 2.238726221962528e-09, + "loss": 0.84580505, + "num_input_tokens_seen": 353671295, + "step": 16389, + "time_per_iteration": 2.6234493255615234 + }, + { + "auxiliary_loss_clip": 0.01134326, + "auxiliary_loss_mlp": 0.00747266, + "balance_loss_clip": 1.00183773, + "balance_loss_mlp": 1.00041842, + "epoch": 0.9854201112280174, + "flos": 23841956384640.0, + "grad_norm": 2.632332480900348, + "language_loss": 0.67129564, + "learning_rate": 2.2203418077946234e-09, + "loss": 0.69011152, + "num_input_tokens_seen": 353690560, + "step": 16390, + "time_per_iteration": 2.67187762260437 + }, + { + "auxiliary_loss_clip": 0.0111516, + "auxiliary_loss_mlp": 0.01101684, + "balance_loss_clip": 1.00198317, + "balance_loss_mlp": 1.00054705, + "epoch": 0.9854802344806854, + "flos": 30080573233920.0, + "grad_norm": 1.911337154304883, + "language_loss": 0.77113301, + "learning_rate": 2.2020331496312994e-09, + "loss": 0.79330146, + "num_input_tokens_seen": 353710660, + "step": 16391, + "time_per_iteration": 2.80415678024292 + }, + { + "auxiliary_loss_clip": 0.01115751, + "auxiliary_loss_mlp": 0.00747221, + "balance_loss_clip": 1.00183034, + "balance_loss_mlp": 1.00037289, + "epoch": 0.9855403577333534, + "flos": 21907412668800.0, + "grad_norm": 1.9654445375292107, + "language_loss": 0.68211854, + "learning_rate": 2.1838002481673333e-09, + "loss": 0.70074832, + "num_input_tokens_seen": 353730440, + "step": 16392, + "time_per_iteration": 2.7115182876586914 + }, + { + "auxiliary_loss_clip": 0.01116353, + "auxiliary_loss_mlp": 0.01101439, + "balance_loss_clip": 1.00170958, + "balance_loss_mlp": 1.00049222, + "epoch": 0.9856004809860214, + "flos": 15413794191360.0, + "grad_norm": 1.8251365658796685, + "language_loss": 0.55857062, + "learning_rate": 2.1656431040937286e-09, + "loss": 0.58074856, + "num_input_tokens_seen": 353748360, + "step": 16393, + "time_per_iteration": 2.6439783573150635 + }, + { + "auxiliary_loss_clip": 0.01118163, + "auxiliary_loss_mlp": 0.01101345, + "balance_loss_clip": 1.00185275, + "balance_loss_mlp": 1.00054145, + "epoch": 0.9856606042386893, + "flos": 13653201064320.0, + "grad_norm": 2.703958031703162, + "language_loss": 0.78508079, + "learning_rate": 2.1475617180990444e-09, + "loss": 0.80727589, + "num_input_tokens_seen": 353760880, + "step": 16394, + "time_per_iteration": 4.06302547454834 + }, + { + "auxiliary_loss_clip": 0.01149541, + "auxiliary_loss_mlp": 0.01101333, + "balance_loss_clip": 1.00194919, + "balance_loss_mlp": 1.00052977, + "epoch": 0.9857207274913573, + "flos": 23479151063040.0, + "grad_norm": 2.9224785663833805, + "language_loss": 0.75970405, + "learning_rate": 2.129556090869178e-09, + "loss": 0.78221279, + "num_input_tokens_seen": 353782255, + "step": 16395, + "time_per_iteration": 2.6180920600891113 + }, + { + "auxiliary_loss_clip": 0.01149569, + "auxiliary_loss_mlp": 0.01100601, + "balance_loss_clip": 1.00193405, + "balance_loss_mlp": 1.00070405, + "epoch": 0.9857808507440252, + "flos": 21065486808960.0, + "grad_norm": 2.8702799710833635, + "language_loss": 0.75402045, + "learning_rate": 2.1116262230866933e-09, + "loss": 0.7765221, + "num_input_tokens_seen": 353803580, + "step": 16396, + "time_per_iteration": 2.6287848949432373 + }, + { + "auxiliary_loss_clip": 0.01118165, + "auxiliary_loss_mlp": 0.01098721, + "balance_loss_clip": 1.00175667, + "balance_loss_mlp": 1.00039768, + "epoch": 0.9858409739966932, + "flos": 25301365971840.0, + "grad_norm": 2.6888326132361877, + "language_loss": 0.71158803, + "learning_rate": 2.0937721154317133e-09, + "loss": 0.73375696, + "num_input_tokens_seen": 353824200, + "step": 16397, + "time_per_iteration": 4.223249912261963 + }, + { + "auxiliary_loss_clip": 0.01132733, + "auxiliary_loss_mlp": 0.01100199, + "balance_loss_clip": 1.0018959, + "balance_loss_mlp": 1.00058722, + "epoch": 0.9859010972493611, + "flos": 20558751690240.0, + "grad_norm": 2.049672489590845, + "language_loss": 0.71513432, + "learning_rate": 2.0759937685810304e-09, + "loss": 0.73746359, + "num_input_tokens_seen": 353843350, + "step": 16398, + "time_per_iteration": 2.734748363494873 + }, + { + "auxiliary_loss_clip": 0.01115327, + "auxiliary_loss_mlp": 0.01099733, + "balance_loss_clip": 1.00171137, + "balance_loss_mlp": 1.00045538, + "epoch": 0.9859612205020292, + "flos": 24754985216640.0, + "grad_norm": 1.4839269889655067, + "language_loss": 0.73859787, + "learning_rate": 2.058291183208771e-09, + "loss": 0.76074851, + "num_input_tokens_seen": 353864520, + "step": 16399, + "time_per_iteration": 2.675645112991333 + }, + { + "auxiliary_loss_clip": 0.01164117, + "auxiliary_loss_mlp": 0.01100465, + "balance_loss_clip": 1.00178885, + "balance_loss_mlp": 1.00052023, + "epoch": 0.9860213437546971, + "flos": 21105850717440.0, + "grad_norm": 1.8582954043263606, + "language_loss": 0.57388043, + "learning_rate": 2.0406643599863993e-09, + "loss": 0.59652627, + "num_input_tokens_seen": 353882240, + "step": 16400, + "time_per_iteration": 2.528550624847412 + }, + { + "auxiliary_loss_clip": 0.0113133, + "auxiliary_loss_mlp": 0.01101944, + "balance_loss_clip": 1.00161982, + "balance_loss_mlp": 1.0004735, + "epoch": 0.9860814670073651, + "flos": 19136078737920.0, + "grad_norm": 1.7532680925978563, + "language_loss": 0.80087966, + "learning_rate": 2.023113299582491e-09, + "loss": 0.82321239, + "num_input_tokens_seen": 353901590, + "step": 16401, + "time_per_iteration": 2.6543490886688232 + }, + { + "auxiliary_loss_clip": 0.01147418, + "auxiliary_loss_mlp": 0.0110101, + "balance_loss_clip": 1.00192606, + "balance_loss_mlp": 1.00054097, + "epoch": 0.9861415902600331, + "flos": 17237050594560.0, + "grad_norm": 3.314585021878405, + "language_loss": 0.77645338, + "learning_rate": 2.005638002662069e-09, + "loss": 0.79893768, + "num_input_tokens_seen": 353918785, + "step": 16402, + "time_per_iteration": 2.6275227069854736 + }, + { + "auxiliary_loss_clip": 0.01147393, + "auxiliary_loss_mlp": 0.01101397, + "balance_loss_clip": 1.00179434, + "balance_loss_mlp": 1.0006417, + "epoch": 0.986201713512701, + "flos": 27782577751680.0, + "grad_norm": 2.0696646146277486, + "language_loss": 0.70178401, + "learning_rate": 1.9882384698881596e-09, + "loss": 0.72427183, + "num_input_tokens_seen": 353940390, + "step": 16403, + "time_per_iteration": 2.6311159133911133 + }, + { + "auxiliary_loss_clip": 0.01149428, + "auxiliary_loss_mlp": 0.01099561, + "balance_loss_clip": 1.00180614, + "balance_loss_mlp": 1.00052166, + "epoch": 0.986261836765369, + "flos": 28730403884160.0, + "grad_norm": 2.002318438655095, + "language_loss": 0.74637079, + "learning_rate": 1.9709147019204566e-09, + "loss": 0.76886064, + "num_input_tokens_seen": 353962180, + "step": 16404, + "time_per_iteration": 2.6169729232788086 + }, + { + "auxiliary_loss_clip": 0.01149641, + "auxiliary_loss_mlp": 0.00747436, + "balance_loss_clip": 1.00190663, + "balance_loss_mlp": 1.00051594, + "epoch": 0.986321960018037, + "flos": 34313471568000.0, + "grad_norm": 1.927520082897172, + "language_loss": 0.70119321, + "learning_rate": 1.953666699415768e-09, + "loss": 0.720164, + "num_input_tokens_seen": 353984305, + "step": 16405, + "time_per_iteration": 2.760272264480591 + }, + { + "auxiliary_loss_clip": 0.01132075, + "auxiliary_loss_mlp": 0.01100038, + "balance_loss_clip": 1.00188804, + "balance_loss_mlp": 1.0006175, + "epoch": 0.986382083270705, + "flos": 25189755436800.0, + "grad_norm": 2.9603921773946906, + "language_loss": 0.69982266, + "learning_rate": 1.93649446302846e-09, + "loss": 0.72214377, + "num_input_tokens_seen": 354004495, + "step": 16406, + "time_per_iteration": 2.6288928985595703 + }, + { + "auxiliary_loss_clip": 0.01087239, + "auxiliary_loss_mlp": 0.01099856, + "balance_loss_clip": 1.00171137, + "balance_loss_mlp": 1.00048292, + "epoch": 0.9864422065233729, + "flos": 11025904671360.0, + "grad_norm": 2.7537665597431156, + "language_loss": 0.75203323, + "learning_rate": 1.9193979934095663e-09, + "loss": 0.77390414, + "num_input_tokens_seen": 354015985, + "step": 16407, + "time_per_iteration": 2.6746938228607178 + }, + { + "auxiliary_loss_clip": 0.01130517, + "auxiliary_loss_mlp": 0.01099684, + "balance_loss_clip": 1.00157309, + "balance_loss_mlp": 1.00045395, + "epoch": 0.9865023297760409, + "flos": 16545590807040.0, + "grad_norm": 3.2091898766951563, + "language_loss": 0.77622688, + "learning_rate": 1.9023772912072357e-09, + "loss": 0.79852891, + "num_input_tokens_seen": 354033260, + "step": 16408, + "time_per_iteration": 2.5586323738098145 + }, + { + "auxiliary_loss_clip": 0.01148039, + "auxiliary_loss_mlp": 0.01100808, + "balance_loss_clip": 1.00184631, + "balance_loss_mlp": 1.00043368, + "epoch": 0.9865624530287088, + "flos": 18880179269760.0, + "grad_norm": 1.7734508284676205, + "language_loss": 0.67857599, + "learning_rate": 1.8854323570669515e-09, + "loss": 0.70106447, + "num_input_tokens_seen": 354052825, + "step": 16409, + "time_per_iteration": 2.5831353664398193 + }, + { + "auxiliary_loss_clip": 0.01126823, + "auxiliary_loss_mlp": 0.01073694, + "balance_loss_clip": 1.00072527, + "balance_loss_mlp": 1.00007081, + "epoch": 0.9866225762813768, + "flos": 68887798680960.0, + "grad_norm": 0.7969532870609882, + "language_loss": 0.61059105, + "learning_rate": 1.8685631916313118e-09, + "loss": 0.63259625, + "num_input_tokens_seen": 354113920, + "step": 16410, + "time_per_iteration": 3.223634958267212 + }, + { + "auxiliary_loss_clip": 0.01147443, + "auxiliary_loss_mlp": 0.01100724, + "balance_loss_clip": 1.00180829, + "balance_loss_mlp": 1.00039721, + "epoch": 0.9866826995340447, + "flos": 29023111814400.0, + "grad_norm": 2.218127618205598, + "language_loss": 0.66220915, + "learning_rate": 1.8517697955400258e-09, + "loss": 0.68469077, + "num_input_tokens_seen": 354134210, + "step": 16411, + "time_per_iteration": 2.6631758213043213 + }, + { + "auxiliary_loss_clip": 0.01157942, + "auxiliary_loss_mlp": 0.01073688, + "balance_loss_clip": 1.00074673, + "balance_loss_mlp": 1.00006473, + "epoch": 0.9867428227867128, + "flos": 65376814867200.0, + "grad_norm": 0.7199298098791677, + "language_loss": 0.5623244, + "learning_rate": 1.8350521694299182e-09, + "loss": 0.58464068, + "num_input_tokens_seen": 354198010, + "step": 16412, + "time_per_iteration": 3.146721363067627 + }, + { + "auxiliary_loss_clip": 0.01115933, + "auxiliary_loss_mlp": 0.01101347, + "balance_loss_clip": 1.00164938, + "balance_loss_mlp": 1.0006392, + "epoch": 0.9868029460393807, + "flos": 26506312634880.0, + "grad_norm": 2.010083853278611, + "language_loss": 0.73116577, + "learning_rate": 1.818410313934926e-09, + "loss": 0.75333858, + "num_input_tokens_seen": 354220000, + "step": 16413, + "time_per_iteration": 2.759801149368286 + }, + { + "auxiliary_loss_clip": 0.01097766, + "auxiliary_loss_mlp": 0.01100499, + "balance_loss_clip": 1.00155449, + "balance_loss_mlp": 1.00045884, + "epoch": 0.9868630692920487, + "flos": 22967280299520.0, + "grad_norm": 1.5870259396037831, + "language_loss": 0.7157948, + "learning_rate": 1.8018442296858782e-09, + "loss": 0.73777747, + "num_input_tokens_seen": 354240910, + "step": 16414, + "time_per_iteration": 4.177674293518066 + }, + { + "auxiliary_loss_clip": 0.01149587, + "auxiliary_loss_mlp": 0.01099784, + "balance_loss_clip": 1.00198221, + "balance_loss_mlp": 1.00060248, + "epoch": 0.9869231925447167, + "flos": 19828687760640.0, + "grad_norm": 1.6344285782320367, + "language_loss": 0.7044009, + "learning_rate": 1.7853539173111608e-09, + "loss": 0.72689462, + "num_input_tokens_seen": 354259430, + "step": 16415, + "time_per_iteration": 3.967449426651001 + }, + { + "auxiliary_loss_clip": 0.01118049, + "auxiliary_loss_mlp": 0.01098877, + "balance_loss_clip": 1.00175905, + "balance_loss_mlp": 1.00055313, + "epoch": 0.9869833157973846, + "flos": 20195228096640.0, + "grad_norm": 1.5830527561377439, + "language_loss": 0.75671256, + "learning_rate": 1.7689393774362737e-09, + "loss": 0.77888185, + "num_input_tokens_seen": 354279490, + "step": 16416, + "time_per_iteration": 2.6537301540374756 + }, + { + "auxiliary_loss_clip": 0.01130664, + "auxiliary_loss_mlp": 0.01100604, + "balance_loss_clip": 1.0018028, + "balance_loss_mlp": 1.00051594, + "epoch": 0.9870434390500527, + "flos": 16099507802880.0, + "grad_norm": 2.2318142643117294, + "language_loss": 0.7078203, + "learning_rate": 1.7526006106833858e-09, + "loss": 0.730133, + "num_input_tokens_seen": 354295080, + "step": 16417, + "time_per_iteration": 2.586193799972534 + }, + { + "auxiliary_loss_clip": 0.01132849, + "auxiliary_loss_mlp": 0.01101219, + "balance_loss_clip": 1.00176859, + "balance_loss_mlp": 1.00041556, + "epoch": 0.9871035623027206, + "flos": 21760753438080.0, + "grad_norm": 2.0755532940494112, + "language_loss": 0.70772159, + "learning_rate": 1.7363376176720013e-09, + "loss": 0.73006225, + "num_input_tokens_seen": 354314610, + "step": 16418, + "time_per_iteration": 2.6565799713134766 + }, + { + "auxiliary_loss_clip": 0.01157965, + "auxiliary_loss_mlp": 0.01073724, + "balance_loss_clip": 1.00071502, + "balance_loss_mlp": 1.00010037, + "epoch": 0.9871636855553886, + "flos": 70219583245440.0, + "grad_norm": 0.6570338628468614, + "language_loss": 0.53709495, + "learning_rate": 1.7201503990189603e-09, + "loss": 0.55941182, + "num_input_tokens_seen": 354383115, + "step": 16419, + "time_per_iteration": 3.226290464401245 + }, + { + "auxiliary_loss_clip": 0.01132698, + "auxiliary_loss_mlp": 0.01101511, + "balance_loss_clip": 1.001724, + "balance_loss_mlp": 1.00061274, + "epoch": 0.9872238088080565, + "flos": 25045825639680.0, + "grad_norm": 1.7562699392231726, + "language_loss": 0.77789903, + "learning_rate": 1.7040389553382162e-09, + "loss": 0.80024117, + "num_input_tokens_seen": 354403115, + "step": 16420, + "time_per_iteration": 2.637216567993164 + }, + { + "auxiliary_loss_clip": 0.01115597, + "auxiliary_loss_mlp": 0.01099942, + "balance_loss_clip": 1.00192022, + "balance_loss_mlp": 1.00052154, + "epoch": 0.9872839320607245, + "flos": 19465846525440.0, + "grad_norm": 1.80727316915884, + "language_loss": 0.70984447, + "learning_rate": 1.6880032872403916e-09, + "loss": 0.73199993, + "num_input_tokens_seen": 354424520, + "step": 16421, + "time_per_iteration": 2.68287992477417 + }, + { + "auxiliary_loss_clip": 0.01147885, + "auxiliary_loss_mlp": 0.01101161, + "balance_loss_clip": 1.00172603, + "balance_loss_mlp": 1.00050068, + "epoch": 0.9873440553133924, + "flos": 26942914448640.0, + "grad_norm": 2.8055530550293204, + "language_loss": 0.82054526, + "learning_rate": 1.6720433953338886e-09, + "loss": 0.8430357, + "num_input_tokens_seen": 354444800, + "step": 16422, + "time_per_iteration": 2.599307060241699 + }, + { + "auxiliary_loss_clip": 0.01116437, + "auxiliary_loss_mlp": 0.01100675, + "balance_loss_clip": 1.00186753, + "balance_loss_mlp": 1.00053906, + "epoch": 0.9874041785660604, + "flos": 19062210418560.0, + "grad_norm": 1.9056908123158378, + "language_loss": 0.85782874, + "learning_rate": 1.656159280223779e-09, + "loss": 0.87999988, + "num_input_tokens_seen": 354464590, + "step": 16423, + "time_per_iteration": 2.6635639667510986 + }, + { + "auxiliary_loss_clip": 0.01149568, + "auxiliary_loss_mlp": 0.01100202, + "balance_loss_clip": 1.00186789, + "balance_loss_mlp": 1.00040007, + "epoch": 0.9874643018187284, + "flos": 21105814803840.0, + "grad_norm": 3.866461890350677, + "language_loss": 0.70336759, + "learning_rate": 1.6403509425122475e-09, + "loss": 0.72586524, + "num_input_tokens_seen": 354484145, + "step": 16424, + "time_per_iteration": 2.5750412940979004 + }, + { + "auxiliary_loss_clip": 0.01149159, + "auxiliary_loss_mlp": 0.00747285, + "balance_loss_clip": 1.00189662, + "balance_loss_mlp": 1.00045538, + "epoch": 0.9875244250713964, + "flos": 24426043441920.0, + "grad_norm": 2.0940793136454614, + "language_loss": 0.80580908, + "learning_rate": 1.6246183827990366e-09, + "loss": 0.82477349, + "num_input_tokens_seen": 354502475, + "step": 16425, + "time_per_iteration": 2.614872455596924 + }, + { + "auxiliary_loss_clip": 0.01101719, + "auxiliary_loss_mlp": 0.0110002, + "balance_loss_clip": 1.00175846, + "balance_loss_mlp": 1.0005517, + "epoch": 0.9875845483240643, + "flos": 25117610970240.0, + "grad_norm": 2.8711411591456937, + "language_loss": 0.79850012, + "learning_rate": 1.6089616016803364e-09, + "loss": 0.82051754, + "num_input_tokens_seen": 354521855, + "step": 16426, + "time_per_iteration": 2.7701399326324463 + }, + { + "auxiliary_loss_clip": 0.01147451, + "auxiliary_loss_mlp": 0.01100612, + "balance_loss_clip": 1.0018692, + "balance_loss_mlp": 1.00061941, + "epoch": 0.9876446715767323, + "flos": 16581788737920.0, + "grad_norm": 2.6376465987395203, + "language_loss": 0.84837937, + "learning_rate": 1.593380599750338e-09, + "loss": 0.87085998, + "num_input_tokens_seen": 354539535, + "step": 16427, + "time_per_iteration": 2.5452232360839844 + }, + { + "auxiliary_loss_clip": 0.01164272, + "auxiliary_loss_mlp": 0.01100066, + "balance_loss_clip": 1.00200558, + "balance_loss_mlp": 1.00045514, + "epoch": 0.9877047948294003, + "flos": 21616141282560.0, + "grad_norm": 1.7384464338950716, + "language_loss": 0.70215833, + "learning_rate": 1.577875377599458e-09, + "loss": 0.72480172, + "num_input_tokens_seen": 354557430, + "step": 16428, + "time_per_iteration": 2.533808708190918 + }, + { + "auxiliary_loss_clip": 0.01116064, + "auxiliary_loss_mlp": 0.0110007, + "balance_loss_clip": 1.00170231, + "balance_loss_mlp": 1.00069761, + "epoch": 0.9877649180820682, + "flos": 21178497974400.0, + "grad_norm": 2.0800509138749472, + "language_loss": 0.80053127, + "learning_rate": 1.5624459358158926e-09, + "loss": 0.82269263, + "num_input_tokens_seen": 354574735, + "step": 16429, + "time_per_iteration": 2.6527583599090576 + }, + { + "auxiliary_loss_clip": 0.01164174, + "auxiliary_loss_mlp": 0.01100575, + "balance_loss_clip": 1.00187969, + "balance_loss_mlp": 1.00053477, + "epoch": 0.9878250413347363, + "flos": 39749233576320.0, + "grad_norm": 1.6655268235985954, + "language_loss": 0.61798775, + "learning_rate": 1.5470922749845073e-09, + "loss": 0.64063519, + "num_input_tokens_seen": 354597050, + "step": 16430, + "time_per_iteration": 2.705376148223877 + }, + { + "auxiliary_loss_clip": 0.01164216, + "auxiliary_loss_mlp": 0.01100374, + "balance_loss_clip": 1.00187111, + "balance_loss_mlp": 1.00052404, + "epoch": 0.9878851645874042, + "flos": 29425634599680.0, + "grad_norm": 1.3873276537365609, + "language_loss": 0.72781205, + "learning_rate": 1.531814395687725e-09, + "loss": 0.750458, + "num_input_tokens_seen": 354619095, + "step": 16431, + "time_per_iteration": 2.609886646270752 + }, + { + "auxiliary_loss_clip": 0.01164159, + "auxiliary_loss_mlp": 0.01100719, + "balance_loss_clip": 1.00203586, + "balance_loss_mlp": 1.0006789, + "epoch": 0.9879452878400722, + "flos": 15806261168640.0, + "grad_norm": 2.3730155111288274, + "language_loss": 0.80595422, + "learning_rate": 1.5166122985048602e-09, + "loss": 0.82860303, + "num_input_tokens_seen": 354633790, + "step": 16432, + "time_per_iteration": 3.957946538925171 + }, + { + "auxiliary_loss_clip": 0.01146795, + "auxiliary_loss_mlp": 0.01098806, + "balance_loss_clip": 1.00176227, + "balance_loss_mlp": 1.00038683, + "epoch": 0.9880054110927401, + "flos": 22233912318720.0, + "grad_norm": 1.4694739055603112, + "language_loss": 0.80617917, + "learning_rate": 1.5014859840123405e-09, + "loss": 0.82863516, + "num_input_tokens_seen": 354653180, + "step": 16433, + "time_per_iteration": 2.575589418411255 + }, + { + "auxiliary_loss_clip": 0.01163994, + "auxiliary_loss_mlp": 0.01100258, + "balance_loss_clip": 1.0019052, + "balance_loss_mlp": 1.00050354, + "epoch": 0.9880655343454081, + "flos": 28763836467840.0, + "grad_norm": 2.025427380185317, + "language_loss": 0.65407372, + "learning_rate": 1.4864354527837075e-09, + "loss": 0.67671627, + "num_input_tokens_seen": 354669900, + "step": 16434, + "time_per_iteration": 2.5813186168670654 + }, + { + "auxiliary_loss_clip": 0.01149056, + "auxiliary_loss_mlp": 0.01100977, + "balance_loss_clip": 1.00177872, + "balance_loss_mlp": 1.0005548, + "epoch": 0.988125657598076, + "flos": 32853379622400.0, + "grad_norm": 2.887466865346666, + "language_loss": 0.68832767, + "learning_rate": 1.4714607053896154e-09, + "loss": 0.71082801, + "num_input_tokens_seen": 354693165, + "step": 16435, + "time_per_iteration": 4.123489618301392 + }, + { + "auxiliary_loss_clip": 0.01099417, + "auxiliary_loss_mlp": 0.01099914, + "balance_loss_clip": 1.00180101, + "balance_loss_mlp": 1.00058913, + "epoch": 0.988185780850744, + "flos": 19390685316480.0, + "grad_norm": 1.6030525328887906, + "language_loss": 0.75349867, + "learning_rate": 1.4565617423980548e-09, + "loss": 0.77549195, + "num_input_tokens_seen": 354711915, + "step": 16436, + "time_per_iteration": 2.679882526397705 + }, + { + "auxiliary_loss_clip": 0.0113292, + "auxiliary_loss_mlp": 0.01100327, + "balance_loss_clip": 1.0017997, + "balance_loss_mlp": 1.00047708, + "epoch": 0.988245904103412, + "flos": 22528415928960.0, + "grad_norm": 2.6452429263557375, + "language_loss": 0.7422024, + "learning_rate": 1.4417385643741286e-09, + "loss": 0.76453489, + "num_input_tokens_seen": 354729135, + "step": 16437, + "time_per_iteration": 2.6400227546691895 + }, + { + "auxiliary_loss_clip": 0.01113764, + "auxiliary_loss_mlp": 0.01099677, + "balance_loss_clip": 1.00169945, + "balance_loss_mlp": 1.00059068, + "epoch": 0.98830602735608, + "flos": 28659193171200.0, + "grad_norm": 1.8122014637843213, + "language_loss": 0.60177553, + "learning_rate": 1.4269911718796103e-09, + "loss": 0.62390995, + "num_input_tokens_seen": 354752530, + "step": 16438, + "time_per_iteration": 2.7851245403289795 + }, + { + "auxiliary_loss_clip": 0.01134825, + "auxiliary_loss_mlp": 0.01100281, + "balance_loss_clip": 1.00203109, + "balance_loss_mlp": 1.00047946, + "epoch": 0.9883661506087479, + "flos": 20996035862400.0, + "grad_norm": 2.8298669543847823, + "language_loss": 0.7191534, + "learning_rate": 1.4123195654738295e-09, + "loss": 0.74150455, + "num_input_tokens_seen": 354771135, + "step": 16439, + "time_per_iteration": 2.674330234527588 + }, + { + "auxiliary_loss_clip": 0.01147308, + "auxiliary_loss_mlp": 0.01100076, + "balance_loss_clip": 1.00186121, + "balance_loss_mlp": 1.00046492, + "epoch": 0.9884262738614159, + "flos": 32706109860480.0, + "grad_norm": 1.8637765850224328, + "language_loss": 0.60085696, + "learning_rate": 1.3977237457134528e-09, + "loss": 0.62333083, + "num_input_tokens_seen": 354791800, + "step": 16440, + "time_per_iteration": 2.660019874572754 + }, + { + "auxiliary_loss_clip": 0.01164173, + "auxiliary_loss_mlp": 0.01100805, + "balance_loss_clip": 1.00183535, + "balance_loss_mlp": 1.00047851, + "epoch": 0.9884863971140839, + "flos": 17564699479680.0, + "grad_norm": 2.6376860567258644, + "language_loss": 0.76425773, + "learning_rate": 1.3832037131513707e-09, + "loss": 0.78690749, + "num_input_tokens_seen": 354809200, + "step": 16441, + "time_per_iteration": 2.503995180130005 + }, + { + "auxiliary_loss_clip": 0.0113279, + "auxiliary_loss_mlp": 0.01100973, + "balance_loss_clip": 1.00171375, + "balance_loss_mlp": 1.00055122, + "epoch": 0.9885465203667518, + "flos": 40552519380480.0, + "grad_norm": 3.250457042458943, + "language_loss": 0.6785512, + "learning_rate": 1.3687594683386982e-09, + "loss": 0.70088881, + "num_input_tokens_seen": 354829945, + "step": 16442, + "time_per_iteration": 2.783576488494873 + }, + { + "auxiliary_loss_clip": 0.01146753, + "auxiliary_loss_mlp": 0.01098882, + "balance_loss_clip": 1.00181246, + "balance_loss_mlp": 1.0003674, + "epoch": 0.9886066436194199, + "flos": 13807976768640.0, + "grad_norm": 2.3355814188084065, + "language_loss": 0.74294794, + "learning_rate": 1.3543910118227753e-09, + "loss": 0.76540434, + "num_input_tokens_seen": 354845055, + "step": 16443, + "time_per_iteration": 2.5395610332489014 + }, + { + "auxiliary_loss_clip": 0.01132331, + "auxiliary_loss_mlp": 0.0110093, + "balance_loss_clip": 1.00182772, + "balance_loss_mlp": 1.0004127, + "epoch": 0.9886667668720878, + "flos": 23325129544320.0, + "grad_norm": 3.905708678585301, + "language_loss": 0.73616636, + "learning_rate": 1.3400983441487213e-09, + "loss": 0.75849897, + "num_input_tokens_seen": 354864680, + "step": 16444, + "time_per_iteration": 2.6348278522491455 + }, + { + "auxiliary_loss_clip": 0.01099823, + "auxiliary_loss_mlp": 0.01100331, + "balance_loss_clip": 1.00182915, + "balance_loss_mlp": 1.00052893, + "epoch": 0.9887268901247558, + "flos": 22706029704960.0, + "grad_norm": 2.3824340098828762, + "language_loss": 0.69302833, + "learning_rate": 1.325881465858547e-09, + "loss": 0.71502984, + "num_input_tokens_seen": 354885685, + "step": 16445, + "time_per_iteration": 2.72377347946167 + }, + { + "auxiliary_loss_clip": 0.01149484, + "auxiliary_loss_mlp": 0.01101157, + "balance_loss_clip": 1.00195396, + "balance_loss_mlp": 1.00040138, + "epoch": 0.9887870133774237, + "flos": 13041283944960.0, + "grad_norm": 2.8911960625359496, + "language_loss": 0.60762507, + "learning_rate": 1.311740377491155e-09, + "loss": 0.63013148, + "num_input_tokens_seen": 354901505, + "step": 16446, + "time_per_iteration": 2.558570384979248 + }, + { + "auxiliary_loss_clip": 0.01133122, + "auxiliary_loss_mlp": 0.01099757, + "balance_loss_clip": 1.00186658, + "balance_loss_mlp": 1.00057507, + "epoch": 0.9888471366300917, + "flos": 15158864390400.0, + "grad_norm": 2.1382543817779216, + "language_loss": 0.70790577, + "learning_rate": 1.297675079582783e-09, + "loss": 0.73023462, + "num_input_tokens_seen": 354920060, + "step": 16447, + "time_per_iteration": 2.579073190689087 + }, + { + "auxiliary_loss_clip": 0.01164191, + "auxiliary_loss_mlp": 0.00747354, + "balance_loss_clip": 1.00201106, + "balance_loss_mlp": 1.00042248, + "epoch": 0.9889072598827596, + "flos": 25118796119040.0, + "grad_norm": 1.7472075726114338, + "language_loss": 0.83766592, + "learning_rate": 1.2836855726667818e-09, + "loss": 0.8567813, + "num_input_tokens_seen": 354938690, + "step": 16448, + "time_per_iteration": 2.592660665512085 + }, + { + "auxiliary_loss_clip": 0.01147126, + "auxiliary_loss_mlp": 0.01099543, + "balance_loss_clip": 1.00177479, + "balance_loss_mlp": 1.00045633, + "epoch": 0.9889673831354276, + "flos": 16728663450240.0, + "grad_norm": 1.5883452189167084, + "language_loss": 0.70063734, + "learning_rate": 1.26977185727406e-09, + "loss": 0.72310406, + "num_input_tokens_seen": 354956955, + "step": 16449, + "time_per_iteration": 2.555154800415039 + }, + { + "auxiliary_loss_clip": 0.01147444, + "auxiliary_loss_mlp": 0.01100734, + "balance_loss_clip": 1.00182939, + "balance_loss_mlp": 1.00050318, + "epoch": 0.9890275063880956, + "flos": 35585175657600.0, + "grad_norm": 2.01367787021229, + "language_loss": 0.73836756, + "learning_rate": 1.25593393393153e-09, + "loss": 0.7608493, + "num_input_tokens_seen": 354976800, + "step": 16450, + "time_per_iteration": 2.699948787689209 + }, + { + "auxiliary_loss_clip": 0.01164193, + "auxiliary_loss_mlp": 0.01100969, + "balance_loss_clip": 1.00178742, + "balance_loss_mlp": 1.0004518, + "epoch": 0.9890876296407636, + "flos": 18952359649920.0, + "grad_norm": 2.9207414190537713, + "language_loss": 0.79831177, + "learning_rate": 1.242171803164549e-09, + "loss": 0.82096338, + "num_input_tokens_seen": 354996625, + "step": 16451, + "time_per_iteration": 2.5293431282043457 + }, + { + "auxiliary_loss_clip": 0.01117378, + "auxiliary_loss_mlp": 0.01100731, + "balance_loss_clip": 1.00178647, + "balance_loss_mlp": 1.00049949, + "epoch": 0.9891477528934315, + "flos": 23769309127680.0, + "grad_norm": 2.283987368542496, + "language_loss": 0.70444906, + "learning_rate": 1.2284854654946996e-09, + "loss": 0.72663021, + "num_input_tokens_seen": 355014535, + "step": 16452, + "time_per_iteration": 4.12061882019043 + }, + { + "auxiliary_loss_clip": 0.01164017, + "auxiliary_loss_mlp": 0.01098829, + "balance_loss_clip": 1.00190592, + "balance_loss_mlp": 1.00050533, + "epoch": 0.9892078761460995, + "flos": 20772922533120.0, + "grad_norm": 1.6285980587819204, + "language_loss": 0.73920304, + "learning_rate": 1.2148749214409004e-09, + "loss": 0.76183146, + "num_input_tokens_seen": 355033280, + "step": 16453, + "time_per_iteration": 4.0711829662323 + }, + { + "auxiliary_loss_clip": 0.0110012, + "auxiliary_loss_mlp": 0.01099679, + "balance_loss_clip": 1.0015738, + "balance_loss_mlp": 1.00063944, + "epoch": 0.9892679993987675, + "flos": 23367827836800.0, + "grad_norm": 2.2912896488418717, + "language_loss": 0.6986239, + "learning_rate": 1.2013401715191828e-09, + "loss": 0.72062182, + "num_input_tokens_seen": 355053320, + "step": 16454, + "time_per_iteration": 2.712770700454712 + }, + { + "auxiliary_loss_clip": 0.01132482, + "auxiliary_loss_mlp": 0.01099648, + "balance_loss_clip": 1.00192046, + "balance_loss_mlp": 1.00041795, + "epoch": 0.9893281226514354, + "flos": 22705419173760.0, + "grad_norm": 2.441816242061238, + "language_loss": 0.75817466, + "learning_rate": 1.1878812162433583e-09, + "loss": 0.78049594, + "num_input_tokens_seen": 355070230, + "step": 16455, + "time_per_iteration": 2.6410889625549316 + }, + { + "auxiliary_loss_clip": 0.01131432, + "auxiliary_loss_mlp": 0.01098465, + "balance_loss_clip": 1.00181973, + "balance_loss_mlp": 1.00037932, + "epoch": 0.9893882459041035, + "flos": 21796664060160.0, + "grad_norm": 1.641923894060142, + "language_loss": 0.65932488, + "learning_rate": 1.1744980561230188e-09, + "loss": 0.68162382, + "num_input_tokens_seen": 355090125, + "step": 16456, + "time_per_iteration": 2.747333288192749 + }, + { + "auxiliary_loss_clip": 0.01147819, + "auxiliary_loss_mlp": 0.01101002, + "balance_loss_clip": 1.00185728, + "balance_loss_mlp": 1.00048518, + "epoch": 0.9894483691567714, + "flos": 18113773754880.0, + "grad_norm": 2.2669266064145335, + "language_loss": 0.735654, + "learning_rate": 1.161190691666203e-09, + "loss": 0.75814229, + "num_input_tokens_seen": 355107890, + "step": 16457, + "time_per_iteration": 2.5697648525238037 + }, + { + "auxiliary_loss_clip": 0.01164238, + "auxiliary_loss_mlp": 0.01100498, + "balance_loss_clip": 1.00189531, + "balance_loss_mlp": 1.00045753, + "epoch": 0.9895084924094394, + "flos": 31211615664000.0, + "grad_norm": 2.0058497269279623, + "language_loss": 0.68808132, + "learning_rate": 1.1479591233773954e-09, + "loss": 0.7107287, + "num_input_tokens_seen": 355126340, + "step": 16458, + "time_per_iteration": 2.6157150268554688 + }, + { + "auxiliary_loss_clip": 0.01147103, + "auxiliary_loss_mlp": 0.01100047, + "balance_loss_clip": 1.00173664, + "balance_loss_mlp": 1.00048399, + "epoch": 0.9895686156621073, + "flos": 19678042120320.0, + "grad_norm": 2.1190234036417053, + "language_loss": 0.79357398, + "learning_rate": 1.1348033517581956e-09, + "loss": 0.81604552, + "num_input_tokens_seen": 355144025, + "step": 16459, + "time_per_iteration": 2.555143117904663 + }, + { + "auxiliary_loss_clip": 0.01130881, + "auxiliary_loss_mlp": 0.01100791, + "balance_loss_clip": 1.00174594, + "balance_loss_mlp": 1.00055981, + "epoch": 0.9896287389147753, + "flos": 23581675457280.0, + "grad_norm": 2.304363461219337, + "language_loss": 0.70531499, + "learning_rate": 1.1217233773075373e-09, + "loss": 0.72763169, + "num_input_tokens_seen": 355163125, + "step": 16460, + "time_per_iteration": 2.6283695697784424 + }, + { + "auxiliary_loss_clip": 0.01131789, + "auxiliary_loss_mlp": 0.01100877, + "balance_loss_clip": 1.00168383, + "balance_loss_mlp": 1.00040793, + "epoch": 0.9896888621674432, + "flos": 29605331364480.0, + "grad_norm": 2.467329824566326, + "language_loss": 0.87236369, + "learning_rate": 1.1087192005214685e-09, + "loss": 0.89469028, + "num_input_tokens_seen": 355184060, + "step": 16461, + "time_per_iteration": 2.668116569519043 + }, + { + "auxiliary_loss_clip": 0.01146962, + "auxiliary_loss_mlp": 0.01100749, + "balance_loss_clip": 1.0018611, + "balance_loss_mlp": 1.00051808, + "epoch": 0.9897489854201112, + "flos": 23695045758720.0, + "grad_norm": 1.961502916658576, + "language_loss": 0.63169181, + "learning_rate": 1.09579082189315e-09, + "loss": 0.6541689, + "num_input_tokens_seen": 355204505, + "step": 16462, + "time_per_iteration": 2.6095943450927734 + }, + { + "auxiliary_loss_clip": 0.01149411, + "auxiliary_loss_mlp": 0.01100527, + "balance_loss_clip": 1.00188518, + "balance_loss_mlp": 1.00048697, + "epoch": 0.9898091086727792, + "flos": 13225146687360.0, + "grad_norm": 1.8002939017688966, + "language_loss": 0.73013061, + "learning_rate": 1.0829382419126343e-09, + "loss": 0.75263, + "num_input_tokens_seen": 355223055, + "step": 16463, + "time_per_iteration": 2.563227415084839 + }, + { + "auxiliary_loss_clip": 0.01148677, + "auxiliary_loss_mlp": 0.01100331, + "balance_loss_clip": 1.00177336, + "balance_loss_mlp": 1.00043392, + "epoch": 0.9898692319254472, + "flos": 22930400010240.0, + "grad_norm": 1.805201326789645, + "language_loss": 0.70165169, + "learning_rate": 1.0701614610675314e-09, + "loss": 0.72414178, + "num_input_tokens_seen": 355242000, + "step": 16464, + "time_per_iteration": 2.5938169956207275 + }, + { + "auxiliary_loss_clip": 0.0111738, + "auxiliary_loss_mlp": 0.01100228, + "balance_loss_clip": 1.00174594, + "balance_loss_mlp": 1.00037813, + "epoch": 0.9899293551781151, + "flos": 12458346122880.0, + "grad_norm": 2.0467602773195663, + "language_loss": 0.73109198, + "learning_rate": 1.0574604798421204e-09, + "loss": 0.75326812, + "num_input_tokens_seen": 355260175, + "step": 16465, + "time_per_iteration": 2.629730701446533 + }, + { + "auxiliary_loss_clip": 0.01164051, + "auxiliary_loss_mlp": 0.01099116, + "balance_loss_clip": 1.00186968, + "balance_loss_mlp": 1.00055385, + "epoch": 0.9899894784307831, + "flos": 26871129118080.0, + "grad_norm": 2.4309563261997833, + "language_loss": 0.86369884, + "learning_rate": 1.0448352987182386e-09, + "loss": 0.88633054, + "num_input_tokens_seen": 355281930, + "step": 16466, + "time_per_iteration": 2.597064733505249 + }, + { + "auxiliary_loss_clip": 0.01118253, + "auxiliary_loss_mlp": 0.011012, + "balance_loss_clip": 1.00192678, + "balance_loss_mlp": 1.00039637, + "epoch": 0.990049601683451, + "flos": 21542093395200.0, + "grad_norm": 1.823291790987292, + "language_loss": 0.71882677, + "learning_rate": 1.0322859181743915e-09, + "loss": 0.74102128, + "num_input_tokens_seen": 355301555, + "step": 16467, + "time_per_iteration": 2.6509146690368652 + }, + { + "auxiliary_loss_clip": 0.01134601, + "auxiliary_loss_mlp": 0.01099505, + "balance_loss_clip": 1.00184727, + "balance_loss_mlp": 1.00041807, + "epoch": 0.990109724936119, + "flos": 28771809287040.0, + "grad_norm": 1.304665880287783, + "language_loss": 0.6492545, + "learning_rate": 1.019812338686643e-09, + "loss": 0.67159557, + "num_input_tokens_seen": 355324925, + "step": 16468, + "time_per_iteration": 2.7166428565979004 + }, + { + "auxiliary_loss_clip": 0.01115541, + "auxiliary_loss_mlp": 0.01101226, + "balance_loss_clip": 1.00161004, + "balance_loss_mlp": 1.00051796, + "epoch": 0.9901698481887871, + "flos": 29274270687360.0, + "grad_norm": 2.019689648187325, + "language_loss": 0.62133765, + "learning_rate": 1.0074145607281704e-09, + "loss": 0.64350533, + "num_input_tokens_seen": 355343875, + "step": 16469, + "time_per_iteration": 4.084724426269531 + }, + { + "auxiliary_loss_clip": 0.01115686, + "auxiliary_loss_mlp": 0.01100769, + "balance_loss_clip": 1.00177944, + "balance_loss_mlp": 1.00044227, + "epoch": 0.990229971441455, + "flos": 15959025711360.0, + "grad_norm": 4.348166083615502, + "language_loss": 0.70078492, + "learning_rate": 9.950925847685976e-10, + "loss": 0.72294956, + "num_input_tokens_seen": 355358835, + "step": 16470, + "time_per_iteration": 2.608057975769043 + }, + { + "auxiliary_loss_clip": 0.01141342, + "auxiliary_loss_mlp": 0.01073654, + "balance_loss_clip": 1.00087643, + "balance_loss_mlp": 1.00003052, + "epoch": 0.990290094694123, + "flos": 69780287911680.0, + "grad_norm": 0.6665771784369148, + "language_loss": 0.55469334, + "learning_rate": 9.828464112755509e-10, + "loss": 0.57684326, + "num_input_tokens_seen": 355431225, + "step": 16471, + "time_per_iteration": 3.321800470352173 + }, + { + "auxiliary_loss_clip": 0.01131337, + "auxiliary_loss_mlp": 0.0110048, + "balance_loss_clip": 1.00173473, + "balance_loss_mlp": 1.00053525, + "epoch": 0.9903502179467909, + "flos": 16252451913600.0, + "grad_norm": 2.7998979737983634, + "language_loss": 0.83693075, + "learning_rate": 9.706760407131032e-10, + "loss": 0.85924888, + "num_input_tokens_seen": 355448250, + "step": 16472, + "time_per_iteration": 4.069448232650757 + }, + { + "auxiliary_loss_clip": 0.01146956, + "auxiliary_loss_mlp": 0.011005, + "balance_loss_clip": 1.00184572, + "balance_loss_mlp": 1.00045919, + "epoch": 0.9904103411994589, + "flos": 21688393489920.0, + "grad_norm": 2.2452981155066247, + "language_loss": 0.85756046, + "learning_rate": 9.585814735431075e-10, + "loss": 0.88003504, + "num_input_tokens_seen": 355467040, + "step": 16473, + "time_per_iteration": 2.5988214015960693 + }, + { + "auxiliary_loss_clip": 0.01164146, + "auxiliary_loss_mlp": 0.0109953, + "balance_loss_clip": 1.00183249, + "balance_loss_mlp": 1.00049114, + "epoch": 0.9904704644521268, + "flos": 25739440243200.0, + "grad_norm": 2.019349551128443, + "language_loss": 0.84160089, + "learning_rate": 9.465627102240859e-10, + "loss": 0.86423767, + "num_input_tokens_seen": 355487825, + "step": 16474, + "time_per_iteration": 2.58978271484375 + }, + { + "auxiliary_loss_clip": 0.01130548, + "auxiliary_loss_mlp": 0.01099827, + "balance_loss_clip": 1.00162852, + "balance_loss_mlp": 1.00059712, + "epoch": 0.9905305877047949, + "flos": 21908346422400.0, + "grad_norm": 1.9427478773662374, + "language_loss": 0.76416552, + "learning_rate": 9.346197512116738e-10, + "loss": 0.78646922, + "num_input_tokens_seen": 355507445, + "step": 16475, + "time_per_iteration": 2.6153461933135986 + }, + { + "auxiliary_loss_clip": 0.01117728, + "auxiliary_loss_mlp": 0.01100232, + "balance_loss_clip": 1.00170648, + "balance_loss_mlp": 1.00038242, + "epoch": 0.9905907109574628, + "flos": 21392417422080.0, + "grad_norm": 1.5413633946352565, + "language_loss": 0.75779831, + "learning_rate": 9.227525969588423e-10, + "loss": 0.77997792, + "num_input_tokens_seen": 355527205, + "step": 16476, + "time_per_iteration": 2.713191509246826 + }, + { + "auxiliary_loss_clip": 0.01147795, + "auxiliary_loss_mlp": 0.00747364, + "balance_loss_clip": 1.00181007, + "balance_loss_mlp": 1.00050461, + "epoch": 0.9906508342101308, + "flos": 20521620005760.0, + "grad_norm": 2.3071735861893408, + "language_loss": 0.67741799, + "learning_rate": 9.109612479154538e-10, + "loss": 0.69636959, + "num_input_tokens_seen": 355544740, + "step": 16477, + "time_per_iteration": 2.619269609451294 + }, + { + "auxiliary_loss_clip": 0.01132568, + "auxiliary_loss_mlp": 0.01100527, + "balance_loss_clip": 1.00176609, + "balance_loss_mlp": 1.00062931, + "epoch": 0.9907109574627987, + "flos": 21361211481600.0, + "grad_norm": 1.9323987121099269, + "language_loss": 0.72017765, + "learning_rate": 8.992457045289282e-10, + "loss": 0.74250853, + "num_input_tokens_seen": 355564385, + "step": 16478, + "time_per_iteration": 2.6324570178985596 + }, + { + "auxiliary_loss_clip": 0.01164193, + "auxiliary_loss_mlp": 0.01101734, + "balance_loss_clip": 1.00188923, + "balance_loss_mlp": 1.00073957, + "epoch": 0.9907710807154667, + "flos": 17338605321600.0, + "grad_norm": 2.48561860610853, + "language_loss": 0.80832994, + "learning_rate": 8.876059672433545e-10, + "loss": 0.83098918, + "num_input_tokens_seen": 355579260, + "step": 16479, + "time_per_iteration": 2.493448495864868 + }, + { + "auxiliary_loss_clip": 0.01149637, + "auxiliary_loss_mlp": 0.01100717, + "balance_loss_clip": 1.00182319, + "balance_loss_mlp": 1.00048614, + "epoch": 0.9908312039681346, + "flos": 28621881918720.0, + "grad_norm": 2.6467214897628346, + "language_loss": 0.66129386, + "learning_rate": 8.760420364999355e-10, + "loss": 0.68379736, + "num_input_tokens_seen": 355599790, + "step": 16480, + "time_per_iteration": 2.6545398235321045 + }, + { + "auxiliary_loss_clip": 0.01147549, + "auxiliary_loss_mlp": 0.01100553, + "balance_loss_clip": 1.00172913, + "balance_loss_mlp": 1.00051272, + "epoch": 0.9908913272208026, + "flos": 35770654512000.0, + "grad_norm": 2.5311872741471833, + "language_loss": 0.72330719, + "learning_rate": 8.645539127374313e-10, + "loss": 0.74578822, + "num_input_tokens_seen": 355620925, + "step": 16481, + "time_per_iteration": 2.708997964859009 + }, + { + "auxiliary_loss_clip": 0.01149395, + "auxiliary_loss_mlp": 0.01099655, + "balance_loss_clip": 1.00189829, + "balance_loss_mlp": 1.00037742, + "epoch": 0.9909514504734707, + "flos": 19902196944000.0, + "grad_norm": 1.740857406378462, + "language_loss": 0.77762622, + "learning_rate": 8.531415963912713e-10, + "loss": 0.80011672, + "num_input_tokens_seen": 355639165, + "step": 16482, + "time_per_iteration": 2.6212897300720215 + }, + { + "auxiliary_loss_clip": 0.01149105, + "auxiliary_loss_mlp": 0.01099838, + "balance_loss_clip": 1.00183797, + "balance_loss_mlp": 1.00051248, + "epoch": 0.9910115737261386, + "flos": 20004793165440.0, + "grad_norm": 2.6256821154259855, + "language_loss": 0.75289464, + "learning_rate": 8.418050878944427e-10, + "loss": 0.77538407, + "num_input_tokens_seen": 355657320, + "step": 16483, + "time_per_iteration": 2.5762276649475098 + }, + { + "auxiliary_loss_clip": 0.01143701, + "auxiliary_loss_mlp": 0.01073694, + "balance_loss_clip": 1.00072312, + "balance_loss_mlp": 1.00007057, + "epoch": 0.9910716969788066, + "flos": 70688432494080.0, + "grad_norm": 0.6720929413218469, + "language_loss": 0.53645182, + "learning_rate": 8.305443876768237e-10, + "loss": 0.55862582, + "num_input_tokens_seen": 355726370, + "step": 16484, + "time_per_iteration": 3.2793478965759277 + }, + { + "auxiliary_loss_clip": 0.01163952, + "auxiliary_loss_mlp": 0.01099698, + "balance_loss_clip": 1.00183094, + "balance_loss_mlp": 1.00065935, + "epoch": 0.9911318202314745, + "flos": 21434038306560.0, + "grad_norm": 1.8137033106480402, + "language_loss": 0.82265115, + "learning_rate": 8.19359496165184e-10, + "loss": 0.84528768, + "num_input_tokens_seen": 355745840, + "step": 16485, + "time_per_iteration": 2.5311672687530518 + }, + { + "auxiliary_loss_clip": 0.01118201, + "auxiliary_loss_mlp": 0.01099755, + "balance_loss_clip": 1.00179386, + "balance_loss_mlp": 1.00062037, + "epoch": 0.9911919434841425, + "flos": 19826820253440.0, + "grad_norm": 1.7841466362164182, + "language_loss": 0.81381482, + "learning_rate": 8.082504137836288e-10, + "loss": 0.83599442, + "num_input_tokens_seen": 355763385, + "step": 16486, + "time_per_iteration": 2.752143144607544 + }, + { + "auxiliary_loss_clip": 0.01148723, + "auxiliary_loss_mlp": 0.01100503, + "balance_loss_clip": 1.00179827, + "balance_loss_mlp": 1.00051045, + "epoch": 0.9912520667368104, + "flos": 41719364691840.0, + "grad_norm": 1.54602218079494, + "language_loss": 0.66113222, + "learning_rate": 7.972171409538209e-10, + "loss": 0.68362451, + "num_input_tokens_seen": 355786075, + "step": 16487, + "time_per_iteration": 2.850783348083496 + }, + { + "auxiliary_loss_clip": 0.01147321, + "auxiliary_loss_mlp": 0.00747367, + "balance_loss_clip": 1.00184977, + "balance_loss_mlp": 1.00052595, + "epoch": 0.9913121899894785, + "flos": 23769668263680.0, + "grad_norm": 1.6608059895435203, + "language_loss": 0.7659741, + "learning_rate": 7.862596780936481e-10, + "loss": 0.78492093, + "num_input_tokens_seen": 355806295, + "step": 16488, + "time_per_iteration": 2.7389769554138184 + }, + { + "auxiliary_loss_clip": 0.01116177, + "auxiliary_loss_mlp": 0.01101691, + "balance_loss_clip": 1.00167918, + "balance_loss_mlp": 1.00050604, + "epoch": 0.9913723132421464, + "flos": 23769668263680.0, + "grad_norm": 7.410009689374057, + "language_loss": 0.6852327, + "learning_rate": 7.753780256190001e-10, + "loss": 0.70741135, + "num_input_tokens_seen": 355825730, + "step": 16489, + "time_per_iteration": 2.804515838623047 + }, + { + "auxiliary_loss_clip": 0.0111031, + "auxiliary_loss_mlp": 0.01074437, + "balance_loss_clip": 1.0007515, + "balance_loss_mlp": 1.00043201, + "epoch": 0.9914324364948144, + "flos": 71267419820160.0, + "grad_norm": 3.6185967511231545, + "language_loss": 0.52578199, + "learning_rate": 7.645721839424357e-10, + "loss": 0.54762948, + "num_input_tokens_seen": 355891545, + "step": 16490, + "time_per_iteration": 4.645293951034546 + }, + { + "auxiliary_loss_clip": 0.01120375, + "auxiliary_loss_mlp": 0.01101767, + "balance_loss_clip": 1.00175726, + "balance_loss_mlp": 1.00048685, + "epoch": 0.9914925597474823, + "flos": 23695440808320.0, + "grad_norm": 2.2041262607949355, + "language_loss": 0.7546829, + "learning_rate": 7.538421534734052e-10, + "loss": 0.77690428, + "num_input_tokens_seen": 355909920, + "step": 16491, + "time_per_iteration": 2.685537815093994 + }, + { + "auxiliary_loss_clip": 0.01099716, + "auxiliary_loss_mlp": 0.01101792, + "balance_loss_clip": 1.00168324, + "balance_loss_mlp": 1.00051141, + "epoch": 0.9915526830001503, + "flos": 13433822749440.0, + "grad_norm": 2.0216428743877386, + "language_loss": 0.70575362, + "learning_rate": 7.431879346191383e-10, + "loss": 0.72776866, + "num_input_tokens_seen": 355923130, + "step": 16492, + "time_per_iteration": 4.005693435668945 + }, + { + "auxiliary_loss_clip": 0.01117731, + "auxiliary_loss_mlp": 0.01099809, + "balance_loss_clip": 1.00172162, + "balance_loss_mlp": 1.00043595, + "epoch": 0.9916128062528182, + "flos": 20740962407040.0, + "grad_norm": 1.9563174937962413, + "language_loss": 0.68394184, + "learning_rate": 7.326095277837563e-10, + "loss": 0.70611727, + "num_input_tokens_seen": 355941960, + "step": 16493, + "time_per_iteration": 2.6661181449890137 + }, + { + "auxiliary_loss_clip": 0.011312, + "auxiliary_loss_mlp": 0.01101727, + "balance_loss_clip": 1.00174785, + "balance_loss_mlp": 1.00063765, + "epoch": 0.9916729295054862, + "flos": 22487082353280.0, + "grad_norm": 2.383548700103952, + "language_loss": 0.71332586, + "learning_rate": 7.221069333678276e-10, + "loss": 0.73565519, + "num_input_tokens_seen": 355961640, + "step": 16494, + "time_per_iteration": 2.664153575897217 + }, + { + "auxiliary_loss_clip": 0.01147641, + "auxiliary_loss_mlp": 0.01100739, + "balance_loss_clip": 1.00179911, + "balance_loss_mlp": 1.00050819, + "epoch": 0.9917330527581543, + "flos": 14792467708800.0, + "grad_norm": 2.339221539820393, + "language_loss": 0.67971396, + "learning_rate": 7.116801517701443e-10, + "loss": 0.70219779, + "num_input_tokens_seen": 355977980, + "step": 16495, + "time_per_iteration": 2.5777602195739746 + }, + { + "auxiliary_loss_clip": 0.01127994, + "auxiliary_loss_mlp": 0.010742, + "balance_loss_clip": 1.00086045, + "balance_loss_mlp": 1.00057626, + "epoch": 0.9917931760108222, + "flos": 59191595585280.0, + "grad_norm": 0.7143774783334212, + "language_loss": 0.53462815, + "learning_rate": 7.013291833859458e-10, + "loss": 0.5566501, + "num_input_tokens_seen": 356042900, + "step": 16496, + "time_per_iteration": 3.315324544906616 + }, + { + "auxiliary_loss_clip": 0.01133001, + "auxiliary_loss_mlp": 0.00747357, + "balance_loss_clip": 1.00175965, + "balance_loss_mlp": 1.0003835, + "epoch": 0.9918532992634902, + "flos": 26761637485440.0, + "grad_norm": 1.6693016978230784, + "language_loss": 0.70952648, + "learning_rate": 6.91054028607585e-10, + "loss": 0.72833008, + "num_input_tokens_seen": 356063000, + "step": 16497, + "time_per_iteration": 2.739351749420166 + }, + { + "auxiliary_loss_clip": 0.01116345, + "auxiliary_loss_mlp": 0.01101069, + "balance_loss_clip": 1.00178289, + "balance_loss_mlp": 1.00050449, + "epoch": 0.9919134225161581, + "flos": 14975719920000.0, + "grad_norm": 5.20950581408011, + "language_loss": 0.81852615, + "learning_rate": 6.808546878249721e-10, + "loss": 0.84070033, + "num_input_tokens_seen": 356078130, + "step": 16498, + "time_per_iteration": 2.6511590480804443 + }, + { + "auxiliary_loss_clip": 0.01116603, + "auxiliary_loss_mlp": 0.01100348, + "balance_loss_clip": 1.00179815, + "balance_loss_mlp": 1.00054622, + "epoch": 0.9919735457688261, + "flos": 27818201064960.0, + "grad_norm": 1.7498441821418649, + "language_loss": 0.68503547, + "learning_rate": 6.707311614246869e-10, + "loss": 0.707205, + "num_input_tokens_seen": 356101655, + "step": 16499, + "time_per_iteration": 2.7198925018310547 + }, + { + "auxiliary_loss_clip": 0.01164324, + "auxiliary_loss_mlp": 0.01100869, + "balance_loss_clip": 1.00193596, + "balance_loss_mlp": 1.00044751, + "epoch": 0.992033669021494, + "flos": 22562782266240.0, + "grad_norm": 1.9835326669608728, + "language_loss": 0.81958127, + "learning_rate": 6.606834497904223e-10, + "loss": 0.84223318, + "num_input_tokens_seen": 356121425, + "step": 16500, + "time_per_iteration": 2.5462353229522705 + }, + { + "auxiliary_loss_clip": 0.01133805, + "auxiliary_loss_mlp": 0.01100231, + "balance_loss_clip": 1.00196171, + "balance_loss_mlp": 1.00047624, + "epoch": 0.9920937922741621, + "flos": 25374587846400.0, + "grad_norm": 1.8094610097326889, + "language_loss": 0.81868339, + "learning_rate": 6.507115533036511e-10, + "loss": 0.84102368, + "num_input_tokens_seen": 356140710, + "step": 16501, + "time_per_iteration": 2.6569957733154297 + }, + { + "auxiliary_loss_clip": 0.01149578, + "auxiliary_loss_mlp": 0.01099784, + "balance_loss_clip": 1.00183153, + "balance_loss_mlp": 1.00045884, + "epoch": 0.99215391552683, + "flos": 22054466949120.0, + "grad_norm": 1.7972154507022644, + "language_loss": 0.77024996, + "learning_rate": 6.408154723420711e-10, + "loss": 0.79274356, + "num_input_tokens_seen": 356159835, + "step": 16502, + "time_per_iteration": 2.5657801628112793 + }, + { + "auxiliary_loss_clip": 0.01131283, + "auxiliary_loss_mlp": 0.01101338, + "balance_loss_clip": 1.00178242, + "balance_loss_mlp": 1.00048709, + "epoch": 0.992214038779498, + "flos": 15413937845760.0, + "grad_norm": 2.2793868387529717, + "language_loss": 0.71795666, + "learning_rate": 6.309952072811597e-10, + "loss": 0.74028289, + "num_input_tokens_seen": 356177555, + "step": 16503, + "time_per_iteration": 2.5820517539978027 + }, + { + "auxiliary_loss_clip": 0.01143487, + "auxiliary_loss_mlp": 0.01073694, + "balance_loss_clip": 1.0007298, + "balance_loss_mlp": 1.00007045, + "epoch": 0.9922741620321659, + "flos": 62014498467840.0, + "grad_norm": 0.6333143605921256, + "language_loss": 0.55044222, + "learning_rate": 6.212507584932858e-10, + "loss": 0.57261407, + "num_input_tokens_seen": 356244975, + "step": 16504, + "time_per_iteration": 3.2340316772460938 + }, + { + "auxiliary_loss_clip": 0.01116717, + "auxiliary_loss_mlp": 0.01099446, + "balance_loss_clip": 1.00164676, + "balance_loss_mlp": 1.0004549, + "epoch": 0.9923342852848339, + "flos": 17165480745600.0, + "grad_norm": 1.776814084181722, + "language_loss": 0.69476104, + "learning_rate": 6.115821263481536e-10, + "loss": 0.71692264, + "num_input_tokens_seen": 356262605, + "step": 16505, + "time_per_iteration": 2.6687045097351074 + }, + { + "auxiliary_loss_clip": 0.01118686, + "auxiliary_loss_mlp": 0.01101691, + "balance_loss_clip": 1.00175118, + "balance_loss_mlp": 1.00050604, + "epoch": 0.9923944085375018, + "flos": 23183210908800.0, + "grad_norm": 3.290582691878704, + "language_loss": 0.65278482, + "learning_rate": 6.019893112119146e-10, + "loss": 0.67498857, + "num_input_tokens_seen": 356278935, + "step": 16506, + "time_per_iteration": 2.6920790672302246 + }, + { + "auxiliary_loss_clip": 0.01082611, + "auxiliary_loss_mlp": 0.0110004, + "balance_loss_clip": 1.00157785, + "balance_loss_mlp": 1.00047672, + "epoch": 0.9924545317901698, + "flos": 20813861059200.0, + "grad_norm": 4.4043818294193215, + "language_loss": 0.63336742, + "learning_rate": 5.924723134487219e-10, + "loss": 0.65519392, + "num_input_tokens_seen": 356295675, + "step": 16507, + "time_per_iteration": 4.1223883628845215 + }, + { + "auxiliary_loss_clip": 0.01164274, + "auxiliary_loss_mlp": 0.01101281, + "balance_loss_clip": 1.001894, + "balance_loss_mlp": 1.00057316, + "epoch": 0.9925146550428379, + "flos": 20083437993600.0, + "grad_norm": 2.6543283484821965, + "language_loss": 0.72975731, + "learning_rate": 5.830311334193983e-10, + "loss": 0.7524128, + "num_input_tokens_seen": 356312885, + "step": 16508, + "time_per_iteration": 2.5258493423461914 + }, + { + "auxiliary_loss_clip": 0.01164041, + "auxiliary_loss_mlp": 0.01100359, + "balance_loss_clip": 1.00179601, + "balance_loss_mlp": 1.00041425, + "epoch": 0.9925747782955058, + "flos": 24973717086720.0, + "grad_norm": 1.5697154518912353, + "language_loss": 0.70208502, + "learning_rate": 5.736657714818793e-10, + "loss": 0.72472912, + "num_input_tokens_seen": 356334070, + "step": 16509, + "time_per_iteration": 2.5801844596862793 + }, + { + "auxiliary_loss_clip": 0.01147669, + "auxiliary_loss_mlp": 0.01101382, + "balance_loss_clip": 1.00172639, + "balance_loss_mlp": 1.00062656, + "epoch": 0.9926349015481738, + "flos": 60472526492160.0, + "grad_norm": 1.587014081930863, + "language_loss": 0.68289852, + "learning_rate": 5.643762279912146e-10, + "loss": 0.70538902, + "num_input_tokens_seen": 356359410, + "step": 16510, + "time_per_iteration": 4.446465969085693 + }, + { + "auxiliary_loss_clip": 0.01116155, + "auxiliary_loss_mlp": 0.01101499, + "balance_loss_clip": 1.00176668, + "balance_loss_mlp": 1.00059986, + "epoch": 0.9926950248008417, + "flos": 20741716592640.0, + "grad_norm": 2.4684202553517673, + "language_loss": 0.81343096, + "learning_rate": 5.551625032997886e-10, + "loss": 0.83560753, + "num_input_tokens_seen": 356378345, + "step": 16511, + "time_per_iteration": 2.661726713180542 + }, + { + "auxiliary_loss_clip": 0.0111639, + "auxiliary_loss_mlp": 0.01099297, + "balance_loss_clip": 1.00172067, + "balance_loss_mlp": 1.00044894, + "epoch": 0.9927551480535097, + "flos": 24352965221760.0, + "grad_norm": 1.9360460792912224, + "language_loss": 0.9155761, + "learning_rate": 5.460245977570998e-10, + "loss": 0.93773299, + "num_input_tokens_seen": 356397345, + "step": 16512, + "time_per_iteration": 2.7134015560150146 + }, + { + "auxiliary_loss_clip": 0.01110327, + "auxiliary_loss_mlp": 0.01073678, + "balance_loss_clip": 1.00062323, + "balance_loss_mlp": 1.0000546, + "epoch": 0.9928152713061776, + "flos": 71275572207360.0, + "grad_norm": 0.7008654804993781, + "language_loss": 0.55226511, + "learning_rate": 5.369625117095378e-10, + "loss": 0.57410514, + "num_input_tokens_seen": 356459160, + "step": 16513, + "time_per_iteration": 3.2789719104766846 + }, + { + "auxiliary_loss_clip": 0.01132645, + "auxiliary_loss_mlp": 0.01100217, + "balance_loss_clip": 1.00181437, + "balance_loss_mlp": 1.00046241, + "epoch": 0.9928753945588457, + "flos": 57809499045120.0, + "grad_norm": 1.4013885688016439, + "language_loss": 0.6480428, + "learning_rate": 5.279762455006054e-10, + "loss": 0.67037141, + "num_input_tokens_seen": 356486405, + "step": 16514, + "time_per_iteration": 2.95391845703125 + }, + { + "auxiliary_loss_clip": 0.01120275, + "auxiliary_loss_mlp": 0.01101162, + "balance_loss_clip": 1.00180244, + "balance_loss_mlp": 1.00045395, + "epoch": 0.9929355178115136, + "flos": 19568981450880.0, + "grad_norm": 1.942135545428143, + "language_loss": 0.73122656, + "learning_rate": 5.190657994713632e-10, + "loss": 0.75344086, + "num_input_tokens_seen": 356502905, + "step": 16515, + "time_per_iteration": 2.6616969108581543 + }, + { + "auxiliary_loss_clip": 0.0111823, + "auxiliary_loss_mlp": 0.01100163, + "balance_loss_clip": 1.00179791, + "balance_loss_mlp": 1.00045633, + "epoch": 0.9929956410641816, + "flos": 22964658606720.0, + "grad_norm": 1.4332211899832306, + "language_loss": 0.77171457, + "learning_rate": 5.102311739593191e-10, + "loss": 0.79389846, + "num_input_tokens_seen": 356523830, + "step": 16516, + "time_per_iteration": 2.707719326019287 + }, + { + "auxiliary_loss_clip": 0.01117773, + "auxiliary_loss_mlp": 0.01098922, + "balance_loss_clip": 1.00172901, + "balance_loss_mlp": 1.00045526, + "epoch": 0.9930557643168495, + "flos": 22566409539840.0, + "grad_norm": 1.4058950027807409, + "language_loss": 0.77811807, + "learning_rate": 5.014723692997602e-10, + "loss": 0.80028498, + "num_input_tokens_seen": 356543965, + "step": 16517, + "time_per_iteration": 2.663771152496338 + }, + { + "auxiliary_loss_clip": 0.01148169, + "auxiliary_loss_mlp": 0.01101989, + "balance_loss_clip": 1.00187516, + "balance_loss_mlp": 1.00051785, + "epoch": 0.9931158875695175, + "flos": 17201032231680.0, + "grad_norm": 2.490075940557608, + "language_loss": 0.67359829, + "learning_rate": 4.927893858248655e-10, + "loss": 0.69609982, + "num_input_tokens_seen": 356561530, + "step": 16518, + "time_per_iteration": 2.564702033996582 + }, + { + "auxiliary_loss_clip": 0.01127568, + "auxiliary_loss_mlp": 0.01073832, + "balance_loss_clip": 1.00090432, + "balance_loss_mlp": 1.0002079, + "epoch": 0.9931760108221854, + "flos": 63711204278400.0, + "grad_norm": 0.7313962148674059, + "language_loss": 0.53475976, + "learning_rate": 4.84182223863483e-10, + "loss": 0.55677378, + "num_input_tokens_seen": 356616845, + "step": 16519, + "time_per_iteration": 3.0481934547424316 + }, + { + "auxiliary_loss_clip": 0.01118444, + "auxiliary_loss_mlp": 0.01100687, + "balance_loss_clip": 1.00181103, + "balance_loss_mlp": 1.00045562, + "epoch": 0.9932361340748534, + "flos": 15304805349120.0, + "grad_norm": 2.503622059552426, + "language_loss": 0.60059273, + "learning_rate": 4.756508837426842e-10, + "loss": 0.62278402, + "num_input_tokens_seen": 356633560, + "step": 16520, + "time_per_iteration": 2.612865447998047 + }, + { + "auxiliary_loss_clip": 0.01133647, + "auxiliary_loss_mlp": 0.01100327, + "balance_loss_clip": 1.00197446, + "balance_loss_mlp": 1.00052524, + "epoch": 0.9932962573275215, + "flos": 36064906727040.0, + "grad_norm": 1.7581797049985302, + "language_loss": 0.62006468, + "learning_rate": 4.671953657853223e-10, + "loss": 0.64240444, + "num_input_tokens_seen": 356657600, + "step": 16521, + "time_per_iteration": 2.8281774520874023 + }, + { + "auxiliary_loss_clip": 0.01132376, + "auxiliary_loss_mlp": 0.01101229, + "balance_loss_clip": 1.00180268, + "balance_loss_mlp": 1.00052142, + "epoch": 0.9933563805801894, + "flos": 21470523546240.0, + "grad_norm": 1.8489480706600172, + "language_loss": 0.74455178, + "learning_rate": 4.5881567031225145e-10, + "loss": 0.76688784, + "num_input_tokens_seen": 356675880, + "step": 16522, + "time_per_iteration": 2.599693536758423 + }, + { + "auxiliary_loss_clip": 0.01115986, + "auxiliary_loss_mlp": 0.01099227, + "balance_loss_clip": 1.00171328, + "balance_loss_mlp": 1.00047374, + "epoch": 0.9934165038328574, + "flos": 23986532626560.0, + "grad_norm": 1.6348107017178752, + "language_loss": 0.73249674, + "learning_rate": 4.5051179764143964e-10, + "loss": 0.75464886, + "num_input_tokens_seen": 356696000, + "step": 16523, + "time_per_iteration": 2.667771577835083 + }, + { + "auxiliary_loss_clip": 0.01132112, + "auxiliary_loss_mlp": 0.00747308, + "balance_loss_clip": 1.00179529, + "balance_loss_mlp": 1.0004586, + "epoch": 0.9934766270855253, + "flos": 21907807718400.0, + "grad_norm": 1.8558974001777109, + "language_loss": 0.7105664, + "learning_rate": 4.422837480875241e-10, + "loss": 0.72936058, + "num_input_tokens_seen": 356716845, + "step": 16524, + "time_per_iteration": 2.8051247596740723 + }, + { + "auxiliary_loss_clip": 0.01114506, + "auxiliary_loss_mlp": 0.01100777, + "balance_loss_clip": 1.00174212, + "balance_loss_mlp": 1.00064147, + "epoch": 0.9935367503381933, + "flos": 17129139160320.0, + "grad_norm": 1.9125632132362704, + "language_loss": 0.79286301, + "learning_rate": 4.341315219624775e-10, + "loss": 0.81501579, + "num_input_tokens_seen": 356732100, + "step": 16525, + "time_per_iteration": 2.655730724334717 + }, + { + "auxiliary_loss_clip": 0.01115206, + "auxiliary_loss_mlp": 0.0110002, + "balance_loss_clip": 1.00181937, + "balance_loss_mlp": 1.00040841, + "epoch": 0.9935968735908612, + "flos": 22346241125760.0, + "grad_norm": 1.8081133380790393, + "language_loss": 0.74915516, + "learning_rate": 4.2605511957582995e-10, + "loss": 0.77130747, + "num_input_tokens_seen": 356751480, + "step": 16526, + "time_per_iteration": 2.6402170658111572 + }, + { + "auxiliary_loss_clip": 0.01163937, + "auxiliary_loss_mlp": 0.00747305, + "balance_loss_clip": 1.001899, + "balance_loss_mlp": 1.0004611, + "epoch": 0.9936569968435293, + "flos": 29460539640960.0, + "grad_norm": 1.5909194973687177, + "language_loss": 0.7238239, + "learning_rate": 4.180545412333369e-10, + "loss": 0.74293637, + "num_input_tokens_seen": 356772650, + "step": 16527, + "time_per_iteration": 3.9815399646759033 + }, + { + "auxiliary_loss_clip": 0.01131848, + "auxiliary_loss_mlp": 0.01100104, + "balance_loss_clip": 1.0016706, + "balance_loss_mlp": 1.00049281, + "epoch": 0.9937171200961972, + "flos": 16544046522240.0, + "grad_norm": 2.6834682648126416, + "language_loss": 0.76367933, + "learning_rate": 4.1012978723875547e-10, + "loss": 0.78599882, + "num_input_tokens_seen": 356788510, + "step": 16528, + "time_per_iteration": 2.59389066696167 + }, + { + "auxiliary_loss_clip": 0.01134449, + "auxiliary_loss_mlp": 0.01100371, + "balance_loss_clip": 1.00175428, + "balance_loss_mlp": 1.00042653, + "epoch": 0.9937772433488652, + "flos": 24390276474240.0, + "grad_norm": 1.8963297601078448, + "language_loss": 0.67804682, + "learning_rate": 4.022808578922898e-10, + "loss": 0.70039505, + "num_input_tokens_seen": 356809115, + "step": 16529, + "time_per_iteration": 4.110400199890137 + }, + { + "auxiliary_loss_clip": 0.0114754, + "auxiliary_loss_mlp": 0.01101542, + "balance_loss_clip": 1.00188375, + "balance_loss_mlp": 1.00049996, + "epoch": 0.9938373666015331, + "flos": 15669909141120.0, + "grad_norm": 2.9903882405440285, + "language_loss": 0.65168989, + "learning_rate": 3.9450775349170186e-10, + "loss": 0.67418069, + "num_input_tokens_seen": 356826410, + "step": 16530, + "time_per_iteration": 2.560591220855713 + }, + { + "auxiliary_loss_clip": 0.01149772, + "auxiliary_loss_mlp": 0.01101091, + "balance_loss_clip": 1.00199437, + "balance_loss_mlp": 1.00047874, + "epoch": 0.9938974898542011, + "flos": 19496190539520.0, + "grad_norm": 3.0495258007520496, + "language_loss": 0.71207631, + "learning_rate": 3.8681047433186676e-10, + "loss": 0.73458493, + "num_input_tokens_seen": 356844990, + "step": 16531, + "time_per_iteration": 2.5578811168670654 + }, + { + "auxiliary_loss_clip": 0.01149849, + "auxiliary_loss_mlp": 0.0110054, + "balance_loss_clip": 1.00200152, + "balance_loss_mlp": 1.00049996, + "epoch": 0.993957613106869, + "flos": 26906896085760.0, + "grad_norm": 1.3513288914393455, + "language_loss": 0.74053985, + "learning_rate": 3.791890207045512e-10, + "loss": 0.76304376, + "num_input_tokens_seen": 356866530, + "step": 16532, + "time_per_iteration": 2.674814462661743 + }, + { + "auxiliary_loss_clip": 0.01099239, + "auxiliary_loss_mlp": 0.01099383, + "balance_loss_clip": 1.00158882, + "balance_loss_mlp": 1.00053453, + "epoch": 0.994017736359537, + "flos": 14939593816320.0, + "grad_norm": 1.5992644591970415, + "language_loss": 0.70742786, + "learning_rate": 3.7164339289885717e-10, + "loss": 0.72941411, + "num_input_tokens_seen": 356884660, + "step": 16533, + "time_per_iteration": 2.671887159347534 + }, + { + "auxiliary_loss_clip": 0.0114757, + "auxiliary_loss_mlp": 0.01101693, + "balance_loss_clip": 1.00177753, + "balance_loss_mlp": 1.00041306, + "epoch": 0.9940778596122051, + "flos": 15377883569280.0, + "grad_norm": 3.681252707181612, + "language_loss": 0.84325415, + "learning_rate": 3.641735912007782e-10, + "loss": 0.86574674, + "num_input_tokens_seen": 356900895, + "step": 16534, + "time_per_iteration": 2.567624807357788 + }, + { + "auxiliary_loss_clip": 0.01115837, + "auxiliary_loss_mlp": 0.0109938, + "balance_loss_clip": 1.00166321, + "balance_loss_mlp": 1.00043583, + "epoch": 0.994137982864873, + "flos": 25228108183680.0, + "grad_norm": 1.4236878164308788, + "language_loss": 0.65881616, + "learning_rate": 3.567796158934211e-10, + "loss": 0.68096828, + "num_input_tokens_seen": 356920985, + "step": 16535, + "time_per_iteration": 2.713254451751709 + }, + { + "auxiliary_loss_clip": 0.0111385, + "auxiliary_loss_mlp": 0.01100494, + "balance_loss_clip": 1.00171757, + "balance_loss_mlp": 1.00040615, + "epoch": 0.994198106117541, + "flos": 18442140912000.0, + "grad_norm": 1.7496796129579433, + "language_loss": 0.65223783, + "learning_rate": 3.4946146725767235e-10, + "loss": 0.67438126, + "num_input_tokens_seen": 356939800, + "step": 16536, + "time_per_iteration": 2.631758689880371 + }, + { + "auxiliary_loss_clip": 0.01116697, + "auxiliary_loss_mlp": 0.01100861, + "balance_loss_clip": 1.00193477, + "balance_loss_mlp": 1.00043941, + "epoch": 0.9942582293702089, + "flos": 16654112772480.0, + "grad_norm": 1.9945119092067753, + "language_loss": 0.7846995, + "learning_rate": 3.4221914557064357e-10, + "loss": 0.80687511, + "num_input_tokens_seen": 356957780, + "step": 16537, + "time_per_iteration": 2.617135763168335 + }, + { + "auxiliary_loss_clip": 0.01148835, + "auxiliary_loss_mlp": 0.01100861, + "balance_loss_clip": 1.00181067, + "balance_loss_mlp": 1.00043917, + "epoch": 0.9943183526228769, + "flos": 21944580266880.0, + "grad_norm": 1.6939841213808622, + "language_loss": 0.68989384, + "learning_rate": 3.35052651107004e-10, + "loss": 0.71239078, + "num_input_tokens_seen": 356979185, + "step": 16538, + "time_per_iteration": 2.5959386825561523 + }, + { + "auxiliary_loss_clip": 0.01114966, + "auxiliary_loss_mlp": 0.01099126, + "balance_loss_clip": 1.00165439, + "balance_loss_mlp": 1.00046849, + "epoch": 0.9943784758755448, + "flos": 23842566915840.0, + "grad_norm": 2.0947960286484255, + "language_loss": 0.75342345, + "learning_rate": 3.2796198413853614e-10, + "loss": 0.77556437, + "num_input_tokens_seen": 356997735, + "step": 16539, + "time_per_iteration": 2.6545913219451904 + }, + { + "auxiliary_loss_clip": 0.01099986, + "auxiliary_loss_mlp": 0.01100891, + "balance_loss_clip": 1.00163567, + "balance_loss_mlp": 1.00056505, + "epoch": 0.9944385991282129, + "flos": 21469984842240.0, + "grad_norm": 1.9349866961308009, + "language_loss": 0.70757329, + "learning_rate": 3.209471449341361e-10, + "loss": 0.72958213, + "num_input_tokens_seen": 357015660, + "step": 16540, + "time_per_iteration": 2.690378189086914 + }, + { + "auxiliary_loss_clip": 0.01147245, + "auxiliary_loss_mlp": 0.0109898, + "balance_loss_clip": 1.00181711, + "balance_loss_mlp": 1.00041807, + "epoch": 0.9944987223808808, + "flos": 22927024131840.0, + "grad_norm": 1.900231885502111, + "language_loss": 0.74813855, + "learning_rate": 3.140081337600353e-10, + "loss": 0.7706008, + "num_input_tokens_seen": 357034800, + "step": 16541, + "time_per_iteration": 2.575608015060425 + }, + { + "auxiliary_loss_clip": 0.01132637, + "auxiliary_loss_mlp": 0.01101773, + "balance_loss_clip": 1.00187683, + "balance_loss_mlp": 1.00058842, + "epoch": 0.9945588456335488, + "flos": 22383013674240.0, + "grad_norm": 1.9175242330923428, + "language_loss": 0.76338243, + "learning_rate": 3.0714495087891255e-10, + "loss": 0.78572643, + "num_input_tokens_seen": 357053785, + "step": 16542, + "time_per_iteration": 2.6214356422424316 + }, + { + "auxiliary_loss_clip": 0.01147479, + "auxiliary_loss_mlp": 0.01101311, + "balance_loss_clip": 1.00180578, + "balance_loss_mlp": 1.00050759, + "epoch": 0.9946189688862167, + "flos": 21397517153280.0, + "grad_norm": 2.8152177090186714, + "language_loss": 0.73767459, + "learning_rate": 3.0035759655122615e-10, + "loss": 0.76016253, + "num_input_tokens_seen": 357072025, + "step": 16543, + "time_per_iteration": 2.5893144607543945 + }, + { + "auxiliary_loss_clip": 0.01149944, + "auxiliary_loss_mlp": 0.01101889, + "balance_loss_clip": 1.00197649, + "balance_loss_mlp": 1.00060868, + "epoch": 0.9946790921388847, + "flos": 12416545670400.0, + "grad_norm": 2.3630105185633243, + "language_loss": 0.82483947, + "learning_rate": 2.9364607103454785e-10, + "loss": 0.84735775, + "num_input_tokens_seen": 357086960, + "step": 16544, + "time_per_iteration": 2.5926570892333984 + }, + { + "auxiliary_loss_clip": 0.01164065, + "auxiliary_loss_mlp": 0.01100662, + "balance_loss_clip": 1.0018425, + "balance_loss_mlp": 1.00043082, + "epoch": 0.9947392153915526, + "flos": 19058295836160.0, + "grad_norm": 2.7921307412373975, + "language_loss": 0.79090005, + "learning_rate": 2.870103745831187e-10, + "loss": 0.81354737, + "num_input_tokens_seen": 357105095, + "step": 16545, + "time_per_iteration": 3.93319034576416 + }, + { + "auxiliary_loss_clip": 0.01120202, + "auxiliary_loss_mlp": 0.01100104, + "balance_loss_clip": 1.00179756, + "balance_loss_mlp": 1.00039768, + "epoch": 0.9947993386442207, + "flos": 27308808339840.0, + "grad_norm": 1.935173825061444, + "language_loss": 0.72137964, + "learning_rate": 2.8045050744873733e-10, + "loss": 0.74358267, + "num_input_tokens_seen": 357125065, + "step": 16546, + "time_per_iteration": 2.7098124027252197 + }, + { + "auxiliary_loss_clip": 0.01149429, + "auxiliary_loss_mlp": 0.01100133, + "balance_loss_clip": 1.00178409, + "balance_loss_mlp": 1.00061703, + "epoch": 0.9948594618968887, + "flos": 20806498771200.0, + "grad_norm": 2.7080085076949656, + "language_loss": 0.77497458, + "learning_rate": 2.739664698798716e-10, + "loss": 0.79747021, + "num_input_tokens_seen": 357141600, + "step": 16547, + "time_per_iteration": 2.55844783782959 + }, + { + "auxiliary_loss_clip": 0.01132482, + "auxiliary_loss_mlp": 0.01099992, + "balance_loss_clip": 1.00168848, + "balance_loss_mlp": 1.00052381, + "epoch": 0.9949195851495566, + "flos": 23292953936640.0, + "grad_norm": 2.3018632001386794, + "language_loss": 0.6995219, + "learning_rate": 2.67558262122769e-10, + "loss": 0.72184664, + "num_input_tokens_seen": 357157880, + "step": 16548, + "time_per_iteration": 4.100517749786377 + }, + { + "auxiliary_loss_clip": 0.01149493, + "auxiliary_loss_mlp": 0.01100513, + "balance_loss_clip": 1.00197697, + "balance_loss_mlp": 1.00037694, + "epoch": 0.9949797084022246, + "flos": 18515470527360.0, + "grad_norm": 1.8132279861166158, + "language_loss": 0.75757927, + "learning_rate": 2.6122588442012427e-10, + "loss": 0.78007925, + "num_input_tokens_seen": 357176705, + "step": 16549, + "time_per_iteration": 2.607177734375 + }, + { + "auxiliary_loss_clip": 0.01116964, + "auxiliary_loss_mlp": 0.01101595, + "balance_loss_clip": 1.00183582, + "balance_loss_mlp": 1.00050521, + "epoch": 0.9950398316548925, + "flos": 30407719328640.0, + "grad_norm": 2.2153676442105485, + "language_loss": 0.74419284, + "learning_rate": 2.5496933701241177e-10, + "loss": 0.7663784, + "num_input_tokens_seen": 357197630, + "step": 16550, + "time_per_iteration": 2.7559874057769775 + }, + { + "auxiliary_loss_clip": 0.01101335, + "auxiliary_loss_mlp": 0.00747297, + "balance_loss_clip": 1.00162351, + "balance_loss_mlp": 1.00053918, + "epoch": 0.9950999549075605, + "flos": 19900868140800.0, + "grad_norm": 1.514794389593978, + "language_loss": 0.78124118, + "learning_rate": 2.4878862013655297e-10, + "loss": 0.7997275, + "num_input_tokens_seen": 357215445, + "step": 16551, + "time_per_iteration": 2.712108850479126 + }, + { + "auxiliary_loss_clip": 0.01147017, + "auxiliary_loss_mlp": 0.01098207, + "balance_loss_clip": 1.00180697, + "balance_loss_mlp": 1.00045514, + "epoch": 0.9951600781602284, + "flos": 17603555016960.0, + "grad_norm": 1.6584053558331837, + "language_loss": 0.6652993, + "learning_rate": 2.426837340270271e-10, + "loss": 0.68775153, + "num_input_tokens_seen": 357234285, + "step": 16552, + "time_per_iteration": 2.5781612396240234 + }, + { + "auxiliary_loss_clip": 0.01164096, + "auxiliary_loss_mlp": 0.0110022, + "balance_loss_clip": 1.00179231, + "balance_loss_mlp": 1.00046587, + "epoch": 0.9952202014128965, + "flos": 28950715952640.0, + "grad_norm": 1.3890631989610824, + "language_loss": 0.81510711, + "learning_rate": 2.3665467891520465e-10, + "loss": 0.83775026, + "num_input_tokens_seen": 357257565, + "step": 16553, + "time_per_iteration": 2.6380698680877686 + }, + { + "auxiliary_loss_clip": 0.0114094, + "auxiliary_loss_mlp": 0.01073805, + "balance_loss_clip": 1.00066602, + "balance_loss_mlp": 1.00018167, + "epoch": 0.9952803246655644, + "flos": 70810386145920.0, + "grad_norm": 0.7178986810787628, + "language_loss": 0.57357895, + "learning_rate": 2.3070145503001348e-10, + "loss": 0.59572649, + "num_input_tokens_seen": 357320205, + "step": 16554, + "time_per_iteration": 3.2541732788085938 + }, + { + "auxiliary_loss_clip": 0.01148371, + "auxiliary_loss_mlp": 0.0110055, + "balance_loss_clip": 1.00198221, + "balance_loss_mlp": 1.00070071, + "epoch": 0.9953404479182324, + "flos": 21799070271360.0, + "grad_norm": 1.826312171183402, + "language_loss": 0.77148747, + "learning_rate": 2.24824062597051e-10, + "loss": 0.79397666, + "num_input_tokens_seen": 357340695, + "step": 16555, + "time_per_iteration": 2.617734670639038 + }, + { + "auxiliary_loss_clip": 0.01119678, + "auxiliary_loss_mlp": 0.0110009, + "balance_loss_clip": 1.00173163, + "balance_loss_mlp": 1.00047898, + "epoch": 0.9954005711709003, + "flos": 21937397546880.0, + "grad_norm": 3.0596574993690244, + "language_loss": 0.86170393, + "learning_rate": 2.1902250183902793e-10, + "loss": 0.8839016, + "num_input_tokens_seen": 357357505, + "step": 16556, + "time_per_iteration": 2.6634788513183594 + }, + { + "auxiliary_loss_clip": 0.01099005, + "auxiliary_loss_mlp": 0.0109909, + "balance_loss_clip": 1.00165987, + "balance_loss_mlp": 1.00043201, + "epoch": 0.9954606944235683, + "flos": 19354559212800.0, + "grad_norm": 1.781062797590822, + "language_loss": 0.73107755, + "learning_rate": 2.132967729762125e-10, + "loss": 0.75305855, + "num_input_tokens_seen": 357375395, + "step": 16557, + "time_per_iteration": 2.695371627807617 + }, + { + "auxiliary_loss_clip": 0.01148983, + "auxiliary_loss_mlp": 0.01099719, + "balance_loss_clip": 1.00193763, + "balance_loss_mlp": 1.00044179, + "epoch": 0.9955208176762362, + "flos": 30518611591680.0, + "grad_norm": 2.3926824683940553, + "language_loss": 0.76019704, + "learning_rate": 2.0764687622554233e-10, + "loss": 0.78268409, + "num_input_tokens_seen": 357397375, + "step": 16558, + "time_per_iteration": 2.6471047401428223 + }, + { + "auxiliary_loss_clip": 0.01132853, + "auxiliary_loss_mlp": 0.01100057, + "balance_loss_clip": 1.00178671, + "balance_loss_mlp": 1.00044596, + "epoch": 0.9955809409289043, + "flos": 30008249199360.0, + "grad_norm": 1.908627441583738, + "language_loss": 0.6366086, + "learning_rate": 2.0207281180129044e-10, + "loss": 0.65893769, + "num_input_tokens_seen": 357418880, + "step": 16559, + "time_per_iteration": 2.674288511276245 + }, + { + "auxiliary_loss_clip": 0.01149512, + "auxiliary_loss_mlp": 0.01100646, + "balance_loss_clip": 1.00193703, + "balance_loss_mlp": 1.00051022, + "epoch": 0.9956410641815723, + "flos": 21543278544000.0, + "grad_norm": 3.336821559358489, + "language_loss": 0.74219137, + "learning_rate": 1.965745799148433e-10, + "loss": 0.76469296, + "num_input_tokens_seen": 357438310, + "step": 16560, + "time_per_iteration": 2.5651042461395264 + }, + { + "auxiliary_loss_clip": 0.01086577, + "auxiliary_loss_mlp": 0.01099539, + "balance_loss_clip": 1.00161028, + "balance_loss_mlp": 1.00054741, + "epoch": 0.9957011874342402, + "flos": 21689470897920.0, + "grad_norm": 1.711292114568176, + "language_loss": 0.79334819, + "learning_rate": 1.9115218077470073e-10, + "loss": 0.81520939, + "num_input_tokens_seen": 357457155, + "step": 16561, + "time_per_iteration": 2.730973243713379 + }, + { + "auxiliary_loss_clip": 0.01164138, + "auxiliary_loss_mlp": 0.01099457, + "balance_loss_clip": 1.00195348, + "balance_loss_mlp": 1.00051308, + "epoch": 0.9957613106869082, + "flos": 17702667619200.0, + "grad_norm": 3.6383232693047582, + "language_loss": 0.65798974, + "learning_rate": 1.8580561458647614e-10, + "loss": 0.68062562, + "num_input_tokens_seen": 357468060, + "step": 16562, + "time_per_iteration": 2.514319658279419 + }, + { + "auxiliary_loss_clip": 0.01131405, + "auxiliary_loss_mlp": 0.00747323, + "balance_loss_clip": 1.00185239, + "balance_loss_mlp": 1.00054717, + "epoch": 0.9958214339395761, + "flos": 30555994671360.0, + "grad_norm": 1.868654748450153, + "language_loss": 0.64586937, + "learning_rate": 1.805348815528962e-10, + "loss": 0.66465664, + "num_input_tokens_seen": 357489665, + "step": 16563, + "time_per_iteration": 2.6938419342041016 + }, + { + "auxiliary_loss_clip": 0.01130844, + "auxiliary_loss_mlp": 0.01100104, + "balance_loss_clip": 1.0017333, + "balance_loss_mlp": 1.00044489, + "epoch": 0.9958815571922441, + "flos": 24169174306560.0, + "grad_norm": 1.5446934587041472, + "language_loss": 0.64731002, + "learning_rate": 1.7533998187380105e-10, + "loss": 0.6696195, + "num_input_tokens_seen": 357511975, + "step": 16564, + "time_per_iteration": 4.071380853652954 + }, + { + "auxiliary_loss_clip": 0.01130611, + "auxiliary_loss_mlp": 0.00747299, + "balance_loss_clip": 1.00182343, + "balance_loss_mlp": 1.00052333, + "epoch": 0.995941680444912, + "flos": 15487016065920.0, + "grad_norm": 5.438595012519466, + "language_loss": 0.74226725, + "learning_rate": 1.7022091574636633e-10, + "loss": 0.76104629, + "num_input_tokens_seen": 357529345, + "step": 16565, + "time_per_iteration": 2.6823954582214355 + }, + { + "auxiliary_loss_clip": 0.01132899, + "auxiliary_loss_mlp": 0.01099406, + "balance_loss_clip": 1.00161505, + "balance_loss_mlp": 1.00041473, + "epoch": 0.9960018036975801, + "flos": 18621227145600.0, + "grad_norm": 1.820013227848659, + "language_loss": 0.79058504, + "learning_rate": 1.6517768336443694e-10, + "loss": 0.81290811, + "num_input_tokens_seen": 357547615, + "step": 16566, + "time_per_iteration": 4.055199384689331 + }, + { + "auxiliary_loss_clip": 0.01099485, + "auxiliary_loss_mlp": 0.00747379, + "balance_loss_clip": 1.00156546, + "balance_loss_mlp": 1.0005579, + "epoch": 0.996061926950248, + "flos": 20084120352000.0, + "grad_norm": 1.6848825338486497, + "language_loss": 0.70940667, + "learning_rate": 1.6021028491941535e-10, + "loss": 0.72787529, + "num_input_tokens_seen": 357567380, + "step": 16567, + "time_per_iteration": 2.7036690711975098 + }, + { + "auxiliary_loss_clip": 0.01149524, + "auxiliary_loss_mlp": 0.01101153, + "balance_loss_clip": 1.00184464, + "balance_loss_mlp": 1.00044537, + "epoch": 0.996122050202916, + "flos": 24347829576960.0, + "grad_norm": 3.713181488000301, + "language_loss": 0.78804433, + "learning_rate": 1.5531872059959538e-10, + "loss": 0.81055117, + "num_input_tokens_seen": 357586435, + "step": 16568, + "time_per_iteration": 2.595757007598877 + }, + { + "auxiliary_loss_clip": 0.01147157, + "auxiliary_loss_mlp": 0.01098539, + "balance_loss_clip": 1.00180495, + "balance_loss_mlp": 1.00040555, + "epoch": 0.9961821734555839, + "flos": 24199302839040.0, + "grad_norm": 2.2176075697990414, + "language_loss": 0.8209455, + "learning_rate": 1.5050299059060634e-10, + "loss": 0.84340245, + "num_input_tokens_seen": 357604720, + "step": 16569, + "time_per_iteration": 2.60109281539917 + }, + { + "auxiliary_loss_clip": 0.01116136, + "auxiliary_loss_mlp": 0.007473, + "balance_loss_clip": 1.0018146, + "balance_loss_mlp": 1.00046611, + "epoch": 0.9962422967082519, + "flos": 22633741584000.0, + "grad_norm": 2.0384948284219058, + "language_loss": 0.70345873, + "learning_rate": 1.457630950747468e-10, + "loss": 0.72209311, + "num_input_tokens_seen": 357622345, + "step": 16570, + "time_per_iteration": 2.6708381175994873 + }, + { + "auxiliary_loss_clip": 0.01118256, + "auxiliary_loss_mlp": 0.01099557, + "balance_loss_clip": 1.00181842, + "balance_loss_mlp": 1.00047052, + "epoch": 0.9963024199609198, + "flos": 26396030903040.0, + "grad_norm": 1.9023931102864624, + "language_loss": 0.75281227, + "learning_rate": 1.4109903423209502e-10, + "loss": 0.77499044, + "num_input_tokens_seen": 357642710, + "step": 16571, + "time_per_iteration": 2.723437547683716 + }, + { + "auxiliary_loss_clip": 0.01132958, + "auxiliary_loss_mlp": 0.01100442, + "balance_loss_clip": 1.00182617, + "balance_loss_mlp": 1.00049734, + "epoch": 0.9963625432135879, + "flos": 16581537342720.0, + "grad_norm": 1.863220631883894, + "language_loss": 0.80257875, + "learning_rate": 1.3651080823939843e-10, + "loss": 0.82491273, + "num_input_tokens_seen": 357659870, + "step": 16572, + "time_per_iteration": 2.5860707759857178 + }, + { + "auxiliary_loss_clip": 0.01130011, + "auxiliary_loss_mlp": 0.0110095, + "balance_loss_clip": 1.00179303, + "balance_loss_mlp": 1.00043249, + "epoch": 0.9964226664662559, + "flos": 26468534505600.0, + "grad_norm": 2.165120126281918, + "language_loss": 0.7062524, + "learning_rate": 1.3199841727074e-10, + "loss": 0.728562, + "num_input_tokens_seen": 357677075, + "step": 16573, + "time_per_iteration": 2.655378818511963 + }, + { + "auxiliary_loss_clip": 0.01130382, + "auxiliary_loss_mlp": 0.01101768, + "balance_loss_clip": 1.00170135, + "balance_loss_mlp": 1.00058329, + "epoch": 0.9964827897189238, + "flos": 27448320764160.0, + "grad_norm": 1.6388429730683964, + "language_loss": 0.6317004, + "learning_rate": 1.275618614968721e-10, + "loss": 0.65402186, + "num_input_tokens_seen": 357696715, + "step": 16574, + "time_per_iteration": 2.6989986896514893 + }, + { + "auxiliary_loss_clip": 0.01114163, + "auxiliary_loss_mlp": 0.01102059, + "balance_loss_clip": 1.00178623, + "balance_loss_mlp": 1.00058782, + "epoch": 0.9965429129715918, + "flos": 11721566350080.0, + "grad_norm": 2.8743996689137643, + "language_loss": 0.76152474, + "learning_rate": 1.2320114108654856e-10, + "loss": 0.783687, + "num_input_tokens_seen": 357712345, + "step": 16575, + "time_per_iteration": 2.6391186714172363 + }, + { + "auxiliary_loss_clip": 0.01132816, + "auxiliary_loss_mlp": 0.01099979, + "balance_loss_clip": 1.00178897, + "balance_loss_mlp": 1.00051069, + "epoch": 0.9966030362242597, + "flos": 19756004590080.0, + "grad_norm": 3.2193430400613026, + "language_loss": 0.70409656, + "learning_rate": 1.1891625620474855e-10, + "loss": 0.72642446, + "num_input_tokens_seen": 357731815, + "step": 16576, + "time_per_iteration": 2.6934776306152344 + }, + { + "auxiliary_loss_clip": 0.01147658, + "auxiliary_loss_mlp": 0.01099624, + "balance_loss_clip": 1.00182784, + "balance_loss_mlp": 1.00039387, + "epoch": 0.9966631594769277, + "flos": 23915178259200.0, + "grad_norm": 1.6819142406389127, + "language_loss": 0.72056597, + "learning_rate": 1.1470720701400871e-10, + "loss": 0.74303877, + "num_input_tokens_seen": 357751640, + "step": 16577, + "time_per_iteration": 2.60965895652771 + }, + { + "auxiliary_loss_clip": 0.01130828, + "auxiliary_loss_mlp": 0.01100239, + "balance_loss_clip": 1.00186741, + "balance_loss_mlp": 1.00057971, + "epoch": 0.9967232827295956, + "flos": 15559591495680.0, + "grad_norm": 2.1151187541065335, + "language_loss": 0.78438342, + "learning_rate": 1.1057399367397912e-10, + "loss": 0.80669403, + "num_input_tokens_seen": 357769850, + "step": 16578, + "time_per_iteration": 2.5948052406311035 + }, + { + "auxiliary_loss_clip": 0.01099305, + "auxiliary_loss_mlp": 0.00747247, + "balance_loss_clip": 1.0016073, + "balance_loss_mlp": 1.0003233, + "epoch": 0.9967834059822637, + "flos": 20813035046400.0, + "grad_norm": 1.669327586267533, + "language_loss": 0.75940919, + "learning_rate": 1.0651661634142328e-10, + "loss": 0.77787471, + "num_input_tokens_seen": 357789550, + "step": 16579, + "time_per_iteration": 2.704190731048584 + }, + { + "auxiliary_loss_clip": 0.01131242, + "auxiliary_loss_mlp": 0.01101718, + "balance_loss_clip": 1.00204813, + "balance_loss_mlp": 1.00053334, + "epoch": 0.9968435292349316, + "flos": 36719234830080.0, + "grad_norm": 3.680686298242025, + "language_loss": 0.68956184, + "learning_rate": 1.0253507516999604e-10, + "loss": 0.71189141, + "num_input_tokens_seen": 357809525, + "step": 16580, + "time_per_iteration": 2.755563497543335 + }, + { + "auxiliary_loss_clip": 0.01099173, + "auxiliary_loss_mlp": 0.01099696, + "balance_loss_clip": 1.00168633, + "balance_loss_mlp": 1.00046611, + "epoch": 0.9969036524875996, + "flos": 26760919213440.0, + "grad_norm": 1.8625296248759904, + "language_loss": 0.79950881, + "learning_rate": 9.862937031113184e-11, + "loss": 0.82149744, + "num_input_tokens_seen": 357829795, + "step": 16581, + "time_per_iteration": 2.73453426361084 + }, + { + "auxiliary_loss_clip": 0.01130966, + "auxiliary_loss_mlp": 0.01098944, + "balance_loss_clip": 1.00175977, + "balance_loss_mlp": 1.00042987, + "epoch": 0.9969637757402675, + "flos": 24827237424000.0, + "grad_norm": 1.6990834965319963, + "language_loss": 0.80217797, + "learning_rate": 9.479950191249031e-11, + "loss": 0.82447708, + "num_input_tokens_seen": 357851655, + "step": 16582, + "time_per_iteration": 4.0555126667022705 + }, + { + "auxiliary_loss_clip": 0.01148785, + "auxiliary_loss_mlp": 0.01099415, + "balance_loss_clip": 1.00180507, + "balance_loss_mlp": 1.00047112, + "epoch": 0.9970238989929355, + "flos": 23038742407680.0, + "grad_norm": 2.1721049120420113, + "language_loss": 0.60618728, + "learning_rate": 9.104547011951069e-11, + "loss": 0.62866926, + "num_input_tokens_seen": 357871205, + "step": 16583, + "time_per_iteration": 2.58121395111084 + }, + { + "auxiliary_loss_clip": 0.01131819, + "auxiliary_loss_mlp": 0.01100004, + "balance_loss_clip": 1.0016942, + "balance_loss_mlp": 1.00053596, + "epoch": 0.9970840222456034, + "flos": 25298816106240.0, + "grad_norm": 1.5610320654229841, + "language_loss": 0.77654052, + "learning_rate": 8.736727507452357e-11, + "loss": 0.79885876, + "num_input_tokens_seen": 357892145, + "step": 16584, + "time_per_iteration": 2.64791202545166 + }, + { + "auxiliary_loss_clip": 0.0113068, + "auxiliary_loss_mlp": 0.01099899, + "balance_loss_clip": 1.00167441, + "balance_loss_mlp": 1.0005734, + "epoch": 0.9971441454982715, + "flos": 21615602578560.0, + "grad_norm": 1.5530742002293365, + "language_loss": 0.69328797, + "learning_rate": 8.376491691697297e-11, + "loss": 0.7155937, + "num_input_tokens_seen": 357911205, + "step": 16585, + "time_per_iteration": 2.610079288482666 + }, + { + "auxiliary_loss_clip": 0.01164035, + "auxiliary_loss_mlp": 0.01100437, + "balance_loss_clip": 1.00193167, + "balance_loss_mlp": 1.00044394, + "epoch": 0.9972042687509394, + "flos": 14975612179200.0, + "grad_norm": 2.536189938998142, + "language_loss": 0.81602538, + "learning_rate": 8.023839578363834e-11, + "loss": 0.83867007, + "num_input_tokens_seen": 357928190, + "step": 16586, + "time_per_iteration": 3.985710382461548 + }, + { + "auxiliary_loss_clip": 0.01133011, + "auxiliary_loss_mlp": 0.0110057, + "balance_loss_clip": 1.00177717, + "balance_loss_mlp": 1.00062442, + "epoch": 0.9972643920036074, + "flos": 25806664546560.0, + "grad_norm": 1.7881466715037901, + "language_loss": 0.77683765, + "learning_rate": 7.678771180796851e-11, + "loss": 0.79917347, + "num_input_tokens_seen": 357946985, + "step": 16587, + "time_per_iteration": 2.7180206775665283 + }, + { + "auxiliary_loss_clip": 0.01132076, + "auxiliary_loss_mlp": 0.01101062, + "balance_loss_clip": 1.00203216, + "balance_loss_mlp": 1.00059223, + "epoch": 0.9973245152562754, + "flos": 23326242865920.0, + "grad_norm": 1.870619403699745, + "language_loss": 0.72857171, + "learning_rate": 7.341286512074773e-11, + "loss": 0.75090313, + "num_input_tokens_seen": 357966720, + "step": 16588, + "time_per_iteration": 2.6450600624084473 + }, + { + "auxiliary_loss_clip": 0.01164318, + "auxiliary_loss_mlp": 0.01101765, + "balance_loss_clip": 1.00190544, + "balance_loss_mlp": 1.00048459, + "epoch": 0.9973846385089433, + "flos": 12166212810240.0, + "grad_norm": 2.3876132409803796, + "language_loss": 0.82760727, + "learning_rate": 7.011385585031781e-11, + "loss": 0.85026813, + "num_input_tokens_seen": 357981375, + "step": 16589, + "time_per_iteration": 2.5235671997070312 + }, + { + "auxiliary_loss_clip": 0.01149565, + "auxiliary_loss_mlp": 0.0110178, + "balance_loss_clip": 1.00181103, + "balance_loss_mlp": 1.00049996, + "epoch": 0.9974447617616113, + "flos": 20045157073920.0, + "grad_norm": 1.983860412851463, + "language_loss": 0.7029646, + "learning_rate": 6.689068412168986e-11, + "loss": 0.72547805, + "num_input_tokens_seen": 358000290, + "step": 16590, + "time_per_iteration": 2.674788475036621 + }, + { + "auxiliary_loss_clip": 0.0113334, + "auxiliary_loss_mlp": 0.01100628, + "balance_loss_clip": 1.00179291, + "balance_loss_mlp": 1.00063491, + "epoch": 0.9975048850142793, + "flos": 32014614159360.0, + "grad_norm": 2.0823919775503, + "language_loss": 0.63497794, + "learning_rate": 6.374335005676634e-11, + "loss": 0.65731758, + "num_input_tokens_seen": 358022075, + "step": 16591, + "time_per_iteration": 2.711012601852417 + }, + { + "auxiliary_loss_clip": 0.01131262, + "auxiliary_loss_mlp": 0.0109998, + "balance_loss_clip": 1.00167203, + "balance_loss_mlp": 1.00041652, + "epoch": 0.9975650082669473, + "flos": 36933728895360.0, + "grad_norm": 1.6737215204305436, + "language_loss": 0.73279464, + "learning_rate": 6.067185377522933e-11, + "loss": 0.75510705, + "num_input_tokens_seen": 358043940, + "step": 16592, + "time_per_iteration": 2.7519114017486572 + }, + { + "auxiliary_loss_clip": 0.01130593, + "auxiliary_loss_mlp": 0.01100631, + "balance_loss_clip": 1.00183427, + "balance_loss_mlp": 1.00059032, + "epoch": 0.9976251315196152, + "flos": 16472117537280.0, + "grad_norm": 1.8898408262414894, + "language_loss": 0.851538, + "learning_rate": 5.767619539343016e-11, + "loss": 0.87385023, + "num_input_tokens_seen": 358062720, + "step": 16593, + "time_per_iteration": 2.5942633152008057 + }, + { + "auxiliary_loss_clip": 0.01163993, + "auxiliary_loss_mlp": 0.00747178, + "balance_loss_clip": 1.00182819, + "balance_loss_mlp": 1.00035238, + "epoch": 0.9976852547722832, + "flos": 19646836179840.0, + "grad_norm": 5.374973409049526, + "language_loss": 0.69191688, + "learning_rate": 5.4756375024833656e-11, + "loss": 0.71102858, + "num_input_tokens_seen": 358081560, + "step": 16594, + "time_per_iteration": 2.534698724746704 + }, + { + "auxiliary_loss_clip": 0.01099372, + "auxiliary_loss_mlp": 0.01100719, + "balance_loss_clip": 1.00164843, + "balance_loss_mlp": 1.00044012, + "epoch": 0.9977453780249511, + "flos": 20448434044800.0, + "grad_norm": 2.2867857480274987, + "language_loss": 0.72950298, + "learning_rate": 5.1912392780462113e-11, + "loss": 0.75150394, + "num_input_tokens_seen": 358099065, + "step": 16595, + "time_per_iteration": 2.6853649616241455 + }, + { + "auxiliary_loss_clip": 0.01141551, + "auxiliary_loss_mlp": 0.0107407, + "balance_loss_clip": 1.00076389, + "balance_loss_mlp": 1.00006521, + "epoch": 0.9978055012776191, + "flos": 65455097581440.0, + "grad_norm": 0.781014421600637, + "language_loss": 0.60309136, + "learning_rate": 4.9144248768007156e-11, + "loss": 0.6252476, + "num_input_tokens_seen": 358156095, + "step": 16596, + "time_per_iteration": 3.008166551589966 + }, + { + "auxiliary_loss_clip": 0.01147696, + "auxiliary_loss_mlp": 0.01100695, + "balance_loss_clip": 1.00189972, + "balance_loss_mlp": 1.00046372, + "epoch": 0.997865624530287, + "flos": 20631506688000.0, + "grad_norm": 1.8700219787676478, + "language_loss": 0.78001809, + "learning_rate": 4.645194309227385e-11, + "loss": 0.80250204, + "num_input_tokens_seen": 358175230, + "step": 16597, + "time_per_iteration": 2.5883073806762695 + }, + { + "auxiliary_loss_clip": 0.01149446, + "auxiliary_loss_mlp": 0.01100368, + "balance_loss_clip": 1.00175488, + "balance_loss_mlp": 1.00037491, + "epoch": 0.9979257477829551, + "flos": 29387102284800.0, + "grad_norm": 1.7545059454649365, + "language_loss": 0.82013482, + "learning_rate": 4.383547585562475e-11, + "loss": 0.84263301, + "num_input_tokens_seen": 358197075, + "step": 16598, + "time_per_iteration": 2.637603759765625 + }, + { + "auxiliary_loss_clip": 0.01131576, + "auxiliary_loss_mlp": 0.01102327, + "balance_loss_clip": 1.00200486, + "balance_loss_mlp": 1.00066566, + "epoch": 0.997985871035623, + "flos": 22635070387200.0, + "grad_norm": 2.193653946159016, + "language_loss": 0.64641142, + "learning_rate": 4.129484715709175e-11, + "loss": 0.66875041, + "num_input_tokens_seen": 358215925, + "step": 16599, + "time_per_iteration": 2.6358468532562256 + }, + { + "auxiliary_loss_clip": 0.01125644, + "auxiliary_loss_mlp": 0.01073796, + "balance_loss_clip": 1.00097919, + "balance_loss_mlp": 1.00017202, + "epoch": 0.998045994288291, + "flos": 61806968663040.0, + "grad_norm": 0.9048469736386494, + "language_loss": 0.62324083, + "learning_rate": 3.8830057093264256e-11, + "loss": 0.64523524, + "num_input_tokens_seen": 358269035, + "step": 16600, + "time_per_iteration": 3.1076347827911377 + }, + { + "auxiliary_loss_clip": 0.01130916, + "auxiliary_loss_mlp": 0.01099335, + "balance_loss_clip": 1.00180554, + "balance_loss_mlp": 1.00062943, + "epoch": 0.998106117540959, + "flos": 19245534456960.0, + "grad_norm": 1.6984603333381998, + "language_loss": 0.78482819, + "learning_rate": 3.644110575717896e-11, + "loss": 0.80713069, + "num_input_tokens_seen": 358287680, + "step": 16601, + "time_per_iteration": 2.5962865352630615 + }, + { + "auxiliary_loss_clip": 0.01116279, + "auxiliary_loss_mlp": 0.01100771, + "balance_loss_clip": 1.00175428, + "balance_loss_mlp": 1.00053966, + "epoch": 0.9981662407936269, + "flos": 21106209853440.0, + "grad_norm": 2.502288286558055, + "language_loss": 0.82609355, + "learning_rate": 3.412799323987414e-11, + "loss": 0.8482641, + "num_input_tokens_seen": 358304080, + "step": 16602, + "time_per_iteration": 4.022325277328491 + }, + { + "auxiliary_loss_clip": 0.01114503, + "auxiliary_loss_mlp": 0.01101845, + "balance_loss_clip": 1.00179768, + "balance_loss_mlp": 1.00056505, + "epoch": 0.998226364046295, + "flos": 24316839118080.0, + "grad_norm": 2.115583602027236, + "language_loss": 0.62537825, + "learning_rate": 3.189071962883538e-11, + "loss": 0.64754176, + "num_input_tokens_seen": 358323670, + "step": 16603, + "time_per_iteration": 2.684150457382202 + }, + { + "auxiliary_loss_clip": 0.01132765, + "auxiliary_loss_mlp": 0.01101753, + "balance_loss_clip": 1.00180244, + "balance_loss_mlp": 1.00056863, + "epoch": 0.9982864872989629, + "flos": 23836389776640.0, + "grad_norm": 1.9107212629702441, + "language_loss": 0.71087664, + "learning_rate": 2.972928500866168e-11, + "loss": 0.73322183, + "num_input_tokens_seen": 358341980, + "step": 16604, + "time_per_iteration": 3.992887258529663 + }, + { + "auxiliary_loss_clip": 0.01164136, + "auxiliary_loss_mlp": 0.01100124, + "balance_loss_clip": 1.00188565, + "balance_loss_mlp": 1.00041747, + "epoch": 0.9983466105516309, + "flos": 18333116156160.0, + "grad_norm": 1.5936921348271598, + "language_loss": 0.64697832, + "learning_rate": 2.7643689461953613e-11, + "loss": 0.66962093, + "num_input_tokens_seen": 358360400, + "step": 16605, + "time_per_iteration": 2.527327060699463 + }, + { + "auxiliary_loss_clip": 0.01115791, + "auxiliary_loss_mlp": 0.01099276, + "balance_loss_clip": 1.0016942, + "balance_loss_mlp": 1.00052321, + "epoch": 0.9984067338042988, + "flos": 17236763285760.0, + "grad_norm": 1.5637026367627254, + "language_loss": 0.71481538, + "learning_rate": 2.5633933067092938e-11, + "loss": 0.73696601, + "num_input_tokens_seen": 358378990, + "step": 16606, + "time_per_iteration": 2.6480302810668945 + }, + { + "auxiliary_loss_clip": 0.01147501, + "auxiliary_loss_mlp": 0.00747402, + "balance_loss_clip": 1.00184202, + "balance_loss_mlp": 1.00049341, + "epoch": 0.9984668570569668, + "flos": 20667884186880.0, + "grad_norm": 2.0102414867681735, + "language_loss": 0.81737113, + "learning_rate": 2.370001590090709e-11, + "loss": 0.83632016, + "num_input_tokens_seen": 358395970, + "step": 16607, + "time_per_iteration": 2.59968638420105 + }, + { + "auxiliary_loss_clip": 0.01116453, + "auxiliary_loss_mlp": 0.01100505, + "balance_loss_clip": 1.00162172, + "balance_loss_mlp": 1.00051188, + "epoch": 0.9985269803096347, + "flos": 30262532555520.0, + "grad_norm": 1.530113559679553, + "language_loss": 0.6724413, + "learning_rate": 2.184193803622669e-11, + "loss": 0.69461083, + "num_input_tokens_seen": 358417355, + "step": 16608, + "time_per_iteration": 2.749241590499878 + }, + { + "auxiliary_loss_clip": 0.01102073, + "auxiliary_loss_mlp": 0.01100513, + "balance_loss_clip": 1.00183392, + "balance_loss_mlp": 1.00042439, + "epoch": 0.9985871035623027, + "flos": 10560970005120.0, + "grad_norm": 3.298077308445817, + "language_loss": 0.81007862, + "learning_rate": 2.0059699543883978e-11, + "loss": 0.8321045, + "num_input_tokens_seen": 358434345, + "step": 16609, + "time_per_iteration": 2.688930034637451 + }, + { + "auxiliary_loss_clip": 0.01132489, + "auxiliary_loss_mlp": 0.01099202, + "balance_loss_clip": 1.00177908, + "balance_loss_mlp": 1.00059175, + "epoch": 0.9986472268149706, + "flos": 16873455173760.0, + "grad_norm": 1.41156665455102, + "language_loss": 0.62765396, + "learning_rate": 1.8353300491158462e-11, + "loss": 0.64997083, + "num_input_tokens_seen": 358452870, + "step": 16610, + "time_per_iteration": 2.6317195892333984 + }, + { + "auxiliary_loss_clip": 0.01147856, + "auxiliary_loss_mlp": 0.01100094, + "balance_loss_clip": 1.00194955, + "balance_loss_mlp": 1.00057828, + "epoch": 0.9987073500676387, + "flos": 22054538776320.0, + "grad_norm": 2.3490272579375007, + "language_loss": 0.67094374, + "learning_rate": 1.672274094288717e-11, + "loss": 0.69342333, + "num_input_tokens_seen": 358472210, + "step": 16611, + "time_per_iteration": 2.614189624786377 + }, + { + "auxiliary_loss_clip": 0.01116498, + "auxiliary_loss_mlp": 0.01101424, + "balance_loss_clip": 1.00172806, + "balance_loss_mlp": 1.00066841, + "epoch": 0.9987674733203066, + "flos": 30482880537600.0, + "grad_norm": 1.473824151151938, + "language_loss": 0.69353771, + "learning_rate": 1.5168020961020544e-11, + "loss": 0.71571696, + "num_input_tokens_seen": 358493840, + "step": 16612, + "time_per_iteration": 2.7574784755706787 + }, + { + "auxiliary_loss_clip": 0.01130719, + "auxiliary_loss_mlp": 0.01099753, + "balance_loss_clip": 1.00190151, + "balance_loss_mlp": 1.00057065, + "epoch": 0.9988275965729746, + "flos": 27745230585600.0, + "grad_norm": 1.5348715742850076, + "language_loss": 0.73899472, + "learning_rate": 1.3689140604400407e-11, + "loss": 0.76129943, + "num_input_tokens_seen": 358515060, + "step": 16613, + "time_per_iteration": 2.667330741882324 + }, + { + "auxiliary_loss_clip": 0.01115605, + "auxiliary_loss_mlp": 0.00747362, + "balance_loss_clip": 1.00166607, + "balance_loss_mlp": 1.00048685, + "epoch": 0.9988877198256426, + "flos": 17524191916800.0, + "grad_norm": 2.2320965772722396, + "language_loss": 0.73969615, + "learning_rate": 1.2286099928981996e-11, + "loss": 0.75832582, + "num_input_tokens_seen": 358528200, + "step": 16614, + "time_per_iteration": 2.6358325481414795 + }, + { + "auxiliary_loss_clip": 0.01149568, + "auxiliary_loss_mlp": 0.01100452, + "balance_loss_clip": 1.00201023, + "balance_loss_mlp": 1.00050712, + "epoch": 0.9989478430783105, + "flos": 20996502739200.0, + "grad_norm": 1.631724241266286, + "language_loss": 0.72776908, + "learning_rate": 1.0958898988278065e-11, + "loss": 0.75026923, + "num_input_tokens_seen": 358548360, + "step": 16615, + "time_per_iteration": 2.6068508625030518 + }, + { + "auxiliary_loss_clip": 0.01164333, + "auxiliary_loss_mlp": 0.00747258, + "balance_loss_clip": 1.00197673, + "balance_loss_mlp": 1.00044489, + "epoch": 0.9990079663309785, + "flos": 13370620769280.0, + "grad_norm": 2.4012750323273755, + "language_loss": 0.77557075, + "learning_rate": 9.70753783247069e-12, + "loss": 0.79468668, + "num_input_tokens_seen": 358566270, + "step": 16616, + "time_per_iteration": 2.5293262004852295 + }, + { + "auxiliary_loss_clip": 0.01133576, + "auxiliary_loss_mlp": 0.01100634, + "balance_loss_clip": 1.00187612, + "balance_loss_mlp": 1.00045061, + "epoch": 0.9990680895836465, + "flos": 17310236555520.0, + "grad_norm": 2.0913960861782144, + "language_loss": 0.82898986, + "learning_rate": 8.532016508855378e-12, + "loss": 0.85133201, + "num_input_tokens_seen": 358584710, + "step": 16617, + "time_per_iteration": 2.582841634750366 + }, + { + "auxiliary_loss_clip": 0.01132629, + "auxiliary_loss_mlp": 0.0109961, + "balance_loss_clip": 1.00170112, + "balance_loss_mlp": 1.00038028, + "epoch": 0.9991282128363145, + "flos": 24207993930240.0, + "grad_norm": 2.3996442689687885, + "language_loss": 0.78760219, + "learning_rate": 7.43233506206309e-12, + "loss": 0.8099246, + "num_input_tokens_seen": 358606750, + "step": 16618, + "time_per_iteration": 2.649549722671509 + }, + { + "auxiliary_loss_clip": 0.01164104, + "auxiliary_loss_mlp": 0.01099584, + "balance_loss_clip": 1.00184512, + "balance_loss_mlp": 1.00044918, + "epoch": 0.9991883360889824, + "flos": 21175301664000.0, + "grad_norm": 1.6651878854680395, + "language_loss": 0.74546063, + "learning_rate": 6.408493534060255e-12, + "loss": 0.76809752, + "num_input_tokens_seen": 358624675, + "step": 16619, + "time_per_iteration": 2.527097225189209 + }, + { + "auxiliary_loss_clip": 0.01149233, + "auxiliary_loss_mlp": 0.01098905, + "balance_loss_clip": 1.00176382, + "balance_loss_mlp": 1.00043845, + "epoch": 0.9992484593416504, + "flos": 19901155449600.0, + "grad_norm": 1.9432192178980696, + "language_loss": 0.86939031, + "learning_rate": 5.460491963260594e-12, + "loss": 0.89187169, + "num_input_tokens_seen": 358640715, + "step": 16620, + "time_per_iteration": 3.935516834259033 + }, + { + "auxiliary_loss_clip": 0.01115586, + "auxiliary_loss_mlp": 0.010997, + "balance_loss_clip": 1.00167966, + "balance_loss_mlp": 1.00047076, + "epoch": 0.9993085825943183, + "flos": 24857832833280.0, + "grad_norm": 2.7713278740756846, + "language_loss": 0.72274101, + "learning_rate": 4.58833038607942e-12, + "loss": 0.74489391, + "num_input_tokens_seen": 358659630, + "step": 16621, + "time_per_iteration": 2.6753427982330322 + }, + { + "auxiliary_loss_clip": 0.01108511, + "auxiliary_loss_mlp": 0.01073916, + "balance_loss_clip": 1.00091028, + "balance_loss_mlp": 1.00029278, + "epoch": 0.9993687058469863, + "flos": 71284478780160.0, + "grad_norm": 0.7325424297753145, + "language_loss": 0.56532609, + "learning_rate": 3.79200883515729e-12, + "loss": 0.58715034, + "num_input_tokens_seen": 358727840, + "step": 16622, + "time_per_iteration": 3.405395746231079 + }, + { + "auxiliary_loss_clip": 0.01118018, + "auxiliary_loss_mlp": 0.0110059, + "balance_loss_clip": 1.00189626, + "balance_loss_mlp": 1.00045455, + "epoch": 0.9994288290996542, + "flos": 12199573566720.0, + "grad_norm": 2.874918782457117, + "language_loss": 0.71288145, + "learning_rate": 3.071527340914315e-12, + "loss": 0.73506761, + "num_input_tokens_seen": 358744125, + "step": 16623, + "time_per_iteration": 4.124709129333496 + }, + { + "auxiliary_loss_clip": 0.01117029, + "auxiliary_loss_mlp": 0.01100648, + "balance_loss_clip": 1.00196886, + "balance_loss_mlp": 1.00041652, + "epoch": 0.9994889523523223, + "flos": 17889942153600.0, + "grad_norm": 1.865772070420901, + "language_loss": 0.74546218, + "learning_rate": 2.4268859304399368e-12, + "loss": 0.76763892, + "num_input_tokens_seen": 358761420, + "step": 16624, + "time_per_iteration": 2.6653122901916504 + }, + { + "auxiliary_loss_clip": 0.0111909, + "auxiliary_loss_mlp": 0.01100789, + "balance_loss_clip": 1.00177789, + "balance_loss_mlp": 1.00041461, + "epoch": 0.9995490756049902, + "flos": 26578888064640.0, + "grad_norm": 1.5808019367016821, + "language_loss": 0.73678911, + "learning_rate": 1.8580846286031514e-12, + "loss": 0.7589879, + "num_input_tokens_seen": 358782600, + "step": 16625, + "time_per_iteration": 2.7011637687683105 + }, + { + "auxiliary_loss_clip": 0.01147222, + "auxiliary_loss_mlp": 0.01100029, + "balance_loss_clip": 1.00173616, + "balance_loss_mlp": 1.00046575, + "epoch": 0.9996091988576582, + "flos": 22200048771840.0, + "grad_norm": 2.0041193284885948, + "language_loss": 0.76480293, + "learning_rate": 1.3651234567202408e-12, + "loss": 0.78727549, + "num_input_tokens_seen": 358801220, + "step": 16626, + "time_per_iteration": 2.5812253952026367 + }, + { + "auxiliary_loss_clip": 0.01164113, + "auxiliary_loss_mlp": 0.01100964, + "balance_loss_clip": 1.00198019, + "balance_loss_mlp": 1.00058985, + "epoch": 0.9996693221103262, + "flos": 27373195468800.0, + "grad_norm": 2.194533326048089, + "language_loss": 0.82188833, + "learning_rate": 9.480024334429515e-13, + "loss": 0.84453905, + "num_input_tokens_seen": 358819190, + "step": 16627, + "time_per_iteration": 2.6082417964935303 + }, + { + "auxiliary_loss_clip": 0.01148097, + "auxiliary_loss_mlp": 0.01100472, + "balance_loss_clip": 1.0019021, + "balance_loss_mlp": 1.0004313, + "epoch": 0.9997294453629941, + "flos": 26870410846080.0, + "grad_norm": 2.6925227017047835, + "language_loss": 0.70863622, + "learning_rate": 6.067215747584952e-13, + "loss": 0.7311219, + "num_input_tokens_seen": 358839850, + "step": 16628, + "time_per_iteration": 2.639680862426758 + }, + { + "auxiliary_loss_clip": 0.01148906, + "auxiliary_loss_mlp": 0.01099945, + "balance_loss_clip": 1.00173092, + "balance_loss_mlp": 1.00047684, + "epoch": 0.9997895686156621, + "flos": 23476996247040.0, + "grad_norm": 1.2707057575605296, + "language_loss": 0.75221801, + "learning_rate": 3.4128089332341456e-13, + "loss": 0.77470654, + "num_input_tokens_seen": 358859805, + "step": 16629, + "time_per_iteration": 2.578444719314575 + }, + { + "auxiliary_loss_clip": 0.01130903, + "auxiliary_loss_mlp": 0.01101547, + "balance_loss_clip": 1.00173318, + "balance_loss_mlp": 1.00050545, + "epoch": 0.9998496918683301, + "flos": 20224961579520.0, + "grad_norm": 1.5570050597349718, + "language_loss": 0.60244811, + "learning_rate": 1.5168039935176126e-13, + "loss": 0.62477261, + "num_input_tokens_seen": 358877900, + "step": 16630, + "time_per_iteration": 2.6186623573303223 + }, + { + "auxiliary_loss_clip": 0.01101666, + "auxiliary_loss_mlp": 0.01100716, + "balance_loss_clip": 1.00171626, + "balance_loss_mlp": 1.00043726, + "epoch": 0.9999098151209981, + "flos": 21652913831040.0, + "grad_norm": 2.359689050619927, + "language_loss": 0.60484922, + "learning_rate": 3.792010017100722e-14, + "loss": 0.62687302, + "num_input_tokens_seen": 358897285, + "step": 16631, + "time_per_iteration": 2.7497358322143555 + }, + { + "auxiliary_loss_clip": 0.01102769, + "auxiliary_loss_mlp": 0.00747263, + "balance_loss_clip": 1.0019033, + "balance_loss_mlp": 1.00044656, + "epoch": 0.999969938373666, + "flos": 11544599018880.0, + "grad_norm": 3.6952621322827053, + "language_loss": 0.72713625, + "learning_rate": 0.0, + "loss": 0.74563658, + "num_input_tokens_seen": 358911570, + "step": 16632, + "time_per_iteration": 2.656825542449951 + }, + { + "epoch": 0.999969938373666, + "num_input_tokens_seen": 358911570, + "step": 16632, + "total_flos": 1.3992169073237033e+18, + "train_loss": 0.7702363172493168, + "train_runtime": 48111.0274, + "train_samples_per_second": 13.828, + "train_steps_per_second": 0.346 + } + ], + "logging_steps": 1.0, + "max_steps": 16632, + "num_input_tokens_seen": 358911570, + "num_train_epochs": 1, + "save_steps": 3328, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.3992169073237033e+18, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +} diff --git a/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/training_args.bin b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e2f78304223465834f90c3ebe857d6fb0466b8a9 --- /dev/null +++ b/CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d184055355e29fdb6a50c07848c245cc1a9210d8493f4fbf355458519a5ea64 +size 7992